diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 9db0e09e6..c1b03d002 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -507,11 +507,14 @@ public class GenomeAnalysisEngine { ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals, Integer maxIterations) { - final long SHARD_SIZE = 100000L; + long SHARD_SIZE = 100000L; ShardStrategy shardStrategy = null; ShardStrategyFactory.SHATTER_STRATEGY shardType; + if (walker instanceof LocusWalker) { + if ( walker instanceof RodWalker ) SHARD_SIZE *= 100; + if (intervals != null) { shardType = (walker.isReduceByInterval()) ? ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL : @@ -525,7 +528,6 @@ public class GenomeAnalysisEngine { shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, drivingDataSource.getSequenceDictionary(), SHARD_SIZE, maxIterations); - } else if (walker instanceof ReadWalker || walker instanceof DuplicateWalker) { diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusView.java b/java/src/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusView.java index 58c5486aa..d89708bbc 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusView.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusView.java @@ -22,19 +22,6 @@ import org.apache.log4j.Logger; */ public class CoveredLocusView extends LocusView { - /** - * Gets the position to which the last seek was requested. - */ - private GenomeLoc seekPoint; - - /** - * What's the context for the last locus accessed? - * @param provider - */ - private AlignmentContext nextLocusContext = null; - - private static Logger logger = Logger.getLogger(CoveredLocusView.class); - /** * Create a new queue of locus contexts. * @param provider diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedView.java b/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java similarity index 92% rename from java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedView.java rename to java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java index c9e2a0263..2da895d01 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedView.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java @@ -25,11 +25,11 @@ import java.util.Collections; /** * A view into the reference-ordered data in the provider. */ -public class ReferenceOrderedView implements View { +public class ManagingReferenceOrderedView implements ReferenceOrderedView { /** * The provider that's supplying our backing data. */ - private final ShardDataProvider provider; + //private final ShardDataProvider provider; /** * The data sources along with their current states. @@ -40,8 +40,8 @@ public class ReferenceOrderedView implements View { * Create a new view of reference-ordered data. * @param provider */ - public ReferenceOrderedView( ShardDataProvider provider ) { - this.provider = provider; + public ManagingReferenceOrderedView( ShardDataProvider provider ) { + //this.provider = provider; for( ReferenceOrderedDataSource dataSource: provider.getReferenceOrderedData() ) states.add( new ReferenceOrderedDataState( dataSource, (RODIterator)dataSource.seek(provider.getShard()) ) ); diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java b/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java new file mode 100644 index 000000000..d9483d2bc --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java @@ -0,0 +1,220 @@ +package org.broadinstitute.sting.gatk.datasources.providers; + +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RODIterator; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.MergingIterator; +import org.broadinstitute.sting.utils.GenomeLocParser; + +import java.util.*; + +import net.sf.samtools.SAMRecord; +/** + * User: hanna + * Date: May 21, 2009 + * Time: 2:49:17 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * A view into the reference-ordered data in the provider. + */ +public class RodLocusView extends LocusView implements ReferenceOrderedView { + /** + * The provider that's supplying our backing data. + */ + //private final ShardDataProvider provider; + + /** + * The data sources along with their current states. + */ + private MergingIterator rodQueue = null; + + RefMetaDataTracker tracker = null; + GenomeLoc lastLoc = null; + ReferenceOrderedDatum interval = null; + + // broken support for multi-locus rods + //List multiLocusRODs = new LinkedList(); + + /** + * Enable debugging output -- todo remove me + */ + final static boolean DEBUG = false; + + final static String INTERVAL_ROD_NAME = "interval"; + + /** + * Create a new view of reference-ordered data. + * + * @param provider + */ + public RodLocusView( ShardDataProvider provider ) { + super(provider); + + GenomeLoc loc = provider.getShard().getGenomeLoc(); + + List> iterators = new LinkedList>(); + for( ReferenceOrderedDataSource dataSource: provider.getReferenceOrderedData() ) { + if ( DEBUG ) System.out.printf("Shard is %s%n", loc); + RODIterator it = (RODIterator)dataSource.seek(provider.getShard()); + ReferenceOrderedDatum x = it.seekForward(loc); + + // we need to special case the interval so we don't always think there's a rod at the first location + if ( dataSource.getName().equals(INTERVAL_ROD_NAME) ) { + if ( interval != null ) + throw new RuntimeException("BUG: interval local variable already assigned " + interval); + interval = x; + } else { + iterators.add( (Iterator)it ); + } + } + + rodQueue = new MergingIterator(iterators); + } + + public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc ) { + return tracker; + } + + public boolean hasNext() { + if ( ! rodQueue.hasNext() ) + return false; + else { + ReferenceOrderedDatum peeked = rodQueue.peek(); + return ! peeked.getLocation().isPast(shard.getGenomeLoc()); + } + } + + /** + * Returns the next covered locus context in the shard. + * @return Next covered locus context in the shard. + * @throw NoSuchElementException if no such element exists. + */ + public AlignmentContext next() { + if ( DEBUG ) System.out.printf("In RodLocusView.next()...%n"); + ReferenceOrderedDatum datum = rodQueue.next(); + if ( DEBUG ) System.out.printf("In RodLocusView.next(); datum = %s...%n", datum.getLocation()); + + if ( DEBUG ) System.out.printf("In RodLocusView.next(): creating tracker...%n"); + + // Update the tracker here for use + Collection allRODsHere = getSpanningRods(datum); + tracker = createTracker(allRODsHere); + + GenomeLoc rodSite = datum.getLocation(); + GenomeLoc site = GenomeLocParser.createGenomeLoc( rodSite.getContigIndex(), rodSite.getStart(), rodSite.getStart()); + + if ( DEBUG ) System.out.printf("rodLocusView.next() is at %s%n", site); + + // calculate the number of skipped bases, and update lastLoc so we can do that again in the next() + long skippedBases = getSkippedBases( rodSite ); + lastLoc = site; + return new AlignmentContext(site, new ArrayList(), new ArrayList(), skippedBases); + } + + private RefMetaDataTracker createTracker( Collection allRodsHere ) { + RefMetaDataTracker t = new RefMetaDataTracker(); + for ( ReferenceOrderedDatum element : allRodsHere ) { + if ( ! t.hasROD(element.getName()) ) + t.bind(element.getName(), element); + } + + // special case the interval again -- add it into the ROD + if ( interval != null ) { t.bind(interval.getName(), interval); } + + return t; + } + + private Collection getSpanningRods(ReferenceOrderedDatum marker) { + return rodQueue.allElementsLTE(marker); + } + +// private Collection getSpanningRods(ReferenceOrderedDatum marker) { +// Collection allFromQueue = rodQueue.allElementsLTE(marker); +// Collection all = new LinkedList(allFromQueue); +// +// Iterator it = multiLocusRODs.iterator(); +// while ( it.hasNext() ) { +// ReferenceOrderedDatum mlr = it.next(); +// if ( mlr.getLocation().overlapsP(marker.getLocation()) ) { +// all.add(mlr); +// } else { +// it.remove(); // no long covering this site +// } +// } +// +// for ( ReferenceOrderedDatum datum : allFromQueue ) { +// if ( ! datum.getLocation().isSingleBP() ) { +// System.out.printf("Adding multi-locus %s%n", datum); +// multiLocusRODs.add(datum); +// } +// } +// +// return all; +// } + + private long getSkippedBases( GenomeLoc currentPos ) { + long skippedBases = 0; + + if ( lastLoc == null ) { + // special case -- we're at the start + //System.out.printf("Cur=%s, shard=%s%n", currentPos, shard.getGenomeLoc()); + skippedBases = currentPos.getStart() - shard.getGenomeLoc().getStart(); + } else { + //System.out.printf("Cur=%s, last=%s%n", currentPos, lastLoc); + skippedBases = currentPos.minus(lastLoc) - 1; + } + + if ( skippedBases < -1 ) { // minus 1 value is ok + throw new RuntimeException(String.format("BUG: skipped bases is < 0: cur=%s vs. last=%s", currentPos, lastLoc)); + } + return Math.max(skippedBases, 0); + } + + /** + * Get the location one after the last position we will traverse through + * @return + */ + public GenomeLoc getLocOneBeyondShard() { + return GenomeLocParser.createGenomeLoc( shard.getGenomeLoc().getContigIndex(), + shard.getGenomeLoc().getStop()+1, + shard.getGenomeLoc().getStop()+1); + } + + /** + * How many bases are we skipping from the current location to the end of the interval / shard + * if we have no more elements + * + * @return + */ + public long getLastSkippedBases() { + if ( hasNext() ) + throw new RuntimeException("BUG: getLastSkippedBases called when there are elements remaining."); + + return getSkippedBases(getLocOneBeyondShard()); + } + + public RefMetaDataTracker getTracker() { + return tracker; + } + + /** + * Closes the current view. + */ + public void close() { + //rodQueue.close(); + + rodQueue = null; + tracker = null; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/BasicReferenceOrderedDatum.java b/java/src/org/broadinstitute/sting/gatk/refdata/BasicReferenceOrderedDatum.java index a4ac03b19..0f45c90c3 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/BasicReferenceOrderedDatum.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/BasicReferenceOrderedDatum.java @@ -35,6 +35,7 @@ public abstract class BasicReferenceOrderedDatum implements ReferenceOrderedDatu public abstract GenomeLoc getLocation(); public int compareTo( ReferenceOrderedDatum that ) { + //System.out.printf("Comparing %s to %s => %d%n", getLocation(), that.getLocation(), getLocation().compareTo(that.getLocation())); return getLocation().compareTo(that.getLocation()); } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java b/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java index 0c4a25c2c..24abc4106 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java @@ -158,7 +158,7 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements AllelicVaria String contig = parts[1]; long start = Long.parseLong(parts[2]) + 1; // The final is 0 based long stop = Long.parseLong(parts[3]) + 1; // The final is 0 based - loc = GenomeLocParser.parseGenomeLoc(contig, start, stop-1); + loc = GenomeLocParser.parseGenomeLoc(contig, start, Math.max(start, stop-1)); name = parts[4]; strand = parts[6]; diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java index 1d9038c83..646f0aebd 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java @@ -10,6 +10,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Utils; @@ -19,6 +20,8 @@ import java.util.ArrayList; * A simple solution to iterating over all reference positions over a series of genomic locations. */ public class TraverseLoci extends TraversalEngine { + final private static String UNIT_STRING = "sites"; + /** * our log, which we want to capture anything from this class @@ -45,32 +48,59 @@ public class TraverseLoci extends TraversalEngine { LocusWalker locusWalker = (LocusWalker)walker; LocusView locusView = getLocusView( walker, dataProvider ); - LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); - ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); - // We keep processing while the next reference location is within the interval - while( locusView.hasNext() ) { - AlignmentContext locus = locusView.next(); - TraversalStatistics.nRecords++; + if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA ) + throw new RuntimeException("Engine currently doesn't support RodWalkers"); - // Iterate forward to get all reference ordered data covering this locus - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation()); + if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all - ReferenceContext refContext = referenceView.getReferenceContext(locus.getLocation()); + //ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); + ReferenceOrderedView referenceOrderedDataView = null; + if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) + referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider ); + else + referenceOrderedDataView = (RodLocusView)locusView; - final boolean keepMeP = locusWalker.filter(tracker, refContext, locus); - if (keepMeP) { - M x = locusWalker.map(tracker, refContext, locus); + LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); + + // We keep processing while the next reference location is within the interval + while( locusView.hasNext() ) { + AlignmentContext locus = locusView.next(); + + TraversalStatistics.nRecords++; + + // Iterate forward to get all reference ordered data covering this locus + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation()); + + ReferenceContext refContext = referenceView.getReferenceContext(locus.getLocation()); + + final boolean keepMeP = locusWalker.filter(tracker, refContext, locus); + if (keepMeP) { + M x = locusWalker.map(tracker, refContext, locus); + sum = locusWalker.reduce(x, sum); + } + + if (this.maximumIterations > 0 && TraversalStatistics.nRecords > this.maximumIterations) { + logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords)); + break; + } + + printProgress(UNIT_STRING, locus.getLocation()); + } + } + + // We have a final map call to execute here to clean up the skipped based from the + // last position in the ROD to that in the interval + if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA ) { + RodLocusView rodLocusView = (RodLocusView)locusView; + long nSkipped = rodLocusView.getLastSkippedBases(); + if ( nSkipped > 0 ) { + // no sense in making the call if you don't have anything interesting to say + AlignmentContext ac = new AlignmentContext(rodLocusView.getLocOneBeyondShard(), null, null, nSkipped); + M x = locusWalker.map(null, null, ac); sum = locusWalker.reduce(x, sum); } - - if (this.maximumIterations > 0 && TraversalStatistics.nRecords > this.maximumIterations) { - logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords)); - break; - } - - printProgress("loci", locus.getLocation()); } return sum; @@ -78,12 +108,12 @@ public class TraverseLoci extends TraversalEngine { /** * Temporary override of printOnTraversalDone. - * TODO: Add some sort of TE.getName() function once all TraversalEngines are ported. + * * @param sum Result of the computation. * @param Type of the result. */ public void printOnTraversalDone( T sum ) { - printOnTraversalDone( "loci", sum ); + printOnTraversalDone( UNIT_STRING, sum ); } /** @@ -97,6 +127,8 @@ public class TraverseLoci extends TraversalEngine { return new CoveredLocusView(dataProvider); else if( dataSource == DataSource.REFERENCE ) return new AllLocusView(dataProvider); + else if( dataSource == DataSource.REFERENCE_ORDERED_DATA ) + return new RodLocusView(dataProvider); else throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource); } diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLocusWindows.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLocusWindows.java index 80d04cdb3..2ea1f9a4c 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLocusWindows.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLocusWindows.java @@ -2,10 +2,7 @@ package org.broadinstitute.sting.gatk.traversals; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView; -import org.broadinstitute.sting.gatk.datasources.providers.ReadView; -import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView; -import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; +import org.broadinstitute.sting.gatk.datasources.providers.*; import org.broadinstitute.sting.gatk.datasources.shards.Shard; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -39,7 +36,7 @@ public class TraverseLocusWindows extends TraversalEngine { ReadView readView = new ReadView( dataProvider ); LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); - ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); + ReferenceOrderedView referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider ); AlignmentContext locus = getLocusContext(readView.iterator(), interval); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/CountRodByRefWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/CountRodByRefWalker.java new file mode 100644 index 000000000..9106d527a --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/CountRodByRefWalker.java @@ -0,0 +1,43 @@ +package org.broadinstitute.sting.gatk.walkers; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.cmdLine.Argument; + +import java.util.ArrayList; +import java.util.List; + +/** + * Same as CountRod but this one is a reference walker + */ +public class CountRodByRefWalker extends RefWalker, Long>> { + @Argument(fullName = "verbose", shortName = "v", doc="If true, Countrod will print out detailed information about the rods it finds and locations", required = false) + public boolean verbose = false; + + @Argument(fullName = "showSkipped", shortName = "s", doc="If true, CountRod will print out the skippped locations", required = false) + public boolean showSkipped = false; + + CountRodWalker crw = new CountRodWalker(); + + public void initialize() { + crw.verbose = verbose; + crw.showSkipped = showSkipped; + crw.out = out; + crw.err = err; + } + + public CountRodWalker.Datum map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + return crw.map(tracker, ref, context); + } + + public Pair, Long> reduceInit() { + return crw.reduceInit(); + } + + public Pair, Long> reduce(CountRodWalker.Datum point, Pair, Long> sum) { + return crw.reduce(point, sum); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/CountRodWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/CountRodWalker.java new file mode 100644 index 000000000..fe778235c --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/CountRodWalker.java @@ -0,0 +1,110 @@ +package org.broadinstitute.sting.gatk.walkers; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.cmdLine.Argument; + +import java.util.ArrayList; +import java.util.List; +import java.util.Collection; +import java.util.LinkedList; + +public class CountRodWalker extends RodWalker, Long>> { + @Argument(fullName = "verbose", shortName = "v", doc="If true, Countrod will print out detailed information about the rods it finds and locations", required = false) + public boolean verbose = false; + + @Argument(fullName = "showSkipped", shortName = "s", doc="If true, CountRod will print out the skippped locations", required = false) + public boolean showSkipped = false; + + public class Datum { + public long nRodsAtThisLocation = 0; + public long nSkippedBases =0; + public long nTotalBases = 0; + + public Datum( long nRodsAtThisLocation, long nSkippedBases, long nTotalBases ) { + this.nRodsAtThisLocation = nRodsAtThisLocation; + this.nSkippedBases = nSkippedBases; + this.nTotalBases = nTotalBases; + } + + public String toString() { + return String.format("<%d %d %d>", nRodsAtThisLocation, nSkippedBases, nTotalBases); + } + } + + public Datum map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + GenomeLoc cur = context.getLocation(); + + if ( verbose && showSkipped ) { + for ( int i = 0; i < context.getSkippedBases(); i++ ) { + out.printf("%s: skipped%n", GenomeLocParser.incPos(cur, i - context.getSkippedBases())); + } + } + + long nRodsHere = 0; + long nTotalBases = 0; + + if ( ref == null ) { + // we're getting the last skipped update + if ( verbose ) + out.printf("Last position was %s: skipping %d bases%n", + context.getLocation(), context.getSkippedBases() ); + nRodsHere = -1; // don't update this + nTotalBases = context.getSkippedBases(); + } else { + Collection rods = new LinkedList(); + for ( ReferenceOrderedDatum rod : tracker.getBoundRods() ) { + //System.out.printf("Considering rod %s%n", rod); + if ( rod.getLocation().getStart() == context.getLocation().getStart() && ! rod.getName().equals("interval") ) { + // only consider the first element + //System.out.printf("adding it%n"); + rods.add(rod); + } + } + + nRodsHere = rods.size(); + + if ( nRodsHere > 0 ) { + if ( verbose ) { + List names = new ArrayList(); + for ( ReferenceOrderedDatum rod : rods ) { + names.add(rod.getName()); + } + + //System.out.printf("context is %s", context.getSkippedBases()); + out.printf("At %s: found %d rod(s) [%s] after skipping %d bases%n", + context.getLocation(), nRodsHere, Utils.join(",", names), context.getSkippedBases() ); + } + } + + nTotalBases = context.getSkippedBases() + 1; + } + + return new Datum(nRodsHere, context.getSkippedBases(), nTotalBases); + } + + public Pair, Long> reduceInit() { + return new Pair, Long>(new ExpandingArrayList(), 0l); + } + + private void updateCounts(ExpandingArrayList counts, long nRods, long nObs) { + if ( nRods >= 0 ) { + long prev = counts.get((int)nRods) == null ? 0l : counts.get((int)nRods); + counts.set((int)nRods, nObs + prev); + } + } + + public Pair, Long> reduce(Datum point, Pair, Long> sum) { + ExpandingArrayList counts = sum.getFirst(); + updateCounts(counts, point.nRodsAtThisLocation, 1); + updateCounts(counts, 0, point.nSkippedBases); + + Pair, Long> r = new Pair, Long>(counts, point.nTotalBases + sum.getSecond()); + + //System.out.printf("Reduce: %s %s => %s%n", point, sum, r); + return r; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/DataSource.java b/java/src/org/broadinstitute/sting/gatk/walkers/DataSource.java index 0f130c3c9..a152ab137 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/DataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/DataSource.java @@ -18,5 +18,6 @@ package org.broadinstitute.sting.gatk.walkers; public enum DataSource { READS, REFERENCE, - REFERENCE_BASES // Do I actually need the reference bases passed to the walker? + REFERENCE_BASES, // Do I actually need the reference bases passed to the walker? + REFERENCE_ORDERED_DATA } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/RodWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/RodWalker.java new file mode 100644 index 000000000..c37e063dc --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/RodWalker.java @@ -0,0 +1,14 @@ +package org.broadinstitute.sting.gatk.walkers; + +/** + * Created by IntelliJ IDEA. + * User: mdepristo + * Date: Feb 22, 2009 + * Time: 2:52:28 PM + * To change this template use File | Settings | File Templates. + */ +@By(DataSource.REFERENCE_ORDERED_DATA) +@Requires({DataSource.REFERENCE, DataSource.REFERENCE_ORDERED_DATA}) +@Allows({DataSource.REFERENCE, DataSource.REFERENCE_ORDERED_DATA}) +public abstract class RodWalker extends LocusWalker { +} \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewTest.java index 0157bd3d9..62cd58446 100755 --- a/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewTest.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewTest.java @@ -54,7 +54,7 @@ public class ReferenceOrderedViewTest extends BaseTest { public void testNoBindings() { Shard shard = new LocusShard(GenomeLocParser.createGenomeLoc("chrM",1,30)); ShardDataProvider provider = new ShardDataProvider(shard, null, seq, Collections.emptyList()); - ReferenceOrderedView view = new ReferenceOrderedView( provider ); + ReferenceOrderedView view = new ManagingReferenceOrderedView( provider ); RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(GenomeLocParser.createGenomeLoc("chrM",10)); Assert.assertNull("The tracker should not have produced any data", tracker.lookup("tableTest",null)); @@ -72,7 +72,7 @@ public class ReferenceOrderedViewTest extends BaseTest { Shard shard = new LocusShard(GenomeLocParser.createGenomeLoc("chrM",1,30)); ShardDataProvider provider = new ShardDataProvider(shard, null, seq, Collections.singletonList(dataSource)); - ReferenceOrderedView view = new ReferenceOrderedView( provider ); + ReferenceOrderedView view = new ManagingReferenceOrderedView( provider ); RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(GenomeLocParser.createGenomeLoc("chrM",20)); TabularROD datum = (TabularROD)tracker.lookup("tableTest",null); @@ -98,7 +98,7 @@ public class ReferenceOrderedViewTest extends BaseTest { Shard shard = new LocusShard(GenomeLocParser.createGenomeLoc("chrM",1,30)); ShardDataProvider provider = new ShardDataProvider(shard, null, seq, Arrays.asList(dataSource1,dataSource2)); - ReferenceOrderedView view = new ReferenceOrderedView( provider ); + ReferenceOrderedView view = new ManagingReferenceOrderedView( provider ); RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(GenomeLocParser.createGenomeLoc("chrM",20)); TabularROD datum1 = (TabularROD)tracker.lookup("tableTest1",null); diff --git a/python/Verify1KGArchiveBAMs.py b/python/Verify1KGArchiveBAMs.py index fc02bb5e6..cc996e509 100644 --- a/python/Verify1KGArchiveBAMs.py +++ b/python/Verify1KGArchiveBAMs.py @@ -23,17 +23,20 @@ class Status: def __init__(self, file, exists, size): self.file = file self.exists = exists - self.size = size + self._size = size if not exists: self.status = "missing" if size == 0: self.status = "no-data" - else: self.status = "exists: bytes=" + str(self.size) + else: self.status = "exists: bytes=" + str(self.size()) def __str__(self): - return self.status + return self.file + " " + self.status + + def size(self): + return self._size def viewSize(self): - return MergeBAMsUtils.greek(self.size) + return MergeBAMsUtils.greek(self.size()) def md5(file): @@ -50,10 +53,10 @@ class ComparedFiles: self.ftpStat = ftpStat def size(self): - if self.localStat.size <> 0: - return self.localStat.size - if self.ftpStat.size <> 0: - return self.ftpStat.size + if self.localStat.size() <> 0: + return self.localStat.size() + if self.ftpStat.size() <> 0: + return self.ftpStat.size() else: return 0 @@ -67,7 +70,7 @@ def modTimeStr(t): if t == 0: return 'N/A' else: - return time.strftime("%m/%d/%y", time.localtime(t)) + return time.strftime("%m/%d/%y", time.localtime(int(t))) def getSizeForFile(dir, filename): global CACHED_LIST @@ -141,9 +144,10 @@ def validateFile(relPath, localRoot, ftpRoot): return compared def compareFileStatus(localStat, ftpStat): + if DEBUG: print 'comparing', localStat, ftpStat if localStat.exists: if ftpStat.exists: - if localStat.size == ftpStat.size: + if localStat.size() == ftpStat.size(): status = 'in-sync' else: status = 'size-mismatch' @@ -165,6 +169,8 @@ def filesInLocalPath(root, subdir): if subdir <> None: for fullroot, dirs, files in os.walk(os.path.join(root, subdir)): for file in filter( regex.match, files ): + #if file <> "NA12761.SLX.WUGSC.Mosaik.SRP000033.2009_08.bam.bai": + # continue fullpath = os.path.join(fullroot, file) path = fullpath.split(root)[1] #print 'adding relpath=', path, 'fullpath=', fullpath @@ -266,12 +272,15 @@ if __name__ == "__main__": statusForFileListing = ['size-mismatch', 'local-file-missing'] maxFilesToList = 10 if status in statusForFileListing: - print 'SUMMARY: listing the first', maxFilesToList, 'of', n + print 'SUMMARY: listing the first', min(maxFilesToList, n), 'of', n for file in itertools.islice(filesOfStatus, maxFilesToList): - print 'SUMMARY: File: %8s %12s %s' % ( MergeBAMsUtils.greek(file.size()), modTimeStr(file.modTime()), file.file) + if status == 'size-mismatch': + print 'SUMMARY: File: ftp=%d bytes local=%d bytes %12s %s' % ( file.ftpStat.size(), file.localStat.size() , modTimeStr(file.modTime()), file.file) + else: + print 'SUMMARY: File: %8s %12s %s' % ( MergeBAMsUtils.greek(file.size()), modTimeStr(file.modTime()), file.file) if n > 0: - fileSizes = MergeBAMsUtils.greek(reduce(operator.__add__, map( ComparedFiles.size, filesOfStatus ), 0 )) - mostRecentMod = modTimeStr(apply(max, map( ComparedFiles.modTime, filesOfStatus ))) - - print 'SUMMARY: total size %s' % ( fileSizes ) - print 'SUMMARY: last modification time %s' % ( mostRecentMod ) + fileSizes = MergeBAMsUtils.greek(reduce(operator.__add__, map( ComparedFiles.size, filesOfStatus ), 0 )) + mostRecentMod = modTimeStr(apply(max, map( ComparedFiles.modTime, filesOfStatus ) + [0])) + + print 'SUMMARY: total size %s' % ( fileSizes ) + print 'SUMMARY: last modification time %s' % ( mostRecentMod ) diff --git a/testdata/ValidatingPileupTargets.list b/testdata/ValidatingPileupTargets.list index fd07dc132..8cf777f2a 100755 --- a/testdata/ValidatingPileupTargets.list +++ b/testdata/ValidatingPileupTargets.list @@ -2,7 +2,7 @@ /humgen/gsa-scr1/GATK_Data/Validation_Data/MV1994.bam /humgen/gsa-scr1/GATK_Data/Validation_Data/Escherichia_coli_K12_MG1655.fasta * # daughter chrom 6 deep coverage first 10 MB -/broad/1KG/DCC/ftp/pilot_data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam /broad/1KG/reference/human_b36_both.fasta 6:1-10,000,00 +/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam /broad/1KG/reference/human_b36_both.fasta 1:10,500,000-11,000,000 # daughter chrom 1 deep coverage /broad/1KG/DCC/ftp/pilot_data/NA12878/alignment/NA12878.chrom1.SLX.SRP000032.2009_06.bam /broad/1KG/reference/human_b36_both.fasta *