From 3c3cd5bb6452b30fac8a057141c9637f8af7ad33 Mon Sep 17 00:00:00 2001 From: aaron Date: Wed, 27 May 2009 18:24:31 +0000 Subject: [PATCH] Moving some of the data sharding around. A new shard catagory now exits, INTERVAL. This saved a lot of code that was mirroring the same approach in both the read and locus shard strategies. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@840 348d0f76-0448-11de-a6fe-93d51630548a --- .../shards/ExpGrowthLocusShardStrategy.java | 31 +++-- ...ervalReadShard.java => IntervalShard.java} | 18 +-- .../shards/LinearLocusShardStrategy.java | 31 +++-- .../shards/LocusIntervalShardStrategy.java | 66 ---------- .../shards/LocusShardStrategy.java | 5 +- .../shards/ReadIntervalShardStrategy.java | 118 ------------------ .../dataSources/shards/ReadShardStrategy.java | 29 +++-- .../sting/gatk/dataSources/shards/Shard.java | 2 +- .../shards/ShardStrategyFactory.java | 8 +- .../simpleDataSources/SAMDataSource.java | 6 +- .../sting/gatk/executive/MicroScheduler.java | 100 ++++++++------- .../sting/gatk/traversals/TraverseReads.java | 8 +- ...st.java => IntervalShardStrategyTest.java} | 37 ++++-- ...dShardTest.java => IntervalShardTest.java} | 13 +- ...java => LinearLocusShardStrategyTest.java} | 62 ++++++--- .../shards/ShardStrategyFactoryTest.java | 8 +- 16 files changed, 217 insertions(+), 325 deletions(-) rename java/src/org/broadinstitute/sting/gatk/dataSources/shards/{IntervalReadShard.java => IntervalShard.java} (81%) delete mode 100755 java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusIntervalShardStrategy.java delete mode 100755 java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadIntervalShardStrategy.java rename java/test/org/broadinstitute/sting/gatk/dataSources/shards/{ReadIntervalShardStrategyTest.java => IntervalShardStrategyTest.java} (77%) rename java/test/org/broadinstitute/sting/gatk/dataSources/shards/{IntervalReadShardTest.java => IntervalShardTest.java} (85%) rename java/test/org/broadinstitute/sting/gatk/dataSources/shards/{LocusIntervalShardStrategyTest.java => LinearLocusShardStrategyTest.java} (56%) diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ExpGrowthLocusShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ExpGrowthLocusShardStrategy.java index de8d47993..f80142ce5 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ExpGrowthLocusShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ExpGrowthLocusShardStrategy.java @@ -6,20 +6,29 @@ import org.broadinstitute.sting.utils.GenomeLocSortedSet; import java.util.List; -/** +/* + * Copyright (c) 2009 The Broad Institute * - * User: aaron - * Date: Apr 6, 2009 - * Time: 8:23:19 PM + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. */ diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/IntervalReadShard.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/IntervalShard.java similarity index 81% rename from java/src/org/broadinstitute/sting/gatk/dataSources/shards/IntervalReadShard.java rename to java/src/org/broadinstitute/sting/gatk/dataSources/shards/IntervalShard.java index 21c70bb78..7530cb672 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/IntervalReadShard.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/IntervalShard.java @@ -1,6 +1,5 @@ package org.broadinstitute.sting.gatk.dataSources.shards; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.GenomeLoc; @@ -31,17 +30,18 @@ import org.broadinstitute.sting.utils.GenomeLoc; /** * @author aaron - *

- * Class IntervalReadShard - *

- * This is the read shard that knowns about genomic intervals + *

+ * Class IntervalShard + *

+ * the base interval shard. All interval shards are generally the same, + * but must return their ShardType individually. */ -public class IntervalReadShard implements Shard { +public class IntervalShard implements Shard { /** a collection of genomic locations to interate over */ private GenomeLoc mSet; - IntervalReadShard(GenomeLoc myLocation) { + IntervalShard(GenomeLoc myLocation) { mSet = myLocation.clone(); } @@ -55,7 +55,7 @@ public class IntervalReadShard implements Shard { * * @return READ, indicating the shard type */ - public ShardType getShardType() { - return Shard.ShardType.READ; + public Shard.ShardType getShardType() { + return ShardType.INTERVAL; } } diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LinearLocusShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LinearLocusShardStrategy.java index 5d91c0319..a11791439 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LinearLocusShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LinearLocusShardStrategy.java @@ -6,20 +6,29 @@ import org.broadinstitute.sting.utils.GenomeLocSortedSet; import java.util.List; -/** +/* + * Copyright (c) 2009 The Broad Institute * - * User: aaron - * Date: Apr 6, 2009 - * Time: 7:18:19 PM + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. */ diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusIntervalShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusIntervalShardStrategy.java deleted file mode 100755 index c7f5a6291..000000000 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusIntervalShardStrategy.java +++ /dev/null @@ -1,66 +0,0 @@ -package org.broadinstitute.sting.gatk.dataSources.shards; - -import net.sf.samtools.SAMSequenceDictionary; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; - -import java.util.List; - -/** - * - * User: aaron - * Date: May 14, 2009 - * Time: 3:28:50 PM - * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - * - */ - - -/** - * @author aaron - * @version 1.0 - * @date May 14, 2009 - *

- * Class LocusWindowShardStrategy - *

- * This function knows how to shard on a genome loc boundry. It guarantees - * a one-to-one mapping between a GenomeLoc and shard. - */ -public class LocusIntervalShardStrategy extends LocusShardStrategy { - /** - * the constructor, taking a seq dictionary to parse out contigs - * - * @param dic the seq dictionary - * @param intervals file - */ - LocusIntervalShardStrategy(SAMSequenceDictionary dic, GenomeLocSortedSet intervals) { - super(dic, intervals); - } - - /** - * This is how the various shards strategies implements their approach, adjusting this value - * - * @return the next shard size - */ - protected long nextShardSize() { - long nextSize = this.getCurrentInterval().getStop() - this.getCurrentInterval().getStart(); - return nextSize; - } - - /** - * set the next shards size - * - * @param size adjust the next size to this - */ - public void adjustNextShardSize(long size) { - //To change body of implemented methods use File | Settings | File Templates. - } - -} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusShardStrategy.java index 084dc2946..c663da43e 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusShardStrategy.java @@ -7,7 +7,6 @@ import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.StingException; import java.util.Iterator; -import java.util.List; /** * * User: aaron @@ -159,11 +158,11 @@ public abstract class LocusShardStrategy implements ShardStrategy { if (loc.getStop() - loc.getStart() <= proposedSize) { intervals.removeRegion(loc); - return new IntervalReadShard(loc); + return new IntervalShard(loc); } else { GenomeLoc subLoc = new GenomeLoc(loc.getContigIndex(), loc.getStart(), loc.getStart() + proposedSize - 1); intervals.removeRegion(subLoc); - return new IntervalReadShard(subLoc); + return new IntervalShard(subLoc); } } diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadIntervalShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadIntervalShardStrategy.java deleted file mode 100755 index 0dbde857d..000000000 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadIntervalShardStrategy.java +++ /dev/null @@ -1,118 +0,0 @@ -package org.broadinstitute.sting.gatk.dataSources.shards; - -import net.sf.samtools.SAMSequenceDictionary; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.StingException; - -import java.util.Iterator; -import java.util.List; - -/** - * - * User: aaron - * Date: May 21, 2009 - * Time: 4:13:53 PM - * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - * - */ - - -/** - * @author aaron - *

- * Class ReadByIntervalShardStrategy - *

- * Impliments the sharding strategy for reads, given a list - * of genomic locations. Shards returned will be bounded by the interval, - * but each provided interval may be split into a number of smaller regions. - */ -public class ReadIntervalShardStrategy implements ShardStrategy { - - /** our storage of the genomic locations they'd like to shard over */ - private final GenomeLocSortedSet regions; - - /** their prefered size of the shard, we can modify this based on what we see in the shards */ - private long size; - - /** the sequence dictionary we'll use to lookup the contigs */ - private final SAMSequenceDictionary dict; - - /** - * change the recommended shard size for the next shard we generate. The code will do it's - * best to respect this value, but there are no guarantees. - * - * @param size the next recommended shard size. - */ - public void adjustNextShardSize(long size) { - this.size = size; - } - - /** - * the default constructor - * - * @param dict the sequence dictionary to use - * @param size the read count to iterate over - */ - ReadIntervalShardStrategy(SAMSequenceDictionary dict, long size, GenomeLocSortedSet locations) { - if (locations == null || locations.isEmpty()) { - throw new StingException("ReadIntervalShardStrategy: genomic regions list is empty."); - } - this.regions = locations.clone(); - this.size = size; - this.dict = dict; - } - - /** - * returns true if there are additional shards - * @return false if we're done processing shards - */ - public boolean hasNext() { - return (!regions.isEmpty()); - } - - /** - * gets the next Shard - * @return the next shard - */ - public Shard next() { - if ((this.regions == null) || (regions.isEmpty())) { - throw new StingException("ReadIntervalShardStrategy: genomic regions list is empty in next() function."); - } - - // get the first region in the list - GenomeLoc loc = regions.iterator().next(); - - if (loc.getStop() - loc.getStart() <= this.size) { - regions.removeRegion(loc); - return new IntervalReadShard(loc); - } else { - GenomeLoc subLoc = new GenomeLoc(loc.getContigIndex(),loc.getStart(),loc.getStart()+size-1); - regions.removeRegion(subLoc); - return new IntervalReadShard(subLoc); - } - - } - - /** - * we don't support the remove command - */ - public void remove() { - throw new UnsupportedOperationException("ShardStrategies don't support remove()"); - } - - /** - * makes the ReadIntervalShard iterable, i.e. usable in a for loop. - * @return - */ - public Iterator iterator() { - return this; - } -} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadShardStrategy.java index 3e4ebc62e..ffa2f2685 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadShardStrategy.java @@ -4,16 +4,29 @@ import net.sf.samtools.SAMSequenceDictionary; import java.util.Iterator; -/** +/* + * Copyright (c) 2009 The Broad Institute * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. */ @@ -60,7 +73,7 @@ public class ReadShardStrategy implements ShardStrategy { } public Shard next() { - return new ReadShard((int)readCount, this); + return new ReadShard((int)readCount, this); } public void remove() { diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/Shard.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/Shard.java index 0c94b6e37..bed3ffae1 100644 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/Shard.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/Shard.java @@ -30,7 +30,7 @@ import java.io.Serializable; */ public interface Shard extends Serializable { enum ShardType { - READ, LOCUS + READ, LOCUS, INTERVAL } /** @return the genome location represented by this shard */ diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactory.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactory.java index 3677bcb75..f9b4b3364 100644 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactory.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactory.java @@ -2,12 +2,9 @@ package org.broadinstitute.sting.gatk.dataSources.shards; import net.sf.samtools.SAMSequenceDictionary; import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import java.util.List; - /** * * User: aaron @@ -81,10 +78,9 @@ public class ShardStrategyFactory { return new LinearLocusShardStrategy(dic, startingSize, lst); case EXPONENTIAL: return new ExpGrowthLocusShardStrategy(dic, startingSize, lst); - case READS: - return new ReadIntervalShardStrategy(dic, startingSize, lst); case INTERVAL: - return new LocusIntervalShardStrategy(dic, lst); + case READS: + return new IntervalShardStrategy(startingSize, lst); default: throw new StingException("Strategy: " + strat + " isn't implemented"); } diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java index cb9db9df2..5571028a3 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java @@ -148,14 +148,16 @@ public class SAMDataSource implements SimpleDataSource { reads.getDownsamplingFraction(), reads.getMaxOnTheFlySorts(), reads.getSafetyChecking()); - } else if (shard.getShardType() == Shard.ShardType.LOCUS) { + } else if (shard.getShardType() == Shard.ShardType.LOCUS || + shard.getShardType() == Shard.ShardType.INTERVAL) { iterator = seekLocus(shard.getGenomeLoc()); iterator = TraversalEngine.applyDecoratingIterators(false, iterator, reads.getDownsamplingFraction(), reads.getMaxOnTheFlySorts(), reads.getSafetyChecking()); - } else { + } + else { throw new StingException("seek: Unknown shard type"); } diff --git a/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 90bd909a3..c85bff529 100755 --- a/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -38,11 +38,9 @@ import java.util.ArrayList; * To change this template use File | Settings | File Templates. */ -/** - * Shards and schedules data in manageable chunks. - */ +/** Shards and schedules data in manageable chunks. */ public abstract class MicroScheduler { - private static long SHARD_SIZE = 100000L; + private static long SHARD_SIZE = 100000L; protected static Logger logger = Logger.getLogger(MicroScheduler.class); @@ -59,14 +57,13 @@ public abstract class MicroScheduler { * @param nThreadsToUse Number of threads to utilize. * @return The best-fit microscheduler. */ - public static MicroScheduler create( Walker walker, Reads reads, File ref, List> rods, int nThreadsToUse ) { - if( walker instanceof TreeReducible && nThreadsToUse > 1 ) { + public static MicroScheduler create(Walker walker, Reads reads, File ref, List> rods, int nThreadsToUse) { + if (walker instanceof TreeReducible && nThreadsToUse > 1) { logger.info("Creating hierarchical microscheduler"); - return new HierarchicalMicroScheduler( walker, reads, ref, rods, nThreadsToUse ); - } - else { + return new HierarchicalMicroScheduler(walker, reads, ref, rods, nThreadsToUse); + } else { logger.info("Creating linear microscheduler"); - return new LinearMicroScheduler( walker, reads, ref, rods ); + return new LinearMicroScheduler(walker, reads, ref, rods); } } @@ -75,16 +72,16 @@ public abstract class MicroScheduler { * @param reads The reads. * @param refFile File pointer to the reference. */ - protected MicroScheduler( Walker walker, Reads reads, File refFile, List> rods ) { + protected MicroScheduler(Walker walker, Reads reads, File refFile, List> rods) { if (walker instanceof ReadWalker) { traversalEngine = new TraverseReads(reads.getReadsFiles(), refFile, rods); } else { traversalEngine = new TraverseLoci(reads.getReadsFiles(), refFile, rods); } - this.reads = getReadsDataSource( reads ); - this.reference = openReferenceSequenceFile( refFile ); - this.rods = getReferenceOrderedDataSources( rods ); + this.reads = getReadsDataSource(reads); + this.reference = openReferenceSequenceFile(refFile); + this.rods = getReferenceOrderedDataSources(rods); } /** @@ -102,7 +99,7 @@ public abstract class MicroScheduler { * @param intervals A list of intervals over which to walk. Null for whole dataset. * @return the return type of the walker */ - public abstract Object execute( Walker walker, GenomeLocSortedSet intervals); + public abstract Object execute(Walker walker, GenomeLocSortedSet intervals); /** * Get the sharding strategy given a driving data source. @@ -111,35 +108,42 @@ public abstract class MicroScheduler { * @param intervals Intervals to use when limiting sharding. * @return Sharding strategy for this driving data source. */ - protected ShardStrategy getShardStrategy( Walker walker, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals ) { + protected ShardStrategy getShardStrategy(Walker walker, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) { ShardStrategy shardStrategy = null; + ShardStrategyFactory.SHATTER_STRATEGY shardType; + if (walker instanceof LocusWalker) { + if (intervals != null) { + shardType = (walker.isReduceByInterval()) ? + ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL : + ShardStrategyFactory.SHATTER_STRATEGY.LINEAR; - if( walker instanceof LocusWalker ) { - if( intervals != null ) { - ShardStrategyFactory.SHATTER_STRATEGY shardType = (walker.isReduceByInterval()) ? - ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL : - ShardStrategyFactory.SHATTER_STRATEGY.LINEAR; + shardStrategy = ShardStrategyFactory.shatter(shardType, + drivingDataSource.getSequenceDictionary(), + SHARD_SIZE, + intervals); + } else + shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, + drivingDataSource.getSequenceDictionary(), + SHARD_SIZE); - shardStrategy = ShardStrategyFactory.shatter( shardType, - drivingDataSource.getSequenceDictionary(), - SHARD_SIZE, - intervals ); + } else if (walker instanceof ReadWalker) { + + shardType = ShardStrategyFactory.SHATTER_STRATEGY.READS; + + if (intervals != null) { + shardStrategy = ShardStrategyFactory.shatter(shardType, + drivingDataSource.getSequenceDictionary(), + SHARD_SIZE, + intervals); + } else { + shardStrategy = ShardStrategyFactory.shatter(shardType, + drivingDataSource.getSequenceDictionary(), + SHARD_SIZE); } - else - shardStrategy = ShardStrategyFactory.shatter( ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, - drivingDataSource.getSequenceDictionary(), - SHARD_SIZE ); - - } - else if( walker instanceof ReadWalker ) { - shardStrategy = ShardStrategyFactory.shatter( ShardStrategyFactory.SHATTER_STRATEGY.READS, - drivingDataSource.getSequenceDictionary(), - SHARD_SIZE ); - } - else + } else throw new StingException("Unable to support walker of type" + walker.getClass().getName()); - return shardStrategy; + return shardStrategy; } /** @@ -147,20 +151,20 @@ public abstract class MicroScheduler { * @param shard The section of data to view. * @return An accessor for all the data in this shard. */ - protected ShardDataProvider getShardDataProvider( Shard shard ) { - return new ShardDataProvider( shard, reads, reference, rods ); + protected ShardDataProvider getShardDataProvider(Shard shard) { + return new ShardDataProvider(shard, reads, reference, rods); } /** * Gets a data source for the given set of reads. * @return A data source for the given set of reads. */ - private SAMDataSource getReadsDataSource( Reads reads ) { + private SAMDataSource getReadsDataSource(Reads reads) { // By reference traversals are happy with no reads. Make sure that case is handled. - if( reads.getReadsFiles().size() == 0 ) + if (reads.getReadsFiles().size() == 0) return null; - SAMDataSource dataSource = new SAMDataSource( reads ); + SAMDataSource dataSource = new SAMDataSource(reads); // Side effect: initialize the traversal engine with reads data. // TODO: Give users a dedicated way of getting the header so that the MicroScheduler @@ -174,10 +178,10 @@ public abstract class MicroScheduler { * Open the reference-ordered data sources. * @return A list of reference-ordered data sources. */ - private List getReferenceOrderedDataSources( List> rods) { + private List getReferenceOrderedDataSources(List> rods) { List dataSources = new ArrayList(); - for( ReferenceOrderedData rod: rods ) - dataSources.add( new ReferenceOrderedDataSource(rod) ); + for (ReferenceOrderedData rod : rods) + dataSources.add(new ReferenceOrderedDataSource(rod)); return dataSources; } @@ -186,12 +190,12 @@ public abstract class MicroScheduler { * @param refFile Handle to a reference sequence file. Non-null. * @return A thread-safe file wrapper. */ - private IndexedFastaSequenceFile openReferenceSequenceFile( File refFile ) { + private IndexedFastaSequenceFile openReferenceSequenceFile(File refFile) { IndexedFastaSequenceFile ref = null; try { ref = new IndexedFastaSequenceFile(refFile); } - catch( FileNotFoundException ex ) { + catch (FileNotFoundException ex) { throw new RuntimeException("File not found opening fasta file; please do this check before MicroManaging", ex); } GenomeLoc.setupRefContigOrdering(ref); diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java index 7f7c842bb..4646de4d9 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java @@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.dataSources.providers.ReadView; import org.broadinstitute.sting.gatk.dataSources.providers.ReadReferenceView; import org.broadinstitute.sting.gatk.dataSources.shards.ReadShard; import org.broadinstitute.sting.gatk.dataSources.shards.Shard; +import org.broadinstitute.sting.gatk.dataSources.shards.IntervalShard; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.walkers.ReadWalker; @@ -79,7 +80,12 @@ public class TraverseReads extends TraversalEngine { ShardDataProvider dataProvider, T sum) { - logger.debug(String.format("TraverseReads.traverse Genomic interval is %s", ((ReadShard) shard).getSize())); + if (shard instanceof ReadShard) { + logger.debug(String.format("TraverseReads.traverse Genomic interval is %s", ((ReadShard) shard).getSize())); + } else if (shard instanceof IntervalShard) { + logger.debug(String.format("TraverseReads.traverse Genomic interval is %s", ((IntervalShard) shard).getGenomeLoc())); + } + if (!(walker instanceof ReadWalker)) throw new IllegalArgumentException("Walker isn't a read walker!"); diff --git a/java/test/org/broadinstitute/sting/gatk/dataSources/shards/ReadIntervalShardStrategyTest.java b/java/test/org/broadinstitute/sting/gatk/dataSources/shards/IntervalShardStrategyTest.java similarity index 77% rename from java/test/org/broadinstitute/sting/gatk/dataSources/shards/ReadIntervalShardStrategyTest.java rename to java/test/org/broadinstitute/sting/gatk/dataSources/shards/IntervalShardStrategyTest.java index 0b6351ba7..4330ea46a 100755 --- a/java/test/org/broadinstitute/sting/gatk/dataSources/shards/ReadIntervalShardStrategyTest.java +++ b/java/test/org/broadinstitute/sting/gatk/dataSources/shards/IntervalShardStrategyTest.java @@ -44,7 +44,7 @@ import net.sf.samtools.SAMFileHeader; *

* Tests the ReadIntervalShardStrategy class */ -public class ReadIntervalShardStrategyTest extends BaseTest { +public class IntervalShardStrategyTest extends BaseTest { private GenomeLocSortedSet mSortedSet = null; private SAMFileHeader header = ArtificialSamUtils.createArtificialSamHeader(NUMBER_OF_CHROMOSOMES, STARTING_CHROMOSOME, CHROMOSOME_SIZE); @@ -60,19 +60,21 @@ public class ReadIntervalShardStrategyTest extends BaseTest { @Test(expected = StingException.class) public void testExceptionOnEmpty() { - ReadIntervalShardStrategy strat = new ReadIntervalShardStrategy(header.getSequenceDictionary(), 100, mSortedSet); + IntervalShardStrategy strat = new IntervalShardStrategy(100, mSortedSet); } @Test public void testSingleChromosomeFunctionality() { GenomeLoc loc = new GenomeLoc(1, 1, 1000); mSortedSet.add(loc); - ReadIntervalShardStrategy strat = new ReadIntervalShardStrategy(header.getSequenceDictionary(), 100, mSortedSet); + IntervalShardStrategy strat = new IntervalShardStrategy(100, mSortedSet); int counter = 0; + Shard d = null; while (strat.hasNext()) { - Shard d = strat.next(); + d = strat.next(); counter++; } + assertTrue(d instanceof IntervalShard); assertEquals(10, counter); } @@ -82,12 +84,14 @@ public class ReadIntervalShardStrategyTest extends BaseTest { GenomeLoc loc = new GenomeLoc(x, 1, 1000); mSortedSet.add(loc); } - ReadIntervalShardStrategy strat = new ReadIntervalShardStrategy(header.getSequenceDictionary(), 100, mSortedSet); + IntervalShardStrategy strat = new IntervalShardStrategy(100, mSortedSet); int counter = 0; + Shard d = null; while (strat.hasNext()) { - Shard d = strat.next(); + d = strat.next(); counter++; } + assertTrue(d instanceof IntervalShard); assertEquals(50, counter); } @@ -97,7 +101,7 @@ public class ReadIntervalShardStrategyTest extends BaseTest { GenomeLoc loc = new GenomeLoc(x, 1, 1000); mSortedSet.add(loc); } - ReadIntervalShardStrategy strat = new ReadIntervalShardStrategy(header.getSequenceDictionary(), 789, mSortedSet); + IntervalShardStrategy strat = new IntervalShardStrategy(789, mSortedSet); int counter = 0; while (strat.hasNext()) { Shard d = strat.next(); @@ -113,11 +117,28 @@ public class ReadIntervalShardStrategyTest extends BaseTest { assertEquals(10, counter); } + + @Test + public void testInfiniteShardSize() { + for (int x = 0; x < 5; x++) { + GenomeLoc loc = new GenomeLoc(x, 1, 1000); + mSortedSet.add(loc); + } + IntervalShardStrategy strat = new IntervalShardStrategy(Long.MAX_VALUE, mSortedSet); + int counter = 0; + while (strat.hasNext()) { + Shard d = strat.next(); + assertEquals(1000, d.getGenomeLoc().getStop()); + counter++; + } + assertEquals(5, counter); + } + @Test(expected = UnsupportedOperationException.class) public void testRemove() { GenomeLoc loc = new GenomeLoc(1, 1, 1000); mSortedSet.add(loc); - ReadIntervalShardStrategy strat = new ReadIntervalShardStrategy(header.getSequenceDictionary(), 100, mSortedSet); + IntervalShardStrategy strat = new IntervalShardStrategy(100, mSortedSet); strat.remove(); } diff --git a/java/test/org/broadinstitute/sting/gatk/dataSources/shards/IntervalReadShardTest.java b/java/test/org/broadinstitute/sting/gatk/dataSources/shards/IntervalShardTest.java similarity index 85% rename from java/test/org/broadinstitute/sting/gatk/dataSources/shards/IntervalReadShardTest.java rename to java/test/org/broadinstitute/sting/gatk/dataSources/shards/IntervalShardTest.java index ed17fd4b9..d727661b7 100755 --- a/java/test/org/broadinstitute/sting/gatk/dataSources/shards/IntervalReadShardTest.java +++ b/java/test/org/broadinstitute/sting/gatk/dataSources/shards/IntervalShardTest.java @@ -2,7 +2,6 @@ package org.broadinstitute.sting.gatk.dataSources.shards; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.sam.ArtificialSamUtils; import org.junit.Before; import org.junit.Test; @@ -42,9 +41,9 @@ import net.sf.samtools.SAMFileHeader; *

* Tests for the IntervalReadShard class. */ -public class IntervalReadShardTest extends BaseTest { +public class IntervalShardTest extends BaseTest { - private IntervalReadShard shard = null; + private IntervalShard intervalShard = null; private SAMFileHeader header = ArtificialSamUtils.createArtificialSamHeader(NUMBER_OF_CHROMOSOMES, STARTING_CHROMOSOME, CHROMOSOME_SIZE); private static final int NUMBER_OF_CHROMOSOMES = 5; private static final int STARTING_CHROMOSOME = 1; @@ -59,15 +58,15 @@ public class IntervalReadShardTest extends BaseTest { @Test public void simpleReturn() { GenomeLoc loc = new GenomeLoc(1, 1, 100); - shard = new IntervalReadShard(loc); - assertTrue(shard.getGenomeLoc().equals(loc)); + intervalShard = new IntervalShard(loc); + assertTrue(intervalShard.getGenomeLoc().equals(loc)); } @Test public void ensureNotReference() { GenomeLoc loc = new GenomeLoc(1, 1, 100); - shard = new IntervalReadShard(loc); - assertTrue(shard.getGenomeLoc() != loc && shard.getGenomeLoc().equals(loc)); + intervalShard = new IntervalShard(loc); + assertTrue(intervalShard.getGenomeLoc() != loc && intervalShard.getGenomeLoc().equals(loc)); } } diff --git a/java/test/org/broadinstitute/sting/gatk/dataSources/shards/LocusIntervalShardStrategyTest.java b/java/test/org/broadinstitute/sting/gatk/dataSources/shards/LinearLocusShardStrategyTest.java similarity index 56% rename from java/test/org/broadinstitute/sting/gatk/dataSources/shards/LocusIntervalShardStrategyTest.java rename to java/test/org/broadinstitute/sting/gatk/dataSources/shards/LinearLocusShardStrategyTest.java index 69c55d46e..00b45e731 100755 --- a/java/test/org/broadinstitute/sting/gatk/dataSources/shards/LocusIntervalShardStrategyTest.java +++ b/java/test/org/broadinstitute/sting/gatk/dataSources/shards/LinearLocusShardStrategyTest.java @@ -8,6 +8,7 @@ import org.junit.Before; import org.junit.Test; import static org.junit.Assert.assertTrue; import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMSequenceDictionary; /* @@ -38,42 +39,65 @@ import net.sf.samtools.SAMFileHeader; /** * @author aaron *

- * Class LocusIntervalShardStrategyTest + * Class LocusShardStrategyTest *

- * Tests the LocusIntervalShardStrategy class. + * Test for the Locus Shard Strategy */ -public class LocusIntervalShardStrategyTest extends BaseTest { +public class LinearLocusShardStrategyTest extends BaseTest { + private GenomeLocSortedSet mSortedSet = null; private SAMFileHeader header = ArtificialSamUtils.createArtificialSamHeader(NUMBER_OF_CHROMOSOMES, STARTING_CHROMOSOME, CHROMOSOME_SIZE); private static final int NUMBER_OF_CHROMOSOMES = 5; private static final int STARTING_CHROMOSOME = 1; private static final int CHROMOSOME_SIZE = 1000; - private LocusIntervalShardStrategy strat = null; @Before public void setup() { GenomeLoc.setupRefContigOrdering(header.getSequenceDictionary()); - mSortedSet = new GenomeLocSortedSet(); } @Test - public void testOneToOneness() { - for (int x = 0; x < 100; x++) { - GenomeLoc loc = new GenomeLoc(0,(x*10)+1, (x*10)+8); - mSortedSet.add(loc); - } - strat = new LocusIntervalShardStrategy(header.getSequenceDictionary(),mSortedSet); + public void testSetup() { + LinearLocusShardStrategy strat = new LinearLocusShardStrategy(header.getSequenceDictionary(), 500); int counter = 0; - while (strat.hasNext()) { + while(strat.hasNext()) { + Shard d = strat.next(); + assertTrue(d instanceof LocusShard); + assertTrue(d.getGenomeLoc().getStop() - d.getGenomeLoc().getStart() == 499); ++counter; - GenomeLoc loc = strat.next().getGenomeLoc(); - long stop = loc.getStop(); - long start = loc.getStart(); - long length = stop - start; - assertTrue(length == 7); } - assertTrue(counter == 100); - + assertTrue(counter == 10); } + @Test + public void testAdjustSize() { + LinearLocusShardStrategy strat = new LinearLocusShardStrategy(header.getSequenceDictionary(), 500); + strat.adjustNextShardSize(1000); + int counter = 0; + while(strat.hasNext()) { + Shard d = strat.next(); + assertTrue(d instanceof LocusShard); + assertTrue(d.getGenomeLoc().getStop() - d.getGenomeLoc().getStart() == 999); + ++counter; + } + assertTrue(counter == 5); + } + + + @Test + public void testUnevenSplit() { + LinearLocusShardStrategy strat = new LinearLocusShardStrategy(header.getSequenceDictionary(), 600); + int counter = 0; + while(strat.hasNext()) { + Shard d = strat.next(); + assertTrue(d instanceof LocusShard); + if (counter % 2 == 0) { + assertTrue(d.getGenomeLoc().getStop() - d.getGenomeLoc().getStart() == 599); + } else { + assertTrue(d.getGenomeLoc().getStop() - d.getGenomeLoc().getStart() == 399); + } + ++counter; + } + assertTrue(counter == 10); + } } diff --git a/java/test/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactoryTest.java b/java/test/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactoryTest.java index af1eccd5c..8141a4660 100755 --- a/java/test/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactoryTest.java +++ b/java/test/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactoryTest.java @@ -2,20 +2,14 @@ package org.broadinstitute.sting.gatk.dataSources.shards; import static junit.framework.Assert.assertEquals; import static junit.framework.Assert.fail; -import net.sf.samtools.SAMSequenceDictionary; -import net.sf.samtools.SAMSequenceRecord; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.sam.ArtificialSamUtils; -import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2; import org.junit.*; import static org.junit.Assert.assertTrue; -import java.io.File; -import java.util.ArrayList; - /** * * User: aaron @@ -63,7 +57,7 @@ public class ShardStrategyFactoryTest extends BaseTest { GenomeLoc l = new GenomeLoc(0,1,100); set.add(l); ShardStrategy st = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS,header.getSequenceDictionary(),100,set); - assertTrue(st instanceof ReadIntervalShardStrategy); + assertTrue(st instanceof IntervalShardStrategy); } @Test