From d994544c47bdab6d29178ce912d1901f8cdb10d8 Mon Sep 17 00:00:00 2001 From: aaron Date: Tue, 26 May 2009 20:57:46 +0000 Subject: [PATCH] Added back end code support for Sharding based on genomic location for reads. Changed the sharding code to take GenomeLocSortedSet instead of a list, and added a bunch of much simplier and cleaner test cases. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@816 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/GenomeAnalysisEngine.java | 8 +- .../shards/ExpGrowthLocusShardStrategy.java | 3 +- .../dataSources/shards/IntervalReadShard.java | 61 +++++++ .../shards/LinearLocusShardStrategy.java | 3 +- ...y.java => LocusIntervalShardStrategy.java} | 9 +- .../gatk/dataSources/shards/LocusShard.java | 2 +- .../shards/LocusShardStrategy.java | 72 +++----- .../shards/ReadIntervalShardStrategy.java | 118 ++++++++++++ .../gatk/dataSources/shards/ReadShard.java | 22 ++- .../dataSources/shards/ReadShardStrategy.java | 11 +- .../shards/ShardStrategyFactory.java | 37 +--- .../ReferenceDataSource.java | 73 -------- .../simpleDataSources/SAMDataSource.java | 128 ++++++------- .../executive/HierarchicalMicroScheduler.java | 3 +- .../gatk/executive/LinearMicroScheduler.java | 3 +- .../sting/gatk/executive/MicroScheduler.java | 5 +- .../broadinstitute/sting/utils/GenomeLoc.java | 2 +- .../sting/utils/GenomeLocSortedSet.java | 83 ++++++--- .../shards/IntervalReadShardTest.java | 73 ++++++++ .../shards/IntervalShardStrategyTest.java | 142 --------------- .../LocusIntervalShardStrategyTest.java | 79 ++++++++ .../shards/ReadIntervalShardStrategyTest.java | 124 +++++++++++++ .../shards/ShardStrategyFactoryTest.java | 169 +++++------------- .../simpleDataSources/SAMByReadsTest.java | 2 +- .../sting/utils/GenomeLocSortedSetTest.java | 27 ++- 25 files changed, 732 insertions(+), 527 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/gatk/dataSources/shards/IntervalReadShard.java rename java/src/org/broadinstitute/sting/gatk/dataSources/shards/{IntervalShardStrategy.java => LocusIntervalShardStrategy.java} (84%) create mode 100755 java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadIntervalShardStrategy.java delete mode 100644 java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceDataSource.java create mode 100755 java/test/org/broadinstitute/sting/gatk/dataSources/shards/IntervalReadShardTest.java delete mode 100755 java/test/org/broadinstitute/sting/gatk/dataSources/shards/IntervalShardStrategyTest.java create mode 100755 java/test/org/broadinstitute/sting/gatk/dataSources/shards/LocusIntervalShardStrategyTest.java create mode 100755 java/test/org/broadinstitute/sting/gatk/dataSources/shards/ReadIntervalShardStrategyTest.java diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 3b07fcc07..9ba375483 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -13,6 +13,7 @@ import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.cmdLine.ArgumentException; import java.util.ArrayList; @@ -110,7 +111,10 @@ public class GenomeAnalysisEngine { genericEngineSetup(strictness); // parse out any genomic location they've provided - List locs = setupIntervalRegion(); + List locationsList = setupIntervalRegion(); + GenomeLocSortedSet locs = null; + if (locationsList != null) + locs = GenomeLocSortedSet.createSetFromList(locationsList); // excute the microscheduler microScheduler.execute(my_walker, locs); @@ -192,7 +196,7 @@ public class GenomeAnalysisEngine { engine.setMaxReads(Integer.parseInt(argCollection.maximumReads)); - // we default interval files over the genome region strin + // we default interval files over the genome region string if (argCollection.intervals != null) { engine.setLocation(setupIntervalRegion()); } diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ExpGrowthLocusShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ExpGrowthLocusShardStrategy.java index 441f7ae80..de8d47993 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ExpGrowthLocusShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ExpGrowthLocusShardStrategy.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.dataSources.shards; import net.sf.samtools.SAMSequenceDictionary; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; import java.util.List; @@ -66,7 +67,7 @@ public class ExpGrowthLocusShardStrategy extends LocusShardStrategy { * @param startSize the starting size of the shard * @param lst locations to iterate from */ - ExpGrowthLocusShardStrategy(SAMSequenceDictionary dic, long startSize, List lst) { + ExpGrowthLocusShardStrategy(SAMSequenceDictionary dic, long startSize, GenomeLocSortedSet lst) { super(dic, lst); this.baseSize = startSize; this.currentExp = 0; diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/IntervalReadShard.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/IntervalReadShard.java new file mode 100755 index 000000000..21c70bb78 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/IntervalReadShard.java @@ -0,0 +1,61 @@ +package org.broadinstitute.sting.gatk.dataSources.shards; + +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.GenomeLoc; + + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * @author aaron + *

+ * Class IntervalReadShard + *

+ * This is the read shard that knowns about genomic intervals + */ +public class IntervalReadShard implements Shard { + + /** a collection of genomic locations to interate over */ + private GenomeLoc mSet; + + IntervalReadShard(GenomeLoc myLocation) { + mSet = myLocation.clone(); + } + + /** @return the genome location represented by this shard */ + public GenomeLoc getGenomeLoc() { + return mSet; + } + + /** + * returns the type of shard, READ + * + * @return READ, indicating the shard type + */ + public ShardType getShardType() { + return Shard.ShardType.READ; + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LinearLocusShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LinearLocusShardStrategy.java index 5c4dece8d..5d91c0319 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LinearLocusShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LinearLocusShardStrategy.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.dataSources.shards; import net.sf.samtools.SAMSequenceDictionary; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; import java.util.List; @@ -63,7 +64,7 @@ class LinearLocusShardStrategy extends LocusShardStrategy { * @param startSize the starting size of the shard * @param lst locations to iterate from */ - LinearLocusShardStrategy(SAMSequenceDictionary dic, long startSize, List lst) { + LinearLocusShardStrategy(SAMSequenceDictionary dic, long startSize, GenomeLocSortedSet lst) { super(dic, lst); this.nextShardSize = startSize; } diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/IntervalShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusIntervalShardStrategy.java similarity index 84% rename from java/src/org/broadinstitute/sting/gatk/dataSources/shards/IntervalShardStrategy.java rename to java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusIntervalShardStrategy.java index 70838af86..c7f5a6291 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/IntervalShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusIntervalShardStrategy.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.dataSources.shards; import net.sf.samtools.SAMSequenceDictionary; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; import java.util.List; @@ -29,17 +30,17 @@ import java.util.List; *

* Class LocusWindowShardStrategy *

- * This function knows how to shard on a genome loc boundry. It guarantee's - * a one-to-one mapping between a GenomeLoc and hte + * This function knows how to shard on a genome loc boundry. It guarantees + * a one-to-one mapping between a GenomeLoc and shard. */ -public class IntervalShardStrategy extends LocusShardStrategy { +public class LocusIntervalShardStrategy extends LocusShardStrategy { /** * the constructor, taking a seq dictionary to parse out contigs * * @param dic the seq dictionary * @param intervals file */ - IntervalShardStrategy(SAMSequenceDictionary dic, List intervals) { + LocusIntervalShardStrategy(SAMSequenceDictionary dic, GenomeLocSortedSet intervals) { super(dic, intervals); } diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusShard.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusShard.java index 011840ba5..af0143810 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusShard.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusShard.java @@ -26,7 +26,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; *

* Class Shard *

- * This is the base class for shards. Right now it does little more then + * This is the base class for locus shards. Right now it does little more then * wrap GenomeLoc (actually nothing more), but it's good to have the class * in place so it's easier to change guts later. */ diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusShardStrategy.java index 5907fd7f4..084dc2946 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusShardStrategy.java @@ -3,6 +3,8 @@ package org.broadinstitute.sting.gatk.dataSources.shards; import net.sf.samtools.SAMSequenceDictionary; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.StingException; import java.util.Iterator; import java.util.List; @@ -25,11 +27,6 @@ import java.util.List; /** * @author aaron * @version 1.0 - * @date Apr 6, 2009 - *

- * Interface Shard - *

- * The shard interface, which controls how data is divided for loci */ public abstract class LocusShardStrategy implements ShardStrategy { @@ -50,10 +47,7 @@ public abstract class LocusShardStrategy implements ShardStrategy { private boolean nextContig = false; /** our interal list * */ - private List intervals = null; - - /** our interal list * */ - private int currentInterval = -1; + private GenomeLocSortedSet intervals = null; /** our log, which we want to capture anything from this class */ private static Logger logger = Logger.getLogger(LocusShardStrategy.class); @@ -92,15 +86,15 @@ public abstract class LocusShardStrategy implements ShardStrategy { * @param dic the seq dictionary * @param intervals file */ - LocusShardStrategy(SAMSequenceDictionary dic, List intervals) { + LocusShardStrategy(SAMSequenceDictionary dic, GenomeLocSortedSet intervals) { this.dic = dic; - this.intervals = intervals; - this.currentInterval = 0; + this.intervals = intervals.clone(); // set the starting point to the beginning interval if (intervals.size() < 1) { throw new IllegalArgumentException("Interval files must contain at least one interval"); } - mLoc = new GenomeLoc(intervals.get(0).getContig(),intervals.get(0).getStart()-1,intervals.get(0).getStart()-1); + GenomeLoc loc = intervals.iterator().next(); + mLoc = new GenomeLoc(loc.getContig(), loc.getStart() - 1, loc.getStart() - 1); if (dic.getSequences().size() > 0) { nextContig = true; } @@ -139,11 +133,11 @@ public abstract class LocusShardStrategy implements ShardStrategy { long proposedSize = nextShardSize(); long nextStart = mLoc.getStop() + 1; - // if we don't have an interval file, use the non interval based approach. Simple, eh? + // if we don't have an interval set, use the non interval based approach. Simple, eh? if (this.intervals == null) { return nonIntervaledNext(length, proposedSize, nextStart); } else { - return intervaledNext(proposedSize, nextStart); + return intervaledNext(proposedSize); } } @@ -152,36 +146,24 @@ public abstract class LocusShardStrategy implements ShardStrategy { * Interval based next processing * * @param proposedSize the proposed size - * @param nextStart where we start from + * * @return the shard that represents this data */ - private Shard intervaledNext(long proposedSize, long nextStart) { - // get the current genome location - GenomeLoc loc = intervals.get(currentInterval); - if (nextStart + proposedSize >= loc.getStop()) { - // we need to get the rest of the current loc in a shard (return it), and move to the next location - proposedSize = loc.getStop() - nextStart; - lastGenomeLocSize = proposedSize; + private Shard intervaledNext(long proposedSize) { + if ((this.intervals == null) || (intervals.isEmpty())) { + throw new StingException("LocusShardStrategy: genomic regions list is empty in next() function."); + } - // the next sequence should start at the begining of the next contig - Shard ret = LocusShard.toShard(new GenomeLoc(intervals.get(currentInterval).getContigIndex(), nextStart, nextStart + proposedSize)); - - ++currentInterval; - if (intervals.size() > currentInterval) { - mLoc = new GenomeLoc(intervals.get(currentInterval).getContigIndex(), intervals.get(currentInterval).getStart() - 1, intervals.get(currentInterval).getStart() - 1); - } - return ret;// return + // get the first region in the list + GenomeLoc loc = intervals.iterator().next(); + if (loc.getStop() - loc.getStart() <= proposedSize) { + intervals.removeRegion(loc); + return new IntervalReadShard(loc); } else { - // we need to move the next interval - lastGenomeLocSize = proposedSize; - - // the next sequence should start at the begining of the next contig - Shard ret = LocusShard.toShard(new GenomeLoc(intervals.get(currentInterval).getContigIndex(), nextStart, nextStart + proposedSize - 1)); - - mLoc = new GenomeLoc(intervals.get(currentInterval).getContigIndex(), nextStart, nextStart + proposedSize - 1); - - return ret;// return + GenomeLoc subLoc = new GenomeLoc(loc.getContigIndex(), loc.getStart(), loc.getStart() + proposedSize - 1); + intervals.removeRegion(subLoc); + return new IntervalReadShard(subLoc); } } @@ -191,6 +173,7 @@ public abstract class LocusShardStrategy implements ShardStrategy { * @param length the length of the contig * @param proposedSize the proposed size * @param nextStart the next start location + * * @return the shard to return to the user */ private Shard nonIntervaledNext(long length, long proposedSize, long nextStart) { @@ -241,11 +224,11 @@ public abstract class LocusShardStrategy implements ShardStrategy { * @return */ public boolean hasNext() { - // if we don't have an interval file, use the non interval based approach. Simple, eh? + // if we don't have an interval file, use the non interval based approach. if (this.intervals == null) { return nextContig; } else { - return (this.currentInterval < this.intervals.size()); + return (this.intervals.size() > 0); } } @@ -267,13 +250,14 @@ public abstract class LocusShardStrategy implements ShardStrategy { /** * this allows a shard strategy to get the current interval. It's kind of a hack, but for the * locusWindowShardStrategy it was the best approach. + * * @return */ protected GenomeLoc getCurrentInterval() { - if (this.intervals == null || currentInterval < 0) { + if (this.intervals == null || intervals.size() < 1) { return null; } - return intervals.get(currentInterval); + return intervals.iterator().next(); } } diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadIntervalShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadIntervalShardStrategy.java new file mode 100755 index 000000000..0dbde857d --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadIntervalShardStrategy.java @@ -0,0 +1,118 @@ +package org.broadinstitute.sting.gatk.dataSources.shards; + +import net.sf.samtools.SAMSequenceDictionary; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.StingException; + +import java.util.Iterator; +import java.util.List; + +/** + * + * User: aaron + * Date: May 21, 2009 + * Time: 4:13:53 PM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + + +/** + * @author aaron + *

+ * Class ReadByIntervalShardStrategy + *

+ * Impliments the sharding strategy for reads, given a list + * of genomic locations. Shards returned will be bounded by the interval, + * but each provided interval may be split into a number of smaller regions. + */ +public class ReadIntervalShardStrategy implements ShardStrategy { + + /** our storage of the genomic locations they'd like to shard over */ + private final GenomeLocSortedSet regions; + + /** their prefered size of the shard, we can modify this based on what we see in the shards */ + private long size; + + /** the sequence dictionary we'll use to lookup the contigs */ + private final SAMSequenceDictionary dict; + + /** + * change the recommended shard size for the next shard we generate. The code will do it's + * best to respect this value, but there are no guarantees. + * + * @param size the next recommended shard size. + */ + public void adjustNextShardSize(long size) { + this.size = size; + } + + /** + * the default constructor + * + * @param dict the sequence dictionary to use + * @param size the read count to iterate over + */ + ReadIntervalShardStrategy(SAMSequenceDictionary dict, long size, GenomeLocSortedSet locations) { + if (locations == null || locations.isEmpty()) { + throw new StingException("ReadIntervalShardStrategy: genomic regions list is empty."); + } + this.regions = locations.clone(); + this.size = size; + this.dict = dict; + } + + /** + * returns true if there are additional shards + * @return false if we're done processing shards + */ + public boolean hasNext() { + return (!regions.isEmpty()); + } + + /** + * gets the next Shard + * @return the next shard + */ + public Shard next() { + if ((this.regions == null) || (regions.isEmpty())) { + throw new StingException("ReadIntervalShardStrategy: genomic regions list is empty in next() function."); + } + + // get the first region in the list + GenomeLoc loc = regions.iterator().next(); + + if (loc.getStop() - loc.getStart() <= this.size) { + regions.removeRegion(loc); + return new IntervalReadShard(loc); + } else { + GenomeLoc subLoc = new GenomeLoc(loc.getContigIndex(),loc.getStart(),loc.getStart()+size-1); + regions.removeRegion(subLoc); + return new IntervalReadShard(subLoc); + } + + } + + /** + * we don't support the remove command + */ + public void remove() { + throw new UnsupportedOperationException("ShardStrategies don't support remove()"); + } + + /** + * makes the ReadIntervalShard iterable, i.e. usable in a for loop. + * @return + */ + public Iterator iterator() { + return this; + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadShard.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadShard.java index 9d08081f5..677f00d33 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadShard.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadShard.java @@ -21,19 +21,20 @@ import org.broadinstitute.sting.utils.GenomeLoc; /** * @author aaron - * @version 1.0 - * @date Apr 10, 2009 - *

- * Class ReadShard - *

- * A class for sharded reads. + *

+ * ReadShard + *

+ * the base class for read shards. */ public class ReadShard implements Shard { // the count of the reads we want to copy off private int size = 0; - // this is going to get gross + /** + * our tie in for the shard strategy. This allows us to signal to the shard + * strategy that we've finished process, so it can indicate that we're out of reads + */ private final ReadShardStrategy str; // the reference back to our read shard strategy @@ -63,7 +64,7 @@ public class ReadShard implements Shard { /** @return the genome location represented by this shard */ public GenomeLoc getGenomeLoc() { - throw new UnsupportedOperationException("Reads based sharding isn't genome loc aware"); + throw new UnsupportedOperationException("ReadShard isn't genome loc aware"); } /** @return the genome location represented by this shard */ @@ -71,7 +72,10 @@ public class ReadShard implements Shard { return size; } - + /** + * this method is used as a backend, to signal to the sharding strategy that we've + * finished processing. When we move to a more read-aware bam system this method could disappear. + */ public void signalDone() { strat.signalDone(); } diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadShardStrategy.java index 2343a320b..3e4ebc62e 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadShardStrategy.java @@ -5,10 +5,6 @@ import net.sf.samtools.SAMSequenceDictionary; import java.util.Iterator; /** - * - * User: aaron - * Date: Apr 14, 2009 - * Time: 1:34:28 PM * * The Broad Institute * SOFTWARE COPYRIGHT NOTICE AGREEMENT @@ -28,7 +24,8 @@ import java.util.Iterator; *

* Class ReadShardStrategy *

- * A descriptions should go here. Blame aaron if it's missing. + * The sharding strategy for reads using a simple counting mechanism. Each read shard + * has a specific number of reads (default to 100K) which is configured in the constructor. */ public class ReadShardStrategy implements ShardStrategy { @@ -46,7 +43,7 @@ public class ReadShardStrategy implements ShardStrategy { /** * the default constructor - * @param dic the dictionary + * @param dic the sequence dictionary to use * @param size the read count to iterate over */ ReadShardStrategy(SAMSequenceDictionary dic, long size) { @@ -63,7 +60,7 @@ public class ReadShardStrategy implements ShardStrategy { } public Shard next() { - return new ReadShard((int)readCount, this); + return new ReadShard((int)readCount, this); } public void remove() { diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactory.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactory.java index c4d1b63c6..3677bcb75 100644 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactory.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactory.java @@ -4,6 +4,7 @@ import net.sf.samtools.SAMSequenceDictionary; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; import java.util.List; @@ -65,25 +66,6 @@ public class ShardStrategyFactory { } - /** - * convert between types - * - * @param strat the strategy - * @param convertFrom convert from this strategy - * @return - */ - static public ShardStrategy transitionToShardStrategy(SHATTER_STRATEGY strat, LocusShardStrategy convertFrom) { - switch (strat) { - case LINEAR: - return new LinearLocusShardStrategy(convertFrom); - case EXPONENTIAL: - return new ExpGrowthLocusShardStrategy(convertFrom); - default: - throw new StingException("Strategy: " + strat + " isn't implemented"); - - } - } - /** * get a new shatter strategy @@ -93,31 +75,20 @@ public class ShardStrategyFactory { * @param startingSize the starting size * @return */ - static public ShardStrategy shatter(SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, List lst) { + static public ShardStrategy shatter(SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocSortedSet lst) { switch (strat) { case LINEAR: return new LinearLocusShardStrategy(dic, startingSize, lst); case EXPONENTIAL: return new ExpGrowthLocusShardStrategy(dic, startingSize, lst); case READS: - // return new ReadShardStrategy(dic, startingSize); - throw new StingException("Strategy: " + strat + " isn't implemented for intervals"); + return new ReadIntervalShardStrategy(dic, startingSize, lst); case INTERVAL: - return new IntervalShardStrategy(dic, lst); + return new LocusIntervalShardStrategy(dic, lst); default: throw new StingException("Strategy: " + strat + " isn't implemented"); } } - /** - * setup a reads shattering strategy - * - * @param readCount the number of reads to include in each shard - * @return - */ - static public ShardStrategy shatterByReadCount(SAMSequenceDictionary dic, long readCount) { - return new ReadShardStrategy(dic, readCount); - } - } diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceDataSource.java deleted file mode 100644 index 8a92c3da3..000000000 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceDataSource.java +++ /dev/null @@ -1,73 +0,0 @@ -package org.broadinstitute.sting.gatk.dataSources.simpleDataSources; - -import org.broadinstitute.sting.gatk.dataSources.shards.Shard; -import org.broadinstitute.sting.gatk.iterators.BoundedReferenceIterator; -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; - -import java.io.File; -import java.io.FileNotFoundException; - -/** - * - * User: aaron - * Date: Apr 6, 2009 - * Time: 3:55:21 PM - * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - * - */ - - -/** - * @author aaron - * @version 1.0 - * @date Apr 6, 2009 - *

- * Class ReferenceDataSource - *

- * A descriptions should go here. Blame aaron if it's missing. - */ -public class ReferenceDataSource implements SimpleDataSource { - - final protected IndexedFastaSequenceFile refFile; - - /** - * Query the data source for a region of interest, specified by the genome location. - * The iterator will generate successive calls - * - * @param shard the genome location to extract data for - * @return an iterator of the appropriate type, that is limited by the region - */ - public BoundedReferenceIterator seek(Shard shard) { - if (shard.getShardType() == Shard.ShardType.LOCUS) { - BoundedReferenceIterator ret = new BoundedReferenceIterator(refFile, shard.getGenomeLoc()); - return ret; - } else { - throw new StingException("ReferenceDataSource can only take LocusShards"); - } - - } - - public ReferenceDataSource(String refFileName) throws SimpleDataSourceLoadException { - if (refFileName == null) { - throw new SimpleDataSourceLoadException("ReferenceDataSource: refFileName passed in is null"); - } - File infile = new File(refFileName); - if (!infile.canRead()) { - throw new SimpleDataSourceLoadException("ReferenceDataSource: Unable to load file: " + refFileName); - } - try { - refFile = new IndexedFastaSequenceFile(new File(refFileName)); - } - catch( FileNotFoundException ex ) { - throw new SimpleDataSourceLoadException( "Unable to find reference file", ex ); - } - } -} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java index 0d3808781..cb9db9df2 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java @@ -35,9 +35,7 @@ import java.util.List; * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. */ public class SAMDataSource implements SimpleDataSource { - /** - * Backing support for reads. - */ + /** Backing support for reads. */ private Reads reads = null; /** our SAM data files */ @@ -52,9 +50,7 @@ public class SAMDataSource implements SimpleDataSource { // our list of readers private final List samFileList = new ArrayList(); - /** - * SAM header file. - */ + /** SAM header file. */ private final SAMFileHeader header; // used for the reads case, the last count of reads retrieved @@ -90,14 +86,14 @@ public class SAMDataSource implements SimpleDataSource { } - header = createHeaderMerger().getMergedHeader(); + header = createHeaderMerger().getMergedHeader(); } /** - * Load up a sam file. + * Load a SAM/BAM, given an input file. * * @param samFile the file name - * @return a SAMFileReader for the file + * @return a SAMFileReader for the file, null if we're attempting to read a list */ private SAMFileReader initializeSAMFile(final File samFile) { if (samFile.toString().endsWith(".list")) { @@ -115,7 +111,7 @@ public class SAMDataSource implements SimpleDataSource { /** *

- * seek + * seekLocus *

* * @param location the genome location to extract data for @@ -123,17 +119,16 @@ public class SAMDataSource implements SimpleDataSource { */ public StingSAMIterator seekLocus(GenomeLoc location) throws SimpleDataSourceLoadException { - // right now this is pretty damn heavy, it copies the file list into a reader list every time + // right now this is very heavy, it copies the file list into a reader list every time SamFileHeaderMerger headerMerger = createHeaderMerger(); // make a merging iterator for this record MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(headerMerger); - // we do different things for locus and read modes iter.queryOverlapping(location.getContig(), (int) location.getStart(), (int) location.getStop() + 1); // return the iterator - return StingSAMIteratorAdapter.adapt( reads, iter ); + return StingSAMIteratorAdapter.adapt(reads, iter); } /** @@ -149,17 +144,17 @@ public class SAMDataSource implements SimpleDataSource { if (shard.getShardType() == Shard.ShardType.READ) { iterator = seekRead((ReadShard) shard); iterator = TraversalEngine.applyDecoratingIterators(true, - iterator, - reads.getDownsamplingFraction(), - reads.getMaxOnTheFlySorts(), - reads.getSafetyChecking()); + iterator, + reads.getDownsamplingFraction(), + reads.getMaxOnTheFlySorts(), + reads.getSafetyChecking()); } else if (shard.getShardType() == Shard.ShardType.LOCUS) { iterator = seekLocus(shard.getGenomeLoc()); iterator = TraversalEngine.applyDecoratingIterators(false, - iterator, - reads.getDownsamplingFraction(), - reads.getMaxOnTheFlySorts(), - reads.getSafetyChecking()); + iterator, + reads.getDownsamplingFraction(), + reads.getMaxOnTheFlySorts(), + reads.getSafetyChecking()); } else { throw new StingException("seek: Unknown shard type"); } @@ -168,26 +163,26 @@ public class SAMDataSource implements SimpleDataSource { } - /** - * If we're in by-read mode, this indicates if we want - * to see unmapped reads too. Only seeing mapped reads - * is much faster, but most BAM files have significant - * unmapped read counts. - * - * @param seeUnMappedReads true to see unmapped reads, false otherwise - */ - public void viewUnmappedReads(boolean seeUnMappedReads) { - includeUnmappedReads = seeUnMappedReads; - } - /** * Gets the (potentially merged) SAM file header. + * * @return SAM file header. */ public SAMFileHeader getHeader() { - return header; + return header; } + /** + * create the merging header. + * + * @return a SamFileHeaderMerger that includes the set of SAM files we were created with + */ + private SamFileHeaderMerger createHeaderMerger() { + List lst = GetReaderList(); + return new SamFileHeaderMerger(lst, SORT_ORDER); + } + + /** *

* seek @@ -203,10 +198,8 @@ public class SAMDataSource implements SimpleDataSource { MergingSamRecordIterator2 iter = null; if (!intoUnmappedReads) { - // make a merging iterator for this record iter = new MergingSamRecordIterator2(headerMerger); - - bound = fastMappedReadSeek(shard.getSize(), iter); + bound = fastMappedReadSeek(shard.getSize(), iter); } if ((bound == null || intoUnmappedReads) && includeUnmappedReads) { if (iter != null) { @@ -218,18 +211,21 @@ public class SAMDataSource implements SimpleDataSource { if (bound == null) { shard.signalDone(); - bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads,iter), 0); + bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), 0); } return bound; } - private SamFileHeaderMerger createHeaderMerger() { - // TODO: make extremely less horrible - List lst = GetReaderList(); - - // now merge the headers - SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(lst, SORT_ORDER); - return headerMerger; + /** + * If we're in by-read mode, this indicates if we want + * to see unmapped reads too. Only seeing mapped reads + * is much faster, but most BAM files have significant + * unmapped read counts. + * + * @param seeUnMappedReads true to see unmapped reads, false otherwise + */ + public void viewUnmappedReads(boolean seeUnMappedReads) { + includeUnmappedReads = seeUnMappedReads; } /** @@ -242,7 +238,6 @@ public class SAMDataSource implements SimpleDataSource { * @throws SimpleDataSourceLoadException */ private BoundedReadIterator toUnmappedReads(long readCount, MergingSamRecordIterator2 iter) throws SimpleDataSourceLoadException { - BoundedReadIterator bound;// is this the first time we're doing this? int count = 0; SAMRecord d = null; while (iter.hasNext()) { @@ -270,15 +265,15 @@ public class SAMDataSource implements SimpleDataSource { return null; } - // we're good, increment our read cout + // we're not out of unmapped reads, so increment our read cout this.readsTaken += readCount; - return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads,iter), readCount); + return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount); } /** - * unmapped reads. + * A seek function for unmapped reads. * * @param readCount how many reads to retrieve * @param iter the iterator to use @@ -286,16 +281,10 @@ public class SAMDataSource implements SimpleDataSource { * @throws SimpleDataSourceLoadException */ private BoundedReadIterator fastMappedReadSeek(long readCount, MergingSamRecordIterator2 iter) throws SimpleDataSourceLoadException { - BoundedReadIterator bound;// is this the first time we're doing this? if (lastReadPos == null) { - lastReadPos = new GenomeLoc(iter.getHeader().getSequenceDictionary().getSequence(0).getSequenceIndex(), 0, 0); - iter.queryContained(lastReadPos.getContig(), 1, -1); - bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads,iter), readCount); - this.readsTaken = readCount; - } - // we're not at the beginning, not at the end, so we move forward with our ghastly plan... - else { - + return InitialReadIterator(readCount, iter); + } else { + BoundedReadIterator bound; iter.queryContained(lastReadPos.getContig(), (int) lastReadPos.getStop(), -1); // move the number of reads we read from the last pos @@ -338,7 +327,7 @@ public class SAMDataSource implements SimpleDataSource { SamFileHeaderMerger mg = createHeaderMerger(); iter = new MergingSamRecordIterator2(mg); iter.queryContained(lastReadPos.getContig(), 1, Integer.MAX_VALUE); - return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads,iter),readCount); + return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount); } } } @@ -363,11 +352,28 @@ public class SAMDataSource implements SimpleDataSource { throw new StingException("Danger: weve run out reads in fastMappedReadSeek"); //return null; } - bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads,iter), readCount); + bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount); + + // return the iterator + return bound; } - // return the iterator + } + + /** + * set the initial iterator + * + * @param readCount the number of reads + * @param iter the merging iterator + * @return a bounded read iterator at the first read position in the file. + */ + private BoundedReadIterator InitialReadIterator(long readCount, MergingSamRecordIterator2 iter) { + BoundedReadIterator bound; + lastReadPos = new GenomeLoc(iter.getHeader().getSequenceDictionary().getSequence(0).getSequenceIndex(), 0, 0); + iter.queryContained(lastReadPos.getContig(), 1, -1); + bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount); + this.readsTaken = readCount; return bound; } diff --git a/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java index cb545c207..4f748a490 100755 --- a/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java +++ b/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java @@ -11,6 +11,7 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.threading.ThreadPoolMonitor; import java.io.File; @@ -61,7 +62,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Reduce this.threadPool = Executors.newFixedThreadPool(nThreadsToUse); } - public Object execute( Walker walker, List intervals ) { + public Object execute( Walker walker, GenomeLocSortedSet intervals ) { // Fast fail for walkers not supporting TreeReducible interface. if( !(walker instanceof TreeReducible) ) throw new IllegalArgumentException("Hierarchical microscheduler only works with TreeReducible walkers"); diff --git a/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index ce6db40b0..5a579698a 100644 --- a/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; import org.broadinstitute.sting.gatk.Reads; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; import java.io.File; import java.util.List; @@ -31,7 +32,7 @@ public class LinearMicroScheduler extends MicroScheduler { * @param walker Computation to perform over dataset. * @param locations Subset of the dataset over which to walk. */ - public Object execute(Walker walker, List locations) { + public Object execute(Walker walker, GenomeLocSortedSet locations) { ShardStrategy shardStrategy = getShardStrategy(walker, reference, locations); walker.initialize(); diff --git a/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 95032dfad..90bd909a3 100755 --- a/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -20,6 +20,7 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; import org.broadinstitute.sting.gatk.Reads; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import java.io.File; @@ -101,7 +102,7 @@ public abstract class MicroScheduler { * @param intervals A list of intervals over which to walk. Null for whole dataset. * @return the return type of the walker */ - public abstract Object execute( Walker walker, List intervals); + public abstract Object execute( Walker walker, GenomeLocSortedSet intervals); /** * Get the sharding strategy given a driving data source. @@ -110,7 +111,7 @@ public abstract class MicroScheduler { * @param intervals Intervals to use when limiting sharding. * @return Sharding strategy for this driving data source. */ - protected ShardStrategy getShardStrategy( Walker walker, ReferenceSequenceFile drivingDataSource, List intervals ) { + protected ShardStrategy getShardStrategy( Walker walker, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals ) { ShardStrategy shardStrategy = null; if( walker instanceof LocusWalker ) { diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index 21171abf4..656c974a6 100644 --- a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -479,7 +479,7 @@ public class GenomeLoc implements Comparable, Cloneable { * @return A GenomeLoc with the same contents as the current loc. */ @Override - public Object clone() { + public GenomeLoc clone() { return new GenomeLoc(this); } diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java b/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java index 850278f91..6e57f6a49 100755 --- a/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java @@ -6,6 +6,7 @@ import net.sf.samtools.SAMSequenceRecord; import java.util.AbstractSet; import java.util.ArrayList; import java.util.Iterator; +import java.util.List; /** * @@ -26,22 +27,22 @@ import java.util.Iterator; /** * @author aaron - * @version 1.0 - * @date May 22, 2009 - *

- * Class GenomeLocCollection - *

- * a set of genome locations. This collection is self sorting, - * and will merge genome locations that are overlapping. The remove function - * will also remove a region from the list, if the region to remove is a - * partial interval of a region in the collection it will remove the region from - * that element. + *

+ * Class GenomeLocCollection + *

+ * a set of genome locations. This collection is self sorting, + * and will merge genome locations that are overlapping. The remove function + * will also remove a region from the list, if the region to remove is a + * partial interval of a region in the collection it will remove the region from + * that element. */ public class GenomeLocSortedSet extends AbstractSet { // our private storage for the GenomeLoc's private final ArrayList mArray = new ArrayList(); - public GenomeLocSortedSet() {} + /** default constructor */ + public GenomeLocSortedSet() { + } /** * get an iterator over this collection @@ -72,7 +73,9 @@ public class GenomeLocSortedSet extends AbstractSet { /** * add a genomeLoc to the collection, simply inserting in order into the set + * * @param e the GenomeLoc to add + * * @return true */ public boolean add(GenomeLoc e) { @@ -82,7 +85,7 @@ public class GenomeLocSortedSet extends AbstractSet { int index = 0; while (index < mArray.size()) { if (!e.isPast(mArray.get(index))) { - mArray.add(index,e); + mArray.add(index, e); return true; } ++index; @@ -96,6 +99,7 @@ public class GenomeLocSortedSet extends AbstractSet { * If it's not overlapping then we add it in sorted order. * * @param e the GenomeLoc to add to the collection + * * @return true, if the GenomeLoc could be added to the collection */ public boolean addRegion(GenomeLoc e) { @@ -112,7 +116,7 @@ public class GenomeLocSortedSet extends AbstractSet { for (GenomeLoc g : mArray) { if (g.contiguousP(e)) { GenomeLoc c = g.merge(e); - mArray.set(mArray.indexOf(g),c); + mArray.set(mArray.indexOf(g), c); haveAdded = true; } else if ((g.getContigIndex() == e.getContigIndex()) && (e.getStart() < g.getStart()) && !haveAdded) { @@ -132,7 +136,9 @@ public class GenomeLocSortedSet extends AbstractSet { /** * remove an element from the set. Given a specific genome location, this function will * remove all regions in the element set that overlap the specified region. + * * @param e the genomic range to remove + * * @return true if a removal action was performed, false if the collection was unchanged. */ public boolean removeRegion(GenomeLoc e) { @@ -148,7 +154,7 @@ public class GenomeLocSortedSet extends AbstractSet { */ for (GenomeLoc g : mArray) { if (g.overlapsP(e)) { - if (g.compareTo(e) == 0) { + if (g.equals(e)) { mArray.remove(mArray.indexOf(g)); return true; } else if (g.containsP(e)) { @@ -162,11 +168,15 @@ public class GenomeLocSortedSet extends AbstractSet { * |------| + |--------| * */ - GenomeLoc before = new GenomeLoc(g.getContigIndex(), g.getStart(), e.getStart()-1); + GenomeLoc before = new GenomeLoc(g.getContigIndex(), g.getStart(), e.getStart() - 1); GenomeLoc after = new GenomeLoc(g.getContigIndex(), e.getStop() + 1, g.getStop()); int index = mArray.indexOf(g); - mArray.add(index, after); - mArray.add(index, before); + if (after.getStop() - after.getStart() > 0) { + mArray.add(index, after); + } + if (before.getStop() - before.getStart() > 0) { + mArray.add(index, before); + } mArray.remove(mArray.indexOf(g)); return true; } else if (e.containsP(g)) { @@ -194,12 +204,12 @@ public class GenomeLocSortedSet extends AbstractSet { * |------------- g ----------| * |------------ e -----------| * - */ + */ if (e.getStart() < g.getStart()) { - l = new GenomeLoc(g.getContigIndex(), e.getStop()+1, g.getStop()); + l = new GenomeLoc(g.getContigIndex(), e.getStop() + 1, g.getStop()); } else { - l = new GenomeLoc(g.getContigIndex(), g.getStart(), e.getStart()-1); + l = new GenomeLoc(g.getContigIndex(), g.getStart(), e.getStart() - 1); } // replace g with the new region mArray.set(mArray.indexOf(g), l); @@ -212,14 +222,45 @@ public class GenomeLocSortedSet extends AbstractSet { /** * create a list of genomic locations, given a reference sequence + * * @param dict the sequence dictionary to create a collection from + * * @return the GenomeLocSet of all references sequences as GenomeLoc's */ public static GenomeLocSortedSet createSetFromSequenceDictionary(SAMSequenceDictionary dict) { GenomeLocSortedSet returnSortedSet = new GenomeLocSortedSet(); for (SAMSequenceRecord record : dict.getSequences()) { - returnSortedSet.add(new GenomeLoc(record.getSequenceIndex(),1,record.getSequenceLength())); + returnSortedSet.add(new GenomeLoc(record.getSequenceIndex(), 1, record.getSequenceLength())); } return returnSortedSet; } + + /** + * Create a sorted genome location set from a list of GenomeLocs. + * @param locs the list + * @return the sorted genome loc list + */ + public static GenomeLocSortedSet createSetFromList(List locs) { + GenomeLocSortedSet set = new GenomeLocSortedSet(); + for (GenomeLoc l: locs) { + set.add(l); + } + return set; + } + + + /** + * return a deep copy of this collection. + * + * @return a new GenomeLocSortedSet, indentical to the current GenomeLocSortedSet. + */ + public GenomeLocSortedSet clone() { + GenomeLocSortedSet ret = new GenomeLocSortedSet(); + for (GenomeLoc loc : this.mArray) { + // ensure a deep copy + ret.mArray.add(new GenomeLoc(loc.getContigIndex(), loc.getStart(), loc.getStop())); + } + return ret; + } + } diff --git a/java/test/org/broadinstitute/sting/gatk/dataSources/shards/IntervalReadShardTest.java b/java/test/org/broadinstitute/sting/gatk/dataSources/shards/IntervalReadShardTest.java new file mode 100755 index 000000000..ed17fd4b9 --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/dataSources/shards/IntervalReadShardTest.java @@ -0,0 +1,73 @@ +package org.broadinstitute.sting.gatk.dataSources.shards; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.sam.ArtificialSamUtils; +import org.junit.Before; +import org.junit.Test; +import static org.junit.Assert.assertTrue; +import net.sf.samtools.SAMFileHeader; + + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * @author aaron + *

+ * Class IntervalReadShardTest + *

+ * Tests for the IntervalReadShard class. + */ +public class IntervalReadShardTest extends BaseTest { + + private IntervalReadShard shard = null; + private SAMFileHeader header = ArtificialSamUtils.createArtificialSamHeader(NUMBER_OF_CHROMOSOMES, STARTING_CHROMOSOME, CHROMOSOME_SIZE); + private static final int NUMBER_OF_CHROMOSOMES = 5; + private static final int STARTING_CHROMOSOME = 1; + private static final int CHROMOSOME_SIZE = 1000; + + @Before + public void setup() { + GenomeLoc.setupRefContigOrdering(header.getSequenceDictionary()); + } + + + @Test + public void simpleReturn() { + GenomeLoc loc = new GenomeLoc(1, 1, 100); + shard = new IntervalReadShard(loc); + assertTrue(shard.getGenomeLoc().equals(loc)); + } + + @Test + public void ensureNotReference() { + GenomeLoc loc = new GenomeLoc(1, 1, 100); + shard = new IntervalReadShard(loc); + assertTrue(shard.getGenomeLoc() != loc && shard.getGenomeLoc().equals(loc)); + } + +} diff --git a/java/test/org/broadinstitute/sting/gatk/dataSources/shards/IntervalShardStrategyTest.java b/java/test/org/broadinstitute/sting/gatk/dataSources/shards/IntervalShardStrategyTest.java deleted file mode 100755 index 38ffca578..000000000 --- a/java/test/org/broadinstitute/sting/gatk/dataSources/shards/IntervalShardStrategyTest.java +++ /dev/null @@ -1,142 +0,0 @@ -package org.broadinstitute.sting.gatk.dataSources.shards; - -import static junit.framework.Assert.assertEquals; -import static junit.framework.Assert.fail; -import net.sf.samtools.SAMSequenceDictionary; -import net.sf.samtools.SAMSequenceRecord; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2; -import org.junit.*; - -import java.io.File; -import java.util.ArrayList; - -/** - * - * User: aaron - * Date: May 14, 2009 - * Time: 3:52:57 PM - * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - * - */ - - -/** - * @author aaron - * @version 1.0 - * @date May 14, 2009 - *

- * Class LocusWindowShardStrategyTest - *

- * LocusWindowShardStrategy tests - */ -public class IntervalShardStrategyTest extends BaseTest { - - private static FastaSequenceFile2 seq; - - /** - * This function (because of the @BeforeClass tag) gets called only once ever, - * before any tests are run - */ - @BeforeClass - public static void doBeforeAnyTests() { - seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")); - } - - /** - * Tears down the test fixture after each call. - *

- * Called after every test case method. - */ - @AfterClass - public static void doAfterAllTests() { - - } - - /** - * This function does the setup of our parser, before each method call. - *

- * Called before every test case method. - */ - @Before - public void doForEachTest() { - - } - - /** - * Tears down the test fixture after each call. - *

- * Called after every test case method. - */ - @After - public void undoForEachTest() { - - } - - /** Tests that we got a string parameter in correctly */ - @Test - public void testIntervalGenomeCycle() throws InterruptedException { - logger.warn("Executing testIntervalGenomeCycle"); - - SAMSequenceDictionary dic = seq.getSequenceDictionary(); - - - // setup a list of genome locs that represent the whole file - SAMSequenceRecord s = dic.getSequence(1); - int stop = s.getSequenceLength(); - int size = 10000; - int location = 1; - - GenomeLoc.setupRefContigOrdering(dic); - // keep track of the number of genome locs we build - int genomeLocs = 0; - ArrayList locations = new ArrayList(); - try { - while (location + size < stop) { - // lets make up some fake locations - GenomeLoc gl = new GenomeLoc(s.getSequenceName(), location, location + size - 1); - logger.debug("loc = " + location); - - // let's move the location up, with a size space - location += (size * 2); - - // add our current location to the list - locations.add(gl); - - // add another genome location - ++genomeLocs; - } - } catch (Exception e) { - e.printStackTrace(); - } - logger.debug("Location count = " + genomeLocs); - ShardStrategy strategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL, seq.getSequenceDictionary(), 0, locations); - int shardCount = 0; - try { - for (Shard sh : strategy) { - GenomeLoc l = sh.getGenomeLoc(); - GenomeLoc truth = locations.get(shardCount); - if (l.compareTo(truth) != 0) { - String truthStr = truth.getContig() + ":" + truth.getStart() + ":" + truth.getStop(); - String lStr = l.getContig() + ":" + l.getStart() + ":" + l.getStop(); - fail("Genome loc " + truthStr + " doesn't equal " + lStr); - } - shardCount++; - } - assertEquals(shardCount, genomeLocs); - - } catch (Exception e) { - e.printStackTrace(); - fail("testIntervalGenomeCycle: ne exception expected"); - } - } - -} diff --git a/java/test/org/broadinstitute/sting/gatk/dataSources/shards/LocusIntervalShardStrategyTest.java b/java/test/org/broadinstitute/sting/gatk/dataSources/shards/LocusIntervalShardStrategyTest.java new file mode 100755 index 000000000..69c55d46e --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/dataSources/shards/LocusIntervalShardStrategyTest.java @@ -0,0 +1,79 @@ +package org.broadinstitute.sting.gatk.dataSources.shards; + +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.sam.ArtificialSamUtils; +import org.broadinstitute.sting.BaseTest; +import org.junit.Before; +import org.junit.Test; +import static org.junit.Assert.assertTrue; +import net.sf.samtools.SAMFileHeader; + + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * @author aaron + *

+ * Class LocusIntervalShardStrategyTest + *

+ * Tests the LocusIntervalShardStrategy class. + */ +public class LocusIntervalShardStrategyTest extends BaseTest { + private GenomeLocSortedSet mSortedSet = null; + private SAMFileHeader header = ArtificialSamUtils.createArtificialSamHeader(NUMBER_OF_CHROMOSOMES, STARTING_CHROMOSOME, CHROMOSOME_SIZE); + private static final int NUMBER_OF_CHROMOSOMES = 5; + private static final int STARTING_CHROMOSOME = 1; + private static final int CHROMOSOME_SIZE = 1000; + private LocusIntervalShardStrategy strat = null; + + @Before + public void setup() { + GenomeLoc.setupRefContigOrdering(header.getSequenceDictionary()); + mSortedSet = new GenomeLocSortedSet(); + } + + @Test + public void testOneToOneness() { + for (int x = 0; x < 100; x++) { + GenomeLoc loc = new GenomeLoc(0,(x*10)+1, (x*10)+8); + mSortedSet.add(loc); + } + strat = new LocusIntervalShardStrategy(header.getSequenceDictionary(),mSortedSet); + int counter = 0; + while (strat.hasNext()) { + ++counter; + GenomeLoc loc = strat.next().getGenomeLoc(); + long stop = loc.getStop(); + long start = loc.getStart(); + long length = stop - start; + assertTrue(length == 7); + } + assertTrue(counter == 100); + + } + +} diff --git a/java/test/org/broadinstitute/sting/gatk/dataSources/shards/ReadIntervalShardStrategyTest.java b/java/test/org/broadinstitute/sting/gatk/dataSources/shards/ReadIntervalShardStrategyTest.java new file mode 100755 index 000000000..0b6351ba7 --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/dataSources/shards/ReadIntervalShardStrategyTest.java @@ -0,0 +1,124 @@ +package org.broadinstitute.sting.gatk.dataSources.shards; + +import org.junit.Test; +import org.junit.Before; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.sam.ArtificialSamUtils; +import org.broadinstitute.sting.BaseTest; +import net.sf.samtools.SAMFileHeader; + + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * @author aaron + *

+ * Class ReadIntervalShardStrategyTest + *

+ * Tests the ReadIntervalShardStrategy class + */ +public class ReadIntervalShardStrategyTest extends BaseTest { + + private GenomeLocSortedSet mSortedSet = null; + private SAMFileHeader header = ArtificialSamUtils.createArtificialSamHeader(NUMBER_OF_CHROMOSOMES, STARTING_CHROMOSOME, CHROMOSOME_SIZE); + private static final int NUMBER_OF_CHROMOSOMES = 5; + private static final int STARTING_CHROMOSOME = 1; + private static final int CHROMOSOME_SIZE = 1000; + + @Before + public void setup() { + GenomeLoc.setupRefContigOrdering(header.getSequenceDictionary()); + mSortedSet = new GenomeLocSortedSet(); + } + + @Test(expected = StingException.class) + public void testExceptionOnEmpty() { + ReadIntervalShardStrategy strat = new ReadIntervalShardStrategy(header.getSequenceDictionary(), 100, mSortedSet); + } + + @Test + public void testSingleChromosomeFunctionality() { + GenomeLoc loc = new GenomeLoc(1, 1, 1000); + mSortedSet.add(loc); + ReadIntervalShardStrategy strat = new ReadIntervalShardStrategy(header.getSequenceDictionary(), 100, mSortedSet); + int counter = 0; + while (strat.hasNext()) { + Shard d = strat.next(); + counter++; + } + assertEquals(10, counter); + } + + @Test + public void testMultipleChromosomeFunctionality() { + for (int x = 0; x < 5; x++) { + GenomeLoc loc = new GenomeLoc(x, 1, 1000); + mSortedSet.add(loc); + } + ReadIntervalShardStrategy strat = new ReadIntervalShardStrategy(header.getSequenceDictionary(), 100, mSortedSet); + int counter = 0; + while (strat.hasNext()) { + Shard d = strat.next(); + counter++; + } + assertEquals(50, counter); + } + + @Test + public void testOddSizeShardFunctionality() { + for (int x = 0; x < 5; x++) { + GenomeLoc loc = new GenomeLoc(x, 1, 1000); + mSortedSet.add(loc); + } + ReadIntervalShardStrategy strat = new ReadIntervalShardStrategy(header.getSequenceDictionary(), 789, mSortedSet); + int counter = 0; + while (strat.hasNext()) { + Shard d = strat.next(); + if (counter % 2 == 0) { + assertEquals(1, d.getGenomeLoc().getStart()); + assertEquals(789, d.getGenomeLoc().getStop()); + } else { + assertEquals(790, d.getGenomeLoc().getStart()); + assertEquals(1000, d.getGenomeLoc().getStop()); + } + counter++; + } + assertEquals(10, counter); + } + + @Test(expected = UnsupportedOperationException.class) + public void testRemove() { + GenomeLoc loc = new GenomeLoc(1, 1, 1000); + mSortedSet.add(loc); + ReadIntervalShardStrategy strat = new ReadIntervalShardStrategy(header.getSequenceDictionary(), 100, mSortedSet); + strat.remove(); + } + +} diff --git a/java/test/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactoryTest.java b/java/test/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactoryTest.java index 88f4d069f..af1eccd5c 100755 --- a/java/test/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactoryTest.java +++ b/java/test/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactoryTest.java @@ -4,10 +4,14 @@ import static junit.framework.Assert.assertEquals; import static junit.framework.Assert.fail; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; +import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.sam.ArtificialSamUtils; import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2; import org.junit.*; +import static org.junit.Assert.assertTrue; import java.io.File; import java.util.ArrayList; @@ -32,139 +36,62 @@ import java.util.ArrayList; /** * @author aaron * @version 1.0 - * @date Apr 8, 2009 - *

- * Class ShardFactoryTest - *

- * Tests the shard strategy factory. This tests the whole sharding interface, and should be - * split in the future into seperate test cases. - * TODO: split out for the seperate sharding classes */ public class ShardStrategyFactoryTest extends BaseTest { - private static FastaSequenceFile2 seq; + private SAMFileHeader header = ArtificialSamUtils.createArtificialSamHeader(NUMBER_OF_CHROMOSOMES, STARTING_CHROMOSOME, CHROMOSOME_SIZE); + private static final int NUMBER_OF_CHROMOSOMES = 5; + private static final int STARTING_CHROMOSOME = 1; + private static final int CHROMOSOME_SIZE = 1000; + private GenomeLocSortedSet set = null; - /** - * This function (because of the @BeforeClass tag) gets called only once ever, - * before any tests are run - */ - @BeforeClass - public static void doBeforeAnyTests() { - seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")); - } - /** - * Tears down the test fixture after each call. - *

- * Called after every test case method. - */ - @AfterClass - public static void doAfterAllTests() { - - } - - /** - * This function does the setup of our parser, before each method call. - *

- * Called before every test case method. - */ @Before - public void doForEachTest() { - + public void setup() { + GenomeLoc.setupRefContigOrdering(header.getSequenceDictionary()); + set = new GenomeLocSortedSet(); } - /** - * Tears down the test fixture after each call. - *

- * Called after every test case method. - */ - @After - public void undoForEachTest() { - - } - - /** Tests that we got a string parameter in correctly */ @Test - public void testFullGenomeCycle() { - logger.warn("Executing testFullGenomeCycle"); - - GenomeLoc.setupRefContigOrdering(seq.getSequenceDictionary()); - - ShardStrategy strategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000); - int shardCount = 0; - try { - - for (Shard s : strategy) { - GenomeLoc l = s.getGenomeLoc(); - //logger.debug("Shard start: " + l.getStart() + " stop " + l.getStop() + " contig " + l.getContig()); - shardCount++; - } - - // check to make sure we got apple shards - //logger.debug("shardCount : " + shardCount + " seq size = " + seq.getSequenceDictionary().size()); - - } catch (Exception e) { - e.printStackTrace(); - fail("We Shouldn't of seen an exception! : " + e.getMessage() + "; shard count " + shardCount); - } + public void testReadNonInterval() { + ShardStrategy st = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS,header.getSequenceDictionary(),100); + assertTrue(st instanceof ReadShardStrategy); } - - /** Tests that we got a string parameter in correctly */ @Test - public void testIntervalGenomeCycle() throws InterruptedException { - logger.warn("Executing testIntervalGenomeCycle"); - - SAMSequenceDictionary dic = seq.getSequenceDictionary(); - SAMSequenceRecord s = dic.getSequence(1); - // Character stream writing - - - int stop = s.getSequenceLength(); - int size = 10000; - int location = 1; - GenomeLoc.setupRefContigOrdering(dic); - // keep track of the number of genome locs we build - int genomeLocs = 0; - ArrayList locations = new ArrayList(); - - try { - while (location + size < stop) { - logger.debug("s = " + s.getSequenceName() + " " + location + " " + size); - // lets make up some fake locations - GenomeLoc gl = new GenomeLoc(s.getSequenceName(), location, location + size - 1); - logger.debug("loc = " + location); - - // let's move the location up, with a size space - location += (size * 2); - - // add our current location to the list - locations.add(gl); - - // add another genome location - ++genomeLocs; - } - } catch (Exception e) { - e.printStackTrace(); - } - logger.debug("Location count = " + genomeLocs); - ShardStrategy strategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 5000, locations); - int shardCount = 0; - try { - for (Shard sh : strategy) { - GenomeLoc l = sh.getGenomeLoc(); - - logger.debug("Shard start: " + l.getStart() + " stop " + l.getStop() + " contig " + l.getContig()); - shardCount++; - } - - logger.debug("Shard count = " + shardCount); - assertEquals(shardCount, genomeLocs * 2); - - } catch (Exception e) { - e.printStackTrace(); - fail("testIntervalGenomeCycle: ne exception expected"); - } + public void testReadInterval() { + GenomeLoc l = new GenomeLoc(0,1,100); + set.add(l); + ShardStrategy st = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS,header.getSequenceDictionary(),100,set); + assertTrue(st instanceof ReadIntervalShardStrategy); } + @Test + public void testLinearNonInterval() { + ShardStrategy st = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR,header.getSequenceDictionary(),100); + assertTrue(st instanceof LinearLocusShardStrategy); + } + + @Test + public void testExpNonInterval() { + ShardStrategy st = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.EXPONENTIAL,header.getSequenceDictionary(),100); + assertTrue(st instanceof ExpGrowthLocusShardStrategy); + } + + @Test + public void testExpInterval() { + GenomeLoc l = new GenomeLoc(0,1,100); + set.add(l); + ShardStrategy st = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.EXPONENTIAL,header.getSequenceDictionary(),100,set); + assertTrue(st instanceof ExpGrowthLocusShardStrategy); + } + + @Test + public void testLinearInterval() { + GenomeLoc l = new GenomeLoc(0,1,100); + set.add(l); + ShardStrategy st = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR,header.getSequenceDictionary(),100,set); + assertTrue(st instanceof LinearLocusShardStrategy); + } + } diff --git a/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMByReadsTest.java b/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMByReadsTest.java index 193ea0f42..5edcd0394 100755 --- a/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMByReadsTest.java +++ b/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMByReadsTest.java @@ -75,7 +75,7 @@ public class SAMByReadsTest extends BaseTest { final int targetReadCount = 5000; - ShardStrategy shardStrategy = ShardStrategyFactory.shatterByReadCount(seq.getSequenceDictionary(),targetReadCount); + ShardStrategy shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS,seq.getSequenceDictionary(),targetReadCount); try { SAMDataSource data = new SAMDataSource(reads); diff --git a/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetTest.java b/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetTest.java index 46aaf22de..641ee0bd8 100755 --- a/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetTest.java +++ b/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetTest.java @@ -29,7 +29,6 @@ import java.util.Iterator; /** * @author aaron * @version 1.0 - * @date May 22, 2009 *

* Class GenomeLocSetTest *

@@ -142,6 +141,32 @@ public class GenomeLocSortedSetTest extends BaseTest { assertTrue(loc.getContigIndex() == 1); } + @Test + public void deleteAllByRegion() { + GenomeLoc e = new GenomeLoc(1, 1, 100); + mSortedSet.add(e); + for (int x = 1; x < 101; x++) { + GenomeLoc del = new GenomeLoc(1,x,x); + mSortedSet.removeRegion(del); + } + assertTrue(mSortedSet.isEmpty()); + } + @Test + public void deleteSomeByRegion() { + GenomeLoc e = new GenomeLoc(1, 1, 100); + mSortedSet.add(e); + for (int x = 1; x < 50; x++) { + GenomeLoc del = new GenomeLoc(1,x,x); + mSortedSet.removeRegion(del); + } + assertTrue(!mSortedSet.isEmpty()); + assertTrue(mSortedSet.size() == 1); + GenomeLoc loc = mSortedSet.iterator().next(); + assertTrue(loc.getStop() == 100); + assertTrue(loc.getStart() == 50); + + } + @Test public void deleteSuperRegion() { GenomeLoc e = new GenomeLoc(1, 10, 20);