diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/IntervalShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/IntervalShardStrategy.java new file mode 100755 index 000000000..70838af86 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/IntervalShardStrategy.java @@ -0,0 +1,65 @@ +package org.broadinstitute.sting.gatk.dataSources.shards; + +import net.sf.samtools.SAMSequenceDictionary; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.util.List; + +/** + * + * User: aaron + * Date: May 14, 2009 + * Time: 3:28:50 PM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + + +/** + * @author aaron + * @version 1.0 + * @date May 14, 2009 + *

+ * Class LocusWindowShardStrategy + *

+ * This function knows how to shard on a genome loc boundry. It guarantee's + * a one-to-one mapping between a GenomeLoc and hte + */ +public class IntervalShardStrategy extends LocusShardStrategy { + /** + * the constructor, taking a seq dictionary to parse out contigs + * + * @param dic the seq dictionary + * @param intervals file + */ + IntervalShardStrategy(SAMSequenceDictionary dic, List intervals) { + super(dic, intervals); + } + + /** + * This is how the various shards strategies implements their approach, adjusting this value + * + * @return the next shard size + */ + protected long nextShardSize() { + long nextSize = this.getCurrentInterval().getStop() - this.getCurrentInterval().getStart(); + return nextSize; + } + + /** + * set the next shards size + * + * @param size adjust the next size to this + */ + public void adjustNextShardSize(long size) { + //To change body of implemented methods use File | Settings | File Templates. + } + +} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusShardStrategy.java index 5ddfaebaa..5907fd7f4 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusShardStrategy.java @@ -143,7 +143,7 @@ public abstract class LocusShardStrategy implements ShardStrategy { if (this.intervals == null) { return nonIntervaledNext(length, proposedSize, nextStart); } else { - return intervaledNext(length, proposedSize, nextStart); + return intervaledNext(proposedSize, nextStart); } } @@ -151,16 +151,15 @@ public abstract class LocusShardStrategy implements ShardStrategy { /** * Interval based next processing * - * @param length the length of the sequence * @param proposedSize the proposed size * @param nextStart where we start from * @return the shard that represents this data */ - private Shard intervaledNext(long length, long proposedSize, long nextStart) { + private Shard intervaledNext(long proposedSize, long nextStart) { // get the current genome location GenomeLoc loc = intervals.get(currentInterval); - if (nextStart + proposedSize > loc.getStop()) { - // we need to move the next interval + if (nextStart + proposedSize >= loc.getStop()) { + // we need to get the rest of the current loc in a shard (return it), and move to the next location proposedSize = loc.getStop() - nextStart; lastGenomeLocSize = proposedSize; @@ -265,4 +264,16 @@ public abstract class LocusShardStrategy implements ShardStrategy { return this; } + /** + * this allows a shard strategy to get the current interval. It's kind of a hack, but for the + * locusWindowShardStrategy it was the best approach. + * @return + */ + protected GenomeLoc getCurrentInterval() { + if (this.intervals == null || currentInterval < 0) { + return null; + } + return intervals.get(currentInterval); + } + } diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactory.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactory.java index a5b0d3745..c4d1b63c6 100644 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactory.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactory.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.dataSources.shards; import net.sf.samtools.SAMSequenceDictionary; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.StingException; import java.util.List; @@ -35,7 +36,7 @@ import java.util.List; */ public class ShardStrategyFactory { public enum SHATTER_STRATEGY { - LINEAR, EXPONENTIAL, READS + LINEAR, EXPONENTIAL, READS, INTERVAL } /** our log, which we want to capture anything from this class */ @@ -59,7 +60,7 @@ public class ShardStrategyFactory { case READS: return new ReadShardStrategy(dic, startingSize); default: - throw new RuntimeException("Strategy: " + strat + " isn't implemented"); + throw new StingException("Strategy: " + strat + " isn't implemented for this type of shatter request"); } } @@ -78,7 +79,7 @@ public class ShardStrategyFactory { case EXPONENTIAL: return new ExpGrowthLocusShardStrategy(convertFrom); default: - throw new RuntimeException("Strategy: " + strat + " isn't implemented"); + throw new StingException("Strategy: " + strat + " isn't implemented"); } } @@ -100,15 +101,17 @@ public class ShardStrategyFactory { return new ExpGrowthLocusShardStrategy(dic, startingSize, lst); case READS: // return new ReadShardStrategy(dic, startingSize); - throw new RuntimeException("Strategy: " + strat + " isn't implemented for intervals"); + throw new StingException("Strategy: " + strat + " isn't implemented for intervals"); + case INTERVAL: + return new IntervalShardStrategy(dic, lst); default: - throw new RuntimeException("Strategy: " + strat + " isn't implemented"); + throw new StingException("Strategy: " + strat + " isn't implemented"); } } /** - * convert between types + * setup a reads shattering strategy * * @param readCount the number of reads to include in each shard * @return diff --git a/java/test/org/broadinstitute/sting/gatk/dataSources/shards/IntervalShardStrategyTest.java b/java/test/org/broadinstitute/sting/gatk/dataSources/shards/IntervalShardStrategyTest.java new file mode 100755 index 000000000..38ffca578 --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/dataSources/shards/IntervalShardStrategyTest.java @@ -0,0 +1,142 @@ +package org.broadinstitute.sting.gatk.dataSources.shards; + +import static junit.framework.Assert.assertEquals; +import static junit.framework.Assert.fail; +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2; +import org.junit.*; + +import java.io.File; +import java.util.ArrayList; + +/** + * + * User: aaron + * Date: May 14, 2009 + * Time: 3:52:57 PM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + + +/** + * @author aaron + * @version 1.0 + * @date May 14, 2009 + *

+ * Class LocusWindowShardStrategyTest + *

+ * LocusWindowShardStrategy tests + */ +public class IntervalShardStrategyTest extends BaseTest { + + private static FastaSequenceFile2 seq; + + /** + * This function (because of the @BeforeClass tag) gets called only once ever, + * before any tests are run + */ + @BeforeClass + public static void doBeforeAnyTests() { + seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")); + } + + /** + * Tears down the test fixture after each call. + *

+ * Called after every test case method. + */ + @AfterClass + public static void doAfterAllTests() { + + } + + /** + * This function does the setup of our parser, before each method call. + *

+ * Called before every test case method. + */ + @Before + public void doForEachTest() { + + } + + /** + * Tears down the test fixture after each call. + *

+ * Called after every test case method. + */ + @After + public void undoForEachTest() { + + } + + /** Tests that we got a string parameter in correctly */ + @Test + public void testIntervalGenomeCycle() throws InterruptedException { + logger.warn("Executing testIntervalGenomeCycle"); + + SAMSequenceDictionary dic = seq.getSequenceDictionary(); + + + // setup a list of genome locs that represent the whole file + SAMSequenceRecord s = dic.getSequence(1); + int stop = s.getSequenceLength(); + int size = 10000; + int location = 1; + + GenomeLoc.setupRefContigOrdering(dic); + // keep track of the number of genome locs we build + int genomeLocs = 0; + ArrayList locations = new ArrayList(); + try { + while (location + size < stop) { + // lets make up some fake locations + GenomeLoc gl = new GenomeLoc(s.getSequenceName(), location, location + size - 1); + logger.debug("loc = " + location); + + // let's move the location up, with a size space + location += (size * 2); + + // add our current location to the list + locations.add(gl); + + // add another genome location + ++genomeLocs; + } + } catch (Exception e) { + e.printStackTrace(); + } + logger.debug("Location count = " + genomeLocs); + ShardStrategy strategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL, seq.getSequenceDictionary(), 0, locations); + int shardCount = 0; + try { + for (Shard sh : strategy) { + GenomeLoc l = sh.getGenomeLoc(); + GenomeLoc truth = locations.get(shardCount); + if (l.compareTo(truth) != 0) { + String truthStr = truth.getContig() + ":" + truth.getStart() + ":" + truth.getStop(); + String lStr = l.getContig() + ":" + l.getStart() + ":" + l.getStop(); + fail("Genome loc " + truthStr + " doesn't equal " + lStr); + } + shardCount++; + } + assertEquals(shardCount, genomeLocs); + + } catch (Exception e) { + e.printStackTrace(); + fail("testIntervalGenomeCycle: ne exception expected"); + } + } + +} diff --git a/java/test/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactoryTest.java b/java/test/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactoryTest.java index a2d69ed62..88f4d069f 100755 --- a/java/test/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactoryTest.java +++ b/java/test/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactoryTest.java @@ -124,11 +124,10 @@ public class ShardStrategyFactoryTest extends BaseTest { int size = 10000; int location = 1; GenomeLoc.setupRefContigOrdering(dic); - logger.debug("done to sleep"); // keep track of the number of genome locs we build int genomeLocs = 0; ArrayList locations = new ArrayList(); - logger.debug("done to sleep2"); + try { while (location + size < stop) { logger.debug("s = " + s.getSequenceName() + " " + location + " " + size);