diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ExpGrowthShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ExpGrowthShardStrategy.java index 5a7e479a5..af206f144 100644 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ExpGrowthShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ExpGrowthShardStrategy.java @@ -1,6 +1,9 @@ package org.broadinstitute.sting.gatk.dataSources.shards; import net.sf.samtools.SAMSequenceDictionary; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.util.List; /** * @@ -45,6 +48,17 @@ public class ExpGrowthShardStrategy extends ShardStrategy { currentExp = 0; } + /** + * the constructor, taking a seq dictionary to parse out contigs + * + * @param dic the seq dictionary + */ + ExpGrowthShardStrategy(SAMSequenceDictionary dic, long startSize, List lst) { + super(dic, lst); + this.baseSize = startSize; + currentExp = 0; + } + /** * the constructor, taking a seq dictionary to parse out contigs * diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LinearShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LinearShardStrategy.java index c435fb36e..bdc0956f5 100644 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LinearShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LinearShardStrategy.java @@ -1,6 +1,9 @@ package org.broadinstitute.sting.gatk.dataSources.shards; import net.sf.samtools.SAMSequenceDictionary; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.util.List; /** * @@ -53,7 +56,16 @@ class LinearShardStrategy extends ShardStrategy { this.nextShardSize = strat.nextShardSize(); } - + /** + * the constructor, taking a seq dictionary to parse out contigs + * + * @param dic the seq dictionary + * @param lst the list of genome locations to iterate over + */ + LinearShardStrategy(SAMSequenceDictionary dic, long startSize, List lst) { + super(dic, lst); + this.nextShardSize = startSize; + } /** * set the next shards size * diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategy.java index d1416f3b0..e35e92254 100644 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategy.java @@ -6,6 +6,7 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.GenomeLoc; import java.util.Iterator; +import java.util.List; /** * * User: aaron @@ -52,6 +53,10 @@ public abstract class ShardStrategy implements Iterator, Iterable /** our log, which we want to capture anything from this class */ private static Logger logger = Logger.getLogger(ShardStrategy.class); + /** our interal list * */ + private List intervals = null; + /** our interal list * */ + private int currentInterval = -1; /** * the constructor, taking a seq dictionary to parse out contigs @@ -79,6 +84,23 @@ public abstract class ShardStrategy implements Iterator, Iterable this.nextContig = old.nextContig; } + /** + * the constructor, taking a seq dictionary to parse out contigs + * + * @param dic the seq dictionary + * @param intervals file + */ + ShardStrategy(SAMSequenceDictionary dic, List intervals) { + this.dic = dic; + this.intervals = intervals; + this.currentInterval = 0; + + mLoc = new GenomeLoc(intervals.get(0).getContig(), intervals.get(0).getStart() - 1, intervals.get(0).getStart() - 1); + if (dic.getSequences().size() > 0) { + nextContig = true; + } + } + /** * * Abstract methods that each strategy has to implement @@ -110,31 +132,88 @@ public abstract class ShardStrategy implements Iterator, Iterable /** * get the next shard, based on the return size of nextShardSize * - * @return + * @return the next shard */ public Shard next() { + // lets get some background info on the problem long length = dic.getSequence(seqLoc).getSequenceLength(); long proposedSize = nextShardSize(); long nextStart = mLoc.getStop() + 1; + + // if we don't have an interval file, use the non interval based approach. Simple, eh? + if (this.intervals == null) { + return nonIntervaledNext(length, proposedSize, nextStart); + } else { + return intervaledNext(length, proposedSize, nextStart); + } + + } + + private Shard intervaledNext(long length, long proposedSize, long nextStart) { + // get the current genome location + GenomeLoc loc = intervals.get(currentInterval); + if (nextStart + proposedSize > loc.getStop()) { + // we need to move the next interval + proposedSize = loc.getStop() - nextStart; + lastGenomeLocSize = proposedSize; + + // the next sequence should start at the begining of the next contig + Shard ret = Shard.toShard(new GenomeLoc(intervals.get(currentInterval).getContig(), nextStart, nextStart + proposedSize - 1)); + + ++currentInterval; + if (intervals.size() > currentInterval) { + mLoc = new GenomeLoc(intervals.get(currentInterval).getContig(), intervals.get(currentInterval).getStart() - 1, intervals.get(currentInterval).getStart() - 1); + } + return ret;// return + + } else { + // we need to move the next interval + lastGenomeLocSize = proposedSize; + + // the next sequence should start at the begining of the next contig + Shard ret = Shard.toShard(new GenomeLoc(intervals.get(currentInterval).getContig(), nextStart, nextStart + proposedSize - 1)); + + mLoc = new GenomeLoc(intervals.get(currentInterval).getContig(), nextStart, nextStart + proposedSize - 1); + + return ret;// return + } + } + + /** + * Get the next shard, if we don't have intervals to traverse over + * + * @param length the length of the contig + * @param proposedSize the proposed size + * @param nextStart the next start location + * @return the shard to return to the user + */ + private Shard nonIntervaledNext(long length, long proposedSize, long nextStart) { // can we fit it into the current seq size? if (nextStart + proposedSize - 1 < length) { lastGenomeLocSize = proposedSize; - mLoc = new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + proposedSize-1); - return Shard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + proposedSize-1)); + mLoc = new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + proposedSize - 1); + return Shard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + proposedSize - 1)); } // else we can't make it in the current location, we have to stitch one together else { - long overflow = nextStart + proposedSize -1 - length; + // lets find out the remaining size of the current contig + long overflow = nextStart + proposedSize - 1 - length; logger.debug("Overflow = " + overflow + " length: " + length); + + // set our last size counter to the remaining size lastGenomeLocSize = proposedSize - overflow; + // move to the next contig // the next sequence should start at the begining of the next contig Shard ret = Shard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + lastGenomeLocSize)); + + // now jump ahead to the next contig jumpContig(); + + // return the shard return ret; } - } /** jump to the next contig */ @@ -149,7 +228,6 @@ public abstract class ShardStrategy implements Iterator, Iterable mLoc = new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), 0, 0); - } /** @@ -158,7 +236,12 @@ public abstract class ShardStrategy implements Iterator, Iterable * @return */ public boolean hasNext() { - return nextContig; + // if we don't have an interval file, use the non interval based approach. Simple, eh? + if (this.intervals == null) { + return nextContig; + } else { + return (this.currentInterval < this.intervals.size()); + } } /** we don't support remove */ @@ -177,4 +260,5 @@ public abstract class ShardStrategy implements Iterator, Iterable } + } diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactory.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactory.java index 31717205e..3c6877980 100644 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactory.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactory.java @@ -2,6 +2,9 @@ package org.broadinstitute.sting.gatk.dataSources.shards; import net.sf.samtools.SAMSequenceDictionary; import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.util.List; /** * @@ -59,6 +62,26 @@ public class ShardStrategyFactory { } + /** + * get a new shatter strategy + * + * @param strat what's our strategy - SHATTER_STRATEGY type + * @param dic the seq dictionary + * @param startingSize the starting size + * @return + */ + static public ShardStrategy shatter(SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, List lst) { + switch (strat) { + case LINEAR: + return new LinearShardStrategy(dic, startingSize, lst); + case EXPONENTIAL: + return new ExpGrowthShardStrategy(dic, startingSize, lst); + default: + throw new RuntimeException("Strategy: " + strat + " isn't implemented"); + } + + } + /** * convert between types * diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSource.java index 2c2a913bd..94dac4697 100644 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSource.java @@ -31,9 +31,6 @@ public class SAMBAMDataSource implements SimpleDataSource { /** our log, which we want to capture anything from this class */ protected static Logger logger = Logger.getLogger(SAMBAMDataSource.class); - // our sam file readers - private final ArrayList readers = new ArrayList(); - // are we set to locus mode or read mode for dividing private boolean locusMode = true; diff --git a/java/test/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactoryTest.java b/java/test/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactoryTest.java index fa15e9773..05f7df91d 100755 --- a/java/test/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactoryTest.java +++ b/java/test/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactoryTest.java @@ -1,11 +1,16 @@ package org.broadinstitute.sting.gatk.dataSources.shards; +import static junit.framework.Assert.assertEquals; import static junit.framework.Assert.fail; +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; import org.broadinstitute.sting.utils.FastaSequenceFile2; import org.broadinstitute.sting.utils.GenomeLoc; import org.junit.*; import java.io.File; +import java.io.FileWriter; +import java.util.ArrayList; /** * @@ -94,7 +99,62 @@ public class ShardStrategyFactoryTest { } catch (Exception e) { e.printStackTrace(); - fail("We Shouldn't of seen an exception! : " + e.getMessage() + "; shard count " + shardCount); + fail("We Shouldn't of seen an exception! : " + e.getMessage() + "; shard count " + shardCount); + } + } + + + /** Tests that we got a string parameter in correctly */ + @Test + public void testIntervalGenomeCycle() { + SAMSequenceDictionary dic = seq.getSequenceDictionary(); + SAMSequenceRecord s = dic.getSequence(1); + // Character stream writing + + System.err.println("Trying to sleep"); + try { + Thread.sleep(5000); + } catch (InterruptedException e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + } + int stop = s.getSequenceLength(); + int size = 10000; + int location = 1; + System.err.println("done to sleep"); + // keep track of the number of genome locs we build + int genomeLocs = 0; + ArrayList locations = new ArrayList(); + while (location + size < stop) { + // lets make up some fake locations + GenomeLoc gl = new GenomeLoc(s.getSequenceName(), location, location + size - 1); + + // let's move the location up, with a size space + location += (size * 2); + + // add our current location to the list + locations.add(gl); + + // add another genome location + ++genomeLocs; + } + + ShardStrategy strategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 5000, locations); + int shardCount = 0; + try { + FileWriter writer = new FileWriter("myfile.txt"); + for (Shard sh : strategy) { + GenomeLoc l = sh.getGenomeLoc(); + + writer.write("Shard start: " + l.getStart() + " stop " + l.getStop() + " contig " + l.getContig()); + //logger.debug("Shard start: " + l.getStart() + " stop " + l.getStop() + " contig " + l.getContig()); + shardCount++; + } + writer.close(); + assertEquals(shardCount, genomeLocs * 2); + + } catch (Exception e) { + e.printStackTrace(); + fail("testIntervalGenomeCycle: ne exception expected"); } }