diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/AdaptiveShard.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/AdaptiveShard.java new file mode 100644 index 000000000..90180f02a --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/AdaptiveShard.java @@ -0,0 +1,58 @@ +package org.broadinstitute.sting.gatk.dataSources.shards; + +import net.sf.samtools.SAMSequenceDictionary; + +/** + * + * User: aaron + * Date: Apr 6, 2009 + * Time: 7:18:19 PM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + + +/** + * @author aaron + * @version 1.0 + * @date Apr 6, 2009 + *

+ * Class AdaptiveShard + *

+ * A descriptions should go here. Blame aaron if it's missing. + */ +class AdaptiveShard extends Shard { + + // default the next size to 100,000 + private long nextShardSize = 100000; + + /** + * the constructor, taking a seq dictionary to parse out contigs + * + * @param dic the seq dictionary + */ + AdaptiveShard(SAMSequenceDictionary dic, long startSize) { + super(dic); + this.nextShardSize = startSize; + } + + public void setNextShardSize(long size) { + nextShardSize = size; + } + + /** + * This is how the various shards strategies implements their approach + * + * @return the next shard size + */ + protected long nextShardSize() { + return nextShardSize; + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LinearShard.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LinearShard.java new file mode 100644 index 000000000..8287ea51f --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LinearShard.java @@ -0,0 +1,54 @@ +package org.broadinstitute.sting.gatk.dataSources.shards; + +import net.sf.samtools.SAMSequenceDictionary; + +/** + * + * User: aaron + * Date: Apr 6, 2009 + * Time: 8:23:19 PM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + + +/** + * @author aaron + * @version 1.0 + * @date Apr 6, 2009 + *

+ * Class LinearShard + *

+ * A descriptions should go here. Blame aaron if it's missing. + */ +public class LinearShard extends Shard { + + // fixed size + private long nextShardSize = 100000; + + /** + * the constructor, taking a seq dictionary to parse out contigs + * + * @param dic the seq dictionary + */ + LinearShard(SAMSequenceDictionary dic, long startSize) { + super(dic); + this.nextShardSize = startSize; + } + + /** + * This is how the various shards strategies implements their approach + * + * @return the next shard size + */ + protected long nextShardSize() { + return nextShardSize; + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/Shard.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/Shard.java new file mode 100644 index 000000000..9eeb3241c --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/Shard.java @@ -0,0 +1,126 @@ +package org.broadinstitute.sting.gatk.dataSources.shards; + +import net.sf.samtools.SAMSequenceDictionary; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.util.Iterator; +/** + * + * User: aaron + * Date: Apr 6, 2009 + * Time: 11:23:17 AM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + +/** + * @author aaron + * @version 1.0 + * @date Apr 6, 2009 + *

+ * Interface Shard + *

+ * The shard interface, which controls how data is divided + */ +public abstract class Shard implements Iterator { + + // this stores the seq dictionary, which is a reference for the + // lengths and names of contigs, which you need to generate an iterative stratagy + protected static SAMSequenceDictionary dic = null; + + // the current genome location + protected GenomeLoc mLoc = null; + + // current seq location + protected int seqLoc = 0; + + // the actual last size; this can change based on contig endings + protected long lastGenomeLocSize = 0; + + // do we have another contig? + private boolean nextContig = false; + + + /** + * the constructor, taking a seq dictionary to parse out contigs + * + * @param dic the seq dictionary + */ + Shard(SAMSequenceDictionary dic) { + this.dic = dic; + mLoc = new GenomeLoc(dic.getSequence(0).getSequenceName(), 0, 0); + if (dic.getSequences().size() > 0) { + nextContig = true; + } + } + + + /** + * This is how the various shards strategies implements their approach + * + * @return the next shard size + */ + protected abstract long nextShardSize(); + + /** + * get the next shard, based on the return size of nextShardSize + * + * @return + */ + public GenomeLoc next() { + // lets get some background info on the problem + long length = dic.getSequence(seqLoc).getSequenceLength(); + long proposedSize = nextShardSize(); + long nextStart = mLoc.getStop() + 1; + // can we fit it into the current seq size? + if (nextStart + proposedSize < length) { + lastGenomeLocSize = proposedSize; + mLoc = new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + proposedSize); + return new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + proposedSize); + } + // else we can't make it in the current location, we have to stitch one together + else { + lastGenomeLocSize = nextStart + proposedSize - length; + + + // move to the next contig + jumpContig(); + return new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, lastGenomeLocSize); + } + + } + + /** jump to the next contig */ + private void jumpContig() { + ++seqLoc; + if (dic.getSequences().size() <= seqLoc) { + nextContig = false; + return; + } + + // the next sequence should start at the begining of the next contig + mLoc = new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), 0, 0); + + } + + /** + * is there another GenomeLoc to get? + * + * @return + */ + public boolean hasNext() { + return nextContig; + } + + /** we don't support remove */ + public void remove() { + throw new UnsupportedOperationException("Can not remove records from a shard iterator!"); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardFactory.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardFactory.java new file mode 100644 index 000000000..04ff96b3a --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardFactory.java @@ -0,0 +1,66 @@ +package org.broadinstitute.sting.gatk.dataSources.shards; + +import net.sf.samtools.SAMSequenceDictionary; + +/** + * + * User: aaron + * Date: Apr 6, 2009 + * Time: 7:09:22 PM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + + +/** + * @author aaron + * @version 1.0 + * @date Apr 6, 2009 + *

+ * Class ShardBuilder + *

+ * A descriptions should go here. Blame aaron if it's missing. + */ +public class ShardFactory { + public enum SHATTER_STRATEGY { + ADAPTIVE, LINEAR + } + + /** + * get a new shatter strategy + * + * @param strat what's our strategy - SHATTER_STRATEGY type + * @param dic the seq dictionary + * @param startingSize the starting size + * @return + */ + static public Shard shatter(SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize) { + Shard d = null; + switch (strat) { + case ADAPTIVE: + d = new AdaptiveShard(dic, startingSize); + default: + d = new LinearShard(dic, startingSize); // default + } + return d; + } + + /** + * if you know what you want + * + * @param dic the seq dictionary + * @param startingSize the starting size + * @return + */ + static public AdaptiveShard getAdaptiveShard(SAMSequenceDictionary dic, long startingSize) { + return new AdaptiveShard(dic, startingSize); + } + +}