the new shatter method, independent of the underlying data. The only thing needed to create a Shard is the reference seq, which may be a problem in reference less traversals, so the builder class is there so we can make different construction schemes.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@308 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
0baa8c0f76
commit
b42d8df646
|
|
@ -0,0 +1,58 @@
|
||||||
|
package org.broadinstitute.sting.gatk.dataSources.shards;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMSequenceDictionary;
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* User: aaron
|
||||||
|
* Date: Apr 6, 2009
|
||||||
|
* Time: 7:18:19 PM
|
||||||
|
*
|
||||||
|
* The Broad Institute
|
||||||
|
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||||
|
* This software and its documentation are copyright 2009 by the
|
||||||
|
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||||
|
*
|
||||||
|
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||||
|
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author aaron
|
||||||
|
* @version 1.0
|
||||||
|
* @date Apr 6, 2009
|
||||||
|
* <p/>
|
||||||
|
* Class AdaptiveShard
|
||||||
|
* <p/>
|
||||||
|
* A descriptions should go here. Blame aaron if it's missing.
|
||||||
|
*/
|
||||||
|
class AdaptiveShard extends Shard {
|
||||||
|
|
||||||
|
// default the next size to 100,000
|
||||||
|
private long nextShardSize = 100000;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* the constructor, taking a seq dictionary to parse out contigs
|
||||||
|
*
|
||||||
|
* @param dic the seq dictionary
|
||||||
|
*/
|
||||||
|
AdaptiveShard(SAMSequenceDictionary dic, long startSize) {
|
||||||
|
super(dic);
|
||||||
|
this.nextShardSize = startSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setNextShardSize(long size) {
|
||||||
|
nextShardSize = size;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This is how the various shards strategies implements their approach
|
||||||
|
*
|
||||||
|
* @return the next shard size
|
||||||
|
*/
|
||||||
|
protected long nextShardSize() {
|
||||||
|
return nextShardSize;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,54 @@
|
||||||
|
package org.broadinstitute.sting.gatk.dataSources.shards;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMSequenceDictionary;
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* User: aaron
|
||||||
|
* Date: Apr 6, 2009
|
||||||
|
* Time: 8:23:19 PM
|
||||||
|
*
|
||||||
|
* The Broad Institute
|
||||||
|
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||||
|
* This software and its documentation are copyright 2009 by the
|
||||||
|
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||||
|
*
|
||||||
|
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||||
|
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author aaron
|
||||||
|
* @version 1.0
|
||||||
|
* @date Apr 6, 2009
|
||||||
|
* <p/>
|
||||||
|
* Class LinearShard
|
||||||
|
* <p/>
|
||||||
|
* A descriptions should go here. Blame aaron if it's missing.
|
||||||
|
*/
|
||||||
|
public class LinearShard extends Shard {
|
||||||
|
|
||||||
|
// fixed size
|
||||||
|
private long nextShardSize = 100000;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* the constructor, taking a seq dictionary to parse out contigs
|
||||||
|
*
|
||||||
|
* @param dic the seq dictionary
|
||||||
|
*/
|
||||||
|
LinearShard(SAMSequenceDictionary dic, long startSize) {
|
||||||
|
super(dic);
|
||||||
|
this.nextShardSize = startSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This is how the various shards strategies implements their approach
|
||||||
|
*
|
||||||
|
* @return the next shard size
|
||||||
|
*/
|
||||||
|
protected long nextShardSize() {
|
||||||
|
return nextShardSize;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,126 @@
|
||||||
|
package org.broadinstitute.sting.gatk.dataSources.shards;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMSequenceDictionary;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
|
||||||
|
import java.util.Iterator;
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* User: aaron
|
||||||
|
* Date: Apr 6, 2009
|
||||||
|
* Time: 11:23:17 AM
|
||||||
|
*
|
||||||
|
* The Broad Institute
|
||||||
|
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||||
|
* This software and its documentation are copyright 2009 by the
|
||||||
|
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||||
|
*
|
||||||
|
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||||
|
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author aaron
|
||||||
|
* @version 1.0
|
||||||
|
* @date Apr 6, 2009
|
||||||
|
* <p/>
|
||||||
|
* Interface Shard
|
||||||
|
* <p/>
|
||||||
|
* The shard interface, which controls how data is divided
|
||||||
|
*/
|
||||||
|
public abstract class Shard implements Iterator<GenomeLoc> {
|
||||||
|
|
||||||
|
// this stores the seq dictionary, which is a reference for the
|
||||||
|
// lengths and names of contigs, which you need to generate an iterative stratagy
|
||||||
|
protected static SAMSequenceDictionary dic = null;
|
||||||
|
|
||||||
|
// the current genome location
|
||||||
|
protected GenomeLoc mLoc = null;
|
||||||
|
|
||||||
|
// current seq location
|
||||||
|
protected int seqLoc = 0;
|
||||||
|
|
||||||
|
// the actual last size; this can change based on contig endings
|
||||||
|
protected long lastGenomeLocSize = 0;
|
||||||
|
|
||||||
|
// do we have another contig?
|
||||||
|
private boolean nextContig = false;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* the constructor, taking a seq dictionary to parse out contigs
|
||||||
|
*
|
||||||
|
* @param dic the seq dictionary
|
||||||
|
*/
|
||||||
|
Shard(SAMSequenceDictionary dic) {
|
||||||
|
this.dic = dic;
|
||||||
|
mLoc = new GenomeLoc(dic.getSequence(0).getSequenceName(), 0, 0);
|
||||||
|
if (dic.getSequences().size() > 0) {
|
||||||
|
nextContig = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This is how the various shards strategies implements their approach
|
||||||
|
*
|
||||||
|
* @return the next shard size
|
||||||
|
*/
|
||||||
|
protected abstract long nextShardSize();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get the next shard, based on the return size of nextShardSize
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public GenomeLoc next() {
|
||||||
|
// lets get some background info on the problem
|
||||||
|
long length = dic.getSequence(seqLoc).getSequenceLength();
|
||||||
|
long proposedSize = nextShardSize();
|
||||||
|
long nextStart = mLoc.getStop() + 1;
|
||||||
|
// can we fit it into the current seq size?
|
||||||
|
if (nextStart + proposedSize < length) {
|
||||||
|
lastGenomeLocSize = proposedSize;
|
||||||
|
mLoc = new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + proposedSize);
|
||||||
|
return new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + proposedSize);
|
||||||
|
}
|
||||||
|
// else we can't make it in the current location, we have to stitch one together
|
||||||
|
else {
|
||||||
|
lastGenomeLocSize = nextStart + proposedSize - length;
|
||||||
|
|
||||||
|
|
||||||
|
// move to the next contig
|
||||||
|
jumpContig();
|
||||||
|
return new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, lastGenomeLocSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/** jump to the next contig */
|
||||||
|
private void jumpContig() {
|
||||||
|
++seqLoc;
|
||||||
|
if (dic.getSequences().size() <= seqLoc) {
|
||||||
|
nextContig = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// the next sequence should start at the begining of the next contig
|
||||||
|
mLoc = new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), 0, 0);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* is there another GenomeLoc to get?
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public boolean hasNext() {
|
||||||
|
return nextContig;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** we don't support remove */
|
||||||
|
public void remove() {
|
||||||
|
throw new UnsupportedOperationException("Can not remove records from a shard iterator!");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,66 @@
|
||||||
|
package org.broadinstitute.sting.gatk.dataSources.shards;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMSequenceDictionary;
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* User: aaron
|
||||||
|
* Date: Apr 6, 2009
|
||||||
|
* Time: 7:09:22 PM
|
||||||
|
*
|
||||||
|
* The Broad Institute
|
||||||
|
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||||
|
* This software and its documentation are copyright 2009 by the
|
||||||
|
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||||
|
*
|
||||||
|
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||||
|
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author aaron
|
||||||
|
* @version 1.0
|
||||||
|
* @date Apr 6, 2009
|
||||||
|
* <p/>
|
||||||
|
* Class ShardBuilder
|
||||||
|
* <p/>
|
||||||
|
* A descriptions should go here. Blame aaron if it's missing.
|
||||||
|
*/
|
||||||
|
public class ShardFactory {
|
||||||
|
public enum SHATTER_STRATEGY {
|
||||||
|
ADAPTIVE, LINEAR
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get a new shatter strategy
|
||||||
|
*
|
||||||
|
* @param strat what's our strategy - SHATTER_STRATEGY type
|
||||||
|
* @param dic the seq dictionary
|
||||||
|
* @param startingSize the starting size
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
static public Shard shatter(SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize) {
|
||||||
|
Shard d = null;
|
||||||
|
switch (strat) {
|
||||||
|
case ADAPTIVE:
|
||||||
|
d = new AdaptiveShard(dic, startingSize);
|
||||||
|
default:
|
||||||
|
d = new LinearShard(dic, startingSize); // default
|
||||||
|
}
|
||||||
|
return d;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* if you know what you want
|
||||||
|
*
|
||||||
|
* @param dic the seq dictionary
|
||||||
|
* @param startingSize the starting size
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
static public AdaptiveShard getAdaptiveShard(SAMSequenceDictionary dic, long startingSize) {
|
||||||
|
return new AdaptiveShard(dic, startingSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue