Add interval support to the
.__ __ __
_____| |__ _____ _/ |__/ |_ ___________
/ ___/ | \\__ \\ __\ __\/ __ \_ __ \
\___ \| Y \/ __ \| | | | \ ___/| | \/
/____ >___| (____ /__| |__| \___ >__|
\/ \/ \/ \/
classes!
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@352 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
c5220c0822
commit
9afa101465
|
|
@ -1,6 +1,9 @@
|
|||
package org.broadinstitute.sting.gatk.dataSources.shards;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
*
|
||||
|
|
@ -45,6 +48,17 @@ public class ExpGrowthShardStrategy extends ShardStrategy {
|
|||
currentExp = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* the constructor, taking a seq dictionary to parse out contigs
|
||||
*
|
||||
* @param dic the seq dictionary
|
||||
*/
|
||||
ExpGrowthShardStrategy(SAMSequenceDictionary dic, long startSize, List<GenomeLoc> lst) {
|
||||
super(dic, lst);
|
||||
this.baseSize = startSize;
|
||||
currentExp = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* the constructor, taking a seq dictionary to parse out contigs
|
||||
*
|
||||
|
|
|
|||
|
|
@ -1,6 +1,9 @@
|
|||
package org.broadinstitute.sting.gatk.dataSources.shards;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
*
|
||||
|
|
@ -53,7 +56,16 @@ class LinearShardStrategy extends ShardStrategy {
|
|||
this.nextShardSize = strat.nextShardSize();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* the constructor, taking a seq dictionary to parse out contigs
|
||||
*
|
||||
* @param dic the seq dictionary
|
||||
* @param lst the list of genome locations to iterate over
|
||||
*/
|
||||
LinearShardStrategy(SAMSequenceDictionary dic, long startSize, List<GenomeLoc> lst) {
|
||||
super(dic, lst);
|
||||
this.nextShardSize = startSize;
|
||||
}
|
||||
/**
|
||||
* set the next shards size
|
||||
*
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ import org.apache.log4j.Logger;
|
|||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
/**
|
||||
*
|
||||
* User: aaron
|
||||
|
|
@ -52,6 +53,10 @@ public abstract class ShardStrategy implements Iterator<Shard>, Iterable<Shard>
|
|||
/** our log, which we want to capture anything from this class */
|
||||
private static Logger logger = Logger.getLogger(ShardStrategy.class);
|
||||
|
||||
/** our interal list * */
|
||||
private List<GenomeLoc> intervals = null;
|
||||
/** our interal list * */
|
||||
private int currentInterval = -1;
|
||||
|
||||
/**
|
||||
* the constructor, taking a seq dictionary to parse out contigs
|
||||
|
|
@ -79,6 +84,23 @@ public abstract class ShardStrategy implements Iterator<Shard>, Iterable<Shard>
|
|||
this.nextContig = old.nextContig;
|
||||
}
|
||||
|
||||
/**
|
||||
* the constructor, taking a seq dictionary to parse out contigs
|
||||
*
|
||||
* @param dic the seq dictionary
|
||||
* @param intervals file
|
||||
*/
|
||||
ShardStrategy(SAMSequenceDictionary dic, List<GenomeLoc> intervals) {
|
||||
this.dic = dic;
|
||||
this.intervals = intervals;
|
||||
this.currentInterval = 0;
|
||||
|
||||
mLoc = new GenomeLoc(intervals.get(0).getContig(), intervals.get(0).getStart() - 1, intervals.get(0).getStart() - 1);
|
||||
if (dic.getSequences().size() > 0) {
|
||||
nextContig = true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* Abstract methods that each strategy has to implement
|
||||
|
|
@ -110,31 +132,88 @@ public abstract class ShardStrategy implements Iterator<Shard>, Iterable<Shard>
|
|||
/**
|
||||
* get the next shard, based on the return size of nextShardSize
|
||||
*
|
||||
* @return
|
||||
* @return the next shard
|
||||
*/
|
||||
public Shard next() {
|
||||
|
||||
// lets get some background info on the problem
|
||||
long length = dic.getSequence(seqLoc).getSequenceLength();
|
||||
long proposedSize = nextShardSize();
|
||||
long nextStart = mLoc.getStop() + 1;
|
||||
|
||||
// if we don't have an interval file, use the non interval based approach. Simple, eh?
|
||||
if (this.intervals == null) {
|
||||
return nonIntervaledNext(length, proposedSize, nextStart);
|
||||
} else {
|
||||
return intervaledNext(length, proposedSize, nextStart);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private Shard intervaledNext(long length, long proposedSize, long nextStart) {
|
||||
// get the current genome location
|
||||
GenomeLoc loc = intervals.get(currentInterval);
|
||||
if (nextStart + proposedSize > loc.getStop()) {
|
||||
// we need to move the next interval
|
||||
proposedSize = loc.getStop() - nextStart;
|
||||
lastGenomeLocSize = proposedSize;
|
||||
|
||||
// the next sequence should start at the begining of the next contig
|
||||
Shard ret = Shard.toShard(new GenomeLoc(intervals.get(currentInterval).getContig(), nextStart, nextStart + proposedSize - 1));
|
||||
|
||||
++currentInterval;
|
||||
if (intervals.size() > currentInterval) {
|
||||
mLoc = new GenomeLoc(intervals.get(currentInterval).getContig(), intervals.get(currentInterval).getStart() - 1, intervals.get(currentInterval).getStart() - 1);
|
||||
}
|
||||
return ret;// return
|
||||
|
||||
} else {
|
||||
// we need to move the next interval
|
||||
lastGenomeLocSize = proposedSize;
|
||||
|
||||
// the next sequence should start at the begining of the next contig
|
||||
Shard ret = Shard.toShard(new GenomeLoc(intervals.get(currentInterval).getContig(), nextStart, nextStart + proposedSize - 1));
|
||||
|
||||
mLoc = new GenomeLoc(intervals.get(currentInterval).getContig(), nextStart, nextStart + proposedSize - 1);
|
||||
|
||||
return ret;// return
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the next shard, if we don't have intervals to traverse over
|
||||
*
|
||||
* @param length the length of the contig
|
||||
* @param proposedSize the proposed size
|
||||
* @param nextStart the next start location
|
||||
* @return the shard to return to the user
|
||||
*/
|
||||
private Shard nonIntervaledNext(long length, long proposedSize, long nextStart) {
|
||||
// can we fit it into the current seq size?
|
||||
if (nextStart + proposedSize - 1 < length) {
|
||||
lastGenomeLocSize = proposedSize;
|
||||
mLoc = new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + proposedSize-1);
|
||||
return Shard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + proposedSize-1));
|
||||
mLoc = new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + proposedSize - 1);
|
||||
return Shard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + proposedSize - 1));
|
||||
}
|
||||
// else we can't make it in the current location, we have to stitch one together
|
||||
else {
|
||||
long overflow = nextStart + proposedSize -1 - length;
|
||||
// lets find out the remaining size of the current contig
|
||||
long overflow = nextStart + proposedSize - 1 - length;
|
||||
logger.debug("Overflow = " + overflow + " length: " + length);
|
||||
|
||||
// set our last size counter to the remaining size
|
||||
lastGenomeLocSize = proposedSize - overflow;
|
||||
|
||||
// move to the next contig
|
||||
// the next sequence should start at the begining of the next contig
|
||||
Shard ret = Shard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + lastGenomeLocSize));
|
||||
|
||||
// now jump ahead to the next contig
|
||||
jumpContig();
|
||||
|
||||
// return the shard
|
||||
return ret;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/** jump to the next contig */
|
||||
|
|
@ -149,7 +228,6 @@ public abstract class ShardStrategy implements Iterator<Shard>, Iterable<Shard>
|
|||
mLoc = new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), 0, 0);
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -158,7 +236,12 @@ public abstract class ShardStrategy implements Iterator<Shard>, Iterable<Shard>
|
|||
* @return
|
||||
*/
|
||||
public boolean hasNext() {
|
||||
return nextContig;
|
||||
// if we don't have an interval file, use the non interval based approach. Simple, eh?
|
||||
if (this.intervals == null) {
|
||||
return nextContig;
|
||||
} else {
|
||||
return (this.currentInterval < this.intervals.size());
|
||||
}
|
||||
}
|
||||
|
||||
/** we don't support remove */
|
||||
|
|
@ -177,4 +260,5 @@ public abstract class ShardStrategy implements Iterator<Shard>, Iterable<Shard>
|
|||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,6 +2,9 @@ package org.broadinstitute.sting.gatk.dataSources.shards;
|
|||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
*
|
||||
|
|
@ -59,6 +62,26 @@ public class ShardStrategyFactory {
|
|||
|
||||
}
|
||||
|
||||
/**
|
||||
* get a new shatter strategy
|
||||
*
|
||||
* @param strat what's our strategy - SHATTER_STRATEGY type
|
||||
* @param dic the seq dictionary
|
||||
* @param startingSize the starting size
|
||||
* @return
|
||||
*/
|
||||
static public ShardStrategy shatter(SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, List<GenomeLoc> lst) {
|
||||
switch (strat) {
|
||||
case LINEAR:
|
||||
return new LinearShardStrategy(dic, startingSize, lst);
|
||||
case EXPONENTIAL:
|
||||
return new ExpGrowthShardStrategy(dic, startingSize, lst);
|
||||
default:
|
||||
throw new RuntimeException("Strategy: " + strat + " isn't implemented");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* convert between types
|
||||
*
|
||||
|
|
|
|||
|
|
@ -31,9 +31,6 @@ public class SAMBAMDataSource implements SimpleDataSource {
|
|||
/** our log, which we want to capture anything from this class */
|
||||
protected static Logger logger = Logger.getLogger(SAMBAMDataSource.class);
|
||||
|
||||
// our sam file readers
|
||||
private final ArrayList<SAMFileReader> readers = new ArrayList<SAMFileReader>();
|
||||
|
||||
// are we set to locus mode or read mode for dividing
|
||||
private boolean locusMode = true;
|
||||
|
||||
|
|
|
|||
|
|
@ -1,11 +1,16 @@
|
|||
package org.broadinstitute.sting.gatk.dataSources.shards;
|
||||
|
||||
import static junit.framework.Assert.assertEquals;
|
||||
import static junit.framework.Assert.fail;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.broadinstitute.sting.utils.FastaSequenceFile2;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.junit.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileWriter;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
*
|
||||
|
|
@ -94,7 +99,62 @@ public class ShardStrategyFactoryTest {
|
|||
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
fail("We Shouldn't of seen an exception! : " + e.getMessage() + "; shard count " + shardCount);
|
||||
fail("We Shouldn't of seen an exception! : " + e.getMessage() + "; shard count " + shardCount);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** Tests that we got a string parameter in correctly */
|
||||
@Test
|
||||
public void testIntervalGenomeCycle() {
|
||||
SAMSequenceDictionary dic = seq.getSequenceDictionary();
|
||||
SAMSequenceRecord s = dic.getSequence(1);
|
||||
// Character stream writing
|
||||
|
||||
System.err.println("Trying to sleep");
|
||||
try {
|
||||
Thread.sleep(5000);
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
|
||||
}
|
||||
int stop = s.getSequenceLength();
|
||||
int size = 10000;
|
||||
int location = 1;
|
||||
System.err.println("done to sleep");
|
||||
// keep track of the number of genome locs we build
|
||||
int genomeLocs = 0;
|
||||
ArrayList<GenomeLoc> locations = new ArrayList<GenomeLoc>();
|
||||
while (location + size < stop) {
|
||||
// lets make up some fake locations
|
||||
GenomeLoc gl = new GenomeLoc(s.getSequenceName(), location, location + size - 1);
|
||||
|
||||
// let's move the location up, with a size space
|
||||
location += (size * 2);
|
||||
|
||||
// add our current location to the list
|
||||
locations.add(gl);
|
||||
|
||||
// add another genome location
|
||||
++genomeLocs;
|
||||
}
|
||||
|
||||
ShardStrategy strategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 5000, locations);
|
||||
int shardCount = 0;
|
||||
try {
|
||||
FileWriter writer = new FileWriter("myfile.txt");
|
||||
for (Shard sh : strategy) {
|
||||
GenomeLoc l = sh.getGenomeLoc();
|
||||
|
||||
writer.write("Shard start: " + l.getStart() + " stop " + l.getStop() + " contig " + l.getContig());
|
||||
//logger.debug("Shard start: " + l.getStart() + " stop " + l.getStop() + " contig " + l.getContig());
|
||||
shardCount++;
|
||||
}
|
||||
writer.close();
|
||||
assertEquals(shardCount, genomeLocs * 2);
|
||||
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
fail("testIntervalGenomeCycle: ne exception expected");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue