Added a shard strategy for the reduce-by-interval traversals. Also fixed bugs that I found along the way.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@718 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
0f8e6061b6
commit
50f32b7f61
|
|
@ -0,0 +1,65 @@
|
|||
package org.broadinstitute.sting.gatk.dataSources.shards;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
*
|
||||
* User: aaron
|
||||
* Date: May 14, 2009
|
||||
* Time: 3:28:50 PM
|
||||
*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* @version 1.0
|
||||
* @date May 14, 2009
|
||||
* <p/>
|
||||
* Class LocusWindowShardStrategy
|
||||
* <p/>
|
||||
* This function knows how to shard on a genome loc boundry. It guarantee's
|
||||
* a one-to-one mapping between a GenomeLoc and hte
|
||||
*/
|
||||
public class IntervalShardStrategy extends LocusShardStrategy {
|
||||
/**
|
||||
* the constructor, taking a seq dictionary to parse out contigs
|
||||
*
|
||||
* @param dic the seq dictionary
|
||||
* @param intervals file
|
||||
*/
|
||||
IntervalShardStrategy(SAMSequenceDictionary dic, List<GenomeLoc> intervals) {
|
||||
super(dic, intervals);
|
||||
}
|
||||
|
||||
/**
|
||||
* This is how the various shards strategies implements their approach, adjusting this value
|
||||
*
|
||||
* @return the next shard size
|
||||
*/
|
||||
protected long nextShardSize() {
|
||||
long nextSize = this.getCurrentInterval().getStop() - this.getCurrentInterval().getStart();
|
||||
return nextSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* set the next shards size
|
||||
*
|
||||
* @param size adjust the next size to this
|
||||
*/
|
||||
public void adjustNextShardSize(long size) {
|
||||
//To change body of implemented methods use File | Settings | File Templates.
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -143,7 +143,7 @@ public abstract class LocusShardStrategy implements ShardStrategy {
|
|||
if (this.intervals == null) {
|
||||
return nonIntervaledNext(length, proposedSize, nextStart);
|
||||
} else {
|
||||
return intervaledNext(length, proposedSize, nextStart);
|
||||
return intervaledNext(proposedSize, nextStart);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -151,16 +151,15 @@ public abstract class LocusShardStrategy implements ShardStrategy {
|
|||
/**
|
||||
* Interval based next processing
|
||||
*
|
||||
* @param length the length of the sequence
|
||||
* @param proposedSize the proposed size
|
||||
* @param nextStart where we start from
|
||||
* @return the shard that represents this data
|
||||
*/
|
||||
private Shard intervaledNext(long length, long proposedSize, long nextStart) {
|
||||
private Shard intervaledNext(long proposedSize, long nextStart) {
|
||||
// get the current genome location
|
||||
GenomeLoc loc = intervals.get(currentInterval);
|
||||
if (nextStart + proposedSize > loc.getStop()) {
|
||||
// we need to move the next interval
|
||||
if (nextStart + proposedSize >= loc.getStop()) {
|
||||
// we need to get the rest of the current loc in a shard (return it), and move to the next location
|
||||
proposedSize = loc.getStop() - nextStart;
|
||||
lastGenomeLocSize = proposedSize;
|
||||
|
||||
|
|
@ -265,4 +264,16 @@ public abstract class LocusShardStrategy implements ShardStrategy {
|
|||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* this allows a shard strategy to get the current interval. It's kind of a hack, but for the
|
||||
* locusWindowShardStrategy it was the best approach.
|
||||
* @return
|
||||
*/
|
||||
protected GenomeLoc getCurrentInterval() {
|
||||
if (this.intervals == null || currentInterval < 0) {
|
||||
return null;
|
||||
}
|
||||
return intervals.get(currentInterval);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.dataSources.shards;
|
|||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
|
|
@ -35,7 +36,7 @@ import java.util.List;
|
|||
*/
|
||||
public class ShardStrategyFactory {
|
||||
public enum SHATTER_STRATEGY {
|
||||
LINEAR, EXPONENTIAL, READS
|
||||
LINEAR, EXPONENTIAL, READS, INTERVAL
|
||||
}
|
||||
|
||||
/** our log, which we want to capture anything from this class */
|
||||
|
|
@ -59,7 +60,7 @@ public class ShardStrategyFactory {
|
|||
case READS:
|
||||
return new ReadShardStrategy(dic, startingSize);
|
||||
default:
|
||||
throw new RuntimeException("Strategy: " + strat + " isn't implemented");
|
||||
throw new StingException("Strategy: " + strat + " isn't implemented for this type of shatter request");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -78,7 +79,7 @@ public class ShardStrategyFactory {
|
|||
case EXPONENTIAL:
|
||||
return new ExpGrowthLocusShardStrategy(convertFrom);
|
||||
default:
|
||||
throw new RuntimeException("Strategy: " + strat + " isn't implemented");
|
||||
throw new StingException("Strategy: " + strat + " isn't implemented");
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -100,15 +101,17 @@ public class ShardStrategyFactory {
|
|||
return new ExpGrowthLocusShardStrategy(dic, startingSize, lst);
|
||||
case READS:
|
||||
// return new ReadShardStrategy(dic, startingSize);
|
||||
throw new RuntimeException("Strategy: " + strat + " isn't implemented for intervals");
|
||||
throw new StingException("Strategy: " + strat + " isn't implemented for intervals");
|
||||
case INTERVAL:
|
||||
return new IntervalShardStrategy(dic, lst);
|
||||
default:
|
||||
throw new RuntimeException("Strategy: " + strat + " isn't implemented");
|
||||
throw new StingException("Strategy: " + strat + " isn't implemented");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* convert between types
|
||||
* setup a reads shattering strategy
|
||||
*
|
||||
* @param readCount the number of reads to include in each shard
|
||||
* @return
|
||||
|
|
|
|||
|
|
@ -0,0 +1,142 @@
|
|||
package org.broadinstitute.sting.gatk.dataSources.shards;
|
||||
|
||||
import static junit.framework.Assert.assertEquals;
|
||||
import static junit.framework.Assert.fail;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
||||
import org.junit.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
*
|
||||
* User: aaron
|
||||
* Date: May 14, 2009
|
||||
* Time: 3:52:57 PM
|
||||
*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* @version 1.0
|
||||
* @date May 14, 2009
|
||||
* <p/>
|
||||
* Class LocusWindowShardStrategyTest
|
||||
* <p/>
|
||||
* LocusWindowShardStrategy tests
|
||||
*/
|
||||
public class IntervalShardStrategyTest extends BaseTest {
|
||||
|
||||
private static FastaSequenceFile2 seq;
|
||||
|
||||
/**
|
||||
* This function (because of the @BeforeClass tag) gets called only once ever,
|
||||
* before any tests are run
|
||||
*/
|
||||
@BeforeClass
|
||||
public static void doBeforeAnyTests() {
|
||||
seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Tears down the test fixture after each call.
|
||||
* <p/>
|
||||
* Called after every test case method.
|
||||
*/
|
||||
@AfterClass
|
||||
public static void doAfterAllTests() {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* This function does the setup of our parser, before each method call.
|
||||
* <p/>
|
||||
* Called before every test case method.
|
||||
*/
|
||||
@Before
|
||||
public void doForEachTest() {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Tears down the test fixture after each call.
|
||||
* <p/>
|
||||
* Called after every test case method.
|
||||
*/
|
||||
@After
|
||||
public void undoForEachTest() {
|
||||
|
||||
}
|
||||
|
||||
/** Tests that we got a string parameter in correctly */
|
||||
@Test
|
||||
public void testIntervalGenomeCycle() throws InterruptedException {
|
||||
logger.warn("Executing testIntervalGenomeCycle");
|
||||
|
||||
SAMSequenceDictionary dic = seq.getSequenceDictionary();
|
||||
|
||||
|
||||
// setup a list of genome locs that represent the whole file
|
||||
SAMSequenceRecord s = dic.getSequence(1);
|
||||
int stop = s.getSequenceLength();
|
||||
int size = 10000;
|
||||
int location = 1;
|
||||
|
||||
GenomeLoc.setupRefContigOrdering(dic);
|
||||
// keep track of the number of genome locs we build
|
||||
int genomeLocs = 0;
|
||||
ArrayList<GenomeLoc> locations = new ArrayList<GenomeLoc>();
|
||||
try {
|
||||
while (location + size < stop) {
|
||||
// lets make up some fake locations
|
||||
GenomeLoc gl = new GenomeLoc(s.getSequenceName(), location, location + size - 1);
|
||||
logger.debug("loc = " + location);
|
||||
|
||||
// let's move the location up, with a size space
|
||||
location += (size * 2);
|
||||
|
||||
// add our current location to the list
|
||||
locations.add(gl);
|
||||
|
||||
// add another genome location
|
||||
++genomeLocs;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
logger.debug("Location count = " + genomeLocs);
|
||||
ShardStrategy strategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL, seq.getSequenceDictionary(), 0, locations);
|
||||
int shardCount = 0;
|
||||
try {
|
||||
for (Shard sh : strategy) {
|
||||
GenomeLoc l = sh.getGenomeLoc();
|
||||
GenomeLoc truth = locations.get(shardCount);
|
||||
if (l.compareTo(truth) != 0) {
|
||||
String truthStr = truth.getContig() + ":" + truth.getStart() + ":" + truth.getStop();
|
||||
String lStr = l.getContig() + ":" + l.getStart() + ":" + l.getStop();
|
||||
fail("Genome loc " + truthStr + " doesn't equal " + lStr);
|
||||
}
|
||||
shardCount++;
|
||||
}
|
||||
assertEquals(shardCount, genomeLocs);
|
||||
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
fail("testIntervalGenomeCycle: ne exception expected");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -124,11 +124,10 @@ public class ShardStrategyFactoryTest extends BaseTest {
|
|||
int size = 10000;
|
||||
int location = 1;
|
||||
GenomeLoc.setupRefContigOrdering(dic);
|
||||
logger.debug("done to sleep");
|
||||
// keep track of the number of genome locs we build
|
||||
int genomeLocs = 0;
|
||||
ArrayList<GenomeLoc> locations = new ArrayList<GenomeLoc>();
|
||||
logger.debug("done to sleep2");
|
||||
|
||||
try {
|
||||
while (location + size < stop) {
|
||||
logger.debug("s = " + s.getSequenceName() + " " + location + " " + size);
|
||||
|
|
|
|||
Loading…
Reference in New Issue