Added a shard strategy for the reduce-by-interval traversals. Also fixed bugs that I found along the way.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@718 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2009-05-14 21:20:18 +00:00
parent 0f8e6061b6
commit 50f32b7f61
5 changed files with 233 additions and 13 deletions

View File

@ -0,0 +1,65 @@
package org.broadinstitute.sting.gatk.dataSources.shards;
import net.sf.samtools.SAMSequenceDictionary;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.util.List;
/**
*
* User: aaron
* Date: May 14, 2009
* Time: 3:28:50 PM
*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
/**
* @author aaron
* @version 1.0
* @date May 14, 2009
* <p/>
* Class LocusWindowShardStrategy
* <p/>
* This function knows how to shard on a genome loc boundry. It guarantee's
* a one-to-one mapping between a GenomeLoc and hte
*/
public class IntervalShardStrategy extends LocusShardStrategy {
/**
* the constructor, taking a seq dictionary to parse out contigs
*
* @param dic the seq dictionary
* @param intervals file
*/
IntervalShardStrategy(SAMSequenceDictionary dic, List<GenomeLoc> intervals) {
super(dic, intervals);
}
/**
* This is how the various shards strategies implements their approach, adjusting this value
*
* @return the next shard size
*/
protected long nextShardSize() {
long nextSize = this.getCurrentInterval().getStop() - this.getCurrentInterval().getStart();
return nextSize;
}
/**
* set the next shards size
*
* @param size adjust the next size to this
*/
public void adjustNextShardSize(long size) {
//To change body of implemented methods use File | Settings | File Templates.
}
}

View File

@ -143,7 +143,7 @@ public abstract class LocusShardStrategy implements ShardStrategy {
if (this.intervals == null) {
return nonIntervaledNext(length, proposedSize, nextStart);
} else {
return intervaledNext(length, proposedSize, nextStart);
return intervaledNext(proposedSize, nextStart);
}
}
@ -151,16 +151,15 @@ public abstract class LocusShardStrategy implements ShardStrategy {
/**
* Interval based next processing
*
* @param length the length of the sequence
* @param proposedSize the proposed size
* @param nextStart where we start from
* @return the shard that represents this data
*/
private Shard intervaledNext(long length, long proposedSize, long nextStart) {
private Shard intervaledNext(long proposedSize, long nextStart) {
// get the current genome location
GenomeLoc loc = intervals.get(currentInterval);
if (nextStart + proposedSize > loc.getStop()) {
// we need to move the next interval
if (nextStart + proposedSize >= loc.getStop()) {
// we need to get the rest of the current loc in a shard (return it), and move to the next location
proposedSize = loc.getStop() - nextStart;
lastGenomeLocSize = proposedSize;
@ -265,4 +264,16 @@ public abstract class LocusShardStrategy implements ShardStrategy {
return this;
}
/**
* this allows a shard strategy to get the current interval. It's kind of a hack, but for the
* locusWindowShardStrategy it was the best approach.
* @return
*/
protected GenomeLoc getCurrentInterval() {
if (this.intervals == null || currentInterval < 0) {
return null;
}
return intervals.get(currentInterval);
}
}

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.dataSources.shards;
import net.sf.samtools.SAMSequenceDictionary;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import java.util.List;
@ -35,7 +36,7 @@ import java.util.List;
*/
public class ShardStrategyFactory {
public enum SHATTER_STRATEGY {
LINEAR, EXPONENTIAL, READS
LINEAR, EXPONENTIAL, READS, INTERVAL
}
/** our log, which we want to capture anything from this class */
@ -59,7 +60,7 @@ public class ShardStrategyFactory {
case READS:
return new ReadShardStrategy(dic, startingSize);
default:
throw new RuntimeException("Strategy: " + strat + " isn't implemented");
throw new StingException("Strategy: " + strat + " isn't implemented for this type of shatter request");
}
}
@ -78,7 +79,7 @@ public class ShardStrategyFactory {
case EXPONENTIAL:
return new ExpGrowthLocusShardStrategy(convertFrom);
default:
throw new RuntimeException("Strategy: " + strat + " isn't implemented");
throw new StingException("Strategy: " + strat + " isn't implemented");
}
}
@ -100,15 +101,17 @@ public class ShardStrategyFactory {
return new ExpGrowthLocusShardStrategy(dic, startingSize, lst);
case READS:
// return new ReadShardStrategy(dic, startingSize);
throw new RuntimeException("Strategy: " + strat + " isn't implemented for intervals");
throw new StingException("Strategy: " + strat + " isn't implemented for intervals");
case INTERVAL:
return new IntervalShardStrategy(dic, lst);
default:
throw new RuntimeException("Strategy: " + strat + " isn't implemented");
throw new StingException("Strategy: " + strat + " isn't implemented");
}
}
/**
* convert between types
* setup a reads shattering strategy
*
* @param readCount the number of reads to include in each shard
* @return

View File

@ -0,0 +1,142 @@
package org.broadinstitute.sting.gatk.dataSources.shards;
import static junit.framework.Assert.assertEquals;
import static junit.framework.Assert.fail;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import org.junit.*;
import java.io.File;
import java.util.ArrayList;
/**
*
* User: aaron
* Date: May 14, 2009
* Time: 3:52:57 PM
*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
/**
* @author aaron
* @version 1.0
* @date May 14, 2009
* <p/>
* Class LocusWindowShardStrategyTest
* <p/>
* LocusWindowShardStrategy tests
*/
public class IntervalShardStrategyTest extends BaseTest {
private static FastaSequenceFile2 seq;
/**
* This function (because of the @BeforeClass tag) gets called only once ever,
* before any tests are run
*/
@BeforeClass
public static void doBeforeAnyTests() {
seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
}
/**
* Tears down the test fixture after each call.
* <p/>
* Called after every test case method.
*/
@AfterClass
public static void doAfterAllTests() {
}
/**
* This function does the setup of our parser, before each method call.
* <p/>
* Called before every test case method.
*/
@Before
public void doForEachTest() {
}
/**
* Tears down the test fixture after each call.
* <p/>
* Called after every test case method.
*/
@After
public void undoForEachTest() {
}
/** Tests that we got a string parameter in correctly */
@Test
public void testIntervalGenomeCycle() throws InterruptedException {
logger.warn("Executing testIntervalGenomeCycle");
SAMSequenceDictionary dic = seq.getSequenceDictionary();
// setup a list of genome locs that represent the whole file
SAMSequenceRecord s = dic.getSequence(1);
int stop = s.getSequenceLength();
int size = 10000;
int location = 1;
GenomeLoc.setupRefContigOrdering(dic);
// keep track of the number of genome locs we build
int genomeLocs = 0;
ArrayList<GenomeLoc> locations = new ArrayList<GenomeLoc>();
try {
while (location + size < stop) {
// lets make up some fake locations
GenomeLoc gl = new GenomeLoc(s.getSequenceName(), location, location + size - 1);
logger.debug("loc = " + location);
// let's move the location up, with a size space
location += (size * 2);
// add our current location to the list
locations.add(gl);
// add another genome location
++genomeLocs;
}
} catch (Exception e) {
e.printStackTrace();
}
logger.debug("Location count = " + genomeLocs);
ShardStrategy strategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL, seq.getSequenceDictionary(), 0, locations);
int shardCount = 0;
try {
for (Shard sh : strategy) {
GenomeLoc l = sh.getGenomeLoc();
GenomeLoc truth = locations.get(shardCount);
if (l.compareTo(truth) != 0) {
String truthStr = truth.getContig() + ":" + truth.getStart() + ":" + truth.getStop();
String lStr = l.getContig() + ":" + l.getStart() + ":" + l.getStop();
fail("Genome loc " + truthStr + " doesn't equal " + lStr);
}
shardCount++;
}
assertEquals(shardCount, genomeLocs);
} catch (Exception e) {
e.printStackTrace();
fail("testIntervalGenomeCycle: ne exception expected");
}
}
}

View File

@ -124,11 +124,10 @@ public class ShardStrategyFactoryTest extends BaseTest {
int size = 10000;
int location = 1;
GenomeLoc.setupRefContigOrdering(dic);
logger.debug("done to sleep");
// keep track of the number of genome locs we build
int genomeLocs = 0;
ArrayList<GenomeLoc> locations = new ArrayList<GenomeLoc>();
logger.debug("done to sleep2");
try {
while (location + size < stop) {
logger.debug("s = " + s.getSequenceName() + " " + location + " " + size);