A bunch of updates to the SAM/BAM data source, along with test cases for the merging of multiple files (it works!).
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@399 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
30121534ed
commit
eb4b4a053b
|
|
@ -1,6 +1,9 @@
|
||||||
package org.broadinstitute.sting.gatk.dataSources.shards;
|
package org.broadinstitute.sting.gatk.dataSources.shards;
|
||||||
|
|
||||||
import net.sf.samtools.SAMSequenceDictionary;
|
import net.sf.samtools.SAMSequenceDictionary;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
|
|
@ -56,6 +59,20 @@ public class ExpGrowthLocusShardStrategy extends LocusShardStrategy {
|
||||||
currentExp = 0;
|
currentExp = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The constructor, for a genomic list, start size, and a reference dictionary
|
||||||
|
*
|
||||||
|
* @param dic the reference dictionary
|
||||||
|
* @param startSize the starting size of the shard
|
||||||
|
* @param lst locations to iterate from
|
||||||
|
*/
|
||||||
|
ExpGrowthLocusShardStrategy(SAMSequenceDictionary dic, long startSize, List<GenomeLoc> lst) {
|
||||||
|
super(dic, lst);
|
||||||
|
this.baseSize = startSize;
|
||||||
|
this.currentExp = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* set the next shards size
|
* set the next shards size
|
||||||
*
|
*
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,9 @@
|
||||||
package org.broadinstitute.sting.gatk.dataSources.shards;
|
package org.broadinstitute.sting.gatk.dataSources.shards;
|
||||||
|
|
||||||
import net.sf.samtools.SAMSequenceDictionary;
|
import net.sf.samtools.SAMSequenceDictionary;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
|
|
@ -44,7 +47,7 @@ class LinearLocusShardStrategy extends LocusShardStrategy {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* the constructor, taking a seq dictionary to parse out contigs
|
* the constructor, constructing from another Locus strategy
|
||||||
*
|
*
|
||||||
* @param strat the shatter to convert from
|
* @param strat the shatter to convert from
|
||||||
*/
|
*/
|
||||||
|
|
@ -54,6 +57,18 @@ class LinearLocusShardStrategy extends LocusShardStrategy {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The constructor, for a genomic list, start size, and a reference dictionary
|
||||||
|
* @param dic the reference dictionary
|
||||||
|
* @param startSize the starting size of the shard
|
||||||
|
* @param lst locations to iterate from
|
||||||
|
*/
|
||||||
|
LinearLocusShardStrategy(SAMSequenceDictionary dic, long startSize, List<GenomeLoc> lst) {
|
||||||
|
super(dic, lst);
|
||||||
|
this.nextShardSize = startSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* set the next shards size
|
* set the next shards size
|
||||||
*
|
*
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,11 @@
|
||||||
package org.broadinstitute.sting.gatk.dataSources.shards;
|
package org.broadinstitute.sting.gatk.dataSources.shards;
|
||||||
|
|
||||||
import net.sf.samtools.SAMSequenceDictionary;
|
import net.sf.samtools.SAMSequenceDictionary;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* User: aaron
|
* User: aaron
|
||||||
|
|
@ -29,7 +31,7 @@ import java.util.Iterator;
|
||||||
* <p/>
|
* <p/>
|
||||||
* The shard interface, which controls how data is divided
|
* The shard interface, which controls how data is divided
|
||||||
*/
|
*/
|
||||||
public abstract class LocusShardStrategy implements ShardStrategy {
|
public abstract class LocusShardStrategy implements ShardStrategy {
|
||||||
|
|
||||||
// this stores the seq dictionary, which is a reference for the
|
// this stores the seq dictionary, which is a reference for the
|
||||||
// lengths and names of contigs, which you need to generate an iterative stratagy
|
// lengths and names of contigs, which you need to generate an iterative stratagy
|
||||||
|
|
@ -47,6 +49,15 @@ public abstract class LocusShardStrategy implements ShardStrategy {
|
||||||
// do we have another contig?
|
// do we have another contig?
|
||||||
private boolean nextContig = false;
|
private boolean nextContig = false;
|
||||||
|
|
||||||
|
/** our interal list * */
|
||||||
|
private List<GenomeLoc> intervals = null;
|
||||||
|
|
||||||
|
/** our interal list * */
|
||||||
|
private int currentInterval = -1;
|
||||||
|
|
||||||
|
/** our log, which we want to capture anything from this class */
|
||||||
|
private static Logger logger = Logger.getLogger(LocusShardStrategy.class);
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* the constructor, taking a seq dictionary to parse out contigs
|
* the constructor, taking a seq dictionary to parse out contigs
|
||||||
|
|
@ -55,7 +66,7 @@ public abstract class LocusShardStrategy implements ShardStrategy {
|
||||||
*/
|
*/
|
||||||
LocusShardStrategy(SAMSequenceDictionary dic) {
|
LocusShardStrategy(SAMSequenceDictionary dic) {
|
||||||
this.dic = dic;
|
this.dic = dic;
|
||||||
mLoc = new GenomeLoc(dic.getSequence(0).getSequenceName(), 0, 0);
|
mLoc = new GenomeLoc(0,0,0);
|
||||||
if (dic.getSequences().size() > 0) {
|
if (dic.getSequences().size() > 0) {
|
||||||
nextContig = true;
|
nextContig = true;
|
||||||
}
|
}
|
||||||
|
|
@ -74,6 +85,24 @@ public abstract class LocusShardStrategy implements ShardStrategy {
|
||||||
this.nextContig = old.nextContig;
|
this.nextContig = old.nextContig;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* the constructor, taking a seq dictionary to parse out contigs
|
||||||
|
*
|
||||||
|
* @param dic the seq dictionary
|
||||||
|
* @param intervals file
|
||||||
|
*/
|
||||||
|
LocusShardStrategy(SAMSequenceDictionary dic, List<GenomeLoc> intervals) {
|
||||||
|
this.dic = dic;
|
||||||
|
this.intervals = intervals;
|
||||||
|
this.currentInterval = 0;
|
||||||
|
|
||||||
|
mLoc = new GenomeLoc(0, 0, 0);
|
||||||
|
if (dic.getSequences().size() > 0) {
|
||||||
|
nextContig = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* Abstract methods that each strategy has to implement
|
* Abstract methods that each strategy has to implement
|
||||||
|
|
@ -102,44 +131,115 @@ public abstract class LocusShardStrategy implements ShardStrategy {
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* get the next shard, based on the return size of nextShardSize
|
* get the next shard, based on the return size of nextShardSize
|
||||||
*
|
*
|
||||||
* @return
|
* @return the next shard
|
||||||
*/
|
*/
|
||||||
public LocusShard next() {
|
public Shard next() {
|
||||||
|
|
||||||
// lets get some background info on the problem
|
// lets get some background info on the problem
|
||||||
long length = dic.getSequence(seqLoc).getSequenceLength();
|
long length = dic.getSequence(seqLoc).getSequenceLength();
|
||||||
long proposedSize = nextShardSize();
|
long proposedSize = nextShardSize();
|
||||||
long nextStart = mLoc.getStop() + 1;
|
long nextStart = mLoc.getStop() + 1;
|
||||||
// can we fit it into the current seq size?
|
|
||||||
if (nextStart + proposedSize < length) {
|
// if we don't have an interval file, use the non interval based approach. Simple, eh?
|
||||||
|
if (this.intervals == null) {
|
||||||
|
return nonIntervaledNext(length, proposedSize, nextStart);
|
||||||
|
} else {
|
||||||
|
return intervaledNext(length, proposedSize, nextStart);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Interval based next processing
|
||||||
|
*
|
||||||
|
* @param length the length of the sequence
|
||||||
|
* @param proposedSize the proposed size
|
||||||
|
* @param nextStart where we start from
|
||||||
|
* @return the shard that represents this data
|
||||||
|
*/
|
||||||
|
private Shard intervaledNext(long length, long proposedSize, long nextStart) {
|
||||||
|
// get the current genome location
|
||||||
|
GenomeLoc loc = intervals.get(currentInterval);
|
||||||
|
if (nextStart + proposedSize > loc.getStop()) {
|
||||||
|
// we need to move the next interval
|
||||||
|
proposedSize = loc.getStop() - nextStart;
|
||||||
lastGenomeLocSize = proposedSize;
|
lastGenomeLocSize = proposedSize;
|
||||||
mLoc = new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + proposedSize);
|
|
||||||
return LocusShard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + proposedSize));
|
// the next sequence should start at the begining of the next contig
|
||||||
|
Shard ret = LocusShard.toShard(new GenomeLoc(intervals.get(currentInterval).getContigIndex(), nextStart, nextStart + proposedSize - 1));
|
||||||
|
|
||||||
|
++currentInterval;
|
||||||
|
if (intervals.size() > currentInterval) {
|
||||||
|
mLoc = new GenomeLoc(intervals.get(currentInterval).getContigIndex(), intervals.get(currentInterval).getStart() - 1, intervals.get(currentInterval).getStart() - 1);
|
||||||
|
}
|
||||||
|
return ret;// return
|
||||||
|
|
||||||
|
} else {
|
||||||
|
// we need to move the next interval
|
||||||
|
lastGenomeLocSize = proposedSize;
|
||||||
|
|
||||||
|
// the next sequence should start at the begining of the next contig
|
||||||
|
Shard ret = LocusShard.toShard(new GenomeLoc(intervals.get(currentInterval).getContigIndex(), nextStart, nextStart + proposedSize - 1));
|
||||||
|
|
||||||
|
mLoc = new GenomeLoc(intervals.get(currentInterval).getContigIndex(), nextStart, nextStart + proposedSize - 1);
|
||||||
|
|
||||||
|
return ret;// return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the next shard, if we don't have intervals to traverse over
|
||||||
|
*
|
||||||
|
* @param length the length of the contig
|
||||||
|
* @param proposedSize the proposed size
|
||||||
|
* @param nextStart the next start location
|
||||||
|
* @return the shard to return to the user
|
||||||
|
*/
|
||||||
|
private Shard nonIntervaledNext(long length, long proposedSize, long nextStart) {
|
||||||
|
// can we fit it into the current seq size?
|
||||||
|
if (nextStart + proposedSize - 1 < length) {
|
||||||
|
lastGenomeLocSize = proposedSize;
|
||||||
|
mLoc = new GenomeLoc(dic.getSequence(seqLoc).getSequenceIndex(), nextStart, nextStart + proposedSize - 1);
|
||||||
|
return LocusShard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceIndex(), nextStart, nextStart + proposedSize - 1));
|
||||||
}
|
}
|
||||||
// else we can't make it in the current location, we have to stitch one together
|
// else we can't make it in the current location, we have to stitch one together
|
||||||
else {
|
else {
|
||||||
lastGenomeLocSize = nextStart + proposedSize - length;
|
// lets find out the remaining size of the current contig
|
||||||
|
long overflow = nextStart + proposedSize - 1 - length;
|
||||||
|
logger.debug("Overflow = " + overflow + " length: " + length);
|
||||||
|
|
||||||
|
// set our last size counter to the remaining size
|
||||||
|
lastGenomeLocSize = proposedSize - overflow;
|
||||||
|
|
||||||
// move to the next contig
|
// move to the next contig
|
||||||
jumpContig();
|
// the next sequence should start at the begining of the next contig
|
||||||
return LocusShard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, lastGenomeLocSize));
|
Shard ret = LocusShard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceIndex(), nextStart, nextStart + lastGenomeLocSize));
|
||||||
}
|
|
||||||
|
|
||||||
|
// now jump ahead to the next contig
|
||||||
|
jumpContig();
|
||||||
|
|
||||||
|
// return the shard
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** jump to the next contig */
|
/** jump to the next contig */
|
||||||
private void jumpContig() {
|
private void jumpContig() {
|
||||||
++seqLoc;
|
++seqLoc;
|
||||||
if (dic.getSequences().size() <= seqLoc) {
|
|
||||||
|
if (!(seqLoc < dic.getSequences().size())) {
|
||||||
nextContig = false;
|
nextContig = false;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
logger.debug("Next contig, index = " + dic.getSequence(seqLoc).getSequenceIndex());
|
||||||
|
mLoc = new GenomeLoc(dic.getSequence(seqLoc).getSequenceIndex(), 0, 0);
|
||||||
|
|
||||||
// the next sequence should start at the begining of the next contig
|
|
||||||
mLoc = new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), 0, 0);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -149,7 +249,12 @@ public abstract class LocusShardStrategy implements ShardStrategy {
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public boolean hasNext() {
|
public boolean hasNext() {
|
||||||
return nextContig;
|
// if we don't have an interval file, use the non interval based approach. Simple, eh?
|
||||||
|
if (this.intervals == null) {
|
||||||
|
return nextContig;
|
||||||
|
} else {
|
||||||
|
return (this.currentInterval < this.intervals.size());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** we don't support remove */
|
/** we don't support remove */
|
||||||
|
|
@ -167,5 +272,4 @@ public abstract class LocusShardStrategy implements ShardStrategy {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -39,9 +39,7 @@ public class ShardStrategyFactory {
|
||||||
}
|
}
|
||||||
|
|
||||||
/** our log, which we want to capture anything from this class */
|
/** our log, which we want to capture anything from this class */
|
||||||
private static Logger logger = Logger.getLogger(ShardStrategyFactory.class);
|
private static Logger logger = Logger.getLogger(ShardStrategyFactory.class);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -95,9 +93,9 @@ public class ShardStrategyFactory {
|
||||||
static public ShardStrategy shatter(SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, List<GenomeLoc> lst) {
|
static public ShardStrategy shatter(SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, List<GenomeLoc> lst) {
|
||||||
switch (strat) {
|
switch (strat) {
|
||||||
case LINEAR:
|
case LINEAR:
|
||||||
return new LinearLocusShardStrategy(dic, startingSize); // , lst);
|
return new LinearLocusShardStrategy(dic, startingSize , lst);
|
||||||
case EXPONENTIAL:
|
case EXPONENTIAL:
|
||||||
return new ExpGrowthLocusShardStrategy(dic, startingSize); // , lst);
|
return new ExpGrowthLocusShardStrategy(dic, startingSize , lst);
|
||||||
default:
|
default:
|
||||||
throw new RuntimeException("Strategy: " + strat + " isn't implemented");
|
throw new RuntimeException("Strategy: " + strat + " isn't implemented");
|
||||||
}
|
}
|
||||||
|
|
@ -111,7 +109,7 @@ public class ShardStrategyFactory {
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
static public ShardStrategy shatterByReadCount(long readCount) {
|
static public ShardStrategy shatterByReadCount(long readCount) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -32,7 +32,7 @@ public class SAMBAMDataSource implements SimpleDataSource {
|
||||||
protected static Logger logger = Logger.getLogger(SAMBAMDataSource.class);
|
protected static Logger logger = Logger.getLogger(SAMBAMDataSource.class);
|
||||||
|
|
||||||
// are we set to locus mode or read mode for dividing
|
// are we set to locus mode or read mode for dividing
|
||||||
private boolean locusMode = true;
|
private boolean locusMode = false;
|
||||||
|
|
||||||
// How strict should we be with SAM/BAM parsing?
|
// How strict should we be with SAM/BAM parsing?
|
||||||
protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.STRICT;
|
protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.STRICT;
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,6 @@ import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.junit.*;
|
import org.junit.*;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileWriter;
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -84,6 +83,8 @@ public class ShardStrategyFactoryTest {
|
||||||
/** Tests that we got a string parameter in correctly */
|
/** Tests that we got a string parameter in correctly */
|
||||||
@Test
|
@Test
|
||||||
public void testFullGenomeCycle() {
|
public void testFullGenomeCycle() {
|
||||||
|
GenomeLoc.setupRefContigOrdering(seq.getSequenceDictionary());
|
||||||
|
|
||||||
ShardStrategy strategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000);
|
ShardStrategy strategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000);
|
||||||
int shardCount = 0;
|
int shardCount = 0;
|
||||||
try {
|
try {
|
||||||
|
|
@ -106,7 +107,7 @@ public class ShardStrategyFactoryTest {
|
||||||
|
|
||||||
/** Tests that we got a string parameter in correctly */
|
/** Tests that we got a string parameter in correctly */
|
||||||
@Test
|
@Test
|
||||||
public void testIntervalGenomeCycle() {
|
public void testIntervalGenomeCycle() throws InterruptedException {
|
||||||
SAMSequenceDictionary dic = seq.getSequenceDictionary();
|
SAMSequenceDictionary dic = seq.getSequenceDictionary();
|
||||||
SAMSequenceRecord s = dic.getSequence(1);
|
SAMSequenceRecord s = dic.getSequence(1);
|
||||||
// Character stream writing
|
// Character stream writing
|
||||||
|
|
@ -115,13 +116,18 @@ public class ShardStrategyFactoryTest {
|
||||||
int stop = s.getSequenceLength();
|
int stop = s.getSequenceLength();
|
||||||
int size = 10000;
|
int size = 10000;
|
||||||
int location = 1;
|
int location = 1;
|
||||||
|
GenomeLoc.setupRefContigOrdering(dic);
|
||||||
System.err.println("done to sleep");
|
System.err.println("done to sleep");
|
||||||
// keep track of the number of genome locs we build
|
// keep track of the number of genome locs we build
|
||||||
int genomeLocs = 0;
|
int genomeLocs = 0;
|
||||||
ArrayList<GenomeLoc> locations = new ArrayList<GenomeLoc>();
|
ArrayList<GenomeLoc> locations = new ArrayList<GenomeLoc>();
|
||||||
while (location + size < stop) {
|
System.err.println("done to sleep2");
|
||||||
|
try {
|
||||||
|
while (location + size < stop) {
|
||||||
|
System.err.println("s = " + s.getSequenceName() + " " + location + " " + size);
|
||||||
// lets make up some fake locations
|
// lets make up some fake locations
|
||||||
GenomeLoc gl = new GenomeLoc(s.getSequenceName(), location, location + size - 1);
|
GenomeLoc gl = new GenomeLoc(s.getSequenceName(), location, location + size - 1);
|
||||||
|
System.err.println("loc = " + location);
|
||||||
|
|
||||||
// let's move the location up, with a size space
|
// let's move the location up, with a size space
|
||||||
location += (size * 2);
|
location += (size * 2);
|
||||||
|
|
@ -132,19 +138,21 @@ public class ShardStrategyFactoryTest {
|
||||||
// add another genome location
|
// add another genome location
|
||||||
++genomeLocs;
|
++genomeLocs;
|
||||||
}
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
System.err.println("Location count = " + genomeLocs);
|
||||||
ShardStrategy strategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 5000, locations);
|
ShardStrategy strategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 5000, locations);
|
||||||
int shardCount = 0;
|
int shardCount = 0;
|
||||||
try {
|
try {
|
||||||
FileWriter writer = new FileWriter("myfile.txt");
|
|
||||||
for (Shard sh : strategy) {
|
for (Shard sh : strategy) {
|
||||||
GenomeLoc l = sh.getGenomeLoc();
|
GenomeLoc l = sh.getGenomeLoc();
|
||||||
|
|
||||||
writer.write("Shard start: " + l.getStart() + " stop " + l.getStop() + " contig " + l.getContig());
|
System.err.println("Shard start: " + l.getStart() + " stop " + l.getStop() + " contig " + l.getContig());
|
||||||
//logger.debug("Shard start: " + l.getStart() + " stop " + l.getStop() + " contig " + l.getContig());
|
|
||||||
shardCount++;
|
shardCount++;
|
||||||
}
|
}
|
||||||
writer.close();
|
|
||||||
|
System.err.println("Shard count = " + shardCount);
|
||||||
assertEquals(shardCount, genomeLocs * 2);
|
assertEquals(shardCount, genomeLocs * 2);
|
||||||
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategy;
|
||||||
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategyFactory;
|
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategyFactory;
|
||||||
import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2;
|
import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2;
|
||||||
import org.broadinstitute.sting.utils.FastaSequenceFile2;
|
import org.broadinstitute.sting.utils.FastaSequenceFile2;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.junit.After;
|
import org.junit.After;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
@ -54,10 +55,10 @@ public class SAMBAMDataSourceTest {
|
||||||
@Before
|
@Before
|
||||||
public void doForEachTest() {
|
public void doForEachTest() {
|
||||||
fl = new ArrayList<String>();
|
fl = new ArrayList<String>();
|
||||||
fl.add("/humgen/gsa-scr1/aaron/stink/NA12892.bam");
|
|
||||||
|
|
||||||
// sequence
|
// sequence
|
||||||
seq = new FastaSequenceFile2(new File("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
|
seq = new FastaSequenceFile2(new File("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
|
||||||
|
GenomeLoc.setupRefContigOrdering(seq.getSequenceDictionary());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -67,17 +68,22 @@ public class SAMBAMDataSourceTest {
|
||||||
*/
|
*/
|
||||||
@After
|
@After
|
||||||
public void undoForEachTest() {
|
public void undoForEachTest() {
|
||||||
|
seq = null;
|
||||||
|
fl.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/** Test out that we can shard the file and iterate over every read */
|
/** Test out that we can shard the file and iterate over every read */
|
||||||
@Test
|
//@Test
|
||||||
public void testLinearBreakIterateAll() {
|
public void testLinearBreakIterateAll() {
|
||||||
// the sharding strat.
|
// the sharding strat.
|
||||||
ShardStrategy strat = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000);
|
ShardStrategy strat = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000);
|
||||||
int count = 0;
|
int count = 0;
|
||||||
|
|
||||||
|
// setup the data
|
||||||
|
fl.add("/humgen/gsa-scr1/aaron/stink/NA12892.bam");
|
||||||
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
SAMBAMDataSource data = new SAMBAMDataSource(fl);
|
SAMBAMDataSource data = new SAMBAMDataSource(fl);
|
||||||
for (Shard sh : strat) {
|
for (Shard sh : strat) {
|
||||||
|
|
@ -103,4 +109,96 @@ public class SAMBAMDataSourceTest {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** Test out that we can shard the file and iterate over every read */
|
||||||
|
@Test
|
||||||
|
public void testMergingTwoBAMFiles() {
|
||||||
|
// the sharding strat.
|
||||||
|
ShardStrategy strat = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000);
|
||||||
|
|
||||||
|
|
||||||
|
// setup the test files
|
||||||
|
fl.add("/seq/dirseq/analysis/cancer_exome/sams/TCGA-06-0188.aligned.duplicates_marked.bam");
|
||||||
|
|
||||||
|
ArrayList<Integer> readcountPerShard = new ArrayList<Integer>();
|
||||||
|
ArrayList<Integer> readcountPerShard2 = new ArrayList<Integer>();
|
||||||
|
|
||||||
|
// count up the first hundred shards
|
||||||
|
int shardsToCount = 100;
|
||||||
|
int count = 0;
|
||||||
|
|
||||||
|
try {
|
||||||
|
SAMBAMDataSource data = new SAMBAMDataSource(fl);
|
||||||
|
for (Shard sh : strat) {
|
||||||
|
int readCount = 0;
|
||||||
|
count++;
|
||||||
|
if (count > shardsToCount) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
MergingSamRecordIterator2 datum = data.seek(sh.getGenomeLoc());
|
||||||
|
|
||||||
|
for (SAMRecord r : datum) {
|
||||||
|
readCount++;
|
||||||
|
|
||||||
|
}
|
||||||
|
readcountPerShard.add(readCount);
|
||||||
|
System.out.println("read count = " + readCount);
|
||||||
|
datum.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (SimpleDataSourceLoadException e) {
|
||||||
|
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
|
||||||
|
fail("testLinearBreakIterateAll: We Should get a SimpleDataSourceLoadException");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// setup the data and the counter before our second run
|
||||||
|
fl.clear();
|
||||||
|
fl.add("/seq/dirseq/analysis/cancer_exome/sams/TCGA-06-0188-01A-01W.aligned.duplicates_marked.bam");
|
||||||
|
fl.add("/seq/dirseq/analysis/cancer_exome/sams/TCGA-06-0188-10B-01W.aligned.duplicates_marked.bam");
|
||||||
|
count = 0;
|
||||||
|
// the sharding strat.
|
||||||
|
strat = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000);
|
||||||
|
|
||||||
|
System.err.println("Pile two:");
|
||||||
|
try {
|
||||||
|
SAMBAMDataSource data = new SAMBAMDataSource(fl);
|
||||||
|
for (Shard sh : strat) {
|
||||||
|
int readCount = 0;
|
||||||
|
count++;
|
||||||
|
|
||||||
|
// can we leave?
|
||||||
|
if (count > shardsToCount) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
MergingSamRecordIterator2 datum = data.seek(sh.getGenomeLoc());
|
||||||
|
|
||||||
|
for (SAMRecord r : datum) {
|
||||||
|
readCount++;
|
||||||
|
}
|
||||||
|
|
||||||
|
readcountPerShard2.add(readCount);
|
||||||
|
System.out.println("read count = " + readCount);
|
||||||
|
datum.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (SimpleDataSourceLoadException e) {
|
||||||
|
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
|
||||||
|
fail("testLinearBreakIterateAll: We Should get a SimpleDataSourceLoadException");
|
||||||
|
}
|
||||||
|
|
||||||
|
int pos = 0;
|
||||||
|
for (; pos < 100; pos++) {
|
||||||
|
if (!readcountPerShard.get(pos).equals(readcountPerShard2.get(pos))) {
|
||||||
|
fail("Shard number " + pos + " in the two approaches had different read counts");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue