From eb4b4a053b93bae2b443776f8e1330ae49534d01 Mon Sep 17 00:00:00 2001 From: aaron Date: Tue, 14 Apr 2009 14:19:20 +0000 Subject: [PATCH] A bunch of updates to the SAM/BAM data source, along with test cases for the merging of multiple files (it works!). git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@399 348d0f76-0448-11de-a6fe-93d51630548a --- .../shards/ExpGrowthLocusShardStrategy.java | 17 +++ .../shards/LinearLocusShardStrategy.java | 17 ++- .../shards/LocusShardStrategy.java | 138 +++++++++++++++--- .../shards/ShardStrategyFactory.java | 10 +- .../simpleDataSources/SAMBAMDataSource.java | 2 +- .../shards/ShardStrategyFactoryTest.java | 24 ++- .../SAMBAMDataSourceTest.java | 104 ++++++++++++- 7 files changed, 276 insertions(+), 36 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ExpGrowthLocusShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ExpGrowthLocusShardStrategy.java index bf6d72537..441f7ae80 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ExpGrowthLocusShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ExpGrowthLocusShardStrategy.java @@ -1,6 +1,9 @@ package org.broadinstitute.sting.gatk.dataSources.shards; import net.sf.samtools.SAMSequenceDictionary; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.util.List; /** * @@ -56,6 +59,20 @@ public class ExpGrowthLocusShardStrategy extends LocusShardStrategy { currentExp = 0; } + /** + * The constructor, for a genomic list, start size, and a reference dictionary + * + * @param dic the reference dictionary + * @param startSize the starting size of the shard + * @param lst locations to iterate from + */ + ExpGrowthLocusShardStrategy(SAMSequenceDictionary dic, long startSize, List lst) { + super(dic, lst); + this.baseSize = startSize; + this.currentExp = 0; + } + + /** * set the next shards size * diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LinearLocusShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LinearLocusShardStrategy.java index 0b29755ec..5c4dece8d 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LinearLocusShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LinearLocusShardStrategy.java @@ -1,6 +1,9 @@ package org.broadinstitute.sting.gatk.dataSources.shards; import net.sf.samtools.SAMSequenceDictionary; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.util.List; /** * @@ -44,7 +47,7 @@ class LinearLocusShardStrategy extends LocusShardStrategy { } /** - * the constructor, taking a seq dictionary to parse out contigs + * the constructor, constructing from another Locus strategy * * @param strat the shatter to convert from */ @@ -54,6 +57,18 @@ class LinearLocusShardStrategy extends LocusShardStrategy { } + /** + * The constructor, for a genomic list, start size, and a reference dictionary + * @param dic the reference dictionary + * @param startSize the starting size of the shard + * @param lst locations to iterate from + */ + LinearLocusShardStrategy(SAMSequenceDictionary dic, long startSize, List lst) { + super(dic, lst); + this.nextShardSize = startSize; + } + + /** * set the next shards size * diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusShardStrategy.java index feeacaa3b..60049d374 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusShardStrategy.java @@ -1,9 +1,11 @@ package org.broadinstitute.sting.gatk.dataSources.shards; import net.sf.samtools.SAMSequenceDictionary; +import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.GenomeLoc; import java.util.Iterator; +import java.util.List; /** * * User: aaron @@ -29,7 +31,7 @@ import java.util.Iterator; *

* The shard interface, which controls how data is divided */ -public abstract class LocusShardStrategy implements ShardStrategy { +public abstract class LocusShardStrategy implements ShardStrategy { // this stores the seq dictionary, which is a reference for the // lengths and names of contigs, which you need to generate an iterative stratagy @@ -47,6 +49,15 @@ public abstract class LocusShardStrategy implements ShardStrategy { // do we have another contig? private boolean nextContig = false; + /** our interal list * */ + private List intervals = null; + + /** our interal list * */ + private int currentInterval = -1; + + /** our log, which we want to capture anything from this class */ + private static Logger logger = Logger.getLogger(LocusShardStrategy.class); + /** * the constructor, taking a seq dictionary to parse out contigs @@ -55,7 +66,7 @@ public abstract class LocusShardStrategy implements ShardStrategy { */ LocusShardStrategy(SAMSequenceDictionary dic) { this.dic = dic; - mLoc = new GenomeLoc(dic.getSequence(0).getSequenceName(), 0, 0); + mLoc = new GenomeLoc(0,0,0); if (dic.getSequences().size() > 0) { nextContig = true; } @@ -74,6 +85,24 @@ public abstract class LocusShardStrategy implements ShardStrategy { this.nextContig = old.nextContig; } + + /** + * the constructor, taking a seq dictionary to parse out contigs + * + * @param dic the seq dictionary + * @param intervals file + */ + LocusShardStrategy(SAMSequenceDictionary dic, List intervals) { + this.dic = dic; + this.intervals = intervals; + this.currentInterval = 0; + + mLoc = new GenomeLoc(0, 0, 0); + if (dic.getSequences().size() > 0) { + nextContig = true; + } + } + /** * * Abstract methods that each strategy has to implement @@ -102,44 +131,115 @@ public abstract class LocusShardStrategy implements ShardStrategy { * */ + + + /** * get the next shard, based on the return size of nextShardSize * - * @return + * @return the next shard */ - public LocusShard next() { + public Shard next() { + // lets get some background info on the problem long length = dic.getSequence(seqLoc).getSequenceLength(); long proposedSize = nextShardSize(); long nextStart = mLoc.getStop() + 1; - // can we fit it into the current seq size? - if (nextStart + proposedSize < length) { + + // if we don't have an interval file, use the non interval based approach. Simple, eh? + if (this.intervals == null) { + return nonIntervaledNext(length, proposedSize, nextStart); + } else { + return intervaledNext(length, proposedSize, nextStart); + } + + } + + /** + * Interval based next processing + * + * @param length the length of the sequence + * @param proposedSize the proposed size + * @param nextStart where we start from + * @return the shard that represents this data + */ + private Shard intervaledNext(long length, long proposedSize, long nextStart) { + // get the current genome location + GenomeLoc loc = intervals.get(currentInterval); + if (nextStart + proposedSize > loc.getStop()) { + // we need to move the next interval + proposedSize = loc.getStop() - nextStart; lastGenomeLocSize = proposedSize; - mLoc = new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + proposedSize); - return LocusShard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + proposedSize)); + + // the next sequence should start at the begining of the next contig + Shard ret = LocusShard.toShard(new GenomeLoc(intervals.get(currentInterval).getContigIndex(), nextStart, nextStart + proposedSize - 1)); + + ++currentInterval; + if (intervals.size() > currentInterval) { + mLoc = new GenomeLoc(intervals.get(currentInterval).getContigIndex(), intervals.get(currentInterval).getStart() - 1, intervals.get(currentInterval).getStart() - 1); + } + return ret;// return + + } else { + // we need to move the next interval + lastGenomeLocSize = proposedSize; + + // the next sequence should start at the begining of the next contig + Shard ret = LocusShard.toShard(new GenomeLoc(intervals.get(currentInterval).getContigIndex(), nextStart, nextStart + proposedSize - 1)); + + mLoc = new GenomeLoc(intervals.get(currentInterval).getContigIndex(), nextStart, nextStart + proposedSize - 1); + + return ret;// return + } + } + + /** + * Get the next shard, if we don't have intervals to traverse over + * + * @param length the length of the contig + * @param proposedSize the proposed size + * @param nextStart the next start location + * @return the shard to return to the user + */ + private Shard nonIntervaledNext(long length, long proposedSize, long nextStart) { + // can we fit it into the current seq size? + if (nextStart + proposedSize - 1 < length) { + lastGenomeLocSize = proposedSize; + mLoc = new GenomeLoc(dic.getSequence(seqLoc).getSequenceIndex(), nextStart, nextStart + proposedSize - 1); + return LocusShard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceIndex(), nextStart, nextStart + proposedSize - 1)); } // else we can't make it in the current location, we have to stitch one together else { - lastGenomeLocSize = nextStart + proposedSize - length; + // lets find out the remaining size of the current contig + long overflow = nextStart + proposedSize - 1 - length; + logger.debug("Overflow = " + overflow + " length: " + length); + // set our last size counter to the remaining size + lastGenomeLocSize = proposedSize - overflow; // move to the next contig - jumpContig(); - return LocusShard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, lastGenomeLocSize)); - } + // the next sequence should start at the begining of the next contig + Shard ret = LocusShard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceIndex(), nextStart, nextStart + lastGenomeLocSize)); + // now jump ahead to the next contig + jumpContig(); + + // return the shard + return ret; + } } /** jump to the next contig */ private void jumpContig() { ++seqLoc; - if (dic.getSequences().size() <= seqLoc) { + + if (!(seqLoc < dic.getSequences().size())) { nextContig = false; return; } + logger.debug("Next contig, index = " + dic.getSequence(seqLoc).getSequenceIndex()); + mLoc = new GenomeLoc(dic.getSequence(seqLoc).getSequenceIndex(), 0, 0); - // the next sequence should start at the begining of the next contig - mLoc = new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), 0, 0); } @@ -149,7 +249,12 @@ public abstract class LocusShardStrategy implements ShardStrategy { * @return */ public boolean hasNext() { - return nextContig; + // if we don't have an interval file, use the non interval based approach. Simple, eh? + if (this.intervals == null) { + return nextContig; + } else { + return (this.currentInterval < this.intervals.size()); + } } /** we don't support remove */ @@ -167,5 +272,4 @@ public abstract class LocusShardStrategy implements ShardStrategy { return this; } - } diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactory.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactory.java index 4b63b55f8..bb773caaa 100644 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactory.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactory.java @@ -39,9 +39,7 @@ public class ShardStrategyFactory { } /** our log, which we want to capture anything from this class */ - private static Logger logger = Logger.getLogger(ShardStrategyFactory.class); - - + private static Logger logger = Logger.getLogger(ShardStrategyFactory.class); /** @@ -95,9 +93,9 @@ public class ShardStrategyFactory { static public ShardStrategy shatter(SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, List lst) { switch (strat) { case LINEAR: - return new LinearLocusShardStrategy(dic, startingSize); // , lst); + return new LinearLocusShardStrategy(dic, startingSize , lst); case EXPONENTIAL: - return new ExpGrowthLocusShardStrategy(dic, startingSize); // , lst); + return new ExpGrowthLocusShardStrategy(dic, startingSize , lst); default: throw new RuntimeException("Strategy: " + strat + " isn't implemented"); } @@ -111,7 +109,7 @@ public class ShardStrategyFactory { * @return */ static public ShardStrategy shatterByReadCount(long readCount) { - return null; + return null; } } diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSource.java index 35a55efc9..e657a8e5d 100644 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSource.java @@ -32,7 +32,7 @@ public class SAMBAMDataSource implements SimpleDataSource { protected static Logger logger = Logger.getLogger(SAMBAMDataSource.class); // are we set to locus mode or read mode for dividing - private boolean locusMode = true; + private boolean locusMode = false; // How strict should we be with SAM/BAM parsing? protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.STRICT; diff --git a/java/test/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactoryTest.java b/java/test/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactoryTest.java index 62bb0d5b7..9c06162c4 100755 --- a/java/test/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactoryTest.java +++ b/java/test/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactoryTest.java @@ -9,7 +9,6 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.junit.*; import java.io.File; -import java.io.FileWriter; import java.util.ArrayList; /** @@ -84,6 +83,8 @@ public class ShardStrategyFactoryTest { /** Tests that we got a string parameter in correctly */ @Test public void testFullGenomeCycle() { + GenomeLoc.setupRefContigOrdering(seq.getSequenceDictionary()); + ShardStrategy strategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000); int shardCount = 0; try { @@ -106,7 +107,7 @@ public class ShardStrategyFactoryTest { /** Tests that we got a string parameter in correctly */ @Test - public void testIntervalGenomeCycle() { + public void testIntervalGenomeCycle() throws InterruptedException { SAMSequenceDictionary dic = seq.getSequenceDictionary(); SAMSequenceRecord s = dic.getSequence(1); // Character stream writing @@ -115,13 +116,18 @@ public class ShardStrategyFactoryTest { int stop = s.getSequenceLength(); int size = 10000; int location = 1; + GenomeLoc.setupRefContigOrdering(dic); System.err.println("done to sleep"); // keep track of the number of genome locs we build int genomeLocs = 0; ArrayList locations = new ArrayList(); - while (location + size < stop) { + System.err.println("done to sleep2"); + try { + while (location + size < stop) { + System.err.println("s = " + s.getSequenceName() + " " + location + " " + size); // lets make up some fake locations GenomeLoc gl = new GenomeLoc(s.getSequenceName(), location, location + size - 1); + System.err.println("loc = " + location); // let's move the location up, with a size space location += (size * 2); @@ -132,19 +138,21 @@ public class ShardStrategyFactoryTest { // add another genome location ++genomeLocs; } - + } catch (Exception e) { + e.printStackTrace(); + } + System.err.println("Location count = " + genomeLocs); ShardStrategy strategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 5000, locations); int shardCount = 0; try { - FileWriter writer = new FileWriter("myfile.txt"); for (Shard sh : strategy) { GenomeLoc l = sh.getGenomeLoc(); - writer.write("Shard start: " + l.getStart() + " stop " + l.getStop() + " contig " + l.getContig()); - //logger.debug("Shard start: " + l.getStart() + " stop " + l.getStop() + " contig " + l.getContig()); + System.err.println("Shard start: " + l.getStart() + " stop " + l.getStop() + " contig " + l.getContig()); shardCount++; } - writer.close(); + + System.err.println("Shard count = " + shardCount); assertEquals(shardCount, genomeLocs * 2); } catch (Exception e) { diff --git a/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSourceTest.java b/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSourceTest.java index 281cebd32..e38794e46 100755 --- a/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSourceTest.java +++ b/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSourceTest.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategy; import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategyFactory; import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2; import org.broadinstitute.sting.utils.FastaSequenceFile2; +import org.broadinstitute.sting.utils.GenomeLoc; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -54,10 +55,10 @@ public class SAMBAMDataSourceTest { @Before public void doForEachTest() { fl = new ArrayList(); - fl.add("/humgen/gsa-scr1/aaron/stink/NA12892.bam"); // sequence seq = new FastaSequenceFile2(new File("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")); + GenomeLoc.setupRefContigOrdering(seq.getSequenceDictionary()); } /** @@ -67,17 +68,22 @@ public class SAMBAMDataSourceTest { */ @After public void undoForEachTest() { - + seq = null; + fl.clear(); } /** Test out that we can shard the file and iterate over every read */ - @Test + //@Test public void testLinearBreakIterateAll() { // the sharding strat. ShardStrategy strat = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000); int count = 0; + // setup the data + fl.add("/humgen/gsa-scr1/aaron/stink/NA12892.bam"); + + try { SAMBAMDataSource data = new SAMBAMDataSource(fl); for (Shard sh : strat) { @@ -103,4 +109,96 @@ public class SAMBAMDataSourceTest { } } + + /** Test out that we can shard the file and iterate over every read */ + @Test + public void testMergingTwoBAMFiles() { + // the sharding strat. + ShardStrategy strat = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000); + + + // setup the test files + fl.add("/seq/dirseq/analysis/cancer_exome/sams/TCGA-06-0188.aligned.duplicates_marked.bam"); + + ArrayList readcountPerShard = new ArrayList(); + ArrayList readcountPerShard2 = new ArrayList(); + + // count up the first hundred shards + int shardsToCount = 100; + int count = 0; + + try { + SAMBAMDataSource data = new SAMBAMDataSource(fl); + for (Shard sh : strat) { + int readCount = 0; + count++; + if (count > shardsToCount) { + break; + } + + MergingSamRecordIterator2 datum = data.seek(sh.getGenomeLoc()); + + for (SAMRecord r : datum) { + readCount++; + + } + readcountPerShard.add(readCount); + System.out.println("read count = " + readCount); + datum.close(); + } + } + catch (SimpleDataSourceLoadException e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + fail("testLinearBreakIterateAll: We Should get a SimpleDataSourceLoadException"); + } + + + // setup the data and the counter before our second run + fl.clear(); + fl.add("/seq/dirseq/analysis/cancer_exome/sams/TCGA-06-0188-01A-01W.aligned.duplicates_marked.bam"); + fl.add("/seq/dirseq/analysis/cancer_exome/sams/TCGA-06-0188-10B-01W.aligned.duplicates_marked.bam"); + count = 0; + // the sharding strat. + strat = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000); + + System.err.println("Pile two:"); + try { + SAMBAMDataSource data = new SAMBAMDataSource(fl); + for (Shard sh : strat) { + int readCount = 0; + count++; + + // can we leave? + if (count > shardsToCount) { + break; + } + + MergingSamRecordIterator2 datum = data.seek(sh.getGenomeLoc()); + + for (SAMRecord r : datum) { + readCount++; + } + + readcountPerShard2.add(readCount); + System.out.println("read count = " + readCount); + datum.close(); + } + } + catch (SimpleDataSourceLoadException e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + fail("testLinearBreakIterateAll: We Should get a SimpleDataSourceLoadException"); + } + + int pos = 0; + for (; pos < 100; pos++) { + if (!readcountPerShard.get(pos).equals(readcountPerShard2.get(pos))) { + fail("Shard number " + pos + " in the two approaches had different read counts"); + } + } + + } + + + + }