A bunch of updates to the SAM/BAM data source, along with test cases for the merging of multiple files (it works!).

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@399 348d0f76-0448-11de-a6fe-93d51630548a
2009-04-14 14:19:20 +00:00 · 2009-04-14 14:19:20 +00:00 · eb4b4a053b
parent 30121534ed
commit eb4b4a053b
7 changed files with 276 additions and 36 deletions
--- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ExpGrowthLocusShardStrategy.java
+++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ExpGrowthLocusShardStrategy.java
@ -1,6 +1,9 @@
 package org.broadinstitute.sting.gatk.dataSources.shards;

 import net.sf.samtools.SAMSequenceDictionary;
+import org.broadinstitute.sting.utils.GenomeLoc;
+
+import java.util.List;

 /**
 *
@ -56,6 +59,20 @@ public class ExpGrowthLocusShardStrategy extends LocusShardStrategy {
        currentExp = 0;
    }

+    /**
+     * The constructor, for a genomic list, start size, and a reference dictionary
+     *
+     * @param dic       the reference dictionary
+     * @param startSize the starting size of the shard
+     * @param lst       locations to iterate from
+     */
+    ExpGrowthLocusShardStrategy(SAMSequenceDictionary dic, long startSize, List<GenomeLoc> lst) {
+        super(dic, lst);
+        this.baseSize = startSize;
+        this.currentExp = 0;
+    }
+
+
    /**
     * set the next shards size
     *
--- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LinearLocusShardStrategy.java
+++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LinearLocusShardStrategy.java
@ -1,6 +1,9 @@
 package org.broadinstitute.sting.gatk.dataSources.shards;

 import net.sf.samtools.SAMSequenceDictionary;
+import org.broadinstitute.sting.utils.GenomeLoc;
+
+import java.util.List;

 /**
 *
@ -44,7 +47,7 @@ class LinearLocusShardStrategy extends LocusShardStrategy {
    }

    /**
-     * the constructor, taking a seq dictionary to parse out contigs
+     * the constructor, constructing from another Locus strategy
     *
     * @param strat the shatter to convert from
     */
@ -54,6 +57,18 @@ class LinearLocusShardStrategy extends LocusShardStrategy {
    }


+    /**
+     * The constructor, for a genomic list, start size, and a reference dictionary 
+     * @param dic the reference dictionary
+     * @param startSize the starting size of the shard
+     * @param lst locations to iterate from
+     */
+    LinearLocusShardStrategy(SAMSequenceDictionary dic, long startSize, List<GenomeLoc> lst) {
+        super(dic, lst);
+        this.nextShardSize = startSize;
+    }
+
+
    /**
     * set the next shards size
     *
--- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusShardStrategy.java
+++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusShardStrategy.java
@ -1,9 +1,11 @@
 package org.broadinstitute.sting.gatk.dataSources.shards;

 import net.sf.samtools.SAMSequenceDictionary;
+import org.apache.log4j.Logger;
 import org.broadinstitute.sting.utils.GenomeLoc;

 import java.util.Iterator;
+import java.util.List;
 /**
 *
 * User: aaron
@ -29,7 +31,7 @@ import java.util.Iterator;
 * <p/>
 * The shard interface, which controls how data is divided
 */
-public abstract class LocusShardStrategy implements ShardStrategy  {
+public abstract class LocusShardStrategy implements ShardStrategy {

    // this stores the seq dictionary, which is a reference for the
    // lengths and names of contigs, which you need to generate an iterative stratagy
@ -47,6 +49,15 @@ public abstract class LocusShardStrategy implements ShardStrategy  {
    // do we have another contig?
    private boolean nextContig = false;

+    /** our interal list * */
+    private List<GenomeLoc> intervals = null;
+
+    /** our interal list * */
+    private int currentInterval = -1;
+
+    /** our log, which we want to capture anything from this class */
+    private static Logger logger = Logger.getLogger(LocusShardStrategy.class);
+

    /**
     * the constructor, taking a seq dictionary to parse out contigs
@ -55,7 +66,7 @@ public abstract class LocusShardStrategy implements ShardStrategy  {
     */
    LocusShardStrategy(SAMSequenceDictionary dic) {
        this.dic = dic;
-        mLoc = new GenomeLoc(dic.getSequence(0).getSequenceName(), 0, 0);
+        mLoc = new GenomeLoc(0,0,0);
        if (dic.getSequences().size() > 0) {
            nextContig = true;
        }
@ -74,6 +85,24 @@ public abstract class LocusShardStrategy implements ShardStrategy  {
        this.nextContig = old.nextContig;
    }

+
+    /**
+     * the constructor, taking a seq dictionary to parse out contigs
+     *
+     * @param dic       the seq dictionary
+     * @param intervals file
+     */
+    LocusShardStrategy(SAMSequenceDictionary dic, List<GenomeLoc> intervals) {
+        this.dic = dic;
+        this.intervals = intervals;
+        this.currentInterval = 0;
+
+        mLoc = new GenomeLoc(0, 0, 0);
+        if (dic.getSequences().size() > 0) {
+            nextContig = true;
+        }
+    }
+
    /**
     *
     * Abstract methods that each strategy has to implement
@ -102,44 +131,115 @@ public abstract class LocusShardStrategy implements ShardStrategy  {
     *
     */

+
+
+
    /**
     * get the next shard, based on the return size of nextShardSize
     *
-     * @return
+     * @return the next shard
     */
-    public LocusShard next() {
+    public Shard next() {
+
        // lets get some background info on the problem
        long length = dic.getSequence(seqLoc).getSequenceLength();
        long proposedSize = nextShardSize();
        long nextStart = mLoc.getStop() + 1;
-        // can we fit it into the current seq size?
-        if (nextStart + proposedSize < length) {
+
+        // if we don't have an interval file, use the non interval based approach.  Simple, eh?
+        if (this.intervals == null) {
+            return nonIntervaledNext(length, proposedSize, nextStart);
+        } else {
+            return intervaledNext(length, proposedSize, nextStart);
+        }
+
+    }
+
+    /**
+     * Interval based next processing
+     *
+     * @param length       the length of the sequence
+     * @param proposedSize the proposed size
+     * @param nextStart    where we start from
+     * @return the shard that represents this data
+     */
+    private Shard intervaledNext(long length, long proposedSize, long nextStart) {
+        // get the current genome location
+        GenomeLoc loc = intervals.get(currentInterval);
+        if (nextStart + proposedSize > loc.getStop()) {
+            // we need to move the next interval
+            proposedSize = loc.getStop() - nextStart;
            lastGenomeLocSize = proposedSize;
-            mLoc = new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + proposedSize);
-            return LocusShard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, nextStart + proposedSize));
+
+            // the next sequence should start at the begining of the next contig
+            Shard ret = LocusShard.toShard(new GenomeLoc(intervals.get(currentInterval).getContigIndex(), nextStart, nextStart + proposedSize - 1));
+
+            ++currentInterval;
+            if (intervals.size() > currentInterval) {
+                mLoc = new GenomeLoc(intervals.get(currentInterval).getContigIndex(), intervals.get(currentInterval).getStart() - 1, intervals.get(currentInterval).getStart() - 1);
+            }
+            return ret;// return
+
+        } else {
+            // we need to move the next interval
+            lastGenomeLocSize = proposedSize;
+
+            // the next sequence should start at the begining of the next contig
+            Shard ret = LocusShard.toShard(new GenomeLoc(intervals.get(currentInterval).getContigIndex(), nextStart, nextStart + proposedSize - 1));
+
+            mLoc = new GenomeLoc(intervals.get(currentInterval).getContigIndex(), nextStart, nextStart + proposedSize - 1);
+
+            return ret;// return
+        }
+    }
+
+    /**
+     * Get the next shard, if we don't have intervals to traverse over
+     *
+     * @param length       the length of the contig
+     * @param proposedSize the proposed size
+     * @param nextStart    the next start location
+     * @return the shard to return to the user
+     */
+    private Shard nonIntervaledNext(long length, long proposedSize, long nextStart) {
+        // can we fit it into the current seq size?
+        if (nextStart + proposedSize - 1 < length) {
+            lastGenomeLocSize = proposedSize;
+            mLoc = new GenomeLoc(dic.getSequence(seqLoc).getSequenceIndex(), nextStart, nextStart + proposedSize - 1);
+            return LocusShard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceIndex(), nextStart, nextStart + proposedSize - 1));
        }
        // else we can't make it in the current location, we have to stitch one together
        else {
-            lastGenomeLocSize = nextStart + proposedSize - length;
+            // lets find out the remaining size of the current contig
+            long overflow = nextStart + proposedSize - 1 - length;
+            logger.debug("Overflow = " + overflow + " length: " + length);

+            // set our last size counter to the remaining size
+            lastGenomeLocSize = proposedSize - overflow;

            // move to the next contig
-            jumpContig();
-            return LocusShard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), nextStart, lastGenomeLocSize));
-        }
+            // the next sequence should start at the begining of the next contig
+            Shard ret = LocusShard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceIndex(), nextStart, nextStart + lastGenomeLocSize));

+            // now  jump ahead to the next contig
+            jumpContig();
+
+            // return the shard
+            return ret;
+        }
    }

    /** jump to the next contig */
    private void jumpContig() {
        ++seqLoc;
-        if (dic.getSequences().size() <= seqLoc) {
+
+        if (!(seqLoc < dic.getSequences().size())) {
            nextContig = false;
            return;
        }
+        logger.debug("Next contig, index = " + dic.getSequence(seqLoc).getSequenceIndex());
+        mLoc = new GenomeLoc(dic.getSequence(seqLoc).getSequenceIndex(), 0, 0);

-        // the next sequence should start at the begining of the next contig
-        mLoc = new GenomeLoc(dic.getSequence(seqLoc).getSequenceName(), 0, 0);

    }

@ -149,7 +249,12 @@ public abstract class LocusShardStrategy implements ShardStrategy  {
     * @return
     */
    public boolean hasNext() {
-        return nextContig;
+        // if we don't have an interval file, use the non interval based approach.  Simple, eh?
+        if (this.intervals == null) {
+            return nextContig;
+        } else {
+            return (this.currentInterval < this.intervals.size());
+        }
    }

    /** we don't support remove */
@ -167,5 +272,4 @@ public abstract class LocusShardStrategy implements ShardStrategy  {
        return this;
    }

-
 }
--- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactory.java
+++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactory.java
@ -39,9 +39,7 @@ public class ShardStrategyFactory {
    }

    /** our log, which we want to capture anything from this class */
-        private static Logger logger = Logger.getLogger(ShardStrategyFactory.class);
-
-
+    private static Logger logger = Logger.getLogger(ShardStrategyFactory.class);


    /**
@ -95,9 +93,9 @@ public class ShardStrategyFactory {
    static public ShardStrategy shatter(SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, List<GenomeLoc> lst) {
        switch (strat) {
            case LINEAR:
-                return new LinearLocusShardStrategy(dic, startingSize); // , lst);
+                return new LinearLocusShardStrategy(dic, startingSize , lst);
            case EXPONENTIAL:
-                return new ExpGrowthLocusShardStrategy(dic, startingSize); // , lst);
+                return new ExpGrowthLocusShardStrategy(dic, startingSize , lst);
            default:
                throw new RuntimeException("Strategy: " + strat + " isn't implemented");
        }
@ -111,7 +109,7 @@ public class ShardStrategyFactory {
     * @return
     */
    static public ShardStrategy shatterByReadCount(long readCount) {
-        return null;    
+        return null;
    }

 }
--- a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSource.java
+++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSource.java
@ -32,7 +32,7 @@ public class SAMBAMDataSource implements SimpleDataSource {
    protected static Logger logger = Logger.getLogger(SAMBAMDataSource.class);

    // are we set to locus mode or read mode for dividing
-    private boolean locusMode = true;
+    private boolean locusMode = false;

    // How strict should we be with SAM/BAM parsing?
    protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.STRICT;
--- a/java/test/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactoryTest.java
+++ b/java/test/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactoryTest.java
@ -9,7 +9,6 @@ import org.broadinstitute.sting.utils.GenomeLoc;
 import org.junit.*;

 import java.io.File;
-import java.io.FileWriter;
 import java.util.ArrayList;

 /**
@ -84,6 +83,8 @@ public class ShardStrategyFactoryTest {
    /** Tests that we got a string parameter in correctly */
    @Test
    public void testFullGenomeCycle() {
+        GenomeLoc.setupRefContigOrdering(seq.getSequenceDictionary());
+
        ShardStrategy strategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000);
        int shardCount = 0;
        try {
@ -106,7 +107,7 @@ public class ShardStrategyFactoryTest {

    /** Tests that we got a string parameter in correctly */
    @Test
-    public void testIntervalGenomeCycle() {
+    public void testIntervalGenomeCycle() throws InterruptedException {
        SAMSequenceDictionary dic = seq.getSequenceDictionary();
        SAMSequenceRecord s = dic.getSequence(1);
        // Character stream writing
@ -115,13 +116,18 @@ public class ShardStrategyFactoryTest {
        int stop = s.getSequenceLength();
        int size = 10000;
        int location = 1;
+        GenomeLoc.setupRefContigOrdering(dic);
        System.err.println("done to sleep");
        // keep track of the number of genome locs we build
        int genomeLocs = 0;
        ArrayList<GenomeLoc> locations = new ArrayList<GenomeLoc>();
-        while (location + size < stop) {
+        System.err.println("done to sleep2");
+        try {
+            while (location + size < stop) {
+            System.err.println("s = " + s.getSequenceName() + " " + location + " " + size);
            // lets make up some fake locations
            GenomeLoc gl = new GenomeLoc(s.getSequenceName(), location, location + size - 1);
+            System.err.println("loc = " + location);

            // let's move the location up, with a size space
            location += (size * 2);
@ -132,19 +138,21 @@ public class ShardStrategyFactoryTest {
            // add another genome location
            ++genomeLocs;
        }
-
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+        System.err.println("Location count = " + genomeLocs);
        ShardStrategy strategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 5000, locations);
        int shardCount = 0;
        try {
-            FileWriter writer = new FileWriter("myfile.txt");
            for (Shard sh : strategy) {
                GenomeLoc l = sh.getGenomeLoc();

-                writer.write("Shard start: " + l.getStart() + " stop " + l.getStop() + " contig " + l.getContig());
-                //logger.debug("Shard start: " + l.getStart() + " stop " + l.getStop() + " contig " + l.getContig());
+                System.err.println("Shard start: " + l.getStart() + " stop " + l.getStop() + " contig " + l.getContig());
                shardCount++;
            }
-            writer.close();
+
+             System.err.println("Shard count = " + shardCount); 
            assertEquals(shardCount, genomeLocs * 2);

        } catch (Exception e) {
--- a/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSourceTest.java
+++ b/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSourceTest.java
@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategy;
 import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategyFactory;
 import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2;
 import org.broadinstitute.sting.utils.FastaSequenceFile2;
+import org.broadinstitute.sting.utils.GenomeLoc;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@ -54,10 +55,10 @@ public class SAMBAMDataSourceTest {
    @Before
    public void doForEachTest() {
        fl = new ArrayList<String>();
-        fl.add("/humgen/gsa-scr1/aaron/stink/NA12892.bam");

        // sequence
        seq = new FastaSequenceFile2(new File("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
+        GenomeLoc.setupRefContigOrdering(seq.getSequenceDictionary());
    }

    /**
@ -67,17 +68,22 @@ public class SAMBAMDataSourceTest {
     */
    @After
    public void undoForEachTest() {
-
+        seq = null;
+        fl.clear();
    }


    /** Test out that we can shard the file and iterate over every read */
-    @Test
+    //@Test
    public void testLinearBreakIterateAll() {
        // the sharding strat.
        ShardStrategy strat = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000);
        int count = 0;

+        // setup the data
+        fl.add("/humgen/gsa-scr1/aaron/stink/NA12892.bam");
+
+
        try {
            SAMBAMDataSource data = new SAMBAMDataSource(fl);
            for (Shard sh : strat) {
@ -103,4 +109,96 @@ public class SAMBAMDataSourceTest {
        }
    }

+
+    /** Test out that we can shard the file and iterate over every read */
+    @Test
+    public void testMergingTwoBAMFiles() {
+        // the sharding strat.
+        ShardStrategy strat = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000);
+
+
+        // setup the test files
+        fl.add("/seq/dirseq/analysis/cancer_exome/sams/TCGA-06-0188.aligned.duplicates_marked.bam");
+
+        ArrayList<Integer> readcountPerShard = new ArrayList<Integer>();
+        ArrayList<Integer> readcountPerShard2 = new ArrayList<Integer>();
+
+        // count up the first hundred shards
+        int shardsToCount = 100;
+        int count = 0;
+
+        try {
+            SAMBAMDataSource data = new SAMBAMDataSource(fl);
+            for (Shard sh : strat) {
+                int readCount = 0;
+                count++;
+                if (count > shardsToCount) {
+                    break;
+                }
+
+                MergingSamRecordIterator2 datum = data.seek(sh.getGenomeLoc());
+
+                for (SAMRecord r : datum) {
+                    readCount++;
+
+                }
+                readcountPerShard.add(readCount);
+                System.out.println("read count = " + readCount);
+                datum.close();
+            }
+        }
+        catch (SimpleDataSourceLoadException e) {
+            e.printStackTrace();  //To change body of catch statement use File | Settings | File Templates.
+            fail("testLinearBreakIterateAll: We Should get a SimpleDataSourceLoadException");
+        }
+
+
+        // setup the data and the counter before our second run
+        fl.clear();
+        fl.add("/seq/dirseq/analysis/cancer_exome/sams/TCGA-06-0188-01A-01W.aligned.duplicates_marked.bam");
+        fl.add("/seq/dirseq/analysis/cancer_exome/sams/TCGA-06-0188-10B-01W.aligned.duplicates_marked.bam");
+        count = 0;
+        // the sharding strat.
+        strat = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000);
+
+        System.err.println("Pile two:");
+        try {
+            SAMBAMDataSource data = new SAMBAMDataSource(fl);
+            for (Shard sh : strat) {
+                int readCount = 0;
+                count++;
+
+                // can we leave?
+                if (count > shardsToCount) {
+                    break;
+                }
+
+                MergingSamRecordIterator2 datum = data.seek(sh.getGenomeLoc());
+
+                for (SAMRecord r : datum) {
+                    readCount++;
+                }
+
+                readcountPerShard2.add(readCount);
+                System.out.println("read count = " + readCount);
+                datum.close();
+            }
+        }
+        catch (SimpleDataSourceLoadException e) {
+            e.printStackTrace();  //To change body of catch statement use File | Settings | File Templates.
+            fail("testLinearBreakIterateAll: We Should get a SimpleDataSourceLoadException");
+        }
+
+        int pos = 0;
+        for (; pos < 100; pos++) {
+            if (!readcountPerShard.get(pos).equals(readcountPerShard2.get(pos))) {
+                fail("Shard number " + pos + " in the two approaches had different read counts");
+            }
+        }
+
+    }
+
+
+
+
 }