diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusShardStrategy.java index 60049d374..6585c24b9 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusShardStrategy.java @@ -29,7 +29,7 @@ import java.util.List; *

* Interface Shard *

- * The shard interface, which controls how data is divided + * The shard interface, which controls how data is divided for loci */ public abstract class LocusShardStrategy implements ShardStrategy { @@ -66,7 +66,7 @@ public abstract class LocusShardStrategy implements ShardStrategy { */ LocusShardStrategy(SAMSequenceDictionary dic) { this.dic = dic; - mLoc = new GenomeLoc(0,0,0); + mLoc = new GenomeLoc(0, 0, 0); if (dic.getSequences().size() > 0) { nextContig = true; } @@ -110,19 +110,11 @@ public abstract class LocusShardStrategy implements ShardStrategy { */ /** - * set the next shards size - * - * @param size adjust the next size to this - */ - public abstract void adjustNextShardSize(long size); - - - /** - * This is how the various shards strategies implements their approach + * This is how the various shards strategies implements their approach, adjusting this value * * @return the next shard size */ - abstract long nextShardSize(); + protected abstract long nextShardSize(); /** @@ -132,8 +124,6 @@ public abstract class LocusShardStrategy implements ShardStrategy { */ - - /** * get the next shard, based on the return size of nextShardSize * diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategy.java index 30a3fd6ed..dd7908e0b 100644 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategy.java @@ -28,4 +28,13 @@ import java.util.Iterator; * class, but not this will be an interface to accomidate read based sharding */ public interface ShardStrategy extends Iterator, Iterable { + + /** + * set the next shards size + * + * @param size adjust the next size to this + */ + public abstract void adjustNextShardSize(long size); + + } diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactory.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactory.java index bb773caaa..39af328f0 100644 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactory.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ShardStrategyFactory.java @@ -108,7 +108,7 @@ public class ShardStrategyFactory { * @param readCount the number of reads to include in each shard * @return */ - static public ShardStrategy shatterByReadCount(long readCount) { + static public ShardStrategy shatterByReadCount(SAMSequenceDictionary dic, long readCount) { return null; } diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java new file mode 100755 index 000000000..0260de781 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java @@ -0,0 +1,116 @@ +package org.broadinstitute.sting.gatk.dataSources.simpleDataSources; + +import edu.mit.broad.picard.sam.SamFileHeaderMerger; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMFileReader; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; + +/** + * User: aaron + * Date: Mar 26, 2009 + * Time: 2:36:16 PM + *

+ * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + *

+ * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + */ +public class SAMDataSource implements SimpleDataSource { + /** our SAM data files */ + private final SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.coordinate; + + /** our log, which we want to capture anything from this class */ + protected static Logger logger = Logger.getLogger(SAMDataSource.class); + + // are we set to locus mode or read mode for dividing + private boolean locusMode = false; + + // How strict should we be with SAM/BAM parsing? + protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.STRICT; + + // our list of readers + private final List samFileList = new ArrayList(); + + /** + * constructor, given a single sam file + * + * @param samFiles the list of sam files + */ + public SAMDataSource(List samFiles) throws SimpleDataSourceLoadException { + for (String fileName : samFiles) { + File smFile = new File(fileName); + if (!smFile.canRead()) { + throw new SimpleDataSourceLoadException("SAMDataSource: Unable to load file: " + fileName); + } + samFileList.add(smFile); + + } + + //SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(samFileList, SORT_ORDER); + } + + + protected SAMFileReader initializeSAMFile(final File samFile) { + if (samFile.toString().endsWith(".list")) { + return null; + } else { + SAMFileReader samReader = new SAMFileReader(samFile, true); + samReader.setValidationStringency(strictness); + + final SAMFileHeader header = samReader.getFileHeader(); + logger.info(String.format("Sort order is: " + header.getSortOrder())); + + return samReader; + } + } + + /** + *

+ * seek + *

+ * + * @param location the genome location to extract data for + * @return an iterator for that region + */ + public MergingSamRecordIterator2 seek(GenomeLoc location) throws SimpleDataSourceLoadException { + + // right now this is pretty damn heavy, it copies the file list into a reader list every time + List lst = new ArrayList(); + for (File f : this.samFileList) { + SAMFileReader reader = initializeSAMFile(f); + if (reader == null) { + throw new SimpleDataSourceLoadException("SAMDataSource: Unable to load file: " + f); + } + lst.add(reader); + } + + // now merge the headers + SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(lst, SORT_ORDER); + + // make a merging iterator for this record + MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(headerMerger); + + + // we do different things for locus and read modes + if (locusMode) { + iter.queryOverlapping(location.getContig(), (int) location.getStart(), (int) location.getStop()); + } else { + iter.queryContained(location.getContig(), (int) location.getStart(), (int) location.getStop()); + } + + // return the iterator + return iter; + } + + + +} diff --git a/java/src/org/broadinstitute/sting/gatk/executive/MicroManager.java b/java/src/org/broadinstitute/sting/gatk/executive/MicroManager.java index c4ce0de6e..d07dea89b 100644 --- a/java/src/org/broadinstitute/sting/gatk/executive/MicroManager.java +++ b/java/src/org/broadinstitute/sting/gatk/executive/MicroManager.java @@ -7,7 +7,7 @@ import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategy; import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategyFactory; import org.broadinstitute.sting.gatk.dataSources.shards.Shard; -import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SAMBAMDataSource; +import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SAMDataSource; import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SimpleDataSourceLoadException; import org.broadinstitute.sting.gatk.dataSources.providers.LocusContextProvider; import org.broadinstitute.sting.gatk.dataSources.providers.ReferenceProvider; @@ -68,10 +68,10 @@ public class MicroManager { SHARD_SIZE ); ReferenceIterator refIter = new ReferenceIterator(ref); - SAMBAMDataSource dataSource = null; + SAMDataSource dataSource = null; try { - dataSource = new SAMBAMDataSource( Arrays.asList( new String[] { reads.getCanonicalPath() } ) ); + dataSource = new SAMDataSource( Arrays.asList( new String[] { reads.getCanonicalPath() } ) ); } catch( SimpleDataSourceLoadException ex ) { throw new RuntimeException( ex ); diff --git a/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSourceTest.java b/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSourceTest.java index 955aa9363..653b8facd 100755 --- a/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSourceTest.java +++ b/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSourceTest.java @@ -87,7 +87,7 @@ public class SAMBAMDataSourceTest extends BaseTest { try { - SAMBAMDataSource data = new SAMBAMDataSource(fl); + SAMDataSource data = new SAMDataSource(fl); for (Shard sh : strat) { int readCount = 0; count++; @@ -136,7 +136,7 @@ public class SAMBAMDataSourceTest extends BaseTest { int count = 0; try { - SAMBAMDataSource data = new SAMBAMDataSource(fl); + SAMDataSource data = new SAMDataSource(fl); for (Shard sh : strat) { int readCount = 0; count++; @@ -171,7 +171,7 @@ public class SAMBAMDataSourceTest extends BaseTest { logger.debug("Pile two:"); try { - SAMBAMDataSource data = new SAMBAMDataSource(fl); + SAMDataSource data = new SAMDataSource(fl); for (Shard sh : strat) { int readCount = 0; count++; diff --git a/java/test/org/broadinstitute/sting/gatk/iterators/BoundedReadIteratorTest.java b/java/test/org/broadinstitute/sting/gatk/iterators/BoundedReadIteratorTest.java index d613898d1..85b8ed066 100755 --- a/java/test/org/broadinstitute/sting/gatk/iterators/BoundedReadIteratorTest.java +++ b/java/test/org/broadinstitute/sting/gatk/iterators/BoundedReadIteratorTest.java @@ -6,7 +6,7 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.dataSources.shards.Shard; import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategy; import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategyFactory; -import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SAMBAMDataSource; +import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SAMDataSource; import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SimpleDataSourceLoadException; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2; @@ -83,7 +83,7 @@ public class BoundedReadIteratorTest extends BaseTest { long shardReadCount = 0; try { - SAMBAMDataSource data = new SAMBAMDataSource(fl); + SAMDataSource data = new SAMDataSource(fl); // make sure we have a shard if (!strat.hasNext()) {