diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceDataSource.java
new file mode 100644
index 000000000..292caf010
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceDataSource.java
@@ -0,0 +1,70 @@
+package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
+
+import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
+import org.broadinstitute.sting.utils.FastaSequenceFile2;
+import org.broadinstitute.sting.utils.GenomeLoc;
+
+import java.io.File;
+
+/**
+ *
+ * User: aaron
+ * Date: Apr 6, 2009
+ * Time: 3:55:21 PM
+ *
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2009 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ *
+ * This software is supplied without any warranty or guaranteed support whatsoever. Neither
+ * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
+ *
+ */
+
+
+/**
+ * @author aaron
+ * @version 1.0
+ * @date Apr 6, 2009
+ *
+ * Class ReferenceDataSource
+ *
+ * A descriptions should go here. Blame aaron if it's missing.
+ */
+public class ReferenceDataSource implements SimpleDataSource {
+
+ final protected FastaSequenceFile2 refFile;
+ final protected ReferenceIterator refIter;
+
+ /**
+ * Query the data source for a region of interest, specified by the genome location.
+ * The iterator will generate successive calls
+ *
+ * @param location the genome location to extract data for
+ * @return an iterator of the appropriate type, that is limited by the region
+ */
+ public ReferenceIterator seek(GenomeLoc location) {
+ ReferenceIterator refSite = refIter.seekForward(location);
+ return refSite;
+ }
+
+ /**
+ * Constructor - ReferenceDataSource
+ *
+ * @param refFileName the reference file
+ * @throws SimpleDataSourceLoadException
+ */
+ public ReferenceDataSource(String refFileName) throws SimpleDataSourceLoadException {
+ if (refFileName == null) {
+ throw new SimpleDataSourceLoadException("ReferenceDataSource: refFileName passed in is null");
+ }
+ File infile = new File(refFileName);
+ if (!infile.canRead()) {
+ throw new SimpleDataSourceLoadException("ReferenceDataSource: Unable to load file: " + refFileName);
+ }
+ refFile = new FastaSequenceFile2(new File(refFileName));
+ refIter = new ReferenceIterator(this.refFile);
+
+ }
+}
diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceMetaDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceMetaDataSource.java
new file mode 100644
index 000000000..54fbec295
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceMetaDataSource.java
@@ -0,0 +1,118 @@
+package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
+
+import org.broadinstitute.sting.gatk.refdata.HapMapAlleleFrequenciesROD;
+import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
+import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
+import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
+import org.broadinstitute.sting.utils.GenomeLoc;
+
+import java.io.File;
+import java.util.*;
+
+/**
+ *
+ * User: aaron
+ * Date: Apr 6, 2009
+ * Time: 4:33:10 PM
+ *
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2009 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ *
+ * This software is supplied without any warranty or guaranteed support whatsoever. Neither
+ * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
+ *
+ */
+
+
+/**
+ * @author aaron
+ * @version 1.0
+ * @date Apr 6, 2009
+ *
+ * Class ReferenceMetaDataSource
+ *
+ * A descriptions should go here. Blame aaron if it's missing.
+ */
+public class ReferenceMetaDataSource implements SimpleDataSource {
+
+ // our enumerated types
+ public enum RODTYPE {
+ DBSNP, HAPMAP
+ }
+
+ // these could go on the stack, but a heap copy isn't too bad
+ private List myData = null;
+ private List.RODIterator> rodIters = null;
+ private List> rods = null;
+
+ /**
+ * Prepare the list of reference ordered data iterators for each of the rods
+ *
+ * @return A list of ROD iterators for getting data from each ROD
+ */
+ protected List.RODIterator> initializeRODs() {
+ // set up reference ordered data
+ rodIters = new ArrayList.RODIterator>();
+ for (ReferenceOrderedData extends ReferenceOrderedDatum> data : rods) {
+ rodIters.add(data.iterator());
+ }
+ return rodIters;
+ }
+
+ /**
+ * Builds a list of the reference ordered datum at loc from each of the iterators. This function
+ * assumes you are accessing the data in order. You can't use this function for random access. Each
+ * successive call moves you along the file, consuming all data before loc.
+ *
+ * @param rodIters Iterators to access the RODs
+ * @param loc The location to get the rods at
+ * @return A list of ReferenceOrderDatum at loc. ROD without a datum at loc will be null in the list
+ */
+ protected List getReferenceOrderedDataAtLocus(List.RODIterator> rodIters,
+ final GenomeLoc loc) {
+ List data = new ArrayList();
+ for (ReferenceOrderedData extends ReferenceOrderedDatum>.RODIterator iter : rodIters) {
+ data.add(iter.seekForward(loc));
+ }
+ return data;
+ }
+
+ /**
+ * Query the data source for a region of interest, specified by the genome location.
+ * The iterator will generate successive calls
+ *
+ * @param location the genome location to extract data for
+ * @return an iterator of the appropriate type, that is limited by the region
+ */
+ public Iterator seek(GenomeLoc location) {
+ myData = getReferenceOrderedDataAtLocus(rodIters, location);
+ return myData.iterator();
+ }
+
+ public ReferenceMetaDataSource(HashMap files) {
+
+ // setup a rod list
+ List> rods = new ArrayList>();
+
+ // cycle through the passed in rod's
+
+ Set fileNames = files.keySet();
+ for (String file : fileNames) {
+ switch (files.get(file)) {
+
+ case DBSNP: {
+ ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File(file), rodDbSNP.class);
+ //dbsnp.testMe();
+ rods.add(dbsnp); // { gff, dbsnp };
+ }
+ case HAPMAP: {
+ ReferenceOrderedData hapmap = new ReferenceOrderedData(new File(file), HapMapAlleleFrequenciesROD.class);
+ //dbsnp.testMe();
+ rods.add(hapmap); // { gff, dbsnp };
+ }
+ }
+ }
+ }
+}
diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSource.java
new file mode 100644
index 000000000..795cab75f
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSource.java
@@ -0,0 +1,114 @@
+package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
+
+import edu.mit.broad.picard.sam.SamFileHeaderMerger;
+import net.sf.samtools.SAMFileHeader;
+import net.sf.samtools.SAMFileReader;
+import org.apache.log4j.Logger;
+import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2;
+import org.broadinstitute.sting.utils.GenomeLoc;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * User: aaron
+ * Date: Mar 26, 2009
+ * Time: 2:36:16 PM
+ *
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2009 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ *
+ * This software is supplied without any warranty or guaranteed support whatsoever. Neither
+ * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
+ */
+public class SAMBAMDataSource implements SimpleDataSource {
+ /** our SAM data files */
+ private final SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.coordinate;
+
+ /** our log, which we want to capture anything from this class */
+ protected static Logger logger = Logger.getLogger(SAMBAMDataSource.class);
+
+ // our sam file readers
+ private final ArrayList readers = new ArrayList();
+
+ // do we care that the SAM files respect the sort order.
+ private boolean matchedSortOrders = true;
+
+ // our merged sam iterator for spliting up the files
+ MergingSamRecordIterator2 mergeIterator;
+
+ // are we set to locus mode or read mode for dividing
+ private boolean locusMode = true;
+
+ // How strict should we be with SAM/BAM parsing?
+ protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.STRICT;
+
+ /**
+ * constructor, given a single sam file
+ *
+ * @param samFiles the list of sam files
+ */
+ public SAMBAMDataSource(List samFiles) throws SimpleDataSourceLoadException {
+ List readers = new ArrayList();
+ for (String fileName : samFiles) {
+ File smFile = new File(fileName);
+ if (!smFile.canRead()) {
+ throw new SimpleDataSourceLoadException("SAMBAMDataSource: Unable to load file: " + fileName);
+ }
+ SAMFileReader reader = initializeSAMFile(smFile);
+ if (reader != null) {
+ readers.add(reader);
+ }
+ }
+
+ SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SORT_ORDER);
+ this.mergeIterator = new MergingSamRecordIterator2(headerMerger);
+ }
+
+
+ protected SAMFileReader initializeSAMFile(final File samFile) {
+ if (samFile.toString().endsWith(".list")) {
+ return null;
+ } else {
+ SAMFileReader samReader = new SAMFileReader(samFile, true);
+ samReader.setValidationStringency(strictness);
+
+ final SAMFileHeader header = samReader.getFileHeader();
+ logger.info(String.format("Sort order is: " + header.getSortOrder()));
+
+ return samReader;
+ }
+ }
+
+
+ /**
+ * set the mode to by loci, which let's you duplicate reads, but never at a single
+ * locus, or false for read mode where no read is seen twice.
+ *
+ * @param tr true if by loci, false if by read
+ */
+ public void setToByLociMode(boolean tr) {
+ locusMode = tr;
+ }
+
+ /**
+ *
+ * getQueryRegionIterator
+ *
+ *
+ * @param location the genome location to extract data for
+ * @return an iterator for that region
+ */
+ public MergingSamRecordIterator2 seek(GenomeLoc location) {
+ MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(this.mergeIterator);
+ if (locusMode) {
+ iter.query(location.getContig(), (int) location.getStart(), (int) location.getStop(), true);
+ } else {
+ iter.queryContained(location.getContig(), (int) location.getStart(), (int) location.getStop());
+ }
+ return iter; //To change body of implemented methods use File | Settings | File Templates.
+ }
+}
diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java
deleted file mode 100644
index 61a913892..000000000
--- a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java
+++ /dev/null
@@ -1,117 +0,0 @@
-package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
-
-import edu.mit.broad.picard.io.IoUtil;
-import edu.mit.broad.picard.sam.MergingSamRecordIterator;
-import edu.mit.broad.picard.sam.SamFileHeaderMerger;
-import net.sf.samtools.SAMFileHeader;
-import net.sf.samtools.SAMFileReader;
-import net.sf.samtools.SAMFileWriter;
-
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- * User: aaron
- * Date: Mar 26, 2009
- * Time: 2:36:16 PM
- *
- * The Broad Institute
- * SOFTWARE COPYRIGHT NOTICE AGREEMENT
- * This software and its documentation are copyright 2009 by the
- * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
- *
- * This software is supplied without any warranty or guaranteed support whatsoever. Neither
- * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
- */
-public class SAMDataSource implements SimpleDataSource {
- /** our SAM data files */
- private final SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.coordinate;
-
- // our sam file readers
- private final ArrayList readers = new ArrayList();
-
- // do we care that the SAM files respect the sort order.
- private boolean matchedSortOrders = true;
-
- // our record iterator, we use it to iterate over all the reads
- private MergingSamRecordIterator iterator = null;
-
- // we may want to write out the file
- private SAMFileWriter out = null;
-
- // are we set to locus mode or read mode for dividing
- private boolean locusMode = true;
-
- /**
- * constructor for multiple sam files
- *
- * @param samfiles
- */
- public SAMDataSource(ArrayList samfiles) throws FileNotFoundException {
- loadFiles(samfiles);
- }
-
- private void loadFiles(ArrayList samfiles) throws FileNotFoundException {
- // verify the list passed to the class
- ArrayList INPUT = new ArrayList();
- for (String check : samfiles) {
- File nf = new File(check);
- if (!nf.exists()) {
- throw new FileNotFoundException(check + " doesn't exist");
- }
- }
-
-
- // Open the files for reading and writing
-
- List readers = new ArrayList();
- for (File inFile : INPUT) {
- IoUtil.assertFileIsReadable(inFile);
- SAMFileReader in = new SAMFileReader(inFile);
- readers.add(in);
- matchedSortOrders = matchedSortOrders && in.getFileHeader().getSortOrder() == SORT_ORDER;
- }
-
- // If all the input sort orders match the output sort order then just merge them and
- // write on the fly, otherwise setup to merge and sort before writing out the final file
- if (matchedSortOrders || SORT_ORDER == SAMFileHeader.SortOrder.unsorted) {
- SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SORT_ORDER);
- iterator = new MergingSamRecordIterator(headerMerger);
-
- } else {
- SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SAMFileHeader.SortOrder.unsorted);
- iterator = new MergingSamRecordIterator(headerMerger);
- SAMFileHeader header = headerMerger.getMergedHeader();
- header.setSortOrder(SORT_ORDER);
-
- }
- }
-
- /**
- * constructor, given a single sam file
- *
- * @param samFile
- */
- public SAMDataSource(String samFile) throws FileNotFoundException {
- ArrayList samfiles = new ArrayList();
- samfiles.add(samFile);
- loadFiles(samfiles);
- }
-
- /**
- * Chunk the sam file at appropriate locations, given the chunk count
- *
- * @param chunkCount
- * @return
- */
- public void chunk(int chunkCount) {
-
- }
-
- /** set this source to divide on reads */
- public void setToReadMode() {
- locusMode = true;
- }
-}
diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSource.java
index ea2916159..408672f38 100644
--- a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSource.java
+++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSource.java
@@ -1,6 +1,10 @@
package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
+import org.broadinstitute.sting.utils.GenomeLoc;
+
import java.io.Serializable;
+import java.util.Iterator;
+
/**
* User: aaron
@@ -14,17 +18,20 @@ import java.io.Serializable;
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
+ *
*/
+
+/** This class is the interface for all data sources */
public interface SimpleDataSource extends Serializable {
- /**
- * recommend how many data chunks we should be breaking the file into,
- * as a recommendated number. If not specified (and even if specified)
- * the chunking data source can make decisions to chunk differently.
- *
- * @param chunkCount
- */
- public void chunk(int chunkCount);
+ /**
+ * Query the data source for a region of interest, specified by the genome location.
+ * The iterator will generate successive calls
+ *
+ * @param location the genome location to extract data for
+ * @return an iterator of the appropriate type, that is limited by the region
+ */
+ public Iterator seek(GenomeLoc location);
}
diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSourceLoadException.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSourceLoadException.java
new file mode 100644
index 000000000..cd9ecce7c
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSourceLoadException.java
@@ -0,0 +1,33 @@
+package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
+
+/**
+ *
+ * User: aaron
+ * Date: Apr 6, 2009
+ * Time: 4:21:58 PM
+ *
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2009 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ *
+ * This software is supplied without any warranty or guaranteed support whatsoever. Neither
+ * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
+ *
+ */
+
+
+/**
+ * @author aaron
+ * @version 1.0
+ * @date Apr 6, 2009
+ *
+ * Class SimpleDataSourceLoadException
+ *
+ * Generate this on a simple data source load exception
+ */
+public class SimpleDataSourceLoadException extends Exception {
+ public SimpleDataSourceLoadException(String msg) {
+ super(msg);
+ }
+}
diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSourceSplitException.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSourceSplitException.java
new file mode 100644
index 000000000..f30e6998e
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSourceSplitException.java
@@ -0,0 +1,34 @@
+package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
+
+/**
+ *
+ * User: aaron
+ * Date: Apr 1, 2009
+ * Time: 11:08:06 AM
+ *
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2009 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ *
+ * This software is supplied without any warranty or guaranteed support whatsoever. Neither
+ * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
+ *
+ */
+
+
+/**
+ * @author aaron
+ * @version 1.0
+ * @date Apr 1, 2009
+ *
+ * Class SimpleDataSourceSplitFailure
+ *
+ * A descriptions should go here. Blame aaron if it's missing.
+ */
+public class SimpleDataSourceSplitException extends Exception {
+ public SimpleDataSourceSplitException(String msg) {
+ super(msg);
+ }
+
+}