diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/DataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/DataSource.java
new file mode 100644
index 000000000..2ddcea799
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/dataSources/DataSource.java
@@ -0,0 +1,21 @@
+package org.broadinstitute.sting.gatk.dataSources;
+
+import org.broadinstitute.sting.gatk.dataSources.chunks.DataShard;
+
+/**
+ * User: aaron
+ * Date: Mar 25, 2009
+ * Time: 6:20:00 PM
+ *
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2009 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ *
+ * This software is supplied without any warranty or guaranteed support whatsoever. Neither
+ * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
+ */
+public interface DataSource {
+
+ public DataShard toChunk(int chunkCount);
+}
diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/DataSourceBuilder.java b/java/src/org/broadinstitute/sting/gatk/dataSources/DataSourceBuilder.java
new file mode 100644
index 000000000..ecbf23584
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/dataSources/DataSourceBuilder.java
@@ -0,0 +1,74 @@
+package org.broadinstitute.sting.gatk.dataSources;
+
+import org.broadinstitute.sting.gatk.walkers.Walker;
+import org.broadinstitute.sting.gatk.walkers.ReadWalker;
+
+import java.util.ArrayList;
+import java.io.File;
+
+/**
+ * User: aaron
+ * Date: Mar 25, 2009
+ * Time: 4:51:39 PM
+ *
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2009 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ *
+ * This software is supplied without any warranty or guaranteed support whatsoever. Neither
+ * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
+ */
+public class DataSourceBuilder {
+
+ // storage for the passed file
+ ArrayList passFiles = new ArrayList();
+
+ public DataSourceBuilder() {
+
+ }
+
+ /**
+ * add a file used to generate the data sources
+ *
+ * @param fileName the filename that should be used
+ */
+ public void addDataFile(String fileName) {
+ // for now, just add it to the internal file list
+ passFiles.add(new File(fileName));
+ }
+
+ /**
+ * add a file used to generate the data sources
+ *
+ * @param file the filename that should be used
+ */
+ public void addDataFile(File file) {
+ // for now, just add it to the internal file list
+ passFiles.add(file);
+ }
+
+ public DataSource build(Walker inputWalker) {
+ if (inputWalker instanceof ReadWalker) {
+
+ }
+
+ return null;
+ }
+
+
+ /**
+ * this section contains the private methods to create data sources
+ * based on the type of walker we're passed in.
+ */
+
+
+ /**
+ * we know we have a read data source, let's get the
+ * @return
+ */
+ //private ReadDataSource generateReadDataSource() {
+ //
+ //}
+
+}
diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/DataSourceGenerationException.java b/java/src/org/broadinstitute/sting/gatk/dataSources/DataSourceGenerationException.java
new file mode 100644
index 000000000..711335c41
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/dataSources/DataSourceGenerationException.java
@@ -0,0 +1,24 @@
+package org.broadinstitute.sting.gatk.dataSources;
+
+/**
+ * User: aaron
+ * Date: Mar 26, 2009
+ * Time: 9:25:49 AM
+ *
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2009 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ *
+ * This software is supplied without any warranty or guaranteed support whatsoever. Neither
+ * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
+ */
+
+/**
+ * This exception is throw when we're unable to generate a data source,
+ * most likely due to an incomplete input source list
+ */
+public class DataSourceGenerationException extends Exception {
+
+
+}
diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/ReadDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/ReadDataSource.java
new file mode 100644
index 000000000..2f463a958
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/dataSources/ReadDataSource.java
@@ -0,0 +1,48 @@
+package org.broadinstitute.sting.gatk.dataSources;
+
+import net.sf.samtools.SAMFileReader;
+import net.sf.samtools.SAMRecord;
+
+import java.util.Iterator;
+import java.io.File;
+
+import org.broadinstitute.sting.gatk.iterators.VerifyingSamIterator;
+import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
+import org.broadinstitute.sting.utils.FastaSequenceFile2;
+
+/**
+ * User: aaron
+ * Date: Mar 26, 2009
+ * Time: 10:35:40 AM
+ *
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2009 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ *
+ * This software is supplied without any warranty or guaranteed support whatsoever. Neither
+ * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
+ */
+public class ReadDataSource {
+
+ /**
+ * our SAM data files
+ */
+ // our SAM reader
+ private SAMFileReader samReader = null;
+ // iterator over the sam records in the readsFile
+ private Iterator samReadIter = null;
+
+ // The verifying iterator, it does checking
+ VerifyingSamIterator verifyingSamReadIter = null;
+
+
+ /**
+ * our reference data source
+ */
+ // The reference data -- filename, refSeqFile, and iterator
+ private File refFileName = null; // the name of the reference file
+ //private ReferenceSequenceFile refFile = null;
+ private FastaSequenceFile2 refFile = null; // todo: merge FastaSequenceFile2 into picard!
+ private ReferenceIterator refIter = null;
+}
diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/chunks/BasicDataShard.java b/java/src/org/broadinstitute/sting/gatk/dataSources/chunks/BasicDataShard.java
new file mode 100644
index 000000000..76f111674
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/dataSources/chunks/BasicDataShard.java
@@ -0,0 +1,36 @@
+package org.broadinstitute.sting.gatk.dataSources.chunks;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: aaronmckenna
+ * Date: Mar 29, 2009
+ * Time: 8:35:16 PM
+ * To change this template use File | Settings | File Templates.
+ */
+public class BasicDataShard implements DataShard {
+
+ List list = new ArrayList();
+ int index = 0;
+
+ public BasicDataShard(List list) {
+ this.list = list;
+ }
+
+ public boolean hasNext() {
+ if (list.size() > index) {
+ return true;
+ }
+ return false;
+ }
+
+ public T next() {
+ return list.get(index);
+ }
+
+ public void remove() {
+ list.remove(index);
+ }
+}
diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/chunks/DataShard.java b/java/src/org/broadinstitute/sting/gatk/dataSources/chunks/DataShard.java
new file mode 100644
index 000000000..d40f9b6f3
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/dataSources/chunks/DataShard.java
@@ -0,0 +1,19 @@
+package org.broadinstitute.sting.gatk.dataSources.chunks;
+
+import java.util.Iterator;
+
+/**
+ * User: aaron
+ * Date: Mar 26, 2009
+ * Time: 2:43:04 PM
+ *
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2009 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ *
+ * This software is supplied without any warranty or guaranteed support whatsoever. Neither
+ * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
+ */
+public interface DataShard extends Iterator {
+}
diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/chunks/LociShard.java b/java/src/org/broadinstitute/sting/gatk/dataSources/chunks/LociShard.java
new file mode 100644
index 000000000..7dc6913de
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/dataSources/chunks/LociShard.java
@@ -0,0 +1,99 @@
+package org.broadinstitute.sting.gatk.dataSources.chunks;
+
+import org.broadinstitute.sting.gatk.LocusContext;
+import org.broadinstitute.sting.gatk.dataSources.datum.LocusDatum;
+import org.broadinstitute.sting.gatk.iterators.LocusIterator;
+import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
+import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
+import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
+import org.broadinstitute.sting.utils.GenomeLoc;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ *
+ * User: aaron
+ * Date: Mar 30, 2009
+ * Time: 7:01:56 PM
+ *
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2009 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ *
+ * This software is supplied without any warranty or guaranteed support whatsoever. Neither
+ * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
+ *
+ */
+
+
+/**
+ * @author aaron
+ * @version 1.0
+ * @date Mar 30, 2009
+ *
+ * Class LociShard
+ *
+ * This is the loci shard, which are collectively made when a shatter call is made to
+ * a data source.
+ */
+public class LociShard implements DataShard {
+
+ // our locusIterator
+ private final LocusIterator locusIterator;
+
+ // our reference locusIterator
+ private final ReferenceIterator refIterator;
+
+ // Iterator over rods
+ private final List rodIters;
+
+ // the max number of iterations
+ private final int maxCount;
+
+ // how many iterations we've had
+ private int iterCount = 0;
+
+ public LociShard(LocusIterator locusIterator, ReferenceIterator refIterator, List rodIters, int maxCount) {
+ this.locusIterator = locusIterator;
+ this.maxCount = maxCount;
+ this.refIterator = refIterator;
+ this.rodIters = rodIters;
+ }
+
+ public boolean hasNext() {
+ return locusIterator.hasNext() && maxCount > iterCount;
+ }
+
+ public LocusDatum next() {
+ LocusContext locus = locusIterator.next();
+ ReferenceIterator refSite = refIterator.seekForward(locus.getLocation());
+ locus.setReferenceContig(refSite.getCurrentContig());
+ // Iterate forward to get all reference ordered data covering this locus
+ final List rodData = getReferenceOrderedDataAtLocus(rodIters, locus.getLocation());
+ return new LocusDatum(rodData, refSite.getBaseAsChar(), locus);
+ }
+
+ public void remove() {
+ locusIterator.remove();
+ }
+
+ /**
+ * Builds a list of the reference ordered datum at loc from each of the iterators. This function
+ * assumes you are accessing the data in order. You can't use this function for random access. Each
+ * successive call moves you along the file, consuming all data before loc.
+ *
+ * @param rodIters Iterators to access the RODs
+ * @param loc The location to get the rods at
+ * @return A list of ReferenceOrderDatum at loc. ROD without a datum at loc will be null in the list
+ */
+ protected List getReferenceOrderedDataAtLocus(List rodIters,
+ final GenomeLoc loc) {
+ List data = new ArrayList();
+ for (ReferenceOrderedData.RODIterator iter : rodIters) {
+ data.add(iter.seekForward(loc));
+ }
+ return data;
+ }
+}
diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/chunks/ReadShard.java b/java/src/org/broadinstitute/sting/gatk/dataSources/chunks/ReadShard.java
new file mode 100644
index 000000000..a083ce45d
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/dataSources/chunks/ReadShard.java
@@ -0,0 +1,85 @@
+package org.broadinstitute.sting.gatk.dataSources.chunks;
+
+import edu.mit.broad.picard.sam.MergingSamRecordIterator;
+import org.broadinstitute.sting.gatk.dataSources.datum.ReadDatum;
+import org.broadinstitute.sting.gatk.LocusContext;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.Utils;
+import net.sf.samtools.SAMRecord;
+
+import java.util.List;
+import java.util.Arrays;
+
+/**
+ *
+ * User: aaron
+ * Date: Mar 30, 2009
+ * Time: 5:45:51 PM
+ *
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2009 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ *
+ * This software is supplied without any warranty or guaranteed support whatsoever. Neither
+ * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
+ *
+ */
+
+
+/**
+ * @author aaron
+ * @version 1.0
+ * @date Mar 30, 2009
+ *
+ * Class ReadShard
+ *
+ * A read data shard.
+ */
+public class ReadShard implements DataShard {
+
+ private MergingSamRecordIterator iterator;
+
+ /**
+ * create the data chunk with an iterator, and a limiter
+ *
+ * @param samIterator
+ */
+ public ReadShard(MergingSamRecordIterator samIterator) {
+ this.iterator = samIterator;
+ }
+
+ /**
+ * do we have a next data point
+ *
+ * @return true if we have a data point
+ */
+ public boolean hasNext() {
+ return iterator.hasNext();
+ }
+
+ public ReadDatum next() {
+ // get the read
+ final SAMRecord read = iterator.next();
+
+ // put the read into a list
+ final List reads = Arrays.asList(read);
+
+ // put together the genome location
+ final GenomeLoc loc = Utils.genomicLocationOf(read);
+
+ // Offset of a single read is always 0
+ List offsets = Arrays.asList(0);
+
+ // create the locus
+ final LocusContext locus = new LocusContext(loc, reads, offsets);
+
+ // return the read datum
+ return new ReadDatum(read, locus);
+ }
+
+ /** remove the current pointed to data source */
+ public void remove() {
+ iterator.remove();
+ }
+}
diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/chunks/SAMDataShard.java b/java/src/org/broadinstitute/sting/gatk/dataSources/chunks/SAMDataShard.java
new file mode 100644
index 000000000..53b105dcf
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/dataSources/chunks/SAMDataShard.java
@@ -0,0 +1,48 @@
+package org.broadinstitute.sting.gatk.dataSources.chunks;
+
+import edu.mit.broad.picard.sam.MergingSamRecordIterator;
+import net.sf.samtools.SAMRecord;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: aaronmckenna
+ * Date: Mar 29, 2009
+ * Time: 8:47:50 PM
+ * To change this template use File | Settings | File Templates.
+ */
+public class SAMDataShard implements DataShard {
+
+ // our iterator
+ final private MergingSamRecordIterator iterator;
+
+ // divide by reads or by loci
+ private boolean byReads = true;
+
+ // iterator bounds limiter
+ private int lengthCount = 0;
+ private final int limiter;
+
+ public SAMDataShard(MergingSamRecordIterator iterator, int limiter) {
+ this.iterator = iterator;
+ this.limiter = limiter;
+ }
+
+ public SAMDataShard(MergingSamRecordIterator iterator) {
+ this.iterator = iterator;
+ limiter = Integer.MAX_VALUE;
+ }
+
+
+ public boolean hasNext() {
+ return iterator.hasNext() && lengthCount > limiter;
+ }
+
+ public SAMRecord next() {
+ ++lengthCount;
+ return iterator.next();
+ }
+
+ public void remove() {
+ iterator.remove();
+ }
+}
diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/datum/Datum.java b/java/src/org/broadinstitute/sting/gatk/dataSources/datum/Datum.java
new file mode 100644
index 000000000..6609e830a
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/dataSources/datum/Datum.java
@@ -0,0 +1,35 @@
+package org.broadinstitute.sting.gatk.dataSources.datum;
+
+import org.broadinstitute.sting.utils.GenomeLoc;
+
+import java.io.Serializable;
+/**
+ *
+ * User: aaron
+ * Date: Mar 30, 2009
+ * Time: 1:32:34 PM
+ *
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2009 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ *
+ * This software is supplied without any warranty or guaranteed support whatsoever. Neither
+ * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
+ *
+ */
+
+/**
+ * @author aaron
+ * @version 1.0
+ * @date Mar 30, 2009
+ *
+ * interface Datum
+ *
+ * The interface for all Datum Types.
+ */
+public interface Datum extends Serializable {
+
+ // this function is used for tracking where we are in a genome
+ public GenomeLoc getSequenceLocation();
+}
diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/datum/LocusDatum.java b/java/src/org/broadinstitute/sting/gatk/dataSources/datum/LocusDatum.java
new file mode 100644
index 000000000..1ab986732
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/dataSources/datum/LocusDatum.java
@@ -0,0 +1,94 @@
+package org.broadinstitute.sting.gatk.dataSources.datum;
+
+import org.broadinstitute.sting.gatk.LocusContext;
+import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
+import org.broadinstitute.sting.utils.GenomeLoc;
+
+import java.util.List;
+
+/**
+ *
+ * User: aaron
+ * Date: Mar 30, 2009
+ * Time: 3:08:28 PM
+ *
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2009 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ *
+ * This software is supplied without any warranty or guaranteed support whatsoever. Neither
+ * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
+ *
+ */
+
+
+/**
+ * @author aaron
+ * @version 1.0
+ * @date Mar 30, 2009
+ *
+ * Class LocusDatum
+ *
+ * The datum for loci. It contains the reference base, locusContext,
+ * and the reference order data.
+ */
+public class LocusDatum implements Datum {
+
+ // our reference order data
+ private final List rodData;
+ // our seq base
+ private final char ref;
+ // our locus context
+ private final LocusContext context;
+
+ /**
+ * the locus dataum constructor
+ *
+ * @param rodData our reference data
+ * @param ref our reference sequence base position
+ * @param context the genome context we're in
+ */
+ public LocusDatum(List rodData, char ref, LocusContext context) {
+ this.rodData = rodData;
+ this.ref = ref;
+ this.context = context;
+ }
+
+ /**
+ * return the Reference order data for this position
+ *
+ * @return
+ */
+ public List getRodData() {
+ return rodData;
+ }
+
+ /**
+ * return the reference base
+ *
+ * @return a character representing the reference base
+ */
+ public char getRef() {
+ return ref;
+ }
+
+ /**
+ * get the locus context at the current position
+ *
+ * @return
+ */
+ public LocusContext getContext() {
+ return context;
+ }
+
+ /**
+ * gets the current postion in the sequence, which comes
+ * free from underlying data types
+ *
+ * @return our current GenomeLocation
+ */
+ public GenomeLoc getSequenceLocation() {
+ return this.context.getLocation();
+ }
+}
diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/datum/ReadDatum.java b/java/src/org/broadinstitute/sting/gatk/dataSources/datum/ReadDatum.java
new file mode 100644
index 000000000..5552af496
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/dataSources/datum/ReadDatum.java
@@ -0,0 +1,65 @@
+package org.broadinstitute.sting.gatk.dataSources.datum;
+
+import net.sf.samtools.SAMRecord;
+import org.broadinstitute.sting.gatk.LocusContext;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.Utils;
+/**
+ *
+ * User: aaron
+ * Date: Mar 30, 2009
+ * Time: 2:53:37 PM
+ *
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2009 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ *
+ * This software is supplied without any warranty or guaranteed support whatsoever. Neither
+ * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
+ *
+ */
+
+
+/**
+ * @author aaron
+ * @version 1.0
+ * @date Mar 30, 2009
+ *
+ * Class ReadDatum
+ *
+ * The base read datum class.
+ */
+public class ReadDatum implements Datum {
+
+ // our SAM record
+ final private SAMRecord sam;
+
+ // our locus context
+ final private LocusContext locus;
+
+ // the constructor, taking a sam read and a locus
+ public ReadDatum(SAMRecord r, LocusContext locus) {
+ this.sam = r;
+ this.locus = locus;
+ }
+
+ // get the SAMRecord
+ public SAMRecord getRead() {
+ return this.sam;
+ }
+
+ // get the locus context
+ public LocusContext getLocus() {
+ return this.locus;
+ }
+
+ /**
+ * gets the region that our read spans
+ *
+ * @return a genome loc that details the region that our read spans.
+ */
+ public GenomeLoc getSequenceLocation() {
+ return Utils.genomicLocationOf(sam);
+ }
+}
diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java
new file mode 100644
index 000000000..61a913892
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java
@@ -0,0 +1,117 @@
+package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
+
+import edu.mit.broad.picard.io.IoUtil;
+import edu.mit.broad.picard.sam.MergingSamRecordIterator;
+import edu.mit.broad.picard.sam.SamFileHeaderMerger;
+import net.sf.samtools.SAMFileHeader;
+import net.sf.samtools.SAMFileReader;
+import net.sf.samtools.SAMFileWriter;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * User: aaron
+ * Date: Mar 26, 2009
+ * Time: 2:36:16 PM
+ *
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2009 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ *
+ * This software is supplied without any warranty or guaranteed support whatsoever. Neither
+ * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
+ */
+public class SAMDataSource implements SimpleDataSource {
+ /** our SAM data files */
+ private final SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.coordinate;
+
+ // our sam file readers
+ private final ArrayList readers = new ArrayList();
+
+ // do we care that the SAM files respect the sort order.
+ private boolean matchedSortOrders = true;
+
+ // our record iterator, we use it to iterate over all the reads
+ private MergingSamRecordIterator iterator = null;
+
+ // we may want to write out the file
+ private SAMFileWriter out = null;
+
+ // are we set to locus mode or read mode for dividing
+ private boolean locusMode = true;
+
+ /**
+ * constructor for multiple sam files
+ *
+ * @param samfiles
+ */
+ public SAMDataSource(ArrayList samfiles) throws FileNotFoundException {
+ loadFiles(samfiles);
+ }
+
+ private void loadFiles(ArrayList samfiles) throws FileNotFoundException {
+ // verify the list passed to the class
+ ArrayList INPUT = new ArrayList();
+ for (String check : samfiles) {
+ File nf = new File(check);
+ if (!nf.exists()) {
+ throw new FileNotFoundException(check + " doesn't exist");
+ }
+ }
+
+
+ // Open the files for reading and writing
+
+ List readers = new ArrayList();
+ for (File inFile : INPUT) {
+ IoUtil.assertFileIsReadable(inFile);
+ SAMFileReader in = new SAMFileReader(inFile);
+ readers.add(in);
+ matchedSortOrders = matchedSortOrders && in.getFileHeader().getSortOrder() == SORT_ORDER;
+ }
+
+ // If all the input sort orders match the output sort order then just merge them and
+ // write on the fly, otherwise setup to merge and sort before writing out the final file
+ if (matchedSortOrders || SORT_ORDER == SAMFileHeader.SortOrder.unsorted) {
+ SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SORT_ORDER);
+ iterator = new MergingSamRecordIterator(headerMerger);
+
+ } else {
+ SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SAMFileHeader.SortOrder.unsorted);
+ iterator = new MergingSamRecordIterator(headerMerger);
+ SAMFileHeader header = headerMerger.getMergedHeader();
+ header.setSortOrder(SORT_ORDER);
+
+ }
+ }
+
+ /**
+ * constructor, given a single sam file
+ *
+ * @param samFile
+ */
+ public SAMDataSource(String samFile) throws FileNotFoundException {
+ ArrayList samfiles = new ArrayList();
+ samfiles.add(samFile);
+ loadFiles(samfiles);
+ }
+
+ /**
+ * Chunk the sam file at appropriate locations, given the chunk count
+ *
+ * @param chunkCount
+ * @return
+ */
+ public void chunk(int chunkCount) {
+
+ }
+
+ /** set this source to divide on reads */
+ public void setToReadMode() {
+ locusMode = true;
+ }
+}
diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSource.java
new file mode 100644
index 000000000..ea2916159
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSource.java
@@ -0,0 +1,30 @@
+package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
+
+import java.io.Serializable;
+
+/**
+ * User: aaron
+ * Date: Mar 26, 2009
+ * Time: 2:39:05 PM
+ *
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2009 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ *
+ * This software is supplied without any warranty or guaranteed support whatsoever. Neither
+ * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
+ */
+public interface SimpleDataSource extends Serializable {
+
+ /**
+ * recommend how many data chunks we should be breaking the file into,
+ * as a recommendated number. If not specified (and even if specified)
+ * the chunking data source can make decisions to chunk differently.
+ *
+ * @param chunkCount
+ */
+ public void chunk(int chunkCount);
+
+
+}