From ba99e9f6487f9e028837e7b45ef2ed2b8b065f77 Mon Sep 17 00:00:00 2001 From: aaron Date: Tue, 31 Mar 2009 00:04:03 +0000 Subject: [PATCH] checking in some of the more static Data Source dependent code at this point. They don't do much on their own, but are need for the base data source code I'm writing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@231 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/dataSources/DataSource.java | 21 ++++ .../gatk/dataSources/DataSourceBuilder.java | 74 +++++++++++ .../DataSourceGenerationException.java | 24 ++++ .../gatk/dataSources/ReadDataSource.java | 48 +++++++ .../dataSources/chunks/BasicDataShard.java | 36 ++++++ .../gatk/dataSources/chunks/DataShard.java | 19 +++ .../gatk/dataSources/chunks/LociShard.java | 99 +++++++++++++++ .../gatk/dataSources/chunks/ReadShard.java | 85 +++++++++++++ .../gatk/dataSources/chunks/SAMDataShard.java | 48 +++++++ .../sting/gatk/dataSources/datum/Datum.java | 35 ++++++ .../gatk/dataSources/datum/LocusDatum.java | 94 ++++++++++++++ .../gatk/dataSources/datum/ReadDatum.java | 65 ++++++++++ .../simpleDataSources/SAMDataSource.java | 117 ++++++++++++++++++ .../simpleDataSources/SimpleDataSource.java | 30 +++++ 14 files changed, 795 insertions(+) create mode 100644 java/src/org/broadinstitute/sting/gatk/dataSources/DataSource.java create mode 100644 java/src/org/broadinstitute/sting/gatk/dataSources/DataSourceBuilder.java create mode 100644 java/src/org/broadinstitute/sting/gatk/dataSources/DataSourceGenerationException.java create mode 100644 java/src/org/broadinstitute/sting/gatk/dataSources/ReadDataSource.java create mode 100644 java/src/org/broadinstitute/sting/gatk/dataSources/chunks/BasicDataShard.java create mode 100644 java/src/org/broadinstitute/sting/gatk/dataSources/chunks/DataShard.java create mode 100644 java/src/org/broadinstitute/sting/gatk/dataSources/chunks/LociShard.java create mode 100644 java/src/org/broadinstitute/sting/gatk/dataSources/chunks/ReadShard.java create mode 100644 java/src/org/broadinstitute/sting/gatk/dataSources/chunks/SAMDataShard.java create mode 100644 java/src/org/broadinstitute/sting/gatk/dataSources/datum/Datum.java create mode 100644 java/src/org/broadinstitute/sting/gatk/dataSources/datum/LocusDatum.java create mode 100644 java/src/org/broadinstitute/sting/gatk/dataSources/datum/ReadDatum.java create mode 100644 java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java create mode 100644 java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSource.java diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/DataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/DataSource.java new file mode 100644 index 000000000..2ddcea799 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/DataSource.java @@ -0,0 +1,21 @@ +package org.broadinstitute.sting.gatk.dataSources; + +import org.broadinstitute.sting.gatk.dataSources.chunks.DataShard; + +/** + * User: aaron + * Date: Mar 25, 2009 + * Time: 6:20:00 PM + *

+ * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + *

+ * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + */ +public interface DataSource { + + public DataShard toChunk(int chunkCount); +} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/DataSourceBuilder.java b/java/src/org/broadinstitute/sting/gatk/dataSources/DataSourceBuilder.java new file mode 100644 index 000000000..ecbf23584 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/DataSourceBuilder.java @@ -0,0 +1,74 @@ +package org.broadinstitute.sting.gatk.dataSources; + +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; + +import java.util.ArrayList; +import java.io.File; + +/** + * User: aaron + * Date: Mar 25, 2009 + * Time: 4:51:39 PM + *

+ * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + *

+ * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + */ +public class DataSourceBuilder { + + // storage for the passed file + ArrayList passFiles = new ArrayList(); + + public DataSourceBuilder() { + + } + + /** + * add a file used to generate the data sources + * + * @param fileName the filename that should be used + */ + public void addDataFile(String fileName) { + // for now, just add it to the internal file list + passFiles.add(new File(fileName)); + } + + /** + * add a file used to generate the data sources + * + * @param file the filename that should be used + */ + public void addDataFile(File file) { + // for now, just add it to the internal file list + passFiles.add(file); + } + + public DataSource build(Walker inputWalker) { + if (inputWalker instanceof ReadWalker) { + + } + + return null; + } + + + /** + * this section contains the private methods to create data sources + * based on the type of walker we're passed in. + */ + + + /** + * we know we have a read data source, let's get the + * @return + */ + //private ReadDataSource generateReadDataSource() { + // + //} + +} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/DataSourceGenerationException.java b/java/src/org/broadinstitute/sting/gatk/dataSources/DataSourceGenerationException.java new file mode 100644 index 000000000..711335c41 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/DataSourceGenerationException.java @@ -0,0 +1,24 @@ +package org.broadinstitute.sting.gatk.dataSources; + +/** + * User: aaron + * Date: Mar 26, 2009 + * Time: 9:25:49 AM + *

+ * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + *

+ * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + */ + +/** + * This exception is throw when we're unable to generate a data source, + * most likely due to an incomplete input source list + */ +public class DataSourceGenerationException extends Exception { + + +} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/ReadDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/ReadDataSource.java new file mode 100644 index 000000000..2f463a958 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/ReadDataSource.java @@ -0,0 +1,48 @@ +package org.broadinstitute.sting.gatk.dataSources; + +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMRecord; + +import java.util.Iterator; +import java.io.File; + +import org.broadinstitute.sting.gatk.iterators.VerifyingSamIterator; +import org.broadinstitute.sting.gatk.iterators.ReferenceIterator; +import org.broadinstitute.sting.utils.FastaSequenceFile2; + +/** + * User: aaron + * Date: Mar 26, 2009 + * Time: 10:35:40 AM + *

+ * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + *

+ * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + */ +public class ReadDataSource { + + /** + * our SAM data files + */ + // our SAM reader + private SAMFileReader samReader = null; + // iterator over the sam records in the readsFile + private Iterator samReadIter = null; + + // The verifying iterator, it does checking + VerifyingSamIterator verifyingSamReadIter = null; + + + /** + * our reference data source + */ + // The reference data -- filename, refSeqFile, and iterator + private File refFileName = null; // the name of the reference file + //private ReferenceSequenceFile refFile = null; + private FastaSequenceFile2 refFile = null; // todo: merge FastaSequenceFile2 into picard! + private ReferenceIterator refIter = null; +} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/chunks/BasicDataShard.java b/java/src/org/broadinstitute/sting/gatk/dataSources/chunks/BasicDataShard.java new file mode 100644 index 000000000..76f111674 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/chunks/BasicDataShard.java @@ -0,0 +1,36 @@ +package org.broadinstitute.sting.gatk.dataSources.chunks; + +import java.util.ArrayList; +import java.util.List; + +/** + * Created by IntelliJ IDEA. + * User: aaronmckenna + * Date: Mar 29, 2009 + * Time: 8:35:16 PM + * To change this template use File | Settings | File Templates. + */ +public class BasicDataShard implements DataShard { + + List list = new ArrayList(); + int index = 0; + + public BasicDataShard(List list) { + this.list = list; + } + + public boolean hasNext() { + if (list.size() > index) { + return true; + } + return false; + } + + public T next() { + return list.get(index); + } + + public void remove() { + list.remove(index); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/chunks/DataShard.java b/java/src/org/broadinstitute/sting/gatk/dataSources/chunks/DataShard.java new file mode 100644 index 000000000..d40f9b6f3 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/chunks/DataShard.java @@ -0,0 +1,19 @@ +package org.broadinstitute.sting.gatk.dataSources.chunks; + +import java.util.Iterator; + +/** + * User: aaron + * Date: Mar 26, 2009 + * Time: 2:43:04 PM + *

+ * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + *

+ * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + */ +public interface DataShard extends Iterator { +} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/chunks/LociShard.java b/java/src/org/broadinstitute/sting/gatk/dataSources/chunks/LociShard.java new file mode 100644 index 000000000..7dc6913de --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/chunks/LociShard.java @@ -0,0 +1,99 @@ +package org.broadinstitute.sting.gatk.dataSources.chunks; + +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.gatk.dataSources.datum.LocusDatum; +import org.broadinstitute.sting.gatk.iterators.LocusIterator; +import org.broadinstitute.sting.gatk.iterators.ReferenceIterator; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.util.ArrayList; +import java.util.List; + +/** + * + * User: aaron + * Date: Mar 30, 2009 + * Time: 7:01:56 PM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + + +/** + * @author aaron + * @version 1.0 + * @date Mar 30, 2009 + *

+ * Class LociShard + *

+ * This is the loci shard, which are collectively made when a shatter call is made to + * a data source. + */ +public class LociShard implements DataShard { + + // our locusIterator + private final LocusIterator locusIterator; + + // our reference locusIterator + private final ReferenceIterator refIterator; + + // Iterator over rods + private final List rodIters; + + // the max number of iterations + private final int maxCount; + + // how many iterations we've had + private int iterCount = 0; + + public LociShard(LocusIterator locusIterator, ReferenceIterator refIterator, List rodIters, int maxCount) { + this.locusIterator = locusIterator; + this.maxCount = maxCount; + this.refIterator = refIterator; + this.rodIters = rodIters; + } + + public boolean hasNext() { + return locusIterator.hasNext() && maxCount > iterCount; + } + + public LocusDatum next() { + LocusContext locus = locusIterator.next(); + ReferenceIterator refSite = refIterator.seekForward(locus.getLocation()); + locus.setReferenceContig(refSite.getCurrentContig()); + // Iterate forward to get all reference ordered data covering this locus + final List rodData = getReferenceOrderedDataAtLocus(rodIters, locus.getLocation()); + return new LocusDatum(rodData, refSite.getBaseAsChar(), locus); + } + + public void remove() { + locusIterator.remove(); + } + + /** + * Builds a list of the reference ordered datum at loc from each of the iterators. This function + * assumes you are accessing the data in order. You can't use this function for random access. Each + * successive call moves you along the file, consuming all data before loc. + * + * @param rodIters Iterators to access the RODs + * @param loc The location to get the rods at + * @return A list of ReferenceOrderDatum at loc. ROD without a datum at loc will be null in the list + */ + protected List getReferenceOrderedDataAtLocus(List rodIters, + final GenomeLoc loc) { + List data = new ArrayList(); + for (ReferenceOrderedData.RODIterator iter : rodIters) { + data.add(iter.seekForward(loc)); + } + return data; + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/chunks/ReadShard.java b/java/src/org/broadinstitute/sting/gatk/dataSources/chunks/ReadShard.java new file mode 100644 index 000000000..a083ce45d --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/chunks/ReadShard.java @@ -0,0 +1,85 @@ +package org.broadinstitute.sting.gatk.dataSources.chunks; + +import edu.mit.broad.picard.sam.MergingSamRecordIterator; +import org.broadinstitute.sting.gatk.dataSources.datum.ReadDatum; +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Utils; +import net.sf.samtools.SAMRecord; + +import java.util.List; +import java.util.Arrays; + +/** + * + * User: aaron + * Date: Mar 30, 2009 + * Time: 5:45:51 PM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + + +/** + * @author aaron + * @version 1.0 + * @date Mar 30, 2009 + *

+ * Class ReadShard + *

+ * A read data shard. + */ +public class ReadShard implements DataShard { + + private MergingSamRecordIterator iterator; + + /** + * create the data chunk with an iterator, and a limiter + * + * @param samIterator + */ + public ReadShard(MergingSamRecordIterator samIterator) { + this.iterator = samIterator; + } + + /** + * do we have a next data point + * + * @return true if we have a data point + */ + public boolean hasNext() { + return iterator.hasNext(); + } + + public ReadDatum next() { + // get the read + final SAMRecord read = iterator.next(); + + // put the read into a list + final List reads = Arrays.asList(read); + + // put together the genome location + final GenomeLoc loc = Utils.genomicLocationOf(read); + + // Offset of a single read is always 0 + List offsets = Arrays.asList(0); + + // create the locus + final LocusContext locus = new LocusContext(loc, reads, offsets); + + // return the read datum + return new ReadDatum(read, locus); + } + + /** remove the current pointed to data source */ + public void remove() { + iterator.remove(); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/chunks/SAMDataShard.java b/java/src/org/broadinstitute/sting/gatk/dataSources/chunks/SAMDataShard.java new file mode 100644 index 000000000..53b105dcf --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/chunks/SAMDataShard.java @@ -0,0 +1,48 @@ +package org.broadinstitute.sting.gatk.dataSources.chunks; + +import edu.mit.broad.picard.sam.MergingSamRecordIterator; +import net.sf.samtools.SAMRecord; + +/** + * Created by IntelliJ IDEA. + * User: aaronmckenna + * Date: Mar 29, 2009 + * Time: 8:47:50 PM + * To change this template use File | Settings | File Templates. + */ +public class SAMDataShard implements DataShard { + + // our iterator + final private MergingSamRecordIterator iterator; + + // divide by reads or by loci + private boolean byReads = true; + + // iterator bounds limiter + private int lengthCount = 0; + private final int limiter; + + public SAMDataShard(MergingSamRecordIterator iterator, int limiter) { + this.iterator = iterator; + this.limiter = limiter; + } + + public SAMDataShard(MergingSamRecordIterator iterator) { + this.iterator = iterator; + limiter = Integer.MAX_VALUE; + } + + + public boolean hasNext() { + return iterator.hasNext() && lengthCount > limiter; + } + + public SAMRecord next() { + ++lengthCount; + return iterator.next(); + } + + public void remove() { + iterator.remove(); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/datum/Datum.java b/java/src/org/broadinstitute/sting/gatk/dataSources/datum/Datum.java new file mode 100644 index 000000000..6609e830a --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/datum/Datum.java @@ -0,0 +1,35 @@ +package org.broadinstitute.sting.gatk.dataSources.datum; + +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.io.Serializable; +/** + * + * User: aaron + * Date: Mar 30, 2009 + * Time: 1:32:34 PM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + +/** + * @author aaron + * @version 1.0 + * @date Mar 30, 2009 + *

+ * interface Datum + *

+ * The interface for all Datum Types. + */ +public interface Datum extends Serializable { + + // this function is used for tracking where we are in a genome + public GenomeLoc getSequenceLocation(); +} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/datum/LocusDatum.java b/java/src/org/broadinstitute/sting/gatk/dataSources/datum/LocusDatum.java new file mode 100644 index 000000000..1ab986732 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/datum/LocusDatum.java @@ -0,0 +1,94 @@ +package org.broadinstitute.sting.gatk.dataSources.datum; + +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.util.List; + +/** + * + * User: aaron + * Date: Mar 30, 2009 + * Time: 3:08:28 PM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + + +/** + * @author aaron + * @version 1.0 + * @date Mar 30, 2009 + *

+ * Class LocusDatum + *

+ * The datum for loci. It contains the reference base, locusContext, + * and the reference order data. + */ +public class LocusDatum implements Datum { + + // our reference order data + private final List rodData; + // our seq base + private final char ref; + // our locus context + private final LocusContext context; + + /** + * the locus dataum constructor + * + * @param rodData our reference data + * @param ref our reference sequence base position + * @param context the genome context we're in + */ + public LocusDatum(List rodData, char ref, LocusContext context) { + this.rodData = rodData; + this.ref = ref; + this.context = context; + } + + /** + * return the Reference order data for this position + * + * @return + */ + public List getRodData() { + return rodData; + } + + /** + * return the reference base + * + * @return a character representing the reference base + */ + public char getRef() { + return ref; + } + + /** + * get the locus context at the current position + * + * @return + */ + public LocusContext getContext() { + return context; + } + + /** + * gets the current postion in the sequence, which comes + * free from underlying data types + * + * @return our current GenomeLocation + */ + public GenomeLoc getSequenceLocation() { + return this.context.getLocation(); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/datum/ReadDatum.java b/java/src/org/broadinstitute/sting/gatk/dataSources/datum/ReadDatum.java new file mode 100644 index 000000000..5552af496 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/datum/ReadDatum.java @@ -0,0 +1,65 @@ +package org.broadinstitute.sting.gatk.dataSources.datum; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Utils; +/** + * + * User: aaron + * Date: Mar 30, 2009 + * Time: 2:53:37 PM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + + +/** + * @author aaron + * @version 1.0 + * @date Mar 30, 2009 + *

+ * Class ReadDatum + *

+ * The base read datum class. + */ +public class ReadDatum implements Datum { + + // our SAM record + final private SAMRecord sam; + + // our locus context + final private LocusContext locus; + + // the constructor, taking a sam read and a locus + public ReadDatum(SAMRecord r, LocusContext locus) { + this.sam = r; + this.locus = locus; + } + + // get the SAMRecord + public SAMRecord getRead() { + return this.sam; + } + + // get the locus context + public LocusContext getLocus() { + return this.locus; + } + + /** + * gets the region that our read spans + * + * @return a genome loc that details the region that our read spans. + */ + public GenomeLoc getSequenceLocation() { + return Utils.genomicLocationOf(sam); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java new file mode 100644 index 000000000..61a913892 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java @@ -0,0 +1,117 @@ +package org.broadinstitute.sting.gatk.dataSources.simpleDataSources; + +import edu.mit.broad.picard.io.IoUtil; +import edu.mit.broad.picard.sam.MergingSamRecordIterator; +import edu.mit.broad.picard.sam.SamFileHeaderMerger; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMFileWriter; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.List; + +/** + * User: aaron + * Date: Mar 26, 2009 + * Time: 2:36:16 PM + *

+ * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + *

+ * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + */ +public class SAMDataSource implements SimpleDataSource { + /** our SAM data files */ + private final SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.coordinate; + + // our sam file readers + private final ArrayList readers = new ArrayList(); + + // do we care that the SAM files respect the sort order. + private boolean matchedSortOrders = true; + + // our record iterator, we use it to iterate over all the reads + private MergingSamRecordIterator iterator = null; + + // we may want to write out the file + private SAMFileWriter out = null; + + // are we set to locus mode or read mode for dividing + private boolean locusMode = true; + + /** + * constructor for multiple sam files + * + * @param samfiles + */ + public SAMDataSource(ArrayList samfiles) throws FileNotFoundException { + loadFiles(samfiles); + } + + private void loadFiles(ArrayList samfiles) throws FileNotFoundException { + // verify the list passed to the class + ArrayList INPUT = new ArrayList(); + for (String check : samfiles) { + File nf = new File(check); + if (!nf.exists()) { + throw new FileNotFoundException(check + " doesn't exist"); + } + } + + + // Open the files for reading and writing + + List readers = new ArrayList(); + for (File inFile : INPUT) { + IoUtil.assertFileIsReadable(inFile); + SAMFileReader in = new SAMFileReader(inFile); + readers.add(in); + matchedSortOrders = matchedSortOrders && in.getFileHeader().getSortOrder() == SORT_ORDER; + } + + // If all the input sort orders match the output sort order then just merge them and + // write on the fly, otherwise setup to merge and sort before writing out the final file + if (matchedSortOrders || SORT_ORDER == SAMFileHeader.SortOrder.unsorted) { + SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SORT_ORDER); + iterator = new MergingSamRecordIterator(headerMerger); + + } else { + SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SAMFileHeader.SortOrder.unsorted); + iterator = new MergingSamRecordIterator(headerMerger); + SAMFileHeader header = headerMerger.getMergedHeader(); + header.setSortOrder(SORT_ORDER); + + } + } + + /** + * constructor, given a single sam file + * + * @param samFile + */ + public SAMDataSource(String samFile) throws FileNotFoundException { + ArrayList samfiles = new ArrayList(); + samfiles.add(samFile); + loadFiles(samfiles); + } + + /** + * Chunk the sam file at appropriate locations, given the chunk count + * + * @param chunkCount + * @return + */ + public void chunk(int chunkCount) { + + } + + /** set this source to divide on reads */ + public void setToReadMode() { + locusMode = true; + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSource.java new file mode 100644 index 000000000..ea2916159 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSource.java @@ -0,0 +1,30 @@ +package org.broadinstitute.sting.gatk.dataSources.simpleDataSources; + +import java.io.Serializable; + +/** + * User: aaron + * Date: Mar 26, 2009 + * Time: 2:39:05 PM + *

+ * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + *

+ * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + */ +public interface SimpleDataSource extends Serializable { + + /** + * recommend how many data chunks we should be breaking the file into, + * as a recommendated number. If not specified (and even if specified) + * the chunking data source can make decisions to chunk differently. + * + * @param chunkCount + */ + public void chunk(int chunkCount); + + +}