From 97d14abe85e2451f81620292914885b7031ae029 Mon Sep 17 00:00:00 2001 From: aaron Date: Mon, 6 Apr 2009 21:14:19 +0000 Subject: [PATCH] Interface check-in for Matt git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@300 348d0f76-0448-11de-a6fe-93d51630548a --- .../ReferenceDataSource.java | 70 +++++++++++ .../ReferenceMetaDataSource.java | 118 ++++++++++++++++++ .../simpleDataSources/SAMBAMDataSource.java | 114 +++++++++++++++++ .../simpleDataSources/SAMDataSource.java | 117 ----------------- .../simpleDataSources/SimpleDataSource.java | 23 ++-- .../SimpleDataSourceLoadException.java | 33 +++++ .../SimpleDataSourceSplitException.java | 34 +++++ 7 files changed, 384 insertions(+), 125 deletions(-) create mode 100644 java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceDataSource.java create mode 100644 java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceMetaDataSource.java create mode 100644 java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSource.java delete mode 100644 java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java create mode 100644 java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSourceLoadException.java create mode 100644 java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSourceSplitException.java diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceDataSource.java new file mode 100644 index 000000000..292caf010 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceDataSource.java @@ -0,0 +1,70 @@ +package org.broadinstitute.sting.gatk.dataSources.simpleDataSources; + +import org.broadinstitute.sting.gatk.iterators.ReferenceIterator; +import org.broadinstitute.sting.utils.FastaSequenceFile2; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.io.File; + +/** + * + * User: aaron + * Date: Apr 6, 2009 + * Time: 3:55:21 PM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + + +/** + * @author aaron + * @version 1.0 + * @date Apr 6, 2009 + *

+ * Class ReferenceDataSource + *

+ * A descriptions should go here. Blame aaron if it's missing. + */ +public class ReferenceDataSource implements SimpleDataSource { + + final protected FastaSequenceFile2 refFile; + final protected ReferenceIterator refIter; + + /** + * Query the data source for a region of interest, specified by the genome location. + * The iterator will generate successive calls + * + * @param location the genome location to extract data for + * @return an iterator of the appropriate type, that is limited by the region + */ + public ReferenceIterator seek(GenomeLoc location) { + ReferenceIterator refSite = refIter.seekForward(location); + return refSite; + } + + /** + * Constructor - ReferenceDataSource + * + * @param refFileName the reference file + * @throws SimpleDataSourceLoadException + */ + public ReferenceDataSource(String refFileName) throws SimpleDataSourceLoadException { + if (refFileName == null) { + throw new SimpleDataSourceLoadException("ReferenceDataSource: refFileName passed in is null"); + } + File infile = new File(refFileName); + if (!infile.canRead()) { + throw new SimpleDataSourceLoadException("ReferenceDataSource: Unable to load file: " + refFileName); + } + refFile = new FastaSequenceFile2(new File(refFileName)); + refIter = new ReferenceIterator(this.refFile); + + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceMetaDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceMetaDataSource.java new file mode 100644 index 000000000..54fbec295 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/ReferenceMetaDataSource.java @@ -0,0 +1,118 @@ +package org.broadinstitute.sting.gatk.dataSources.simpleDataSources; + +import org.broadinstitute.sting.gatk.refdata.HapMapAlleleFrequenciesROD; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.refdata.rodDbSNP; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.io.File; +import java.util.*; + +/** + * + * User: aaron + * Date: Apr 6, 2009 + * Time: 4:33:10 PM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + + +/** + * @author aaron + * @version 1.0 + * @date Apr 6, 2009 + *

+ * Class ReferenceMetaDataSource + *

+ * A descriptions should go here. Blame aaron if it's missing. + */ +public class ReferenceMetaDataSource implements SimpleDataSource { + + // our enumerated types + public enum RODTYPE { + DBSNP, HAPMAP + } + + // these could go on the stack, but a heap copy isn't too bad + private List myData = null; + private List.RODIterator> rodIters = null; + private List> rods = null; + + /** + * Prepare the list of reference ordered data iterators for each of the rods + * + * @return A list of ROD iterators for getting data from each ROD + */ + protected List.RODIterator> initializeRODs() { + // set up reference ordered data + rodIters = new ArrayList.RODIterator>(); + for (ReferenceOrderedData data : rods) { + rodIters.add(data.iterator()); + } + return rodIters; + } + + /** + * Builds a list of the reference ordered datum at loc from each of the iterators. This function + * assumes you are accessing the data in order. You can't use this function for random access. Each + * successive call moves you along the file, consuming all data before loc. + * + * @param rodIters Iterators to access the RODs + * @param loc The location to get the rods at + * @return A list of ReferenceOrderDatum at loc. ROD without a datum at loc will be null in the list + */ + protected List getReferenceOrderedDataAtLocus(List.RODIterator> rodIters, + final GenomeLoc loc) { + List data = new ArrayList(); + for (ReferenceOrderedData.RODIterator iter : rodIters) { + data.add(iter.seekForward(loc)); + } + return data; + } + + /** + * Query the data source for a region of interest, specified by the genome location. + * The iterator will generate successive calls + * + * @param location the genome location to extract data for + * @return an iterator of the appropriate type, that is limited by the region + */ + public Iterator seek(GenomeLoc location) { + myData = getReferenceOrderedDataAtLocus(rodIters, location); + return myData.iterator(); + } + + public ReferenceMetaDataSource(HashMap files) { + + // setup a rod list + List> rods = new ArrayList>(); + + // cycle through the passed in rod's + + Set fileNames = files.keySet(); + for (String file : fileNames) { + switch (files.get(file)) { + + case DBSNP: { + ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File(file), rodDbSNP.class); + //dbsnp.testMe(); + rods.add(dbsnp); // { gff, dbsnp }; + } + case HAPMAP: { + ReferenceOrderedData hapmap = new ReferenceOrderedData(new File(file), HapMapAlleleFrequenciesROD.class); + //dbsnp.testMe(); + rods.add(hapmap); // { gff, dbsnp }; + } + } + } + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSource.java new file mode 100644 index 000000000..795cab75f --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSource.java @@ -0,0 +1,114 @@ +package org.broadinstitute.sting.gatk.dataSources.simpleDataSources; + +import edu.mit.broad.picard.sam.SamFileHeaderMerger; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMFileReader; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; + +/** + * User: aaron + * Date: Mar 26, 2009 + * Time: 2:36:16 PM + *

+ * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + *

+ * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + */ +public class SAMBAMDataSource implements SimpleDataSource { + /** our SAM data files */ + private final SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.coordinate; + + /** our log, which we want to capture anything from this class */ + protected static Logger logger = Logger.getLogger(SAMBAMDataSource.class); + + // our sam file readers + private final ArrayList readers = new ArrayList(); + + // do we care that the SAM files respect the sort order. + private boolean matchedSortOrders = true; + + // our merged sam iterator for spliting up the files + MergingSamRecordIterator2 mergeIterator; + + // are we set to locus mode or read mode for dividing + private boolean locusMode = true; + + // How strict should we be with SAM/BAM parsing? + protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.STRICT; + + /** + * constructor, given a single sam file + * + * @param samFiles the list of sam files + */ + public SAMBAMDataSource(List samFiles) throws SimpleDataSourceLoadException { + List readers = new ArrayList(); + for (String fileName : samFiles) { + File smFile = new File(fileName); + if (!smFile.canRead()) { + throw new SimpleDataSourceLoadException("SAMBAMDataSource: Unable to load file: " + fileName); + } + SAMFileReader reader = initializeSAMFile(smFile); + if (reader != null) { + readers.add(reader); + } + } + + SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SORT_ORDER); + this.mergeIterator = new MergingSamRecordIterator2(headerMerger); + } + + + protected SAMFileReader initializeSAMFile(final File samFile) { + if (samFile.toString().endsWith(".list")) { + return null; + } else { + SAMFileReader samReader = new SAMFileReader(samFile, true); + samReader.setValidationStringency(strictness); + + final SAMFileHeader header = samReader.getFileHeader(); + logger.info(String.format("Sort order is: " + header.getSortOrder())); + + return samReader; + } + } + + + /** + * set the mode to by loci, which let's you duplicate reads, but never at a single + * locus, or false for read mode where no read is seen twice. + * + * @param tr true if by loci, false if by read + */ + public void setToByLociMode(boolean tr) { + locusMode = tr; + } + + /** + *

+ * getQueryRegionIterator + *

+ * + * @param location the genome location to extract data for + * @return an iterator for that region + */ + public MergingSamRecordIterator2 seek(GenomeLoc location) { + MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(this.mergeIterator); + if (locusMode) { + iter.query(location.getContig(), (int) location.getStart(), (int) location.getStop(), true); + } else { + iter.queryContained(location.getContig(), (int) location.getStart(), (int) location.getStop()); + } + return iter; //To change body of implemented methods use File | Settings | File Templates. + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java deleted file mode 100644 index 61a913892..000000000 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java +++ /dev/null @@ -1,117 +0,0 @@ -package org.broadinstitute.sting.gatk.dataSources.simpleDataSources; - -import edu.mit.broad.picard.io.IoUtil; -import edu.mit.broad.picard.sam.MergingSamRecordIterator; -import edu.mit.broad.picard.sam.SamFileHeaderMerger; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMFileReader; -import net.sf.samtools.SAMFileWriter; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.ArrayList; -import java.util.List; - -/** - * User: aaron - * Date: Mar 26, 2009 - * Time: 2:36:16 PM - *

- * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - *

- * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - */ -public class SAMDataSource implements SimpleDataSource { - /** our SAM data files */ - private final SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.coordinate; - - // our sam file readers - private final ArrayList readers = new ArrayList(); - - // do we care that the SAM files respect the sort order. - private boolean matchedSortOrders = true; - - // our record iterator, we use it to iterate over all the reads - private MergingSamRecordIterator iterator = null; - - // we may want to write out the file - private SAMFileWriter out = null; - - // are we set to locus mode or read mode for dividing - private boolean locusMode = true; - - /** - * constructor for multiple sam files - * - * @param samfiles - */ - public SAMDataSource(ArrayList samfiles) throws FileNotFoundException { - loadFiles(samfiles); - } - - private void loadFiles(ArrayList samfiles) throws FileNotFoundException { - // verify the list passed to the class - ArrayList INPUT = new ArrayList(); - for (String check : samfiles) { - File nf = new File(check); - if (!nf.exists()) { - throw new FileNotFoundException(check + " doesn't exist"); - } - } - - - // Open the files for reading and writing - - List readers = new ArrayList(); - for (File inFile : INPUT) { - IoUtil.assertFileIsReadable(inFile); - SAMFileReader in = new SAMFileReader(inFile); - readers.add(in); - matchedSortOrders = matchedSortOrders && in.getFileHeader().getSortOrder() == SORT_ORDER; - } - - // If all the input sort orders match the output sort order then just merge them and - // write on the fly, otherwise setup to merge and sort before writing out the final file - if (matchedSortOrders || SORT_ORDER == SAMFileHeader.SortOrder.unsorted) { - SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SORT_ORDER); - iterator = new MergingSamRecordIterator(headerMerger); - - } else { - SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SAMFileHeader.SortOrder.unsorted); - iterator = new MergingSamRecordIterator(headerMerger); - SAMFileHeader header = headerMerger.getMergedHeader(); - header.setSortOrder(SORT_ORDER); - - } - } - - /** - * constructor, given a single sam file - * - * @param samFile - */ - public SAMDataSource(String samFile) throws FileNotFoundException { - ArrayList samfiles = new ArrayList(); - samfiles.add(samFile); - loadFiles(samfiles); - } - - /** - * Chunk the sam file at appropriate locations, given the chunk count - * - * @param chunkCount - * @return - */ - public void chunk(int chunkCount) { - - } - - /** set this source to divide on reads */ - public void setToReadMode() { - locusMode = true; - } -} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSource.java index ea2916159..408672f38 100644 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSource.java @@ -1,6 +1,10 @@ package org.broadinstitute.sting.gatk.dataSources.simpleDataSources; +import org.broadinstitute.sting.utils.GenomeLoc; + import java.io.Serializable; +import java.util.Iterator; + /** * User: aaron @@ -14,17 +18,20 @@ import java.io.Serializable; *

* This software is supplied without any warranty or guaranteed support whatsoever. Neither * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * */ + +/** This class is the interface for all data sources */ public interface SimpleDataSource extends Serializable { - /** - * recommend how many data chunks we should be breaking the file into, - * as a recommendated number. If not specified (and even if specified) - * the chunking data source can make decisions to chunk differently. - * - * @param chunkCount - */ - public void chunk(int chunkCount); + /** + * Query the data source for a region of interest, specified by the genome location. + * The iterator will generate successive calls + * + * @param location the genome location to extract data for + * @return an iterator of the appropriate type, that is limited by the region + */ + public Iterator seek(GenomeLoc location); } diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSourceLoadException.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSourceLoadException.java new file mode 100644 index 000000000..cd9ecce7c --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSourceLoadException.java @@ -0,0 +1,33 @@ +package org.broadinstitute.sting.gatk.dataSources.simpleDataSources; + +/** + * + * User: aaron + * Date: Apr 6, 2009 + * Time: 4:21:58 PM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + + +/** + * @author aaron + * @version 1.0 + * @date Apr 6, 2009 + *

+ * Class SimpleDataSourceLoadException + *

+ * Generate this on a simple data source load exception + */ +public class SimpleDataSourceLoadException extends Exception { + public SimpleDataSourceLoadException(String msg) { + super(msg); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSourceSplitException.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSourceSplitException.java new file mode 100644 index 000000000..f30e6998e --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SimpleDataSourceSplitException.java @@ -0,0 +1,34 @@ +package org.broadinstitute.sting.gatk.dataSources.simpleDataSources; + +/** + * + * User: aaron + * Date: Apr 1, 2009 + * Time: 11:08:06 AM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + + +/** + * @author aaron + * @version 1.0 + * @date Apr 1, 2009 + *

+ * Class SimpleDataSourceSplitFailure + *

+ * A descriptions should go here. Blame aaron if it's missing. + */ +public class SimpleDataSourceSplitException extends Exception { + public SimpleDataSourceSplitException(String msg) { + super(msg); + } + +}