Interface check-in for Matt

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@300 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2009-04-06 21:14:19 +00:00
parent 820cf09198
commit 97d14abe85
7 changed files with 384 additions and 125 deletions

View File

@ -0,0 +1,70 @@
package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
import org.broadinstitute.sting.utils.FastaSequenceFile2;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.io.File;
/**
*
* User: aaron
* Date: Apr 6, 2009
* Time: 3:55:21 PM
*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
/**
* @author aaron
* @version 1.0
* @date Apr 6, 2009
* <p/>
* Class ReferenceDataSource
* <p/>
* A descriptions should go here. Blame aaron if it's missing.
*/
public class ReferenceDataSource implements SimpleDataSource {
final protected FastaSequenceFile2 refFile;
final protected ReferenceIterator refIter;
/**
* Query the data source for a region of interest, specified by the genome location.
* The iterator will generate successive calls
*
* @param location the genome location to extract data for
* @return an iterator of the appropriate type, that is limited by the region
*/
public ReferenceIterator seek(GenomeLoc location) {
ReferenceIterator refSite = refIter.seekForward(location);
return refSite;
}
/**
* Constructor - ReferenceDataSource
*
* @param refFileName the reference file
* @throws SimpleDataSourceLoadException
*/
public ReferenceDataSource(String refFileName) throws SimpleDataSourceLoadException {
if (refFileName == null) {
throw new SimpleDataSourceLoadException("ReferenceDataSource: refFileName passed in is null");
}
File infile = new File(refFileName);
if (!infile.canRead()) {
throw new SimpleDataSourceLoadException("ReferenceDataSource: Unable to load file: " + refFileName);
}
refFile = new FastaSequenceFile2(new File(refFileName));
refIter = new ReferenceIterator(this.refFile);
}
}

View File

@ -0,0 +1,118 @@
package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
import org.broadinstitute.sting.gatk.refdata.HapMapAlleleFrequenciesROD;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.io.File;
import java.util.*;
/**
*
* User: aaron
* Date: Apr 6, 2009
* Time: 4:33:10 PM
*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
/**
* @author aaron
* @version 1.0
* @date Apr 6, 2009
* <p/>
* Class ReferenceMetaDataSource
* <p/>
* A descriptions should go here. Blame aaron if it's missing.
*/
public class ReferenceMetaDataSource implements SimpleDataSource {
// our enumerated types
public enum RODTYPE {
DBSNP, HAPMAP
}
// these could go on the stack, but a heap copy isn't too bad
private List<ReferenceOrderedDatum> myData = null;
private List<ReferenceOrderedData<? extends ReferenceOrderedDatum>.RODIterator> rodIters = null;
private List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods = null;
/**
* Prepare the list of reference ordered data iterators for each of the rods
*
* @return A list of ROD iterators for getting data from each ROD
*/
protected List<ReferenceOrderedData<? extends ReferenceOrderedDatum>.RODIterator> initializeRODs() {
// set up reference ordered data
rodIters = new ArrayList<ReferenceOrderedData<? extends ReferenceOrderedDatum>.RODIterator>();
for (ReferenceOrderedData<? extends ReferenceOrderedDatum> data : rods) {
rodIters.add(data.iterator());
}
return rodIters;
}
/**
* Builds a list of the reference ordered datum at loc from each of the iterators. This function
* assumes you are accessing the data in order. You can't use this function for random access. Each
* successive call moves you along the file, consuming all data before loc.
*
* @param rodIters Iterators to access the RODs
* @param loc The location to get the rods at
* @return A list of ReferenceOrderDatum at loc. ROD without a datum at loc will be null in the list
*/
protected List<ReferenceOrderedDatum> getReferenceOrderedDataAtLocus(List<ReferenceOrderedData<? extends ReferenceOrderedDatum>.RODIterator> rodIters,
final GenomeLoc loc) {
List<ReferenceOrderedDatum> data = new ArrayList<ReferenceOrderedDatum>();
for (ReferenceOrderedData<? extends ReferenceOrderedDatum>.RODIterator iter : rodIters) {
data.add(iter.seekForward(loc));
}
return data;
}
/**
* Query the data source for a region of interest, specified by the genome location.
* The iterator will generate successive calls
*
* @param location the genome location to extract data for
* @return an iterator of the appropriate type, that is limited by the region
*/
public Iterator<ReferenceOrderedDatum> seek(GenomeLoc location) {
myData = getReferenceOrderedDataAtLocus(rodIters, location);
return myData.iterator();
}
public ReferenceMetaDataSource(HashMap<String, RODTYPE> files) {
// setup a rod list
List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods = new ArrayList<ReferenceOrderedData<? extends ReferenceOrderedDatum>>();
// cycle through the passed in rod's
Set<String> fileNames = files.keySet();
for (String file : fileNames) {
switch (files.get(file)) {
case DBSNP: {
ReferenceOrderedData<rodDbSNP> dbsnp = new ReferenceOrderedData<rodDbSNP>(new File(file), rodDbSNP.class);
//dbsnp.testMe();
rods.add(dbsnp); // { gff, dbsnp };
}
case HAPMAP: {
ReferenceOrderedData<HapMapAlleleFrequenciesROD> hapmap = new ReferenceOrderedData<HapMapAlleleFrequenciesROD>(new File(file), HapMapAlleleFrequenciesROD.class);
//dbsnp.testMe();
rods.add(hapmap); // { gff, dbsnp };
}
}
}
}
}

View File

@ -0,0 +1,114 @@
package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
import edu.mit.broad.picard.sam.SamFileHeaderMerger;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileReader;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
/**
* User: aaron
* Date: Mar 26, 2009
* Time: 2:36:16 PM
* <p/>
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
* <p/>
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
public class SAMBAMDataSource implements SimpleDataSource {
/** our SAM data files */
private final SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.coordinate;
/** our log, which we want to capture anything from this class */
protected static Logger logger = Logger.getLogger(SAMBAMDataSource.class);
// our sam file readers
private final ArrayList<SAMFileReader> readers = new ArrayList<SAMFileReader>();
// do we care that the SAM files respect the sort order.
private boolean matchedSortOrders = true;
// our merged sam iterator for spliting up the files
MergingSamRecordIterator2 mergeIterator;
// are we set to locus mode or read mode for dividing
private boolean locusMode = true;
// How strict should we be with SAM/BAM parsing?
protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.STRICT;
/**
* constructor, given a single sam file
*
* @param samFiles the list of sam files
*/
public SAMBAMDataSource(List<String> samFiles) throws SimpleDataSourceLoadException {
List<SAMFileReader> readers = new ArrayList<SAMFileReader>();
for (String fileName : samFiles) {
File smFile = new File(fileName);
if (!smFile.canRead()) {
throw new SimpleDataSourceLoadException("SAMBAMDataSource: Unable to load file: " + fileName);
}
SAMFileReader reader = initializeSAMFile(smFile);
if (reader != null) {
readers.add(reader);
}
}
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SORT_ORDER);
this.mergeIterator = new MergingSamRecordIterator2(headerMerger);
}
protected SAMFileReader initializeSAMFile(final File samFile) {
if (samFile.toString().endsWith(".list")) {
return null;
} else {
SAMFileReader samReader = new SAMFileReader(samFile, true);
samReader.setValidationStringency(strictness);
final SAMFileHeader header = samReader.getFileHeader();
logger.info(String.format("Sort order is: " + header.getSortOrder()));
return samReader;
}
}
/**
* set the mode to by loci, which let's you duplicate reads, but never at a single
* locus, or false for read mode where no read is seen twice.
*
* @param tr true if by loci, false if by read
*/
public void setToByLociMode(boolean tr) {
locusMode = tr;
}
/**
* <p>
* getQueryRegionIterator
* </p>
*
* @param location the genome location to extract data for
* @return an iterator for that region
*/
public MergingSamRecordIterator2 seek(GenomeLoc location) {
MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(this.mergeIterator);
if (locusMode) {
iter.query(location.getContig(), (int) location.getStart(), (int) location.getStop(), true);
} else {
iter.queryContained(location.getContig(), (int) location.getStart(), (int) location.getStop());
}
return iter; //To change body of implemented methods use File | Settings | File Templates.
}
}

View File

@ -1,117 +0,0 @@
package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
import edu.mit.broad.picard.io.IoUtil;
import edu.mit.broad.picard.sam.MergingSamRecordIterator;
import edu.mit.broad.picard.sam.SamFileHeaderMerger;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMFileWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.List;
/**
* User: aaron
* Date: Mar 26, 2009
* Time: 2:36:16 PM
* <p/>
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
* <p/>
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
public class SAMDataSource implements SimpleDataSource {
/** our SAM data files */
private final SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.coordinate;
// our sam file readers
private final ArrayList<SAMFileReader> readers = new ArrayList<SAMFileReader>();
// do we care that the SAM files respect the sort order.
private boolean matchedSortOrders = true;
// our record iterator, we use it to iterate over all the reads
private MergingSamRecordIterator iterator = null;
// we may want to write out the file
private SAMFileWriter out = null;
// are we set to locus mode or read mode for dividing
private boolean locusMode = true;
/**
* constructor for multiple sam files
*
* @param samfiles
*/
public SAMDataSource(ArrayList<String> samfiles) throws FileNotFoundException {
loadFiles(samfiles);
}
private void loadFiles(ArrayList<String> samfiles) throws FileNotFoundException {
// verify the list passed to the class
ArrayList<File> INPUT = new ArrayList<File>();
for (String check : samfiles) {
File nf = new File(check);
if (!nf.exists()) {
throw new FileNotFoundException(check + " doesn't exist");
}
}
// Open the files for reading and writing
List<SAMFileReader> readers = new ArrayList<SAMFileReader>();
for (File inFile : INPUT) {
IoUtil.assertFileIsReadable(inFile);
SAMFileReader in = new SAMFileReader(inFile);
readers.add(in);
matchedSortOrders = matchedSortOrders && in.getFileHeader().getSortOrder() == SORT_ORDER;
}
// If all the input sort orders match the output sort order then just merge them and
// write on the fly, otherwise setup to merge and sort before writing out the final file
if (matchedSortOrders || SORT_ORDER == SAMFileHeader.SortOrder.unsorted) {
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SORT_ORDER);
iterator = new MergingSamRecordIterator(headerMerger);
} else {
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SAMFileHeader.SortOrder.unsorted);
iterator = new MergingSamRecordIterator(headerMerger);
SAMFileHeader header = headerMerger.getMergedHeader();
header.setSortOrder(SORT_ORDER);
}
}
/**
* constructor, given a single sam file
*
* @param samFile
*/
public SAMDataSource(String samFile) throws FileNotFoundException {
ArrayList<String> samfiles = new ArrayList<String>();
samfiles.add(samFile);
loadFiles(samfiles);
}
/**
* Chunk the sam file at appropriate locations, given the chunk count
*
* @param chunkCount
* @return
*/
public void chunk(int chunkCount) {
}
/** set this source to divide on reads */
public void setToReadMode() {
locusMode = true;
}
}

View File

@ -1,6 +1,10 @@
package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.io.Serializable;
import java.util.Iterator;
/**
* User: aaron
@ -14,17 +18,20 @@ import java.io.Serializable;
* <p/>
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
/** This class is the interface for all data sources */
public interface SimpleDataSource extends Serializable {
/**
* recommend how many data chunks we should be breaking the file into,
* as a recommendated number. If not specified (and even if specified)
* the chunking data source can make decisions to chunk differently.
*
* @param chunkCount
*/
public void chunk(int chunkCount);
/**
* Query the data source for a region of interest, specified by the genome location.
* The iterator will generate successive calls
*
* @param location the genome location to extract data for
* @return an iterator of the appropriate type, that is limited by the region
*/
public Iterator seek(GenomeLoc location);
}

View File

@ -0,0 +1,33 @@
package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
/**
*
* User: aaron
* Date: Apr 6, 2009
* Time: 4:21:58 PM
*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
/**
* @author aaron
* @version 1.0
* @date Apr 6, 2009
* <p/>
* Class SimpleDataSourceLoadException
* <p/>
* Generate this on a simple data source load exception
*/
public class SimpleDataSourceLoadException extends Exception {
public SimpleDataSourceLoadException(String msg) {
super(msg);
}
}

View File

@ -0,0 +1,34 @@
package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
/**
*
* User: aaron
* Date: Apr 1, 2009
* Time: 11:08:06 AM
*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
/**
* @author aaron
* @version 1.0
* @date Apr 1, 2009
* <p/>
* Class SimpleDataSourceSplitFailure
* <p/>
* A descriptions should go here. Blame aaron if it's missing.
*/
public class SimpleDataSourceSplitException extends Exception {
public SimpleDataSourceSplitException(String msg) {
super(msg);
}
}