Interface check-in for Matt
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@300 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
820cf09198
commit
97d14abe85
|
|
@ -0,0 +1,70 @@
|
|||
package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
|
||||
|
||||
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
|
||||
import org.broadinstitute.sting.utils.FastaSequenceFile2;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
*
|
||||
* User: aaron
|
||||
* Date: Apr 6, 2009
|
||||
* Time: 3:55:21 PM
|
||||
*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* @version 1.0
|
||||
* @date Apr 6, 2009
|
||||
* <p/>
|
||||
* Class ReferenceDataSource
|
||||
* <p/>
|
||||
* A descriptions should go here. Blame aaron if it's missing.
|
||||
*/
|
||||
public class ReferenceDataSource implements SimpleDataSource {
|
||||
|
||||
final protected FastaSequenceFile2 refFile;
|
||||
final protected ReferenceIterator refIter;
|
||||
|
||||
/**
|
||||
* Query the data source for a region of interest, specified by the genome location.
|
||||
* The iterator will generate successive calls
|
||||
*
|
||||
* @param location the genome location to extract data for
|
||||
* @return an iterator of the appropriate type, that is limited by the region
|
||||
*/
|
||||
public ReferenceIterator seek(GenomeLoc location) {
|
||||
ReferenceIterator refSite = refIter.seekForward(location);
|
||||
return refSite;
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor - ReferenceDataSource
|
||||
*
|
||||
* @param refFileName the reference file
|
||||
* @throws SimpleDataSourceLoadException
|
||||
*/
|
||||
public ReferenceDataSource(String refFileName) throws SimpleDataSourceLoadException {
|
||||
if (refFileName == null) {
|
||||
throw new SimpleDataSourceLoadException("ReferenceDataSource: refFileName passed in is null");
|
||||
}
|
||||
File infile = new File(refFileName);
|
||||
if (!infile.canRead()) {
|
||||
throw new SimpleDataSourceLoadException("ReferenceDataSource: Unable to load file: " + refFileName);
|
||||
}
|
||||
refFile = new FastaSequenceFile2(new File(refFileName));
|
||||
refIter = new ReferenceIterator(this.refFile);
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,118 @@
|
|||
package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.HapMapAlleleFrequenciesROD;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
*
|
||||
* User: aaron
|
||||
* Date: Apr 6, 2009
|
||||
* Time: 4:33:10 PM
|
||||
*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* @version 1.0
|
||||
* @date Apr 6, 2009
|
||||
* <p/>
|
||||
* Class ReferenceMetaDataSource
|
||||
* <p/>
|
||||
* A descriptions should go here. Blame aaron if it's missing.
|
||||
*/
|
||||
public class ReferenceMetaDataSource implements SimpleDataSource {
|
||||
|
||||
// our enumerated types
|
||||
public enum RODTYPE {
|
||||
DBSNP, HAPMAP
|
||||
}
|
||||
|
||||
// these could go on the stack, but a heap copy isn't too bad
|
||||
private List<ReferenceOrderedDatum> myData = null;
|
||||
private List<ReferenceOrderedData<? extends ReferenceOrderedDatum>.RODIterator> rodIters = null;
|
||||
private List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods = null;
|
||||
|
||||
/**
|
||||
* Prepare the list of reference ordered data iterators for each of the rods
|
||||
*
|
||||
* @return A list of ROD iterators for getting data from each ROD
|
||||
*/
|
||||
protected List<ReferenceOrderedData<? extends ReferenceOrderedDatum>.RODIterator> initializeRODs() {
|
||||
// set up reference ordered data
|
||||
rodIters = new ArrayList<ReferenceOrderedData<? extends ReferenceOrderedDatum>.RODIterator>();
|
||||
for (ReferenceOrderedData<? extends ReferenceOrderedDatum> data : rods) {
|
||||
rodIters.add(data.iterator());
|
||||
}
|
||||
return rodIters;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a list of the reference ordered datum at loc from each of the iterators. This function
|
||||
* assumes you are accessing the data in order. You can't use this function for random access. Each
|
||||
* successive call moves you along the file, consuming all data before loc.
|
||||
*
|
||||
* @param rodIters Iterators to access the RODs
|
||||
* @param loc The location to get the rods at
|
||||
* @return A list of ReferenceOrderDatum at loc. ROD without a datum at loc will be null in the list
|
||||
*/
|
||||
protected List<ReferenceOrderedDatum> getReferenceOrderedDataAtLocus(List<ReferenceOrderedData<? extends ReferenceOrderedDatum>.RODIterator> rodIters,
|
||||
final GenomeLoc loc) {
|
||||
List<ReferenceOrderedDatum> data = new ArrayList<ReferenceOrderedDatum>();
|
||||
for (ReferenceOrderedData<? extends ReferenceOrderedDatum>.RODIterator iter : rodIters) {
|
||||
data.add(iter.seekForward(loc));
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Query the data source for a region of interest, specified by the genome location.
|
||||
* The iterator will generate successive calls
|
||||
*
|
||||
* @param location the genome location to extract data for
|
||||
* @return an iterator of the appropriate type, that is limited by the region
|
||||
*/
|
||||
public Iterator<ReferenceOrderedDatum> seek(GenomeLoc location) {
|
||||
myData = getReferenceOrderedDataAtLocus(rodIters, location);
|
||||
return myData.iterator();
|
||||
}
|
||||
|
||||
public ReferenceMetaDataSource(HashMap<String, RODTYPE> files) {
|
||||
|
||||
// setup a rod list
|
||||
List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods = new ArrayList<ReferenceOrderedData<? extends ReferenceOrderedDatum>>();
|
||||
|
||||
// cycle through the passed in rod's
|
||||
|
||||
Set<String> fileNames = files.keySet();
|
||||
for (String file : fileNames) {
|
||||
switch (files.get(file)) {
|
||||
|
||||
case DBSNP: {
|
||||
ReferenceOrderedData<rodDbSNP> dbsnp = new ReferenceOrderedData<rodDbSNP>(new File(file), rodDbSNP.class);
|
||||
//dbsnp.testMe();
|
||||
rods.add(dbsnp); // { gff, dbsnp };
|
||||
}
|
||||
case HAPMAP: {
|
||||
ReferenceOrderedData<HapMapAlleleFrequenciesROD> hapmap = new ReferenceOrderedData<HapMapAlleleFrequenciesROD>(new File(file), HapMapAlleleFrequenciesROD.class);
|
||||
//dbsnp.testMe();
|
||||
rods.add(hapmap); // { gff, dbsnp };
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,114 @@
|
|||
package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
|
||||
|
||||
import edu.mit.broad.picard.sam.SamFileHeaderMerger;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* User: aaron
|
||||
* Date: Mar 26, 2009
|
||||
* Time: 2:36:16 PM
|
||||
* <p/>
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
* <p/>
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
public class SAMBAMDataSource implements SimpleDataSource {
|
||||
/** our SAM data files */
|
||||
private final SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.coordinate;
|
||||
|
||||
/** our log, which we want to capture anything from this class */
|
||||
protected static Logger logger = Logger.getLogger(SAMBAMDataSource.class);
|
||||
|
||||
// our sam file readers
|
||||
private final ArrayList<SAMFileReader> readers = new ArrayList<SAMFileReader>();
|
||||
|
||||
// do we care that the SAM files respect the sort order.
|
||||
private boolean matchedSortOrders = true;
|
||||
|
||||
// our merged sam iterator for spliting up the files
|
||||
MergingSamRecordIterator2 mergeIterator;
|
||||
|
||||
// are we set to locus mode or read mode for dividing
|
||||
private boolean locusMode = true;
|
||||
|
||||
// How strict should we be with SAM/BAM parsing?
|
||||
protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.STRICT;
|
||||
|
||||
/**
|
||||
* constructor, given a single sam file
|
||||
*
|
||||
* @param samFiles the list of sam files
|
||||
*/
|
||||
public SAMBAMDataSource(List<String> samFiles) throws SimpleDataSourceLoadException {
|
||||
List<SAMFileReader> readers = new ArrayList<SAMFileReader>();
|
||||
for (String fileName : samFiles) {
|
||||
File smFile = new File(fileName);
|
||||
if (!smFile.canRead()) {
|
||||
throw new SimpleDataSourceLoadException("SAMBAMDataSource: Unable to load file: " + fileName);
|
||||
}
|
||||
SAMFileReader reader = initializeSAMFile(smFile);
|
||||
if (reader != null) {
|
||||
readers.add(reader);
|
||||
}
|
||||
}
|
||||
|
||||
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SORT_ORDER);
|
||||
this.mergeIterator = new MergingSamRecordIterator2(headerMerger);
|
||||
}
|
||||
|
||||
|
||||
protected SAMFileReader initializeSAMFile(final File samFile) {
|
||||
if (samFile.toString().endsWith(".list")) {
|
||||
return null;
|
||||
} else {
|
||||
SAMFileReader samReader = new SAMFileReader(samFile, true);
|
||||
samReader.setValidationStringency(strictness);
|
||||
|
||||
final SAMFileHeader header = samReader.getFileHeader();
|
||||
logger.info(String.format("Sort order is: " + header.getSortOrder()));
|
||||
|
||||
return samReader;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* set the mode to by loci, which let's you duplicate reads, but never at a single
|
||||
* locus, or false for read mode where no read is seen twice.
|
||||
*
|
||||
* @param tr true if by loci, false if by read
|
||||
*/
|
||||
public void setToByLociMode(boolean tr) {
|
||||
locusMode = tr;
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* getQueryRegionIterator
|
||||
* </p>
|
||||
*
|
||||
* @param location the genome location to extract data for
|
||||
* @return an iterator for that region
|
||||
*/
|
||||
public MergingSamRecordIterator2 seek(GenomeLoc location) {
|
||||
MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(this.mergeIterator);
|
||||
if (locusMode) {
|
||||
iter.query(location.getContig(), (int) location.getStart(), (int) location.getStop(), true);
|
||||
} else {
|
||||
iter.queryContained(location.getContig(), (int) location.getStart(), (int) location.getStop());
|
||||
}
|
||||
return iter; //To change body of implemented methods use File | Settings | File Templates.
|
||||
}
|
||||
}
|
||||
|
|
@ -1,117 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
|
||||
|
||||
import edu.mit.broad.picard.io.IoUtil;
|
||||
import edu.mit.broad.picard.sam.MergingSamRecordIterator;
|
||||
import edu.mit.broad.picard.sam.SamFileHeaderMerger;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import net.sf.samtools.SAMFileWriter;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* User: aaron
|
||||
* Date: Mar 26, 2009
|
||||
* Time: 2:36:16 PM
|
||||
* <p/>
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
* <p/>
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
public class SAMDataSource implements SimpleDataSource {
|
||||
/** our SAM data files */
|
||||
private final SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.coordinate;
|
||||
|
||||
// our sam file readers
|
||||
private final ArrayList<SAMFileReader> readers = new ArrayList<SAMFileReader>();
|
||||
|
||||
// do we care that the SAM files respect the sort order.
|
||||
private boolean matchedSortOrders = true;
|
||||
|
||||
// our record iterator, we use it to iterate over all the reads
|
||||
private MergingSamRecordIterator iterator = null;
|
||||
|
||||
// we may want to write out the file
|
||||
private SAMFileWriter out = null;
|
||||
|
||||
// are we set to locus mode or read mode for dividing
|
||||
private boolean locusMode = true;
|
||||
|
||||
/**
|
||||
* constructor for multiple sam files
|
||||
*
|
||||
* @param samfiles
|
||||
*/
|
||||
public SAMDataSource(ArrayList<String> samfiles) throws FileNotFoundException {
|
||||
loadFiles(samfiles);
|
||||
}
|
||||
|
||||
private void loadFiles(ArrayList<String> samfiles) throws FileNotFoundException {
|
||||
// verify the list passed to the class
|
||||
ArrayList<File> INPUT = new ArrayList<File>();
|
||||
for (String check : samfiles) {
|
||||
File nf = new File(check);
|
||||
if (!nf.exists()) {
|
||||
throw new FileNotFoundException(check + " doesn't exist");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Open the files for reading and writing
|
||||
|
||||
List<SAMFileReader> readers = new ArrayList<SAMFileReader>();
|
||||
for (File inFile : INPUT) {
|
||||
IoUtil.assertFileIsReadable(inFile);
|
||||
SAMFileReader in = new SAMFileReader(inFile);
|
||||
readers.add(in);
|
||||
matchedSortOrders = matchedSortOrders && in.getFileHeader().getSortOrder() == SORT_ORDER;
|
||||
}
|
||||
|
||||
// If all the input sort orders match the output sort order then just merge them and
|
||||
// write on the fly, otherwise setup to merge and sort before writing out the final file
|
||||
if (matchedSortOrders || SORT_ORDER == SAMFileHeader.SortOrder.unsorted) {
|
||||
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SORT_ORDER);
|
||||
iterator = new MergingSamRecordIterator(headerMerger);
|
||||
|
||||
} else {
|
||||
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SAMFileHeader.SortOrder.unsorted);
|
||||
iterator = new MergingSamRecordIterator(headerMerger);
|
||||
SAMFileHeader header = headerMerger.getMergedHeader();
|
||||
header.setSortOrder(SORT_ORDER);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* constructor, given a single sam file
|
||||
*
|
||||
* @param samFile
|
||||
*/
|
||||
public SAMDataSource(String samFile) throws FileNotFoundException {
|
||||
ArrayList<String> samfiles = new ArrayList<String>();
|
||||
samfiles.add(samFile);
|
||||
loadFiles(samfiles);
|
||||
}
|
||||
|
||||
/**
|
||||
* Chunk the sam file at appropriate locations, given the chunk count
|
||||
*
|
||||
* @param chunkCount
|
||||
* @return
|
||||
*/
|
||||
public void chunk(int chunkCount) {
|
||||
|
||||
}
|
||||
|
||||
/** set this source to divide on reads */
|
||||
public void setToReadMode() {
|
||||
locusMode = true;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,6 +1,10 @@
|
|||
package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Iterator;
|
||||
|
||||
|
||||
/**
|
||||
* User: aaron
|
||||
|
|
@ -14,17 +18,20 @@ import java.io.Serializable;
|
|||
* <p/>
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
|
||||
/** This class is the interface for all data sources */
|
||||
public interface SimpleDataSource extends Serializable {
|
||||
|
||||
/**
|
||||
* recommend how many data chunks we should be breaking the file into,
|
||||
* as a recommendated number. If not specified (and even if specified)
|
||||
* the chunking data source can make decisions to chunk differently.
|
||||
*
|
||||
* @param chunkCount
|
||||
*/
|
||||
public void chunk(int chunkCount);
|
||||
|
||||
/**
|
||||
* Query the data source for a region of interest, specified by the genome location.
|
||||
* The iterator will generate successive calls
|
||||
*
|
||||
* @param location the genome location to extract data for
|
||||
* @return an iterator of the appropriate type, that is limited by the region
|
||||
*/
|
||||
public Iterator seek(GenomeLoc location);
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,33 @@
|
|||
package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
|
||||
|
||||
/**
|
||||
*
|
||||
* User: aaron
|
||||
* Date: Apr 6, 2009
|
||||
* Time: 4:21:58 PM
|
||||
*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* @version 1.0
|
||||
* @date Apr 6, 2009
|
||||
* <p/>
|
||||
* Class SimpleDataSourceLoadException
|
||||
* <p/>
|
||||
* Generate this on a simple data source load exception
|
||||
*/
|
||||
public class SimpleDataSourceLoadException extends Exception {
|
||||
public SimpleDataSourceLoadException(String msg) {
|
||||
super(msg);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
|
||||
|
||||
/**
|
||||
*
|
||||
* User: aaron
|
||||
* Date: Apr 1, 2009
|
||||
* Time: 11:08:06 AM
|
||||
*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* @version 1.0
|
||||
* @date Apr 1, 2009
|
||||
* <p/>
|
||||
* Class SimpleDataSourceSplitFailure
|
||||
* <p/>
|
||||
* A descriptions should go here. Blame aaron if it's missing.
|
||||
*/
|
||||
public class SimpleDataSourceSplitException extends Exception {
|
||||
public SimpleDataSourceSplitException(String msg) {
|
||||
super(msg);
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
Reference in New Issue