checking in some of the more static Data Source dependent code at this point. They don't do much on their own, but are need for the base data source code I'm writing.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@231 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2009-03-31 00:04:03 +00:00
parent 7fda409f4e
commit ba99e9f648
14 changed files with 795 additions and 0 deletions

View File

@ -0,0 +1,21 @@
package org.broadinstitute.sting.gatk.dataSources;
import org.broadinstitute.sting.gatk.dataSources.chunks.DataShard;
/**
* User: aaron
* Date: Mar 25, 2009
* Time: 6:20:00 PM
* <p/>
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
* <p/>
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
public interface DataSource {
public DataShard toChunk(int chunkCount);
}

View File

@ -0,0 +1,74 @@
package org.broadinstitute.sting.gatk.dataSources;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import java.util.ArrayList;
import java.io.File;
/**
* User: aaron
* Date: Mar 25, 2009
* Time: 4:51:39 PM
* <p/>
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
* <p/>
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
public class DataSourceBuilder {
// storage for the passed file
ArrayList<File> passFiles = new ArrayList<File>();
public DataSourceBuilder() {
}
/**
* add a file used to generate the data sources
*
* @param fileName the filename that should be used
*/
public void addDataFile(String fileName) {
// for now, just add it to the internal file list
passFiles.add(new File(fileName));
}
/**
* add a file used to generate the data sources
*
* @param file the filename that should be used
*/
public void addDataFile(File file) {
// for now, just add it to the internal file list
passFiles.add(file);
}
public DataSource build(Walker inputWalker) {
if (inputWalker instanceof ReadWalker) {
}
return null;
}
/**
* this section contains the private methods to create data sources
* based on the type of walker we're passed in.
*/
/**
* we know we have a read data source, let's get the
* @return
*/
//private ReadDataSource generateReadDataSource() {
//
//}
}

View File

@ -0,0 +1,24 @@
package org.broadinstitute.sting.gatk.dataSources;
/**
* User: aaron
* Date: Mar 26, 2009
* Time: 9:25:49 AM
* <p/>
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
* <p/>
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
/**
* This exception is throw when we're unable to generate a data source,
* most likely due to an incomplete input source list
*/
public class DataSourceGenerationException extends Exception {
}

View File

@ -0,0 +1,48 @@
package org.broadinstitute.sting.gatk.dataSources;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMRecord;
import java.util.Iterator;
import java.io.File;
import org.broadinstitute.sting.gatk.iterators.VerifyingSamIterator;
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
import org.broadinstitute.sting.utils.FastaSequenceFile2;
/**
* User: aaron
* Date: Mar 26, 2009
* Time: 10:35:40 AM
* <p/>
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
* <p/>
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
public class ReadDataSource {
/**
* our SAM data files
*/
// our SAM reader
private SAMFileReader samReader = null;
// iterator over the sam records in the readsFile
private Iterator<SAMRecord> samReadIter = null;
// The verifying iterator, it does checking
VerifyingSamIterator verifyingSamReadIter = null;
/**
* our reference data source
*/
// The reference data -- filename, refSeqFile, and iterator
private File refFileName = null; // the name of the reference file
//private ReferenceSequenceFile refFile = null;
private FastaSequenceFile2 refFile = null; // todo: merge FastaSequenceFile2 into picard!
private ReferenceIterator refIter = null;
}

View File

@ -0,0 +1,36 @@
package org.broadinstitute.sting.gatk.dataSources.chunks;
import java.util.ArrayList;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: aaronmckenna
* Date: Mar 29, 2009
* Time: 8:35:16 PM
* To change this template use File | Settings | File Templates.
*/
public class BasicDataShard<T> implements DataShard {
List<T> list = new ArrayList<T>();
int index = 0;
public BasicDataShard(List<T> list) {
this.list = list;
}
public boolean hasNext() {
if (list.size() > index) {
return true;
}
return false;
}
public T next() {
return list.get(index);
}
public void remove() {
list.remove(index);
}
}

View File

@ -0,0 +1,19 @@
package org.broadinstitute.sting.gatk.dataSources.chunks;
import java.util.Iterator;
/**
* User: aaron
* Date: Mar 26, 2009
* Time: 2:43:04 PM
* <p/>
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
* <p/>
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
public interface DataShard extends Iterator {
}

View File

@ -0,0 +1,99 @@
package org.broadinstitute.sting.gatk.dataSources.chunks;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.dataSources.datum.LocusDatum;
import org.broadinstitute.sting.gatk.iterators.LocusIterator;
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.util.ArrayList;
import java.util.List;
/**
*
* User: aaron
* Date: Mar 30, 2009
* Time: 7:01:56 PM
*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
/**
* @author aaron
* @version 1.0
* @date Mar 30, 2009
* <p/>
* Class LociShard
* <p/>
* This is the loci shard, which are collectively made when a shatter call is made to
* a data source.
*/
public class LociShard implements DataShard {
// our locusIterator
private final LocusIterator locusIterator;
// our reference locusIterator
private final ReferenceIterator refIterator;
// Iterator over rods
private final List<ReferenceOrderedData.RODIterator> rodIters;
// the max number of iterations
private final int maxCount;
// how many iterations we've had
private int iterCount = 0;
public LociShard(LocusIterator locusIterator, ReferenceIterator refIterator, List<ReferenceOrderedData.RODIterator> rodIters, int maxCount) {
this.locusIterator = locusIterator;
this.maxCount = maxCount;
this.refIterator = refIterator;
this.rodIters = rodIters;
}
public boolean hasNext() {
return locusIterator.hasNext() && maxCount > iterCount;
}
public LocusDatum next() {
LocusContext locus = locusIterator.next();
ReferenceIterator refSite = refIterator.seekForward(locus.getLocation());
locus.setReferenceContig(refSite.getCurrentContig());
// Iterate forward to get all reference ordered data covering this locus
final List<ReferenceOrderedDatum> rodData = getReferenceOrderedDataAtLocus(rodIters, locus.getLocation());
return new LocusDatum(rodData, refSite.getBaseAsChar(), locus);
}
public void remove() {
locusIterator.remove();
}
/**
* Builds a list of the reference ordered datum at loc from each of the iterators. This function
* assumes you are accessing the data in order. You can't use this function for random access. Each
* successive call moves you along the file, consuming all data before loc.
*
* @param rodIters Iterators to access the RODs
* @param loc The location to get the rods at
* @return A list of ReferenceOrderDatum at loc. ROD without a datum at loc will be null in the list
*/
protected List<ReferenceOrderedDatum> getReferenceOrderedDataAtLocus(List<ReferenceOrderedData.RODIterator> rodIters,
final GenomeLoc loc) {
List<ReferenceOrderedDatum> data = new ArrayList<ReferenceOrderedDatum>();
for (ReferenceOrderedData.RODIterator iter : rodIters) {
data.add(iter.seekForward(loc));
}
return data;
}
}

View File

@ -0,0 +1,85 @@
package org.broadinstitute.sting.gatk.dataSources.chunks;
import edu.mit.broad.picard.sam.MergingSamRecordIterator;
import org.broadinstitute.sting.gatk.dataSources.datum.ReadDatum;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
import net.sf.samtools.SAMRecord;
import java.util.List;
import java.util.Arrays;
/**
*
* User: aaron
* Date: Mar 30, 2009
* Time: 5:45:51 PM
*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
/**
* @author aaron
* @version 1.0
* @date Mar 30, 2009
* <p/>
* Class ReadShard
* <p/>
* A read data shard.
*/
public class ReadShard implements DataShard {
private MergingSamRecordIterator iterator;
/**
* create the data chunk with an iterator, and a limiter
*
* @param samIterator
*/
public ReadShard(MergingSamRecordIterator samIterator) {
this.iterator = samIterator;
}
/**
* do we have a next data point
*
* @return true if we have a data point
*/
public boolean hasNext() {
return iterator.hasNext();
}
public ReadDatum next() {
// get the read
final SAMRecord read = iterator.next();
// put the read into a list
final List<SAMRecord> reads = Arrays.asList(read);
// put together the genome location
final GenomeLoc loc = Utils.genomicLocationOf(read);
// Offset of a single read is always 0
List<Integer> offsets = Arrays.asList(0);
// create the locus
final LocusContext locus = new LocusContext(loc, reads, offsets);
// return the read datum
return new ReadDatum(read, locus);
}
/** remove the current pointed to data source */
public void remove() {
iterator.remove();
}
}

View File

@ -0,0 +1,48 @@
package org.broadinstitute.sting.gatk.dataSources.chunks;
import edu.mit.broad.picard.sam.MergingSamRecordIterator;
import net.sf.samtools.SAMRecord;
/**
* Created by IntelliJ IDEA.
* User: aaronmckenna
* Date: Mar 29, 2009
* Time: 8:47:50 PM
* To change this template use File | Settings | File Templates.
*/
public class SAMDataShard implements DataShard {
// our iterator
final private MergingSamRecordIterator iterator;
// divide by reads or by loci
private boolean byReads = true;
// iterator bounds limiter
private int lengthCount = 0;
private final int limiter;
public SAMDataShard(MergingSamRecordIterator iterator, int limiter) {
this.iterator = iterator;
this.limiter = limiter;
}
public SAMDataShard(MergingSamRecordIterator iterator) {
this.iterator = iterator;
limiter = Integer.MAX_VALUE;
}
public boolean hasNext() {
return iterator.hasNext() && lengthCount > limiter;
}
public SAMRecord next() {
++lengthCount;
return iterator.next();
}
public void remove() {
iterator.remove();
}
}

View File

@ -0,0 +1,35 @@
package org.broadinstitute.sting.gatk.dataSources.datum;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.io.Serializable;
/**
*
* User: aaron
* Date: Mar 30, 2009
* Time: 1:32:34 PM
*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
/**
* @author aaron
* @version 1.0
* @date Mar 30, 2009
* <p/>
* interface Datum
* <p/>
* The interface for all Datum Types.
*/
public interface Datum extends Serializable {
// this function is used for tracking where we are in a genome
public GenomeLoc getSequenceLocation();
}

View File

@ -0,0 +1,94 @@
package org.broadinstitute.sting.gatk.dataSources.datum;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.util.List;
/**
*
* User: aaron
* Date: Mar 30, 2009
* Time: 3:08:28 PM
*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
/**
* @author aaron
* @version 1.0
* @date Mar 30, 2009
* <p/>
* Class LocusDatum
* <p/>
* The datum for loci. It contains the reference base, locusContext,
* and the reference order data.
*/
public class LocusDatum implements Datum {
// our reference order data
private final List<ReferenceOrderedDatum> rodData;
// our seq base
private final char ref;
// our locus context
private final LocusContext context;
/**
* the locus dataum constructor
*
* @param rodData our reference data
* @param ref our reference sequence base position
* @param context the genome context we're in
*/
public LocusDatum(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
this.rodData = rodData;
this.ref = ref;
this.context = context;
}
/**
* return the Reference order data for this position
*
* @return
*/
public List<ReferenceOrderedDatum> getRodData() {
return rodData;
}
/**
* return the reference base
*
* @return a character representing the reference base
*/
public char getRef() {
return ref;
}
/**
* get the locus context at the current position
*
* @return
*/
public LocusContext getContext() {
return context;
}
/**
* gets the current postion in the sequence, which comes
* free from underlying data types
*
* @return our current GenomeLocation
*/
public GenomeLoc getSequenceLocation() {
return this.context.getLocation();
}
}

View File

@ -0,0 +1,65 @@
package org.broadinstitute.sting.gatk.dataSources.datum;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
/**
*
* User: aaron
* Date: Mar 30, 2009
* Time: 2:53:37 PM
*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
/**
* @author aaron
* @version 1.0
* @date Mar 30, 2009
* <p/>
* Class ReadDatum
* <p/>
* The base read datum class.
*/
public class ReadDatum implements Datum {
// our SAM record
final private SAMRecord sam;
// our locus context
final private LocusContext locus;
// the constructor, taking a sam read and a locus
public ReadDatum(SAMRecord r, LocusContext locus) {
this.sam = r;
this.locus = locus;
}
// get the SAMRecord
public SAMRecord getRead() {
return this.sam;
}
// get the locus context
public LocusContext getLocus() {
return this.locus;
}
/**
* gets the region that our read spans
*
* @return a genome loc that details the region that our read spans.
*/
public GenomeLoc getSequenceLocation() {
return Utils.genomicLocationOf(sam);
}
}

View File

@ -0,0 +1,117 @@
package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
import edu.mit.broad.picard.io.IoUtil;
import edu.mit.broad.picard.sam.MergingSamRecordIterator;
import edu.mit.broad.picard.sam.SamFileHeaderMerger;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMFileWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.List;
/**
* User: aaron
* Date: Mar 26, 2009
* Time: 2:36:16 PM
* <p/>
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
* <p/>
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
public class SAMDataSource implements SimpleDataSource {
/** our SAM data files */
private final SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.coordinate;
// our sam file readers
private final ArrayList<SAMFileReader> readers = new ArrayList<SAMFileReader>();
// do we care that the SAM files respect the sort order.
private boolean matchedSortOrders = true;
// our record iterator, we use it to iterate over all the reads
private MergingSamRecordIterator iterator = null;
// we may want to write out the file
private SAMFileWriter out = null;
// are we set to locus mode or read mode for dividing
private boolean locusMode = true;
/**
* constructor for multiple sam files
*
* @param samfiles
*/
public SAMDataSource(ArrayList<String> samfiles) throws FileNotFoundException {
loadFiles(samfiles);
}
private void loadFiles(ArrayList<String> samfiles) throws FileNotFoundException {
// verify the list passed to the class
ArrayList<File> INPUT = new ArrayList<File>();
for (String check : samfiles) {
File nf = new File(check);
if (!nf.exists()) {
throw new FileNotFoundException(check + " doesn't exist");
}
}
// Open the files for reading and writing
List<SAMFileReader> readers = new ArrayList<SAMFileReader>();
for (File inFile : INPUT) {
IoUtil.assertFileIsReadable(inFile);
SAMFileReader in = new SAMFileReader(inFile);
readers.add(in);
matchedSortOrders = matchedSortOrders && in.getFileHeader().getSortOrder() == SORT_ORDER;
}
// If all the input sort orders match the output sort order then just merge them and
// write on the fly, otherwise setup to merge and sort before writing out the final file
if (matchedSortOrders || SORT_ORDER == SAMFileHeader.SortOrder.unsorted) {
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SORT_ORDER);
iterator = new MergingSamRecordIterator(headerMerger);
} else {
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SAMFileHeader.SortOrder.unsorted);
iterator = new MergingSamRecordIterator(headerMerger);
SAMFileHeader header = headerMerger.getMergedHeader();
header.setSortOrder(SORT_ORDER);
}
}
/**
* constructor, given a single sam file
*
* @param samFile
*/
public SAMDataSource(String samFile) throws FileNotFoundException {
ArrayList<String> samfiles = new ArrayList<String>();
samfiles.add(samFile);
loadFiles(samfiles);
}
/**
* Chunk the sam file at appropriate locations, given the chunk count
*
* @param chunkCount
* @return
*/
public void chunk(int chunkCount) {
}
/** set this source to divide on reads */
public void setToReadMode() {
locusMode = true;
}
}

View File

@ -0,0 +1,30 @@
package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
import java.io.Serializable;
/**
* User: aaron
* Date: Mar 26, 2009
* Time: 2:39:05 PM
* <p/>
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
* <p/>
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
public interface SimpleDataSource extends Serializable {
/**
* recommend how many data chunks we should be breaking the file into,
* as a recommendated number. If not specified (and even if specified)
* the chunking data source can make decisions to chunk differently.
*
* @param chunkCount
*/
public void chunk(int chunkCount);
}