Changes to the interface to the simple data source rippled out to a bunch of files.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@572 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2009-04-30 20:35:56 +00:00
parent 19e4e97f21
commit 63403d32cd
12 changed files with 113 additions and 58 deletions

View File

@ -36,14 +36,18 @@ public class ReadShard implements Shard {
// this is going to get gross
private final ReadShardStrategy str;
// the reference back to our read shard strategy
private final ReadShardStrategy strat;
/**
* create a read shard, given a read size
*
* @param size
*/
public ReadShard(int size) {
ReadShard(int size, ReadShardStrategy strat) {
this.str = null;
this.size = size;
this.strat = strat;
}
/**
@ -51,9 +55,10 @@ public class ReadShard implements Shard {
*
* @param size
*/
ReadShard(ReadShardStrategy caller, int size) {
ReadShard(ReadShardStrategy caller, int size, ReadShardStrategy strat) {
this.str = caller;
this.size = size;
this.strat = strat;
}
/** @return the genome location represented by this shard */
@ -67,6 +72,10 @@ public class ReadShard implements Shard {
}
public void signalDone() {
strat.signalDone();
}
/**
* what kind of shard do we return
*

View File

@ -41,6 +41,9 @@ public class ReadShardStrategy implements ShardStrategy {
// our sequence dictionary
final private SAMSequenceDictionary dic;
// our hasnext flag
boolean hasNext = true;
/**
* the default constructor
* @param dic the dictionary
@ -56,11 +59,11 @@ public class ReadShardStrategy implements ShardStrategy {
* @return
*/
public boolean hasNext() {
return true;
return hasNext;
}
public Shard next() {
return new ReadShard((int)readCount); //To change body of implemented methods use File | Settings | File Templates.
return new ReadShard((int)readCount, this);
}
public void remove() {
@ -79,4 +82,14 @@ public class ReadShardStrategy implements ShardStrategy {
public void adjustNextShardSize(long size) {
readCount = size;
}
/**
* this function is a work-around for the fact that
* we don't know when we're out of reads until the SAM data source
* tells us so.
*/
public void signalDone() {
hasNext = false;
}
}

View File

@ -1,9 +1,8 @@
package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import org.broadinstitute.sting.gatk.iterators.BoundedReferenceIterator;
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import java.io.File;
@ -43,12 +42,17 @@ public class ReferenceDataSource implements SimpleDataSource {
* Query the data source for a region of interest, specified by the genome location.
* The iterator will generate successive calls
*
* @param location the genome location to extract data for
* @param shard the genome location to extract data for
* @return an iterator of the appropriate type, that is limited by the region
*/
public BoundedReferenceIterator seek(GenomeLoc location) {
BoundedReferenceIterator ret = new BoundedReferenceIterator(refFile, location);
return ret;
public BoundedReferenceIterator seek(Shard shard) {
if (shard.getShardType() == Shard.ShardType.LOCUS) {
BoundedReferenceIterator ret = new BoundedReferenceIterator(refFile, shard.getGenomeLoc());
return ret;
} else {
throw new StingException("ReferenceDataSource can only take LocusShards");
}
}
public ReferenceDataSource(String refFileName) throws SimpleDataSourceLoadException {

View File

@ -2,7 +2,9 @@ package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import java.util.ArrayList;
import java.util.HashMap;
@ -83,12 +85,16 @@ public class ReferenceMetaDataSource implements SimpleDataSource {
* Query the data source for a region of interest, specified by the genome location.
* The iterator will generate successive calls
*
* @param location the genome location to extract data for
* @param shard the genome location to extract data for
* @return an iterator of the appropriate type, that is limited by the region
*/
public Iterator<ReferenceOrderedDatum> seek(GenomeLoc location) {
myData = getReferenceOrderedDataAtLocus(rodIters, location);
return myData.iterator();
public Iterator<ReferenceOrderedDatum> seek(Shard shard) {
if (shard.getShardType() == Shard.ShardType.LOCUS) {
myData = getReferenceOrderedDataAtLocus(rodIters, shard.getGenomeLoc());
return myData.iterator();
} else {
throw new StingException("ReferenceMetaDataSource can only take LocusShards");
}
}
public ReferenceMetaDataSource(HashMap<String, RODTYPE> files) {

View File

@ -5,10 +5,14 @@ import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.CloseableIterator;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.dataSources.shards.ReadShard;
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import org.broadinstitute.sting.gatk.iterators.BoundedReadIterator;
import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import java.io.File;
import java.util.ArrayList;
@ -114,7 +118,7 @@ public class SAMDataSource implements SimpleDataSource {
* @param location the genome location to extract data for
* @return an iterator for that region
*/
public MergingSamRecordIterator2 seek(GenomeLoc location) throws SimpleDataSourceLoadException {
public MergingSamRecordIterator2 seekLocus(GenomeLoc location) throws SimpleDataSourceLoadException {
// right now this is pretty damn heavy, it copies the file list into a reader list every time
List<SAMFileReader> lst = GetReaderList();
@ -137,6 +141,26 @@ public class SAMDataSource implements SimpleDataSource {
return iter;
}
/**
* <p>
* seek
* </p>
*
* @param shard the shard to get data for
* @return an iterator for that region
*/
public CloseableIterator<SAMRecord> seek(Shard shard) throws SimpleDataSourceLoadException {
if (shard.getShardType() == Shard.ShardType.READ) {
return seekRead((ReadShard)shard);
}
else if (shard.getShardType() == Shard.ShardType.LOCUS) {
return seekLocus(shard.getGenomeLoc());
}
else {
throw new StingException("seek: Unknown shard type");
}
}
/**
* If we're in by-read mode, this indicates if we want
@ -156,14 +180,14 @@ public class SAMDataSource implements SimpleDataSource {
* seek
* </p>
*
* @param readCount the length or reads to extract
* @param shard the read shard to extract from
* @return an iterator for that region
*/
public BoundedReadIterator seek(long readCount) throws SimpleDataSourceLoadException {
private BoundedReadIterator seekRead(ReadShard shard) throws SimpleDataSourceLoadException {
// TODO: make extremely less horrible
List<SAMFileReader> lst = GetReaderList();
BoundedReadIterator bound;
BoundedReadIterator bound = null;
// now merge the headers
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(lst, SORT_ORDER);
@ -171,12 +195,15 @@ public class SAMDataSource implements SimpleDataSource {
// make a merging iterator for this record
MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(headerMerger);
if (!includeUnmappedReads) {
return fastMappedReadSeek(readCount, iter);
bound = fastMappedReadSeek(shard.getSize(), iter);
} else {
return unmappedReadSeek(readCount, iter);
bound = unmappedReadSeek(shard.getSize(), iter);
}
if (bound == null) {
shard.signalDone();
}
return bound;
}
/**
@ -211,7 +238,7 @@ public class SAMDataSource implements SimpleDataSource {
/**
* Seek, if we only want mapped reads. This method will be faster then the unmapped read method, but you cannot extract the
* Seek, if we want only mapped reads. This method will be faster then the unmapped read method, but you cannot extract the
* unmapped reads.
*
* @param readCount how many reads to retrieve

View File

@ -1,6 +1,6 @@
package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import java.io.Serializable;
import java.util.Iterator;
@ -29,9 +29,9 @@ public interface SimpleDataSource extends Serializable {
* Query the data source for a region of interest, specified by the genome location.
* The iterator will generate successive calls
*
* @param location the genome location to extract data for
* @param shard the region
* @return an iterator of the appropriate type, that is limited by the region
*/
public Iterator seek(GenomeLoc location) throws SimpleDataSourceLoadException;
public Iterator seek(Shard shard) throws SimpleDataSourceLoadException;
}

View File

@ -50,17 +50,16 @@ public class LinearMicroScheduler extends MicroScheduler {
Object accumulator = null;
for(Shard shard: shardStrategy) {
GenomeLoc span = shard.getGenomeLoc();
MergingSamRecordIterator2 readShard = null;
try {
readShard = dataSource.seek( span );
readShard = (MergingSamRecordIterator2)dataSource.seek( shard );
}
catch( SimpleDataSourceLoadException ex ) {
throw new RuntimeException( ex );
}
ReferenceProvider referenceProvider = new ReferenceProvider( reference, span );
ReferenceProvider referenceProvider = new ReferenceProvider( reference, shard.getGenomeLoc() );
LocusContextProvider locusProvider = new LocusContextProvider( readShard );
// set the sam header of the traversal engine

View File

@ -1,15 +1,14 @@
package org.broadinstitute.sting.gatk.executive;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.dataSources.providers.LocusContextProvider;
import org.broadinstitute.sting.gatk.dataSources.providers.ReferenceProvider;
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SAMDataSource;
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SimpleDataSourceLoadException;
import org.broadinstitute.sting.gatk.dataSources.providers.ReferenceProvider;
import org.broadinstitute.sting.gatk.dataSources.providers.LocusContextProvider;
import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2;
import org.broadinstitute.sting.gatk.traversals.TraverseLociByReference;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import java.util.concurrent.Callable;
@ -48,18 +47,17 @@ public class ShardTraverser implements Callable {
}
public Object call() {
GenomeLoc span = shard.getGenomeLoc();
Object accumulator = ((LocusWalker<?,?>)walker).reduceInit();
MergingSamRecordIterator2 readShard = null;
try {
readShard = reads.seek( span );
readShard = (MergingSamRecordIterator2)reads.seek( shard );
}
catch( SimpleDataSourceLoadException ex ) {
throw new RuntimeException( ex );
}
ReferenceProvider referenceProvider = new ReferenceProvider( reference, span );
ReferenceProvider referenceProvider = new ReferenceProvider( reference, shard.getGenomeLoc() );
LocusContextProvider locusProvider = new LocusContextProvider( readShard );
accumulator = traversalEngine.traverse( walker, shard, referenceProvider, locusProvider, accumulator );

View File

@ -3,7 +3,6 @@ package org.broadinstitute.sting.gatk.traversals;
import net.sf.samtools.SAMRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.dataSources.providers.LocusContextProvider;
import org.broadinstitute.sting.gatk.dataSources.shards.ReadShard;
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import org.broadinstitute.sting.gatk.iterators.BoundedReadIterator;
@ -14,8 +13,9 @@ import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.io.File;
import java.util.List;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
*
@ -44,7 +44,7 @@ import java.util.Arrays;
* This class handles traversing by reads in the new shardable style
*/
public class TraverseReads extends TraversalEngine {
final ArrayList<String> x = new ArrayList<String>();
/** our log, which we want to capture anything from this class */
protected static Logger logger = Logger.getLogger(TraverseReads.class);
@ -64,21 +64,20 @@ public class TraverseReads extends TraversalEngine {
/**
* Traverse by reads, given the data and the walker
*
* @param walker the walker to execute over
* @param shard the shard of data to feed the walker
* @param locusProvider the factory for loci
* @param sum of type T, the return from the walker
* @param <M> the generic type
* @param <T> the return type of the reduce function
* @param shard the shard of data to feed the walker
* @param sum of type T, the return from the walker
* @param <M> the generic type
* @param <T> the return type of the reduce function
* @return
*/
public <M, T> T traverse(Walker<M, T> walker,
Shard shard,
LocusContextProvider locusProvider,
BoundedReadIterator iter,
T sum) {
logger.debug(String.format("TraverseReads.traverse Genomic interval is %s", ((ReadShard)shard).getSize()));
logger.debug(String.format("TraverseReads.traverse Genomic interval is %s", ((ReadShard) shard).getSize()));
if (!(walker instanceof ReadWalker))
throw new IllegalArgumentException("Walker isn't a read walker!");
@ -88,7 +87,7 @@ public class TraverseReads extends TraversalEngine {
int readCNT = 0;
// while we still have more reads
for (SAMRecord read: iter) {
for (SAMRecord read : iter) {
// get the genome loc from the read
GenomeLoc site = new GenomeLoc(read);
@ -99,9 +98,10 @@ public class TraverseReads extends TraversalEngine {
// update the number of reads we've seen
TraversalStatistics.nRecords++;
// we still have to fix the locus context provider to take care of this problem with > 1 length contexts
// LocusContext locus = locusProvider.getLocusContext(site);
final boolean keepMeP = readWalker.filter(locus, read);
if (keepMeP) {
M x = readWalker.map(locus, read);
@ -110,7 +110,7 @@ public class TraverseReads extends TraversalEngine {
printProgress("loci", locus.getLocation());
}
System.err.println(TraversalStatistics.nRecords);
return sum;
}

View File

@ -94,7 +94,7 @@ public class SAMBAMDataSourceTest extends BaseTest {
logger.debug("Start : " + sh.getGenomeLoc().getStart() + " stop : " + sh.getGenomeLoc().getStop() + " contig " + sh.getGenomeLoc().getContig());
logger.debug("count = " + count);
MergingSamRecordIterator2 datum = data.seek(sh.getGenomeLoc());
MergingSamRecordIterator2 datum = (MergingSamRecordIterator2)data.seek(sh);
// for the first couple of shards make sure we can see the reads
if (count < 5) {
@ -144,7 +144,7 @@ public class SAMBAMDataSourceTest extends BaseTest {
break;
}
MergingSamRecordIterator2 datum = data.seek(sh.getGenomeLoc());
MergingSamRecordIterator2 datum = (MergingSamRecordIterator2)data.seek(sh);
for (SAMRecord r : datum) {
readCount++;
@ -181,7 +181,7 @@ public class SAMBAMDataSourceTest extends BaseTest {
break;
}
MergingSamRecordIterator2 datum = data.seek(sh.getGenomeLoc());
MergingSamRecordIterator2 datum = (MergingSamRecordIterator2)data.seek(sh);
for (SAMRecord r : datum) {
readCount++;

View File

@ -1,12 +1,10 @@
package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
import static junit.framework.Assert.fail;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.iterators.BoundedReadIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import static org.junit.Assert.assertEquals;
import org.junit.Before;
import org.junit.Test;
@ -90,7 +88,7 @@ public class SAMByReadsTest extends BaseTest {
int readsSeen = 0;
BoundedReadIterator iter;
/*
while ((iter = data.seek(targetReadCount)) != null) {
int readcnt = 0;
@ -111,6 +109,7 @@ public class SAMByReadsTest extends BaseTest {
}
// make sure we've seen all the reads
assertEquals(totalReads,readsSeen);
*/
}
catch (SimpleDataSourceLoadException e) {

View File

@ -92,8 +92,8 @@ public class BoundedReadIteratorTest extends BaseTest {
Shard sd = strat.next();
MergingSamRecordIterator2 datum = data.seek(sd.getGenomeLoc());
MergingSamRecordIterator2 datum2 = data.seek(sd.getGenomeLoc());
MergingSamRecordIterator2 datum = (MergingSamRecordIterator2)data.seek(sd);
MergingSamRecordIterator2 datum2 = (MergingSamRecordIterator2)data.seek(sd);
// check the reads in the shard
for (SAMRecord r : datum) {