Added changes to shattering, refactored SAMBAM into SAM

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@426 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2009-04-15 13:52:56 +00:00
parent 182626576f
commit 6db9127f90
7 changed files with 138 additions and 23 deletions

View File

@ -29,7 +29,7 @@ import java.util.List;
* <p/>
* Interface Shard
* <p/>
* The shard interface, which controls how data is divided
* The shard interface, which controls how data is divided for loci
*/
public abstract class LocusShardStrategy implements ShardStrategy {
@ -66,7 +66,7 @@ public abstract class LocusShardStrategy implements ShardStrategy {
*/
LocusShardStrategy(SAMSequenceDictionary dic) {
this.dic = dic;
mLoc = new GenomeLoc(0,0,0);
mLoc = new GenomeLoc(0, 0, 0);
if (dic.getSequences().size() > 0) {
nextContig = true;
}
@ -110,19 +110,11 @@ public abstract class LocusShardStrategy implements ShardStrategy {
*/
/**
* set the next shards size
*
* @param size adjust the next size to this
*/
public abstract void adjustNextShardSize(long size);
/**
* This is how the various shards strategies implements their approach
* This is how the various shards strategies implements their approach, adjusting this value
*
* @return the next shard size
*/
abstract long nextShardSize();
protected abstract long nextShardSize();
/**
@ -132,8 +124,6 @@ public abstract class LocusShardStrategy implements ShardStrategy {
*/
/**
* get the next shard, based on the return size of nextShardSize
*

View File

@ -28,4 +28,13 @@ import java.util.Iterator;
* class, but not this will be an interface to accomidate read based sharding
*/
public interface ShardStrategy extends Iterator<Shard>, Iterable<Shard> {
/**
* set the next shards size
*
* @param size adjust the next size to this
*/
public abstract void adjustNextShardSize(long size);
}

View File

@ -108,7 +108,7 @@ public class ShardStrategyFactory {
* @param readCount the number of reads to include in each shard
* @return
*/
static public ShardStrategy shatterByReadCount(long readCount) {
static public ShardStrategy shatterByReadCount(SAMSequenceDictionary dic, long readCount) {
return null;
}

View File

@ -0,0 +1,116 @@
package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
import edu.mit.broad.picard.sam.SamFileHeaderMerger;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileReader;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
/**
* User: aaron
* Date: Mar 26, 2009
* Time: 2:36:16 PM
* <p/>
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
* <p/>
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
public class SAMDataSource implements SimpleDataSource {
/** our SAM data files */
private final SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.coordinate;
/** our log, which we want to capture anything from this class */
protected static Logger logger = Logger.getLogger(SAMDataSource.class);
// are we set to locus mode or read mode for dividing
private boolean locusMode = false;
// How strict should we be with SAM/BAM parsing?
protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.STRICT;
// our list of readers
private final List<File> samFileList = new ArrayList<File>();
/**
* constructor, given a single sam file
*
* @param samFiles the list of sam files
*/
public SAMDataSource(List<String> samFiles) throws SimpleDataSourceLoadException {
for (String fileName : samFiles) {
File smFile = new File(fileName);
if (!smFile.canRead()) {
throw new SimpleDataSourceLoadException("SAMDataSource: Unable to load file: " + fileName);
}
samFileList.add(smFile);
}
//SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(samFileList, SORT_ORDER);
}
protected SAMFileReader initializeSAMFile(final File samFile) {
if (samFile.toString().endsWith(".list")) {
return null;
} else {
SAMFileReader samReader = new SAMFileReader(samFile, true);
samReader.setValidationStringency(strictness);
final SAMFileHeader header = samReader.getFileHeader();
logger.info(String.format("Sort order is: " + header.getSortOrder()));
return samReader;
}
}
/**
* <p>
* seek
* </p>
*
* @param location the genome location to extract data for
* @return an iterator for that region
*/
public MergingSamRecordIterator2 seek(GenomeLoc location) throws SimpleDataSourceLoadException {
// right now this is pretty damn heavy, it copies the file list into a reader list every time
List<SAMFileReader> lst = new ArrayList<SAMFileReader>();
for (File f : this.samFileList) {
SAMFileReader reader = initializeSAMFile(f);
if (reader == null) {
throw new SimpleDataSourceLoadException("SAMDataSource: Unable to load file: " + f);
}
lst.add(reader);
}
// now merge the headers
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(lst, SORT_ORDER);
// make a merging iterator for this record
MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(headerMerger);
// we do different things for locus and read modes
if (locusMode) {
iter.queryOverlapping(location.getContig(), (int) location.getStart(), (int) location.getStop());
} else {
iter.queryContained(location.getContig(), (int) location.getStart(), (int) location.getStop());
}
// return the iterator
return iter;
}
}

View File

@ -7,7 +7,7 @@ import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategy;
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategyFactory;
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SAMBAMDataSource;
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SAMDataSource;
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SimpleDataSourceLoadException;
import org.broadinstitute.sting.gatk.dataSources.providers.LocusContextProvider;
import org.broadinstitute.sting.gatk.dataSources.providers.ReferenceProvider;
@ -68,10 +68,10 @@ public class MicroManager {
SHARD_SIZE );
ReferenceIterator refIter = new ReferenceIterator(ref);
SAMBAMDataSource dataSource = null;
SAMDataSource dataSource = null;
try {
dataSource = new SAMBAMDataSource( Arrays.asList( new String[] { reads.getCanonicalPath() } ) );
dataSource = new SAMDataSource( Arrays.asList( new String[] { reads.getCanonicalPath() } ) );
}
catch( SimpleDataSourceLoadException ex ) {
throw new RuntimeException( ex );

View File

@ -87,7 +87,7 @@ public class SAMBAMDataSourceTest extends BaseTest {
try {
SAMBAMDataSource data = new SAMBAMDataSource(fl);
SAMDataSource data = new SAMDataSource(fl);
for (Shard sh : strat) {
int readCount = 0;
count++;
@ -136,7 +136,7 @@ public class SAMBAMDataSourceTest extends BaseTest {
int count = 0;
try {
SAMBAMDataSource data = new SAMBAMDataSource(fl);
SAMDataSource data = new SAMDataSource(fl);
for (Shard sh : strat) {
int readCount = 0;
count++;
@ -171,7 +171,7 @@ public class SAMBAMDataSourceTest extends BaseTest {
logger.debug("Pile two:");
try {
SAMBAMDataSource data = new SAMBAMDataSource(fl);
SAMDataSource data = new SAMDataSource(fl);
for (Shard sh : strat) {
int readCount = 0;
count++;

View File

@ -6,7 +6,7 @@ import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategy;
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategyFactory;
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SAMBAMDataSource;
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SAMDataSource;
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SimpleDataSourceLoadException;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
@ -83,7 +83,7 @@ public class BoundedReadIteratorTest extends BaseTest {
long shardReadCount = 0;
try {
SAMBAMDataSource data = new SAMBAMDataSource(fl);
SAMDataSource data = new SAMDataSource(fl);
// make sure we have a shard
if (!strat.hasNext()) {