Added changes to shattering, refactored SAMBAM into SAM
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@426 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
182626576f
commit
6db9127f90
|
|
@ -29,7 +29,7 @@ import java.util.List;
|
|||
* <p/>
|
||||
* Interface Shard
|
||||
* <p/>
|
||||
* The shard interface, which controls how data is divided
|
||||
* The shard interface, which controls how data is divided for loci
|
||||
*/
|
||||
public abstract class LocusShardStrategy implements ShardStrategy {
|
||||
|
||||
|
|
@ -66,7 +66,7 @@ public abstract class LocusShardStrategy implements ShardStrategy {
|
|||
*/
|
||||
LocusShardStrategy(SAMSequenceDictionary dic) {
|
||||
this.dic = dic;
|
||||
mLoc = new GenomeLoc(0,0,0);
|
||||
mLoc = new GenomeLoc(0, 0, 0);
|
||||
if (dic.getSequences().size() > 0) {
|
||||
nextContig = true;
|
||||
}
|
||||
|
|
@ -110,19 +110,11 @@ public abstract class LocusShardStrategy implements ShardStrategy {
|
|||
*/
|
||||
|
||||
/**
|
||||
* set the next shards size
|
||||
*
|
||||
* @param size adjust the next size to this
|
||||
*/
|
||||
public abstract void adjustNextShardSize(long size);
|
||||
|
||||
|
||||
/**
|
||||
* This is how the various shards strategies implements their approach
|
||||
* This is how the various shards strategies implements their approach, adjusting this value
|
||||
*
|
||||
* @return the next shard size
|
||||
*/
|
||||
abstract long nextShardSize();
|
||||
protected abstract long nextShardSize();
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -132,8 +124,6 @@ public abstract class LocusShardStrategy implements ShardStrategy {
|
|||
*/
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* get the next shard, based on the return size of nextShardSize
|
||||
*
|
||||
|
|
|
|||
|
|
@ -28,4 +28,13 @@ import java.util.Iterator;
|
|||
* class, but not this will be an interface to accomidate read based sharding
|
||||
*/
|
||||
public interface ShardStrategy extends Iterator<Shard>, Iterable<Shard> {
|
||||
|
||||
/**
|
||||
* set the next shards size
|
||||
*
|
||||
* @param size adjust the next size to this
|
||||
*/
|
||||
public abstract void adjustNextShardSize(long size);
|
||||
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -108,7 +108,7 @@ public class ShardStrategyFactory {
|
|||
* @param readCount the number of reads to include in each shard
|
||||
* @return
|
||||
*/
|
||||
static public ShardStrategy shatterByReadCount(long readCount) {
|
||||
static public ShardStrategy shatterByReadCount(SAMSequenceDictionary dic, long readCount) {
|
||||
return null;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,116 @@
|
|||
package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
|
||||
|
||||
import edu.mit.broad.picard.sam.SamFileHeaderMerger;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* User: aaron
|
||||
* Date: Mar 26, 2009
|
||||
* Time: 2:36:16 PM
|
||||
* <p/>
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
* <p/>
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
public class SAMDataSource implements SimpleDataSource {
|
||||
/** our SAM data files */
|
||||
private final SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.coordinate;
|
||||
|
||||
/** our log, which we want to capture anything from this class */
|
||||
protected static Logger logger = Logger.getLogger(SAMDataSource.class);
|
||||
|
||||
// are we set to locus mode or read mode for dividing
|
||||
private boolean locusMode = false;
|
||||
|
||||
// How strict should we be with SAM/BAM parsing?
|
||||
protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.STRICT;
|
||||
|
||||
// our list of readers
|
||||
private final List<File> samFileList = new ArrayList<File>();
|
||||
|
||||
/**
|
||||
* constructor, given a single sam file
|
||||
*
|
||||
* @param samFiles the list of sam files
|
||||
*/
|
||||
public SAMDataSource(List<String> samFiles) throws SimpleDataSourceLoadException {
|
||||
for (String fileName : samFiles) {
|
||||
File smFile = new File(fileName);
|
||||
if (!smFile.canRead()) {
|
||||
throw new SimpleDataSourceLoadException("SAMDataSource: Unable to load file: " + fileName);
|
||||
}
|
||||
samFileList.add(smFile);
|
||||
|
||||
}
|
||||
|
||||
//SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(samFileList, SORT_ORDER);
|
||||
}
|
||||
|
||||
|
||||
protected SAMFileReader initializeSAMFile(final File samFile) {
|
||||
if (samFile.toString().endsWith(".list")) {
|
||||
return null;
|
||||
} else {
|
||||
SAMFileReader samReader = new SAMFileReader(samFile, true);
|
||||
samReader.setValidationStringency(strictness);
|
||||
|
||||
final SAMFileHeader header = samReader.getFileHeader();
|
||||
logger.info(String.format("Sort order is: " + header.getSortOrder()));
|
||||
|
||||
return samReader;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* seek
|
||||
* </p>
|
||||
*
|
||||
* @param location the genome location to extract data for
|
||||
* @return an iterator for that region
|
||||
*/
|
||||
public MergingSamRecordIterator2 seek(GenomeLoc location) throws SimpleDataSourceLoadException {
|
||||
|
||||
// right now this is pretty damn heavy, it copies the file list into a reader list every time
|
||||
List<SAMFileReader> lst = new ArrayList<SAMFileReader>();
|
||||
for (File f : this.samFileList) {
|
||||
SAMFileReader reader = initializeSAMFile(f);
|
||||
if (reader == null) {
|
||||
throw new SimpleDataSourceLoadException("SAMDataSource: Unable to load file: " + f);
|
||||
}
|
||||
lst.add(reader);
|
||||
}
|
||||
|
||||
// now merge the headers
|
||||
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(lst, SORT_ORDER);
|
||||
|
||||
// make a merging iterator for this record
|
||||
MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(headerMerger);
|
||||
|
||||
|
||||
// we do different things for locus and read modes
|
||||
if (locusMode) {
|
||||
iter.queryOverlapping(location.getContig(), (int) location.getStart(), (int) location.getStop());
|
||||
} else {
|
||||
iter.queryContained(location.getContig(), (int) location.getStart(), (int) location.getStop());
|
||||
}
|
||||
|
||||
// return the iterator
|
||||
return iter;
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -7,7 +7,7 @@ import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
|||
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategy;
|
||||
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategyFactory;
|
||||
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
|
||||
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SAMBAMDataSource;
|
||||
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SAMDataSource;
|
||||
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SimpleDataSourceLoadException;
|
||||
import org.broadinstitute.sting.gatk.dataSources.providers.LocusContextProvider;
|
||||
import org.broadinstitute.sting.gatk.dataSources.providers.ReferenceProvider;
|
||||
|
|
@ -68,10 +68,10 @@ public class MicroManager {
|
|||
SHARD_SIZE );
|
||||
|
||||
ReferenceIterator refIter = new ReferenceIterator(ref);
|
||||
SAMBAMDataSource dataSource = null;
|
||||
SAMDataSource dataSource = null;
|
||||
|
||||
try {
|
||||
dataSource = new SAMBAMDataSource( Arrays.asList( new String[] { reads.getCanonicalPath() } ) );
|
||||
dataSource = new SAMDataSource( Arrays.asList( new String[] { reads.getCanonicalPath() } ) );
|
||||
}
|
||||
catch( SimpleDataSourceLoadException ex ) {
|
||||
throw new RuntimeException( ex );
|
||||
|
|
|
|||
|
|
@ -87,7 +87,7 @@ public class SAMBAMDataSourceTest extends BaseTest {
|
|||
|
||||
|
||||
try {
|
||||
SAMBAMDataSource data = new SAMBAMDataSource(fl);
|
||||
SAMDataSource data = new SAMDataSource(fl);
|
||||
for (Shard sh : strat) {
|
||||
int readCount = 0;
|
||||
count++;
|
||||
|
|
@ -136,7 +136,7 @@ public class SAMBAMDataSourceTest extends BaseTest {
|
|||
int count = 0;
|
||||
|
||||
try {
|
||||
SAMBAMDataSource data = new SAMBAMDataSource(fl);
|
||||
SAMDataSource data = new SAMDataSource(fl);
|
||||
for (Shard sh : strat) {
|
||||
int readCount = 0;
|
||||
count++;
|
||||
|
|
@ -171,7 +171,7 @@ public class SAMBAMDataSourceTest extends BaseTest {
|
|||
|
||||
logger.debug("Pile two:");
|
||||
try {
|
||||
SAMBAMDataSource data = new SAMBAMDataSource(fl);
|
||||
SAMDataSource data = new SAMDataSource(fl);
|
||||
for (Shard sh : strat) {
|
||||
int readCount = 0;
|
||||
count++;
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ import org.broadinstitute.sting.BaseTest;
|
|||
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
|
||||
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategy;
|
||||
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategyFactory;
|
||||
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SAMBAMDataSource;
|
||||
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SAMDataSource;
|
||||
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SimpleDataSourceLoadException;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
||||
|
|
@ -83,7 +83,7 @@ public class BoundedReadIteratorTest extends BaseTest {
|
|||
long shardReadCount = 0;
|
||||
|
||||
try {
|
||||
SAMBAMDataSource data = new SAMBAMDataSource(fl);
|
||||
SAMDataSource data = new SAMDataSource(fl);
|
||||
|
||||
// make sure we have a shard
|
||||
if (!strat.hasNext()) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue