We're no longer in the read-dropping business.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@901 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
4d880477d6
commit
109bef6c08
|
|
@ -56,16 +56,17 @@ public class IntervalShardStrategy implements ShardStrategy {
|
|||
*
|
||||
* @param size the next recommended shard size.
|
||||
*/
|
||||
public void adjustNextShardSize(long size) {
|
||||
public void adjustNextShardSize( long size ) {
|
||||
this.size = size;
|
||||
}
|
||||
|
||||
/**
|
||||
* construct the shard strategy from a seq dictionary, a shard size, and and genomeLocs
|
||||
*
|
||||
* @param size
|
||||
* @param locations
|
||||
*/
|
||||
IntervalShardStrategy(long size, GenomeLocSortedSet locations) {
|
||||
IntervalShardStrategy( long size, GenomeLocSortedSet locations ) {
|
||||
if (locations == null || locations.isEmpty()) {
|
||||
throw new StingException("IntervalShardStrategy: genomic regions list is empty.");
|
||||
}
|
||||
|
|
@ -73,12 +74,12 @@ public class IntervalShardStrategy implements ShardStrategy {
|
|||
this.size = size;
|
||||
}
|
||||
|
||||
/**
|
||||
/**
|
||||
* the default constructor
|
||||
*
|
||||
* @param dict the sequence dictionary to use
|
||||
* @param dict the sequence dictionary to use
|
||||
*/
|
||||
IntervalShardStrategy(SAMSequenceDictionary dict, GenomeLocSortedSet locations) {
|
||||
IntervalShardStrategy( SAMSequenceDictionary dict, GenomeLocSortedSet locations ) {
|
||||
if (locations == null || locations.isEmpty()) {
|
||||
throw new StingException("IntervalShardStrategy: genomic regions list is empty.");
|
||||
}
|
||||
|
|
@ -88,44 +89,39 @@ public class IntervalShardStrategy implements ShardStrategy {
|
|||
|
||||
/**
|
||||
* returns true if there are additional shards
|
||||
*
|
||||
* @return false if we're done processing shards
|
||||
*/
|
||||
public boolean hasNext() {
|
||||
return (!regions.isEmpty());
|
||||
return ( !regions.isEmpty() );
|
||||
}
|
||||
|
||||
/**
|
||||
* gets the next Shard
|
||||
*
|
||||
* @return the next shard
|
||||
*/
|
||||
public Shard next() {
|
||||
if ((this.regions == null) || (regions.isEmpty())) {
|
||||
if (( this.regions == null ) || ( regions.isEmpty() )) {
|
||||
throw new StingException("IntervalShardStrategy: genomic regions list is empty in next() function.");
|
||||
}
|
||||
|
||||
// get the first region in the list
|
||||
GenomeLoc loc = regions.iterator().next();
|
||||
|
||||
if (loc.getStop() - loc.getStart() <= this.size) {
|
||||
regions.removeRegion(loc);
|
||||
return new IntervalShard(loc);
|
||||
} else {
|
||||
GenomeLoc subLoc = new GenomeLoc(loc.getContigIndex(),loc.getStart(),loc.getStart()+size-1);
|
||||
regions.removeRegion(subLoc);
|
||||
return new IntervalShard(subLoc);
|
||||
}
|
||||
|
||||
regions.removeRegion(loc);
|
||||
return new IntervalShard(loc);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* we don't support the remove command
|
||||
*/
|
||||
/** we don't support the remove command */
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("ShardStrategies don't support remove()");
|
||||
}
|
||||
|
||||
/**
|
||||
* makes the ReadIntervalShard iterable, i.e. usable in a for loop.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public Iterator<Shard> iterator() {
|
||||
|
|
|
|||
|
|
@ -5,13 +5,11 @@ import net.sf.samtools.SAMFileHeader;
|
|||
import net.sf.samtools.SAMFileReader;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.dataSources.shards.ReadShard;
|
||||
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
|
||||
import org.broadinstitute.sting.gatk.iterators.BoundedReadIterator;
|
||||
import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2;
|
||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter;
|
||||
import org.broadinstitute.sting.gatk.iterators.*;
|
||||
import org.broadinstitute.sting.gatk.Reads;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
|
@ -35,43 +33,53 @@ import java.util.List;
|
|||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
public class SAMDataSource implements SimpleDataSource {
|
||||
|
||||
|
||||
/** Backing support for reads. */
|
||||
private Reads reads = null;
|
||||
|
||||
/** our SAM data files */
|
||||
private final SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.coordinate;
|
||||
|
||||
/** our log, which we want to capture anything from this class */
|
||||
protected static Logger logger = Logger.getLogger(SAMDataSource.class);
|
||||
|
||||
// How strict should we be with SAM/BAM parsing?
|
||||
protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.SILENT;
|
||||
|
||||
// our list of readers
|
||||
private final List<File> samFileList = new ArrayList<File>();
|
||||
|
||||
/** SAM header file. */
|
||||
private final SAMFileHeader header;
|
||||
|
||||
// used for the reads case, the last count of reads retrieved
|
||||
private long readsTaken = 0;
|
||||
long readsTaken = 0;
|
||||
|
||||
// our last genome loc position
|
||||
private GenomeLoc lastReadPos = null;
|
||||
GenomeLoc lastReadPos = null;
|
||||
|
||||
// do we take unmapped reads
|
||||
private boolean includeUnmappedReads = true;
|
||||
|
||||
// reads based traversal variables
|
||||
private boolean intoUnmappedReads = false;
|
||||
private int readsAtLastPos = 0;
|
||||
private int readsSeenAtLastPos = 0;
|
||||
|
||||
|
||||
/**
|
||||
* package protected getter and setter for the iterator generator
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
IteratorGenerator getIterGen() {
|
||||
return iterGen;
|
||||
}
|
||||
|
||||
void setIterGen( IteratorGenerator iterGen ) {
|
||||
this.iterGen = iterGen;
|
||||
}
|
||||
|
||||
// where we get out iterators from
|
||||
private IteratorGenerator iterGen;
|
||||
|
||||
/**
|
||||
* constructor, given sam files
|
||||
*
|
||||
* @param reads the list of sam files
|
||||
* @param byReads are we a by reads traversal, or a loci traversal. We could delete this field
|
||||
* if we passed in iterGen, which would be a better (although more complicated for the
|
||||
* consumers of SAMDataSources).
|
||||
*/
|
||||
public SAMDataSource(Reads reads) throws SimpleDataSourceLoadException {
|
||||
public SAMDataSource( Reads reads, boolean byReads ) throws SimpleDataSourceLoadException {
|
||||
this.reads = reads;
|
||||
|
||||
// check the length
|
||||
|
|
@ -82,32 +90,11 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
if (!smFile.canRead()) {
|
||||
throw new SimpleDataSourceLoadException("SAMDataSource: Unable to load file: " + smFile.getName());
|
||||
}
|
||||
samFileList.add(smFile);
|
||||
|
||||
}
|
||||
iterGen = new MSR2IteratorGenerator(reads, byReads);
|
||||
|
||||
header = createHeaderMerger().getMergedHeader();
|
||||
}
|
||||
|
||||
/**
|
||||
* Load a SAM/BAM, given an input file.
|
||||
*
|
||||
* @param samFile the file name
|
||||
* @return a SAMFileReader for the file, null if we're attempting to read a list
|
||||
*/
|
||||
private SAMFileReader initializeSAMFile(final File samFile) {
|
||||
if (samFile.toString().endsWith(".list")) {
|
||||
return null;
|
||||
} else {
|
||||
SAMFileReader samReader = new SAMFileReader(samFile, true);
|
||||
samReader.setValidationStringency(strictness);
|
||||
|
||||
final SAMFileHeader header = samReader.getFileHeader();
|
||||
logger.debug(String.format("Sort order is: " + header.getSortOrder()));
|
||||
|
||||
return samReader;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>
|
||||
|
|
@ -115,18 +102,12 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
* </p>
|
||||
*
|
||||
* @param location the genome location to extract data for
|
||||
*
|
||||
* @return an iterator for that region
|
||||
*/
|
||||
public StingSAMIterator seekLocus(GenomeLoc location) throws SimpleDataSourceLoadException {
|
||||
|
||||
// right now this is very heavy, it copies the file list into a reader list every time
|
||||
SamFileHeaderMerger headerMerger = createHeaderMerger();
|
||||
|
||||
// make a merging iterator for this record
|
||||
MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(headerMerger);
|
||||
|
||||
iter.queryOverlapping(location.getContig(), (int) location.getStart(), (int) location.getStop() + 1);
|
||||
|
||||
public StingSAMIterator seekLocus( GenomeLoc location ) throws SimpleDataSourceLoadException {
|
||||
// make a merging iterator for this record
|
||||
CloseableIterator<SAMRecord> iter = iterGen.seek(location);
|
||||
// return the iterator
|
||||
return StingSAMIteratorAdapter.adapt(reads, iter);
|
||||
}
|
||||
|
|
@ -137,9 +118,10 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
* </p>
|
||||
*
|
||||
* @param shard the shard to get data for
|
||||
*
|
||||
* @return an iterator for that region
|
||||
*/
|
||||
public StingSAMIterator seek(Shard shard) throws SimpleDataSourceLoadException {
|
||||
public StingSAMIterator seek( Shard shard ) throws SimpleDataSourceLoadException {
|
||||
StingSAMIterator iterator = null;
|
||||
if (shard.getShardType() == Shard.ShardType.READ) {
|
||||
iterator = seekRead((ReadShard) shard);
|
||||
|
|
@ -149,8 +131,7 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
reads.getMaxOnTheFlySorts(),
|
||||
reads.getFilterZeroMappingQualityReads(),
|
||||
reads.getSafetyChecking());
|
||||
} else if (shard.getShardType() == Shard.ShardType.LOCUS ||
|
||||
shard.getShardType() == Shard.ShardType.INTERVAL) {
|
||||
} else if (shard.getShardType() == Shard.ShardType.LOCUS || shard.getShardType() == Shard.ShardType.INTERVAL) {
|
||||
iterator = seekLocus(shard.getGenomeLoc());
|
||||
iterator = TraversalEngine.applyDecoratingIterators(false,
|
||||
iterator,
|
||||
|
|
@ -158,8 +139,7 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
reads.getMaxOnTheFlySorts(),
|
||||
reads.getFilterZeroMappingQualityReads(),
|
||||
reads.getSafetyChecking());
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
throw new StingException("seek: Unknown shard type");
|
||||
}
|
||||
|
||||
|
|
@ -173,17 +153,7 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
* @return SAM file header.
|
||||
*/
|
||||
public SAMFileHeader getHeader() {
|
||||
return header;
|
||||
}
|
||||
|
||||
/**
|
||||
* create the merging header.
|
||||
*
|
||||
* @return a SamFileHeaderMerger that includes the set of SAM files we were created with
|
||||
*/
|
||||
private SamFileHeaderMerger createHeaderMerger() {
|
||||
List<SAMFileReader> lst = GetReaderList();
|
||||
return new SamFileHeaderMerger(lst, SORT_ORDER, true);
|
||||
return this.iterGen.getHeader();
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -193,26 +163,33 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
* </p>
|
||||
*
|
||||
* @param shard the read shard to extract from
|
||||
*
|
||||
* @return an iterator for that region
|
||||
*/
|
||||
private BoundedReadIterator seekRead(ReadShard shard) throws SimpleDataSourceLoadException {
|
||||
private BoundedReadIterator seekRead( ReadShard shard ) throws SimpleDataSourceLoadException {
|
||||
|
||||
BoundedReadIterator bound = null;
|
||||
SamFileHeaderMerger headerMerger = createHeaderMerger();
|
||||
MergingSamRecordIterator2 iter = null;
|
||||
CloseableIterator<SAMRecord> iter = null;
|
||||
|
||||
if (!intoUnmappedReads) {
|
||||
iter = new MergingSamRecordIterator2(headerMerger);
|
||||
bound = fastMappedReadSeek(shard.getSize(), iter);
|
||||
if (lastReadPos == null) {
|
||||
lastReadPos = new GenomeLoc(iterGen.getHeader().getSequenceDictionary().getSequence(0).getSequenceIndex(), 0, Integer.MAX_VALUE);
|
||||
iter = iterGen.seek(lastReadPos);
|
||||
return InitialReadIterator(shard.getSize(), iter);
|
||||
} else {
|
||||
lastReadPos.setStop(-1);
|
||||
iter = iterGen.seek(lastReadPos);
|
||||
bound = fastMappedReadSeek(shard.getSize(), StingSAMIteratorAdapter.adapt(reads, iter));
|
||||
}
|
||||
}
|
||||
if ((bound == null || intoUnmappedReads) && includeUnmappedReads) {
|
||||
|
||||
if (( bound == null || intoUnmappedReads ) && includeUnmappedReads) {
|
||||
if (iter != null) {
|
||||
iter.close();
|
||||
}
|
||||
iter = new MergingSamRecordIterator2(createHeaderMerger());
|
||||
bound = toUnmappedReads(shard.getSize(), iter);
|
||||
iter = iterGen.seek(null);
|
||||
bound = toUnmappedReads(shard.getSize(), (PeekingStingIterator) iter);
|
||||
}
|
||||
|
||||
if (bound == null) {
|
||||
shard.signalDone();
|
||||
bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), 0);
|
||||
|
|
@ -228,7 +205,7 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
*
|
||||
* @param seeUnMappedReads true to see unmapped reads, false otherwise
|
||||
*/
|
||||
public void viewUnmappedReads(boolean seeUnMappedReads) {
|
||||
public void viewUnmappedReads( boolean seeUnMappedReads ) {
|
||||
includeUnmappedReads = seeUnMappedReads;
|
||||
}
|
||||
|
||||
|
|
@ -238,19 +215,21 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
*
|
||||
* @param readCount how many reads to retrieve
|
||||
* @param iter the iterator to use
|
||||
*
|
||||
* @return the bounded iterator that you can use to get the intervaled reads from
|
||||
* @throws SimpleDataSourceLoadException
|
||||
*/
|
||||
private BoundedReadIterator toUnmappedReads(long readCount, MergingSamRecordIterator2 iter) throws SimpleDataSourceLoadException {
|
||||
<U extends PeekingStingIterator> BoundedReadIterator toUnmappedReads( long readCount, U iter ) throws SimpleDataSourceLoadException {
|
||||
int count = 0;
|
||||
int cnt = 0;
|
||||
SAMRecord d = null;
|
||||
while (iter.hasNext()) {
|
||||
d = iter.peek();
|
||||
int x = d.getReferenceIndex();
|
||||
if (x < 0 || x >= d.getHeader().getSequenceDictionary().getSequences().size()) {
|
||||
if (x < 0)
|
||||
// we have the magic read that starts the unmapped read segment!
|
||||
break;
|
||||
}
|
||||
cnt++;
|
||||
iter.next();
|
||||
}
|
||||
|
||||
|
|
@ -262,6 +241,7 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
// now walk until we've taken the unmapped read count
|
||||
while (iter.hasNext() && count < this.readsTaken) {
|
||||
iter.next();
|
||||
count++;
|
||||
}
|
||||
|
||||
// check to see what happened, did we run out of reads?
|
||||
|
|
@ -280,108 +260,164 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
* A seek function for unmapped reads.
|
||||
*
|
||||
* @param readCount how many reads to retrieve
|
||||
* @param iter the iterator to use
|
||||
* @param iter the iterator to use, seeked to the correct start location
|
||||
*
|
||||
* @return the bounded iterator that you can use to get the intervaled reads from
|
||||
* @throws SimpleDataSourceLoadException
|
||||
*/
|
||||
private BoundedReadIterator fastMappedReadSeek(long readCount, MergingSamRecordIterator2 iter) throws SimpleDataSourceLoadException {
|
||||
if (lastReadPos == null) {
|
||||
BoundedReadIterator fastMappedReadSeek( long readCount, StingSAMIterator iter ) throws SimpleDataSourceLoadException {
|
||||
BoundedReadIterator bound;
|
||||
correctForReadPileupSeek(iter);
|
||||
if (readsTaken == 0) {
|
||||
return InitialReadIterator(readCount, iter);
|
||||
} else {
|
||||
BoundedReadIterator bound;
|
||||
iter.queryContained(lastReadPos.getContig(), (int) lastReadPos.getStop(), -1);
|
||||
}
|
||||
int x = 0;
|
||||
SAMRecord rec = null;
|
||||
int lastPos = 0;
|
||||
|
||||
// move the number of reads we read from the last pos
|
||||
boolean atLeastOneReadSeen = false; // we have a problem where some chomesomes don't have a single read (i.e. the chrN_random chrom.)
|
||||
while (iter.hasNext() && this.readsAtLastPos > 0) {
|
||||
iter.next();
|
||||
--readsAtLastPos;
|
||||
atLeastOneReadSeen = true;
|
||||
}
|
||||
if (readsAtLastPos != 0 && atLeastOneReadSeen) {
|
||||
throw new SimpleDataSourceLoadException("Seek problem: reads at last position count != 0");
|
||||
}
|
||||
|
||||
int x = 0;
|
||||
SAMRecord rec = null;
|
||||
int lastPos = 0;
|
||||
|
||||
while (x < readsTaken) {
|
||||
if (iter.hasNext()) {
|
||||
rec = iter.next();
|
||||
if (lastPos == rec.getAlignmentStart()) {
|
||||
++this.readsAtLastPos;
|
||||
} else {
|
||||
this.readsAtLastPos = 1;
|
||||
}
|
||||
lastPos = rec.getAlignmentStart();
|
||||
++x;
|
||||
while (x < readsTaken) {
|
||||
if (iter.hasNext()) {
|
||||
rec = iter.next();
|
||||
if (lastPos == rec.getAlignmentStart()) ++this.readsSeenAtLastPos;
|
||||
else this.readsSeenAtLastPos = 1;
|
||||
lastPos = rec.getAlignmentStart();
|
||||
++x;
|
||||
} else {
|
||||
// jump contigs
|
||||
if (lastReadPos.toNextContig() == false) {
|
||||
// check to see if we're using unmapped reads, if not return, we're done
|
||||
readsTaken = 0;
|
||||
intoUnmappedReads = true;
|
||||
return null;
|
||||
} else {
|
||||
// jump contigs
|
||||
if (lastReadPos.toNextContig() == false) {
|
||||
// check to see if we're using unmapped reads, if not return, we're done
|
||||
readsTaken = 0;
|
||||
intoUnmappedReads = true;
|
||||
return null;
|
||||
|
||||
} else {
|
||||
iter.close();
|
||||
// now merge the headers
|
||||
// right now this is pretty damn heavy, it copies the file list into a reader list every time
|
||||
SamFileHeaderMerger mg = createHeaderMerger();
|
||||
iter = new MergingSamRecordIterator2(mg);
|
||||
iter.queryContained(lastReadPos.getContig(), 1, Integer.MAX_VALUE);
|
||||
return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount);
|
||||
}
|
||||
readsTaken = readCount;
|
||||
readsSeenAtLastPos = 0;
|
||||
lastReadPos.setStop(-1);
|
||||
CloseableIterator<SAMRecord> ret = iterGen.seek(lastReadPos);
|
||||
return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, ret), readCount);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// if we're off the end of the last contig (into unmapped territory)
|
||||
if (rec != null && rec.getAlignmentStart() == 0) {
|
||||
readsTaken += readCount;
|
||||
intoUnmappedReads = true;
|
||||
}
|
||||
// else we're not off the end, store our location
|
||||
else if (rec != null) {
|
||||
int stopPos = rec.getAlignmentStart();
|
||||
if (stopPos < lastReadPos.getStart()) {
|
||||
lastReadPos = new GenomeLoc(lastReadPos.getContigIndex() + 1, stopPos, stopPos);
|
||||
} else {
|
||||
lastReadPos.setStop(rec.getAlignmentStart());
|
||||
}
|
||||
}
|
||||
// in case we're run out of reads, get out
|
||||
else {
|
||||
throw new StingException("Danger: weve run out reads in fastMappedReadSeek");
|
||||
//return null;
|
||||
}
|
||||
bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount);
|
||||
|
||||
// return the iterator
|
||||
return bound;
|
||||
}
|
||||
|
||||
// if we're off the end of the last contig (into unmapped territory)
|
||||
if (rec != null && rec.getAlignmentStart() == 0) {
|
||||
readsTaken += readCount;
|
||||
intoUnmappedReads = true;
|
||||
}
|
||||
// else we're not off the end, store our location
|
||||
else if (rec != null) {
|
||||
int stopPos = rec.getAlignmentStart();
|
||||
if (stopPos < lastReadPos.getStart()) {
|
||||
lastReadPos = new GenomeLoc(lastReadPos.getContigIndex() + 1, stopPos, stopPos);
|
||||
} else {
|
||||
lastReadPos.setStart(rec.getAlignmentStart());
|
||||
}
|
||||
}
|
||||
// in case we're run out of reads, get out
|
||||
else {
|
||||
throw new StingException("Danger: weve run out reads in fastMappedReadSeek");
|
||||
//return null;
|
||||
}
|
||||
bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount);
|
||||
|
||||
|
||||
// return the iterator
|
||||
return bound;
|
||||
}
|
||||
|
||||
/**
|
||||
* Even though the iterator has seeked to the correct location, there may be multiple reads at that location,
|
||||
* and we may have given some of them out already. Move the iterator to the correct location using the readsAtLastPos variable
|
||||
*
|
||||
* @param iter the iterator
|
||||
*/
|
||||
private void correctForReadPileupSeek( StingSAMIterator iter ) {
|
||||
// move the number of reads we read from the last pos
|
||||
boolean atLeastOneReadSeen = false; // we have a problem where some chomesomes don't have a single read (i.e. the chrN_random chrom.)
|
||||
while (iter.hasNext() && this.readsSeenAtLastPos > 0) {
|
||||
iter.next();
|
||||
--readsSeenAtLastPos;
|
||||
atLeastOneReadSeen = true;
|
||||
}
|
||||
if (readsSeenAtLastPos != 0 && atLeastOneReadSeen) {
|
||||
throw new SimpleDataSourceLoadException("Seek problem: reads at last position count != 0");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* set the initial iterator
|
||||
*
|
||||
* @param readCount the number of reads
|
||||
* @param iter the merging iterator
|
||||
*
|
||||
* @return a bounded read iterator at the first read position in the file.
|
||||
*/
|
||||
private BoundedReadIterator InitialReadIterator(long readCount, MergingSamRecordIterator2 iter) {
|
||||
private BoundedReadIterator InitialReadIterator( long readCount, CloseableIterator<SAMRecord> iter ) {
|
||||
BoundedReadIterator bound;
|
||||
lastReadPos = new GenomeLoc(iter.getHeader().getSequenceDictionary().getSequence(0).getSequenceIndex(), 0, 0);
|
||||
iter.queryContained(lastReadPos.getContig(), 1, -1);
|
||||
bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount);
|
||||
this.readsTaken = readCount;
|
||||
return bound;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* iterator generator
|
||||
* <p/>
|
||||
* This class generates iterators for the SAMDataSource. This class was introduced for testing purposes,
|
||||
* since it became increasingly hard to test the SAM data source code. The class defines two abstraact
|
||||
* methods:
|
||||
* <p/>
|
||||
* -seek( GenomeLoc ) which returns an iterator seeked to the genome loc, and if null is passed to the default
|
||||
* location (which is implementation specific).
|
||||
* <p/>
|
||||
* -getHeader(), which returns a SAMFileHeader for the specified IteratorGenerator. I hope we can phase this
|
||||
* method out, since it doesn't seem necessary, and it would be much cleaner with out it.
|
||||
*/
|
||||
abstract class IteratorGenerator {
|
||||
/** our log, which we want to capture anything from this class */
|
||||
protected static Logger logger = Logger.getLogger(SAMDataSource.class);
|
||||
|
||||
/**
|
||||
* seek to a location
|
||||
*
|
||||
* @param seekTo the genome loc to seek to
|
||||
*
|
||||
* @return StingSAMIterator
|
||||
*/
|
||||
public abstract CloseableIterator<SAMRecord> seek( GenomeLoc seekTo );
|
||||
|
||||
/**
|
||||
* get the merged header
|
||||
*
|
||||
* @return the merged header
|
||||
*/
|
||||
public abstract SAMFileHeader getHeader();
|
||||
|
||||
/**
|
||||
* Load a SAM/BAM, given an input file.
|
||||
*
|
||||
* @param samFile the file name
|
||||
*
|
||||
* @return a SAMFileReader for the file, null if we're attempting to read a list
|
||||
*/
|
||||
protected static SAMFileReader initializeSAMFile( final File samFile, SAMFileReader.ValidationStringency strictness ) {
|
||||
if (samFile.toString().endsWith(".list")) {
|
||||
return null;
|
||||
} else {
|
||||
SAMFileReader samReader = new SAMFileReader(samFile, true);
|
||||
samReader.setValidationStringency(strictness);
|
||||
|
||||
final SAMFileHeader header = samReader.getFileHeader();
|
||||
logger.debug(String.format("Sort order is: " + header.getSortOrder()));
|
||||
|
||||
return samReader;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A private function that, given the internal file list, generates a SamFileReader
|
||||
* list of validated files.
|
||||
|
|
@ -389,11 +425,11 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
* @return a list of SAMFileReaders that represent the stored file names
|
||||
* @throws SimpleDataSourceLoadException if there's a problem loading the files
|
||||
*/
|
||||
private List<SAMFileReader> GetReaderList() throws SimpleDataSourceLoadException {
|
||||
protected static List<SAMFileReader> GetReaderList( Reads reads, SAMFileReader.ValidationStringency strictness ) throws SimpleDataSourceLoadException {
|
||||
// right now this is pretty damn heavy, it copies the file list into a reader list every time
|
||||
List<SAMFileReader> lst = new ArrayList<SAMFileReader>();
|
||||
for (File f : this.samFileList) {
|
||||
SAMFileReader reader = initializeSAMFile(f);
|
||||
for (File f : reads.getReadsFiles()) {
|
||||
SAMFileReader reader = initializeSAMFile(f, strictness);
|
||||
|
||||
if (reader.getFileHeader().getReadGroups().size() < 1) {
|
||||
//logger.warn("Setting header in reader " + f.getName());
|
||||
|
|
@ -412,4 +448,65 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
return lst;
|
||||
}
|
||||
|
||||
/**
|
||||
* create the merging header.
|
||||
*
|
||||
* @return a SamFileHeaderMerger that includes the set of SAM files we were created with
|
||||
*/
|
||||
protected static SamFileHeaderMerger createHeaderMerger( Reads reads, SAMFileReader.ValidationStringency strictness, SAMFileHeader.SortOrder SORT_ORDER ) {
|
||||
List<SAMFileReader> lst = GetReaderList(reads, strictness);
|
||||
return new SamFileHeaderMerger(lst, SORT_ORDER, true);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* MSR2IteratorGenerator
|
||||
* <p/>
|
||||
* generates a MerginsSAMIterator2, given a genomic location. The constructor takes the reads structure,
|
||||
* and a flag indicating if we're dealing with reads or loci (to determine the correct query function).
|
||||
*/
|
||||
class MSR2IteratorGenerator extends IteratorGenerator {
|
||||
/** our read pile */
|
||||
private Reads reads;
|
||||
|
||||
private SamFileHeaderMerger header;
|
||||
|
||||
// How strict should we be with SAM/BAM parsing?
|
||||
protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.SILENT;
|
||||
|
||||
// are we by reads or by loci
|
||||
protected boolean byReads = true;
|
||||
|
||||
/** our SAM data files */
|
||||
private final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate;
|
||||
|
||||
public MSR2IteratorGenerator( Reads reads, boolean byReads ) {
|
||||
this.reads = reads;
|
||||
this.header = IteratorGenerator.createHeaderMerger(reads, strictness, sortOrder);
|
||||
this.byReads = byReads;
|
||||
}
|
||||
|
||||
public CloseableIterator<SAMRecord> seek( GenomeLoc seekTo ) {
|
||||
SamFileHeaderMerger mg = createHeaderMerger(reads, strictness, sortOrder);
|
||||
MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(mg, reads);
|
||||
if (seekTo != null) {
|
||||
if (byReads)
|
||||
iter.queryContained(seekTo.getContig(), (int) seekTo.getStart(), (int) seekTo.getStop());
|
||||
else
|
||||
iter.queryOverlapping(seekTo.getContig(), (int) seekTo.getStart(), (int) seekTo.getStop());
|
||||
}
|
||||
return iter;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the merged header
|
||||
*
|
||||
* @return the merged header
|
||||
*/
|
||||
public SAMFileHeader getHeader() {
|
||||
return header.getMergedHeader();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -75,11 +75,13 @@ public abstract class MicroScheduler {
|
|||
protected MicroScheduler(Walker walker, Reads reads, File refFile, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
|
||||
if (walker instanceof ReadWalker) {
|
||||
traversalEngine = new TraverseReads(reads.getReadsFiles(), refFile, rods);
|
||||
this.reads = getReadsDataSource(reads,true);
|
||||
} else {
|
||||
traversalEngine = new TraverseLoci(reads.getReadsFiles(), refFile, rods);
|
||||
this.reads = getReadsDataSource(reads,false);
|
||||
}
|
||||
|
||||
this.reads = getReadsDataSource(reads);
|
||||
|
||||
this.reference = openReferenceSequenceFile(refFile);
|
||||
this.rods = getReferenceOrderedDataSources(rods);
|
||||
}
|
||||
|
|
@ -159,12 +161,12 @@ public abstract class MicroScheduler {
|
|||
* Gets a data source for the given set of reads.
|
||||
* @return A data source for the given set of reads.
|
||||
*/
|
||||
private SAMDataSource getReadsDataSource(Reads reads) {
|
||||
private SAMDataSource getReadsDataSource(Reads reads, boolean byReads) {
|
||||
// By reference traversals are happy with no reads. Make sure that case is handled.
|
||||
if (reads.getReadsFiles().size() == 0)
|
||||
return null;
|
||||
|
||||
SAMDataSource dataSource = new SAMDataSource(reads);
|
||||
SAMDataSource dataSource = new SAMDataSource(reads, byReads);
|
||||
|
||||
// Side effect: initialize the traversal engine with reads data.
|
||||
// TODO: Give users a dedicated way of getting the header so that the MicroScheduler
|
||||
|
|
|
|||
|
|
@ -7,20 +7,29 @@ import java.util.Iterator;
|
|||
|
||||
import org.broadinstitute.sting.gatk.Reads;
|
||||
|
||||
/**
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* User: aaron
|
||||
* Date: Apr 14, 2009
|
||||
* Time: 5:27:31 PM
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -31,7 +31,7 @@ import java.util.PriorityQueue;
|
|||
* iterable stream. The underlying iterators/files must all have the same sort order unless
|
||||
* the requested output format is unsorted, in which case any combination is valid.
|
||||
*/
|
||||
public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>, Iterable<SAMRecord> {
|
||||
public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>, Iterable<SAMRecord>, PeekingStingIterator {
|
||||
protected PriorityQueue<ComparableSamRecordIterator> pq = null;
|
||||
protected final SamFileHeaderMerger samHeaderMerger;
|
||||
protected final SAMFileHeader.SortOrder sortOrder;
|
||||
|
|
@ -43,7 +43,7 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
|
|||
* Constructs a new merging iterator with the same set of readers and sort order as
|
||||
* provided by the header merger parameter.
|
||||
*/
|
||||
public MergingSamRecordIterator2(final SamFileHeaderMerger headerMerger) {
|
||||
public MergingSamRecordIterator2(final SamFileHeaderMerger headerMerger, Reads reads) {
|
||||
this.samHeaderMerger = headerMerger;
|
||||
this.sortOrder = headerMerger.getMergedHeader().getSortOrder();
|
||||
this.pq = new PriorityQueue<ComparableSamRecordIterator>(samHeaderMerger.getReaders().size());
|
||||
|
|
@ -275,6 +275,16 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
|
|||
public Iterator<SAMRecord> iterator() {
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets source information for the reads. Contains information about the original reads
|
||||
* files, plus information about downsampling, etc.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public Reads getSourceInfo() {
|
||||
return null; //To change body of implemented methods use File | Settings | File Templates.
|
||||
}
|
||||
}
|
||||
|
||||
// Should replace picard class with the same name
|
||||
|
|
|
|||
|
|
@ -0,0 +1,43 @@
|
|||
package org.broadinstitute.sting.gatk.iterators;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* <p/>
|
||||
* Interface Peekable
|
||||
* <p/>
|
||||
* This interface indicates that
|
||||
*/
|
||||
public interface PeekingStingIterator extends StingSAMIterator {
|
||||
/**
|
||||
* peek, given the specified type
|
||||
* @return
|
||||
*/
|
||||
SAMRecord peek();
|
||||
}
|
||||
|
|
@ -337,34 +337,7 @@ public abstract class TraversalEngine {
|
|||
return true;
|
||||
}
|
||||
|
||||
protected Iterator<SAMRecord> initializeReads() {
|
||||
if( readsFiles.size() == 1 )
|
||||
samReader = initializeSAMFile(readsFiles.get(0));
|
||||
else
|
||||
samReader = null;
|
||||
return wrapReadsIterator(getReadsIterator(samReader), true);
|
||||
}
|
||||
|
||||
protected Iterator<SAMRecord> getReadsIterator(final SAMFileReader samReader) {
|
||||
// If the file has an index, querying functions are available. Use them if possible...
|
||||
if ( samReader == null && readsFiles.size() > 0 ) {
|
||||
SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.coordinate;
|
||||
|
||||
List<SAMFileReader> readers = new ArrayList<SAMFileReader>();
|
||||
|
||||
for ( File readsFile: readsFiles ) {
|
||||
SAMFileReader reader = initializeSAMFile(readsFile);
|
||||
readers.add(reader);
|
||||
}
|
||||
|
||||
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SORT_ORDER);
|
||||
return new MergingSamRecordIterator2(headerMerger);
|
||||
}
|
||||
else {
|
||||
return samReader.iterator();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Deprecated
|
||||
protected StingSAMIterator wrapReadsIterator( final Iterator<SAMRecord> rawIterator, final boolean enableVerification ) {
|
||||
// Reads sourceInfo is gone by this point in the traversal engine. Stub in a null and rely on the iterator to
|
||||
|
|
|
|||
|
|
@ -246,39 +246,7 @@ public class TraverseDuplicates extends TraversalEngine {
|
|||
return sum;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// old style interface to the system
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
@Override
|
||||
public <M,T> T traverse(Walker<M,T> walker, List<GenomeLoc> locations) {
|
||||
if ( walker instanceof DuplicateWalker) {
|
||||
Walker x = walker;
|
||||
DuplicateWalker<?, ?> dupWalker = (DuplicateWalker<?, ?>)x;
|
||||
return (T)this.traverseByRead(dupWalker, locations);
|
||||
} else {
|
||||
throw new IllegalArgumentException("Walker isn't a duplicate walker!");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Should we deleted at the soonist possible opportunity
|
||||
*/
|
||||
public <M, T> Object traverseByRead(DuplicateWalker<M, T> walker, List<GenomeLoc> locations) {
|
||||
samReadIter = initializeReads();
|
||||
|
||||
// Initialize the walker
|
||||
walker.initialize();
|
||||
|
||||
// Initialize the sum
|
||||
FilteringIterator filterIter = new FilteringIterator(samReadIter, new duplicateStreamFilterFunc());
|
||||
T sum = actuallyTraverse(walker, filterIter, walker.reduceInit());
|
||||
|
||||
//printOnTraversalDone("reads", sum);
|
||||
walker.onTraversalDone(sum);
|
||||
return sum;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
|
|||
|
|
@ -0,0 +1,165 @@
|
|||
package org.broadinstitute.sting.utils.sam;
|
||||
|
||||
import org.broadinstitute.sting.gatk.iterators.PeekingStingIterator;
|
||||
import org.broadinstitute.sting.gatk.Reads;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/** this fake iterator allows us to look at how specific piles of reads are handled */
|
||||
public class ArtificialSAMIterator implements PeekingStingIterator {
|
||||
|
||||
|
||||
protected int currentChromo = 0;
|
||||
protected int currentRead = 0;
|
||||
protected int totalReadCount = 0;
|
||||
protected boolean done = false;
|
||||
// the next record
|
||||
protected SAMRecord next = null;
|
||||
protected SAMFileHeader header = null;
|
||||
|
||||
// the passed in parameters
|
||||
protected final int sChr;
|
||||
protected final int eChromosomeCount;
|
||||
protected final int rCount;
|
||||
protected int uMappedReadCount;
|
||||
|
||||
// let us know to make a read, we need this to help out the fake sam query iterator
|
||||
private boolean initialized = false;
|
||||
|
||||
/**
|
||||
* create the fake iterator, given the mapping of chromosomes and read counts
|
||||
*
|
||||
* @param startingChr the starting chromosome
|
||||
* @param endingChr the ending chromosome
|
||||
* @param readCount the number of reads in each chromosome
|
||||
* @param header the associated header
|
||||
*/
|
||||
ArtificialSAMIterator( int startingChr, int endingChr, int readCount, SAMFileHeader header ) {
|
||||
sChr = startingChr;
|
||||
eChromosomeCount = (endingChr - startingChr) + 1;
|
||||
rCount = readCount;
|
||||
this.header = header;
|
||||
this.currentChromo = 0;
|
||||
uMappedReadCount = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* create the fake iterator, given the mapping of chromosomes and read counts
|
||||
*
|
||||
* @param startingChr the starting chromosome
|
||||
* @param endingChr the ending chromosome
|
||||
* @param readCount the number of reads in each chromosome
|
||||
* @param header the associated header
|
||||
*/
|
||||
ArtificialSAMIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount, SAMFileHeader header ) {
|
||||
sChr = startingChr;
|
||||
eChromosomeCount = (endingChr - startingChr) + 1;
|
||||
rCount = readCount;
|
||||
this.header = header;
|
||||
this.currentChromo = 0;
|
||||
this.uMappedReadCount = unmappedReadCount;
|
||||
}
|
||||
|
||||
|
||||
public Reads getSourceInfo() {
|
||||
throw new UnsupportedOperationException("We don't support this");
|
||||
}
|
||||
|
||||
public void close() {
|
||||
// done
|
||||
currentChromo = Integer.MAX_VALUE;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
if (!initialized){
|
||||
initialized = true;
|
||||
createNextRead();
|
||||
}
|
||||
if (this.next != null) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean createNextRead() {
|
||||
if (currentRead >= rCount) {
|
||||
currentChromo++;
|
||||
currentRead = 0;
|
||||
}
|
||||
// check for end condition, have we finished the chromosome listing, and have no unmapped reads
|
||||
if (currentChromo >= eChromosomeCount) {
|
||||
if (uMappedReadCount < 1) {
|
||||
this.next = null;
|
||||
return false;
|
||||
} else {
|
||||
++totalReadCount;
|
||||
this.next = ArtificialSamUtils.createArtificialRead(this.header, String.valueOf(totalReadCount), -1, -1, 50);
|
||||
--uMappedReadCount;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
++totalReadCount;
|
||||
this.next = ArtificialSamUtils.createArtificialRead(this.header, String.valueOf(totalReadCount), currentChromo, currentRead, 50);
|
||||
++currentRead;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
public SAMRecord next() {
|
||||
SAMRecord ret = next;
|
||||
createNextRead();
|
||||
return ret;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("You've tried to remove on a StingSAMIterator (unsupported), not to mention that this is a fake iterator.");
|
||||
}
|
||||
|
||||
public Iterator<SAMRecord> iterator() {
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* some instrumentation methods
|
||||
*/
|
||||
public int readsTaken() {
|
||||
return totalReadCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* peek at the next sam record
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public SAMRecord peek() {
|
||||
return this.next;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,181 @@
|
|||
package org.broadinstitute.sting.utils.sam;
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* <p/>
|
||||
* Class ArtificialSAMQueryIterator
|
||||
* <p/>
|
||||
* A descriptions should go here. Blame aaron if it's missing.
|
||||
*/
|
||||
public class ArtificialSAMQueryIterator extends ArtificialSAMIterator {
|
||||
|
||||
// get the next positon
|
||||
protected int finalPos = 0;
|
||||
protected int startPos = 0;
|
||||
protected int contigIndex = -1;
|
||||
protected boolean overlapping = false;
|
||||
protected int startingChr = 0;
|
||||
protected boolean seeked = false;
|
||||
|
||||
/**
|
||||
* create the fake iterator, given the mapping of chromosomes and read counts
|
||||
*
|
||||
* @param startingChr the starting chromosome
|
||||
* @param endingChr the ending chromosome
|
||||
* @param readCount the number of reads in each chromosome
|
||||
* @param header the associated header
|
||||
*/
|
||||
ArtificialSAMQueryIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount, SAMFileHeader header ) {
|
||||
super(startingChr, endingChr, readCount, unmappedReadCount, header);
|
||||
this.startingChr = startingChr;
|
||||
}
|
||||
|
||||
/**
|
||||
* query containing - get reads contained by the specified interval
|
||||
*
|
||||
* @param contig the contig index string
|
||||
* @param start the start position
|
||||
* @param stop the stop position
|
||||
*/
|
||||
public void queryContained( String contig, int start, int stop ) {
|
||||
this.overlapping = false;
|
||||
initialize(contig, start, stop);
|
||||
}
|
||||
|
||||
/**
|
||||
* query containing - get reads contained by the specified interval
|
||||
*
|
||||
* @param contig the contig index string
|
||||
* @param start the start position
|
||||
* @param stop the stop position
|
||||
*/
|
||||
public void queryOverlapping( String contig, int start, int stop ) {
|
||||
this.overlapping = true;
|
||||
initialize(contig, start, stop);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* initialize the query iterator
|
||||
*
|
||||
* @param contig the contig
|
||||
* @param start the start position
|
||||
* @param stop the stop postition
|
||||
*/
|
||||
private void initialize( String contig, int start, int stop ) {
|
||||
ensureUntouched();
|
||||
finalPos = stop;
|
||||
startPos = start;
|
||||
if (finalPos < 0) {
|
||||
finalPos = Integer.MAX_VALUE;
|
||||
}
|
||||
// sanity check that we have the contig
|
||||
contigIndex = -1;
|
||||
List<SAMSequenceRecord> list = header.getSequenceDictionary().getSequences();
|
||||
for (SAMSequenceRecord rec : list) {
|
||||
if (rec.getSequenceName().equals(contig)) {
|
||||
contigIndex = rec.getSequenceIndex();
|
||||
}
|
||||
}
|
||||
if (contigIndex < 0) { throw new IllegalArgumentException("Contig" + contig + " doesn't exist"); }
|
||||
while (super.hasNext() && this.peek().getReferenceIndex() < contigIndex) {
|
||||
super.next();
|
||||
}
|
||||
if (!super.hasNext()) {
|
||||
throw new StingException("Unable to find the target chromosome");
|
||||
}
|
||||
while (super.hasNext() && this.peek().getAlignmentStart() < start) {
|
||||
super.next();
|
||||
}
|
||||
// sanity check that we have an actual matching read next
|
||||
SAMRecord rec = this.peek();
|
||||
if (!matches(rec)) {
|
||||
throw new StingException("The next read doesn't match");
|
||||
}
|
||||
// set the seeked variable to true
|
||||
seeked = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* given a read and the query type, check if it matches our regions
|
||||
*
|
||||
* @param rec the read
|
||||
*
|
||||
* @return true if it belongs in our region
|
||||
*/
|
||||
public boolean matches( SAMRecord rec ) {
|
||||
if (rec.getReferenceIndex() != this.contigIndex) {
|
||||
return false;
|
||||
}
|
||||
if (!overlapping) {
|
||||
// if the start or the end are somewhere within our range
|
||||
if (( rec.getAlignmentStart() >= startPos && rec.getAlignmentEnd() <= finalPos )) {
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
if (( rec.getAlignmentStart() <= finalPos && rec.getAlignmentStart() >= startPos ) ||
|
||||
( rec.getAlignmentEnd() <= finalPos && rec.getAlignmentEnd() >= startPos )) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* override the hasNext, to incorportate our limiting factor
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public boolean hasNext() {
|
||||
boolean res = super.hasNext();
|
||||
if (!seeked) {
|
||||
return res;
|
||||
}
|
||||
if (res && matches(this.next)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/** make sure we haven't been used as an iterator yet; this is to miror the MergingSamIterator2 action. */
|
||||
public void ensureUntouched() {
|
||||
if (this.currentChromo != 0 || this.currentRead > 0) {
|
||||
throw new UnsupportedOperationException("We've already been used as an iterator; you can't query after that");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -5,7 +5,7 @@ import net.sf.samtools.*;
|
|||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||
import org.broadinstitute.sting.gatk.iterators.PeekingStingIterator;
|
||||
import org.broadinstitute.sting.gatk.Reads;
|
||||
|
||||
/**
|
||||
|
|
@ -28,11 +28,6 @@ import org.broadinstitute.sting.gatk.Reads;
|
|||
/**
|
||||
* @author aaron
|
||||
* @version 1.0
|
||||
* @date May 21, 2009
|
||||
* <p/>
|
||||
* Class samUtils
|
||||
* <p/>
|
||||
* A collection of utilities for making sam and bam files. Mostly these are for creating sam and bam files for testing purposes.
|
||||
*/
|
||||
public class ArtificialSamUtils {
|
||||
public static final int DEFAULT_READ_LENGTH = 50;
|
||||
|
|
@ -46,7 +41,7 @@ public class ArtificialSamUtils {
|
|||
* @param chromosomeSize how large each chromosome is
|
||||
* @param readsPerChomosome how many reads to make in each chromosome. They'll be aligned from position 1 to x (which is the number of reads)
|
||||
*/
|
||||
public static void createArtificialBamFile(String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome) {
|
||||
public static void createArtificialBamFile( String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome ) {
|
||||
SAMFileHeader header = createArtificialSamHeader(numberOfChromosomes, startingChromosome, chromosomeSize);
|
||||
File outFile = new File(filename);
|
||||
|
||||
|
|
@ -70,7 +65,7 @@ public class ArtificialSamUtils {
|
|||
* @param chromosomeSize how large each chromosome is
|
||||
* @param readsPerChomosome how many reads to make in each chromosome. They'll be aligned from position 1 to x (which is the number of reads)
|
||||
*/
|
||||
public static void createArtificialSamFile(String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome) {
|
||||
public static void createArtificialSamFile( String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome ) {
|
||||
SAMFileHeader header = createArtificialSamHeader(numberOfChromosomes, startingChromosome, chromosomeSize);
|
||||
File outFile = new File(filename);
|
||||
|
||||
|
|
@ -91,14 +86,15 @@ public class ArtificialSamUtils {
|
|||
* @param numberOfChromosomes the number of chromosomes to create
|
||||
* @param startingChromosome the starting number for the chromosome (most likely set to 1)
|
||||
* @param chromosomeSize the length of each chromosome
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public static SAMFileHeader createArtificialSamHeader(int numberOfChromosomes, int startingChromosome, int chromosomeSize) {
|
||||
public static SAMFileHeader createArtificialSamHeader( int numberOfChromosomes, int startingChromosome, int chromosomeSize ) {
|
||||
SAMFileHeader header = new SAMFileHeader();
|
||||
SAMSequenceDictionary dict = new SAMSequenceDictionary();
|
||||
// make up some sequence records
|
||||
for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) {
|
||||
SAMSequenceRecord rec = new SAMSequenceRecord("chr" + (x), chromosomeSize /* size */);
|
||||
SAMSequenceRecord rec = new SAMSequenceRecord("chr" + ( x ), chromosomeSize /* size */);
|
||||
rec.setSequenceLength(chromosomeSize);
|
||||
dict.addSequence(rec);
|
||||
}
|
||||
|
|
@ -114,9 +110,10 @@ public class ArtificialSamUtils {
|
|||
* @param refIndex the reference index, i.e. what chromosome to associate it with
|
||||
* @param alignmentStart where to start the alignment
|
||||
* @param length the length of the read
|
||||
*
|
||||
* @return the artificial read
|
||||
*/
|
||||
public static SAMRecord createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, int length) {
|
||||
public static SAMRecord createArtificialRead( SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) {
|
||||
SAMRecord record = new SAMRecord(header);
|
||||
record.setReadName(name);
|
||||
record.setReferenceIndex(refIndex);
|
||||
|
|
@ -131,95 +128,64 @@ public class ArtificialSamUtils {
|
|||
|
||||
/**
|
||||
* create an iterator containing the specified read piles
|
||||
*
|
||||
* @param startingChr the chromosome (reference ID) to start from
|
||||
* @param endingChr the id to end with
|
||||
* @param readCount the number of reads per chromosome
|
||||
* @param endingChr the id to end with
|
||||
* @param readCount the number of reads per chromosome
|
||||
*
|
||||
* @return StingSAMIterator representing the specified amount of fake data
|
||||
*/
|
||||
public static StingSAMIterator unmappedReadIterator(int startingChr, int endingChr, int readCount ) {
|
||||
SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH);
|
||||
Map<Integer, Integer> map = new HashMap<Integer, Integer>();
|
||||
for (int x = startingChr; x < endingChr; x++) {
|
||||
map.put(x,readCount);
|
||||
}
|
||||
return new ArtificialSAMIterator(startingChr, endingChr, readCount,header);
|
||||
public static PeekingStingIterator unmappedReadIterator( int startingChr, int endingChr, int readCount ) {
|
||||
SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH);
|
||||
|
||||
return new ArtificialSAMIterator(startingChr, endingChr, readCount, header);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* this fake iterator allows us to look at how specific piles of reads are handled
|
||||
*/
|
||||
class ArtificialSAMIterator implements StingSAMIterator {
|
||||
|
||||
|
||||
private int currentChromo = 0;
|
||||
private int currentRead = 0;
|
||||
private int readCount = 0;
|
||||
private boolean done = false;
|
||||
// the next record
|
||||
private SAMRecord next = null;
|
||||
SAMFileHeader header = null;
|
||||
|
||||
// the passed in parameters
|
||||
final int sChr;
|
||||
final int eChr;
|
||||
final int rCount;
|
||||
|
||||
/**
|
||||
* create the fake iterator, given the mapping of chromosomes and read counts
|
||||
* @param startingChr the starting chromosome
|
||||
* @param endingChr the ending chromosome
|
||||
* @param readCount the number of reads in each chromosome
|
||||
* @param header the associated header
|
||||
* create an iterator containing the specified read piles
|
||||
*
|
||||
* @param startingChr the chromosome (reference ID) to start from
|
||||
* @param endingChr the id to end with
|
||||
* @param readCount the number of reads per chromosome
|
||||
* @param unmappedReadCount the count of unmapped reads to place at the end of the iterator, like in a sorted bam file
|
||||
*
|
||||
* @return StingSAMIterator representing the specified amount of fake data
|
||||
*/
|
||||
ArtificialSAMIterator(int startingChr, int endingChr, int readCount, SAMFileHeader header) {
|
||||
sChr = startingChr;
|
||||
eChr = endingChr;
|
||||
rCount = readCount;
|
||||
this.header = header;
|
||||
this.currentChromo = startingChr;
|
||||
public static PeekingStingIterator unmappedReadIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount ) {
|
||||
SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH);
|
||||
|
||||
return new ArtificialSAMIterator(startingChr, endingChr, readCount, unmappedReadCount, header);
|
||||
}
|
||||
|
||||
public Reads getSourceInfo() {
|
||||
throw new UnsupportedOperationException("We don't support this");
|
||||
/**
|
||||
* create an ArtificialSAMQueryIterator containing the specified read piles
|
||||
*
|
||||
* @param startingChr the chromosome (reference ID) to start from
|
||||
* @param endingChr the id to end with
|
||||
* @param readCount the number of reads per chromosome
|
||||
*
|
||||
* @return StingSAMIterator representing the specified amount of fake data
|
||||
*/
|
||||
public static ArtificialSAMQueryIterator queryReadIterator( int startingChr, int endingChr, int readCount ) {
|
||||
SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH);
|
||||
|
||||
return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, 0, header);
|
||||
}
|
||||
|
||||
public void close() {
|
||||
// done
|
||||
currentChromo = Integer.MAX_VALUE;
|
||||
}
|
||||
/**
|
||||
* create an ArtificialSAMQueryIterator containing the specified read piles
|
||||
*
|
||||
* @param startingChr the chromosome (reference ID) to start from
|
||||
* @param endingChr the id to end with
|
||||
* @param readCount the number of reads per chromosome
|
||||
* @param unmappedReadCount the count of unmapped reads to place at the end of the iterator, like in a sorted bam file
|
||||
*
|
||||
* @return StingSAMIterator representing the specified amount of fake data
|
||||
*/
|
||||
public static ArtificialSAMQueryIterator queryReadIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount ) {
|
||||
SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH);
|
||||
|
||||
public boolean hasNext() {
|
||||
if (currentRead >= rCount) {
|
||||
currentChromo++;
|
||||
currentRead = 0;
|
||||
}
|
||||
if (currentChromo >= eChr) {
|
||||
return false;
|
||||
}
|
||||
int alignment = 0;
|
||||
if (currentChromo >= 0) {
|
||||
alignment = currentRead;
|
||||
} else {
|
||||
alignment = 0;
|
||||
}
|
||||
|
||||
this.next = ArtificialSamUtils.createArtificialRead(this.header,String.valueOf(readCount), currentChromo, alignment, 50);
|
||||
currentRead++;
|
||||
return true;
|
||||
}
|
||||
|
||||
public SAMRecord next() {
|
||||
return next;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("You've tried to remove on a StingSAMIterator (unsupported), not to mention that this is a fake iterator.");
|
||||
}
|
||||
|
||||
public Iterator<SAMRecord> iterator() {
|
||||
return this;
|
||||
return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, unmappedReadCount, header);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -43,6 +43,12 @@ import net.sf.samtools.SAMFileHeader;
|
|||
* Class ReadIntervalShardStrategyTest
|
||||
* <p/>
|
||||
* Tests the ReadIntervalShardStrategy class
|
||||
*
|
||||
*
|
||||
* TODO: this test has been changed, since we now force interval shards to not subdivide if they're large,
|
||||
* so you should always get one shard back, reguardless of the specified length. If this behavior changes
|
||||
* back in the future, the expected number of shards should change from 1 to > 1.
|
||||
*
|
||||
*/
|
||||
public class IntervalShardStrategyTest extends BaseTest {
|
||||
|
||||
|
|
@ -75,7 +81,7 @@ public class IntervalShardStrategyTest extends BaseTest {
|
|||
counter++;
|
||||
}
|
||||
assertTrue(d instanceof IntervalShard);
|
||||
assertEquals(10, counter);
|
||||
assertEquals(1, counter);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
@ -92,7 +98,7 @@ public class IntervalShardStrategyTest extends BaseTest {
|
|||
counter++;
|
||||
}
|
||||
assertTrue(d instanceof IntervalShard);
|
||||
assertEquals(50, counter);
|
||||
assertEquals(5, counter);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
@ -105,16 +111,11 @@ public class IntervalShardStrategyTest extends BaseTest {
|
|||
int counter = 0;
|
||||
while (strat.hasNext()) {
|
||||
Shard d = strat.next();
|
||||
if (counter % 2 == 0) {
|
||||
assertEquals(1, d.getGenomeLoc().getStart());
|
||||
assertEquals(789, d.getGenomeLoc().getStop());
|
||||
} else {
|
||||
assertEquals(790, d.getGenomeLoc().getStart());
|
||||
assertEquals(1000, d.getGenomeLoc().getStop());
|
||||
}
|
||||
counter++;
|
||||
}
|
||||
assertEquals(10, counter);
|
||||
assertEquals(5, counter);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -89,7 +89,7 @@ public class SAMBAMDataSourceTest extends BaseTest {
|
|||
Reads reads = new Reads(fl);
|
||||
|
||||
try {
|
||||
SAMDataSource data = new SAMDataSource(reads);
|
||||
SAMDataSource data = new SAMDataSource(reads,false);
|
||||
for (Shard sh : strat) {
|
||||
int readCount = 0;
|
||||
count++;
|
||||
|
|
@ -139,7 +139,7 @@ public class SAMBAMDataSourceTest extends BaseTest {
|
|||
int count = 0;
|
||||
|
||||
try {
|
||||
SAMDataSource data = new SAMDataSource(reads);
|
||||
SAMDataSource data = new SAMDataSource(reads,false);
|
||||
for (Shard sh : strat) {
|
||||
int readCount = 0;
|
||||
count++;
|
||||
|
|
@ -176,7 +176,7 @@ public class SAMBAMDataSourceTest extends BaseTest {
|
|||
|
||||
logger.debug("Pile two:");
|
||||
try {
|
||||
SAMDataSource data = new SAMDataSource(reads);
|
||||
SAMDataSource data = new SAMDataSource(reads,false);
|
||||
for (Shard sh : strat) {
|
||||
int readCount = 0;
|
||||
count++;
|
||||
|
|
|
|||
|
|
@ -2,15 +2,20 @@ package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
|
|||
|
||||
import static junit.framework.Assert.fail;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
|
||||
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategy;
|
||||
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategyFactory;
|
||||
import org.broadinstitute.sting.gatk.iterators.BoundedReadIterator;
|
||||
import org.broadinstitute.sting.gatk.iterators.*;
|
||||
import org.broadinstitute.sting.gatk.Reads;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSamUtils;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMQueryIterator;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
|
|
@ -38,16 +43,14 @@ import java.util.List;
|
|||
/**
|
||||
* @author aaron
|
||||
* @version 1.0
|
||||
* @date Apr 15, 2009
|
||||
* <p/>
|
||||
* Class SAMByReadsTest
|
||||
* <p/>
|
||||
* Test sam traversal by reads FIX ME
|
||||
*/
|
||||
public class SAMByReadsTest extends BaseTest {
|
||||
|
||||
private FastaSequenceFile2 seq;
|
||||
|
||||
private List<File> fl;
|
||||
ShardStrategy shardStrategy;
|
||||
Reads reads;
|
||||
private int targetReadCount = 14;
|
||||
|
||||
/**
|
||||
* This function does the setup of our parser, before each method call.
|
||||
|
|
@ -59,56 +62,42 @@ public class SAMByReadsTest extends BaseTest {
|
|||
fl = new ArrayList<File>();
|
||||
|
||||
// sequence
|
||||
seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly17/v0/Homo_sapiens_assembly17.fasta"));
|
||||
GenomeLoc.setupRefContigOrdering(seq.getSequenceDictionary());
|
||||
//seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly17/v0/Homo_sapiens_assembly17.fasta"));
|
||||
//GenomeLoc.setupRefContigOrdering(seq.getSequenceDictionary());
|
||||
|
||||
// setup the test files
|
||||
fl.add(new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/index_test.bam"));
|
||||
reads = new Reads(fl);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
/** Test out that we can shard the file and iterate over every read */
|
||||
@Test
|
||||
public void testTotalReadCount() {
|
||||
logger.warn("Executing testTotalReadCount");
|
||||
|
||||
// setup the test files
|
||||
fl.add(new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/index_test.bam"));
|
||||
Reads reads = new Reads(fl);
|
||||
|
||||
final int targetReadCount = 5000;
|
||||
|
||||
ShardStrategy shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS,seq.getSequenceDictionary(),targetReadCount);
|
||||
|
||||
public void testToUnmappedReads() {
|
||||
ArtificialIteratorGenerator gen = new ArtificialIteratorGenerator(1,10,100,1000);
|
||||
GenomeLoc.setupRefContigOrdering(gen.getHeader().getSequenceDictionary());
|
||||
try {
|
||||
SAMDataSource data = new SAMDataSource(reads);
|
||||
|
||||
// check the total read count
|
||||
final int totalReads = 10000;
|
||||
|
||||
int readsSeen = 0;
|
||||
BoundedReadIterator iter;
|
||||
|
||||
|
||||
for (Shard sd : shardStrategy) {
|
||||
int readcnt = 0;
|
||||
|
||||
iter = (BoundedReadIterator)data.seek(sd);
|
||||
|
||||
for (SAMRecord r : iter) {
|
||||
|
||||
readcnt++;
|
||||
|
||||
int unmappedReadsSeen = 0;
|
||||
int iterations = 0;
|
||||
SAMDataSource data = new SAMDataSource(reads,true);
|
||||
for (int x = 0; x < 10; x++) {
|
||||
++iterations;
|
||||
PeekingStingIterator iter = ArtificialSamUtils.unmappedReadIterator(1, 100, 10, 1000);
|
||||
BoundedReadIterator ret = data.toUnmappedReads(100, iter);
|
||||
// count the reads we've gotten back
|
||||
if (ret == null) {
|
||||
fail("On iteration " + iterations + " we were returned a null pointer, after seeing " + unmappedReadsSeen + " reads out of a 1000");
|
||||
}
|
||||
while (ret.hasNext()) {
|
||||
ret.next();
|
||||
unmappedReadsSeen++;
|
||||
}
|
||||
|
||||
|
||||
readsSeen += readcnt;
|
||||
//logger.warn("Seen " + readsSeen + " reads.");
|
||||
|
||||
}
|
||||
// make sure we've seen all the reads
|
||||
assertEquals(totalReads,readsSeen);
|
||||
logger.warn("Success " + readsSeen + " equals target count of " + totalReads);
|
||||
|
||||
assertEquals(1000,unmappedReadsSeen);
|
||||
}
|
||||
|
||||
|
||||
catch (SimpleDataSourceLoadException e) {
|
||||
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
|
||||
fail("testLinearBreakIterateAll: We Should not get a SimpleDataSourceLoadException");
|
||||
|
|
@ -116,4 +105,145 @@ public class SAMByReadsTest extends BaseTest {
|
|||
|
||||
|
||||
}
|
||||
|
||||
/** Test out that we can shard the file and iterate over every read */
|
||||
@Test
|
||||
public void testShardingOfReadsSize14() {
|
||||
ArtificialIteratorGenerator gen = new ArtificialIteratorGenerator(1,10,100,1000);
|
||||
GenomeLoc.setupRefContigOrdering(gen.getHeader().getSequenceDictionary());
|
||||
targetReadCount = 14;
|
||||
try {
|
||||
int iterations = 0;
|
||||
int readCount = 0;
|
||||
SAMDataSource data = new SAMDataSource(reads,true);
|
||||
|
||||
|
||||
data.setIterGen(gen);
|
||||
shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS, gen.getHeader().getSequenceDictionary(), targetReadCount);
|
||||
while (shardStrategy.hasNext()) {
|
||||
|
||||
|
||||
BoundedReadIterator ret = (BoundedReadIterator)data.seek(shardStrategy.next());
|
||||
assertTrue(ret != null);
|
||||
while (ret.hasNext()) {
|
||||
ret.next();
|
||||
readCount++;
|
||||
}
|
||||
ret.close();
|
||||
iterations++;
|
||||
}
|
||||
|
||||
// assert that we saw 2000 reads
|
||||
assertEquals(2000,readCount);
|
||||
|
||||
/**
|
||||
* this next assertion is based on the following logic:
|
||||
* 14 reads per shard = 8 shards per each 100 read chromosome
|
||||
* 10 chromosomes = 8 * 10 = 80
|
||||
* 1000 unmapped reads / 14 = 72
|
||||
* 1 iteration at the end to know we're done
|
||||
* 80 + 72 + 1 = 153
|
||||
*/
|
||||
assertEquals(153,iterations);
|
||||
|
||||
}
|
||||
|
||||
catch (SimpleDataSourceLoadException e) {
|
||||
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
|
||||
fail("testLinearBreakIterateAll: We Should not get a SimpleDataSourceLoadException");
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
/** Test out that we can shard the file and iterate over every read */
|
||||
@Test
|
||||
public void testShardingOfReadsSize25() {
|
||||
ArtificialIteratorGenerator gen = new ArtificialIteratorGenerator(1,10,100,1000);
|
||||
GenomeLoc.setupRefContigOrdering(gen.getHeader().getSequenceDictionary());
|
||||
targetReadCount = 25;
|
||||
try {
|
||||
int iterations = 0;
|
||||
int readCount = 0;
|
||||
SAMDataSource data = new SAMDataSource(reads,true);
|
||||
|
||||
|
||||
data.setIterGen(gen);
|
||||
shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS, gen.getHeader().getSequenceDictionary(), targetReadCount);
|
||||
while (shardStrategy.hasNext()) {
|
||||
|
||||
|
||||
BoundedReadIterator ret = (BoundedReadIterator)data.seek(shardStrategy.next());
|
||||
assertTrue(ret != null);
|
||||
while (ret.hasNext()) {
|
||||
ret.next();
|
||||
readCount++;
|
||||
}
|
||||
ret.close();
|
||||
iterations++;
|
||||
}
|
||||
|
||||
// assert that we saw 2000 reads
|
||||
assertEquals(2000,readCount);
|
||||
|
||||
/**
|
||||
* this next assertion is based on the following logic:
|
||||
* 25 reads per shard = 5 shards (1 on the end to realize we're done)
|
||||
* 10 chromosomes = 5 * 10 = 50
|
||||
* 1000 unmapped reads / 25 = 40
|
||||
* 1 iteration at the end to know we're done
|
||||
* 50 + 40 + 1 = 91
|
||||
*/
|
||||
assertEquals(91,iterations);
|
||||
|
||||
}
|
||||
|
||||
catch (SimpleDataSourceLoadException e) {
|
||||
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
|
||||
fail("testLinearBreakIterateAll: We Should not get a SimpleDataSourceLoadException");
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* use this to inject into SAMDataSource for testing
|
||||
*/
|
||||
class ArtificialIteratorGenerator extends IteratorGenerator {
|
||||
// How strict should we be with SAM/BAM parsing?
|
||||
protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.SILENT;
|
||||
|
||||
// the header
|
||||
private SAMFileHeader header;
|
||||
private int endingChr;
|
||||
private int startingChr;
|
||||
private int readCount;
|
||||
private int readSize;
|
||||
/** our SAM data files */
|
||||
private final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate;
|
||||
|
||||
public ArtificialIteratorGenerator( int startingChr, int endingChr, int readCount, int readSize) {
|
||||
header = ArtificialSamUtils.createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + readSize);
|
||||
|
||||
}
|
||||
|
||||
public CloseableIterator<SAMRecord> seek( GenomeLoc seekTo ) {
|
||||
ArtificialSAMQueryIterator iter = ArtificialSamUtils.queryReadIterator(1, 10, 100, 1000);
|
||||
if (seekTo != null) {
|
||||
iter.queryContained(seekTo.getContig(), (int)seekTo.getStart(), (int)seekTo.getStop());
|
||||
}
|
||||
return iter;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the merged header
|
||||
*
|
||||
* @return the merged header
|
||||
*/
|
||||
public SAMFileHeader getHeader() {
|
||||
return this.header;
|
||||
}
|
||||
}
|
||||
|
|
@ -85,7 +85,7 @@ public class BoundedReadIteratorTest extends BaseTest {
|
|||
long shardReadCount = 0;
|
||||
|
||||
try {
|
||||
SAMDataSource data = new SAMDataSource(reads);
|
||||
SAMDataSource data = new SAMDataSource(reads,true);
|
||||
|
||||
// make sure we have a shard
|
||||
if (!strat.hasNext()) {
|
||||
|
|
|
|||
|
|
@ -122,7 +122,7 @@ public class TraverseReadsTest extends BaseTest {
|
|||
ref.getSequenceDictionary(),
|
||||
readSize);
|
||||
|
||||
SAMDataSource dataSource = new SAMDataSource(new Reads(bamList));
|
||||
SAMDataSource dataSource = new SAMDataSource(new Reads(bamList),true);
|
||||
dataSource.viewUnmappedReads(false);
|
||||
|
||||
countReadWalker.initialize();
|
||||
|
|
@ -170,7 +170,7 @@ public class TraverseReadsTest extends BaseTest {
|
|||
ref.getSequenceDictionary(),
|
||||
readSize);
|
||||
|
||||
SAMDataSource dataSource = new SAMDataSource(new Reads(bamList));
|
||||
SAMDataSource dataSource = new SAMDataSource(new Reads(bamList),true);
|
||||
dataSource.viewUnmappedReads(true);
|
||||
|
||||
countReadWalker.initialize();
|
||||
|
|
|
|||
|
|
@ -0,0 +1,115 @@
|
|||
package org.broadinstitute.sting.utils.sam;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.iterators.PeekingStingIterator;
|
||||
import org.junit.Test;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import static junit.framework.Assert.assertTrue;
|
||||
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* <p/>
|
||||
* Class ArtificialSAMQueryIteratorTest
|
||||
* <p/>
|
||||
* a test for the ArtificialSAMQueryIterator class.
|
||||
*/
|
||||
public class ArtificialSAMQueryIteratorTest extends BaseTest {
|
||||
|
||||
@Test
|
||||
public void testWholeChromosomeQuery() {
|
||||
ArtificialSAMQueryIterator iter = ArtificialSamUtils.queryReadIterator(1, 2, 100);
|
||||
iter.queryContained("chr1", 1, -1);
|
||||
int count = 0;
|
||||
while (iter.hasNext()) {
|
||||
SAMRecord rec = iter.next();
|
||||
count++;
|
||||
}
|
||||
assertEquals(100, count);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testContainedQueryStart() {
|
||||
ArtificialSAMQueryIterator iter = ArtificialSamUtils.queryReadIterator(1, 2, 100);
|
||||
iter.queryContained("chr1", 1, 50);
|
||||
int count = 0;
|
||||
while (iter.hasNext()) {
|
||||
SAMRecord rec = iter.next();
|
||||
count++;
|
||||
}
|
||||
assertEquals(1, count);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOverlappingQueryStart() {
|
||||
ArtificialSAMQueryIterator iter = ArtificialSamUtils.queryReadIterator(1, 2, 100);
|
||||
iter.queryOverlapping("chr1", 1, 50);
|
||||
int count = 0;
|
||||
while (iter.hasNext()) {
|
||||
SAMRecord rec = iter.next();
|
||||
count++;
|
||||
}
|
||||
assertEquals(50, count);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testContainedQueryMiddle() {
|
||||
ArtificialSAMQueryIterator iter = ArtificialSamUtils.queryReadIterator(1, 2, 100);
|
||||
iter.queryContained("chr1", 25, 74);
|
||||
int count = 0;
|
||||
while (iter.hasNext()) {
|
||||
SAMRecord rec = iter.next();
|
||||
count++;
|
||||
}
|
||||
assertEquals(1, count);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOverlappingQueryMiddle() {
|
||||
ArtificialSAMQueryIterator iter = ArtificialSamUtils.queryReadIterator(1, 2, 100);
|
||||
iter.queryOverlapping("chr1", 25, 74);
|
||||
int count = 0;
|
||||
while (iter.hasNext()) {
|
||||
SAMRecord rec = iter.next();
|
||||
count++;
|
||||
}
|
||||
assertEquals(50, count);
|
||||
|
||||
}
|
||||
|
||||
@Test(expected = IllegalArgumentException.class)
|
||||
public void testUnknownChromosome() {
|
||||
ArtificialSAMQueryIterator iter = ArtificialSamUtils.queryReadIterator(1, 2, 100);
|
||||
iter.queryOverlapping("chr621", 25, 74);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,109 @@
|
|||
package org.broadinstitute.sting.utils.sam;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||
import org.broadinstitute.sting.gatk.iterators.PeekingStingIterator;
|
||||
import org.junit.Test;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.fail;
|
||||
import static junit.framework.Assert.assertTrue;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: aaronmckenna
|
||||
* Date: Jun 3, 2009
|
||||
* Time: 3:09:34 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class ArtificialSamUtilsTest extends BaseTest {
|
||||
|
||||
|
||||
@Test
|
||||
public void basicReadIteratorTest() {
|
||||
StingSAMIterator iter = ArtificialSamUtils.unmappedReadIterator(1, 100, 100);
|
||||
int count = 0;
|
||||
while (iter.hasNext()) {
|
||||
SAMRecord rec = iter.next();
|
||||
count++;
|
||||
}
|
||||
assertEquals(100 * 100, count);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void tenPerChromosome() {
|
||||
StingSAMIterator iter = ArtificialSamUtils.unmappedReadIterator(1, 100, 10);
|
||||
int count = 0;
|
||||
while (iter.hasNext()) {
|
||||
SAMRecord rec = iter.next();
|
||||
|
||||
assertEquals(Integer.valueOf(Math.round(count / 10)), rec.getReferenceIndex());
|
||||
count++;
|
||||
}
|
||||
assertEquals(100 * 10, count);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void onePerChromosome() {
|
||||
StingSAMIterator iter = ArtificialSamUtils.unmappedReadIterator(1, 100, 1);
|
||||
int count = 0;
|
||||
while (iter.hasNext()) {
|
||||
SAMRecord rec = iter.next();
|
||||
|
||||
assertEquals(Integer.valueOf(count), rec.getReferenceIndex());
|
||||
count++;
|
||||
}
|
||||
assertEquals(100 * 1, count);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void basicUnmappedIteratorTest() {
|
||||
StingSAMIterator iter = ArtificialSamUtils.unmappedReadIterator(1, 100, 100, 1000);
|
||||
int count = 0;
|
||||
for (int x = 0; x < (100* 100); x++ ) {
|
||||
if (!iter.hasNext()) {
|
||||
fail ("we didn't get the expected number of reads");
|
||||
}
|
||||
SAMRecord rec = iter.next();
|
||||
assertTrue(rec.getReferenceIndex() >= 0);
|
||||
count++;
|
||||
}
|
||||
assertEquals(count, 100 * 100);
|
||||
|
||||
// now we should have 1000 unmapped reads
|
||||
count = 0;
|
||||
while (iter.hasNext()) {
|
||||
SAMRecord rec = iter.next();
|
||||
assertTrue(rec.getReferenceIndex() < 0);
|
||||
count++;
|
||||
}
|
||||
assertEquals(1000, count);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPeeking() {
|
||||
PeekingStingIterator iter = ArtificialSamUtils.unmappedReadIterator(1, 100, 100);
|
||||
int count = 0;
|
||||
while (iter.hasNext()) {
|
||||
int readCnt = ((ArtificialSAMIterator)(iter)).readsTaken();
|
||||
|
||||
// peek the record
|
||||
SAMRecord rec = iter.peek();
|
||||
assertTrue(rec.getReferenceIndex() >= 0);
|
||||
|
||||
// next the record
|
||||
SAMRecord rec2 = iter.next();
|
||||
assertTrue(rec2.getReadName() == rec.getReadName());
|
||||
assertTrue(rec2.getAlignmentStart() == rec.getAlignmentStart());
|
||||
|
||||
// find out how many reads we've taken now
|
||||
int readCnt2 = ((ArtificialSAMIterator)(iter)).readsTaken();
|
||||
|
||||
count++;
|
||||
if (count < 100*100) assertEquals(readCnt + 1, readCnt2);
|
||||
else assertEquals(readCnt, readCnt2);
|
||||
}
|
||||
assertEquals(100 * 100, count );
|
||||
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue