We're no longer in the read-dropping business.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@901 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
4d880477d6
commit
109bef6c08
|
|
@ -56,16 +56,17 @@ public class IntervalShardStrategy implements ShardStrategy {
|
||||||
*
|
*
|
||||||
* @param size the next recommended shard size.
|
* @param size the next recommended shard size.
|
||||||
*/
|
*/
|
||||||
public void adjustNextShardSize(long size) {
|
public void adjustNextShardSize( long size ) {
|
||||||
this.size = size;
|
this.size = size;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* construct the shard strategy from a seq dictionary, a shard size, and and genomeLocs
|
* construct the shard strategy from a seq dictionary, a shard size, and and genomeLocs
|
||||||
|
*
|
||||||
* @param size
|
* @param size
|
||||||
* @param locations
|
* @param locations
|
||||||
*/
|
*/
|
||||||
IntervalShardStrategy(long size, GenomeLocSortedSet locations) {
|
IntervalShardStrategy( long size, GenomeLocSortedSet locations ) {
|
||||||
if (locations == null || locations.isEmpty()) {
|
if (locations == null || locations.isEmpty()) {
|
||||||
throw new StingException("IntervalShardStrategy: genomic regions list is empty.");
|
throw new StingException("IntervalShardStrategy: genomic regions list is empty.");
|
||||||
}
|
}
|
||||||
|
|
@ -78,7 +79,7 @@ public class IntervalShardStrategy implements ShardStrategy {
|
||||||
*
|
*
|
||||||
* @param dict the sequence dictionary to use
|
* @param dict the sequence dictionary to use
|
||||||
*/
|
*/
|
||||||
IntervalShardStrategy(SAMSequenceDictionary dict, GenomeLocSortedSet locations) {
|
IntervalShardStrategy( SAMSequenceDictionary dict, GenomeLocSortedSet locations ) {
|
||||||
if (locations == null || locations.isEmpty()) {
|
if (locations == null || locations.isEmpty()) {
|
||||||
throw new StingException("IntervalShardStrategy: genomic regions list is empty.");
|
throw new StingException("IntervalShardStrategy: genomic regions list is empty.");
|
||||||
}
|
}
|
||||||
|
|
@ -88,44 +89,39 @@ public class IntervalShardStrategy implements ShardStrategy {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* returns true if there are additional shards
|
* returns true if there are additional shards
|
||||||
|
*
|
||||||
* @return false if we're done processing shards
|
* @return false if we're done processing shards
|
||||||
*/
|
*/
|
||||||
public boolean hasNext() {
|
public boolean hasNext() {
|
||||||
return (!regions.isEmpty());
|
return ( !regions.isEmpty() );
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* gets the next Shard
|
* gets the next Shard
|
||||||
|
*
|
||||||
* @return the next shard
|
* @return the next shard
|
||||||
*/
|
*/
|
||||||
public Shard next() {
|
public Shard next() {
|
||||||
if ((this.regions == null) || (regions.isEmpty())) {
|
if (( this.regions == null ) || ( regions.isEmpty() )) {
|
||||||
throw new StingException("IntervalShardStrategy: genomic regions list is empty in next() function.");
|
throw new StingException("IntervalShardStrategy: genomic regions list is empty in next() function.");
|
||||||
}
|
}
|
||||||
|
|
||||||
// get the first region in the list
|
// get the first region in the list
|
||||||
GenomeLoc loc = regions.iterator().next();
|
GenomeLoc loc = regions.iterator().next();
|
||||||
|
|
||||||
if (loc.getStop() - loc.getStart() <= this.size) {
|
|
||||||
regions.removeRegion(loc);
|
regions.removeRegion(loc);
|
||||||
return new IntervalShard(loc);
|
return new IntervalShard(loc);
|
||||||
} else {
|
|
||||||
GenomeLoc subLoc = new GenomeLoc(loc.getContigIndex(),loc.getStart(),loc.getStart()+size-1);
|
|
||||||
regions.removeRegion(subLoc);
|
|
||||||
return new IntervalShard(subLoc);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** we don't support the remove command */
|
||||||
* we don't support the remove command
|
|
||||||
*/
|
|
||||||
public void remove() {
|
public void remove() {
|
||||||
throw new UnsupportedOperationException("ShardStrategies don't support remove()");
|
throw new UnsupportedOperationException("ShardStrategies don't support remove()");
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* makes the ReadIntervalShard iterable, i.e. usable in a for loop.
|
* makes the ReadIntervalShard iterable, i.e. usable in a for loop.
|
||||||
|
*
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public Iterator<Shard> iterator() {
|
public Iterator<Shard> iterator() {
|
||||||
|
|
|
||||||
|
|
@ -5,13 +5,11 @@ import net.sf.samtools.SAMFileHeader;
|
||||||
import net.sf.samtools.SAMFileReader;
|
import net.sf.samtools.SAMFileReader;
|
||||||
import net.sf.samtools.SAMReadGroupRecord;
|
import net.sf.samtools.SAMReadGroupRecord;
|
||||||
import net.sf.samtools.SAMRecord;
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import net.sf.samtools.util.CloseableIterator;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.gatk.dataSources.shards.ReadShard;
|
import org.broadinstitute.sting.gatk.dataSources.shards.ReadShard;
|
||||||
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
|
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
|
||||||
import org.broadinstitute.sting.gatk.iterators.BoundedReadIterator;
|
import org.broadinstitute.sting.gatk.iterators.*;
|
||||||
import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2;
|
|
||||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
|
||||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter;
|
|
||||||
import org.broadinstitute.sting.gatk.Reads;
|
import org.broadinstitute.sting.gatk.Reads;
|
||||||
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
|
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
|
@ -35,43 +33,53 @@ import java.util.List;
|
||||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||||
*/
|
*/
|
||||||
public class SAMDataSource implements SimpleDataSource {
|
public class SAMDataSource implements SimpleDataSource {
|
||||||
|
|
||||||
|
|
||||||
/** Backing support for reads. */
|
/** Backing support for reads. */
|
||||||
private Reads reads = null;
|
private Reads reads = null;
|
||||||
|
|
||||||
/** our SAM data files */
|
|
||||||
private final SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.coordinate;
|
|
||||||
|
|
||||||
/** our log, which we want to capture anything from this class */
|
/** our log, which we want to capture anything from this class */
|
||||||
protected static Logger logger = Logger.getLogger(SAMDataSource.class);
|
protected static Logger logger = Logger.getLogger(SAMDataSource.class);
|
||||||
|
|
||||||
// How strict should we be with SAM/BAM parsing?
|
|
||||||
protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.SILENT;
|
|
||||||
|
|
||||||
// our list of readers
|
|
||||||
private final List<File> samFileList = new ArrayList<File>();
|
|
||||||
|
|
||||||
/** SAM header file. */
|
|
||||||
private final SAMFileHeader header;
|
|
||||||
|
|
||||||
// used for the reads case, the last count of reads retrieved
|
// used for the reads case, the last count of reads retrieved
|
||||||
private long readsTaken = 0;
|
long readsTaken = 0;
|
||||||
|
|
||||||
// our last genome loc position
|
// our last genome loc position
|
||||||
private GenomeLoc lastReadPos = null;
|
GenomeLoc lastReadPos = null;
|
||||||
|
|
||||||
// do we take unmapped reads
|
// do we take unmapped reads
|
||||||
private boolean includeUnmappedReads = true;
|
private boolean includeUnmappedReads = true;
|
||||||
|
|
||||||
// reads based traversal variables
|
// reads based traversal variables
|
||||||
private boolean intoUnmappedReads = false;
|
private boolean intoUnmappedReads = false;
|
||||||
private int readsAtLastPos = 0;
|
private int readsSeenAtLastPos = 0;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* package protected getter and setter for the iterator generator
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
IteratorGenerator getIterGen() {
|
||||||
|
return iterGen;
|
||||||
|
}
|
||||||
|
|
||||||
|
void setIterGen( IteratorGenerator iterGen ) {
|
||||||
|
this.iterGen = iterGen;
|
||||||
|
}
|
||||||
|
|
||||||
|
// where we get out iterators from
|
||||||
|
private IteratorGenerator iterGen;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* constructor, given sam files
|
* constructor, given sam files
|
||||||
*
|
*
|
||||||
* @param reads the list of sam files
|
* @param reads the list of sam files
|
||||||
|
* @param byReads are we a by reads traversal, or a loci traversal. We could delete this field
|
||||||
|
* if we passed in iterGen, which would be a better (although more complicated for the
|
||||||
|
* consumers of SAMDataSources).
|
||||||
*/
|
*/
|
||||||
public SAMDataSource(Reads reads) throws SimpleDataSourceLoadException {
|
public SAMDataSource( Reads reads, boolean byReads ) throws SimpleDataSourceLoadException {
|
||||||
this.reads = reads;
|
this.reads = reads;
|
||||||
|
|
||||||
// check the length
|
// check the length
|
||||||
|
|
@ -82,32 +90,11 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
if (!smFile.canRead()) {
|
if (!smFile.canRead()) {
|
||||||
throw new SimpleDataSourceLoadException("SAMDataSource: Unable to load file: " + smFile.getName());
|
throw new SimpleDataSourceLoadException("SAMDataSource: Unable to load file: " + smFile.getName());
|
||||||
}
|
}
|
||||||
samFileList.add(smFile);
|
}
|
||||||
|
iterGen = new MSR2IteratorGenerator(reads, byReads);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
header = createHeaderMerger().getMergedHeader();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Load a SAM/BAM, given an input file.
|
|
||||||
*
|
|
||||||
* @param samFile the file name
|
|
||||||
* @return a SAMFileReader for the file, null if we're attempting to read a list
|
|
||||||
*/
|
|
||||||
private SAMFileReader initializeSAMFile(final File samFile) {
|
|
||||||
if (samFile.toString().endsWith(".list")) {
|
|
||||||
return null;
|
|
||||||
} else {
|
|
||||||
SAMFileReader samReader = new SAMFileReader(samFile, true);
|
|
||||||
samReader.setValidationStringency(strictness);
|
|
||||||
|
|
||||||
final SAMFileHeader header = samReader.getFileHeader();
|
|
||||||
logger.debug(String.format("Sort order is: " + header.getSortOrder()));
|
|
||||||
|
|
||||||
return samReader;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* <p>
|
* <p>
|
||||||
|
|
@ -115,18 +102,12 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
* </p>
|
* </p>
|
||||||
*
|
*
|
||||||
* @param location the genome location to extract data for
|
* @param location the genome location to extract data for
|
||||||
|
*
|
||||||
* @return an iterator for that region
|
* @return an iterator for that region
|
||||||
*/
|
*/
|
||||||
public StingSAMIterator seekLocus(GenomeLoc location) throws SimpleDataSourceLoadException {
|
public StingSAMIterator seekLocus( GenomeLoc location ) throws SimpleDataSourceLoadException {
|
||||||
|
|
||||||
// right now this is very heavy, it copies the file list into a reader list every time
|
|
||||||
SamFileHeaderMerger headerMerger = createHeaderMerger();
|
|
||||||
|
|
||||||
// make a merging iterator for this record
|
// make a merging iterator for this record
|
||||||
MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(headerMerger);
|
CloseableIterator<SAMRecord> iter = iterGen.seek(location);
|
||||||
|
|
||||||
iter.queryOverlapping(location.getContig(), (int) location.getStart(), (int) location.getStop() + 1);
|
|
||||||
|
|
||||||
// return the iterator
|
// return the iterator
|
||||||
return StingSAMIteratorAdapter.adapt(reads, iter);
|
return StingSAMIteratorAdapter.adapt(reads, iter);
|
||||||
}
|
}
|
||||||
|
|
@ -137,9 +118,10 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
* </p>
|
* </p>
|
||||||
*
|
*
|
||||||
* @param shard the shard to get data for
|
* @param shard the shard to get data for
|
||||||
|
*
|
||||||
* @return an iterator for that region
|
* @return an iterator for that region
|
||||||
*/
|
*/
|
||||||
public StingSAMIterator seek(Shard shard) throws SimpleDataSourceLoadException {
|
public StingSAMIterator seek( Shard shard ) throws SimpleDataSourceLoadException {
|
||||||
StingSAMIterator iterator = null;
|
StingSAMIterator iterator = null;
|
||||||
if (shard.getShardType() == Shard.ShardType.READ) {
|
if (shard.getShardType() == Shard.ShardType.READ) {
|
||||||
iterator = seekRead((ReadShard) shard);
|
iterator = seekRead((ReadShard) shard);
|
||||||
|
|
@ -149,8 +131,7 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
reads.getMaxOnTheFlySorts(),
|
reads.getMaxOnTheFlySorts(),
|
||||||
reads.getFilterZeroMappingQualityReads(),
|
reads.getFilterZeroMappingQualityReads(),
|
||||||
reads.getSafetyChecking());
|
reads.getSafetyChecking());
|
||||||
} else if (shard.getShardType() == Shard.ShardType.LOCUS ||
|
} else if (shard.getShardType() == Shard.ShardType.LOCUS || shard.getShardType() == Shard.ShardType.INTERVAL) {
|
||||||
shard.getShardType() == Shard.ShardType.INTERVAL) {
|
|
||||||
iterator = seekLocus(shard.getGenomeLoc());
|
iterator = seekLocus(shard.getGenomeLoc());
|
||||||
iterator = TraversalEngine.applyDecoratingIterators(false,
|
iterator = TraversalEngine.applyDecoratingIterators(false,
|
||||||
iterator,
|
iterator,
|
||||||
|
|
@ -158,8 +139,7 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
reads.getMaxOnTheFlySorts(),
|
reads.getMaxOnTheFlySorts(),
|
||||||
reads.getFilterZeroMappingQualityReads(),
|
reads.getFilterZeroMappingQualityReads(),
|
||||||
reads.getSafetyChecking());
|
reads.getSafetyChecking());
|
||||||
}
|
} else {
|
||||||
else {
|
|
||||||
throw new StingException("seek: Unknown shard type");
|
throw new StingException("seek: Unknown shard type");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -173,17 +153,7 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
* @return SAM file header.
|
* @return SAM file header.
|
||||||
*/
|
*/
|
||||||
public SAMFileHeader getHeader() {
|
public SAMFileHeader getHeader() {
|
||||||
return header;
|
return this.iterGen.getHeader();
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* create the merging header.
|
|
||||||
*
|
|
||||||
* @return a SamFileHeaderMerger that includes the set of SAM files we were created with
|
|
||||||
*/
|
|
||||||
private SamFileHeaderMerger createHeaderMerger() {
|
|
||||||
List<SAMFileReader> lst = GetReaderList();
|
|
||||||
return new SamFileHeaderMerger(lst, SORT_ORDER, true);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -193,26 +163,33 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
* </p>
|
* </p>
|
||||||
*
|
*
|
||||||
* @param shard the read shard to extract from
|
* @param shard the read shard to extract from
|
||||||
|
*
|
||||||
* @return an iterator for that region
|
* @return an iterator for that region
|
||||||
*/
|
*/
|
||||||
private BoundedReadIterator seekRead(ReadShard shard) throws SimpleDataSourceLoadException {
|
private BoundedReadIterator seekRead( ReadShard shard ) throws SimpleDataSourceLoadException {
|
||||||
|
|
||||||
BoundedReadIterator bound = null;
|
BoundedReadIterator bound = null;
|
||||||
SamFileHeaderMerger headerMerger = createHeaderMerger();
|
CloseableIterator<SAMRecord> iter = null;
|
||||||
MergingSamRecordIterator2 iter = null;
|
|
||||||
|
|
||||||
if (!intoUnmappedReads) {
|
if (!intoUnmappedReads) {
|
||||||
iter = new MergingSamRecordIterator2(headerMerger);
|
if (lastReadPos == null) {
|
||||||
bound = fastMappedReadSeek(shard.getSize(), iter);
|
lastReadPos = new GenomeLoc(iterGen.getHeader().getSequenceDictionary().getSequence(0).getSequenceIndex(), 0, Integer.MAX_VALUE);
|
||||||
|
iter = iterGen.seek(lastReadPos);
|
||||||
|
return InitialReadIterator(shard.getSize(), iter);
|
||||||
|
} else {
|
||||||
|
lastReadPos.setStop(-1);
|
||||||
|
iter = iterGen.seek(lastReadPos);
|
||||||
|
bound = fastMappedReadSeek(shard.getSize(), StingSAMIteratorAdapter.adapt(reads, iter));
|
||||||
}
|
}
|
||||||
if ((bound == null || intoUnmappedReads) && includeUnmappedReads) {
|
}
|
||||||
|
|
||||||
|
if (( bound == null || intoUnmappedReads ) && includeUnmappedReads) {
|
||||||
if (iter != null) {
|
if (iter != null) {
|
||||||
iter.close();
|
iter.close();
|
||||||
}
|
}
|
||||||
iter = new MergingSamRecordIterator2(createHeaderMerger());
|
iter = iterGen.seek(null);
|
||||||
bound = toUnmappedReads(shard.getSize(), iter);
|
bound = toUnmappedReads(shard.getSize(), (PeekingStingIterator) iter);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (bound == null) {
|
if (bound == null) {
|
||||||
shard.signalDone();
|
shard.signalDone();
|
||||||
bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), 0);
|
bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), 0);
|
||||||
|
|
@ -228,7 +205,7 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
*
|
*
|
||||||
* @param seeUnMappedReads true to see unmapped reads, false otherwise
|
* @param seeUnMappedReads true to see unmapped reads, false otherwise
|
||||||
*/
|
*/
|
||||||
public void viewUnmappedReads(boolean seeUnMappedReads) {
|
public void viewUnmappedReads( boolean seeUnMappedReads ) {
|
||||||
includeUnmappedReads = seeUnMappedReads;
|
includeUnmappedReads = seeUnMappedReads;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -238,19 +215,21 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
*
|
*
|
||||||
* @param readCount how many reads to retrieve
|
* @param readCount how many reads to retrieve
|
||||||
* @param iter the iterator to use
|
* @param iter the iterator to use
|
||||||
|
*
|
||||||
* @return the bounded iterator that you can use to get the intervaled reads from
|
* @return the bounded iterator that you can use to get the intervaled reads from
|
||||||
* @throws SimpleDataSourceLoadException
|
* @throws SimpleDataSourceLoadException
|
||||||
*/
|
*/
|
||||||
private BoundedReadIterator toUnmappedReads(long readCount, MergingSamRecordIterator2 iter) throws SimpleDataSourceLoadException {
|
<U extends PeekingStingIterator> BoundedReadIterator toUnmappedReads( long readCount, U iter ) throws SimpleDataSourceLoadException {
|
||||||
int count = 0;
|
int count = 0;
|
||||||
|
int cnt = 0;
|
||||||
SAMRecord d = null;
|
SAMRecord d = null;
|
||||||
while (iter.hasNext()) {
|
while (iter.hasNext()) {
|
||||||
d = iter.peek();
|
d = iter.peek();
|
||||||
int x = d.getReferenceIndex();
|
int x = d.getReferenceIndex();
|
||||||
if (x < 0 || x >= d.getHeader().getSequenceDictionary().getSequences().size()) {
|
if (x < 0)
|
||||||
// we have the magic read that starts the unmapped read segment!
|
// we have the magic read that starts the unmapped read segment!
|
||||||
break;
|
break;
|
||||||
}
|
cnt++;
|
||||||
iter.next();
|
iter.next();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -262,6 +241,7 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
// now walk until we've taken the unmapped read count
|
// now walk until we've taken the unmapped read count
|
||||||
while (iter.hasNext() && count < this.readsTaken) {
|
while (iter.hasNext() && count < this.readsTaken) {
|
||||||
iter.next();
|
iter.next();
|
||||||
|
count++;
|
||||||
}
|
}
|
||||||
|
|
||||||
// check to see what happened, did we run out of reads?
|
// check to see what happened, did we run out of reads?
|
||||||
|
|
@ -280,28 +260,17 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
* A seek function for unmapped reads.
|
* A seek function for unmapped reads.
|
||||||
*
|
*
|
||||||
* @param readCount how many reads to retrieve
|
* @param readCount how many reads to retrieve
|
||||||
* @param iter the iterator to use
|
* @param iter the iterator to use, seeked to the correct start location
|
||||||
|
*
|
||||||
* @return the bounded iterator that you can use to get the intervaled reads from
|
* @return the bounded iterator that you can use to get the intervaled reads from
|
||||||
* @throws SimpleDataSourceLoadException
|
* @throws SimpleDataSourceLoadException
|
||||||
*/
|
*/
|
||||||
private BoundedReadIterator fastMappedReadSeek(long readCount, MergingSamRecordIterator2 iter) throws SimpleDataSourceLoadException {
|
BoundedReadIterator fastMappedReadSeek( long readCount, StingSAMIterator iter ) throws SimpleDataSourceLoadException {
|
||||||
if (lastReadPos == null) {
|
|
||||||
return InitialReadIterator(readCount, iter);
|
|
||||||
} else {
|
|
||||||
BoundedReadIterator bound;
|
BoundedReadIterator bound;
|
||||||
iter.queryContained(lastReadPos.getContig(), (int) lastReadPos.getStop(), -1);
|
correctForReadPileupSeek(iter);
|
||||||
|
if (readsTaken == 0) {
|
||||||
// move the number of reads we read from the last pos
|
return InitialReadIterator(readCount, iter);
|
||||||
boolean atLeastOneReadSeen = false; // we have a problem where some chomesomes don't have a single read (i.e. the chrN_random chrom.)
|
|
||||||
while (iter.hasNext() && this.readsAtLastPos > 0) {
|
|
||||||
iter.next();
|
|
||||||
--readsAtLastPos;
|
|
||||||
atLeastOneReadSeen = true;
|
|
||||||
}
|
}
|
||||||
if (readsAtLastPos != 0 && atLeastOneReadSeen) {
|
|
||||||
throw new SimpleDataSourceLoadException("Seek problem: reads at last position count != 0");
|
|
||||||
}
|
|
||||||
|
|
||||||
int x = 0;
|
int x = 0;
|
||||||
SAMRecord rec = null;
|
SAMRecord rec = null;
|
||||||
int lastPos = 0;
|
int lastPos = 0;
|
||||||
|
|
@ -309,11 +278,8 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
while (x < readsTaken) {
|
while (x < readsTaken) {
|
||||||
if (iter.hasNext()) {
|
if (iter.hasNext()) {
|
||||||
rec = iter.next();
|
rec = iter.next();
|
||||||
if (lastPos == rec.getAlignmentStart()) {
|
if (lastPos == rec.getAlignmentStart()) ++this.readsSeenAtLastPos;
|
||||||
++this.readsAtLastPos;
|
else this.readsSeenAtLastPos = 1;
|
||||||
} else {
|
|
||||||
this.readsAtLastPos = 1;
|
|
||||||
}
|
|
||||||
lastPos = rec.getAlignmentStart();
|
lastPos = rec.getAlignmentStart();
|
||||||
++x;
|
++x;
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -323,20 +289,16 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
readsTaken = 0;
|
readsTaken = 0;
|
||||||
intoUnmappedReads = true;
|
intoUnmappedReads = true;
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
iter.close();
|
readsTaken = readCount;
|
||||||
// now merge the headers
|
readsSeenAtLastPos = 0;
|
||||||
// right now this is pretty damn heavy, it copies the file list into a reader list every time
|
lastReadPos.setStop(-1);
|
||||||
SamFileHeaderMerger mg = createHeaderMerger();
|
CloseableIterator<SAMRecord> ret = iterGen.seek(lastReadPos);
|
||||||
iter = new MergingSamRecordIterator2(mg);
|
return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, ret), readCount);
|
||||||
iter.queryContained(lastReadPos.getContig(), 1, Integer.MAX_VALUE);
|
|
||||||
return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// if we're off the end of the last contig (into unmapped territory)
|
// if we're off the end of the last contig (into unmapped territory)
|
||||||
if (rec != null && rec.getAlignmentStart() == 0) {
|
if (rec != null && rec.getAlignmentStart() == 0) {
|
||||||
readsTaken += readCount;
|
readsTaken += readCount;
|
||||||
|
|
@ -348,7 +310,7 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
if (stopPos < lastReadPos.getStart()) {
|
if (stopPos < lastReadPos.getStart()) {
|
||||||
lastReadPos = new GenomeLoc(lastReadPos.getContigIndex() + 1, stopPos, stopPos);
|
lastReadPos = new GenomeLoc(lastReadPos.getContigIndex() + 1, stopPos, stopPos);
|
||||||
} else {
|
} else {
|
||||||
lastReadPos.setStop(rec.getAlignmentStart());
|
lastReadPos.setStart(rec.getAlignmentStart());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// in case we're run out of reads, get out
|
// in case we're run out of reads, get out
|
||||||
|
|
@ -358,30 +320,104 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
}
|
}
|
||||||
bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount);
|
bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount);
|
||||||
|
|
||||||
|
|
||||||
// return the iterator
|
// return the iterator
|
||||||
return bound;
|
return bound;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Even though the iterator has seeked to the correct location, there may be multiple reads at that location,
|
||||||
|
* and we may have given some of them out already. Move the iterator to the correct location using the readsAtLastPos variable
|
||||||
|
*
|
||||||
|
* @param iter the iterator
|
||||||
|
*/
|
||||||
|
private void correctForReadPileupSeek( StingSAMIterator iter ) {
|
||||||
|
// move the number of reads we read from the last pos
|
||||||
|
boolean atLeastOneReadSeen = false; // we have a problem where some chomesomes don't have a single read (i.e. the chrN_random chrom.)
|
||||||
|
while (iter.hasNext() && this.readsSeenAtLastPos > 0) {
|
||||||
|
iter.next();
|
||||||
|
--readsSeenAtLastPos;
|
||||||
|
atLeastOneReadSeen = true;
|
||||||
}
|
}
|
||||||
|
if (readsSeenAtLastPos != 0 && atLeastOneReadSeen) {
|
||||||
|
throw new SimpleDataSourceLoadException("Seek problem: reads at last position count != 0");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* set the initial iterator
|
* set the initial iterator
|
||||||
*
|
*
|
||||||
* @param readCount the number of reads
|
* @param readCount the number of reads
|
||||||
* @param iter the merging iterator
|
* @param iter the merging iterator
|
||||||
|
*
|
||||||
* @return a bounded read iterator at the first read position in the file.
|
* @return a bounded read iterator at the first read position in the file.
|
||||||
*/
|
*/
|
||||||
private BoundedReadIterator InitialReadIterator(long readCount, MergingSamRecordIterator2 iter) {
|
private BoundedReadIterator InitialReadIterator( long readCount, CloseableIterator<SAMRecord> iter ) {
|
||||||
BoundedReadIterator bound;
|
BoundedReadIterator bound;
|
||||||
lastReadPos = new GenomeLoc(iter.getHeader().getSequenceDictionary().getSequence(0).getSequenceIndex(), 0, 0);
|
|
||||||
iter.queryContained(lastReadPos.getContig(), 1, -1);
|
|
||||||
bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount);
|
bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount);
|
||||||
this.readsTaken = readCount;
|
this.readsTaken = readCount;
|
||||||
return bound;
|
return bound;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* iterator generator
|
||||||
|
* <p/>
|
||||||
|
* This class generates iterators for the SAMDataSource. This class was introduced for testing purposes,
|
||||||
|
* since it became increasingly hard to test the SAM data source code. The class defines two abstraact
|
||||||
|
* methods:
|
||||||
|
* <p/>
|
||||||
|
* -seek( GenomeLoc ) which returns an iterator seeked to the genome loc, and if null is passed to the default
|
||||||
|
* location (which is implementation specific).
|
||||||
|
* <p/>
|
||||||
|
* -getHeader(), which returns a SAMFileHeader for the specified IteratorGenerator. I hope we can phase this
|
||||||
|
* method out, since it doesn't seem necessary, and it would be much cleaner with out it.
|
||||||
|
*/
|
||||||
|
abstract class IteratorGenerator {
|
||||||
|
/** our log, which we want to capture anything from this class */
|
||||||
|
protected static Logger logger = Logger.getLogger(SAMDataSource.class);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* seek to a location
|
||||||
|
*
|
||||||
|
* @param seekTo the genome loc to seek to
|
||||||
|
*
|
||||||
|
* @return StingSAMIterator
|
||||||
|
*/
|
||||||
|
public abstract CloseableIterator<SAMRecord> seek( GenomeLoc seekTo );
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get the merged header
|
||||||
|
*
|
||||||
|
* @return the merged header
|
||||||
|
*/
|
||||||
|
public abstract SAMFileHeader getHeader();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Load a SAM/BAM, given an input file.
|
||||||
|
*
|
||||||
|
* @param samFile the file name
|
||||||
|
*
|
||||||
|
* @return a SAMFileReader for the file, null if we're attempting to read a list
|
||||||
|
*/
|
||||||
|
protected static SAMFileReader initializeSAMFile( final File samFile, SAMFileReader.ValidationStringency strictness ) {
|
||||||
|
if (samFile.toString().endsWith(".list")) {
|
||||||
|
return null;
|
||||||
|
} else {
|
||||||
|
SAMFileReader samReader = new SAMFileReader(samFile, true);
|
||||||
|
samReader.setValidationStringency(strictness);
|
||||||
|
|
||||||
|
final SAMFileHeader header = samReader.getFileHeader();
|
||||||
|
logger.debug(String.format("Sort order is: " + header.getSortOrder()));
|
||||||
|
|
||||||
|
return samReader;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A private function that, given the internal file list, generates a SamFileReader
|
* A private function that, given the internal file list, generates a SamFileReader
|
||||||
* list of validated files.
|
* list of validated files.
|
||||||
|
|
@ -389,11 +425,11 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
* @return a list of SAMFileReaders that represent the stored file names
|
* @return a list of SAMFileReaders that represent the stored file names
|
||||||
* @throws SimpleDataSourceLoadException if there's a problem loading the files
|
* @throws SimpleDataSourceLoadException if there's a problem loading the files
|
||||||
*/
|
*/
|
||||||
private List<SAMFileReader> GetReaderList() throws SimpleDataSourceLoadException {
|
protected static List<SAMFileReader> GetReaderList( Reads reads, SAMFileReader.ValidationStringency strictness ) throws SimpleDataSourceLoadException {
|
||||||
// right now this is pretty damn heavy, it copies the file list into a reader list every time
|
// right now this is pretty damn heavy, it copies the file list into a reader list every time
|
||||||
List<SAMFileReader> lst = new ArrayList<SAMFileReader>();
|
List<SAMFileReader> lst = new ArrayList<SAMFileReader>();
|
||||||
for (File f : this.samFileList) {
|
for (File f : reads.getReadsFiles()) {
|
||||||
SAMFileReader reader = initializeSAMFile(f);
|
SAMFileReader reader = initializeSAMFile(f, strictness);
|
||||||
|
|
||||||
if (reader.getFileHeader().getReadGroups().size() < 1) {
|
if (reader.getFileHeader().getReadGroups().size() < 1) {
|
||||||
//logger.warn("Setting header in reader " + f.getName());
|
//logger.warn("Setting header in reader " + f.getName());
|
||||||
|
|
@ -412,4 +448,65 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
return lst;
|
return lst;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create the merging header.
|
||||||
|
*
|
||||||
|
* @return a SamFileHeaderMerger that includes the set of SAM files we were created with
|
||||||
|
*/
|
||||||
|
protected static SamFileHeaderMerger createHeaderMerger( Reads reads, SAMFileReader.ValidationStringency strictness, SAMFileHeader.SortOrder SORT_ORDER ) {
|
||||||
|
List<SAMFileReader> lst = GetReaderList(reads, strictness);
|
||||||
|
return new SamFileHeaderMerger(lst, SORT_ORDER, true);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* MSR2IteratorGenerator
|
||||||
|
* <p/>
|
||||||
|
* generates a MerginsSAMIterator2, given a genomic location. The constructor takes the reads structure,
|
||||||
|
* and a flag indicating if we're dealing with reads or loci (to determine the correct query function).
|
||||||
|
*/
|
||||||
|
class MSR2IteratorGenerator extends IteratorGenerator {
|
||||||
|
/** our read pile */
|
||||||
|
private Reads reads;
|
||||||
|
|
||||||
|
private SamFileHeaderMerger header;
|
||||||
|
|
||||||
|
// How strict should we be with SAM/BAM parsing?
|
||||||
|
protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.SILENT;
|
||||||
|
|
||||||
|
// are we by reads or by loci
|
||||||
|
protected boolean byReads = true;
|
||||||
|
|
||||||
|
/** our SAM data files */
|
||||||
|
private final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate;
|
||||||
|
|
||||||
|
public MSR2IteratorGenerator( Reads reads, boolean byReads ) {
|
||||||
|
this.reads = reads;
|
||||||
|
this.header = IteratorGenerator.createHeaderMerger(reads, strictness, sortOrder);
|
||||||
|
this.byReads = byReads;
|
||||||
|
}
|
||||||
|
|
||||||
|
public CloseableIterator<SAMRecord> seek( GenomeLoc seekTo ) {
|
||||||
|
SamFileHeaderMerger mg = createHeaderMerger(reads, strictness, sortOrder);
|
||||||
|
MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(mg, reads);
|
||||||
|
if (seekTo != null) {
|
||||||
|
if (byReads)
|
||||||
|
iter.queryContained(seekTo.getContig(), (int) seekTo.getStart(), (int) seekTo.getStop());
|
||||||
|
else
|
||||||
|
iter.queryOverlapping(seekTo.getContig(), (int) seekTo.getStart(), (int) seekTo.getStop());
|
||||||
|
}
|
||||||
|
return iter;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get the merged header
|
||||||
|
*
|
||||||
|
* @return the merged header
|
||||||
|
*/
|
||||||
|
public SAMFileHeader getHeader() {
|
||||||
|
return header.getMergedHeader();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -75,11 +75,13 @@ public abstract class MicroScheduler {
|
||||||
protected MicroScheduler(Walker walker, Reads reads, File refFile, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
|
protected MicroScheduler(Walker walker, Reads reads, File refFile, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
|
||||||
if (walker instanceof ReadWalker) {
|
if (walker instanceof ReadWalker) {
|
||||||
traversalEngine = new TraverseReads(reads.getReadsFiles(), refFile, rods);
|
traversalEngine = new TraverseReads(reads.getReadsFiles(), refFile, rods);
|
||||||
|
this.reads = getReadsDataSource(reads,true);
|
||||||
} else {
|
} else {
|
||||||
traversalEngine = new TraverseLoci(reads.getReadsFiles(), refFile, rods);
|
traversalEngine = new TraverseLoci(reads.getReadsFiles(), refFile, rods);
|
||||||
|
this.reads = getReadsDataSource(reads,false);
|
||||||
}
|
}
|
||||||
|
|
||||||
this.reads = getReadsDataSource(reads);
|
|
||||||
this.reference = openReferenceSequenceFile(refFile);
|
this.reference = openReferenceSequenceFile(refFile);
|
||||||
this.rods = getReferenceOrderedDataSources(rods);
|
this.rods = getReferenceOrderedDataSources(rods);
|
||||||
}
|
}
|
||||||
|
|
@ -159,12 +161,12 @@ public abstract class MicroScheduler {
|
||||||
* Gets a data source for the given set of reads.
|
* Gets a data source for the given set of reads.
|
||||||
* @return A data source for the given set of reads.
|
* @return A data source for the given set of reads.
|
||||||
*/
|
*/
|
||||||
private SAMDataSource getReadsDataSource(Reads reads) {
|
private SAMDataSource getReadsDataSource(Reads reads, boolean byReads) {
|
||||||
// By reference traversals are happy with no reads. Make sure that case is handled.
|
// By reference traversals are happy with no reads. Make sure that case is handled.
|
||||||
if (reads.getReadsFiles().size() == 0)
|
if (reads.getReadsFiles().size() == 0)
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
SAMDataSource dataSource = new SAMDataSource(reads);
|
SAMDataSource dataSource = new SAMDataSource(reads, byReads);
|
||||||
|
|
||||||
// Side effect: initialize the traversal engine with reads data.
|
// Side effect: initialize the traversal engine with reads data.
|
||||||
// TODO: Give users a dedicated way of getting the header so that the MicroScheduler
|
// TODO: Give users a dedicated way of getting the header so that the MicroScheduler
|
||||||
|
|
|
||||||
|
|
@ -7,20 +7,29 @@ import java.util.Iterator;
|
||||||
|
|
||||||
import org.broadinstitute.sting.gatk.Reads;
|
import org.broadinstitute.sting.gatk.Reads;
|
||||||
|
|
||||||
/**
|
/*
|
||||||
|
* Copyright (c) 2009 The Broad Institute
|
||||||
*
|
*
|
||||||
* User: aaron
|
* Permission is hereby granted, free of charge, to any person
|
||||||
* Date: Apr 14, 2009
|
* obtaining a copy of this software and associated documentation
|
||||||
* Time: 5:27:31 PM
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
*
|
*
|
||||||
* The Broad Institute
|
* The above copyright notice and this permission notice shall be
|
||||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
* included in all copies or substantial portions of the Software.
|
||||||
* This software and its documentation are copyright 2009 by the
|
|
||||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
|
||||||
*
|
|
||||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
|
||||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
|
||||||
*
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -31,7 +31,7 @@ import java.util.PriorityQueue;
|
||||||
* iterable stream. The underlying iterators/files must all have the same sort order unless
|
* iterable stream. The underlying iterators/files must all have the same sort order unless
|
||||||
* the requested output format is unsorted, in which case any combination is valid.
|
* the requested output format is unsorted, in which case any combination is valid.
|
||||||
*/
|
*/
|
||||||
public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>, Iterable<SAMRecord> {
|
public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>, Iterable<SAMRecord>, PeekingStingIterator {
|
||||||
protected PriorityQueue<ComparableSamRecordIterator> pq = null;
|
protected PriorityQueue<ComparableSamRecordIterator> pq = null;
|
||||||
protected final SamFileHeaderMerger samHeaderMerger;
|
protected final SamFileHeaderMerger samHeaderMerger;
|
||||||
protected final SAMFileHeader.SortOrder sortOrder;
|
protected final SAMFileHeader.SortOrder sortOrder;
|
||||||
|
|
@ -43,7 +43,7 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
|
||||||
* Constructs a new merging iterator with the same set of readers and sort order as
|
* Constructs a new merging iterator with the same set of readers and sort order as
|
||||||
* provided by the header merger parameter.
|
* provided by the header merger parameter.
|
||||||
*/
|
*/
|
||||||
public MergingSamRecordIterator2(final SamFileHeaderMerger headerMerger) {
|
public MergingSamRecordIterator2(final SamFileHeaderMerger headerMerger, Reads reads) {
|
||||||
this.samHeaderMerger = headerMerger;
|
this.samHeaderMerger = headerMerger;
|
||||||
this.sortOrder = headerMerger.getMergedHeader().getSortOrder();
|
this.sortOrder = headerMerger.getMergedHeader().getSortOrder();
|
||||||
this.pq = new PriorityQueue<ComparableSamRecordIterator>(samHeaderMerger.getReaders().size());
|
this.pq = new PriorityQueue<ComparableSamRecordIterator>(samHeaderMerger.getReaders().size());
|
||||||
|
|
@ -275,6 +275,16 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
|
||||||
public Iterator<SAMRecord> iterator() {
|
public Iterator<SAMRecord> iterator() {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets source information for the reads. Contains information about the original reads
|
||||||
|
* files, plus information about downsampling, etc.
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public Reads getSourceInfo() {
|
||||||
|
return null; //To change body of implemented methods use File | Settings | File Templates.
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Should replace picard class with the same name
|
// Should replace picard class with the same name
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,43 @@
|
||||||
|
package org.broadinstitute.sting.gatk.iterators;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2009 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author aaron
|
||||||
|
* <p/>
|
||||||
|
* Interface Peekable
|
||||||
|
* <p/>
|
||||||
|
* This interface indicates that
|
||||||
|
*/
|
||||||
|
public interface PeekingStingIterator extends StingSAMIterator {
|
||||||
|
/**
|
||||||
|
* peek, given the specified type
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
SAMRecord peek();
|
||||||
|
}
|
||||||
|
|
@ -337,33 +337,6 @@ public abstract class TraversalEngine {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Iterator<SAMRecord> initializeReads() {
|
|
||||||
if( readsFiles.size() == 1 )
|
|
||||||
samReader = initializeSAMFile(readsFiles.get(0));
|
|
||||||
else
|
|
||||||
samReader = null;
|
|
||||||
return wrapReadsIterator(getReadsIterator(samReader), true);
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Iterator<SAMRecord> getReadsIterator(final SAMFileReader samReader) {
|
|
||||||
// If the file has an index, querying functions are available. Use them if possible...
|
|
||||||
if ( samReader == null && readsFiles.size() > 0 ) {
|
|
||||||
SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.coordinate;
|
|
||||||
|
|
||||||
List<SAMFileReader> readers = new ArrayList<SAMFileReader>();
|
|
||||||
|
|
||||||
for ( File readsFile: readsFiles ) {
|
|
||||||
SAMFileReader reader = initializeSAMFile(readsFile);
|
|
||||||
readers.add(reader);
|
|
||||||
}
|
|
||||||
|
|
||||||
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SORT_ORDER);
|
|
||||||
return new MergingSamRecordIterator2(headerMerger);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
return samReader.iterator();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Deprecated
|
@Deprecated
|
||||||
protected StingSAMIterator wrapReadsIterator( final Iterator<SAMRecord> rawIterator, final boolean enableVerification ) {
|
protected StingSAMIterator wrapReadsIterator( final Iterator<SAMRecord> rawIterator, final boolean enableVerification ) {
|
||||||
|
|
|
||||||
|
|
@ -246,39 +246,7 @@ public class TraverseDuplicates extends TraversalEngine {
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
|
||||||
//
|
|
||||||
// old style interface to the system
|
|
||||||
//
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
|
||||||
@Override
|
|
||||||
public <M,T> T traverse(Walker<M,T> walker, List<GenomeLoc> locations) {
|
|
||||||
if ( walker instanceof DuplicateWalker) {
|
|
||||||
Walker x = walker;
|
|
||||||
DuplicateWalker<?, ?> dupWalker = (DuplicateWalker<?, ?>)x;
|
|
||||||
return (T)this.traverseByRead(dupWalker, locations);
|
|
||||||
} else {
|
|
||||||
throw new IllegalArgumentException("Walker isn't a duplicate walker!");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Should we deleted at the soonist possible opportunity
|
|
||||||
*/
|
|
||||||
public <M, T> Object traverseByRead(DuplicateWalker<M, T> walker, List<GenomeLoc> locations) {
|
|
||||||
samReadIter = initializeReads();
|
|
||||||
|
|
||||||
// Initialize the walker
|
|
||||||
walker.initialize();
|
|
||||||
|
|
||||||
// Initialize the sum
|
|
||||||
FilteringIterator filterIter = new FilteringIterator(samReadIter, new duplicateStreamFilterFunc());
|
|
||||||
T sum = actuallyTraverse(walker, filterIter, walker.reduceInit());
|
|
||||||
|
|
||||||
//printOnTraversalDone("reads", sum);
|
|
||||||
walker.onTraversalDone(sum);
|
|
||||||
return sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,165 @@
|
||||||
|
package org.broadinstitute.sting.utils.sam;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.iterators.PeekingStingIterator;
|
||||||
|
import org.broadinstitute.sting.gatk.Reads;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import net.sf.samtools.SAMFileHeader;
|
||||||
|
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2009 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/** this fake iterator allows us to look at how specific piles of reads are handled */
|
||||||
|
public class ArtificialSAMIterator implements PeekingStingIterator {
|
||||||
|
|
||||||
|
|
||||||
|
protected int currentChromo = 0;
|
||||||
|
protected int currentRead = 0;
|
||||||
|
protected int totalReadCount = 0;
|
||||||
|
protected boolean done = false;
|
||||||
|
// the next record
|
||||||
|
protected SAMRecord next = null;
|
||||||
|
protected SAMFileHeader header = null;
|
||||||
|
|
||||||
|
// the passed in parameters
|
||||||
|
protected final int sChr;
|
||||||
|
protected final int eChromosomeCount;
|
||||||
|
protected final int rCount;
|
||||||
|
protected int uMappedReadCount;
|
||||||
|
|
||||||
|
// let us know to make a read, we need this to help out the fake sam query iterator
|
||||||
|
private boolean initialized = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create the fake iterator, given the mapping of chromosomes and read counts
|
||||||
|
*
|
||||||
|
* @param startingChr the starting chromosome
|
||||||
|
* @param endingChr the ending chromosome
|
||||||
|
* @param readCount the number of reads in each chromosome
|
||||||
|
* @param header the associated header
|
||||||
|
*/
|
||||||
|
ArtificialSAMIterator( int startingChr, int endingChr, int readCount, SAMFileHeader header ) {
|
||||||
|
sChr = startingChr;
|
||||||
|
eChromosomeCount = (endingChr - startingChr) + 1;
|
||||||
|
rCount = readCount;
|
||||||
|
this.header = header;
|
||||||
|
this.currentChromo = 0;
|
||||||
|
uMappedReadCount = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create the fake iterator, given the mapping of chromosomes and read counts
|
||||||
|
*
|
||||||
|
* @param startingChr the starting chromosome
|
||||||
|
* @param endingChr the ending chromosome
|
||||||
|
* @param readCount the number of reads in each chromosome
|
||||||
|
* @param header the associated header
|
||||||
|
*/
|
||||||
|
ArtificialSAMIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount, SAMFileHeader header ) {
|
||||||
|
sChr = startingChr;
|
||||||
|
eChromosomeCount = (endingChr - startingChr) + 1;
|
||||||
|
rCount = readCount;
|
||||||
|
this.header = header;
|
||||||
|
this.currentChromo = 0;
|
||||||
|
this.uMappedReadCount = unmappedReadCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Reads getSourceInfo() {
|
||||||
|
throw new UnsupportedOperationException("We don't support this");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() {
|
||||||
|
// done
|
||||||
|
currentChromo = Integer.MAX_VALUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasNext() {
|
||||||
|
if (!initialized){
|
||||||
|
initialized = true;
|
||||||
|
createNextRead();
|
||||||
|
}
|
||||||
|
if (this.next != null) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean createNextRead() {
|
||||||
|
if (currentRead >= rCount) {
|
||||||
|
currentChromo++;
|
||||||
|
currentRead = 0;
|
||||||
|
}
|
||||||
|
// check for end condition, have we finished the chromosome listing, and have no unmapped reads
|
||||||
|
if (currentChromo >= eChromosomeCount) {
|
||||||
|
if (uMappedReadCount < 1) {
|
||||||
|
this.next = null;
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
++totalReadCount;
|
||||||
|
this.next = ArtificialSamUtils.createArtificialRead(this.header, String.valueOf(totalReadCount), -1, -1, 50);
|
||||||
|
--uMappedReadCount;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
++totalReadCount;
|
||||||
|
this.next = ArtificialSamUtils.createArtificialRead(this.header, String.valueOf(totalReadCount), currentChromo, currentRead, 50);
|
||||||
|
++currentRead;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public SAMRecord next() {
|
||||||
|
SAMRecord ret = next;
|
||||||
|
createNextRead();
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void remove() {
|
||||||
|
throw new UnsupportedOperationException("You've tried to remove on a StingSAMIterator (unsupported), not to mention that this is a fake iterator.");
|
||||||
|
}
|
||||||
|
|
||||||
|
public Iterator<SAMRecord> iterator() {
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* some instrumentation methods
|
||||||
|
*/
|
||||||
|
public int readsTaken() {
|
||||||
|
return totalReadCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* peek at the next sam record
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public SAMRecord peek() {
|
||||||
|
return this.next;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,181 @@
|
||||||
|
package org.broadinstitute.sting.utils.sam;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMFileHeader;
|
||||||
|
import net.sf.samtools.SAMSequenceRecord;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2009 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author aaron
|
||||||
|
* <p/>
|
||||||
|
* Class ArtificialSAMQueryIterator
|
||||||
|
* <p/>
|
||||||
|
* A descriptions should go here. Blame aaron if it's missing.
|
||||||
|
*/
|
||||||
|
public class ArtificialSAMQueryIterator extends ArtificialSAMIterator {
|
||||||
|
|
||||||
|
// get the next positon
|
||||||
|
protected int finalPos = 0;
|
||||||
|
protected int startPos = 0;
|
||||||
|
protected int contigIndex = -1;
|
||||||
|
protected boolean overlapping = false;
|
||||||
|
protected int startingChr = 0;
|
||||||
|
protected boolean seeked = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create the fake iterator, given the mapping of chromosomes and read counts
|
||||||
|
*
|
||||||
|
* @param startingChr the starting chromosome
|
||||||
|
* @param endingChr the ending chromosome
|
||||||
|
* @param readCount the number of reads in each chromosome
|
||||||
|
* @param header the associated header
|
||||||
|
*/
|
||||||
|
ArtificialSAMQueryIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount, SAMFileHeader header ) {
|
||||||
|
super(startingChr, endingChr, readCount, unmappedReadCount, header);
|
||||||
|
this.startingChr = startingChr;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* query containing - get reads contained by the specified interval
|
||||||
|
*
|
||||||
|
* @param contig the contig index string
|
||||||
|
* @param start the start position
|
||||||
|
* @param stop the stop position
|
||||||
|
*/
|
||||||
|
public void queryContained( String contig, int start, int stop ) {
|
||||||
|
this.overlapping = false;
|
||||||
|
initialize(contig, start, stop);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* query containing - get reads contained by the specified interval
|
||||||
|
*
|
||||||
|
* @param contig the contig index string
|
||||||
|
* @param start the start position
|
||||||
|
* @param stop the stop position
|
||||||
|
*/
|
||||||
|
public void queryOverlapping( String contig, int start, int stop ) {
|
||||||
|
this.overlapping = true;
|
||||||
|
initialize(contig, start, stop);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* initialize the query iterator
|
||||||
|
*
|
||||||
|
* @param contig the contig
|
||||||
|
* @param start the start position
|
||||||
|
* @param stop the stop postition
|
||||||
|
*/
|
||||||
|
private void initialize( String contig, int start, int stop ) {
|
||||||
|
ensureUntouched();
|
||||||
|
finalPos = stop;
|
||||||
|
startPos = start;
|
||||||
|
if (finalPos < 0) {
|
||||||
|
finalPos = Integer.MAX_VALUE;
|
||||||
|
}
|
||||||
|
// sanity check that we have the contig
|
||||||
|
contigIndex = -1;
|
||||||
|
List<SAMSequenceRecord> list = header.getSequenceDictionary().getSequences();
|
||||||
|
for (SAMSequenceRecord rec : list) {
|
||||||
|
if (rec.getSequenceName().equals(contig)) {
|
||||||
|
contigIndex = rec.getSequenceIndex();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (contigIndex < 0) { throw new IllegalArgumentException("Contig" + contig + " doesn't exist"); }
|
||||||
|
while (super.hasNext() && this.peek().getReferenceIndex() < contigIndex) {
|
||||||
|
super.next();
|
||||||
|
}
|
||||||
|
if (!super.hasNext()) {
|
||||||
|
throw new StingException("Unable to find the target chromosome");
|
||||||
|
}
|
||||||
|
while (super.hasNext() && this.peek().getAlignmentStart() < start) {
|
||||||
|
super.next();
|
||||||
|
}
|
||||||
|
// sanity check that we have an actual matching read next
|
||||||
|
SAMRecord rec = this.peek();
|
||||||
|
if (!matches(rec)) {
|
||||||
|
throw new StingException("The next read doesn't match");
|
||||||
|
}
|
||||||
|
// set the seeked variable to true
|
||||||
|
seeked = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* given a read and the query type, check if it matches our regions
|
||||||
|
*
|
||||||
|
* @param rec the read
|
||||||
|
*
|
||||||
|
* @return true if it belongs in our region
|
||||||
|
*/
|
||||||
|
public boolean matches( SAMRecord rec ) {
|
||||||
|
if (rec.getReferenceIndex() != this.contigIndex) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (!overlapping) {
|
||||||
|
// if the start or the end are somewhere within our range
|
||||||
|
if (( rec.getAlignmentStart() >= startPos && rec.getAlignmentEnd() <= finalPos )) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (( rec.getAlignmentStart() <= finalPos && rec.getAlignmentStart() >= startPos ) ||
|
||||||
|
( rec.getAlignmentEnd() <= finalPos && rec.getAlignmentEnd() >= startPos )) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* override the hasNext, to incorportate our limiting factor
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public boolean hasNext() {
|
||||||
|
boolean res = super.hasNext();
|
||||||
|
if (!seeked) {
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
if (res && matches(this.next)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** make sure we haven't been used as an iterator yet; this is to miror the MergingSamIterator2 action. */
|
||||||
|
public void ensureUntouched() {
|
||||||
|
if (this.currentChromo != 0 || this.currentRead > 0) {
|
||||||
|
throw new UnsupportedOperationException("We've already been used as an iterator; you can't query after that");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -5,7 +5,7 @@ import net.sf.samtools.*;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
import org.broadinstitute.sting.gatk.iterators.PeekingStingIterator;
|
||||||
import org.broadinstitute.sting.gatk.Reads;
|
import org.broadinstitute.sting.gatk.Reads;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -28,11 +28,6 @@ import org.broadinstitute.sting.gatk.Reads;
|
||||||
/**
|
/**
|
||||||
* @author aaron
|
* @author aaron
|
||||||
* @version 1.0
|
* @version 1.0
|
||||||
* @date May 21, 2009
|
|
||||||
* <p/>
|
|
||||||
* Class samUtils
|
|
||||||
* <p/>
|
|
||||||
* A collection of utilities for making sam and bam files. Mostly these are for creating sam and bam files for testing purposes.
|
|
||||||
*/
|
*/
|
||||||
public class ArtificialSamUtils {
|
public class ArtificialSamUtils {
|
||||||
public static final int DEFAULT_READ_LENGTH = 50;
|
public static final int DEFAULT_READ_LENGTH = 50;
|
||||||
|
|
@ -46,7 +41,7 @@ public class ArtificialSamUtils {
|
||||||
* @param chromosomeSize how large each chromosome is
|
* @param chromosomeSize how large each chromosome is
|
||||||
* @param readsPerChomosome how many reads to make in each chromosome. They'll be aligned from position 1 to x (which is the number of reads)
|
* @param readsPerChomosome how many reads to make in each chromosome. They'll be aligned from position 1 to x (which is the number of reads)
|
||||||
*/
|
*/
|
||||||
public static void createArtificialBamFile(String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome) {
|
public static void createArtificialBamFile( String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome ) {
|
||||||
SAMFileHeader header = createArtificialSamHeader(numberOfChromosomes, startingChromosome, chromosomeSize);
|
SAMFileHeader header = createArtificialSamHeader(numberOfChromosomes, startingChromosome, chromosomeSize);
|
||||||
File outFile = new File(filename);
|
File outFile = new File(filename);
|
||||||
|
|
||||||
|
|
@ -70,7 +65,7 @@ public class ArtificialSamUtils {
|
||||||
* @param chromosomeSize how large each chromosome is
|
* @param chromosomeSize how large each chromosome is
|
||||||
* @param readsPerChomosome how many reads to make in each chromosome. They'll be aligned from position 1 to x (which is the number of reads)
|
* @param readsPerChomosome how many reads to make in each chromosome. They'll be aligned from position 1 to x (which is the number of reads)
|
||||||
*/
|
*/
|
||||||
public static void createArtificialSamFile(String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome) {
|
public static void createArtificialSamFile( String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome ) {
|
||||||
SAMFileHeader header = createArtificialSamHeader(numberOfChromosomes, startingChromosome, chromosomeSize);
|
SAMFileHeader header = createArtificialSamHeader(numberOfChromosomes, startingChromosome, chromosomeSize);
|
||||||
File outFile = new File(filename);
|
File outFile = new File(filename);
|
||||||
|
|
||||||
|
|
@ -91,14 +86,15 @@ public class ArtificialSamUtils {
|
||||||
* @param numberOfChromosomes the number of chromosomes to create
|
* @param numberOfChromosomes the number of chromosomes to create
|
||||||
* @param startingChromosome the starting number for the chromosome (most likely set to 1)
|
* @param startingChromosome the starting number for the chromosome (most likely set to 1)
|
||||||
* @param chromosomeSize the length of each chromosome
|
* @param chromosomeSize the length of each chromosome
|
||||||
|
*
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public static SAMFileHeader createArtificialSamHeader(int numberOfChromosomes, int startingChromosome, int chromosomeSize) {
|
public static SAMFileHeader createArtificialSamHeader( int numberOfChromosomes, int startingChromosome, int chromosomeSize ) {
|
||||||
SAMFileHeader header = new SAMFileHeader();
|
SAMFileHeader header = new SAMFileHeader();
|
||||||
SAMSequenceDictionary dict = new SAMSequenceDictionary();
|
SAMSequenceDictionary dict = new SAMSequenceDictionary();
|
||||||
// make up some sequence records
|
// make up some sequence records
|
||||||
for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) {
|
for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) {
|
||||||
SAMSequenceRecord rec = new SAMSequenceRecord("chr" + (x), chromosomeSize /* size */);
|
SAMSequenceRecord rec = new SAMSequenceRecord("chr" + ( x ), chromosomeSize /* size */);
|
||||||
rec.setSequenceLength(chromosomeSize);
|
rec.setSequenceLength(chromosomeSize);
|
||||||
dict.addSequence(rec);
|
dict.addSequence(rec);
|
||||||
}
|
}
|
||||||
|
|
@ -114,9 +110,10 @@ public class ArtificialSamUtils {
|
||||||
* @param refIndex the reference index, i.e. what chromosome to associate it with
|
* @param refIndex the reference index, i.e. what chromosome to associate it with
|
||||||
* @param alignmentStart where to start the alignment
|
* @param alignmentStart where to start the alignment
|
||||||
* @param length the length of the read
|
* @param length the length of the read
|
||||||
|
*
|
||||||
* @return the artificial read
|
* @return the artificial read
|
||||||
*/
|
*/
|
||||||
public static SAMRecord createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, int length) {
|
public static SAMRecord createArtificialRead( SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) {
|
||||||
SAMRecord record = new SAMRecord(header);
|
SAMRecord record = new SAMRecord(header);
|
||||||
record.setReadName(name);
|
record.setReadName(name);
|
||||||
record.setReferenceIndex(refIndex);
|
record.setReferenceIndex(refIndex);
|
||||||
|
|
@ -131,95 +128,64 @@ public class ArtificialSamUtils {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* create an iterator containing the specified read piles
|
* create an iterator containing the specified read piles
|
||||||
|
*
|
||||||
* @param startingChr the chromosome (reference ID) to start from
|
* @param startingChr the chromosome (reference ID) to start from
|
||||||
* @param endingChr the id to end with
|
* @param endingChr the id to end with
|
||||||
* @param readCount the number of reads per chromosome
|
* @param readCount the number of reads per chromosome
|
||||||
|
*
|
||||||
* @return StingSAMIterator representing the specified amount of fake data
|
* @return StingSAMIterator representing the specified amount of fake data
|
||||||
*/
|
*/
|
||||||
public static StingSAMIterator unmappedReadIterator(int startingChr, int endingChr, int readCount ) {
|
public static PeekingStingIterator unmappedReadIterator( int startingChr, int endingChr, int readCount ) {
|
||||||
SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH);
|
SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH);
|
||||||
Map<Integer, Integer> map = new HashMap<Integer, Integer>();
|
|
||||||
for (int x = startingChr; x < endingChr; x++) {
|
return new ArtificialSAMIterator(startingChr, endingChr, readCount, header);
|
||||||
map.put(x,readCount);
|
|
||||||
}
|
}
|
||||||
return new ArtificialSAMIterator(startingChr, endingChr, readCount,header);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* this fake iterator allows us to look at how specific piles of reads are handled
|
|
||||||
*/
|
|
||||||
class ArtificialSAMIterator implements StingSAMIterator {
|
|
||||||
|
|
||||||
|
|
||||||
private int currentChromo = 0;
|
|
||||||
private int currentRead = 0;
|
|
||||||
private int readCount = 0;
|
|
||||||
private boolean done = false;
|
|
||||||
// the next record
|
|
||||||
private SAMRecord next = null;
|
|
||||||
SAMFileHeader header = null;
|
|
||||||
|
|
||||||
// the passed in parameters
|
|
||||||
final int sChr;
|
|
||||||
final int eChr;
|
|
||||||
final int rCount;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* create the fake iterator, given the mapping of chromosomes and read counts
|
* create an iterator containing the specified read piles
|
||||||
* @param startingChr the starting chromosome
|
*
|
||||||
* @param endingChr the ending chromosome
|
* @param startingChr the chromosome (reference ID) to start from
|
||||||
* @param readCount the number of reads in each chromosome
|
* @param endingChr the id to end with
|
||||||
* @param header the associated header
|
* @param readCount the number of reads per chromosome
|
||||||
|
* @param unmappedReadCount the count of unmapped reads to place at the end of the iterator, like in a sorted bam file
|
||||||
|
*
|
||||||
|
* @return StingSAMIterator representing the specified amount of fake data
|
||||||
*/
|
*/
|
||||||
ArtificialSAMIterator(int startingChr, int endingChr, int readCount, SAMFileHeader header) {
|
public static PeekingStingIterator unmappedReadIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount ) {
|
||||||
sChr = startingChr;
|
SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH);
|
||||||
eChr = endingChr;
|
|
||||||
rCount = readCount;
|
return new ArtificialSAMIterator(startingChr, endingChr, readCount, unmappedReadCount, header);
|
||||||
this.header = header;
|
|
||||||
this.currentChromo = startingChr;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Reads getSourceInfo() {
|
/**
|
||||||
throw new UnsupportedOperationException("We don't support this");
|
* create an ArtificialSAMQueryIterator containing the specified read piles
|
||||||
|
*
|
||||||
|
* @param startingChr the chromosome (reference ID) to start from
|
||||||
|
* @param endingChr the id to end with
|
||||||
|
* @param readCount the number of reads per chromosome
|
||||||
|
*
|
||||||
|
* @return StingSAMIterator representing the specified amount of fake data
|
||||||
|
*/
|
||||||
|
public static ArtificialSAMQueryIterator queryReadIterator( int startingChr, int endingChr, int readCount ) {
|
||||||
|
SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH);
|
||||||
|
|
||||||
|
return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, 0, header);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void close() {
|
/**
|
||||||
// done
|
* create an ArtificialSAMQueryIterator containing the specified read piles
|
||||||
currentChromo = Integer.MAX_VALUE;
|
*
|
||||||
}
|
* @param startingChr the chromosome (reference ID) to start from
|
||||||
|
* @param endingChr the id to end with
|
||||||
|
* @param readCount the number of reads per chromosome
|
||||||
|
* @param unmappedReadCount the count of unmapped reads to place at the end of the iterator, like in a sorted bam file
|
||||||
|
*
|
||||||
|
* @return StingSAMIterator representing the specified amount of fake data
|
||||||
|
*/
|
||||||
|
public static ArtificialSAMQueryIterator queryReadIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount ) {
|
||||||
|
SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH);
|
||||||
|
|
||||||
public boolean hasNext() {
|
return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, unmappedReadCount, header);
|
||||||
if (currentRead >= rCount) {
|
|
||||||
currentChromo++;
|
|
||||||
currentRead = 0;
|
|
||||||
}
|
|
||||||
if (currentChromo >= eChr) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
int alignment = 0;
|
|
||||||
if (currentChromo >= 0) {
|
|
||||||
alignment = currentRead;
|
|
||||||
} else {
|
|
||||||
alignment = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
this.next = ArtificialSamUtils.createArtificialRead(this.header,String.valueOf(readCount), currentChromo, alignment, 50);
|
|
||||||
currentRead++;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
public SAMRecord next() {
|
|
||||||
return next;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void remove() {
|
|
||||||
throw new UnsupportedOperationException("You've tried to remove on a StingSAMIterator (unsupported), not to mention that this is a fake iterator.");
|
|
||||||
}
|
|
||||||
|
|
||||||
public Iterator<SAMRecord> iterator() {
|
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -43,6 +43,12 @@ import net.sf.samtools.SAMFileHeader;
|
||||||
* Class ReadIntervalShardStrategyTest
|
* Class ReadIntervalShardStrategyTest
|
||||||
* <p/>
|
* <p/>
|
||||||
* Tests the ReadIntervalShardStrategy class
|
* Tests the ReadIntervalShardStrategy class
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* TODO: this test has been changed, since we now force interval shards to not subdivide if they're large,
|
||||||
|
* so you should always get one shard back, reguardless of the specified length. If this behavior changes
|
||||||
|
* back in the future, the expected number of shards should change from 1 to > 1.
|
||||||
|
*
|
||||||
*/
|
*/
|
||||||
public class IntervalShardStrategyTest extends BaseTest {
|
public class IntervalShardStrategyTest extends BaseTest {
|
||||||
|
|
||||||
|
|
@ -75,7 +81,7 @@ public class IntervalShardStrategyTest extends BaseTest {
|
||||||
counter++;
|
counter++;
|
||||||
}
|
}
|
||||||
assertTrue(d instanceof IntervalShard);
|
assertTrue(d instanceof IntervalShard);
|
||||||
assertEquals(10, counter);
|
assertEquals(1, counter);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
@ -92,7 +98,7 @@ public class IntervalShardStrategyTest extends BaseTest {
|
||||||
counter++;
|
counter++;
|
||||||
}
|
}
|
||||||
assertTrue(d instanceof IntervalShard);
|
assertTrue(d instanceof IntervalShard);
|
||||||
assertEquals(50, counter);
|
assertEquals(5, counter);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
@ -105,16 +111,11 @@ public class IntervalShardStrategyTest extends BaseTest {
|
||||||
int counter = 0;
|
int counter = 0;
|
||||||
while (strat.hasNext()) {
|
while (strat.hasNext()) {
|
||||||
Shard d = strat.next();
|
Shard d = strat.next();
|
||||||
if (counter % 2 == 0) {
|
|
||||||
assertEquals(1, d.getGenomeLoc().getStart());
|
assertEquals(1, d.getGenomeLoc().getStart());
|
||||||
assertEquals(789, d.getGenomeLoc().getStop());
|
|
||||||
} else {
|
|
||||||
assertEquals(790, d.getGenomeLoc().getStart());
|
|
||||||
assertEquals(1000, d.getGenomeLoc().getStop());
|
assertEquals(1000, d.getGenomeLoc().getStop());
|
||||||
}
|
|
||||||
counter++;
|
counter++;
|
||||||
}
|
}
|
||||||
assertEquals(10, counter);
|
assertEquals(5, counter);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -89,7 +89,7 @@ public class SAMBAMDataSourceTest extends BaseTest {
|
||||||
Reads reads = new Reads(fl);
|
Reads reads = new Reads(fl);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
SAMDataSource data = new SAMDataSource(reads);
|
SAMDataSource data = new SAMDataSource(reads,false);
|
||||||
for (Shard sh : strat) {
|
for (Shard sh : strat) {
|
||||||
int readCount = 0;
|
int readCount = 0;
|
||||||
count++;
|
count++;
|
||||||
|
|
@ -139,7 +139,7 @@ public class SAMBAMDataSourceTest extends BaseTest {
|
||||||
int count = 0;
|
int count = 0;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
SAMDataSource data = new SAMDataSource(reads);
|
SAMDataSource data = new SAMDataSource(reads,false);
|
||||||
for (Shard sh : strat) {
|
for (Shard sh : strat) {
|
||||||
int readCount = 0;
|
int readCount = 0;
|
||||||
count++;
|
count++;
|
||||||
|
|
@ -176,7 +176,7 @@ public class SAMBAMDataSourceTest extends BaseTest {
|
||||||
|
|
||||||
logger.debug("Pile two:");
|
logger.debug("Pile two:");
|
||||||
try {
|
try {
|
||||||
SAMDataSource data = new SAMDataSource(reads);
|
SAMDataSource data = new SAMDataSource(reads,false);
|
||||||
for (Shard sh : strat) {
|
for (Shard sh : strat) {
|
||||||
int readCount = 0;
|
int readCount = 0;
|
||||||
count++;
|
count++;
|
||||||
|
|
|
||||||
|
|
@ -2,15 +2,20 @@ package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
|
||||||
|
|
||||||
import static junit.framework.Assert.fail;
|
import static junit.framework.Assert.fail;
|
||||||
import net.sf.samtools.SAMRecord;
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import net.sf.samtools.SAMFileReader;
|
||||||
|
import net.sf.samtools.SAMFileHeader;
|
||||||
|
import net.sf.samtools.util.CloseableIterator;
|
||||||
import org.broadinstitute.sting.BaseTest;
|
import org.broadinstitute.sting.BaseTest;
|
||||||
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
|
|
||||||
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategy;
|
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategy;
|
||||||
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategyFactory;
|
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategyFactory;
|
||||||
import org.broadinstitute.sting.gatk.iterators.BoundedReadIterator;
|
import org.broadinstitute.sting.gatk.iterators.BoundedReadIterator;
|
||||||
|
import org.broadinstitute.sting.gatk.iterators.*;
|
||||||
import org.broadinstitute.sting.gatk.Reads;
|
import org.broadinstitute.sting.gatk.Reads;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
import org.broadinstitute.sting.utils.sam.ArtificialSamUtils;
|
||||||
|
import org.broadinstitute.sting.utils.sam.ArtificialSAMQueryIterator;
|
||||||
import static org.junit.Assert.assertEquals;
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
|
|
@ -38,16 +43,14 @@ import java.util.List;
|
||||||
/**
|
/**
|
||||||
* @author aaron
|
* @author aaron
|
||||||
* @version 1.0
|
* @version 1.0
|
||||||
* @date Apr 15, 2009
|
|
||||||
* <p/>
|
|
||||||
* Class SAMByReadsTest
|
|
||||||
* <p/>
|
|
||||||
* Test sam traversal by reads FIX ME
|
|
||||||
*/
|
*/
|
||||||
public class SAMByReadsTest extends BaseTest {
|
public class SAMByReadsTest extends BaseTest {
|
||||||
|
|
||||||
private FastaSequenceFile2 seq;
|
|
||||||
private List<File> fl;
|
private List<File> fl;
|
||||||
|
ShardStrategy shardStrategy;
|
||||||
|
Reads reads;
|
||||||
|
private int targetReadCount = 14;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This function does the setup of our parser, before each method call.
|
* This function does the setup of our parser, before each method call.
|
||||||
|
|
@ -59,53 +62,89 @@ public class SAMByReadsTest extends BaseTest {
|
||||||
fl = new ArrayList<File>();
|
fl = new ArrayList<File>();
|
||||||
|
|
||||||
// sequence
|
// sequence
|
||||||
seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly17/v0/Homo_sapiens_assembly17.fasta"));
|
//seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly17/v0/Homo_sapiens_assembly17.fasta"));
|
||||||
GenomeLoc.setupRefContigOrdering(seq.getSequenceDictionary());
|
//GenomeLoc.setupRefContigOrdering(seq.getSequenceDictionary());
|
||||||
|
|
||||||
|
// setup the test files
|
||||||
|
fl.add(new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/index_test.bam"));
|
||||||
|
reads = new Reads(fl);
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/** Test out that we can shard the file and iterate over every read */
|
/** Test out that we can shard the file and iterate over every read */
|
||||||
@Test
|
@Test
|
||||||
public void testTotalReadCount() {
|
public void testToUnmappedReads() {
|
||||||
logger.warn("Executing testTotalReadCount");
|
ArtificialIteratorGenerator gen = new ArtificialIteratorGenerator(1,10,100,1000);
|
||||||
|
GenomeLoc.setupRefContigOrdering(gen.getHeader().getSequenceDictionary());
|
||||||
// setup the test files
|
|
||||||
fl.add(new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/index_test.bam"));
|
|
||||||
Reads reads = new Reads(fl);
|
|
||||||
|
|
||||||
final int targetReadCount = 5000;
|
|
||||||
|
|
||||||
ShardStrategy shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS,seq.getSequenceDictionary(),targetReadCount);
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
SAMDataSource data = new SAMDataSource(reads);
|
int unmappedReadsSeen = 0;
|
||||||
|
int iterations = 0;
|
||||||
// check the total read count
|
SAMDataSource data = new SAMDataSource(reads,true);
|
||||||
final int totalReads = 10000;
|
for (int x = 0; x < 10; x++) {
|
||||||
|
++iterations;
|
||||||
int readsSeen = 0;
|
PeekingStingIterator iter = ArtificialSamUtils.unmappedReadIterator(1, 100, 10, 1000);
|
||||||
BoundedReadIterator iter;
|
BoundedReadIterator ret = data.toUnmappedReads(100, iter);
|
||||||
|
// count the reads we've gotten back
|
||||||
|
if (ret == null) {
|
||||||
for (Shard sd : shardStrategy) {
|
fail("On iteration " + iterations + " we were returned a null pointer, after seeing " + unmappedReadsSeen + " reads out of a 1000");
|
||||||
int readcnt = 0;
|
}
|
||||||
|
while (ret.hasNext()) {
|
||||||
iter = (BoundedReadIterator)data.seek(sd);
|
ret.next();
|
||||||
|
unmappedReadsSeen++;
|
||||||
for (SAMRecord r : iter) {
|
}
|
||||||
|
}
|
||||||
readcnt++;
|
assertEquals(1000,unmappedReadsSeen);
|
||||||
|
}
|
||||||
|
|
||||||
|
catch (SimpleDataSourceLoadException e) {
|
||||||
|
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
|
||||||
|
fail("testLinearBreakIterateAll: We Should not get a SimpleDataSourceLoadException");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
readsSeen += readcnt;
|
|
||||||
//logger.warn("Seen " + readsSeen + " reads.");
|
|
||||||
|
|
||||||
}
|
}
|
||||||
// make sure we've seen all the reads
|
|
||||||
assertEquals(totalReads,readsSeen);
|
/** Test out that we can shard the file and iterate over every read */
|
||||||
logger.warn("Success " + readsSeen + " equals target count of " + totalReads);
|
@Test
|
||||||
|
public void testShardingOfReadsSize14() {
|
||||||
|
ArtificialIteratorGenerator gen = new ArtificialIteratorGenerator(1,10,100,1000);
|
||||||
|
GenomeLoc.setupRefContigOrdering(gen.getHeader().getSequenceDictionary());
|
||||||
|
targetReadCount = 14;
|
||||||
|
try {
|
||||||
|
int iterations = 0;
|
||||||
|
int readCount = 0;
|
||||||
|
SAMDataSource data = new SAMDataSource(reads,true);
|
||||||
|
|
||||||
|
|
||||||
|
data.setIterGen(gen);
|
||||||
|
shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS, gen.getHeader().getSequenceDictionary(), targetReadCount);
|
||||||
|
while (shardStrategy.hasNext()) {
|
||||||
|
|
||||||
|
|
||||||
|
BoundedReadIterator ret = (BoundedReadIterator)data.seek(shardStrategy.next());
|
||||||
|
assertTrue(ret != null);
|
||||||
|
while (ret.hasNext()) {
|
||||||
|
ret.next();
|
||||||
|
readCount++;
|
||||||
|
}
|
||||||
|
ret.close();
|
||||||
|
iterations++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// assert that we saw 2000 reads
|
||||||
|
assertEquals(2000,readCount);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* this next assertion is based on the following logic:
|
||||||
|
* 14 reads per shard = 8 shards per each 100 read chromosome
|
||||||
|
* 10 chromosomes = 8 * 10 = 80
|
||||||
|
* 1000 unmapped reads / 14 = 72
|
||||||
|
* 1 iteration at the end to know we're done
|
||||||
|
* 80 + 72 + 1 = 153
|
||||||
|
*/
|
||||||
|
assertEquals(153,iterations);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -116,4 +155,95 @@ public class SAMByReadsTest extends BaseTest {
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Test out that we can shard the file and iterate over every read */
|
||||||
|
@Test
|
||||||
|
public void testShardingOfReadsSize25() {
|
||||||
|
ArtificialIteratorGenerator gen = new ArtificialIteratorGenerator(1,10,100,1000);
|
||||||
|
GenomeLoc.setupRefContigOrdering(gen.getHeader().getSequenceDictionary());
|
||||||
|
targetReadCount = 25;
|
||||||
|
try {
|
||||||
|
int iterations = 0;
|
||||||
|
int readCount = 0;
|
||||||
|
SAMDataSource data = new SAMDataSource(reads,true);
|
||||||
|
|
||||||
|
|
||||||
|
data.setIterGen(gen);
|
||||||
|
shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS, gen.getHeader().getSequenceDictionary(), targetReadCount);
|
||||||
|
while (shardStrategy.hasNext()) {
|
||||||
|
|
||||||
|
|
||||||
|
BoundedReadIterator ret = (BoundedReadIterator)data.seek(shardStrategy.next());
|
||||||
|
assertTrue(ret != null);
|
||||||
|
while (ret.hasNext()) {
|
||||||
|
ret.next();
|
||||||
|
readCount++;
|
||||||
|
}
|
||||||
|
ret.close();
|
||||||
|
iterations++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// assert that we saw 2000 reads
|
||||||
|
assertEquals(2000,readCount);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* this next assertion is based on the following logic:
|
||||||
|
* 25 reads per shard = 5 shards (1 on the end to realize we're done)
|
||||||
|
* 10 chromosomes = 5 * 10 = 50
|
||||||
|
* 1000 unmapped reads / 25 = 40
|
||||||
|
* 1 iteration at the end to know we're done
|
||||||
|
* 50 + 40 + 1 = 91
|
||||||
|
*/
|
||||||
|
assertEquals(91,iterations);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
catch (SimpleDataSourceLoadException e) {
|
||||||
|
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
|
||||||
|
fail("testLinearBreakIterateAll: We Should not get a SimpleDataSourceLoadException");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* use this to inject into SAMDataSource for testing
|
||||||
|
*/
|
||||||
|
class ArtificialIteratorGenerator extends IteratorGenerator {
|
||||||
|
// How strict should we be with SAM/BAM parsing?
|
||||||
|
protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.SILENT;
|
||||||
|
|
||||||
|
// the header
|
||||||
|
private SAMFileHeader header;
|
||||||
|
private int endingChr;
|
||||||
|
private int startingChr;
|
||||||
|
private int readCount;
|
||||||
|
private int readSize;
|
||||||
|
/** our SAM data files */
|
||||||
|
private final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate;
|
||||||
|
|
||||||
|
public ArtificialIteratorGenerator( int startingChr, int endingChr, int readCount, int readSize) {
|
||||||
|
header = ArtificialSamUtils.createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + readSize);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public CloseableIterator<SAMRecord> seek( GenomeLoc seekTo ) {
|
||||||
|
ArtificialSAMQueryIterator iter = ArtificialSamUtils.queryReadIterator(1, 10, 100, 1000);
|
||||||
|
if (seekTo != null) {
|
||||||
|
iter.queryContained(seekTo.getContig(), (int)seekTo.getStart(), (int)seekTo.getStop());
|
||||||
|
}
|
||||||
|
return iter;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get the merged header
|
||||||
|
*
|
||||||
|
* @return the merged header
|
||||||
|
*/
|
||||||
|
public SAMFileHeader getHeader() {
|
||||||
|
return this.header;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -85,7 +85,7 @@ public class BoundedReadIteratorTest extends BaseTest {
|
||||||
long shardReadCount = 0;
|
long shardReadCount = 0;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
SAMDataSource data = new SAMDataSource(reads);
|
SAMDataSource data = new SAMDataSource(reads,true);
|
||||||
|
|
||||||
// make sure we have a shard
|
// make sure we have a shard
|
||||||
if (!strat.hasNext()) {
|
if (!strat.hasNext()) {
|
||||||
|
|
|
||||||
|
|
@ -122,7 +122,7 @@ public class TraverseReadsTest extends BaseTest {
|
||||||
ref.getSequenceDictionary(),
|
ref.getSequenceDictionary(),
|
||||||
readSize);
|
readSize);
|
||||||
|
|
||||||
SAMDataSource dataSource = new SAMDataSource(new Reads(bamList));
|
SAMDataSource dataSource = new SAMDataSource(new Reads(bamList),true);
|
||||||
dataSource.viewUnmappedReads(false);
|
dataSource.viewUnmappedReads(false);
|
||||||
|
|
||||||
countReadWalker.initialize();
|
countReadWalker.initialize();
|
||||||
|
|
@ -170,7 +170,7 @@ public class TraverseReadsTest extends BaseTest {
|
||||||
ref.getSequenceDictionary(),
|
ref.getSequenceDictionary(),
|
||||||
readSize);
|
readSize);
|
||||||
|
|
||||||
SAMDataSource dataSource = new SAMDataSource(new Reads(bamList));
|
SAMDataSource dataSource = new SAMDataSource(new Reads(bamList),true);
|
||||||
dataSource.viewUnmappedReads(true);
|
dataSource.viewUnmappedReads(true);
|
||||||
|
|
||||||
countReadWalker.initialize();
|
countReadWalker.initialize();
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,115 @@
|
||||||
|
package org.broadinstitute.sting.utils.sam;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.BaseTest;
|
||||||
|
import org.broadinstitute.sting.gatk.iterators.PeekingStingIterator;
|
||||||
|
import org.junit.Test;
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import static junit.framework.Assert.assertTrue;
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2009 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author aaron
|
||||||
|
* <p/>
|
||||||
|
* Class ArtificialSAMQueryIteratorTest
|
||||||
|
* <p/>
|
||||||
|
* a test for the ArtificialSAMQueryIterator class.
|
||||||
|
*/
|
||||||
|
public class ArtificialSAMQueryIteratorTest extends BaseTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testWholeChromosomeQuery() {
|
||||||
|
ArtificialSAMQueryIterator iter = ArtificialSamUtils.queryReadIterator(1, 2, 100);
|
||||||
|
iter.queryContained("chr1", 1, -1);
|
||||||
|
int count = 0;
|
||||||
|
while (iter.hasNext()) {
|
||||||
|
SAMRecord rec = iter.next();
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
assertEquals(100, count);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testContainedQueryStart() {
|
||||||
|
ArtificialSAMQueryIterator iter = ArtificialSamUtils.queryReadIterator(1, 2, 100);
|
||||||
|
iter.queryContained("chr1", 1, 50);
|
||||||
|
int count = 0;
|
||||||
|
while (iter.hasNext()) {
|
||||||
|
SAMRecord rec = iter.next();
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
assertEquals(1, count);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testOverlappingQueryStart() {
|
||||||
|
ArtificialSAMQueryIterator iter = ArtificialSamUtils.queryReadIterator(1, 2, 100);
|
||||||
|
iter.queryOverlapping("chr1", 1, 50);
|
||||||
|
int count = 0;
|
||||||
|
while (iter.hasNext()) {
|
||||||
|
SAMRecord rec = iter.next();
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
assertEquals(50, count);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testContainedQueryMiddle() {
|
||||||
|
ArtificialSAMQueryIterator iter = ArtificialSamUtils.queryReadIterator(1, 2, 100);
|
||||||
|
iter.queryContained("chr1", 25, 74);
|
||||||
|
int count = 0;
|
||||||
|
while (iter.hasNext()) {
|
||||||
|
SAMRecord rec = iter.next();
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
assertEquals(1, count);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testOverlappingQueryMiddle() {
|
||||||
|
ArtificialSAMQueryIterator iter = ArtificialSamUtils.queryReadIterator(1, 2, 100);
|
||||||
|
iter.queryOverlapping("chr1", 25, 74);
|
||||||
|
int count = 0;
|
||||||
|
while (iter.hasNext()) {
|
||||||
|
SAMRecord rec = iter.next();
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
assertEquals(50, count);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(expected = IllegalArgumentException.class)
|
||||||
|
public void testUnknownChromosome() {
|
||||||
|
ArtificialSAMQueryIterator iter = ArtificialSamUtils.queryReadIterator(1, 2, 100);
|
||||||
|
iter.queryOverlapping("chr621", 25, 74);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,109 @@
|
||||||
|
package org.broadinstitute.sting.utils.sam;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.BaseTest;
|
||||||
|
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||||
|
import org.broadinstitute.sting.gatk.iterators.PeekingStingIterator;
|
||||||
|
import org.junit.Test;
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.fail;
|
||||||
|
import static junit.framework.Assert.assertTrue;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: aaronmckenna
|
||||||
|
* Date: Jun 3, 2009
|
||||||
|
* Time: 3:09:34 AM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public class ArtificialSamUtilsTest extends BaseTest {
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void basicReadIteratorTest() {
|
||||||
|
StingSAMIterator iter = ArtificialSamUtils.unmappedReadIterator(1, 100, 100);
|
||||||
|
int count = 0;
|
||||||
|
while (iter.hasNext()) {
|
||||||
|
SAMRecord rec = iter.next();
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
assertEquals(100 * 100, count);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void tenPerChromosome() {
|
||||||
|
StingSAMIterator iter = ArtificialSamUtils.unmappedReadIterator(1, 100, 10);
|
||||||
|
int count = 0;
|
||||||
|
while (iter.hasNext()) {
|
||||||
|
SAMRecord rec = iter.next();
|
||||||
|
|
||||||
|
assertEquals(Integer.valueOf(Math.round(count / 10)), rec.getReferenceIndex());
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
assertEquals(100 * 10, count);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void onePerChromosome() {
|
||||||
|
StingSAMIterator iter = ArtificialSamUtils.unmappedReadIterator(1, 100, 1);
|
||||||
|
int count = 0;
|
||||||
|
while (iter.hasNext()) {
|
||||||
|
SAMRecord rec = iter.next();
|
||||||
|
|
||||||
|
assertEquals(Integer.valueOf(count), rec.getReferenceIndex());
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
assertEquals(100 * 1, count);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void basicUnmappedIteratorTest() {
|
||||||
|
StingSAMIterator iter = ArtificialSamUtils.unmappedReadIterator(1, 100, 100, 1000);
|
||||||
|
int count = 0;
|
||||||
|
for (int x = 0; x < (100* 100); x++ ) {
|
||||||
|
if (!iter.hasNext()) {
|
||||||
|
fail ("we didn't get the expected number of reads");
|
||||||
|
}
|
||||||
|
SAMRecord rec = iter.next();
|
||||||
|
assertTrue(rec.getReferenceIndex() >= 0);
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
assertEquals(count, 100 * 100);
|
||||||
|
|
||||||
|
// now we should have 1000 unmapped reads
|
||||||
|
count = 0;
|
||||||
|
while (iter.hasNext()) {
|
||||||
|
SAMRecord rec = iter.next();
|
||||||
|
assertTrue(rec.getReferenceIndex() < 0);
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
assertEquals(1000, count);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testPeeking() {
|
||||||
|
PeekingStingIterator iter = ArtificialSamUtils.unmappedReadIterator(1, 100, 100);
|
||||||
|
int count = 0;
|
||||||
|
while (iter.hasNext()) {
|
||||||
|
int readCnt = ((ArtificialSAMIterator)(iter)).readsTaken();
|
||||||
|
|
||||||
|
// peek the record
|
||||||
|
SAMRecord rec = iter.peek();
|
||||||
|
assertTrue(rec.getReferenceIndex() >= 0);
|
||||||
|
|
||||||
|
// next the record
|
||||||
|
SAMRecord rec2 = iter.next();
|
||||||
|
assertTrue(rec2.getReadName() == rec.getReadName());
|
||||||
|
assertTrue(rec2.getAlignmentStart() == rec.getAlignmentStart());
|
||||||
|
|
||||||
|
// find out how many reads we've taken now
|
||||||
|
int readCnt2 = ((ArtificialSAMIterator)(iter)).readsTaken();
|
||||||
|
|
||||||
|
count++;
|
||||||
|
if (count < 100*100) assertEquals(readCnt + 1, readCnt2);
|
||||||
|
else assertEquals(readCnt, readCnt2);
|
||||||
|
}
|
||||||
|
assertEquals(100 * 100, count );
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue