We're no longer in the read-dropping business.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@901 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2009-06-04 22:37:51 +00:00
parent 4d880477d6
commit 109bef6c08
18 changed files with 1171 additions and 406 deletions

View File

@ -56,16 +56,17 @@ public class IntervalShardStrategy implements ShardStrategy {
*
* @param size the next recommended shard size.
*/
public void adjustNextShardSize(long size) {
public void adjustNextShardSize( long size ) {
this.size = size;
}
/**
* construct the shard strategy from a seq dictionary, a shard size, and and genomeLocs
*
* @param size
* @param locations
*/
IntervalShardStrategy(long size, GenomeLocSortedSet locations) {
IntervalShardStrategy( long size, GenomeLocSortedSet locations ) {
if (locations == null || locations.isEmpty()) {
throw new StingException("IntervalShardStrategy: genomic regions list is empty.");
}
@ -73,12 +74,12 @@ public class IntervalShardStrategy implements ShardStrategy {
this.size = size;
}
/**
/**
* the default constructor
*
* @param dict the sequence dictionary to use
* @param dict the sequence dictionary to use
*/
IntervalShardStrategy(SAMSequenceDictionary dict, GenomeLocSortedSet locations) {
IntervalShardStrategy( SAMSequenceDictionary dict, GenomeLocSortedSet locations ) {
if (locations == null || locations.isEmpty()) {
throw new StingException("IntervalShardStrategy: genomic regions list is empty.");
}
@ -88,44 +89,39 @@ public class IntervalShardStrategy implements ShardStrategy {
/**
* returns true if there are additional shards
*
* @return false if we're done processing shards
*/
public boolean hasNext() {
return (!regions.isEmpty());
return ( !regions.isEmpty() );
}
/**
* gets the next Shard
*
* @return the next shard
*/
public Shard next() {
if ((this.regions == null) || (regions.isEmpty())) {
if (( this.regions == null ) || ( regions.isEmpty() )) {
throw new StingException("IntervalShardStrategy: genomic regions list is empty in next() function.");
}
// get the first region in the list
GenomeLoc loc = regions.iterator().next();
if (loc.getStop() - loc.getStart() <= this.size) {
regions.removeRegion(loc);
return new IntervalShard(loc);
} else {
GenomeLoc subLoc = new GenomeLoc(loc.getContigIndex(),loc.getStart(),loc.getStart()+size-1);
regions.removeRegion(subLoc);
return new IntervalShard(subLoc);
}
regions.removeRegion(loc);
return new IntervalShard(loc);
}
/**
* we don't support the remove command
*/
/** we don't support the remove command */
public void remove() {
throw new UnsupportedOperationException("ShardStrategies don't support remove()");
}
/**
* makes the ReadIntervalShard iterable, i.e. usable in a for loop.
*
* @return
*/
public Iterator<Shard> iterator() {

View File

@ -5,13 +5,11 @@ import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.CloseableIterator;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.dataSources.shards.ReadShard;
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import org.broadinstitute.sting.gatk.iterators.BoundedReadIterator;
import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter;
import org.broadinstitute.sting.gatk.iterators.*;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
import org.broadinstitute.sting.utils.GenomeLoc;
@ -35,43 +33,53 @@ import java.util.List;
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
public class SAMDataSource implements SimpleDataSource {
/** Backing support for reads. */
private Reads reads = null;
/** our SAM data files */
private final SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.coordinate;
/** our log, which we want to capture anything from this class */
protected static Logger logger = Logger.getLogger(SAMDataSource.class);
// How strict should we be with SAM/BAM parsing?
protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.SILENT;
// our list of readers
private final List<File> samFileList = new ArrayList<File>();
/** SAM header file. */
private final SAMFileHeader header;
// used for the reads case, the last count of reads retrieved
private long readsTaken = 0;
long readsTaken = 0;
// our last genome loc position
private GenomeLoc lastReadPos = null;
GenomeLoc lastReadPos = null;
// do we take unmapped reads
private boolean includeUnmappedReads = true;
// reads based traversal variables
private boolean intoUnmappedReads = false;
private int readsAtLastPos = 0;
private int readsSeenAtLastPos = 0;
/**
* package protected getter and setter for the iterator generator
*
* @return
*/
IteratorGenerator getIterGen() {
return iterGen;
}
void setIterGen( IteratorGenerator iterGen ) {
this.iterGen = iterGen;
}
// where we get out iterators from
private IteratorGenerator iterGen;
/**
* constructor, given sam files
*
* @param reads the list of sam files
* @param byReads are we a by reads traversal, or a loci traversal. We could delete this field
* if we passed in iterGen, which would be a better (although more complicated for the
* consumers of SAMDataSources).
*/
public SAMDataSource(Reads reads) throws SimpleDataSourceLoadException {
public SAMDataSource( Reads reads, boolean byReads ) throws SimpleDataSourceLoadException {
this.reads = reads;
// check the length
@ -82,32 +90,11 @@ public class SAMDataSource implements SimpleDataSource {
if (!smFile.canRead()) {
throw new SimpleDataSourceLoadException("SAMDataSource: Unable to load file: " + smFile.getName());
}
samFileList.add(smFile);
}
iterGen = new MSR2IteratorGenerator(reads, byReads);
header = createHeaderMerger().getMergedHeader();
}
/**
* Load a SAM/BAM, given an input file.
*
* @param samFile the file name
* @return a SAMFileReader for the file, null if we're attempting to read a list
*/
private SAMFileReader initializeSAMFile(final File samFile) {
if (samFile.toString().endsWith(".list")) {
return null;
} else {
SAMFileReader samReader = new SAMFileReader(samFile, true);
samReader.setValidationStringency(strictness);
final SAMFileHeader header = samReader.getFileHeader();
logger.debug(String.format("Sort order is: " + header.getSortOrder()));
return samReader;
}
}
/**
* <p>
@ -115,18 +102,12 @@ public class SAMDataSource implements SimpleDataSource {
* </p>
*
* @param location the genome location to extract data for
*
* @return an iterator for that region
*/
public StingSAMIterator seekLocus(GenomeLoc location) throws SimpleDataSourceLoadException {
// right now this is very heavy, it copies the file list into a reader list every time
SamFileHeaderMerger headerMerger = createHeaderMerger();
// make a merging iterator for this record
MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(headerMerger);
iter.queryOverlapping(location.getContig(), (int) location.getStart(), (int) location.getStop() + 1);
public StingSAMIterator seekLocus( GenomeLoc location ) throws SimpleDataSourceLoadException {
// make a merging iterator for this record
CloseableIterator<SAMRecord> iter = iterGen.seek(location);
// return the iterator
return StingSAMIteratorAdapter.adapt(reads, iter);
}
@ -137,9 +118,10 @@ public class SAMDataSource implements SimpleDataSource {
* </p>
*
* @param shard the shard to get data for
*
* @return an iterator for that region
*/
public StingSAMIterator seek(Shard shard) throws SimpleDataSourceLoadException {
public StingSAMIterator seek( Shard shard ) throws SimpleDataSourceLoadException {
StingSAMIterator iterator = null;
if (shard.getShardType() == Shard.ShardType.READ) {
iterator = seekRead((ReadShard) shard);
@ -149,8 +131,7 @@ public class SAMDataSource implements SimpleDataSource {
reads.getMaxOnTheFlySorts(),
reads.getFilterZeroMappingQualityReads(),
reads.getSafetyChecking());
} else if (shard.getShardType() == Shard.ShardType.LOCUS ||
shard.getShardType() == Shard.ShardType.INTERVAL) {
} else if (shard.getShardType() == Shard.ShardType.LOCUS || shard.getShardType() == Shard.ShardType.INTERVAL) {
iterator = seekLocus(shard.getGenomeLoc());
iterator = TraversalEngine.applyDecoratingIterators(false,
iterator,
@ -158,8 +139,7 @@ public class SAMDataSource implements SimpleDataSource {
reads.getMaxOnTheFlySorts(),
reads.getFilterZeroMappingQualityReads(),
reads.getSafetyChecking());
}
else {
} else {
throw new StingException("seek: Unknown shard type");
}
@ -173,17 +153,7 @@ public class SAMDataSource implements SimpleDataSource {
* @return SAM file header.
*/
public SAMFileHeader getHeader() {
return header;
}
/**
* create the merging header.
*
* @return a SamFileHeaderMerger that includes the set of SAM files we were created with
*/
private SamFileHeaderMerger createHeaderMerger() {
List<SAMFileReader> lst = GetReaderList();
return new SamFileHeaderMerger(lst, SORT_ORDER, true);
return this.iterGen.getHeader();
}
@ -193,26 +163,33 @@ public class SAMDataSource implements SimpleDataSource {
* </p>
*
* @param shard the read shard to extract from
*
* @return an iterator for that region
*/
private BoundedReadIterator seekRead(ReadShard shard) throws SimpleDataSourceLoadException {
private BoundedReadIterator seekRead( ReadShard shard ) throws SimpleDataSourceLoadException {
BoundedReadIterator bound = null;
SamFileHeaderMerger headerMerger = createHeaderMerger();
MergingSamRecordIterator2 iter = null;
CloseableIterator<SAMRecord> iter = null;
if (!intoUnmappedReads) {
iter = new MergingSamRecordIterator2(headerMerger);
bound = fastMappedReadSeek(shard.getSize(), iter);
if (lastReadPos == null) {
lastReadPos = new GenomeLoc(iterGen.getHeader().getSequenceDictionary().getSequence(0).getSequenceIndex(), 0, Integer.MAX_VALUE);
iter = iterGen.seek(lastReadPos);
return InitialReadIterator(shard.getSize(), iter);
} else {
lastReadPos.setStop(-1);
iter = iterGen.seek(lastReadPos);
bound = fastMappedReadSeek(shard.getSize(), StingSAMIteratorAdapter.adapt(reads, iter));
}
}
if ((bound == null || intoUnmappedReads) && includeUnmappedReads) {
if (( bound == null || intoUnmappedReads ) && includeUnmappedReads) {
if (iter != null) {
iter.close();
}
iter = new MergingSamRecordIterator2(createHeaderMerger());
bound = toUnmappedReads(shard.getSize(), iter);
iter = iterGen.seek(null);
bound = toUnmappedReads(shard.getSize(), (PeekingStingIterator) iter);
}
if (bound == null) {
shard.signalDone();
bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), 0);
@ -228,7 +205,7 @@ public class SAMDataSource implements SimpleDataSource {
*
* @param seeUnMappedReads true to see unmapped reads, false otherwise
*/
public void viewUnmappedReads(boolean seeUnMappedReads) {
public void viewUnmappedReads( boolean seeUnMappedReads ) {
includeUnmappedReads = seeUnMappedReads;
}
@ -238,19 +215,21 @@ public class SAMDataSource implements SimpleDataSource {
*
* @param readCount how many reads to retrieve
* @param iter the iterator to use
*
* @return the bounded iterator that you can use to get the intervaled reads from
* @throws SimpleDataSourceLoadException
*/
private BoundedReadIterator toUnmappedReads(long readCount, MergingSamRecordIterator2 iter) throws SimpleDataSourceLoadException {
<U extends PeekingStingIterator> BoundedReadIterator toUnmappedReads( long readCount, U iter ) throws SimpleDataSourceLoadException {
int count = 0;
int cnt = 0;
SAMRecord d = null;
while (iter.hasNext()) {
d = iter.peek();
int x = d.getReferenceIndex();
if (x < 0 || x >= d.getHeader().getSequenceDictionary().getSequences().size()) {
if (x < 0)
// we have the magic read that starts the unmapped read segment!
break;
}
cnt++;
iter.next();
}
@ -262,6 +241,7 @@ public class SAMDataSource implements SimpleDataSource {
// now walk until we've taken the unmapped read count
while (iter.hasNext() && count < this.readsTaken) {
iter.next();
count++;
}
// check to see what happened, did we run out of reads?
@ -280,108 +260,164 @@ public class SAMDataSource implements SimpleDataSource {
* A seek function for unmapped reads.
*
* @param readCount how many reads to retrieve
* @param iter the iterator to use
* @param iter the iterator to use, seeked to the correct start location
*
* @return the bounded iterator that you can use to get the intervaled reads from
* @throws SimpleDataSourceLoadException
*/
private BoundedReadIterator fastMappedReadSeek(long readCount, MergingSamRecordIterator2 iter) throws SimpleDataSourceLoadException {
if (lastReadPos == null) {
BoundedReadIterator fastMappedReadSeek( long readCount, StingSAMIterator iter ) throws SimpleDataSourceLoadException {
BoundedReadIterator bound;
correctForReadPileupSeek(iter);
if (readsTaken == 0) {
return InitialReadIterator(readCount, iter);
} else {
BoundedReadIterator bound;
iter.queryContained(lastReadPos.getContig(), (int) lastReadPos.getStop(), -1);
}
int x = 0;
SAMRecord rec = null;
int lastPos = 0;
// move the number of reads we read from the last pos
boolean atLeastOneReadSeen = false; // we have a problem where some chomesomes don't have a single read (i.e. the chrN_random chrom.)
while (iter.hasNext() && this.readsAtLastPos > 0) {
iter.next();
--readsAtLastPos;
atLeastOneReadSeen = true;
}
if (readsAtLastPos != 0 && atLeastOneReadSeen) {
throw new SimpleDataSourceLoadException("Seek problem: reads at last position count != 0");
}
int x = 0;
SAMRecord rec = null;
int lastPos = 0;
while (x < readsTaken) {
if (iter.hasNext()) {
rec = iter.next();
if (lastPos == rec.getAlignmentStart()) {
++this.readsAtLastPos;
} else {
this.readsAtLastPos = 1;
}
lastPos = rec.getAlignmentStart();
++x;
while (x < readsTaken) {
if (iter.hasNext()) {
rec = iter.next();
if (lastPos == rec.getAlignmentStart()) ++this.readsSeenAtLastPos;
else this.readsSeenAtLastPos = 1;
lastPos = rec.getAlignmentStart();
++x;
} else {
// jump contigs
if (lastReadPos.toNextContig() == false) {
// check to see if we're using unmapped reads, if not return, we're done
readsTaken = 0;
intoUnmappedReads = true;
return null;
} else {
// jump contigs
if (lastReadPos.toNextContig() == false) {
// check to see if we're using unmapped reads, if not return, we're done
readsTaken = 0;
intoUnmappedReads = true;
return null;
} else {
iter.close();
// now merge the headers
// right now this is pretty damn heavy, it copies the file list into a reader list every time
SamFileHeaderMerger mg = createHeaderMerger();
iter = new MergingSamRecordIterator2(mg);
iter.queryContained(lastReadPos.getContig(), 1, Integer.MAX_VALUE);
return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount);
}
readsTaken = readCount;
readsSeenAtLastPos = 0;
lastReadPos.setStop(-1);
CloseableIterator<SAMRecord> ret = iterGen.seek(lastReadPos);
return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, ret), readCount);
}
}
// if we're off the end of the last contig (into unmapped territory)
if (rec != null && rec.getAlignmentStart() == 0) {
readsTaken += readCount;
intoUnmappedReads = true;
}
// else we're not off the end, store our location
else if (rec != null) {
int stopPos = rec.getAlignmentStart();
if (stopPos < lastReadPos.getStart()) {
lastReadPos = new GenomeLoc(lastReadPos.getContigIndex() + 1, stopPos, stopPos);
} else {
lastReadPos.setStop(rec.getAlignmentStart());
}
}
// in case we're run out of reads, get out
else {
throw new StingException("Danger: weve run out reads in fastMappedReadSeek");
//return null;
}
bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount);
// return the iterator
return bound;
}
// if we're off the end of the last contig (into unmapped territory)
if (rec != null && rec.getAlignmentStart() == 0) {
readsTaken += readCount;
intoUnmappedReads = true;
}
// else we're not off the end, store our location
else if (rec != null) {
int stopPos = rec.getAlignmentStart();
if (stopPos < lastReadPos.getStart()) {
lastReadPos = new GenomeLoc(lastReadPos.getContigIndex() + 1, stopPos, stopPos);
} else {
lastReadPos.setStart(rec.getAlignmentStart());
}
}
// in case we're run out of reads, get out
else {
throw new StingException("Danger: weve run out reads in fastMappedReadSeek");
//return null;
}
bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount);
// return the iterator
return bound;
}
/**
* Even though the iterator has seeked to the correct location, there may be multiple reads at that location,
* and we may have given some of them out already. Move the iterator to the correct location using the readsAtLastPos variable
*
* @param iter the iterator
*/
private void correctForReadPileupSeek( StingSAMIterator iter ) {
// move the number of reads we read from the last pos
boolean atLeastOneReadSeen = false; // we have a problem where some chomesomes don't have a single read (i.e. the chrN_random chrom.)
while (iter.hasNext() && this.readsSeenAtLastPos > 0) {
iter.next();
--readsSeenAtLastPos;
atLeastOneReadSeen = true;
}
if (readsSeenAtLastPos != 0 && atLeastOneReadSeen) {
throw new SimpleDataSourceLoadException("Seek problem: reads at last position count != 0");
}
}
/**
* set the initial iterator
*
* @param readCount the number of reads
* @param iter the merging iterator
*
* @return a bounded read iterator at the first read position in the file.
*/
private BoundedReadIterator InitialReadIterator(long readCount, MergingSamRecordIterator2 iter) {
private BoundedReadIterator InitialReadIterator( long readCount, CloseableIterator<SAMRecord> iter ) {
BoundedReadIterator bound;
lastReadPos = new GenomeLoc(iter.getHeader().getSequenceDictionary().getSequence(0).getSequenceIndex(), 0, 0);
iter.queryContained(lastReadPos.getContig(), 1, -1);
bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount);
this.readsTaken = readCount;
return bound;
}
}
/**
* iterator generator
* <p/>
* This class generates iterators for the SAMDataSource. This class was introduced for testing purposes,
* since it became increasingly hard to test the SAM data source code. The class defines two abstraact
* methods:
* <p/>
* -seek( GenomeLoc ) which returns an iterator seeked to the genome loc, and if null is passed to the default
* location (which is implementation specific).
* <p/>
* -getHeader(), which returns a SAMFileHeader for the specified IteratorGenerator. I hope we can phase this
* method out, since it doesn't seem necessary, and it would be much cleaner with out it.
*/
abstract class IteratorGenerator {
/** our log, which we want to capture anything from this class */
protected static Logger logger = Logger.getLogger(SAMDataSource.class);
/**
* seek to a location
*
* @param seekTo the genome loc to seek to
*
* @return StingSAMIterator
*/
public abstract CloseableIterator<SAMRecord> seek( GenomeLoc seekTo );
/**
* get the merged header
*
* @return the merged header
*/
public abstract SAMFileHeader getHeader();
/**
* Load a SAM/BAM, given an input file.
*
* @param samFile the file name
*
* @return a SAMFileReader for the file, null if we're attempting to read a list
*/
protected static SAMFileReader initializeSAMFile( final File samFile, SAMFileReader.ValidationStringency strictness ) {
if (samFile.toString().endsWith(".list")) {
return null;
} else {
SAMFileReader samReader = new SAMFileReader(samFile, true);
samReader.setValidationStringency(strictness);
final SAMFileHeader header = samReader.getFileHeader();
logger.debug(String.format("Sort order is: " + header.getSortOrder()));
return samReader;
}
}
/**
* A private function that, given the internal file list, generates a SamFileReader
* list of validated files.
@ -389,11 +425,11 @@ public class SAMDataSource implements SimpleDataSource {
* @return a list of SAMFileReaders that represent the stored file names
* @throws SimpleDataSourceLoadException if there's a problem loading the files
*/
private List<SAMFileReader> GetReaderList() throws SimpleDataSourceLoadException {
protected static List<SAMFileReader> GetReaderList( Reads reads, SAMFileReader.ValidationStringency strictness ) throws SimpleDataSourceLoadException {
// right now this is pretty damn heavy, it copies the file list into a reader list every time
List<SAMFileReader> lst = new ArrayList<SAMFileReader>();
for (File f : this.samFileList) {
SAMFileReader reader = initializeSAMFile(f);
for (File f : reads.getReadsFiles()) {
SAMFileReader reader = initializeSAMFile(f, strictness);
if (reader.getFileHeader().getReadGroups().size() < 1) {
//logger.warn("Setting header in reader " + f.getName());
@ -412,4 +448,65 @@ public class SAMDataSource implements SimpleDataSource {
return lst;
}
/**
* create the merging header.
*
* @return a SamFileHeaderMerger that includes the set of SAM files we were created with
*/
protected static SamFileHeaderMerger createHeaderMerger( Reads reads, SAMFileReader.ValidationStringency strictness, SAMFileHeader.SortOrder SORT_ORDER ) {
List<SAMFileReader> lst = GetReaderList(reads, strictness);
return new SamFileHeaderMerger(lst, SORT_ORDER, true);
}
}
/**
* MSR2IteratorGenerator
* <p/>
* generates a MerginsSAMIterator2, given a genomic location. The constructor takes the reads structure,
* and a flag indicating if we're dealing with reads or loci (to determine the correct query function).
*/
class MSR2IteratorGenerator extends IteratorGenerator {
/** our read pile */
private Reads reads;
private SamFileHeaderMerger header;
// How strict should we be with SAM/BAM parsing?
protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.SILENT;
// are we by reads or by loci
protected boolean byReads = true;
/** our SAM data files */
private final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate;
public MSR2IteratorGenerator( Reads reads, boolean byReads ) {
this.reads = reads;
this.header = IteratorGenerator.createHeaderMerger(reads, strictness, sortOrder);
this.byReads = byReads;
}
public CloseableIterator<SAMRecord> seek( GenomeLoc seekTo ) {
SamFileHeaderMerger mg = createHeaderMerger(reads, strictness, sortOrder);
MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(mg, reads);
if (seekTo != null) {
if (byReads)
iter.queryContained(seekTo.getContig(), (int) seekTo.getStart(), (int) seekTo.getStop());
else
iter.queryOverlapping(seekTo.getContig(), (int) seekTo.getStart(), (int) seekTo.getStop());
}
return iter;
}
/**
* get the merged header
*
* @return the merged header
*/
public SAMFileHeader getHeader() {
return header.getMergedHeader();
}
}

View File

@ -75,11 +75,13 @@ public abstract class MicroScheduler {
protected MicroScheduler(Walker walker, Reads reads, File refFile, List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods) {
if (walker instanceof ReadWalker) {
traversalEngine = new TraverseReads(reads.getReadsFiles(), refFile, rods);
this.reads = getReadsDataSource(reads,true);
} else {
traversalEngine = new TraverseLoci(reads.getReadsFiles(), refFile, rods);
this.reads = getReadsDataSource(reads,false);
}
this.reads = getReadsDataSource(reads);
this.reference = openReferenceSequenceFile(refFile);
this.rods = getReferenceOrderedDataSources(rods);
}
@ -159,12 +161,12 @@ public abstract class MicroScheduler {
* Gets a data source for the given set of reads.
* @return A data source for the given set of reads.
*/
private SAMDataSource getReadsDataSource(Reads reads) {
private SAMDataSource getReadsDataSource(Reads reads, boolean byReads) {
// By reference traversals are happy with no reads. Make sure that case is handled.
if (reads.getReadsFiles().size() == 0)
return null;
SAMDataSource dataSource = new SAMDataSource(reads);
SAMDataSource dataSource = new SAMDataSource(reads, byReads);
// Side effect: initialize the traversal engine with reads data.
// TODO: Give users a dedicated way of getting the header so that the MicroScheduler

View File

@ -7,20 +7,29 @@ import java.util.Iterator;
import org.broadinstitute.sting.gatk.Reads;
/**
/*
* Copyright (c) 2009 The Broad Institute
*
* User: aaron
* Date: Apr 14, 2009
* Time: 5:27:31 PM
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/

View File

@ -31,7 +31,7 @@ import java.util.PriorityQueue;
* iterable stream. The underlying iterators/files must all have the same sort order unless
* the requested output format is unsorted, in which case any combination is valid.
*/
public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>, Iterable<SAMRecord> {
public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>, Iterable<SAMRecord>, PeekingStingIterator {
protected PriorityQueue<ComparableSamRecordIterator> pq = null;
protected final SamFileHeaderMerger samHeaderMerger;
protected final SAMFileHeader.SortOrder sortOrder;
@ -43,7 +43,7 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
* Constructs a new merging iterator with the same set of readers and sort order as
* provided by the header merger parameter.
*/
public MergingSamRecordIterator2(final SamFileHeaderMerger headerMerger) {
public MergingSamRecordIterator2(final SamFileHeaderMerger headerMerger, Reads reads) {
this.samHeaderMerger = headerMerger;
this.sortOrder = headerMerger.getMergedHeader().getSortOrder();
this.pq = new PriorityQueue<ComparableSamRecordIterator>(samHeaderMerger.getReaders().size());
@ -275,6 +275,16 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
public Iterator<SAMRecord> iterator() {
return this;
}
/**
* Gets source information for the reads. Contains information about the original reads
* files, plus information about downsampling, etc.
*
* @return
*/
public Reads getSourceInfo() {
return null; //To change body of implemented methods use File | Settings | File Templates.
}
}
// Should replace picard class with the same name

View File

@ -0,0 +1,43 @@
package org.broadinstitute.sting.gatk.iterators;
import net.sf.samtools.SAMRecord;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* @author aaron
* <p/>
* Interface Peekable
* <p/>
* This interface indicates that
*/
public interface PeekingStingIterator extends StingSAMIterator {
/**
* peek, given the specified type
* @return
*/
SAMRecord peek();
}

View File

@ -337,34 +337,7 @@ public abstract class TraversalEngine {
return true;
}
protected Iterator<SAMRecord> initializeReads() {
if( readsFiles.size() == 1 )
samReader = initializeSAMFile(readsFiles.get(0));
else
samReader = null;
return wrapReadsIterator(getReadsIterator(samReader), true);
}
protected Iterator<SAMRecord> getReadsIterator(final SAMFileReader samReader) {
// If the file has an index, querying functions are available. Use them if possible...
if ( samReader == null && readsFiles.size() > 0 ) {
SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.coordinate;
List<SAMFileReader> readers = new ArrayList<SAMFileReader>();
for ( File readsFile: readsFiles ) {
SAMFileReader reader = initializeSAMFile(readsFile);
readers.add(reader);
}
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SORT_ORDER);
return new MergingSamRecordIterator2(headerMerger);
}
else {
return samReader.iterator();
}
}
@Deprecated
protected StingSAMIterator wrapReadsIterator( final Iterator<SAMRecord> rawIterator, final boolean enableVerification ) {
// Reads sourceInfo is gone by this point in the traversal engine. Stub in a null and rely on the iterator to

View File

@ -246,39 +246,7 @@ public class TraverseDuplicates extends TraversalEngine {
return sum;
}
// --------------------------------------------------------------------------------------------------------------
//
// old style interface to the system
//
// --------------------------------------------------------------------------------------------------------------
@Override
public <M,T> T traverse(Walker<M,T> walker, List<GenomeLoc> locations) {
if ( walker instanceof DuplicateWalker) {
Walker x = walker;
DuplicateWalker<?, ?> dupWalker = (DuplicateWalker<?, ?>)x;
return (T)this.traverseByRead(dupWalker, locations);
} else {
throw new IllegalArgumentException("Walker isn't a duplicate walker!");
}
}
/**
* Should we deleted at the soonist possible opportunity
*/
public <M, T> Object traverseByRead(DuplicateWalker<M, T> walker, List<GenomeLoc> locations) {
samReadIter = initializeReads();
// Initialize the walker
walker.initialize();
// Initialize the sum
FilteringIterator filterIter = new FilteringIterator(samReadIter, new duplicateStreamFilterFunc());
T sum = actuallyTraverse(walker, filterIter, walker.reduceInit());
//printOnTraversalDone("reads", sum);
walker.onTraversalDone(sum);
return sum;
}
// --------------------------------------------------------------------------------------------------------------
//

View File

@ -0,0 +1,165 @@
package org.broadinstitute.sting.utils.sam;
import org.broadinstitute.sting.gatk.iterators.PeekingStingIterator;
import org.broadinstitute.sting.gatk.Reads;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileHeader;
import java.util.Iterator;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/** this fake iterator allows us to look at how specific piles of reads are handled */
public class ArtificialSAMIterator implements PeekingStingIterator {
protected int currentChromo = 0;
protected int currentRead = 0;
protected int totalReadCount = 0;
protected boolean done = false;
// the next record
protected SAMRecord next = null;
protected SAMFileHeader header = null;
// the passed in parameters
protected final int sChr;
protected final int eChromosomeCount;
protected final int rCount;
protected int uMappedReadCount;
// let us know to make a read, we need this to help out the fake sam query iterator
private boolean initialized = false;
/**
* create the fake iterator, given the mapping of chromosomes and read counts
*
* @param startingChr the starting chromosome
* @param endingChr the ending chromosome
* @param readCount the number of reads in each chromosome
* @param header the associated header
*/
ArtificialSAMIterator( int startingChr, int endingChr, int readCount, SAMFileHeader header ) {
sChr = startingChr;
eChromosomeCount = (endingChr - startingChr) + 1;
rCount = readCount;
this.header = header;
this.currentChromo = 0;
uMappedReadCount = 0;
}
/**
* create the fake iterator, given the mapping of chromosomes and read counts
*
* @param startingChr the starting chromosome
* @param endingChr the ending chromosome
* @param readCount the number of reads in each chromosome
* @param header the associated header
*/
ArtificialSAMIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount, SAMFileHeader header ) {
sChr = startingChr;
eChromosomeCount = (endingChr - startingChr) + 1;
rCount = readCount;
this.header = header;
this.currentChromo = 0;
this.uMappedReadCount = unmappedReadCount;
}
public Reads getSourceInfo() {
throw new UnsupportedOperationException("We don't support this");
}
public void close() {
// done
currentChromo = Integer.MAX_VALUE;
}
public boolean hasNext() {
if (!initialized){
initialized = true;
createNextRead();
}
if (this.next != null) {
return true;
}
return false;
}
private boolean createNextRead() {
if (currentRead >= rCount) {
currentChromo++;
currentRead = 0;
}
// check for end condition, have we finished the chromosome listing, and have no unmapped reads
if (currentChromo >= eChromosomeCount) {
if (uMappedReadCount < 1) {
this.next = null;
return false;
} else {
++totalReadCount;
this.next = ArtificialSamUtils.createArtificialRead(this.header, String.valueOf(totalReadCount), -1, -1, 50);
--uMappedReadCount;
return true;
}
}
++totalReadCount;
this.next = ArtificialSamUtils.createArtificialRead(this.header, String.valueOf(totalReadCount), currentChromo, currentRead, 50);
++currentRead;
return true;
}
public SAMRecord next() {
SAMRecord ret = next;
createNextRead();
return ret;
}
public void remove() {
throw new UnsupportedOperationException("You've tried to remove on a StingSAMIterator (unsupported), not to mention that this is a fake iterator.");
}
public Iterator<SAMRecord> iterator() {
return this;
}
/**
* some instrumentation methods
*/
public int readsTaken() {
return totalReadCount;
}
/**
* peek at the next sam record
*
* @return
*/
public SAMRecord peek() {
return this.next;
}
}

View File

@ -0,0 +1,181 @@
package org.broadinstitute.sting.utils.sam;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMSequenceRecord;
import net.sf.samtools.SAMRecord;
import java.util.List;
import org.broadinstitute.sting.utils.StingException;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* @author aaron
* <p/>
* Class ArtificialSAMQueryIterator
* <p/>
* A descriptions should go here. Blame aaron if it's missing.
*/
public class ArtificialSAMQueryIterator extends ArtificialSAMIterator {
// get the next positon
protected int finalPos = 0;
protected int startPos = 0;
protected int contigIndex = -1;
protected boolean overlapping = false;
protected int startingChr = 0;
protected boolean seeked = false;
/**
* create the fake iterator, given the mapping of chromosomes and read counts
*
* @param startingChr the starting chromosome
* @param endingChr the ending chromosome
* @param readCount the number of reads in each chromosome
* @param header the associated header
*/
ArtificialSAMQueryIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount, SAMFileHeader header ) {
super(startingChr, endingChr, readCount, unmappedReadCount, header);
this.startingChr = startingChr;
}
/**
* query containing - get reads contained by the specified interval
*
* @param contig the contig index string
* @param start the start position
* @param stop the stop position
*/
public void queryContained( String contig, int start, int stop ) {
this.overlapping = false;
initialize(contig, start, stop);
}
/**
* query containing - get reads contained by the specified interval
*
* @param contig the contig index string
* @param start the start position
* @param stop the stop position
*/
public void queryOverlapping( String contig, int start, int stop ) {
this.overlapping = true;
initialize(contig, start, stop);
}
/**
* initialize the query iterator
*
* @param contig the contig
* @param start the start position
* @param stop the stop postition
*/
private void initialize( String contig, int start, int stop ) {
ensureUntouched();
finalPos = stop;
startPos = start;
if (finalPos < 0) {
finalPos = Integer.MAX_VALUE;
}
// sanity check that we have the contig
contigIndex = -1;
List<SAMSequenceRecord> list = header.getSequenceDictionary().getSequences();
for (SAMSequenceRecord rec : list) {
if (rec.getSequenceName().equals(contig)) {
contigIndex = rec.getSequenceIndex();
}
}
if (contigIndex < 0) { throw new IllegalArgumentException("Contig" + contig + " doesn't exist"); }
while (super.hasNext() && this.peek().getReferenceIndex() < contigIndex) {
super.next();
}
if (!super.hasNext()) {
throw new StingException("Unable to find the target chromosome");
}
while (super.hasNext() && this.peek().getAlignmentStart() < start) {
super.next();
}
// sanity check that we have an actual matching read next
SAMRecord rec = this.peek();
if (!matches(rec)) {
throw new StingException("The next read doesn't match");
}
// set the seeked variable to true
seeked = true;
}
/**
* given a read and the query type, check if it matches our regions
*
* @param rec the read
*
* @return true if it belongs in our region
*/
public boolean matches( SAMRecord rec ) {
if (rec.getReferenceIndex() != this.contigIndex) {
return false;
}
if (!overlapping) {
// if the start or the end are somewhere within our range
if (( rec.getAlignmentStart() >= startPos && rec.getAlignmentEnd() <= finalPos )) {
return true;
}
} else {
if (( rec.getAlignmentStart() <= finalPos && rec.getAlignmentStart() >= startPos ) ||
( rec.getAlignmentEnd() <= finalPos && rec.getAlignmentEnd() >= startPos )) {
return true;
}
}
return false;
}
/**
* override the hasNext, to incorportate our limiting factor
*
* @return
*/
public boolean hasNext() {
boolean res = super.hasNext();
if (!seeked) {
return res;
}
if (res && matches(this.next)) {
return true;
}
return false;
}
/** make sure we haven't been used as an iterator yet; this is to miror the MergingSamIterator2 action. */
public void ensureUntouched() {
if (this.currentChromo != 0 || this.currentRead > 0) {
throw new UnsupportedOperationException("We've already been used as an iterator; you can't query after that");
}
}
}

View File

@ -5,7 +5,7 @@ import net.sf.samtools.*;
import java.io.File;
import java.util.*;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.iterators.PeekingStingIterator;
import org.broadinstitute.sting.gatk.Reads;
/**
@ -28,11 +28,6 @@ import org.broadinstitute.sting.gatk.Reads;
/**
* @author aaron
* @version 1.0
* @date May 21, 2009
* <p/>
* Class samUtils
* <p/>
* A collection of utilities for making sam and bam files. Mostly these are for creating sam and bam files for testing purposes.
*/
public class ArtificialSamUtils {
public static final int DEFAULT_READ_LENGTH = 50;
@ -46,7 +41,7 @@ public class ArtificialSamUtils {
* @param chromosomeSize how large each chromosome is
* @param readsPerChomosome how many reads to make in each chromosome. They'll be aligned from position 1 to x (which is the number of reads)
*/
public static void createArtificialBamFile(String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome) {
public static void createArtificialBamFile( String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome ) {
SAMFileHeader header = createArtificialSamHeader(numberOfChromosomes, startingChromosome, chromosomeSize);
File outFile = new File(filename);
@ -70,7 +65,7 @@ public class ArtificialSamUtils {
* @param chromosomeSize how large each chromosome is
* @param readsPerChomosome how many reads to make in each chromosome. They'll be aligned from position 1 to x (which is the number of reads)
*/
public static void createArtificialSamFile(String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome) {
public static void createArtificialSamFile( String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome ) {
SAMFileHeader header = createArtificialSamHeader(numberOfChromosomes, startingChromosome, chromosomeSize);
File outFile = new File(filename);
@ -91,14 +86,15 @@ public class ArtificialSamUtils {
* @param numberOfChromosomes the number of chromosomes to create
* @param startingChromosome the starting number for the chromosome (most likely set to 1)
* @param chromosomeSize the length of each chromosome
*
* @return
*/
public static SAMFileHeader createArtificialSamHeader(int numberOfChromosomes, int startingChromosome, int chromosomeSize) {
public static SAMFileHeader createArtificialSamHeader( int numberOfChromosomes, int startingChromosome, int chromosomeSize ) {
SAMFileHeader header = new SAMFileHeader();
SAMSequenceDictionary dict = new SAMSequenceDictionary();
// make up some sequence records
for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) {
SAMSequenceRecord rec = new SAMSequenceRecord("chr" + (x), chromosomeSize /* size */);
SAMSequenceRecord rec = new SAMSequenceRecord("chr" + ( x ), chromosomeSize /* size */);
rec.setSequenceLength(chromosomeSize);
dict.addSequence(rec);
}
@ -114,9 +110,10 @@ public class ArtificialSamUtils {
* @param refIndex the reference index, i.e. what chromosome to associate it with
* @param alignmentStart where to start the alignment
* @param length the length of the read
*
* @return the artificial read
*/
public static SAMRecord createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, int length) {
public static SAMRecord createArtificialRead( SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) {
SAMRecord record = new SAMRecord(header);
record.setReadName(name);
record.setReferenceIndex(refIndex);
@ -131,95 +128,64 @@ public class ArtificialSamUtils {
/**
* create an iterator containing the specified read piles
*
* @param startingChr the chromosome (reference ID) to start from
* @param endingChr the id to end with
* @param readCount the number of reads per chromosome
* @param endingChr the id to end with
* @param readCount the number of reads per chromosome
*
* @return StingSAMIterator representing the specified amount of fake data
*/
public static StingSAMIterator unmappedReadIterator(int startingChr, int endingChr, int readCount ) {
SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH);
Map<Integer, Integer> map = new HashMap<Integer, Integer>();
for (int x = startingChr; x < endingChr; x++) {
map.put(x,readCount);
}
return new ArtificialSAMIterator(startingChr, endingChr, readCount,header);
public static PeekingStingIterator unmappedReadIterator( int startingChr, int endingChr, int readCount ) {
SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH);
return new ArtificialSAMIterator(startingChr, endingChr, readCount, header);
}
}
/**
* this fake iterator allows us to look at how specific piles of reads are handled
*/
class ArtificialSAMIterator implements StingSAMIterator {
private int currentChromo = 0;
private int currentRead = 0;
private int readCount = 0;
private boolean done = false;
// the next record
private SAMRecord next = null;
SAMFileHeader header = null;
// the passed in parameters
final int sChr;
final int eChr;
final int rCount;
/**
* create the fake iterator, given the mapping of chromosomes and read counts
* @param startingChr the starting chromosome
* @param endingChr the ending chromosome
* @param readCount the number of reads in each chromosome
* @param header the associated header
* create an iterator containing the specified read piles
*
* @param startingChr the chromosome (reference ID) to start from
* @param endingChr the id to end with
* @param readCount the number of reads per chromosome
* @param unmappedReadCount the count of unmapped reads to place at the end of the iterator, like in a sorted bam file
*
* @return StingSAMIterator representing the specified amount of fake data
*/
ArtificialSAMIterator(int startingChr, int endingChr, int readCount, SAMFileHeader header) {
sChr = startingChr;
eChr = endingChr;
rCount = readCount;
this.header = header;
this.currentChromo = startingChr;
public static PeekingStingIterator unmappedReadIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount ) {
SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH);
return new ArtificialSAMIterator(startingChr, endingChr, readCount, unmappedReadCount, header);
}
public Reads getSourceInfo() {
throw new UnsupportedOperationException("We don't support this");
/**
* create an ArtificialSAMQueryIterator containing the specified read piles
*
* @param startingChr the chromosome (reference ID) to start from
* @param endingChr the id to end with
* @param readCount the number of reads per chromosome
*
* @return StingSAMIterator representing the specified amount of fake data
*/
public static ArtificialSAMQueryIterator queryReadIterator( int startingChr, int endingChr, int readCount ) {
SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH);
return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, 0, header);
}
public void close() {
// done
currentChromo = Integer.MAX_VALUE;
}
/**
* create an ArtificialSAMQueryIterator containing the specified read piles
*
* @param startingChr the chromosome (reference ID) to start from
* @param endingChr the id to end with
* @param readCount the number of reads per chromosome
* @param unmappedReadCount the count of unmapped reads to place at the end of the iterator, like in a sorted bam file
*
* @return StingSAMIterator representing the specified amount of fake data
*/
public static ArtificialSAMQueryIterator queryReadIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount ) {
SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH);
public boolean hasNext() {
if (currentRead >= rCount) {
currentChromo++;
currentRead = 0;
}
if (currentChromo >= eChr) {
return false;
}
int alignment = 0;
if (currentChromo >= 0) {
alignment = currentRead;
} else {
alignment = 0;
}
this.next = ArtificialSamUtils.createArtificialRead(this.header,String.valueOf(readCount), currentChromo, alignment, 50);
currentRead++;
return true;
}
public SAMRecord next() {
return next;
}
public void remove() {
throw new UnsupportedOperationException("You've tried to remove on a StingSAMIterator (unsupported), not to mention that this is a fake iterator.");
}
public Iterator<SAMRecord> iterator() {
return this;
return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, unmappedReadCount, header);
}
}

View File

@ -43,6 +43,12 @@ import net.sf.samtools.SAMFileHeader;
* Class ReadIntervalShardStrategyTest
* <p/>
* Tests the ReadIntervalShardStrategy class
*
*
* TODO: this test has been changed, since we now force interval shards to not subdivide if they're large,
* so you should always get one shard back, reguardless of the specified length. If this behavior changes
* back in the future, the expected number of shards should change from 1 to > 1.
*
*/
public class IntervalShardStrategyTest extends BaseTest {
@ -75,7 +81,7 @@ public class IntervalShardStrategyTest extends BaseTest {
counter++;
}
assertTrue(d instanceof IntervalShard);
assertEquals(10, counter);
assertEquals(1, counter);
}
@Test
@ -92,7 +98,7 @@ public class IntervalShardStrategyTest extends BaseTest {
counter++;
}
assertTrue(d instanceof IntervalShard);
assertEquals(50, counter);
assertEquals(5, counter);
}
@Test
@ -105,16 +111,11 @@ public class IntervalShardStrategyTest extends BaseTest {
int counter = 0;
while (strat.hasNext()) {
Shard d = strat.next();
if (counter % 2 == 0) {
assertEquals(1, d.getGenomeLoc().getStart());
assertEquals(789, d.getGenomeLoc().getStop());
} else {
assertEquals(790, d.getGenomeLoc().getStart());
assertEquals(1000, d.getGenomeLoc().getStop());
}
counter++;
}
assertEquals(10, counter);
assertEquals(5, counter);
}

View File

@ -89,7 +89,7 @@ public class SAMBAMDataSourceTest extends BaseTest {
Reads reads = new Reads(fl);
try {
SAMDataSource data = new SAMDataSource(reads);
SAMDataSource data = new SAMDataSource(reads,false);
for (Shard sh : strat) {
int readCount = 0;
count++;
@ -139,7 +139,7 @@ public class SAMBAMDataSourceTest extends BaseTest {
int count = 0;
try {
SAMDataSource data = new SAMDataSource(reads);
SAMDataSource data = new SAMDataSource(reads,false);
for (Shard sh : strat) {
int readCount = 0;
count++;
@ -176,7 +176,7 @@ public class SAMBAMDataSourceTest extends BaseTest {
logger.debug("Pile two:");
try {
SAMDataSource data = new SAMDataSource(reads);
SAMDataSource data = new SAMDataSource(reads,false);
for (Shard sh : strat) {
int readCount = 0;
count++;

View File

@ -2,15 +2,20 @@ package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
import static junit.framework.Assert.fail;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.util.CloseableIterator;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategy;
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategyFactory;
import org.broadinstitute.sting.gatk.iterators.BoundedReadIterator;
import org.broadinstitute.sting.gatk.iterators.*;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import org.broadinstitute.sting.utils.sam.ArtificialSamUtils;
import org.broadinstitute.sting.utils.sam.ArtificialSAMQueryIterator;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import org.junit.Before;
import org.junit.Test;
@ -38,16 +43,14 @@ import java.util.List;
/**
* @author aaron
* @version 1.0
* @date Apr 15, 2009
* <p/>
* Class SAMByReadsTest
* <p/>
* Test sam traversal by reads FIX ME
*/
public class SAMByReadsTest extends BaseTest {
private FastaSequenceFile2 seq;
private List<File> fl;
ShardStrategy shardStrategy;
Reads reads;
private int targetReadCount = 14;
/**
* This function does the setup of our parser, before each method call.
@ -59,56 +62,42 @@ public class SAMByReadsTest extends BaseTest {
fl = new ArrayList<File>();
// sequence
seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly17/v0/Homo_sapiens_assembly17.fasta"));
GenomeLoc.setupRefContigOrdering(seq.getSequenceDictionary());
//seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly17/v0/Homo_sapiens_assembly17.fasta"));
//GenomeLoc.setupRefContigOrdering(seq.getSequenceDictionary());
// setup the test files
fl.add(new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/index_test.bam"));
reads = new Reads(fl);
}
/** Test out that we can shard the file and iterate over every read */
@Test
public void testTotalReadCount() {
logger.warn("Executing testTotalReadCount");
// setup the test files
fl.add(new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/index_test.bam"));
Reads reads = new Reads(fl);
final int targetReadCount = 5000;
ShardStrategy shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS,seq.getSequenceDictionary(),targetReadCount);
public void testToUnmappedReads() {
ArtificialIteratorGenerator gen = new ArtificialIteratorGenerator(1,10,100,1000);
GenomeLoc.setupRefContigOrdering(gen.getHeader().getSequenceDictionary());
try {
SAMDataSource data = new SAMDataSource(reads);
// check the total read count
final int totalReads = 10000;
int readsSeen = 0;
BoundedReadIterator iter;
for (Shard sd : shardStrategy) {
int readcnt = 0;
iter = (BoundedReadIterator)data.seek(sd);
for (SAMRecord r : iter) {
readcnt++;
int unmappedReadsSeen = 0;
int iterations = 0;
SAMDataSource data = new SAMDataSource(reads,true);
for (int x = 0; x < 10; x++) {
++iterations;
PeekingStingIterator iter = ArtificialSamUtils.unmappedReadIterator(1, 100, 10, 1000);
BoundedReadIterator ret = data.toUnmappedReads(100, iter);
// count the reads we've gotten back
if (ret == null) {
fail("On iteration " + iterations + " we were returned a null pointer, after seeing " + unmappedReadsSeen + " reads out of a 1000");
}
while (ret.hasNext()) {
ret.next();
unmappedReadsSeen++;
}
readsSeen += readcnt;
//logger.warn("Seen " + readsSeen + " reads.");
}
// make sure we've seen all the reads
assertEquals(totalReads,readsSeen);
logger.warn("Success " + readsSeen + " equals target count of " + totalReads);
assertEquals(1000,unmappedReadsSeen);
}
catch (SimpleDataSourceLoadException e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
fail("testLinearBreakIterateAll: We Should not get a SimpleDataSourceLoadException");
@ -116,4 +105,145 @@ public class SAMByReadsTest extends BaseTest {
}
/** Test out that we can shard the file and iterate over every read */
@Test
public void testShardingOfReadsSize14() {
ArtificialIteratorGenerator gen = new ArtificialIteratorGenerator(1,10,100,1000);
GenomeLoc.setupRefContigOrdering(gen.getHeader().getSequenceDictionary());
targetReadCount = 14;
try {
int iterations = 0;
int readCount = 0;
SAMDataSource data = new SAMDataSource(reads,true);
data.setIterGen(gen);
shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS, gen.getHeader().getSequenceDictionary(), targetReadCount);
while (shardStrategy.hasNext()) {
BoundedReadIterator ret = (BoundedReadIterator)data.seek(shardStrategy.next());
assertTrue(ret != null);
while (ret.hasNext()) {
ret.next();
readCount++;
}
ret.close();
iterations++;
}
// assert that we saw 2000 reads
assertEquals(2000,readCount);
/**
* this next assertion is based on the following logic:
* 14 reads per shard = 8 shards per each 100 read chromosome
* 10 chromosomes = 8 * 10 = 80
* 1000 unmapped reads / 14 = 72
* 1 iteration at the end to know we're done
* 80 + 72 + 1 = 153
*/
assertEquals(153,iterations);
}
catch (SimpleDataSourceLoadException e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
fail("testLinearBreakIterateAll: We Should not get a SimpleDataSourceLoadException");
}
}
/** Test out that we can shard the file and iterate over every read */
@Test
public void testShardingOfReadsSize25() {
ArtificialIteratorGenerator gen = new ArtificialIteratorGenerator(1,10,100,1000);
GenomeLoc.setupRefContigOrdering(gen.getHeader().getSequenceDictionary());
targetReadCount = 25;
try {
int iterations = 0;
int readCount = 0;
SAMDataSource data = new SAMDataSource(reads,true);
data.setIterGen(gen);
shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS, gen.getHeader().getSequenceDictionary(), targetReadCount);
while (shardStrategy.hasNext()) {
BoundedReadIterator ret = (BoundedReadIterator)data.seek(shardStrategy.next());
assertTrue(ret != null);
while (ret.hasNext()) {
ret.next();
readCount++;
}
ret.close();
iterations++;
}
// assert that we saw 2000 reads
assertEquals(2000,readCount);
/**
* this next assertion is based on the following logic:
* 25 reads per shard = 5 shards (1 on the end to realize we're done)
* 10 chromosomes = 5 * 10 = 50
* 1000 unmapped reads / 25 = 40
* 1 iteration at the end to know we're done
* 50 + 40 + 1 = 91
*/
assertEquals(91,iterations);
}
catch (SimpleDataSourceLoadException e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
fail("testLinearBreakIterateAll: We Should not get a SimpleDataSourceLoadException");
}
}
}
/**
* use this to inject into SAMDataSource for testing
*/
class ArtificialIteratorGenerator extends IteratorGenerator {
// How strict should we be with SAM/BAM parsing?
protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.SILENT;
// the header
private SAMFileHeader header;
private int endingChr;
private int startingChr;
private int readCount;
private int readSize;
/** our SAM data files */
private final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate;
public ArtificialIteratorGenerator( int startingChr, int endingChr, int readCount, int readSize) {
header = ArtificialSamUtils.createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + readSize);
}
public CloseableIterator<SAMRecord> seek( GenomeLoc seekTo ) {
ArtificialSAMQueryIterator iter = ArtificialSamUtils.queryReadIterator(1, 10, 100, 1000);
if (seekTo != null) {
iter.queryContained(seekTo.getContig(), (int)seekTo.getStart(), (int)seekTo.getStop());
}
return iter;
}
/**
* get the merged header
*
* @return the merged header
*/
public SAMFileHeader getHeader() {
return this.header;
}
}

View File

@ -85,7 +85,7 @@ public class BoundedReadIteratorTest extends BaseTest {
long shardReadCount = 0;
try {
SAMDataSource data = new SAMDataSource(reads);
SAMDataSource data = new SAMDataSource(reads,true);
// make sure we have a shard
if (!strat.hasNext()) {

View File

@ -122,7 +122,7 @@ public class TraverseReadsTest extends BaseTest {
ref.getSequenceDictionary(),
readSize);
SAMDataSource dataSource = new SAMDataSource(new Reads(bamList));
SAMDataSource dataSource = new SAMDataSource(new Reads(bamList),true);
dataSource.viewUnmappedReads(false);
countReadWalker.initialize();
@ -170,7 +170,7 @@ public class TraverseReadsTest extends BaseTest {
ref.getSequenceDictionary(),
readSize);
SAMDataSource dataSource = new SAMDataSource(new Reads(bamList));
SAMDataSource dataSource = new SAMDataSource(new Reads(bamList),true);
dataSource.viewUnmappedReads(true);
countReadWalker.initialize();

View File

@ -0,0 +1,115 @@
package org.broadinstitute.sting.utils.sam;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.iterators.PeekingStingIterator;
import org.junit.Test;
import static org.junit.Assert.assertEquals;
import net.sf.samtools.SAMRecord;
import static junit.framework.Assert.assertTrue;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* @author aaron
* <p/>
* Class ArtificialSAMQueryIteratorTest
* <p/>
* a test for the ArtificialSAMQueryIterator class.
*/
public class ArtificialSAMQueryIteratorTest extends BaseTest {
@Test
public void testWholeChromosomeQuery() {
ArtificialSAMQueryIterator iter = ArtificialSamUtils.queryReadIterator(1, 2, 100);
iter.queryContained("chr1", 1, -1);
int count = 0;
while (iter.hasNext()) {
SAMRecord rec = iter.next();
count++;
}
assertEquals(100, count);
}
@Test
public void testContainedQueryStart() {
ArtificialSAMQueryIterator iter = ArtificialSamUtils.queryReadIterator(1, 2, 100);
iter.queryContained("chr1", 1, 50);
int count = 0;
while (iter.hasNext()) {
SAMRecord rec = iter.next();
count++;
}
assertEquals(1, count);
}
@Test
public void testOverlappingQueryStart() {
ArtificialSAMQueryIterator iter = ArtificialSamUtils.queryReadIterator(1, 2, 100);
iter.queryOverlapping("chr1", 1, 50);
int count = 0;
while (iter.hasNext()) {
SAMRecord rec = iter.next();
count++;
}
assertEquals(50, count);
}
@Test
public void testContainedQueryMiddle() {
ArtificialSAMQueryIterator iter = ArtificialSamUtils.queryReadIterator(1, 2, 100);
iter.queryContained("chr1", 25, 74);
int count = 0;
while (iter.hasNext()) {
SAMRecord rec = iter.next();
count++;
}
assertEquals(1, count);
}
@Test
public void testOverlappingQueryMiddle() {
ArtificialSAMQueryIterator iter = ArtificialSamUtils.queryReadIterator(1, 2, 100);
iter.queryOverlapping("chr1", 25, 74);
int count = 0;
while (iter.hasNext()) {
SAMRecord rec = iter.next();
count++;
}
assertEquals(50, count);
}
@Test(expected = IllegalArgumentException.class)
public void testUnknownChromosome() {
ArtificialSAMQueryIterator iter = ArtificialSamUtils.queryReadIterator(1, 2, 100);
iter.queryOverlapping("chr621", 25, 74);
}
}

View File

@ -0,0 +1,109 @@
package org.broadinstitute.sting.utils.sam;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.iterators.PeekingStingIterator;
import org.junit.Test;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;
import static junit.framework.Assert.assertTrue;
import net.sf.samtools.SAMRecord;
/**
* Created by IntelliJ IDEA.
* User: aaronmckenna
* Date: Jun 3, 2009
* Time: 3:09:34 AM
* To change this template use File | Settings | File Templates.
*/
public class ArtificialSamUtilsTest extends BaseTest {
@Test
public void basicReadIteratorTest() {
StingSAMIterator iter = ArtificialSamUtils.unmappedReadIterator(1, 100, 100);
int count = 0;
while (iter.hasNext()) {
SAMRecord rec = iter.next();
count++;
}
assertEquals(100 * 100, count);
}
@Test
public void tenPerChromosome() {
StingSAMIterator iter = ArtificialSamUtils.unmappedReadIterator(1, 100, 10);
int count = 0;
while (iter.hasNext()) {
SAMRecord rec = iter.next();
assertEquals(Integer.valueOf(Math.round(count / 10)), rec.getReferenceIndex());
count++;
}
assertEquals(100 * 10, count);
}
@Test
public void onePerChromosome() {
StingSAMIterator iter = ArtificialSamUtils.unmappedReadIterator(1, 100, 1);
int count = 0;
while (iter.hasNext()) {
SAMRecord rec = iter.next();
assertEquals(Integer.valueOf(count), rec.getReferenceIndex());
count++;
}
assertEquals(100 * 1, count);
}
@Test
public void basicUnmappedIteratorTest() {
StingSAMIterator iter = ArtificialSamUtils.unmappedReadIterator(1, 100, 100, 1000);
int count = 0;
for (int x = 0; x < (100* 100); x++ ) {
if (!iter.hasNext()) {
fail ("we didn't get the expected number of reads");
}
SAMRecord rec = iter.next();
assertTrue(rec.getReferenceIndex() >= 0);
count++;
}
assertEquals(count, 100 * 100);
// now we should have 1000 unmapped reads
count = 0;
while (iter.hasNext()) {
SAMRecord rec = iter.next();
assertTrue(rec.getReferenceIndex() < 0);
count++;
}
assertEquals(1000, count);
}
@Test
public void testPeeking() {
PeekingStingIterator iter = ArtificialSamUtils.unmappedReadIterator(1, 100, 100);
int count = 0;
while (iter.hasNext()) {
int readCnt = ((ArtificialSAMIterator)(iter)).readsTaken();
// peek the record
SAMRecord rec = iter.peek();
assertTrue(rec.getReferenceIndex() >= 0);
// next the record
SAMRecord rec2 = iter.next();
assertTrue(rec2.getReadName() == rec.getReadName());
assertTrue(rec2.getAlignmentStart() == rec.getAlignmentStart());
// find out how many reads we've taken now
int readCnt2 = ((ArtificialSAMIterator)(iter)).readsTaken();
count++;
if (count < 100*100) assertEquals(readCnt + 1, readCnt2);
else assertEquals(readCnt, readCnt2);
}
assertEquals(100 * 100, count );
}
}