First pass at eliminating the old sharding system. Classes required for the original sharding system

are gone where I could identify them, but hierarchies that split to support two sharding systems have
not yet been taken apart.
@Eric: ~4k lines.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3580 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2010-06-17 20:17:31 +00:00
parent b694ca9633
commit 612c3fdd9d
35 changed files with 28 additions and 3493 deletions

View File

@ -726,7 +726,7 @@ public class GenomeAnalysisEngine {
ValidationExclusion exclusions) {
// Use monolithic sharding if no index is present. Monolithic sharding is always required for the original
// sharding system; it's required with the new sharding system only for locus walkers.
if(readsDataSource != null && !readsDataSource.hasIndex() && (argCollection.disableExperimentalSharding || walker instanceof LocusWalker)) {
if(readsDataSource != null && !readsDataSource.hasIndex() && walker instanceof LocusWalker) {
if(!exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM) || intervals != null)
throw new StingException("The GATK cannot currently process unindexed BAM files");
@ -756,27 +756,21 @@ public class GenomeAnalysisEngine {
if(readsDataSource != null && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
Utils.scareUser("Locus walkers can only walk over coordinate-sorted data. Please resort your input BAM file.");
shardType = (walker.isReduceByInterval()) ?
ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL :
ShardStrategyFactory.SHATTER_STRATEGY.LINEAR;
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
referenceDataSource.getReference(),
!argCollection.disableExperimentalSharding ? ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL : shardType,
ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL,
drivingDataSource.getSequenceDictionary(),
SHARD_SIZE,
intervals, maxIterations);
} else
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
referenceDataSource.getReference(),
!argCollection.disableExperimentalSharding ? ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL : ShardStrategyFactory.SHATTER_STRATEGY.LINEAR,
ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL,
drivingDataSource.getSequenceDictionary(),
SHARD_SIZE, maxIterations);
} else if (walker instanceof ReadWalker ||
walker instanceof DuplicateWalker) {
if(!argCollection.disableExperimentalSharding)
shardType = ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL;
else
shardType = ShardStrategyFactory.SHATTER_STRATEGY.READS;
shardType = ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL;
if (intervals != null && !intervals.isEmpty()) {
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
@ -793,8 +787,6 @@ public class GenomeAnalysisEngine {
SHARD_SIZE, maxIterations);
}
} else if (walker instanceof ReadPairWalker) {
if(argCollection.disableExperimentalSharding)
Utils.scareUser("Pairs traversal cannot be used in conjunction with the old sharding system.");
if(readsDataSource != null && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname)
Utils.scareUser("Read pair walkers can only walk over query name-sorted data. Please resort your input BAM file.");
if(intervals != null && !intervals.isEmpty())
@ -822,13 +814,7 @@ public class GenomeAnalysisEngine {
if (reads.getReadsFiles().size() == 0)
return null;
SAMDataSource dataSource = null;
if(!argCollection.disableExperimentalSharding)
dataSource = new BlockDrivenSAMDataSource(reads);
else
dataSource = new IndexDrivenSAMDataSource(reads);
return dataSource;
return new BlockDrivenSAMDataSource(reads);
}
/**

View File

@ -168,10 +168,6 @@ public class GATKArgumentCollection {
// @Argument(fullName = "enableRodWalkers", shortName = "erw", doc = "Enable experimental rodWalker support. TEMPORARY HACK TO ALLOW EXPERIMENTATION WITH ROD WALKERS. [default is false]}.", required = false)
// public boolean enableRodWalkers = false;
@Element(required = false)
@Argument(fullName = "disable_experimental_sharding",shortName="ds", doc="Disable the experimental sharding strategy.", required = false)
public boolean disableExperimentalSharding = false;
@ElementList(required = false)
@Argument(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching <TAG>:<STRING> or a .txt file containing the filter strings one per line.", required = false)
public List<String> readGroupBlackList = null;

View File

@ -46,24 +46,14 @@ public class IndexDelimitedLocusShard extends LocusShard implements BAMFormatAwa
*/
private final Map<SAMReaderID,SAMFileSpan> fileSpans;
/**
* An IndexDelimitedLocusShard can be used either for LOCUS or LOCUS_INTERVAL shard types.
* Track which type is being used.
*/
private final ShardType shardType;
/**
* Create a new locus shard, divided by index.
* @param intervals List of intervals to process.
* @param fileSpans File spans associated with that interval.
* @param shardType Type of the shard; must be either LOCUS or LOCUS_INTERVAL.
*/
IndexDelimitedLocusShard(List<GenomeLoc> intervals, Map<SAMReaderID,SAMFileSpan> fileSpans, ShardType shardType) {
IndexDelimitedLocusShard(List<GenomeLoc> intervals, Map<SAMReaderID,SAMFileSpan> fileSpans) {
super(intervals);
this.fileSpans = fileSpans;
if(shardType != ShardType.LOCUS && shardType != ShardType.LOCUS_INTERVAL)
throw new StingException("Attempted to create an IndexDelimitedLocusShard with invalid shard type: " + shardType);
this.shardType = shardType;
}
/**
@ -125,6 +115,6 @@ public class IndexDelimitedLocusShard extends LocusShard implements BAMFormatAwa
*/
@Override
public ShardType getShardType() {
return shardType;
return ShardType.LOCUS;
}
}

View File

@ -127,7 +127,7 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
public IndexDelimitedLocusShard next() {
FilePointer nextFilePointer = filePointerIterator.next();
Map<SAMReaderID,SAMFileSpan> fileSpansBounding = nextFilePointer.fileSpans != null ? nextFilePointer.fileSpans : null;
return new IndexDelimitedLocusShard(nextFilePointer.locations,fileSpansBounding,Shard.ShardType.LOCUS_INTERVAL);
return new IndexDelimitedLocusShard(nextFilePointer.locations,fileSpansBounding);
}
/** we don't support the remove command */

View File

@ -1,77 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.shards;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.util.Collections;
import java.util.List;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* @author aaron
* <p/>
* Class IntervalShard
* <p/>
* the base interval shard. All interval shards are generally the same,
* but must return their ShardType individually.
*/
public class IntervalShard implements Shard {
/** a collection of genomic locations to interate over */
private GenomeLoc mSet;
private Shard.ShardType mType = Shard.ShardType.LOCUS_INTERVAL;
IntervalShard(GenomeLoc myLocation, Shard.ShardType intervalType) {
if (intervalType != Shard.ShardType.LOCUS_INTERVAL && intervalType != Shard.ShardType.READ_INTERVAL)
throw new IllegalArgumentException("The specified interval type must be either LOCUS_INTERVAL or READ_INTERVAL");
mType = intervalType;
mSet = myLocation.clone();
}
/** @return the genome location represented by this shard */
public List<GenomeLoc> getGenomeLocs() {
return Collections.singletonList(mSet);
}
/**
* returns the type of shard, READ
*
* @return READ, indicating the shard type
*/
public Shard.ShardType getShardType() {
return mType;
}
/**
* String representation of this shard.
* @return A string representation of the boundaries of this shard.
*/
@Override
public String toString() {
return mSet.toString();
}
}

View File

@ -1,115 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.shards;
import net.sf.samtools.SAMSequenceDictionary;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.StingException;
import java.util.Iterator;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* @author aaron
* <p/>
* Class ReadByIntervalShardStrategy
* <p/>
* Impliments the sharding strategy for reads, given a list
* of genomic locations. Shards returned will be bounded by the interval,
* but each provided interval may be split into a number of smaller regions.
*/
public class IntervalShardStrategy implements ShardStrategy {
/** our storage of the genomic locations they'd like to shard over */
private final GenomeLocSortedSet regions;
/** their prefered size of the shard, we can modify this based on what we see in the shards */
private long size;
private Shard.ShardType type;
/**
* change the recommended shard size for the next shard we generate. The code will do it's
* best to respect this value, but there are no guarantees.
*
* @param size the next recommended shard size.
*/
public void adjustNextShardSize( long size ) {
this.size = size; // we don't do anything with the size in this case, but we should store it anyway
}
/**
* construct the shard strategy from a seq dictionary, a shard size, and and genomeLocs
*
* @param size
* @param locations
*/
IntervalShardStrategy( long size, GenomeLocSortedSet locations, Shard.ShardType shardType ) {
type = shardType;
this.regions = locations == null ? new GenomeLocSortedSet() : locations.clone();
this.size = size;
}
/**
* returns true if there are additional shards
*
* @return false if we're done processing shards
*/
public boolean hasNext() {
return ( !regions.isEmpty() );
}
/**
* gets the next Shard
*
* @return the next shard
*/
public Shard next() {
if (( this.regions == null ) || ( regions.isEmpty() )) {
throw new StingException("IntervalShardStrategy: genomic regions list is empty in next() function.");
}
// get the first region in the list
GenomeLoc loc = regions.iterator().next();
regions.remove(loc);
return new IntervalShard(loc,type);
}
/** we don't support the remove command */
public void remove() {
throw new UnsupportedOperationException("ShardStrategies don't support remove()");
}
/**
* makes the ReadIntervalShard iterable, i.e. usable in a for loop.
*
* @return
*/
public Iterator<Shard> iterator() {
return this;
}
}

View File

@ -1,99 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.shards;
import net.sf.samtools.SAMSequenceDictionary;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* @author aaron
* @version 1.0
* @date Apr 6, 2009
* <p/>
* Class AdaptiveShard
* <p/>
* allows you to change the sharding length as you traverse
*/
class LinearLocusShardStrategy extends LocusShardStrategy {
// default the next size to 100,000
private long nextShardSize = 100000;
/**
* the constructor, taking a seq dictionary to parse out contigs
*
* @param dic the seq dictionary
*/
LinearLocusShardStrategy(SAMSequenceDictionary dic, long startSize, long limitByCount) {
super(dic);
this.limitingFactor = limitByCount;
this.nextShardSize = startSize;
}
/**
* the constructor, constructing from another Locus strategy
*
* @param strat the shatter to convert from
*/
LinearLocusShardStrategy(LocusShardStrategy strat) {
super(strat);
this.nextShardSize = strat.nextShardSize();
}
/**
* The constructor, for a genomic list, start size, and a reference dictionary
* @param dic the reference dictionary
* @param startSize the starting size of the shard
* @param lst locations to iterate from
*/
LinearLocusShardStrategy(SAMSequenceDictionary dic, long startSize, GenomeLocSortedSet lst, long limitByCount) {
super(dic, lst);
this.limitingFactor = limitByCount;
this.nextShardSize = startSize;
}
/**
* set the next shards size
*
* @param size adjust the next size to this
*/
public void adjustNextShardSize(long size) {
nextShardSize = size;
}
/**
* This is how the various shards strategies implements their approach
*
* @return the next shard size
*/
protected long nextShardSize() {
return nextShardSize;
}
}

View File

@ -1,275 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.shards;
import net.sf.samtools.SAMSequenceDictionary;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLocParser;
import java.util.Iterator;
import java.util.Collections;
/**
*
* User: aaron
* Date: Apr 6, 2009
* Time: 11:23:17 AM
*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
/**
* @author aaron
* @version 1.0
*/
public abstract class LocusShardStrategy implements ShardStrategy {
// this stores the seq dictionary, which is a reference for the
// lengths and names of contigs, which you need to generate an iterative stratagy
protected final SAMSequenceDictionary dic;
// the current genome location
protected GenomeLoc mLoc = null;
// current seq location
protected int seqLoc = 0;
// the actual last size; this can change based on contig endings
protected long lastGenomeLocSize = 0;
// do we have another contig?
private boolean nextContig = false;
/** our interal list * */
private GenomeLocSortedSet intervals = null;
/** our log, which we want to capture anything from this class */
private static Logger logger = Logger.getLogger(LocusShardStrategy.class);
/** the number of iterations before we stop */
protected long limitingFactor = -1;
private boolean stopDueToLimitingFactor = false;
/**
* the constructor, taking a seq dictionary to parse out contigs
*
* @param dic the seq dictionary
*/
LocusShardStrategy(SAMSequenceDictionary dic) {
this.dic = dic;
limitingFactor = -1;
mLoc = GenomeLocParser.createGenomeLoc(0, 0, 0);
if (dic.getSequences().size() > 0) {
nextContig = true;
}
}
/**
* the copy constructor,
*
* @param old the old strategy
*/
LocusShardStrategy(LocusShardStrategy old) {
this.dic = old.dic;
this.mLoc = old.mLoc;
this.seqLoc = old.seqLoc;
this.lastGenomeLocSize = old.lastGenomeLocSize;
this.nextContig = old.nextContig;
this.limitingFactor = old.limitingFactor;
}
/**
* the constructor, taking a seq dictionary to parse out contigs
*
* @param dic the seq dictionary
* @param intervals file
*/
LocusShardStrategy(SAMSequenceDictionary dic, GenomeLocSortedSet intervals) {
this.dic = dic;
this.intervals = intervals.clone();
// set the starting point to the beginning interval
if (intervals.size() < 1) {
throw new IllegalArgumentException("Interval files must contain at least one interval");
}
GenomeLoc loc = intervals.iterator().next();
mLoc = GenomeLocParser.createGenomeLoc(loc.getContig(), loc.getStart() - 1, loc.getStart() - 1);
if (dic.getSequences().size() > 0) {
nextContig = true;
}
}
/**
*
* Abstract methods that each strategy has to implement
*
*/
/**
* This is how the various shards strategies implements their approach, adjusting this value
*
* @return the next shard size
*/
protected abstract long nextShardSize();
/**
*
* Concrete methods that each strategy does not have to implement
*
*/
/**
* get the next shard, based on the return size of nextShardSize
*
* @return the next shard
*/
public Shard next() {
// lets get some background info on the problem
long length = dic.getSequence(seqLoc).getSequenceLength();
long proposedSize = nextShardSize();
long nextStart = mLoc.getStop() + 1;
if (this.limitingFactor > 0) {
if (proposedSize < limitingFactor) {
limitingFactor = limitingFactor - proposedSize;
} else {
proposedSize = limitingFactor;
this.stopDueToLimitingFactor = true;
}
}
// if we don't have an interval set, use the non interval based approach. Simple, eh?
if (this.intervals == null) {
return nonIntervaledNext(length, proposedSize, nextStart);
} else {
return intervaledNext();
}
}
/**
* Interval based next processing
*
*
* @return the shard that represents this data
*/
private Shard intervaledNext() {
if ((this.intervals == null) || (intervals.isEmpty())) {
throw new StingException("LocusShardStrategy: genomic regions list is empty in next() function.");
}
// get the first region in the list
GenomeLoc loc = intervals.iterator().next();
intervals.remove(loc);
return new IntervalShard(loc, Shard.ShardType.LOCUS_INTERVAL);
}
/**
* Get the next shard, if we don't have intervals to traverse over
*
* @param length the length of the contig
* @param proposedSize the proposed size
* @param nextStart the next start location
*
* @return the shard to return to the user
*/
private Shard nonIntervaledNext(long length, long proposedSize, long nextStart) {
// can we fit it into the current seq size?
if (nextStart + proposedSize - 1 < length) {
lastGenomeLocSize = proposedSize;
mLoc = GenomeLocParser.createGenomeLoc(dic.getSequence(seqLoc).getSequenceIndex(), nextStart, nextStart + proposedSize - 1);
return new LocusShard(Collections.singletonList(mLoc));
}
// else we can't make it in the current location, we have to stitch one together
else {
// lets find out the remaining size of the current contig
long overflow = nextStart + proposedSize - 1 - length;
logger.debug("Overflow = " + overflow + " length: " + length);
// set our last size counter to the remaining size
lastGenomeLocSize = proposedSize - overflow;
// move to the next contig
// the next sequence should start at the begining of the next contig
Shard ret = new LocusShard(Collections.singletonList(GenomeLocParser.createGenomeLoc(dic.getSequence(seqLoc).getSequenceIndex(), nextStart, nextStart + lastGenomeLocSize - 1)));
// now jump ahead to the next contig
jumpContig();
// return the shard
return ret;
}
}
/** jump to the next contig */
private void jumpContig() {
++seqLoc;
if (!(seqLoc < dic.getSequences().size())) {
nextContig = false;
return;
}
logger.debug("Next contig, index = " + dic.getSequence(seqLoc).getSequenceIndex());
mLoc = GenomeLocParser.createGenomeLoc(dic.getSequence(seqLoc).getSequenceIndex(), 0, 0);
}
/**
* is there another GenomeLoc to get?
*
* @return
*/
public boolean hasNext() {
if (this.stopDueToLimitingFactor) {
return false;
}
// if we don't have an interval file, use the non interval based approach.
if (this.intervals == null) {
return nextContig;
} else {
return (this.intervals.size() > 0);
}
}
/** we don't support remove */
public void remove() {
throw new UnsupportedOperationException("Can not remove records from a shard iterator!");
}
/**
* to be for-each(able), we must implement this method
*
* @return
*/
public Iterator<Shard> iterator() {
return this;
}
/**
* this allows a shard strategy to get the current interval. It's kind of a hack, but for the
* locusWindowShardStrategy it was the best approach.
*
* @return
*/
protected GenomeLoc getCurrentInterval() {
if (this.intervals == null || intervals.size() < 1) {
return null;
}
return intervals.iterator().next();
}
}

View File

@ -31,7 +31,7 @@ import java.util.List;
*/
public interface Shard extends Serializable {
enum ShardType {
READ, LOCUS, READ_INTERVAL, LOCUS_INTERVAL
READ, LOCUS
}
/** @return the genome location represented by this shard */

View File

@ -38,9 +38,6 @@ import java.io.File;
*/
public class ShardStrategyFactory {
public enum SHATTER_STRATEGY {
LINEAR,
READS,
INTERVAL,
MONOLITHIC, // Put all of the available data into one shard.
LOCUS_EXPERIMENTAL,
READS_EXPERIMENTAL
@ -72,12 +69,6 @@ public class ShardStrategyFactory {
*/
static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, long limitByCount) {
switch (strat) {
case LINEAR:
return new LinearLocusShardStrategy(dic, startingSize, limitByCount);
case READS:
return new ReadDelimitedReadShardStrategy(startingSize, limitByCount);
case INTERVAL:
throw new StingException("Requested trategy: " + strat + " doesn't work with the limiting count (-M) command line option");
case LOCUS_EXPERIMENTAL:
return new IndexDelimitedLocusShardStrategy(readsDataSource,referenceDataSource,null);
case READS_EXPERIMENTAL:
@ -116,12 +107,6 @@ public class ShardStrategyFactory {
*/
static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocSortedSet lst, long limitDataCount) {
switch (strat) {
case LINEAR:
return new LinearLocusShardStrategy(dic, startingSize, lst, limitDataCount);
case INTERVAL:
return new IntervalShardStrategy(startingSize, lst, Shard.ShardType.LOCUS_INTERVAL);
case READS:
return new IntervalShardStrategy(startingSize, lst, Shard.ShardType.READ_INTERVAL);
case LOCUS_EXPERIMENTAL:
return new IndexDelimitedLocusShardStrategy(readsDataSource,referenceDataSource,lst);
case READS_EXPERIMENTAL:

View File

@ -1,442 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.util.CloseableIterator;
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
import org.broadinstitute.sting.gatk.datasources.shards.MonolithicShard;
import org.broadinstitute.sting.gatk.datasources.shards.ReadDelimitedReadShard;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.iterators.*;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.sam.SAMReadViolationHistogram;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* User: aaron
* Date: Mar 26, 2009
* Time: 2:36:16 PM
* <p/>
* Converts shards to SAM iterators over the specified region
*/
public class IndexDrivenSAMDataSource extends SAMDataSource {
// used for the reads case, the last count of reads retrieved
long readsTaken = 0;
// our last genome loc position
protected GenomeLoc lastReadPos = null;
// do we take unmapped reads
private boolean includeUnmappedReads = true;
// reads based traversal variables
private boolean intoUnmappedReads = false;
private int readsSeenAtLastPos = 0;
/**
* A histogram of exactly what reads were removed from the input stream and why.
*/
private SAMReadViolationHistogram violations = new SAMReadViolationHistogram();
// A pool of SAM iterators.
private SAMResourcePool resourcePool = null;
private GenomeLoc mLastInterval = null;
/**
* Returns a histogram of reads that were screened out, grouped by the nature of the error.
* @return Histogram of reads. Will not be null.
*/
public SAMReadViolationHistogram getViolationHistogram() {
return violations;
}
/**
* constructor, given sam files
*
* @param reads the list of sam files
*/
public IndexDrivenSAMDataSource( Reads reads ) throws SimpleDataSourceLoadException {
super(reads);
resourcePool = new SAMResourcePool(reads);
}
/**
* Do all BAM files backing this data source have an index? The case where hasIndex() is false
* is supported, but only in a few extreme cases.
* @return True if an index is present; false otherwise.
*/
public boolean hasIndex() {
return resourcePool.hasIndex;
}
/**
* Retrieves the sort order of the readers.
* @return Sort order. For this datasource, MUST be coordinate.
*/
public SAMFileHeader.SortOrder getSortOrder() {
return SAMFileHeader.SortOrder.coordinate;
}
/**
* Gets the (potentially merged) SAM file header.
*
* @return SAM file header.
*/
public SAMFileHeader getHeader() {
return resourcePool.getHeader();
}
public SAMFileHeader getHeader(SAMReaderID id) {
return resourcePool.fileToReaderMap.get(id.samFile).getFileHeader();
}
/**
* Retrieves the id of the reader which built the given read.
* @param read The read to test.
* @return ID of the reader.
*/
public SAMReaderID getReaderID(SAMRecord read) {
if(resourcePool.readerToIDMap.containsKey(read.getFileSource().getReader()))
return resourcePool.readerToIDMap.get(read.getFileSource().getReader());
throw new StingException("Unable to find reader id for record.");
}
/**
* Returns Reads data structure containing information about the reads data sources placed in this pool as well as
* information about how they are downsampled, sorted, and filtered
* @return
*/
public Reads getReadsInfo() { return reads; }
/** Returns true if there are read group duplicates within the merged headers. */
public boolean hasReadGroupCollisions() {
return resourcePool.getHeaderMerger().hasReadGroupCollisions();
}
/** Returns the read group id that should be used for the input read and RG id. */
public String getReadGroupId(final SAMReaderID id, final String originalReadGroupId) {
SAMFileReader reader = resourcePool.getFileToReaderMapping().get(id.samFile);
return resourcePool.getHeaderMerger().getReadGroupId(reader,originalReadGroupId);
}
/**
*
* @param shard the shard to get data for
*
* @return an iterator for that region
*/
public StingSAMIterator seek( Shard shard ) throws SimpleDataSourceLoadException {
// setup the iterator pool if it's not setup
boolean queryOverlapping = ( shard.getShardType() == Shard.ShardType.READ ) ? false : true;
resourcePool.setQueryOverlapping(queryOverlapping);
StingSAMIterator iterator = null;
if (shard.getShardType() == Shard.ShardType.READ) {
iterator = seekRead(shard);
iterator = applyDecoratingIterators(true,
iterator,
reads.getDownsamplingMethod().toFraction,
reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
reads.getSupplementalFilters());
} else if (shard.getShardType() == Shard.ShardType.LOCUS) {
iterator = seekLocus(shard);
iterator = applyDecoratingIterators(false,
iterator,
reads.getDownsamplingMethod().toFraction,
reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
reads.getSupplementalFilters());
} else if ((shard.getShardType() == Shard.ShardType.LOCUS_INTERVAL) ||
(shard.getShardType() == Shard.ShardType.READ_INTERVAL)) {
iterator = seekLocus(shard);
iterator = applyDecoratingIterators(false,
iterator,
reads.getDownsamplingMethod().toFraction,
reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
reads.getSupplementalFilters());
// add the new overlapping detection iterator, if we have a last interval and we're a read based shard
if(shard.getGenomeLocs().size() > 1)
throw new StingException("This SAMDataSource does not support multiple intervals within a single shard");
GenomeLoc shardGenomeLoc = shard.getGenomeLocs().get(0);
if(shard.getShardType() == Shard.ShardType.READ_INTERVAL) {
if (mLastInterval != null)
iterator = new PlusOneFixIterator(shardGenomeLoc,new IntervalOverlapIterator(iterator,mLastInterval,false));
else
iterator = new PlusOneFixIterator(shardGenomeLoc,iterator);
mLastInterval = shardGenomeLoc;
}
} else {
throw new StingException("seek: Unknown shard type");
}
return iterator;
}
/**
* <p>
* seekLocus
* </p>
*
* @param shard the shard containing the genome location to extract data for
*
* @return an iterator for that region
*/
private StingSAMIterator seekLocus( Shard shard ) throws SimpleDataSourceLoadException {
if(shard instanceof MonolithicShard)
return createIterator(new EntireStream());
if( getHeader().getSequenceDictionary().getSequences().size() == 0 )
throw new StingException("Unable to seek to the given locus; reads data source has no alignment information.");
if(shard.getGenomeLocs().size() > 1)
throw new StingException("This SAMDataSource does not support multiple intervals within a single shard");
GenomeLoc shardGenomeLoc = shard.getGenomeLocs().get(0);
return createIterator( new MappedStreamSegment(shardGenomeLoc) );
}
/**
* <p>
* seek
* </p>
*
* @param shard the read shard to extract from
*
* @return an iterator for that region
*/
private StingSAMIterator seekRead( Shard shard ) throws SimpleDataSourceLoadException {
if(shard instanceof MonolithicShard)
return createIterator(new EntireStream());
ReadDelimitedReadShard readShard = (ReadDelimitedReadShard)shard;
StingSAMIterator iter = null;
// If there are no entries in the sequence dictionary, there can't possibly be any unmapped reads. Force state to 'unmapped'.
if( isSequenceDictionaryEmpty() )
intoUnmappedReads = true;
if (!intoUnmappedReads) {
if (lastReadPos == null) {
lastReadPos = GenomeLocParser.createGenomeLoc(getHeader().getSequenceDictionary().getSequence(0).getSequenceIndex(), 0, Integer.MAX_VALUE);
iter = createIterator(new MappedStreamSegment(lastReadPos));
return InitialReadIterator(readShard.getSize(), iter);
} else {
lastReadPos = GenomeLocParser.setStop(lastReadPos,-1);
iter = fastMappedReadSeek(readShard.getSize(), StingSAMIteratorAdapter.adapt(reads, createIterator(new MappedStreamSegment(lastReadPos))));
}
if (intoUnmappedReads && !includeUnmappedReads)
readShard.signalDone();
}
if (intoUnmappedReads && includeUnmappedReads) {
if (iter != null)
iter.close();
iter = toUnmappedReads(readShard.getSize());
if (!iter.hasNext())
readShard.signalDone();
}
return iter;
}
/**
* If we're in by-read mode, this indicates if we want
* to see unmapped reads too. Only seeing mapped reads
* is much faster, but most BAM files have significant
* unmapped read counts.
*
* @param seeUnMappedReads true to see unmapped reads, false otherwise
*/
public void viewUnmappedReads( boolean seeUnMappedReads ) {
includeUnmappedReads = seeUnMappedReads;
}
/**
* For unit testing, add a custom iterator pool.
*
* @param resourcePool Custom mock iterator pool.
*/
void setResourcePool( SAMResourcePool resourcePool ) {
this.resourcePool = resourcePool;
}
/**
* Retrieve unmapped reads.
*
* @param readCount how many reads to retrieve
*
* @return the bounded iterator that you can use to get the intervaled reads from
*/
StingSAMIterator toUnmappedReads( long readCount ) {
StingSAMIterator iter = createIterator(new UnmappedStreamSegment(readsTaken, readCount));
readsTaken += readCount;
return iter;
}
/**
* A seek function for mapped reads.
*
* @param readCount how many reads to retrieve
* @param iter the iterator to use, seeked to the correct start location
*
* @return the bounded iterator that you can use to get the intervaled reads from. Will be a zero-length
* iterator if no reads are available.
* @throws SimpleDataSourceLoadException
*/
StingSAMIterator fastMappedReadSeek( long readCount, StingSAMIterator iter ) throws SimpleDataSourceLoadException {
BoundedReadIterator bound;
correctForReadPileupSeek(iter);
if (readsTaken == 0) {
return InitialReadIterator(readCount, iter);
}
int x = 0;
SAMRecord rec = null;
// Assuming that lastReadPos should never be null, because this is a mappedReadSeek
// and initial queries are handled by the previous conditional.
int lastContig = lastReadPos.getContigIndex();
int lastPos = (int)lastReadPos.getStart();
while (x < readsTaken) {
if (iter.hasNext()) {
rec = iter.next();
if (lastContig == rec.getReferenceIndex() && lastPos == rec.getAlignmentStart()) ++this.readsSeenAtLastPos;
else this.readsSeenAtLastPos = 1;
lastPos = rec.getAlignmentStart();
++x;
} else {
iter.close();
// jump contigs
lastReadPos = GenomeLocParser.toNextContig(lastReadPos);
if (lastReadPos == null) {
// check to see if we're using unmapped reads, if not return, we're done
readsTaken = 0;
intoUnmappedReads = true;
// fastMappedReadSeek must return an iterator, even if that iterator iterates through nothing.
return new NullSAMIterator(reads);
} else {
readsTaken = readCount;
readsSeenAtLastPos = 0;
lastReadPos = GenomeLocParser.setStop(lastReadPos,-1);
CloseableIterator<SAMRecord> ret = createIterator(new MappedStreamSegment(lastReadPos));
return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, ret), readCount);
}
}
}
// if we're off the end of the last contig (into unmapped territory)
if (rec != null && rec.getAlignmentStart() == 0) {
readsTaken += readCount;
intoUnmappedReads = true;
}
// else we're not off the end, store our location
else if (rec != null) {
int stopPos = rec.getAlignmentStart();
if (stopPos < lastReadPos.getStart()) {
lastReadPos = GenomeLocParser.createGenomeLoc(lastReadPos.getContigIndex() + 1, stopPos, stopPos);
} else {
lastReadPos = GenomeLocParser.setStart(lastReadPos,rec.getAlignmentStart());
}
}
// in case we're run out of reads, get out
else {
throw new StingException("Danger: weve run out reads in fastMappedReadSeek");
//return null;
}
bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount);
// return the iterator
return bound;
}
/**
* Determines whether the BAM file is completely unsequenced. Requires that the resource pool be initialized.
* @return True if the sequence dictionary is completely empty. False otherwise.
*/
private boolean isSequenceDictionaryEmpty() {
return getHeader().getSequenceDictionary().isEmpty();
}
/**
* Even though the iterator has seeked to the correct location, there may be multiple reads at that location,
* and we may have given some of them out already. Move the iterator to the correct location using the readsAtLastPos variable
*
* @param iter the iterator
*/
private void correctForReadPileupSeek( StingSAMIterator iter ) {
// move the number of reads we read from the last pos
boolean atLeastOneReadSeen = false; // we have a problem where some chomesomes don't have a single read (i.e. the chrN_random chrom.)
for(int i = 0; i < this.readsSeenAtLastPos && iter.hasNext(); i++,iter.next())
atLeastOneReadSeen = true;
if (readsSeenAtLastPos > 0 && !atLeastOneReadSeen) {
throw new SimpleDataSourceLoadException("Seek problem: reads at last position count != 0");
}
}
/**
* set the initial iterator
*
* @param readCount the number of reads
* @param iter the merging iterator
*
* @return a bounded read iterator at the first read position in the file.
*/
private BoundedReadIterator InitialReadIterator( long readCount, CloseableIterator<SAMRecord> iter ) {
BoundedReadIterator bound;
bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount);
this.readsTaken = readCount;
return bound;
}
/**
* Creates an iterator over the selected segment, from a resource pulled from the pool.
* @param segment Segment over which to gather reads.
* @return An iterator over just the reads in the given segment.
*/
private StingSAMIterator createIterator( DataStreamSegment segment ) {
StingSAMIterator iterator = resourcePool.iterator(segment);
StingSAMIterator malformedWrappedIterator = new MalformedSAMFilteringIterator( getHeader(), iterator, violations );
StingSAMIterator readWrappingIterator = new ReadFormattingIterator(malformedWrappedIterator);
return readWrappingIterator;
}
}

View File

@ -1,305 +0,0 @@
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.gatk.iterators.*;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLoc;
import net.sf.picard.sam.SamFileHeaderMerger;
import net.sf.samtools.SAMFileReader;
/**
* Abstract class that models a current state in some category of reads.
* @author hanna
* @version 0.1
*/
abstract class ReadStreamPointer {
/**
* Describes the source of reads data.
*/
protected final Reads sourceInfo;
/**
* Open handles to the reads info.
*/
protected final SamFileHeaderMerger headerMerger;
public ReadStreamPointer( Reads sourceInfo, SamFileHeaderMerger headerMerger ) {
this.sourceInfo = sourceInfo;
this.headerMerger = headerMerger;
}
/**
* Can this pointer access the provided segment efficiently?
* @param segment Segment to test.
* @return True if it would be quick for this segment to access the given data.
* False if accessing this data would require some sort of reinitialization.
*/
public abstract boolean canAccessSegmentEfficiently(DataStreamSegment segment);
/**
* Close this resource, destroying all file handles.
*/
public void close() {
for (SAMFileReader reader : headerMerger.getReaders())
reader.close();
}
/**
* Returns Reads data structure containing information about the reads data sources as well as
* information about how they are downsampled, sorted, and filtered
* @return
*/
public Reads getReadsInfo() {
return sourceInfo;
}
/**
* Returns header merger: a class that keeps the mapping between original read groups and read groups
* of the merged stream; merger also provides access to the individual file readers (and hence headers
* too) maintained by the system.
* @return
*/
public SamFileHeaderMerger getHeaderMerger() {
return headerMerger;
}
/**
* Remove an iterator from service.
* @param iterator The iterator to remove from service. Must not be null.
*/
public abstract void destroy( StingSAMIterator iterator );
/**
* Get a stream of all the reads that overlap a given segment.
* @param segment Segment to check for overlaps.
* @return An iterator over all reads overlapping the given segment.
*/
public abstract StingSAMIterator getReadsOverlapping( DataStreamSegment segment );
/**
* Get a stream of all the reads that are completely contained by a given segment.
* The segment can be mapped or unmapped.
* @param segment Segment to check for containment..
* @return An iterator over all reads contained by the given segment.
*/
public abstract StingSAMIterator getReadsContainedBy( DataStreamSegment segment );
}
class MappedReadStreamPointer extends ReadStreamPointer {
public MappedReadStreamPointer( Reads sourceInfo, SamFileHeaderMerger headerMerger ) {
super( sourceInfo, headerMerger );
}
/**
* MappedReadStreamPointers can access any segment efficiently. Always return true.
* @param segment Segment to test.
* @return True.
*/
public boolean canAccessSegmentEfficiently(DataStreamSegment segment) {
return true;
}
/**
* {@inheritDoc}
*/
public void destroy( StingSAMIterator iterator ) {
iterator.close();
}
/**
* {@inheritDoc}
*/
@Override
public StingSAMIterator getReadsOverlapping( DataStreamSegment segment ) {
if(!(segment instanceof MappedStreamSegment))
throw new UnsupportedOperationException("MappedReadStreamPointer cannot get reads overlapping an unmapped stream segment");
MappedStreamSegment mappedSegment = (MappedStreamSegment)segment;
MergingSamRecordIterator2 mergingIterator = new MergingSamRecordIterator2( headerMerger, sourceInfo );
// The getStop() + 1 is a hack to work around an old bug in the way Picard created SAM files where queries
// over a given interval would occasionally not pick up the last read in that interval.
mergingIterator.queryOverlapping( mappedSegment.locus.getContig(),
(int)mappedSegment.locus.getStart(),
(int)mappedSegment.locus.getStop()+1);
return StingSAMIteratorAdapter.adapt(sourceInfo,mergingIterator);
}
/**
* {@inheritDoc}
*/
@Override
public StingSAMIterator getReadsContainedBy( DataStreamSegment segment ) {
if( !(segment instanceof MappedStreamSegment) )
throw new StingException("Trying to access unmapped content from a mapped read stream pointer");
MappedStreamSegment mappedSegment = (MappedStreamSegment)segment;
MergingSamRecordIterator2 mergingIterator = new MergingSamRecordIterator2( headerMerger, sourceInfo );
// NOTE: explicitly not using the queryOverlapping hack above since, according to the above criteria,
// we'd only miss reads that are one base long when performing a contained query.
mergingIterator.queryContained( mappedSegment.locus.getContig(),
(int)mappedSegment.locus.getStart(),
(int)mappedSegment.locus.getStop()+1);
return StingSAMIteratorAdapter.adapt(sourceInfo,mergingIterator);
}
/**
* Convert a mapped read stream pointer to an unmapped read stream pointer, transferring ownership
* of the underlying file handles to the new container.
* After doing this conversion, the source MappedReadStreamPointer should not be used.
* @return
*/
public UnmappedReadStreamPointer toUnmappedReadStreamPointer() {
return new UnmappedReadStreamPointer( sourceInfo, headerMerger );
}
}
class UnmappedReadStreamPointer extends ReadStreamPointer {
/**
* A pointer to the current position of this iterator in the read stream.
*/
private PositionTrackingIterator unmappedIterator = null;
public UnmappedReadStreamPointer( Reads sourceInfo, SamFileHeaderMerger headerMerger ) {
super( sourceInfo, headerMerger );
MergingSamRecordIterator2 mergingIterator = new MergingSamRecordIterator2( headerMerger, sourceInfo );
mergingIterator.queryUnmappedReads();
unmappedIterator = new PositionTrackingIterator( sourceInfo, mergingIterator, 0L );
}
/**
* UnmappedReadStreamPointers are streams and can therefore access 'future' reads in the file quickly,
* but reads already seen are difficult to seek to.
* @param segment Segment to test.
* @return True if this DataStreamSegment follows the current position.
*/
public boolean canAccessSegmentEfficiently(DataStreamSegment segment) {
if( !(segment instanceof UnmappedStreamSegment) )
return false;
return unmappedIterator.getPosition() <= ((UnmappedStreamSegment)segment).position;
}
/**
* {@inheritDoc}
*/
@Override
public StingSAMIterator getReadsOverlapping( DataStreamSegment segment ) {
throw new UnsupportedOperationException("Unable to determine overlapped reads of an unmapped segment");
}
/**
* {@inheritDoc}
*/
@Override
public StingSAMIterator getReadsContainedBy( DataStreamSegment segment ) {
if( !(segment instanceof UnmappedStreamSegment) )
throw new StingException("Trying to access mapped content from an unmapped read stream pointer");
UnmappedStreamSegment unmappedSegment = (UnmappedStreamSegment)segment;
// Force the iterator to the next pending position.
while(unmappedIterator.getPosition() < unmappedSegment.position)
unmappedIterator.next();
return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(sourceInfo,unmappedIterator), unmappedSegment.size);
}
/**
* {@inheritDoc}
*/
@Override
public void close() {
if( unmappedIterator != null )
unmappedIterator.close();
super.close();
}
/**
* {@inheritDoc}
*/
public void destroy( StingSAMIterator iterator ) {
// Don't destroy the iterator; reuse it.
}
}
class EntireReadStreamPointer extends ReadStreamPointer {
/**
* Create a new pointer that can return info about the entire read stream.
* @param sourceInfo Source info for the reads.
* @param headerMerger Header merging apparatus.
*
*/
public EntireReadStreamPointer( Reads sourceInfo, SamFileHeaderMerger headerMerger ) {
super( sourceInfo, headerMerger );
}
/**
* An EntireReadStreamPointer can only efficiently access the entire file.
* @param segment Segment to test.
* @return true if requesting the entire stream.
*/
public boolean canAccessSegmentEfficiently(DataStreamSegment segment) {
return segment instanceof EntireStream;
}
/**
* Get a stream of all the reads that overlap a given segment.
* @param segment Segment to check for overlaps.
* @return An iterator over all reads overlapping the given segment.
*/
@Override
public StingSAMIterator getReadsOverlapping( DataStreamSegment segment ) {
if(!(segment instanceof EntireStream))
throw new StingException("EntireReadStreamPointer can only get reads overlapping the entire stream.");
return StingSAMIteratorAdapter.adapt(sourceInfo,new MergingSamRecordIterator2(headerMerger, sourceInfo));
}
/**
* Get a stream of all the reads that are completely contained by a given segment.
* @param segment Segment to check for containment..
* @return An iterator over all reads contained by the given segment.
*/
@Override
public StingSAMIterator getReadsContainedBy( DataStreamSegment segment ) {
if(!(segment instanceof EntireStream))
throw new StingException("EntireReadStreamPointer can only get reads contained by the entire stream.");
return StingSAMIteratorAdapter.adapt(sourceInfo,new MergingSamRecordIterator2(headerMerger, sourceInfo));
}
/**
* {@inheritDoc}
*/
public void destroy( StingSAMIterator iterator ) {
iterator.close();
}
}

View File

@ -1,196 +0,0 @@
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.picard.sam.SamFileHeaderMerger;
import java.util.*;
import java.io.File;
/**
* Represents a single stream of read data. Used to represent the state of the stream and determine
* whether the state of this resource is such that it can field the desired query.
* @author hanna
* @version 0.1
*/
class ReadStreamResource {
final static boolean eagerDecode = true;
/**
* Do all the constituent components of this ReadStreamResource have indices?
* In general, BAM files without indices are not supported, but in a few specific
* cases we do allow this for the Picard pipeline.
* @return true if all BAM files have indices; false otherwise.
*/
protected boolean hasIndex() {
for(SAMFileReader reader: readStreamPointer.getHeaderMerger().getReaders()) {
if(!reader.hasIndex())
return false;
}
return true;
}
protected final boolean hasIndex;
/** our log, which we want to capture anything from this class */
protected static Logger logger = Logger.getLogger(ReadStreamPointer.class);
/**
* The (possibly merged) header for the input fileset.
*/
private final SAMFileHeader header;
/**
* A pointer to the current location of the file.
*/
private ReadStreamPointer readStreamPointer = null;
/**
* A mapping from original input file to merged read group record ids
*/
private Map<File, SAMFileReader> fileToReaderMap = null;
/**
* A mapping from reader back to the ID uniquely identifying this input file.
*/
private Map<SAMFileReader, SAMReaderID> readerToIDMap = null;
public ReadStreamResource( Reads sourceInfo ) {
SamFileHeaderMerger headerMerger = createHeaderMerger(sourceInfo, SAMFileHeader.SortOrder.coordinate);
this.header = headerMerger.getMergedHeader();
boolean indexPresent = true;
for(SAMFileReader reader: headerMerger.getReaders()) {
if(!reader.hasIndex())
indexPresent = false;
}
hasIndex = indexPresent;
if(hasIndex)
readStreamPointer = new MappedReadStreamPointer(sourceInfo, headerMerger);
else
readStreamPointer = new EntireReadStreamPointer(sourceInfo, headerMerger);
}
/**
* Gets the header information for the read stream.
* @return Header information for the read stream.
*/
public SAMFileHeader getHeader() {
return header;
}
/**
* Returns Reads data structure containing information about the reads data sources as well as
* information about how they are downsampled, sorted, and filtered
* @return the Reads object
*/
public Reads getReadsInfo() { return readStreamPointer.getReadsInfo(); }
/**
* Returns header merger: a class that keeps the mapping between original read groups and read groups
* of the merged stream; merger also provides access to the individual file readers (and hence headers
* too) maintained by the system.
* @return the header merger
*/
public SamFileHeaderMerger getHeaderMerger() { return readStreamPointer.getHeaderMerger(); }
public boolean canAccessSegmentEfficiently(DataStreamSegment segment) {
return readStreamPointer.canAccessSegmentEfficiently(segment);
}
public void close() {
readStreamPointer.close();
}
public void destroy( StingSAMIterator iterator ) {
readStreamPointer.destroy(iterator);
}
public StingSAMIterator getReadsContainedBy( DataStreamSegment segment ) {
if( readStreamPointer instanceof MappedReadStreamPointer && segment instanceof UnmappedStreamSegment )
readStreamPointer = ((MappedReadStreamPointer)readStreamPointer).toUnmappedReadStreamPointer();
return readStreamPointer.getReadsContainedBy(segment);
}
public StingSAMIterator getReadsOverlapping( DataStreamSegment segment ) {
return readStreamPointer.getReadsOverlapping(segment);
}
public Map<File, SAMFileReader> getFileToReaderMapping() {
return fileToReaderMap;
}
public Map<SAMFileReader,SAMReaderID> getReaderToIDMapping() {
return readerToIDMap;
}
/**
* A private function that, given the internal file list, generates a merging construct for
* all available files.
* @param reads source information about the reads.
* @param SORT_ORDER sort order for the reads.
* @return a list of SAMFileReaders that represent the stored file names
* @throws SimpleDataSourceLoadException if the file cannot be opened.
*/
private SamFileHeaderMerger createHeaderMerger( Reads reads, SAMFileHeader.SortOrder SORT_ORDER )
throws SimpleDataSourceLoadException {
// right now this is pretty damn heavy, it copies the file list into a reader list every time
List<SAMFileReader> lst = new ArrayList<SAMFileReader>();
fileToReaderMap = new HashMap<File, SAMFileReader>();
readerToIDMap = new HashMap<SAMFileReader,SAMReaderID>();
for (File f : reads.getReadsFiles()) {
SAMFileReader reader = new SAMFileReader(f, eagerDecode);
fileToReaderMap.put(f, reader);
readerToIDMap.put(reader,new SAMReaderID(f));
reader.setValidationStringency(reads.getValidationStringency());
final SAMFileHeader header = reader.getFileHeader();
logger.debug(String.format("Sort order is: " + header.getSortOrder()));
if (reader.getFileHeader().getReadGroups().size() < 1) {
//logger.warn("Setting header in reader " + f.getName());
SAMReadGroupRecord rec = new SAMReadGroupRecord(f.getName());
rec.setLibrary(f.getName());
rec.setSample(f.getName());
reader.getFileHeader().addReadGroup(rec);
}
lst.add(reader);
}
return new SamFileHeaderMerger(lst,SORT_ORDER,true);
}
}

View File

@ -167,18 +167,6 @@ public abstract class SAMDataSource implements SimpleDataSource {
*/
public abstract StingSAMIterator seek(Shard shard);
/**
* If we're in by-read mode, this indicates if we want
* to see unmapped reads too. Only seeing mapped reads
* is much faster, but most BAM files have significant
* unmapped read counts.
*
* @param seeUnMappedReads true to see unmapped reads, false otherwise
*/
public void viewUnmappedReads( boolean seeUnMappedReads ) {
includeUnmappedReads = seeUnMappedReads;
}
/**
* Filter reads based on user-specified criteria.
*

View File

@ -1,193 +0,0 @@
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.Reads;
import org.apache.log4j.Logger;
import net.sf.picard.sam.SamFileHeaderMerger;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileReader;
import java.util.List;
import java.util.Map;
import java.io.File;
/**
* Maintain a pool of resources of accessors to SAM read data. SAMFileReaders and
* headers are actually quite expensive to open, so this class manages the mechanics
* of keeping them open and reusing them.
* @author hanna
* @version 0.1
*/
@Deprecated
class SAMResourcePool extends ResourcePool<ReadStreamResource, StingSAMIterator> {
/** our log, which we want to capture anything from this class */
protected static Logger logger = Logger.getLogger(SAMResourcePool.class);
/** Source information about the reads. */
protected Reads reads;
protected SamFileHeaderMerger headerMerger;
protected Map<File, SAMFileReader> fileToReaderMap;
protected Map<SAMFileReader, SAMReaderID> readerToIDMap;
/**
* Do all the constituent BAM files have indices? We support some very limited
* cases where not all BAM files have indices available.
*/
protected final boolean hasIndex;
/** Is this a by-reads traversal or a by-locus? */
protected boolean queryOverlapping;
/** File header for the combined file. */
protected final SAMFileHeader header;
public SAMResourcePool( Reads reads ) {
this.reads = reads;
this.queryOverlapping = true;
ReadStreamResource streamResource = createNewResource();
this.header = streamResource.getHeader();
this.headerMerger = streamResource.getHeaderMerger();
this.hasIndex = streamResource.hasIndex();
this.fileToReaderMap = streamResource.getFileToReaderMapping();
this.readerToIDMap = streamResource.getReaderToIDMapping();
// Add this resource to the pool.
this.addNewResource(streamResource);
}
/** Get the combined header for all files in the iterator pool. */
public SAMFileHeader getHeader() {
return header;
}
/**
* Returns Reads data structure containing information about the reads data sources placed in this pool as well as
* information about how they are downsampled, sorted, and filtered
* @return
*/
public Reads getReadsInfo() { return reads; }
/**
* Returns a mapping from original input files to the SAMFileReaders
*
* @return the mapping
*/
public Map<File, SAMFileReader> getFileToReaderMapping() {
return fileToReaderMap;
}
/**
* Returns header merger: a class that keeps the mapping between original read groups and read groups
* of the merged stream; merger also provides access to the individual file readers (and hence headers
* too) maintained by the system.
* @return
*/
public SamFileHeaderMerger getHeaderMerger() { return headerMerger; }
protected ReadStreamResource selectBestExistingResource( DataStreamSegment segment, List<ReadStreamResource> resources ) {
for (ReadStreamResource resource : resources) {
if (resource.canAccessSegmentEfficiently(segment)) {
return resource;
}
}
return null;
}
protected ReadStreamResource createNewResource() {
return new ReadStreamResource(reads);
}
protected StingSAMIterator createIteratorFromResource( DataStreamSegment segment, ReadStreamResource streamResource ) {
StingSAMIterator iterator = null;
if (!queryOverlapping)
iterator = streamResource.getReadsContainedBy(segment);
else
iterator = streamResource.getReadsOverlapping(segment);
return new ReleasingIterator( streamResource, iterator );
}
protected void closeResource( ReadStreamResource resource ) {
resource.close();
}
private class ReleasingIterator implements StingSAMIterator {
/**
* The resource acting as the source of the data.
*/
private final ReadStreamResource resource;
/**
* The iterator to wrap.
*/
private final StingSAMIterator wrappedIterator;
public Reads getSourceInfo() {
return wrappedIterator.getSourceInfo();
}
public ReleasingIterator( ReadStreamResource resource, StingSAMIterator wrapped ) {
this.resource = resource;
this.wrappedIterator = wrapped;
}
public ReleasingIterator iterator() {
return this;
}
public void remove() {
throw new UnsupportedOperationException("Can't remove from a StingSAMIterator");
}
public void close() {
resource.destroy(wrappedIterator);
release(this);
}
public boolean hasNext() {
return wrappedIterator.hasNext();
}
public SAMRecord next() {
return wrappedIterator.next();
}
}
public boolean isQueryOverlapping() {
return queryOverlapping;
}
public void setQueryOverlapping( boolean queryOverlapping ) {
this.queryOverlapping = queryOverlapping;
}
}

View File

@ -57,7 +57,7 @@ public class LinearMicroScheduler extends MicroScheduler {
for (Shard shard : shardStrategy) {
// New experimental code for managing locus intervals.
if(shard.getShardType() == Shard.ShardType.LOCUS || shard.getShardType() == Shard.ShardType.LOCUS_INTERVAL) {
if(shard.getShardType() == Shard.ShardType.LOCUS) {
LocusWalker lWalker = (LocusWalker)walker;
WindowMaker windowMaker = new WindowMaker(getReadIterator(shard), shard.getGenomeLocs(), walker.getMandatoryReadFilters(), lWalker.getDiscards());
for(WindowMaker.WindowMakerIterator iterator: windowMaker) {

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.iterators;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import net.sf.picard.sam.MergingSamRecordIterator;
import java.util.Iterator;
@ -87,8 +88,8 @@ public class BoundedReadIterator implements StingSAMIterator {
public SAMFileHeader getHeader() {
// todo: this is bad, we need an iterface out there for samrecords that supports getting the header,
// regardless of the merging
if (iterator instanceof MergingSamRecordIterator2)
return ((MergingSamRecordIterator2)iterator).getHeader();
if (iterator instanceof MergingSamRecordIterator)
return ((MergingSamRecordIterator)iterator).getMergedHeader();
else
return null;
}

View File

@ -1,113 +0,0 @@
package org.broadinstitute.sting.gatk.iterators;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.StingException;
import java.util.Iterator;
/**
*
* @author aaron
*
* Class DuplicateDetectorIterator
*
* remove reads that overlap the passed in interval, yea!
*
*/
public class IntervalOverlapIterator implements StingSAMIterator {
// our wrapped iterator
private final StingSAMIterator mIter;
private final boolean throwException;
// storage for the next record
private SAMRecord mNextRecord = null;
// the genomic location which we filter on
private final GenomeLoc mLoc;
/**
* Create a DuplicateDetectorIterator from another sam iterator
* @param iter something that implements StingSAMIterator
* @param blowUpOnDup if we find a dup, do we throw an exception (blow up) or do we drop it
*/
public IntervalOverlapIterator(StingSAMIterator iter, GenomeLoc filterLocation, boolean blowUpOnDup) {
this.mIter = iter;
this.throwException = blowUpOnDup;
this.mLoc = filterLocation;
if (iter.hasNext()) {
next();
}
}
/**
* Gets source information for the reads. Contains information about the original reads
* files, plus information about downsampling, etc.
*
* @return
*/
public Reads getSourceInfo() {
return mIter.getSourceInfo();
}
/**
* close this iterator
*/
public void close() {
if (mIter != null) mIter.close();
}
/**
* do we have a next?
* @return true if yes, false if not
*/
public boolean hasNext() {
return (mNextRecord != null);
}
/**
* get the next record
* @return a SAMRecord
*/
public SAMRecord next() {
SAMRecord ret = mNextRecord;
while (mIter.hasNext()) {
mNextRecord = mIter.next();
if (!isOverlaping(mNextRecord)) return ret;
}
mNextRecord = null;
return ret;
}
/**
* not supported
*/
public void remove() {
throw new UnsupportedOperationException("You can't call remove, so, like, I guess please don't");
}
/**
* create an iterator out of the this type
* @return this!
*/
public Iterator<SAMRecord> iterator() {
return this;
}
/**
* determine if a read overlaps the specified interval that was passed in
* @param rec the read
* @return true if it overlaps, false otherwise
*/
private boolean isOverlaping(SAMRecord rec) {
boolean overlap = mLoc.overlapsP(GenomeLocParser.createGenomeLoc(rec));
if (overlap && this.throwException)
throw new StingException("IntervalOverlapIterator found a overlapping read " + rec.getReadName() + " with overlap " + this.mLoc.toString());
return overlap;
}
}

View File

@ -43,337 +43,6 @@ import java.util.Iterator;
import java.util.List;
import java.util.PriorityQueue;
/**
* Provides an iterator interface for merging multiple underlying iterators into a single
* iterable stream. The underlying iterators/files must all have the same sort order unless
* the requested output format is unsorted, in which case any combination is valid.
*/
public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>, Iterable<SAMRecord>, QueryIterator {
protected PriorityQueue<ComparableSamRecordIterator> pq = null;
protected final SamFileHeaderMerger samHeaderMerger;
protected final SAMFileHeader.SortOrder sortOrder;
protected static Logger logger = Logger.getLogger(MergingSamRecordIterator2.class);
private SAMRecord mNextRecord;
protected boolean initialized = false;
protected final Reads reads;
protected static boolean warnedUserAboutSortOrder = false; // so we only warn the user once
/**
* Constructs a new merging iterator with the same set of readers and sort order as
* provided by the header merger parameter.
*
* @param headerMerger the header to merge
* @param reads the reads pile
*/
public MergingSamRecordIterator2(final SamFileHeaderMerger headerMerger, Reads reads) {
this.samHeaderMerger = headerMerger;
this.reads = reads;
this.sortOrder = headerMerger.getMergedHeader().getSortOrder();
this.pq = new PriorityQueue<ComparableSamRecordIterator>(samHeaderMerger.getReaders().size());
}
/**
* we hold off initializing because we don't know if they plan to query or just iterate. This function
* gets called if the hasNext() determines we haven't been query-ed yet, and we want to setup the iterator to start
* at the beginning of the read pile
*/
private void lazyInitialization() {
if (initialized) {
throw new UnsupportedOperationException("You cannot double initialize a MergingSamRecordIterator2");
}
final SAMRecordComparator comparator = getComparator();
for (final SAMFileReader reader : samHeaderMerger.getReaders()) {
checkSortOrder(reader);
final ComparableSamRecordIterator iterator = new ComparableSamRecordIterator(this.samHeaderMerger, reader, comparator);
addIfNotEmpty(iterator);
}
setInitialized();
}
/**
* verify the sort order
*
* @param reader the reader to check
*/
private void checkSortOrder(SAMFileReader reader) {
if (this.sortOrder != SAMFileHeader.SortOrder.unsorted && reader.getFileHeader().getSortOrder() != this.sortOrder) {
String msg = String.format("The GATK requires your bam have %s sort order, but your BAM file header %s. Continuing beyond this point is unsafe" +
"-- please update your BAM file to have a compatible sort order using samtools sort or Picard MergeBamFiles. You may overcome this error by " +
"passing the \'-U ALLOW_UNSET_BAM_SORT_ORDER\' on the command line; before doing so make sure that your input BAM is sorted even if it can't be marked as such.",
this.sortOrder, reader.getFileHeader().getAttribute("SO") == null ? "is missing the SO sort order flag" : "has an SO flag set to " + reader.getFileHeader().getAttribute("SO"));
if (!reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.ALLOW_UNSET_BAM_SORT_ORDER)) {
throw new PicardException(msg);
} else if (!warnedUserAboutSortOrder) {
warnedUserAboutSortOrder = true;
Utils.warnUser(msg);
}
}
}
public boolean supportsSeeking() {
return true;
}
public void queryOverlapping(final String contig, final int start, final int stop) {
if (initialized) {
throw new IllegalStateException("You cannot double initialize a MergingSamRecordIterator2");
}
final SAMRecordComparator comparator = getComparator();
for (final SAMFileReader reader : samHeaderMerger.getReaders()) {
checkSortOrder(reader);
Iterator<SAMRecord> recordIter = reader.queryOverlapping(contig, start, stop);
final ComparableSamRecordIterator iterator = new ComparableSamRecordIterator(this.samHeaderMerger, reader, recordIter, comparator);
addIfNotEmpty(iterator);
}
setInitialized();
}
public void query(final String contig, final int start, final int stop, final boolean contained) {
if (initialized) {
throw new IllegalStateException("You cannot double initialize a MergingSamRecordIterator2");
}
final SAMRecordComparator comparator = getComparator();
for (final SAMFileReader reader : samHeaderMerger.getReaders()) {
checkSortOrder(reader);
Iterator<SAMRecord> recordIter = reader.query(contig, start, stop, contained);
final ComparableSamRecordIterator iterator = new ComparableSamRecordIterator(this.samHeaderMerger, reader, recordIter, comparator);
addIfNotEmpty(iterator);
}
setInitialized();
}
public void queryUnmappedReads() {
if (initialized) {
throw new IllegalStateException("You cannot double initialize a MergingSamRecordIterator2");
}
final SAMRecordComparator comparator = getComparator();
for (final SAMFileReader reader : samHeaderMerger.getReaders()) {
Iterator<SAMRecord> recordIter = null;
if (reader.hasIndex()) {
recordIter = reader.queryUnmapped();
} else {
// HACK: Supporting completely unmapped BAM files is easy. Let's do a quick check to make sure
// these BAMs aren't partially indexed.
if (reader.getFileHeader().getSequenceDictionary().size() > 0)
throw new StingException("Partially mapped BAM files without indices are not supported");
recordIter = reader.iterator();
}
final ComparableSamRecordIterator iterator = new ComparableSamRecordIterator(this.samHeaderMerger, reader, recordIter, comparator);
addIfNotEmpty(iterator);
}
setInitialized();
}
public void queryContained(final String contig, final int start, final int stop) {
if (initialized) {
throw new IllegalStateException("You cannot double initialize a MergingSamRecordIterator2");
}
final SAMRecordComparator comparator = getComparator();
for (final SAMFileReader reader : samHeaderMerger.getReaders()) {
checkSortOrder(reader);
Iterator<SAMRecord> recordIter = reader.queryContained(contig, start, stop);
final ComparableSamRecordIterator iterator = new ComparableSamRecordIterator(this.samHeaderMerger, reader, recordIter, comparator);
addIfNotEmpty(iterator);
}
setInitialized();
}
private void setInitialized() {
initialized = true;
mNextRecord = nextRecord();
}
/** Returns true if any of the underlying iterators has more records, otherwise false. */
public boolean hasNext() {
if (!initialized) {
lazyInitialization();
}
if (this.pq.isEmpty() && mNextRecord == null) {
return false;
}
return true;
}
public SAMRecord next() {
SAMRecord r = mNextRecord;
if (!this.pq.isEmpty()) {
mNextRecord = nextRecord();
} else {
mNextRecord = null;
}
return r;
}
/** @return the next record, without moving the iterator */
public SAMRecord peek() {
return mNextRecord;
}
/**
* Returns the next record from the top most iterator during merging.
*
* @return the next record, null if it's unavailable
*/
public SAMRecord nextRecord() {
if (!initialized) {
lazyInitialization();
}
final ComparableSamRecordIterator iterator = this.pq.poll();
if (iterator == null) {
return null;
}
final SAMRecord record = iterator.next();
addIfNotEmpty(iterator);
// Fix the read group if needs be
if (this.samHeaderMerger.hasReadGroupCollisions()) {
final String oldGroupId = (String) record.getAttribute(ReservedTagConstants.READ_GROUP_ID);
if (oldGroupId != null ) {
final String newGroupId = this.samHeaderMerger.getReadGroupId(iterator.getReader(), oldGroupId);
record.setAttribute(ReservedTagConstants.READ_GROUP_ID, newGroupId);
}
}
// Fix the program group if needs be
if (this.samHeaderMerger.hasProgramGroupCollisions()) {
final String oldGroupId = (String) record.getAttribute(ReservedTagConstants.PROGRAM_GROUP_ID);
if (oldGroupId != null ) {
final String newGroupId = this.samHeaderMerger.getProgramGroupId(iterator.getReader(), oldGroupId);
record.setAttribute(ReservedTagConstants.PROGRAM_GROUP_ID, newGroupId);
}
}
// if we don't have a read group, set one correctly
if (record.getAttribute(SAMTag.RG.toString()) == null) {
List<SAMReadGroupRecord> readGroups = iterator.getReader().getFileHeader().getReadGroups();
if (readGroups.size() == 1) {
record.setAttribute(SAMTag.RG.toString(), readGroups.get(0).getReadGroupId());
record.setAttribute(SAMTag.SM.toString(), readGroups.get(0).getReadGroupId());
} else {
logger.warn("Unable to set read group of ungrouped read: unable to pick default group, there are " + readGroups.size() + " possible.");
}
}
record.setHeader(samHeaderMerger.getMergedHeader());
//System.out.printf("NEXT = %s %s %d%n", record.getReadName(), record.getReferenceName(), record.getAlignmentStart());
//System.out.printf("PEEK = %s %s %d%n", this.pq.peek().peek().getReadName(), this.pq.peek().peek().getReferenceName(), this.pq.peek().peek().getAlignmentStart());
return record;
}
/**
* Adds iterator to priority queue. If the iterator has more records it is added
* otherwise it is closed and not added.
*
* @param iterator the iterator to add
*/
protected void addIfNotEmpty(final ComparableSamRecordIterator iterator) {
//System.out.printf("Adding %s %s %d%n", iterator.peek().getReadName(), iterator.peek().getReferenceName(), iterator.peek().getAlignmentStart());
if (iterator.hasNext()) {
pq.offer(iterator);
} else {
iterator.close();
}
}
/** Unsupported operation. */
public void remove() {
throw new UnsupportedOperationException("MergingSAMRecorderIterator.remove()");
}
/**
* Get the right comparator for a given sort order (coordinate, alphabetic). In the
* case of "unsorted" it will return a comparator that gives an arbitrary but reflexive
* ordering.
*
* @return the SAMRecordComparator appropriate for our sort order
*/
protected SAMRecordComparator getComparator() {
// For unsorted build a fake comparator that compares based on object ID
if (this.sortOrder == SAMFileHeader.SortOrder.unsorted) {
return new SAMRecordComparator() {
public int fileOrderCompare(final SAMRecord lhs, final SAMRecord rhs) {
return System.identityHashCode(lhs) - System.identityHashCode(rhs);
}
public int compare(final SAMRecord lhs, final SAMRecord rhs) {
return fileOrderCompare(lhs, rhs);
}
};
}
// Otherwise try and figure out what kind of comparator to return and build it
final Class<? extends SAMRecordComparator> type = this.sortOrder.getComparator();
try {
final Constructor<? extends SAMRecordComparator> ctor = type.getConstructor(SAMFileHeader.class);
//System.out.printf("Getting comparator %s%n", ctor.toGenericString());
return ctor.newInstance(this.samHeaderMerger.getMergedHeader());
}
catch (Exception e) {
try {
final Constructor<? extends SAMRecordComparator> ctor = type.getConstructor();
return ctor.newInstance();
}
catch (Exception e2) {
throw new PicardException("Could not instantiate a comparator for sort order: " + this.sortOrder, e2);
}
}
}
/**
* Returns the merged header that the merging iterator is working from.
*
* @return our sam file header
*/
public SAMFileHeader getHeader() {
return this.samHeaderMerger.getMergedHeader();
}
/**
* closes all open iterators....DO THIS or you will run out of handles
* with sharding.
*/
public void close() {
for (ComparableSamRecordIterator iterator : pq)
iterator.close();
}
/**
* allows us to be used in the new style for loops
*
* @return this iterator, which can be used in for loop each iterations
*/
public Iterator<SAMRecord> iterator() {
return this;
}
/**
* Gets source information for the reads. Contains information about the original reads
* files, plus information about downsampling, etc.
*
* @return the read object we were created for
*/
public Reads getSourceInfo() {
return this.reads;
}
}
// Should replace picard class with the same name
class ComparableSamRecordIterator extends PeekableIterator<SAMRecord> implements Comparable<ComparableSamRecordIterator>, StingSAMIterator {
private Reads sourceInfo;

View File

@ -1,106 +0,0 @@
package org.broadinstitute.sting.gatk.iterators;
import net.sf.samtools.SAMRecord;
import java.util.Iterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.gatk.Reads;
/**
* Created by IntelliJ IDEA.
* User: aaronmckenna
* Date: Nov 4, 2009
* Time: 10:41:02 PM
*/
public class PlusOneFixIterator implements StingSAMIterator {
/**
* our interval region
*/
private final GenomeLoc mInterval;
/**
* our iterator
*/
private final StingSAMIterator mIterator;
/**
* our next record
*/
private SAMRecord mNextRecord;
/**
* This iterator filters all reads that have their start < PLUS_ONE_FIX_CONSTANT from the end of the
* interval:
* <p/>
* <p/>
* (this won't make sense in a non-fixed width font)
* --------------
* interval |
* ------------------------
* | good read|
* --------------------------------------------
* | overlap < PLUS_ONE_FIX_CONSTANT, we drop|
* -------------------------------------------
* <p/>
* The problem is that we had to put a +1 on our queryOverlapping() stop position value to SAMTools
* due to an indexing problem. Other iterators don't know about this adjustment though, so calculations
* like overlap fail. This iterator eliminates them before the read is seen by other iterators.
* <p/>
* create a plus one fix iterator, given:
*
* @param inteveral the interval we're checking
* @param iterator the iterator to draw reads from
*/
public PlusOneFixIterator(GenomeLoc inteveral, StingSAMIterator iterator) {
if (inteveral == null || iterator == null)
throw new StingException("Both parameters to PlusOneFixIterator have to != null");
// save off the iterator and the interval
mInterval = inteveral;
mIterator = iterator;
// pop off the first read
if (mIterator.hasNext()) {
next();
}
}
@Override
public boolean hasNext() {
return (mNextRecord != null);
}
@Override
public SAMRecord next() {
SAMRecord ret = mNextRecord;
while (mIterator.hasNext()) {
mNextRecord = mIterator.next();
if (!(mNextRecord.getAlignmentStart() > mInterval.getStop())) return ret;
}
mNextRecord = null;
return ret;
}
@Override
public void remove() {
throw new UnsupportedOperationException("REMOVE on PlusOneFixIterator is not supported");
}
@Override
public Reads getSourceInfo() {
return this.mIterator.getSourceInfo();
}
@Override
public void close() {
this.mIterator.close();
}
@Override
public Iterator<SAMRecord> iterator() {
return this;
}
}

View File

@ -1,48 +0,0 @@
package org.broadinstitute.sting.gatk.iterators;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMRecordComparator;
import net.sf.samtools.SAMFileReader;
import java.util.Iterator;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* @author aaron
*
* This interface indicates that the iterator is query able
*/
public interface QueryIterator extends StingSAMIterator, PeekingIterator<SAMRecord> {
/**
* The required methods to query able
**/
public void queryOverlapping(final String contig, final int start, final int stop);
public void query(final String contig, final int start, final int stop, final boolean contained);
public void queryUnmappedReads();
public void queryContained(final String contig, final int start, final int stop);
}

View File

@ -7,7 +7,6 @@ import net.sf.samtools.SAMRecord;
import java.util.List;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.gatk.iterators.QueryIterator;
/*
@ -42,7 +41,7 @@ import org.broadinstitute.sting.gatk.iterators.QueryIterator;
* to test out classes that use specific itervals. The reads returned will
* all lie in order in the specified interval.
*/
public class ArtificialSAMQueryIterator extends ArtificialSAMIterator implements QueryIterator {
public class ArtificialSAMQueryIterator extends ArtificialSAMIterator {
// get the next positon
protected int finalPos = 0;

View File

@ -1,7 +1,7 @@
package org.broadinstitute.sting.utils.sam;
import net.sf.samtools.*;
import org.broadinstitute.sting.gatk.iterators.QueryIterator;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.utils.StingException;
import java.io.File;
@ -203,7 +203,7 @@ public class ArtificialSAMUtils {
*
* @return StingSAMIterator representing the specified amount of fake data
*/
public static QueryIterator mappedReadIterator( int startingChr, int endingChr, int readCount ) {
public static StingSAMIterator mappedReadIterator( int startingChr, int endingChr, int readCount ) {
SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH);
return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, 0, header);
@ -219,7 +219,7 @@ public class ArtificialSAMUtils {
*
* @return StingSAMIterator representing the specified amount of fake data
*/
public static ArtificialSAMIterator mappedAndUnmappedReadIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount ) {
public static StingSAMIterator mappedAndUnmappedReadIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount ) {
SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH);
return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, unmappedReadCount, header);
@ -250,7 +250,7 @@ public class ArtificialSAMUtils {
*
* @return StingSAMIterator representing the specified amount of fake data
*/
public static ArtificialSAMQueryIterator queryReadIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount ) {
public static StingSAMIterator queryReadIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount ) {
SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH);
return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, unmappedReadCount, header);

View File

@ -1,149 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.shards;
import org.junit.Test;
import org.junit.Before;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.BaseTest;
import net.sf.samtools.SAMFileHeader;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* @author aaron
* <p/>
* Class ReadIntervalShardStrategyUnitTest
* <p/>
* Tests the ReadIntervalShardStrategy class
*
*
* TODO: this test has been changed, since we now force interval shards to not subdivide if they're large,
* so you should always get one shard back, reguardless of the specified length. If this behavior changes
* back in the future, the expected number of shards should change from 1 to > 1.
*
*/
public class IntervalShardStrategyUnitTest extends BaseTest {
private GenomeLocSortedSet mSortedSet = null;
private SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(NUMBER_OF_CHROMOSOMES, STARTING_CHROMOSOME, CHROMOSOME_SIZE);
private static final int NUMBER_OF_CHROMOSOMES = 5;
private static final int STARTING_CHROMOSOME = 1;
private static final int CHROMOSOME_SIZE = 1000;
@Before
public void setup() {
GenomeLocParser.setupRefContigOrdering(header.getSequenceDictionary());
mSortedSet = new GenomeLocSortedSet();
}
@Test
public void testNoExceptionOnEmpty() {
IntervalShardStrategy strat = new IntervalShardStrategy(100, mSortedSet,Shard.ShardType.LOCUS_INTERVAL);
}
@Test
public void testSingleChromosomeFunctionality() {
GenomeLoc loc = GenomeLocParser.createGenomeLoc(1, 1, 1000);
mSortedSet.add(loc);
IntervalShardStrategy strat = new IntervalShardStrategy(100, mSortedSet,Shard.ShardType.LOCUS_INTERVAL);
int counter = 0;
Shard d = null;
while (strat.hasNext()) {
d = strat.next();
counter++;
}
assertTrue(d instanceof IntervalShard);
assertEquals(1, counter);
}
@Test
public void testMultipleChromosomeFunctionality() {
for (int x = 0; x < 5; x++) {
GenomeLoc loc = GenomeLocParser.createGenomeLoc(x, 1, 1000);
mSortedSet.add(loc);
}
IntervalShardStrategy strat = new IntervalShardStrategy(100, mSortedSet,Shard.ShardType.LOCUS_INTERVAL);
int counter = 0;
Shard d = null;
while (strat.hasNext()) {
d = strat.next();
counter++;
}
assertTrue(d instanceof IntervalShard);
assertEquals(5, counter);
}
@Test
public void testOddSizeShardFunctionality() {
for (int x = 0; x < 5; x++) {
GenomeLoc loc = GenomeLocParser.createGenomeLoc(x, 1, 1000);
mSortedSet.add(loc);
}
IntervalShardStrategy strat = new IntervalShardStrategy(789, mSortedSet,Shard.ShardType.LOCUS_INTERVAL);
int counter = 0;
while (strat.hasNext()) {
Shard d = strat.next();
assertEquals(1,d.getGenomeLocs().size());
assertEquals(1, d.getGenomeLocs().get(0).getStart());
assertEquals(1000, d.getGenomeLocs().get(0).getStop());
counter++;
}
assertEquals(5, counter);
}
@Test
public void testInfiniteShardSize() {
for (int x = 0; x < 5; x++) {
GenomeLoc loc = GenomeLocParser.createGenomeLoc(x, 1, 1000);
mSortedSet.add(loc);
}
IntervalShardStrategy strat = new IntervalShardStrategy(Long.MAX_VALUE, mSortedSet,Shard.ShardType.LOCUS_INTERVAL);
int counter = 0;
while (strat.hasNext()) {
Shard d = strat.next();
assertEquals(1,d.getGenomeLocs().size());
assertEquals(1000, d.getGenomeLocs().get(0).getStop());
counter++;
}
assertEquals(5, counter);
}
@Test(expected = UnsupportedOperationException.class)
public void testRemove() {
GenomeLoc loc = GenomeLocParser.createGenomeLoc(1, 1, 1000);
mSortedSet.add(loc);
IntervalShardStrategy strat = new IntervalShardStrategy(100, mSortedSet,Shard.ShardType.LOCUS_INTERVAL);
strat.remove();
}
}

View File

@ -1,76 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.shards;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.junit.Before;
import org.junit.Test;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import net.sf.samtools.SAMFileHeader;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* @author aaron
* <p/>
* Class IntervalReadShardTest
* <p/>
* Tests for the IntervalReadShard class.
*/
public class IntervalShardUnitTest extends BaseTest {
private IntervalShard intervalShard = null;
private SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(NUMBER_OF_CHROMOSOMES, STARTING_CHROMOSOME, CHROMOSOME_SIZE);
private static final int NUMBER_OF_CHROMOSOMES = 5;
private static final int STARTING_CHROMOSOME = 1;
private static final int CHROMOSOME_SIZE = 1000;
@Before
public void setup() {
GenomeLocParser.setupRefContigOrdering(header.getSequenceDictionary());
}
@Test
public void simpleReturn() {
GenomeLoc loc = GenomeLocParser.createGenomeLoc(1, 1, 100);
intervalShard = new IntervalShard(loc,Shard.ShardType.LOCUS_INTERVAL);
assertEquals("Input parameters imply a single-locus shard",1,intervalShard.getGenomeLocs().size());
assertTrue(intervalShard.getGenomeLocs().get(0).equals(loc));
}
@Test
public void ensureNotReference() {
GenomeLoc loc = GenomeLocParser.createGenomeLoc(1, 1, 100);
intervalShard = new IntervalShard(loc,Shard.ShardType.LOCUS_INTERVAL);
assertEquals("Input parameters imply a single-locus shard",1,intervalShard.getGenomeLocs().size());
assertTrue(intervalShard.getGenomeLocs().get(0) != loc && intervalShard.getGenomeLocs().get(0).equals(loc));
}
}

View File

@ -1,122 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.shards;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.BaseTest;
import org.junit.Before;
import org.junit.Test;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import net.sf.samtools.SAMFileHeader;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* @author aaron
* <p/>
* Class LocusShardStrategyTest
* <p/>
* Test for the Locus Shard Strategy
*/
public class LinearLocusShardStrategyUnitTest extends BaseTest {
private GenomeLocSortedSet mSortedSet = null;
private SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(NUMBER_OF_CHROMOSOMES, STARTING_CHROMOSOME, CHROMOSOME_SIZE);
private static final int NUMBER_OF_CHROMOSOMES = 5;
private static final int STARTING_CHROMOSOME = 1;
private static final int CHROMOSOME_SIZE = 1000;
@Before
public void setup() {
GenomeLocParser.setupRefContigOrdering(header.getSequenceDictionary());
}
@Test
public void testSetup() {
LinearLocusShardStrategy strat = new LinearLocusShardStrategy(header.getSequenceDictionary(), 500, -1);
int counter = 0;
while(strat.hasNext()) {
Shard d = strat.next();
assertTrue(d instanceof LocusShard);
assertEquals("Sharding strategy must emit single locus shards",1,d.getGenomeLocs().size());
assertTrue(d.getGenomeLocs().get(0).getStop() - d.getGenomeLocs().get(0).getStart() == 499);
++counter;
}
assertTrue(counter == 10);
}
@Test
public void testAdjustSize() {
LinearLocusShardStrategy strat = new LinearLocusShardStrategy(header.getSequenceDictionary(), 500, -1);
strat.adjustNextShardSize(1000);
int counter = 0;
while(strat.hasNext()) {
Shard d = strat.next();
assertTrue(d instanceof LocusShard);
assertEquals("Sharding strategy must emit single locus shards",1,d.getGenomeLocs().size());
assertTrue(d.getGenomeLocs().get(0).getStop() - d.getGenomeLocs().get(0).getStart() == 999);
++counter;
}
assertTrue(counter == 5);
}
@Test
public void testUnevenSplit() {
LinearLocusShardStrategy strat = new LinearLocusShardStrategy(header.getSequenceDictionary(), 600, -1);
int counter = 0;
while(strat.hasNext()) {
Shard d = strat.next();
assertTrue(d instanceof LocusShard);
assertEquals("Sharding strategy must emit single locus shards",1,d.getGenomeLocs().size());
if (counter % 2 == 0) {
assertTrue(d.getGenomeLocs().get(0).getStop() - d.getGenomeLocs().get(0).getStart() == 599);
} else {
assertTrue(d.getGenomeLocs().get(0).getStop() - d.getGenomeLocs().get(0).getStart() == 399);
}
++counter;
}
assertTrue(counter == 10);
}
@Test
public void testDashMOption() {
LinearLocusShardStrategy strat = new LinearLocusShardStrategy(header.getSequenceDictionary(), 600, 200);
int counter = 0;
while(strat.hasNext()) {
Shard d = strat.next();
assertTrue(d instanceof LocusShard);
assertEquals("Sharding strategy must emit single locus shards",1,d.getGenomeLocs().size());
assertEquals(199,(d.getGenomeLocs().get(0).getStop() - d.getGenomeLocs().get(0).getStart()));
++counter;
}
assertTrue(counter == 1);
}
}

View File

@ -1,78 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.shards;
import static junit.framework.Assert.assertEquals;
import static junit.framework.Assert.fail;
import net.sf.samtools.SAMFileHeader;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.junit.*;
import static org.junit.Assert.assertTrue;
/**
*
* User: aaron
* Date: Apr 8, 2009
* Time: 11:31:04 AM
*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
/**
* @author aaron
* @version 1.0
*/
public class ShardStrategyFactoryUnitTest extends BaseTest {
private SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(NUMBER_OF_CHROMOSOMES, STARTING_CHROMOSOME, CHROMOSOME_SIZE);
private static final int NUMBER_OF_CHROMOSOMES = 5;
private static final int STARTING_CHROMOSOME = 1;
private static final int CHROMOSOME_SIZE = 1000;
private GenomeLocSortedSet set = null;
@Before
public void setup() {
GenomeLocParser.setupRefContigOrdering(header.getSequenceDictionary());
set = new GenomeLocSortedSet();
}
@Test
public void testReadNonInterval() {
ShardStrategy st = ShardStrategyFactory.shatter(null,null,ShardStrategyFactory.SHATTER_STRATEGY.READS,header.getSequenceDictionary(),100);
assertTrue(st instanceof ReadShardStrategy);
}
@Test
public void testReadInterval() {
GenomeLoc l = GenomeLocParser.createGenomeLoc(0,1,100);
set.add(l);
ShardStrategy st = ShardStrategyFactory.shatter(null,null,ShardStrategyFactory.SHATTER_STRATEGY.READS,header.getSequenceDictionary(),100,set);
assertTrue(st instanceof IntervalShardStrategy);
}
@Test
public void testLinearNonInterval() {
ShardStrategy st = ShardStrategyFactory.shatter(null,null,ShardStrategyFactory.SHATTER_STRATEGY.LINEAR,header.getSequenceDictionary(),100);
assertTrue(st instanceof LinearLocusShardStrategy);
}
@Test
public void testLinearInterval() {
GenomeLoc l = GenomeLocParser.createGenomeLoc(0,1,100);
set.add(l);
ShardStrategy st = ShardStrategyFactory.shatter(null,null,ShardStrategyFactory.SHATTER_STRATEGY.LINEAR,header.getSequenceDictionary(),100,set);
assertTrue(st instanceof LinearLocusShardStrategy);
}
}

View File

@ -1,102 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMFileHeader;
import org.broadinstitute.sting.utils.sam.ArtificialSAMIterator;
import org.broadinstitute.sting.utils.sam.ArtificialSAMQueryIterator;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.iterators.BoundedReadIterator;
import java.util.Collections;
import java.io.File;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* use this to inject into SAMDataSource for testing
*/
public class ArtificialResourcePool extends SAMResourcePool {
// How strict should we be with SAM/BAM parsing?
protected SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.SILENT;
// the header
private SAMFileHeader header;
private ArtificialSAMIterator iterator;
/**
* Track the iterator to see whether it's venturing into unmapped reads for the first
* time. If so, query straight there. Only works for query iterators.
*
* TODO: Clean up.
*/
private boolean intoUnmappedReads = false;
public ArtificialResourcePool( SAMFileHeader header, ArtificialSAMIterator iterator ) {
super( new Reads(Collections.<File>emptyList()) );
this.header = header;
this.iterator = iterator;
}
@Override
public StingSAMIterator iterator( DataStreamSegment segment ) {
if (segment instanceof MappedStreamSegment && iterator instanceof ArtificialSAMQueryIterator) {
ArtificialSAMQueryIterator queryIterator = (ArtificialSAMQueryIterator)iterator;
MappedStreamSegment mappedSegment = (MappedStreamSegment)segment;
GenomeLoc bounds = mappedSegment.locus;
if (!this.queryOverlapping) {
queryIterator.queryContained(bounds.getContig(), (int)bounds.getStart(), (int)bounds.getStop());
} else {
queryIterator.queryOverlapping(bounds.getContig(), (int)bounds.getStart(), (int)bounds.getStop());
}
return queryIterator;
}
else if (segment instanceof UnmappedStreamSegment) {
if( !intoUnmappedReads ) {
if( iterator instanceof ArtificialSAMQueryIterator ) {
ArtificialSAMQueryIterator queryIterator = (ArtificialSAMQueryIterator)iterator;
queryIterator.queryUnmappedReads();
}
intoUnmappedReads = true;
}
return new BoundedReadIterator(iterator,((UnmappedStreamSegment)segment).size);
}
else
throw new StingException("Unsupported segment type passed to test");
}
/**
* get the merged header
*
* @return the merged header
*/
public SAMFileHeader getHeader() {
return this.header;
}
}

View File

@ -50,7 +50,7 @@ import java.util.List;
public class SAMBAMDataSourceUnitTest extends BaseTest {
private List<File> fl;
private ReferenceSequenceFile seq;
private IndexedFastaSequenceFile seq;
/**
* This function does the setup of our parser, before each method call.
@ -88,8 +88,8 @@ public class SAMBAMDataSourceUnitTest extends BaseTest {
Reads reads = new Reads(fl);
// the sharding strat.
SAMDataSource data = new IndexDrivenSAMDataSource(reads);
ShardStrategy strat = ShardStrategyFactory.shatter(data,null,ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000);
SAMDataSource data = new BlockDrivenSAMDataSource(reads);
ShardStrategy strat = ShardStrategyFactory.shatter(data,seq,ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, seq.getSequenceDictionary(), 100000);
int count = 0;
try {
@ -133,8 +133,8 @@ public class SAMBAMDataSourceUnitTest extends BaseTest {
Reads reads = new Reads(fl);
// the sharding strat.
SAMDataSource data = new IndexDrivenSAMDataSource(reads);
ShardStrategy strat = ShardStrategyFactory.shatter(data,null,ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000);
SAMDataSource data = new BlockDrivenSAMDataSource(reads);
ShardStrategy strat = ShardStrategyFactory.shatter(data,seq,ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, seq.getSequenceDictionary(), 100000);
ArrayList<Integer> readcountPerShard = new ArrayList<Integer>();
ArrayList<Integer> readcountPerShard2 = new ArrayList<Integer>();
@ -176,8 +176,8 @@ public class SAMBAMDataSourceUnitTest extends BaseTest {
count = 0;
// the sharding strat.
data = new IndexDrivenSAMDataSource(reads);
strat = ShardStrategyFactory.shatter(data,null,ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000);
data = new BlockDrivenSAMDataSource(reads);
strat = ShardStrategyFactory.shatter(data,seq,ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, seq.getSequenceDictionary(), 100000);
logger.debug("Pile two:");
try {

View File

@ -1,151 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy;
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory;
import org.junit.Before;
import org.junit.Test;
import static org.junit.Assert.assertEquals;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* @author aaron
* <p/>
* Class SAMByIntervalUnitTest
* <p/>
* Test that the SAM data source behaves well given intervals
*/
public class SAMByIntervalUnitTest extends BaseTest {
private List<File> fl;
ShardStrategy shardStrategy;
Reads reads;
private int targetReadCount = 14;
// constants we use throughout the tests
protected final int READ_COUNT;
protected final int ENDING_CHROMO;
protected final int STARTING_CHROMO;
protected final int UNMAPPED_READ_COUNT;
public SAMByIntervalUnitTest() {
READ_COUNT = 100;
ENDING_CHROMO = 10;
STARTING_CHROMO = 1;
UNMAPPED_READ_COUNT = 1000;
}
/**
* This function does the setup of our parser, before each method call.
* <p/>
* Called before every test case method.
*/
@Before
public void doForEachTest() {
fl = new ArrayList<File>();
// sequence
//seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly17/v0/Homo_sapiens_assembly17.fasta"));
//GenomeLoc.setupRefContigOrdering(seq.getSequenceDictionary());
// setup the test files
fl.add(new File(validationDataLocation + "index_test.bam"));
reads = new Reads(fl);
}
/** run a test on data over a specific interval */
private void testRead( int start, int stop, int readCount ) {
ArtificialResourcePool gen = new ArtificialResourcePool(createArtificialSamHeader(STARTING_CHROMO, ENDING_CHROMO, READ_COUNT, UNMAPPED_READ_COUNT),
ArtificialSAMUtils.mappedAndUnmappedReadIterator(STARTING_CHROMO, ENDING_CHROMO, READ_COUNT, UNMAPPED_READ_COUNT));
GenomeLocParser.setupRefContigOrdering(gen.getHeader().getSequenceDictionary());
int unmappedReadsSeen = 0;
int iterations = 0;
IndexDrivenSAMDataSource data = new IndexDrivenSAMDataSource(reads);
data.setResourcePool(gen);
GenomeLocSortedSet set = new GenomeLocSortedSet();
set.add(GenomeLocParser.createGenomeLoc(0, start, stop));
ShardStrategy strat = ShardStrategyFactory.shatter(data,null,ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL, gen.getHeader().getSequenceDictionary(), UNMAPPED_READ_COUNT, set);
StingSAMIterator iter = data.seek(strat.next());
int count = 0;
while (iter.hasNext()) {
SAMRecord r = iter.next();
// uncomment for debugging - System.err.println(r.getAlignmentStart() + " " + r.getAlignmentEnd());
count++;
}
assertEquals(readCount, count);
}
/**
* test out that we get a single read, given the specific size
*/
@Test
public void testSingleRead() {
testRead(1,ArtificialSAMUtils.DEFAULT_READ_LENGTH,50);
}
/**
* test out that we get the expected amount for a whole chromosome
*/
@Test
public void testChromosome() {
testRead(1, READ_COUNT, READ_COUNT);
}
/**
* test out that we get the expected amount for a whole chromosome
*/
@Test
public void testMiddle() {
testRead(20, READ_COUNT-20,61);
}
private SAMFileHeader createArtificialSamHeader( int startingChr, int endingChr, int readCount, int readSize ) {
return ArtificialSAMUtils.createArtificialSamHeader(( endingChr - startingChr ) + 1,
startingChr,
readCount + readSize);
}
}

View File

@ -1,175 +0,0 @@
package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
import static junit.framework.Assert.fail;
import net.sf.samtools.SAMFileHeader;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy;
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory;
import org.broadinstitute.sting.gatk.iterators.*;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import org.junit.Before;
import org.junit.Test;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
/**
*
* User: aaron
* Date: Apr 15, 2009
* Time: 7:12:59 PM
*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
/**
* @author aaron
* @version 1.0
*/
public class SAMByReadsUnitTest extends BaseTest {
private List<File> fl;
ShardStrategy shardStrategy;
Reads reads;
private int targetReadCount = 14;
/**
* This function does the setup of our parser, before each method call.
* <p/>
* Called before every test case method.
*/
@Before
public void doForEachTest() {
fl = new ArrayList<File>();
// sequence
//seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly17/v0/Homo_sapiens_assembly17.fasta"));
//GenomeLoc.setupRefContigOrdering(seq.getSequenceDictionary());
// setup the test files
fl.add(new File(validationDataLocation + "index_test.bam"));
reads = new Reads(fl);
}
/**
* Test out that we can shard the file and iterate over every read
*/
@Test
public void testToUnmappedReads() {
ArtificialResourcePool gen = new ArtificialResourcePool(createArtificialSamHeader(1, 1, 1, 50),
ArtificialSAMUtils.mappedAndUnmappedReadIterator(1, 1, 1, 10));
GenomeLocParser.setupRefContigOrdering(gen.getHeader().getSequenceDictionary());
try {
int unmappedReadsSeen = 0;
int iterations = 0;
IndexDrivenSAMDataSource data = new IndexDrivenSAMDataSource(reads);
data.setResourcePool(gen);
++iterations;
StingSAMIterator ret = data.toUnmappedReads(100);
// count the reads we've gotten back
if (ret == null) {
fail("On iteration " + iterations + " we were returned a null pointer, after seeing " + unmappedReadsSeen + " reads out of a 1000");
}
while (ret.hasNext()) {
ret.next();
unmappedReadsSeen++;
}
assertEquals(10, unmappedReadsSeen);
} catch (SimpleDataSourceLoadException e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
fail("testLinearBreakIterateAll: We Should not get a SimpleDataSourceLoadException");
}
}
/**
* Test out that we can shard the file and iterate over every read
*/
@Test
public void testShardingOfReadsEvenSplit() {
ArtificialResourcePool gen = new ArtificialResourcePool(createArtificialSamHeader(1, 1, 10, 50),
ArtificialSAMUtils.mappedAndUnmappedReadIterator(1, 1, 10, 10));
GenomeLocParser.setupRefContigOrdering(gen.getHeader().getSequenceDictionary());
targetReadCount = 5;
try {
int readCount = 0;
IndexDrivenSAMDataSource data = new IndexDrivenSAMDataSource(reads);
data.setResourcePool(gen);
shardStrategy = ShardStrategyFactory.shatter(data,null,ShardStrategyFactory.SHATTER_STRATEGY.READS, gen.getHeader().getSequenceDictionary(), targetReadCount);
while (shardStrategy.hasNext()) {
StingSAMIterator ret = data.seek(shardStrategy.next());
assertTrue(ret != null);
while (ret.hasNext()) {
ret.next();
readCount++;
}
ret.close();
}
assertEquals(20, readCount);
} catch (SimpleDataSourceLoadException e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
fail("testLinearBreakIterateAll: We Should not get a SimpleDataSourceLoadException");
}
}
/**
* Test out that we can shard the file and iterate over every read
*/
@Test
public void testShardingOfReadsOddRemainder() {
ArtificialResourcePool gen = new ArtificialResourcePool(createArtificialSamHeader(1, 1, 10, 100),
ArtificialSAMUtils.mappedAndUnmappedReadIterator(1, 1, 10, 10));
GenomeLocParser.setupRefContigOrdering(gen.getHeader().getSequenceDictionary());
targetReadCount = 3;
try {
int readCount = 0;
IndexDrivenSAMDataSource data = new IndexDrivenSAMDataSource(reads);
data.setResourcePool(gen);
shardStrategy = ShardStrategyFactory.shatter(data,null,ShardStrategyFactory.SHATTER_STRATEGY.READS, gen.getHeader().getSequenceDictionary(), targetReadCount);
while (shardStrategy.hasNext()) {
StingSAMIterator ret = data.seek(shardStrategy.next());
assertTrue(ret != null);
while (ret.hasNext()) {
ret.next();
readCount++;
}
ret.close();
}
// assert that we saw 2000 reads
assertEquals(20, readCount);
} catch (SimpleDataSourceLoadException e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
fail("testLinearBreakIterateAll: We Should not get a SimpleDataSourceLoadException");
}
}
private SAMFileHeader createArtificialSamHeader(int startingChr, int endingChr, int readCount, int readSize) {
return ArtificialSAMUtils.createArtificialSamHeader((endingChr - startingChr) + 1,
startingChr,
readCount + readSize);
}
}

View File

@ -6,17 +6,9 @@ import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import net.sf.picard.reference.ReferenceSequenceFile;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy;
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SimpleDataSourceLoadException;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.IndexDrivenSAMDataSource;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.junit.Assert;

View File

@ -1,80 +0,0 @@
package org.broadinstitute.sting.gatk.iterators;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import org.junit.Assert;
import static org.junit.Assert.fail;
import org.junit.BeforeClass;
import org.junit.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.Iterator;
/**
* @author aaron
* <p/>
* Class DuplicateDetectorIteratorTest
* <p/>
* test the DuplicateDetectorIterator class.
*/
public class IntervalOverlapIteratorUnitTest extends BaseTest {
private final File bam = new File(validationDataLocation + "index_test.bam");
private static IndexedFastaSequenceFile seq;
private int chromosomeOneReadCount = 885;
//GenomeLoc.setupRefContigOrdering(seq.getSequenceDictionary());
/**
* This function does the setup of our parser, before each method call.
* <p/>
* Called before every test case method.
*/
@BeforeClass
public static void beforeAll() {
try {
seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
} catch (FileNotFoundException e) {
fail("Unexpected Exception" + e.getLocalizedMessage());
}
GenomeLocParser.setupRefContigOrdering(seq);
}
@Test
public void testOverlappingIntervals() {
int countOfReads = 0;
int seqLength = seq.getSequenceDictionary().getSequence("chr1").getSequenceLength();
GenomeLoc last = GenomeLocParser.createGenomeLoc("chr1", 1, 470535);
// first count the initial pile of reads
SAMFileReader reader = new SAMFileReader(bam, true);
reader.setValidationStringency(SAMFileReader.ValidationStringency.LENIENT);
Iterator<SAMRecord> i = reader.queryOverlapping("chr1",1,470535);
GenomeLoc newLoc;
while (i.hasNext()) {
i.next();
countOfReads++;
}
reader.close();
while (last.getStart() < seq.getSequenceDictionary().getSequence("chr1").getSequenceLength()) {
reader = new SAMFileReader(bam, true);
long stop = (last.getStop() >= seqLength) ? seqLength : last.getStop() + 470535;
newLoc = GenomeLocParser.createGenomeLoc(last.getContigIndex(),last.getStart()+470535,stop);
reader.setValidationStringency(SAMFileReader.ValidationStringency.LENIENT);
i = reader.queryOverlapping(newLoc.getContig(),(int)newLoc.getStart(),(int)newLoc.getStop());
IntervalOverlapIterator iter = new IntervalOverlapIterator(StingSAMIteratorAdapter.adapt(null, i),last,false);
while(iter.hasNext()) {
countOfReads++;
iter.next();
}
last = newLoc;
reader.close();
}
Assert.assertEquals(chromosomeOneReadCount,countOfReads);
}
}

View File

@ -1,120 +0,0 @@
package org.broadinstitute.sting.gatk.iterators;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.BaseTest;
import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.Assert;
import static org.junit.Assert.fail;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.Iterator;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMRecord;
/**
* Created by IntelliJ IDEA.
* User: aaronmckenna
* Date: Nov 4, 2009
* Time: 11:02:24 PM
*/
public class PlusOneFixIteratorUnitTest extends BaseTest {
private final File bam = new File(validationDataLocation + "index_test.bam");
private static IndexedFastaSequenceFile seq;
private int chromosomeOneReadCount = 885;
//GenomeLoc.setupRefContigOrdering(seq.getSequenceDictionary());
/**
* This function does the setup of our parser, before each method call.
* <p/>
* Called before every test case method.
*/
@BeforeClass
public static void beforeAll() {
try {
seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
} catch (FileNotFoundException e) {
fail("Unexpected Exception" + e.getLocalizedMessage());
}
GenomeLocParser.setupRefContigOrdering(seq);
}
/**
* there's a read starting at chr1:1108664, make our interval go from chr1 to 1108664, apply the iterator,
* and we shouldn't see the read.
*/
@Test
public void testReadAtEndOfInterval() {
int countOfReads = 0;
SAMFileReader reader = new SAMFileReader(bam, true);
final int size = 1108663;
reader.setValidationStringency(SAMFileReader.ValidationStringency.LENIENT);
GenomeLoc last = GenomeLocParser.createGenomeLoc("chr1", 1, size);
Iterator<SAMRecord> i = new PlusOneFixIterator(last, StingSAMIteratorAdapter.adapt(null, reader.queryOverlapping("chr1", 1, 1108663)));
//Iterator<SAMRecord> i = reader.queryOverlapping("chr1", 1, size);
while (i.hasNext()) {
SAMRecord rec = i.next();
countOfReads++;
}
Assert.assertEquals(2, countOfReads);
}
/**
* there are 4296 reads in chr1, make sure we see them all.
*
* chr1 length = 247249719 (and no reads start at the last position
*/
@Test
public void testAllReads() {
int countOfReads = 0;
SAMFileReader reader = new SAMFileReader(bam, true);
reader.setValidationStringency(SAMFileReader.ValidationStringency.LENIENT);
final int size = 247249719;
GenomeLoc last = GenomeLocParser.createGenomeLoc("chr1", 1, size);
Iterator<SAMRecord> i = new PlusOneFixIterator(last, StingSAMIteratorAdapter.adapt(null, reader.queryOverlapping("chr1", 1, size)));
//Iterator<SAMRecord> i = reader.queryOverlapping("chr1", 1, size);
while (i.hasNext()) {
SAMRecord rec = i.next();
countOfReads++;
}
Assert.assertEquals(885, countOfReads);
}
/**
* there are 4296 reads in chr1, make sure we see them all.
*
* chr1 length = 247249719 (and no reads start at the last position
*/
@Test
public void testAllReadsSharded() {
int countOfReads = 0;
final int size = 247249719;
int currentStop = 0;
int incr = 100000;
GenomeLoc last = GenomeLocParser.createGenomeLoc("chr1", 1, size);
while (currentStop < size) {
SAMFileReader reader = new SAMFileReader(bam, true);
reader.setValidationStringency(SAMFileReader.ValidationStringency.LENIENT);
int lastStop = (currentStop > 1) ? currentStop: 1;
if (currentStop + incr < size)
currentStop += incr;
else
currentStop = size;
Iterator<SAMRecord> i = new PlusOneFixIterator(last, StingSAMIteratorAdapter.adapt(null, reader.queryOverlapping("chr1", lastStop, currentStop)));
while (i.hasNext()) {
SAMRecord rec = i.next();
countOfReads++;
}
reader.close();
}
Assert.assertEquals(885, countOfReads);
}
}

View File

@ -9,7 +9,7 @@ import org.broadinstitute.sting.gatk.datasources.shards.Shard;
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy;
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.IndexDrivenSAMDataSource;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.BlockDrivenSAMDataSource;
import org.broadinstitute.sting.gatk.walkers.qc.CountReadsWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.GenomeLocParser;
@ -103,54 +103,6 @@ public class TraverseReadsUnitTest extends BaseTest {
}
/** Test out that we can shard the file and iterate over every read */
@Test
public void testMappedReadCount() {
IndexedFastaSequenceFile ref = null;
try {
ref = new IndexedFastaSequenceFile(refFile);
}
catch (FileNotFoundException ex) {
throw new RuntimeException("File not found opening fasta file; please do this check before MicroManaging", ex);
}
GenomeLocParser.setupRefContigOrdering(ref);
SAMDataSource dataSource = new IndexDrivenSAMDataSource(new Reads(bamList));
dataSource.viewUnmappedReads(false);
ShardStrategy shardStrategy = ShardStrategyFactory.shatter(dataSource,ref,ShardStrategyFactory.SHATTER_STRATEGY.READS,
ref.getSequenceDictionary(),
readSize);
countReadWalker.initialize();
Object accumulator = countReadWalker.reduceInit();
while (shardStrategy.hasNext()) {
Shard shard = shardStrategy.next();
if (shard == null) {
fail("Shard == null");
}
ShardDataProvider dataProvider = new ReadShardDataProvider(shard,dataSource.seek(shard),null,null);
accumulator = traversalEngine.traverse(countReadWalker, dataProvider, accumulator);
dataProvider.close();
}
traversalEngine.printOnTraversalDone("reads", accumulator);
countReadWalker.onTraversalDone(accumulator);
if (!(accumulator instanceof Integer)) {
fail("Count read walker should return an interger.");
}
if (((Integer) accumulator) != 9721) {
fail("there should be 9721 mapped reads in the index file, there was " + ((Integer) accumulator) );
}
}
/** Test out that we can shard the file and iterate over every read */
@Test
public void testUnmappedReadCount() {
@ -163,9 +115,8 @@ public class TraverseReadsUnitTest extends BaseTest {
}
GenomeLocParser.setupRefContigOrdering(ref);
SAMDataSource dataSource = new IndexDrivenSAMDataSource(new Reads(bamList));
dataSource.viewUnmappedReads(true);
ShardStrategy shardStrategy = ShardStrategyFactory.shatter(dataSource,ref,ShardStrategyFactory.SHATTER_STRATEGY.READS,
SAMDataSource dataSource = new BlockDrivenSAMDataSource(new Reads(bamList));
ShardStrategy shardStrategy = ShardStrategyFactory.shatter(dataSource,ref,ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL,
ref.getSequenceDictionary(),
readSize);