Added back end code support for Sharding based on genomic location for reads. Changed the sharding
code to take GenomeLocSortedSet instead of a list<GenomeLoc>, and added a bunch of much simplier and cleaner test cases. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@816 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
4edcdffe45
commit
d994544c47
|
|
@ -13,6 +13,7 @@ import org.broadinstitute.sting.gatk.walkers.*;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||||
import org.broadinstitute.sting.utils.cmdLine.ArgumentException;
|
import org.broadinstitute.sting.utils.cmdLine.ArgumentException;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
|
@ -110,7 +111,10 @@ public class GenomeAnalysisEngine {
|
||||||
genericEngineSetup(strictness);
|
genericEngineSetup(strictness);
|
||||||
|
|
||||||
// parse out any genomic location they've provided
|
// parse out any genomic location they've provided
|
||||||
List<GenomeLoc> locs = setupIntervalRegion();
|
List<GenomeLoc> locationsList = setupIntervalRegion();
|
||||||
|
GenomeLocSortedSet locs = null;
|
||||||
|
if (locationsList != null)
|
||||||
|
locs = GenomeLocSortedSet.createSetFromList(locationsList);
|
||||||
|
|
||||||
// excute the microscheduler
|
// excute the microscheduler
|
||||||
microScheduler.execute(my_walker, locs);
|
microScheduler.execute(my_walker, locs);
|
||||||
|
|
@ -192,7 +196,7 @@ public class GenomeAnalysisEngine {
|
||||||
|
|
||||||
engine.setMaxReads(Integer.parseInt(argCollection.maximumReads));
|
engine.setMaxReads(Integer.parseInt(argCollection.maximumReads));
|
||||||
|
|
||||||
// we default interval files over the genome region strin
|
// we default interval files over the genome region string
|
||||||
if (argCollection.intervals != null) {
|
if (argCollection.intervals != null) {
|
||||||
engine.setLocation(setupIntervalRegion());
|
engine.setLocation(setupIntervalRegion());
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.dataSources.shards;
|
||||||
|
|
||||||
import net.sf.samtools.SAMSequenceDictionary;
|
import net.sf.samtools.SAMSequenceDictionary;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
|
@ -66,7 +67,7 @@ public class ExpGrowthLocusShardStrategy extends LocusShardStrategy {
|
||||||
* @param startSize the starting size of the shard
|
* @param startSize the starting size of the shard
|
||||||
* @param lst locations to iterate from
|
* @param lst locations to iterate from
|
||||||
*/
|
*/
|
||||||
ExpGrowthLocusShardStrategy(SAMSequenceDictionary dic, long startSize, List<GenomeLoc> lst) {
|
ExpGrowthLocusShardStrategy(SAMSequenceDictionary dic, long startSize, GenomeLocSortedSet lst) {
|
||||||
super(dic, lst);
|
super(dic, lst);
|
||||||
this.baseSize = startSize;
|
this.baseSize = startSize;
|
||||||
this.currentExp = 0;
|
this.currentExp = 0;
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,61 @@
|
||||||
|
package org.broadinstitute.sting.gatk.dataSources.shards;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2009 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author aaron
|
||||||
|
* <p/>
|
||||||
|
* Class IntervalReadShard
|
||||||
|
* <p/>
|
||||||
|
* This is the read shard that knowns about genomic intervals
|
||||||
|
*/
|
||||||
|
public class IntervalReadShard implements Shard {
|
||||||
|
|
||||||
|
/** a collection of genomic locations to interate over */
|
||||||
|
private GenomeLoc mSet;
|
||||||
|
|
||||||
|
IntervalReadShard(GenomeLoc myLocation) {
|
||||||
|
mSet = myLocation.clone();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @return the genome location represented by this shard */
|
||||||
|
public GenomeLoc getGenomeLoc() {
|
||||||
|
return mSet;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* returns the type of shard, READ
|
||||||
|
*
|
||||||
|
* @return READ, indicating the shard type
|
||||||
|
*/
|
||||||
|
public ShardType getShardType() {
|
||||||
|
return Shard.ShardType.READ;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.dataSources.shards;
|
||||||
|
|
||||||
import net.sf.samtools.SAMSequenceDictionary;
|
import net.sf.samtools.SAMSequenceDictionary;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
|
@ -63,7 +64,7 @@ class LinearLocusShardStrategy extends LocusShardStrategy {
|
||||||
* @param startSize the starting size of the shard
|
* @param startSize the starting size of the shard
|
||||||
* @param lst locations to iterate from
|
* @param lst locations to iterate from
|
||||||
*/
|
*/
|
||||||
LinearLocusShardStrategy(SAMSequenceDictionary dic, long startSize, List<GenomeLoc> lst) {
|
LinearLocusShardStrategy(SAMSequenceDictionary dic, long startSize, GenomeLocSortedSet lst) {
|
||||||
super(dic, lst);
|
super(dic, lst);
|
||||||
this.nextShardSize = startSize;
|
this.nextShardSize = startSize;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.dataSources.shards;
|
||||||
|
|
||||||
import net.sf.samtools.SAMSequenceDictionary;
|
import net.sf.samtools.SAMSequenceDictionary;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
|
@ -29,17 +30,17 @@ import java.util.List;
|
||||||
* <p/>
|
* <p/>
|
||||||
* Class LocusWindowShardStrategy
|
* Class LocusWindowShardStrategy
|
||||||
* <p/>
|
* <p/>
|
||||||
* This function knows how to shard on a genome loc boundry. It guarantee's
|
* This function knows how to shard on a genome loc boundry. It guarantees
|
||||||
* a one-to-one mapping between a GenomeLoc and hte
|
* a one-to-one mapping between a GenomeLoc and shard.
|
||||||
*/
|
*/
|
||||||
public class IntervalShardStrategy extends LocusShardStrategy {
|
public class LocusIntervalShardStrategy extends LocusShardStrategy {
|
||||||
/**
|
/**
|
||||||
* the constructor, taking a seq dictionary to parse out contigs
|
* the constructor, taking a seq dictionary to parse out contigs
|
||||||
*
|
*
|
||||||
* @param dic the seq dictionary
|
* @param dic the seq dictionary
|
||||||
* @param intervals file
|
* @param intervals file
|
||||||
*/
|
*/
|
||||||
IntervalShardStrategy(SAMSequenceDictionary dic, List<GenomeLoc> intervals) {
|
LocusIntervalShardStrategy(SAMSequenceDictionary dic, GenomeLocSortedSet intervals) {
|
||||||
super(dic, intervals);
|
super(dic, intervals);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -26,7 +26,7 @@ import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
* <p/>
|
* <p/>
|
||||||
* Class Shard
|
* Class Shard
|
||||||
* <p/>
|
* <p/>
|
||||||
* This is the base class for shards. Right now it does little more then
|
* This is the base class for locus shards. Right now it does little more then
|
||||||
* wrap GenomeLoc (actually nothing more), but it's good to have the class
|
* wrap GenomeLoc (actually nothing more), but it's good to have the class
|
||||||
* in place so it's easier to change guts later.
|
* in place so it's easier to change guts later.
|
||||||
*/
|
*/
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,8 @@ package org.broadinstitute.sting.gatk.dataSources.shards;
|
||||||
import net.sf.samtools.SAMSequenceDictionary;
|
import net.sf.samtools.SAMSequenceDictionary;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||||
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
@ -25,11 +27,6 @@ import java.util.List;
|
||||||
/**
|
/**
|
||||||
* @author aaron
|
* @author aaron
|
||||||
* @version 1.0
|
* @version 1.0
|
||||||
* @date Apr 6, 2009
|
|
||||||
* <p/>
|
|
||||||
* Interface Shard
|
|
||||||
* <p/>
|
|
||||||
* The shard interface, which controls how data is divided for loci
|
|
||||||
*/
|
*/
|
||||||
public abstract class LocusShardStrategy implements ShardStrategy {
|
public abstract class LocusShardStrategy implements ShardStrategy {
|
||||||
|
|
||||||
|
|
@ -50,10 +47,7 @@ public abstract class LocusShardStrategy implements ShardStrategy {
|
||||||
private boolean nextContig = false;
|
private boolean nextContig = false;
|
||||||
|
|
||||||
/** our interal list * */
|
/** our interal list * */
|
||||||
private List<GenomeLoc> intervals = null;
|
private GenomeLocSortedSet intervals = null;
|
||||||
|
|
||||||
/** our interal list * */
|
|
||||||
private int currentInterval = -1;
|
|
||||||
|
|
||||||
/** our log, which we want to capture anything from this class */
|
/** our log, which we want to capture anything from this class */
|
||||||
private static Logger logger = Logger.getLogger(LocusShardStrategy.class);
|
private static Logger logger = Logger.getLogger(LocusShardStrategy.class);
|
||||||
|
|
@ -92,15 +86,15 @@ public abstract class LocusShardStrategy implements ShardStrategy {
|
||||||
* @param dic the seq dictionary
|
* @param dic the seq dictionary
|
||||||
* @param intervals file
|
* @param intervals file
|
||||||
*/
|
*/
|
||||||
LocusShardStrategy(SAMSequenceDictionary dic, List<GenomeLoc> intervals) {
|
LocusShardStrategy(SAMSequenceDictionary dic, GenomeLocSortedSet intervals) {
|
||||||
this.dic = dic;
|
this.dic = dic;
|
||||||
this.intervals = intervals;
|
this.intervals = intervals.clone();
|
||||||
this.currentInterval = 0;
|
|
||||||
// set the starting point to the beginning interval
|
// set the starting point to the beginning interval
|
||||||
if (intervals.size() < 1) {
|
if (intervals.size() < 1) {
|
||||||
throw new IllegalArgumentException("Interval files must contain at least one interval");
|
throw new IllegalArgumentException("Interval files must contain at least one interval");
|
||||||
}
|
}
|
||||||
mLoc = new GenomeLoc(intervals.get(0).getContig(),intervals.get(0).getStart()-1,intervals.get(0).getStart()-1);
|
GenomeLoc loc = intervals.iterator().next();
|
||||||
|
mLoc = new GenomeLoc(loc.getContig(), loc.getStart() - 1, loc.getStart() - 1);
|
||||||
if (dic.getSequences().size() > 0) {
|
if (dic.getSequences().size() > 0) {
|
||||||
nextContig = true;
|
nextContig = true;
|
||||||
}
|
}
|
||||||
|
|
@ -139,11 +133,11 @@ public abstract class LocusShardStrategy implements ShardStrategy {
|
||||||
long proposedSize = nextShardSize();
|
long proposedSize = nextShardSize();
|
||||||
long nextStart = mLoc.getStop() + 1;
|
long nextStart = mLoc.getStop() + 1;
|
||||||
|
|
||||||
// if we don't have an interval file, use the non interval based approach. Simple, eh?
|
// if we don't have an interval set, use the non interval based approach. Simple, eh?
|
||||||
if (this.intervals == null) {
|
if (this.intervals == null) {
|
||||||
return nonIntervaledNext(length, proposedSize, nextStart);
|
return nonIntervaledNext(length, proposedSize, nextStart);
|
||||||
} else {
|
} else {
|
||||||
return intervaledNext(proposedSize, nextStart);
|
return intervaledNext(proposedSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
@ -152,36 +146,24 @@ public abstract class LocusShardStrategy implements ShardStrategy {
|
||||||
* Interval based next processing
|
* Interval based next processing
|
||||||
*
|
*
|
||||||
* @param proposedSize the proposed size
|
* @param proposedSize the proposed size
|
||||||
* @param nextStart where we start from
|
*
|
||||||
* @return the shard that represents this data
|
* @return the shard that represents this data
|
||||||
*/
|
*/
|
||||||
private Shard intervaledNext(long proposedSize, long nextStart) {
|
private Shard intervaledNext(long proposedSize) {
|
||||||
// get the current genome location
|
if ((this.intervals == null) || (intervals.isEmpty())) {
|
||||||
GenomeLoc loc = intervals.get(currentInterval);
|
throw new StingException("LocusShardStrategy: genomic regions list is empty in next() function.");
|
||||||
if (nextStart + proposedSize >= loc.getStop()) {
|
}
|
||||||
// we need to get the rest of the current loc in a shard (return it), and move to the next location
|
|
||||||
proposedSize = loc.getStop() - nextStart;
|
|
||||||
lastGenomeLocSize = proposedSize;
|
|
||||||
|
|
||||||
// the next sequence should start at the begining of the next contig
|
// get the first region in the list
|
||||||
Shard ret = LocusShard.toShard(new GenomeLoc(intervals.get(currentInterval).getContigIndex(), nextStart, nextStart + proposedSize));
|
GenomeLoc loc = intervals.iterator().next();
|
||||||
|
|
||||||
++currentInterval;
|
|
||||||
if (intervals.size() > currentInterval) {
|
|
||||||
mLoc = new GenomeLoc(intervals.get(currentInterval).getContigIndex(), intervals.get(currentInterval).getStart() - 1, intervals.get(currentInterval).getStart() - 1);
|
|
||||||
}
|
|
||||||
return ret;// return
|
|
||||||
|
|
||||||
|
if (loc.getStop() - loc.getStart() <= proposedSize) {
|
||||||
|
intervals.removeRegion(loc);
|
||||||
|
return new IntervalReadShard(loc);
|
||||||
} else {
|
} else {
|
||||||
// we need to move the next interval
|
GenomeLoc subLoc = new GenomeLoc(loc.getContigIndex(), loc.getStart(), loc.getStart() + proposedSize - 1);
|
||||||
lastGenomeLocSize = proposedSize;
|
intervals.removeRegion(subLoc);
|
||||||
|
return new IntervalReadShard(subLoc);
|
||||||
// the next sequence should start at the begining of the next contig
|
|
||||||
Shard ret = LocusShard.toShard(new GenomeLoc(intervals.get(currentInterval).getContigIndex(), nextStart, nextStart + proposedSize - 1));
|
|
||||||
|
|
||||||
mLoc = new GenomeLoc(intervals.get(currentInterval).getContigIndex(), nextStart, nextStart + proposedSize - 1);
|
|
||||||
|
|
||||||
return ret;// return
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -191,6 +173,7 @@ public abstract class LocusShardStrategy implements ShardStrategy {
|
||||||
* @param length the length of the contig
|
* @param length the length of the contig
|
||||||
* @param proposedSize the proposed size
|
* @param proposedSize the proposed size
|
||||||
* @param nextStart the next start location
|
* @param nextStart the next start location
|
||||||
|
*
|
||||||
* @return the shard to return to the user
|
* @return the shard to return to the user
|
||||||
*/
|
*/
|
||||||
private Shard nonIntervaledNext(long length, long proposedSize, long nextStart) {
|
private Shard nonIntervaledNext(long length, long proposedSize, long nextStart) {
|
||||||
|
|
@ -241,11 +224,11 @@ public abstract class LocusShardStrategy implements ShardStrategy {
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public boolean hasNext() {
|
public boolean hasNext() {
|
||||||
// if we don't have an interval file, use the non interval based approach. Simple, eh?
|
// if we don't have an interval file, use the non interval based approach.
|
||||||
if (this.intervals == null) {
|
if (this.intervals == null) {
|
||||||
return nextContig;
|
return nextContig;
|
||||||
} else {
|
} else {
|
||||||
return (this.currentInterval < this.intervals.size());
|
return (this.intervals.size() > 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -267,13 +250,14 @@ public abstract class LocusShardStrategy implements ShardStrategy {
|
||||||
/**
|
/**
|
||||||
* this allows a shard strategy to get the current interval. It's kind of a hack, but for the
|
* this allows a shard strategy to get the current interval. It's kind of a hack, but for the
|
||||||
* locusWindowShardStrategy it was the best approach.
|
* locusWindowShardStrategy it was the best approach.
|
||||||
|
*
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
protected GenomeLoc getCurrentInterval() {
|
protected GenomeLoc getCurrentInterval() {
|
||||||
if (this.intervals == null || currentInterval < 0) {
|
if (this.intervals == null || intervals.size() < 1) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
return intervals.get(currentInterval);
|
return intervals.iterator().next();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,118 @@
|
||||||
|
package org.broadinstitute.sting.gatk.dataSources.shards;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMSequenceDictionary;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||||
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* User: aaron
|
||||||
|
* Date: May 21, 2009
|
||||||
|
* Time: 4:13:53 PM
|
||||||
|
*
|
||||||
|
* The Broad Institute
|
||||||
|
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||||
|
* This software and its documentation are copyright 2009 by the
|
||||||
|
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||||
|
*
|
||||||
|
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||||
|
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author aaron
|
||||||
|
* <p/>
|
||||||
|
* Class ReadByIntervalShardStrategy
|
||||||
|
* <p/>
|
||||||
|
* Impliments the sharding strategy for reads, given a list
|
||||||
|
* of genomic locations. Shards returned will be bounded by the interval,
|
||||||
|
* but each provided interval may be split into a number of smaller regions.
|
||||||
|
*/
|
||||||
|
public class ReadIntervalShardStrategy implements ShardStrategy {
|
||||||
|
|
||||||
|
/** our storage of the genomic locations they'd like to shard over */
|
||||||
|
private final GenomeLocSortedSet regions;
|
||||||
|
|
||||||
|
/** their prefered size of the shard, we can modify this based on what we see in the shards */
|
||||||
|
private long size;
|
||||||
|
|
||||||
|
/** the sequence dictionary we'll use to lookup the contigs */
|
||||||
|
private final SAMSequenceDictionary dict;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* change the recommended shard size for the next shard we generate. The code will do it's
|
||||||
|
* best to respect this value, but there are no guarantees.
|
||||||
|
*
|
||||||
|
* @param size the next recommended shard size.
|
||||||
|
*/
|
||||||
|
public void adjustNextShardSize(long size) {
|
||||||
|
this.size = size;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* the default constructor
|
||||||
|
*
|
||||||
|
* @param dict the sequence dictionary to use
|
||||||
|
* @param size the read count to iterate over
|
||||||
|
*/
|
||||||
|
ReadIntervalShardStrategy(SAMSequenceDictionary dict, long size, GenomeLocSortedSet locations) {
|
||||||
|
if (locations == null || locations.isEmpty()) {
|
||||||
|
throw new StingException("ReadIntervalShardStrategy: genomic regions list is empty.");
|
||||||
|
}
|
||||||
|
this.regions = locations.clone();
|
||||||
|
this.size = size;
|
||||||
|
this.dict = dict;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* returns true if there are additional shards
|
||||||
|
* @return false if we're done processing shards
|
||||||
|
*/
|
||||||
|
public boolean hasNext() {
|
||||||
|
return (!regions.isEmpty());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* gets the next Shard
|
||||||
|
* @return the next shard
|
||||||
|
*/
|
||||||
|
public Shard next() {
|
||||||
|
if ((this.regions == null) || (regions.isEmpty())) {
|
||||||
|
throw new StingException("ReadIntervalShardStrategy: genomic regions list is empty in next() function.");
|
||||||
|
}
|
||||||
|
|
||||||
|
// get the first region in the list
|
||||||
|
GenomeLoc loc = regions.iterator().next();
|
||||||
|
|
||||||
|
if (loc.getStop() - loc.getStart() <= this.size) {
|
||||||
|
regions.removeRegion(loc);
|
||||||
|
return new IntervalReadShard(loc);
|
||||||
|
} else {
|
||||||
|
GenomeLoc subLoc = new GenomeLoc(loc.getContigIndex(),loc.getStart(),loc.getStart()+size-1);
|
||||||
|
regions.removeRegion(subLoc);
|
||||||
|
return new IntervalReadShard(subLoc);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* we don't support the remove command
|
||||||
|
*/
|
||||||
|
public void remove() {
|
||||||
|
throw new UnsupportedOperationException("ShardStrategies don't support remove()");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* makes the ReadIntervalShard iterable, i.e. usable in a for loop.
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public Iterator<Shard> iterator() {
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -21,19 +21,20 @@ import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author aaron
|
* @author aaron
|
||||||
* @version 1.0
|
* <p/>
|
||||||
* @date Apr 10, 2009
|
* ReadShard
|
||||||
* <p/>
|
* <p/>
|
||||||
* Class ReadShard
|
* the base class for read shards.
|
||||||
* <p/>
|
|
||||||
* A class for sharded reads.
|
|
||||||
*/
|
*/
|
||||||
public class ReadShard implements Shard {
|
public class ReadShard implements Shard {
|
||||||
|
|
||||||
// the count of the reads we want to copy off
|
// the count of the reads we want to copy off
|
||||||
private int size = 0;
|
private int size = 0;
|
||||||
|
|
||||||
// this is going to get gross
|
/**
|
||||||
|
* our tie in for the shard strategy. This allows us to signal to the shard
|
||||||
|
* strategy that we've finished process, so it can indicate that we're out of reads
|
||||||
|
*/
|
||||||
private final ReadShardStrategy str;
|
private final ReadShardStrategy str;
|
||||||
|
|
||||||
// the reference back to our read shard strategy
|
// the reference back to our read shard strategy
|
||||||
|
|
@ -63,7 +64,7 @@ public class ReadShard implements Shard {
|
||||||
|
|
||||||
/** @return the genome location represented by this shard */
|
/** @return the genome location represented by this shard */
|
||||||
public GenomeLoc getGenomeLoc() {
|
public GenomeLoc getGenomeLoc() {
|
||||||
throw new UnsupportedOperationException("Reads based sharding isn't genome loc aware");
|
throw new UnsupportedOperationException("ReadShard isn't genome loc aware");
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @return the genome location represented by this shard */
|
/** @return the genome location represented by this shard */
|
||||||
|
|
@ -71,7 +72,10 @@ public class ReadShard implements Shard {
|
||||||
return size;
|
return size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* this method is used as a backend, to signal to the sharding strategy that we've
|
||||||
|
* finished processing. When we move to a more read-aware bam system this method could disappear.
|
||||||
|
*/
|
||||||
public void signalDone() {
|
public void signalDone() {
|
||||||
strat.signalDone();
|
strat.signalDone();
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -5,10 +5,6 @@ import net.sf.samtools.SAMSequenceDictionary;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
|
||||||
* User: aaron
|
|
||||||
* Date: Apr 14, 2009
|
|
||||||
* Time: 1:34:28 PM
|
|
||||||
*
|
*
|
||||||
* The Broad Institute
|
* The Broad Institute
|
||||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||||
|
|
@ -28,7 +24,8 @@ import java.util.Iterator;
|
||||||
* <p/>
|
* <p/>
|
||||||
* Class ReadShardStrategy
|
* Class ReadShardStrategy
|
||||||
* <p/>
|
* <p/>
|
||||||
* A descriptions should go here. Blame aaron if it's missing.
|
* The sharding strategy for reads using a simple counting mechanism. Each read shard
|
||||||
|
* has a specific number of reads (default to 100K) which is configured in the constructor.
|
||||||
*/
|
*/
|
||||||
public class ReadShardStrategy implements ShardStrategy {
|
public class ReadShardStrategy implements ShardStrategy {
|
||||||
|
|
||||||
|
|
@ -46,7 +43,7 @@ public class ReadShardStrategy implements ShardStrategy {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* the default constructor
|
* the default constructor
|
||||||
* @param dic the dictionary
|
* @param dic the sequence dictionary to use
|
||||||
* @param size the read count to iterate over
|
* @param size the read count to iterate over
|
||||||
*/
|
*/
|
||||||
ReadShardStrategy(SAMSequenceDictionary dic, long size) {
|
ReadShardStrategy(SAMSequenceDictionary dic, long size) {
|
||||||
|
|
@ -63,7 +60,7 @@ public class ReadShardStrategy implements ShardStrategy {
|
||||||
}
|
}
|
||||||
|
|
||||||
public Shard next() {
|
public Shard next() {
|
||||||
return new ReadShard((int)readCount, this);
|
return new ReadShard((int)readCount, this);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void remove() {
|
public void remove() {
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,7 @@ import net.sf.samtools.SAMSequenceDictionary;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
|
@ -65,25 +66,6 @@ public class ShardStrategyFactory {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* convert between types
|
|
||||||
*
|
|
||||||
* @param strat the strategy
|
|
||||||
* @param convertFrom convert from this strategy
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
static public ShardStrategy transitionToShardStrategy(SHATTER_STRATEGY strat, LocusShardStrategy convertFrom) {
|
|
||||||
switch (strat) {
|
|
||||||
case LINEAR:
|
|
||||||
return new LinearLocusShardStrategy(convertFrom);
|
|
||||||
case EXPONENTIAL:
|
|
||||||
return new ExpGrowthLocusShardStrategy(convertFrom);
|
|
||||||
default:
|
|
||||||
throw new StingException("Strategy: " + strat + " isn't implemented");
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* get a new shatter strategy
|
* get a new shatter strategy
|
||||||
|
|
@ -93,31 +75,20 @@ public class ShardStrategyFactory {
|
||||||
* @param startingSize the starting size
|
* @param startingSize the starting size
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
static public ShardStrategy shatter(SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, List<GenomeLoc> lst) {
|
static public ShardStrategy shatter(SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocSortedSet lst) {
|
||||||
switch (strat) {
|
switch (strat) {
|
||||||
case LINEAR:
|
case LINEAR:
|
||||||
return new LinearLocusShardStrategy(dic, startingSize, lst);
|
return new LinearLocusShardStrategy(dic, startingSize, lst);
|
||||||
case EXPONENTIAL:
|
case EXPONENTIAL:
|
||||||
return new ExpGrowthLocusShardStrategy(dic, startingSize, lst);
|
return new ExpGrowthLocusShardStrategy(dic, startingSize, lst);
|
||||||
case READS:
|
case READS:
|
||||||
// return new ReadShardStrategy(dic, startingSize);
|
return new ReadIntervalShardStrategy(dic, startingSize, lst);
|
||||||
throw new StingException("Strategy: " + strat + " isn't implemented for intervals");
|
|
||||||
case INTERVAL:
|
case INTERVAL:
|
||||||
return new IntervalShardStrategy(dic, lst);
|
return new LocusIntervalShardStrategy(dic, lst);
|
||||||
default:
|
default:
|
||||||
throw new StingException("Strategy: " + strat + " isn't implemented");
|
throw new StingException("Strategy: " + strat + " isn't implemented");
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* setup a reads shattering strategy
|
|
||||||
*
|
|
||||||
* @param readCount the number of reads to include in each shard
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
static public ShardStrategy shatterByReadCount(SAMSequenceDictionary dic, long readCount) {
|
|
||||||
return new ReadShardStrategy(dic, readCount);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,73 +0,0 @@
|
||||||
package org.broadinstitute.sting.gatk.dataSources.simpleDataSources;
|
|
||||||
|
|
||||||
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
|
|
||||||
import org.broadinstitute.sting.gatk.iterators.BoundedReferenceIterator;
|
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
|
||||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.FileNotFoundException;
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* User: aaron
|
|
||||||
* Date: Apr 6, 2009
|
|
||||||
* Time: 3:55:21 PM
|
|
||||||
*
|
|
||||||
* The Broad Institute
|
|
||||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
|
||||||
* This software and its documentation are copyright 2009 by the
|
|
||||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
|
||||||
*
|
|
||||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
|
||||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @author aaron
|
|
||||||
* @version 1.0
|
|
||||||
* @date Apr 6, 2009
|
|
||||||
* <p/>
|
|
||||||
* Class ReferenceDataSource
|
|
||||||
* <p/>
|
|
||||||
* A descriptions should go here. Blame aaron if it's missing.
|
|
||||||
*/
|
|
||||||
public class ReferenceDataSource implements SimpleDataSource {
|
|
||||||
|
|
||||||
final protected IndexedFastaSequenceFile refFile;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Query the data source for a region of interest, specified by the genome location.
|
|
||||||
* The iterator will generate successive calls
|
|
||||||
*
|
|
||||||
* @param shard the genome location to extract data for
|
|
||||||
* @return an iterator of the appropriate type, that is limited by the region
|
|
||||||
*/
|
|
||||||
public BoundedReferenceIterator seek(Shard shard) {
|
|
||||||
if (shard.getShardType() == Shard.ShardType.LOCUS) {
|
|
||||||
BoundedReferenceIterator ret = new BoundedReferenceIterator(refFile, shard.getGenomeLoc());
|
|
||||||
return ret;
|
|
||||||
} else {
|
|
||||||
throw new StingException("ReferenceDataSource can only take LocusShards");
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public ReferenceDataSource(String refFileName) throws SimpleDataSourceLoadException {
|
|
||||||
if (refFileName == null) {
|
|
||||||
throw new SimpleDataSourceLoadException("ReferenceDataSource: refFileName passed in is null");
|
|
||||||
}
|
|
||||||
File infile = new File(refFileName);
|
|
||||||
if (!infile.canRead()) {
|
|
||||||
throw new SimpleDataSourceLoadException("ReferenceDataSource: Unable to load file: " + refFileName);
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
refFile = new IndexedFastaSequenceFile(new File(refFileName));
|
|
||||||
}
|
|
||||||
catch( FileNotFoundException ex ) {
|
|
||||||
throw new SimpleDataSourceLoadException( "Unable to find reference file", ex );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -35,9 +35,7 @@ import java.util.List;
|
||||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||||
*/
|
*/
|
||||||
public class SAMDataSource implements SimpleDataSource {
|
public class SAMDataSource implements SimpleDataSource {
|
||||||
/**
|
/** Backing support for reads. */
|
||||||
* Backing support for reads.
|
|
||||||
*/
|
|
||||||
private Reads reads = null;
|
private Reads reads = null;
|
||||||
|
|
||||||
/** our SAM data files */
|
/** our SAM data files */
|
||||||
|
|
@ -52,9 +50,7 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
// our list of readers
|
// our list of readers
|
||||||
private final List<File> samFileList = new ArrayList<File>();
|
private final List<File> samFileList = new ArrayList<File>();
|
||||||
|
|
||||||
/**
|
/** SAM header file. */
|
||||||
* SAM header file.
|
|
||||||
*/
|
|
||||||
private final SAMFileHeader header;
|
private final SAMFileHeader header;
|
||||||
|
|
||||||
// used for the reads case, the last count of reads retrieved
|
// used for the reads case, the last count of reads retrieved
|
||||||
|
|
@ -90,14 +86,14 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
header = createHeaderMerger().getMergedHeader();
|
header = createHeaderMerger().getMergedHeader();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Load up a sam file.
|
* Load a SAM/BAM, given an input file.
|
||||||
*
|
*
|
||||||
* @param samFile the file name
|
* @param samFile the file name
|
||||||
* @return a SAMFileReader for the file
|
* @return a SAMFileReader for the file, null if we're attempting to read a list
|
||||||
*/
|
*/
|
||||||
private SAMFileReader initializeSAMFile(final File samFile) {
|
private SAMFileReader initializeSAMFile(final File samFile) {
|
||||||
if (samFile.toString().endsWith(".list")) {
|
if (samFile.toString().endsWith(".list")) {
|
||||||
|
|
@ -115,7 +111,7 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* <p>
|
* <p>
|
||||||
* seek
|
* seekLocus
|
||||||
* </p>
|
* </p>
|
||||||
*
|
*
|
||||||
* @param location the genome location to extract data for
|
* @param location the genome location to extract data for
|
||||||
|
|
@ -123,17 +119,16 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
*/
|
*/
|
||||||
public StingSAMIterator seekLocus(GenomeLoc location) throws SimpleDataSourceLoadException {
|
public StingSAMIterator seekLocus(GenomeLoc location) throws SimpleDataSourceLoadException {
|
||||||
|
|
||||||
// right now this is pretty damn heavy, it copies the file list into a reader list every time
|
// right now this is very heavy, it copies the file list into a reader list every time
|
||||||
SamFileHeaderMerger headerMerger = createHeaderMerger();
|
SamFileHeaderMerger headerMerger = createHeaderMerger();
|
||||||
|
|
||||||
// make a merging iterator for this record
|
// make a merging iterator for this record
|
||||||
MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(headerMerger);
|
MergingSamRecordIterator2 iter = new MergingSamRecordIterator2(headerMerger);
|
||||||
|
|
||||||
// we do different things for locus and read modes
|
|
||||||
iter.queryOverlapping(location.getContig(), (int) location.getStart(), (int) location.getStop() + 1);
|
iter.queryOverlapping(location.getContig(), (int) location.getStart(), (int) location.getStop() + 1);
|
||||||
|
|
||||||
// return the iterator
|
// return the iterator
|
||||||
return StingSAMIteratorAdapter.adapt( reads, iter );
|
return StingSAMIteratorAdapter.adapt(reads, iter);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -149,17 +144,17 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
if (shard.getShardType() == Shard.ShardType.READ) {
|
if (shard.getShardType() == Shard.ShardType.READ) {
|
||||||
iterator = seekRead((ReadShard) shard);
|
iterator = seekRead((ReadShard) shard);
|
||||||
iterator = TraversalEngine.applyDecoratingIterators(true,
|
iterator = TraversalEngine.applyDecoratingIterators(true,
|
||||||
iterator,
|
iterator,
|
||||||
reads.getDownsamplingFraction(),
|
reads.getDownsamplingFraction(),
|
||||||
reads.getMaxOnTheFlySorts(),
|
reads.getMaxOnTheFlySorts(),
|
||||||
reads.getSafetyChecking());
|
reads.getSafetyChecking());
|
||||||
} else if (shard.getShardType() == Shard.ShardType.LOCUS) {
|
} else if (shard.getShardType() == Shard.ShardType.LOCUS) {
|
||||||
iterator = seekLocus(shard.getGenomeLoc());
|
iterator = seekLocus(shard.getGenomeLoc());
|
||||||
iterator = TraversalEngine.applyDecoratingIterators(false,
|
iterator = TraversalEngine.applyDecoratingIterators(false,
|
||||||
iterator,
|
iterator,
|
||||||
reads.getDownsamplingFraction(),
|
reads.getDownsamplingFraction(),
|
||||||
reads.getMaxOnTheFlySorts(),
|
reads.getMaxOnTheFlySorts(),
|
||||||
reads.getSafetyChecking());
|
reads.getSafetyChecking());
|
||||||
} else {
|
} else {
|
||||||
throw new StingException("seek: Unknown shard type");
|
throw new StingException("seek: Unknown shard type");
|
||||||
}
|
}
|
||||||
|
|
@ -168,26 +163,26 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* If we're in by-read mode, this indicates if we want
|
|
||||||
* to see unmapped reads too. Only seeing mapped reads
|
|
||||||
* is much faster, but most BAM files have significant
|
|
||||||
* unmapped read counts.
|
|
||||||
*
|
|
||||||
* @param seeUnMappedReads true to see unmapped reads, false otherwise
|
|
||||||
*/
|
|
||||||
public void viewUnmappedReads(boolean seeUnMappedReads) {
|
|
||||||
includeUnmappedReads = seeUnMappedReads;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gets the (potentially merged) SAM file header.
|
* Gets the (potentially merged) SAM file header.
|
||||||
|
*
|
||||||
* @return SAM file header.
|
* @return SAM file header.
|
||||||
*/
|
*/
|
||||||
public SAMFileHeader getHeader() {
|
public SAMFileHeader getHeader() {
|
||||||
return header;
|
return header;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create the merging header.
|
||||||
|
*
|
||||||
|
* @return a SamFileHeaderMerger that includes the set of SAM files we were created with
|
||||||
|
*/
|
||||||
|
private SamFileHeaderMerger createHeaderMerger() {
|
||||||
|
List<SAMFileReader> lst = GetReaderList();
|
||||||
|
return new SamFileHeaderMerger(lst, SORT_ORDER);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* <p>
|
* <p>
|
||||||
* seek
|
* seek
|
||||||
|
|
@ -203,10 +198,8 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
MergingSamRecordIterator2 iter = null;
|
MergingSamRecordIterator2 iter = null;
|
||||||
|
|
||||||
if (!intoUnmappedReads) {
|
if (!intoUnmappedReads) {
|
||||||
// make a merging iterator for this record
|
|
||||||
iter = new MergingSamRecordIterator2(headerMerger);
|
iter = new MergingSamRecordIterator2(headerMerger);
|
||||||
|
bound = fastMappedReadSeek(shard.getSize(), iter);
|
||||||
bound = fastMappedReadSeek(shard.getSize(), iter);
|
|
||||||
}
|
}
|
||||||
if ((bound == null || intoUnmappedReads) && includeUnmappedReads) {
|
if ((bound == null || intoUnmappedReads) && includeUnmappedReads) {
|
||||||
if (iter != null) {
|
if (iter != null) {
|
||||||
|
|
@ -218,18 +211,21 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
|
|
||||||
if (bound == null) {
|
if (bound == null) {
|
||||||
shard.signalDone();
|
shard.signalDone();
|
||||||
bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads,iter), 0);
|
bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), 0);
|
||||||
}
|
}
|
||||||
return bound;
|
return bound;
|
||||||
}
|
}
|
||||||
|
|
||||||
private SamFileHeaderMerger createHeaderMerger() {
|
/**
|
||||||
// TODO: make extremely less horrible
|
* If we're in by-read mode, this indicates if we want
|
||||||
List<SAMFileReader> lst = GetReaderList();
|
* to see unmapped reads too. Only seeing mapped reads
|
||||||
|
* is much faster, but most BAM files have significant
|
||||||
// now merge the headers
|
* unmapped read counts.
|
||||||
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(lst, SORT_ORDER);
|
*
|
||||||
return headerMerger;
|
* @param seeUnMappedReads true to see unmapped reads, false otherwise
|
||||||
|
*/
|
||||||
|
public void viewUnmappedReads(boolean seeUnMappedReads) {
|
||||||
|
includeUnmappedReads = seeUnMappedReads;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -242,7 +238,6 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
* @throws SimpleDataSourceLoadException
|
* @throws SimpleDataSourceLoadException
|
||||||
*/
|
*/
|
||||||
private BoundedReadIterator toUnmappedReads(long readCount, MergingSamRecordIterator2 iter) throws SimpleDataSourceLoadException {
|
private BoundedReadIterator toUnmappedReads(long readCount, MergingSamRecordIterator2 iter) throws SimpleDataSourceLoadException {
|
||||||
BoundedReadIterator bound;// is this the first time we're doing this?
|
|
||||||
int count = 0;
|
int count = 0;
|
||||||
SAMRecord d = null;
|
SAMRecord d = null;
|
||||||
while (iter.hasNext()) {
|
while (iter.hasNext()) {
|
||||||
|
|
@ -270,15 +265,15 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
// we're good, increment our read cout
|
// we're not out of unmapped reads, so increment our read cout
|
||||||
this.readsTaken += readCount;
|
this.readsTaken += readCount;
|
||||||
return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads,iter), readCount);
|
return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* unmapped reads.
|
* A seek function for unmapped reads.
|
||||||
*
|
*
|
||||||
* @param readCount how many reads to retrieve
|
* @param readCount how many reads to retrieve
|
||||||
* @param iter the iterator to use
|
* @param iter the iterator to use
|
||||||
|
|
@ -286,16 +281,10 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
* @throws SimpleDataSourceLoadException
|
* @throws SimpleDataSourceLoadException
|
||||||
*/
|
*/
|
||||||
private BoundedReadIterator fastMappedReadSeek(long readCount, MergingSamRecordIterator2 iter) throws SimpleDataSourceLoadException {
|
private BoundedReadIterator fastMappedReadSeek(long readCount, MergingSamRecordIterator2 iter) throws SimpleDataSourceLoadException {
|
||||||
BoundedReadIterator bound;// is this the first time we're doing this?
|
|
||||||
if (lastReadPos == null) {
|
if (lastReadPos == null) {
|
||||||
lastReadPos = new GenomeLoc(iter.getHeader().getSequenceDictionary().getSequence(0).getSequenceIndex(), 0, 0);
|
return InitialReadIterator(readCount, iter);
|
||||||
iter.queryContained(lastReadPos.getContig(), 1, -1);
|
} else {
|
||||||
bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads,iter), readCount);
|
BoundedReadIterator bound;
|
||||||
this.readsTaken = readCount;
|
|
||||||
}
|
|
||||||
// we're not at the beginning, not at the end, so we move forward with our ghastly plan...
|
|
||||||
else {
|
|
||||||
|
|
||||||
iter.queryContained(lastReadPos.getContig(), (int) lastReadPos.getStop(), -1);
|
iter.queryContained(lastReadPos.getContig(), (int) lastReadPos.getStop(), -1);
|
||||||
|
|
||||||
// move the number of reads we read from the last pos
|
// move the number of reads we read from the last pos
|
||||||
|
|
@ -338,7 +327,7 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
SamFileHeaderMerger mg = createHeaderMerger();
|
SamFileHeaderMerger mg = createHeaderMerger();
|
||||||
iter = new MergingSamRecordIterator2(mg);
|
iter = new MergingSamRecordIterator2(mg);
|
||||||
iter.queryContained(lastReadPos.getContig(), 1, Integer.MAX_VALUE);
|
iter.queryContained(lastReadPos.getContig(), 1, Integer.MAX_VALUE);
|
||||||
return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads,iter),readCount);
|
return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -363,11 +352,28 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
throw new StingException("Danger: weve run out reads in fastMappedReadSeek");
|
throw new StingException("Danger: weve run out reads in fastMappedReadSeek");
|
||||||
//return null;
|
//return null;
|
||||||
}
|
}
|
||||||
bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads,iter), readCount);
|
bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount);
|
||||||
|
|
||||||
|
// return the iterator
|
||||||
|
return bound;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// return the iterator
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* set the initial iterator
|
||||||
|
*
|
||||||
|
* @param readCount the number of reads
|
||||||
|
* @param iter the merging iterator
|
||||||
|
* @return a bounded read iterator at the first read position in the file.
|
||||||
|
*/
|
||||||
|
private BoundedReadIterator InitialReadIterator(long readCount, MergingSamRecordIterator2 iter) {
|
||||||
|
BoundedReadIterator bound;
|
||||||
|
lastReadPos = new GenomeLoc(iter.getHeader().getSequenceDictionary().getSequence(0).getSequenceIndex(), 0, 0);
|
||||||
|
iter.queryContained(lastReadPos.getContig(), 1, -1);
|
||||||
|
bound = new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount);
|
||||||
|
this.readsTaken = readCount;
|
||||||
return bound;
|
return bound;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,7 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||||
import org.broadinstitute.sting.utils.threading.ThreadPoolMonitor;
|
import org.broadinstitute.sting.utils.threading.ThreadPoolMonitor;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
|
@ -61,7 +62,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Reduce
|
||||||
this.threadPool = Executors.newFixedThreadPool(nThreadsToUse);
|
this.threadPool = Executors.newFixedThreadPool(nThreadsToUse);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Object execute( Walker walker, List<GenomeLoc> intervals ) {
|
public Object execute( Walker walker, GenomeLocSortedSet intervals ) {
|
||||||
// Fast fail for walkers not supporting TreeReducible interface.
|
// Fast fail for walkers not supporting TreeReducible interface.
|
||||||
if( !(walker instanceof TreeReducible) )
|
if( !(walker instanceof TreeReducible) )
|
||||||
throw new IllegalArgumentException("Hierarchical microscheduler only works with TreeReducible walkers");
|
throw new IllegalArgumentException("Hierarchical microscheduler only works with TreeReducible walkers");
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||||
import org.broadinstitute.sting.gatk.Reads;
|
import org.broadinstitute.sting.gatk.Reads;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
@ -31,7 +32,7 @@ public class LinearMicroScheduler extends MicroScheduler {
|
||||||
* @param walker Computation to perform over dataset.
|
* @param walker Computation to perform over dataset.
|
||||||
* @param locations Subset of the dataset over which to walk.
|
* @param locations Subset of the dataset over which to walk.
|
||||||
*/
|
*/
|
||||||
public Object execute(Walker walker, List<GenomeLoc> locations) {
|
public Object execute(Walker walker, GenomeLocSortedSet locations) {
|
||||||
ShardStrategy shardStrategy = getShardStrategy(walker, reference, locations);
|
ShardStrategy shardStrategy = getShardStrategy(walker, reference, locations);
|
||||||
|
|
||||||
walker.initialize();
|
walker.initialize();
|
||||||
|
|
|
||||||
|
|
@ -20,6 +20,7 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||||
import org.broadinstitute.sting.gatk.Reads;
|
import org.broadinstitute.sting.gatk.Reads;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
|
@ -101,7 +102,7 @@ public abstract class MicroScheduler {
|
||||||
* @param intervals A list of intervals over which to walk. Null for whole dataset.
|
* @param intervals A list of intervals over which to walk. Null for whole dataset.
|
||||||
* @return the return type of the walker
|
* @return the return type of the walker
|
||||||
*/
|
*/
|
||||||
public abstract Object execute( Walker walker, List<GenomeLoc> intervals);
|
public abstract Object execute( Walker walker, GenomeLocSortedSet intervals);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the sharding strategy given a driving data source.
|
* Get the sharding strategy given a driving data source.
|
||||||
|
|
@ -110,7 +111,7 @@ public abstract class MicroScheduler {
|
||||||
* @param intervals Intervals to use when limiting sharding.
|
* @param intervals Intervals to use when limiting sharding.
|
||||||
* @return Sharding strategy for this driving data source.
|
* @return Sharding strategy for this driving data source.
|
||||||
*/
|
*/
|
||||||
protected ShardStrategy getShardStrategy( Walker walker, ReferenceSequenceFile drivingDataSource, List<GenomeLoc> intervals ) {
|
protected ShardStrategy getShardStrategy( Walker walker, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals ) {
|
||||||
ShardStrategy shardStrategy = null;
|
ShardStrategy shardStrategy = null;
|
||||||
|
|
||||||
if( walker instanceof LocusWalker ) {
|
if( walker instanceof LocusWalker ) {
|
||||||
|
|
|
||||||
|
|
@ -479,7 +479,7 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable {
|
||||||
* @return A GenomeLoc with the same contents as the current loc.
|
* @return A GenomeLoc with the same contents as the current loc.
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public Object clone() {
|
public GenomeLoc clone() {
|
||||||
return new GenomeLoc(this);
|
return new GenomeLoc(this);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,7 @@ import net.sf.samtools.SAMSequenceRecord;
|
||||||
import java.util.AbstractSet;
|
import java.util.AbstractSet;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
|
|
@ -26,22 +27,22 @@ import java.util.Iterator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author aaron
|
* @author aaron
|
||||||
* @version 1.0
|
* <p/>
|
||||||
* @date May 22, 2009
|
* Class GenomeLocCollection
|
||||||
* <p/>
|
* <p/>
|
||||||
* Class GenomeLocCollection
|
* a set of genome locations. This collection is self sorting,
|
||||||
* <p/>
|
* and will merge genome locations that are overlapping. The remove function
|
||||||
* a set of genome locations. This collection is self sorting,
|
* will also remove a region from the list, if the region to remove is a
|
||||||
* and will merge genome locations that are overlapping. The remove function
|
* partial interval of a region in the collection it will remove the region from
|
||||||
* will also remove a region from the list, if the region to remove is a
|
* that element.
|
||||||
* partial interval of a region in the collection it will remove the region from
|
|
||||||
* that element.
|
|
||||||
*/
|
*/
|
||||||
public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
||||||
// our private storage for the GenomeLoc's
|
// our private storage for the GenomeLoc's
|
||||||
private final ArrayList<GenomeLoc> mArray = new ArrayList<GenomeLoc>();
|
private final ArrayList<GenomeLoc> mArray = new ArrayList<GenomeLoc>();
|
||||||
|
|
||||||
public GenomeLocSortedSet() {}
|
/** default constructor */
|
||||||
|
public GenomeLocSortedSet() {
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* get an iterator over this collection
|
* get an iterator over this collection
|
||||||
|
|
@ -72,7 +73,9 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* add a genomeLoc to the collection, simply inserting in order into the set
|
* add a genomeLoc to the collection, simply inserting in order into the set
|
||||||
|
*
|
||||||
* @param e the GenomeLoc to add
|
* @param e the GenomeLoc to add
|
||||||
|
*
|
||||||
* @return true
|
* @return true
|
||||||
*/
|
*/
|
||||||
public boolean add(GenomeLoc e) {
|
public boolean add(GenomeLoc e) {
|
||||||
|
|
@ -82,7 +85,7 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
||||||
int index = 0;
|
int index = 0;
|
||||||
while (index < mArray.size()) {
|
while (index < mArray.size()) {
|
||||||
if (!e.isPast(mArray.get(index))) {
|
if (!e.isPast(mArray.get(index))) {
|
||||||
mArray.add(index,e);
|
mArray.add(index, e);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
++index;
|
++index;
|
||||||
|
|
@ -96,6 +99,7 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
||||||
* If it's not overlapping then we add it in sorted order.
|
* If it's not overlapping then we add it in sorted order.
|
||||||
*
|
*
|
||||||
* @param e the GenomeLoc to add to the collection
|
* @param e the GenomeLoc to add to the collection
|
||||||
|
*
|
||||||
* @return true, if the GenomeLoc could be added to the collection
|
* @return true, if the GenomeLoc could be added to the collection
|
||||||
*/
|
*/
|
||||||
public boolean addRegion(GenomeLoc e) {
|
public boolean addRegion(GenomeLoc e) {
|
||||||
|
|
@ -112,7 +116,7 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
||||||
for (GenomeLoc g : mArray) {
|
for (GenomeLoc g : mArray) {
|
||||||
if (g.contiguousP(e)) {
|
if (g.contiguousP(e)) {
|
||||||
GenomeLoc c = g.merge(e);
|
GenomeLoc c = g.merge(e);
|
||||||
mArray.set(mArray.indexOf(g),c);
|
mArray.set(mArray.indexOf(g), c);
|
||||||
haveAdded = true;
|
haveAdded = true;
|
||||||
} else if ((g.getContigIndex() == e.getContigIndex()) &&
|
} else if ((g.getContigIndex() == e.getContigIndex()) &&
|
||||||
(e.getStart() < g.getStart()) && !haveAdded) {
|
(e.getStart() < g.getStart()) && !haveAdded) {
|
||||||
|
|
@ -132,7 +136,9 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
||||||
/**
|
/**
|
||||||
* remove an element from the set. Given a specific genome location, this function will
|
* remove an element from the set. Given a specific genome location, this function will
|
||||||
* remove all regions in the element set that overlap the specified region.
|
* remove all regions in the element set that overlap the specified region.
|
||||||
|
*
|
||||||
* @param e the genomic range to remove
|
* @param e the genomic range to remove
|
||||||
|
*
|
||||||
* @return true if a removal action was performed, false if the collection was unchanged.
|
* @return true if a removal action was performed, false if the collection was unchanged.
|
||||||
*/
|
*/
|
||||||
public boolean removeRegion(GenomeLoc e) {
|
public boolean removeRegion(GenomeLoc e) {
|
||||||
|
|
@ -148,7 +154,7 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
||||||
*/
|
*/
|
||||||
for (GenomeLoc g : mArray) {
|
for (GenomeLoc g : mArray) {
|
||||||
if (g.overlapsP(e)) {
|
if (g.overlapsP(e)) {
|
||||||
if (g.compareTo(e) == 0) {
|
if (g.equals(e)) {
|
||||||
mArray.remove(mArray.indexOf(g));
|
mArray.remove(mArray.indexOf(g));
|
||||||
return true;
|
return true;
|
||||||
} else if (g.containsP(e)) {
|
} else if (g.containsP(e)) {
|
||||||
|
|
@ -162,11 +168,15 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
||||||
* |------| + |--------|
|
* |------| + |--------|
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
GenomeLoc before = new GenomeLoc(g.getContigIndex(), g.getStart(), e.getStart()-1);
|
GenomeLoc before = new GenomeLoc(g.getContigIndex(), g.getStart(), e.getStart() - 1);
|
||||||
GenomeLoc after = new GenomeLoc(g.getContigIndex(), e.getStop() + 1, g.getStop());
|
GenomeLoc after = new GenomeLoc(g.getContigIndex(), e.getStop() + 1, g.getStop());
|
||||||
int index = mArray.indexOf(g);
|
int index = mArray.indexOf(g);
|
||||||
mArray.add(index, after);
|
if (after.getStop() - after.getStart() > 0) {
|
||||||
mArray.add(index, before);
|
mArray.add(index, after);
|
||||||
|
}
|
||||||
|
if (before.getStop() - before.getStart() > 0) {
|
||||||
|
mArray.add(index, before);
|
||||||
|
}
|
||||||
mArray.remove(mArray.indexOf(g));
|
mArray.remove(mArray.indexOf(g));
|
||||||
return true;
|
return true;
|
||||||
} else if (e.containsP(g)) {
|
} else if (e.containsP(g)) {
|
||||||
|
|
@ -194,12 +204,12 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
||||||
* |------------- g ----------|
|
* |------------- g ----------|
|
||||||
* |------------ e -----------|
|
* |------------ e -----------|
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
||||||
if (e.getStart() < g.getStart()) {
|
if (e.getStart() < g.getStart()) {
|
||||||
l = new GenomeLoc(g.getContigIndex(), e.getStop()+1, g.getStop());
|
l = new GenomeLoc(g.getContigIndex(), e.getStop() + 1, g.getStop());
|
||||||
} else {
|
} else {
|
||||||
l = new GenomeLoc(g.getContigIndex(), g.getStart(), e.getStart()-1);
|
l = new GenomeLoc(g.getContigIndex(), g.getStart(), e.getStart() - 1);
|
||||||
}
|
}
|
||||||
// replace g with the new region
|
// replace g with the new region
|
||||||
mArray.set(mArray.indexOf(g), l);
|
mArray.set(mArray.indexOf(g), l);
|
||||||
|
|
@ -212,14 +222,45 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* create a list of genomic locations, given a reference sequence
|
* create a list of genomic locations, given a reference sequence
|
||||||
|
*
|
||||||
* @param dict the sequence dictionary to create a collection from
|
* @param dict the sequence dictionary to create a collection from
|
||||||
|
*
|
||||||
* @return the GenomeLocSet of all references sequences as GenomeLoc's
|
* @return the GenomeLocSet of all references sequences as GenomeLoc's
|
||||||
*/
|
*/
|
||||||
public static GenomeLocSortedSet createSetFromSequenceDictionary(SAMSequenceDictionary dict) {
|
public static GenomeLocSortedSet createSetFromSequenceDictionary(SAMSequenceDictionary dict) {
|
||||||
GenomeLocSortedSet returnSortedSet = new GenomeLocSortedSet();
|
GenomeLocSortedSet returnSortedSet = new GenomeLocSortedSet();
|
||||||
for (SAMSequenceRecord record : dict.getSequences()) {
|
for (SAMSequenceRecord record : dict.getSequences()) {
|
||||||
returnSortedSet.add(new GenomeLoc(record.getSequenceIndex(),1,record.getSequenceLength()));
|
returnSortedSet.add(new GenomeLoc(record.getSequenceIndex(), 1, record.getSequenceLength()));
|
||||||
}
|
}
|
||||||
return returnSortedSet;
|
return returnSortedSet;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a sorted genome location set from a list of GenomeLocs.
|
||||||
|
* @param locs the list<GenomeLoc>
|
||||||
|
* @return the sorted genome loc list
|
||||||
|
*/
|
||||||
|
public static GenomeLocSortedSet createSetFromList(List<GenomeLoc> locs) {
|
||||||
|
GenomeLocSortedSet set = new GenomeLocSortedSet();
|
||||||
|
for (GenomeLoc l: locs) {
|
||||||
|
set.add(l);
|
||||||
|
}
|
||||||
|
return set;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* return a deep copy of this collection.
|
||||||
|
*
|
||||||
|
* @return a new GenomeLocSortedSet, indentical to the current GenomeLocSortedSet.
|
||||||
|
*/
|
||||||
|
public GenomeLocSortedSet clone() {
|
||||||
|
GenomeLocSortedSet ret = new GenomeLocSortedSet();
|
||||||
|
for (GenomeLoc loc : this.mArray) {
|
||||||
|
// ensure a deep copy
|
||||||
|
ret.mArray.add(new GenomeLoc(loc.getContigIndex(), loc.getStart(), loc.getStop()));
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,73 @@
|
||||||
|
package org.broadinstitute.sting.gatk.dataSources.shards;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.BaseTest;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||||
|
import org.broadinstitute.sting.utils.sam.ArtificialSamUtils;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Test;
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
|
import net.sf.samtools.SAMFileHeader;
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2009 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author aaron
|
||||||
|
* <p/>
|
||||||
|
* Class IntervalReadShardTest
|
||||||
|
* <p/>
|
||||||
|
* Tests for the IntervalReadShard class.
|
||||||
|
*/
|
||||||
|
public class IntervalReadShardTest extends BaseTest {
|
||||||
|
|
||||||
|
private IntervalReadShard shard = null;
|
||||||
|
private SAMFileHeader header = ArtificialSamUtils.createArtificialSamHeader(NUMBER_OF_CHROMOSOMES, STARTING_CHROMOSOME, CHROMOSOME_SIZE);
|
||||||
|
private static final int NUMBER_OF_CHROMOSOMES = 5;
|
||||||
|
private static final int STARTING_CHROMOSOME = 1;
|
||||||
|
private static final int CHROMOSOME_SIZE = 1000;
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setup() {
|
||||||
|
GenomeLoc.setupRefContigOrdering(header.getSequenceDictionary());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void simpleReturn() {
|
||||||
|
GenomeLoc loc = new GenomeLoc(1, 1, 100);
|
||||||
|
shard = new IntervalReadShard(loc);
|
||||||
|
assertTrue(shard.getGenomeLoc().equals(loc));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void ensureNotReference() {
|
||||||
|
GenomeLoc loc = new GenomeLoc(1, 1, 100);
|
||||||
|
shard = new IntervalReadShard(loc);
|
||||||
|
assertTrue(shard.getGenomeLoc() != loc && shard.getGenomeLoc().equals(loc));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -1,142 +0,0 @@
|
||||||
package org.broadinstitute.sting.gatk.dataSources.shards;
|
|
||||||
|
|
||||||
import static junit.framework.Assert.assertEquals;
|
|
||||||
import static junit.framework.Assert.fail;
|
|
||||||
import net.sf.samtools.SAMSequenceDictionary;
|
|
||||||
import net.sf.samtools.SAMSequenceRecord;
|
|
||||||
import org.broadinstitute.sting.BaseTest;
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
|
||||||
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
|
||||||
import org.junit.*;
|
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* User: aaron
|
|
||||||
* Date: May 14, 2009
|
|
||||||
* Time: 3:52:57 PM
|
|
||||||
*
|
|
||||||
* The Broad Institute
|
|
||||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
|
||||||
* This software and its documentation are copyright 2009 by the
|
|
||||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
|
||||||
*
|
|
||||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
|
||||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @author aaron
|
|
||||||
* @version 1.0
|
|
||||||
* @date May 14, 2009
|
|
||||||
* <p/>
|
|
||||||
* Class LocusWindowShardStrategyTest
|
|
||||||
* <p/>
|
|
||||||
* LocusWindowShardStrategy tests
|
|
||||||
*/
|
|
||||||
public class IntervalShardStrategyTest extends BaseTest {
|
|
||||||
|
|
||||||
private static FastaSequenceFile2 seq;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This function (because of the @BeforeClass tag) gets called only once ever,
|
|
||||||
* before any tests are run
|
|
||||||
*/
|
|
||||||
@BeforeClass
|
|
||||||
public static void doBeforeAnyTests() {
|
|
||||||
seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Tears down the test fixture after each call.
|
|
||||||
* <p/>
|
|
||||||
* Called after every test case method.
|
|
||||||
*/
|
|
||||||
@AfterClass
|
|
||||||
public static void doAfterAllTests() {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This function does the setup of our parser, before each method call.
|
|
||||||
* <p/>
|
|
||||||
* Called before every test case method.
|
|
||||||
*/
|
|
||||||
@Before
|
|
||||||
public void doForEachTest() {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Tears down the test fixture after each call.
|
|
||||||
* <p/>
|
|
||||||
* Called after every test case method.
|
|
||||||
*/
|
|
||||||
@After
|
|
||||||
public void undoForEachTest() {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Tests that we got a string parameter in correctly */
|
|
||||||
@Test
|
|
||||||
public void testIntervalGenomeCycle() throws InterruptedException {
|
|
||||||
logger.warn("Executing testIntervalGenomeCycle");
|
|
||||||
|
|
||||||
SAMSequenceDictionary dic = seq.getSequenceDictionary();
|
|
||||||
|
|
||||||
|
|
||||||
// setup a list of genome locs that represent the whole file
|
|
||||||
SAMSequenceRecord s = dic.getSequence(1);
|
|
||||||
int stop = s.getSequenceLength();
|
|
||||||
int size = 10000;
|
|
||||||
int location = 1;
|
|
||||||
|
|
||||||
GenomeLoc.setupRefContigOrdering(dic);
|
|
||||||
// keep track of the number of genome locs we build
|
|
||||||
int genomeLocs = 0;
|
|
||||||
ArrayList<GenomeLoc> locations = new ArrayList<GenomeLoc>();
|
|
||||||
try {
|
|
||||||
while (location + size < stop) {
|
|
||||||
// lets make up some fake locations
|
|
||||||
GenomeLoc gl = new GenomeLoc(s.getSequenceName(), location, location + size - 1);
|
|
||||||
logger.debug("loc = " + location);
|
|
||||||
|
|
||||||
// let's move the location up, with a size space
|
|
||||||
location += (size * 2);
|
|
||||||
|
|
||||||
// add our current location to the list
|
|
||||||
locations.add(gl);
|
|
||||||
|
|
||||||
// add another genome location
|
|
||||||
++genomeLocs;
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
logger.debug("Location count = " + genomeLocs);
|
|
||||||
ShardStrategy strategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL, seq.getSequenceDictionary(), 0, locations);
|
|
||||||
int shardCount = 0;
|
|
||||||
try {
|
|
||||||
for (Shard sh : strategy) {
|
|
||||||
GenomeLoc l = sh.getGenomeLoc();
|
|
||||||
GenomeLoc truth = locations.get(shardCount);
|
|
||||||
if (l.compareTo(truth) != 0) {
|
|
||||||
String truthStr = truth.getContig() + ":" + truth.getStart() + ":" + truth.getStop();
|
|
||||||
String lStr = l.getContig() + ":" + l.getStart() + ":" + l.getStop();
|
|
||||||
fail("Genome loc " + truthStr + " doesn't equal " + lStr);
|
|
||||||
}
|
|
||||||
shardCount++;
|
|
||||||
}
|
|
||||||
assertEquals(shardCount, genomeLocs);
|
|
||||||
|
|
||||||
} catch (Exception e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
fail("testIntervalGenomeCycle: ne exception expected");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -0,0 +1,79 @@
|
||||||
|
package org.broadinstitute.sting.gatk.dataSources.shards;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.sam.ArtificialSamUtils;
|
||||||
|
import org.broadinstitute.sting.BaseTest;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Test;
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
|
import net.sf.samtools.SAMFileHeader;
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2009 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author aaron
|
||||||
|
* <p/>
|
||||||
|
* Class LocusIntervalShardStrategyTest
|
||||||
|
* <p/>
|
||||||
|
* Tests the LocusIntervalShardStrategy class.
|
||||||
|
*/
|
||||||
|
public class LocusIntervalShardStrategyTest extends BaseTest {
|
||||||
|
private GenomeLocSortedSet mSortedSet = null;
|
||||||
|
private SAMFileHeader header = ArtificialSamUtils.createArtificialSamHeader(NUMBER_OF_CHROMOSOMES, STARTING_CHROMOSOME, CHROMOSOME_SIZE);
|
||||||
|
private static final int NUMBER_OF_CHROMOSOMES = 5;
|
||||||
|
private static final int STARTING_CHROMOSOME = 1;
|
||||||
|
private static final int CHROMOSOME_SIZE = 1000;
|
||||||
|
private LocusIntervalShardStrategy strat = null;
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setup() {
|
||||||
|
GenomeLoc.setupRefContigOrdering(header.getSequenceDictionary());
|
||||||
|
mSortedSet = new GenomeLocSortedSet();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testOneToOneness() {
|
||||||
|
for (int x = 0; x < 100; x++) {
|
||||||
|
GenomeLoc loc = new GenomeLoc(0,(x*10)+1, (x*10)+8);
|
||||||
|
mSortedSet.add(loc);
|
||||||
|
}
|
||||||
|
strat = new LocusIntervalShardStrategy(header.getSequenceDictionary(),mSortedSet);
|
||||||
|
int counter = 0;
|
||||||
|
while (strat.hasNext()) {
|
||||||
|
++counter;
|
||||||
|
GenomeLoc loc = strat.next().getGenomeLoc();
|
||||||
|
long stop = loc.getStop();
|
||||||
|
long start = loc.getStart();
|
||||||
|
long length = stop - start;
|
||||||
|
assertTrue(length == 7);
|
||||||
|
}
|
||||||
|
assertTrue(counter == 100);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,124 @@
|
||||||
|
package org.broadinstitute.sting.gatk.dataSources.shards;
|
||||||
|
|
||||||
|
import org.junit.Test;
|
||||||
|
import org.junit.Before;
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
import org.broadinstitute.sting.utils.sam.ArtificialSamUtils;
|
||||||
|
import org.broadinstitute.sting.BaseTest;
|
||||||
|
import net.sf.samtools.SAMFileHeader;
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2009 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author aaron
|
||||||
|
* <p/>
|
||||||
|
* Class ReadIntervalShardStrategyTest
|
||||||
|
* <p/>
|
||||||
|
* Tests the ReadIntervalShardStrategy class
|
||||||
|
*/
|
||||||
|
public class ReadIntervalShardStrategyTest extends BaseTest {
|
||||||
|
|
||||||
|
private GenomeLocSortedSet mSortedSet = null;
|
||||||
|
private SAMFileHeader header = ArtificialSamUtils.createArtificialSamHeader(NUMBER_OF_CHROMOSOMES, STARTING_CHROMOSOME, CHROMOSOME_SIZE);
|
||||||
|
private static final int NUMBER_OF_CHROMOSOMES = 5;
|
||||||
|
private static final int STARTING_CHROMOSOME = 1;
|
||||||
|
private static final int CHROMOSOME_SIZE = 1000;
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setup() {
|
||||||
|
GenomeLoc.setupRefContigOrdering(header.getSequenceDictionary());
|
||||||
|
mSortedSet = new GenomeLocSortedSet();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(expected = StingException.class)
|
||||||
|
public void testExceptionOnEmpty() {
|
||||||
|
ReadIntervalShardStrategy strat = new ReadIntervalShardStrategy(header.getSequenceDictionary(), 100, mSortedSet);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSingleChromosomeFunctionality() {
|
||||||
|
GenomeLoc loc = new GenomeLoc(1, 1, 1000);
|
||||||
|
mSortedSet.add(loc);
|
||||||
|
ReadIntervalShardStrategy strat = new ReadIntervalShardStrategy(header.getSequenceDictionary(), 100, mSortedSet);
|
||||||
|
int counter = 0;
|
||||||
|
while (strat.hasNext()) {
|
||||||
|
Shard d = strat.next();
|
||||||
|
counter++;
|
||||||
|
}
|
||||||
|
assertEquals(10, counter);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMultipleChromosomeFunctionality() {
|
||||||
|
for (int x = 0; x < 5; x++) {
|
||||||
|
GenomeLoc loc = new GenomeLoc(x, 1, 1000);
|
||||||
|
mSortedSet.add(loc);
|
||||||
|
}
|
||||||
|
ReadIntervalShardStrategy strat = new ReadIntervalShardStrategy(header.getSequenceDictionary(), 100, mSortedSet);
|
||||||
|
int counter = 0;
|
||||||
|
while (strat.hasNext()) {
|
||||||
|
Shard d = strat.next();
|
||||||
|
counter++;
|
||||||
|
}
|
||||||
|
assertEquals(50, counter);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testOddSizeShardFunctionality() {
|
||||||
|
for (int x = 0; x < 5; x++) {
|
||||||
|
GenomeLoc loc = new GenomeLoc(x, 1, 1000);
|
||||||
|
mSortedSet.add(loc);
|
||||||
|
}
|
||||||
|
ReadIntervalShardStrategy strat = new ReadIntervalShardStrategy(header.getSequenceDictionary(), 789, mSortedSet);
|
||||||
|
int counter = 0;
|
||||||
|
while (strat.hasNext()) {
|
||||||
|
Shard d = strat.next();
|
||||||
|
if (counter % 2 == 0) {
|
||||||
|
assertEquals(1, d.getGenomeLoc().getStart());
|
||||||
|
assertEquals(789, d.getGenomeLoc().getStop());
|
||||||
|
} else {
|
||||||
|
assertEquals(790, d.getGenomeLoc().getStart());
|
||||||
|
assertEquals(1000, d.getGenomeLoc().getStop());
|
||||||
|
}
|
||||||
|
counter++;
|
||||||
|
}
|
||||||
|
assertEquals(10, counter);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(expected = UnsupportedOperationException.class)
|
||||||
|
public void testRemove() {
|
||||||
|
GenomeLoc loc = new GenomeLoc(1, 1, 1000);
|
||||||
|
mSortedSet.add(loc);
|
||||||
|
ReadIntervalShardStrategy strat = new ReadIntervalShardStrategy(header.getSequenceDictionary(), 100, mSortedSet);
|
||||||
|
strat.remove();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -4,10 +4,14 @@ import static junit.framework.Assert.assertEquals;
|
||||||
import static junit.framework.Assert.fail;
|
import static junit.framework.Assert.fail;
|
||||||
import net.sf.samtools.SAMSequenceDictionary;
|
import net.sf.samtools.SAMSequenceDictionary;
|
||||||
import net.sf.samtools.SAMSequenceRecord;
|
import net.sf.samtools.SAMSequenceRecord;
|
||||||
|
import net.sf.samtools.SAMFileHeader;
|
||||||
import org.broadinstitute.sting.BaseTest;
|
import org.broadinstitute.sting.BaseTest;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||||
|
import org.broadinstitute.sting.utils.sam.ArtificialSamUtils;
|
||||||
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
||||||
import org.junit.*;
|
import org.junit.*;
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
|
@ -32,139 +36,62 @@ import java.util.ArrayList;
|
||||||
/**
|
/**
|
||||||
* @author aaron
|
* @author aaron
|
||||||
* @version 1.0
|
* @version 1.0
|
||||||
* @date Apr 8, 2009
|
|
||||||
* <p/>
|
|
||||||
* Class ShardFactoryTest
|
|
||||||
* <p/>
|
|
||||||
* Tests the shard strategy factory. This tests the whole sharding interface, and should be
|
|
||||||
* split in the future into seperate test cases.
|
|
||||||
* TODO: split out for the seperate sharding classes
|
|
||||||
*/
|
*/
|
||||||
public class ShardStrategyFactoryTest extends BaseTest {
|
public class ShardStrategyFactoryTest extends BaseTest {
|
||||||
|
|
||||||
private static FastaSequenceFile2 seq;
|
private SAMFileHeader header = ArtificialSamUtils.createArtificialSamHeader(NUMBER_OF_CHROMOSOMES, STARTING_CHROMOSOME, CHROMOSOME_SIZE);
|
||||||
|
private static final int NUMBER_OF_CHROMOSOMES = 5;
|
||||||
|
private static final int STARTING_CHROMOSOME = 1;
|
||||||
|
private static final int CHROMOSOME_SIZE = 1000;
|
||||||
|
private GenomeLocSortedSet set = null;
|
||||||
|
|
||||||
/**
|
|
||||||
* This function (because of the @BeforeClass tag) gets called only once ever,
|
|
||||||
* before any tests are run
|
|
||||||
*/
|
|
||||||
@BeforeClass
|
|
||||||
public static void doBeforeAnyTests() {
|
|
||||||
seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Tears down the test fixture after each call.
|
|
||||||
* <p/>
|
|
||||||
* Called after every test case method.
|
|
||||||
*/
|
|
||||||
@AfterClass
|
|
||||||
public static void doAfterAllTests() {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This function does the setup of our parser, before each method call.
|
|
||||||
* <p/>
|
|
||||||
* Called before every test case method.
|
|
||||||
*/
|
|
||||||
@Before
|
@Before
|
||||||
public void doForEachTest() {
|
public void setup() {
|
||||||
|
GenomeLoc.setupRefContigOrdering(header.getSequenceDictionary());
|
||||||
|
set = new GenomeLocSortedSet();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Tears down the test fixture after each call.
|
|
||||||
* <p/>
|
|
||||||
* Called after every test case method.
|
|
||||||
*/
|
|
||||||
@After
|
|
||||||
public void undoForEachTest() {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Tests that we got a string parameter in correctly */
|
|
||||||
@Test
|
@Test
|
||||||
public void testFullGenomeCycle() {
|
public void testReadNonInterval() {
|
||||||
logger.warn("Executing testFullGenomeCycle");
|
ShardStrategy st = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS,header.getSequenceDictionary(),100);
|
||||||
|
assertTrue(st instanceof ReadShardStrategy);
|
||||||
GenomeLoc.setupRefContigOrdering(seq.getSequenceDictionary());
|
|
||||||
|
|
||||||
ShardStrategy strategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 100000);
|
|
||||||
int shardCount = 0;
|
|
||||||
try {
|
|
||||||
|
|
||||||
for (Shard s : strategy) {
|
|
||||||
GenomeLoc l = s.getGenomeLoc();
|
|
||||||
//logger.debug("Shard start: " + l.getStart() + " stop " + l.getStop() + " contig " + l.getContig());
|
|
||||||
shardCount++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// check to make sure we got apple shards
|
|
||||||
//logger.debug("shardCount : " + shardCount + " seq size = " + seq.getSequenceDictionary().size());
|
|
||||||
|
|
||||||
} catch (Exception e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
fail("We Shouldn't of seen an exception! : " + e.getMessage() + "; shard count " + shardCount);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/** Tests that we got a string parameter in correctly */
|
|
||||||
@Test
|
@Test
|
||||||
public void testIntervalGenomeCycle() throws InterruptedException {
|
public void testReadInterval() {
|
||||||
logger.warn("Executing testIntervalGenomeCycle");
|
GenomeLoc l = new GenomeLoc(0,1,100);
|
||||||
|
set.add(l);
|
||||||
SAMSequenceDictionary dic = seq.getSequenceDictionary();
|
ShardStrategy st = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS,header.getSequenceDictionary(),100,set);
|
||||||
SAMSequenceRecord s = dic.getSequence(1);
|
assertTrue(st instanceof ReadIntervalShardStrategy);
|
||||||
// Character stream writing
|
|
||||||
|
|
||||||
|
|
||||||
int stop = s.getSequenceLength();
|
|
||||||
int size = 10000;
|
|
||||||
int location = 1;
|
|
||||||
GenomeLoc.setupRefContigOrdering(dic);
|
|
||||||
// keep track of the number of genome locs we build
|
|
||||||
int genomeLocs = 0;
|
|
||||||
ArrayList<GenomeLoc> locations = new ArrayList<GenomeLoc>();
|
|
||||||
|
|
||||||
try {
|
|
||||||
while (location + size < stop) {
|
|
||||||
logger.debug("s = " + s.getSequenceName() + " " + location + " " + size);
|
|
||||||
// lets make up some fake locations
|
|
||||||
GenomeLoc gl = new GenomeLoc(s.getSequenceName(), location, location + size - 1);
|
|
||||||
logger.debug("loc = " + location);
|
|
||||||
|
|
||||||
// let's move the location up, with a size space
|
|
||||||
location += (size * 2);
|
|
||||||
|
|
||||||
// add our current location to the list
|
|
||||||
locations.add(gl);
|
|
||||||
|
|
||||||
// add another genome location
|
|
||||||
++genomeLocs;
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
logger.debug("Location count = " + genomeLocs);
|
|
||||||
ShardStrategy strategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, seq.getSequenceDictionary(), 5000, locations);
|
|
||||||
int shardCount = 0;
|
|
||||||
try {
|
|
||||||
for (Shard sh : strategy) {
|
|
||||||
GenomeLoc l = sh.getGenomeLoc();
|
|
||||||
|
|
||||||
logger.debug("Shard start: " + l.getStart() + " stop " + l.getStop() + " contig " + l.getContig());
|
|
||||||
shardCount++;
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.debug("Shard count = " + shardCount);
|
|
||||||
assertEquals(shardCount, genomeLocs * 2);
|
|
||||||
|
|
||||||
} catch (Exception e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
fail("testIntervalGenomeCycle: ne exception expected");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLinearNonInterval() {
|
||||||
|
ShardStrategy st = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR,header.getSequenceDictionary(),100);
|
||||||
|
assertTrue(st instanceof LinearLocusShardStrategy);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testExpNonInterval() {
|
||||||
|
ShardStrategy st = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.EXPONENTIAL,header.getSequenceDictionary(),100);
|
||||||
|
assertTrue(st instanceof ExpGrowthLocusShardStrategy);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testExpInterval() {
|
||||||
|
GenomeLoc l = new GenomeLoc(0,1,100);
|
||||||
|
set.add(l);
|
||||||
|
ShardStrategy st = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.EXPONENTIAL,header.getSequenceDictionary(),100,set);
|
||||||
|
assertTrue(st instanceof ExpGrowthLocusShardStrategy);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLinearInterval() {
|
||||||
|
GenomeLoc l = new GenomeLoc(0,1,100);
|
||||||
|
set.add(l);
|
||||||
|
ShardStrategy st = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.LINEAR,header.getSequenceDictionary(),100,set);
|
||||||
|
assertTrue(st instanceof LinearLocusShardStrategy);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -75,7 +75,7 @@ public class SAMByReadsTest extends BaseTest {
|
||||||
|
|
||||||
final int targetReadCount = 5000;
|
final int targetReadCount = 5000;
|
||||||
|
|
||||||
ShardStrategy shardStrategy = ShardStrategyFactory.shatterByReadCount(seq.getSequenceDictionary(),targetReadCount);
|
ShardStrategy shardStrategy = ShardStrategyFactory.shatter(ShardStrategyFactory.SHATTER_STRATEGY.READS,seq.getSequenceDictionary(),targetReadCount);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
SAMDataSource data = new SAMDataSource(reads);
|
SAMDataSource data = new SAMDataSource(reads);
|
||||||
|
|
|
||||||
|
|
@ -29,7 +29,6 @@ import java.util.Iterator;
|
||||||
/**
|
/**
|
||||||
* @author aaron
|
* @author aaron
|
||||||
* @version 1.0
|
* @version 1.0
|
||||||
* @date May 22, 2009
|
|
||||||
* <p/>
|
* <p/>
|
||||||
* Class GenomeLocSetTest
|
* Class GenomeLocSetTest
|
||||||
* <p/>
|
* <p/>
|
||||||
|
|
@ -142,6 +141,32 @@ public class GenomeLocSortedSetTest extends BaseTest {
|
||||||
assertTrue(loc.getContigIndex() == 1);
|
assertTrue(loc.getContigIndex() == 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void deleteAllByRegion() {
|
||||||
|
GenomeLoc e = new GenomeLoc(1, 1, 100);
|
||||||
|
mSortedSet.add(e);
|
||||||
|
for (int x = 1; x < 101; x++) {
|
||||||
|
GenomeLoc del = new GenomeLoc(1,x,x);
|
||||||
|
mSortedSet.removeRegion(del);
|
||||||
|
}
|
||||||
|
assertTrue(mSortedSet.isEmpty());
|
||||||
|
}
|
||||||
|
@Test
|
||||||
|
public void deleteSomeByRegion() {
|
||||||
|
GenomeLoc e = new GenomeLoc(1, 1, 100);
|
||||||
|
mSortedSet.add(e);
|
||||||
|
for (int x = 1; x < 50; x++) {
|
||||||
|
GenomeLoc del = new GenomeLoc(1,x,x);
|
||||||
|
mSortedSet.removeRegion(del);
|
||||||
|
}
|
||||||
|
assertTrue(!mSortedSet.isEmpty());
|
||||||
|
assertTrue(mSortedSet.size() == 1);
|
||||||
|
GenomeLoc loc = mSortedSet.iterator().next();
|
||||||
|
assertTrue(loc.getStop() == 100);
|
||||||
|
assertTrue(loc.getStart() == 50);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void deleteSuperRegion() {
|
public void deleteSuperRegion() {
|
||||||
GenomeLoc e = new GenomeLoc(1, 10, 20);
|
GenomeLoc e = new GenomeLoc(1, 10, 20);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue