added the ability to create interval lists directly from a ROD, using the command line arg '-BTI' (long name '--rodToIntervalTrackName'). The parameter to this arg is the name of the ROD track, which must be a track name specified in the -B option.
Using this feature, sites covered by the target ROD will be iterated over. This list of intevals generated is merged with any intervals from the -L and -XL args, and the Walker is run over the resulting merged list. WARNING: for very large ROD's this can be costly. Consider this experimental for now. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3134 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
20cc2a85a4
commit
e148a3ac61
|
|
@ -45,6 +45,7 @@ import org.broadinstitute.sting.gatk.io.OutputTracker;
|
|||
import org.broadinstitute.sting.gatk.io.stubs.Stub;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackManager;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RMDIntervalGenerator;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.bed.BedParser;
|
||||
|
|
@ -176,12 +177,12 @@ public class GenomeAnalysisEngine {
|
|||
private void initializeIntervals() {
|
||||
|
||||
// return null if no interval arguments at all
|
||||
if ((argCollection.intervals == null) && (argCollection.excludeIntervals == null))
|
||||
if ((argCollection.intervals == null) && (argCollection.excludeIntervals == null) && (argCollection.RODToInterval == null))
|
||||
return;
|
||||
|
||||
else {
|
||||
// if include argument isn't given, create new set of all possible intervals
|
||||
GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null ?
|
||||
GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null && argCollection.RODToInterval == null ?
|
||||
GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getSequenceDictionary()) :
|
||||
parseIntervalArguments(argCollection.intervals, argCollection.intervalMerging));
|
||||
|
||||
|
|
@ -222,28 +223,32 @@ public class GenomeAnalysisEngine {
|
|||
public static GenomeLocSortedSet parseIntervalArguments(List <String> argList, IntervalMergingRule mergingRule) {
|
||||
|
||||
List<GenomeLoc> rawIntervals = new ArrayList<GenomeLoc>(); // running list of raw GenomeLocs
|
||||
rawIntervals.addAll(checkRODToIntervalArgument()); // add any RODs-to-intervals we have
|
||||
|
||||
for (String argument : argList) {
|
||||
if (argList != null) { // now that we can be in this function if only the ROD-to-Intervals was provided, we need to
|
||||
// ensure that the arg list isn't null before looping.
|
||||
for (String argument : argList) {
|
||||
|
||||
// if any interval argument is '-L all', consider all loci by returning no intervals
|
||||
if (argument.equals("all")) {
|
||||
if (argList.size() != 1) {
|
||||
// throw error if '-L all' is not only interval - potentially conflicting commands
|
||||
throw new StingException(String.format("Conflicting arguments: Intervals given along with \"-L all\""));
|
||||
// if any interval argument is '-L all', consider all loci by returning no intervals
|
||||
if (argument.equals("all")) {
|
||||
if (argList.size() != 1) {
|
||||
// throw error if '-L all' is not only interval - potentially conflicting commands
|
||||
throw new StingException(String.format("Conflicting arguments: Intervals given along with \"-L all\""));
|
||||
}
|
||||
return null;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// separate argument on semicolon first
|
||||
for (String fileOrInterval : argument.split(";")) {
|
||||
// separate argument on semicolon first
|
||||
for (String fileOrInterval : argument.split(";")) {
|
||||
|
||||
// if it's a file, add items to raw interval list
|
||||
if (isFile(fileOrInterval))
|
||||
rawIntervals.addAll(GenomeLocParser.intervalFileToList(fileOrInterval, mergingRule));
|
||||
// if it's a file, add items to raw interval list
|
||||
if (isFile(fileOrInterval))
|
||||
rawIntervals.addAll(GenomeLocParser.intervalFileToList(fileOrInterval, mergingRule));
|
||||
|
||||
// otherwise treat as an interval -> parse and add to raw interval list
|
||||
else {
|
||||
rawIntervals.add(GenomeLocParser.parseGenomeInterval(fileOrInterval));
|
||||
// otherwise treat as an interval -> parse and add to raw interval list
|
||||
else {
|
||||
rawIntervals.add(GenomeLocParser.parseGenomeInterval(fileOrInterval));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -261,6 +266,30 @@ public class GenomeAnalysisEngine {
|
|||
|
||||
}
|
||||
|
||||
/**
|
||||
* if we have a ROD specified as a 'rodToIntervalTrackName', convert its records to RODs
|
||||
*/
|
||||
// TODO: this function uses toLowerCase to work with the current ROD system, fix it if we make ROD names case-sensitive
|
||||
private static List<GenomeLoc> checkRODToIntervalArgument() {
|
||||
Map<String, ReferenceOrderedDataSource> rodNames = RMDIntervalGenerator.getRMDTrackNames(instance.rodDataSources);
|
||||
// Do we have any RODs that overloaded as interval lists with the 'rodToIntervalTrackName' flag?
|
||||
List<GenomeLoc> ret = new ArrayList<GenomeLoc>();
|
||||
if (rodNames != null && instance.argCollection.RODToInterval != null) {
|
||||
String rodName = GenomeAnalysisEngine.instance.argCollection.RODToInterval;
|
||||
|
||||
// check to make sure we have a rod of that name
|
||||
if (!rodNames.containsKey(rodName.toLowerCase()))
|
||||
throw new StingException("--rodToIntervalTrackName (-BTI) was pass the name '"+rodName+"', which wasn't given as a ROD name in the -B option");
|
||||
|
||||
for (String str : rodNames.keySet())
|
||||
if (str.toLowerCase().equals(rodName.toLowerCase())) {
|
||||
RMDIntervalGenerator intervalGenerator = new RMDIntervalGenerator(rodNames.get(str).getReferenceOrderedData());
|
||||
ret.addAll(intervalGenerator.toGenomeLocList());
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if string argument was intented as a file
|
||||
* Accepted file extensions: .list, .interval_list, .bed, .picard
|
||||
|
|
|
|||
|
|
@ -76,6 +76,10 @@ public class GATKArgumentCollection {
|
|||
@Argument(fullName = "rodBind", shortName = "B", doc = "Bindings for reference-ordered data, in the form <name>,<type>,<file>", required = false)
|
||||
public ArrayList<String> RODBindings = new ArrayList<String>();
|
||||
|
||||
@Element(required = false)
|
||||
@Argument(fullName = "rodToIntervalTrackName", shortName = "BTI", doc = "Indicates that the named track should be converted into an interval list, to drive the traversal", required = false)
|
||||
public String RODToInterval = null;
|
||||
|
||||
@Element(required = false)
|
||||
@Argument(fullName = "DBSNP", shortName = "D", doc = "DBSNP file", required = false)
|
||||
public String DBSNPFile = null;
|
||||
|
|
@ -234,6 +238,7 @@ public class GATKArgumentCollection {
|
|||
* @return true if they're equal
|
||||
*/
|
||||
public boolean equals(GATKArgumentCollection other) {
|
||||
if (other == null) return false;
|
||||
if (other.samFiles.size() != samFiles.size()) {
|
||||
return false;
|
||||
}
|
||||
|
|
@ -318,6 +323,10 @@ public class GATKArgumentCollection {
|
|||
if (other.intervalMerging != this.intervalMerging) {
|
||||
return false;
|
||||
}
|
||||
if ((other.RODToInterval == null && RODToInterval != null) ||
|
||||
(other.RODToInterval != null && !other.RODToInterval.equals(RODToInterval))) {
|
||||
return false;
|
||||
}
|
||||
// if (other.enableRodWalkers != this.enableRodWalkers) {
|
||||
// return false;
|
||||
// }
|
||||
|
|
|
|||
|
|
@ -0,0 +1,58 @@
|
|||
package org.broadinstitute.sting.gatk.refdata.utils;
|
||||
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @author aaron
|
||||
*
|
||||
* Class RMDIntervalGenerator
|
||||
*
|
||||
* Creates an interval list, given an RMDTrack
|
||||
*/
|
||||
public class RMDIntervalGenerator {
|
||||
public RMDTrack track;
|
||||
|
||||
/**
|
||||
* create a interval representation of a ROD track
|
||||
* @param track the track
|
||||
*/
|
||||
public RMDIntervalGenerator(RMDTrack track) {
|
||||
if (track == null) throw new IllegalArgumentException("Track cannot be null");
|
||||
this.track = track;
|
||||
}
|
||||
|
||||
/**
|
||||
* create a genome location list from the interval track
|
||||
* @return a list of genome locations
|
||||
*/
|
||||
public List<GenomeLoc> toGenomeLocList() {
|
||||
Iterator<GATKFeature> iter = track.getIterator();
|
||||
List<GenomeLoc> locations = new ArrayList<GenomeLoc>();
|
||||
while (iter.hasNext()) {
|
||||
GATKFeature feature = iter.next();
|
||||
GenomeLoc loc = feature.getLocation();
|
||||
if (loc != null) locations.add(loc);
|
||||
}
|
||||
return locations;
|
||||
}
|
||||
|
||||
/**
|
||||
* return a map of reference meta data track names to RODS
|
||||
* @param sources the reference ordered data sources to get the names from
|
||||
* @return a map of reference meta data names to RODS
|
||||
*/
|
||||
public static Map<String,ReferenceOrderedDataSource> getRMDTrackNames(List<ReferenceOrderedDataSource> sources) {
|
||||
// get a list of the current rod names we're working with
|
||||
Map<String,ReferenceOrderedDataSource> rodNames = new HashMap<String,ReferenceOrderedDataSource>();
|
||||
for (ReferenceOrderedDataSource rod : sources) {
|
||||
rodNames.put(rod.getName(),rod);
|
||||
}
|
||||
return rodNames;
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue