added the ability to create interval lists directly from a ROD, using the command line arg '-BTI' (long name '--rodToIntervalTrackName'). The parameter to this arg is the name of the ROD track, which must be a track name specified in the -B option.

Using this feature, sites covered by the target ROD will be iterated over.  This list of intevals generated is merged with any intervals from the -L and -XL args, and the Walker is run over the resulting merged list.

WARNING: for very large ROD's this can be costly.  Consider this experimental for now.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3134 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2010-04-08 05:14:41 +00:00
parent 20cc2a85a4
commit e148a3ac61
3 changed files with 114 additions and 18 deletions

View File

@ -45,6 +45,7 @@ import org.broadinstitute.sting.gatk.io.OutputTracker;
import org.broadinstitute.sting.gatk.io.stubs.Stub;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackManager;
import org.broadinstitute.sting.gatk.refdata.utils.RMDIntervalGenerator;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.bed.BedParser;
@ -176,12 +177,12 @@ public class GenomeAnalysisEngine {
private void initializeIntervals() {
// return null if no interval arguments at all
if ((argCollection.intervals == null) && (argCollection.excludeIntervals == null))
if ((argCollection.intervals == null) && (argCollection.excludeIntervals == null) && (argCollection.RODToInterval == null))
return;
else {
// if include argument isn't given, create new set of all possible intervals
GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null ?
GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null && argCollection.RODToInterval == null ?
GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getSequenceDictionary()) :
parseIntervalArguments(argCollection.intervals, argCollection.intervalMerging));
@ -222,28 +223,32 @@ public class GenomeAnalysisEngine {
public static GenomeLocSortedSet parseIntervalArguments(List <String> argList, IntervalMergingRule mergingRule) {
List<GenomeLoc> rawIntervals = new ArrayList<GenomeLoc>(); // running list of raw GenomeLocs
rawIntervals.addAll(checkRODToIntervalArgument()); // add any RODs-to-intervals we have
for (String argument : argList) {
if (argList != null) { // now that we can be in this function if only the ROD-to-Intervals was provided, we need to
// ensure that the arg list isn't null before looping.
for (String argument : argList) {
// if any interval argument is '-L all', consider all loci by returning no intervals
if (argument.equals("all")) {
if (argList.size() != 1) {
// throw error if '-L all' is not only interval - potentially conflicting commands
throw new StingException(String.format("Conflicting arguments: Intervals given along with \"-L all\""));
// if any interval argument is '-L all', consider all loci by returning no intervals
if (argument.equals("all")) {
if (argList.size() != 1) {
// throw error if '-L all' is not only interval - potentially conflicting commands
throw new StingException(String.format("Conflicting arguments: Intervals given along with \"-L all\""));
}
return null;
}
return null;
}
// separate argument on semicolon first
for (String fileOrInterval : argument.split(";")) {
// separate argument on semicolon first
for (String fileOrInterval : argument.split(";")) {
// if it's a file, add items to raw interval list
if (isFile(fileOrInterval))
rawIntervals.addAll(GenomeLocParser.intervalFileToList(fileOrInterval, mergingRule));
// if it's a file, add items to raw interval list
if (isFile(fileOrInterval))
rawIntervals.addAll(GenomeLocParser.intervalFileToList(fileOrInterval, mergingRule));
// otherwise treat as an interval -> parse and add to raw interval list
else {
rawIntervals.add(GenomeLocParser.parseGenomeInterval(fileOrInterval));
// otherwise treat as an interval -> parse and add to raw interval list
else {
rawIntervals.add(GenomeLocParser.parseGenomeInterval(fileOrInterval));
}
}
}
}
@ -261,6 +266,30 @@ public class GenomeAnalysisEngine {
}
/**
* if we have a ROD specified as a 'rodToIntervalTrackName', convert its records to RODs
*/
// TODO: this function uses toLowerCase to work with the current ROD system, fix it if we make ROD names case-sensitive
private static List<GenomeLoc> checkRODToIntervalArgument() {
Map<String, ReferenceOrderedDataSource> rodNames = RMDIntervalGenerator.getRMDTrackNames(instance.rodDataSources);
// Do we have any RODs that overloaded as interval lists with the 'rodToIntervalTrackName' flag?
List<GenomeLoc> ret = new ArrayList<GenomeLoc>();
if (rodNames != null && instance.argCollection.RODToInterval != null) {
String rodName = GenomeAnalysisEngine.instance.argCollection.RODToInterval;
// check to make sure we have a rod of that name
if (!rodNames.containsKey(rodName.toLowerCase()))
throw new StingException("--rodToIntervalTrackName (-BTI) was pass the name '"+rodName+"', which wasn't given as a ROD name in the -B option");
for (String str : rodNames.keySet())
if (str.toLowerCase().equals(rodName.toLowerCase())) {
RMDIntervalGenerator intervalGenerator = new RMDIntervalGenerator(rodNames.get(str).getReferenceOrderedData());
ret.addAll(intervalGenerator.toGenomeLocList());
}
}
return ret;
}
/**
* Check if string argument was intented as a file
* Accepted file extensions: .list, .interval_list, .bed, .picard

View File

@ -76,6 +76,10 @@ public class GATKArgumentCollection {
@Argument(fullName = "rodBind", shortName = "B", doc = "Bindings for reference-ordered data, in the form <name>,<type>,<file>", required = false)
public ArrayList<String> RODBindings = new ArrayList<String>();
@Element(required = false)
@Argument(fullName = "rodToIntervalTrackName", shortName = "BTI", doc = "Indicates that the named track should be converted into an interval list, to drive the traversal", required = false)
public String RODToInterval = null;
@Element(required = false)
@Argument(fullName = "DBSNP", shortName = "D", doc = "DBSNP file", required = false)
public String DBSNPFile = null;
@ -234,6 +238,7 @@ public class GATKArgumentCollection {
* @return true if they're equal
*/
public boolean equals(GATKArgumentCollection other) {
if (other == null) return false;
if (other.samFiles.size() != samFiles.size()) {
return false;
}
@ -318,6 +323,10 @@ public class GATKArgumentCollection {
if (other.intervalMerging != this.intervalMerging) {
return false;
}
if ((other.RODToInterval == null && RODToInterval != null) ||
(other.RODToInterval != null && !other.RODToInterval.equals(RODToInterval))) {
return false;
}
// if (other.enableRodWalkers != this.enableRodWalkers) {
// return false;
// }

View File

@ -0,0 +1,58 @@
package org.broadinstitute.sting.gatk.refdata.utils;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.util.*;
/**
*
* @author aaron
*
* Class RMDIntervalGenerator
*
* Creates an interval list, given an RMDTrack
*/
public class RMDIntervalGenerator {
public RMDTrack track;
/**
* create a interval representation of a ROD track
* @param track the track
*/
public RMDIntervalGenerator(RMDTrack track) {
if (track == null) throw new IllegalArgumentException("Track cannot be null");
this.track = track;
}
/**
* create a genome location list from the interval track
* @return a list of genome locations
*/
public List<GenomeLoc> toGenomeLocList() {
Iterator<GATKFeature> iter = track.getIterator();
List<GenomeLoc> locations = new ArrayList<GenomeLoc>();
while (iter.hasNext()) {
GATKFeature feature = iter.next();
GenomeLoc loc = feature.getLocation();
if (loc != null) locations.add(loc);
}
return locations;
}
/**
* return a map of reference meta data track names to RODS
* @param sources the reference ordered data sources to get the names from
* @return a map of reference meta data names to RODS
*/
public static Map<String,ReferenceOrderedDataSource> getRMDTrackNames(List<ReferenceOrderedDataSource> sources) {
// get a list of the current rod names we're working with
Map<String,ReferenceOrderedDataSource> rodNames = new HashMap<String,ReferenceOrderedDataSource>();
for (ReferenceOrderedDataSource rod : sources) {
rodNames.put(rod.getName(),rod);
}
return rodNames;
}
}