added the ability to create interval lists directly from a ROD, using the command line arg '-BTI' (long name '--rodToIntervalTrackName'). The parameter to this arg is the name of the ROD track, which must be a track name specified in the -B option.
Using this feature, sites covered by the target ROD will be iterated over. This list of intevals generated is merged with any intervals from the -L and -XL args, and the Walker is run over the resulting merged list. WARNING: for very large ROD's this can be costly. Consider this experimental for now. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3134 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
20cc2a85a4
commit
e148a3ac61
|
|
@ -45,6 +45,7 @@ import org.broadinstitute.sting.gatk.io.OutputTracker;
|
||||||
import org.broadinstitute.sting.gatk.io.stubs.Stub;
|
import org.broadinstitute.sting.gatk.io.stubs.Stub;
|
||||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackManager;
|
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackManager;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.utils.RMDIntervalGenerator;
|
||||||
import org.broadinstitute.sting.gatk.walkers.*;
|
import org.broadinstitute.sting.gatk.walkers.*;
|
||||||
import org.broadinstitute.sting.utils.*;
|
import org.broadinstitute.sting.utils.*;
|
||||||
import org.broadinstitute.sting.utils.bed.BedParser;
|
import org.broadinstitute.sting.utils.bed.BedParser;
|
||||||
|
|
@ -176,12 +177,12 @@ public class GenomeAnalysisEngine {
|
||||||
private void initializeIntervals() {
|
private void initializeIntervals() {
|
||||||
|
|
||||||
// return null if no interval arguments at all
|
// return null if no interval arguments at all
|
||||||
if ((argCollection.intervals == null) && (argCollection.excludeIntervals == null))
|
if ((argCollection.intervals == null) && (argCollection.excludeIntervals == null) && (argCollection.RODToInterval == null))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
else {
|
else {
|
||||||
// if include argument isn't given, create new set of all possible intervals
|
// if include argument isn't given, create new set of all possible intervals
|
||||||
GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null ?
|
GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null && argCollection.RODToInterval == null ?
|
||||||
GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getSequenceDictionary()) :
|
GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getSequenceDictionary()) :
|
||||||
parseIntervalArguments(argCollection.intervals, argCollection.intervalMerging));
|
parseIntervalArguments(argCollection.intervals, argCollection.intervalMerging));
|
||||||
|
|
||||||
|
|
@ -222,28 +223,32 @@ public class GenomeAnalysisEngine {
|
||||||
public static GenomeLocSortedSet parseIntervalArguments(List <String> argList, IntervalMergingRule mergingRule) {
|
public static GenomeLocSortedSet parseIntervalArguments(List <String> argList, IntervalMergingRule mergingRule) {
|
||||||
|
|
||||||
List<GenomeLoc> rawIntervals = new ArrayList<GenomeLoc>(); // running list of raw GenomeLocs
|
List<GenomeLoc> rawIntervals = new ArrayList<GenomeLoc>(); // running list of raw GenomeLocs
|
||||||
|
rawIntervals.addAll(checkRODToIntervalArgument()); // add any RODs-to-intervals we have
|
||||||
|
|
||||||
for (String argument : argList) {
|
if (argList != null) { // now that we can be in this function if only the ROD-to-Intervals was provided, we need to
|
||||||
|
// ensure that the arg list isn't null before looping.
|
||||||
|
for (String argument : argList) {
|
||||||
|
|
||||||
// if any interval argument is '-L all', consider all loci by returning no intervals
|
// if any interval argument is '-L all', consider all loci by returning no intervals
|
||||||
if (argument.equals("all")) {
|
if (argument.equals("all")) {
|
||||||
if (argList.size() != 1) {
|
if (argList.size() != 1) {
|
||||||
// throw error if '-L all' is not only interval - potentially conflicting commands
|
// throw error if '-L all' is not only interval - potentially conflicting commands
|
||||||
throw new StingException(String.format("Conflicting arguments: Intervals given along with \"-L all\""));
|
throw new StingException(String.format("Conflicting arguments: Intervals given along with \"-L all\""));
|
||||||
|
}
|
||||||
|
return null;
|
||||||
}
|
}
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
// separate argument on semicolon first
|
// separate argument on semicolon first
|
||||||
for (String fileOrInterval : argument.split(";")) {
|
for (String fileOrInterval : argument.split(";")) {
|
||||||
|
|
||||||
// if it's a file, add items to raw interval list
|
// if it's a file, add items to raw interval list
|
||||||
if (isFile(fileOrInterval))
|
if (isFile(fileOrInterval))
|
||||||
rawIntervals.addAll(GenomeLocParser.intervalFileToList(fileOrInterval, mergingRule));
|
rawIntervals.addAll(GenomeLocParser.intervalFileToList(fileOrInterval, mergingRule));
|
||||||
|
|
||||||
// otherwise treat as an interval -> parse and add to raw interval list
|
// otherwise treat as an interval -> parse and add to raw interval list
|
||||||
else {
|
else {
|
||||||
rawIntervals.add(GenomeLocParser.parseGenomeInterval(fileOrInterval));
|
rawIntervals.add(GenomeLocParser.parseGenomeInterval(fileOrInterval));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -261,6 +266,30 @@ public class GenomeAnalysisEngine {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* if we have a ROD specified as a 'rodToIntervalTrackName', convert its records to RODs
|
||||||
|
*/
|
||||||
|
// TODO: this function uses toLowerCase to work with the current ROD system, fix it if we make ROD names case-sensitive
|
||||||
|
private static List<GenomeLoc> checkRODToIntervalArgument() {
|
||||||
|
Map<String, ReferenceOrderedDataSource> rodNames = RMDIntervalGenerator.getRMDTrackNames(instance.rodDataSources);
|
||||||
|
// Do we have any RODs that overloaded as interval lists with the 'rodToIntervalTrackName' flag?
|
||||||
|
List<GenomeLoc> ret = new ArrayList<GenomeLoc>();
|
||||||
|
if (rodNames != null && instance.argCollection.RODToInterval != null) {
|
||||||
|
String rodName = GenomeAnalysisEngine.instance.argCollection.RODToInterval;
|
||||||
|
|
||||||
|
// check to make sure we have a rod of that name
|
||||||
|
if (!rodNames.containsKey(rodName.toLowerCase()))
|
||||||
|
throw new StingException("--rodToIntervalTrackName (-BTI) was pass the name '"+rodName+"', which wasn't given as a ROD name in the -B option");
|
||||||
|
|
||||||
|
for (String str : rodNames.keySet())
|
||||||
|
if (str.toLowerCase().equals(rodName.toLowerCase())) {
|
||||||
|
RMDIntervalGenerator intervalGenerator = new RMDIntervalGenerator(rodNames.get(str).getReferenceOrderedData());
|
||||||
|
ret.addAll(intervalGenerator.toGenomeLocList());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Check if string argument was intented as a file
|
* Check if string argument was intented as a file
|
||||||
* Accepted file extensions: .list, .interval_list, .bed, .picard
|
* Accepted file extensions: .list, .interval_list, .bed, .picard
|
||||||
|
|
|
||||||
|
|
@ -76,6 +76,10 @@ public class GATKArgumentCollection {
|
||||||
@Argument(fullName = "rodBind", shortName = "B", doc = "Bindings for reference-ordered data, in the form <name>,<type>,<file>", required = false)
|
@Argument(fullName = "rodBind", shortName = "B", doc = "Bindings for reference-ordered data, in the form <name>,<type>,<file>", required = false)
|
||||||
public ArrayList<String> RODBindings = new ArrayList<String>();
|
public ArrayList<String> RODBindings = new ArrayList<String>();
|
||||||
|
|
||||||
|
@Element(required = false)
|
||||||
|
@Argument(fullName = "rodToIntervalTrackName", shortName = "BTI", doc = "Indicates that the named track should be converted into an interval list, to drive the traversal", required = false)
|
||||||
|
public String RODToInterval = null;
|
||||||
|
|
||||||
@Element(required = false)
|
@Element(required = false)
|
||||||
@Argument(fullName = "DBSNP", shortName = "D", doc = "DBSNP file", required = false)
|
@Argument(fullName = "DBSNP", shortName = "D", doc = "DBSNP file", required = false)
|
||||||
public String DBSNPFile = null;
|
public String DBSNPFile = null;
|
||||||
|
|
@ -234,6 +238,7 @@ public class GATKArgumentCollection {
|
||||||
* @return true if they're equal
|
* @return true if they're equal
|
||||||
*/
|
*/
|
||||||
public boolean equals(GATKArgumentCollection other) {
|
public boolean equals(GATKArgumentCollection other) {
|
||||||
|
if (other == null) return false;
|
||||||
if (other.samFiles.size() != samFiles.size()) {
|
if (other.samFiles.size() != samFiles.size()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
@ -318,6 +323,10 @@ public class GATKArgumentCollection {
|
||||||
if (other.intervalMerging != this.intervalMerging) {
|
if (other.intervalMerging != this.intervalMerging) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
if ((other.RODToInterval == null && RODToInterval != null) ||
|
||||||
|
(other.RODToInterval != null && !other.RODToInterval.equals(RODToInterval))) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
// if (other.enableRodWalkers != this.enableRodWalkers) {
|
// if (other.enableRodWalkers != this.enableRodWalkers) {
|
||||||
// return false;
|
// return false;
|
||||||
// }
|
// }
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,58 @@
|
||||||
|
package org.broadinstitute.sting.gatk.refdata.utils;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @author aaron
|
||||||
|
*
|
||||||
|
* Class RMDIntervalGenerator
|
||||||
|
*
|
||||||
|
* Creates an interval list, given an RMDTrack
|
||||||
|
*/
|
||||||
|
public class RMDIntervalGenerator {
|
||||||
|
public RMDTrack track;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create a interval representation of a ROD track
|
||||||
|
* @param track the track
|
||||||
|
*/
|
||||||
|
public RMDIntervalGenerator(RMDTrack track) {
|
||||||
|
if (track == null) throw new IllegalArgumentException("Track cannot be null");
|
||||||
|
this.track = track;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create a genome location list from the interval track
|
||||||
|
* @return a list of genome locations
|
||||||
|
*/
|
||||||
|
public List<GenomeLoc> toGenomeLocList() {
|
||||||
|
Iterator<GATKFeature> iter = track.getIterator();
|
||||||
|
List<GenomeLoc> locations = new ArrayList<GenomeLoc>();
|
||||||
|
while (iter.hasNext()) {
|
||||||
|
GATKFeature feature = iter.next();
|
||||||
|
GenomeLoc loc = feature.getLocation();
|
||||||
|
if (loc != null) locations.add(loc);
|
||||||
|
}
|
||||||
|
return locations;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* return a map of reference meta data track names to RODS
|
||||||
|
* @param sources the reference ordered data sources to get the names from
|
||||||
|
* @return a map of reference meta data names to RODS
|
||||||
|
*/
|
||||||
|
public static Map<String,ReferenceOrderedDataSource> getRMDTrackNames(List<ReferenceOrderedDataSource> sources) {
|
||||||
|
// get a list of the current rod names we're working with
|
||||||
|
Map<String,ReferenceOrderedDataSource> rodNames = new HashMap<String,ReferenceOrderedDataSource>();
|
||||||
|
for (ReferenceOrderedDataSource rod : sources) {
|
||||||
|
rodNames.put(rod.getName(),rod);
|
||||||
|
}
|
||||||
|
return rodNames;
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue