From e148a3ac6145fbfe7ab5f71f29cc33192c07e97d Mon Sep 17 00:00:00 2001 From: aaron Date: Thu, 8 Apr 2010 05:14:41 +0000 Subject: [PATCH] added the ability to create interval lists directly from a ROD, using the command line arg '-BTI' (long name '--rodToIntervalTrackName'). The parameter to this arg is the name of the ROD track, which must be a track name specified in the -B option. Using this feature, sites covered by the target ROD will be iterated over. This list of intevals generated is merged with any intervals from the -L and -XL args, and the Walker is run over the resulting merged list. WARNING: for very large ROD's this can be costly. Consider this experimental for now. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3134 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/GenomeAnalysisEngine.java | 65 ++++++++++++++----- .../arguments/GATKArgumentCollection.java | 9 +++ .../refdata/utils/RMDIntervalGenerator.java | 58 +++++++++++++++++ 3 files changed, 114 insertions(+), 18 deletions(-) create mode 100644 java/src/org/broadinstitute/sting/gatk/refdata/utils/RMDIntervalGenerator.java diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 89c7d25cc..c3b41aabf 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -45,6 +45,7 @@ import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.io.stubs.Stub; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackManager; +import org.broadinstitute.sting.gatk.refdata.utils.RMDIntervalGenerator; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.bed.BedParser; @@ -176,12 +177,12 @@ public class GenomeAnalysisEngine { private void initializeIntervals() { // return null if no interval arguments at all - if ((argCollection.intervals == null) && (argCollection.excludeIntervals == null)) + if ((argCollection.intervals == null) && (argCollection.excludeIntervals == null) && (argCollection.RODToInterval == null)) return; else { // if include argument isn't given, create new set of all possible intervals - GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null ? + GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null && argCollection.RODToInterval == null ? GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getSequenceDictionary()) : parseIntervalArguments(argCollection.intervals, argCollection.intervalMerging)); @@ -222,28 +223,32 @@ public class GenomeAnalysisEngine { public static GenomeLocSortedSet parseIntervalArguments(List argList, IntervalMergingRule mergingRule) { List rawIntervals = new ArrayList(); // running list of raw GenomeLocs + rawIntervals.addAll(checkRODToIntervalArgument()); // add any RODs-to-intervals we have - for (String argument : argList) { + if (argList != null) { // now that we can be in this function if only the ROD-to-Intervals was provided, we need to + // ensure that the arg list isn't null before looping. + for (String argument : argList) { - // if any interval argument is '-L all', consider all loci by returning no intervals - if (argument.equals("all")) { - if (argList.size() != 1) { - // throw error if '-L all' is not only interval - potentially conflicting commands - throw new StingException(String.format("Conflicting arguments: Intervals given along with \"-L all\"")); + // if any interval argument is '-L all', consider all loci by returning no intervals + if (argument.equals("all")) { + if (argList.size() != 1) { + // throw error if '-L all' is not only interval - potentially conflicting commands + throw new StingException(String.format("Conflicting arguments: Intervals given along with \"-L all\"")); + } + return null; } - return null; - } - // separate argument on semicolon first - for (String fileOrInterval : argument.split(";")) { + // separate argument on semicolon first + for (String fileOrInterval : argument.split(";")) { - // if it's a file, add items to raw interval list - if (isFile(fileOrInterval)) - rawIntervals.addAll(GenomeLocParser.intervalFileToList(fileOrInterval, mergingRule)); + // if it's a file, add items to raw interval list + if (isFile(fileOrInterval)) + rawIntervals.addAll(GenomeLocParser.intervalFileToList(fileOrInterval, mergingRule)); - // otherwise treat as an interval -> parse and add to raw interval list - else { - rawIntervals.add(GenomeLocParser.parseGenomeInterval(fileOrInterval)); + // otherwise treat as an interval -> parse and add to raw interval list + else { + rawIntervals.add(GenomeLocParser.parseGenomeInterval(fileOrInterval)); + } } } } @@ -261,6 +266,30 @@ public class GenomeAnalysisEngine { } + /** + * if we have a ROD specified as a 'rodToIntervalTrackName', convert its records to RODs + */ + // TODO: this function uses toLowerCase to work with the current ROD system, fix it if we make ROD names case-sensitive + private static List checkRODToIntervalArgument() { + Map rodNames = RMDIntervalGenerator.getRMDTrackNames(instance.rodDataSources); + // Do we have any RODs that overloaded as interval lists with the 'rodToIntervalTrackName' flag? + List ret = new ArrayList(); + if (rodNames != null && instance.argCollection.RODToInterval != null) { + String rodName = GenomeAnalysisEngine.instance.argCollection.RODToInterval; + + // check to make sure we have a rod of that name + if (!rodNames.containsKey(rodName.toLowerCase())) + throw new StingException("--rodToIntervalTrackName (-BTI) was pass the name '"+rodName+"', which wasn't given as a ROD name in the -B option"); + + for (String str : rodNames.keySet()) + if (str.toLowerCase().equals(rodName.toLowerCase())) { + RMDIntervalGenerator intervalGenerator = new RMDIntervalGenerator(rodNames.get(str).getReferenceOrderedData()); + ret.addAll(intervalGenerator.toGenomeLocList()); + } + } + return ret; + } + /** * Check if string argument was intented as a file * Accepted file extensions: .list, .interval_list, .bed, .picard diff --git a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 816d2c71d..0e0275b74 100755 --- a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -76,6 +76,10 @@ public class GATKArgumentCollection { @Argument(fullName = "rodBind", shortName = "B", doc = "Bindings for reference-ordered data, in the form ,,", required = false) public ArrayList RODBindings = new ArrayList(); + @Element(required = false) + @Argument(fullName = "rodToIntervalTrackName", shortName = "BTI", doc = "Indicates that the named track should be converted into an interval list, to drive the traversal", required = false) + public String RODToInterval = null; + @Element(required = false) @Argument(fullName = "DBSNP", shortName = "D", doc = "DBSNP file", required = false) public String DBSNPFile = null; @@ -234,6 +238,7 @@ public class GATKArgumentCollection { * @return true if they're equal */ public boolean equals(GATKArgumentCollection other) { + if (other == null) return false; if (other.samFiles.size() != samFiles.size()) { return false; } @@ -318,6 +323,10 @@ public class GATKArgumentCollection { if (other.intervalMerging != this.intervalMerging) { return false; } + if ((other.RODToInterval == null && RODToInterval != null) || + (other.RODToInterval != null && !other.RODToInterval.equals(RODToInterval))) { + return false; + } // if (other.enableRodWalkers != this.enableRodWalkers) { // return false; // } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/utils/RMDIntervalGenerator.java b/java/src/org/broadinstitute/sting/gatk/refdata/utils/RMDIntervalGenerator.java new file mode 100644 index 000000000..8817b8d49 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/utils/RMDIntervalGenerator.java @@ -0,0 +1,58 @@ +package org.broadinstitute.sting.gatk.refdata.utils; + +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.util.*; + + +/** + * + * @author aaron + * + * Class RMDIntervalGenerator + * + * Creates an interval list, given an RMDTrack + */ +public class RMDIntervalGenerator { + public RMDTrack track; + + /** + * create a interval representation of a ROD track + * @param track the track + */ + public RMDIntervalGenerator(RMDTrack track) { + if (track == null) throw new IllegalArgumentException("Track cannot be null"); + this.track = track; + } + + /** + * create a genome location list from the interval track + * @return a list of genome locations + */ + public List toGenomeLocList() { + Iterator iter = track.getIterator(); + List locations = new ArrayList(); + while (iter.hasNext()) { + GATKFeature feature = iter.next(); + GenomeLoc loc = feature.getLocation(); + if (loc != null) locations.add(loc); + } + return locations; + } + + /** + * return a map of reference meta data track names to RODS + * @param sources the reference ordered data sources to get the names from + * @return a map of reference meta data names to RODS + */ + public static Map getRMDTrackNames(List sources) { + // get a list of the current rod names we're working with + Map rodNames = new HashMap(); + for (ReferenceOrderedDataSource rod : sources) { + rodNames.put(rod.getName(),rod); + } + return rodNames; + } +}