gatk-3.8/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java

121 lines
5.4 KiB
Java

package org.broadinstitute.sting.utils.interval;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLocParser;
import java.util.List;
import java.util.ArrayList;
import java.util.Collections;
import java.io.File;
/**
* Parse text representations of interval strings that
* can appear in Sting-based applications.
*
* @author mhanna
* @version 0.1
*/
public class IntervalUtils {
/**
* Turns a set of strings describing intervals into a parsed set of intervals. Valid string elements can be files,
* intervals in samtools notation (chrA:B-C), or some combination of the above separated by semicolons. Additionally,
* 'all' can be supplied to indicate all possible intervals, but 'all' must be exclusive of all other interval
* specifications.
*
* @param argList A list of strings containing interval data.
* @return an unsorted, unmerged representation of the given intervals. Null is used to indicate that all intervals should be used.
*/
public static List<GenomeLoc> parseIntervalArguments(List<String> argList) {
List<GenomeLoc> rawIntervals = new ArrayList<GenomeLoc>(); // running list of raw GenomeLocs
if (argList != null) { // now that we can be in this function if only the ROD-to-Intervals was provided, we need to
// ensure that the arg list isn't null before looping.
for (String argument : argList) {
// if any interval argument is '-L all', consider all loci by returning no intervals
if (argument.equals("all")) {
if (argList.size() != 1) {
// throw error if '-L all' is not only interval - potentially conflicting commands
throw new StingException(String.format("Conflicting arguments: Intervals given along with \"-L all\""));
}
return null;
}
// separate argument on semicolon first
for (String fileOrInterval : argument.split(";")) {
// if it's a file, add items to raw interval list
if (isIntervalFile(fileOrInterval)) {
try {
rawIntervals.addAll(GenomeLocParser.intervalFileToList(fileOrInterval));
}
catch (Exception e) {
throw new StingException(String.format("Interval file %s could not be parsed in either format. " +
"The problem is:%n%s",
fileOrInterval, e.getMessage()), e);
}
}
// otherwise treat as an interval -> parse and add to raw interval list
else {
rawIntervals.add(GenomeLocParser.parseGenomeInterval(fileOrInterval));
}
}
}
}
return rawIntervals;
}
/**
* Sorts and merges an interval list. Multiple techniques are available for merging: ALL, which combines
* all overlapping and abutting intervals into an interval that spans the union of all covered bases, and
* OVERLAPPING_ONLY, which unions overlapping intervals but keeps abutting intervals separate.
*
* @param intervals A collection of intervals to merge.
* @param mergingRule A descriptor for the type of merging to perform.
* @return A sorted, merged version of the intervals passed in.
*/
public static GenomeLocSortedSet sortAndMergeIntervals(List<GenomeLoc> intervals, IntervalMergingRule mergingRule) {
// sort raw interval list
Collections.sort(intervals);
// now merge raw interval list
intervals = GenomeLocParser.mergeIntervalLocations(intervals, mergingRule);
return GenomeLocSortedSet.createSetFromList(intervals);
}
/**
* Check if string argument was intented as a file
* Accepted file extensions: .bed .list, .picard, .interval_list, .intervals.
* @param str token to identify as a filename.
* @return true if the token looks like a filename, or false otherwise.
*/
public static boolean isIntervalFile(String str) {
// should we define list of file extensions as a public array somewhere?
// is regex or endsiwth better?
File file = new File(str);
if (str.toUpperCase().endsWith(".BED") || str.toUpperCase().endsWith(".LIST") ||
str.toUpperCase().endsWith(".PICARD") || str.toUpperCase().endsWith(".INTERVAL_LIST")
|| str.toUpperCase().endsWith(".INTERVALS")) {
if (file.exists())
return true;
else
throw new StingException(String.format("The interval file %s does not exist.", file.getAbsolutePath()));
}
if(file.exists())
throw new StingException(String.format("The interval file %s does not have one of " +
"the supported extensions (.bed, .list, .picard, .interval_list, or .intervals). " +
"Please rename your file with the appropriate extension. If %s is NOT supposed to be a file, " +
"please move or rename the file at location %s", str, str, file.getAbsolutePath()));
else return false;
}
}