Refactoring intervals, separating the process of parsing interval lists,

sorting and merging interval lists, and creating RODs from intervals.  This
gives Doug the ability to keep using our interval list parsing code when
sorting intervals on our behalf.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3159 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2010-04-13 15:50:38 +00:00
parent d0123956bc
commit 8573b0bc6f
13 changed files with 147 additions and 126 deletions

View File

@ -30,7 +30,8 @@ import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.samtools.*;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
import org.broadinstitute.sting.gatk.arguments.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.datasources.shards.MonolithicShardStrategy;
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
@ -48,7 +49,6 @@ import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackManager;
import org.broadinstitute.sting.gatk.refdata.utils.RMDIntervalGenerator;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.bed.BedParser;
import org.broadinstitute.sting.utils.cmdLine.ArgumentException;
import org.broadinstitute.sting.utils.cmdLine.ArgumentSource;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
@ -184,7 +184,7 @@ public class GenomeAnalysisEngine {
// if include argument isn't given, create new set of all possible intervals
GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null && argCollection.RODToInterval == null ?
GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getSequenceDictionary()) :
parseIntervalArguments(argCollection.intervals, argCollection.intervalMerging));
loadIntervals(argCollection.intervals, argCollection.intervalMerging));
// if no exclude arguments, can return parseIntervalArguments directly
if (argCollection.excludeIntervals == null)
@ -192,8 +192,8 @@ public class GenomeAnalysisEngine {
// otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets
else {
GenomeLocSortedSet excludeSortedSet = parseIntervalArguments(argCollection.excludeIntervals, argCollection.intervalMerging);
intervals = includeSortedSet.substractRegions(excludeSortedSet);
GenomeLocSortedSet excludeSortedSet = loadIntervals(argCollection.excludeIntervals, argCollection.intervalMerging);
intervals = includeSortedSet.subtractRegions(excludeSortedSet);
// logging messages only printed when exclude (-XL) arguments are given
long toPruneSize = includeSortedSet.coveredSize();
@ -208,62 +208,23 @@ public class GenomeAnalysisEngine {
}
/**
* Creates a GenomeLocSortedSet from a set of LIKE arguments - either -L or -XL
* Set is sorted and merged
* Loads the intervals relevant to
* @param argList String representation of arguments; might include 'all', filenames, intervals in samtools
* notation, or a combination of the
* @param mergingRule Technique to use when merging interval data.
* @return A sorted, merged list of all intervals specified in this arg list.
*/
public static GenomeLocSortedSet parseIntervalArguments(final List<String> intervals) {
return parseIntervalArguments(intervals, GenomeAnalysisEngine.instance.getArguments().intervalMerging);
}
/**
* Creates a GenomeLocSortedSet from a set of LIKE arguments - either -L or -XL
* Set is sorted and merged
*/
public static GenomeLocSortedSet parseIntervalArguments(List <String> argList, IntervalMergingRule mergingRule) {
private GenomeLocSortedSet loadIntervals(List<String> argList, IntervalMergingRule mergingRule) {
List<GenomeLoc> rawIntervals = new ArrayList<GenomeLoc>(); // running list of raw GenomeLocs
// TODO: Aaron, how do we discriminate between RODs that are for inclusion and RODs that are for exclusion?
rawIntervals.addAll(checkRODToIntervalArgument()); // add any RODs-to-intervals we have
rawIntervals.addAll(IntervalUtils.parseIntervalArguments(argList));
if (argList != null) { // now that we can be in this function if only the ROD-to-Intervals was provided, we need to
// ensure that the arg list isn't null before looping.
for (String argument : argList) {
// if any interval argument is '-L all', consider all loci by returning no intervals
if (argument.equals("all")) {
if (argList.size() != 1) {
// throw error if '-L all' is not only interval - potentially conflicting commands
throw new StingException(String.format("Conflicting arguments: Intervals given along with \"-L all\""));
}
return null;
}
// separate argument on semicolon first
for (String fileOrInterval : argument.split(";")) {
// if it's a file, add items to raw interval list
if (isFile(fileOrInterval))
rawIntervals.addAll(GenomeLocParser.intervalFileToList(fileOrInterval, mergingRule));
// otherwise treat as an interval -> parse and add to raw interval list
else {
rawIntervals.add(GenomeLocParser.parseGenomeInterval(fileOrInterval));
}
}
}
}
// redundant check => default no arguments is null, not empty list
if (rawIntervals.size() == 0)
return null;
// sort raw interval list
Collections.sort(rawIntervals);
// now merge raw interval list
rawIntervals = GenomeLocParser.mergeIntervalLocations(rawIntervals, mergingRule);
return GenomeLocSortedSet.createSetFromList(rawIntervals);
return IntervalUtils.sortAndMergeIntervals(GenomeLocSortedSet.createSetFromList(rawIntervals),mergingRule);
}
/**
@ -290,26 +251,6 @@ public class GenomeAnalysisEngine {
return ret;
}
/**
* Check if string argument was intented as a file
* Accepted file extensions: .bed .list, .interval_list, .bed, .picard
*/
private static boolean isFile(String str) {
// should we define list of file extensions as a public array somewhere?
// is regex or endsiwth better?
if (str.toUpperCase().endsWith(".BED") || str.toUpperCase().endsWith(".LIST") ||
str.toUpperCase().endsWith(".PICARD") || str.toUpperCase().endsWith(".INTERVAL_LIST")
|| str.toUpperCase().endsWith(".INTERVALS"))
return true;
if(new File(str).exists())
throw new StingException("Interval argument looks like a filename, but does not have one of " +
"the supported extensions (.bed, .list, .picard, .interval_list, or .intervals). " +
"Please rename your file with the appropriate extension.");
else return false;
}
/**
* Add additional, externally managed IO streams for walker input.
*

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.arguments;
import net.sf.samtools.SAMFileReader;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.simpleframework.xml.*;
import org.simpleframework.xml.core.Persister;

View File

@ -1,17 +0,0 @@
package org.broadinstitute.sting.gatk.arguments;
/**
* a class we use to determine the merging rules for intervals passed to the GATK
*/
public enum IntervalMergingRule {
ALL, // we merge both overlapping intervals and abutting intervals
OVERLAPPING_ONLY, // We merge intervals that are overlapping, but NOT ones that only abut each other
NONE; // we merge neither overlapping or abutting intervals, the list of intervals is sorted, but not merged
public boolean check() {
if (this.compareTo(NONE) == 0)
throw new UnsupportedOperationException("We Currently do not support IntervalMergingRule.NONE");
return true;
}
}

View File

@ -6,6 +6,7 @@ import java.util.*;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
public class IntervalRodIterator implements Iterator<IntervalRod> {
//private List<GenomeLoc> locations = null;
@ -19,7 +20,8 @@ public class IntervalRodIterator implements Iterator<IntervalRod> {
public static IntervalRodIterator IntervalRodIteratorFromLocsFile(final String trackName, final File file) {
//System.out.printf("Parsing %s for intervals %s%n", file, trackName);
GenomeLocSortedSet locs = GenomeAnalysisEngine.parseIntervalArguments(Collections.singletonList(file.getPath()));
GenomeLocSortedSet locs = IntervalUtils.sortAndMergeIntervals(IntervalUtils.parseIntervalArguments(Collections.singletonList(file.getPath())),
GenomeAnalysisEngine.instance.getArguments().intervalMerging);
//System.out.printf(" => got %d entries %n", locs.size());
return new IntervalRodIterator(trackName, locs);
}

View File

@ -3,7 +3,8 @@ package org.broadinstitute.sting.gatk.walkers.indels;
import net.sf.samtools.*;
import net.sf.samtools.util.StringUtil;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.arguments.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
@ -126,7 +127,8 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
throw new RuntimeException("Entropy threshold must be a fraction between 0 and 1");
// read in the intervals for cleaning
GenomeLocSortedSet locs = GenomeAnalysisEngine.parseIntervalArguments(Arrays.asList(intervalsFile), IntervalMergingRule.OVERLAPPING_ONLY);
GenomeLocSortedSet locs = IntervalUtils.sortAndMergeIntervals(IntervalUtils.parseIntervalArguments(Arrays.asList(intervalsFile)),
IntervalMergingRule.OVERLAPPING_ONLY);
intervals = locs.iterator();
currentInterval = intervals.hasNext() ? intervals.next() : null;

View File

@ -8,13 +8,12 @@ import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.arguments.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.utils.bed.BedParser;
import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
@ -291,7 +290,7 @@ public class GenomeLocParser {
* @param file_name
* @param rule also merge abutting intervals
*/
public static List<GenomeLoc> intervalFileToList(final String file_name, IntervalMergingRule rule) {
public static List<GenomeLoc> intervalFileToList(final String file_name) {
// try to open file
File inputFile = null;
try {
@ -315,7 +314,7 @@ public class GenomeLocParser {
// case: BED file
if (file_name.toUpperCase().endsWith(".BED")) {
BedParser parser = new BedParser(inputFile);
return parser.getSortedAndMergedLocations(rule);
return parser.getLocations();
}
/**

View File

@ -159,7 +159,7 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
return true;
}
public GenomeLocSortedSet substractRegions(GenomeLocSortedSet toRemoveSet) {
public GenomeLocSortedSet subtractRegions(GenomeLocSortedSet toRemoveSet) {
LinkedList<GenomeLoc> good = new LinkedList<GenomeLoc>();
Stack<GenomeLoc> toProcess = new Stack<GenomeLoc>();
Stack<GenomeLoc> toExclude = new Stack<GenomeLoc>();

View File

@ -1,6 +1,6 @@
package org.broadinstitute.sting.utils.bed;
import org.broadinstitute.sting.gatk.arguments.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
@ -98,16 +98,4 @@ public class BedParser {
public List<GenomeLoc> getLocations() {
return mLocations;
}
/**
* sort and merge the intervals, using the interval rule supplied
* @param rule the rule to merge intervals with
* @return a list of genome locs, sorted and merged
*/
public List<GenomeLoc> getSortedAndMergedLocations(IntervalMergingRule rule) {
List<GenomeLoc> locs = new ArrayList<GenomeLoc>();
locs.addAll(mLocations);
Collections.sort(locs);
return GenomeLocParser.mergeIntervalLocations(locs, rule);
}
}

View File

@ -0,0 +1,10 @@
package org.broadinstitute.sting.utils.interval;
/**
* a class we use to determine the merging rules for intervals passed to the GATK
*/
public enum IntervalMergingRule {
ALL, // we merge both overlapping intervals and abutting intervals
OVERLAPPING_ONLY // We merge intervals that are overlapping, but NOT ones that only abut each other
}

View File

@ -0,0 +1,105 @@
package org.broadinstitute.sting.utils.interval;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLocParser;
import java.util.List;
import java.util.ArrayList;
import java.util.Collections;
import java.io.File;
/**
* Parse text representations of interval strings that
* can appear in Sting-based applications.
*
* @author mhanna
* @version 0.1
*/
public class IntervalUtils {
/**
* Turns a set of strings describing intervals into a parsed set of intervals. Valid string elements can be files,
* intervals in samtools notation (chrA:B-C), or some combination of the above separated by semicolons. Additionally,
* 'all' can be supplied to indicate all possible intervals, but 'all' must be exclusive of all other interval
* specifications.
*
* @param argList A list of strings containing interval data.
* @return an unsorted, unmerged representation of the given intervals. Null is used to indicate that all intervals should be used.
*/
public static GenomeLocSortedSet parseIntervalArguments(List<String> argList) {
List<GenomeLoc> rawIntervals = new ArrayList<GenomeLoc>(); // running list of raw GenomeLocs
if (argList != null) { // now that we can be in this function if only the ROD-to-Intervals was provided, we need to
// ensure that the arg list isn't null before looping.
for (String argument : argList) {
// if any interval argument is '-L all', consider all loci by returning no intervals
if (argument.equals("all")) {
if (argList.size() != 1) {
// throw error if '-L all' is not only interval - potentially conflicting commands
throw new StingException(String.format("Conflicting arguments: Intervals given along with \"-L all\""));
}
return null;
}
// separate argument on semicolon first
for (String fileOrInterval : argument.split(";")) {
// if it's a file, add items to raw interval list
if (isFile(fileOrInterval))
rawIntervals.addAll(GenomeLocParser.intervalFileToList(fileOrInterval));
// otherwise treat as an interval -> parse and add to raw interval list
else {
rawIntervals.add(GenomeLocParser.parseGenomeInterval(fileOrInterval));
}
}
}
}
return GenomeLocSortedSet.createSetFromList(rawIntervals);
}
/**
* Sorts and merges an interval list. Multiple techniques are available for merging: ALL, which combines
* all overlapping and abutting intervals into an interval that spans the union of all covered bases, and
* OVERLAPPING_ONLY, which unions overlapping intervals but keeps abutting intervals separate.
*
* @param intervals A collection of intervals to merge.
* @param mergingRule A descriptor for the type of merging to perform.
* @return A sorted, merged version of the intervals passed in.
*/
public static GenomeLocSortedSet sortAndMergeIntervals(GenomeLocSortedSet intervals, IntervalMergingRule mergingRule) {
List<GenomeLoc> intervalList = intervals.toList();
// sort raw interval list
Collections.sort(intervalList);
// now merge raw interval list
intervalList = GenomeLocParser.mergeIntervalLocations(intervalList, mergingRule);
return GenomeLocSortedSet.createSetFromList(intervalList);
}
/**
* Check if string argument was intented as a file
* Accepted file extensions: .bed .list, .picard, .interval_list, .intervals.
* @param str token to identify as a filename.
* @return true if the token looks like a filename, or false otherwise.
*/
private static boolean isFile(String str) {
// should we define list of file extensions as a public array somewhere?
// is regex or endsiwth better?
if (str.toUpperCase().endsWith(".BED") || str.toUpperCase().endsWith(".LIST") ||
str.toUpperCase().endsWith(".PICARD") || str.toUpperCase().endsWith(".INTERVAL_LIST")
|| str.toUpperCase().endsWith(".INTERVALS"))
return true;
if(new File(str).exists())
throw new StingException("Interval argument looks like a filename, but does not have one of " +
"the supported extensions (.bed, .list, .picard, .interval_list, or .intervals). " +
"Please rename your file with the appropriate extension.");
else return false;
}
}

View File

@ -3,7 +3,6 @@ package org.broadinstitute.sting.utils;
import net.sf.samtools.SAMFileHeader;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.arguments.IntervalMergingRule;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import static org.junit.Assert.assertEquals;
@ -12,8 +11,6 @@ import static org.junit.Assert.assertTrue;
import org.junit.BeforeClass;
import org.junit.Test;
import java.util.List;
/**
* @author aaron
* <p/>

View File

@ -127,7 +127,7 @@ public class GenomeLocSortedSetUnitTest extends BaseTest {
mSortedSet.add(e);
for (int x = 1; x < 101; x++) {
GenomeLoc del = GenomeLocParser.createGenomeLoc(1,x,x);
mSortedSet = mSortedSet.substractRegions(new GenomeLocSortedSet(del));
mSortedSet = mSortedSet.subtractRegions(new GenomeLocSortedSet(del));
}
assertTrue(mSortedSet.isEmpty());
}
@ -138,7 +138,7 @@ public class GenomeLocSortedSetUnitTest extends BaseTest {
mSortedSet.add(e);
for (int x = 1; x < 50; x++) {
GenomeLoc del = GenomeLocParser.createGenomeLoc(1,x,x);
mSortedSet = mSortedSet.substractRegions(new GenomeLocSortedSet(del));
mSortedSet = mSortedSet.subtractRegions(new GenomeLocSortedSet(del));
}
assertTrue(!mSortedSet.isEmpty());
assertTrue(mSortedSet.size() == 1);
@ -157,7 +157,7 @@ public class GenomeLocSortedSetUnitTest extends BaseTest {
assertTrue(mSortedSet.size() == 2);
// now delete a region
GenomeLoc d = GenomeLocParser.createGenomeLoc(1, 15, 75);
mSortedSet = mSortedSet.substractRegions(new GenomeLocSortedSet(d));
mSortedSet = mSortedSet.subtractRegions(new GenomeLocSortedSet(d));
Iterator<GenomeLoc> iter = mSortedSet.iterator();
GenomeLoc loc = iter.next();
assertTrue(loc.getStart() == 10);
@ -180,7 +180,7 @@ public class GenomeLocSortedSetUnitTest extends BaseTest {
GenomeLoc r3 = GenomeLocParser.createGenomeLoc(1, 16, 18);
GenomeLocSortedSet toExclude = new GenomeLocSortedSet(Arrays.asList(r1, r2, r3));
GenomeLocSortedSet remaining = mSortedSet.substractRegions(toExclude);
GenomeLocSortedSet remaining = mSortedSet.subtractRegions(toExclude);
// logger.debug("Initial " + mSortedSet);
// logger.debug("Exclude " + toExclude);
// logger.debug("Remaining " + remaining);

View File

@ -1,7 +1,7 @@
package org.broadinstitute.sting.utils.bed;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.arguments.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLocParser;
@ -59,11 +59,4 @@ public class BedParserUnitTest extends BaseTest {
Assert.assertEquals(5000, location.get(2).getStop());
Assert.assertEquals(6000, location.get(3).getStop());
}
@Test
public void testLoadBedFileOverlapping() {
BedParser parser = new BedParser(bedFile);
List<GenomeLoc> location = parser.getSortedAndMergedLocations(IntervalMergingRule.ALL);
Assert.assertEquals(3, location.size());
}
}