diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index fa746a7e3..4af224752 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -227,16 +227,14 @@ public class GenomeAnalysisEngine { * @param additionalIntervals a list of additional intervals to add to the returned set. Can be null. * @return A sorted, merged list of all intervals specified in this arg list. */ - private GenomeLocSortedSet loadIntervals(List argList, IntervalMergingRule mergingRule, List additionalIntervals) { - List rawIntervals = (additionalIntervals == null) ? new ArrayList() : additionalIntervals; // running list of raw GenomeLocs + private GenomeLocSortedSet loadIntervals(List argList, + IntervalMergingRule mergingRule, + List additionalIntervals) { - rawIntervals.addAll(IntervalUtils.parseIntervalArguments(argList)); - - // redundant check => default no arguments is null, not empty list - if (rawIntervals.size() == 0) - return null; - - return IntervalUtils.sortAndMergeIntervals(rawIntervals,mergingRule); + return IntervalUtils.sortAndMergeIntervals(IntervalUtils.mergeListsBySetOperator(additionalIntervals, + IntervalUtils.parseIntervalArguments(argList), + argCollection.BTIMergeRule), + mergingRule); } /** diff --git a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 543d36bf1..90afae069 100755 --- a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -30,6 +30,7 @@ import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.gatk.DownsampleType; +import org.broadinstitute.sting.utils.interval.IntervalSetRule; import org.simpleframework.xml.*; import org.simpleframework.xml.core.Persister; import org.simpleframework.xml.stream.Format; @@ -92,7 +93,11 @@ public class GATKArgumentCollection { @Element(required = false) @Argument(fullName = "rodToIntervalTrackName", shortName = "BTI", doc = "Indicates that the named track should be converted into an interval list, to drive the traversal", required = false) - public String RODToInterval = null; + public String RODToInterval = null; + + @Element(required = false) + @Argument(fullName = "BTI_merge_rule", shortName = "BTIMR", doc = "Indicates the merging approach the interval parser should use to combine the BTI track with other -L options", required = false) + public IntervalSetRule BTIMergeRule = IntervalSetRule.UNION; @Element(required = false) @Argument(fullName = "DBSNP", shortName = "D", doc = "DBSNP file", required = false) @@ -351,6 +356,9 @@ public class GATKArgumentCollection { (other.RODToInterval != null && !other.RODToInterval.equals(RODToInterval))) { return false; } + if (BTIMergeRule != other.BTIMergeRule) + return false; + // if (other.enableRodWalkers != this.enableRodWalkers) { // return false; // } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/TribbleTrack.java b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/TribbleTrack.java index 451781a42..60dbc0bb6 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/TribbleTrack.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/TribbleTrack.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.refdata.tracks; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.util.CloseableIterator; +import org.broad.tribble.FeatureCodec; import org.broad.tribble.FeatureSource; import org.broad.tribble.source.BasicFeatureSource; import org.broadinstitute.sting.gatk.refdata.utils.FeatureToGATKFeatureIterator; @@ -51,20 +52,23 @@ public class TribbleTrack extends RMDTrack implements QueryableTrack { // our sequence dictionary, which can be null private final SAMSequenceDictionary dictionary; + // our codec type + private final FeatureCodec codec; /** * Create a track * * @param type the type of track, used for track lookup - * @param recordType the type of record we produce * @param name the name of this specific track * @param file the associated file, for reference or recreating the reader * @param reader the feature reader to use as the underlying data source * @param dict the sam sequence dictionary + * @param codec the feature codec we use to decode this type */ - public TribbleTrack(Class type, Class recordType, String name, File file, BasicFeatureSource reader, SAMSequenceDictionary dict) { - super(type, recordType, name, file); + public TribbleTrack(Class type, String name, File file, BasicFeatureSource reader, SAMSequenceDictionary dict, FeatureCodec codec) { + super(type, codec.getFeatureType(), name, file); this.reader = reader; this.dictionary = dict; + this.codec = codec; } /** @@ -135,4 +139,8 @@ public class TribbleTrack extends RMDTrack implements QueryableTrack { public Object getHeader() { return reader.getHeader(); } + + public FeatureCodec getCodec() { + return codec; + } } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilder.java b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilder.java index 7a358db2d..b5e18e069 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilder.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilder.java @@ -105,7 +105,7 @@ public class TribbleRMDTrackBuilder extends PluginManager implemen // return a feature reader track Pair pair = createFeatureReader(targetClass, name, inputFile); if (pair == null) throw new StingException("Unable to make the feature reader for input file " + inputFile); - return new TribbleTrack(targetClass, createCodec(targetClass, name).getFeatureType(), name, inputFile, pair.first, pair.second); + return new TribbleTrack(targetClass, name, inputFile, pair.first, pair.second, createCodec(targetClass, name)); } public Pair createFeatureReader(Class targetClass, File inputFile) { diff --git a/java/src/org/broadinstitute/sting/utils/interval/IntervalSetRule.java b/java/src/org/broadinstitute/sting/utils/interval/IntervalSetRule.java new file mode 100644 index 000000000..eae4f8db5 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/interval/IntervalSetRule.java @@ -0,0 +1,9 @@ +package org.broadinstitute.sting.utils.interval; + +/** + * set operators for combining lists of intervals + */ +public enum IntervalSetRule { + UNION, + INTERSECTION; +} diff --git a/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index 01396c4a8..e8ffcae62 100644 --- a/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -5,6 +5,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.GenomeLocParser; +import java.util.LinkedList; import java.util.List; import java.util.ArrayList; import java.util.Collections; @@ -69,6 +70,49 @@ public class IntervalUtils { return rawIntervals; } + /** + * merge two interval lists, using an interval set rule + * @param setOne a list of genomeLocs, in order (cannot be NULL) + * @param setTwo a list of genomeLocs, also in order (cannot be NULL) + * @param rule the rule to use for merging, i.e. union, intersection, etc + * @return a list, correctly merged using the specified rule + */ + public static List mergeListsBySetOperator(List setOne, List setTwo, IntervalSetRule rule) { + // shortcut, if either set is zero, return the other set + if (setOne.size() == 0 || setTwo.size() == 0) return (setOne.size() == 0) ? setTwo : setOne; + + // if we're set to UNION, just add them all + if (rule == IntervalSetRule.UNION) { + setOne.addAll(setTwo); + return setOne; + } + + // else we're INTERSECTION, create two indexes into the lists + int iOne = 0; + int iTwo = 0; + + // our master list, since we can't guarantee removal time in a generic list + LinkedList retList = new LinkedList(); + + // merge the second into the first using the rule + while (iTwo < setTwo.size() && iOne < setOne.size()) + // if the first list is ahead, drop items off the second until we overlap + if (setTwo.get(iTwo).isBefore(setOne.get(iOne))) + iTwo++; + // if the second is ahead, drop intervals off the first until we overlap + else if (setOne.get(iOne).isBefore(setTwo.get(iTwo))) + iOne++; + // we overlap, intersect the two intervals and add the result. Then remove the interval that ends first. + else { + retList.add(setOne.get(iOne).intersect(setTwo.get(iTwo))); + if (setOne.get(iOne).getStop() < setTwo.get(iTwo).getStop()) iOne++; + else iTwo++; + } + + // we don't need to add the rest of remaining locations, since we know they don't overlap. return what we have + return retList; + } + /** * Sorts and merges an interval list. Multiple techniques are available for merging: ALL, which combines * all overlapping and abutting intervals into an interval that spans the union of all covered bases, and diff --git a/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsTest.java b/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsTest.java new file mode 100644 index 000000000..e4a3a3db5 --- /dev/null +++ b/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsTest.java @@ -0,0 +1,91 @@ +package org.broadinstitute.sting.utils.interval; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.picard.reference.ReferenceSequenceFile; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.List; + +/** + * test out the interval utility methods + */ +public class IntervalUtilsTest extends BaseTest { + // used to seed the genome loc parser with a sequence dictionary + private static ReferenceSequenceFile seq; + + + + @BeforeClass + public static void init() throws FileNotFoundException { + seq = new IndexedFastaSequenceFile(new File(hg18Reference)); + GenomeLocParser.setupRefContigOrdering(seq); + + } + + @Test + public void testMergeListsBySetOperatorNoOverlap() { + // a couple of lists we'll use for the testing + List listEveryTwoFromOne = new ArrayList(); + List listEveryTwoFromTwo = new ArrayList(); + + // create the two lists we'll use + for (int x = 1; x < 101; x++) { + if (x % 2 == 0) + listEveryTwoFromTwo.add(GenomeLocParser.createGenomeLoc("chr1",x,x)); + else + listEveryTwoFromOne.add(GenomeLocParser.createGenomeLoc("chr1",x,x)); + } + + List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.UNION); + Assert.assertEquals(100,ret.size()); + ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.INTERSECTION); + Assert.assertEquals(0,ret.size()); + } + + @Test + public void testMergeListsBySetOperatorAllOverlap() { + // a couple of lists we'll use for the testing + List allSites = new ArrayList(); + List listEveryTwoFromTwo = new ArrayList(); + + // create the two lists we'll use + for (int x = 1; x < 101; x++) { + if (x % 2 == 0) + listEveryTwoFromTwo.add(GenomeLocParser.createGenomeLoc("chr1",x,x)); + allSites.add(GenomeLocParser.createGenomeLoc("chr1",x,x)); + } + + List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); + Assert.assertEquals(150,ret.size()); + ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); + Assert.assertEquals(50,ret.size()); + } + + @Test + public void testMergeListsBySetOperator() { + // a couple of lists we'll use for the testing + List allSites = new ArrayList(); + List listEveryTwoFromTwo = new ArrayList(); + + // create the two lists we'll use + for (int x = 1; x < 101; x++) { + if (x % 5 == 0) { + listEveryTwoFromTwo.add(GenomeLocParser.createGenomeLoc("chr1",x,x)); + allSites.add(GenomeLocParser.createGenomeLoc("chr1",x,x)); + } + } + + List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); + Assert.assertEquals(40,ret.size()); + ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); + Assert.assertEquals(20,ret.size()); + } +}