From 23e7f1bed9e395d0a41de659fd10d8d398dc2b82 Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Wed, 8 Feb 2012 02:12:16 -0500 Subject: [PATCH] When an interval list specifies overlapping intervals merge them before scattering. --- .../queue/extensions/gatk/GATKIntervals.scala | 4 +++- .../gatk/GATKIntervalsUnitTest.scala | 18 ++++++++++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala index 9e47f64a1..b0483f0bb 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala @@ -26,7 +26,7 @@ package org.broadinstitute.sting.queue.extensions.gatk import java.io.File import collection.JavaConversions._ -import org.broadinstitute.sting.utils.interval.IntervalUtils +import org.broadinstitute.sting.utils.interval.{IntervalMergingRule, IntervalUtils} import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource import net.sf.samtools.SAMFileHeader import java.util.Collections @@ -51,6 +51,8 @@ case class GATKIntervals(reference: File, intervals: List[String]) { IntervalUtils.parseIntervalArguments(parser, intervals) Collections.sort(parsedLocs) Collections.unmodifiableList(parsedLocs) + val mergedLocs = IntervalUtils.mergeIntervalLocations(parsedLocs, IntervalMergingRule.OVERLAPPING_ONLY) + Collections.unmodifiableList(mergedLocs) } lazy val contigs = locs.map(_.getContig).distinct.toList diff --git a/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala b/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala index 38abe24ef..5383b3716 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala @@ -32,6 +32,7 @@ import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile import org.broadinstitute.sting.utils.{GenomeLocSortedSet, GenomeLocParser} import collection.JavaConversions._ +import org.broadinstitute.sting.utils.interval.IntervalUtils class GATKIntervalsUnitTest { private final lazy val hg18Reference = new File(BaseTest.hg18Reference) @@ -57,7 +58,7 @@ class GATKIntervalsUnitTest { // Assert.assertEquals(gi.getSplits(3).toList, List(1, 2, 3)) } - @Test(timeOut = 30000) + @Test(timeOut = 30000L) def testIntervalFile() { var gi = new GATKIntervals(hg19Reference, List(BaseTest.hg19Intervals)) Assert.assertEquals(gi.locs.size, 189894) @@ -65,7 +66,7 @@ class GATKIntervalsUnitTest { // for(Item item: javaConvertedScalaList) // This for loop is actually an O(N^2) operation as the iterator calls the // O(N) javaConvertedScalaList.size() for each iteration of the loop. - //Assert.assertEquals(gi.getSplits(gi.locs.size).size, 189894) + Assert.assertEquals(IntervalUtils.splitFixedIntervals(gi.locs, 189894).size(), 189894) Assert.assertEquals(gi.contigs.size, 24) } @@ -84,4 +85,17 @@ class GATKIntervalsUnitTest { Assert.assertEquals(new GATKIntervals(hg18Reference, List("chr1", "chr2", "chr3")).contigs, List("chr1", "chr2", "chr3")) Assert.assertEquals(new GATKIntervals(hg18Reference, List("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2")).contigs, List("chr1", "chr2", "chr3")) } + + @Test + def testSortAndMergeIntervals() { + testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:1-10", "chr1:1-10"), Seq("chr1:1-10")) + testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:1-11", "chr1:1-12"), Seq("chr1:1-12")) + testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:11-20", "chr1:21-30"), Seq("chr1:1-10", "chr1:11-20", "chr1:21-30")) + testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:10-20", "chr1:21-30"), Seq("chr1:1-20", "chr1:21-30")) + testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:21-30", "chr1:10-20"), Seq("chr1:1-20", "chr1:21-30")) + } + + private def testSortAndMergeIntervals(actual: Seq[String], expected: Seq[String]) { + Assert.assertEquals(new GATKIntervals(hg18Reference, actual.toList).locs.toSeq, expected.map(hg18GenomeLocParser.parseGenomeLoc(_))) + } }