From 28b897d5de5860867eb7f3a3d42648575403cd5f Mon Sep 17 00:00:00 2001 From: kshakir Date: Thu, 5 May 2011 03:32:35 +0000 Subject: [PATCH] Fixed O(N^2) operation when scattering interval files. Cleaned up intervals contig count function. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5768 348d0f76-0448-11de-a6fe-93d51630548a --- .../utils/interval/IntervalUtilsUnitTest.java | 20 +++++--- .../gatk/ContigScatterFunction.scala | 5 +- .../queue/extensions/gatk/GATKIntervals.scala | 8 ++-- .../gatk/GATKIntervalsUnitTest.scala | 46 ++++++++++++++----- 4 files changed, 55 insertions(+), 24 deletions(-) diff --git a/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java index 7c5bbe85e..fd07f4e92 100644 --- a/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java +++ b/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java @@ -149,13 +149,6 @@ public class IntervalUtilsUnitTest extends BaseTest { Assert.assertEquals(getLocs("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2").size(), 4); } - @Test(dependsOnMethods = "testParseIntervalArguments") - public void testCountIntervalsByContig() { - Assert.assertEquals(IntervalUtils.countContigIntervals(getLocs()), 45); - Assert.assertEquals(IntervalUtils.countContigIntervals(getLocs("chr1", "chr2", "chr3")), 3); - Assert.assertEquals(IntervalUtils.countContigIntervals(getLocs("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2")), 3); - } - @Test public void testIsIntervalFile() { Assert.assertTrue(IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "empty_intervals.list")); @@ -528,6 +521,19 @@ public class IntervalUtilsUnitTest extends BaseTest { Assert.assertEquals(locs3.get(1), chr3b); } + @Test + public void testScatterContigIntervalsMax() { + List files = testFiles("sg.", 85, ".intervals"); + IntervalUtils.scatterContigIntervals(hg19Header, hg19ReferenceLocs, files); + + for (int i = 0; i < files.size(); i++) { + String file = files.get(i).toString(); + List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file), false); + Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()"); + Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()"); + } + } + private List testFiles(String prefix, int count, String suffix) { ArrayList files = new ArrayList(); for (int i = 1; i <= count; i++) { diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala b/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala index 1aa88187e..d0c1345f7 100755 --- a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala +++ b/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala @@ -36,12 +36,11 @@ class ContigScatterFunction extends GATKScatterFunction with InProcessFunction { this.includeUnmapped = true protected override def maxIntervals = { - val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals) - IntervalUtils.countContigIntervals(gi.locs) + GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals).contigs.size } def run() { val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals) - IntervalUtils.scatterContigIntervals(gi.samFileHeader, gi.locs, this.scatterOutputFiles) + IntervalUtils.scatterContigIntervals(gi.samFileHeader, gi.javaLocs, this.scatterOutputFiles) } } diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala b/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala index cebec8c88..078a37ab3 100755 --- a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala +++ b/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala @@ -42,7 +42,9 @@ case class GATKIntervals(reference: File, intervals: List[String]) { header } - lazy val locs: List[GenomeLoc] = { + lazy val locs: List[GenomeLoc] = javaLocs.toList + + lazy val javaLocs: java.util.List[GenomeLoc] = { val parser = new GenomeLocParser(referenceDataSource.getReference) val parsedLocs = if (intervals.isEmpty) @@ -50,14 +52,14 @@ case class GATKIntervals(reference: File, intervals: List[String]) { else IntervalUtils.parseIntervalArguments(parser, intervals, false) Collections.sort(parsedLocs) - parsedLocs.toList + Collections.unmodifiableList(parsedLocs) } lazy val contigs = locs.map(_.getContig).distinct def getSplits(size: Int) = { splitsBySize.getOrElse(size, { - val splits: java.util.List[java.lang.Integer] = IntervalUtils.splitFixedIntervals(locs, size) + val splits: java.util.List[java.lang.Integer] = IntervalUtils.splitFixedIntervals(javaLocs, size) splitsBySize += size -> splits splits }) diff --git a/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala b/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala index 663f6830a..f80637459 100644 --- a/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala +++ b/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala @@ -34,30 +34,54 @@ import org.broadinstitute.sting.utils.{GenomeLocSortedSet, GenomeLocParser} import collection.JavaConversions._ class GATKIntervalsUnitTest { - private final lazy val reference = new File(BaseTest.hg18Reference) - private final lazy val genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(reference)) - private final lazy val referenceLocs = GenomeLocSortedSet. - createSetFromSequenceDictionary(new ReferenceDataSource(reference).getReference.getSequenceDictionary).toList.toList + private final lazy val hg18Reference = new File(BaseTest.hg18Reference) + private final lazy val hg18GenomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(hg18Reference)) + private final lazy val hg18ReferenceLocs = GenomeLocSortedSet. + createSetFromSequenceDictionary(new ReferenceDataSource(hg18Reference).getReference.getSequenceDictionary).toList.toList + + private final lazy val hg19Reference = new File(BaseTest.hg19Reference) + private final lazy val hg19GenomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(hg19Reference)) + private final lazy val hg19ReferenceLocs = GenomeLocSortedSet. + createSetFromSequenceDictionary(new ReferenceDataSource(hg19Reference).getReference.getSequenceDictionary).toList.toList @Test def testWithIntervals() { - val chr1 = genomeLocParser.parseGenomeInterval("chr1:1-1") - val chr2 = genomeLocParser.parseGenomeInterval("chr2:2-3") - val chr3 = genomeLocParser.parseGenomeInterval("chr3:3-5") + val chr1 = hg18GenomeLocParser.parseGenomeInterval("chr1:1-1") + val chr2 = hg18GenomeLocParser.parseGenomeInterval("chr2:2-3") + val chr3 = hg18GenomeLocParser.parseGenomeInterval("chr3:3-5") - val gi = new GATKIntervals(reference, List("chr1:1-1", "chr2:2-3", "chr3:3-5")) + val gi = new GATKIntervals(hg18Reference, List("chr1:1-1", "chr2:2-3", "chr3:3-5")) Assert.assertEquals(gi.locs, List(chr1, chr2, chr3)) Assert.assertEquals(gi.contigs, List("chr1", "chr2", "chr3")) Assert.assertEquals(gi.getSplits(2).toList, List(2, 3)) Assert.assertEquals(gi.getSplits(3).toList, List(1, 2, 3)) } + @Test(timeOut = 30000) + def testIntervalFile() { + var gi = new GATKIntervals(hg19Reference, List(BaseTest.hg19Intervals)) + Assert.assertEquals(gi.locs.size, 189894) + // Timeout check is because of bad: + // for(Item item: javaConvertedScalaList) + // This for loop is actually an O(N^2) operation as the iterator calls the + // O(N) javaConvertedScalaList.size() for each iteration of the loop. + Assert.assertEquals(gi.getSplits(gi.locs.size).size, 189894) + Assert.assertEquals(gi.contigs.size, 24) + } + @Test def testEmptyIntervals() { - val gi = new GATKIntervals(reference, Nil) - Assert.assertEquals(gi.locs, referenceLocs) - Assert.assertEquals(gi.contigs.size, referenceLocs.size) + val gi = new GATKIntervals(hg18Reference, Nil) + Assert.assertEquals(gi.locs, hg18ReferenceLocs) + Assert.assertEquals(gi.contigs.size, hg18ReferenceLocs.size) Assert.assertEquals(gi.getSplits(2).toList, List(10, 45)) Assert.assertEquals(gi.getSplits(4).toList, List(5, 10, 16, 45)) } + + @Test + def testContigCounts() { + Assert.assertEquals(new GATKIntervals(hg18Reference, Nil).contigs, hg18ReferenceLocs.map(_.getContig)) + Assert.assertEquals(new GATKIntervals(hg18Reference, List("chr1", "chr2", "chr3")).contigs, List("chr1", "chr2", "chr3")) + Assert.assertEquals(new GATKIntervals(hg18Reference, List("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2")).contigs, List("chr1", "chr2", "chr3")) + } }