Fixed O(N^2) operation when scattering interval files.

Cleaned up intervals contig count function.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5768 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kshakir 2011-05-05 03:32:35 +00:00
parent 3882d1b9c0
commit 28b897d5de
4 changed files with 55 additions and 24 deletions

View File

@ -149,13 +149,6 @@ public class IntervalUtilsUnitTest extends BaseTest {
Assert.assertEquals(getLocs("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2").size(), 4);
}
@Test(dependsOnMethods = "testParseIntervalArguments")
public void testCountIntervalsByContig() {
Assert.assertEquals(IntervalUtils.countContigIntervals(getLocs()), 45);
Assert.assertEquals(IntervalUtils.countContigIntervals(getLocs("chr1", "chr2", "chr3")), 3);
Assert.assertEquals(IntervalUtils.countContigIntervals(getLocs("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2")), 3);
}
@Test
public void testIsIntervalFile() {
Assert.assertTrue(IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "empty_intervals.list"));
@ -528,6 +521,19 @@ public class IntervalUtilsUnitTest extends BaseTest {
Assert.assertEquals(locs3.get(1), chr3b);
}
@Test
public void testScatterContigIntervalsMax() {
List<File> files = testFiles("sg.", 85, ".intervals");
IntervalUtils.scatterContigIntervals(hg19Header, hg19ReferenceLocs, files);
for (int i = 0; i < files.size(); i++) {
String file = files.get(i).toString();
List<GenomeLoc> parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file), false);
Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()");
Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()");
}
}
private List<File> testFiles(String prefix, int count, String suffix) {
ArrayList<File> files = new ArrayList<File>();
for (int i = 1; i <= count; i++) {

View File

@ -36,12 +36,11 @@ class ContigScatterFunction extends GATKScatterFunction with InProcessFunction {
this.includeUnmapped = true
protected override def maxIntervals = {
val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals)
IntervalUtils.countContigIntervals(gi.locs)
GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals).contigs.size
}
def run() {
val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals)
IntervalUtils.scatterContigIntervals(gi.samFileHeader, gi.locs, this.scatterOutputFiles)
IntervalUtils.scatterContigIntervals(gi.samFileHeader, gi.javaLocs, this.scatterOutputFiles)
}
}

View File

@ -42,7 +42,9 @@ case class GATKIntervals(reference: File, intervals: List[String]) {
header
}
lazy val locs: List[GenomeLoc] = {
lazy val locs: List[GenomeLoc] = javaLocs.toList
lazy val javaLocs: java.util.List[GenomeLoc] = {
val parser = new GenomeLocParser(referenceDataSource.getReference)
val parsedLocs =
if (intervals.isEmpty)
@ -50,14 +52,14 @@ case class GATKIntervals(reference: File, intervals: List[String]) {
else
IntervalUtils.parseIntervalArguments(parser, intervals, false)
Collections.sort(parsedLocs)
parsedLocs.toList
Collections.unmodifiableList(parsedLocs)
}
lazy val contigs = locs.map(_.getContig).distinct
def getSplits(size: Int) = {
splitsBySize.getOrElse(size, {
val splits: java.util.List[java.lang.Integer] = IntervalUtils.splitFixedIntervals(locs, size)
val splits: java.util.List[java.lang.Integer] = IntervalUtils.splitFixedIntervals(javaLocs, size)
splitsBySize += size -> splits
splits
})

View File

@ -34,30 +34,54 @@ import org.broadinstitute.sting.utils.{GenomeLocSortedSet, GenomeLocParser}
import collection.JavaConversions._
class GATKIntervalsUnitTest {
private final lazy val reference = new File(BaseTest.hg18Reference)
private final lazy val genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(reference))
private final lazy val referenceLocs = GenomeLocSortedSet.
createSetFromSequenceDictionary(new ReferenceDataSource(reference).getReference.getSequenceDictionary).toList.toList
private final lazy val hg18Reference = new File(BaseTest.hg18Reference)
private final lazy val hg18GenomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(hg18Reference))
private final lazy val hg18ReferenceLocs = GenomeLocSortedSet.
createSetFromSequenceDictionary(new ReferenceDataSource(hg18Reference).getReference.getSequenceDictionary).toList.toList
private final lazy val hg19Reference = new File(BaseTest.hg19Reference)
private final lazy val hg19GenomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(hg19Reference))
private final lazy val hg19ReferenceLocs = GenomeLocSortedSet.
createSetFromSequenceDictionary(new ReferenceDataSource(hg19Reference).getReference.getSequenceDictionary).toList.toList
@Test
def testWithIntervals() {
val chr1 = genomeLocParser.parseGenomeInterval("chr1:1-1")
val chr2 = genomeLocParser.parseGenomeInterval("chr2:2-3")
val chr3 = genomeLocParser.parseGenomeInterval("chr3:3-5")
val chr1 = hg18GenomeLocParser.parseGenomeInterval("chr1:1-1")
val chr2 = hg18GenomeLocParser.parseGenomeInterval("chr2:2-3")
val chr3 = hg18GenomeLocParser.parseGenomeInterval("chr3:3-5")
val gi = new GATKIntervals(reference, List("chr1:1-1", "chr2:2-3", "chr3:3-5"))
val gi = new GATKIntervals(hg18Reference, List("chr1:1-1", "chr2:2-3", "chr3:3-5"))
Assert.assertEquals(gi.locs, List(chr1, chr2, chr3))
Assert.assertEquals(gi.contigs, List("chr1", "chr2", "chr3"))
Assert.assertEquals(gi.getSplits(2).toList, List(2, 3))
Assert.assertEquals(gi.getSplits(3).toList, List(1, 2, 3))
}
@Test(timeOut = 30000)
def testIntervalFile() {
var gi = new GATKIntervals(hg19Reference, List(BaseTest.hg19Intervals))
Assert.assertEquals(gi.locs.size, 189894)
// Timeout check is because of bad:
// for(Item item: javaConvertedScalaList)
// This for loop is actually an O(N^2) operation as the iterator calls the
// O(N) javaConvertedScalaList.size() for each iteration of the loop.
Assert.assertEquals(gi.getSplits(gi.locs.size).size, 189894)
Assert.assertEquals(gi.contigs.size, 24)
}
@Test
def testEmptyIntervals() {
val gi = new GATKIntervals(reference, Nil)
Assert.assertEquals(gi.locs, referenceLocs)
Assert.assertEquals(gi.contigs.size, referenceLocs.size)
val gi = new GATKIntervals(hg18Reference, Nil)
Assert.assertEquals(gi.locs, hg18ReferenceLocs)
Assert.assertEquals(gi.contigs.size, hg18ReferenceLocs.size)
Assert.assertEquals(gi.getSplits(2).toList, List(10, 45))
Assert.assertEquals(gi.getSplits(4).toList, List(5, 10, 16, 45))
}
@Test
def testContigCounts() {
Assert.assertEquals(new GATKIntervals(hg18Reference, Nil).contigs, hg18ReferenceLocs.map(_.getContig))
Assert.assertEquals(new GATKIntervals(hg18Reference, List("chr1", "chr2", "chr3")).contigs, List("chr1", "chr2", "chr3"))
Assert.assertEquals(new GATKIntervals(hg18Reference, List("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2")).contigs, List("chr1", "chr2", "chr3"))
}
}