Intermediate commit cleaning up scatter intervals

-- Adding unit tests to ensure uniformity of intervals
This commit is contained in:
Mark DePristo 2011-09-09 12:56:45 -04:00
parent 507574b1c8
commit 06cb20f2a5
5 changed files with 658 additions and 575 deletions

View File

@ -334,24 +334,44 @@ public class IntervalUtils {
} }
/** /**
* Splits an interval list into multiple files. * Splits an interval list into multiple sublists.
* @param fileHeader The sam file header.
* @param locs The genome locs to split. * @param locs The genome locs to split.
* @param splits The stop points for the genome locs returned by splitFixedIntervals. * @param splits The stop points for the genome locs returned by splitFixedIntervals.
* @param scatterParts The output interval lists to write to. * @return A list of lists of genome locs, split according to splits
*/ */
public static void scatterFixedIntervals(SAMFileHeader fileHeader, List<GenomeLoc> locs, List<Integer> splits, List<File> scatterParts) { public static List<List<GenomeLoc>> splitIntervalsToSubLists(List<GenomeLoc> locs, List<Integer> splits) {
if (splits.size() != scatterParts.size())
throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size()));
int fileIndex = 0;
int locIndex = 1; int locIndex = 1;
int start = 0; int start = 0;
List<List<GenomeLoc>> sublists = new ArrayList<List<GenomeLoc>>(splits.size());
for (Integer stop: splits) { for (Integer stop: splits) {
IntervalList intervalList = new IntervalList(fileHeader); List<GenomeLoc> curList = new ArrayList<GenomeLoc>();
for (int i = start; i < stop; i++) for (int i = start; i < stop; i++)
intervalList.add(toInterval(locs.get(i), locIndex++)); curList.add(locs.get(i));
intervalList.write(scatterParts.get(fileIndex++));
start = stop; start = stop;
sublists.add(curList);
}
return sublists;
}
/**
* Splits an interval list into multiple files.
* @param fileHeader The sam file header.
* @param splits Pre-divided genome locs returned by splitFixedIntervals.
* @param scatterParts The output interval lists to write to.
*/
public static void scatterFixedIntervals(SAMFileHeader fileHeader, List<List<GenomeLoc>> splits, List<File> scatterParts) {
if (splits.size() != scatterParts.size())
throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size()));
int fileIndex = 0;
int locIndex = 1;
for (final List<GenomeLoc> split : splits) {
IntervalList intervalList = new IntervalList(fileHeader);
for (final GenomeLoc loc : split)
intervalList.add(toInterval(loc, locIndex++));
intervalList.write(scatterParts.get(fileIndex++));
} }
} }
@ -361,17 +381,15 @@ public class IntervalUtils {
* @param numParts Number of parts to split the locs into. * @param numParts Number of parts to split the locs into.
* @return The stop points to split the genome locs. * @return The stop points to split the genome locs.
*/ */
public static List<Integer> splitFixedIntervals(List<GenomeLoc> locs, int numParts) { public static List<List<GenomeLoc>> splitFixedIntervals(List<GenomeLoc> locs, int numParts) {
if (locs.size() < numParts) if (locs.size() < numParts)
throw new UserException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts)); throw new UserException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts));
long locsSize = 0; final long locsSize = intervalSize(locs);
for (GenomeLoc loc: locs) final List<Integer> splitPoints = new ArrayList<Integer>();
locsSize += loc.size();
List<Integer> splitPoints = new ArrayList<Integer>();
addFixedSplit(splitPoints, locs, locsSize, 0, locs.size(), numParts); addFixedSplit(splitPoints, locs, locsSize, 0, locs.size(), numParts);
Collections.sort(splitPoints); Collections.sort(splitPoints);
splitPoints.add(locs.size()); splitPoints.add(locs.size());
return splitPoints; return splitIntervalsToSubLists(locs, splitPoints);
} }
private static void addFixedSplit(List<Integer> splitPoints, List<GenomeLoc> locs, long locsSize, int startIndex, int stopIndex, int numParts) { private static void addFixedSplit(List<Integer> splitPoints, List<GenomeLoc> locs, long locsSize, int startIndex, int stopIndex, int numParts) {
@ -441,4 +459,11 @@ public class IntervalUtils {
return merged; return merged;
} }
} }
public static final long intervalSize(final List<GenomeLoc> locs) {
long size = 0;
for ( final GenomeLoc loc : locs )
size += loc.size();
return size;
}
} }

View File

@ -1,65 +1,65 @@
/* /*
* Copyright (c) 2011, The Broad Institute * Copyright (c) 2011, The Broad Institute
* *
* Permission is hereby granted, free of charge, to any person * Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation * obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without * files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, * restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell * copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the * copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following * Software is furnished to do so, subject to the following
* conditions: * conditions:
* *
* The above copyright notice and this permission notice shall be * The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software. * included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE. * OTHER DEALINGS IN THE SOFTWARE.
*/ */
package org.broadinstitute.sting.queue.extensions.gatk package org.broadinstitute.sting.queue.extensions.gatk
import java.io.File import java.io.File
import collection.JavaConversions._ import collection.JavaConversions._
import org.broadinstitute.sting.utils.interval.IntervalUtils import org.broadinstitute.sting.utils.interval.IntervalUtils
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource
import net.sf.samtools.SAMFileHeader import net.sf.samtools.SAMFileHeader
import java.util.Collections import java.util.Collections
import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocSortedSet, GenomeLocParser} import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocSortedSet, GenomeLocParser}
case class GATKIntervals(reference: File, intervals: List[String]) { case class GATKIntervals(reference: File, intervals: List[String]) {
private lazy val referenceDataSource = new ReferenceDataSource(reference) private lazy val referenceDataSource = new ReferenceDataSource(reference)
private var splitsBySize = Map.empty[Int, java.util.List[java.lang.Integer]] // private var splitsBySize = Map.empty[Int, java.util.List[java.lang.Integer]]
lazy val samFileHeader = { lazy val samFileHeader = {
val header = new SAMFileHeader val header = new SAMFileHeader
header.setSequenceDictionary(referenceDataSource.getReference.getSequenceDictionary) header.setSequenceDictionary(referenceDataSource.getReference.getSequenceDictionary)
header header
} }
lazy val locs: java.util.List[GenomeLoc] = { lazy val locs: java.util.List[GenomeLoc] = {
val parser = new GenomeLocParser(referenceDataSource.getReference) val parser = new GenomeLocParser(referenceDataSource.getReference)
val parsedLocs = val parsedLocs =
if (intervals.isEmpty) if (intervals.isEmpty)
GenomeLocSortedSet.createSetFromSequenceDictionary(samFileHeader.getSequenceDictionary).toList GenomeLocSortedSet.createSetFromSequenceDictionary(samFileHeader.getSequenceDictionary).toList
else else
IntervalUtils.parseIntervalArguments(parser, intervals, false) IntervalUtils.parseIntervalArguments(parser, intervals, false)
Collections.sort(parsedLocs) Collections.sort(parsedLocs)
Collections.unmodifiableList(parsedLocs) Collections.unmodifiableList(parsedLocs)
} }
lazy val contigs = locs.map(_.getContig).distinct.toList lazy val contigs = locs.map(_.getContig).distinct.toList
def getSplits(size: Int) = { // def getSplits(size: Int) = {
splitsBySize.getOrElse(size, { // splitsBySize.getOrElse(size, {
val splits: java.util.List[java.lang.Integer] = IntervalUtils.splitFixedIntervals(locs, size) // val splits: java.util.List[java.lang.Integer] = IntervalUtils.splitFixedIntervals(locs, size)
splitsBySize += size -> splits // splitsBySize += size -> splits
splits // splits
}) // })
} // }
} }

View File

@ -37,7 +37,7 @@ class IntervalScatterFunction extends GATKScatterFunction with InProcessFunction
def run() { def run() {
val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals) val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals)
IntervalUtils.scatterFixedIntervals(gi.samFileHeader, gi.locs, val splits = IntervalUtils.splitFixedIntervals(gi.locs, this.scatterOutputFiles.size)
gi.getSplits(this.scatterOutputFiles.size), this.scatterOutputFiles) IntervalUtils.scatterFixedIntervals(gi.samFileHeader, splits, this.scatterOutputFiles)
} }
} }

View File

@ -53,8 +53,8 @@ class GATKIntervalsUnitTest {
val gi = new GATKIntervals(hg18Reference, List("chr1:1-1", "chr2:2-3", "chr3:3-5")) val gi = new GATKIntervals(hg18Reference, List("chr1:1-1", "chr2:2-3", "chr3:3-5"))
Assert.assertEquals(gi.locs.toList, List(chr1, chr2, chr3)) Assert.assertEquals(gi.locs.toList, List(chr1, chr2, chr3))
Assert.assertEquals(gi.contigs, List("chr1", "chr2", "chr3")) Assert.assertEquals(gi.contigs, List("chr1", "chr2", "chr3"))
Assert.assertEquals(gi.getSplits(2).toList, List(2, 3)) // Assert.assertEquals(gi.getSplits(2).toList, List(2, 3))
Assert.assertEquals(gi.getSplits(3).toList, List(1, 2, 3)) // Assert.assertEquals(gi.getSplits(3).toList, List(1, 2, 3))
} }
@Test(timeOut = 30000) @Test(timeOut = 30000)
@ -65,7 +65,7 @@ class GATKIntervalsUnitTest {
// for(Item item: javaConvertedScalaList) // for(Item item: javaConvertedScalaList)
// This for loop is actually an O(N^2) operation as the iterator calls the // This for loop is actually an O(N^2) operation as the iterator calls the
// O(N) javaConvertedScalaList.size() for each iteration of the loop. // O(N) javaConvertedScalaList.size() for each iteration of the loop.
Assert.assertEquals(gi.getSplits(gi.locs.size).size, 189894) //Assert.assertEquals(gi.getSplits(gi.locs.size).size, 189894)
Assert.assertEquals(gi.contigs.size, 24) Assert.assertEquals(gi.contigs.size, 24)
} }
@ -74,8 +74,8 @@ class GATKIntervalsUnitTest {
val gi = new GATKIntervals(hg18Reference, Nil) val gi = new GATKIntervals(hg18Reference, Nil)
Assert.assertEquals(gi.locs, hg18ReferenceLocs) Assert.assertEquals(gi.locs, hg18ReferenceLocs)
Assert.assertEquals(gi.contigs.size, hg18ReferenceLocs.size) Assert.assertEquals(gi.contigs.size, hg18ReferenceLocs.size)
Assert.assertEquals(gi.getSplits(2).toList, List(10, 45)) // Assert.assertEquals(gi.getSplits(2).toList, List(10, 45))
Assert.assertEquals(gi.getSplits(4).toList, List(5, 10, 16, 45)) // Assert.assertEquals(gi.getSplits(4).toList, List(5, 10, 16, 45))
} }
@Test @Test