Intermediate commit cleaning up scatter intervals
-- Adding unit tests to ensure uniformity of intervals
This commit is contained in:
parent
507574b1c8
commit
06cb20f2a5
|
|
@ -334,24 +334,44 @@ public class IntervalUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Splits an interval list into multiple files.
|
* Splits an interval list into multiple sublists.
|
||||||
* @param fileHeader The sam file header.
|
|
||||||
* @param locs The genome locs to split.
|
* @param locs The genome locs to split.
|
||||||
* @param splits The stop points for the genome locs returned by splitFixedIntervals.
|
* @param splits The stop points for the genome locs returned by splitFixedIntervals.
|
||||||
* @param scatterParts The output interval lists to write to.
|
* @return A list of lists of genome locs, split according to splits
|
||||||
*/
|
*/
|
||||||
public static void scatterFixedIntervals(SAMFileHeader fileHeader, List<GenomeLoc> locs, List<Integer> splits, List<File> scatterParts) {
|
public static List<List<GenomeLoc>> splitIntervalsToSubLists(List<GenomeLoc> locs, List<Integer> splits) {
|
||||||
if (splits.size() != scatterParts.size())
|
|
||||||
throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size()));
|
|
||||||
int fileIndex = 0;
|
|
||||||
int locIndex = 1;
|
int locIndex = 1;
|
||||||
int start = 0;
|
int start = 0;
|
||||||
|
List<List<GenomeLoc>> sublists = new ArrayList<List<GenomeLoc>>(splits.size());
|
||||||
for (Integer stop: splits) {
|
for (Integer stop: splits) {
|
||||||
IntervalList intervalList = new IntervalList(fileHeader);
|
List<GenomeLoc> curList = new ArrayList<GenomeLoc>();
|
||||||
for (int i = start; i < stop; i++)
|
for (int i = start; i < stop; i++)
|
||||||
intervalList.add(toInterval(locs.get(i), locIndex++));
|
curList.add(locs.get(i));
|
||||||
intervalList.write(scatterParts.get(fileIndex++));
|
|
||||||
start = stop;
|
start = stop;
|
||||||
|
sublists.add(curList);
|
||||||
|
}
|
||||||
|
|
||||||
|
return sublists;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Splits an interval list into multiple files.
|
||||||
|
* @param fileHeader The sam file header.
|
||||||
|
* @param splits Pre-divided genome locs returned by splitFixedIntervals.
|
||||||
|
* @param scatterParts The output interval lists to write to.
|
||||||
|
*/
|
||||||
|
public static void scatterFixedIntervals(SAMFileHeader fileHeader, List<List<GenomeLoc>> splits, List<File> scatterParts) {
|
||||||
|
if (splits.size() != scatterParts.size())
|
||||||
|
throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size()));
|
||||||
|
|
||||||
|
int fileIndex = 0;
|
||||||
|
int locIndex = 1;
|
||||||
|
for (final List<GenomeLoc> split : splits) {
|
||||||
|
IntervalList intervalList = new IntervalList(fileHeader);
|
||||||
|
for (final GenomeLoc loc : split)
|
||||||
|
intervalList.add(toInterval(loc, locIndex++));
|
||||||
|
intervalList.write(scatterParts.get(fileIndex++));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -361,17 +381,15 @@ public class IntervalUtils {
|
||||||
* @param numParts Number of parts to split the locs into.
|
* @param numParts Number of parts to split the locs into.
|
||||||
* @return The stop points to split the genome locs.
|
* @return The stop points to split the genome locs.
|
||||||
*/
|
*/
|
||||||
public static List<Integer> splitFixedIntervals(List<GenomeLoc> locs, int numParts) {
|
public static List<List<GenomeLoc>> splitFixedIntervals(List<GenomeLoc> locs, int numParts) {
|
||||||
if (locs.size() < numParts)
|
if (locs.size() < numParts)
|
||||||
throw new UserException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts));
|
throw new UserException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts));
|
||||||
long locsSize = 0;
|
final long locsSize = intervalSize(locs);
|
||||||
for (GenomeLoc loc: locs)
|
final List<Integer> splitPoints = new ArrayList<Integer>();
|
||||||
locsSize += loc.size();
|
|
||||||
List<Integer> splitPoints = new ArrayList<Integer>();
|
|
||||||
addFixedSplit(splitPoints, locs, locsSize, 0, locs.size(), numParts);
|
addFixedSplit(splitPoints, locs, locsSize, 0, locs.size(), numParts);
|
||||||
Collections.sort(splitPoints);
|
Collections.sort(splitPoints);
|
||||||
splitPoints.add(locs.size());
|
splitPoints.add(locs.size());
|
||||||
return splitPoints;
|
return splitIntervalsToSubLists(locs, splitPoints);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void addFixedSplit(List<Integer> splitPoints, List<GenomeLoc> locs, long locsSize, int startIndex, int stopIndex, int numParts) {
|
private static void addFixedSplit(List<Integer> splitPoints, List<GenomeLoc> locs, long locsSize, int startIndex, int stopIndex, int numParts) {
|
||||||
|
|
@ -441,4 +459,11 @@ public class IntervalUtils {
|
||||||
return merged;
|
return merged;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static final long intervalSize(final List<GenomeLoc> locs) {
|
||||||
|
long size = 0;
|
||||||
|
for ( final GenomeLoc loc : locs )
|
||||||
|
size += loc.size();
|
||||||
|
return size;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,65 +1,65 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2011, The Broad Institute
|
* Copyright (c) 2011, The Broad Institute
|
||||||
*
|
*
|
||||||
* Permission is hereby granted, free of charge, to any person
|
* Permission is hereby granted, free of charge, to any person
|
||||||
* obtaining a copy of this software and associated documentation
|
* obtaining a copy of this software and associated documentation
|
||||||
* files (the "Software"), to deal in the Software without
|
* files (the "Software"), to deal in the Software without
|
||||||
* restriction, including without limitation the rights to use,
|
* restriction, including without limitation the rights to use,
|
||||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
* copies of the Software, and to permit persons to whom the
|
* copies of the Software, and to permit persons to whom the
|
||||||
* Software is furnished to do so, subject to the following
|
* Software is furnished to do so, subject to the following
|
||||||
* conditions:
|
* conditions:
|
||||||
*
|
*
|
||||||
* The above copyright notice and this permission notice shall be
|
* The above copyright notice and this permission notice shall be
|
||||||
* included in all copies or substantial portions of the Software.
|
* included in all copies or substantial portions of the Software.
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
* OTHER DEALINGS IN THE SOFTWARE.
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package org.broadinstitute.sting.queue.extensions.gatk
|
package org.broadinstitute.sting.queue.extensions.gatk
|
||||||
|
|
||||||
import java.io.File
|
import java.io.File
|
||||||
import collection.JavaConversions._
|
import collection.JavaConversions._
|
||||||
import org.broadinstitute.sting.utils.interval.IntervalUtils
|
import org.broadinstitute.sting.utils.interval.IntervalUtils
|
||||||
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource
|
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource
|
||||||
import net.sf.samtools.SAMFileHeader
|
import net.sf.samtools.SAMFileHeader
|
||||||
import java.util.Collections
|
import java.util.Collections
|
||||||
import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocSortedSet, GenomeLocParser}
|
import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocSortedSet, GenomeLocParser}
|
||||||
|
|
||||||
case class GATKIntervals(reference: File, intervals: List[String]) {
|
case class GATKIntervals(reference: File, intervals: List[String]) {
|
||||||
private lazy val referenceDataSource = new ReferenceDataSource(reference)
|
private lazy val referenceDataSource = new ReferenceDataSource(reference)
|
||||||
private var splitsBySize = Map.empty[Int, java.util.List[java.lang.Integer]]
|
// private var splitsBySize = Map.empty[Int, java.util.List[java.lang.Integer]]
|
||||||
|
|
||||||
lazy val samFileHeader = {
|
lazy val samFileHeader = {
|
||||||
val header = new SAMFileHeader
|
val header = new SAMFileHeader
|
||||||
header.setSequenceDictionary(referenceDataSource.getReference.getSequenceDictionary)
|
header.setSequenceDictionary(referenceDataSource.getReference.getSequenceDictionary)
|
||||||
header
|
header
|
||||||
}
|
}
|
||||||
|
|
||||||
lazy val locs: java.util.List[GenomeLoc] = {
|
lazy val locs: java.util.List[GenomeLoc] = {
|
||||||
val parser = new GenomeLocParser(referenceDataSource.getReference)
|
val parser = new GenomeLocParser(referenceDataSource.getReference)
|
||||||
val parsedLocs =
|
val parsedLocs =
|
||||||
if (intervals.isEmpty)
|
if (intervals.isEmpty)
|
||||||
GenomeLocSortedSet.createSetFromSequenceDictionary(samFileHeader.getSequenceDictionary).toList
|
GenomeLocSortedSet.createSetFromSequenceDictionary(samFileHeader.getSequenceDictionary).toList
|
||||||
else
|
else
|
||||||
IntervalUtils.parseIntervalArguments(parser, intervals, false)
|
IntervalUtils.parseIntervalArguments(parser, intervals, false)
|
||||||
Collections.sort(parsedLocs)
|
Collections.sort(parsedLocs)
|
||||||
Collections.unmodifiableList(parsedLocs)
|
Collections.unmodifiableList(parsedLocs)
|
||||||
}
|
}
|
||||||
|
|
||||||
lazy val contigs = locs.map(_.getContig).distinct.toList
|
lazy val contigs = locs.map(_.getContig).distinct.toList
|
||||||
|
|
||||||
def getSplits(size: Int) = {
|
// def getSplits(size: Int) = {
|
||||||
splitsBySize.getOrElse(size, {
|
// splitsBySize.getOrElse(size, {
|
||||||
val splits: java.util.List[java.lang.Integer] = IntervalUtils.splitFixedIntervals(locs, size)
|
// val splits: java.util.List[java.lang.Integer] = IntervalUtils.splitFixedIntervals(locs, size)
|
||||||
splitsBySize += size -> splits
|
// splitsBySize += size -> splits
|
||||||
splits
|
// splits
|
||||||
})
|
// })
|
||||||
}
|
// }
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -37,7 +37,7 @@ class IntervalScatterFunction extends GATKScatterFunction with InProcessFunction
|
||||||
|
|
||||||
def run() {
|
def run() {
|
||||||
val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals)
|
val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals)
|
||||||
IntervalUtils.scatterFixedIntervals(gi.samFileHeader, gi.locs,
|
val splits = IntervalUtils.splitFixedIntervals(gi.locs, this.scatterOutputFiles.size)
|
||||||
gi.getSplits(this.scatterOutputFiles.size), this.scatterOutputFiles)
|
IntervalUtils.scatterFixedIntervals(gi.samFileHeader, splits, this.scatterOutputFiles)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -53,8 +53,8 @@ class GATKIntervalsUnitTest {
|
||||||
val gi = new GATKIntervals(hg18Reference, List("chr1:1-1", "chr2:2-3", "chr3:3-5"))
|
val gi = new GATKIntervals(hg18Reference, List("chr1:1-1", "chr2:2-3", "chr3:3-5"))
|
||||||
Assert.assertEquals(gi.locs.toList, List(chr1, chr2, chr3))
|
Assert.assertEquals(gi.locs.toList, List(chr1, chr2, chr3))
|
||||||
Assert.assertEquals(gi.contigs, List("chr1", "chr2", "chr3"))
|
Assert.assertEquals(gi.contigs, List("chr1", "chr2", "chr3"))
|
||||||
Assert.assertEquals(gi.getSplits(2).toList, List(2, 3))
|
// Assert.assertEquals(gi.getSplits(2).toList, List(2, 3))
|
||||||
Assert.assertEquals(gi.getSplits(3).toList, List(1, 2, 3))
|
// Assert.assertEquals(gi.getSplits(3).toList, List(1, 2, 3))
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(timeOut = 30000)
|
@Test(timeOut = 30000)
|
||||||
|
|
@ -65,7 +65,7 @@ class GATKIntervalsUnitTest {
|
||||||
// for(Item item: javaConvertedScalaList)
|
// for(Item item: javaConvertedScalaList)
|
||||||
// This for loop is actually an O(N^2) operation as the iterator calls the
|
// This for loop is actually an O(N^2) operation as the iterator calls the
|
||||||
// O(N) javaConvertedScalaList.size() for each iteration of the loop.
|
// O(N) javaConvertedScalaList.size() for each iteration of the loop.
|
||||||
Assert.assertEquals(gi.getSplits(gi.locs.size).size, 189894)
|
//Assert.assertEquals(gi.getSplits(gi.locs.size).size, 189894)
|
||||||
Assert.assertEquals(gi.contigs.size, 24)
|
Assert.assertEquals(gi.contigs.size, 24)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -74,8 +74,8 @@ class GATKIntervalsUnitTest {
|
||||||
val gi = new GATKIntervals(hg18Reference, Nil)
|
val gi = new GATKIntervals(hg18Reference, Nil)
|
||||||
Assert.assertEquals(gi.locs, hg18ReferenceLocs)
|
Assert.assertEquals(gi.locs, hg18ReferenceLocs)
|
||||||
Assert.assertEquals(gi.contigs.size, hg18ReferenceLocs.size)
|
Assert.assertEquals(gi.contigs.size, hg18ReferenceLocs.size)
|
||||||
Assert.assertEquals(gi.getSplits(2).toList, List(10, 45))
|
// Assert.assertEquals(gi.getSplits(2).toList, List(10, 45))
|
||||||
Assert.assertEquals(gi.getSplits(4).toList, List(5, 10, 16, 45))
|
// Assert.assertEquals(gi.getSplits(4).toList, List(5, 10, 16, 45))
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue