diff --git a/build.xml b/build.xml index 1196f32dc..e5ad9daf0 100644 --- a/build.xml +++ b/build.xml @@ -852,8 +852,8 @@ - - + + diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index 2cfcc19a9..41cbbe59f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -333,6 +333,28 @@ public class IntervalUtils { throw new UserException.BadArgumentValue("scatterParts", String.format("Only able to write contigs into %d of %d files.", fileIndex + 1, scatterParts.size())); } + /** + * Splits an interval list into multiple sublists. + * @param locs The genome locs to split. + * @param splits The stop points for the genome locs returned by splitFixedIntervals. + * @return A list of lists of genome locs, split according to splits + */ + public static List> splitIntervalsToSubLists(List locs, List splits) { + int locIndex = 1; + int start = 0; + List> sublists = new ArrayList>(splits.size()); + for (Integer stop: splits) { + List curList = new ArrayList(); + for (int i = start; i < stop; i++) + curList.add(locs.get(i)); + start = stop; + sublists.add(curList); + } + + return sublists; + } + + /** * Splits an interval list into multiple files. * @param fileHeader The sam file header. @@ -362,27 +384,39 @@ public class IntervalUtils { public static List> splitFixedIntervals(List locs, int numParts) { if (locs.size() < numParts) throw new UserException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts)); - final long locsSize = intervalSize(locs); - final double idealSplitSize = locsSize / numParts; - final List> splits = new ArrayList>(numParts); - final LinkedList remainingLocs = new LinkedList(locs); + final List splitPoints = new ArrayList(); + addFixedSplit(splitPoints, locs, locsSize, 0, locs.size(), numParts); + Collections.sort(splitPoints); + splitPoints.add(locs.size()); + return splitIntervalsToSubLists(locs, splitPoints); + } - for ( int i = 0; i < numParts; i++ ) { - long splitSize = 0; - List split = new ArrayList(); - while ( ! remainingLocs.isEmpty() ) { - final GenomeLoc toAdd = remainingLocs.pop(); - splitSize += toAdd.size(); - split.add(toAdd); - final long nextEltSize = remainingLocs.isEmpty() ? 0 : remainingLocs.peek().size(); - if ( splitSize + (i % 2 == 0 ? 0 : nextEltSize) > idealSplitSize ) - break; - } - splits.add(split); + private static void addFixedSplit(List splitPoints, List locs, long locsSize, int startIndex, int stopIndex, int numParts) { + if (numParts < 2) + return; + int halfParts = (numParts + 1) / 2; + Pair splitPoint = getFixedSplit(locs, locsSize, startIndex, stopIndex, halfParts, numParts - halfParts); + int splitIndex = splitPoint.first; + long splitSize = splitPoint.second; + splitPoints.add(splitIndex); + addFixedSplit(splitPoints, locs, splitSize, startIndex, splitIndex, halfParts); + addFixedSplit(splitPoints, locs, locsSize - splitSize, splitIndex, stopIndex, numParts - halfParts); + } + + private static Pair getFixedSplit(List locs, long locsSize, int startIndex, int stopIndex, int minLocs, int maxLocs) { + int splitIndex = startIndex; + long splitSize = 0; + for (int i = 0; i < minLocs; i++) { + splitSize += locs.get(splitIndex).size(); + splitIndex++; } - - return splits; + long halfSize = locsSize / 2; + while (splitIndex < (stopIndex - maxLocs) && splitSize < halfSize) { + splitSize += locs.get(splitIndex).size(); + splitIndex++; + } + return new Pair(splitIndex, splitSize); } /** diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java index 4809f1b5c..98b878d23 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java @@ -1,7 +1,6 @@ package org.broadinstitute.sting.utils.interval; import net.sf.picard.reference.ReferenceSequenceFile; -import net.sf.picard.util.IntervalUtil; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; @@ -101,25 +100,18 @@ public class IntervalUtilsUnitTest extends BaseTest { @DataProvider(name = "intervalslicingdata") public Object[][] createTrees() { new IntervalSlicingTest(1, 0); - new IntervalSlicingTest(2, 0.1); - new IntervalSlicingTest(3, 0.1); - new IntervalSlicingTest(7, 0.1); - new IntervalSlicingTest(10, 0.1); - new IntervalSlicingTest(31, 0.1); - new IntervalSlicingTest(67, 0.1); - new IntervalSlicingTest(100, 0.1); - new IntervalSlicingTest(127, 0.1); - // starts to become a bit less efficiency with larger cuts - new IntervalSlicingTest(500, 0.5); + new IntervalSlicingTest(2, 1); + new IntervalSlicingTest(5, 1); + new IntervalSlicingTest(10, 1); + new IntervalSlicingTest(67, 1); + new IntervalSlicingTest(100, 1); + new IntervalSlicingTest(500, 1); new IntervalSlicingTest(1000, 1); - new IntervalSlicingTest(10000, 10); return IntervalSlicingTest.getTests(IntervalSlicingTest.class); } - @Test(dataProvider = "intervalslicingdata") + @Test(enabled = true, dataProvider = "intervalslicingdata") public void testFixedScatterIntervalsAlgorithm(IntervalSlicingTest test) { - Set locsSet = new HashSet(hg19exomeIntervals); - Set notFoundSet = new HashSet(hg19exomeIntervals); List> splits = IntervalUtils.splitFixedIntervals(hg19exomeIntervals, test.parts); long totalSize = IntervalUtils.intervalSize(hg19exomeIntervals); @@ -134,15 +126,9 @@ public class IntervalUtilsUnitTest extends BaseTest { counter++; sumOfSplitSizes += splitSize; Assert.assertTrue(Math.abs(sigma) <= test.maxAllowableVariance, String.format("Interval %d (size %d ideal %d) has a variance %.2f outside of the tolerated range %.2f", counter, splitSize, idealSplitSize, sigma, test.maxAllowableVariance)); - - for ( final GenomeLoc loc : split ) { - Assert.assertTrue(locsSet.contains(loc), "Split location " + loc + " not found in set of input locs"); - notFoundSet.remove(loc); - } } - Assert.assertEquals(sumOfSplitSizes, totalSize, "Split intervals don't contain the exact number of bases in the original intervals"); - Assert.assertTrue(notFoundSet.isEmpty(), "Not all intervals were present in the split set"); + Assert.assertEquals(totalSize, sumOfSplitSizes, "Split intervals don't contain the exact number of bases in the origianl intervals"); } @Test(expectedExceptions=UserException.class) @@ -246,7 +232,8 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("basic.", 3, ".intervals"); List locs = getLocs("chr1", "chr2", "chr3"); - IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files); + List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); @@ -271,20 +258,21 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("less.", 3, ".intervals"); List locs = getLocs("chr1", "chr2", "chr3", "chr4"); - IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files); + List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - Assert.assertEquals(locs1.size(), 2); + Assert.assertEquals(locs1.size(), 1); Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 1); + Assert.assertEquals(locs3.size(), 2); Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs1.get(1), chr2); - Assert.assertEquals(locs2.get(0), chr3); - Assert.assertEquals(locs3.get(0), chr4); + Assert.assertEquals(locs2.get(0), chr2); + Assert.assertEquals(locs3.get(0), chr3); + Assert.assertEquals(locs3.get(1), chr4); } @Test(expectedExceptions=UserException.BadArgumentValue.class) @@ -298,7 +286,8 @@ public class IntervalUtilsUnitTest extends BaseTest { public void testScatterFixedIntervalsMoreFiles() { List files = testFiles("more.", 3, ".intervals"); List locs = getLocs("chr1", "chr2"); - IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, locs.size()), files); + List> splits = IntervalUtils.splitFixedIntervals(locs, locs.size()); // locs.size() instead of files.size() + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); } @Test public void testScatterFixedIntervalsStart() { @@ -311,7 +300,8 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("split.", 3, ".intervals"); List locs = getLocs(intervals); - IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files); + List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); @@ -338,7 +328,8 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("split.", 3, ".intervals"); List locs = getLocs(intervals); - IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files); + List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); @@ -365,7 +356,8 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("split.", 3, ".intervals"); List locs = getLocs(intervals); - IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files); + List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); @@ -399,7 +391,7 @@ public class IntervalUtilsUnitTest extends BaseTest { //String splitCounts = ""; for (int i = 0; i < splits.size(); i++) { - long splitCount = splits.get(i).size(); + int splitCount = splits.get(i).size(); Assert.assertEquals(splitCount, counts[i], "Num intervals in split " + i); } //System.out.println(splitCounts.substring(2)); @@ -420,7 +412,8 @@ public class IntervalUtilsUnitTest extends BaseTest { @Test public void testScatterFixedIntervalsMax() { List files = testFiles("sg.", 85, ".intervals"); - IntervalUtils.scatterFixedIntervals(hg19Header, IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size()), files); + List> splits = IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size()); + IntervalUtils.scatterFixedIntervals(hg19Header, splits, files); for (int i = 0; i < files.size(); i++) { String file = files.get(i).toString();