diff --git a/build.xml b/build.xml index beca6bce0..efefdd438 100644 --- a/build.xml +++ b/build.xml @@ -855,8 +855,8 @@ - - + + diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index 41cbbe59f..2cfcc19a9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -333,28 +333,6 @@ public class IntervalUtils { throw new UserException.BadArgumentValue("scatterParts", String.format("Only able to write contigs into %d of %d files.", fileIndex + 1, scatterParts.size())); } - /** - * Splits an interval list into multiple sublists. - * @param locs The genome locs to split. - * @param splits The stop points for the genome locs returned by splitFixedIntervals. - * @return A list of lists of genome locs, split according to splits - */ - public static List> splitIntervalsToSubLists(List locs, List splits) { - int locIndex = 1; - int start = 0; - List> sublists = new ArrayList>(splits.size()); - for (Integer stop: splits) { - List curList = new ArrayList(); - for (int i = start; i < stop; i++) - curList.add(locs.get(i)); - start = stop; - sublists.add(curList); - } - - return sublists; - } - - /** * Splits an interval list into multiple files. * @param fileHeader The sam file header. @@ -384,39 +362,27 @@ public class IntervalUtils { public static List> splitFixedIntervals(List locs, int numParts) { if (locs.size() < numParts) throw new UserException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts)); + final long locsSize = intervalSize(locs); - final List splitPoints = new ArrayList(); - addFixedSplit(splitPoints, locs, locsSize, 0, locs.size(), numParts); - Collections.sort(splitPoints); - splitPoints.add(locs.size()); - return splitIntervalsToSubLists(locs, splitPoints); - } + final double idealSplitSize = locsSize / numParts; + final List> splits = new ArrayList>(numParts); + final LinkedList remainingLocs = new LinkedList(locs); - private static void addFixedSplit(List splitPoints, List locs, long locsSize, int startIndex, int stopIndex, int numParts) { - if (numParts < 2) - return; - int halfParts = (numParts + 1) / 2; - Pair splitPoint = getFixedSplit(locs, locsSize, startIndex, stopIndex, halfParts, numParts - halfParts); - int splitIndex = splitPoint.first; - long splitSize = splitPoint.second; - splitPoints.add(splitIndex); - addFixedSplit(splitPoints, locs, splitSize, startIndex, splitIndex, halfParts); - addFixedSplit(splitPoints, locs, locsSize - splitSize, splitIndex, stopIndex, numParts - halfParts); - } + for ( int i = 0; i < numParts; i++ ) { + long splitSize = 0; + List split = new ArrayList(); + while ( ! remainingLocs.isEmpty() ) { + final GenomeLoc toAdd = remainingLocs.pop(); + splitSize += toAdd.size(); + split.add(toAdd); + final long nextEltSize = remainingLocs.isEmpty() ? 0 : remainingLocs.peek().size(); + if ( splitSize + (i % 2 == 0 ? 0 : nextEltSize) > idealSplitSize ) + break; + } + splits.add(split); + } - private static Pair getFixedSplit(List locs, long locsSize, int startIndex, int stopIndex, int minLocs, int maxLocs) { - int splitIndex = startIndex; - long splitSize = 0; - for (int i = 0; i < minLocs; i++) { - splitSize += locs.get(splitIndex).size(); - splitIndex++; - } - long halfSize = locsSize / 2; - while (splitIndex < (stopIndex - maxLocs) && splitSize < halfSize) { - splitSize += locs.get(splitIndex).size(); - splitIndex++; - } - return new Pair(splitIndex, splitSize); + return splits; } /** diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java index bd6bf9591..4809f1b5c 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.utils.interval; import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.picard.util.IntervalUtil; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; @@ -99,19 +100,26 @@ public class IntervalUtilsUnitTest extends BaseTest { @DataProvider(name = "intervalslicingdata") public Object[][] createTrees() { -// new IntervalSlicingTest(1, 0); -// new IntervalSlicingTest(2, 0.1); - new IntervalSlicingTest(5, 0.1); -// new IntervalSlicingTest(10, 0.1); -// new IntervalSlicingTest(67, 0.1); -// new IntervalSlicingTest(100, 0.1); -// new IntervalSlicingTest(500, 0.1); -// new IntervalSlicingTest(1000, 0.1); + new IntervalSlicingTest(1, 0); + new IntervalSlicingTest(2, 0.1); + new IntervalSlicingTest(3, 0.1); + new IntervalSlicingTest(7, 0.1); + new IntervalSlicingTest(10, 0.1); + new IntervalSlicingTest(31, 0.1); + new IntervalSlicingTest(67, 0.1); + new IntervalSlicingTest(100, 0.1); + new IntervalSlicingTest(127, 0.1); + // starts to become a bit less efficiency with larger cuts + new IntervalSlicingTest(500, 0.5); + new IntervalSlicingTest(1000, 1); + new IntervalSlicingTest(10000, 10); return IntervalSlicingTest.getTests(IntervalSlicingTest.class); } @Test(dataProvider = "intervalslicingdata") public void testFixedScatterIntervalsAlgorithm(IntervalSlicingTest test) { + Set locsSet = new HashSet(hg19exomeIntervals); + Set notFoundSet = new HashSet(hg19exomeIntervals); List> splits = IntervalUtils.splitFixedIntervals(hg19exomeIntervals, test.parts); long totalSize = IntervalUtils.intervalSize(hg19exomeIntervals); @@ -122,501 +130,497 @@ public class IntervalUtilsUnitTest extends BaseTest { for ( final List split : splits ) { long splitSize = IntervalUtils.intervalSize(split); double sigma = (splitSize - idealSplitSize) / (1.0 * idealSplitSize); - logger.warn(String.format("Split %d size %d ideal %d sigma %.2f", counter, splitSize, idealSplitSize, sigma)); + //logger.warn(String.format("Split %d size %d ideal %d sigma %.2f", counter, splitSize, idealSplitSize, sigma)); counter++; sumOfSplitSizes += splitSize; Assert.assertTrue(Math.abs(sigma) <= test.maxAllowableVariance, String.format("Interval %d (size %d ideal %d) has a variance %.2f outside of the tolerated range %.2f", counter, splitSize, idealSplitSize, sigma, test.maxAllowableVariance)); + + for ( final GenomeLoc loc : split ) { + Assert.assertTrue(locsSet.contains(loc), "Split location " + loc + " not found in set of input locs"); + notFoundSet.remove(loc); + } } - Assert.assertEquals(totalSize, sumOfSplitSizes, "Split intervals don't contain the exact number of bases in the origianl intervals"); + Assert.assertEquals(sumOfSplitSizes, totalSize, "Split intervals don't contain the exact number of bases in the original intervals"); + Assert.assertTrue(notFoundSet.isEmpty(), "Not all intervals were present in the split set"); } -// @Test(expectedExceptions=UserException.class) -// public void testMergeListsBySetOperatorNoOverlap() { -// // a couple of lists we'll use for the testing -// List listEveryTwoFromOne = new ArrayList(); -// List listEveryTwoFromTwo = new ArrayList(); -// -// // create the two lists we'll use -// for (int x = 1; x < 101; x++) { -// if (x % 2 == 0) -// listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); -// else -// listEveryTwoFromOne.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); -// } -// -// List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.UNION); -// Assert.assertEquals(ret.size(), 100); -// ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.INTERSECTION); -// Assert.assertEquals(ret.size(), 0); -// } -// -// @Test -// public void testMergeListsBySetOperatorAllOverlap() { -// // a couple of lists we'll use for the testing -// List allSites = new ArrayList(); -// List listEveryTwoFromTwo = new ArrayList(); -// -// // create the two lists we'll use -// for (int x = 1; x < 101; x++) { -// if (x % 2 == 0) -// listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); -// allSites.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); -// } -// -// List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); -// Assert.assertEquals(ret.size(), 150); -// ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); -// Assert.assertEquals(ret.size(), 50); -// } -// -// @Test -// public void testMergeListsBySetOperator() { -// // a couple of lists we'll use for the testing -// List allSites = new ArrayList(); -// List listEveryTwoFromTwo = new ArrayList(); -// -// // create the two lists we'll use -// for (int x = 1; x < 101; x++) { -// if (x % 5 == 0) { -// listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); -// allSites.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); -// } -// } -// -// List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); -// Assert.assertEquals(ret.size(), 40); -// ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); -// Assert.assertEquals(ret.size(), 20); -// } -// -// @Test -// public void testGetContigLengths() { -// Map lengths = IntervalUtils.getContigSizes(new File(BaseTest.hg18Reference)); -// Assert.assertEquals((long)lengths.get("chr1"), 247249719); -// Assert.assertEquals((long)lengths.get("chr2"), 242951149); -// Assert.assertEquals((long)lengths.get("chr3"), 199501827); -// Assert.assertEquals((long)lengths.get("chr20"), 62435964); -// Assert.assertEquals((long)lengths.get("chrX"), 154913754); -// } -// -// @Test -// public void testParseIntervalArguments() { -// Assert.assertEquals(getLocs().size(), 45); -// Assert.assertEquals(getLocs("chr1", "chr2", "chr3").size(), 3); -// Assert.assertEquals(getLocs("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2").size(), 4); -// } -// -// @Test -// public void testIsIntervalFile() { -// Assert.assertTrue(IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "empty_intervals.list")); -// Assert.assertTrue(IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "empty_intervals.list", true)); -// -// List extensions = Arrays.asList("bed", "interval_list", "intervals", "list", "picard"); -// for (String extension: extensions) { -// Assert.assertTrue(IntervalUtils.isIntervalFile("test_intervals." + extension, false), "Tested interval file extension: " + extension); -// } -// } -// -// @Test(expectedExceptions = UserException.CouldNotReadInputFile.class) -// public void testMissingIntervalFile() { -// IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "no_such_intervals.list"); -// } -// -// @Test -// public void testFixedScatterIntervalsBasic() { -// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); -// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); -// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); -// -// List files = testFiles("basic.", 3, ".intervals"); -// -// List locs = getLocs("chr1", "chr2", "chr3"); -// List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); -// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); -// -// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); -// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); -// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); -// -// Assert.assertEquals(locs1.size(), 1); -// Assert.assertEquals(locs2.size(), 1); -// Assert.assertEquals(locs3.size(), 1); -// -// Assert.assertEquals(locs1.get(0), chr1); -// Assert.assertEquals(locs2.get(0), chr2); -// Assert.assertEquals(locs3.get(0), chr3); -// } -// -// @Test -// public void testScatterFixedIntervalsLessFiles() { -// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); -// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); -// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); -// GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4"); -// -// List files = testFiles("less.", 3, ".intervals"); -// -// List locs = getLocs("chr1", "chr2", "chr3", "chr4"); -// List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); -// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); -// -// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); -// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); -// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); -// -// Assert.assertEquals(locs1.size(), 1); -// Assert.assertEquals(locs2.size(), 1); -// Assert.assertEquals(locs3.size(), 2); -// -// Assert.assertEquals(locs1.get(0), chr1); -// Assert.assertEquals(locs2.get(0), chr2); -// Assert.assertEquals(locs3.get(0), chr3); -// Assert.assertEquals(locs3.get(1), chr4); -// } -// -// @Test(expectedExceptions=UserException.BadArgumentValue.class) -// public void testSplitFixedIntervalsMoreFiles() { -// List files = testFiles("more.", 3, ".intervals"); -// List locs = getLocs("chr1", "chr2"); -// IntervalUtils.splitFixedIntervals(locs, files.size()); -// } -// -// @Test(expectedExceptions=UserException.BadArgumentValue.class) -// public void testScatterFixedIntervalsMoreFiles() { -// List files = testFiles("more.", 3, ".intervals"); -// List locs = getLocs("chr1", "chr2"); -// List splits = IntervalUtils.splitFixedIntervals(locs, locs.size()); // locs.size() instead of files.size() -// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); -// } -// @Test -// public void testScatterFixedIntervalsStart() { -// List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); -// GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2"); -// GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5"); -// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); -// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); -// -// List files = testFiles("split.", 3, ".intervals"); -// -// List locs = getLocs(intervals); -// List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); -// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); -// -// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); -// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); -// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); -// -// Assert.assertEquals(locs1.size(), 1); -// Assert.assertEquals(locs2.size(), 1); -// Assert.assertEquals(locs3.size(), 2); -// -// Assert.assertEquals(locs1.get(0), chr1a); -// Assert.assertEquals(locs2.get(0), chr1b); -// Assert.assertEquals(locs3.get(0), chr2); -// Assert.assertEquals(locs3.get(1), chr3); -// } -// -// @Test -// public void testScatterFixedIntervalsMiddle() { -// List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); -// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); -// GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2"); -// GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5"); -// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); -// -// List files = testFiles("split.", 3, ".intervals"); -// -// List locs = getLocs(intervals); -// List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); -// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); -// -// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); -// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); -// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); -// -// Assert.assertEquals(locs1.size(), 1); -// Assert.assertEquals(locs2.size(), 1); -// Assert.assertEquals(locs3.size(), 2); -// -// Assert.assertEquals(locs1.get(0), chr1); -// Assert.assertEquals(locs2.get(0), chr2a); -// Assert.assertEquals(locs3.get(0), chr2b); -// Assert.assertEquals(locs3.get(1), chr3); -// } -// -// @Test -// public void testScatterFixedIntervalsEnd() { -// List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); -// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); -// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2"); -// GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2"); -// GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5"); -// -// List files = testFiles("split.", 3, ".intervals"); -// -// List locs = getLocs(intervals); -// List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); -// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); -// -// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); -// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); -// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); -// -// Assert.assertEquals(locs1.size(), 2); -// Assert.assertEquals(locs2.size(), 1); -// Assert.assertEquals(locs3.size(), 1); -// -// Assert.assertEquals(locs1.get(0), chr1); -// Assert.assertEquals(locs1.get(1), chr2); -// Assert.assertEquals(locs2.get(0), chr3a); -// Assert.assertEquals(locs3.get(0), chr3b); -// } -// -// @Test -// public void testScatterFixedIntervalsFile() { -// List files = testFiles("sg.", 20, ".intervals"); -// List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(BaseTest.GATKDataLocation + "whole_exome_agilent_designed_120.targets.hg18.chr20.interval_list"), false); -// List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); -// -// int[] counts = { -// 125, 138, 287, 291, 312, 105, 155, 324, -// 295, 298, 141, 121, 285, 302, 282, 88, -// 116, 274, 282, 248 -//// 5169, 5573, 10017, 10567, 10551, -//// 5087, 4908, 10120, 10435, 10399, -//// 5391, 4735, 10621, 10352, 10654, -//// 5227, 5256, 10151, 9649, 9825 -// }; -// -// //String splitCounts = ""; -// for (int lastIndex = 0, i = 0; i < splits.size(); i++) { -// int splitIndex = splits.get(i); -// int splitCount = (splitIndex - lastIndex); -// //splitCounts += ", " + splitCount; -// lastIndex = splitIndex; -// Assert.assertEquals(splitCount, counts[i], "Num intervals in split " + i); -// } -// //System.out.println(splitCounts.substring(2)); -// -// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); -// -// int locIndex = 0; -// for (int i = 0; i < files.size(); i++) { -// String file = files.get(i).toString(); -// List parsedLocs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(file), false); -// Assert.assertEquals(parsedLocs.size(), counts[i], "Intervals in " + file); -// for (GenomeLoc parsedLoc: parsedLocs) -// Assert.assertEquals(parsedLoc, locs.get(locIndex), String.format("Genome loc %d from file %d", locIndex++, i)); -// } -// Assert.assertEquals(locIndex, locs.size(), "Total number of GenomeLocs"); -// } -// -// @Test -// public void testScatterFixedIntervalsMax() { -// List files = testFiles("sg.", 85, ".intervals"); -// List splits = IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size()); -// IntervalUtils.scatterFixedIntervals(hg19Header, hg19ReferenceLocs, splits, files); -// -// for (int i = 0; i < files.size(); i++) { -// String file = files.get(i).toString(); -// List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file), false); -// Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()"); -// Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()"); -// } -// } -// -// @Test -// public void testScatterContigIntervalsOrder() { -// List intervals = Arrays.asList("chr2:1-1", "chr1:1-1", "chr3:2-2"); -// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); -// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); -// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); -// -// List files = testFiles("split.", 3, ".intervals"); -// -// IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); -// -// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); -// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); -// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); -// -// Assert.assertEquals(locs1.size(), 1); -// Assert.assertEquals(locs2.size(), 1); -// Assert.assertEquals(locs3.size(), 1); -// -// Assert.assertEquals(locs1.get(0), chr2); -// Assert.assertEquals(locs2.get(0), chr1); -// Assert.assertEquals(locs3.get(0), chr3); -// } -// -// @Test -// public void testScatterContigIntervalsBasic() { -// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); -// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); -// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); -// -// List files = testFiles("contig_basic.", 3, ".intervals"); -// -// IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2", "chr3"), files); -// -// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); -// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); -// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); -// -// Assert.assertEquals(locs1.size(), 1); -// Assert.assertEquals(locs2.size(), 1); -// Assert.assertEquals(locs3.size(), 1); -// -// Assert.assertEquals(locs1.get(0), chr1); -// Assert.assertEquals(locs2.get(0), chr2); -// Assert.assertEquals(locs3.get(0), chr3); -// } -// -// @Test -// public void testScatterContigIntervalsLessFiles() { -// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); -// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); -// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); -// GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4"); -// -// List files = testFiles("contig_less.", 3, ".intervals"); -// -// IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2", "chr3", "chr4"), files); -// -// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); -// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); -// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); -// -// Assert.assertEquals(locs1.size(), 1); -// Assert.assertEquals(locs2.size(), 1); -// Assert.assertEquals(locs3.size(), 2); -// -// Assert.assertEquals(locs1.get(0), chr1); -// Assert.assertEquals(locs2.get(0), chr2); -// Assert.assertEquals(locs3.get(0), chr3); -// Assert.assertEquals(locs3.get(1), chr4); -// } -// -// @Test(expectedExceptions=UserException.BadArgumentValue.class) -// public void testScatterContigIntervalsMoreFiles() { -// List files = testFiles("contig_more.", 3, ".intervals"); -// IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2"), files); -// } -// -// @Test -// public void testScatterContigIntervalsStart() { -// List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); -// GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2"); -// GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5"); -// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); -// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); -// -// List files = testFiles("contig_split_start.", 3, ".intervals"); -// -// IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); -// -// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); -// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); -// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); -// -// Assert.assertEquals(locs1.size(), 2); -// Assert.assertEquals(locs2.size(), 1); -// Assert.assertEquals(locs3.size(), 1); -// -// Assert.assertEquals(locs1.get(0), chr1a); -// Assert.assertEquals(locs1.get(1), chr1b); -// Assert.assertEquals(locs2.get(0), chr2); -// Assert.assertEquals(locs3.get(0), chr3); -// } -// -// @Test -// public void testScatterContigIntervalsMiddle() { -// List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); -// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); -// GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2"); -// GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5"); -// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); -// -// List files = testFiles("contig_split_middle.", 3, ".intervals"); -// -// IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); -// -// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); -// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); -// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); -// -// Assert.assertEquals(locs1.size(), 1); -// Assert.assertEquals(locs2.size(), 2); -// Assert.assertEquals(locs3.size(), 1); -// -// Assert.assertEquals(locs1.get(0), chr1); -// Assert.assertEquals(locs2.get(0), chr2a); -// Assert.assertEquals(locs2.get(1), chr2b); -// Assert.assertEquals(locs3.get(0), chr3); -// } -// -// @Test -// public void testScatterContigIntervalsEnd() { -// List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); -// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); -// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2"); -// GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2"); -// GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5"); -// -// List files = testFiles("contig_split_end.", 3 ,".intervals"); -// -// IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); -// -// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); -// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); -// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); -// -// Assert.assertEquals(locs1.size(), 1); -// Assert.assertEquals(locs2.size(), 1); -// Assert.assertEquals(locs3.size(), 2); -// -// Assert.assertEquals(locs1.get(0), chr1); -// Assert.assertEquals(locs2.get(0), chr2); -// Assert.assertEquals(locs3.get(0), chr3a); -// Assert.assertEquals(locs3.get(1), chr3b); -// } -// -// @Test -// public void testScatterContigIntervalsMax() { -// List files = testFiles("sg.", 85, ".intervals"); -// IntervalUtils.scatterContigIntervals(hg19Header, hg19ReferenceLocs, files); -// -// for (int i = 0; i < files.size(); i++) { -// String file = files.get(i).toString(); -// List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file), false); -// Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()"); -// Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()"); -// } -// } -// -// private List testFiles(String prefix, int count, String suffix) { -// ArrayList files = new ArrayList(); -// for (int i = 1; i <= count; i++) { -// files.add(createTempFile(prefix + i, suffix)); -// } -// return files; -// } -// -// @DataProvider(name="unmergedIntervals") -// public Object[][] getUnmergedIntervals() { -// return new Object[][] { -// new Object[] {"small_unmerged_picard_intervals.list"}, -// new Object[] {"small_unmerged_gatk_intervals.list"} -// }; -// } -// -// @Test(dataProvider="unmergedIntervals") -// public void testUnmergedIntervals(String unmergedIntervals) { -// List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Collections.singletonList(validationDataLocation + unmergedIntervals), false); -// Assert.assertEquals(locs.size(), 2); -// -// List merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL); -// Assert.assertEquals(merged.size(), 1); -// } + @Test(expectedExceptions=UserException.class) + public void testMergeListsBySetOperatorNoOverlap() { + // a couple of lists we'll use for the testing + List listEveryTwoFromOne = new ArrayList(); + List listEveryTwoFromTwo = new ArrayList(); + + // create the two lists we'll use + for (int x = 1; x < 101; x++) { + if (x % 2 == 0) + listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); + else + listEveryTwoFromOne.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); + } + + List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.UNION); + Assert.assertEquals(ret.size(), 100); + ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.INTERSECTION); + Assert.assertEquals(ret.size(), 0); + } + + @Test + public void testMergeListsBySetOperatorAllOverlap() { + // a couple of lists we'll use for the testing + List allSites = new ArrayList(); + List listEveryTwoFromTwo = new ArrayList(); + + // create the two lists we'll use + for (int x = 1; x < 101; x++) { + if (x % 2 == 0) + listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); + allSites.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); + } + + List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); + Assert.assertEquals(ret.size(), 150); + ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); + Assert.assertEquals(ret.size(), 50); + } + + @Test + public void testMergeListsBySetOperator() { + // a couple of lists we'll use for the testing + List allSites = new ArrayList(); + List listEveryTwoFromTwo = new ArrayList(); + + // create the two lists we'll use + for (int x = 1; x < 101; x++) { + if (x % 5 == 0) { + listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); + allSites.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); + } + } + + List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); + Assert.assertEquals(ret.size(), 40); + ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); + Assert.assertEquals(ret.size(), 20); + } + + @Test + public void testGetContigLengths() { + Map lengths = IntervalUtils.getContigSizes(new File(BaseTest.hg18Reference)); + Assert.assertEquals((long)lengths.get("chr1"), 247249719); + Assert.assertEquals((long)lengths.get("chr2"), 242951149); + Assert.assertEquals((long)lengths.get("chr3"), 199501827); + Assert.assertEquals((long)lengths.get("chr20"), 62435964); + Assert.assertEquals((long)lengths.get("chrX"), 154913754); + } + + @Test + public void testParseIntervalArguments() { + Assert.assertEquals(getLocs().size(), 45); + Assert.assertEquals(getLocs("chr1", "chr2", "chr3").size(), 3); + Assert.assertEquals(getLocs("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2").size(), 4); + } + + @Test + public void testIsIntervalFile() { + Assert.assertTrue(IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "empty_intervals.list")); + Assert.assertTrue(IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "empty_intervals.list", true)); + + List extensions = Arrays.asList("bed", "interval_list", "intervals", "list", "picard"); + for (String extension: extensions) { + Assert.assertTrue(IntervalUtils.isIntervalFile("test_intervals." + extension, false), "Tested interval file extension: " + extension); + } + } + + @Test(expectedExceptions = UserException.CouldNotReadInputFile.class) + public void testMissingIntervalFile() { + IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "no_such_intervals.list"); + } + + @Test + public void testFixedScatterIntervalsBasic() { + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); + + List files = testFiles("basic.", 3, ".intervals"); + + List locs = getLocs("chr1", "chr2", "chr3"); + IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 1); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs2.get(0), chr2); + Assert.assertEquals(locs3.get(0), chr3); + } + + @Test + public void testScatterFixedIntervalsLessFiles() { + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); + GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4"); + + List files = testFiles("less.", 3, ".intervals"); + + List locs = getLocs("chr1", "chr2", "chr3", "chr4"); + IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 2); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 1); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs1.get(1), chr2); + Assert.assertEquals(locs2.get(0), chr3); + Assert.assertEquals(locs3.get(0), chr4); + } + + @Test(expectedExceptions=UserException.BadArgumentValue.class) + public void testSplitFixedIntervalsMoreFiles() { + List files = testFiles("more.", 3, ".intervals"); + List locs = getLocs("chr1", "chr2"); + IntervalUtils.splitFixedIntervals(locs, files.size()); + } + + @Test(expectedExceptions=UserException.BadArgumentValue.class) + public void testScatterFixedIntervalsMoreFiles() { + List files = testFiles("more.", 3, ".intervals"); + List locs = getLocs("chr1", "chr2"); + IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, locs.size()), files); + } + @Test + public void testScatterFixedIntervalsStart() { + List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); + GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2"); + GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); + + List files = testFiles("split.", 3, ".intervals"); + + List locs = getLocs(intervals); + IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 2); + + Assert.assertEquals(locs1.get(0), chr1a); + Assert.assertEquals(locs2.get(0), chr1b); + Assert.assertEquals(locs3.get(0), chr2); + Assert.assertEquals(locs3.get(1), chr3); + } + + @Test + public void testScatterFixedIntervalsMiddle() { + List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); + GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2"); + GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); + + List files = testFiles("split.", 3, ".intervals"); + + List locs = getLocs(intervals); + IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 2); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs2.get(0), chr2a); + Assert.assertEquals(locs3.get(0), chr2b); + Assert.assertEquals(locs3.get(1), chr3); + } + + @Test + public void testScatterFixedIntervalsEnd() { + List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2"); + GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2"); + GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5"); + + List files = testFiles("split.", 3, ".intervals"); + + List locs = getLocs(intervals); + IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 2); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 1); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs1.get(1), chr2); + Assert.assertEquals(locs2.get(0), chr3a); + Assert.assertEquals(locs3.get(0), chr3b); + } + + @Test + public void testScatterFixedIntervalsFile() { + List files = testFiles("sg.", 20, ".intervals"); + List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(BaseTest.GATKDataLocation + "whole_exome_agilent_designed_120.targets.hg18.chr20.interval_list"), false); + List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + + int[] counts = { + 125, 138, 287, 291, 312, 105, 155, 324, + 295, 298, 141, 121, 285, 302, 282, 88, + 116, 274, 282, 248 +// 5169, 5573, 10017, 10567, 10551, +// 5087, 4908, 10120, 10435, 10399, +// 5391, 4735, 10621, 10352, 10654, +// 5227, 5256, 10151, 9649, 9825 + }; + + //String splitCounts = ""; + for (int i = 0; i < splits.size(); i++) { + long splitCount = splits.get(i).size(); + Assert.assertEquals(splitCount, counts[i], "Num intervals in split " + i); + } + //System.out.println(splitCounts.substring(2)); + + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); + + int locIndex = 0; + for (int i = 0; i < files.size(); i++) { + String file = files.get(i).toString(); + List parsedLocs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(file), false); + Assert.assertEquals(parsedLocs.size(), counts[i], "Intervals in " + file); + for (GenomeLoc parsedLoc: parsedLocs) + Assert.assertEquals(parsedLoc, locs.get(locIndex), String.format("Genome loc %d from file %d", locIndex++, i)); + } + Assert.assertEquals(locIndex, locs.size(), "Total number of GenomeLocs"); + } + + @Test + public void testScatterFixedIntervalsMax() { + List files = testFiles("sg.", 85, ".intervals"); + IntervalUtils.scatterFixedIntervals(hg19Header, IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size()), files); + + for (int i = 0; i < files.size(); i++) { + String file = files.get(i).toString(); + List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file), false); + Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()"); + Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()"); + } + } + + @Test + public void testScatterContigIntervalsOrder() { + List intervals = Arrays.asList("chr2:1-1", "chr1:1-1", "chr3:2-2"); + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); + + List files = testFiles("split.", 3, ".intervals"); + + IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 1); + + Assert.assertEquals(locs1.get(0), chr2); + Assert.assertEquals(locs2.get(0), chr1); + Assert.assertEquals(locs3.get(0), chr3); + } + + @Test + public void testScatterContigIntervalsBasic() { + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); + + List files = testFiles("contig_basic.", 3, ".intervals"); + + IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2", "chr3"), files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 1); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs2.get(0), chr2); + Assert.assertEquals(locs3.get(0), chr3); + } + + @Test + public void testScatterContigIntervalsLessFiles() { + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); + GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4"); + + List files = testFiles("contig_less.", 3, ".intervals"); + + IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2", "chr3", "chr4"), files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 2); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs2.get(0), chr2); + Assert.assertEquals(locs3.get(0), chr3); + Assert.assertEquals(locs3.get(1), chr4); + } + + @Test(expectedExceptions=UserException.BadArgumentValue.class) + public void testScatterContigIntervalsMoreFiles() { + List files = testFiles("contig_more.", 3, ".intervals"); + IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2"), files); + } + + @Test + public void testScatterContigIntervalsStart() { + List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); + GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2"); + GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); + + List files = testFiles("contig_split_start.", 3, ".intervals"); + + IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 2); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 1); + + Assert.assertEquals(locs1.get(0), chr1a); + Assert.assertEquals(locs1.get(1), chr1b); + Assert.assertEquals(locs2.get(0), chr2); + Assert.assertEquals(locs3.get(0), chr3); + } + + @Test + public void testScatterContigIntervalsMiddle() { + List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); + GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2"); + GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); + + List files = testFiles("contig_split_middle.", 3, ".intervals"); + + IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 2); + Assert.assertEquals(locs3.size(), 1); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs2.get(0), chr2a); + Assert.assertEquals(locs2.get(1), chr2b); + Assert.assertEquals(locs3.get(0), chr3); + } + + @Test + public void testScatterContigIntervalsEnd() { + List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2"); + GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2"); + GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5"); + + List files = testFiles("contig_split_end.", 3 ,".intervals"); + + IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 2); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs2.get(0), chr2); + Assert.assertEquals(locs3.get(0), chr3a); + Assert.assertEquals(locs3.get(1), chr3b); + } + + @Test + public void testScatterContigIntervalsMax() { + List files = testFiles("sg.", 85, ".intervals"); + IntervalUtils.scatterContigIntervals(hg19Header, hg19ReferenceLocs, files); + + for (int i = 0; i < files.size(); i++) { + String file = files.get(i).toString(); + List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file), false); + Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()"); + Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()"); + } + } + + private List testFiles(String prefix, int count, String suffix) { + ArrayList files = new ArrayList(); + for (int i = 1; i <= count; i++) { + files.add(createTempFile(prefix + i, suffix)); + } + return files; + } + + @DataProvider(name="unmergedIntervals") + public Object[][] getUnmergedIntervals() { + return new Object[][] { + new Object[] {"small_unmerged_picard_intervals.list"}, + new Object[] {"small_unmerged_gatk_intervals.list"} + }; + } + + @Test(dataProvider="unmergedIntervals") + public void testUnmergedIntervals(String unmergedIntervals) { + List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Collections.singletonList(validationDataLocation + unmergedIntervals), false); + Assert.assertEquals(locs.size(), 2); + + List merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL); + Assert.assertEquals(merged.size(), 1); + } }