From 06cb20f2a5fd2681a95613ae0b8b8a53c6002f4b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 9 Sep 2011 12:56:45 -0400 Subject: [PATCH] Intermediate commit cleaning up scatter intervals -- Adding unit tests to ensure uniformity of intervals --- .../sting/utils/interval/IntervalUtils.java | 57 +- .../utils/interval/IntervalUtilsUnitTest.java | 1032 +++++++++-------- .../queue/extensions/gatk/GATKIntervals.scala | 130 +-- .../gatk/IntervalScatterFunction.scala | 4 +- .../gatk/GATKIntervalsUnitTest.scala | 10 +- 5 files changed, 658 insertions(+), 575 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index f551e1368..41cbbe59f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -334,24 +334,44 @@ public class IntervalUtils { } /** - * Splits an interval list into multiple files. - * @param fileHeader The sam file header. + * Splits an interval list into multiple sublists. * @param locs The genome locs to split. * @param splits The stop points for the genome locs returned by splitFixedIntervals. - * @param scatterParts The output interval lists to write to. + * @return A list of lists of genome locs, split according to splits */ - public static void scatterFixedIntervals(SAMFileHeader fileHeader, List locs, List splits, List scatterParts) { - if (splits.size() != scatterParts.size()) - throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size())); - int fileIndex = 0; + public static List> splitIntervalsToSubLists(List locs, List splits) { int locIndex = 1; int start = 0; + List> sublists = new ArrayList>(splits.size()); for (Integer stop: splits) { - IntervalList intervalList = new IntervalList(fileHeader); + List curList = new ArrayList(); for (int i = start; i < stop; i++) - intervalList.add(toInterval(locs.get(i), locIndex++)); - intervalList.write(scatterParts.get(fileIndex++)); + curList.add(locs.get(i)); start = stop; + sublists.add(curList); + } + + return sublists; + } + + + /** + * Splits an interval list into multiple files. + * @param fileHeader The sam file header. + * @param splits Pre-divided genome locs returned by splitFixedIntervals. + * @param scatterParts The output interval lists to write to. + */ + public static void scatterFixedIntervals(SAMFileHeader fileHeader, List> splits, List scatterParts) { + if (splits.size() != scatterParts.size()) + throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size())); + + int fileIndex = 0; + int locIndex = 1; + for (final List split : splits) { + IntervalList intervalList = new IntervalList(fileHeader); + for (final GenomeLoc loc : split) + intervalList.add(toInterval(loc, locIndex++)); + intervalList.write(scatterParts.get(fileIndex++)); } } @@ -361,17 +381,15 @@ public class IntervalUtils { * @param numParts Number of parts to split the locs into. * @return The stop points to split the genome locs. */ - public static List splitFixedIntervals(List locs, int numParts) { + public static List> splitFixedIntervals(List locs, int numParts) { if (locs.size() < numParts) throw new UserException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts)); - long locsSize = 0; - for (GenomeLoc loc: locs) - locsSize += loc.size(); - List splitPoints = new ArrayList(); + final long locsSize = intervalSize(locs); + final List splitPoints = new ArrayList(); addFixedSplit(splitPoints, locs, locsSize, 0, locs.size(), numParts); Collections.sort(splitPoints); splitPoints.add(locs.size()); - return splitPoints; + return splitIntervalsToSubLists(locs, splitPoints); } private static void addFixedSplit(List splitPoints, List locs, long locsSize, int startIndex, int stopIndex, int numParts) { @@ -441,4 +459,11 @@ public class IntervalUtils { return merged; } } + + public static final long intervalSize(final List locs) { + long size = 0; + for ( final GenomeLoc loc : locs ) + size += loc.size(); + return size; + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java index bb892eec8..bd6bf9591 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java @@ -30,6 +30,20 @@ public class IntervalUtilsUnitTest extends BaseTest { private SAMFileHeader hg19Header; private GenomeLocParser hg19GenomeLocParser; private List hg19ReferenceLocs; + private List hg19exomeIntervals; + + private List getLocs(String... intervals) { + return getLocs(Arrays.asList(intervals)); + } + + private List getLocs(List intervals) { + if (intervals.size() == 0) + return hg18ReferenceLocs; + List locs = new ArrayList(); + for (String interval: intervals) + locs.add(hg18GenomeLocParser.parseGenomeLoc(interval)); + return locs; + } @BeforeClass public void init() { @@ -54,511 +68,555 @@ public class IntervalUtilsUnitTest extends BaseTest { ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(hg19Ref); hg19GenomeLocParser = new GenomeLocParser(seq); hg19ReferenceLocs = Collections.unmodifiableList(GenomeLocSortedSet.createSetFromSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()).toList()) ; + + hg19exomeIntervals = Collections.unmodifiableList(IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(hg19Intervals), false)); } catch(FileNotFoundException ex) { throw new UserException.CouldNotReadInputFile(hg19Ref,ex); } } - @Test(expectedExceptions=UserException.class) - public void testMergeListsBySetOperatorNoOverlap() { - // a couple of lists we'll use for the testing - List listEveryTwoFromOne = new ArrayList(); - List listEveryTwoFromTwo = new ArrayList(); + // ------------------------------------------------------------------------------------- + // + // tests to ensure the quality of the interval cuts of the interval cutting functions + // + // ------------------------------------------------------------------------------------- - // create the two lists we'll use - for (int x = 1; x < 101; x++) { - if (x % 2 == 0) - listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); - else - listEveryTwoFromOne.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); + private class IntervalSlicingTest extends TestDataProvider { + public int parts; + public double maxAllowableVariance; + + private IntervalSlicingTest(final int parts, final double maxAllowableVariance) { + super(IntervalSlicingTest.class); + this.parts = parts; + this.maxAllowableVariance = maxAllowableVariance; } - List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.UNION); - Assert.assertEquals(ret.size(), 100); - ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.INTERSECTION); - Assert.assertEquals(ret.size(), 0); - } - - @Test - public void testMergeListsBySetOperatorAllOverlap() { - // a couple of lists we'll use for the testing - List allSites = new ArrayList(); - List listEveryTwoFromTwo = new ArrayList(); - - // create the two lists we'll use - for (int x = 1; x < 101; x++) { - if (x % 2 == 0) - listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); - allSites.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); - } - - List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); - Assert.assertEquals(ret.size(), 150); - ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); - Assert.assertEquals(ret.size(), 50); - } - - @Test - public void testMergeListsBySetOperator() { - // a couple of lists we'll use for the testing - List allSites = new ArrayList(); - List listEveryTwoFromTwo = new ArrayList(); - - // create the two lists we'll use - for (int x = 1; x < 101; x++) { - if (x % 5 == 0) { - listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); - allSites.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); - } - } - - List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); - Assert.assertEquals(ret.size(), 40); - ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); - Assert.assertEquals(ret.size(), 20); - } - - @Test - public void testGetContigLengths() { - Map lengths = IntervalUtils.getContigSizes(new File(BaseTest.hg18Reference)); - Assert.assertEquals((long)lengths.get("chr1"), 247249719); - Assert.assertEquals((long)lengths.get("chr2"), 242951149); - Assert.assertEquals((long)lengths.get("chr3"), 199501827); - Assert.assertEquals((long)lengths.get("chr20"), 62435964); - Assert.assertEquals((long)lengths.get("chrX"), 154913754); - } - - private List getLocs(String... intervals) { - return getLocs(Arrays.asList(intervals)); - } - - private List getLocs(List intervals) { - if (intervals.size() == 0) - return hg18ReferenceLocs; - List locs = new ArrayList(); - for (String interval: intervals) - locs.add(hg18GenomeLocParser.parseGenomeLoc(interval)); - return locs; - } - - @Test - public void testParseIntervalArguments() { - Assert.assertEquals(getLocs().size(), 45); - Assert.assertEquals(getLocs("chr1", "chr2", "chr3").size(), 3); - Assert.assertEquals(getLocs("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2").size(), 4); - } - - @Test - public void testIsIntervalFile() { - Assert.assertTrue(IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "empty_intervals.list")); - Assert.assertTrue(IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "empty_intervals.list", true)); - - List extensions = Arrays.asList("bed", "interval_list", "intervals", "list", "picard"); - for (String extension: extensions) { - Assert.assertTrue(IntervalUtils.isIntervalFile("test_intervals." + extension, false), "Tested interval file extension: " + extension); + public String toString() { + return String.format("IntervalSlicingTest parts=%d maxVar=%.2f", parts, maxAllowableVariance); } } - @Test(expectedExceptions = UserException.CouldNotReadInputFile.class) - public void testMissingIntervalFile() { - IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "no_such_intervals.list"); + @DataProvider(name = "intervalslicingdata") + public Object[][] createTrees() { +// new IntervalSlicingTest(1, 0); +// new IntervalSlicingTest(2, 0.1); + new IntervalSlicingTest(5, 0.1); +// new IntervalSlicingTest(10, 0.1); +// new IntervalSlicingTest(67, 0.1); +// new IntervalSlicingTest(100, 0.1); +// new IntervalSlicingTest(500, 0.1); +// new IntervalSlicingTest(1000, 0.1); + return IntervalSlicingTest.getTests(IntervalSlicingTest.class); } - @Test - public void testFixedScatterIntervalsBasic() { - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); + @Test(dataProvider = "intervalslicingdata") + public void testFixedScatterIntervalsAlgorithm(IntervalSlicingTest test) { + List> splits = IntervalUtils.splitFixedIntervals(hg19exomeIntervals, test.parts); - List files = testFiles("basic.", 3, ".intervals"); + long totalSize = IntervalUtils.intervalSize(hg19exomeIntervals); + long idealSplitSize = totalSize / test.parts; - List locs = getLocs("chr1", "chr2", "chr3"); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2); - Assert.assertEquals(locs3.get(0), chr3); - } - - @Test - public void testScatterFixedIntervalsLessFiles() { - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); - GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4"); - - List files = testFiles("less.", 3, ".intervals"); - - List locs = getLocs("chr1", "chr2", "chr3", "chr4"); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 2); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2); - Assert.assertEquals(locs3.get(0), chr3); - Assert.assertEquals(locs3.get(1), chr4); - } - - @Test(expectedExceptions=UserException.BadArgumentValue.class) - public void testSplitFixedIntervalsMoreFiles() { - List files = testFiles("more.", 3, ".intervals"); - List locs = getLocs("chr1", "chr2"); - IntervalUtils.splitFixedIntervals(locs, files.size()); - } - - @Test(expectedExceptions=UserException.BadArgumentValue.class) - public void testScatterFixedIntervalsMoreFiles() { - List files = testFiles("more.", 3, ".intervals"); - List locs = getLocs("chr1", "chr2"); - List splits = IntervalUtils.splitFixedIntervals(locs, locs.size()); // locs.size() instead of files.size() - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); - } - @Test - public void testScatterFixedIntervalsStart() { - List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); - GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2"); - GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); - - List files = testFiles("split.", 3, ".intervals"); - - List locs = getLocs(intervals); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 2); - - Assert.assertEquals(locs1.get(0), chr1a); - Assert.assertEquals(locs2.get(0), chr1b); - Assert.assertEquals(locs3.get(0), chr2); - Assert.assertEquals(locs3.get(1), chr3); - } - - @Test - public void testScatterFixedIntervalsMiddle() { - List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); - GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2"); - GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); - - List files = testFiles("split.", 3, ".intervals"); - - List locs = getLocs(intervals); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 2); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2a); - Assert.assertEquals(locs3.get(0), chr2b); - Assert.assertEquals(locs3.get(1), chr3); - } - - @Test - public void testScatterFixedIntervalsEnd() { - List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2"); - GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2"); - GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5"); - - List files = testFiles("split.", 3, ".intervals"); - - List locs = getLocs(intervals); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 2); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs1.get(1), chr2); - Assert.assertEquals(locs2.get(0), chr3a); - Assert.assertEquals(locs3.get(0), chr3b); - } - - @Test - public void testScatterFixedIntervalsFile() { - List files = testFiles("sg.", 20, ".intervals"); - List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(BaseTest.GATKDataLocation + "whole_exome_agilent_designed_120.targets.hg18.chr20.interval_list"), false); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - - int[] counts = { - 125, 138, 287, 291, 312, 105, 155, 324, - 295, 298, 141, 121, 285, 302, 282, 88, - 116, 274, 282, 248 -// 5169, 5573, 10017, 10567, 10551, -// 5087, 4908, 10120, 10435, 10399, -// 5391, 4735, 10621, 10352, 10654, -// 5227, 5256, 10151, 9649, 9825 - }; - - //String splitCounts = ""; - for (int lastIndex = 0, i = 0; i < splits.size(); i++) { - int splitIndex = splits.get(i); - int splitCount = (splitIndex - lastIndex); - //splitCounts += ", " + splitCount; - lastIndex = splitIndex; - Assert.assertEquals(splitCount, counts[i], "Num intervals in split " + i); + long sumOfSplitSizes = 0; + int counter = 0; + for ( final List split : splits ) { + long splitSize = IntervalUtils.intervalSize(split); + double sigma = (splitSize - idealSplitSize) / (1.0 * idealSplitSize); + logger.warn(String.format("Split %d size %d ideal %d sigma %.2f", counter, splitSize, idealSplitSize, sigma)); + counter++; + sumOfSplitSizes += splitSize; + Assert.assertTrue(Math.abs(sigma) <= test.maxAllowableVariance, String.format("Interval %d (size %d ideal %d) has a variance %.2f outside of the tolerated range %.2f", counter, splitSize, idealSplitSize, sigma, test.maxAllowableVariance)); } - //System.out.println(splitCounts.substring(2)); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); - - int locIndex = 0; - for (int i = 0; i < files.size(); i++) { - String file = files.get(i).toString(); - List parsedLocs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(file), false); - Assert.assertEquals(parsedLocs.size(), counts[i], "Intervals in " + file); - for (GenomeLoc parsedLoc: parsedLocs) - Assert.assertEquals(parsedLoc, locs.get(locIndex), String.format("Genome loc %d from file %d", locIndex++, i)); - } - Assert.assertEquals(locIndex, locs.size(), "Total number of GenomeLocs"); + Assert.assertEquals(totalSize, sumOfSplitSizes, "Split intervals don't contain the exact number of bases in the origianl intervals"); } - @Test - public void testScatterFixedIntervalsMax() { - List files = testFiles("sg.", 85, ".intervals"); - List splits = IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size()); - IntervalUtils.scatterFixedIntervals(hg19Header, hg19ReferenceLocs, splits, files); - - for (int i = 0; i < files.size(); i++) { - String file = files.get(i).toString(); - List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file), false); - Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()"); - Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()"); - } - } - - @Test - public void testScatterContigIntervalsOrder() { - List intervals = Arrays.asList("chr2:1-1", "chr1:1-1", "chr3:2-2"); - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); - - List files = testFiles("split.", 3, ".intervals"); - - IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr2); - Assert.assertEquals(locs2.get(0), chr1); - Assert.assertEquals(locs3.get(0), chr3); - } - - @Test - public void testScatterContigIntervalsBasic() { - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); - - List files = testFiles("contig_basic.", 3, ".intervals"); - - IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2", "chr3"), files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2); - Assert.assertEquals(locs3.get(0), chr3); - } - - @Test - public void testScatterContigIntervalsLessFiles() { - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); - GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4"); - - List files = testFiles("contig_less.", 3, ".intervals"); - - IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2", "chr3", "chr4"), files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 2); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2); - Assert.assertEquals(locs3.get(0), chr3); - Assert.assertEquals(locs3.get(1), chr4); - } - - @Test(expectedExceptions=UserException.BadArgumentValue.class) - public void testScatterContigIntervalsMoreFiles() { - List files = testFiles("contig_more.", 3, ".intervals"); - IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2"), files); - } - - @Test - public void testScatterContigIntervalsStart() { - List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); - GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2"); - GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); - - List files = testFiles("contig_split_start.", 3, ".intervals"); - - IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 2); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr1a); - Assert.assertEquals(locs1.get(1), chr1b); - Assert.assertEquals(locs2.get(0), chr2); - Assert.assertEquals(locs3.get(0), chr3); - } - - @Test - public void testScatterContigIntervalsMiddle() { - List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); - GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2"); - GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); - - List files = testFiles("contig_split_middle.", 3, ".intervals"); - - IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 2); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2a); - Assert.assertEquals(locs2.get(1), chr2b); - Assert.assertEquals(locs3.get(0), chr3); - } - - @Test - public void testScatterContigIntervalsEnd() { - List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2"); - GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2"); - GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5"); - - List files = testFiles("contig_split_end.", 3 ,".intervals"); - - IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 2); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2); - Assert.assertEquals(locs3.get(0), chr3a); - Assert.assertEquals(locs3.get(1), chr3b); - } - - @Test - public void testScatterContigIntervalsMax() { - List files = testFiles("sg.", 85, ".intervals"); - IntervalUtils.scatterContigIntervals(hg19Header, hg19ReferenceLocs, files); - - for (int i = 0; i < files.size(); i++) { - String file = files.get(i).toString(); - List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file), false); - Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()"); - Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()"); - } - } - - private List testFiles(String prefix, int count, String suffix) { - ArrayList files = new ArrayList(); - for (int i = 1; i <= count; i++) { - files.add(createTempFile(prefix + i, suffix)); - } - return files; - } - - @DataProvider(name="unmergedIntervals") - public Object[][] getUnmergedIntervals() { - return new Object[][] { - new Object[] {"small_unmerged_picard_intervals.list"}, - new Object[] {"small_unmerged_gatk_intervals.list"} - }; - } - - @Test(dataProvider="unmergedIntervals") - public void testUnmergedIntervals(String unmergedIntervals) { - List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Collections.singletonList(validationDataLocation + unmergedIntervals), false); - Assert.assertEquals(locs.size(), 2); - - List merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL); - Assert.assertEquals(merged.size(), 1); - } +// @Test(expectedExceptions=UserException.class) +// public void testMergeListsBySetOperatorNoOverlap() { +// // a couple of lists we'll use for the testing +// List listEveryTwoFromOne = new ArrayList(); +// List listEveryTwoFromTwo = new ArrayList(); +// +// // create the two lists we'll use +// for (int x = 1; x < 101; x++) { +// if (x % 2 == 0) +// listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); +// else +// listEveryTwoFromOne.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); +// } +// +// List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.UNION); +// Assert.assertEquals(ret.size(), 100); +// ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.INTERSECTION); +// Assert.assertEquals(ret.size(), 0); +// } +// +// @Test +// public void testMergeListsBySetOperatorAllOverlap() { +// // a couple of lists we'll use for the testing +// List allSites = new ArrayList(); +// List listEveryTwoFromTwo = new ArrayList(); +// +// // create the two lists we'll use +// for (int x = 1; x < 101; x++) { +// if (x % 2 == 0) +// listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); +// allSites.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); +// } +// +// List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); +// Assert.assertEquals(ret.size(), 150); +// ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); +// Assert.assertEquals(ret.size(), 50); +// } +// +// @Test +// public void testMergeListsBySetOperator() { +// // a couple of lists we'll use for the testing +// List allSites = new ArrayList(); +// List listEveryTwoFromTwo = new ArrayList(); +// +// // create the two lists we'll use +// for (int x = 1; x < 101; x++) { +// if (x % 5 == 0) { +// listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); +// allSites.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); +// } +// } +// +// List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); +// Assert.assertEquals(ret.size(), 40); +// ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); +// Assert.assertEquals(ret.size(), 20); +// } +// +// @Test +// public void testGetContigLengths() { +// Map lengths = IntervalUtils.getContigSizes(new File(BaseTest.hg18Reference)); +// Assert.assertEquals((long)lengths.get("chr1"), 247249719); +// Assert.assertEquals((long)lengths.get("chr2"), 242951149); +// Assert.assertEquals((long)lengths.get("chr3"), 199501827); +// Assert.assertEquals((long)lengths.get("chr20"), 62435964); +// Assert.assertEquals((long)lengths.get("chrX"), 154913754); +// } +// +// @Test +// public void testParseIntervalArguments() { +// Assert.assertEquals(getLocs().size(), 45); +// Assert.assertEquals(getLocs("chr1", "chr2", "chr3").size(), 3); +// Assert.assertEquals(getLocs("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2").size(), 4); +// } +// +// @Test +// public void testIsIntervalFile() { +// Assert.assertTrue(IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "empty_intervals.list")); +// Assert.assertTrue(IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "empty_intervals.list", true)); +// +// List extensions = Arrays.asList("bed", "interval_list", "intervals", "list", "picard"); +// for (String extension: extensions) { +// Assert.assertTrue(IntervalUtils.isIntervalFile("test_intervals." + extension, false), "Tested interval file extension: " + extension); +// } +// } +// +// @Test(expectedExceptions = UserException.CouldNotReadInputFile.class) +// public void testMissingIntervalFile() { +// IntervalUtils.isIntervalFile(BaseTest.validationDataLocation + "no_such_intervals.list"); +// } +// +// @Test +// public void testFixedScatterIntervalsBasic() { +// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); +// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); +// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); +// +// List files = testFiles("basic.", 3, ".intervals"); +// +// List locs = getLocs("chr1", "chr2", "chr3"); +// List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); +// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 1); +// Assert.assertEquals(locs2.size(), 1); +// Assert.assertEquals(locs3.size(), 1); +// +// Assert.assertEquals(locs1.get(0), chr1); +// Assert.assertEquals(locs2.get(0), chr2); +// Assert.assertEquals(locs3.get(0), chr3); +// } +// +// @Test +// public void testScatterFixedIntervalsLessFiles() { +// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); +// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); +// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); +// GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4"); +// +// List files = testFiles("less.", 3, ".intervals"); +// +// List locs = getLocs("chr1", "chr2", "chr3", "chr4"); +// List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); +// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 1); +// Assert.assertEquals(locs2.size(), 1); +// Assert.assertEquals(locs3.size(), 2); +// +// Assert.assertEquals(locs1.get(0), chr1); +// Assert.assertEquals(locs2.get(0), chr2); +// Assert.assertEquals(locs3.get(0), chr3); +// Assert.assertEquals(locs3.get(1), chr4); +// } +// +// @Test(expectedExceptions=UserException.BadArgumentValue.class) +// public void testSplitFixedIntervalsMoreFiles() { +// List files = testFiles("more.", 3, ".intervals"); +// List locs = getLocs("chr1", "chr2"); +// IntervalUtils.splitFixedIntervals(locs, files.size()); +// } +// +// @Test(expectedExceptions=UserException.BadArgumentValue.class) +// public void testScatterFixedIntervalsMoreFiles() { +// List files = testFiles("more.", 3, ".intervals"); +// List locs = getLocs("chr1", "chr2"); +// List splits = IntervalUtils.splitFixedIntervals(locs, locs.size()); // locs.size() instead of files.size() +// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); +// } +// @Test +// public void testScatterFixedIntervalsStart() { +// List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); +// GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2"); +// GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5"); +// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); +// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); +// +// List files = testFiles("split.", 3, ".intervals"); +// +// List locs = getLocs(intervals); +// List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); +// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 1); +// Assert.assertEquals(locs2.size(), 1); +// Assert.assertEquals(locs3.size(), 2); +// +// Assert.assertEquals(locs1.get(0), chr1a); +// Assert.assertEquals(locs2.get(0), chr1b); +// Assert.assertEquals(locs3.get(0), chr2); +// Assert.assertEquals(locs3.get(1), chr3); +// } +// +// @Test +// public void testScatterFixedIntervalsMiddle() { +// List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); +// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); +// GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2"); +// GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5"); +// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); +// +// List files = testFiles("split.", 3, ".intervals"); +// +// List locs = getLocs(intervals); +// List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); +// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 1); +// Assert.assertEquals(locs2.size(), 1); +// Assert.assertEquals(locs3.size(), 2); +// +// Assert.assertEquals(locs1.get(0), chr1); +// Assert.assertEquals(locs2.get(0), chr2a); +// Assert.assertEquals(locs3.get(0), chr2b); +// Assert.assertEquals(locs3.get(1), chr3); +// } +// +// @Test +// public void testScatterFixedIntervalsEnd() { +// List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); +// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); +// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2"); +// GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2"); +// GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5"); +// +// List files = testFiles("split.", 3, ".intervals"); +// +// List locs = getLocs(intervals); +// List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); +// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 2); +// Assert.assertEquals(locs2.size(), 1); +// Assert.assertEquals(locs3.size(), 1); +// +// Assert.assertEquals(locs1.get(0), chr1); +// Assert.assertEquals(locs1.get(1), chr2); +// Assert.assertEquals(locs2.get(0), chr3a); +// Assert.assertEquals(locs3.get(0), chr3b); +// } +// +// @Test +// public void testScatterFixedIntervalsFile() { +// List files = testFiles("sg.", 20, ".intervals"); +// List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(BaseTest.GATKDataLocation + "whole_exome_agilent_designed_120.targets.hg18.chr20.interval_list"), false); +// List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); +// +// int[] counts = { +// 125, 138, 287, 291, 312, 105, 155, 324, +// 295, 298, 141, 121, 285, 302, 282, 88, +// 116, 274, 282, 248 +//// 5169, 5573, 10017, 10567, 10551, +//// 5087, 4908, 10120, 10435, 10399, +//// 5391, 4735, 10621, 10352, 10654, +//// 5227, 5256, 10151, 9649, 9825 +// }; +// +// //String splitCounts = ""; +// for (int lastIndex = 0, i = 0; i < splits.size(); i++) { +// int splitIndex = splits.get(i); +// int splitCount = (splitIndex - lastIndex); +// //splitCounts += ", " + splitCount; +// lastIndex = splitIndex; +// Assert.assertEquals(splitCount, counts[i], "Num intervals in split " + i); +// } +// //System.out.println(splitCounts.substring(2)); +// +// IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); +// +// int locIndex = 0; +// for (int i = 0; i < files.size(); i++) { +// String file = files.get(i).toString(); +// List parsedLocs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(file), false); +// Assert.assertEquals(parsedLocs.size(), counts[i], "Intervals in " + file); +// for (GenomeLoc parsedLoc: parsedLocs) +// Assert.assertEquals(parsedLoc, locs.get(locIndex), String.format("Genome loc %d from file %d", locIndex++, i)); +// } +// Assert.assertEquals(locIndex, locs.size(), "Total number of GenomeLocs"); +// } +// +// @Test +// public void testScatterFixedIntervalsMax() { +// List files = testFiles("sg.", 85, ".intervals"); +// List splits = IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size()); +// IntervalUtils.scatterFixedIntervals(hg19Header, hg19ReferenceLocs, splits, files); +// +// for (int i = 0; i < files.size(); i++) { +// String file = files.get(i).toString(); +// List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file), false); +// Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()"); +// Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()"); +// } +// } +// +// @Test +// public void testScatterContigIntervalsOrder() { +// List intervals = Arrays.asList("chr2:1-1", "chr1:1-1", "chr3:2-2"); +// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); +// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); +// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); +// +// List files = testFiles("split.", 3, ".intervals"); +// +// IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 1); +// Assert.assertEquals(locs2.size(), 1); +// Assert.assertEquals(locs3.size(), 1); +// +// Assert.assertEquals(locs1.get(0), chr2); +// Assert.assertEquals(locs2.get(0), chr1); +// Assert.assertEquals(locs3.get(0), chr3); +// } +// +// @Test +// public void testScatterContigIntervalsBasic() { +// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); +// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); +// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); +// +// List files = testFiles("contig_basic.", 3, ".intervals"); +// +// IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2", "chr3"), files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 1); +// Assert.assertEquals(locs2.size(), 1); +// Assert.assertEquals(locs3.size(), 1); +// +// Assert.assertEquals(locs1.get(0), chr1); +// Assert.assertEquals(locs2.get(0), chr2); +// Assert.assertEquals(locs3.get(0), chr3); +// } +// +// @Test +// public void testScatterContigIntervalsLessFiles() { +// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); +// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); +// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); +// GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4"); +// +// List files = testFiles("contig_less.", 3, ".intervals"); +// +// IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2", "chr3", "chr4"), files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 1); +// Assert.assertEquals(locs2.size(), 1); +// Assert.assertEquals(locs3.size(), 2); +// +// Assert.assertEquals(locs1.get(0), chr1); +// Assert.assertEquals(locs2.get(0), chr2); +// Assert.assertEquals(locs3.get(0), chr3); +// Assert.assertEquals(locs3.get(1), chr4); +// } +// +// @Test(expectedExceptions=UserException.BadArgumentValue.class) +// public void testScatterContigIntervalsMoreFiles() { +// List files = testFiles("contig_more.", 3, ".intervals"); +// IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2"), files); +// } +// +// @Test +// public void testScatterContigIntervalsStart() { +// List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); +// GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2"); +// GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5"); +// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); +// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); +// +// List files = testFiles("contig_split_start.", 3, ".intervals"); +// +// IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 2); +// Assert.assertEquals(locs2.size(), 1); +// Assert.assertEquals(locs3.size(), 1); +// +// Assert.assertEquals(locs1.get(0), chr1a); +// Assert.assertEquals(locs1.get(1), chr1b); +// Assert.assertEquals(locs2.get(0), chr2); +// Assert.assertEquals(locs3.get(0), chr3); +// } +// +// @Test +// public void testScatterContigIntervalsMiddle() { +// List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); +// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); +// GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2"); +// GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5"); +// GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); +// +// List files = testFiles("contig_split_middle.", 3, ".intervals"); +// +// IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 1); +// Assert.assertEquals(locs2.size(), 2); +// Assert.assertEquals(locs3.size(), 1); +// +// Assert.assertEquals(locs1.get(0), chr1); +// Assert.assertEquals(locs2.get(0), chr2a); +// Assert.assertEquals(locs2.get(1), chr2b); +// Assert.assertEquals(locs3.get(0), chr3); +// } +// +// @Test +// public void testScatterContigIntervalsEnd() { +// List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); +// GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); +// GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2"); +// GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2"); +// GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5"); +// +// List files = testFiles("contig_split_end.", 3 ,".intervals"); +// +// IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); +// +// List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); +// List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); +// List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); +// +// Assert.assertEquals(locs1.size(), 1); +// Assert.assertEquals(locs2.size(), 1); +// Assert.assertEquals(locs3.size(), 2); +// +// Assert.assertEquals(locs1.get(0), chr1); +// Assert.assertEquals(locs2.get(0), chr2); +// Assert.assertEquals(locs3.get(0), chr3a); +// Assert.assertEquals(locs3.get(1), chr3b); +// } +// +// @Test +// public void testScatterContigIntervalsMax() { +// List files = testFiles("sg.", 85, ".intervals"); +// IntervalUtils.scatterContigIntervals(hg19Header, hg19ReferenceLocs, files); +// +// for (int i = 0; i < files.size(); i++) { +// String file = files.get(i).toString(); +// List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file), false); +// Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()"); +// Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()"); +// } +// } +// +// private List testFiles(String prefix, int count, String suffix) { +// ArrayList files = new ArrayList(); +// for (int i = 1; i <= count; i++) { +// files.add(createTempFile(prefix + i, suffix)); +// } +// return files; +// } +// +// @DataProvider(name="unmergedIntervals") +// public Object[][] getUnmergedIntervals() { +// return new Object[][] { +// new Object[] {"small_unmerged_picard_intervals.list"}, +// new Object[] {"small_unmerged_gatk_intervals.list"} +// }; +// } +// +// @Test(dataProvider="unmergedIntervals") +// public void testUnmergedIntervals(String unmergedIntervals) { +// List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Collections.singletonList(validationDataLocation + unmergedIntervals), false); +// Assert.assertEquals(locs.size(), 2); +// +// List merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL); +// Assert.assertEquals(merged.size(), 1); +// } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala index aae5e438c..0fb997f43 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala @@ -1,65 +1,65 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.queue.extensions.gatk - -import java.io.File -import collection.JavaConversions._ -import org.broadinstitute.sting.utils.interval.IntervalUtils -import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource -import net.sf.samtools.SAMFileHeader -import java.util.Collections -import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocSortedSet, GenomeLocParser} - -case class GATKIntervals(reference: File, intervals: List[String]) { - private lazy val referenceDataSource = new ReferenceDataSource(reference) - private var splitsBySize = Map.empty[Int, java.util.List[java.lang.Integer]] - - lazy val samFileHeader = { - val header = new SAMFileHeader - header.setSequenceDictionary(referenceDataSource.getReference.getSequenceDictionary) - header - } - - lazy val locs: java.util.List[GenomeLoc] = { - val parser = new GenomeLocParser(referenceDataSource.getReference) - val parsedLocs = - if (intervals.isEmpty) - GenomeLocSortedSet.createSetFromSequenceDictionary(samFileHeader.getSequenceDictionary).toList - else - IntervalUtils.parseIntervalArguments(parser, intervals, false) - Collections.sort(parsedLocs) - Collections.unmodifiableList(parsedLocs) - } - - lazy val contigs = locs.map(_.getContig).distinct.toList - - def getSplits(size: Int) = { - splitsBySize.getOrElse(size, { - val splits: java.util.List[java.lang.Integer] = IntervalUtils.splitFixedIntervals(locs, size) - splitsBySize += size -> splits - splits - }) - } -} +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.extensions.gatk + +import java.io.File +import collection.JavaConversions._ +import org.broadinstitute.sting.utils.interval.IntervalUtils +import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource +import net.sf.samtools.SAMFileHeader +import java.util.Collections +import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocSortedSet, GenomeLocParser} + +case class GATKIntervals(reference: File, intervals: List[String]) { + private lazy val referenceDataSource = new ReferenceDataSource(reference) +// private var splitsBySize = Map.empty[Int, java.util.List[java.lang.Integer]] + + lazy val samFileHeader = { + val header = new SAMFileHeader + header.setSequenceDictionary(referenceDataSource.getReference.getSequenceDictionary) + header + } + + lazy val locs: java.util.List[GenomeLoc] = { + val parser = new GenomeLocParser(referenceDataSource.getReference) + val parsedLocs = + if (intervals.isEmpty) + GenomeLocSortedSet.createSetFromSequenceDictionary(samFileHeader.getSequenceDictionary).toList + else + IntervalUtils.parseIntervalArguments(parser, intervals, false) + Collections.sort(parsedLocs) + Collections.unmodifiableList(parsedLocs) + } + + lazy val contigs = locs.map(_.getContig).distinct.toList + +// def getSplits(size: Int) = { +// splitsBySize.getOrElse(size, { +// val splits: java.util.List[java.lang.Integer] = IntervalUtils.splitFixedIntervals(locs, size) +// splitsBySize += size -> splits +// splits +// }) +// } +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala index d88d272b9..f65d5ab29 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala @@ -37,7 +37,7 @@ class IntervalScatterFunction extends GATKScatterFunction with InProcessFunction def run() { val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals) - IntervalUtils.scatterFixedIntervals(gi.samFileHeader, gi.locs, - gi.getSplits(this.scatterOutputFiles.size), this.scatterOutputFiles) + val splits = IntervalUtils.splitFixedIntervals(gi.locs, this.scatterOutputFiles.size) + IntervalUtils.scatterFixedIntervals(gi.samFileHeader, splits, this.scatterOutputFiles) } } diff --git a/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala b/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala index b3a2d23ae..38abe24ef 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala @@ -53,8 +53,8 @@ class GATKIntervalsUnitTest { val gi = new GATKIntervals(hg18Reference, List("chr1:1-1", "chr2:2-3", "chr3:3-5")) Assert.assertEquals(gi.locs.toList, List(chr1, chr2, chr3)) Assert.assertEquals(gi.contigs, List("chr1", "chr2", "chr3")) - Assert.assertEquals(gi.getSplits(2).toList, List(2, 3)) - Assert.assertEquals(gi.getSplits(3).toList, List(1, 2, 3)) +// Assert.assertEquals(gi.getSplits(2).toList, List(2, 3)) +// Assert.assertEquals(gi.getSplits(3).toList, List(1, 2, 3)) } @Test(timeOut = 30000) @@ -65,7 +65,7 @@ class GATKIntervalsUnitTest { // for(Item item: javaConvertedScalaList) // This for loop is actually an O(N^2) operation as the iterator calls the // O(N) javaConvertedScalaList.size() for each iteration of the loop. - Assert.assertEquals(gi.getSplits(gi.locs.size).size, 189894) + //Assert.assertEquals(gi.getSplits(gi.locs.size).size, 189894) Assert.assertEquals(gi.contigs.size, 24) } @@ -74,8 +74,8 @@ class GATKIntervalsUnitTest { val gi = new GATKIntervals(hg18Reference, Nil) Assert.assertEquals(gi.locs, hg18ReferenceLocs) Assert.assertEquals(gi.contigs.size, hg18ReferenceLocs.size) - Assert.assertEquals(gi.getSplits(2).toList, List(10, 45)) - Assert.assertEquals(gi.getSplits(4).toList, List(5, 10, 16, 45)) +// Assert.assertEquals(gi.getSplits(2).toList, List(10, 45)) +// Assert.assertEquals(gi.getSplits(4).toList, List(5, 10, 16, 45)) } @Test