package org.broadinstitute.sting.utils.interval; import net.sf.picard.reference.ReferenceSequenceFile; import org.broadinstitute.sting.BaseTest; import org.testng.Assert; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; import java.io.File; import java.io.FileNotFoundException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; /** * test out the interval utility methods */ public class IntervalUtilsUnitTest extends BaseTest { // used to seed the genome loc parser with a sequence dictionary private static File reference = new File(BaseTest.hg18Reference); private GenomeLocParser genomeLocParser; @BeforeClass public void init() { try { ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(reference); genomeLocParser = new GenomeLocParser(seq); } catch(FileNotFoundException ex) { throw new UserException.CouldNotReadInputFile(reference,ex); } } @Test(expectedExceptions=UserException.class) public void testMergeListsBySetOperatorNoOverlap() { // a couple of lists we'll use for the testing List listEveryTwoFromOne = new ArrayList(); List listEveryTwoFromTwo = new ArrayList(); // create the two lists we'll use for (int x = 1; x < 101; x++) { if (x % 2 == 0) listEveryTwoFromTwo.add(genomeLocParser.createGenomeLoc("chr1",x,x)); else listEveryTwoFromOne.add(genomeLocParser.createGenomeLoc("chr1",x,x)); } List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.UNION); Assert.assertEquals(ret.size(), 100); ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.INTERSECTION); Assert.assertEquals(ret.size(), 0); } @Test public void testMergeListsBySetOperatorAllOverlap() { // a couple of lists we'll use for the testing List allSites = new ArrayList(); List listEveryTwoFromTwo = new ArrayList(); // create the two lists we'll use for (int x = 1; x < 101; x++) { if (x % 2 == 0) listEveryTwoFromTwo.add(genomeLocParser.createGenomeLoc("chr1",x,x)); allSites.add(genomeLocParser.createGenomeLoc("chr1",x,x)); } List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); Assert.assertEquals(ret.size(), 150); ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); Assert.assertEquals(ret.size(), 50); } @Test public void testMergeListsBySetOperator() { // a couple of lists we'll use for the testing List allSites = new ArrayList(); List listEveryTwoFromTwo = new ArrayList(); // create the two lists we'll use for (int x = 1; x < 101; x++) { if (x % 5 == 0) { listEveryTwoFromTwo.add(genomeLocParser.createGenomeLoc("chr1",x,x)); allSites.add(genomeLocParser.createGenomeLoc("chr1",x,x)); } } List ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); Assert.assertEquals(ret.size(), 40); ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); Assert.assertEquals(ret.size(), 20); } @Test public void testCountContigs() { List chrs = new ArrayList(); for (int i = 1; i <= 22; i++) chrs.add("chr" + i); chrs.add("chrX"); chrs.add("chrY"); List chrsNoRandom = Arrays.asList("chr12", "chr14", "chr20", "chrY"); List chrsWithRandom = new ArrayList(); chrsWithRandom.add("chrM"); chrsWithRandom.addAll(chrs); for (String chr: chrs) if(!chrsNoRandom.contains(chr)) chrsWithRandom.add(chr + "_random"); Assert.assertEquals(IntervalUtils.distinctContigs(reference), chrsWithRandom); Assert.assertEquals(IntervalUtils.distinctContigs(reference, Arrays.asList(BaseTest.validationDataLocation + "TCGA-06-0188.interval_list")), chrs); Assert.assertEquals(IntervalUtils.distinctContigs(reference, Arrays.asList("chr1:1-1", "chr2:1-1", "chr3:2-2")), Arrays.asList("chr1","chr2","chr3")); Assert.assertEquals(IntervalUtils.distinctContigs(reference, Arrays.asList("chr2:1-1", "chr1:1-1", "chr3:2-2")), Arrays.asList("chr1","chr2","chr3")); } @Test public void testBasicScatter() { GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1"); GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2"); GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3"); List files = testFiles("basic.", 3, ".intervals"); IntervalUtils.scatterIntervalArguments(reference, Arrays.asList("chr1", "chr2", "chr3"), files, false); List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); List locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false); Assert.assertEquals(locs1.size(), 1); Assert.assertEquals(locs2.size(), 1); Assert.assertEquals(locs3.size(), 1); Assert.assertEquals(locs1.get(0), chr1); Assert.assertEquals(locs2.get(0), chr2); Assert.assertEquals(locs3.get(0), chr3); } @Test public void testScatterLessFiles() { GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1"); GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2"); GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3"); GenomeLoc chr4 = genomeLocParser.parseGenomeInterval("chr4"); List files = testFiles("less.", 3, ".intervals"); IntervalUtils.scatterIntervalArguments(reference, Arrays.asList("chr1", "chr2", "chr3", "chr4"), files, false); List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); List locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false); Assert.assertEquals(locs1.size(), 2); Assert.assertEquals(locs2.size(), 1); Assert.assertEquals(locs3.size(), 1); Assert.assertEquals(locs1.get(0), chr1); Assert.assertEquals(locs1.get(1), chr2); Assert.assertEquals(locs2.get(0), chr3); Assert.assertEquals(locs3.get(0), chr4); } @Test(expectedExceptions=UserException.BadArgumentValue.class) public void testScatterMoreFiles() { List files = testFiles("more.", 3, ".intervals"); IntervalUtils.scatterIntervalArguments(reference, Arrays.asList("chr1", "chr2"), files, false); } @Test public void testScatterIntervals() { List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); GenomeLoc chr1a = genomeLocParser.parseGenomeInterval("chr1:1-2"); GenomeLoc chr1b = genomeLocParser.parseGenomeInterval("chr1:4-5"); GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2:1-1"); GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3:2-2"); List files = testFiles("split.", 3, ".intervals"); IntervalUtils.scatterIntervalArguments(reference, intervals, files, true); List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); List locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false); Assert.assertEquals(locs1.size(), 2); Assert.assertEquals(locs2.size(), 1); Assert.assertEquals(locs3.size(), 1); Assert.assertEquals(locs1.get(0), chr1a); Assert.assertEquals(locs1.get(1), chr1b); Assert.assertEquals(locs2.get(0), chr2); Assert.assertEquals(locs3.get(0), chr3); } @Test(enabled=false) // disabled, GenomeLoc.compareTo() returns 0 for two locs with the same start, causing an exception in GLSS.add(). public void testScatterIntervalsWithTheSameStart() { List files = testFiles("sg.", 20, ".intervals"); IntervalUtils.scatterIntervalArguments(new File(hg18Reference), Arrays.asList(BaseTest.GATKDataLocation + "whole_exome_agilent_designed_120.targets.hg18.chr20.interval_list"), files, false); } @Test public void testScatterOrder() { List intervals = Arrays.asList("chr2:1-1", "chr1:1-1", "chr3:2-2"); GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1:1-1"); GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2:1-1"); GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3:2-2"); List files = testFiles("split.", 3, ".intervals"); IntervalUtils.scatterIntervalArguments(reference, intervals, files, true); List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); List locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false); Assert.assertEquals(locs1.size(), 1); Assert.assertEquals(locs2.size(), 1); Assert.assertEquals(locs3.size(), 1); Assert.assertEquals(locs1.get(0), chr1); Assert.assertEquals(locs2.get(0), chr2); Assert.assertEquals(locs3.get(0), chr3); } @Test public void testBasicScatterByContig() { GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1"); GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2"); GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3"); List files = testFiles("contig_basic.", 3, ".intervals"); IntervalUtils.scatterIntervalArguments(reference, Arrays.asList("chr1", "chr2", "chr3"), files, true); List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); List locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false); Assert.assertEquals(locs1.size(), 1); Assert.assertEquals(locs2.size(), 1); Assert.assertEquals(locs3.size(), 1); Assert.assertEquals(locs1.get(0), chr1); Assert.assertEquals(locs2.get(0), chr2); Assert.assertEquals(locs3.get(0), chr3); } @Test public void testScatterByContigLessFiles() { GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1"); GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2"); GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3"); GenomeLoc chr4 = genomeLocParser.parseGenomeInterval("chr4"); List files = testFiles("contig_less.", 3, ".intervals"); IntervalUtils.scatterIntervalArguments(reference, Arrays.asList("chr1", "chr2", "chr3", "chr4"), files, true); List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); List locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false); Assert.assertEquals(locs1.size(), 1); Assert.assertEquals(locs2.size(), 1); Assert.assertEquals(locs3.size(), 2); Assert.assertEquals(locs1.get(0), chr1); Assert.assertEquals(locs2.get(0), chr2); Assert.assertEquals(locs3.get(0), chr3); Assert.assertEquals(locs3.get(1), chr4); } @Test(expectedExceptions=UserException.BadArgumentValue.class) public void testScatterByContigMoreFiles() { List files = testFiles("contig_more.", 3, ".intervals"); IntervalUtils.scatterIntervalArguments(reference, Arrays.asList("chr1", "chr2"), files, true); } @Test public void testScatterByContigIntervalsStart() { List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); GenomeLoc chr1a = genomeLocParser.parseGenomeInterval("chr1:1-2"); GenomeLoc chr1b = genomeLocParser.parseGenomeInterval("chr1:4-5"); GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2:1-1"); GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3:2-2"); List files = testFiles("contig_split_start.", 3, ".intervals"); IntervalUtils.scatterIntervalArguments(reference, intervals, files, true); List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); List locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false); Assert.assertEquals(locs1.size(), 2); Assert.assertEquals(locs2.size(), 1); Assert.assertEquals(locs3.size(), 1); Assert.assertEquals(locs1.get(0), chr1a); Assert.assertEquals(locs1.get(1), chr1b); Assert.assertEquals(locs2.get(0), chr2); Assert.assertEquals(locs3.get(0), chr3); } @Test public void testScatterByContigIntervalsMiddle() { List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1:1-1"); GenomeLoc chr2a = genomeLocParser.parseGenomeInterval("chr2:1-2"); GenomeLoc chr2b = genomeLocParser.parseGenomeInterval("chr2:4-5"); GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3:2-2"); List files = testFiles("contig_split_middle.", 3, ".intervals"); IntervalUtils.scatterIntervalArguments(reference, intervals, files, true); List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); List locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false); Assert.assertEquals(locs1.size(), 1); Assert.assertEquals(locs2.size(), 2); Assert.assertEquals(locs3.size(), 1); Assert.assertEquals(locs1.get(0), chr1); Assert.assertEquals(locs2.get(0), chr2a); Assert.assertEquals(locs2.get(1), chr2b); Assert.assertEquals(locs3.get(0), chr3); } @Test public void testScatterByContigIntervalsEnd() { List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1:1-1"); GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2:2-2"); GenomeLoc chr3a = genomeLocParser.parseGenomeInterval("chr3:1-2"); GenomeLoc chr3b = genomeLocParser.parseGenomeInterval("chr3:4-5"); List files = testFiles("contig_split_end.", 3 ,".intervals"); IntervalUtils.scatterIntervalArguments(reference, intervals, files, true); List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); List locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false); Assert.assertEquals(locs1.size(), 1); Assert.assertEquals(locs2.size(), 1); Assert.assertEquals(locs3.size(), 2); Assert.assertEquals(locs1.get(0), chr1); Assert.assertEquals(locs2.get(0), chr2); Assert.assertEquals(locs3.get(0), chr3a); Assert.assertEquals(locs3.get(1), chr3b); } private List testFiles(String prefix, int count, String suffix) { ArrayList files = new ArrayList(); for (int i = 1; i <= count; i++) { files.add(createTempFile(prefix + i, suffix)); } return files; } }