diff --git a/java/src/org/broadinstitute/sting/utils/interval/IntervalFileMergingIterator.java b/java/src/org/broadinstitute/sting/utils/interval/IntervalFileMergingIterator.java new file mode 100644 index 000000000..76f5ac4f2 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/interval/IntervalFileMergingIterator.java @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.interval; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.text.XReadLines; +import org.broadinstitute.sting.gatk.iterators.PushbackIterator; +import org.broadinstitute.sting.gatk.refdata.utils.StringToGenomeLocIteratorAdapter; + +import java.util.Iterator; +import java.io.File; +import java.io.FileNotFoundException; + +/** + * Created by IntelliJ IDEA. + * User: asivache + * Date: Jun 11, 2010 + * Time: 2:56:29 PM + * To change this template use File | Settings | File Templates. + */ + +/** This iterator reads intervals from interval file (can be gatk-style + * interval list or a bed file) and merges them on the fly. Very much alike + * IntervalUtils.sortAndMergeIntervals() but the list is read sequentially + * from a file upon request instead of loading the whole list into memory. + * Intervals in the underlying file MUST be + * pre-sorted into the reference order (they can overlap though, as this + * iterator is a merging one). + */ +public class IntervalFileMergingIterator implements Iterator { + + private PushbackIterator it ; + private IntervalMergingRule myRule; + + public IntervalFileMergingIterator(File f, IntervalMergingRule rule) { + try { + XReadLines reader = new XReadLines(f); + + if (f.getName().toUpperCase().endsWith(".BED")) { + it = new PushbackIterator( new StringToGenomeLocIteratorAdapter( reader.iterator(), + StringToGenomeLocIteratorAdapter.FORMAT.BED ) ) ; + } else { + it = new PushbackIterator( new StringToGenomeLocIteratorAdapter( reader.iterator(), + StringToGenomeLocIteratorAdapter.FORMAT.GATK ) ) ; + } + } catch ( FileNotFoundException e ) { + throw new StingException("Can not open interval file "+f); + } + myRule = rule; + } + + public boolean hasNext() { + return it.hasNext(); + } + + /** Returns next merged interval from the underlying interval file. In other words, keeps reading intervals + * for as long as they overlap and returns a single merged interval encompassing the set of overlapping + * intervals read from the file. Non-overlapping intervals are returned as is. This method will throw an + * exception if it runs into an interval that is out of order. + * @return + */ + public GenomeLoc next() { + + GenomeLoc current = it.next(); + + while ( it.hasNext() ) { + GenomeLoc next = it.next(); + + if ( next.isBefore(current)) { + throw new StingException("Interval "+next+" in the interval file is out of order."); + } + + if (current.overlapsP(next)) { + current = current.merge(next); + } else if (current.contiguousP(next) && myRule == IntervalMergingRule.ALL) { + current = current.merge(next); + } else { + it.pushback(next); + break; + } + } + + return current; + } + + public void remove() { + throw new UnsupportedOperationException("method 'remove' is not supported by this iterator"); + } + +} diff --git a/java/test/org/broadinstitute/sting/utils/interval/IntervalFileMergingIteratorUnitTest.java b/java/test/org/broadinstitute/sting/utils/interval/IntervalFileMergingIteratorUnitTest.java new file mode 100644 index 000000000..5631b2b5a --- /dev/null +++ b/java/test/org/broadinstitute/sting/utils/interval/IntervalFileMergingIteratorUnitTest.java @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.interval; + +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.Assert; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.StingException; +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.picard.reference.ReferenceSequenceFileFactory; + +import java.io.File; +import java.util.Iterator; +import java.util.List; +import java.util.ArrayList; + +/** + * Created by IntelliJ IDEA. + * User: asivache + * Date: Jun 14, 2010 + * Time: 10:15:52 AM + * To change this template use File | Settings | File Templates. + */ +public class IntervalFileMergingIteratorUnitTest extends BaseTest { + + private static File refFile = new File(validationDataLocation + "Homo_sapiens_assembly17.fasta"); + private static String intervalFileNameGATK = validationDataLocation+"test.gatk.intervals"; + private static String intervalFileNameBED = validationDataLocation+"test.bed"; + private static List results1 = null; + private static List results2 = null; + + @BeforeClass + public static void init() { + GenomeLocParser.setupRefContigOrdering(ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile)); + + results1 = new ArrayList(); + results2 = new ArrayList(); + + results1.add(GenomeLocParser.createGenomeLoc("chr1",1554)); + results1.add(GenomeLocParser.createGenomeLoc("chr1",2538,2568)); + results1.add(GenomeLocParser.createGenomeLoc("chr1",18932,19000)); + results1.add(GenomeLocParser.createGenomeLoc("chr1",19001,25000)); + results1.add(GenomeLocParser.createGenomeLoc("chr5",7415,7600)); + + results2.add(GenomeLocParser.createGenomeLoc("chr1",1554)); + results2.add(GenomeLocParser.createGenomeLoc("chr1",2538,2568)); + results2.add(GenomeLocParser.createGenomeLoc("chr1",18932,25000)); + results2.add(GenomeLocParser.createGenomeLoc("chr5",7415,7600)); + + } + + @Test + public void testGATKIntervalFileIterator_Overlap() { + logger.warn("Executing testGATKIntervalFileIterator_Overlap"); + + Iterator it = new IntervalFileMergingIterator(new File(intervalFileNameGATK),IntervalMergingRule.OVERLAPPING_ONLY); + Iterator check_it = results1.iterator(); + while(it.hasNext()) { + GenomeLoc l = it.next(); + GenomeLoc l_expected = check_it.next(); + //System.out.println("int: "+l+" expected: "+l_expected) ; + Assert.assertEquals("Unexpected location returned by the iterator: "+l,l,l_expected); + } + } + + @Test + public void testGATKIntervalFileIterator_OverlapWithException() { + logger.warn("Executing testGATKIntervalFileIterator_OverlapWithException"); + + Iterator it = new IntervalFileMergingIterator(new File(intervalFileNameGATK),IntervalMergingRule.OVERLAPPING_ONLY); + Iterator check_it = results1.iterator(); + try { + while(it.hasNext()) { + GenomeLoc l = it.next(); + GenomeLoc l_expected = check_it.next(); +// System.out.println("int: "+l+" expected: "+l_expected) ; + } + } catch ( StingException e) { + Assert.assertEquals( e.getMessage(), "Interval chr5:7414 in the interval file is out of order."); + } + } + + @Test + public void testGATKIntervalFileIterator_All() { + logger.warn("Executing testGATKIntervalFileIterator_All"); + + Iterator it = new IntervalFileMergingIterator(new File(intervalFileNameGATK),IntervalMergingRule.ALL); + Iterator check_it = results2.iterator(); + while(it.hasNext()) { + GenomeLoc l = it.next(); + GenomeLoc l_expected = check_it.next(); +// System.out.println("int: "+l+" expected: "+l_expected) ; + Assert.assertEquals("Unexpected location returned by the iterator: "+l,l,l_expected); + } + } + + @Test + public void testBEDIntervalFileIterator_Overlap() { + logger.warn("Executing testBEDIntervalFileIterator_Overlap"); + + Iterator it = new IntervalFileMergingIterator(new File(intervalFileNameBED),IntervalMergingRule.OVERLAPPING_ONLY); + Iterator check_it = results1.iterator(); + while(it.hasNext()) { + GenomeLoc l = it.next(); + GenomeLoc l_expected = check_it.next(); +// System.out.println("int: "+l+" expected: "+l_expected) ; + Assert.assertEquals("Unexpected location returned by the iterator: "+l,l,l_expected); + } + } + +}