A simple utility class that implements a merging Iterator<GenomeLoc> built over an interval or bed file (this is NOT a rod, but rather a direct line-by-line file reader that converts strings to genome locs on the fly and merges overlapping intervals)

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3546 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
asivache 2010-06-14 15:54:37 +00:00
parent f137bf8f85
commit 671ac00748
2 changed files with 252 additions and 0 deletions

View File

@ -0,0 +1,114 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.interval;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.text.XReadLines;
import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
import org.broadinstitute.sting.gatk.refdata.utils.StringToGenomeLocIteratorAdapter;
import java.util.Iterator;
import java.io.File;
import java.io.FileNotFoundException;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: Jun 11, 2010
* Time: 2:56:29 PM
* To change this template use File | Settings | File Templates.
*/
/** This iterator reads intervals from interval file (can be gatk-style
* interval list or a bed file) and merges them on the fly. Very much alike
* IntervalUtils.sortAndMergeIntervals() but the list is read sequentially
* from a file upon request instead of loading the whole list into memory.
* Intervals in the underlying file MUST be
* pre-sorted into the reference order (they can overlap though, as this
* iterator is a merging one).
*/
public class IntervalFileMergingIterator implements Iterator<GenomeLoc> {
private PushbackIterator<GenomeLoc> it ;
private IntervalMergingRule myRule;
public IntervalFileMergingIterator(File f, IntervalMergingRule rule) {
try {
XReadLines reader = new XReadLines(f);
if (f.getName().toUpperCase().endsWith(".BED")) {
it = new PushbackIterator<GenomeLoc>( new StringToGenomeLocIteratorAdapter( reader.iterator(),
StringToGenomeLocIteratorAdapter.FORMAT.BED ) ) ;
} else {
it = new PushbackIterator<GenomeLoc>( new StringToGenomeLocIteratorAdapter( reader.iterator(),
StringToGenomeLocIteratorAdapter.FORMAT.GATK ) ) ;
}
} catch ( FileNotFoundException e ) {
throw new StingException("Can not open interval file "+f);
}
myRule = rule;
}
public boolean hasNext() {
return it.hasNext();
}
/** Returns next merged interval from the underlying interval file. In other words, keeps reading intervals
* for as long as they overlap and returns a single merged interval encompassing the set of overlapping
* intervals read from the file. Non-overlapping intervals are returned as is. This method will throw an
* exception if it runs into an interval that is out of order.
* @return
*/
public GenomeLoc next() {
GenomeLoc current = it.next();
while ( it.hasNext() ) {
GenomeLoc next = it.next();
if ( next.isBefore(current)) {
throw new StingException("Interval "+next+" in the interval file is out of order.");
}
if (current.overlapsP(next)) {
current = current.merge(next);
} else if (current.contiguousP(next) && myRule == IntervalMergingRule.ALL) {
current = current.merge(next);
} else {
it.pushback(next);
break;
}
}
return current;
}
public void remove() {
throw new UnsupportedOperationException("method 'remove' is not supported by this iterator");
}
}

View File

@ -0,0 +1,138 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.interval;
import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.Assert;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFileFactory;
import java.io.File;
import java.util.Iterator;
import java.util.List;
import java.util.ArrayList;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: Jun 14, 2010
* Time: 10:15:52 AM
* To change this template use File | Settings | File Templates.
*/
public class IntervalFileMergingIteratorUnitTest extends BaseTest {
private static File refFile = new File(validationDataLocation + "Homo_sapiens_assembly17.fasta");
private static String intervalFileNameGATK = validationDataLocation+"test.gatk.intervals";
private static String intervalFileNameBED = validationDataLocation+"test.bed";
private static List<GenomeLoc> results1 = null;
private static List<GenomeLoc> results2 = null;
@BeforeClass
public static void init() {
GenomeLocParser.setupRefContigOrdering(ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile));
results1 = new ArrayList<GenomeLoc>();
results2 = new ArrayList<GenomeLoc>();
results1.add(GenomeLocParser.createGenomeLoc("chr1",1554));
results1.add(GenomeLocParser.createGenomeLoc("chr1",2538,2568));
results1.add(GenomeLocParser.createGenomeLoc("chr1",18932,19000));
results1.add(GenomeLocParser.createGenomeLoc("chr1",19001,25000));
results1.add(GenomeLocParser.createGenomeLoc("chr5",7415,7600));
results2.add(GenomeLocParser.createGenomeLoc("chr1",1554));
results2.add(GenomeLocParser.createGenomeLoc("chr1",2538,2568));
results2.add(GenomeLocParser.createGenomeLoc("chr1",18932,25000));
results2.add(GenomeLocParser.createGenomeLoc("chr5",7415,7600));
}
@Test
public void testGATKIntervalFileIterator_Overlap() {
logger.warn("Executing testGATKIntervalFileIterator_Overlap");
Iterator<GenomeLoc> it = new IntervalFileMergingIterator(new File(intervalFileNameGATK),IntervalMergingRule.OVERLAPPING_ONLY);
Iterator<GenomeLoc> check_it = results1.iterator();
while(it.hasNext()) {
GenomeLoc l = it.next();
GenomeLoc l_expected = check_it.next();
//System.out.println("int: "+l+" expected: "+l_expected) ;
Assert.assertEquals("Unexpected location returned by the iterator: "+l,l,l_expected);
}
}
@Test
public void testGATKIntervalFileIterator_OverlapWithException() {
logger.warn("Executing testGATKIntervalFileIterator_OverlapWithException");
Iterator<GenomeLoc> it = new IntervalFileMergingIterator(new File(intervalFileNameGATK),IntervalMergingRule.OVERLAPPING_ONLY);
Iterator<GenomeLoc> check_it = results1.iterator();
try {
while(it.hasNext()) {
GenomeLoc l = it.next();
GenomeLoc l_expected = check_it.next();
// System.out.println("int: "+l+" expected: "+l_expected) ;
}
} catch ( StingException e) {
Assert.assertEquals( e.getMessage(), "Interval chr5:7414 in the interval file is out of order.");
}
}
@Test
public void testGATKIntervalFileIterator_All() {
logger.warn("Executing testGATKIntervalFileIterator_All");
Iterator<GenomeLoc> it = new IntervalFileMergingIterator(new File(intervalFileNameGATK),IntervalMergingRule.ALL);
Iterator<GenomeLoc> check_it = results2.iterator();
while(it.hasNext()) {
GenomeLoc l = it.next();
GenomeLoc l_expected = check_it.next();
// System.out.println("int: "+l+" expected: "+l_expected) ;
Assert.assertEquals("Unexpected location returned by the iterator: "+l,l,l_expected);
}
}
@Test
public void testBEDIntervalFileIterator_Overlap() {
logger.warn("Executing testBEDIntervalFileIterator_Overlap");
Iterator<GenomeLoc> it = new IntervalFileMergingIterator(new File(intervalFileNameBED),IntervalMergingRule.OVERLAPPING_ONLY);
Iterator<GenomeLoc> check_it = results1.iterator();
while(it.hasNext()) {
GenomeLoc l = it.next();
GenomeLoc l_expected = check_it.next();
// System.out.println("int: "+l+" expected: "+l_expected) ;
Assert.assertEquals("Unexpected location returned by the iterator: "+l,l,l_expected);
}
}
}