A simple utility class that implements a merging Iterator<GenomeLoc> built over an interval or bed file (this is NOT a rod, but rather a direct line-by-line file reader that converts strings to genome locs on the fly and merges overlapping intervals)
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3546 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
f137bf8f85
commit
671ac00748
|
|
@ -0,0 +1,114 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.interval;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.StringToGenomeLocIteratorAdapter;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: asivache
|
||||
* Date: Jun 11, 2010
|
||||
* Time: 2:56:29 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
|
||||
/** This iterator reads intervals from interval file (can be gatk-style
|
||||
* interval list or a bed file) and merges them on the fly. Very much alike
|
||||
* IntervalUtils.sortAndMergeIntervals() but the list is read sequentially
|
||||
* from a file upon request instead of loading the whole list into memory.
|
||||
* Intervals in the underlying file MUST be
|
||||
* pre-sorted into the reference order (they can overlap though, as this
|
||||
* iterator is a merging one).
|
||||
*/
|
||||
public class IntervalFileMergingIterator implements Iterator<GenomeLoc> {
|
||||
|
||||
private PushbackIterator<GenomeLoc> it ;
|
||||
private IntervalMergingRule myRule;
|
||||
|
||||
public IntervalFileMergingIterator(File f, IntervalMergingRule rule) {
|
||||
try {
|
||||
XReadLines reader = new XReadLines(f);
|
||||
|
||||
if (f.getName().toUpperCase().endsWith(".BED")) {
|
||||
it = new PushbackIterator<GenomeLoc>( new StringToGenomeLocIteratorAdapter( reader.iterator(),
|
||||
StringToGenomeLocIteratorAdapter.FORMAT.BED ) ) ;
|
||||
} else {
|
||||
it = new PushbackIterator<GenomeLoc>( new StringToGenomeLocIteratorAdapter( reader.iterator(),
|
||||
StringToGenomeLocIteratorAdapter.FORMAT.GATK ) ) ;
|
||||
}
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new StingException("Can not open interval file "+f);
|
||||
}
|
||||
myRule = rule;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return it.hasNext();
|
||||
}
|
||||
|
||||
/** Returns next merged interval from the underlying interval file. In other words, keeps reading intervals
|
||||
* for as long as they overlap and returns a single merged interval encompassing the set of overlapping
|
||||
* intervals read from the file. Non-overlapping intervals are returned as is. This method will throw an
|
||||
* exception if it runs into an interval that is out of order.
|
||||
* @return
|
||||
*/
|
||||
public GenomeLoc next() {
|
||||
|
||||
GenomeLoc current = it.next();
|
||||
|
||||
while ( it.hasNext() ) {
|
||||
GenomeLoc next = it.next();
|
||||
|
||||
if ( next.isBefore(current)) {
|
||||
throw new StingException("Interval "+next+" in the interval file is out of order.");
|
||||
}
|
||||
|
||||
if (current.overlapsP(next)) {
|
||||
current = current.merge(next);
|
||||
} else if (current.contiguousP(next) && myRule == IntervalMergingRule.ALL) {
|
||||
current = current.merge(next);
|
||||
} else {
|
||||
it.pushback(next);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return current;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("method 'remove' is not supported by this iterator");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,138 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.interval;
|
||||
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
import org.junit.Assert;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import net.sf.picard.reference.ReferenceSequenceFileFactory;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: asivache
|
||||
* Date: Jun 14, 2010
|
||||
* Time: 10:15:52 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class IntervalFileMergingIteratorUnitTest extends BaseTest {
|
||||
|
||||
private static File refFile = new File(validationDataLocation + "Homo_sapiens_assembly17.fasta");
|
||||
private static String intervalFileNameGATK = validationDataLocation+"test.gatk.intervals";
|
||||
private static String intervalFileNameBED = validationDataLocation+"test.bed";
|
||||
private static List<GenomeLoc> results1 = null;
|
||||
private static List<GenomeLoc> results2 = null;
|
||||
|
||||
@BeforeClass
|
||||
public static void init() {
|
||||
GenomeLocParser.setupRefContigOrdering(ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile));
|
||||
|
||||
results1 = new ArrayList<GenomeLoc>();
|
||||
results2 = new ArrayList<GenomeLoc>();
|
||||
|
||||
results1.add(GenomeLocParser.createGenomeLoc("chr1",1554));
|
||||
results1.add(GenomeLocParser.createGenomeLoc("chr1",2538,2568));
|
||||
results1.add(GenomeLocParser.createGenomeLoc("chr1",18932,19000));
|
||||
results1.add(GenomeLocParser.createGenomeLoc("chr1",19001,25000));
|
||||
results1.add(GenomeLocParser.createGenomeLoc("chr5",7415,7600));
|
||||
|
||||
results2.add(GenomeLocParser.createGenomeLoc("chr1",1554));
|
||||
results2.add(GenomeLocParser.createGenomeLoc("chr1",2538,2568));
|
||||
results2.add(GenomeLocParser.createGenomeLoc("chr1",18932,25000));
|
||||
results2.add(GenomeLocParser.createGenomeLoc("chr5",7415,7600));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGATKIntervalFileIterator_Overlap() {
|
||||
logger.warn("Executing testGATKIntervalFileIterator_Overlap");
|
||||
|
||||
Iterator<GenomeLoc> it = new IntervalFileMergingIterator(new File(intervalFileNameGATK),IntervalMergingRule.OVERLAPPING_ONLY);
|
||||
Iterator<GenomeLoc> check_it = results1.iterator();
|
||||
while(it.hasNext()) {
|
||||
GenomeLoc l = it.next();
|
||||
GenomeLoc l_expected = check_it.next();
|
||||
//System.out.println("int: "+l+" expected: "+l_expected) ;
|
||||
Assert.assertEquals("Unexpected location returned by the iterator: "+l,l,l_expected);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGATKIntervalFileIterator_OverlapWithException() {
|
||||
logger.warn("Executing testGATKIntervalFileIterator_OverlapWithException");
|
||||
|
||||
Iterator<GenomeLoc> it = new IntervalFileMergingIterator(new File(intervalFileNameGATK),IntervalMergingRule.OVERLAPPING_ONLY);
|
||||
Iterator<GenomeLoc> check_it = results1.iterator();
|
||||
try {
|
||||
while(it.hasNext()) {
|
||||
GenomeLoc l = it.next();
|
||||
GenomeLoc l_expected = check_it.next();
|
||||
// System.out.println("int: "+l+" expected: "+l_expected) ;
|
||||
}
|
||||
} catch ( StingException e) {
|
||||
Assert.assertEquals( e.getMessage(), "Interval chr5:7414 in the interval file is out of order.");
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGATKIntervalFileIterator_All() {
|
||||
logger.warn("Executing testGATKIntervalFileIterator_All");
|
||||
|
||||
Iterator<GenomeLoc> it = new IntervalFileMergingIterator(new File(intervalFileNameGATK),IntervalMergingRule.ALL);
|
||||
Iterator<GenomeLoc> check_it = results2.iterator();
|
||||
while(it.hasNext()) {
|
||||
GenomeLoc l = it.next();
|
||||
GenomeLoc l_expected = check_it.next();
|
||||
// System.out.println("int: "+l+" expected: "+l_expected) ;
|
||||
Assert.assertEquals("Unexpected location returned by the iterator: "+l,l,l_expected);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBEDIntervalFileIterator_Overlap() {
|
||||
logger.warn("Executing testBEDIntervalFileIterator_Overlap");
|
||||
|
||||
Iterator<GenomeLoc> it = new IntervalFileMergingIterator(new File(intervalFileNameBED),IntervalMergingRule.OVERLAPPING_ONLY);
|
||||
Iterator<GenomeLoc> check_it = results1.iterator();
|
||||
while(it.hasNext()) {
|
||||
GenomeLoc l = it.next();
|
||||
GenomeLoc l_expected = check_it.next();
|
||||
// System.out.println("int: "+l+" expected: "+l_expected) ;
|
||||
Assert.assertEquals("Unexpected location returned by the iterator: "+l,l,l_expected);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
Reference in New Issue