Added a collection for storing GenomeLocs, that also has functions for removing by genomic region (that may span multiple GenomeLoc's in the collection), and adding regions, which are then merged with any overlapping regions.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@809 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
34413362fd
commit
831d430025
|
|
@ -0,0 +1,220 @@
|
|||
package org.broadinstitute.sting.utils;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
|
||||
import java.util.AbstractSet;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
*
|
||||
* User: aaron
|
||||
* Date: May 22, 2009
|
||||
* Time: 10:54:40 AM
|
||||
*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* @version 1.0
|
||||
* @date May 22, 2009
|
||||
* <p/>
|
||||
* Class GenomeLocCollection
|
||||
* <p/>
|
||||
* a set of genome locations. This collection is self sorting,
|
||||
* and will merge genome locations that are overlapping. The remove function
|
||||
* will also remove a region from the list, if the region to remove is a
|
||||
* partial interval of a region in the collection it will remove the region from
|
||||
* that element.
|
||||
*/
|
||||
public class GenomeLocSet extends AbstractSet<GenomeLoc> {
|
||||
// our private storage for the GenomeLoc's
|
||||
private final ArrayList<GenomeLoc> mArray = new ArrayList<GenomeLoc>();
|
||||
|
||||
public GenomeLocSet() {}
|
||||
|
||||
/**
|
||||
* get an iterator over this collection
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public Iterator<GenomeLoc> iterator() {
|
||||
return mArray.iterator();
|
||||
}
|
||||
|
||||
/**
|
||||
* return the size of the collection
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public int size() {
|
||||
return mArray.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* determine if the collection is empty
|
||||
*
|
||||
* @return true if we have no elements
|
||||
*/
|
||||
public boolean isEmpty() {
|
||||
return mArray.isEmpty();
|
||||
}
|
||||
|
||||
/**
|
||||
* add a genomeLoc to the collection, simply inserting in order into the set
|
||||
* @param e the GenomeLoc to add
|
||||
* @return true
|
||||
*/
|
||||
public boolean add(GenomeLoc e) {
|
||||
if (mArray.contains(e)) {
|
||||
throw new IllegalArgumentException("attempting to add a duplicate object to the set");
|
||||
}
|
||||
int index = 0;
|
||||
while (index < mArray.size()) {
|
||||
if (!e.isPast(mArray.get(index))) {
|
||||
mArray.add(index,e);
|
||||
return true;
|
||||
}
|
||||
++index;
|
||||
}
|
||||
this.mArray.add(e);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a GenomeLoc to the collection, merging it if it overlaps another region.
|
||||
* If it's not overlapping then we add it in sorted order.
|
||||
*
|
||||
* @param e the GenomeLoc to add to the collection
|
||||
* @return true, if the GenomeLoc could be added to the collection
|
||||
*/
|
||||
public boolean addRegion(GenomeLoc e) {
|
||||
if (e == null) {
|
||||
return false;
|
||||
}
|
||||
/**
|
||||
* check if the specified element overlaps any current locations, if so
|
||||
* we should merge the two.
|
||||
*/
|
||||
for (GenomeLoc g : mArray) {
|
||||
if (g.contiguousP(e)) {
|
||||
GenomeLoc c = g.merge(e);
|
||||
mArray.set(mArray.indexOf(g),c);
|
||||
return true;
|
||||
} else if ((g.getContigIndex() == e.getContigIndex()) &&
|
||||
(g.getStart() > e.getStart())) {
|
||||
mArray.add(mArray.indexOf(g), e);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
/** we're at the end and we haven't found locations that should fall after it,
|
||||
* so we'll put it at the end
|
||||
*/
|
||||
mArray.add(e);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* remove an element from the set. Given a specific genome location, this function will
|
||||
* remove all regions in the element set that overlap the specified region.
|
||||
* @param e the genomic range to remove
|
||||
* @return true if a removal action was performed, false if the collection was unchanged.
|
||||
*/
|
||||
public boolean removeRegion(GenomeLoc e) {
|
||||
if (e == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// sometimes we can't return right away, this holds the value for those cases
|
||||
boolean returnValue = false;
|
||||
/**
|
||||
* check if the specified element overlaps any current locations, subtract the removed
|
||||
* region and reinsert what is left.
|
||||
*/
|
||||
for (GenomeLoc g : mArray) {
|
||||
if (g.overlapsP(e)) {
|
||||
if (g.compareTo(e) == 0) {
|
||||
mArray.remove(mArray.indexOf(g));
|
||||
return true;
|
||||
} else if (g.containsP(e)) {
|
||||
/**
|
||||
* we have to create two new region, one for the before part, one for the after
|
||||
* The old region:
|
||||
* |----------------- old region (g) -------------|
|
||||
* |----- to delete (e) ------|
|
||||
*
|
||||
* product (two new regions):
|
||||
* |------| + |--------|
|
||||
*
|
||||
*/
|
||||
GenomeLoc before = new GenomeLoc(g.getContigIndex(), g.getStart(), e.getStart()-1);
|
||||
GenomeLoc after = new GenomeLoc(g.getContigIndex(), e.getStop() + 1, g.getStop());
|
||||
int index = mArray.indexOf(g);
|
||||
mArray.add(index, after);
|
||||
mArray.add(index, before);
|
||||
mArray.remove(mArray.indexOf(g));
|
||||
return true;
|
||||
} else if (e.containsP(g)) {
|
||||
/**
|
||||
* e completely contains g, delete g, but keep looking, there may be more regions
|
||||
* i.e.:
|
||||
* |--------------------- e --------------------|
|
||||
* |--- g ---| |---- others ----|
|
||||
*/
|
||||
mArray.remove(mArray.indexOf(g));
|
||||
returnValue = true;
|
||||
} else {
|
||||
|
||||
/**
|
||||
* otherwise e overlaps some part of g
|
||||
*/
|
||||
GenomeLoc l;
|
||||
|
||||
/**
|
||||
* figure out which region occurs first on the genome. I.e., is it:
|
||||
* |------------- g ----------|
|
||||
* |------------- e ----------|
|
||||
*
|
||||
* or:
|
||||
* |------------- g ----------|
|
||||
* |------------ e -----------|
|
||||
*
|
||||
*/
|
||||
|
||||
if (e.getStart() < g.getStart()) {
|
||||
l = new GenomeLoc(g.getContigIndex(), e.getStop()+1, g.getStop());
|
||||
} else {
|
||||
l = new GenomeLoc(g.getContigIndex(), g.getStart(), e.getStart()-1);
|
||||
}
|
||||
// replace g with the new region
|
||||
mArray.set(mArray.indexOf(g), l);
|
||||
returnValue = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return returnValue;
|
||||
}
|
||||
|
||||
/**
|
||||
* create a list of genomic locations, given a reference sequence
|
||||
* @param dict the sequence dictionary to create a collection from
|
||||
* @return the GenomeLocSet of all references sequences as GenomeLoc's
|
||||
*/
|
||||
public static GenomeLocSet createSetFromSequenceDictionary(SAMSequenceDictionary dict) {
|
||||
GenomeLocSet returnSet = new GenomeLocSet();
|
||||
for (SAMSequenceRecord record : dict.getSequences()) {
|
||||
returnSet.add(new GenomeLoc(record.getSequenceIndex(),1,record.getSequenceLength()));
|
||||
}
|
||||
return returnSet;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,168 @@
|
|||
package org.broadinstitute.sting.utils;
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSamUtils;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
*
|
||||
* User: aaron
|
||||
* Date: May 22, 2009
|
||||
* Time: 2:14:07 PM
|
||||
*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* @version 1.0
|
||||
* @date May 22, 2009
|
||||
* <p/>
|
||||
* Class GenomeLocSetTest
|
||||
* <p/>
|
||||
* This tests the functions of the GenomeLocSet
|
||||
*/
|
||||
public class GenomeLocSetTest extends BaseTest {
|
||||
|
||||
private GenomeLocSet mSet = null;
|
||||
private SAMFileHeader header = ArtificialSamUtils.createArtificialSamHeader(NUMBER_OF_CHROMOSOMES, STARTING_CHROMOSOME, CHROMOSOME_SIZE);
|
||||
private static final int NUMBER_OF_CHROMOSOMES = 5;
|
||||
private static final int STARTING_CHROMOSOME = 1;
|
||||
private static final int CHROMOSOME_SIZE = 1000;
|
||||
|
||||
@Before
|
||||
public void setup() {
|
||||
GenomeLoc.setupRefContigOrdering(header.getSequenceDictionary());
|
||||
mSet = new GenomeLocSet();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAdd() {
|
||||
GenomeLoc g = new GenomeLoc(STARTING_CHROMOSOME, 0, 0);
|
||||
assertTrue(mSet.size() == 0);
|
||||
mSet.add(g);
|
||||
assertTrue(mSet.size() == STARTING_CHROMOSOME);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRemove() {
|
||||
assertTrue(mSet.size() == 0);
|
||||
GenomeLoc g = new GenomeLoc(STARTING_CHROMOSOME, 0, 0);
|
||||
mSet.add(g);
|
||||
assertTrue(mSet.size() == STARTING_CHROMOSOME);
|
||||
mSet.remove(g);
|
||||
assertTrue(mSet.size() == 0);
|
||||
}
|
||||
|
||||
@Test(expected = IllegalArgumentException.class)
|
||||
public void testAddDupplicate() {
|
||||
assertTrue(mSet.size() == 0);
|
||||
GenomeLoc g = new GenomeLoc(STARTING_CHROMOSOME, 0, 0);
|
||||
mSet.add(g);
|
||||
assertTrue(mSet.size() == STARTING_CHROMOSOME);
|
||||
mSet.add(g);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void mergingOverlappingBelow() {
|
||||
GenomeLoc g = new GenomeLoc(STARTING_CHROMOSOME, 0, 50);
|
||||
GenomeLoc e = new GenomeLoc(STARTING_CHROMOSOME, 49, 100);
|
||||
assertTrue(mSet.size() == 0);
|
||||
mSet.add(g);
|
||||
assertTrue(mSet.size() == STARTING_CHROMOSOME);
|
||||
mSet.addRegion(e);
|
||||
assertTrue(mSet.size() == STARTING_CHROMOSOME);
|
||||
Iterator<GenomeLoc> iter = mSet.iterator();
|
||||
GenomeLoc loc = iter.next();
|
||||
assertTrue(loc.getStart() == 0);
|
||||
assertTrue(loc.getStop() == 100);
|
||||
assertTrue(loc.getContigIndex() == STARTING_CHROMOSOME);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void mergingOverlappingAbove() {
|
||||
GenomeLoc e = new GenomeLoc(STARTING_CHROMOSOME, 0, 50);
|
||||
GenomeLoc g = new GenomeLoc(STARTING_CHROMOSOME, 49, 100);
|
||||
assertTrue(mSet.size() == 0);
|
||||
mSet.add(g);
|
||||
assertTrue(mSet.size() == STARTING_CHROMOSOME);
|
||||
mSet.addRegion(e);
|
||||
assertTrue(mSet.size() == STARTING_CHROMOSOME);
|
||||
Iterator<GenomeLoc> iter = mSet.iterator();
|
||||
GenomeLoc loc = iter.next();
|
||||
assertTrue(loc.getStart() == 0);
|
||||
assertTrue(loc.getStop() == 100);
|
||||
assertTrue(loc.getContigIndex() == STARTING_CHROMOSOME);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void deleteSubRegion() {
|
||||
GenomeLoc e = new GenomeLoc(STARTING_CHROMOSOME, 0, 50);
|
||||
GenomeLoc g = new GenomeLoc(STARTING_CHROMOSOME, 49, 100);
|
||||
mSet.add(g);
|
||||
mSet.addRegion(e);
|
||||
|
||||
// now delete a region
|
||||
GenomeLoc d = new GenomeLoc(STARTING_CHROMOSOME, 25, 75);
|
||||
mSet.removeRegion(d);
|
||||
Iterator<GenomeLoc> iter = mSet.iterator();
|
||||
GenomeLoc loc = iter.next();
|
||||
assertTrue(loc.getStart() == 0);
|
||||
assertTrue(loc.getStop() == 24);
|
||||
assertTrue(loc.getContigIndex() == STARTING_CHROMOSOME);
|
||||
|
||||
loc = iter.next();
|
||||
assertTrue(loc.getStart() == 76);
|
||||
assertTrue(loc.getStop() == 100);
|
||||
assertTrue(loc.getContigIndex() == STARTING_CHROMOSOME);
|
||||
}
|
||||
@Test
|
||||
public void deleteSuperRegion() {
|
||||
GenomeLoc e = new GenomeLoc(STARTING_CHROMOSOME, 10, 20);
|
||||
GenomeLoc g = new GenomeLoc(STARTING_CHROMOSOME, 70, 100);
|
||||
mSet.add(g);
|
||||
mSet.addRegion(e);
|
||||
assertTrue(mSet.size() == 2);
|
||||
// now delete a region
|
||||
GenomeLoc d = new GenomeLoc(STARTING_CHROMOSOME, 15, 75);
|
||||
mSet.removeRegion(d);
|
||||
Iterator<GenomeLoc> iter = mSet.iterator();
|
||||
GenomeLoc loc = iter.next();
|
||||
assertTrue(loc.getStart() == 10);
|
||||
assertTrue(loc.getStop() == 14);
|
||||
assertTrue(loc.getContigIndex() == STARTING_CHROMOSOME);
|
||||
|
||||
loc = iter.next();
|
||||
assertTrue(loc.getStart() == 76);
|
||||
assertTrue(loc.getStop() == 100);
|
||||
assertTrue(loc.getContigIndex() == STARTING_CHROMOSOME);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void fromSequenceDictionary() {
|
||||
mSet = GenomeLocSet.createSetFromSequenceDictionary(this.header.getSequenceDictionary());
|
||||
// we should have sequence
|
||||
assertTrue(mSet.size() == GenomeLocSetTest.NUMBER_OF_CHROMOSOMES);
|
||||
int seqNumber = 0;
|
||||
for (GenomeLoc loc : mSet) {
|
||||
assertTrue(loc.getStart() == 1);
|
||||
assertTrue(loc.getStop() == GenomeLocSetTest.CHROMOSOME_SIZE);
|
||||
assertTrue(loc.getContigIndex() == seqNumber);
|
||||
++seqNumber;
|
||||
}
|
||||
assertTrue(seqNumber == GenomeLocSetTest.NUMBER_OF_CHROMOSOMES);
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue