diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLocSet.java b/java/src/org/broadinstitute/sting/utils/GenomeLocSet.java new file mode 100755 index 000000000..a5706ac30 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/GenomeLocSet.java @@ -0,0 +1,220 @@ +package org.broadinstitute.sting.utils; + +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; + +import java.util.AbstractSet; +import java.util.ArrayList; +import java.util.Iterator; + +/** + * + * User: aaron + * Date: May 22, 2009 + * Time: 10:54:40 AM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + + +/** + * @author aaron + * @version 1.0 + * @date May 22, 2009 + *

+ * Class GenomeLocCollection + *

+ * a set of genome locations. This collection is self sorting, + * and will merge genome locations that are overlapping. The remove function + * will also remove a region from the list, if the region to remove is a + * partial interval of a region in the collection it will remove the region from + * that element. + */ +public class GenomeLocSet extends AbstractSet { + // our private storage for the GenomeLoc's + private final ArrayList mArray = new ArrayList(); + + public GenomeLocSet() {} + + /** + * get an iterator over this collection + * + * @return + */ + public Iterator iterator() { + return mArray.iterator(); + } + + /** + * return the size of the collection + * + * @return + */ + public int size() { + return mArray.size(); + } + + /** + * determine if the collection is empty + * + * @return true if we have no elements + */ + public boolean isEmpty() { + return mArray.isEmpty(); + } + + /** + * add a genomeLoc to the collection, simply inserting in order into the set + * @param e the GenomeLoc to add + * @return true + */ + public boolean add(GenomeLoc e) { + if (mArray.contains(e)) { + throw new IllegalArgumentException("attempting to add a duplicate object to the set"); + } + int index = 0; + while (index < mArray.size()) { + if (!e.isPast(mArray.get(index))) { + mArray.add(index,e); + return true; + } + ++index; + } + this.mArray.add(e); + return true; + } + + /** + * Adds a GenomeLoc to the collection, merging it if it overlaps another region. + * If it's not overlapping then we add it in sorted order. + * + * @param e the GenomeLoc to add to the collection + * @return true, if the GenomeLoc could be added to the collection + */ + public boolean addRegion(GenomeLoc e) { + if (e == null) { + return false; + } + /** + * check if the specified element overlaps any current locations, if so + * we should merge the two. + */ + for (GenomeLoc g : mArray) { + if (g.contiguousP(e)) { + GenomeLoc c = g.merge(e); + mArray.set(mArray.indexOf(g),c); + return true; + } else if ((g.getContigIndex() == e.getContigIndex()) && + (g.getStart() > e.getStart())) { + mArray.add(mArray.indexOf(g), e); + return true; + } + } + /** we're at the end and we haven't found locations that should fall after it, + * so we'll put it at the end + */ + mArray.add(e); + return true; + } + + /** + * remove an element from the set. Given a specific genome location, this function will + * remove all regions in the element set that overlap the specified region. + * @param e the genomic range to remove + * @return true if a removal action was performed, false if the collection was unchanged. + */ + public boolean removeRegion(GenomeLoc e) { + if (e == null) { + return false; + } + + // sometimes we can't return right away, this holds the value for those cases + boolean returnValue = false; + /** + * check if the specified element overlaps any current locations, subtract the removed + * region and reinsert what is left. + */ + for (GenomeLoc g : mArray) { + if (g.overlapsP(e)) { + if (g.compareTo(e) == 0) { + mArray.remove(mArray.indexOf(g)); + return true; + } else if (g.containsP(e)) { + /** + * we have to create two new region, one for the before part, one for the after + * The old region: + * |----------------- old region (g) -------------| + * |----- to delete (e) ------| + * + * product (two new regions): + * |------| + |--------| + * + */ + GenomeLoc before = new GenomeLoc(g.getContigIndex(), g.getStart(), e.getStart()-1); + GenomeLoc after = new GenomeLoc(g.getContigIndex(), e.getStop() + 1, g.getStop()); + int index = mArray.indexOf(g); + mArray.add(index, after); + mArray.add(index, before); + mArray.remove(mArray.indexOf(g)); + return true; + } else if (e.containsP(g)) { + /** + * e completely contains g, delete g, but keep looking, there may be more regions + * i.e.: + * |--------------------- e --------------------| + * |--- g ---| |---- others ----| + */ + mArray.remove(mArray.indexOf(g)); + returnValue = true; + } else { + + /** + * otherwise e overlaps some part of g + */ + GenomeLoc l; + + /** + * figure out which region occurs first on the genome. I.e., is it: + * |------------- g ----------| + * |------------- e ----------| + * + * or: + * |------------- g ----------| + * |------------ e -----------| + * + */ + + if (e.getStart() < g.getStart()) { + l = new GenomeLoc(g.getContigIndex(), e.getStop()+1, g.getStop()); + } else { + l = new GenomeLoc(g.getContigIndex(), g.getStart(), e.getStart()-1); + } + // replace g with the new region + mArray.set(mArray.indexOf(g), l); + returnValue = true; + } + } + } + return returnValue; + } + + /** + * create a list of genomic locations, given a reference sequence + * @param dict the sequence dictionary to create a collection from + * @return the GenomeLocSet of all references sequences as GenomeLoc's + */ + public static GenomeLocSet createSetFromSequenceDictionary(SAMSequenceDictionary dict) { + GenomeLocSet returnSet = new GenomeLocSet(); + for (SAMSequenceRecord record : dict.getSequences()) { + returnSet.add(new GenomeLoc(record.getSequenceIndex(),1,record.getSequenceLength())); + } + return returnSet; + } +} diff --git a/java/test/org/broadinstitute/sting/utils/GenomeLocSetTest.java b/java/test/org/broadinstitute/sting/utils/GenomeLocSetTest.java new file mode 100755 index 000000000..44c86b5db --- /dev/null +++ b/java/test/org/broadinstitute/sting/utils/GenomeLocSetTest.java @@ -0,0 +1,168 @@ +package org.broadinstitute.sting.utils; + +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.sam.ArtificialSamUtils; +import static org.junit.Assert.assertTrue; +import org.junit.Before; +import org.junit.Test; + +import java.util.Iterator; + +/** + * + * User: aaron + * Date: May 22, 2009 + * Time: 2:14:07 PM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + + +/** + * @author aaron + * @version 1.0 + * @date May 22, 2009 + *

+ * Class GenomeLocSetTest + *

+ * This tests the functions of the GenomeLocSet + */ +public class GenomeLocSetTest extends BaseTest { + + private GenomeLocSet mSet = null; + private SAMFileHeader header = ArtificialSamUtils.createArtificialSamHeader(NUMBER_OF_CHROMOSOMES, STARTING_CHROMOSOME, CHROMOSOME_SIZE); + private static final int NUMBER_OF_CHROMOSOMES = 5; + private static final int STARTING_CHROMOSOME = 1; + private static final int CHROMOSOME_SIZE = 1000; + + @Before + public void setup() { + GenomeLoc.setupRefContigOrdering(header.getSequenceDictionary()); + mSet = new GenomeLocSet(); + } + + @Test + public void testAdd() { + GenomeLoc g = new GenomeLoc(STARTING_CHROMOSOME, 0, 0); + assertTrue(mSet.size() == 0); + mSet.add(g); + assertTrue(mSet.size() == STARTING_CHROMOSOME); + } + + @Test + public void testRemove() { + assertTrue(mSet.size() == 0); + GenomeLoc g = new GenomeLoc(STARTING_CHROMOSOME, 0, 0); + mSet.add(g); + assertTrue(mSet.size() == STARTING_CHROMOSOME); + mSet.remove(g); + assertTrue(mSet.size() == 0); + } + + @Test(expected = IllegalArgumentException.class) + public void testAddDupplicate() { + assertTrue(mSet.size() == 0); + GenomeLoc g = new GenomeLoc(STARTING_CHROMOSOME, 0, 0); + mSet.add(g); + assertTrue(mSet.size() == STARTING_CHROMOSOME); + mSet.add(g); + } + + @Test + public void mergingOverlappingBelow() { + GenomeLoc g = new GenomeLoc(STARTING_CHROMOSOME, 0, 50); + GenomeLoc e = new GenomeLoc(STARTING_CHROMOSOME, 49, 100); + assertTrue(mSet.size() == 0); + mSet.add(g); + assertTrue(mSet.size() == STARTING_CHROMOSOME); + mSet.addRegion(e); + assertTrue(mSet.size() == STARTING_CHROMOSOME); + Iterator iter = mSet.iterator(); + GenomeLoc loc = iter.next(); + assertTrue(loc.getStart() == 0); + assertTrue(loc.getStop() == 100); + assertTrue(loc.getContigIndex() == STARTING_CHROMOSOME); + } + + @Test + public void mergingOverlappingAbove() { + GenomeLoc e = new GenomeLoc(STARTING_CHROMOSOME, 0, 50); + GenomeLoc g = new GenomeLoc(STARTING_CHROMOSOME, 49, 100); + assertTrue(mSet.size() == 0); + mSet.add(g); + assertTrue(mSet.size() == STARTING_CHROMOSOME); + mSet.addRegion(e); + assertTrue(mSet.size() == STARTING_CHROMOSOME); + Iterator iter = mSet.iterator(); + GenomeLoc loc = iter.next(); + assertTrue(loc.getStart() == 0); + assertTrue(loc.getStop() == 100); + assertTrue(loc.getContigIndex() == STARTING_CHROMOSOME); + } + + @Test + public void deleteSubRegion() { + GenomeLoc e = new GenomeLoc(STARTING_CHROMOSOME, 0, 50); + GenomeLoc g = new GenomeLoc(STARTING_CHROMOSOME, 49, 100); + mSet.add(g); + mSet.addRegion(e); + + // now delete a region + GenomeLoc d = new GenomeLoc(STARTING_CHROMOSOME, 25, 75); + mSet.removeRegion(d); + Iterator iter = mSet.iterator(); + GenomeLoc loc = iter.next(); + assertTrue(loc.getStart() == 0); + assertTrue(loc.getStop() == 24); + assertTrue(loc.getContigIndex() == STARTING_CHROMOSOME); + + loc = iter.next(); + assertTrue(loc.getStart() == 76); + assertTrue(loc.getStop() == 100); + assertTrue(loc.getContigIndex() == STARTING_CHROMOSOME); + } + @Test + public void deleteSuperRegion() { + GenomeLoc e = new GenomeLoc(STARTING_CHROMOSOME, 10, 20); + GenomeLoc g = new GenomeLoc(STARTING_CHROMOSOME, 70, 100); + mSet.add(g); + mSet.addRegion(e); + assertTrue(mSet.size() == 2); + // now delete a region + GenomeLoc d = new GenomeLoc(STARTING_CHROMOSOME, 15, 75); + mSet.removeRegion(d); + Iterator iter = mSet.iterator(); + GenomeLoc loc = iter.next(); + assertTrue(loc.getStart() == 10); + assertTrue(loc.getStop() == 14); + assertTrue(loc.getContigIndex() == STARTING_CHROMOSOME); + + loc = iter.next(); + assertTrue(loc.getStart() == 76); + assertTrue(loc.getStop() == 100); + assertTrue(loc.getContigIndex() == STARTING_CHROMOSOME); + } + + @Test + public void fromSequenceDictionary() { + mSet = GenomeLocSet.createSetFromSequenceDictionary(this.header.getSequenceDictionary()); + // we should have sequence + assertTrue(mSet.size() == GenomeLocSetTest.NUMBER_OF_CHROMOSOMES); + int seqNumber = 0; + for (GenomeLoc loc : mSet) { + assertTrue(loc.getStart() == 1); + assertTrue(loc.getStop() == GenomeLocSetTest.CHROMOSOME_SIZE); + assertTrue(loc.getContigIndex() == seqNumber); + ++seqNumber; + } + assertTrue(seqNumber == GenomeLocSetTest.NUMBER_OF_CHROMOSOMES); + } +}