From ebd540412474e91ac4b153045bf08a2949e22fa2 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 28 Feb 2013 13:46:49 -0500 Subject: [PATCH] Fixed the add functionality of GenomeLocSortedSet. * Fixed GenomeLocSortedSet.add() to ensure that overlapping intervals are detected and an exception is thrown. * Fixed GenomeLocSortedSet.addRegion() by merging it with the add() method; it now produces sorted inputs in all cases. * Cleaned up duplicated code throughout the engine to create a list of intervals over all contigs. * Added more unit tests for add functionality of GLSS. * Resolves GSA-775. --- .../sting/gatk/GenomeAnalysisEngine.java | 4 +- .../gatk/datasources/reads/BAMScheduler.java | 18 +-- .../datasources/reads/IntervalSharder.java | 5 +- .../gatk/datasources/reads/SAMDataSource.java | 8 +- .../reads/utilities/FindLargeShards.java | 13 +- .../sting/utils/GenomeLocSortedSet.java | 142 ++++++++++-------- .../reads/SAMDataSourceUnitTest.java | 2 +- .../utils/GenomeLocSortedSetUnitTest.java | 36 ++++- 8 files changed, 130 insertions(+), 98 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index ba25ac957..85c94cc92 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -558,7 +558,7 @@ public class GenomeAnalysisEngine { if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); if(intervals == null) - return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer()); + return readsDataSource.createShardIteratorOverMappedReads(new LocusShardBalancer()); else return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer()); } @@ -566,7 +566,7 @@ public class GenomeAnalysisEngine { if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Active region walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); if(intervals == null) - return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer()); + return readsDataSource.createShardIteratorOverMappedReads(new LocusShardBalancer()); else return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new LocusShardBalancer()); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java index 8d7cfbaa7..adb668ff9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java @@ -28,7 +28,6 @@ package org.broadinstitute.sting.gatk.datasources.reads; import net.sf.picard.util.PeekableIterator; import net.sf.samtools.GATKBAMFileSpan; import net.sf.samtools.GATKChunk; -import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -53,14 +52,15 @@ public class BAMScheduler implements Iterator { private PeekableIterator locusIterator; private GenomeLoc currentLocus; - public static BAMScheduler createOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary referenceSequenceDictionary, final GenomeLocParser parser) { - BAMScheduler scheduler = new BAMScheduler(dataSource); - GenomeLocSortedSet intervals = new GenomeLocSortedSet(parser); - for(SAMSequenceRecord sequence: referenceSequenceDictionary.getSequences()) { - // Match only on sequence name; trust startup validation to make sure all the sequences match. - if(dataSource.getHeader().getSequenceDictionary().getSequence(sequence.getSequenceName()) != null) - intervals.add(parser.createOverEntireContig(sequence.getSequenceName())); - } + /* + * Creates BAMScheduler using contigs from the given BAM data source. + * + * @param dataSource BAM source + * @return non-null BAM scheduler + */ + public static BAMScheduler createOverMappedReads(final SAMDataSource dataSource) { + final BAMScheduler scheduler = new BAMScheduler(dataSource); + final GenomeLocSortedSet intervals = GenomeLocSortedSet.createSetFromSequenceDictionary(dataSource.getHeader().getSequenceDictionary()); scheduler.populateFilteredIntervalList(intervals); return scheduler; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java index f7ca7593f..048ce17f5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.datasources.reads; import net.sf.picard.util.PeekableIterator; -import net.sf.samtools.SAMSequenceDictionary; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; @@ -51,8 +50,8 @@ public class IntervalSharder implements Iterator { return new IntervalSharder(BAMScheduler.createOverAllReads(dataSource,parser),parser); } - public static IntervalSharder shardOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary sequenceDictionary, final GenomeLocParser parser) { - return new IntervalSharder(BAMScheduler.createOverMappedReads(dataSource,sequenceDictionary,parser),parser); + public static IntervalSharder shardOverMappedReads(final SAMDataSource dataSource, final GenomeLocParser parser) { + return new IntervalSharder(BAMScheduler.createOverMappedReads(dataSource),parser); } public static IntervalSharder shardOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index d52e55d6d..1223dd2af 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -1060,10 +1060,12 @@ public class SAMDataSource { /** * Creates a BAM schedule over all mapped reads in the BAM file, when a 'mapped' read is defined as any * read that has been assigned - * @return + * + * @param shardBalancer shard balancer object + * @return non-null initialized version of the shard balancer */ - public Iterable createShardIteratorOverMappedReads(final SAMSequenceDictionary sequenceDictionary, final ShardBalancer shardBalancer) { - shardBalancer.initialize(this,IntervalSharder.shardOverMappedReads(this,sequenceDictionary,genomeLocParser),genomeLocParser); + public Iterable createShardIteratorOverMappedReads(final ShardBalancer shardBalancer) { + shardBalancer.initialize(this,IntervalSharder.shardOverMappedReads(this,genomeLocParser),genomeLocParser); return shardBalancer; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java index 14bec213e..66463e576 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java @@ -26,12 +26,10 @@ package org.broadinstitute.sting.gatk.datasources.reads.utilities; import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; import org.broadinstitute.sting.commandline.CommandLineProgram; import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.datasources.reads.BAMScheduler; import org.broadinstitute.sting.gatk.datasources.reads.FilePointer; import org.broadinstitute.sting.gatk.datasources.reads.IntervalSharder; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; @@ -98,14 +96,11 @@ public class FindLargeShards extends CommandLineProgram { SAMDataSource dataSource = new SAMDataSource(bamReaders,new ThreadAllocation(),null,genomeLocParser); // intervals - GenomeLocSortedSet intervalSortedSet = null; - if(intervals != null) + final GenomeLocSortedSet intervalSortedSet; + if ( intervals != null ) intervalSortedSet = IntervalUtils.sortAndMergeIntervals(genomeLocParser, IntervalUtils.parseIntervalArguments(genomeLocParser, intervals), IntervalMergingRule.ALL); - else { - intervalSortedSet = new GenomeLocSortedSet(genomeLocParser); - for(SAMSequenceRecord entry: refReader.getSequenceDictionary().getSequences()) - intervalSortedSet.add(genomeLocParser.createGenomeLoc(entry.getSequenceName(),1,entry.getSequenceLength())); - } + else + intervalSortedSet = GenomeLocSortedSet.createSetFromSequenceDictionary(refReader.getSequenceDictionary()); logger.info(String.format("PROGRESS: Calculating mean and variance: Contig\tRegion.Start\tRegion.Stop\tSize")); diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java index 5adef5cdf..28cdaaf56 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java @@ -266,80 +266,96 @@ public class GenomeLocSortedSet extends AbstractSet { } /** - * add a genomeLoc to the collection, simply inserting in order into the set + * Adds a GenomeLoc to the collection, inserting at the correct sorted position into the set. + * Throws an exception if the loc overlaps another loc already in the set. * - * TODO -- this may break the contract of the GenomeLocSortedSet if e overlaps or - * TODO -- other locations already in the set. This code should check to see if - * TODO -- e is overlapping with its nearby elements and merge them or alternatively - * TODO -- throw an exception + * @param loc the GenomeLoc to add * - * @param e the GenomeLoc to add - * - * @return true + * @return true if the loc was added or false otherwise (if the loc was null) */ - public boolean add(GenomeLoc e) { - // assuming that the intervals coming arrive in order saves us a fair amount of time (and it's most likely true) - if (mArray.size() > 0 && e.isPast(mArray.get(mArray.size() - 1))) { - mArray.add(e); - return true; - } else { - final int loc = Collections.binarySearch(mArray,e); - if (loc >= 0) { - throw new ReviewedStingException("Genome Loc Sorted Set already contains the GenomicLoc " + e.toString()); - } else { - mArray.add((loc+1) * -1,e); - return true; - } - } + public boolean add(final GenomeLoc loc) { + return add(loc, false); } /** * Adds a GenomeLoc to the collection, merging it if it overlaps another region. - * If it's not overlapping then we add it in sorted order. + * If it's not overlapping then we insert it at the correct sorted position into the set. * - * TODO TODO TODO -- this function is buggy and will not properly create a sorted - * TODO TODO TODO -- genome loc is addRegion is called sequentially where the second - * TODO TODO TODO -- loc added is actually before the first. So when creating - * TODO TODO TODO -- sets make sure to sort the input locations first! + * @param loc the GenomeLoc to add * - * @param e the GenomeLoc to add to the collection - * - * @return true, if the GenomeLoc could be added to the collection + * @return true if the loc was added or false otherwise (if the loc was null) */ - public boolean addRegion(GenomeLoc e) { - if (e == null) { - return false; - } - // have we added it to the collection? - boolean haveAdded = false; + public boolean addRegion(final GenomeLoc loc) { + return add(loc, true); + } - /** - * check if the specified element overlaps any current locations, if so - * we should merge the two. - */ - for (GenomeLoc g : mArray) { - if (g.contiguousP(e)) { - GenomeLoc c = g.merge(e); - mArray.set(mArray.indexOf(g), c); - haveAdded = true; - } else if ((g.getContigIndex() == e.getContigIndex()) && - (e.getStart() < g.getStart()) && !haveAdded) { - mArray.add(mArray.indexOf(g), e); - return true; - } else if (haveAdded && ((e.getContigIndex() > e.getContigIndex()) || - (g.getContigIndex() == e.getContigIndex() && e.getStart() > g.getStart()))) { - return true; - } + /** + * Adds a GenomeLoc to the collection, inserting at the correct sorted position into the set. + * + * @param loc the GenomeLoc to add + * @param mergeIfIntervalOverlaps if true we merge the interval if it overlaps another one already in the set, otherwise we throw an exception + * + * @return true if the loc was added or false otherwise (if the loc was null or an exact duplicate) + */ + public boolean add(final GenomeLoc loc, final boolean mergeIfIntervalOverlaps) { + if ( loc == null ) + return false; + + // if we have no other intervals yet or if the new loc is past the last one in the list (which is usually the + // case because locs are generally added in order) then be extra efficient and just add the loc to the end + if ( mArray.size() == 0 || loc.isPast(mArray.get(mArray.size() - 1)) ) { + return mArray.add(loc); } - /** we're at the end and we haven't found locations that should fall after it, - * so we'll put it at the end - */ - if (!haveAdded) { - mArray.add(e); + + // find where in the list the new loc belongs + final int binarySearchIndex = Collections.binarySearch(mArray,loc); + + // if it already exists in the list, return or throw an exception as needed + if ( binarySearchIndex >= 0 ) { + if ( mergeIfIntervalOverlaps ) + return false; + throw new IllegalArgumentException("GenomeLocSortedSet already contains the GenomeLoc " + loc); } + + // if it overlaps a loc already in the list merge or throw an exception as needed + final int insertionIndex = -1 * (binarySearchIndex + 1); + if ( ! mergeOverlappingIntervalsFromAdd(loc, insertionIndex, !mergeIfIntervalOverlaps) ) { + // it does not overlap any current intervals, so add it to the set + mArray.add(insertionIndex, loc); + } + return true; } + /* + * If the provided GenomeLoc overlaps another already in the set, merge them (or throw an exception if requested) + * + * @param loc the GenomeLoc to add + * @param insertionIndex the index in the sorted set to add the new loc + * @param throwExceptionIfOverlapping if true we throw an exception if there's overlap, otherwise we merge them + * + * @return true if the loc was added or false otherwise + */ + private boolean mergeOverlappingIntervalsFromAdd(final GenomeLoc loc, final int insertionIndex, final boolean throwExceptionIfOverlapping) { + // try merging with the previous index + if ( insertionIndex != 0 && loc.overlapsP(mArray.get(insertionIndex - 1)) ) { + if ( throwExceptionIfOverlapping ) + throw new IllegalArgumentException(String.format("GenomeLocSortedSet contains a GenomeLoc (%s) that overlaps with the provided one (%s)", mArray.get(insertionIndex - 1).toString(), loc.toString())); + mArray.set(insertionIndex - 1, mArray.get(insertionIndex - 1).merge(loc)); + return true; + } + + // try merging with the following index + if ( insertionIndex < mArray.size() && loc.overlapsP(mArray.get(insertionIndex)) ) { + if ( throwExceptionIfOverlapping ) + throw new IllegalArgumentException(String.format("GenomeLocSortedSet contains a GenomeLoc (%s) that overlaps with the provided one (%s)", mArray.get(insertionIndex).toString(), loc.toString())); + mArray.set(insertionIndex, mArray.get(insertionIndex).merge(loc)); + return true; + } + + return false; + } + public GenomeLocSortedSet subtractRegions(GenomeLocSortedSet toRemoveSet) { LinkedList good = new LinkedList(); Stack toProcess = new Stack(); @@ -401,11 +417,11 @@ public class GenomeLocSortedSet extends AbstractSet { * * @return the GenomeLocSet of all references sequences as GenomeLoc's */ - public static GenomeLocSortedSet createSetFromSequenceDictionary(SAMSequenceDictionary dict) { - GenomeLocParser parser = new GenomeLocParser(dict); - GenomeLocSortedSet returnSortedSet = new GenomeLocSortedSet(parser); - for (SAMSequenceRecord record : dict.getSequences()) { - returnSortedSet.add(parser.createGenomeLoc(record.getSequenceName(), 1, record.getSequenceLength())); + public static GenomeLocSortedSet createSetFromSequenceDictionary(final SAMSequenceDictionary dict) { + final GenomeLocParser parser = new GenomeLocParser(dict); + final GenomeLocSortedSet returnSortedSet = new GenomeLocSortedSet(parser); + for ( final SAMSequenceRecord sequence : dict.getSequences() ) { + returnSortedSet.add(parser.createOverEntireContig(sequence.getSequenceName())); } return returnSortedSet; } diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java index 23720e60d..8d33aa8b6 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java @@ -111,7 +111,7 @@ public class SAMDataSourceUnitTest extends BaseTest { new ArrayList(), false); - Iterable strat = data.createShardIteratorOverMappedReads(seq.getSequenceDictionary(),new LocusShardBalancer()); + Iterable strat = data.createShardIteratorOverMappedReads(new LocusShardBalancer()); int count = 0; try { diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java index df41dc642..443cf2771 100644 --- a/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java @@ -117,11 +117,31 @@ public class GenomeLocSortedSetUnitTest extends BaseTest { GenomeLoc f = genomeLocParser.createGenomeLoc(contigOneName, 30, 80); mSortedSet.addRegion(f); assertTrue(mSortedSet.size() == 1); - } + @Test + public void addRegionsOutOfOrder() { + final String contigTwoName = header.getSequenceDictionary().getSequence(2).getSequenceName(); + assertTrue(mSortedSet.size() == 0); + GenomeLoc g = genomeLocParser.createGenomeLoc(contigTwoName, 1, 50); + mSortedSet.add(g); + GenomeLoc f = genomeLocParser.createGenomeLoc(contigOneName, 30, 80); + mSortedSet.addRegion(f); + assertTrue(mSortedSet.size() == 2); + assertTrue(mSortedSet.toList().get(0).getContig().equals(contigOneName)); + assertTrue(mSortedSet.toList().get(1).getContig().equals(contigTwoName)); + } - @Test(expectedExceptions=ReviewedStingException.class) + @Test(expectedExceptions = IllegalArgumentException.class) + public void addThrowsException() { + assertTrue(mSortedSet.size() == 0); + GenomeLoc g = genomeLocParser.createGenomeLoc(contigOneName, 1, 50); + mSortedSet.add(g); + GenomeLoc f = genomeLocParser.createGenomeLoc(contigOneName, 30, 80); + mSortedSet.add(f); + } + + @Test(expectedExceptions=IllegalArgumentException.class) public void testAddDuplicate() { assertTrue(mSortedSet.size() == 0); GenomeLoc g = genomeLocParser.createGenomeLoc(contigOneName, 0, 0); @@ -141,9 +161,9 @@ public class GenomeLocSortedSetUnitTest extends BaseTest { assertTrue(mSortedSet.size() == 1); Iterator iter = mSortedSet.iterator(); GenomeLoc loc = iter.next(); - assertTrue(loc.getStart() == 0); - assertTrue(loc.getStop() == 100); - assertTrue(loc.getContigIndex() == 1); + assertEquals(loc.getStart(), 0); + assertEquals(loc.getStop(), 100); + assertEquals(loc.getContigIndex(), 1); } @Test @@ -192,9 +212,9 @@ public class GenomeLocSortedSetUnitTest extends BaseTest { assertTrue(mSortedSet.size() == 1); Iterator iter = mSortedSet.iterator(); GenomeLoc loc = iter.next(); - assertTrue(loc.getStart() == 0); - assertTrue(loc.getStop() == 100); - assertTrue(loc.getContigIndex() == 1); + assertEquals(loc.getStart(), 0); + assertEquals(loc.getStop(), 100); + assertEquals(loc.getContigIndex(), 1); } @Test