From f0f482c1fe5f66715b6da93e6c064bd6f5bd7ee6 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Mon, 6 Jun 2016 11:46:15 -0400 Subject: [PATCH] - added an option to merge GenomeLocs that are abutting (contiguous) rather than actually overlapping. (#1399) - this should make ValidateVariants much faster. - fixed NPE that occurs when there is no -L argument --- .../variantutils/ValidateVariants.java | 22 ++++- .../broadinstitute/gatk/utils/GenomeLoc.java | 11 +++ .../gatk/utils/GenomeLocSortedSet.java | 98 +++++++++++-------- .../utils/GenomeLocSortedSetUnitTest.java | 52 ++++++++-- 4 files changed, 132 insertions(+), 51 deletions(-) diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java index ce9e84c0f..07cfa5619 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java @@ -235,12 +235,20 @@ public class ValidateVariants extends RodWalker { lastVcEnd = vc.getEnd(); } + // if there were no variants do not add anything! + if (lastVcEnd == -1) + return null; + return GenomeLoc.setStop(ref.getLocus(), lastVcEnd); } @Override public GenomeLocSortedSet reduce(GenomeLoc value, GenomeLocSortedSet sum) { - sum.add(value, true); + // unless we are doing gvcf validation, there's no need + // to add all the genomelocs. Also, since we do not expect them to be contiguous, it will + // take a lot of memory. + if (VALIDATE_GVCF) + sum.add(value, GenomeLocSortedSet.MergeStrategy.MERGE_CONTIGUOUS); return sum; } @@ -252,8 +260,16 @@ public class ValidateVariants extends RodWalker { @Override public void onTraversalDone(GenomeLocSortedSet result) { if (VALIDATE_GVCF) { - final GenomeLocSortedSet uncoveredIntervals = getToolkit().getIntervals().subtractRegions(result); - if (uncoveredIntervals.coveredSize() > 0) { + final GenomeLocSortedSet toolkitIntervals = getToolkit().getIntervals(); + final GenomeLocSortedSet traversalIntervals; + if (toolkitIntervals == null) { + traversalIntervals = GenomeLocSortedSet.createSetFromSequenceDictionary(getToolkit().getMasterSequenceDictionary()); + } else { + traversalIntervals = toolkitIntervals; + } + + final GenomeLocSortedSet uncoveredIntervals = traversalIntervals.subtractRegions(result); + if (traversalIntervals.subtractRegions(result).coveredSize() > 0) { final UserException e = new UserException.FailsStrictValidation(file, "A GVCF must cover the entire region. Found " + uncoveredIntervals.coveredSize() + " loci with no VariantContext covering it. The first uncovered segment is:" + uncoveredIntervals.iterator().next()); diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/GenomeLoc.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/GenomeLoc.java index c64d5798f..31bae813e 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/GenomeLoc.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/GenomeLoc.java @@ -375,6 +375,17 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome return ( comparison == 1 || ( comparison == 0 && this.getStart() > that.getStop() )); } + /** + * Tests whether this contig is completely after contig 'that' and separate from it by at-least one base. + * @param that Contig to test against. + * @return true if this contig starts after 'that' ends + 1 ; false if this is completely before or + * is continguous with 'that', in the sensce of {@link GenomeLoc#contiguousP(GenomeLoc)} + */ + @Requires("that != null") + public boolean isBeyond( final GenomeLoc that ) { + return isPast(that) && minDistance(that) > 1; + } + /** * Return the minimum distance between any pair of bases in this and that GenomeLocs: */ diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/GenomeLocSortedSet.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/GenomeLocSortedSet.java index dcb9476f8..e149627b1 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/GenomeLocSortedSet.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/GenomeLocSortedSet.java @@ -26,13 +26,13 @@ package org.broadinstitute.gatk.utils; import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; import org.broadinstitute.gatk.utils.interval.IntervalUtils; import java.util.*; +import java.util.stream.Collectors; /** *

@@ -54,7 +54,7 @@ public class GenomeLocSortedSet extends AbstractSet { private GenomeLocParser genomeLocParser; // our private storage for the GenomeLoc's - private final List mArray = new ArrayList(); + private final List mArray = new ArrayList<>(); // cache this to make overlap checking much more efficient private int previousOverlapSearchIndex = -1; @@ -92,7 +92,7 @@ public class GenomeLocSortedSet extends AbstractSet { public GenomeLocSortedSet(final GenomeLocParser parser, final Collection l) { this(parser); - final ArrayList sorted = new ArrayList(l); + final ArrayList sorted = new ArrayList<>(l); Collections.sort(sorted); mArray.addAll(IntervalUtils.mergeIntervalLocations(sorted, IntervalMergingRule.OVERLAPPING_ONLY)); } @@ -228,7 +228,7 @@ public class GenomeLocSortedSet extends AbstractSet { final int start = Math.max(-(index + 1) - 1, 0); final int size = mArray.size(); - final List overlapping = new LinkedList(); + final List overlapping = new LinkedList<>(); for ( int i = start; i < size; i++ ) { final GenomeLoc myLoc = mArray.get(i); if ( loc.overlapsP(myLoc) ) @@ -254,13 +254,10 @@ public class GenomeLocSortedSet extends AbstractSet { * @return a non-null list of locations that overlap loc */ protected List getOverlappingFullSearch(final GenomeLoc loc) { - final List overlapping = new LinkedList(); + final List overlapping = new LinkedList<>(); // super slow, but definitely works - for ( final GenomeLoc myLoc : mArray ) { - if ( loc.overlapsP(myLoc) ) - overlapping.add(myLoc); - } + overlapping.addAll(mArray.stream().filter(loc::overlapsP).collect(Collectors.toList())); return overlapping; } @@ -273,8 +270,9 @@ public class GenomeLocSortedSet extends AbstractSet { * * @return true if the loc was added or false otherwise (if the loc was null) */ + @Override public boolean add(final GenomeLoc loc) { - return add(loc, false); + return add(loc, MergeStrategy.DO_NOT_MERGE); } /** @@ -286,40 +284,53 @@ public class GenomeLocSortedSet extends AbstractSet { * @return true if the loc was added or false otherwise (if the loc was null) */ public boolean addRegion(final GenomeLoc loc) { - return add(loc, true); + return add(loc, MergeStrategy.MERGE_OVERLAPS); + } + + /** + * An enum describing the different ways in which one may wish to merge GenomeLocs + */ + public enum MergeStrategy { + DO_NOT_MERGE, // nothing is merged + MERGE_OVERLAPS, // overlapping (sharing a locus) GenomeLocs will be merged + MERGE_CONTIGUOUS // overlapping _or_ just abutting GenomeLocs will be merged } /** * Adds a GenomeLoc to the collection, inserting at the correct sorted position into the set. * - * @param loc the GenomeLoc to add - * @param mergeIfIntervalOverlaps if true we merge the interval if it overlaps another one already in the set, otherwise we throw an exception + * @param loc The GenomeLoc to add + * @param mergeStrategy A merge strategy informing whether to avoid merging at all, to merge overlapping + * intervals, or to merge contiguous (i.e. overlapping or abutting) intervals. + * If mergeStrategy == DO_NOT_MERGE and loc overlaps another in the set, an exception is thrown * * @return true if the loc was added or false otherwise (if the loc was null or an exact duplicate) + * + * @throws IllegalArgumentException If loc overlaps any GenomeLoc in the set but mergeStrategy == DO_NOT_MERGE */ - public boolean add(final GenomeLoc loc, final boolean mergeIfIntervalOverlaps) { + public boolean add(final GenomeLoc loc, final MergeStrategy mergeStrategy) throws IllegalArgumentException { if ( loc == null ) return false; // if we have no other intervals yet or if the new loc is past the last one in the list (which is usually the // case because locs are generally added in order) then be extra efficient and just add the loc to the end - if ( mArray.size() == 0 || loc.isPast(mArray.get(mArray.size() - 1)) ) { + if ( mArray.size() == 0 || loc.isBeyond(mArray.get(mArray.size() - 1))) { return mArray.add(loc); } // find where in the list the new loc belongs - final int binarySearchIndex = Collections.binarySearch(mArray,loc); + final int binarySearchIndex = Collections.binarySearch(mArray, loc); // if it already exists in the list, return or throw an exception as needed if ( binarySearchIndex >= 0 ) { - if ( mergeIfIntervalOverlaps ) + if ( mergeStrategy != MergeStrategy.DO_NOT_MERGE ) return false; throw new IllegalArgumentException("GenomeLocSortedSet already contains the GenomeLoc " + loc); } // if it overlaps a loc already in the list merge or throw an exception as needed final int insertionIndex = -1 * (binarySearchIndex + 1); - if ( ! mergeOverlappingIntervalsFromAdd(loc, insertionIndex, !mergeIfIntervalOverlaps) ) { + if ( ! mergeOverlappingIntervalsFromAdd(loc, insertionIndex, mergeStrategy) ) { // it does not overlap any current intervals, so add it to the set mArray.add(insertionIndex, loc); } @@ -327,39 +338,53 @@ public class GenomeLocSortedSet extends AbstractSet { return true; } - /* + + + /** * If the provided GenomeLoc overlaps another already in the set, merge them (or throw an exception if requested) * - * @param loc the GenomeLoc to add - * @param insertionIndex the index in the sorted set to add the new loc - * @param throwExceptionIfOverlapping if true we throw an exception if there's overlap, otherwise we merge them + * @param loc the GenomeLoc to add + * @param insertionIndex the index in the sorted set to add the new loc + * @param mergeStrategy a MergeStrategy informing how to deal with overlapping and abutting intervals * * @return true if the loc was added or false otherwise */ - private boolean mergeOverlappingIntervalsFromAdd(final GenomeLoc loc, final int insertionIndex, final boolean throwExceptionIfOverlapping) { + private boolean mergeOverlappingIntervalsFromAdd(final GenomeLoc loc, final int insertionIndex, final MergeStrategy mergeStrategy) { // try merging with the previous index if ( insertionIndex != 0 && loc.overlapsP(mArray.get(insertionIndex - 1)) ) { - if ( throwExceptionIfOverlapping ) + if ( mergeStrategy == MergeStrategy.DO_NOT_MERGE ) throw new IllegalArgumentException(String.format("GenomeLocSortedSet contains a GenomeLoc (%s) that overlaps with the provided one (%s)", mArray.get(insertionIndex - 1).toString(), loc.toString())); mArray.set(insertionIndex - 1, mArray.get(insertionIndex - 1).merge(loc)); return true; } + // if asking to merge contiguous, and is contiguous, merge and add + if ( mergeStrategy == MergeStrategy.MERGE_CONTIGUOUS & insertionIndex != 0 && loc.contiguousP(mArray.get(insertionIndex - 1)) ) { + mArray.set(insertionIndex - 1, mArray.get(insertionIndex - 1).merge(loc)); + return true; + } + // try merging with the following index if ( insertionIndex < mArray.size() && loc.overlapsP(mArray.get(insertionIndex)) ) { - if ( throwExceptionIfOverlapping ) + if ( mergeStrategy == MergeStrategy.DO_NOT_MERGE ) throw new IllegalArgumentException(String.format("GenomeLocSortedSet contains a GenomeLoc (%s) that overlaps with the provided one (%s)", mArray.get(insertionIndex).toString(), loc.toString())); mArray.set(insertionIndex, mArray.get(insertionIndex).merge(loc)); return true; } + // if asking to merge contiguous, and is contiguous, merge and add + if ( mergeStrategy == MergeStrategy.MERGE_CONTIGUOUS && insertionIndex < mArray.size() && loc.contiguousP(mArray.get(insertionIndex)) ) { + mArray.set(insertionIndex, mArray.get(insertionIndex).merge(loc)); + return true; + } + return false; } public GenomeLocSortedSet subtractRegions(GenomeLocSortedSet toRemoveSet) { - LinkedList good = new LinkedList(); - Stack toProcess = new Stack(); - Stack toExclude = new Stack(); + LinkedList good = new LinkedList<>(); + Stack toProcess = new Stack<>(); + Stack toExclude = new Stack<>(); // initialize the stacks toProcess.addAll(mArray); @@ -379,8 +404,7 @@ public class GenomeLocSortedSet extends AbstractSet { if ( p.overlapsP(e) ) { toProcess.pop(); - for ( GenomeLoc newP : p.subtract(e) ) - toProcess.push(newP); + p.subtract(e).forEach(toProcess::push); } else if ( p.compareContigs(e) < 0 ) { good.add(toProcess.pop()); // p is now good } else if ( p.compareContigs(e) > 0 ) { @@ -400,7 +424,6 @@ public class GenomeLocSortedSet extends AbstractSet { return createSetFromList(genomeLocParser,good); } - /** * a simple removal of an interval contained in this list. The interval must be identical to one in the list (no partial locations or overlapping) * @param location the GenomeLoc to remove @@ -420,9 +443,7 @@ public class GenomeLocSortedSet extends AbstractSet { public static GenomeLocSortedSet createSetFromSequenceDictionary(final SAMSequenceDictionary dict) { final GenomeLocParser parser = new GenomeLocParser(dict); final GenomeLocSortedSet returnSortedSet = new GenomeLocSortedSet(parser); - for ( final SAMSequenceRecord sequence : dict.getSequences() ) { - returnSortedSet.add(parser.createOverEntireContig(sequence.getSequenceName())); - } + returnSortedSet.addAll(dict.getSequences().stream().map(sequence -> parser.createOverEntireContig(sequence.getSequenceName())).collect(Collectors.toList())); return returnSortedSet; } @@ -433,13 +454,12 @@ public class GenomeLocSortedSet extends AbstractSet { * * @return the sorted genome loc list */ - public static GenomeLocSortedSet createSetFromList(GenomeLocParser parser,List locs) { + public static GenomeLocSortedSet createSetFromList(final GenomeLocParser parser, final List locs) { GenomeLocSortedSet set = new GenomeLocSortedSet(parser); set.addAll(locs); return set; } - /** * return a deep copy of this collection. * @@ -447,10 +467,8 @@ public class GenomeLocSortedSet extends AbstractSet { */ public GenomeLocSortedSet clone() { GenomeLocSortedSet ret = new GenomeLocSortedSet(genomeLocParser); - for (GenomeLoc loc : this.mArray) { - // ensure a deep copy - ret.mArray.add(genomeLocParser.createGenomeLoc(loc.getContig(), loc.getStart(), loc.getStop())); - } + // ensure a deep copy + ret.mArray.addAll(this.mArray.stream().map(loc -> genomeLocParser.createGenomeLoc(loc.getContig(), loc.getStart(), loc.getStop())).collect(Collectors.toList())); return ret; } diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocSortedSetUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocSortedSetUnitTest.java index 47c1f4d0f..dc0536daa 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocSortedSetUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocSortedSetUnitTest.java @@ -26,15 +26,8 @@ package org.broadinstitute.gatk.utils; import htsjdk.samtools.SAMFileHeader; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; - -import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertFalse; -import static org.testng.Assert.assertTrue; - import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeMethod; @@ -44,6 +37,8 @@ import org.testng.annotations.Test; import java.io.File; import java.util.*; +import static org.testng.Assert.*; + /** * * User: aaron @@ -132,6 +127,48 @@ public class GenomeLocSortedSetUnitTest extends BaseTest { assertTrue(mSortedSet.toList().get(1).getContig().equals(contigTwoName)); } + @DataProvider(name="secondContigStart") + public Object[][] secondContigStart(){ + return new Object[][]{ + new Object[]{49}, + new Object[]{50}, + new Object[]{51} + }; + } + + @Test(dataProvider = "secondContigStart") + public void addMergeContiguous(final int secondContigStart) { + assertTrue(mSortedSet.size() == 0); + GenomeLoc g = genomeLocParser.createGenomeLoc(contigOneName, 1, 50); + mSortedSet.add(g); + GenomeLoc f = genomeLocParser.createGenomeLoc(contigOneName, secondContigStart, 80); + mSortedSet.add(f, GenomeLocSortedSet.MergeStrategy.MERGE_CONTIGUOUS); + assertTrue(mSortedSet.size() == 1); + } + + @Test(dataProvider = "secondContigStart") + public void addMergeContiguousReverse(final int secondContigStart) { + assertTrue(mSortedSet.size() == 0); + GenomeLoc g = genomeLocParser.createGenomeLoc(contigOneName, secondContigStart, 80); + mSortedSet.add(g); + GenomeLoc f = genomeLocParser.createGenomeLoc(contigOneName, 1, 50); + mSortedSet.add(f, GenomeLocSortedSet.MergeStrategy.MERGE_CONTIGUOUS); + assertTrue(mSortedSet.size() == 1); + } + + @Test(dataProvider = "secondContigStart") + public void addMergeContiguousOutOfOrder(final int secondContigStart) { + final String contigTwoName = header.getSequenceDictionary().getSequence(2).getSequenceName(); + assertTrue(mSortedSet.size() == 0); + GenomeLoc g = genomeLocParser.createGenomeLoc(contigTwoName, 1, 50); + mSortedSet.add(g); + GenomeLoc f = genomeLocParser.createGenomeLoc(contigOneName, secondContigStart, 80); + mSortedSet.addRegion(f); + assertTrue(mSortedSet.size() == 2); + assertTrue(mSortedSet.toList().get(0).getContig().equals(contigOneName)); + assertTrue(mSortedSet.toList().get(1).getContig().equals(contigTwoName)); + } + @Test(expectedExceptions = IllegalArgumentException.class) public void addThrowsException() { assertTrue(mSortedSet.size() == 0); @@ -329,7 +366,6 @@ public class GenomeLocSortedSetUnitTest extends BaseTest { testSizeBeforeLocX(50, (int)mSortedSet.coveredSize()); } - @Test public void fromSequenceDictionary() { mSortedSet = GenomeLocSortedSet.createSetFromSequenceDictionary(this.header.getSequenceDictionary());