- added an option to merge GenomeLocs that are abutting (contiguous) rather than actually overlapping. (#1399)
- this should make ValidateVariants much faster. - fixed NPE that occurs when there is no -L argument
This commit is contained in:
parent
85dce75f3f
commit
f0f482c1fe
|
|
@ -235,12 +235,20 @@ public class ValidateVariants extends RodWalker<GenomeLoc, GenomeLocSortedSet> {
|
|||
lastVcEnd = vc.getEnd();
|
||||
}
|
||||
|
||||
// if there were no variants do not add anything!
|
||||
if (lastVcEnd == -1)
|
||||
return null;
|
||||
|
||||
return GenomeLoc.setStop(ref.getLocus(), lastVcEnd);
|
||||
}
|
||||
|
||||
@Override
|
||||
public GenomeLocSortedSet reduce(GenomeLoc value, GenomeLocSortedSet sum) {
|
||||
sum.add(value, true);
|
||||
// unless we are doing gvcf validation, there's no need
|
||||
// to add all the genomelocs. Also, since we do not expect them to be contiguous, it will
|
||||
// take a lot of memory.
|
||||
if (VALIDATE_GVCF)
|
||||
sum.add(value, GenomeLocSortedSet.MergeStrategy.MERGE_CONTIGUOUS);
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
|
@ -252,8 +260,16 @@ public class ValidateVariants extends RodWalker<GenomeLoc, GenomeLocSortedSet> {
|
|||
@Override
|
||||
public void onTraversalDone(GenomeLocSortedSet result) {
|
||||
if (VALIDATE_GVCF) {
|
||||
final GenomeLocSortedSet uncoveredIntervals = getToolkit().getIntervals().subtractRegions(result);
|
||||
if (uncoveredIntervals.coveredSize() > 0) {
|
||||
final GenomeLocSortedSet toolkitIntervals = getToolkit().getIntervals();
|
||||
final GenomeLocSortedSet traversalIntervals;
|
||||
if (toolkitIntervals == null) {
|
||||
traversalIntervals = GenomeLocSortedSet.createSetFromSequenceDictionary(getToolkit().getMasterSequenceDictionary());
|
||||
} else {
|
||||
traversalIntervals = toolkitIntervals;
|
||||
}
|
||||
|
||||
final GenomeLocSortedSet uncoveredIntervals = traversalIntervals.subtractRegions(result);
|
||||
if (traversalIntervals.subtractRegions(result).coveredSize() > 0) {
|
||||
final UserException e = new UserException.FailsStrictValidation(file, "A GVCF must cover the entire region. Found " + uncoveredIntervals.coveredSize() +
|
||||
" loci with no VariantContext covering it. The first uncovered segment is:" +
|
||||
uncoveredIntervals.iterator().next());
|
||||
|
|
|
|||
|
|
@ -375,6 +375,17 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
|
|||
return ( comparison == 1 || ( comparison == 0 && this.getStart() > that.getStop() ));
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests whether this contig is completely after contig 'that' and separate from it by at-least one base.
|
||||
* @param that Contig to test against.
|
||||
* @return true if this contig starts after 'that' ends + 1 ; false if this is completely before or
|
||||
* is continguous with 'that', in the sensce of {@link GenomeLoc#contiguousP(GenomeLoc)}
|
||||
*/
|
||||
@Requires("that != null")
|
||||
public boolean isBeyond( final GenomeLoc that ) {
|
||||
return isPast(that) && minDistance(that) > 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the minimum distance between any pair of bases in this and that GenomeLocs:
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -26,13 +26,13 @@
|
|||
package org.broadinstitute.gatk.utils;
|
||||
|
||||
import htsjdk.samtools.SAMSequenceDictionary;
|
||||
import htsjdk.samtools.SAMSequenceRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
|
||||
import org.broadinstitute.gatk.utils.interval.IntervalMergingRule;
|
||||
import org.broadinstitute.gatk.utils.interval.IntervalUtils;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* <p/>
|
||||
|
|
@ -54,7 +54,7 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
private GenomeLocParser genomeLocParser;
|
||||
|
||||
// our private storage for the GenomeLoc's
|
||||
private final List<GenomeLoc> mArray = new ArrayList<GenomeLoc>();
|
||||
private final List<GenomeLoc> mArray = new ArrayList<>();
|
||||
|
||||
// cache this to make overlap checking much more efficient
|
||||
private int previousOverlapSearchIndex = -1;
|
||||
|
|
@ -92,7 +92,7 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
public GenomeLocSortedSet(final GenomeLocParser parser, final Collection<GenomeLoc> l) {
|
||||
this(parser);
|
||||
|
||||
final ArrayList<GenomeLoc> sorted = new ArrayList<GenomeLoc>(l);
|
||||
final ArrayList<GenomeLoc> sorted = new ArrayList<>(l);
|
||||
Collections.sort(sorted);
|
||||
mArray.addAll(IntervalUtils.mergeIntervalLocations(sorted, IntervalMergingRule.OVERLAPPING_ONLY));
|
||||
}
|
||||
|
|
@ -228,7 +228,7 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
final int start = Math.max(-(index + 1) - 1, 0);
|
||||
final int size = mArray.size();
|
||||
|
||||
final List<GenomeLoc> overlapping = new LinkedList<GenomeLoc>();
|
||||
final List<GenomeLoc> overlapping = new LinkedList<>();
|
||||
for ( int i = start; i < size; i++ ) {
|
||||
final GenomeLoc myLoc = mArray.get(i);
|
||||
if ( loc.overlapsP(myLoc) )
|
||||
|
|
@ -254,13 +254,10 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
* @return a non-null list of locations that overlap loc
|
||||
*/
|
||||
protected List<GenomeLoc> getOverlappingFullSearch(final GenomeLoc loc) {
|
||||
final List<GenomeLoc> overlapping = new LinkedList<GenomeLoc>();
|
||||
final List<GenomeLoc> overlapping = new LinkedList<>();
|
||||
|
||||
// super slow, but definitely works
|
||||
for ( final GenomeLoc myLoc : mArray ) {
|
||||
if ( loc.overlapsP(myLoc) )
|
||||
overlapping.add(myLoc);
|
||||
}
|
||||
overlapping.addAll(mArray.stream().filter(loc::overlapsP).collect(Collectors.toList()));
|
||||
|
||||
return overlapping;
|
||||
}
|
||||
|
|
@ -273,8 +270,9 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
*
|
||||
* @return true if the loc was added or false otherwise (if the loc was null)
|
||||
*/
|
||||
@Override
|
||||
public boolean add(final GenomeLoc loc) {
|
||||
return add(loc, false);
|
||||
return add(loc, MergeStrategy.DO_NOT_MERGE);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -286,40 +284,53 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
* @return true if the loc was added or false otherwise (if the loc was null)
|
||||
*/
|
||||
public boolean addRegion(final GenomeLoc loc) {
|
||||
return add(loc, true);
|
||||
return add(loc, MergeStrategy.MERGE_OVERLAPS);
|
||||
}
|
||||
|
||||
/**
|
||||
* An enum describing the different ways in which one may wish to merge GenomeLocs
|
||||
*/
|
||||
public enum MergeStrategy {
|
||||
DO_NOT_MERGE, // nothing is merged
|
||||
MERGE_OVERLAPS, // overlapping (sharing a locus) GenomeLocs will be merged
|
||||
MERGE_CONTIGUOUS // overlapping _or_ just abutting GenomeLocs will be merged
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a GenomeLoc to the collection, inserting at the correct sorted position into the set.
|
||||
*
|
||||
* @param loc the GenomeLoc to add
|
||||
* @param mergeIfIntervalOverlaps if true we merge the interval if it overlaps another one already in the set, otherwise we throw an exception
|
||||
* @param loc The GenomeLoc to add
|
||||
* @param mergeStrategy A merge strategy informing whether to avoid merging at all, to merge overlapping
|
||||
* intervals, or to merge contiguous (i.e. overlapping or abutting) intervals.
|
||||
* If mergeStrategy == DO_NOT_MERGE and loc overlaps another in the set, an exception is thrown
|
||||
*
|
||||
* @return true if the loc was added or false otherwise (if the loc was null or an exact duplicate)
|
||||
*
|
||||
* @throws IllegalArgumentException If loc overlaps any GenomeLoc in the set but mergeStrategy == DO_NOT_MERGE
|
||||
*/
|
||||
public boolean add(final GenomeLoc loc, final boolean mergeIfIntervalOverlaps) {
|
||||
public boolean add(final GenomeLoc loc, final MergeStrategy mergeStrategy) throws IllegalArgumentException {
|
||||
if ( loc == null )
|
||||
return false;
|
||||
|
||||
// if we have no other intervals yet or if the new loc is past the last one in the list (which is usually the
|
||||
// case because locs are generally added in order) then be extra efficient and just add the loc to the end
|
||||
if ( mArray.size() == 0 || loc.isPast(mArray.get(mArray.size() - 1)) ) {
|
||||
if ( mArray.size() == 0 || loc.isBeyond(mArray.get(mArray.size() - 1))) {
|
||||
return mArray.add(loc);
|
||||
}
|
||||
|
||||
// find where in the list the new loc belongs
|
||||
final int binarySearchIndex = Collections.binarySearch(mArray,loc);
|
||||
final int binarySearchIndex = Collections.binarySearch(mArray, loc);
|
||||
|
||||
// if it already exists in the list, return or throw an exception as needed
|
||||
if ( binarySearchIndex >= 0 ) {
|
||||
if ( mergeIfIntervalOverlaps )
|
||||
if ( mergeStrategy != MergeStrategy.DO_NOT_MERGE )
|
||||
return false;
|
||||
throw new IllegalArgumentException("GenomeLocSortedSet already contains the GenomeLoc " + loc);
|
||||
}
|
||||
|
||||
// if it overlaps a loc already in the list merge or throw an exception as needed
|
||||
final int insertionIndex = -1 * (binarySearchIndex + 1);
|
||||
if ( ! mergeOverlappingIntervalsFromAdd(loc, insertionIndex, !mergeIfIntervalOverlaps) ) {
|
||||
if ( ! mergeOverlappingIntervalsFromAdd(loc, insertionIndex, mergeStrategy) ) {
|
||||
// it does not overlap any current intervals, so add it to the set
|
||||
mArray.add(insertionIndex, loc);
|
||||
}
|
||||
|
|
@ -327,39 +338,53 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
|
||||
/**
|
||||
* If the provided GenomeLoc overlaps another already in the set, merge them (or throw an exception if requested)
|
||||
*
|
||||
* @param loc the GenomeLoc to add
|
||||
* @param insertionIndex the index in the sorted set to add the new loc
|
||||
* @param throwExceptionIfOverlapping if true we throw an exception if there's overlap, otherwise we merge them
|
||||
* @param loc the GenomeLoc to add
|
||||
* @param insertionIndex the index in the sorted set to add the new loc
|
||||
* @param mergeStrategy a MergeStrategy informing how to deal with overlapping and abutting intervals
|
||||
*
|
||||
* @return true if the loc was added or false otherwise
|
||||
*/
|
||||
private boolean mergeOverlappingIntervalsFromAdd(final GenomeLoc loc, final int insertionIndex, final boolean throwExceptionIfOverlapping) {
|
||||
private boolean mergeOverlappingIntervalsFromAdd(final GenomeLoc loc, final int insertionIndex, final MergeStrategy mergeStrategy) {
|
||||
// try merging with the previous index
|
||||
if ( insertionIndex != 0 && loc.overlapsP(mArray.get(insertionIndex - 1)) ) {
|
||||
if ( throwExceptionIfOverlapping )
|
||||
if ( mergeStrategy == MergeStrategy.DO_NOT_MERGE )
|
||||
throw new IllegalArgumentException(String.format("GenomeLocSortedSet contains a GenomeLoc (%s) that overlaps with the provided one (%s)", mArray.get(insertionIndex - 1).toString(), loc.toString()));
|
||||
mArray.set(insertionIndex - 1, mArray.get(insertionIndex - 1).merge(loc));
|
||||
return true;
|
||||
}
|
||||
|
||||
// if asking to merge contiguous, and is contiguous, merge and add
|
||||
if ( mergeStrategy == MergeStrategy.MERGE_CONTIGUOUS & insertionIndex != 0 && loc.contiguousP(mArray.get(insertionIndex - 1)) ) {
|
||||
mArray.set(insertionIndex - 1, mArray.get(insertionIndex - 1).merge(loc));
|
||||
return true;
|
||||
}
|
||||
|
||||
// try merging with the following index
|
||||
if ( insertionIndex < mArray.size() && loc.overlapsP(mArray.get(insertionIndex)) ) {
|
||||
if ( throwExceptionIfOverlapping )
|
||||
if ( mergeStrategy == MergeStrategy.DO_NOT_MERGE )
|
||||
throw new IllegalArgumentException(String.format("GenomeLocSortedSet contains a GenomeLoc (%s) that overlaps with the provided one (%s)", mArray.get(insertionIndex).toString(), loc.toString()));
|
||||
mArray.set(insertionIndex, mArray.get(insertionIndex).merge(loc));
|
||||
return true;
|
||||
}
|
||||
|
||||
// if asking to merge contiguous, and is contiguous, merge and add
|
||||
if ( mergeStrategy == MergeStrategy.MERGE_CONTIGUOUS && insertionIndex < mArray.size() && loc.contiguousP(mArray.get(insertionIndex)) ) {
|
||||
mArray.set(insertionIndex, mArray.get(insertionIndex).merge(loc));
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public GenomeLocSortedSet subtractRegions(GenomeLocSortedSet toRemoveSet) {
|
||||
LinkedList<GenomeLoc> good = new LinkedList<GenomeLoc>();
|
||||
Stack<GenomeLoc> toProcess = new Stack<GenomeLoc>();
|
||||
Stack<GenomeLoc> toExclude = new Stack<GenomeLoc>();
|
||||
LinkedList<GenomeLoc> good = new LinkedList<>();
|
||||
Stack<GenomeLoc> toProcess = new Stack<>();
|
||||
Stack<GenomeLoc> toExclude = new Stack<>();
|
||||
|
||||
// initialize the stacks
|
||||
toProcess.addAll(mArray);
|
||||
|
|
@ -379,8 +404,7 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
|
||||
if ( p.overlapsP(e) ) {
|
||||
toProcess.pop();
|
||||
for ( GenomeLoc newP : p.subtract(e) )
|
||||
toProcess.push(newP);
|
||||
p.subtract(e).forEach(toProcess::push);
|
||||
} else if ( p.compareContigs(e) < 0 ) {
|
||||
good.add(toProcess.pop()); // p is now good
|
||||
} else if ( p.compareContigs(e) > 0 ) {
|
||||
|
|
@ -400,7 +424,6 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
return createSetFromList(genomeLocParser,good);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* a simple removal of an interval contained in this list. The interval must be identical to one in the list (no partial locations or overlapping)
|
||||
* @param location the GenomeLoc to remove
|
||||
|
|
@ -420,9 +443,7 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
public static GenomeLocSortedSet createSetFromSequenceDictionary(final SAMSequenceDictionary dict) {
|
||||
final GenomeLocParser parser = new GenomeLocParser(dict);
|
||||
final GenomeLocSortedSet returnSortedSet = new GenomeLocSortedSet(parser);
|
||||
for ( final SAMSequenceRecord sequence : dict.getSequences() ) {
|
||||
returnSortedSet.add(parser.createOverEntireContig(sequence.getSequenceName()));
|
||||
}
|
||||
returnSortedSet.addAll(dict.getSequences().stream().map(sequence -> parser.createOverEntireContig(sequence.getSequenceName())).collect(Collectors.toList()));
|
||||
return returnSortedSet;
|
||||
}
|
||||
|
||||
|
|
@ -433,13 +454,12 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
*
|
||||
* @return the sorted genome loc list
|
||||
*/
|
||||
public static GenomeLocSortedSet createSetFromList(GenomeLocParser parser,List<GenomeLoc> locs) {
|
||||
public static GenomeLocSortedSet createSetFromList(final GenomeLocParser parser, final List<GenomeLoc> locs) {
|
||||
GenomeLocSortedSet set = new GenomeLocSortedSet(parser);
|
||||
set.addAll(locs);
|
||||
return set;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* return a deep copy of this collection.
|
||||
*
|
||||
|
|
@ -447,10 +467,8 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
*/
|
||||
public GenomeLocSortedSet clone() {
|
||||
GenomeLocSortedSet ret = new GenomeLocSortedSet(genomeLocParser);
|
||||
for (GenomeLoc loc : this.mArray) {
|
||||
// ensure a deep copy
|
||||
ret.mArray.add(genomeLocParser.createGenomeLoc(loc.getContig(), loc.getStart(), loc.getStop()));
|
||||
}
|
||||
// ensure a deep copy
|
||||
ret.mArray.addAll(this.mArray.stream().map(loc -> genomeLocParser.createGenomeLoc(loc.getContig(), loc.getStart(), loc.getStop())).collect(Collectors.toList()));
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -26,15 +26,8 @@
|
|||
package org.broadinstitute.gatk.utils;
|
||||
|
||||
import htsjdk.samtools.SAMFileHeader;
|
||||
import org.broadinstitute.gatk.utils.BaseTest;
|
||||
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
|
||||
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils;
|
||||
|
||||
import static org.testng.Assert.assertEquals;
|
||||
import static org.testng.Assert.assertFalse;
|
||||
import static org.testng.Assert.assertTrue;
|
||||
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.BeforeMethod;
|
||||
|
|
@ -44,6 +37,8 @@ import org.testng.annotations.Test;
|
|||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
||||
import static org.testng.Assert.*;
|
||||
|
||||
/**
|
||||
*
|
||||
* User: aaron
|
||||
|
|
@ -132,6 +127,48 @@ public class GenomeLocSortedSetUnitTest extends BaseTest {
|
|||
assertTrue(mSortedSet.toList().get(1).getContig().equals(contigTwoName));
|
||||
}
|
||||
|
||||
@DataProvider(name="secondContigStart")
|
||||
public Object[][] secondContigStart(){
|
||||
return new Object[][]{
|
||||
new Object[]{49},
|
||||
new Object[]{50},
|
||||
new Object[]{51}
|
||||
};
|
||||
}
|
||||
|
||||
@Test(dataProvider = "secondContigStart")
|
||||
public void addMergeContiguous(final int secondContigStart) {
|
||||
assertTrue(mSortedSet.size() == 0);
|
||||
GenomeLoc g = genomeLocParser.createGenomeLoc(contigOneName, 1, 50);
|
||||
mSortedSet.add(g);
|
||||
GenomeLoc f = genomeLocParser.createGenomeLoc(contigOneName, secondContigStart, 80);
|
||||
mSortedSet.add(f, GenomeLocSortedSet.MergeStrategy.MERGE_CONTIGUOUS);
|
||||
assertTrue(mSortedSet.size() == 1);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "secondContigStart")
|
||||
public void addMergeContiguousReverse(final int secondContigStart) {
|
||||
assertTrue(mSortedSet.size() == 0);
|
||||
GenomeLoc g = genomeLocParser.createGenomeLoc(contigOneName, secondContigStart, 80);
|
||||
mSortedSet.add(g);
|
||||
GenomeLoc f = genomeLocParser.createGenomeLoc(contigOneName, 1, 50);
|
||||
mSortedSet.add(f, GenomeLocSortedSet.MergeStrategy.MERGE_CONTIGUOUS);
|
||||
assertTrue(mSortedSet.size() == 1);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "secondContigStart")
|
||||
public void addMergeContiguousOutOfOrder(final int secondContigStart) {
|
||||
final String contigTwoName = header.getSequenceDictionary().getSequence(2).getSequenceName();
|
||||
assertTrue(mSortedSet.size() == 0);
|
||||
GenomeLoc g = genomeLocParser.createGenomeLoc(contigTwoName, 1, 50);
|
||||
mSortedSet.add(g);
|
||||
GenomeLoc f = genomeLocParser.createGenomeLoc(contigOneName, secondContigStart, 80);
|
||||
mSortedSet.addRegion(f);
|
||||
assertTrue(mSortedSet.size() == 2);
|
||||
assertTrue(mSortedSet.toList().get(0).getContig().equals(contigOneName));
|
||||
assertTrue(mSortedSet.toList().get(1).getContig().equals(contigTwoName));
|
||||
}
|
||||
|
||||
@Test(expectedExceptions = IllegalArgumentException.class)
|
||||
public void addThrowsException() {
|
||||
assertTrue(mSortedSet.size() == 0);
|
||||
|
|
@ -329,7 +366,6 @@ public class GenomeLocSortedSetUnitTest extends BaseTest {
|
|||
testSizeBeforeLocX(50, (int)mSortedSet.coveredSize());
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void fromSequenceDictionary() {
|
||||
mSortedSet = GenomeLocSortedSet.createSetFromSequenceDictionary(this.header.getSequenceDictionary());
|
||||
|
|
|
|||
Loading…
Reference in New Issue