IntervalUtils for completely balanced locus-based scatter/gather
-- scatterLocusIntervals master utility -- Moved around some general functionality from GenomeLocSortedSet to GenomeLoc -- Util function for reversing a list (List<T> -> List<T>, unlike Collections version) -- DoC is PartitionType.INTERVAL -- Significant unit tests on new functionality (all passing) -- Ready for real-world testing, as soon as I can get LocusScatterFunction.scala to actually work
This commit is contained in:
parent
5fc613f972
commit
c2b97030a4
|
|
@ -113,6 +113,7 @@ import java.util.*;
|
|||
// todo -- allow for user to set linear binning (default is logarithmic)
|
||||
// todo -- formatting --> do something special for end bins in getQuantile(int[] foo), this gets mushed into the end+-1 bins for now
|
||||
@By(DataSource.REFERENCE)
|
||||
@PartitionBy(PartitionType.INTERVAL)
|
||||
public class DepthOfCoverageWalker extends LocusWalker<Map<DoCOutputType.Partition,Map<String,int[]>>, CoveragePartitioner> implements TreeReducible<CoveragePartitioner> {
|
||||
@Output
|
||||
@Multiplex(value=DoCOutputMultiplexer.class,arguments={"partitionTypes","refSeqGeneList","omitDepthOutput","omitIntervals","omitSampleSummary","omitLocusTable"})
|
||||
|
|
|
|||
|
|
@ -5,6 +5,10 @@ import com.google.java.contract.Requires;
|
|||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
|
|
@ -174,6 +178,8 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
|
|||
return new GenomeLoc[] { new GenomeLoc(getContig(),contigIndex,getStart(),splitPoint-1), new GenomeLoc(getContig(),contigIndex,splitPoint,getStop()) };
|
||||
}
|
||||
|
||||
public GenomeLoc union( GenomeLoc that ) { return merge(that); }
|
||||
|
||||
@Requires("that != null")
|
||||
@Ensures("result != null")
|
||||
public GenomeLoc intersect( GenomeLoc that ) throws ReviewedStingException {
|
||||
|
|
@ -192,6 +198,79 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
|
|||
Math.min( getStop(), that.getStop()) );
|
||||
}
|
||||
|
||||
@Requires("that != null")
|
||||
public final List<GenomeLoc> subtract( final GenomeLoc that ) {
|
||||
if(GenomeLoc.isUnmapped(this) || GenomeLoc.isUnmapped(that)) {
|
||||
if(! GenomeLoc.isUnmapped(this) || !GenomeLoc.isUnmapped(that))
|
||||
throw new ReviewedStingException("Tried to intersect a mapped and an unmapped genome loc");
|
||||
return Arrays.asList(UNMAPPED);
|
||||
}
|
||||
|
||||
if (!(this.overlapsP(that))) {
|
||||
throw new ReviewedStingException("GenomeLoc::minus(): The two genome loc's need to overlap");
|
||||
}
|
||||
|
||||
if (equals(that)) {
|
||||
return Collections.emptyList();
|
||||
} else if (containsP(that)) {
|
||||
List<GenomeLoc> l = new ArrayList<GenomeLoc>(2);
|
||||
|
||||
/**
|
||||
* we have to create two new region, one for the before part, one for the after
|
||||
* The old region:
|
||||
* |----------------- old region (g) -------------|
|
||||
* |----- to delete (e) ------|
|
||||
*
|
||||
* product (two new regions):
|
||||
* |------| + |--------|
|
||||
*
|
||||
*/
|
||||
int afterStop = this.getStop(), afterStart = that.getStop() + 1;
|
||||
int beforeStop = that.getStart() - 1, beforeStart = this.getStart();
|
||||
if (afterStop - afterStart >= 0) {
|
||||
GenomeLoc after = new GenomeLoc(this.getContig(), getContigIndex(), afterStart, afterStop);
|
||||
l.add(after);
|
||||
}
|
||||
if (beforeStop - beforeStart >= 0) {
|
||||
GenomeLoc before = new GenomeLoc(this.getContig(), getContigIndex(), beforeStart, beforeStop);
|
||||
l.add(before);
|
||||
}
|
||||
|
||||
return l;
|
||||
} else if (that.containsP(this)) {
|
||||
/**
|
||||
* e completely contains g, delete g, but keep looking, there may be more regions
|
||||
* i.e.:
|
||||
* |--------------------- e --------------------|
|
||||
* |--- g ---| |---- others ----|
|
||||
*/
|
||||
return Collections.emptyList(); // don't need to do anything
|
||||
} else {
|
||||
/**
|
||||
* otherwise e overlaps some part of g
|
||||
*
|
||||
* figure out which region occurs first on the genome. I.e., is it:
|
||||
* |------------- g ----------|
|
||||
* |------------- e ----------|
|
||||
*
|
||||
* or:
|
||||
* |------------- g ----------|
|
||||
* |------------ e -----------|
|
||||
*
|
||||
*/
|
||||
|
||||
GenomeLoc n;
|
||||
if (that.getStart() < this.getStart()) {
|
||||
n = new GenomeLoc(this.getContig(), getContigIndex(), that.getStop() + 1, this.getStop());
|
||||
} else {
|
||||
n = new GenomeLoc(this.getContig(), getContigIndex(), this.getStart(), that.getStart() - 1);
|
||||
}
|
||||
|
||||
// replace g with the new region
|
||||
return Arrays.asList(n);
|
||||
}
|
||||
}
|
||||
|
||||
@Requires("that != null")
|
||||
public final boolean containsP(GenomeLoc that) {
|
||||
return onSameContig(that) && getStart() <= that.getStart() && getStop() >= that.getStop();
|
||||
|
|
@ -203,19 +282,14 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
|
|||
}
|
||||
|
||||
@Requires("that != null")
|
||||
public final int minus( final GenomeLoc that ) {
|
||||
@Ensures("result >= 0")
|
||||
public final int distance( final GenomeLoc that ) {
|
||||
if ( this.onSameContig(that) )
|
||||
return this.getStart() - that.getStart();
|
||||
return Math.abs(this.getStart() - that.getStart());
|
||||
else
|
||||
return Integer.MAX_VALUE;
|
||||
}
|
||||
|
||||
@Requires("that != null")
|
||||
@Ensures("result >= 0")
|
||||
public final int distance( final GenomeLoc that ) {
|
||||
return Math.abs(minus(that));
|
||||
}
|
||||
|
||||
@Requires({"left != null", "right != null"})
|
||||
public final boolean isBetween( final GenomeLoc left, final GenomeLoc right ) {
|
||||
return this.compareTo(left) > -1 && this.compareTo(right) < 1;
|
||||
|
|
|
|||
|
|
@ -215,7 +215,7 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
|
||||
if ( p.overlapsP(e) ) {
|
||||
toProcess.pop();
|
||||
for ( GenomeLoc newP : subtractRegion(p, e) )
|
||||
for ( GenomeLoc newP : p.subtract(e) )
|
||||
toProcess.push(newP);
|
||||
} else if ( p.compareContigs(e) < 0 ) {
|
||||
good.add(toProcess.pop()); // p is now good
|
||||
|
|
@ -236,69 +236,6 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
return createSetFromList(genomeLocParser,good);
|
||||
}
|
||||
|
||||
private static final List<GenomeLoc> EMPTY_LIST = new ArrayList<GenomeLoc>();
|
||||
private List<GenomeLoc> subtractRegion(GenomeLoc g, GenomeLoc e) {
|
||||
if (g.equals(e)) {
|
||||
return EMPTY_LIST;
|
||||
} else if (g.containsP(e)) {
|
||||
List<GenomeLoc> l = new ArrayList<GenomeLoc>();
|
||||
|
||||
/**
|
||||
* we have to create two new region, one for the before part, one for the after
|
||||
* The old region:
|
||||
* |----------------- old region (g) -------------|
|
||||
* |----- to delete (e) ------|
|
||||
*
|
||||
* product (two new regions):
|
||||
* |------| + |--------|
|
||||
*
|
||||
*/
|
||||
int afterStop = g.getStop(), afterStart = e.getStop() + 1;
|
||||
int beforeStop = e.getStart() - 1, beforeStart = g.getStart();
|
||||
if (afterStop - afterStart >= 0) {
|
||||
GenomeLoc after = genomeLocParser.createGenomeLoc(g.getContig(), afterStart, afterStop);
|
||||
l.add(after);
|
||||
}
|
||||
if (beforeStop - beforeStart >= 0) {
|
||||
GenomeLoc before = genomeLocParser.createGenomeLoc(g.getContig(), beforeStart, beforeStop);
|
||||
l.add(before);
|
||||
}
|
||||
|
||||
return l;
|
||||
} else if (e.containsP(g)) {
|
||||
/**
|
||||
* e completely contains g, delete g, but keep looking, there may be more regions
|
||||
* i.e.:
|
||||
* |--------------------- e --------------------|
|
||||
* |--- g ---| |---- others ----|
|
||||
*/
|
||||
return EMPTY_LIST; // don't need to do anything
|
||||
} else {
|
||||
/**
|
||||
* otherwise e overlaps some part of g
|
||||
*
|
||||
* figure out which region occurs first on the genome. I.e., is it:
|
||||
* |------------- g ----------|
|
||||
* |------------- e ----------|
|
||||
*
|
||||
* or:
|
||||
* |------------- g ----------|
|
||||
* |------------ e -----------|
|
||||
*
|
||||
*/
|
||||
|
||||
GenomeLoc n;
|
||||
if (e.getStart() < g.getStart()) {
|
||||
n = genomeLocParser.createGenomeLoc(g.getContig(), e.getStop() + 1, g.getStop());
|
||||
} else {
|
||||
n = genomeLocParser.createGenomeLoc(g.getContig(), g.getStart(), e.getStart() - 1);
|
||||
}
|
||||
|
||||
// replace g with the new region
|
||||
return Arrays.asList(n);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* a simple removal of an interval contained in this list. The interval must be identical to one in the list (no partial locations or overlapping)
|
||||
|
|
|
|||
|
|
@ -586,6 +586,12 @@ public class Utils {
|
|||
return rcbases;
|
||||
}
|
||||
|
||||
static public final <T> List<T> reverse(final List<T> l) {
|
||||
final List<T> newL = new ArrayList<T>(l);
|
||||
Collections.reverse(newL);
|
||||
return newL;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reverse an int array of bases
|
||||
*
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
package org.broadinstitute.sting.utils.interval;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.picard.util.Interval;
|
||||
import net.sf.picard.util.IntervalList;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
|
|
@ -8,6 +10,7 @@ import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
|
|||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
|
@ -221,6 +224,44 @@ public class IntervalUtils {
|
|||
return GenomeLocSortedSet.createSetFromList(parser,intervals);
|
||||
}
|
||||
|
||||
/**
|
||||
* computes whether the test interval list is equivalent to master. To be equivalent, test must
|
||||
* contain GenomeLocs covering every base in master, exactly once. Note that this algorithm
|
||||
* assumes that master genomelocs are all discontiguous (i.e., we don't have locs like 1-3 and 4-6 but
|
||||
* rather just 1-6). In order to use this algorithm with contiguous genomelocs first merge them. The algorithm
|
||||
* doesn't assume that test has discontinuous genomelocs.
|
||||
*
|
||||
* Returns a null string if there are no differences, otherwise returns a string describing the difference
|
||||
* (useful for UnitTests). Assumes both lists are sorted
|
||||
*/
|
||||
public static final String equateIntervals(List<GenomeLoc> masterArg, List<GenomeLoc> testArg) {
|
||||
LinkedList<GenomeLoc> master = new LinkedList<GenomeLoc>(masterArg);
|
||||
LinkedList<GenomeLoc> test = new LinkedList<GenomeLoc>(testArg);
|
||||
|
||||
while ( ! master.isEmpty() ) { // there's still unchecked bases in master
|
||||
final GenomeLoc masterHead = master.pop();
|
||||
final GenomeLoc testHead = test.pop();
|
||||
|
||||
if ( testHead.overlapsP(masterHead) ) {
|
||||
// remove the parts of test that overlap master, and push the remaining
|
||||
// parts onto master for further comparison.
|
||||
for ( final GenomeLoc masterPart : Utils.reverse(masterHead.subtract(testHead)) ) {
|
||||
master.push(masterPart);
|
||||
}
|
||||
} else {
|
||||
// testHead is incompatible with masterHead, so we must have extra bases in testHead
|
||||
// that aren't in master
|
||||
return "Incompatible locs detected masterHead=" + masterHead + ", testHead=" + testHead;
|
||||
}
|
||||
}
|
||||
|
||||
if ( test.isEmpty() ) // everything is equal
|
||||
return null; // no differences
|
||||
else
|
||||
return "Remaining elements found in test: first=" + test.peek();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Check if string argument was intented as a file
|
||||
* Accepted file extensions: .bed .list, .picard, .interval_list, .intervals.
|
||||
|
|
@ -384,6 +425,92 @@ public class IntervalUtils {
|
|||
return splitIntervalsToSubLists(locs, splitPoints);
|
||||
}
|
||||
|
||||
@Requires({"locs != null", "numParts > 0"})
|
||||
@Ensures("result != null")
|
||||
public static List<List<GenomeLoc>> splitLocusIntervals(List<GenomeLoc> locs, int numParts) {
|
||||
// the ideal size of each split
|
||||
final long bp = IntervalUtils.intervalSize(locs);
|
||||
final long idealSplitSize = Math.max((long)Math.floor(bp / (1.0*numParts)), 1);
|
||||
|
||||
// algorithm:
|
||||
// split = ()
|
||||
// set size = 0
|
||||
// pop the head H off locs.
|
||||
// If size + size(H) < splitSize:
|
||||
// add H to split, continue
|
||||
// If size + size(H) == splitSize:
|
||||
// done with split, put in splits, restart
|
||||
// if size + size(H) > splitSize:
|
||||
// cut H into two pieces, first of which has splitSize - size bp
|
||||
// push both pieces onto locs, continue
|
||||
// The last split is special -- when you have only one split left, it gets all of the remaining locs
|
||||
// to deal with rounding issues
|
||||
final List<List<GenomeLoc>> splits = new ArrayList<List<GenomeLoc>>(numParts);
|
||||
|
||||
LinkedList<GenomeLoc> locsLinkedList = new LinkedList<GenomeLoc>(locs);
|
||||
while ( ! locsLinkedList.isEmpty() ) {
|
||||
if ( splits.size() + 1 == numParts ) {
|
||||
// the last one gets all of the remaining parts
|
||||
splits.add(new ArrayList<GenomeLoc>(locsLinkedList));
|
||||
locsLinkedList.clear();
|
||||
} else {
|
||||
final SplitLocusRecursive one = splitLocusIntervals1(locsLinkedList, idealSplitSize);
|
||||
splits.add(one.split);
|
||||
locsLinkedList = one.remaining;
|
||||
}
|
||||
}
|
||||
|
||||
return splits;
|
||||
}
|
||||
|
||||
@Requires({"remaining != null", "!remaining.isEmpty()", "idealSplitSize > 0"})
|
||||
@Ensures({"result != null"})
|
||||
final static SplitLocusRecursive splitLocusIntervals1(LinkedList<GenomeLoc> remaining, long idealSplitSize) {
|
||||
final List<GenomeLoc> split = new ArrayList<GenomeLoc>();
|
||||
long size = 0;
|
||||
|
||||
while ( ! remaining.isEmpty() ) {
|
||||
GenomeLoc head = remaining.pop();
|
||||
final long newSize = size + head.size();
|
||||
|
||||
if ( newSize == idealSplitSize ) {
|
||||
split.add(head);
|
||||
break; // we are done
|
||||
} else if ( newSize > idealSplitSize ) {
|
||||
final long remainingBp = idealSplitSize - size;
|
||||
final long cutPoint = head.getStart() + remainingBp;
|
||||
GenomeLoc[] parts = head.split((int)cutPoint);
|
||||
remaining.push(parts[1]);
|
||||
remaining.push(parts[0]);
|
||||
// when we go around, head.size' = idealSplitSize - size
|
||||
// so newSize' = splitSize + head.size' = size + (idealSplitSize - size) = idealSplitSize
|
||||
} else {
|
||||
split.add(head);
|
||||
size = newSize;
|
||||
}
|
||||
}
|
||||
|
||||
return new SplitLocusRecursive(split, remaining);
|
||||
}
|
||||
|
||||
private final static class SplitLocusRecursive {
|
||||
final List<GenomeLoc> split;
|
||||
final LinkedList<GenomeLoc> remaining;
|
||||
|
||||
@Requires({"split != null", "remaining != null"})
|
||||
private SplitLocusRecursive(final List<GenomeLoc> split, final LinkedList<GenomeLoc> remaining) {
|
||||
this.split = split;
|
||||
this.remaining = remaining;
|
||||
}
|
||||
}
|
||||
|
||||
public static List<GenomeLoc> flattenSplitIntervals(List<List<GenomeLoc>> splits) {
|
||||
final List<GenomeLoc> locs = new ArrayList<GenomeLoc>();
|
||||
for ( final List<GenomeLoc> split : splits )
|
||||
locs.addAll(split);
|
||||
return locs;
|
||||
}
|
||||
|
||||
private static void addFixedSplit(List<Integer> splitPoints, List<GenomeLoc> locs, long locsSize, int startIndex, int stopIndex, int numParts) {
|
||||
if (numParts < 2)
|
||||
return;
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
package org.broadinstitute.sting.utils.interval;
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import net.sf.picard.util.IntervalUtil;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
|
||||
|
|
@ -131,6 +132,56 @@ public class IntervalUtilsUnitTest extends BaseTest {
|
|||
Assert.assertEquals(totalSize, sumOfSplitSizes, "Split intervals don't contain the exact number of bases in the origianl intervals");
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// tests to ensure the quality of the interval cuts of the interval cutting functions
|
||||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
||||
private class IntervalRepartitionTest extends TestDataProvider {
|
||||
final List<GenomeLoc> originalIntervals;
|
||||
final public int parts;
|
||||
|
||||
private IntervalRepartitionTest(final String name, List<GenomeLoc> originalIntervals, final int parts) {
|
||||
super(IntervalRepartitionTest.class, name);
|
||||
this.parts = parts;
|
||||
this.originalIntervals = originalIntervals;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("%s parts=%d", super.toString(), parts);
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "IntervalRepartitionTest")
|
||||
public Object[][] createIntervalRepartitionTest() {
|
||||
for ( int parts : Arrays.asList(1, 10, 100, 1000, 10000) ) {
|
||||
new IntervalRepartitionTest("hg19RefLocs", hg19ReferenceLocs, parts);
|
||||
new IntervalRepartitionTest("hg19ExomeLocs", hg19exomeIntervals, parts);
|
||||
}
|
||||
|
||||
return IntervalSlicingTest.getTests(IntervalRepartitionTest.class);
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "IntervalRepartitionTest")
|
||||
public void testIntervalRepartition(IntervalRepartitionTest test) {
|
||||
List<List<GenomeLoc>> splitByLocus = IntervalUtils.splitLocusIntervals(test.originalIntervals, test.parts);
|
||||
Assert.assertEquals(test.parts, splitByLocus.size(), "SplitLocusIntervals failed to generate correct number of intervals");
|
||||
List<GenomeLoc> flat = IntervalUtils.flattenSplitIntervals(splitByLocus);
|
||||
|
||||
// test sizes
|
||||
final long originalSize = IntervalUtils.intervalSize(test.originalIntervals);
|
||||
final long splitSize = IntervalUtils.intervalSize(flat);
|
||||
Assert.assertEquals(originalSize, splitSize, "SplitLocusIntervals locs cover an incorrect number of bases");
|
||||
|
||||
// test that every base in original is covered once by a base in split by locus intervals
|
||||
// todo implement test (complex)
|
||||
}
|
||||
|
||||
//
|
||||
// Misc. tests
|
||||
//
|
||||
|
||||
@Test(expectedExceptions=UserException.class)
|
||||
public void testMergeListsBySetOperatorNoOverlap() {
|
||||
// a couple of lists we'll use for the testing
|
||||
|
|
|
|||
|
|
@ -24,8 +24,20 @@
|
|||
|
||||
package org.broadinstitute.sting.queue.extensions.gatk
|
||||
|
||||
import org.broadinstitute.sting.utils.interval.IntervalUtils
|
||||
import org.broadinstitute.sting.queue.function.InProcessFunction
|
||||
|
||||
/**
|
||||
* For now returns an IntervalScatterFunction.
|
||||
* TODO: A scatter function that divides down to the locus level.
|
||||
* A scatter function that divides down to the locus level.
|
||||
*/
|
||||
class LocusScatterFunction extends IntervalScatterFunction {}
|
||||
class LocusScatterFunction extends IntervalScatterFunction {
|
||||
}
|
||||
//
|
||||
//class LocusScatterFunction extends GATKScatterFunction with InProcessFunction {
|
||||
// // todo -- max intervals is actually the original scatter count, not capped by interval size
|
||||
// def run() {
|
||||
// val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals)
|
||||
// val splits = IntervalUtils.splitLocusIntervals(gi.locs, this.scatterOutputFiles.size)
|
||||
// IntervalUtils.scatterFixedIntervals(gi.samFileHeader, splits, this.scatterOutputFiles)
|
||||
// }
|
||||
//}
|
||||
Loading…
Reference in New Issue