diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java index 3168d9a6b..cbbb3d43f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java @@ -113,6 +113,7 @@ import java.util.*; // todo -- allow for user to set linear binning (default is logarithmic) // todo -- formatting --> do something special for end bins in getQuantile(int[] foo), this gets mushed into the end+-1 bins for now @By(DataSource.REFERENCE) +@PartitionBy(PartitionType.INTERVAL) public class DepthOfCoverageWalker extends LocusWalker>, CoveragePartitioner> implements TreeReducible { @Output @Multiplex(value=DoCOutputMultiplexer.class,arguments={"partitionTypes","refSeqGeneList","omitDepthOutput","omitIntervals","omitSampleSummary","omitLocusTable"}) diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index b66198713..c1479bc69 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -5,6 +5,10 @@ import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.io.Serializable; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; /** * Created by IntelliJ IDEA. @@ -174,6 +178,8 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome return new GenomeLoc[] { new GenomeLoc(getContig(),contigIndex,getStart(),splitPoint-1), new GenomeLoc(getContig(),contigIndex,splitPoint,getStop()) }; } + public GenomeLoc union( GenomeLoc that ) { return merge(that); } + @Requires("that != null") @Ensures("result != null") public GenomeLoc intersect( GenomeLoc that ) throws ReviewedStingException { @@ -192,6 +198,79 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome Math.min( getStop(), that.getStop()) ); } + @Requires("that != null") + public final List subtract( final GenomeLoc that ) { + if(GenomeLoc.isUnmapped(this) || GenomeLoc.isUnmapped(that)) { + if(! GenomeLoc.isUnmapped(this) || !GenomeLoc.isUnmapped(that)) + throw new ReviewedStingException("Tried to intersect a mapped and an unmapped genome loc"); + return Arrays.asList(UNMAPPED); + } + + if (!(this.overlapsP(that))) { + throw new ReviewedStingException("GenomeLoc::minus(): The two genome loc's need to overlap"); + } + + if (equals(that)) { + return Collections.emptyList(); + } else if (containsP(that)) { + List l = new ArrayList(2); + + /** + * we have to create two new region, one for the before part, one for the after + * The old region: + * |----------------- old region (g) -------------| + * |----- to delete (e) ------| + * + * product (two new regions): + * |------| + |--------| + * + */ + int afterStop = this.getStop(), afterStart = that.getStop() + 1; + int beforeStop = that.getStart() - 1, beforeStart = this.getStart(); + if (afterStop - afterStart >= 0) { + GenomeLoc after = new GenomeLoc(this.getContig(), getContigIndex(), afterStart, afterStop); + l.add(after); + } + if (beforeStop - beforeStart >= 0) { + GenomeLoc before = new GenomeLoc(this.getContig(), getContigIndex(), beforeStart, beforeStop); + l.add(before); + } + + return l; + } else if (that.containsP(this)) { + /** + * e completely contains g, delete g, but keep looking, there may be more regions + * i.e.: + * |--------------------- e --------------------| + * |--- g ---| |---- others ----| + */ + return Collections.emptyList(); // don't need to do anything + } else { + /** + * otherwise e overlaps some part of g + * + * figure out which region occurs first on the genome. I.e., is it: + * |------------- g ----------| + * |------------- e ----------| + * + * or: + * |------------- g ----------| + * |------------ e -----------| + * + */ + + GenomeLoc n; + if (that.getStart() < this.getStart()) { + n = new GenomeLoc(this.getContig(), getContigIndex(), that.getStop() + 1, this.getStop()); + } else { + n = new GenomeLoc(this.getContig(), getContigIndex(), this.getStart(), that.getStart() - 1); + } + + // replace g with the new region + return Arrays.asList(n); + } + } + @Requires("that != null") public final boolean containsP(GenomeLoc that) { return onSameContig(that) && getStart() <= that.getStart() && getStop() >= that.getStop(); @@ -203,19 +282,14 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome } @Requires("that != null") - public final int minus( final GenomeLoc that ) { + @Ensures("result >= 0") + public final int distance( final GenomeLoc that ) { if ( this.onSameContig(that) ) - return this.getStart() - that.getStart(); + return Math.abs(this.getStart() - that.getStart()); else return Integer.MAX_VALUE; } - @Requires("that != null") - @Ensures("result >= 0") - public final int distance( final GenomeLoc that ) { - return Math.abs(minus(that)); - } - @Requires({"left != null", "right != null"}) public final boolean isBetween( final GenomeLoc left, final GenomeLoc right ) { return this.compareTo(left) > -1 && this.compareTo(right) < 1; diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java index fd7a79f48..26be0e59e 100755 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java @@ -215,7 +215,7 @@ public class GenomeLocSortedSet extends AbstractSet { if ( p.overlapsP(e) ) { toProcess.pop(); - for ( GenomeLoc newP : subtractRegion(p, e) ) + for ( GenomeLoc newP : p.subtract(e) ) toProcess.push(newP); } else if ( p.compareContigs(e) < 0 ) { good.add(toProcess.pop()); // p is now good @@ -236,69 +236,6 @@ public class GenomeLocSortedSet extends AbstractSet { return createSetFromList(genomeLocParser,good); } - private static final List EMPTY_LIST = new ArrayList(); - private List subtractRegion(GenomeLoc g, GenomeLoc e) { - if (g.equals(e)) { - return EMPTY_LIST; - } else if (g.containsP(e)) { - List l = new ArrayList(); - - /** - * we have to create two new region, one for the before part, one for the after - * The old region: - * |----------------- old region (g) -------------| - * |----- to delete (e) ------| - * - * product (two new regions): - * |------| + |--------| - * - */ - int afterStop = g.getStop(), afterStart = e.getStop() + 1; - int beforeStop = e.getStart() - 1, beforeStart = g.getStart(); - if (afterStop - afterStart >= 0) { - GenomeLoc after = genomeLocParser.createGenomeLoc(g.getContig(), afterStart, afterStop); - l.add(after); - } - if (beforeStop - beforeStart >= 0) { - GenomeLoc before = genomeLocParser.createGenomeLoc(g.getContig(), beforeStart, beforeStop); - l.add(before); - } - - return l; - } else if (e.containsP(g)) { - /** - * e completely contains g, delete g, but keep looking, there may be more regions - * i.e.: - * |--------------------- e --------------------| - * |--- g ---| |---- others ----| - */ - return EMPTY_LIST; // don't need to do anything - } else { - /** - * otherwise e overlaps some part of g - * - * figure out which region occurs first on the genome. I.e., is it: - * |------------- g ----------| - * |------------- e ----------| - * - * or: - * |------------- g ----------| - * |------------ e -----------| - * - */ - - GenomeLoc n; - if (e.getStart() < g.getStart()) { - n = genomeLocParser.createGenomeLoc(g.getContig(), e.getStop() + 1, g.getStop()); - } else { - n = genomeLocParser.createGenomeLoc(g.getContig(), g.getStart(), e.getStart() - 1); - } - - // replace g with the new region - return Arrays.asList(n); - } - } - /** * a simple removal of an interval contained in this list. The interval must be identical to one in the list (no partial locations or overlapping) diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index 372efa350..f0eb5d399 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -586,6 +586,12 @@ public class Utils { return rcbases; } + static public final List reverse(final List l) { + final List newL = new ArrayList(l); + Collections.reverse(newL); + return newL; + } + /** * Reverse an int array of bases * diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index 139cded37..f0e164c87 100644 --- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.utils.interval; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; import net.sf.picard.util.Interval; import net.sf.picard.util.IntervalList; import net.sf.samtools.SAMFileHeader; @@ -8,6 +10,7 @@ import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -221,6 +224,44 @@ public class IntervalUtils { return GenomeLocSortedSet.createSetFromList(parser,intervals); } + /** + * computes whether the test interval list is equivalent to master. To be equivalent, test must + * contain GenomeLocs covering every base in master, exactly once. Note that this algorithm + * assumes that master genomelocs are all discontiguous (i.e., we don't have locs like 1-3 and 4-6 but + * rather just 1-6). In order to use this algorithm with contiguous genomelocs first merge them. The algorithm + * doesn't assume that test has discontinuous genomelocs. + * + * Returns a null string if there are no differences, otherwise returns a string describing the difference + * (useful for UnitTests). Assumes both lists are sorted + */ + public static final String equateIntervals(List masterArg, List testArg) { + LinkedList master = new LinkedList(masterArg); + LinkedList test = new LinkedList(testArg); + + while ( ! master.isEmpty() ) { // there's still unchecked bases in master + final GenomeLoc masterHead = master.pop(); + final GenomeLoc testHead = test.pop(); + + if ( testHead.overlapsP(masterHead) ) { + // remove the parts of test that overlap master, and push the remaining + // parts onto master for further comparison. + for ( final GenomeLoc masterPart : Utils.reverse(masterHead.subtract(testHead)) ) { + master.push(masterPart); + } + } else { + // testHead is incompatible with masterHead, so we must have extra bases in testHead + // that aren't in master + return "Incompatible locs detected masterHead=" + masterHead + ", testHead=" + testHead; + } + } + + if ( test.isEmpty() ) // everything is equal + return null; // no differences + else + return "Remaining elements found in test: first=" + test.peek(); + } + + /** * Check if string argument was intented as a file * Accepted file extensions: .bed .list, .picard, .interval_list, .intervals. @@ -384,6 +425,92 @@ public class IntervalUtils { return splitIntervalsToSubLists(locs, splitPoints); } + @Requires({"locs != null", "numParts > 0"}) + @Ensures("result != null") + public static List> splitLocusIntervals(List locs, int numParts) { + // the ideal size of each split + final long bp = IntervalUtils.intervalSize(locs); + final long idealSplitSize = Math.max((long)Math.floor(bp / (1.0*numParts)), 1); + + // algorithm: + // split = () + // set size = 0 + // pop the head H off locs. + // If size + size(H) < splitSize: + // add H to split, continue + // If size + size(H) == splitSize: + // done with split, put in splits, restart + // if size + size(H) > splitSize: + // cut H into two pieces, first of which has splitSize - size bp + // push both pieces onto locs, continue + // The last split is special -- when you have only one split left, it gets all of the remaining locs + // to deal with rounding issues + final List> splits = new ArrayList>(numParts); + + LinkedList locsLinkedList = new LinkedList(locs); + while ( ! locsLinkedList.isEmpty() ) { + if ( splits.size() + 1 == numParts ) { + // the last one gets all of the remaining parts + splits.add(new ArrayList(locsLinkedList)); + locsLinkedList.clear(); + } else { + final SplitLocusRecursive one = splitLocusIntervals1(locsLinkedList, idealSplitSize); + splits.add(one.split); + locsLinkedList = one.remaining; + } + } + + return splits; + } + + @Requires({"remaining != null", "!remaining.isEmpty()", "idealSplitSize > 0"}) + @Ensures({"result != null"}) + final static SplitLocusRecursive splitLocusIntervals1(LinkedList remaining, long idealSplitSize) { + final List split = new ArrayList(); + long size = 0; + + while ( ! remaining.isEmpty() ) { + GenomeLoc head = remaining.pop(); + final long newSize = size + head.size(); + + if ( newSize == idealSplitSize ) { + split.add(head); + break; // we are done + } else if ( newSize > idealSplitSize ) { + final long remainingBp = idealSplitSize - size; + final long cutPoint = head.getStart() + remainingBp; + GenomeLoc[] parts = head.split((int)cutPoint); + remaining.push(parts[1]); + remaining.push(parts[0]); + // when we go around, head.size' = idealSplitSize - size + // so newSize' = splitSize + head.size' = size + (idealSplitSize - size) = idealSplitSize + } else { + split.add(head); + size = newSize; + } + } + + return new SplitLocusRecursive(split, remaining); + } + + private final static class SplitLocusRecursive { + final List split; + final LinkedList remaining; + + @Requires({"split != null", "remaining != null"}) + private SplitLocusRecursive(final List split, final LinkedList remaining) { + this.split = split; + this.remaining = remaining; + } + } + + public static List flattenSplitIntervals(List> splits) { + final List locs = new ArrayList(); + for ( final List split : splits ) + locs.addAll(split); + return locs; + } + private static void addFixedSplit(List splitPoints, List locs, long locsSize, int startIndex, int stopIndex, int numParts) { if (numParts < 2) return; diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java index 067ec14a0..13b746dc9 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.utils.interval; import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.picard.util.IntervalUtil; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; @@ -131,6 +132,56 @@ public class IntervalUtilsUnitTest extends BaseTest { Assert.assertEquals(totalSize, sumOfSplitSizes, "Split intervals don't contain the exact number of bases in the origianl intervals"); } + // ------------------------------------------------------------------------------------- + // + // tests to ensure the quality of the interval cuts of the interval cutting functions + // + // ------------------------------------------------------------------------------------- + + private class IntervalRepartitionTest extends TestDataProvider { + final List originalIntervals; + final public int parts; + + private IntervalRepartitionTest(final String name, List originalIntervals, final int parts) { + super(IntervalRepartitionTest.class, name); + this.parts = parts; + this.originalIntervals = originalIntervals; + } + + public String toString() { + return String.format("%s parts=%d", super.toString(), parts); + } + } + + @DataProvider(name = "IntervalRepartitionTest") + public Object[][] createIntervalRepartitionTest() { + for ( int parts : Arrays.asList(1, 10, 100, 1000, 10000) ) { + new IntervalRepartitionTest("hg19RefLocs", hg19ReferenceLocs, parts); + new IntervalRepartitionTest("hg19ExomeLocs", hg19exomeIntervals, parts); + } + + return IntervalSlicingTest.getTests(IntervalRepartitionTest.class); + } + + @Test(enabled = true, dataProvider = "IntervalRepartitionTest") + public void testIntervalRepartition(IntervalRepartitionTest test) { + List> splitByLocus = IntervalUtils.splitLocusIntervals(test.originalIntervals, test.parts); + Assert.assertEquals(test.parts, splitByLocus.size(), "SplitLocusIntervals failed to generate correct number of intervals"); + List flat = IntervalUtils.flattenSplitIntervals(splitByLocus); + + // test sizes + final long originalSize = IntervalUtils.intervalSize(test.originalIntervals); + final long splitSize = IntervalUtils.intervalSize(flat); + Assert.assertEquals(originalSize, splitSize, "SplitLocusIntervals locs cover an incorrect number of bases"); + + // test that every base in original is covered once by a base in split by locus intervals + // todo implement test (complex) + } + + // + // Misc. tests + // + @Test(expectedExceptions=UserException.class) public void testMergeListsBySetOperatorNoOverlap() { // a couple of lists we'll use for the testing diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala index 50482033f..366d23488 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala @@ -24,8 +24,20 @@ package org.broadinstitute.sting.queue.extensions.gatk +import org.broadinstitute.sting.utils.interval.IntervalUtils +import org.broadinstitute.sting.queue.function.InProcessFunction + /** - * For now returns an IntervalScatterFunction. - * TODO: A scatter function that divides down to the locus level. + * A scatter function that divides down to the locus level. */ -class LocusScatterFunction extends IntervalScatterFunction {} +class LocusScatterFunction extends IntervalScatterFunction { +} +// +//class LocusScatterFunction extends GATKScatterFunction with InProcessFunction { +// // todo -- max intervals is actually the original scatter count, not capped by interval size +// def run() { +// val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals) +// val splits = IntervalUtils.splitLocusIntervals(gi.locs, this.scatterOutputFiles.size) +// IntervalUtils.scatterFixedIntervals(gi.samFileHeader, splits, this.scatterOutputFiles) +// } +//} \ No newline at end of file