From 5fc613f972b0a858d15428e6eb762083d53fb306 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 1 Nov 2011 19:47:10 -0400 Subject: [PATCH 1/8] Better default partition types for walkers -- Added PartitionType.READ, and associated ReadScatterFunction. ReadScatterFunction is literally just ContigScatterFunction until someone wants to implement something better -- LocusWalkers (and subclasses RodWalkers and RefWalkers) are by default PartitionType.LOCUS. --- .../sting/gatk/walkers/LocusWalker.java | 2 +- .../sting/gatk/walkers/PartitionType.java | 6 ++ .../sting/gatk/walkers/ReadWalker.java | 2 +- .../gatk/ContigScatterFunction.scala | 1 + .../extensions/gatk/ReadScatterFunction.scala | 56 +++++++++++++++++++ 5 files changed, 65 insertions(+), 2 deletions(-) create mode 100644 public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ReadScatterFunction.scala diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java index 8152f74c2..e94d01d5a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java @@ -17,7 +17,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; */ @By(DataSource.READS) @Requires({DataSource.READS,DataSource.REFERENCE, DataSource.REFERENCE_BASES}) -@PartitionBy(PartitionType.INTERVAL) +@PartitionBy(PartitionType.LOCUS) @ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckFilter.class}) public abstract class LocusWalker extends Walker { // Do we actually want to operate on the context? diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PartitionType.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PartitionType.java index 361e222c2..f0d92ef8a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PartitionType.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PartitionType.java @@ -34,6 +34,12 @@ public enum PartitionType { */ NONE, + /** + * The walker inputs can be chunked down to individual + * reads. + */ + READ, + /** * The walker inputs can be chunked down to the * per-locus level. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java index db2038aa3..35f04abfc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java @@ -12,7 +12,7 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; * To change this template use File | Settings | File Templates. */ @Requires({DataSource.READS, DataSource.REFERENCE_BASES}) -@PartitionBy(PartitionType.CONTIG) +@PartitionBy(PartitionType.READ) public abstract class ReadWalker extends Walker { public boolean requiresOrderedReads() { return false; } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala index ee4cb9c4a..d2b3006a9 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala @@ -44,3 +44,4 @@ class ContigScatterFunction extends GATKScatterFunction with InProcessFunction { IntervalUtils.scatterContigIntervals(gi.samFileHeader, gi.locs, this.scatterOutputFiles) } } + diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ReadScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ReadScatterFunction.scala new file mode 100644 index 000000000..f56c4aa02 --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ReadScatterFunction.scala @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.extensions.gatk + +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Currently ReadScatterFunction only does ContigScattering, but it + * could in principle do something more aggressive. + */ +class ReadScatterFunction extends ContigScatterFunction { } + From c2b97030a4fcd55d5540c20da5c78575ffdebe75 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 2 Nov 2011 10:49:40 -0400 Subject: [PATCH 2/8] IntervalUtils for completely balanced locus-based scatter/gather -- scatterLocusIntervals master utility -- Moved around some general functionality from GenomeLocSortedSet to GenomeLoc -- Util function for reversing a list (List -> List, unlike Collections version) -- DoC is PartitionType.INTERVAL -- Significant unit tests on new functionality (all passing) -- Ready for real-world testing, as soon as I can get LocusScatterFunction.scala to actually work --- .../coverage/DepthOfCoverageWalker.java | 1 + .../broadinstitute/sting/utils/GenomeLoc.java | 90 +++++++++++-- .../sting/utils/GenomeLocSortedSet.java | 65 +-------- .../org/broadinstitute/sting/utils/Utils.java | 6 + .../sting/utils/interval/IntervalUtils.java | 127 ++++++++++++++++++ .../utils/interval/IntervalUtilsUnitTest.java | 51 +++++++ .../gatk/LocusScatterFunction.scala | 18 ++- 7 files changed, 283 insertions(+), 75 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java index 3168d9a6b..cbbb3d43f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java @@ -113,6 +113,7 @@ import java.util.*; // todo -- allow for user to set linear binning (default is logarithmic) // todo -- formatting --> do something special for end bins in getQuantile(int[] foo), this gets mushed into the end+-1 bins for now @By(DataSource.REFERENCE) +@PartitionBy(PartitionType.INTERVAL) public class DepthOfCoverageWalker extends LocusWalker>, CoveragePartitioner> implements TreeReducible { @Output @Multiplex(value=DoCOutputMultiplexer.class,arguments={"partitionTypes","refSeqGeneList","omitDepthOutput","omitIntervals","omitSampleSummary","omitLocusTable"}) diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index b66198713..c1479bc69 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -5,6 +5,10 @@ import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.io.Serializable; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; /** * Created by IntelliJ IDEA. @@ -174,6 +178,8 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome return new GenomeLoc[] { new GenomeLoc(getContig(),contigIndex,getStart(),splitPoint-1), new GenomeLoc(getContig(),contigIndex,splitPoint,getStop()) }; } + public GenomeLoc union( GenomeLoc that ) { return merge(that); } + @Requires("that != null") @Ensures("result != null") public GenomeLoc intersect( GenomeLoc that ) throws ReviewedStingException { @@ -192,6 +198,79 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome Math.min( getStop(), that.getStop()) ); } + @Requires("that != null") + public final List subtract( final GenomeLoc that ) { + if(GenomeLoc.isUnmapped(this) || GenomeLoc.isUnmapped(that)) { + if(! GenomeLoc.isUnmapped(this) || !GenomeLoc.isUnmapped(that)) + throw new ReviewedStingException("Tried to intersect a mapped and an unmapped genome loc"); + return Arrays.asList(UNMAPPED); + } + + if (!(this.overlapsP(that))) { + throw new ReviewedStingException("GenomeLoc::minus(): The two genome loc's need to overlap"); + } + + if (equals(that)) { + return Collections.emptyList(); + } else if (containsP(that)) { + List l = new ArrayList(2); + + /** + * we have to create two new region, one for the before part, one for the after + * The old region: + * |----------------- old region (g) -------------| + * |----- to delete (e) ------| + * + * product (two new regions): + * |------| + |--------| + * + */ + int afterStop = this.getStop(), afterStart = that.getStop() + 1; + int beforeStop = that.getStart() - 1, beforeStart = this.getStart(); + if (afterStop - afterStart >= 0) { + GenomeLoc after = new GenomeLoc(this.getContig(), getContigIndex(), afterStart, afterStop); + l.add(after); + } + if (beforeStop - beforeStart >= 0) { + GenomeLoc before = new GenomeLoc(this.getContig(), getContigIndex(), beforeStart, beforeStop); + l.add(before); + } + + return l; + } else if (that.containsP(this)) { + /** + * e completely contains g, delete g, but keep looking, there may be more regions + * i.e.: + * |--------------------- e --------------------| + * |--- g ---| |---- others ----| + */ + return Collections.emptyList(); // don't need to do anything + } else { + /** + * otherwise e overlaps some part of g + * + * figure out which region occurs first on the genome. I.e., is it: + * |------------- g ----------| + * |------------- e ----------| + * + * or: + * |------------- g ----------| + * |------------ e -----------| + * + */ + + GenomeLoc n; + if (that.getStart() < this.getStart()) { + n = new GenomeLoc(this.getContig(), getContigIndex(), that.getStop() + 1, this.getStop()); + } else { + n = new GenomeLoc(this.getContig(), getContigIndex(), this.getStart(), that.getStart() - 1); + } + + // replace g with the new region + return Arrays.asList(n); + } + } + @Requires("that != null") public final boolean containsP(GenomeLoc that) { return onSameContig(that) && getStart() <= that.getStart() && getStop() >= that.getStop(); @@ -203,19 +282,14 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome } @Requires("that != null") - public final int minus( final GenomeLoc that ) { + @Ensures("result >= 0") + public final int distance( final GenomeLoc that ) { if ( this.onSameContig(that) ) - return this.getStart() - that.getStart(); + return Math.abs(this.getStart() - that.getStart()); else return Integer.MAX_VALUE; } - @Requires("that != null") - @Ensures("result >= 0") - public final int distance( final GenomeLoc that ) { - return Math.abs(minus(that)); - } - @Requires({"left != null", "right != null"}) public final boolean isBetween( final GenomeLoc left, final GenomeLoc right ) { return this.compareTo(left) > -1 && this.compareTo(right) < 1; diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java index fd7a79f48..26be0e59e 100755 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java @@ -215,7 +215,7 @@ public class GenomeLocSortedSet extends AbstractSet { if ( p.overlapsP(e) ) { toProcess.pop(); - for ( GenomeLoc newP : subtractRegion(p, e) ) + for ( GenomeLoc newP : p.subtract(e) ) toProcess.push(newP); } else if ( p.compareContigs(e) < 0 ) { good.add(toProcess.pop()); // p is now good @@ -236,69 +236,6 @@ public class GenomeLocSortedSet extends AbstractSet { return createSetFromList(genomeLocParser,good); } - private static final List EMPTY_LIST = new ArrayList(); - private List subtractRegion(GenomeLoc g, GenomeLoc e) { - if (g.equals(e)) { - return EMPTY_LIST; - } else if (g.containsP(e)) { - List l = new ArrayList(); - - /** - * we have to create two new region, one for the before part, one for the after - * The old region: - * |----------------- old region (g) -------------| - * |----- to delete (e) ------| - * - * product (two new regions): - * |------| + |--------| - * - */ - int afterStop = g.getStop(), afterStart = e.getStop() + 1; - int beforeStop = e.getStart() - 1, beforeStart = g.getStart(); - if (afterStop - afterStart >= 0) { - GenomeLoc after = genomeLocParser.createGenomeLoc(g.getContig(), afterStart, afterStop); - l.add(after); - } - if (beforeStop - beforeStart >= 0) { - GenomeLoc before = genomeLocParser.createGenomeLoc(g.getContig(), beforeStart, beforeStop); - l.add(before); - } - - return l; - } else if (e.containsP(g)) { - /** - * e completely contains g, delete g, but keep looking, there may be more regions - * i.e.: - * |--------------------- e --------------------| - * |--- g ---| |---- others ----| - */ - return EMPTY_LIST; // don't need to do anything - } else { - /** - * otherwise e overlaps some part of g - * - * figure out which region occurs first on the genome. I.e., is it: - * |------------- g ----------| - * |------------- e ----------| - * - * or: - * |------------- g ----------| - * |------------ e -----------| - * - */ - - GenomeLoc n; - if (e.getStart() < g.getStart()) { - n = genomeLocParser.createGenomeLoc(g.getContig(), e.getStop() + 1, g.getStop()); - } else { - n = genomeLocParser.createGenomeLoc(g.getContig(), g.getStart(), e.getStart() - 1); - } - - // replace g with the new region - return Arrays.asList(n); - } - } - /** * a simple removal of an interval contained in this list. The interval must be identical to one in the list (no partial locations or overlapping) diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index 372efa350..f0eb5d399 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -586,6 +586,12 @@ public class Utils { return rcbases; } + static public final List reverse(final List l) { + final List newL = new ArrayList(l); + Collections.reverse(newL); + return newL; + } + /** * Reverse an int array of bases * diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index 139cded37..f0e164c87 100644 --- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.utils.interval; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; import net.sf.picard.util.Interval; import net.sf.picard.util.IntervalList; import net.sf.samtools.SAMFileHeader; @@ -8,6 +10,7 @@ import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -221,6 +224,44 @@ public class IntervalUtils { return GenomeLocSortedSet.createSetFromList(parser,intervals); } + /** + * computes whether the test interval list is equivalent to master. To be equivalent, test must + * contain GenomeLocs covering every base in master, exactly once. Note that this algorithm + * assumes that master genomelocs are all discontiguous (i.e., we don't have locs like 1-3 and 4-6 but + * rather just 1-6). In order to use this algorithm with contiguous genomelocs first merge them. The algorithm + * doesn't assume that test has discontinuous genomelocs. + * + * Returns a null string if there are no differences, otherwise returns a string describing the difference + * (useful for UnitTests). Assumes both lists are sorted + */ + public static final String equateIntervals(List masterArg, List testArg) { + LinkedList master = new LinkedList(masterArg); + LinkedList test = new LinkedList(testArg); + + while ( ! master.isEmpty() ) { // there's still unchecked bases in master + final GenomeLoc masterHead = master.pop(); + final GenomeLoc testHead = test.pop(); + + if ( testHead.overlapsP(masterHead) ) { + // remove the parts of test that overlap master, and push the remaining + // parts onto master for further comparison. + for ( final GenomeLoc masterPart : Utils.reverse(masterHead.subtract(testHead)) ) { + master.push(masterPart); + } + } else { + // testHead is incompatible with masterHead, so we must have extra bases in testHead + // that aren't in master + return "Incompatible locs detected masterHead=" + masterHead + ", testHead=" + testHead; + } + } + + if ( test.isEmpty() ) // everything is equal + return null; // no differences + else + return "Remaining elements found in test: first=" + test.peek(); + } + + /** * Check if string argument was intented as a file * Accepted file extensions: .bed .list, .picard, .interval_list, .intervals. @@ -384,6 +425,92 @@ public class IntervalUtils { return splitIntervalsToSubLists(locs, splitPoints); } + @Requires({"locs != null", "numParts > 0"}) + @Ensures("result != null") + public static List> splitLocusIntervals(List locs, int numParts) { + // the ideal size of each split + final long bp = IntervalUtils.intervalSize(locs); + final long idealSplitSize = Math.max((long)Math.floor(bp / (1.0*numParts)), 1); + + // algorithm: + // split = () + // set size = 0 + // pop the head H off locs. + // If size + size(H) < splitSize: + // add H to split, continue + // If size + size(H) == splitSize: + // done with split, put in splits, restart + // if size + size(H) > splitSize: + // cut H into two pieces, first of which has splitSize - size bp + // push both pieces onto locs, continue + // The last split is special -- when you have only one split left, it gets all of the remaining locs + // to deal with rounding issues + final List> splits = new ArrayList>(numParts); + + LinkedList locsLinkedList = new LinkedList(locs); + while ( ! locsLinkedList.isEmpty() ) { + if ( splits.size() + 1 == numParts ) { + // the last one gets all of the remaining parts + splits.add(new ArrayList(locsLinkedList)); + locsLinkedList.clear(); + } else { + final SplitLocusRecursive one = splitLocusIntervals1(locsLinkedList, idealSplitSize); + splits.add(one.split); + locsLinkedList = one.remaining; + } + } + + return splits; + } + + @Requires({"remaining != null", "!remaining.isEmpty()", "idealSplitSize > 0"}) + @Ensures({"result != null"}) + final static SplitLocusRecursive splitLocusIntervals1(LinkedList remaining, long idealSplitSize) { + final List split = new ArrayList(); + long size = 0; + + while ( ! remaining.isEmpty() ) { + GenomeLoc head = remaining.pop(); + final long newSize = size + head.size(); + + if ( newSize == idealSplitSize ) { + split.add(head); + break; // we are done + } else if ( newSize > idealSplitSize ) { + final long remainingBp = idealSplitSize - size; + final long cutPoint = head.getStart() + remainingBp; + GenomeLoc[] parts = head.split((int)cutPoint); + remaining.push(parts[1]); + remaining.push(parts[0]); + // when we go around, head.size' = idealSplitSize - size + // so newSize' = splitSize + head.size' = size + (idealSplitSize - size) = idealSplitSize + } else { + split.add(head); + size = newSize; + } + } + + return new SplitLocusRecursive(split, remaining); + } + + private final static class SplitLocusRecursive { + final List split; + final LinkedList remaining; + + @Requires({"split != null", "remaining != null"}) + private SplitLocusRecursive(final List split, final LinkedList remaining) { + this.split = split; + this.remaining = remaining; + } + } + + public static List flattenSplitIntervals(List> splits) { + final List locs = new ArrayList(); + for ( final List split : splits ) + locs.addAll(split); + return locs; + } + private static void addFixedSplit(List splitPoints, List locs, long locsSize, int startIndex, int stopIndex, int numParts) { if (numParts < 2) return; diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java index 067ec14a0..13b746dc9 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.utils.interval; import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.picard.util.IntervalUtil; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; @@ -131,6 +132,56 @@ public class IntervalUtilsUnitTest extends BaseTest { Assert.assertEquals(totalSize, sumOfSplitSizes, "Split intervals don't contain the exact number of bases in the origianl intervals"); } + // ------------------------------------------------------------------------------------- + // + // tests to ensure the quality of the interval cuts of the interval cutting functions + // + // ------------------------------------------------------------------------------------- + + private class IntervalRepartitionTest extends TestDataProvider { + final List originalIntervals; + final public int parts; + + private IntervalRepartitionTest(final String name, List originalIntervals, final int parts) { + super(IntervalRepartitionTest.class, name); + this.parts = parts; + this.originalIntervals = originalIntervals; + } + + public String toString() { + return String.format("%s parts=%d", super.toString(), parts); + } + } + + @DataProvider(name = "IntervalRepartitionTest") + public Object[][] createIntervalRepartitionTest() { + for ( int parts : Arrays.asList(1, 10, 100, 1000, 10000) ) { + new IntervalRepartitionTest("hg19RefLocs", hg19ReferenceLocs, parts); + new IntervalRepartitionTest("hg19ExomeLocs", hg19exomeIntervals, parts); + } + + return IntervalSlicingTest.getTests(IntervalRepartitionTest.class); + } + + @Test(enabled = true, dataProvider = "IntervalRepartitionTest") + public void testIntervalRepartition(IntervalRepartitionTest test) { + List> splitByLocus = IntervalUtils.splitLocusIntervals(test.originalIntervals, test.parts); + Assert.assertEquals(test.parts, splitByLocus.size(), "SplitLocusIntervals failed to generate correct number of intervals"); + List flat = IntervalUtils.flattenSplitIntervals(splitByLocus); + + // test sizes + final long originalSize = IntervalUtils.intervalSize(test.originalIntervals); + final long splitSize = IntervalUtils.intervalSize(flat); + Assert.assertEquals(originalSize, splitSize, "SplitLocusIntervals locs cover an incorrect number of bases"); + + // test that every base in original is covered once by a base in split by locus intervals + // todo implement test (complex) + } + + // + // Misc. tests + // + @Test(expectedExceptions=UserException.class) public void testMergeListsBySetOperatorNoOverlap() { // a couple of lists we'll use for the testing diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala index 50482033f..366d23488 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala @@ -24,8 +24,20 @@ package org.broadinstitute.sting.queue.extensions.gatk +import org.broadinstitute.sting.utils.interval.IntervalUtils +import org.broadinstitute.sting.queue.function.InProcessFunction + /** - * For now returns an IntervalScatterFunction. - * TODO: A scatter function that divides down to the locus level. + * A scatter function that divides down to the locus level. */ -class LocusScatterFunction extends IntervalScatterFunction {} +class LocusScatterFunction extends IntervalScatterFunction { +} +// +//class LocusScatterFunction extends GATKScatterFunction with InProcessFunction { +// // todo -- max intervals is actually the original scatter count, not capped by interval size +// def run() { +// val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals) +// val splits = IntervalUtils.splitLocusIntervals(gi.locs, this.scatterOutputFiles.size) +// IntervalUtils.scatterFixedIntervals(gi.samFileHeader, splits, this.scatterOutputFiles) +// } +//} \ No newline at end of file From 392e0aeace02813f86d7741ada9fbb3dad7ebfa9 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 2 Nov 2011 10:52:00 -0400 Subject: [PATCH 3/8] Moved unit tests into master IntervalUtilsUnitTest --- .../utils/interval/IntervalUtilsUnitTest.java | 107 +++++++++++++++--- 1 file changed, 93 insertions(+), 14 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java index 13b746dc9..9c3b905c2 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java @@ -134,16 +134,17 @@ public class IntervalUtilsUnitTest extends BaseTest { // ------------------------------------------------------------------------------------- // - // tests to ensure the quality of the interval cuts of the interval cutting functions + // splitLocusIntervals tests // // ------------------------------------------------------------------------------------- - private class IntervalRepartitionTest extends TestDataProvider { + /** large scale tests for many intervals */ + private class SplitLocusIntervalsTest extends TestDataProvider { final List originalIntervals; final public int parts; - private IntervalRepartitionTest(final String name, List originalIntervals, final int parts) { - super(IntervalRepartitionTest.class, name); + private SplitLocusIntervalsTest(final String name, List originalIntervals, final int parts) { + super(SplitLocusIntervalsTest.class, name); this.parts = parts; this.originalIntervals = originalIntervals; } @@ -155,27 +156,105 @@ public class IntervalUtilsUnitTest extends BaseTest { @DataProvider(name = "IntervalRepartitionTest") public Object[][] createIntervalRepartitionTest() { - for ( int parts : Arrays.asList(1, 10, 100, 1000, 10000) ) { - new IntervalRepartitionTest("hg19RefLocs", hg19ReferenceLocs, parts); - new IntervalRepartitionTest("hg19ExomeLocs", hg19exomeIntervals, parts); + for ( int parts : Arrays.asList(1, 2, 3, 10, 13, 100, 151, 1000, 10000) ) { + //for ( int parts : Arrays.asList(10) ) { + new SplitLocusIntervalsTest("hg19RefLocs", hg19ReferenceLocs, parts); + new SplitLocusIntervalsTest("hg19ExomeLocs", hg19exomeIntervals, parts); } - return IntervalSlicingTest.getTests(IntervalRepartitionTest.class); + return SplitLocusIntervalsTest.getTests(SplitLocusIntervalsTest.class); } @Test(enabled = true, dataProvider = "IntervalRepartitionTest") - public void testIntervalRepartition(IntervalRepartitionTest test) { + public void testIntervalRepartition(SplitLocusIntervalsTest test) { List> splitByLocus = IntervalUtils.splitLocusIntervals(test.originalIntervals, test.parts); - Assert.assertEquals(test.parts, splitByLocus.size(), "SplitLocusIntervals failed to generate correct number of intervals"); + Assert.assertEquals(splitByLocus.size(), test.parts, "SplitLocusIntervals failed to generate correct number of intervals"); + List flat = IntervalUtils.flattenSplitIntervals(splitByLocus); + + // test overall size + final long originalSize = IntervalUtils.intervalSize(test.originalIntervals); + final long flatSize = IntervalUtils.intervalSize(flat); + Assert.assertEquals(flatSize, originalSize, "SplitLocusIntervals locs cover an incorrect number of bases"); + + // test size of each split + final long ideal = (long)Math.floor(originalSize / (1.0 * test.parts)); + final long maxSize = ideal + (originalSize % test.parts) * test.parts; // no more than N * rounding error in size + for ( final List split : splitByLocus ) { + final long splitSize = IntervalUtils.intervalSize(split); + Assert.assertTrue(splitSize >= ideal && splitSize <= maxSize, + String.format("SplitLocusIntervals interval (start=%s) has size %d outside of bounds ideal=%d, max=%d", + split.get(0), splitSize, ideal, maxSize)); + } + + // test that every base in original is covered once by a base in split by locus intervals + String diff = IntervalUtils.equateIntervals(test.originalIntervals, flat); + Assert.assertNull(diff, diff); + } + + /** small scale tests where the expected cuts are enumerated upfront for testing */ + private class SplitLocusIntervalsSmallTest extends TestDataProvider { + final List original; + final public int parts; + final public int expectedParts; + final List expected; + + private SplitLocusIntervalsSmallTest(final String name, List originalIntervals, final int parts, List expected) { + this(name, originalIntervals, parts, expected, parts); + } + + private SplitLocusIntervalsSmallTest(final String name, List originalIntervals, final int parts, List expected, int expectedParts) { + super(SplitLocusIntervalsSmallTest.class, name); + this.parts = parts; + this.expectedParts = expectedParts; + this.original = originalIntervals; + this.expected = expected; + } + + public String toString() { + return String.format("%s parts=%d", super.toString(), parts); + } + } + + @DataProvider(name = "SplitLocusIntervalsSmallTest") + public Object[][] createSplitLocusIntervalsSmallTest() { + GenomeLoc bp01_10 = hg19GenomeLocParser.createGenomeLoc("1", 1, 10); + + GenomeLoc bp1_5 = hg19GenomeLocParser.createGenomeLoc("1", 1, 5); + GenomeLoc bp6_10 = hg19GenomeLocParser.createGenomeLoc("1", 6, 10); + new SplitLocusIntervalsSmallTest("cut into two", Arrays.asList(bp01_10), 2, Arrays.asList(bp1_5, bp6_10)); + + GenomeLoc bp20_30 = hg19GenomeLocParser.createGenomeLoc("1", 20, 30); + new SplitLocusIntervalsSmallTest("two in two", Arrays.asList(bp01_10, bp20_30), 2, Arrays.asList(bp01_10, bp20_30)); + + GenomeLoc bp1_7 = hg19GenomeLocParser.createGenomeLoc("1", 1, 7); + GenomeLoc bp8_10 = hg19GenomeLocParser.createGenomeLoc("1", 8, 10); + GenomeLoc bp20_23 = hg19GenomeLocParser.createGenomeLoc("1", 20, 23); + GenomeLoc bp24_30 = hg19GenomeLocParser.createGenomeLoc("1", 24, 30); + new SplitLocusIntervalsSmallTest("two in three", Arrays.asList(bp01_10, bp20_30), 3, + Arrays.asList(bp1_7, bp8_10, bp20_23, bp24_30)); + + GenomeLoc bp1_2 = hg19GenomeLocParser.createGenomeLoc("1", 1, 2); + GenomeLoc bp1_1 = hg19GenomeLocParser.createGenomeLoc("1", 1, 1); + GenomeLoc bp2_2 = hg19GenomeLocParser.createGenomeLoc("1", 2, 2); + new SplitLocusIntervalsSmallTest("too many pieces", Arrays.asList(bp1_2), 5, Arrays.asList(bp1_1, bp2_2), 2); + + new SplitLocusIntervalsSmallTest("emptyList", Collections.emptyList(), 5, Collections.emptyList(), 0); + + return SplitLocusIntervalsSmallTest.getTests(SplitLocusIntervalsSmallTest.class); + } + + @Test(enabled = true, dataProvider = "SplitLocusIntervalsSmallTest") + public void splitLocusIntervalsSmallTest(SplitLocusIntervalsSmallTest test) { + List> splitByLocus = IntervalUtils.splitLocusIntervals(test.original, test.parts); + Assert.assertEquals(splitByLocus.size(), test.expectedParts, "SplitLocusIntervals failed to generate correct number of intervals"); List flat = IntervalUtils.flattenSplitIntervals(splitByLocus); // test sizes - final long originalSize = IntervalUtils.intervalSize(test.originalIntervals); + final long originalSize = IntervalUtils.intervalSize(test.original); final long splitSize = IntervalUtils.intervalSize(flat); - Assert.assertEquals(originalSize, splitSize, "SplitLocusIntervals locs cover an incorrect number of bases"); + Assert.assertEquals(splitSize, originalSize, "SplitLocusIntervals locs cover an incorrect number of bases"); - // test that every base in original is covered once by a base in split by locus intervals - // todo implement test (complex) + Assert.assertEquals(flat, test.expected, "SplitLocusIntervals locs not expected intervals"); } // From c1da8cd5e7376b70d3d2d8b8735f2140b999727c Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 2 Nov 2011 11:26:34 -0400 Subject: [PATCH 4/8] Final version of bp-resolved locus scatter/gather -- Minor refactoring to allow LocusScatterFunction to have maxIntervals be the original scatter count, rather than capping this by the interval count as Contig and Interval do --- .../gatk/ContigScatterFunction.scala | 2 ++ .../gatk/IntervalScatterFunction.scala | 2 ++ .../gatk/LocusScatterFunction.scala | 23 ++++++++++--------- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala index d2b3006a9..2609c3607 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala @@ -35,6 +35,8 @@ class ContigScatterFunction extends GATKScatterFunction with InProcessFunction { // Include unmapped reads by default. this.includeUnmapped = true + override def scatterCount = if (intervalFilesExist) super.scatterCount min this.maxIntervals else super.scatterCount + protected override def maxIntervals = { GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals).contigs.size } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala index f65d5ab29..40a6fc4b4 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala @@ -35,6 +35,8 @@ class IntervalScatterFunction extends GATKScatterFunction with InProcessFunction protected override def maxIntervals = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals).locs.size + override def scatterCount = if (intervalFilesExist) super.scatterCount min this.maxIntervals else super.scatterCount + def run() { val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals) val splits = IntervalUtils.splitFixedIntervals(gi.locs, this.scatterOutputFiles.size) diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala index 366d23488..8f52b9b82 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/LocusScatterFunction.scala @@ -24,20 +24,21 @@ package org.broadinstitute.sting.queue.extensions.gatk +import collection.JavaConversions._ import org.broadinstitute.sting.utils.interval.IntervalUtils import org.broadinstitute.sting.queue.function.InProcessFunction /** * A scatter function that divides down to the locus level. */ -class LocusScatterFunction extends IntervalScatterFunction { -} -// -//class LocusScatterFunction extends GATKScatterFunction with InProcessFunction { -// // todo -- max intervals is actually the original scatter count, not capped by interval size -// def run() { -// val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals) -// val splits = IntervalUtils.splitLocusIntervals(gi.locs, this.scatterOutputFiles.size) -// IntervalUtils.scatterFixedIntervals(gi.samFileHeader, splits, this.scatterOutputFiles) -// } -//} \ No newline at end of file +//class LocusScatterFunction extends IntervalScatterFunction { } + +class LocusScatterFunction extends GATKScatterFunction with InProcessFunction { + protected override def maxIntervals = scatterCount + + def run() { + val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals) + val splits = IntervalUtils.splitLocusIntervals(gi.locs, this.scatterOutputFiles.size) + IntervalUtils.scatterFixedIntervals(gi.samFileHeader, splits, this.scatterOutputFiles) + } +} \ No newline at end of file From bd977c2d92ddc9e5670e4431e72c320937cdcb12 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 2 Nov 2011 16:20:42 -0400 Subject: [PATCH 5/8] Bug fix to avoid infinite loop in GATKScatterFunction --- .../sting/queue/extensions/gatk/GATKScatterFunction.scala | 3 --- 1 file changed, 3 deletions(-) diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala index c89e63668..24c2831b6 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala @@ -53,9 +53,6 @@ trait GATKScatterFunction extends ScatterFunction { /** Whether the last scatter job should also include any unmapped reads. */ protected var includeUnmapped: Boolean = _ - /** The total number of clone jobs that will be created. */ - override def scatterCount = if (intervalFilesExist) super.scatterCount min this.maxIntervals else super.scatterCount - override def init() { this.originalGATK = this.originalFunction.asInstanceOf[CommandLineGATK] this.referenceSequence = this.originalGATK.reference_sequence From 4d35272916c4461d25b17288523f4fca7400985a Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 2 Nov 2011 16:29:10 -0400 Subject: [PATCH 7/8] Bug fixes with Mauricio to functions in ReadUtils used by reduced reads and the haplotype caller. --- .../sting/utils/sam/ReadUtils.java | 153 +++++++----------- 1 file changed, 55 insertions(+), 98 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index f8e4927ed..da047a3a0 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -726,38 +726,6 @@ public class ReadUtils { return true; } - /** - * Looks for a read coordinate that corresponds to the reference coordinate in the soft clipped region before - * the alignment start of the read. - * - * @param read - * @param refCoord - * @return the corresponding read coordinate or -1 if it failed to find it (it has been hard clipped before) - */ - @Requires({"refCoord >= read.getUnclippedStart()", "refCoord < read.getAlignmentStart()"}) - private static int getReadCoordinateForReferenceCoordinateBeforeAlignmentStart(SAMRecord read, int refCoord) { - if (getRefCoordSoftUnclippedStart(read) <= refCoord) - return refCoord - getRefCoordSoftUnclippedStart(read) + 1; - return -1; - } - - - /** - * Looks for a read coordinate that corresponds to the reference coordinate in the soft clipped region after - * the alignment end of the read. - * - * @param read - * @param refCoord - * @return the corresponding read coordinate or -1 if it failed to find it (it has been hard clipped before) - */ - @Requires({"refCoord <= read.getUnclippedEnd()", "refCoord > read.getAlignmentEnd()"}) - private static int getReadCoordinateForReferenceCoordinateBeforeAlignmentEnd(SAMRecord read, int refCoord) { - if (getRefCoordSoftUnclippedEnd(read) >= refCoord) - return refCoord - getRefCoordSoftUnclippedStart(read) + 1; - return -1; - } - - public enum ClippingTail { LEFT_TAIL, RIGHT_TAIL @@ -802,96 +770,85 @@ public class ReadUtils { * @param refCoord * @return the read coordinate corresponding to the requested reference coordinate. (see warning!) */ - @Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd()"}) + @Requires({"refCoord >= getRefCoordSoftUnclippedStart(read)", "refCoord <= getRefCoordSoftUnclippedEnd(read)"}) @Ensures({"result.getFirst() >= 0", "result.getFirst() < read.getReadLength()"}) public static Pair getReadCoordinateForReferenceCoordinate(SAMRecord read, int refCoord) { int readBases = 0; int refBases = 0; boolean fallsInsideDeletion = false; - if (refCoord < read.getAlignmentStart()) { - readBases = getReadCoordinateForReferenceCoordinateBeforeAlignmentStart(read, refCoord); - if (readBases < 0) - throw new ReviewedStingException("Requested a coordinate in a hard clipped area of the read. No equivalent read coordinate."); - } - else if (refCoord > read.getAlignmentEnd()) { - readBases = getReadCoordinateForReferenceCoordinateBeforeAlignmentEnd(read, refCoord); - if (readBases < 0) - throw new ReviewedStingException("Requested a coordinate in a hard clipped area of the read. No equivalent read coordinate."); - } - else { - int goal = refCoord - read.getAlignmentStart(); // The goal is to move this many reference bases - boolean goalReached = refBases == goal; + int goal = refCoord - getRefCoordSoftUnclippedStart(read); // The goal is to move this many reference bases + boolean goalReached = refBases == goal; - Iterator cigarElementIterator = read.getCigar().getCigarElements().iterator(); - while (!goalReached && cigarElementIterator.hasNext()) { - CigarElement cigarElement = cigarElementIterator.next(); - int shift = 0; + Iterator cigarElementIterator = read.getCigar().getCigarElements().iterator(); + while (!goalReached && cigarElementIterator.hasNext()) { + CigarElement cigarElement = cigarElementIterator.next(); + int shift = 0; - if (cigarElement.getOperator().consumesReferenceBases()) { - if (refBases + cigarElement.getLength() < goal) - shift = cigarElement.getLength(); - else - shift = goal - refBases; + if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP) { + if (refBases + cigarElement.getLength() < goal) + shift = cigarElement.getLength(); + else + shift = goal - refBases; - refBases += shift; - } - goalReached = refBases == goal; + refBases += shift; + } + goalReached = refBases == goal; - if (!goalReached && cigarElement.getOperator().consumesReadBases()) - readBases += cigarElement.getLength(); + if (!goalReached && cigarElement.getOperator().consumesReadBases()) + readBases += cigarElement.getLength(); - if (goalReached) { - // Is this base's reference position within this cigar element? Or did we use it all? - boolean endsWithinCigar = shift < cigarElement.getLength(); + if (goalReached) { + // Is this base's reference position within this cigar element? Or did we use it all? + boolean endsWithinCigar = shift < cigarElement.getLength(); - // If it isn't, we need to check the next one. There should *ALWAYS* be a next one - // since we checked if the goal coordinate is within the read length, so this is just a sanity check. - if (!endsWithinCigar && !cigarElementIterator.hasNext()) - throw new ReviewedStingException("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio"); + // If it isn't, we need to check the next one. There should *ALWAYS* be a next one + // since we checked if the goal coordinate is within the read length, so this is just a sanity check. + if (!endsWithinCigar && !cigarElementIterator.hasNext()) + throw new ReviewedStingException("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio"); - CigarElement nextCigarElement; + CigarElement nextCigarElement; - // if we end inside the current cigar element, we just have to check if it is a deletion - if (endsWithinCigar) - fallsInsideDeletion = cigarElement.getOperator() == CigarOperator.DELETION; + // if we end inside the current cigar element, we just have to check if it is a deletion + if (endsWithinCigar) + fallsInsideDeletion = cigarElement.getOperator() == CigarOperator.DELETION; + + // if we end outside the current cigar element, we need to check if the next element is an insertion or deletion. + else { + nextCigarElement = cigarElementIterator.next(); + + // if it's an insertion, we need to clip the whole insertion before looking at the next element + if (nextCigarElement.getOperator() == CigarOperator.INSERTION) { + readBases += nextCigarElement.getLength(); + if (!cigarElementIterator.hasNext()) + throw new ReviewedStingException("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio"); - // if we end outside the current cigar element, we need to check if the next element is an insertion or deletion. - else { nextCigarElement = cigarElementIterator.next(); - - // if it's an insertion, we need to clip the whole insertion before looking at the next element - if (nextCigarElement.getOperator() == CigarOperator.INSERTION) { - readBases += nextCigarElement.getLength(); - if (!cigarElementIterator.hasNext()) - throw new ReviewedStingException("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio"); - - nextCigarElement = cigarElementIterator.next(); - } - - // if it's a deletion, we will pass the information on to be handled downstream. - fallsInsideDeletion = nextCigarElement.getOperator() == CigarOperator.DELETION; } - // If we reached our goal outside a deletion, add the shift - if (!fallsInsideDeletion && cigarElement.getOperator().consumesReadBases()) - readBases += shift; + // if it's a deletion, we will pass the information on to be handled downstream. + fallsInsideDeletion = nextCigarElement.getOperator() == CigarOperator.DELETION; + } - // If we reached our goal inside a deletion, but the deletion is the next cigar element then we need - // to add the shift of the current cigar element but go back to it's last element to return the last - // base before the deletion (see warning in function contracts) - else if (fallsInsideDeletion && !endsWithinCigar) - readBases += shift - 1; + // If we reached our goal outside a deletion, add the shift + if (!fallsInsideDeletion && cigarElement.getOperator().consumesReadBases()) + readBases += shift; - // If we reached our goal inside a deletion then we must backtrack to the last base before the deletion - else if (fallsInsideDeletion && endsWithinCigar) - readBases--; - } + // If we reached our goal inside a deletion, but the deletion is the next cigar element then we need + // to add the shift of the current cigar element but go back to it's last element to return the last + // base before the deletion (see warning in function contracts) + else if (fallsInsideDeletion && !endsWithinCigar) + readBases += shift - 1; + + // If we reached our goal inside a deletion then we must backtrack to the last base before the deletion + else if (fallsInsideDeletion && endsWithinCigar) + readBases--; + } } if (!goalReached) throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?"); - } + return new Pair(readBases, fallsInsideDeletion); } From e4a583a53f086ebf8c086909066b686ea02def2d Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 2 Nov 2011 17:53:10 -0400 Subject: [PATCH 8/8] Fixing docs: No -I in this walker --- .../sting/gatk/walkers/coverage/GCContentByIntervalWalker.java | 1 - 1 file changed, 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java index 5c2a967b9..f01e8bb3c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java @@ -56,7 +56,6 @@ import java.util.List; * -R ref.fasta \ * -T GCContentByInterval \ * -o output.txt \ - * -I input.bam \ * -L input.intervals * *