Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Eric Banks 2011-11-02 21:13:54 -04:00
commit e8bceb1eaa
15 changed files with 487 additions and 178 deletions

View File

@ -17,7 +17,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
*/
@By(DataSource.READS)
@Requires({DataSource.READS,DataSource.REFERENCE, DataSource.REFERENCE_BASES})
@PartitionBy(PartitionType.INTERVAL)
@PartitionBy(PartitionType.LOCUS)
@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckFilter.class})
public abstract class LocusWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
// Do we actually want to operate on the context?

View File

@ -34,6 +34,12 @@ public enum PartitionType {
*/
NONE,
/**
* The walker inputs can be chunked down to individual
* reads.
*/
READ,
/**
* The walker inputs can be chunked down to the
* per-locus level.

View File

@ -12,7 +12,7 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
* To change this template use File | Settings | File Templates.
*/
@Requires({DataSource.READS, DataSource.REFERENCE_BASES})
@PartitionBy(PartitionType.CONTIG)
@PartitionBy(PartitionType.READ)
public abstract class ReadWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
public boolean requiresOrderedReads() { return false; }

View File

@ -113,6 +113,7 @@ import java.util.*;
// todo -- allow for user to set linear binning (default is logarithmic)
// todo -- formatting --> do something special for end bins in getQuantile(int[] foo), this gets mushed into the end+-1 bins for now
@By(DataSource.REFERENCE)
@PartitionBy(PartitionType.INTERVAL)
public class DepthOfCoverageWalker extends LocusWalker<Map<DoCOutputType.Partition,Map<String,int[]>>, CoveragePartitioner> implements TreeReducible<CoveragePartitioner> {
@Output
@Multiplex(value=DoCOutputMultiplexer.class,arguments={"partitionTypes","refSeqGeneList","omitDepthOutput","omitIntervals","omitSampleSummary","omitLocusTable"})

View File

@ -5,6 +5,10 @@ import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
/**
* Created by IntelliJ IDEA.
@ -174,6 +178,8 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
return new GenomeLoc[] { new GenomeLoc(getContig(),contigIndex,getStart(),splitPoint-1), new GenomeLoc(getContig(),contigIndex,splitPoint,getStop()) };
}
public GenomeLoc union( GenomeLoc that ) { return merge(that); }
@Requires("that != null")
@Ensures("result != null")
public GenomeLoc intersect( GenomeLoc that ) throws ReviewedStingException {
@ -192,6 +198,79 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
Math.min( getStop(), that.getStop()) );
}
@Requires("that != null")
public final List<GenomeLoc> subtract( final GenomeLoc that ) {
if(GenomeLoc.isUnmapped(this) || GenomeLoc.isUnmapped(that)) {
if(! GenomeLoc.isUnmapped(this) || !GenomeLoc.isUnmapped(that))
throw new ReviewedStingException("Tried to intersect a mapped and an unmapped genome loc");
return Arrays.asList(UNMAPPED);
}
if (!(this.overlapsP(that))) {
throw new ReviewedStingException("GenomeLoc::minus(): The two genome loc's need to overlap");
}
if (equals(that)) {
return Collections.emptyList();
} else if (containsP(that)) {
List<GenomeLoc> l = new ArrayList<GenomeLoc>(2);
/**
* we have to create two new region, one for the before part, one for the after
* The old region:
* |----------------- old region (g) -------------|
* |----- to delete (e) ------|
*
* product (two new regions):
* |------| + |--------|
*
*/
int afterStop = this.getStop(), afterStart = that.getStop() + 1;
int beforeStop = that.getStart() - 1, beforeStart = this.getStart();
if (afterStop - afterStart >= 0) {
GenomeLoc after = new GenomeLoc(this.getContig(), getContigIndex(), afterStart, afterStop);
l.add(after);
}
if (beforeStop - beforeStart >= 0) {
GenomeLoc before = new GenomeLoc(this.getContig(), getContigIndex(), beforeStart, beforeStop);
l.add(before);
}
return l;
} else if (that.containsP(this)) {
/**
* e completely contains g, delete g, but keep looking, there may be more regions
* i.e.:
* |--------------------- e --------------------|
* |--- g ---| |---- others ----|
*/
return Collections.emptyList(); // don't need to do anything
} else {
/**
* otherwise e overlaps some part of g
*
* figure out which region occurs first on the genome. I.e., is it:
* |------------- g ----------|
* |------------- e ----------|
*
* or:
* |------------- g ----------|
* |------------ e -----------|
*
*/
GenomeLoc n;
if (that.getStart() < this.getStart()) {
n = new GenomeLoc(this.getContig(), getContigIndex(), that.getStop() + 1, this.getStop());
} else {
n = new GenomeLoc(this.getContig(), getContigIndex(), this.getStart(), that.getStart() - 1);
}
// replace g with the new region
return Arrays.asList(n);
}
}
@Requires("that != null")
public final boolean containsP(GenomeLoc that) {
return onSameContig(that) && getStart() <= that.getStart() && getStop() >= that.getStop();
@ -203,19 +282,14 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
}
@Requires("that != null")
public final int minus( final GenomeLoc that ) {
@Ensures("result >= 0")
public final int distance( final GenomeLoc that ) {
if ( this.onSameContig(that) )
return this.getStart() - that.getStart();
return Math.abs(this.getStart() - that.getStart());
else
return Integer.MAX_VALUE;
}
@Requires("that != null")
@Ensures("result >= 0")
public final int distance( final GenomeLoc that ) {
return Math.abs(minus(that));
}
@Requires({"left != null", "right != null"})
public final boolean isBetween( final GenomeLoc left, final GenomeLoc right ) {
return this.compareTo(left) > -1 && this.compareTo(right) < 1;

View File

@ -215,7 +215,7 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
if ( p.overlapsP(e) ) {
toProcess.pop();
for ( GenomeLoc newP : subtractRegion(p, e) )
for ( GenomeLoc newP : p.subtract(e) )
toProcess.push(newP);
} else if ( p.compareContigs(e) < 0 ) {
good.add(toProcess.pop()); // p is now good
@ -236,69 +236,6 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
return createSetFromList(genomeLocParser,good);
}
private static final List<GenomeLoc> EMPTY_LIST = new ArrayList<GenomeLoc>();
private List<GenomeLoc> subtractRegion(GenomeLoc g, GenomeLoc e) {
if (g.equals(e)) {
return EMPTY_LIST;
} else if (g.containsP(e)) {
List<GenomeLoc> l = new ArrayList<GenomeLoc>();
/**
* we have to create two new region, one for the before part, one for the after
* The old region:
* |----------------- old region (g) -------------|
* |----- to delete (e) ------|
*
* product (two new regions):
* |------| + |--------|
*
*/
int afterStop = g.getStop(), afterStart = e.getStop() + 1;
int beforeStop = e.getStart() - 1, beforeStart = g.getStart();
if (afterStop - afterStart >= 0) {
GenomeLoc after = genomeLocParser.createGenomeLoc(g.getContig(), afterStart, afterStop);
l.add(after);
}
if (beforeStop - beforeStart >= 0) {
GenomeLoc before = genomeLocParser.createGenomeLoc(g.getContig(), beforeStart, beforeStop);
l.add(before);
}
return l;
} else if (e.containsP(g)) {
/**
* e completely contains g, delete g, but keep looking, there may be more regions
* i.e.:
* |--------------------- e --------------------|
* |--- g ---| |---- others ----|
*/
return EMPTY_LIST; // don't need to do anything
} else {
/**
* otherwise e overlaps some part of g
*
* figure out which region occurs first on the genome. I.e., is it:
* |------------- g ----------|
* |------------- e ----------|
*
* or:
* |------------- g ----------|
* |------------ e -----------|
*
*/
GenomeLoc n;
if (e.getStart() < g.getStart()) {
n = genomeLocParser.createGenomeLoc(g.getContig(), e.getStop() + 1, g.getStop());
} else {
n = genomeLocParser.createGenomeLoc(g.getContig(), g.getStart(), e.getStart() - 1);
}
// replace g with the new region
return Arrays.asList(n);
}
}
/**
* a simple removal of an interval contained in this list. The interval must be identical to one in the list (no partial locations or overlapping)

View File

@ -586,6 +586,12 @@ public class Utils {
return rcbases;
}
static public final <T> List<T> reverse(final List<T> l) {
final List<T> newL = new ArrayList<T>(l);
Collections.reverse(newL);
return newL;
}
/**
* Reverse an int array of bases
*

View File

@ -1,5 +1,7 @@
package org.broadinstitute.sting.utils.interval;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.picard.util.Interval;
import net.sf.picard.util.IntervalList;
import net.sf.samtools.SAMFileHeader;
@ -8,6 +10,7 @@ import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
@ -221,6 +224,44 @@ public class IntervalUtils {
return GenomeLocSortedSet.createSetFromList(parser,intervals);
}
/**
* computes whether the test interval list is equivalent to master. To be equivalent, test must
* contain GenomeLocs covering every base in master, exactly once. Note that this algorithm
* assumes that master genomelocs are all discontiguous (i.e., we don't have locs like 1-3 and 4-6 but
* rather just 1-6). In order to use this algorithm with contiguous genomelocs first merge them. The algorithm
* doesn't assume that test has discontinuous genomelocs.
*
* Returns a null string if there are no differences, otherwise returns a string describing the difference
* (useful for UnitTests). Assumes both lists are sorted
*/
public static final String equateIntervals(List<GenomeLoc> masterArg, List<GenomeLoc> testArg) {
LinkedList<GenomeLoc> master = new LinkedList<GenomeLoc>(masterArg);
LinkedList<GenomeLoc> test = new LinkedList<GenomeLoc>(testArg);
while ( ! master.isEmpty() ) { // there's still unchecked bases in master
final GenomeLoc masterHead = master.pop();
final GenomeLoc testHead = test.pop();
if ( testHead.overlapsP(masterHead) ) {
// remove the parts of test that overlap master, and push the remaining
// parts onto master for further comparison.
for ( final GenomeLoc masterPart : Utils.reverse(masterHead.subtract(testHead)) ) {
master.push(masterPart);
}
} else {
// testHead is incompatible with masterHead, so we must have extra bases in testHead
// that aren't in master
return "Incompatible locs detected masterHead=" + masterHead + ", testHead=" + testHead;
}
}
if ( test.isEmpty() ) // everything is equal
return null; // no differences
else
return "Remaining elements found in test: first=" + test.peek();
}
/**
* Check if string argument was intented as a file
* Accepted file extensions: .bed .list, .picard, .interval_list, .intervals.
@ -384,6 +425,92 @@ public class IntervalUtils {
return splitIntervalsToSubLists(locs, splitPoints);
}
@Requires({"locs != null", "numParts > 0"})
@Ensures("result != null")
public static List<List<GenomeLoc>> splitLocusIntervals(List<GenomeLoc> locs, int numParts) {
// the ideal size of each split
final long bp = IntervalUtils.intervalSize(locs);
final long idealSplitSize = Math.max((long)Math.floor(bp / (1.0*numParts)), 1);
// algorithm:
// split = ()
// set size = 0
// pop the head H off locs.
// If size + size(H) < splitSize:
// add H to split, continue
// If size + size(H) == splitSize:
// done with split, put in splits, restart
// if size + size(H) > splitSize:
// cut H into two pieces, first of which has splitSize - size bp
// push both pieces onto locs, continue
// The last split is special -- when you have only one split left, it gets all of the remaining locs
// to deal with rounding issues
final List<List<GenomeLoc>> splits = new ArrayList<List<GenomeLoc>>(numParts);
LinkedList<GenomeLoc> locsLinkedList = new LinkedList<GenomeLoc>(locs);
while ( ! locsLinkedList.isEmpty() ) {
if ( splits.size() + 1 == numParts ) {
// the last one gets all of the remaining parts
splits.add(new ArrayList<GenomeLoc>(locsLinkedList));
locsLinkedList.clear();
} else {
final SplitLocusRecursive one = splitLocusIntervals1(locsLinkedList, idealSplitSize);
splits.add(one.split);
locsLinkedList = one.remaining;
}
}
return splits;
}
@Requires({"remaining != null", "!remaining.isEmpty()", "idealSplitSize > 0"})
@Ensures({"result != null"})
final static SplitLocusRecursive splitLocusIntervals1(LinkedList<GenomeLoc> remaining, long idealSplitSize) {
final List<GenomeLoc> split = new ArrayList<GenomeLoc>();
long size = 0;
while ( ! remaining.isEmpty() ) {
GenomeLoc head = remaining.pop();
final long newSize = size + head.size();
if ( newSize == idealSplitSize ) {
split.add(head);
break; // we are done
} else if ( newSize > idealSplitSize ) {
final long remainingBp = idealSplitSize - size;
final long cutPoint = head.getStart() + remainingBp;
GenomeLoc[] parts = head.split((int)cutPoint);
remaining.push(parts[1]);
remaining.push(parts[0]);
// when we go around, head.size' = idealSplitSize - size
// so newSize' = splitSize + head.size' = size + (idealSplitSize - size) = idealSplitSize
} else {
split.add(head);
size = newSize;
}
}
return new SplitLocusRecursive(split, remaining);
}
private final static class SplitLocusRecursive {
final List<GenomeLoc> split;
final LinkedList<GenomeLoc> remaining;
@Requires({"split != null", "remaining != null"})
private SplitLocusRecursive(final List<GenomeLoc> split, final LinkedList<GenomeLoc> remaining) {
this.split = split;
this.remaining = remaining;
}
}
public static List<GenomeLoc> flattenSplitIntervals(List<List<GenomeLoc>> splits) {
final List<GenomeLoc> locs = new ArrayList<GenomeLoc>();
for ( final List<GenomeLoc> split : splits )
locs.addAll(split);
return locs;
}
private static void addFixedSplit(List<Integer> splitPoints, List<GenomeLoc> locs, long locsSize, int startIndex, int stopIndex, int numParts) {
if (numParts < 2)
return;

View File

@ -724,38 +724,6 @@ public class ReadUtils {
return true;
}
/**
* Looks for a read coordinate that corresponds to the reference coordinate in the soft clipped region before
* the alignment start of the read.
*
* @param read
* @param refCoord
* @return the corresponding read coordinate or -1 if it failed to find it (it has been hard clipped before)
*/
@Requires({"refCoord >= read.getUnclippedStart()", "refCoord < read.getAlignmentStart()"})
private static int getReadCoordinateForReferenceCoordinateBeforeAlignmentStart(SAMRecord read, int refCoord) {
if (getRefCoordSoftUnclippedStart(read) <= refCoord)
return refCoord - getRefCoordSoftUnclippedStart(read) + 1;
return -1;
}
/**
* Looks for a read coordinate that corresponds to the reference coordinate in the soft clipped region after
* the alignment end of the read.
*
* @param read
* @param refCoord
* @return the corresponding read coordinate or -1 if it failed to find it (it has been hard clipped before)
*/
@Requires({"refCoord <= read.getUnclippedEnd()", "refCoord > read.getAlignmentEnd()"})
private static int getReadCoordinateForReferenceCoordinateBeforeAlignmentEnd(SAMRecord read, int refCoord) {
if (getRefCoordSoftUnclippedEnd(read) >= refCoord)
return refCoord - getRefCoordSoftUnclippedStart(read) + 1;
return -1;
}
public enum ClippingTail {
LEFT_TAIL,
RIGHT_TAIL
@ -800,96 +768,85 @@ public class ReadUtils {
* @param refCoord
* @return the read coordinate corresponding to the requested reference coordinate. (see warning!)
*/
@Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd()"})
@Requires({"refCoord >= getRefCoordSoftUnclippedStart(read)", "refCoord <= getRefCoordSoftUnclippedEnd(read)"})
@Ensures({"result.getFirst() >= 0", "result.getFirst() < read.getReadLength()"})
public static Pair<Integer, Boolean> getReadCoordinateForReferenceCoordinate(SAMRecord read, int refCoord) {
int readBases = 0;
int refBases = 0;
boolean fallsInsideDeletion = false;
if (refCoord < read.getAlignmentStart()) {
readBases = getReadCoordinateForReferenceCoordinateBeforeAlignmentStart(read, refCoord);
if (readBases < 0)
throw new ReviewedStingException("Requested a coordinate in a hard clipped area of the read. No equivalent read coordinate.");
}
else if (refCoord > read.getAlignmentEnd()) {
readBases = getReadCoordinateForReferenceCoordinateBeforeAlignmentEnd(read, refCoord);
if (readBases < 0)
throw new ReviewedStingException("Requested a coordinate in a hard clipped area of the read. No equivalent read coordinate.");
}
else {
int goal = refCoord - read.getAlignmentStart(); // The goal is to move this many reference bases
boolean goalReached = refBases == goal;
int goal = refCoord - getRefCoordSoftUnclippedStart(read); // The goal is to move this many reference bases
boolean goalReached = refBases == goal;
Iterator<CigarElement> cigarElementIterator = read.getCigar().getCigarElements().iterator();
while (!goalReached && cigarElementIterator.hasNext()) {
CigarElement cigarElement = cigarElementIterator.next();
int shift = 0;
Iterator<CigarElement> cigarElementIterator = read.getCigar().getCigarElements().iterator();
while (!goalReached && cigarElementIterator.hasNext()) {
CigarElement cigarElement = cigarElementIterator.next();
int shift = 0;
if (cigarElement.getOperator().consumesReferenceBases()) {
if (refBases + cigarElement.getLength() < goal)
shift = cigarElement.getLength();
else
shift = goal - refBases;
if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP) {
if (refBases + cigarElement.getLength() < goal)
shift = cigarElement.getLength();
else
shift = goal - refBases;
refBases += shift;
}
goalReached = refBases == goal;
refBases += shift;
}
goalReached = refBases == goal;
if (!goalReached && cigarElement.getOperator().consumesReadBases())
readBases += cigarElement.getLength();
if (!goalReached && cigarElement.getOperator().consumesReadBases())
readBases += cigarElement.getLength();
if (goalReached) {
// Is this base's reference position within this cigar element? Or did we use it all?
boolean endsWithinCigar = shift < cigarElement.getLength();
if (goalReached) {
// Is this base's reference position within this cigar element? Or did we use it all?
boolean endsWithinCigar = shift < cigarElement.getLength();
// If it isn't, we need to check the next one. There should *ALWAYS* be a next one
// since we checked if the goal coordinate is within the read length, so this is just a sanity check.
if (!endsWithinCigar && !cigarElementIterator.hasNext())
throw new ReviewedStingException("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio");
// If it isn't, we need to check the next one. There should *ALWAYS* be a next one
// since we checked if the goal coordinate is within the read length, so this is just a sanity check.
if (!endsWithinCigar && !cigarElementIterator.hasNext())
throw new ReviewedStingException("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio");
CigarElement nextCigarElement;
CigarElement nextCigarElement;
// if we end inside the current cigar element, we just have to check if it is a deletion
if (endsWithinCigar)
fallsInsideDeletion = cigarElement.getOperator() == CigarOperator.DELETION;
// if we end inside the current cigar element, we just have to check if it is a deletion
if (endsWithinCigar)
fallsInsideDeletion = cigarElement.getOperator() == CigarOperator.DELETION;
// if we end outside the current cigar element, we need to check if the next element is an insertion or deletion.
else {
nextCigarElement = cigarElementIterator.next();
// if it's an insertion, we need to clip the whole insertion before looking at the next element
if (nextCigarElement.getOperator() == CigarOperator.INSERTION) {
readBases += nextCigarElement.getLength();
if (!cigarElementIterator.hasNext())
throw new ReviewedStingException("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio");
// if we end outside the current cigar element, we need to check if the next element is an insertion or deletion.
else {
nextCigarElement = cigarElementIterator.next();
// if it's an insertion, we need to clip the whole insertion before looking at the next element
if (nextCigarElement.getOperator() == CigarOperator.INSERTION) {
readBases += nextCigarElement.getLength();
if (!cigarElementIterator.hasNext())
throw new ReviewedStingException("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio");
nextCigarElement = cigarElementIterator.next();
}
// if it's a deletion, we will pass the information on to be handled downstream.
fallsInsideDeletion = nextCigarElement.getOperator() == CigarOperator.DELETION;
}
// If we reached our goal outside a deletion, add the shift
if (!fallsInsideDeletion && cigarElement.getOperator().consumesReadBases())
readBases += shift;
// if it's a deletion, we will pass the information on to be handled downstream.
fallsInsideDeletion = nextCigarElement.getOperator() == CigarOperator.DELETION;
}
// If we reached our goal inside a deletion, but the deletion is the next cigar element then we need
// to add the shift of the current cigar element but go back to it's last element to return the last
// base before the deletion (see warning in function contracts)
else if (fallsInsideDeletion && !endsWithinCigar)
readBases += shift - 1;
// If we reached our goal outside a deletion, add the shift
if (!fallsInsideDeletion && cigarElement.getOperator().consumesReadBases())
readBases += shift;
// If we reached our goal inside a deletion then we must backtrack to the last base before the deletion
else if (fallsInsideDeletion && endsWithinCigar)
readBases--;
}
// If we reached our goal inside a deletion, but the deletion is the next cigar element then we need
// to add the shift of the current cigar element but go back to it's last element to return the last
// base before the deletion (see warning in function contracts)
else if (fallsInsideDeletion && !endsWithinCigar)
readBases += shift - 1;
// If we reached our goal inside a deletion then we must backtrack to the last base before the deletion
else if (fallsInsideDeletion && endsWithinCigar)
readBases--;
}
}
if (!goalReached)
throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?");
}
return new Pair<Integer, Boolean>(readBases, fallsInsideDeletion);
}

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.utils.interval;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.picard.util.IntervalUtil;
import net.sf.samtools.SAMFileHeader;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
@ -131,6 +132,135 @@ public class IntervalUtilsUnitTest extends BaseTest {
Assert.assertEquals(totalSize, sumOfSplitSizes, "Split intervals don't contain the exact number of bases in the origianl intervals");
}
// -------------------------------------------------------------------------------------
//
// splitLocusIntervals tests
//
// -------------------------------------------------------------------------------------
/** large scale tests for many intervals */
private class SplitLocusIntervalsTest extends TestDataProvider {
final List<GenomeLoc> originalIntervals;
final public int parts;
private SplitLocusIntervalsTest(final String name, List<GenomeLoc> originalIntervals, final int parts) {
super(SplitLocusIntervalsTest.class, name);
this.parts = parts;
this.originalIntervals = originalIntervals;
}
public String toString() {
return String.format("%s parts=%d", super.toString(), parts);
}
}
@DataProvider(name = "IntervalRepartitionTest")
public Object[][] createIntervalRepartitionTest() {
for ( int parts : Arrays.asList(1, 2, 3, 10, 13, 100, 151, 1000, 10000) ) {
//for ( int parts : Arrays.asList(10) ) {
new SplitLocusIntervalsTest("hg19RefLocs", hg19ReferenceLocs, parts);
new SplitLocusIntervalsTest("hg19ExomeLocs", hg19exomeIntervals, parts);
}
return SplitLocusIntervalsTest.getTests(SplitLocusIntervalsTest.class);
}
@Test(enabled = true, dataProvider = "IntervalRepartitionTest")
public void testIntervalRepartition(SplitLocusIntervalsTest test) {
List<List<GenomeLoc>> splitByLocus = IntervalUtils.splitLocusIntervals(test.originalIntervals, test.parts);
Assert.assertEquals(splitByLocus.size(), test.parts, "SplitLocusIntervals failed to generate correct number of intervals");
List<GenomeLoc> flat = IntervalUtils.flattenSplitIntervals(splitByLocus);
// test overall size
final long originalSize = IntervalUtils.intervalSize(test.originalIntervals);
final long flatSize = IntervalUtils.intervalSize(flat);
Assert.assertEquals(flatSize, originalSize, "SplitLocusIntervals locs cover an incorrect number of bases");
// test size of each split
final long ideal = (long)Math.floor(originalSize / (1.0 * test.parts));
final long maxSize = ideal + (originalSize % test.parts) * test.parts; // no more than N * rounding error in size
for ( final List<GenomeLoc> split : splitByLocus ) {
final long splitSize = IntervalUtils.intervalSize(split);
Assert.assertTrue(splitSize >= ideal && splitSize <= maxSize,
String.format("SplitLocusIntervals interval (start=%s) has size %d outside of bounds ideal=%d, max=%d",
split.get(0), splitSize, ideal, maxSize));
}
// test that every base in original is covered once by a base in split by locus intervals
String diff = IntervalUtils.equateIntervals(test.originalIntervals, flat);
Assert.assertNull(diff, diff);
}
/** small scale tests where the expected cuts are enumerated upfront for testing */
private class SplitLocusIntervalsSmallTest extends TestDataProvider {
final List<GenomeLoc> original;
final public int parts;
final public int expectedParts;
final List<GenomeLoc> expected;
private SplitLocusIntervalsSmallTest(final String name, List<GenomeLoc> originalIntervals, final int parts, List<GenomeLoc> expected) {
this(name, originalIntervals, parts, expected, parts);
}
private SplitLocusIntervalsSmallTest(final String name, List<GenomeLoc> originalIntervals, final int parts, List<GenomeLoc> expected, int expectedParts) {
super(SplitLocusIntervalsSmallTest.class, name);
this.parts = parts;
this.expectedParts = expectedParts;
this.original = originalIntervals;
this.expected = expected;
}
public String toString() {
return String.format("%s parts=%d", super.toString(), parts);
}
}
@DataProvider(name = "SplitLocusIntervalsSmallTest")
public Object[][] createSplitLocusIntervalsSmallTest() {
GenomeLoc bp01_10 = hg19GenomeLocParser.createGenomeLoc("1", 1, 10);
GenomeLoc bp1_5 = hg19GenomeLocParser.createGenomeLoc("1", 1, 5);
GenomeLoc bp6_10 = hg19GenomeLocParser.createGenomeLoc("1", 6, 10);
new SplitLocusIntervalsSmallTest("cut into two", Arrays.asList(bp01_10), 2, Arrays.asList(bp1_5, bp6_10));
GenomeLoc bp20_30 = hg19GenomeLocParser.createGenomeLoc("1", 20, 30);
new SplitLocusIntervalsSmallTest("two in two", Arrays.asList(bp01_10, bp20_30), 2, Arrays.asList(bp01_10, bp20_30));
GenomeLoc bp1_7 = hg19GenomeLocParser.createGenomeLoc("1", 1, 7);
GenomeLoc bp8_10 = hg19GenomeLocParser.createGenomeLoc("1", 8, 10);
GenomeLoc bp20_23 = hg19GenomeLocParser.createGenomeLoc("1", 20, 23);
GenomeLoc bp24_30 = hg19GenomeLocParser.createGenomeLoc("1", 24, 30);
new SplitLocusIntervalsSmallTest("two in three", Arrays.asList(bp01_10, bp20_30), 3,
Arrays.asList(bp1_7, bp8_10, bp20_23, bp24_30));
GenomeLoc bp1_2 = hg19GenomeLocParser.createGenomeLoc("1", 1, 2);
GenomeLoc bp1_1 = hg19GenomeLocParser.createGenomeLoc("1", 1, 1);
GenomeLoc bp2_2 = hg19GenomeLocParser.createGenomeLoc("1", 2, 2);
new SplitLocusIntervalsSmallTest("too many pieces", Arrays.asList(bp1_2), 5, Arrays.asList(bp1_1, bp2_2), 2);
new SplitLocusIntervalsSmallTest("emptyList", Collections.<GenomeLoc>emptyList(), 5, Collections.<GenomeLoc>emptyList(), 0);
return SplitLocusIntervalsSmallTest.getTests(SplitLocusIntervalsSmallTest.class);
}
@Test(enabled = true, dataProvider = "SplitLocusIntervalsSmallTest")
public void splitLocusIntervalsSmallTest(SplitLocusIntervalsSmallTest test) {
List<List<GenomeLoc>> splitByLocus = IntervalUtils.splitLocusIntervals(test.original, test.parts);
Assert.assertEquals(splitByLocus.size(), test.expectedParts, "SplitLocusIntervals failed to generate correct number of intervals");
List<GenomeLoc> flat = IntervalUtils.flattenSplitIntervals(splitByLocus);
// test sizes
final long originalSize = IntervalUtils.intervalSize(test.original);
final long splitSize = IntervalUtils.intervalSize(flat);
Assert.assertEquals(splitSize, originalSize, "SplitLocusIntervals locs cover an incorrect number of bases");
Assert.assertEquals(flat, test.expected, "SplitLocusIntervals locs not expected intervals");
}
//
// Misc. tests
//
@Test(expectedExceptions=UserException.class)
public void testMergeListsBySetOperatorNoOverlap() {
// a couple of lists we'll use for the testing

View File

@ -35,6 +35,8 @@ class ContigScatterFunction extends GATKScatterFunction with InProcessFunction {
// Include unmapped reads by default.
this.includeUnmapped = true
override def scatterCount = if (intervalFilesExist) super.scatterCount min this.maxIntervals else super.scatterCount
protected override def maxIntervals = {
GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals).contigs.size
}
@ -44,3 +46,4 @@ class ContigScatterFunction extends GATKScatterFunction with InProcessFunction {
IntervalUtils.scatterContigIntervals(gi.samFileHeader, gi.locs, this.scatterOutputFiles)
}
}

View File

@ -53,9 +53,6 @@ trait GATKScatterFunction extends ScatterFunction {
/** Whether the last scatter job should also include any unmapped reads. */
protected var includeUnmapped: Boolean = _
/** The total number of clone jobs that will be created. */
override def scatterCount = if (intervalFilesExist) super.scatterCount min this.maxIntervals else super.scatterCount
override def init() {
this.originalGATK = this.originalFunction.asInstanceOf[CommandLineGATK]
this.referenceSequence = this.originalGATK.reference_sequence

View File

@ -35,6 +35,8 @@ class IntervalScatterFunction extends GATKScatterFunction with InProcessFunction
protected override def maxIntervals =
GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals).locs.size
override def scatterCount = if (intervalFilesExist) super.scatterCount min this.maxIntervals else super.scatterCount
def run() {
val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals)
val splits = IntervalUtils.splitFixedIntervals(gi.locs, this.scatterOutputFiles.size)

View File

@ -24,8 +24,21 @@
package org.broadinstitute.sting.queue.extensions.gatk
import collection.JavaConversions._
import org.broadinstitute.sting.utils.interval.IntervalUtils
import org.broadinstitute.sting.queue.function.InProcessFunction
/**
* For now returns an IntervalScatterFunction.
* TODO: A scatter function that divides down to the locus level.
* A scatter function that divides down to the locus level.
*/
class LocusScatterFunction extends IntervalScatterFunction {}
//class LocusScatterFunction extends IntervalScatterFunction { }
class LocusScatterFunction extends GATKScatterFunction with InProcessFunction {
protected override def maxIntervals = scatterCount
def run() {
val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals)
val splits = IntervalUtils.splitLocusIntervals(gi.locs, this.scatterOutputFiles.size)
IntervalUtils.scatterFixedIntervals(gi.samFileHeader, splits, this.scatterOutputFiles)
}
}

View File

@ -0,0 +1,56 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.queue.extensions.gatk
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Currently ReadScatterFunction only does ContigScattering, but it
* could in principle do something more aggressive.
*/
class ReadScatterFunction extends ContigScatterFunction { }