Final intervals cleanup
-- No functional changes (my algorithm wouldn't work) -- Major structural cleanup (returning more basic data structures that allow us to development new algorithm) -- Unit tests for the efficiency of interval partitioning
This commit is contained in:
parent
6ea57bf036
commit
4ad330008d
|
|
@ -852,8 +852,8 @@
|
||||||
<jvmarg value="-Dpipeline.run=${pipeline.run}" />
|
<jvmarg value="-Dpipeline.run=${pipeline.run}" />
|
||||||
<jvmarg value="-Djava.io.tmpdir=${java.io.tmpdir}" />
|
<jvmarg value="-Djava.io.tmpdir=${java.io.tmpdir}" />
|
||||||
<jvmarg line="${cofoja.jvm.args}"/>
|
<jvmarg line="${cofoja.jvm.args}"/>
|
||||||
<jvmarg value="-Xdebug"/>
|
<!-- <jvmarg value="-Xdebug"/> -->
|
||||||
<jvmarg value="-Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=5005"/>
|
<!-- <jvmarg value="-Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=5005"/> -->
|
||||||
<classpath>
|
<classpath>
|
||||||
<path refid="external.dependencies" />
|
<path refid="external.dependencies" />
|
||||||
<pathelement location="${java.classes}" />
|
<pathelement location="${java.classes}" />
|
||||||
|
|
|
||||||
|
|
@ -333,6 +333,28 @@ public class IntervalUtils {
|
||||||
throw new UserException.BadArgumentValue("scatterParts", String.format("Only able to write contigs into %d of %d files.", fileIndex + 1, scatterParts.size()));
|
throw new UserException.BadArgumentValue("scatterParts", String.format("Only able to write contigs into %d of %d files.", fileIndex + 1, scatterParts.size()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Splits an interval list into multiple sublists.
|
||||||
|
* @param locs The genome locs to split.
|
||||||
|
* @param splits The stop points for the genome locs returned by splitFixedIntervals.
|
||||||
|
* @return A list of lists of genome locs, split according to splits
|
||||||
|
*/
|
||||||
|
public static List<List<GenomeLoc>> splitIntervalsToSubLists(List<GenomeLoc> locs, List<Integer> splits) {
|
||||||
|
int locIndex = 1;
|
||||||
|
int start = 0;
|
||||||
|
List<List<GenomeLoc>> sublists = new ArrayList<List<GenomeLoc>>(splits.size());
|
||||||
|
for (Integer stop: splits) {
|
||||||
|
List<GenomeLoc> curList = new ArrayList<GenomeLoc>();
|
||||||
|
for (int i = start; i < stop; i++)
|
||||||
|
curList.add(locs.get(i));
|
||||||
|
start = stop;
|
||||||
|
sublists.add(curList);
|
||||||
|
}
|
||||||
|
|
||||||
|
return sublists;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Splits an interval list into multiple files.
|
* Splits an interval list into multiple files.
|
||||||
* @param fileHeader The sam file header.
|
* @param fileHeader The sam file header.
|
||||||
|
|
@ -362,27 +384,39 @@ public class IntervalUtils {
|
||||||
public static List<List<GenomeLoc>> splitFixedIntervals(List<GenomeLoc> locs, int numParts) {
|
public static List<List<GenomeLoc>> splitFixedIntervals(List<GenomeLoc> locs, int numParts) {
|
||||||
if (locs.size() < numParts)
|
if (locs.size() < numParts)
|
||||||
throw new UserException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts));
|
throw new UserException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts));
|
||||||
|
|
||||||
final long locsSize = intervalSize(locs);
|
final long locsSize = intervalSize(locs);
|
||||||
final double idealSplitSize = locsSize / numParts;
|
final List<Integer> splitPoints = new ArrayList<Integer>();
|
||||||
final List<List<GenomeLoc>> splits = new ArrayList<List<GenomeLoc>>(numParts);
|
addFixedSplit(splitPoints, locs, locsSize, 0, locs.size(), numParts);
|
||||||
final LinkedList<GenomeLoc> remainingLocs = new LinkedList<GenomeLoc>(locs);
|
Collections.sort(splitPoints);
|
||||||
|
splitPoints.add(locs.size());
|
||||||
|
return splitIntervalsToSubLists(locs, splitPoints);
|
||||||
|
}
|
||||||
|
|
||||||
for ( int i = 0; i < numParts; i++ ) {
|
private static void addFixedSplit(List<Integer> splitPoints, List<GenomeLoc> locs, long locsSize, int startIndex, int stopIndex, int numParts) {
|
||||||
long splitSize = 0;
|
if (numParts < 2)
|
||||||
List<GenomeLoc> split = new ArrayList<GenomeLoc>();
|
return;
|
||||||
while ( ! remainingLocs.isEmpty() ) {
|
int halfParts = (numParts + 1) / 2;
|
||||||
final GenomeLoc toAdd = remainingLocs.pop();
|
Pair<Integer, Long> splitPoint = getFixedSplit(locs, locsSize, startIndex, stopIndex, halfParts, numParts - halfParts);
|
||||||
splitSize += toAdd.size();
|
int splitIndex = splitPoint.first;
|
||||||
split.add(toAdd);
|
long splitSize = splitPoint.second;
|
||||||
final long nextEltSize = remainingLocs.isEmpty() ? 0 : remainingLocs.peek().size();
|
splitPoints.add(splitIndex);
|
||||||
if ( splitSize + (i % 2 == 0 ? 0 : nextEltSize) > idealSplitSize )
|
addFixedSplit(splitPoints, locs, splitSize, startIndex, splitIndex, halfParts);
|
||||||
break;
|
addFixedSplit(splitPoints, locs, locsSize - splitSize, splitIndex, stopIndex, numParts - halfParts);
|
||||||
}
|
}
|
||||||
splits.add(split);
|
|
||||||
|
private static Pair<Integer, Long> getFixedSplit(List<GenomeLoc> locs, long locsSize, int startIndex, int stopIndex, int minLocs, int maxLocs) {
|
||||||
|
int splitIndex = startIndex;
|
||||||
|
long splitSize = 0;
|
||||||
|
for (int i = 0; i < minLocs; i++) {
|
||||||
|
splitSize += locs.get(splitIndex).size();
|
||||||
|
splitIndex++;
|
||||||
}
|
}
|
||||||
|
long halfSize = locsSize / 2;
|
||||||
return splits;
|
while (splitIndex < (stopIndex - maxLocs) && splitSize < halfSize) {
|
||||||
|
splitSize += locs.get(splitIndex).size();
|
||||||
|
splitIndex++;
|
||||||
|
}
|
||||||
|
return new Pair<Integer, Long>(splitIndex, splitSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,6 @@
|
||||||
package org.broadinstitute.sting.utils.interval;
|
package org.broadinstitute.sting.utils.interval;
|
||||||
|
|
||||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||||
import net.sf.picard.util.IntervalUtil;
|
|
||||||
import net.sf.samtools.SAMFileHeader;
|
import net.sf.samtools.SAMFileHeader;
|
||||||
import org.broadinstitute.sting.BaseTest;
|
import org.broadinstitute.sting.BaseTest;
|
||||||
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
|
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
|
||||||
|
|
@ -101,25 +100,18 @@ public class IntervalUtilsUnitTest extends BaseTest {
|
||||||
@DataProvider(name = "intervalslicingdata")
|
@DataProvider(name = "intervalslicingdata")
|
||||||
public Object[][] createTrees() {
|
public Object[][] createTrees() {
|
||||||
new IntervalSlicingTest(1, 0);
|
new IntervalSlicingTest(1, 0);
|
||||||
new IntervalSlicingTest(2, 0.1);
|
new IntervalSlicingTest(2, 1);
|
||||||
new IntervalSlicingTest(3, 0.1);
|
new IntervalSlicingTest(5, 1);
|
||||||
new IntervalSlicingTest(7, 0.1);
|
new IntervalSlicingTest(10, 1);
|
||||||
new IntervalSlicingTest(10, 0.1);
|
new IntervalSlicingTest(67, 1);
|
||||||
new IntervalSlicingTest(31, 0.1);
|
new IntervalSlicingTest(100, 1);
|
||||||
new IntervalSlicingTest(67, 0.1);
|
new IntervalSlicingTest(500, 1);
|
||||||
new IntervalSlicingTest(100, 0.1);
|
|
||||||
new IntervalSlicingTest(127, 0.1);
|
|
||||||
// starts to become a bit less efficiency with larger cuts
|
|
||||||
new IntervalSlicingTest(500, 0.5);
|
|
||||||
new IntervalSlicingTest(1000, 1);
|
new IntervalSlicingTest(1000, 1);
|
||||||
new IntervalSlicingTest(10000, 10);
|
|
||||||
return IntervalSlicingTest.getTests(IntervalSlicingTest.class);
|
return IntervalSlicingTest.getTests(IntervalSlicingTest.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(dataProvider = "intervalslicingdata")
|
@Test(enabled = true, dataProvider = "intervalslicingdata")
|
||||||
public void testFixedScatterIntervalsAlgorithm(IntervalSlicingTest test) {
|
public void testFixedScatterIntervalsAlgorithm(IntervalSlicingTest test) {
|
||||||
Set<GenomeLoc> locsSet = new HashSet<GenomeLoc>(hg19exomeIntervals);
|
|
||||||
Set<GenomeLoc> notFoundSet = new HashSet<GenomeLoc>(hg19exomeIntervals);
|
|
||||||
List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(hg19exomeIntervals, test.parts);
|
List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(hg19exomeIntervals, test.parts);
|
||||||
|
|
||||||
long totalSize = IntervalUtils.intervalSize(hg19exomeIntervals);
|
long totalSize = IntervalUtils.intervalSize(hg19exomeIntervals);
|
||||||
|
|
@ -134,15 +126,9 @@ public class IntervalUtilsUnitTest extends BaseTest {
|
||||||
counter++;
|
counter++;
|
||||||
sumOfSplitSizes += splitSize;
|
sumOfSplitSizes += splitSize;
|
||||||
Assert.assertTrue(Math.abs(sigma) <= test.maxAllowableVariance, String.format("Interval %d (size %d ideal %d) has a variance %.2f outside of the tolerated range %.2f", counter, splitSize, idealSplitSize, sigma, test.maxAllowableVariance));
|
Assert.assertTrue(Math.abs(sigma) <= test.maxAllowableVariance, String.format("Interval %d (size %d ideal %d) has a variance %.2f outside of the tolerated range %.2f", counter, splitSize, idealSplitSize, sigma, test.maxAllowableVariance));
|
||||||
|
|
||||||
for ( final GenomeLoc loc : split ) {
|
|
||||||
Assert.assertTrue(locsSet.contains(loc), "Split location " + loc + " not found in set of input locs");
|
|
||||||
notFoundSet.remove(loc);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Assert.assertEquals(sumOfSplitSizes, totalSize, "Split intervals don't contain the exact number of bases in the original intervals");
|
Assert.assertEquals(totalSize, sumOfSplitSizes, "Split intervals don't contain the exact number of bases in the origianl intervals");
|
||||||
Assert.assertTrue(notFoundSet.isEmpty(), "Not all intervals were present in the split set");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(expectedExceptions=UserException.class)
|
@Test(expectedExceptions=UserException.class)
|
||||||
|
|
@ -246,7 +232,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
|
||||||
List<File> files = testFiles("basic.", 3, ".intervals");
|
List<File> files = testFiles("basic.", 3, ".intervals");
|
||||||
|
|
||||||
List<GenomeLoc> locs = getLocs("chr1", "chr2", "chr3");
|
List<GenomeLoc> locs = getLocs("chr1", "chr2", "chr3");
|
||||||
IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files);
|
List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
|
||||||
|
IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);
|
||||||
|
|
||||||
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
|
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
|
||||||
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
|
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
|
||||||
|
|
@ -271,20 +258,21 @@ public class IntervalUtilsUnitTest extends BaseTest {
|
||||||
List<File> files = testFiles("less.", 3, ".intervals");
|
List<File> files = testFiles("less.", 3, ".intervals");
|
||||||
|
|
||||||
List<GenomeLoc> locs = getLocs("chr1", "chr2", "chr3", "chr4");
|
List<GenomeLoc> locs = getLocs("chr1", "chr2", "chr3", "chr4");
|
||||||
IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files);
|
List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
|
||||||
|
IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);
|
||||||
|
|
||||||
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
|
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
|
||||||
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
|
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
|
||||||
List<GenomeLoc> locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false);
|
List<GenomeLoc> locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false);
|
||||||
|
|
||||||
Assert.assertEquals(locs1.size(), 2);
|
Assert.assertEquals(locs1.size(), 1);
|
||||||
Assert.assertEquals(locs2.size(), 1);
|
Assert.assertEquals(locs2.size(), 1);
|
||||||
Assert.assertEquals(locs3.size(), 1);
|
Assert.assertEquals(locs3.size(), 2);
|
||||||
|
|
||||||
Assert.assertEquals(locs1.get(0), chr1);
|
Assert.assertEquals(locs1.get(0), chr1);
|
||||||
Assert.assertEquals(locs1.get(1), chr2);
|
Assert.assertEquals(locs2.get(0), chr2);
|
||||||
Assert.assertEquals(locs2.get(0), chr3);
|
Assert.assertEquals(locs3.get(0), chr3);
|
||||||
Assert.assertEquals(locs3.get(0), chr4);
|
Assert.assertEquals(locs3.get(1), chr4);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(expectedExceptions=UserException.BadArgumentValue.class)
|
@Test(expectedExceptions=UserException.BadArgumentValue.class)
|
||||||
|
|
@ -298,7 +286,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
|
||||||
public void testScatterFixedIntervalsMoreFiles() {
|
public void testScatterFixedIntervalsMoreFiles() {
|
||||||
List<File> files = testFiles("more.", 3, ".intervals");
|
List<File> files = testFiles("more.", 3, ".intervals");
|
||||||
List<GenomeLoc> locs = getLocs("chr1", "chr2");
|
List<GenomeLoc> locs = getLocs("chr1", "chr2");
|
||||||
IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, locs.size()), files);
|
List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(locs, locs.size()); // locs.size() instead of files.size()
|
||||||
|
IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);
|
||||||
}
|
}
|
||||||
@Test
|
@Test
|
||||||
public void testScatterFixedIntervalsStart() {
|
public void testScatterFixedIntervalsStart() {
|
||||||
|
|
@ -311,7 +300,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
|
||||||
List<File> files = testFiles("split.", 3, ".intervals");
|
List<File> files = testFiles("split.", 3, ".intervals");
|
||||||
|
|
||||||
List<GenomeLoc> locs = getLocs(intervals);
|
List<GenomeLoc> locs = getLocs(intervals);
|
||||||
IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files);
|
List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
|
||||||
|
IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);
|
||||||
|
|
||||||
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
|
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
|
||||||
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
|
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
|
||||||
|
|
@ -338,7 +328,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
|
||||||
List<File> files = testFiles("split.", 3, ".intervals");
|
List<File> files = testFiles("split.", 3, ".intervals");
|
||||||
|
|
||||||
List<GenomeLoc> locs = getLocs(intervals);
|
List<GenomeLoc> locs = getLocs(intervals);
|
||||||
IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files);
|
List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
|
||||||
|
IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);
|
||||||
|
|
||||||
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
|
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
|
||||||
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
|
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
|
||||||
|
|
@ -365,7 +356,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
|
||||||
List<File> files = testFiles("split.", 3, ".intervals");
|
List<File> files = testFiles("split.", 3, ".intervals");
|
||||||
|
|
||||||
List<GenomeLoc> locs = getLocs(intervals);
|
List<GenomeLoc> locs = getLocs(intervals);
|
||||||
IntervalUtils.scatterFixedIntervals(hg18Header, IntervalUtils.splitFixedIntervals(locs, files.size()), files);
|
List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
|
||||||
|
IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);
|
||||||
|
|
||||||
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
|
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
|
||||||
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
|
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
|
||||||
|
|
@ -399,7 +391,7 @@ public class IntervalUtilsUnitTest extends BaseTest {
|
||||||
|
|
||||||
//String splitCounts = "";
|
//String splitCounts = "";
|
||||||
for (int i = 0; i < splits.size(); i++) {
|
for (int i = 0; i < splits.size(); i++) {
|
||||||
long splitCount = splits.get(i).size();
|
int splitCount = splits.get(i).size();
|
||||||
Assert.assertEquals(splitCount, counts[i], "Num intervals in split " + i);
|
Assert.assertEquals(splitCount, counts[i], "Num intervals in split " + i);
|
||||||
}
|
}
|
||||||
//System.out.println(splitCounts.substring(2));
|
//System.out.println(splitCounts.substring(2));
|
||||||
|
|
@ -420,7 +412,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
|
||||||
@Test
|
@Test
|
||||||
public void testScatterFixedIntervalsMax() {
|
public void testScatterFixedIntervalsMax() {
|
||||||
List<File> files = testFiles("sg.", 85, ".intervals");
|
List<File> files = testFiles("sg.", 85, ".intervals");
|
||||||
IntervalUtils.scatterFixedIntervals(hg19Header, IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size()), files);
|
List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size());
|
||||||
|
IntervalUtils.scatterFixedIntervals(hg19Header, splits, files);
|
||||||
|
|
||||||
for (int i = 0; i < files.size(); i++) {
|
for (int i = 0; i < files.size(); i++) {
|
||||||
String file = files.get(i).toString();
|
String file = files.get(i).toString();
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue