Merge branch 'sgintervals'
This commit is contained in:
commit
cfde0e674b
|
|
@ -12,14 +12,14 @@ if ( onCMDLine ) {
|
||||||
inputFileName = args[1]
|
inputFileName = args[1]
|
||||||
outputPDF = args[2]
|
outputPDF = args[2]
|
||||||
} else {
|
} else {
|
||||||
#inputFileName = "~/Desktop/broadLocal/GATK/unstable/report.txt"
|
inputFileName = "~/Desktop/Q-30033@gsa1.jobreport.txt"
|
||||||
inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt"
|
#inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt"
|
||||||
#inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/rodPerformanceGoals/history/report.082711.txt"
|
#inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/rodPerformanceGoals/history/report.082711.txt"
|
||||||
outputPDF = NA
|
outputPDF = NA
|
||||||
}
|
}
|
||||||
|
|
||||||
RUNTIME_UNITS = "(sec)"
|
RUNTIME_UNITS = "(hours)"
|
||||||
ORIGINAL_UNITS_TO_SECONDS = 1/1000
|
ORIGINAL_UNITS_TO_SECONDS = 1/1000/60/60
|
||||||
|
|
||||||
#
|
#
|
||||||
# Helper function to aggregate all of the jobs in the report across all tables
|
# Helper function to aggregate all of the jobs in the report across all tables
|
||||||
|
|
@ -33,7 +33,7 @@ allJobsFromReport <- function(report) {
|
||||||
#
|
#
|
||||||
# Creates segmentation plots of time (x) vs. job (y) with segments for the duration of the job
|
# Creates segmentation plots of time (x) vs. job (y) with segments for the duration of the job
|
||||||
#
|
#
|
||||||
plotJobsGantt <- function(gatkReport, sortOverall) {
|
plotJobsGantt <- function(gatkReport, sortOverall, includeText) {
|
||||||
allJobs = allJobsFromReport(gatkReport)
|
allJobs = allJobsFromReport(gatkReport)
|
||||||
if ( sortOverall ) {
|
if ( sortOverall ) {
|
||||||
title = "All jobs, by analysis, by start time"
|
title = "All jobs, by analysis, by start time"
|
||||||
|
|
@ -44,16 +44,18 @@ plotJobsGantt <- function(gatkReport, sortOverall) {
|
||||||
}
|
}
|
||||||
allJobs$index = 1:nrow(allJobs)
|
allJobs$index = 1:nrow(allJobs)
|
||||||
minTime = min(allJobs$startTime)
|
minTime = min(allJobs$startTime)
|
||||||
allJobs$relStartTime = allJobs$startTime - minTime
|
allJobs$relStartTime = (allJobs$startTime - minTime) * ORIGINAL_UNITS_TO_SECONDS
|
||||||
allJobs$relDoneTime = allJobs$doneTime - minTime
|
allJobs$relDoneTime = (allJobs$doneTime - minTime) * ORIGINAL_UNITS_TO_SECONDS
|
||||||
allJobs$ganttName = paste(allJobs$jobName, "@", allJobs$exechosts)
|
allJobs$ganttName = paste(allJobs$jobName, "@", allJobs$exechosts)
|
||||||
maxRelTime = max(allJobs$relDoneTime)
|
maxRelTime = max(allJobs$relDoneTime)
|
||||||
p <- ggplot(data=allJobs, aes(x=relStartTime, y=index, color=analysisName))
|
p <- ggplot(data=allJobs, aes(x=relStartTime, y=index, color=analysisName))
|
||||||
p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=2, arrow=arrow(length = unit(0.1, "cm")))
|
p <- p + theme_bw()
|
||||||
p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2)
|
p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=1, arrow=arrow(length = unit(0.1, "cm")))
|
||||||
|
if ( includeText )
|
||||||
|
p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2)
|
||||||
p <- p + xlim(0, maxRelTime * 1.1)
|
p <- p + xlim(0, maxRelTime * 1.1)
|
||||||
p <- p + xlab(paste("Start time (relative to first job)", RUNTIME_UNITS))
|
p <- p + xlab(paste("Start time (relative to first job)", RUNTIME_UNITS))
|
||||||
p <- p + ylab("Job")
|
p <- p + ylab("Job number")
|
||||||
p <- p + opts(title=title)
|
p <- p + opts(title=title)
|
||||||
print(p)
|
print(p)
|
||||||
}
|
}
|
||||||
|
|
@ -157,8 +159,8 @@ if ( ! is.na(outputPDF) ) {
|
||||||
pdf(outputPDF, height=8.5, width=11)
|
pdf(outputPDF, height=8.5, width=11)
|
||||||
}
|
}
|
||||||
|
|
||||||
plotJobsGantt(gatkReportData, T)
|
plotJobsGantt(gatkReportData, T, F)
|
||||||
plotJobsGantt(gatkReportData, F)
|
plotJobsGantt(gatkReportData, F, F)
|
||||||
plotProgressByTime(gatkReportData)
|
plotProgressByTime(gatkReportData)
|
||||||
for ( group in gatkReportData ) {
|
for ( group in gatkReportData ) {
|
||||||
plotGroup(group)
|
plotGroup(group)
|
||||||
|
|
|
||||||
|
|
@ -293,15 +293,16 @@ public class GATKRunReport {
|
||||||
* That is, postReport() is guarenteed not to fail for any reason.
|
* That is, postReport() is guarenteed not to fail for any reason.
|
||||||
*/
|
*/
|
||||||
private File postReportToLocalDisk(File rootDir) {
|
private File postReportToLocalDisk(File rootDir) {
|
||||||
|
String filename = getID() + ".report.xml.gz";
|
||||||
|
File file = new File(rootDir, filename);
|
||||||
try {
|
try {
|
||||||
String filename = getID() + ".report.xml.gz";
|
|
||||||
File file = new File(rootDir, filename);
|
|
||||||
postReportToFile(file);
|
postReportToFile(file);
|
||||||
logger.debug("Wrote report to " + file);
|
logger.debug("Wrote report to " + file);
|
||||||
return file;
|
return file;
|
||||||
} catch ( Exception e ) {
|
} catch ( Exception e ) {
|
||||||
// we catch everything, and no matter what eat the error
|
// we catch everything, and no matter what eat the error
|
||||||
exceptDuringRunReport("Couldn't read report file", e);
|
exceptDuringRunReport("Couldn't read report file", e);
|
||||||
|
file.delete();
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -312,6 +313,7 @@ public class GATKRunReport {
|
||||||
File localFile = postReportToLocalDisk(new File("./"));
|
File localFile = postReportToLocalDisk(new File("./"));
|
||||||
logger.debug("Generating GATK report to AWS S3 based on local file " + localFile);
|
logger.debug("Generating GATK report to AWS S3 based on local file " + localFile);
|
||||||
if ( localFile != null ) { // we succeeded in creating the local file
|
if ( localFile != null ) { // we succeeded in creating the local file
|
||||||
|
localFile.deleteOnExit();
|
||||||
try {
|
try {
|
||||||
// stop us from printing the annoying, and meaningless, mime types warning
|
// stop us from printing the annoying, and meaningless, mime types warning
|
||||||
Logger mimeTypeLogger = Logger.getLogger(org.jets3t.service.utils.Mimetypes.class);
|
Logger mimeTypeLogger = Logger.getLogger(org.jets3t.service.utils.Mimetypes.class);
|
||||||
|
|
@ -336,14 +338,13 @@ public class GATKRunReport {
|
||||||
//logger.info("Uploading " + localFile + " to AWS bucket");
|
//logger.info("Uploading " + localFile + " to AWS bucket");
|
||||||
S3Object s3Object = s3Service.putObject(REPORT_BUCKET_NAME, fileObject);
|
S3Object s3Object = s3Service.putObject(REPORT_BUCKET_NAME, fileObject);
|
||||||
logger.debug("Uploaded to AWS: " + s3Object);
|
logger.debug("Uploaded to AWS: " + s3Object);
|
||||||
|
logger.info("Uploaded run statistics report to AWS S3");
|
||||||
} catch ( S3ServiceException e ) {
|
} catch ( S3ServiceException e ) {
|
||||||
exceptDuringRunReport("S3 exception occurred", e);
|
exceptDuringRunReport("S3 exception occurred", e);
|
||||||
} catch ( NoSuchAlgorithmException e ) {
|
} catch ( NoSuchAlgorithmException e ) {
|
||||||
exceptDuringRunReport("Couldn't calculate MD5", e);
|
exceptDuringRunReport("Couldn't calculate MD5", e);
|
||||||
} catch ( IOException e ) {
|
} catch ( IOException e ) {
|
||||||
exceptDuringRunReport("Couldn't read report file", e);
|
exceptDuringRunReport("Couldn't read report file", e);
|
||||||
} finally {
|
|
||||||
localFile.delete();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -306,7 +306,7 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
return (int)( start << 16 + stop << 4 + contigIndex );
|
return start << 16 | stop << 4 | contigIndex;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -334,24 +334,44 @@ public class IntervalUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Splits an interval list into multiple files.
|
* Splits an interval list into multiple sublists.
|
||||||
* @param fileHeader The sam file header.
|
|
||||||
* @param locs The genome locs to split.
|
* @param locs The genome locs to split.
|
||||||
* @param splits The stop points for the genome locs returned by splitFixedIntervals.
|
* @param splits The stop points for the genome locs returned by splitFixedIntervals.
|
||||||
* @param scatterParts The output interval lists to write to.
|
* @return A list of lists of genome locs, split according to splits
|
||||||
*/
|
*/
|
||||||
public static void scatterFixedIntervals(SAMFileHeader fileHeader, List<GenomeLoc> locs, List<Integer> splits, List<File> scatterParts) {
|
public static List<List<GenomeLoc>> splitIntervalsToSubLists(List<GenomeLoc> locs, List<Integer> splits) {
|
||||||
if (splits.size() != scatterParts.size())
|
|
||||||
throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size()));
|
|
||||||
int fileIndex = 0;
|
|
||||||
int locIndex = 1;
|
int locIndex = 1;
|
||||||
int start = 0;
|
int start = 0;
|
||||||
|
List<List<GenomeLoc>> sublists = new ArrayList<List<GenomeLoc>>(splits.size());
|
||||||
for (Integer stop: splits) {
|
for (Integer stop: splits) {
|
||||||
IntervalList intervalList = new IntervalList(fileHeader);
|
List<GenomeLoc> curList = new ArrayList<GenomeLoc>();
|
||||||
for (int i = start; i < stop; i++)
|
for (int i = start; i < stop; i++)
|
||||||
intervalList.add(toInterval(locs.get(i), locIndex++));
|
curList.add(locs.get(i));
|
||||||
intervalList.write(scatterParts.get(fileIndex++));
|
|
||||||
start = stop;
|
start = stop;
|
||||||
|
sublists.add(curList);
|
||||||
|
}
|
||||||
|
|
||||||
|
return sublists;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Splits an interval list into multiple files.
|
||||||
|
* @param fileHeader The sam file header.
|
||||||
|
* @param splits Pre-divided genome locs returned by splitFixedIntervals.
|
||||||
|
* @param scatterParts The output interval lists to write to.
|
||||||
|
*/
|
||||||
|
public static void scatterFixedIntervals(SAMFileHeader fileHeader, List<List<GenomeLoc>> splits, List<File> scatterParts) {
|
||||||
|
if (splits.size() != scatterParts.size())
|
||||||
|
throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size()));
|
||||||
|
|
||||||
|
int fileIndex = 0;
|
||||||
|
int locIndex = 1;
|
||||||
|
for (final List<GenomeLoc> split : splits) {
|
||||||
|
IntervalList intervalList = new IntervalList(fileHeader);
|
||||||
|
for (final GenomeLoc loc : split)
|
||||||
|
intervalList.add(toInterval(loc, locIndex++));
|
||||||
|
intervalList.write(scatterParts.get(fileIndex++));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -361,17 +381,15 @@ public class IntervalUtils {
|
||||||
* @param numParts Number of parts to split the locs into.
|
* @param numParts Number of parts to split the locs into.
|
||||||
* @return The stop points to split the genome locs.
|
* @return The stop points to split the genome locs.
|
||||||
*/
|
*/
|
||||||
public static List<Integer> splitFixedIntervals(List<GenomeLoc> locs, int numParts) {
|
public static List<List<GenomeLoc>> splitFixedIntervals(List<GenomeLoc> locs, int numParts) {
|
||||||
if (locs.size() < numParts)
|
if (locs.size() < numParts)
|
||||||
throw new UserException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts));
|
throw new UserException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts));
|
||||||
long locsSize = 0;
|
final long locsSize = intervalSize(locs);
|
||||||
for (GenomeLoc loc: locs)
|
final List<Integer> splitPoints = new ArrayList<Integer>();
|
||||||
locsSize += loc.size();
|
|
||||||
List<Integer> splitPoints = new ArrayList<Integer>();
|
|
||||||
addFixedSplit(splitPoints, locs, locsSize, 0, locs.size(), numParts);
|
addFixedSplit(splitPoints, locs, locsSize, 0, locs.size(), numParts);
|
||||||
Collections.sort(splitPoints);
|
Collections.sort(splitPoints);
|
||||||
splitPoints.add(locs.size());
|
splitPoints.add(locs.size());
|
||||||
return splitPoints;
|
return splitIntervalsToSubLists(locs, splitPoints);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void addFixedSplit(List<Integer> splitPoints, List<GenomeLoc> locs, long locsSize, int startIndex, int stopIndex, int numParts) {
|
private static void addFixedSplit(List<Integer> splitPoints, List<GenomeLoc> locs, long locsSize, int startIndex, int stopIndex, int numParts) {
|
||||||
|
|
@ -441,4 +459,11 @@ public class IntervalUtils {
|
||||||
return merged;
|
return merged;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static final long intervalSize(final List<GenomeLoc> locs) {
|
||||||
|
long size = 0;
|
||||||
|
for ( final GenomeLoc loc : locs )
|
||||||
|
size += loc.size();
|
||||||
|
return size;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -30,6 +30,20 @@ public class IntervalUtilsUnitTest extends BaseTest {
|
||||||
private SAMFileHeader hg19Header;
|
private SAMFileHeader hg19Header;
|
||||||
private GenomeLocParser hg19GenomeLocParser;
|
private GenomeLocParser hg19GenomeLocParser;
|
||||||
private List<GenomeLoc> hg19ReferenceLocs;
|
private List<GenomeLoc> hg19ReferenceLocs;
|
||||||
|
private List<GenomeLoc> hg19exomeIntervals;
|
||||||
|
|
||||||
|
private List<GenomeLoc> getLocs(String... intervals) {
|
||||||
|
return getLocs(Arrays.asList(intervals));
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<GenomeLoc> getLocs(List<String> intervals) {
|
||||||
|
if (intervals.size() == 0)
|
||||||
|
return hg18ReferenceLocs;
|
||||||
|
List<GenomeLoc> locs = new ArrayList<GenomeLoc>();
|
||||||
|
for (String interval: intervals)
|
||||||
|
locs.add(hg18GenomeLocParser.parseGenomeLoc(interval));
|
||||||
|
return locs;
|
||||||
|
}
|
||||||
|
|
||||||
@BeforeClass
|
@BeforeClass
|
||||||
public void init() {
|
public void init() {
|
||||||
|
|
@ -54,12 +68,69 @@ public class IntervalUtilsUnitTest extends BaseTest {
|
||||||
ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(hg19Ref);
|
ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(hg19Ref);
|
||||||
hg19GenomeLocParser = new GenomeLocParser(seq);
|
hg19GenomeLocParser = new GenomeLocParser(seq);
|
||||||
hg19ReferenceLocs = Collections.unmodifiableList(GenomeLocSortedSet.createSetFromSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()).toList()) ;
|
hg19ReferenceLocs = Collections.unmodifiableList(GenomeLocSortedSet.createSetFromSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()).toList()) ;
|
||||||
|
|
||||||
|
hg19exomeIntervals = Collections.unmodifiableList(IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(hg19Intervals), false));
|
||||||
}
|
}
|
||||||
catch(FileNotFoundException ex) {
|
catch(FileNotFoundException ex) {
|
||||||
throw new UserException.CouldNotReadInputFile(hg19Ref,ex);
|
throw new UserException.CouldNotReadInputFile(hg19Ref,ex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// tests to ensure the quality of the interval cuts of the interval cutting functions
|
||||||
|
//
|
||||||
|
// -------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
private class IntervalSlicingTest extends TestDataProvider {
|
||||||
|
public int parts;
|
||||||
|
public double maxAllowableVariance;
|
||||||
|
|
||||||
|
private IntervalSlicingTest(final int parts, final double maxAllowableVariance) {
|
||||||
|
super(IntervalSlicingTest.class);
|
||||||
|
this.parts = parts;
|
||||||
|
this.maxAllowableVariance = maxAllowableVariance;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return String.format("IntervalSlicingTest parts=%d maxVar=%.2f", parts, maxAllowableVariance);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@DataProvider(name = "intervalslicingdata")
|
||||||
|
public Object[][] createTrees() {
|
||||||
|
new IntervalSlicingTest(1, 0);
|
||||||
|
new IntervalSlicingTest(2, 1);
|
||||||
|
new IntervalSlicingTest(5, 1);
|
||||||
|
new IntervalSlicingTest(10, 1);
|
||||||
|
new IntervalSlicingTest(67, 1);
|
||||||
|
new IntervalSlicingTest(100, 1);
|
||||||
|
new IntervalSlicingTest(500, 1);
|
||||||
|
new IntervalSlicingTest(1000, 1);
|
||||||
|
return IntervalSlicingTest.getTests(IntervalSlicingTest.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(enabled = true, dataProvider = "intervalslicingdata")
|
||||||
|
public void testFixedScatterIntervalsAlgorithm(IntervalSlicingTest test) {
|
||||||
|
List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(hg19exomeIntervals, test.parts);
|
||||||
|
|
||||||
|
long totalSize = IntervalUtils.intervalSize(hg19exomeIntervals);
|
||||||
|
long idealSplitSize = totalSize / test.parts;
|
||||||
|
|
||||||
|
long sumOfSplitSizes = 0;
|
||||||
|
int counter = 0;
|
||||||
|
for ( final List<GenomeLoc> split : splits ) {
|
||||||
|
long splitSize = IntervalUtils.intervalSize(split);
|
||||||
|
double sigma = (splitSize - idealSplitSize) / (1.0 * idealSplitSize);
|
||||||
|
//logger.warn(String.format("Split %d size %d ideal %d sigma %.2f", counter, splitSize, idealSplitSize, sigma));
|
||||||
|
counter++;
|
||||||
|
sumOfSplitSizes += splitSize;
|
||||||
|
Assert.assertTrue(Math.abs(sigma) <= test.maxAllowableVariance, String.format("Interval %d (size %d ideal %d) has a variance %.2f outside of the tolerated range %.2f", counter, splitSize, idealSplitSize, sigma, test.maxAllowableVariance));
|
||||||
|
}
|
||||||
|
|
||||||
|
Assert.assertEquals(totalSize, sumOfSplitSizes, "Split intervals don't contain the exact number of bases in the origianl intervals");
|
||||||
|
}
|
||||||
|
|
||||||
@Test(expectedExceptions=UserException.class)
|
@Test(expectedExceptions=UserException.class)
|
||||||
public void testMergeListsBySetOperatorNoOverlap() {
|
public void testMergeListsBySetOperatorNoOverlap() {
|
||||||
// a couple of lists we'll use for the testing
|
// a couple of lists we'll use for the testing
|
||||||
|
|
@ -129,19 +200,6 @@ public class IntervalUtilsUnitTest extends BaseTest {
|
||||||
Assert.assertEquals((long)lengths.get("chrX"), 154913754);
|
Assert.assertEquals((long)lengths.get("chrX"), 154913754);
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<GenomeLoc> getLocs(String... intervals) {
|
|
||||||
return getLocs(Arrays.asList(intervals));
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<GenomeLoc> getLocs(List<String> intervals) {
|
|
||||||
if (intervals.size() == 0)
|
|
||||||
return hg18ReferenceLocs;
|
|
||||||
List<GenomeLoc> locs = new ArrayList<GenomeLoc>();
|
|
||||||
for (String interval: intervals)
|
|
||||||
locs.add(hg18GenomeLocParser.parseGenomeLoc(interval));
|
|
||||||
return locs;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testParseIntervalArguments() {
|
public void testParseIntervalArguments() {
|
||||||
Assert.assertEquals(getLocs().size(), 45);
|
Assert.assertEquals(getLocs().size(), 45);
|
||||||
|
|
@ -174,8 +232,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
|
||||||
List<File> files = testFiles("basic.", 3, ".intervals");
|
List<File> files = testFiles("basic.", 3, ".intervals");
|
||||||
|
|
||||||
List<GenomeLoc> locs = getLocs("chr1", "chr2", "chr3");
|
List<GenomeLoc> locs = getLocs("chr1", "chr2", "chr3");
|
||||||
List<Integer> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
|
List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
|
||||||
IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files);
|
IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);
|
||||||
|
|
||||||
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
|
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
|
||||||
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
|
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
|
||||||
|
|
@ -200,8 +258,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
|
||||||
List<File> files = testFiles("less.", 3, ".intervals");
|
List<File> files = testFiles("less.", 3, ".intervals");
|
||||||
|
|
||||||
List<GenomeLoc> locs = getLocs("chr1", "chr2", "chr3", "chr4");
|
List<GenomeLoc> locs = getLocs("chr1", "chr2", "chr3", "chr4");
|
||||||
List<Integer> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
|
List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
|
||||||
IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files);
|
IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);
|
||||||
|
|
||||||
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
|
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
|
||||||
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
|
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
|
||||||
|
|
@ -228,8 +286,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
|
||||||
public void testScatterFixedIntervalsMoreFiles() {
|
public void testScatterFixedIntervalsMoreFiles() {
|
||||||
List<File> files = testFiles("more.", 3, ".intervals");
|
List<File> files = testFiles("more.", 3, ".intervals");
|
||||||
List<GenomeLoc> locs = getLocs("chr1", "chr2");
|
List<GenomeLoc> locs = getLocs("chr1", "chr2");
|
||||||
List<Integer> splits = IntervalUtils.splitFixedIntervals(locs, locs.size()); // locs.size() instead of files.size()
|
List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(locs, locs.size()); // locs.size() instead of files.size()
|
||||||
IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files);
|
IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);
|
||||||
}
|
}
|
||||||
@Test
|
@Test
|
||||||
public void testScatterFixedIntervalsStart() {
|
public void testScatterFixedIntervalsStart() {
|
||||||
|
|
@ -242,8 +300,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
|
||||||
List<File> files = testFiles("split.", 3, ".intervals");
|
List<File> files = testFiles("split.", 3, ".intervals");
|
||||||
|
|
||||||
List<GenomeLoc> locs = getLocs(intervals);
|
List<GenomeLoc> locs = getLocs(intervals);
|
||||||
List<Integer> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
|
List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
|
||||||
IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files);
|
IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);
|
||||||
|
|
||||||
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
|
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
|
||||||
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
|
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
|
||||||
|
|
@ -270,8 +328,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
|
||||||
List<File> files = testFiles("split.", 3, ".intervals");
|
List<File> files = testFiles("split.", 3, ".intervals");
|
||||||
|
|
||||||
List<GenomeLoc> locs = getLocs(intervals);
|
List<GenomeLoc> locs = getLocs(intervals);
|
||||||
List<Integer> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
|
List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
|
||||||
IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files);
|
IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);
|
||||||
|
|
||||||
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
|
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
|
||||||
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
|
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
|
||||||
|
|
@ -298,8 +356,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
|
||||||
List<File> files = testFiles("split.", 3, ".intervals");
|
List<File> files = testFiles("split.", 3, ".intervals");
|
||||||
|
|
||||||
List<GenomeLoc> locs = getLocs(intervals);
|
List<GenomeLoc> locs = getLocs(intervals);
|
||||||
List<Integer> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
|
List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
|
||||||
IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files);
|
IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);
|
||||||
|
|
||||||
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
|
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
|
||||||
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
|
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
|
||||||
|
|
@ -319,7 +377,7 @@ public class IntervalUtilsUnitTest extends BaseTest {
|
||||||
public void testScatterFixedIntervalsFile() {
|
public void testScatterFixedIntervalsFile() {
|
||||||
List<File> files = testFiles("sg.", 20, ".intervals");
|
List<File> files = testFiles("sg.", 20, ".intervals");
|
||||||
List<GenomeLoc> locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(BaseTest.GATKDataLocation + "whole_exome_agilent_designed_120.targets.hg18.chr20.interval_list"), false);
|
List<GenomeLoc> locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(BaseTest.GATKDataLocation + "whole_exome_agilent_designed_120.targets.hg18.chr20.interval_list"), false);
|
||||||
List<Integer> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
|
List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
|
||||||
|
|
||||||
int[] counts = {
|
int[] counts = {
|
||||||
125, 138, 287, 291, 312, 105, 155, 324,
|
125, 138, 287, 291, 312, 105, 155, 324,
|
||||||
|
|
@ -332,16 +390,13 @@ public class IntervalUtilsUnitTest extends BaseTest {
|
||||||
};
|
};
|
||||||
|
|
||||||
//String splitCounts = "";
|
//String splitCounts = "";
|
||||||
for (int lastIndex = 0, i = 0; i < splits.size(); i++) {
|
for (int i = 0; i < splits.size(); i++) {
|
||||||
int splitIndex = splits.get(i);
|
int splitCount = splits.get(i).size();
|
||||||
int splitCount = (splitIndex - lastIndex);
|
|
||||||
//splitCounts += ", " + splitCount;
|
|
||||||
lastIndex = splitIndex;
|
|
||||||
Assert.assertEquals(splitCount, counts[i], "Num intervals in split " + i);
|
Assert.assertEquals(splitCount, counts[i], "Num intervals in split " + i);
|
||||||
}
|
}
|
||||||
//System.out.println(splitCounts.substring(2));
|
//System.out.println(splitCounts.substring(2));
|
||||||
|
|
||||||
IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files);
|
IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);
|
||||||
|
|
||||||
int locIndex = 0;
|
int locIndex = 0;
|
||||||
for (int i = 0; i < files.size(); i++) {
|
for (int i = 0; i < files.size(); i++) {
|
||||||
|
|
@ -357,8 +412,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
|
||||||
@Test
|
@Test
|
||||||
public void testScatterFixedIntervalsMax() {
|
public void testScatterFixedIntervalsMax() {
|
||||||
List<File> files = testFiles("sg.", 85, ".intervals");
|
List<File> files = testFiles("sg.", 85, ".intervals");
|
||||||
List<Integer> splits = IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size());
|
List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size());
|
||||||
IntervalUtils.scatterFixedIntervals(hg19Header, hg19ReferenceLocs, splits, files);
|
IntervalUtils.scatterFixedIntervals(hg19Header, splits, files);
|
||||||
|
|
||||||
for (int i = 0; i < files.size(); i++) {
|
for (int i = 0; i < files.size(); i++) {
|
||||||
String file = files.get(i).toString();
|
String file = files.get(i).toString();
|
||||||
|
|
|
||||||
|
|
@ -34,7 +34,7 @@ import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocSortedSet, GenomeLocP
|
||||||
|
|
||||||
case class GATKIntervals(reference: File, intervals: List[String]) {
|
case class GATKIntervals(reference: File, intervals: List[String]) {
|
||||||
private lazy val referenceDataSource = new ReferenceDataSource(reference)
|
private lazy val referenceDataSource = new ReferenceDataSource(reference)
|
||||||
private var splitsBySize = Map.empty[Int, java.util.List[java.lang.Integer]]
|
// private var splitsBySize = Map.empty[Int, java.util.List[java.lang.Integer]]
|
||||||
|
|
||||||
lazy val samFileHeader = {
|
lazy val samFileHeader = {
|
||||||
val header = new SAMFileHeader
|
val header = new SAMFileHeader
|
||||||
|
|
@ -55,11 +55,11 @@ case class GATKIntervals(reference: File, intervals: List[String]) {
|
||||||
|
|
||||||
lazy val contigs = locs.map(_.getContig).distinct.toList
|
lazy val contigs = locs.map(_.getContig).distinct.toList
|
||||||
|
|
||||||
def getSplits(size: Int) = {
|
// def getSplits(size: Int) = {
|
||||||
splitsBySize.getOrElse(size, {
|
// splitsBySize.getOrElse(size, {
|
||||||
val splits: java.util.List[java.lang.Integer] = IntervalUtils.splitFixedIntervals(locs, size)
|
// val splits: java.util.List[java.lang.Integer] = IntervalUtils.splitFixedIntervals(locs, size)
|
||||||
splitsBySize += size -> splits
|
// splitsBySize += size -> splits
|
||||||
splits
|
// splits
|
||||||
})
|
// })
|
||||||
}
|
// }
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -37,7 +37,7 @@ class IntervalScatterFunction extends GATKScatterFunction with InProcessFunction
|
||||||
|
|
||||||
def run() {
|
def run() {
|
||||||
val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals)
|
val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals)
|
||||||
IntervalUtils.scatterFixedIntervals(gi.samFileHeader, gi.locs,
|
val splits = IntervalUtils.splitFixedIntervals(gi.locs, this.scatterOutputFiles.size)
|
||||||
gi.getSplits(this.scatterOutputFiles.size), this.scatterOutputFiles)
|
IntervalUtils.scatterFixedIntervals(gi.samFileHeader, splits, this.scatterOutputFiles)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -53,8 +53,8 @@ class GATKIntervalsUnitTest {
|
||||||
val gi = new GATKIntervals(hg18Reference, List("chr1:1-1", "chr2:2-3", "chr3:3-5"))
|
val gi = new GATKIntervals(hg18Reference, List("chr1:1-1", "chr2:2-3", "chr3:3-5"))
|
||||||
Assert.assertEquals(gi.locs.toList, List(chr1, chr2, chr3))
|
Assert.assertEquals(gi.locs.toList, List(chr1, chr2, chr3))
|
||||||
Assert.assertEquals(gi.contigs, List("chr1", "chr2", "chr3"))
|
Assert.assertEquals(gi.contigs, List("chr1", "chr2", "chr3"))
|
||||||
Assert.assertEquals(gi.getSplits(2).toList, List(2, 3))
|
// Assert.assertEquals(gi.getSplits(2).toList, List(2, 3))
|
||||||
Assert.assertEquals(gi.getSplits(3).toList, List(1, 2, 3))
|
// Assert.assertEquals(gi.getSplits(3).toList, List(1, 2, 3))
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(timeOut = 30000)
|
@Test(timeOut = 30000)
|
||||||
|
|
@ -65,7 +65,7 @@ class GATKIntervalsUnitTest {
|
||||||
// for(Item item: javaConvertedScalaList)
|
// for(Item item: javaConvertedScalaList)
|
||||||
// This for loop is actually an O(N^2) operation as the iterator calls the
|
// This for loop is actually an O(N^2) operation as the iterator calls the
|
||||||
// O(N) javaConvertedScalaList.size() for each iteration of the loop.
|
// O(N) javaConvertedScalaList.size() for each iteration of the loop.
|
||||||
Assert.assertEquals(gi.getSplits(gi.locs.size).size, 189894)
|
//Assert.assertEquals(gi.getSplits(gi.locs.size).size, 189894)
|
||||||
Assert.assertEquals(gi.contigs.size, 24)
|
Assert.assertEquals(gi.contigs.size, 24)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -74,8 +74,8 @@ class GATKIntervalsUnitTest {
|
||||||
val gi = new GATKIntervals(hg18Reference, Nil)
|
val gi = new GATKIntervals(hg18Reference, Nil)
|
||||||
Assert.assertEquals(gi.locs, hg18ReferenceLocs)
|
Assert.assertEquals(gi.locs, hg18ReferenceLocs)
|
||||||
Assert.assertEquals(gi.contigs.size, hg18ReferenceLocs.size)
|
Assert.assertEquals(gi.contigs.size, hg18ReferenceLocs.size)
|
||||||
Assert.assertEquals(gi.getSplits(2).toList, List(10, 45))
|
// Assert.assertEquals(gi.getSplits(2).toList, List(10, 45))
|
||||||
Assert.assertEquals(gi.getSplits(4).toList, List(5, 10, 16, 45))
|
// Assert.assertEquals(gi.getSplits(4).toList, List(5, 10, 16, 45))
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue