diff --git a/public/R/queueJobReport.R b/public/R/queueJobReport.R index 31916361e..de6aa045e 100644 --- a/public/R/queueJobReport.R +++ b/public/R/queueJobReport.R @@ -12,14 +12,14 @@ if ( onCMDLine ) { inputFileName = args[1] outputPDF = args[2] } else { - #inputFileName = "~/Desktop/broadLocal/GATK/unstable/report.txt" - inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt" + inputFileName = "~/Desktop/Q-30033@gsa1.jobreport.txt" + #inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt" #inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/rodPerformanceGoals/history/report.082711.txt" outputPDF = NA } -RUNTIME_UNITS = "(sec)" -ORIGINAL_UNITS_TO_SECONDS = 1/1000 +RUNTIME_UNITS = "(hours)" +ORIGINAL_UNITS_TO_SECONDS = 1/1000/60/60 # # Helper function to aggregate all of the jobs in the report across all tables @@ -33,7 +33,7 @@ allJobsFromReport <- function(report) { # # Creates segmentation plots of time (x) vs. job (y) with segments for the duration of the job # -plotJobsGantt <- function(gatkReport, sortOverall) { +plotJobsGantt <- function(gatkReport, sortOverall, includeText) { allJobs = allJobsFromReport(gatkReport) if ( sortOverall ) { title = "All jobs, by analysis, by start time" @@ -44,16 +44,18 @@ plotJobsGantt <- function(gatkReport, sortOverall) { } allJobs$index = 1:nrow(allJobs) minTime = min(allJobs$startTime) - allJobs$relStartTime = allJobs$startTime - minTime - allJobs$relDoneTime = allJobs$doneTime - minTime + allJobs$relStartTime = (allJobs$startTime - minTime) * ORIGINAL_UNITS_TO_SECONDS + allJobs$relDoneTime = (allJobs$doneTime - minTime) * ORIGINAL_UNITS_TO_SECONDS allJobs$ganttName = paste(allJobs$jobName, "@", allJobs$exechosts) maxRelTime = max(allJobs$relDoneTime) p <- ggplot(data=allJobs, aes(x=relStartTime, y=index, color=analysisName)) - p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=2, arrow=arrow(length = unit(0.1, "cm"))) - p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2) + p <- p + theme_bw() + p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=1, arrow=arrow(length = unit(0.1, "cm"))) + if ( includeText ) + p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2) p <- p + xlim(0, maxRelTime * 1.1) p <- p + xlab(paste("Start time (relative to first job)", RUNTIME_UNITS)) - p <- p + ylab("Job") + p <- p + ylab("Job number") p <- p + opts(title=title) print(p) } @@ -157,8 +159,8 @@ if ( ! is.na(outputPDF) ) { pdf(outputPDF, height=8.5, width=11) } -plotJobsGantt(gatkReportData, T) -plotJobsGantt(gatkReportData, F) +plotJobsGantt(gatkReportData, T, F) +plotJobsGantt(gatkReportData, F, F) plotProgressByTime(gatkReportData) for ( group in gatkReportData ) { plotGroup(group) diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java index 4d94130a8..5a7658031 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java @@ -293,15 +293,16 @@ public class GATKRunReport { * That is, postReport() is guarenteed not to fail for any reason. */ private File postReportToLocalDisk(File rootDir) { + String filename = getID() + ".report.xml.gz"; + File file = new File(rootDir, filename); try { - String filename = getID() + ".report.xml.gz"; - File file = new File(rootDir, filename); postReportToFile(file); logger.debug("Wrote report to " + file); return file; } catch ( Exception e ) { // we catch everything, and no matter what eat the error exceptDuringRunReport("Couldn't read report file", e); + file.delete(); return null; } } @@ -312,6 +313,7 @@ public class GATKRunReport { File localFile = postReportToLocalDisk(new File("./")); logger.debug("Generating GATK report to AWS S3 based on local file " + localFile); if ( localFile != null ) { // we succeeded in creating the local file + localFile.deleteOnExit(); try { // stop us from printing the annoying, and meaningless, mime types warning Logger mimeTypeLogger = Logger.getLogger(org.jets3t.service.utils.Mimetypes.class); @@ -336,14 +338,13 @@ public class GATKRunReport { //logger.info("Uploading " + localFile + " to AWS bucket"); S3Object s3Object = s3Service.putObject(REPORT_BUCKET_NAME, fileObject); logger.debug("Uploaded to AWS: " + s3Object); + logger.info("Uploaded run statistics report to AWS S3"); } catch ( S3ServiceException e ) { exceptDuringRunReport("S3 exception occurred", e); } catch ( NoSuchAlgorithmException e ) { exceptDuringRunReport("Couldn't calculate MD5", e); } catch ( IOException e ) { exceptDuringRunReport("Couldn't read report file", e); - } finally { - localFile.delete(); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java index 664c319ab..86f97a36c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java @@ -63,12 +63,9 @@ import java.util.*; *
* One or more bam files (with proper headers) to be analyzed for coverage statistics + * (Optional) A REFSEQ Rod to aggregate coverage to the gene level *
- *- *(Optional) A REFSEQ Rod to aggregate coverage to the gene level - *
- * (for information about creating the REFSEQ Rod, please consult the RefSeqCodec documentation) - *
+ * ** Tables pertaining to different coverage summaries. Suffix on the table files declares the contents: @@ -96,7 +93,7 @@ import java.util.*; *
* java -Xmx2g -jar GenomeAnalysisTK.jar \ * -R ref.fasta \ - * -T DepthOfCoverage \ + * -T VariantEval \ * -o file_name_base \ * -I input_bams.list * [-geneList refSeq.sorted.txt] \ diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index 2a877fb09..bf9ff35de 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -309,6 +309,7 @@ public class VariantsToTable extends RodWalker{ getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } }); getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } }); getters.put("NO-CALL", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNoCallCount()); } }); + getters.put("TYPE", new Getter() { public String get(VariantContext vc) { return vc.getType().toString(); } }); getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } }); getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } }); getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } }); diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index b96923589..b66198713 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -306,7 +306,7 @@ public class GenomeLoc implements Comparable , Serializable, HasGenome @Override public int hashCode() { - return (int)( start << 16 + stop << 4 + contigIndex ); + return start << 16 | stop << 4 | contigIndex; } diff --git a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java index f9997bfd8..1b4703e4a 100755 --- a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java @@ -190,11 +190,21 @@ public class SampleUtils { } - public static List getSamplesFromCommandLineInput(Collection sampleArgs) { + /** + * Returns a new set of samples, containing a final list of samples expanded from sampleArgs + * + * Each element E of sampleArgs can either be a literal sample name or a file. For each E, + * we try to read a file named E from disk, and if possible all lines from that file are expanded + * into unique sample names. + * + * @param sampleArgs + * @return + */ + public static Set getSamplesFromCommandLineInput(Collection sampleArgs) { if (sampleArgs != null) { // Let's first go through the list and see if we were given any files. We'll add every entry in the file to our // sample list set, and treat the entries as if they had been specified on the command line. - List samplesFromFiles = new ArrayList (); + Set samplesFromFiles = new HashSet (); for (String SAMPLE_EXPRESSION : sampleArgs) { File sampleFile = new File(SAMPLE_EXPRESSION); @@ -203,7 +213,7 @@ public class SampleUtils { List lines = reader.readLines(); for (String line : lines) { - samplesFromFiles.add(line); + samplesFromFiles.add(line.trim()); } } catch (FileNotFoundException e) { samplesFromFiles.add(SAMPLE_EXPRESSION); // not a file, so must be a sample @@ -212,7 +222,8 @@ public class SampleUtils { return samplesFromFiles; } - return new ArrayList (); + + return new HashSet (); } public static Set getSamplesFromCommandLineInput(Collection vcfSamples, Collection sampleExpressions) { diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java index f142fa5aa..d94d9ff84 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java @@ -12,35 +12,19 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import java.util.ArrayList; /** - * Allows for reading in RefSeq information + * TODO FOR CHRIS HARTL * * - * Parses a sorted UCSC RefSeq file (see below) into relevant features: the gene name, the unique gene name (if multiple transcrips get separate entries), exons, gene start/stop, coding start/stop, - * strandedness of transcription. + * Codec Description *
* *- * Instructions for generating a RefSeq file for use with the RefSeq codec can be found on the Wiki here - * http://www.broadinstitute.org/gsa/wiki/index.php/RefSeq + * See also: link to file specification *
- *Usage
- * The RefSeq Rod can be bound as any other rod, and is specified by REFSEQ, for example - *- * -refSeqBinding:REFSEQ /path/to/refSeq.txt - *- * - * You will need to consult individual walkers for the binding name ("refSeqBinding", above) * *File format example
- * If you want to define your own file for use, the format is (tab delimited): - * bin, name, chrom, strand, transcription start, transcription end, coding start, coding end, num exons, exon starts, exon ends, id, alt. name, coding start status (complete/incomplete), coding end status (complete,incomplete) - * and exon frames, for example: - *- * 76 NM_001011874 1 - 3204562 3661579 3206102 3661429 3 3204562,3411782,3660632, 3207049,3411982,3661579, 0 Xkr4 cmpl cmpl 1,2,0, - *- * for more information see here *- * + * A BAM file containing exactly one sample. *
* * @author Mark DePristo diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index f551e1368..41cbbe59f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -334,24 +334,44 @@ public class IntervalUtils { } /** - * Splits an interval list into multiple files. - * @param fileHeader The sam file header. + * Splits an interval list into multiple sublists. * @param locs The genome locs to split. * @param splits The stop points for the genome locs returned by splitFixedIntervals. - * @param scatterParts The output interval lists to write to. + * @return A list of lists of genome locs, split according to splits */ - public static void scatterFixedIntervals(SAMFileHeader fileHeader, Listlocs, List splits, List scatterParts) { - if (splits.size() != scatterParts.size()) - throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size())); - int fileIndex = 0; + public static List > splitIntervalsToSubLists(List
locs, List splits) { int locIndex = 1; int start = 0; + List > sublists = new ArrayList
>(splits.size()); for (Integer stop: splits) { - IntervalList intervalList = new IntervalList(fileHeader); + List
curList = new ArrayList (); for (int i = start; i < stop; i++) - intervalList.add(toInterval(locs.get(i), locIndex++)); - intervalList.write(scatterParts.get(fileIndex++)); + curList.add(locs.get(i)); start = stop; + sublists.add(curList); + } + + return sublists; + } + + + /** + * Splits an interval list into multiple files. + * @param fileHeader The sam file header. + * @param splits Pre-divided genome locs returned by splitFixedIntervals. + * @param scatterParts The output interval lists to write to. + */ + public static void scatterFixedIntervals(SAMFileHeader fileHeader, List > splits, List
scatterParts) { + if (splits.size() != scatterParts.size()) + throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size())); + + int fileIndex = 0; + int locIndex = 1; + for (final List split : splits) { + IntervalList intervalList = new IntervalList(fileHeader); + for (final GenomeLoc loc : split) + intervalList.add(toInterval(loc, locIndex++)); + intervalList.write(scatterParts.get(fileIndex++)); } } @@ -361,17 +381,15 @@ public class IntervalUtils { * @param numParts Number of parts to split the locs into. * @return The stop points to split the genome locs. */ - public static List splitFixedIntervals(List locs, int numParts) { + public static List > splitFixedIntervals(List
locs, int numParts) { if (locs.size() < numParts) throw new UserException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts)); - long locsSize = 0; - for (GenomeLoc loc: locs) - locsSize += loc.size(); - List splitPoints = new ArrayList (); + final long locsSize = intervalSize(locs); + final List splitPoints = new ArrayList (); addFixedSplit(splitPoints, locs, locsSize, 0, locs.size(), numParts); Collections.sort(splitPoints); splitPoints.add(locs.size()); - return splitPoints; + return splitIntervalsToSubLists(locs, splitPoints); } private static void addFixedSplit(List splitPoints, List locs, long locsSize, int startIndex, int stopIndex, int numParts) { @@ -441,4 +459,11 @@ public class IntervalUtils { return merged; } } + + public static final long intervalSize(final List locs) { + long size = 0; + for ( final GenomeLoc loc : locs ) + size += loc.size(); + return size; + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java index bb892eec8..98b878d23 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java @@ -30,6 +30,20 @@ public class IntervalUtilsUnitTest extends BaseTest { private SAMFileHeader hg19Header; private GenomeLocParser hg19GenomeLocParser; private List hg19ReferenceLocs; + private List hg19exomeIntervals; + + private List getLocs(String... intervals) { + return getLocs(Arrays.asList(intervals)); + } + + private List getLocs(List intervals) { + if (intervals.size() == 0) + return hg18ReferenceLocs; + List locs = new ArrayList (); + for (String interval: intervals) + locs.add(hg18GenomeLocParser.parseGenomeLoc(interval)); + return locs; + } @BeforeClass public void init() { @@ -54,12 +68,69 @@ public class IntervalUtilsUnitTest extends BaseTest { ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(hg19Ref); hg19GenomeLocParser = new GenomeLocParser(seq); hg19ReferenceLocs = Collections.unmodifiableList(GenomeLocSortedSet.createSetFromSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()).toList()) ; + + hg19exomeIntervals = Collections.unmodifiableList(IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(hg19Intervals), false)); } catch(FileNotFoundException ex) { throw new UserException.CouldNotReadInputFile(hg19Ref,ex); } } + // ------------------------------------------------------------------------------------- + // + // tests to ensure the quality of the interval cuts of the interval cutting functions + // + // ------------------------------------------------------------------------------------- + + private class IntervalSlicingTest extends TestDataProvider { + public int parts; + public double maxAllowableVariance; + + private IntervalSlicingTest(final int parts, final double maxAllowableVariance) { + super(IntervalSlicingTest.class); + this.parts = parts; + this.maxAllowableVariance = maxAllowableVariance; + } + + public String toString() { + return String.format("IntervalSlicingTest parts=%d maxVar=%.2f", parts, maxAllowableVariance); + } + } + + @DataProvider(name = "intervalslicingdata") + public Object[][] createTrees() { + new IntervalSlicingTest(1, 0); + new IntervalSlicingTest(2, 1); + new IntervalSlicingTest(5, 1); + new IntervalSlicingTest(10, 1); + new IntervalSlicingTest(67, 1); + new IntervalSlicingTest(100, 1); + new IntervalSlicingTest(500, 1); + new IntervalSlicingTest(1000, 1); + return IntervalSlicingTest.getTests(IntervalSlicingTest.class); + } + + @Test(enabled = true, dataProvider = "intervalslicingdata") + public void testFixedScatterIntervalsAlgorithm(IntervalSlicingTest test) { + List > splits = IntervalUtils.splitFixedIntervals(hg19exomeIntervals, test.parts); + + long totalSize = IntervalUtils.intervalSize(hg19exomeIntervals); + long idealSplitSize = totalSize / test.parts; + + long sumOfSplitSizes = 0; + int counter = 0; + for ( final List
split : splits ) { + long splitSize = IntervalUtils.intervalSize(split); + double sigma = (splitSize - idealSplitSize) / (1.0 * idealSplitSize); + //logger.warn(String.format("Split %d size %d ideal %d sigma %.2f", counter, splitSize, idealSplitSize, sigma)); + counter++; + sumOfSplitSizes += splitSize; + Assert.assertTrue(Math.abs(sigma) <= test.maxAllowableVariance, String.format("Interval %d (size %d ideal %d) has a variance %.2f outside of the tolerated range %.2f", counter, splitSize, idealSplitSize, sigma, test.maxAllowableVariance)); + } + + Assert.assertEquals(totalSize, sumOfSplitSizes, "Split intervals don't contain the exact number of bases in the origianl intervals"); + } + @Test(expectedExceptions=UserException.class) public void testMergeListsBySetOperatorNoOverlap() { // a couple of lists we'll use for the testing @@ -129,19 +200,6 @@ public class IntervalUtilsUnitTest extends BaseTest { Assert.assertEquals((long)lengths.get("chrX"), 154913754); } - private List getLocs(String... intervals) { - return getLocs(Arrays.asList(intervals)); - } - - private List getLocs(List intervals) { - if (intervals.size() == 0) - return hg18ReferenceLocs; - List locs = new ArrayList (); - for (String interval: intervals) - locs.add(hg18GenomeLocParser.parseGenomeLoc(interval)); - return locs; - } - @Test public void testParseIntervalArguments() { Assert.assertEquals(getLocs().size(), 45); @@ -174,8 +232,8 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("basic.", 3, ".intervals"); List locs = getLocs("chr1", "chr2", "chr3"); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); + List > splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); List
locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); @@ -200,8 +258,8 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("less.", 3, ".intervals"); List locs = getLocs("chr1", "chr2", "chr3", "chr4"); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); + List > splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); List
locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); @@ -228,8 +286,8 @@ public class IntervalUtilsUnitTest extends BaseTest { public void testScatterFixedIntervalsMoreFiles() { List files = testFiles("more.", 3, ".intervals"); List locs = getLocs("chr1", "chr2"); - List splits = IntervalUtils.splitFixedIntervals(locs, locs.size()); // locs.size() instead of files.size() - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); + List > splits = IntervalUtils.splitFixedIntervals(locs, locs.size()); // locs.size() instead of files.size() + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); } @Test public void testScatterFixedIntervalsStart() { @@ -242,8 +300,8 @@ public class IntervalUtilsUnitTest extends BaseTest { List
files = testFiles("split.", 3, ".intervals"); List locs = getLocs(intervals); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); + List > splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); List
locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); @@ -270,8 +328,8 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("split.", 3, ".intervals"); List locs = getLocs(intervals); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); + List > splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); List
locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); @@ -298,8 +356,8 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("split.", 3, ".intervals"); List locs = getLocs(intervals); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); + List > splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); List
locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); @@ -319,7 +377,7 @@ public class IntervalUtilsUnitTest extends BaseTest { public void testScatterFixedIntervalsFile() { List files = testFiles("sg.", 20, ".intervals"); List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(BaseTest.GATKDataLocation + "whole_exome_agilent_designed_120.targets.hg18.chr20.interval_list"), false); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + List > splits = IntervalUtils.splitFixedIntervals(locs, files.size()); int[] counts = { 125, 138, 287, 291, 312, 105, 155, 324, @@ -332,16 +390,13 @@ public class IntervalUtilsUnitTest extends BaseTest { }; //String splitCounts = ""; - for (int lastIndex = 0, i = 0; i < splits.size(); i++) { - int splitIndex = splits.get(i); - int splitCount = (splitIndex - lastIndex); - //splitCounts += ", " + splitCount; - lastIndex = splitIndex; + for (int i = 0; i < splits.size(); i++) { + int splitCount = splits.get(i).size(); Assert.assertEquals(splitCount, counts[i], "Num intervals in split " + i); } //System.out.println(splitCounts.substring(2)); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); int locIndex = 0; for (int i = 0; i < files.size(); i++) { @@ -357,8 +412,8 @@ public class IntervalUtilsUnitTest extends BaseTest { @Test public void testScatterFixedIntervalsMax() { List
files = testFiles("sg.", 85, ".intervals"); - List splits = IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size()); - IntervalUtils.scatterFixedIntervals(hg19Header, hg19ReferenceLocs, splits, files); + List > splits = IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size()); + IntervalUtils.scatterFixedIntervals(hg19Header, splits, files); for (int i = 0; i < files.size(); i++) { String file = files.get(i).toString(); diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala index aae5e438c..0fb997f43 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala @@ -1,65 +1,65 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.queue.extensions.gatk - -import java.io.File -import collection.JavaConversions._ -import org.broadinstitute.sting.utils.interval.IntervalUtils -import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource -import net.sf.samtools.SAMFileHeader -import java.util.Collections -import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocSortedSet, GenomeLocParser} - -case class GATKIntervals(reference: File, intervals: List[String]) { - private lazy val referenceDataSource = new ReferenceDataSource(reference) - private var splitsBySize = Map.empty[Int, java.util.List[java.lang.Integer]] - - lazy val samFileHeader = { - val header = new SAMFileHeader - header.setSequenceDictionary(referenceDataSource.getReference.getSequenceDictionary) - header - } - - lazy val locs: java.util.List[GenomeLoc] = { - val parser = new GenomeLocParser(referenceDataSource.getReference) - val parsedLocs = - if (intervals.isEmpty) - GenomeLocSortedSet.createSetFromSequenceDictionary(samFileHeader.getSequenceDictionary).toList - else - IntervalUtils.parseIntervalArguments(parser, intervals, false) - Collections.sort(parsedLocs) - Collections.unmodifiableList(parsedLocs) - } - - lazy val contigs = locs.map(_.getContig).distinct.toList - - def getSplits(size: Int) = { - splitsBySize.getOrElse(size, { - val splits: java.util.List[java.lang.Integer] = IntervalUtils.splitFixedIntervals(locs, size) - splitsBySize += size -> splits - splits - }) - } -} +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.extensions.gatk + +import java.io.File +import collection.JavaConversions._ +import org.broadinstitute.sting.utils.interval.IntervalUtils +import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource +import net.sf.samtools.SAMFileHeader +import java.util.Collections +import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocSortedSet, GenomeLocParser} + +case class GATKIntervals(reference: File, intervals: List[String]) { + private lazy val referenceDataSource = new ReferenceDataSource(reference) +// private var splitsBySize = Map.empty[Int, java.util.List[java.lang.Integer]] + + lazy val samFileHeader = { + val header = new SAMFileHeader + header.setSequenceDictionary(referenceDataSource.getReference.getSequenceDictionary) + header + } + + lazy val locs: java.util.List[GenomeLoc] = { + val parser = new GenomeLocParser(referenceDataSource.getReference) + val parsedLocs = + if (intervals.isEmpty) + GenomeLocSortedSet.createSetFromSequenceDictionary(samFileHeader.getSequenceDictionary).toList + else + IntervalUtils.parseIntervalArguments(parser, intervals, false) + Collections.sort(parsedLocs) + Collections.unmodifiableList(parsedLocs) + } + + lazy val contigs = locs.map(_.getContig).distinct.toList + +// def getSplits(size: Int) = { +// splitsBySize.getOrElse(size, { +// val splits: java.util.List[java.lang.Integer] = IntervalUtils.splitFixedIntervals(locs, size) +// splitsBySize += size -> splits +// splits +// }) +// } +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala index d88d272b9..f65d5ab29 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala @@ -37,7 +37,7 @@ class IntervalScatterFunction extends GATKScatterFunction with InProcessFunction def run() { val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals) - IntervalUtils.scatterFixedIntervals(gi.samFileHeader, gi.locs, - gi.getSplits(this.scatterOutputFiles.size), this.scatterOutputFiles) + val splits = IntervalUtils.splitFixedIntervals(gi.locs, this.scatterOutputFiles.size) + IntervalUtils.scatterFixedIntervals(gi.samFileHeader, splits, this.scatterOutputFiles) } } diff --git a/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala b/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala index b3a2d23ae..38abe24ef 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala @@ -53,8 +53,8 @@ class GATKIntervalsUnitTest { val gi = new GATKIntervals(hg18Reference, List("chr1:1-1", "chr2:2-3", "chr3:3-5")) Assert.assertEquals(gi.locs.toList, List(chr1, chr2, chr3)) Assert.assertEquals(gi.contigs, List("chr1", "chr2", "chr3")) - Assert.assertEquals(gi.getSplits(2).toList, List(2, 3)) - Assert.assertEquals(gi.getSplits(3).toList, List(1, 2, 3)) +// Assert.assertEquals(gi.getSplits(2).toList, List(2, 3)) +// Assert.assertEquals(gi.getSplits(3).toList, List(1, 2, 3)) } @Test(timeOut = 30000) @@ -65,7 +65,7 @@ class GATKIntervalsUnitTest { // for(Item item: javaConvertedScalaList) // This for loop is actually an O(N^2) operation as the iterator calls the // O(N) javaConvertedScalaList.size() for each iteration of the loop. - Assert.assertEquals(gi.getSplits(gi.locs.size).size, 189894) + //Assert.assertEquals(gi.getSplits(gi.locs.size).size, 189894) Assert.assertEquals(gi.contigs.size, 24) } @@ -74,8 +74,8 @@ class GATKIntervalsUnitTest { val gi = new GATKIntervals(hg18Reference, Nil) Assert.assertEquals(gi.locs, hg18ReferenceLocs) Assert.assertEquals(gi.contigs.size, hg18ReferenceLocs.size) - Assert.assertEquals(gi.getSplits(2).toList, List(10, 45)) - Assert.assertEquals(gi.getSplits(4).toList, List(5, 10, 16, 45)) +// Assert.assertEquals(gi.getSplits(2).toList, List(10, 45)) +// Assert.assertEquals(gi.getSplits(4).toList, List(5, 10, 16, 45)) } @Test