From f619dd3ca7f1a616f62389aa36d9597a7d92598a Mon Sep 17 00:00:00 2001 From: kshakir Date: Tue, 26 Apr 2011 00:06:00 +0000 Subject: [PATCH] Refactored IntervalUtils used to parse and scatter intervals for Queue. Scattering non-contig interval lists by number of loci in the intervals instead of just number of intervals. Queue caches the list of locs and how to split them up instead of reloading them from disk repeatedly. TODO: general purpose function to divide data evenly. Skip over comments when parsing picard analysis files. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5687 348d0f76-0448-11de-a6fe-93d51630548a --- .../utils/broad/PicardAnalysisFiles.java | 8 +- .../sting/utils/interval/IntervalUtils.java | 226 +++++----- .../broad/PicardAnalysisFilesUnitTest.java | 8 + .../utils/broad/PicardPipelineUnitTest.java | 6 + .../utils/interval/IntervalUtilsUnitTest.java | 422 +++++++++++------- .../gatk/ContigScatterFunction.scala | 9 +- .../extensions/gatk/GATKScatterFunction.scala | 16 +- .../gatk/IntervalScatterFunction.scala | 6 +- 8 files changed, 409 insertions(+), 292 deletions(-) diff --git a/java/src/org/broadinstitute/sting/utils/broad/PicardAnalysisFiles.java b/java/src/org/broadinstitute/sting/utils/broad/PicardAnalysisFiles.java index ef01e7e63..02381916e 100755 --- a/java/src/org/broadinstitute/sting/utils/broad/PicardAnalysisFiles.java +++ b/java/src/org/broadinstitute/sting/utils/broad/PicardAnalysisFiles.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.utils.broad; import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.text.XReadLines; import java.io.File; @@ -56,6 +57,8 @@ public class PicardAnalysisFiles { this.path = path; HashMap headerIndexes = null; for (String line: new XReadLines(new File(path))) { + if (line.startsWith("#")) + continue; String[] values = line.split("\t"); if (headerIndexes == null) { headerIndexes = new HashMap(); @@ -65,7 +68,10 @@ public class PicardAnalysisFiles { } } else { for (String header: ANALYSIS_HEADERS) { - String value = values[headerIndexes.get(header)]; + int index = headerIndexes.get(header); + if (values.length <= index) + throw new StingException(String.format("Unable to parse line in %s: %n%s", path, line)); + String value = values[index]; headerValues.get(header).add(value); } } diff --git a/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index 41f8b5006..68243d066 100644 --- a/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -6,7 +6,7 @@ import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.UserException; import java.util.*; @@ -189,162 +189,136 @@ public class IntervalUtils { } /** - * Returns the list of GenomeLocs from the list of intervals. - * @param referenceSource The reference for the intervals. - * @param intervals The interval as strings or file paths. - * @return The list of GenomeLocs. - */ - private static List parseIntervalArguments(ReferenceDataSource referenceSource, List intervals) { - GenomeLocParser parser = new GenomeLocParser(referenceSource.getReference()); - GenomeLocSortedSet locs; - // TODO: Abstract genome analysis engine has richer logic for parsing. We need to use it! - if (intervals.size() == 0) { - locs = GenomeLocSortedSet.createSetFromSequenceDictionary(referenceSource.getReference().getSequenceDictionary()); - } else { - locs = new GenomeLocSortedSet(parser, IntervalUtils.parseIntervalArguments(parser, intervals, false)); - } - if (locs == null || locs.size() == 0) - throw new UserException.MalformedFile("Intervals are empty: " + Utils.join(", ", intervals)); - return locs.toList(); - } - - /** - * Returns the list of contigs from the list of intervals. + * Returns a map of contig names with their sizes. * @param reference The reference for the intervals. - * @return The list of contig names. + * @return A map of contig names with their sizes. */ - public static List distinctContigs(File reference) { - return distinctContigs(reference, Collections.emptyList()); - } - - /** - * Returns the list of contigs from the list of intervals. - * @param reference The reference for the intervals. - * @param intervals The interval as strings or file paths. - * @return The list of contig names. - */ - public static List distinctContigs(File reference, List intervals) { + public static Map getContigSizes(File reference) { ReferenceDataSource referenceSource = new ReferenceDataSource(reference); - List locs = parseIntervalArguments(referenceSource, intervals); - String contig = null; - List contigs = new ArrayList(); - for (GenomeLoc loc: locs) { - if (contig == null || !contig.equals(loc.getContig())) { - contig = loc.getContig(); - contigs.add(contig); - } - } - return contigs; - } - - /** - * Returns a map of contig names with their lengths from the reference. - * @param reference The reference for the intervals. - * @return A map of contig names with their lengths. - */ - public static Map getContigLengths(File reference) { - ReferenceDataSource referenceSource = new ReferenceDataSource(reference); - List locs = parseIntervalArguments(referenceSource, Collections.emptyList()); - Map lengths = new LinkedHashMap(); + List locs = GenomeLocSortedSet.createSetFromSequenceDictionary(referenceSource.getReference().getSequenceDictionary()).toList(); + Map lengths = new LinkedHashMap(); for (GenomeLoc loc: locs) - lengths.put(loc.getContig(), loc.getStop()); + lengths.put(loc.getContig(), loc.size()); return lengths; } /** * Counts the number of interval files an interval list can be split into using scatterIntervalArguments. - * @param reference The reference for the intervals. - * @param intervals The interval as strings or file paths. - * @param splitByContig If true then one contig will not be written to multiple files. + * @param locs The genome locs. * @return The maximum number of parts the intervals can be split into. */ - public static int countIntervalArguments(File reference, List intervals, boolean splitByContig) { - ReferenceDataSource referenceSource = new ReferenceDataSource(reference); - List locs = parseIntervalArguments(referenceSource, intervals); + public static int countContigIntervals(List locs) { int maxFiles = 0; - if (splitByContig) { - String contig = null; - for (GenomeLoc loc: locs) { - if (contig == null || !contig.equals(loc.getContig())) { - maxFiles++; - contig = loc.getContig(); - } + String contig = null; + for (GenomeLoc loc: locs) { + if (contig == null || !contig.equals(loc.getContig())) { + maxFiles++; + contig = loc.getContig(); } - } else { - maxFiles = locs.size(); } return maxFiles; } /** * Splits an interval list into multiple files. - * @param reference The reference for the intervals. - * @param intervals The interval as strings or file paths. + * @param fileHeader The sam file header. + * @param locs The genome locs to split. * @param scatterParts The output interval lists to write to. - * @param splitByContig If true then one contig will not be written to multiple files. */ - public static void scatterIntervalArguments(File reference, List intervals, List scatterParts, boolean splitByContig) { - ReferenceDataSource referenceSource = new ReferenceDataSource(reference); - List locs = parseIntervalArguments(referenceSource, intervals); - SAMFileHeader fileHeader = new SAMFileHeader(); - fileHeader.setSequenceDictionary(referenceSource.getReference().getSequenceDictionary()); - + public static void scatterContigIntervals(SAMFileHeader fileHeader, List locs, List scatterParts) { IntervalList intervalList = null; int fileIndex = -1; int locIndex = 0; - - if (splitByContig) { - String contig = null; - for (GenomeLoc loc: locs) { - // If there are still more files to write and the contig doesn't match... - if ((fileIndex+1 < scatterParts.size()) && (contig == null || !contig.equals(loc.getContig()))) { - // Then close the current file and start a new one. - if (intervalList != null) { - intervalList.write(scatterParts.get(fileIndex)); - intervalList = null; - } - fileIndex++; - contig = loc.getContig(); + String contig = null; + for (GenomeLoc loc: locs) { + // If there are still more files to write and the contig doesn't match... + if ((fileIndex+1 < scatterParts.size()) && (contig == null || !contig.equals(loc.getContig()))) { + // Then close the current file and start a new one. + if (intervalList != null) { + intervalList.write(scatterParts.get(fileIndex)); + intervalList = null; } - if (intervalList == null) - intervalList = new IntervalList(fileHeader); - intervalList.add(toInterval(loc, ++locIndex)); + fileIndex++; + contig = loc.getContig(); } - if (intervalList != null) - intervalList.write(scatterParts.get(fileIndex)); - } else { - int locsPerFile = locs.size() / scatterParts.size(); - int locRemainder = locs.size() % scatterParts.size(); - - // At the start, put an extra loc per file - locsPerFile++; - int locsLeftFile = 0; - - for (GenomeLoc loc: locs) { - if (locsLeftFile == 0) { - if (intervalList != null) - intervalList.write(scatterParts.get(fileIndex)); - - fileIndex++; - intervalList = new IntervalList(fileHeader); - - // When we have put enough locs into each file, - // reduce the number of locs per file back - // to the original calculated value. - if (fileIndex == locRemainder) - locsPerFile -= 1; - locsLeftFile = locsPerFile; - } - locsLeftFile -= 1; - intervalList.add(toInterval(loc, ++locIndex)); - } - if (intervalList != null) - intervalList.write(scatterParts.get(fileIndex)); + if (intervalList == null) + intervalList = new IntervalList(fileHeader); + intervalList.add(toInterval(loc, ++locIndex)); } + if (intervalList != null) + intervalList.write(scatterParts.get(fileIndex)); if ((fileIndex + 1) != scatterParts.size()) throw new UserException.BadArgumentValue("scatterParts", String.format("Only able to write contigs into %d of %d files.", fileIndex + 1, scatterParts.size())); } + /** + * Splits an interval list into multiple files. + * @param fileHeader The sam file header. + * @param locs The genome locs to split. + * @param splits The stop points for the genome locs returned by splitFixedIntervals. + * @param scatterParts The output interval lists to write to. + */ + public static void scatterFixedIntervals(SAMFileHeader fileHeader, List locs, List splits, List scatterParts) { + if (splits.size() != scatterParts.size()) + throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size())); + int fileIndex = 0; + int locIndex = 1; + int start = 0; + for (Integer stop: splits) { + IntervalList intervalList = new IntervalList(fileHeader); + for (int i = start; i < stop; i++) + intervalList.add(toInterval(locs.get(i), locIndex++)); + intervalList.write(scatterParts.get(fileIndex++)); + start = stop; + } + } + + /** + * Splits the genome locs up by size. + * @param locs Genome locs to split. + * @param numParts Number of parts to split the locs into. + * @return The stop points to split the genome locs. + */ + public static List splitFixedIntervals(List locs, int numParts) { + if (locs.size() < numParts) + throw new UserException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts)); + long locsSize = 0; + for (GenomeLoc loc: locs) + locsSize += loc.size(); + List splitPoints = new ArrayList(); + addFixedSplit(splitPoints, locs, locsSize, 0, locs.size(), numParts); + Collections.sort(splitPoints); + splitPoints.add(locs.size()); + return splitPoints; + } + + private static void addFixedSplit(List splitPoints, List locs, long locsSize, int startIndex, int stopIndex, int numParts) { + if (numParts < 2) + return; + int halfParts = (numParts + 1) / 2; + Pair splitPoint = getFixedSplit(locs, locsSize, startIndex, stopIndex, halfParts); + int splitIndex = splitPoint.first; + long splitSize = splitPoint.second; + splitPoints.add(splitIndex); + addFixedSplit(splitPoints, locs, splitSize, startIndex, splitIndex, halfParts); + addFixedSplit(splitPoints, locs, locsSize - splitSize, splitIndex, stopIndex, numParts - halfParts); + } + + private static Pair getFixedSplit(List locs, long locsSize, int startIndex, int stopIndex, int minLocs) { + int splitIndex = startIndex; + long splitSize = 0; + for (int i = 0; i < minLocs; i++) { + splitSize += locs.get(splitIndex).size(); + splitIndex++; + } + long halfSize = locsSize / 2; + while (splitIndex < stopIndex && splitSize < halfSize) { + splitSize += locs.get(splitIndex).size(); + splitIndex++; + } + return new Pair(splitIndex, splitSize); + } + /** * Converts a GenomeLoc to a picard interval. * @param loc The GenomeLoc. diff --git a/java/test/org/broadinstitute/sting/utils/broad/PicardAnalysisFilesUnitTest.java b/java/test/org/broadinstitute/sting/utils/broad/PicardAnalysisFilesUnitTest.java index f1e44000c..de58bf983 100755 --- a/java/test/org/broadinstitute/sting/utils/broad/PicardAnalysisFilesUnitTest.java +++ b/java/test/org/broadinstitute/sting/utils/broad/PicardAnalysisFilesUnitTest.java @@ -25,6 +25,14 @@ public class PicardAnalysisFilesUnitTest extends BaseTest { Assert.assertEquals(file.getBaitIntervals(), "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.baits.interval_list"); } + @Test + public void testParseValidWithComments() throws Exception { + PicardAnalysisFiles file = new PicardAnalysisFiles(BaseTest.validationDataLocation + "picard_analysis_file_with_comments.txt"); + Assert.assertEquals(file.getReferenceSequence(), "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta"); + Assert.assertEquals(file.getTargetIntervals(), "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list"); + Assert.assertEquals(file.getBaitIntervals(), "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.baits.interval_list"); + } + @Test(expectedExceptions = FileNotFoundException.class) public void testParseBadPath() throws Exception { new PicardAnalysisFiles(BaseTest.validationDataLocation + "non_existent_picard_analysis_file.txt"); diff --git a/java/test/org/broadinstitute/sting/utils/broad/PicardPipelineUnitTest.java b/java/test/org/broadinstitute/sting/utils/broad/PicardPipelineUnitTest.java index a17074b01..a5701d671 100755 --- a/java/test/org/broadinstitute/sting/utils/broad/PicardPipelineUnitTest.java +++ b/java/test/org/broadinstitute/sting/utils/broad/PicardPipelineUnitTest.java @@ -22,6 +22,12 @@ public class PicardPipelineUnitTest { validatePipeline(pipeline, FilenameUtils.getBaseName(tsv.getPath())); } + @Test + public void testParseTsvWithPicardComments() throws Exception { + File tsv = writeTsv("C460", "HG01359"); + PicardPipeline.parse(tsv); + } + @Test public void testParseYaml() throws IOException { File yaml = writeYaml("project_name", PROJECT, SAMPLE); diff --git a/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java index 6c27e4f0b..e79580e21 100644 --- a/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java +++ b/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java @@ -1,7 +1,10 @@ package org.broadinstitute.sting.utils.interval; import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.testng.Assert; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.GenomeLoc; @@ -21,14 +24,20 @@ import java.util.*; */ public class IntervalUtilsUnitTest extends BaseTest { // used to seed the genome loc parser with a sequence dictionary - private static File reference = new File(BaseTest.hg18Reference); + private SAMFileHeader header; private GenomeLocParser genomeLocParser; + private List referenceLocs; @BeforeClass public void init() { + File reference = new File(BaseTest.hg18Reference); try { + ReferenceDataSource referenceDataSource = new ReferenceDataSource(reference); + header = new SAMFileHeader(); + header.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()); ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(reference); genomeLocParser = new GenomeLocParser(seq); + referenceLocs = Collections.unmodifiableList(GenomeLocSortedSet.createSetFromSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()).toList()) ; } catch(FileNotFoundException ex) { throw new UserException.CouldNotReadInputFile(reference,ex); @@ -94,46 +103,41 @@ public class IntervalUtilsUnitTest extends BaseTest { Assert.assertEquals(ret.size(), 20); } - @Test - public void testCountContigs() { - List chrs = new ArrayList(); - for (int i = 1; i <= 22; i++) - chrs.add("chr" + i); - chrs.add("chrX"); - chrs.add("chrY"); - - List chrsNoRandom = Arrays.asList("chr12", "chr14", "chr20", "chrY"); - List chrsWithRandom = new ArrayList(); - chrsWithRandom.add("chrM"); - chrsWithRandom.addAll(chrs); - for (String chr: chrs) - if(!chrsNoRandom.contains(chr)) - chrsWithRandom.add(chr + "_random"); - - Assert.assertEquals(IntervalUtils.distinctContigs(reference), chrsWithRandom); - Assert.assertEquals(IntervalUtils.distinctContigs(reference, Arrays.asList(BaseTest.validationDataLocation + "TCGA-06-0188.interval_list")), chrs); - Assert.assertEquals(IntervalUtils.distinctContigs(reference, Arrays.asList("chr1:1-1", "chr2:1-1", "chr3:2-2")), Arrays.asList("chr1","chr2","chr3")); - Assert.assertEquals(IntervalUtils.distinctContigs(reference, Arrays.asList("chr2:1-1", "chr1:1-1", "chr3:2-2")), Arrays.asList("chr1","chr2","chr3")); - } - @Test public void testGetContigLengths() { - Map lengths = IntervalUtils.getContigLengths(reference); - Assert.assertEquals((int)lengths.get("chr1"), 247249719); - Assert.assertEquals((int)lengths.get("chr2"), 242951149); - Assert.assertEquals((int)lengths.get("chr3"), 199501827); - Assert.assertEquals((int)lengths.get("chr20"), 62435964); - Assert.assertEquals((int)lengths.get("chrX"), 154913754); + Map lengths = IntervalUtils.getContigSizes(new File(BaseTest.hg18Reference)); + Assert.assertEquals((long)lengths.get("chr1"), 247249719); + Assert.assertEquals((long)lengths.get("chr2"), 242951149); + Assert.assertEquals((long)lengths.get("chr3"), 199501827); + Assert.assertEquals((long)lengths.get("chr20"), 62435964); + Assert.assertEquals((long)lengths.get("chrX"), 154913754); + } + + private List getLocs(String... intervals) { + return getLocs(Arrays.asList(intervals)); + } + + private List getLocs(List intervals) { + if (intervals.size() == 0) + return referenceLocs; + List locs = new ArrayList(); + for (String interval: intervals) + locs.add(genomeLocParser.parseGenomeInterval(interval)); + return locs; } @Test - public void testCountIntervals() { - Assert.assertEquals(IntervalUtils.countIntervalArguments(reference, Collections.emptyList(), false), 45); - Assert.assertEquals(IntervalUtils.countIntervalArguments(reference, Collections.emptyList(), true), 45); - Assert.assertEquals(IntervalUtils.countIntervalArguments(reference, Arrays.asList("chr1", "chr2", "chr3"), false), 3); - Assert.assertEquals(IntervalUtils.countIntervalArguments(reference, Arrays.asList("chr1", "chr2", "chr3"), true), 3); - Assert.assertEquals(IntervalUtils.countIntervalArguments(reference, Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"), false), 4); - Assert.assertEquals(IntervalUtils.countIntervalArguments(reference, Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"), true), 3); + public void testParseIntervalArguments() { + Assert.assertEquals(getLocs().size(), 45); + Assert.assertEquals(getLocs("chr1", "chr2", "chr3").size(), 3); + Assert.assertEquals(getLocs("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2").size(), 4); + } + + @Test(dependsOnMethods = "testParseIntervalArguments") + public void testCountIntervalsByContig() { + Assert.assertEquals(IntervalUtils.countContigIntervals(getLocs()), 45); + Assert.assertEquals(IntervalUtils.countContigIntervals(getLocs("chr1", "chr2", "chr3")), 3); + Assert.assertEquals(IntervalUtils.countContigIntervals(getLocs("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2")), 3); } @Test @@ -153,14 +157,16 @@ public class IntervalUtilsUnitTest extends BaseTest { } @Test - public void testBasicScatter() { + public void testFixedScatterIntervalsBasic() { GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1"); GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2"); GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3"); List files = testFiles("basic.", 3, ".intervals"); - IntervalUtils.scatterIntervalArguments(reference, Arrays.asList("chr1", "chr2", "chr3"), files, false); + List locs = getLocs("chr1", "chr2", "chr3"); + List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(header, locs, splits, files); List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); @@ -176,7 +182,7 @@ public class IntervalUtilsUnitTest extends BaseTest { } @Test - public void testScatterLessFiles() { + public void testScatterFixedIntervalsLessFiles() { GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1"); GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2"); GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3"); @@ -184,117 +190,9 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("less.", 3, ".intervals"); - IntervalUtils.scatterIntervalArguments(reference, Arrays.asList("chr1", "chr2", "chr3", "chr4"), files, false); - - List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 2); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs1.get(1), chr2); - Assert.assertEquals(locs2.get(0), chr3); - Assert.assertEquals(locs3.get(0), chr4); - } - - @Test(expectedExceptions=UserException.BadArgumentValue.class) - public void testScatterMoreFiles() { - List files = testFiles("more.", 3, ".intervals"); - IntervalUtils.scatterIntervalArguments(reference, Arrays.asList("chr1", "chr2"), files, false); - } - - @Test - public void testScatterIntervals() { - List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); - GenomeLoc chr1a = genomeLocParser.parseGenomeInterval("chr1:1-2"); - GenomeLoc chr1b = genomeLocParser.parseGenomeInterval("chr1:4-5"); - GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2:1-1"); - GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3:2-2"); - - List files = testFiles("split.", 3, ".intervals"); - - IntervalUtils.scatterIntervalArguments(reference, intervals, files, true); - - List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 2); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr1a); - Assert.assertEquals(locs1.get(1), chr1b); - Assert.assertEquals(locs2.get(0), chr2); - Assert.assertEquals(locs3.get(0), chr3); - } - - @Test(enabled=false) // disabled, GenomeLoc.compareTo() returns 0 for two locs with the same start, causing an exception in GLSS.add(). - public void testScatterIntervalsWithTheSameStart() { - List files = testFiles("sg.", 20, ".intervals"); - IntervalUtils.scatterIntervalArguments(new File(hg18Reference), Arrays.asList(BaseTest.GATKDataLocation + "whole_exome_agilent_designed_120.targets.hg18.chr20.interval_list"), files, false); - } - - @Test - public void testScatterOrder() { - List intervals = Arrays.asList("chr2:1-1", "chr1:1-1", "chr3:2-2"); - GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1:1-1"); - GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2:1-1"); - GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3:2-2"); - - List files = testFiles("split.", 3, ".intervals"); - - IntervalUtils.scatterIntervalArguments(reference, intervals, files, true); - - List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2); - Assert.assertEquals(locs3.get(0), chr3); - } - - @Test - public void testBasicScatterByContig() { - GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1"); - GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2"); - GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3"); - - List files = testFiles("contig_basic.", 3, ".intervals"); - - IntervalUtils.scatterIntervalArguments(reference, Arrays.asList("chr1", "chr2", "chr3"), files, true); - - List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2); - Assert.assertEquals(locs3.get(0), chr3); - } - - @Test - public void testScatterByContigLessFiles() { - GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1"); - GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2"); - GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3"); - GenomeLoc chr4 = genomeLocParser.parseGenomeInterval("chr4"); - - List files = testFiles("contig_less.", 3, ".intervals"); - - IntervalUtils.scatterIntervalArguments(reference, Arrays.asList("chr1", "chr2", "chr3", "chr4"), files, true); + List locs = getLocs("chr1", "chr2", "chr3", "chr4"); + List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(header, locs, splits, files); List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); @@ -311,13 +209,219 @@ public class IntervalUtilsUnitTest extends BaseTest { } @Test(expectedExceptions=UserException.BadArgumentValue.class) - public void testScatterByContigMoreFiles() { - List files = testFiles("contig_more.", 3, ".intervals"); - IntervalUtils.scatterIntervalArguments(reference, Arrays.asList("chr1", "chr2"), files, true); + public void testSplitFixedIntervalsMoreFiles() { + List files = testFiles("more.", 3, ".intervals"); + List locs = getLocs("chr1", "chr2"); + IntervalUtils.splitFixedIntervals(locs, files.size()); + } + + @Test(expectedExceptions=UserException.BadArgumentValue.class) + public void testScatterFixedIntervalsMoreFiles() { + List files = testFiles("more.", 3, ".intervals"); + List locs = getLocs("chr1", "chr2"); + List splits = IntervalUtils.splitFixedIntervals(locs, locs.size()); // locs.size() instead of files.size() + IntervalUtils.scatterFixedIntervals(header, locs, splits, files); + } + @Test + public void testScatterFixedIntervalsStart() { + List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); + GenomeLoc chr1a = genomeLocParser.parseGenomeInterval("chr1:1-2"); + GenomeLoc chr1b = genomeLocParser.parseGenomeInterval("chr1:4-5"); + GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2:1-1"); + GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3:2-2"); + + List files = testFiles("split.", 3, ".intervals"); + + List locs = getLocs(intervals); + List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(header, locs, splits, files); + + List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 2); + + Assert.assertEquals(locs1.get(0), chr1a); + Assert.assertEquals(locs2.get(0), chr1b); + Assert.assertEquals(locs3.get(0), chr2); + Assert.assertEquals(locs3.get(1), chr3); } @Test - public void testScatterByContigIntervalsStart() { + public void testScatterFixedIntervalsMiddle() { + List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); + GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1:1-1"); + GenomeLoc chr2a = genomeLocParser.parseGenomeInterval("chr2:1-2"); + GenomeLoc chr2b = genomeLocParser.parseGenomeInterval("chr2:4-5"); + GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3:2-2"); + + List files = testFiles("split.", 3, ".intervals"); + + List locs = getLocs(intervals); + List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(header, locs, splits, files); + + List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 2); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs2.get(0), chr2a); + Assert.assertEquals(locs3.get(0), chr2b); + Assert.assertEquals(locs3.get(1), chr3); + } + + @Test + public void testScatterFixedIntervalsEnd() { + List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); + GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1:1-1"); + GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2:2-2"); + GenomeLoc chr3a = genomeLocParser.parseGenomeInterval("chr3:1-2"); + GenomeLoc chr3b = genomeLocParser.parseGenomeInterval("chr3:4-5"); + + List files = testFiles("split.", 3, ".intervals"); + + List locs = getLocs(intervals); + List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(header, locs, splits, files); + + List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 2); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 1); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs1.get(1), chr2); + Assert.assertEquals(locs2.get(0), chr3a); + Assert.assertEquals(locs3.get(0), chr3b); + } + + @Test + public void testScatterFixedIntervalsFile() { + List files = testFiles("sg.", 20, ".intervals"); + List locs = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(BaseTest.GATKDataLocation + "whole_exome_agilent_designed_120.targets.hg18.chr20.interval_list"), false); + List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + + int[] counts = { + 5169, 5573, 10017, 10567, 10551, + 5087, 4908, 10120, 10435, 10399, + 5391, 4735, 10621, 10352, 10654, + 5227, 5256, 10151, 9649, 9825 + }; + + //String splitCounts = ""; + for (int lastIndex = 0, i = 0; i < splits.size(); i++) { + int splitIndex = splits.get(i); + int splitCount = (splitIndex - lastIndex); + //splitCounts += ", " + splitCount; + lastIndex = splitIndex; + Assert.assertEquals(splitCount, counts[i], "Num intervals in split " + i); + } + //System.out.println(splitCounts.substring(2)); + + IntervalUtils.scatterFixedIntervals(header, locs, splits, files); + + int locIndex = 0; + for (int i = 0; i < files.size(); i++) { + String file = files.get(i).toString(); + List parsedLocs = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(file), false); + Assert.assertEquals(parsedLocs.size(), counts[i], "Intervals in " + file); + for (GenomeLoc parsedLoc: parsedLocs) + Assert.assertEquals(parsedLoc, locs.get(locIndex), String.format("Genome loc %d from file %d", locIndex++, i)); + } + Assert.assertEquals(locIndex, locs.size(), "Total number of GenomeLocs"); + } + + @Test + public void testScatterContigIntervalsOrder() { + List intervals = Arrays.asList("chr2:1-1", "chr1:1-1", "chr3:2-2"); + GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1:1-1"); + GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2:1-1"); + GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3:2-2"); + + List files = testFiles("split.", 3, ".intervals"); + + IntervalUtils.scatterContigIntervals(header, getLocs(intervals), files); + + List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 1); + + Assert.assertEquals(locs1.get(0), chr2); + Assert.assertEquals(locs2.get(0), chr1); + Assert.assertEquals(locs3.get(0), chr3); + } + + @Test + public void testScatterContigIntervalsBasic() { + GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1"); + GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2"); + GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3"); + + List files = testFiles("contig_basic.", 3, ".intervals"); + + IntervalUtils.scatterContigIntervals(header, getLocs("chr1", "chr2", "chr3"), files); + + List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 1); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs2.get(0), chr2); + Assert.assertEquals(locs3.get(0), chr3); + } + + @Test + public void testScatterContigIntervalsLessFiles() { + GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1"); + GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2"); + GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3"); + GenomeLoc chr4 = genomeLocParser.parseGenomeInterval("chr4"); + + List files = testFiles("contig_less.", 3, ".intervals"); + + IntervalUtils.scatterContigIntervals(header, getLocs("chr1", "chr2", "chr3", "chr4"), files); + + List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 2); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs2.get(0), chr2); + Assert.assertEquals(locs3.get(0), chr3); + Assert.assertEquals(locs3.get(1), chr4); + } + + @Test(expectedExceptions=UserException.BadArgumentValue.class) + public void testScatterContigIntervalsMoreFiles() { + List files = testFiles("contig_more.", 3, ".intervals"); + IntervalUtils.scatterContigIntervals(header, getLocs("chr1", "chr2"), files); + } + + @Test + public void testScatterContigIntervalsStart() { List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); GenomeLoc chr1a = genomeLocParser.parseGenomeInterval("chr1:1-2"); GenomeLoc chr1b = genomeLocParser.parseGenomeInterval("chr1:4-5"); @@ -326,7 +430,7 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("contig_split_start.", 3, ".intervals"); - IntervalUtils.scatterIntervalArguments(reference, intervals, files, true); + IntervalUtils.scatterContigIntervals(header, getLocs(intervals), files); List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); @@ -343,7 +447,7 @@ public class IntervalUtilsUnitTest extends BaseTest { } @Test - public void testScatterByContigIntervalsMiddle() { + public void testScatterContigIntervalsMiddle() { List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1:1-1"); GenomeLoc chr2a = genomeLocParser.parseGenomeInterval("chr2:1-2"); @@ -352,7 +456,7 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("contig_split_middle.", 3, ".intervals"); - IntervalUtils.scatterIntervalArguments(reference, intervals, files, true); + IntervalUtils.scatterContigIntervals(header, getLocs(intervals), files); List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); @@ -369,7 +473,7 @@ public class IntervalUtilsUnitTest extends BaseTest { } @Test - public void testScatterByContigIntervalsEnd() { + public void testScatterContigIntervalsEnd() { List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1:1-1"); GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2:2-2"); @@ -378,7 +482,7 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("contig_split_end.", 3 ,".intervals"); - IntervalUtils.scatterIntervalArguments(reference, intervals, files, true); + IntervalUtils.scatterContigIntervals(header, getLocs(intervals), files); List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala b/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala index f1fd4057b..1aa88187e 100755 --- a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala +++ b/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala @@ -35,10 +35,13 @@ class ContigScatterFunction extends GATKScatterFunction with InProcessFunction { // Include unmapped reads by default. this.includeUnmapped = true - protected override def maxIntervals = - IntervalUtils.countIntervalArguments(this.referenceSequence, this.intervals, true) + protected override def maxIntervals = { + val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals) + IntervalUtils.countContigIntervals(gi.locs) + } def run() { - IntervalUtils.scatterIntervalArguments(this.referenceSequence, this.intervals, this.scatterOutputFiles, true) + val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals) + IntervalUtils.scatterContigIntervals(gi.samFileHeader, gi.locs, this.scatterOutputFiles) } } diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala b/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala index 4dbd7c880..32e58ca6a 100644 --- a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala +++ b/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala @@ -60,7 +60,7 @@ trait GATKScatterFunction extends ScatterFunction { this.originalGATK = this.originalFunction.asInstanceOf[CommandLineGATK] this.referenceSequence = this.originalGATK.reference_sequence if (this.originalGATK.intervals.isEmpty && this.originalGATK.intervalsString.isEmpty) { - this.intervals ++= IntervalUtils.distinctContigs(this.referenceSequence).toList + this.intervals ++= GATKScatterFunction.getGATKIntervals(this.referenceSequence, List.empty[String]).contigs } else { this.intervals ++= this.originalGATK.intervals.map(_.toString) this.intervals ++= this.originalGATK.intervalsString.filterNot(interval => IntervalUtils.isUnmapped(interval)) @@ -103,3 +103,17 @@ trait GATKScatterFunction extends ScatterFunction { */ protected def maxIntervals: Int } + +object GATKScatterFunction { + var gatkIntervals = List.empty[GATKIntervals] + + def getGATKIntervals(reference: File, intervals: List[String]) = { + gatkIntervals.find(gi => gi.reference == reference && gi.intervals == intervals) match { + case Some(gi) => gi + case None => + val gi = new GATKIntervals(reference, intervals) + gatkIntervals :+= gi + gi + } + } +} diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala b/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala index a651b3c12..d88d272b9 100644 --- a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala +++ b/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala @@ -33,9 +33,11 @@ import org.broadinstitute.sting.queue.function.InProcessFunction */ class IntervalScatterFunction extends GATKScatterFunction with InProcessFunction { protected override def maxIntervals = - IntervalUtils.countIntervalArguments(this.referenceSequence, this.intervals, false) + GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals).locs.size def run() { - IntervalUtils.scatterIntervalArguments(this.referenceSequence, this.intervals, this.scatterOutputFiles, false) + val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals) + IntervalUtils.scatterFixedIntervals(gi.samFileHeader, gi.locs, + gi.getSplits(this.scatterOutputFiles.size), this.scatterOutputFiles) } }