From a62dc84795e61452802ffc9ff9f24e9ac985eec3 Mon Sep 17 00:00:00 2001 From: Chris Smowton Date: Tue, 7 Oct 2014 15:35:45 +0100 Subject: [PATCH 1/2] Improved scatter contigs algorithm to be fairer when splitting a large number of contigs into a small number of parts. See also: http://gatkforums.broadinstitute.org/discussion/comment/16010 Signed-off-by: Khalid Shakir --- .../gatk/utils/interval/IntervalUtils.java | 112 ++++++++++++++---- 1 file changed, 89 insertions(+), 23 deletions(-) diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalUtils.java index 53c8d55cc..faeebbf30 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalUtils.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalUtils.java @@ -362,29 +362,95 @@ public class IntervalUtils { * @param scatterParts The output interval lists to write to. */ public static void scatterContigIntervals(SAMFileHeader fileHeader, List locs, List scatterParts) { - IntervalList intervalList = null; - int fileIndex = -1; - int locIndex = 0; - String contig = null; - for (GenomeLoc loc: locs) { - // If there are still more files to write and the contig doesn't match... - if ((fileIndex+1 < scatterParts.size()) && (contig == null || !contig.equals(loc.getContig()))) { - // Then close the current file and start a new one. - if (intervalList != null) { - intervalList.write(scatterParts.get(fileIndex)); - intervalList = null; - } - fileIndex++; - contig = loc.getContig(); - } - if (intervalList == null) - intervalList = new IntervalList(fileHeader); - intervalList.add(toInterval(loc, ++locIndex)); - } - if (intervalList != null) - intervalList.write(scatterParts.get(fileIndex)); - if ((fileIndex + 1) != scatterParts.size()) - throw new UserException.BadArgumentValue("scatterParts", String.format("Only able to write contigs into %d of %d files.", fileIndex + 1, scatterParts.size())); + + // Contract: must divide locs up so that each of scatterParts gets a sublist such that: + // (a) all locs concerning a particular contig go to the same part + // (b) locs are not split or combined, and remain in the same order (so scatterParts[0] + ... + scatterParts[n] == locs) + + // Locs are already sorted. + + long totalBases = 0; + for(GenomeLoc loc : locs) + totalBases += loc.getStop() - loc.getStart(); + + long idealBasesPerPart = totalBases / scatterParts.size(); + if(idealBasesPerPart == 0) + throw new UserException.BadInput(String.format("Genome region is too short (%d bases) to split into %d parts", totalBases, scatterParts.size())); + + // Find the indices in locs where we switch from one contig to the next. + ArrayList contigStartLocs = new ArrayList(); + String prevContig = null; + + for(int i = 0; i < locs.size(); ++i) { + + GenomeLoc loc = locs.get(i); + if(prevContig == null || !loc.getContig().equals(prevContig)) + contigStartLocs.add(i); + prevContig = loc.getContig(); + + } + + if(contigStartLocs.size() < scatterParts.size()) + throw new UserException.BadInput(String.format("Input genome region has too few contigs (%d) to split into %d parts", contigStartLocs.size(), scatterParts.size())); + + long thisPartBases = 0; + int partIdx = 0; + IntervalList outList = new IntervalList(fileHeader); + + for(int i = 0; i < locs.size(); ++i) { + + GenomeLoc loc = locs.get(i); + thisPartBases += loc.getStop() - loc.getStart(); + + outList.add(toInterval(loc, i)); + + boolean partMustStop = false; + + if(partIdx < (scatterParts.size() - 1)) { + + // If there are n contigs and n parts remaining then we must split here, + // otherwise we will run out of contigs. + + int nextPart = partIdx + 1; + int nextPartMustStartBy = contigStartLocs.get(nextPart + (contigStartLocs.size() - scatterParts.size())); + if(i + 1 == nextPartMustStartBy) + partMustStop = true; + + } + else if(i == locs.size() - 1) { + + // We're done! Write the last scatter file. + partMustStop = true; + + } + + if(partMustStop || thisPartBases > idealBasesPerPart) { + + // Ideally we would split here. However, we must make sure to do so + // on a contig boundary. Test always passes with partMustStop == true + // since that indicates we're at a contig boundary. + + GenomeLoc nextLoc = null; + if((i + 1) < locs.size()) + nextLoc = locs.get(i+1); + + if(nextLoc == null || !nextLoc.getContig().equals(loc.getContig())) { + + // Write out this part: + outList.write(scatterParts.get(partIdx)); + + // Reset. If this part ran long, leave the excess in thisPartBases + // and the next will be a little shorter to compensate. + outList = new IntervalList(fileHeader); + thisPartBases -= idealBasesPerPart; + ++partIdx; + + } + + } + + } + } /** From 26ba4c11aaceab82cf539716869427f1e28b5cc2 Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Wed, 22 Oct 2014 17:37:37 +0800 Subject: [PATCH 2/2] Minor fixups for previous commit once tests (only runnable at Broad) were run. Fixed off by one error in size calculation IntervalUtils.scatterContigIntervals(). In test for fewer files than intervals, adjusted expected intervals. In test for more files than intervals, adjusted expected exception. --- .../gatk/utils/interval/IntervalUtils.java | 2 +- .../gatk/utils/interval/IntervalUtilsUnitTest.java | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalUtils.java index faeebbf30..7fffb12e2 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalUtils.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalUtils.java @@ -371,7 +371,7 @@ public class IntervalUtils { long totalBases = 0; for(GenomeLoc loc : locs) - totalBases += loc.getStop() - loc.getStart(); + totalBases += loc.size(); long idealBasesPerPart = totalBases / scatterParts.size(); if(idealBasesPerPart == 0) diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/interval/IntervalUtilsUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/interval/IntervalUtilsUnitTest.java index d21424bc3..e9846da21 100644 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/interval/IntervalUtilsUnitTest.java +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/interval/IntervalUtilsUnitTest.java @@ -671,17 +671,17 @@ public class IntervalUtilsUnitTest extends BaseTest { List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); - Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs1.size(), 2); Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 2); + Assert.assertEquals(locs3.size(), 1); Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2); - Assert.assertEquals(locs3.get(0), chr3); - Assert.assertEquals(locs3.get(1), chr4); + Assert.assertEquals(locs1.get(1), chr2); + Assert.assertEquals(locs2.get(0), chr3); + Assert.assertEquals(locs3.get(0), chr4); } - @Test(expectedExceptions=UserException.BadArgumentValue.class) + @Test(expectedExceptions=UserException.BadInput.class) public void testScatterContigIntervalsMoreFiles() { List files = testFiles("contig_more.", 3, ".intervals"); IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2"), files);