From c4dfc1fb8bf9b69bd620bcb1b1b2490a31751140 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 6 Oct 2011 13:41:36 -0400 Subject: [PATCH 1/6] Temporary commit of parallelization support for RealignerTargetCreator. Tim begged us for this and I got assurances from Khalid/Matt that this would also be extremely helpful for the whole genome calling pipeline, so I spent a while working on this. Needs to be fixed up though because apparently only the leaves in the hierarchical reduce get their output aggregated. Worked out a better solution with Matt. --- .../indels/RealignerTargetCreator.java | 119 +++++++++++++++--- 1 file changed, 99 insertions(+), 20 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java index bede50a0b..e83667e8c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java @@ -103,7 +103,7 @@ import java.util.List; @Allows(value={DataSource.READS, DataSource.REFERENCE}) @By(DataSource.REFERENCE) @BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN) -public class RealignerTargetCreator extends RodWalker { +public class RealignerTargetCreator extends RodWalker implements TreeReducible { /** * The target intervals for realignment. @@ -251,38 +251,117 @@ public class RealignerTargetCreator extends RodWalker 1 ) + sum.right = value; + else { + if ( sum.left.isReportableEvent() ) + out.println(sum.left.toString()); + sum.left = value; + } + } else { + if ( canBeMerged(sum.right, value) ) + sum.right = mergeEvents(sum.right, value); + else { + if ( sum.right.isReportableEvent() ) + out.println(sum.right.toString()); + sum.right = value; + } } - // otherwise, merge the two events - sum.merge(value); return sum; } + static private boolean canBeMerged(Event left, Event right) { + return left.loc.getContigIndex() == right.loc.getContigIndex() && left.furthestStopPos >= right.loc.getStart(); + } + + @com.google.java.contract.Requires({"left != null", "right != null"}) + static private Event mergeEvents(Event left, Event right) { + left.merge(right); + return left; + } + private enum EVENT_TYPE { POINT_EVENT, INDEL_EVENT, BOTH } + class EventPair { + public Event left, right; + + public EventPair(Event left, Event right) { + this.left = left; + this.right = right; + } + } + class Event { public int furthestStopPos; From 6eb87bf58a60e65abf89f7096ecbe06bef790ee4 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 6 Oct 2011 15:57:49 -0400 Subject: [PATCH 2/6] RTC now caches all intervals as GenomeLocs (which is expected to take < 1Gb whole genome based on back of the envelope calculations with Matt) so that 1) we don't have to worry about emitting outside of the leaves in the hierarchical reductions and 2) we can emit the intervals in sorted order which is a big performance plus for the realigner. Integration tests change only because intervals whose start=stop are now printed as chr:start instead of chr:start-stop. --- .../indels/RealignerTargetCreator.java | 60 ++++++++++--------- ...RealignerTargetCreatorIntegrationTest.java | 4 +- 2 files changed, 34 insertions(+), 30 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java index e83667e8c..e1333b7f2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java @@ -50,6 +50,7 @@ import java.io.PrintStream; import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.TreeSet; /** * Emits intervals for the Local Indel Realigner to target for realignment. @@ -253,9 +254,12 @@ public class RealignerTargetCreator extends RodWalker 1 ) + else sum.right = value; - else { - if ( sum.left.isReportableEvent() ) - out.println(sum.left.toString()); - sum.left = value; - } } else { if ( canBeMerged(sum.right, value) ) sum.right = mergeEvents(sum.right, value); else { if ( sum.right.isReportableEvent() ) - out.println(sum.right.toString()); + sum.intervals.add(sum.right.getLoc()); sum.right = value; } } @@ -355,18 +351,26 @@ public class RealignerTargetCreator extends RodWalker intervals = new TreeSet(); public EventPair(Event left, Event right) { this.left = left; this.right = right; } + + public EventPair(Event left, Event right, TreeSet set1, TreeSet set2) { + this.left = left; + this.right = right; + intervals.addAll(set1); + intervals.addAll(set2); + } } class Event { public int furthestStopPos; - public GenomeLoc loc; - public int eventStartPos; + private GenomeLoc loc; + private int eventStartPos; private int eventStopPos; private EVENT_TYPE type; private ArrayList pointEvents = new ArrayList(); @@ -421,8 +425,8 @@ public class RealignerTargetCreator extends RodWalker= 0 && eventStopPos - eventStartPos < maxIntervalSize; } - public String toString() { - return String.format("%s:%d-%d", loc.getContig(), eventStartPos, eventStopPos); + public GenomeLoc getLoc() { + return getToolkit().getGenomeLocParser().createGenomeLoc(loc.getContig(), eventStartPos, eventStopPos); } } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java index 1873ccbe2..fdb310428 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java @@ -13,13 +13,13 @@ public class RealignerTargetCreatorIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( "-T RealignerTargetCreator -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam --mismatchFraction 0.15 -L 1:10,000,000-10,050,000 -o %s", 1, - Arrays.asList("e7accfa58415d6da80383953b1a3a986")); + Arrays.asList("3f0b63a393104d0c4158c7d1538153b8")); executeTest("test standard", spec1); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( "-T RealignerTargetCreator --known " + b36dbSNP129 + " -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000 -o %s", 1, - Arrays.asList("0367d39a122c8ac0899fb868a82ef728")); + Arrays.asList("5085054c78e256432dc75c85a9ac631c")); executeTest("test dbsnp", spec2); WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( From c61804a450e9f5e7672b4f0a5e0e29ab8d1771da Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 6 Oct 2011 16:14:04 -0400 Subject: [PATCH 3/6] Rename the long version of the argument name to more accurately reflect its purpose. --- .../sting/gatk/walkers/indels/IndelRealigner.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 36e4db1c5..41fa755b8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -138,7 +138,7 @@ public class IndelRealigner extends ReadWalker { * Any number of VCF files representing known indels to be used for constructing alternate consenses. * Could be e.g. dbSNP and/or official 1000 Genomes indel calls. Non-indel variants in these files will be ignored. */ - @Input(fullName="known", shortName = "known", doc="Input VCF file(s) with known indels", required=false) + @Input(fullName="knownAlleles", shortName = "known", doc="Input VCF file(s) with known indels", required=false) public List> known = Collections.emptyList(); /** From f91b015e0e79f7a5aeb839efce42f629a4d498f2 Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Thu, 6 Oct 2011 22:33:21 -0400 Subject: [PATCH 5/6] Made the BaseTest.testDir absolute --- public/java/test/org/broadinstitute/sting/BaseTest.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java index 35a81770d..8e11add33 100755 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ b/public/java/test/org/broadinstitute/sting/BaseTest.java @@ -80,7 +80,8 @@ public abstract class BaseTest { public static final String networkTempDir = "/broad/shptmp/"; public static final File networkTempDirFile = new File(networkTempDir); - public static final String testDir = "public/testdata/"; + public static final File testDirFile = new File("public/testdata/"); + public static final String testDir = testDirFile.getAbsolutePath() + "/"; /** before the class starts up */ static { From ca9cd9b688cfd23804e341daea0fe409f4278d7f Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 6 Oct 2011 22:38:44 -0400 Subject: [PATCH 6/6] Minor fix for merging intervals which hadn't been necessary when only merging from the left to right. Added integration tests to cover the parallelization of RTC. --- .../indels/RealignerTargetCreator.java | 4 +++ ...RealignerTargetCreatorIntegrationTest.java | 33 +++++++++++++++---- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java index e1333b7f2..56ce60449 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java @@ -415,6 +415,10 @@ public class RealignerTargetCreator extends RodWalker