From 47607a7effc2be91bc44ccc453639c544a1e45bb Mon Sep 17 00:00:00 2001 From: Roger Zurawicki Date: Tue, 6 Sep 2011 14:25:57 -0400 Subject: [PATCH 1/6] Fixed bug where deletions messed up interval clipping - Instead of using readLength, the ReadUtil function are used to get a proper read coordinate - Added debug info in interval clipping ( with -dl) NOTE: method might not be safe for production and checks need to be added to the ClippingOp code --- .../org/broadinstitute/sting/utils/clipreads/ReadClipper.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java index be045309a..26c25850a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java @@ -63,6 +63,10 @@ public class ReadClipper { if (start < 0 || stop > read.getReadLength() - 1) throw new ReviewedStingException("Trying to clip before the start or after the end of a read"); + // TODO add requires statement/check in the Hardclip function + if ( start > stop ) + stop = ReadUtils.getReadCoordinateForReferenceCoordinate(read, ReadUtils.getRefCoordSoftUnclippedEnd(read)); + //System.out.println("Clipping start/stop: " + start + "/" + stop); this.addOp(new ClippingOp(start, stop)); SAMRecord clippedRead = clipRead(ClippingRepresentation.HARDCLIP_BASES); From a1920397e8c4328f8c09006e5982afc2381ac71c Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 7 Sep 2011 12:18:11 -0400 Subject: [PATCH 2/6] Major bugfix for per sample VariantEval -- per sample stratification was not being calculated correctly. The alt allele was always remaining, even if the genotype of the sample was hom-ref. Although conceptually fine, this breaks the assumptions of all of the eval modules, so per sample stratifications actually included all variants for everything. Eric is going to fix the system in general, so this commit may break the build. --- .../sting/gatk/walkers/varianteval/util/VariantEvalUtils.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java index 92e7c6554..3cc039141 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java @@ -277,7 +277,7 @@ public class VariantEvalUtils { * @return a new VariantContext with just the requested samples */ public VariantContext getSubsetOfVariantContext(VariantContext vc, Collection sampleNames) { - VariantContext vcsub = vc.subContextFromGenotypes(vc.getGenotypes(sampleNames).values(), vc.getAlleles()); + VariantContext vcsub = vc.subContextFromGenotypes(vc.getGenotypes(sampleNames).values()); HashMap newAts = new HashMap(vcsub.getAttributes()); From 6ff432e1f24860e8821e9f55fe71a0e470dce202 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 7 Sep 2011 12:50:17 -0400 Subject: [PATCH 3/6] BugFix for TF argument to VariantEval, actually making it work properly --- .../varianteval/VariantEvalWalker.java | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index fe4729bdc..65e3d3e5a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -15,6 +15,7 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator; +import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.JexlExpression; import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.VariantStratifier; import org.broadinstitute.sting.gatk.walkers.varianteval.util.*; import org.broadinstitute.sting.gatk.walkers.variantrecalibration.Tranche; @@ -24,6 +25,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; @@ -224,12 +226,6 @@ public class VariantEvalWalker extends RodWalker implements Tr } sampleNamesForStratification.add(ALL_SAMPLE_NAME); - // Initialize select expressions - for (VariantContextUtils.JexlVCMatchExp jexl : VariantContextUtils.initializeMatchExps(SELECT_NAMES, SELECT_EXPS)) { - SortableJexlVCMatchExp sjexl = new SortableJexlVCMatchExp(jexl.name, jexl.exp); - jexlExpressions.add(sjexl); - } - // Add select expressions for anything in the tranches file if ( TRANCHE_FILENAME != null ) { // we are going to build a few select names automatically from the tranches file @@ -240,16 +236,27 @@ public class VariantEvalWalker extends RodWalker implements Tr } } + // Initialize select expressions + for (VariantContextUtils.JexlVCMatchExp jexl : VariantContextUtils.initializeMatchExps(SELECT_NAMES, SELECT_EXPS)) { + SortableJexlVCMatchExp sjexl = new SortableJexlVCMatchExp(jexl.name, jexl.exp); + jexlExpressions.add(sjexl); + } + // Initialize the set of stratifications and evaluations to use stratificationObjects = variantEvalUtils.initializeStratificationObjects(this, NO_STANDARD_STRATIFICATIONS, STRATIFICATIONS_TO_USE); Set> evaluationObjects = variantEvalUtils.initializeEvaluationObjects(NO_STANDARD_MODULES, MODULES_TO_USE); + boolean usingJEXL = false; for ( VariantStratifier vs : getStratificationObjects() ) { if ( vs.getClass().getSimpleName().equals("Filter") ) byFilterIsEnabled = true; else if ( vs.getClass().getSimpleName().equals("Sample") ) perSampleIsEnabled = true; + usingJEXL = usingJEXL || vs.getClass().equals(JexlExpression.class); } + if ( TRANCHE_FILENAME != null && ! usingJEXL ) + throw new UserException.BadArgumentValue("tf", "Requires the JexlExpression ST to enabled"); + // Initialize the evaluation contexts evaluationContexts = variantEvalUtils.initializeEvaluationContexts(stratificationObjects, evaluationObjects, null, null); From d23d62049439870ea33fb4d1759c2349f2ad154d Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 7 Sep 2011 12:52:33 -0400 Subject: [PATCH 4/6] Pushing traversal engine timer start to as close to actual start as possible -- Should make initial timings more accurate --- .../gatk/executive/LinearMicroScheduler.java | 2 +- .../sting/gatk/executive/ShardTraverser.java | 1 + .../sting/gatk/traversals/TraversalEngine.java | 15 ++++++++++----- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index 65ff27497..09ab4bd44 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -44,7 +44,6 @@ public class LinearMicroScheduler extends MicroScheduler { * @param shardStrategy A strategy for sharding the data. */ public Object execute(Walker walker, ShardStrategy shardStrategy) { - traversalEngine.startTimers(); walker.initialize(); Accumulator accumulator = Accumulator.create(engine,walker); @@ -54,6 +53,7 @@ public class LinearMicroScheduler extends MicroScheduler { if ( done || shard == null ) // we ran out of shards that aren't owned break; + traversalEngine.startTimersIfNecessary(); if(shard.getShardType() == Shard.ShardType.LOCUS) { LocusWalker lWalker = (LocusWalker)walker; WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(), getReadIterator(shard), shard.getGenomeLocs(), engine.getSampleMetadata()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java index 6136bd68d..2b6488ada 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java @@ -57,6 +57,7 @@ public class ShardTraverser implements Callable { public Object call() { try { + traversalEngine.startTimersIfNecessary(); long startTime = System.currentTimeMillis(); Object accumulator = walker.reduceInit(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index 89a179d0e..dc6ab240e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -115,7 +115,7 @@ public abstract class TraversalEngine,Provide LinkedList history = new LinkedList(); /** We use the SimpleTimer to time our run */ - private SimpleTimer timer = new SimpleTimer("Traversal"); + private SimpleTimer timer = null; // How long can we go without printing some progress info? private static final int PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES = 1000; @@ -209,11 +209,16 @@ public abstract class TraversalEngine,Provide } } /** - * Should be called to indicate that we're going to process records and the timer should start ticking + * Should be called to indicate that we're going to process records and the timer should start ticking. This + * function should be called right before any traversal work is done, to avoid counting setup costs in the + * processing costs and inflating the estimated runtime. */ - public void startTimers() { - timer.start(); - lastProgressPrintTime = timer.currentTime(); + public void startTimersIfNecessary() { + if ( timer == null ) { + timer = new SimpleTimer("Traversal"); + timer.start(); + lastProgressPrintTime = timer.currentTime(); + } } /** From 7e9e20fed0ad5d9f2491f782de3c3850ad341a57 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 7 Sep 2011 12:54:52 -0400 Subject: [PATCH 5/6] Forgot to delete previous call --- .../sting/gatk/executive/HierarchicalMicroScheduler.java | 1 - 1 file changed, 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java index 59fb4aa9e..3b9e35311 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java @@ -97,7 +97,6 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar if (!( walker instanceof TreeReducible )) throw new IllegalArgumentException("The GATK can currently run in parallel only with TreeReducible walkers"); - traversalEngine.startTimers(); ReduceTree reduceTree = new ReduceTree(this); initializeWalker(walker); From 430da2344609582d8de24edf52ed34cd2a426607 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 7 Sep 2011 13:13:07 -0400 Subject: [PATCH 6/6] At least 2 minutes must pass before a status message is printed, further stabilizing time estimates --- .../broadinstitute/sting/gatk/traversals/TraversalEngine.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index dc6ab240e..27fd173cb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -121,6 +121,7 @@ public abstract class TraversalEngine,Provide private static final int PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES = 1000; private int printProgressCheckCounter = 0; private long lastProgressPrintTime = -1; // When was the last time we printed progress log? + private long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 120 * 1000; // in milliseconds private long PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds private final double TWO_HOURS_IN_SECONDS = 2.0 * 60.0 * 60.0; private final double TWELVE_HOURS_IN_SECONDS = 12.0 * 60.0 * 60.0; @@ -229,7 +230,8 @@ public abstract class TraversalEngine,Provide * @return true if the maximum interval (in millisecs) has passed since the last printing */ private boolean maxElapsedIntervalForPrinting(final long curTime, long lastPrintTime, long printFreq) { - return (curTime - lastPrintTime) > printFreq; + long elapsed = curTime - lastPrintTime; + return elapsed > printFreq && elapsed > MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS; } /**