From b4841548f12086e8928d4c749bf66fa29f2eaf4f Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 3 Aug 2012 17:53:46 -0400 Subject: [PATCH 1/4] Bug fixes and misc. improvements to running the adaptive context tools -- Better output file name defaults -- Fixed nasty bug where I included non-existant quals in the contexts to process because they showed up in the Cycle covariate -- Data is processed in qual order now, so it's easier to see progress -- Logger messages explaining where we are in the process -- When in UPDATE mode we still write out the information for an equivalent prune by depth for post analysis --- .../sting/utils/recalibration/RecalDatumNode.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java index 1409af7d0..e6ec6a520 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java @@ -324,7 +324,8 @@ public class RecalDatumNode { while ( root.size() > maxElements ) { // remove the lowest penalty element, and continue root = root.removeLowestPenaltyNode(); - logger.debug("pruneByPenalty root size is now " + root.size() + " of max " + maxElements); + if ( logger.isDebugEnabled() ) + logger.debug("pruneByPenalty root size is now " + root.size() + " of max " + maxElements); } // our size is below the target, so we are good, return From 7bf5ca51eee6c8157e1b8e8a76aad88eafeee0c4 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 4 Aug 2012 11:28:17 -0400 Subject: [PATCH 2/4] Major bugfix for adaptive contexts -- Basically I was treating the context history in the wrong direction, effectively predicting the further bases in the context based on the closer one. Totally backward. Updated the code to build the tree in the right direction. -- Added a few more useful outputs for analysis (minPenalty and maxPenalty) -- Misc. cleanup of the code -- Overall I'm not 100% certain this is even the right way to think about the problem. Clearly this is producing a reasonable output but the sum of chi2 values over the entire tree is just enormous. Perhaps a MCMC convergence / sampling criterion would be a better way to think about this problem? --- .../utils/recalibration/AdaptiveContext.java | 37 +++++++++++++++---- .../utils/recalibration/RecalDatumNode.java | 22 +++++++++++ 2 files changed, 51 insertions(+), 8 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/AdaptiveContext.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/AdaptiveContext.java index 083b8af64..b8a2eaf05 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/AdaptiveContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/AdaptiveContext.java @@ -21,22 +21,42 @@ public class AdaptiveContext { * observation and error counts of their parent. * * This algorithm produces data consistent with the standard output in a BQSR recal - * file for the Context covariate + * file for the Context covariate. + * + * Suppose you have input tree + * + * - x + * - A + * - C + * - T + * - C + * - T + * + * This will produce the following contexts with fill depth of 2 + * + * AA <- gets A info + * CA <- gets CA info + * GA <- gets A info + * TA <- get TA info + * .. for all 16 contexts * * @param root * @param fillDepth * @return */ - public static RecalDatumNode fillToDepth(final RecalDatumNode root, final int fillDepth) { + public static RecalDatumNode fillToDepth(final RecalDatumNode root, + final int fillDepth, + final boolean debugWriteN) { if ( root == null ) throw new IllegalArgumentException("root is null"); if ( fillDepth < 0 ) throw new IllegalArgumentException("fillDepth is < 0"); - return fillToDepthRec(root, fillDepth, 0); + return fillToDepthRec(root, fillDepth, 0, debugWriteN); } private static RecalDatumNode fillToDepthRec(final RecalDatumNode parent, final int fillDepth, - final int currentDepth) { + final int currentDepth, + final boolean debugWriteN) { // three cases: // We are in the tree and so just recursively build // We have reached our depth goal, so just return the parent since we are done @@ -47,24 +67,25 @@ public class AdaptiveContext { final RecalDatumNode newParent = new RecalDatumNode(parent.getRecalDatum()); for ( final String base : Arrays.asList("A", "C", "G", "T")) { + final String subContextBases = base + parent.getRecalDatum().context; ContextDatum subContext; Set> subContexts; - final RecalDatumNode subNode = findSubcontext(parent.getRecalDatum().context + base, parent); + final RecalDatumNode subNode = findSubcontext(subContextBases, parent); if ( subNode != null ) { // we have a subnode corresponding to the expected one, just copy and recurse subContext = subNode.getRecalDatum(); subContexts = subNode.getSubnodes(); } else { // have to create a new one - subContext = new ContextDatum(parent.getRecalDatum().context + base, + subContext = new ContextDatum(debugWriteN ? ("N" + parent.getRecalDatum().context ) : subContextBases, parent.getRecalDatum().getNumObservations(), parent.getRecalDatum().getNumMismatches()); subContexts = Collections.emptySet(); } newParent.addSubnode( fillToDepthRec(new RecalDatumNode(subContext, subContexts), - fillDepth, currentDepth + 1)); + fillDepth, currentDepth + 1, debugWriteN)); } return newParent; } else { @@ -119,7 +140,7 @@ public class AdaptiveContext { parent = new RecalDatumNode(new ContextDatum(parentContext, 0, 0)); contextToNodes.put(parentContext, parent); - if ( parentContext != ContextDatum.ROOT_CONTEXT ) + if ( ! parent.getRecalDatum().isRootContext() ) remaining.add(parent); else root = parent; diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java index e6ec6a520..e792a808d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java @@ -165,6 +165,28 @@ public class RecalDatumNode { } } + /** + * The maximum penalty among all nodes + * @return + */ + public double maxPenalty() { + double max = getPenalty(); + for ( final RecalDatumNode sub : subnodes ) + max = Math.max(max, sub.maxPenalty()); + return max; + } + + /** + * The minimum penalty among all nodes + * @return + */ + public double minPenalty() { + double min = getPenalty(); + for ( final RecalDatumNode sub : subnodes ) + min = Math.min(min, sub.minPenalty()); + return min; + } + /** * What's the longest branch from this node to any leaf? * @return From 2f004665fb1f51728cdb586479ae7202d1c2e7cc Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 6 Aug 2012 11:42:12 -0400 Subject: [PATCH 3/4] Fixing public -> private dep --- .../utils/recalibration/AdaptiveContext.java | 175 ------------------ 1 file changed, 175 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/utils/recalibration/AdaptiveContext.java diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/AdaptiveContext.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/AdaptiveContext.java deleted file mode 100644 index b8a2eaf05..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/AdaptiveContext.java +++ /dev/null @@ -1,175 +0,0 @@ -package org.broadinstitute.sting.utils.recalibration; - -import java.util.*; - -/** - * Functions for working with AdaptiveContexts - * - * User: depristo - * Date: 8/3/12 - * Time: 12:21 PM - * To change this template use File | Settings | File Templates. - */ -public class AdaptiveContext { - private AdaptiveContext() {} - - /** - * Return a freshly allocated tree filled in completely to fillDepth with - * all combinations of {A,C,G,T}^filldepth contexts. For nodes - * in the tree, they are simply copied. When the algorithm needs to - * generate new nodes (because they are missing) the subnodes inherit the - * observation and error counts of their parent. - * - * This algorithm produces data consistent with the standard output in a BQSR recal - * file for the Context covariate. - * - * Suppose you have input tree - * - * - x - * - A - * - C - * - T - * - C - * - T - * - * This will produce the following contexts with fill depth of 2 - * - * AA <- gets A info - * CA <- gets CA info - * GA <- gets A info - * TA <- get TA info - * .. for all 16 contexts - * - * @param root - * @param fillDepth - * @return - */ - public static RecalDatumNode fillToDepth(final RecalDatumNode root, - final int fillDepth, - final boolean debugWriteN) { - if ( root == null ) throw new IllegalArgumentException("root is null"); - if ( fillDepth < 0 ) throw new IllegalArgumentException("fillDepth is < 0"); - - return fillToDepthRec(root, fillDepth, 0, debugWriteN); - } - - private static RecalDatumNode fillToDepthRec(final RecalDatumNode parent, - final int fillDepth, - final int currentDepth, - final boolean debugWriteN) { - // three cases: - // We are in the tree and so just recursively build - // We have reached our depth goal, so just return the parent since we are done - // We are outside of the tree, in which case we need to pointer to our parent node so we can - // we info (N, M) and we need a running context - if ( currentDepth < fillDepth ) { - // we need to create subnodes for each base, and propogate N and M down - final RecalDatumNode newParent = new RecalDatumNode(parent.getRecalDatum()); - - for ( final String base : Arrays.asList("A", "C", "G", "T")) { - final String subContextBases = base + parent.getRecalDatum().context; - ContextDatum subContext; - Set> subContexts; - - final RecalDatumNode subNode = findSubcontext(subContextBases, parent); - if ( subNode != null ) { - // we have a subnode corresponding to the expected one, just copy and recurse - subContext = subNode.getRecalDatum(); - subContexts = subNode.getSubnodes(); - } else { - // have to create a new one - subContext = new ContextDatum(debugWriteN ? ("N" + parent.getRecalDatum().context ) : subContextBases, - parent.getRecalDatum().getNumObservations(), parent.getRecalDatum().getNumMismatches()); - subContexts = Collections.emptySet(); - } - - newParent.addSubnode( - fillToDepthRec(new RecalDatumNode(subContext, subContexts), - fillDepth, currentDepth + 1, debugWriteN)); - } - return newParent; - } else { - return parent; - } - } - - /** - * Go from a flat list of contexts to the tree implied by the contexts - * - * Implicit nodes are created as needed, and their observation and error counts are the sum of the - * all of their subnodes. - * - * Note this does not guarentee the tree is complete, as some contexts (e.g., AAT) may be missing - * from the tree because they are absent from the input list of contexts. - * - * For input AAG, AAT, AC, G would produce the following tree: - * - * - x [root] - * - A - * - A - * - T - * - G - * - C - * - G - * - * sets the fixed penalties in the resulting tree as well - * - * @param flatContexts list of flat contexts - * @return - */ - public static RecalDatumNode createTreeFromFlatContexts(final List flatContexts) { - if ( flatContexts == null || flatContexts.isEmpty() ) - throw new IllegalArgumentException("flatContexts cannot be empty or null"); - - final Queue> remaining = new LinkedList>(); - final Map> contextToNodes = new HashMap>(); - RecalDatumNode root = null; - - // initialize -- start with all of the contexts - for ( final ContextDatum cd : flatContexts ) - remaining.add(new RecalDatumNode(cd)); - - while ( remaining.peek() != null ) { - final RecalDatumNode add = remaining.poll(); - final ContextDatum cd = add.getRecalDatum(); - - final String parentContext = cd.getParentContext(); - RecalDatumNode parent = contextToNodes.get(parentContext); - if ( parent == null ) { - // haven't yet found parent, so make one, and enqueue it for processing - parent = new RecalDatumNode(new ContextDatum(parentContext, 0, 0)); - contextToNodes.put(parentContext, parent); - - if ( ! parent.getRecalDatum().isRootContext() ) - remaining.add(parent); - else - root = parent; - } - - parent.getRecalDatum().incrementNumObservations(cd.getNumObservations()); - parent.getRecalDatum().incrementNumMismatches(cd.getNumMismatches()); - parent.addSubnode(add); - } - - if ( root == null ) - throw new RuntimeException("root is unexpectedly null"); - - // set the fixed penalty everywhere in the tree, so that future modifications don't change the penalties - root.calcAndSetFixedPenalty(true); - - return root; - } - - /** - * Finds immediate subnode with contextToFind, or null if none exists - * - * @param tree whose subnodes should be searched - * @return - */ - public static RecalDatumNode findSubcontext(final String contextToFind, final RecalDatumNode tree) { - for ( final RecalDatumNode sub : tree.getSubnodes() ) - if ( sub.getRecalDatum().context.equals(contextToFind) ) - return sub; - return null; - } -} From 44f160f29fcec834507ae67b7afbfb4aa77c8b3e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 6 Aug 2012 11:42:36 -0400 Subject: [PATCH 4/4] indelGOP and indelGCP are now advanced, not hidden arguments --- .../gatk/walkers/genotyper/UnifiedArgumentCollection.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 69f1176cc..a885d8a58 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -146,12 +146,12 @@ public class UnifiedArgumentCollection { @Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling", required = false) public double INDEL_HETEROZYGOSITY = 1.0/8000; - @Hidden - @Argument(fullName = "indelGapContinuationPenalty", shortName = "indelGCP", doc = "Indel gap continuation penalty", required = false) + @Advanced + @Argument(fullName = "indelGapContinuationPenalty", shortName = "indelGCP", doc = "Indel gap continuation penalty, as Phred-scaled probability. I.e., 30 => 10^-30/10", required = false) public byte INDEL_GAP_CONTINUATION_PENALTY = 10; - @Hidden - @Argument(fullName = "indelGapOpenPenalty", shortName = "indelGOP", doc = "Indel gap open penalty", required = false) + @Advanced + @Argument(fullName = "indelGapOpenPenalty", shortName = "indelGOP", doc = "Indel gap open penalty, as Phred-scaled probability. I.e., 30 => 10^-30/10", required = false) public byte INDEL_GAP_OPEN_PENALTY = 45; @Hidden