From 7bf5ca51eee6c8157e1b8e8a76aad88eafeee0c4 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 4 Aug 2012 11:28:17 -0400 Subject: [PATCH] Major bugfix for adaptive contexts -- Basically I was treating the context history in the wrong direction, effectively predicting the further bases in the context based on the closer one. Totally backward. Updated the code to build the tree in the right direction. -- Added a few more useful outputs for analysis (minPenalty and maxPenalty) -- Misc. cleanup of the code -- Overall I'm not 100% certain this is even the right way to think about the problem. Clearly this is producing a reasonable output but the sum of chi2 values over the entire tree is just enormous. Perhaps a MCMC convergence / sampling criterion would be a better way to think about this problem? --- .../utils/recalibration/AdaptiveContext.java | 37 +++++++++++++++---- .../utils/recalibration/RecalDatumNode.java | 22 +++++++++++ 2 files changed, 51 insertions(+), 8 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/AdaptiveContext.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/AdaptiveContext.java index 083b8af64..b8a2eaf05 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/AdaptiveContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/AdaptiveContext.java @@ -21,22 +21,42 @@ public class AdaptiveContext { * observation and error counts of their parent. * * This algorithm produces data consistent with the standard output in a BQSR recal - * file for the Context covariate + * file for the Context covariate. + * + * Suppose you have input tree + * + * - x + * - A + * - C + * - T + * - C + * - T + * + * This will produce the following contexts with fill depth of 2 + * + * AA <- gets A info + * CA <- gets CA info + * GA <- gets A info + * TA <- get TA info + * .. for all 16 contexts * * @param root * @param fillDepth * @return */ - public static RecalDatumNode fillToDepth(final RecalDatumNode root, final int fillDepth) { + public static RecalDatumNode fillToDepth(final RecalDatumNode root, + final int fillDepth, + final boolean debugWriteN) { if ( root == null ) throw new IllegalArgumentException("root is null"); if ( fillDepth < 0 ) throw new IllegalArgumentException("fillDepth is < 0"); - return fillToDepthRec(root, fillDepth, 0); + return fillToDepthRec(root, fillDepth, 0, debugWriteN); } private static RecalDatumNode fillToDepthRec(final RecalDatumNode parent, final int fillDepth, - final int currentDepth) { + final int currentDepth, + final boolean debugWriteN) { // three cases: // We are in the tree and so just recursively build // We have reached our depth goal, so just return the parent since we are done @@ -47,24 +67,25 @@ public class AdaptiveContext { final RecalDatumNode newParent = new RecalDatumNode(parent.getRecalDatum()); for ( final String base : Arrays.asList("A", "C", "G", "T")) { + final String subContextBases = base + parent.getRecalDatum().context; ContextDatum subContext; Set> subContexts; - final RecalDatumNode subNode = findSubcontext(parent.getRecalDatum().context + base, parent); + final RecalDatumNode subNode = findSubcontext(subContextBases, parent); if ( subNode != null ) { // we have a subnode corresponding to the expected one, just copy and recurse subContext = subNode.getRecalDatum(); subContexts = subNode.getSubnodes(); } else { // have to create a new one - subContext = new ContextDatum(parent.getRecalDatum().context + base, + subContext = new ContextDatum(debugWriteN ? ("N" + parent.getRecalDatum().context ) : subContextBases, parent.getRecalDatum().getNumObservations(), parent.getRecalDatum().getNumMismatches()); subContexts = Collections.emptySet(); } newParent.addSubnode( fillToDepthRec(new RecalDatumNode(subContext, subContexts), - fillDepth, currentDepth + 1)); + fillDepth, currentDepth + 1, debugWriteN)); } return newParent; } else { @@ -119,7 +140,7 @@ public class AdaptiveContext { parent = new RecalDatumNode(new ContextDatum(parentContext, 0, 0)); contextToNodes.put(parentContext, parent); - if ( parentContext != ContextDatum.ROOT_CONTEXT ) + if ( ! parent.getRecalDatum().isRootContext() ) remaining.add(parent); else root = parent; diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java index e6ec6a520..e792a808d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java @@ -165,6 +165,28 @@ public class RecalDatumNode { } } + /** + * The maximum penalty among all nodes + * @return + */ + public double maxPenalty() { + double max = getPenalty(); + for ( final RecalDatumNode sub : subnodes ) + max = Math.max(max, sub.maxPenalty()); + return max; + } + + /** + * The minimum penalty among all nodes + * @return + */ + public double minPenalty() { + double min = getPenalty(); + for ( final RecalDatumNode sub : subnodes ) + min = Math.min(min, sub.minPenalty()); + return min; + } + /** * What's the longest branch from this node to any leaf? * @return