Preliminary version of adaptive context covariate algorithm

-- Works according to visual inspection of output tree
2012-07-30 08:31:38 -04:00 · 2012-07-30 08:31:38 -04:00 · 93640b382e
parent 315d25409f
commit 93640b382e
2 changed files with 213 additions and 76 deletions
--- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java
+++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java
@ -0,0 +1,213 @@
 package org.broadinstitute.sting.utils.recalibration;
 import com.google.java.contract.Ensures;
 import com.google.java.contract.Requires;
 import org.apache.log4j.Logger;
 import org.broadinstitute.sting.utils.collections.Pair;
 import java.util.HashSet;
 import java.util.Set;
 /**
 * A tree of recal datum, where each contains a set of sub datum representing sub-states of the higher level one
 *
 * @author Mark DePristo
 * @since 07/27/12
 */
 public class RecalDatumNode<T extends RecalDatum> {
    protected static Logger logger = Logger.getLogger(RecalDatumNode.class);
    private final static double UNINITIALIZED = -1.0;
    private final T recalDatum;
    private double fixedPenalty = UNINITIALIZED;
    private final Set<RecalDatumNode<T>> subnodes;
    public RecalDatumNode(final T recalDatum) {
        this(recalDatum, new HashSet<RecalDatumNode<T>>());
    }
    @Override
    public String toString() {
        return recalDatum.toString();
    }
    public RecalDatumNode(final T recalDatum, final Set<RecalDatumNode<T>> subnodes) {
        this(recalDatum, UNINITIALIZED, subnodes);
    }
    protected RecalDatumNode(final T recalDatum, final double fixedPenalty) {
        this(recalDatum, fixedPenalty, new HashSet<RecalDatumNode<T>>());
    }
    protected RecalDatumNode(final T recalDatum, final double fixedPenalty, final Set<RecalDatumNode<T>> subnodes) {
        this.recalDatum = recalDatum;
        this.fixedPenalty = fixedPenalty;
        this.subnodes = new HashSet<RecalDatumNode<T>>(subnodes);
    }
    public T getRecalDatum() {
        return recalDatum;
    }
    public Set<RecalDatumNode<T>> getSubnodes() {
        return subnodes;
    }
    public double getPenalty() {
        if ( fixedPenalty != UNINITIALIZED )
            return fixedPenalty;
        else
            return calcPenalty(recalDatum.getEmpiricalErrorRate());
    }
    public double calcAndSetFixedPenalty(final boolean doEntireTree) {
        fixedPenalty = calcPenalty(recalDatum.getEmpiricalErrorRate());
        if ( doEntireTree )
            for ( final RecalDatumNode<T> sub : subnodes )
                sub.calcAndSetFixedPenalty(doEntireTree);
        return fixedPenalty;
    }
    public void addSubnode(final RecalDatumNode<T> sub) {
        subnodes.add(sub);
    }
    public boolean isLeaf() {
        return subnodes.isEmpty();
    }
    public int getNumBranches() {
        return subnodes.size();
    }
    public double getMinNodePenalty() {
        if ( isLeaf() )
            return Double.MAX_VALUE;
        else {
            double minPenalty = getPenalty();
            for ( final RecalDatumNode<T> sub : subnodes )
                minPenalty = Math.min(minPenalty, sub.getMinNodePenalty());
            return minPenalty;
        }
    }
    public int maxDepth() {
        int subMax = 0;
        for ( final RecalDatumNode<T> sub : subnodes )
            subMax = Math.max(subMax, sub.maxDepth());
        return subMax + 1;
    }
    public int size() {
        int size = 1;
        for ( final RecalDatumNode<T> sub : subnodes )
            size += sub.size();
        return size;
    }
    /**
     * Calculate the penalty of this interval, given the overall error rate for the interval
     *
     * If the globalErrorRate is e, this value is:
     *
     * sum_i |log10(e_i) - log10(e)| * nObservations_i
     *
     * each the index i applies to all leaves of the tree accessible from this interval
     * (found recursively from subnodes as necessary)
     *
     * @param globalErrorRate overall error rate in real space against which we calculate the penalty
     * @return the cost of approximating the bins in this interval with the globalErrorRate
     */
    @Requires("globalErrorRate >= 0.0")
    @Ensures("result >= 0.0")
    private double calcPenalty(final double globalErrorRate) {
        if ( globalErrorRate == 0.0 ) // there were no observations, so there's no penalty
            return 0.0;
        if ( isLeaf() ) {
            // this is leave node
            return (Math.abs(Math.log10(recalDatum.getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * recalDatum.getNumObservations();
            // TODO -- how we can generalize this calculation?
 //            if ( this.qEnd <= minInterestingQual )
 //                // It's free to merge up quality scores below the smallest interesting one
 //                return 0;
 //            else {
 //                return (Math.abs(Math.log10(getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * getNumObservations();
 //            }
        } else {
            double sum = 0;
            for ( final RecalDatumNode<T> hrd : subnodes)
                sum += hrd.calcPenalty(globalErrorRate);
            return sum;
        }
    }
    public RecalDatumNode<T> pruneToDepth(final int maxDepth) {
        if ( maxDepth < 1 )
            throw new IllegalArgumentException("maxDepth < 1");
        else {
            final Set<RecalDatumNode<T>> subPruned = new HashSet<RecalDatumNode<T>>(getNumBranches());
            if ( maxDepth > 1 )
                for ( final RecalDatumNode<T> sub : subnodes )
                    subPruned.add(sub.pruneToDepth(maxDepth - 1));
            return new RecalDatumNode<T>(getRecalDatum(), fixedPenalty, subPruned);
        }
    }
    public RecalDatumNode<T> pruneByPenalty(final int maxElements) {
        RecalDatumNode<T> root = this;
        while ( root.size() > maxElements ) {
            // remove the lowest penalty element, and continue
            root = root.removeLowestPenaltyNode();
        }
        // our size is below the target, so we are good, return
        return root;
    }
    /**
     * Find the lowest penalty node in the tree, and return a tree without it
     *
     * Note this excludes the current (root) node
     *
     * @return
     */
    private RecalDatumNode<T> removeLowestPenaltyNode() {
        final RecalDatumNode<T> oneRemoved = removeFirstNodeWithPenalty(getMinNodePenalty()).getFirst();
        if ( oneRemoved == null )
            throw new IllegalStateException("Removed our root node, wow, didn't expect that");
        return oneRemoved;
    }
    private Pair<RecalDatumNode<T>, Boolean> removeFirstNodeWithPenalty(final double penaltyToRemove) {
        if ( getPenalty() == penaltyToRemove ) {
            logger.info("Removing " + this + " with penalty " + penaltyToRemove);
            if ( isLeaf() )
                throw new IllegalStateException("Trying to remove a leaf node from the tree! " + this + " " + penaltyToRemove);
            // node is the thing we are going to remove, but without any subnodes
            final RecalDatumNode<T> node = new RecalDatumNode<T>(getRecalDatum(), fixedPenalty);
            return new Pair<RecalDatumNode<T>, Boolean>(node, true);
        } else {
            // did we remove something in a sub branch?
            boolean removedSomething = false;
            // our sub nodes with the penalty node removed
            final Set<RecalDatumNode<T>> sub = new HashSet<RecalDatumNode<T>>(getNumBranches());
            for ( final RecalDatumNode<T> sub1 : subnodes ) {
                if ( removedSomething ) {
                    // already removed something, just add sub1 back to sub
                    sub.add(sub1);
                } else {
                    // haven't removed anything yet, so try
                    final Pair<RecalDatumNode<T>, Boolean> maybeRemoved = sub1.removeFirstNodeWithPenalty(penaltyToRemove);
                    removedSomething = maybeRemoved.getSecond();
                    sub.add(maybeRemoved.getFirst());
                }
            }
            final RecalDatumNode<T> node = new RecalDatumNode<T>(getRecalDatum(), fixedPenalty, sub);
            return new Pair<RecalDatumNode<T>, Boolean>(node, removedSomething);
        }
    }
 }
--- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumTree.java
+++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumTree.java
@ -1,76 +0,0 @@
 package org.broadinstitute.sting.utils.recalibration;
 import com.google.java.contract.Ensures;
 import com.google.java.contract.Requires;
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.Set;
 /**
 * A tree of recal datum, where each contains a set of sub datum representing sub-states of the higher level one
 *
 * @author Mark DePristo
 * @since 07/27/12
 */
 public class RecalDatumTree extends RecalDatum {
    final Set<RecalDatumTree> subnodes;
    protected RecalDatumTree(final long nObservations, final long nErrors, final byte reportedQual) {
        this(nObservations, nErrors, reportedQual, new HashSet<RecalDatumTree>());
    }
    public RecalDatumTree(final long nObservations, final long nErrors, final byte reportedQual, final Set<RecalDatumTree> subnodes) {
        super(nObservations, nErrors, reportedQual);
        this.subnodes = new HashSet<RecalDatumTree>(subnodes);
    }
    public double getPenalty() {
        return calcPenalty(getEmpiricalErrorRate());
    }
    public void addSubnode(final RecalDatumTree sub) {
        subnodes.add(sub);
    }
    public boolean isLeaf() {
        return subnodes.isEmpty();
    }
    /**
     * Calculate the penalty of this interval, given the overall error rate for the interval
     *
     * If the globalErrorRate is e, this value is:
     *
     * sum_i |log10(e_i) - log10(e)| * nObservations_i
     *
     * each the index i applies to all leaves of the tree accessible from this interval
     * (found recursively from subnodes as necessary)
     *
     * @param globalErrorRate overall error rate in real space against which we calculate the penalty
     * @return the cost of approximating the bins in this interval with the globalErrorRate
     */
    @Requires("globalErrorRate >= 0.0")
    @Ensures("result >= 0.0")
    private double calcPenalty(final double globalErrorRate) {
        if ( globalErrorRate == 0.0 ) // there were no observations, so there's no penalty
            return 0.0;
        if ( isLeaf() ) {
            // this is leave node
            return (Math.abs(Math.log10(getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * getNumObservations();
            // TODO -- how we can generalize this calculation?
 //            if ( this.qEnd <= minInterestingQual )
 //                // It's free to merge up quality scores below the smallest interesting one
 //                return 0;
 //            else {
 //                return (Math.abs(Math.log10(getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * getNumObservations();
 //            }
        } else {
            double sum = 0;
            for ( final RecalDatumTree hrd : subnodes)
                sum += hrd.calcPenalty(globalErrorRate);
            return sum;
        }
    }
 }