Improvement to RecalDatum and VisualizeContextTree

-- Reorganize functions in RecalDatum so that error rate can be computed indepentently. Added unit tests. Removed equals() method, which is a buggy without it's associated implementation for hashcode -- New class RecalDatumTree based on QualIntervals that inherits from RecalDatum but includes the concept of sub data -- VisualizeContextTree now uses RecalDatumTree and can trivially compute the penalty function for merging nodes, which it displays in the graph
2012-07-29 13:13:13 -04:00 · 2012-07-29 13:13:13 -04:00 · 315d25409f
parent 57b45bfb1e
commit 315d25409f
2 changed files with 133 additions and 28 deletions
--- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java
+++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java
@ -136,6 +136,14 @@ public class RecalDatum {
        this.estimatedQReported = estimatedQReported;
    }

+    public static RecalDatum createRandomRecalDatum(int maxObservations, int maxErrors) {
+        final Random random = new Random();
+        final int nObservations = random.nextInt(maxObservations);
+        final int nErrors = random.nextInt(maxErrors);
+        final int qual = random.nextInt(QualityUtils.MAX_QUAL_SCORE);
+        return new RecalDatum(nObservations, nErrors, (byte)qual);
+    }
+
    public final double getEstimatedQReported() {
        return estimatedQReported;
    }
@ -143,6 +151,29 @@ public class RecalDatum {
        return (byte)(int)(Math.round(getEstimatedQReported()));
    }

+    //---------------------------------------------------------------------------------------------------------------
+    //
+    // Empirical quality score -- derived from the num mismatches and observations
+    //
+    //---------------------------------------------------------------------------------------------------------------
+
+    /**
+     * Returns the error rate (in real space) of this interval, or 0 if there are no obserations
+     * @return the empirical error rate ~= N errors / N obs
+     */
+    @Ensures("result >= 0.0")
+    public double getEmpiricalErrorRate() {
+        if ( numObservations == 0 )
+            return 0.0;
+        else {
+            // cache the value so we don't call log over and over again
+            final double doubleMismatches = (double) (numMismatches + SMOOTHING_CONSTANT);
+            // smoothing is one error and one non-error observation, for example
+            final double doubleObservations = (double) (numObservations + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT);
+            return doubleMismatches / doubleObservations;
+        }
+    }
+
    public synchronized void setEmpiricalQuality(final double empiricalQuality) {
        if ( empiricalQuality < 0 ) throw new IllegalArgumentException("empiricalQuality < 0");
        if ( Double.isInfinite(empiricalQuality) ) throw new IllegalArgumentException("empiricalQuality is infinite");
@ -157,6 +188,16 @@ public class RecalDatum {
        return empiricalQuality;
    }

+    public final byte getEmpiricalQualityAsByte() {
+        return (byte)(Math.round(getEmpiricalQuality()));
+    }
+
+        //---------------------------------------------------------------------------------------------------------------
+    //
+    // increment methods
+    //
+    //---------------------------------------------------------------------------------------------------------------
+
    @Override
    public String toString() {
        return String.format("%d,%d,%d", getNumObservations(), getNumMismatches(), (byte) Math.floor(getEmpiricalQuality()));
@ -166,29 +207,21 @@ public class RecalDatum {
        return String.format("%s,%d,%.2f", toString(), (byte) Math.floor(getEstimatedQReported()), getEmpiricalQuality() - getEstimatedQReported());
    }

-    public static RecalDatum createRandomRecalDatum(int maxObservations, int maxErrors) {
-        final Random random = new Random();
-        final int nObservations = random.nextInt(maxObservations);
-        final int nErrors = random.nextInt(maxErrors);
-        final int qual = random.nextInt(QualityUtils.MAX_QUAL_SCORE);
-        return new RecalDatum(nObservations, nErrors, (byte)qual);
-    }
-
-    /**
-     * We don't compare the estimated quality reported because it may be different when read from
-     * report tables.
-     *
-     * @param o the other recal datum
-     * @return true if the two recal datums have the same number of observations, errors and empirical quality.
-     */
-    @Override
-    public boolean equals(Object o) {
-        if (!(o instanceof RecalDatum))
-            return false;
-        RecalDatum other = (RecalDatum) o;
-        return super.equals(o) &&
-               MathUtils.compareDoubles(this.empiricalQuality, other.empiricalQuality, 0.001) == 0;
-    }
+//    /**
+//     * We don't compare the estimated quality reported because it may be different when read from
+//     * report tables.
+//     *
+//     * @param o the other recal datum
+//     * @return true if the two recal datums have the same number of observations, errors and empirical quality.
+//     */
+//    @Override
+//    public boolean equals(Object o) {
+//        if (!(o instanceof RecalDatum))
+//            return false;
+//        RecalDatum other = (RecalDatum) o;
+//        return super.equals(o) &&
+//               MathUtils.compareDoubles(this.empiricalQuality, other.empiricalQuality, 0.001) == 0;
+//    }

    //---------------------------------------------------------------------------------------------------------------
    //
@ -255,11 +288,7 @@ public class RecalDatum {
    @Requires("empiricalQuality == UNINITIALIZED")
    @Ensures("empiricalQuality != UNINITIALIZED")
    private synchronized final void calcEmpiricalQuality() {
-        // cache the value so we don't call log over and over again
-        final double doubleMismatches = (double) (numMismatches + SMOOTHING_CONSTANT);
-        // smoothing is one error and one non-error observation, for example
-        final double doubleObservations = (double) (numObservations + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT);
-        final double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations);
+        final double empiricalQual = -10 * Math.log10(getEmpiricalErrorRate());
        empiricalQuality = Math.min(empiricalQual, (double) QualityUtils.MAX_RECALIBRATED_Q_SCORE);
    }

--- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumTree.java
+++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumTree.java
@ -0,0 +1,76 @@
+package org.broadinstitute.sting.utils.recalibration;
+
+import com.google.java.contract.Ensures;
+import com.google.java.contract.Requires;
+
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * A tree of recal datum, where each contains a set of sub datum representing sub-states of the higher level one
+ *
+ * @author Mark DePristo
+ * @since 07/27/12
+ */
+public class RecalDatumTree extends RecalDatum {
+    final Set<RecalDatumTree> subnodes;
+
+    protected RecalDatumTree(final long nObservations, final long nErrors, final byte reportedQual) {
+        this(nObservations, nErrors, reportedQual, new HashSet<RecalDatumTree>());
+    }
+
+    public RecalDatumTree(final long nObservations, final long nErrors, final byte reportedQual, final Set<RecalDatumTree> subnodes) {
+        super(nObservations, nErrors, reportedQual);
+        this.subnodes = new HashSet<RecalDatumTree>(subnodes);
+    }
+
+    public double getPenalty() {
+        return calcPenalty(getEmpiricalErrorRate());
+    }
+
+    public void addSubnode(final RecalDatumTree sub) {
+        subnodes.add(sub);
+    }
+
+    public boolean isLeaf() {
+        return subnodes.isEmpty();
+    }
+
+    /**
+     * Calculate the penalty of this interval, given the overall error rate for the interval
+     *
+     * If the globalErrorRate is e, this value is:
+     *
+     * sum_i |log10(e_i) - log10(e)| * nObservations_i
+     *
+     * each the index i applies to all leaves of the tree accessible from this interval
+     * (found recursively from subnodes as necessary)
+     *
+     * @param globalErrorRate overall error rate in real space against which we calculate the penalty
+     * @return the cost of approximating the bins in this interval with the globalErrorRate
+     */
+    @Requires("globalErrorRate >= 0.0")
+    @Ensures("result >= 0.0")
+    private double calcPenalty(final double globalErrorRate) {
+        if ( globalErrorRate == 0.0 ) // there were no observations, so there's no penalty
+            return 0.0;
+
+        if ( isLeaf() ) {
+            // this is leave node
+            return (Math.abs(Math.log10(getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * getNumObservations();
+            // TODO -- how we can generalize this calculation?
+//            if ( this.qEnd <= minInterestingQual )
+//                // It's free to merge up quality scores below the smallest interesting one
+//                return 0;
+//            else {
+//                return (Math.abs(Math.log10(getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * getNumObservations();
+//            }
+        } else {
+            double sum = 0;
+            for ( final RecalDatumTree hrd : subnodes)
+                sum += hrd.calcPenalty(globalErrorRate);
+            return sum;
+        }
+    }
+}