diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java index 6ee5df68b..249422c17 100755 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java @@ -136,6 +136,14 @@ public class RecalDatum { this.estimatedQReported = estimatedQReported; } + public static RecalDatum createRandomRecalDatum(int maxObservations, int maxErrors) { + final Random random = new Random(); + final int nObservations = random.nextInt(maxObservations); + final int nErrors = random.nextInt(maxErrors); + final int qual = random.nextInt(QualityUtils.MAX_QUAL_SCORE); + return new RecalDatum(nObservations, nErrors, (byte)qual); + } + public final double getEstimatedQReported() { return estimatedQReported; } @@ -143,6 +151,29 @@ public class RecalDatum { return (byte)(int)(Math.round(getEstimatedQReported())); } + //--------------------------------------------------------------------------------------------------------------- + // + // Empirical quality score -- derived from the num mismatches and observations + // + //--------------------------------------------------------------------------------------------------------------- + + /** + * Returns the error rate (in real space) of this interval, or 0 if there are no obserations + * @return the empirical error rate ~= N errors / N obs + */ + @Ensures("result >= 0.0") + public double getEmpiricalErrorRate() { + if ( numObservations == 0 ) + return 0.0; + else { + // cache the value so we don't call log over and over again + final double doubleMismatches = (double) (numMismatches + SMOOTHING_CONSTANT); + // smoothing is one error and one non-error observation, for example + final double doubleObservations = (double) (numObservations + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT); + return doubleMismatches / doubleObservations; + } + } + public synchronized void setEmpiricalQuality(final double empiricalQuality) { if ( empiricalQuality < 0 ) throw new IllegalArgumentException("empiricalQuality < 0"); if ( Double.isInfinite(empiricalQuality) ) throw new IllegalArgumentException("empiricalQuality is infinite"); @@ -157,6 +188,16 @@ public class RecalDatum { return empiricalQuality; } + public final byte getEmpiricalQualityAsByte() { + return (byte)(Math.round(getEmpiricalQuality())); + } + + //--------------------------------------------------------------------------------------------------------------- + // + // increment methods + // + //--------------------------------------------------------------------------------------------------------------- + @Override public String toString() { return String.format("%d,%d,%d", getNumObservations(), getNumMismatches(), (byte) Math.floor(getEmpiricalQuality())); @@ -166,29 +207,21 @@ public class RecalDatum { return String.format("%s,%d,%.2f", toString(), (byte) Math.floor(getEstimatedQReported()), getEmpiricalQuality() - getEstimatedQReported()); } - public static RecalDatum createRandomRecalDatum(int maxObservations, int maxErrors) { - final Random random = new Random(); - final int nObservations = random.nextInt(maxObservations); - final int nErrors = random.nextInt(maxErrors); - final int qual = random.nextInt(QualityUtils.MAX_QUAL_SCORE); - return new RecalDatum(nObservations, nErrors, (byte)qual); - } - - /** - * We don't compare the estimated quality reported because it may be different when read from - * report tables. - * - * @param o the other recal datum - * @return true if the two recal datums have the same number of observations, errors and empirical quality. - */ - @Override - public boolean equals(Object o) { - if (!(o instanceof RecalDatum)) - return false; - RecalDatum other = (RecalDatum) o; - return super.equals(o) && - MathUtils.compareDoubles(this.empiricalQuality, other.empiricalQuality, 0.001) == 0; - } +// /** +// * We don't compare the estimated quality reported because it may be different when read from +// * report tables. +// * +// * @param o the other recal datum +// * @return true if the two recal datums have the same number of observations, errors and empirical quality. +// */ +// @Override +// public boolean equals(Object o) { +// if (!(o instanceof RecalDatum)) +// return false; +// RecalDatum other = (RecalDatum) o; +// return super.equals(o) && +// MathUtils.compareDoubles(this.empiricalQuality, other.empiricalQuality, 0.001) == 0; +// } //--------------------------------------------------------------------------------------------------------------- // @@ -255,11 +288,7 @@ public class RecalDatum { @Requires("empiricalQuality == UNINITIALIZED") @Ensures("empiricalQuality != UNINITIALIZED") private synchronized final void calcEmpiricalQuality() { - // cache the value so we don't call log over and over again - final double doubleMismatches = (double) (numMismatches + SMOOTHING_CONSTANT); - // smoothing is one error and one non-error observation, for example - final double doubleObservations = (double) (numObservations + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT); - final double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations); + final double empiricalQual = -10 * Math.log10(getEmpiricalErrorRate()); empiricalQuality = Math.min(empiricalQual, (double) QualityUtils.MAX_RECALIBRATED_Q_SCORE); } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumTree.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumTree.java new file mode 100644 index 000000000..210ea53bf --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumTree.java @@ -0,0 +1,76 @@ +package org.broadinstitute.sting.utils.recalibration; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +/** + * A tree of recal datum, where each contains a set of sub datum representing sub-states of the higher level one + * + * @author Mark DePristo + * @since 07/27/12 + */ +public class RecalDatumTree extends RecalDatum { + final Set subnodes; + + protected RecalDatumTree(final long nObservations, final long nErrors, final byte reportedQual) { + this(nObservations, nErrors, reportedQual, new HashSet()); + } + + public RecalDatumTree(final long nObservations, final long nErrors, final byte reportedQual, final Set subnodes) { + super(nObservations, nErrors, reportedQual); + this.subnodes = new HashSet(subnodes); + } + + public double getPenalty() { + return calcPenalty(getEmpiricalErrorRate()); + } + + public void addSubnode(final RecalDatumTree sub) { + subnodes.add(sub); + } + + public boolean isLeaf() { + return subnodes.isEmpty(); + } + + /** + * Calculate the penalty of this interval, given the overall error rate for the interval + * + * If the globalErrorRate is e, this value is: + * + * sum_i |log10(e_i) - log10(e)| * nObservations_i + * + * each the index i applies to all leaves of the tree accessible from this interval + * (found recursively from subnodes as necessary) + * + * @param globalErrorRate overall error rate in real space against which we calculate the penalty + * @return the cost of approximating the bins in this interval with the globalErrorRate + */ + @Requires("globalErrorRate >= 0.0") + @Ensures("result >= 0.0") + private double calcPenalty(final double globalErrorRate) { + if ( globalErrorRate == 0.0 ) // there were no observations, so there's no penalty + return 0.0; + + if ( isLeaf() ) { + // this is leave node + return (Math.abs(Math.log10(getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * getNumObservations(); + // TODO -- how we can generalize this calculation? +// if ( this.qEnd <= minInterestingQual ) +// // It's free to merge up quality scores below the smallest interesting one +// return 0; +// else { +// return (Math.abs(Math.log10(getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * getNumObservations(); +// } + } else { + double sum = 0; + for ( final RecalDatumTree hrd : subnodes) + sum += hrd.calcPenalty(globalErrorRate); + return sum; + } + } +}