Improvement to RecalDatum and VisualizeContextTree
-- Reorganize functions in RecalDatum so that error rate can be computed indepentently. Added unit tests. Removed equals() method, which is a buggy without it's associated implementation for hashcode -- New class RecalDatumTree based on QualIntervals that inherits from RecalDatum but includes the concept of sub data -- VisualizeContextTree now uses RecalDatumTree and can trivially compute the penalty function for merging nodes, which it displays in the graph
This commit is contained in:
parent
57b45bfb1e
commit
315d25409f
|
|
@ -136,6 +136,14 @@ public class RecalDatum {
|
|||
this.estimatedQReported = estimatedQReported;
|
||||
}
|
||||
|
||||
public static RecalDatum createRandomRecalDatum(int maxObservations, int maxErrors) {
|
||||
final Random random = new Random();
|
||||
final int nObservations = random.nextInt(maxObservations);
|
||||
final int nErrors = random.nextInt(maxErrors);
|
||||
final int qual = random.nextInt(QualityUtils.MAX_QUAL_SCORE);
|
||||
return new RecalDatum(nObservations, nErrors, (byte)qual);
|
||||
}
|
||||
|
||||
public final double getEstimatedQReported() {
|
||||
return estimatedQReported;
|
||||
}
|
||||
|
|
@ -143,6 +151,29 @@ public class RecalDatum {
|
|||
return (byte)(int)(Math.round(getEstimatedQReported()));
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Empirical quality score -- derived from the num mismatches and observations
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Returns the error rate (in real space) of this interval, or 0 if there are no obserations
|
||||
* @return the empirical error rate ~= N errors / N obs
|
||||
*/
|
||||
@Ensures("result >= 0.0")
|
||||
public double getEmpiricalErrorRate() {
|
||||
if ( numObservations == 0 )
|
||||
return 0.0;
|
||||
else {
|
||||
// cache the value so we don't call log over and over again
|
||||
final double doubleMismatches = (double) (numMismatches + SMOOTHING_CONSTANT);
|
||||
// smoothing is one error and one non-error observation, for example
|
||||
final double doubleObservations = (double) (numObservations + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT);
|
||||
return doubleMismatches / doubleObservations;
|
||||
}
|
||||
}
|
||||
|
||||
public synchronized void setEmpiricalQuality(final double empiricalQuality) {
|
||||
if ( empiricalQuality < 0 ) throw new IllegalArgumentException("empiricalQuality < 0");
|
||||
if ( Double.isInfinite(empiricalQuality) ) throw new IllegalArgumentException("empiricalQuality is infinite");
|
||||
|
|
@ -157,6 +188,16 @@ public class RecalDatum {
|
|||
return empiricalQuality;
|
||||
}
|
||||
|
||||
public final byte getEmpiricalQualityAsByte() {
|
||||
return (byte)(Math.round(getEmpiricalQuality()));
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// increment methods
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("%d,%d,%d", getNumObservations(), getNumMismatches(), (byte) Math.floor(getEmpiricalQuality()));
|
||||
|
|
@ -166,29 +207,21 @@ public class RecalDatum {
|
|||
return String.format("%s,%d,%.2f", toString(), (byte) Math.floor(getEstimatedQReported()), getEmpiricalQuality() - getEstimatedQReported());
|
||||
}
|
||||
|
||||
public static RecalDatum createRandomRecalDatum(int maxObservations, int maxErrors) {
|
||||
final Random random = new Random();
|
||||
final int nObservations = random.nextInt(maxObservations);
|
||||
final int nErrors = random.nextInt(maxErrors);
|
||||
final int qual = random.nextInt(QualityUtils.MAX_QUAL_SCORE);
|
||||
return new RecalDatum(nObservations, nErrors, (byte)qual);
|
||||
}
|
||||
|
||||
/**
|
||||
* We don't compare the estimated quality reported because it may be different when read from
|
||||
* report tables.
|
||||
*
|
||||
* @param o the other recal datum
|
||||
* @return true if the two recal datums have the same number of observations, errors and empirical quality.
|
||||
*/
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (!(o instanceof RecalDatum))
|
||||
return false;
|
||||
RecalDatum other = (RecalDatum) o;
|
||||
return super.equals(o) &&
|
||||
MathUtils.compareDoubles(this.empiricalQuality, other.empiricalQuality, 0.001) == 0;
|
||||
}
|
||||
// /**
|
||||
// * We don't compare the estimated quality reported because it may be different when read from
|
||||
// * report tables.
|
||||
// *
|
||||
// * @param o the other recal datum
|
||||
// * @return true if the two recal datums have the same number of observations, errors and empirical quality.
|
||||
// */
|
||||
// @Override
|
||||
// public boolean equals(Object o) {
|
||||
// if (!(o instanceof RecalDatum))
|
||||
// return false;
|
||||
// RecalDatum other = (RecalDatum) o;
|
||||
// return super.equals(o) &&
|
||||
// MathUtils.compareDoubles(this.empiricalQuality, other.empiricalQuality, 0.001) == 0;
|
||||
// }
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -255,11 +288,7 @@ public class RecalDatum {
|
|||
@Requires("empiricalQuality == UNINITIALIZED")
|
||||
@Ensures("empiricalQuality != UNINITIALIZED")
|
||||
private synchronized final void calcEmpiricalQuality() {
|
||||
// cache the value so we don't call log over and over again
|
||||
final double doubleMismatches = (double) (numMismatches + SMOOTHING_CONSTANT);
|
||||
// smoothing is one error and one non-error observation, for example
|
||||
final double doubleObservations = (double) (numObservations + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT);
|
||||
final double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations);
|
||||
final double empiricalQual = -10 * Math.log10(getEmpiricalErrorRate());
|
||||
empiricalQuality = Math.min(empiricalQual, (double) QualityUtils.MAX_RECALIBRATED_Q_SCORE);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,76 @@
|
|||
package org.broadinstitute.sting.utils.recalibration;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* A tree of recal datum, where each contains a set of sub datum representing sub-states of the higher level one
|
||||
*
|
||||
* @author Mark DePristo
|
||||
* @since 07/27/12
|
||||
*/
|
||||
public class RecalDatumTree extends RecalDatum {
|
||||
final Set<RecalDatumTree> subnodes;
|
||||
|
||||
protected RecalDatumTree(final long nObservations, final long nErrors, final byte reportedQual) {
|
||||
this(nObservations, nErrors, reportedQual, new HashSet<RecalDatumTree>());
|
||||
}
|
||||
|
||||
public RecalDatumTree(final long nObservations, final long nErrors, final byte reportedQual, final Set<RecalDatumTree> subnodes) {
|
||||
super(nObservations, nErrors, reportedQual);
|
||||
this.subnodes = new HashSet<RecalDatumTree>(subnodes);
|
||||
}
|
||||
|
||||
public double getPenalty() {
|
||||
return calcPenalty(getEmpiricalErrorRate());
|
||||
}
|
||||
|
||||
public void addSubnode(final RecalDatumTree sub) {
|
||||
subnodes.add(sub);
|
||||
}
|
||||
|
||||
public boolean isLeaf() {
|
||||
return subnodes.isEmpty();
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate the penalty of this interval, given the overall error rate for the interval
|
||||
*
|
||||
* If the globalErrorRate is e, this value is:
|
||||
*
|
||||
* sum_i |log10(e_i) - log10(e)| * nObservations_i
|
||||
*
|
||||
* each the index i applies to all leaves of the tree accessible from this interval
|
||||
* (found recursively from subnodes as necessary)
|
||||
*
|
||||
* @param globalErrorRate overall error rate in real space against which we calculate the penalty
|
||||
* @return the cost of approximating the bins in this interval with the globalErrorRate
|
||||
*/
|
||||
@Requires("globalErrorRate >= 0.0")
|
||||
@Ensures("result >= 0.0")
|
||||
private double calcPenalty(final double globalErrorRate) {
|
||||
if ( globalErrorRate == 0.0 ) // there were no observations, so there's no penalty
|
||||
return 0.0;
|
||||
|
||||
if ( isLeaf() ) {
|
||||
// this is leave node
|
||||
return (Math.abs(Math.log10(getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * getNumObservations();
|
||||
// TODO -- how we can generalize this calculation?
|
||||
// if ( this.qEnd <= minInterestingQual )
|
||||
// // It's free to merge up quality scores below the smallest interesting one
|
||||
// return 0;
|
||||
// else {
|
||||
// return (Math.abs(Math.log10(getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * getNumObservations();
|
||||
// }
|
||||
} else {
|
||||
double sum = 0;
|
||||
for ( final RecalDatumTree hrd : subnodes)
|
||||
sum += hrd.calcPenalty(globalErrorRate);
|
||||
return sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue