Ready for full-scale evaluation adaptive BQSR contexts
-- VisualizeContextTree now can write out an equivalent BQSR table determined after adaptive context merging of all RG x QUAL x CONTEXT trees -- Docs, algorithm descriptions, etc so that it makes sense what's going on -- VisualizeContextTree should really be simplified when into a single tool that just visualize the trees when / if we decide to make adaptive contexts standard part of BQSR -- Misc. cleaning, organization of the code (recalibation tests were in private but corresponding actual files were public)
This commit is contained in:
parent
d2e8eb7b23
commit
e1bba91836
|
|
@ -208,11 +208,23 @@ public class GATKReportTable {
|
|||
}
|
||||
|
||||
/**
|
||||
* Verifies that a table or column name has only alphanumeric characters - no spaces or special characters allowed
|
||||
*
|
||||
* @param name the name of the table or column
|
||||
* @return true if the name is valid, false if otherwise
|
||||
* Create a new GATKReportTable with the same structure
|
||||
* @param tableToCopy
|
||||
*/
|
||||
public GATKReportTable(final GATKReportTable tableToCopy, final boolean copyData) {
|
||||
this(tableToCopy.getTableName(), tableToCopy.getTableDescription(), tableToCopy.getNumColumns(), tableToCopy.sortByRowID);
|
||||
for ( final GATKReportColumn column : tableToCopy.getColumnInfo() )
|
||||
addColumn(column.getColumnName(), column.getFormat());
|
||||
if ( copyData )
|
||||
throw new IllegalArgumentException("sorry, copying data in GATKReportTable isn't supported");
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that a table or column name has only alphanumeric characters - no spaces or special characters allowed
|
||||
*
|
||||
* @param name the name of the table or column
|
||||
* @return true if the name is valid, false if otherwise
|
||||
*/
|
||||
private boolean isValidName(String name) {
|
||||
Pattern p = Pattern.compile(INVALID_TABLE_NAME_REGEX);
|
||||
Matcher m = p.matcher(name);
|
||||
|
|
@ -490,6 +502,17 @@ public class GATKReportTable {
|
|||
return get(rowIdToIndex.get(rowID), columnNameToIndex.get(columnName));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a value from the given position in the table
|
||||
*
|
||||
* @param rowIndex the row ID
|
||||
* @param columnName the name of the column
|
||||
* @return the value stored at the specified position in the table
|
||||
*/
|
||||
public Object get(final int rowIndex, final String columnName) {
|
||||
return get(rowIndex, columnNameToIndex.get(columnName));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a value from the given position in the table
|
||||
*
|
||||
|
|
|
|||
|
|
@ -0,0 +1,154 @@
|
|||
package org.broadinstitute.sting.utils.recalibration;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Functions for working with AdaptiveContexts
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 8/3/12
|
||||
* Time: 12:21 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class AdaptiveContext {
|
||||
private AdaptiveContext() {}
|
||||
|
||||
/**
|
||||
* Return a freshly allocated tree filled in completely to fillDepth with
|
||||
* all combinations of {A,C,G,T}^filldepth contexts. For nodes
|
||||
* in the tree, they are simply copied. When the algorithm needs to
|
||||
* generate new nodes (because they are missing) the subnodes inherit the
|
||||
* observation and error counts of their parent.
|
||||
*
|
||||
* This algorithm produces data consistent with the standard output in a BQSR recal
|
||||
* file for the Context covariate
|
||||
*
|
||||
* @param root
|
||||
* @param fillDepth
|
||||
* @return
|
||||
*/
|
||||
public static RecalDatumNode<ContextDatum> fillToDepth(final RecalDatumNode<ContextDatum> root, final int fillDepth) {
|
||||
if ( root == null ) throw new IllegalArgumentException("root is null");
|
||||
if ( fillDepth < 0 ) throw new IllegalArgumentException("fillDepth is < 0");
|
||||
|
||||
return fillToDepthRec(root, fillDepth, 0);
|
||||
}
|
||||
|
||||
private static RecalDatumNode<ContextDatum> fillToDepthRec(final RecalDatumNode<ContextDatum> parent,
|
||||
final int fillDepth,
|
||||
final int currentDepth) {
|
||||
// three cases:
|
||||
// We are in the tree and so just recursively build
|
||||
// We have reached our depth goal, so just return the parent since we are done
|
||||
// We are outside of the tree, in which case we need to pointer to our parent node so we can
|
||||
// we info (N, M) and we need a running context
|
||||
if ( currentDepth < fillDepth ) {
|
||||
// we need to create subnodes for each base, and propogate N and M down
|
||||
final RecalDatumNode<ContextDatum> newParent = new RecalDatumNode<ContextDatum>(parent.getRecalDatum());
|
||||
|
||||
for ( final String base : Arrays.asList("A", "C", "G", "T")) {
|
||||
ContextDatum subContext;
|
||||
Set<RecalDatumNode<ContextDatum>> subContexts;
|
||||
|
||||
final RecalDatumNode<ContextDatum> subNode = findSubcontext(parent.getRecalDatum().context + base, parent);
|
||||
if ( subNode != null ) {
|
||||
// we have a subnode corresponding to the expected one, just copy and recurse
|
||||
subContext = subNode.getRecalDatum();
|
||||
subContexts = subNode.getSubnodes();
|
||||
} else {
|
||||
// have to create a new one
|
||||
subContext = new ContextDatum(parent.getRecalDatum().context + base,
|
||||
parent.getRecalDatum().getNumObservations(), parent.getRecalDatum().getNumMismatches());
|
||||
subContexts = Collections.emptySet();
|
||||
}
|
||||
|
||||
newParent.addSubnode(
|
||||
fillToDepthRec(new RecalDatumNode<ContextDatum>(subContext, subContexts),
|
||||
fillDepth, currentDepth + 1));
|
||||
}
|
||||
return newParent;
|
||||
} else {
|
||||
return parent;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Go from a flat list of contexts to the tree implied by the contexts
|
||||
*
|
||||
* Implicit nodes are created as needed, and their observation and error counts are the sum of the
|
||||
* all of their subnodes.
|
||||
*
|
||||
* Note this does not guarentee the tree is complete, as some contexts (e.g., AAT) may be missing
|
||||
* from the tree because they are absent from the input list of contexts.
|
||||
*
|
||||
* For input AAG, AAT, AC, G would produce the following tree:
|
||||
*
|
||||
* - x [root]
|
||||
* - A
|
||||
* - A
|
||||
* - T
|
||||
* - G
|
||||
* - C
|
||||
* - G
|
||||
*
|
||||
* sets the fixed penalties in the resulting tree as well
|
||||
*
|
||||
* @param flatContexts list of flat contexts
|
||||
* @return
|
||||
*/
|
||||
public static RecalDatumNode<ContextDatum> createTreeFromFlatContexts(final List<ContextDatum> flatContexts) {
|
||||
if ( flatContexts == null || flatContexts.isEmpty() )
|
||||
throw new IllegalArgumentException("flatContexts cannot be empty or null");
|
||||
|
||||
final Queue<RecalDatumNode<ContextDatum>> remaining = new LinkedList<RecalDatumNode<ContextDatum>>();
|
||||
final Map<String, RecalDatumNode<ContextDatum>> contextToNodes = new HashMap<String, RecalDatumNode<ContextDatum>>();
|
||||
RecalDatumNode<ContextDatum> root = null;
|
||||
|
||||
// initialize -- start with all of the contexts
|
||||
for ( final ContextDatum cd : flatContexts )
|
||||
remaining.add(new RecalDatumNode<ContextDatum>(cd));
|
||||
|
||||
while ( remaining.peek() != null ) {
|
||||
final RecalDatumNode<ContextDatum> add = remaining.poll();
|
||||
final ContextDatum cd = add.getRecalDatum();
|
||||
|
||||
final String parentContext = cd.getParentContext();
|
||||
RecalDatumNode<ContextDatum> parent = contextToNodes.get(parentContext);
|
||||
if ( parent == null ) {
|
||||
// haven't yet found parent, so make one, and enqueue it for processing
|
||||
parent = new RecalDatumNode<ContextDatum>(new ContextDatum(parentContext, 0, 0));
|
||||
contextToNodes.put(parentContext, parent);
|
||||
|
||||
if ( parentContext != ContextDatum.ROOT_CONTEXT )
|
||||
remaining.add(parent);
|
||||
else
|
||||
root = parent;
|
||||
}
|
||||
|
||||
parent.getRecalDatum().incrementNumObservations(cd.getNumObservations());
|
||||
parent.getRecalDatum().incrementNumMismatches(cd.getNumMismatches());
|
||||
parent.addSubnode(add);
|
||||
}
|
||||
|
||||
if ( root == null )
|
||||
throw new RuntimeException("root is unexpectedly null");
|
||||
|
||||
// set the fixed penalty everywhere in the tree, so that future modifications don't change the penalties
|
||||
root.calcAndSetFixedPenalty(true);
|
||||
|
||||
return root;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds immediate subnode with contextToFind, or null if none exists
|
||||
*
|
||||
* @param tree whose subnodes should be searched
|
||||
* @return
|
||||
*/
|
||||
public static RecalDatumNode<ContextDatum> findSubcontext(final String contextToFind, final RecalDatumNode<ContextDatum> tree) {
|
||||
for ( final RecalDatumNode<ContextDatum> sub : tree.getSubnodes() )
|
||||
if ( sub.getRecalDatum().context.equals(contextToFind) )
|
||||
return sub;
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
@ -4,10 +4,12 @@ import com.google.java.contract.Ensures;
|
|||
import com.google.java.contract.Requires;
|
||||
import org.apache.commons.math.stat.inference.ChiSquareTestImpl;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
|
|
@ -17,13 +19,18 @@ import java.util.Set;
|
|||
* @since 07/27/12
|
||||
*/
|
||||
public class RecalDatumNode<T extends RecalDatum> {
|
||||
private final static boolean USE_CHI2 = true;
|
||||
protected static Logger logger = Logger.getLogger(RecalDatumNode.class);
|
||||
|
||||
/**
|
||||
* fixedPenalty is this value if it's considered fixed
|
||||
*/
|
||||
private final static double UNINITIALIZED = Double.NEGATIVE_INFINITY;
|
||||
|
||||
private final T recalDatum;
|
||||
private double fixedPenalty = UNINITIALIZED;
|
||||
private final Set<RecalDatumNode<T>> subnodes;
|
||||
|
||||
@Requires({"recalDatum != null"})
|
||||
public RecalDatumNode(final T recalDatum) {
|
||||
this(recalDatum, new HashSet<RecalDatumNode<T>>());
|
||||
}
|
||||
|
|
@ -33,28 +40,45 @@ public class RecalDatumNode<T extends RecalDatum> {
|
|||
return recalDatum.toString();
|
||||
}
|
||||
|
||||
@Requires({"recalDatum != null", "subnodes != null"})
|
||||
public RecalDatumNode(final T recalDatum, final Set<RecalDatumNode<T>> subnodes) {
|
||||
this(recalDatum, UNINITIALIZED, subnodes);
|
||||
}
|
||||
|
||||
@Requires({"recalDatum != null"})
|
||||
protected RecalDatumNode(final T recalDatum, final double fixedPenalty) {
|
||||
this(recalDatum, fixedPenalty, new HashSet<RecalDatumNode<T>>());
|
||||
}
|
||||
|
||||
@Requires({"recalDatum != null", "subnodes != null"})
|
||||
protected RecalDatumNode(final T recalDatum, final double fixedPenalty, final Set<RecalDatumNode<T>> subnodes) {
|
||||
this.recalDatum = recalDatum;
|
||||
this.fixedPenalty = fixedPenalty;
|
||||
this.subnodes = new HashSet<RecalDatumNode<T>>(subnodes);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the recal data associated with this node
|
||||
* @return
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public T getRecalDatum() {
|
||||
return recalDatum;
|
||||
}
|
||||
|
||||
/**
|
||||
* The set of all subnodes of this tree. May be modified.
|
||||
* @return
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public Set<RecalDatumNode<T>> getSubnodes() {
|
||||
return subnodes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the fixed penalty, if set, or else the the calculated penalty for this node
|
||||
* @return
|
||||
*/
|
||||
public double getPenalty() {
|
||||
if ( fixedPenalty != UNINITIALIZED )
|
||||
return fixedPenalty;
|
||||
|
|
@ -62,6 +86,17 @@ public class RecalDatumNode<T extends RecalDatum> {
|
|||
return calcPenalty();
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the fixed penalty for this node to a fresh calculation from calcPenalty
|
||||
*
|
||||
* This is important in the case where you want to compute the penalty from a full
|
||||
* tree and then chop the tree up afterwards while considering the previous penalties.
|
||||
* If you don't call this function then manipulating the tree may result in the
|
||||
* penalty functions changing with changes in the tree.
|
||||
*
|
||||
* @param doEntireTree recurse into all subnodes?
|
||||
* @return the fixed penalty for this node
|
||||
*/
|
||||
public double calcAndSetFixedPenalty(final boolean doEntireTree) {
|
||||
fixedPenalty = calcPenalty();
|
||||
if ( doEntireTree )
|
||||
|
|
@ -70,15 +105,41 @@ public class RecalDatumNode<T extends RecalDatum> {
|
|||
return fixedPenalty;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add node to the set of subnodes of this node
|
||||
* @param sub
|
||||
*/
|
||||
@Requires("sub != null")
|
||||
public void addSubnode(final RecalDatumNode<T> sub) {
|
||||
subnodes.add(sub);
|
||||
}
|
||||
|
||||
/**
|
||||
* Is this a leaf node (i.e., has no subnodes)?
|
||||
* @return
|
||||
*/
|
||||
public boolean isLeaf() {
|
||||
return subnodes.isEmpty();
|
||||
}
|
||||
|
||||
public int getNumBranches() {
|
||||
/**
|
||||
* Is this node immediately above only leaf nodes?
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public boolean isAboveOnlyLeaves() {
|
||||
for ( final RecalDatumNode<T> sub : subnodes )
|
||||
if ( ! sub.isLeaf() )
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* What's the immediate number of subnodes from this node?
|
||||
* @return
|
||||
*/
|
||||
@Ensures("result >= 0")
|
||||
public int getNumSubnodes() {
|
||||
return subnodes.size();
|
||||
}
|
||||
|
||||
|
|
@ -89,6 +150,8 @@ public class RecalDatumNode<T extends RecalDatum> {
|
|||
* definition have 0 penalty unless they represent a pruned tree with underlying -- but now
|
||||
* pruned -- subtrees
|
||||
*
|
||||
* TODO -- can we really just add together the chi2 values?
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public double totalPenalty() {
|
||||
|
|
@ -102,6 +165,10 @@ public class RecalDatumNode<T extends RecalDatum> {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* What's the longest branch from this node to any leaf?
|
||||
* @return
|
||||
*/
|
||||
public int maxDepth() {
|
||||
int subMax = 0;
|
||||
for ( final RecalDatumNode<T> sub : subnodes )
|
||||
|
|
@ -109,6 +176,11 @@ public class RecalDatumNode<T extends RecalDatum> {
|
|||
return subMax + 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* What's the shortest branch from this node to any leaf? Includes this node
|
||||
* @return
|
||||
*/
|
||||
@Ensures("result > 0")
|
||||
public int minDepth() {
|
||||
if ( isLeaf() )
|
||||
return 1;
|
||||
|
|
@ -120,6 +192,11 @@ public class RecalDatumNode<T extends RecalDatum> {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the number of nodes, including this one, reachable from this node
|
||||
* @return
|
||||
*/
|
||||
@Ensures("result > 0")
|
||||
public int size() {
|
||||
int size = 1;
|
||||
for ( final RecalDatumNode<T> sub : subnodes )
|
||||
|
|
@ -127,6 +204,12 @@ public class RecalDatumNode<T extends RecalDatum> {
|
|||
return size;
|
||||
}
|
||||
|
||||
/**
|
||||
* Count the number of leaf nodes reachable from this node
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
@Ensures("result >= 0")
|
||||
public int numLeaves() {
|
||||
if ( isLeaf() )
|
||||
return 1;
|
||||
|
|
@ -138,44 +221,37 @@ public class RecalDatumNode<T extends RecalDatum> {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate the chi^2 penalty among subnodes of this node. The chi^2 value
|
||||
* indicates the degree of independence of the implied error rates among the
|
||||
* immediate subnodes
|
||||
*
|
||||
* @return the chi2 penalty, or 0.0 if it cannot be calculated
|
||||
*/
|
||||
private double calcPenalty() {
|
||||
if ( USE_CHI2 )
|
||||
return calcPenaltyChi2();
|
||||
else
|
||||
return calcPenaltyLog10(getRecalDatum().getEmpiricalErrorRate());
|
||||
}
|
||||
|
||||
private double calcPenaltyChi2() {
|
||||
if ( isLeaf() )
|
||||
return 0.0;
|
||||
else if ( subnodes.size() == 1 )
|
||||
// only one value, so its free to merge away
|
||||
return 0.0;
|
||||
else {
|
||||
final long[][] counts = new long[subnodes.size()][2];
|
||||
|
||||
int i = 0;
|
||||
for ( RecalDatumNode<T> subnode : subnodes ) {
|
||||
counts[i][0] = subnode.getRecalDatum().getNumMismatches();
|
||||
counts[i][1] = subnode.getRecalDatum().getNumObservations();
|
||||
for ( final RecalDatumNode<T> subnode : subnodes ) {
|
||||
// use the yates correction to help avoid all zeros => NaN
|
||||
counts[i][0] = subnode.getRecalDatum().getNumMismatches() + 1;
|
||||
counts[i][1] = subnode.getRecalDatum().getNumObservations() + 2;
|
||||
i++;
|
||||
}
|
||||
|
||||
final double chi2 = new ChiSquareTestImpl().chiSquare(counts);
|
||||
|
||||
// StringBuilder x = new StringBuilder();
|
||||
// StringBuilder y = new StringBuilder();
|
||||
// for ( int k = 0; k < counts.length; k++) {
|
||||
// if ( k != 0 ) {
|
||||
// x.append(", ");
|
||||
// y.append(", ");
|
||||
// }
|
||||
// x.append(counts[k][0]);
|
||||
// y.append(counts[k][1]);
|
||||
// }
|
||||
// logger.info("x = c(" + x.toString() + ")");
|
||||
// logger.info("y = c(" + y.toString() + ")");
|
||||
// logger.info("chi2 = " + chi2);
|
||||
// make sure things are reasonable and fail early if not
|
||||
if (Double.isInfinite(chi2) || Double.isNaN(chi2))
|
||||
throw new ReviewedStingException("chi2 value is " + chi2 + " at " + getRecalDatum());
|
||||
|
||||
return chi2;
|
||||
//return Math.log10(chi2);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -216,11 +292,17 @@ public class RecalDatumNode<T extends RecalDatum> {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a freshly allocated tree prunes to have no more than maxDepth from the root to any leaf
|
||||
*
|
||||
* @param maxDepth
|
||||
* @return
|
||||
*/
|
||||
public RecalDatumNode<T> pruneToDepth(final int maxDepth) {
|
||||
if ( maxDepth < 1 )
|
||||
throw new IllegalArgumentException("maxDepth < 1");
|
||||
else {
|
||||
final Set<RecalDatumNode<T>> subPruned = new HashSet<RecalDatumNode<T>>(getNumBranches());
|
||||
final Set<RecalDatumNode<T>> subPruned = new HashSet<RecalDatumNode<T>>(getNumSubnodes());
|
||||
if ( maxDepth > 1 )
|
||||
for ( final RecalDatumNode<T> sub : subnodes )
|
||||
subPruned.add(sub.pruneToDepth(maxDepth - 1));
|
||||
|
|
@ -228,12 +310,21 @@ public class RecalDatumNode<T extends RecalDatum> {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a freshly allocated tree with to no more than maxElements in order of penalty
|
||||
*
|
||||
* Note that nodes must have fixed penalties to this algorithm will fail.
|
||||
*
|
||||
* @param maxElements
|
||||
* @return
|
||||
*/
|
||||
public RecalDatumNode<T> pruneByPenalty(final int maxElements) {
|
||||
RecalDatumNode<T> root = this;
|
||||
|
||||
while ( root.size() > maxElements ) {
|
||||
// remove the lowest penalty element, and continue
|
||||
root = root.removeLowestPenaltyNode();
|
||||
logger.debug("pruneByPenalty root size is now " + root.size() + " of max " + maxElements);
|
||||
}
|
||||
|
||||
// our size is below the target, so we are good, return
|
||||
|
|
@ -241,15 +332,15 @@ public class RecalDatumNode<T extends RecalDatum> {
|
|||
}
|
||||
|
||||
/**
|
||||
* Find the lowest penalty node in the tree, and return a tree without it
|
||||
* Find the lowest penalty above leaf node in the tree, and return a tree without it
|
||||
*
|
||||
* Note this excludes the current (root) node
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
private RecalDatumNode<T> removeLowestPenaltyNode() {
|
||||
final Pair<RecalDatumNode<T>, Double> nodeToRemove = getMinPenaltyNode();
|
||||
logger.info("Removing " + nodeToRemove.getFirst() + " with penalty " + nodeToRemove.getSecond());
|
||||
final Pair<RecalDatumNode<T>, Double> nodeToRemove = getMinPenaltyAboveLeafNode();
|
||||
//logger.info("Removing " + nodeToRemove.getFirst() + " with penalty " + nodeToRemove.getSecond());
|
||||
|
||||
final Pair<RecalDatumNode<T>, Boolean> result = removeNode(nodeToRemove.getFirst());
|
||||
|
||||
|
|
@ -262,20 +353,37 @@ public class RecalDatumNode<T extends RecalDatum> {
|
|||
return oneRemoved;
|
||||
}
|
||||
|
||||
private Pair<RecalDatumNode<T>, Double> getMinPenaltyNode() {
|
||||
final double myValue = isLeaf() ? Double.MAX_VALUE : getPenalty();
|
||||
Pair<RecalDatumNode<T>, Double> maxNode = new Pair<RecalDatumNode<T>, Double>(this, myValue);
|
||||
|
||||
for ( final RecalDatumNode<T> sub : subnodes ) {
|
||||
final Pair<RecalDatumNode<T>, Double> subFind = sub.getMinPenaltyNode();
|
||||
if ( subFind.getSecond() < maxNode.getSecond() ) {
|
||||
maxNode = subFind;
|
||||
/**
|
||||
* Finds in the tree the node with the lowest penalty whose subnodes are all leaves
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
private Pair<RecalDatumNode<T>, Double> getMinPenaltyAboveLeafNode() {
|
||||
if ( isLeaf() )
|
||||
// not allowed to remove leafs directly
|
||||
return null;
|
||||
if ( isAboveOnlyLeaves() )
|
||||
// we only consider removing nodes above all leaves
|
||||
return new Pair<RecalDatumNode<T>, Double>(this, getPenalty());
|
||||
else {
|
||||
// just recurse, taking the result with the min penalty of all subnodes
|
||||
Pair<RecalDatumNode<T>, Double> minNode = null;
|
||||
for ( final RecalDatumNode<T> sub : subnodes ) {
|
||||
final Pair<RecalDatumNode<T>, Double> subFind = sub.getMinPenaltyAboveLeafNode();
|
||||
if ( subFind != null && (minNode == null || subFind.getSecond() < minNode.getSecond()) ) {
|
||||
minNode = subFind;
|
||||
}
|
||||
}
|
||||
return minNode;
|
||||
}
|
||||
|
||||
return maxNode;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a freshly allocated tree without the node nodeToRemove
|
||||
*
|
||||
* @param nodeToRemove
|
||||
* @return
|
||||
*/
|
||||
private Pair<RecalDatumNode<T>, Boolean> removeNode(final RecalDatumNode<T> nodeToRemove) {
|
||||
if ( this == nodeToRemove ) {
|
||||
if ( isLeaf() )
|
||||
|
|
@ -288,7 +396,7 @@ public class RecalDatumNode<T extends RecalDatum> {
|
|||
boolean removedSomething = false;
|
||||
|
||||
// our sub nodes with the penalty node removed
|
||||
final Set<RecalDatumNode<T>> sub = new HashSet<RecalDatumNode<T>>(getNumBranches());
|
||||
final Set<RecalDatumNode<T>> sub = new HashSet<RecalDatumNode<T>>(getNumSubnodes());
|
||||
|
||||
for ( final RecalDatumNode<T> sub1 : subnodes ) {
|
||||
if ( removedSomething ) {
|
||||
|
|
@ -306,4 +414,29 @@ public class RecalDatumNode<T extends RecalDatum> {
|
|||
return new Pair<RecalDatumNode<T>, Boolean>(node, removedSomething);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a collection of all of the data in the leaf nodes of this tree
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public Collection<T> getAllLeaves() {
|
||||
final LinkedList<T> list = new LinkedList<T>();
|
||||
getAllLeavesRec(list);
|
||||
return list;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helpful recursive function for getAllLeaves()
|
||||
*
|
||||
* @param list the destination for the list of leaves
|
||||
*/
|
||||
private void getAllLeavesRec(final LinkedList<T> list) {
|
||||
if ( isLeaf() )
|
||||
list.add(getRecalDatum());
|
||||
else {
|
||||
for ( final RecalDatumNode<T> sub : subnodes )
|
||||
sub.getAllLeavesRec(list);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,64 @@
|
|||
package org.broadinstitute.sting.utils.recalibration;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
/**
|
||||
* User: depristo
|
||||
* Date: 8/3/12
|
||||
* Time: 12:26 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class AdaptiveContextUnitTest {
|
||||
// TODO
|
||||
// TODO actually need unit tests when we have validated the value of this approach
|
||||
// TODO particularly before we attempt to optimize the algorithm
|
||||
// TODO
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Provider
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
private class AdaptiveContextTestProvider extends BaseTest.TestDataProvider {
|
||||
final RecalDatumNode<ContextDatum> pruned;
|
||||
final RecalDatumNode<ContextDatum> full;
|
||||
|
||||
private AdaptiveContextTestProvider(Class c, RecalDatumNode<ContextDatum> pruned, RecalDatumNode<ContextDatum> full) {
|
||||
super(AdaptiveContextTestProvider.class);
|
||||
this.pruned = pruned;
|
||||
this.full = full;
|
||||
}
|
||||
}
|
||||
|
||||
private RecalDatumNode<ContextDatum> makeTree(final String context, final int N, final int M,
|
||||
final RecalDatumNode<ContextDatum> ... sub) {
|
||||
final ContextDatum contextDatum = new ContextDatum(context, N, M);
|
||||
final RecalDatumNode<ContextDatum> node = new RecalDatumNode<ContextDatum>(contextDatum);
|
||||
for ( final RecalDatumNode<ContextDatum> sub1 : sub ) {
|
||||
node.addSubnode(sub1);
|
||||
}
|
||||
return node;
|
||||
}
|
||||
|
||||
@DataProvider(name = "AdaptiveContextTestProvider")
|
||||
public Object[][] makeRecalDatumTestProvider() {
|
||||
// final RecalDatumNode<ContextDatum> prune1 =
|
||||
// makeTree("A", 10, 1,
|
||||
// makeTree("AA", 11, 2),
|
||||
// makeTree("AC", 12, 3),
|
||||
// makeTree("AG", 13, 4),
|
||||
// makeTree("AT", 14, 5));
|
||||
//
|
||||
// new AdaptiveContextTestProvider(pruned, full);
|
||||
|
||||
return AdaptiveContextTestProvider.getTests(AdaptiveContextTestProvider.class);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "AdaptiveContextTestProvider")
|
||||
public void testAdaptiveContextFill(AdaptiveContextTestProvider cfg) {
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,169 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
// our package
|
||||
package org.broadinstitute.sting.utils.recalibration;
|
||||
|
||||
|
||||
// the imports for unit testing.
|
||||
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeSuite;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
public class QualQuantizerUnitTest extends BaseTest {
|
||||
@BeforeSuite
|
||||
public void before() {
|
||||
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// merge case Provider
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
private class QualIntervalTestProvider extends TestDataProvider {
|
||||
final QualQuantizer.QualInterval left, right;
|
||||
int exError, exTotal, exQual;
|
||||
double exErrorRate;
|
||||
|
||||
private QualIntervalTestProvider(int leftE, int leftN, int rightE, int rightN, int exError, int exTotal) {
|
||||
super(QualIntervalTestProvider.class);
|
||||
|
||||
QualQuantizer qq = new QualQuantizer(0);
|
||||
left = qq.new QualInterval(10, 10, leftN, leftE, 0);
|
||||
right = qq.new QualInterval(11, 11, rightN, rightE, 0);
|
||||
|
||||
this.exError = exError;
|
||||
this.exTotal = exTotal;
|
||||
this.exErrorRate = (leftE + rightE + 1) / (1.0 * (leftN + rightN + 1));
|
||||
this.exQual = QualityUtils.probToQual(1-this.exErrorRate, 0);
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "QualIntervalTestProvider")
|
||||
public Object[][] makeQualIntervalTestProvider() {
|
||||
new QualIntervalTestProvider(10, 100, 10, 1000, 20, 1100);
|
||||
new QualIntervalTestProvider(0, 100, 10, 900, 10, 1000);
|
||||
new QualIntervalTestProvider(10, 900, 0, 100, 10, 1000);
|
||||
new QualIntervalTestProvider(0, 0, 10, 100, 10, 100);
|
||||
new QualIntervalTestProvider(1, 10, 9, 90, 10, 100);
|
||||
new QualIntervalTestProvider(1, 10, 9, 100000, 10, 100010);
|
||||
new QualIntervalTestProvider(1, 10, 9, 1000000, 10,1000010);
|
||||
|
||||
return QualIntervalTestProvider.getTests(QualIntervalTestProvider.class);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "QualIntervalTestProvider")
|
||||
public void testQualInterval(QualIntervalTestProvider cfg) {
|
||||
QualQuantizer.QualInterval merged = cfg.left.merge(cfg.right);
|
||||
Assert.assertEquals(merged.nErrors, cfg.exError);
|
||||
Assert.assertEquals(merged.nObservations, cfg.exTotal);
|
||||
Assert.assertEquals(merged.getErrorRate(), cfg.exErrorRate);
|
||||
Assert.assertEquals(merged.getQual(), cfg.exQual);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMinInterestingQual() {
|
||||
for ( int q = 0; q < 15; q++ ) {
|
||||
for ( int minQual = 0; minQual <= 10; minQual ++ ) {
|
||||
QualQuantizer qq = new QualQuantizer(minQual);
|
||||
QualQuantizer.QualInterval left = qq.new QualInterval(q, q, 100, 10, 0);
|
||||
QualQuantizer.QualInterval right = qq.new QualInterval(q+1, q+1, 1000, 100, 0);
|
||||
|
||||
QualQuantizer.QualInterval merged = left.merge(right);
|
||||
boolean shouldBeFree = q+1 <= minQual;
|
||||
if ( shouldBeFree )
|
||||
Assert.assertEquals(merged.getPenalty(), 0.0);
|
||||
else
|
||||
Assert.assertTrue(merged.getPenalty() > 0.0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// High-level case Provider
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
private class QuantizerTestProvider extends TestDataProvider {
|
||||
final List<Long> nObservationsPerQual = new ArrayList<Long>();
|
||||
final int nLevels;
|
||||
final List<Integer> expectedMap;
|
||||
|
||||
private QuantizerTestProvider(final List<Integer> nObservationsPerQual, final int nLevels, final List<Integer> expectedMap) {
|
||||
super(QuantizerTestProvider.class);
|
||||
|
||||
for ( int x : nObservationsPerQual )
|
||||
this.nObservationsPerQual.add((long)x);
|
||||
this.nLevels = nLevels;
|
||||
this.expectedMap = expectedMap;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("QQTest nLevels=%d nObs=[%s] map=[%s]",
|
||||
nLevels, Utils.join(",", nObservationsPerQual), Utils.join(",", expectedMap));
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "QuantizerTestProvider")
|
||||
public Object[][] makeQuantizerTestProvider() {
|
||||
List<Integer> allQ2 = Arrays.asList(0, 0, 1000, 0, 0);
|
||||
|
||||
new QuantizerTestProvider(allQ2, 5, Arrays.asList(0, 1, 2, 3, 4));
|
||||
new QuantizerTestProvider(allQ2, 1, Arrays.asList(2, 2, 2, 2, 2));
|
||||
|
||||
new QuantizerTestProvider(Arrays.asList(0, 0, 1000, 0, 1000), 2, Arrays.asList(2, 2, 2, 2, 4));
|
||||
new QuantizerTestProvider(Arrays.asList(0, 0, 1000, 1, 1000), 2, Arrays.asList(2, 2, 2, 4, 4));
|
||||
new QuantizerTestProvider(Arrays.asList(0, 0, 1000, 10, 1000), 2, Arrays.asList(2, 2, 2, 2, 4));
|
||||
|
||||
return QuantizerTestProvider.getTests(QuantizerTestProvider.class);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "QuantizerTestProvider", enabled = true)
|
||||
public void testQuantizer(QuantizerTestProvider cfg) {
|
||||
QualQuantizer qq = new QualQuantizer(cfg.nObservationsPerQual, cfg.nLevels, 0);
|
||||
logger.warn("cfg: " + cfg);
|
||||
for ( int i = 0; i < cfg.expectedMap.size(); i++) {
|
||||
int expected = cfg.expectedMap.get(i);
|
||||
int observed = qq.originalToQuantizedMap.get(i);
|
||||
//logger.warn(String.format(" qq map: %s : %d => %d", i, expected, observed));
|
||||
Assert.assertEquals(observed, expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,154 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
// our package
|
||||
package org.broadinstitute.sting.utils.recalibration;
|
||||
|
||||
|
||||
// the imports for unit testing.
|
||||
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeSuite;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
public class RecalDatumUnitTest extends BaseTest {
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// merge case Provider
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
private class RecalDatumTestProvider extends TestDataProvider {
|
||||
int exError, exTotal, reportedQual;
|
||||
|
||||
private RecalDatumTestProvider(int E, int N, int reportedQual) {
|
||||
super(RecalDatumTestProvider.class);
|
||||
|
||||
this.exError = E;
|
||||
this.exTotal = N;
|
||||
this.reportedQual = reportedQual;
|
||||
}
|
||||
|
||||
public double getErrorRate() {
|
||||
return (exError + 1) / (1.0 * (exTotal + 2));
|
||||
}
|
||||
|
||||
public double getErrorRatePhredScaled() {
|
||||
return QualityUtils.phredScaleErrorRate(getErrorRate());
|
||||
}
|
||||
|
||||
public int getReportedQual() {
|
||||
return reportedQual;
|
||||
}
|
||||
|
||||
public RecalDatum makeRecalDatum() {
|
||||
return new RecalDatum(exTotal, exError, (byte)getReportedQual());
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("exError=%d, exTotal=%d, reportedQual=%d", exError, exTotal, reportedQual);
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "RecalDatumTestProvider")
|
||||
public Object[][] makeRecalDatumTestProvider() {
|
||||
for ( int E : Arrays.asList(1, 10, 100, 1000, 10000) )
|
||||
for ( int N : Arrays.asList(10, 100, 1000, 10000, 100000, 1000000) )
|
||||
for ( int reportedQual : Arrays.asList(10, 20) )
|
||||
if ( E <= N )
|
||||
new RecalDatumTestProvider(E, N, reportedQual);
|
||||
return RecalDatumTestProvider.getTests(RecalDatumTestProvider.class);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "RecalDatumTestProvider")
|
||||
public void testRecalDatumBasics(RecalDatumTestProvider cfg) {
|
||||
final RecalDatum datum = cfg.makeRecalDatum();
|
||||
assertBasicFeaturesOfRecalDatum(datum, cfg);
|
||||
}
|
||||
|
||||
private static void assertBasicFeaturesOfRecalDatum(final RecalDatum datum, final RecalDatumTestProvider cfg) {
|
||||
Assert.assertEquals(datum.getNumMismatches(), cfg.exError);
|
||||
Assert.assertEquals(datum.getNumObservations(), cfg.exTotal);
|
||||
if ( cfg.getReportedQual() != -1 )
|
||||
Assert.assertEquals(datum.getEstimatedQReportedAsByte(), cfg.getReportedQual());
|
||||
BaseTest.assertEqualsDoubleSmart(datum.getEmpiricalQuality(), cfg.getErrorRatePhredScaled());
|
||||
BaseTest.assertEqualsDoubleSmart(datum.getEmpiricalErrorRate(), cfg.getErrorRate());
|
||||
}
|
||||
|
||||
@Test(dataProvider = "RecalDatumTestProvider")
|
||||
public void testRecalDatumCopyAndCombine(RecalDatumTestProvider cfg) {
|
||||
final RecalDatum datum = cfg.makeRecalDatum();
|
||||
final RecalDatum copy = new RecalDatum(datum);
|
||||
assertBasicFeaturesOfRecalDatum(copy, cfg);
|
||||
|
||||
RecalDatumTestProvider combinedCfg = new RecalDatumTestProvider(cfg.exError * 2, cfg.exTotal * 2, cfg.reportedQual);
|
||||
copy.combine(datum);
|
||||
assertBasicFeaturesOfRecalDatum(copy, combinedCfg);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "RecalDatumTestProvider")
|
||||
public void testRecalDatumModification(RecalDatumTestProvider cfg) {
|
||||
RecalDatum datum = cfg.makeRecalDatum();
|
||||
datum.setEmpiricalQuality(10.1);
|
||||
Assert.assertEquals(datum.getEmpiricalQuality(), 10.1);
|
||||
|
||||
datum.setEstimatedQReported(10.1);
|
||||
Assert.assertEquals(datum.getEstimatedQReported(), 10.1);
|
||||
Assert.assertEquals(datum.getEstimatedQReportedAsByte(), 10);
|
||||
|
||||
datum = cfg.makeRecalDatum();
|
||||
cfg.exTotal = 100000;
|
||||
datum.setNumObservations(cfg.exTotal);
|
||||
assertBasicFeaturesOfRecalDatum(datum, cfg);
|
||||
|
||||
datum = cfg.makeRecalDatum();
|
||||
cfg.exError = 1000;
|
||||
datum.setNumMismatches(cfg.exError);
|
||||
assertBasicFeaturesOfRecalDatum(datum, cfg);
|
||||
|
||||
datum = cfg.makeRecalDatum();
|
||||
datum.increment(true);
|
||||
cfg.exError++;
|
||||
cfg.exTotal++;
|
||||
assertBasicFeaturesOfRecalDatum(datum, cfg);
|
||||
|
||||
datum = cfg.makeRecalDatum();
|
||||
datum.increment(10, 5);
|
||||
cfg.exError += 5;
|
||||
cfg.exTotal += 10;
|
||||
assertBasicFeaturesOfRecalDatum(datum, cfg);
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue