Ready for full-scale evaluation adaptive BQSR contexts

-- VisualizeContextTree now can write out an equivalent BQSR table determined after adaptive context merging of all RG x QUAL x CONTEXT trees
-- Docs, algorithm descriptions, etc so that it makes sense what's going on
-- VisualizeContextTree should really be simplified when into a single tool that just visualize the trees when / if we decide to make adaptive contexts standard part of BQSR
 -- Misc. cleaning, organization of the code (recalibation tests were in private but corresponding actual files were public)
This commit is contained in:
Mark DePristo 2012-08-03 16:01:27 -04:00
parent d2e8eb7b23
commit e1bba91836
6 changed files with 743 additions and 46 deletions

View File

@ -208,11 +208,23 @@ public class GATKReportTable {
}
/**
* Verifies that a table or column name has only alphanumeric characters - no spaces or special characters allowed
*
* @param name the name of the table or column
* @return true if the name is valid, false if otherwise
* Create a new GATKReportTable with the same structure
* @param tableToCopy
*/
public GATKReportTable(final GATKReportTable tableToCopy, final boolean copyData) {
this(tableToCopy.getTableName(), tableToCopy.getTableDescription(), tableToCopy.getNumColumns(), tableToCopy.sortByRowID);
for ( final GATKReportColumn column : tableToCopy.getColumnInfo() )
addColumn(column.getColumnName(), column.getFormat());
if ( copyData )
throw new IllegalArgumentException("sorry, copying data in GATKReportTable isn't supported");
}
/**
* Verifies that a table or column name has only alphanumeric characters - no spaces or special characters allowed
*
* @param name the name of the table or column
* @return true if the name is valid, false if otherwise
*/
private boolean isValidName(String name) {
Pattern p = Pattern.compile(INVALID_TABLE_NAME_REGEX);
Matcher m = p.matcher(name);
@ -490,6 +502,17 @@ public class GATKReportTable {
return get(rowIdToIndex.get(rowID), columnNameToIndex.get(columnName));
}
/**
* Get a value from the given position in the table
*
* @param rowIndex the row ID
* @param columnName the name of the column
* @return the value stored at the specified position in the table
*/
public Object get(final int rowIndex, final String columnName) {
return get(rowIndex, columnNameToIndex.get(columnName));
}
/**
* Get a value from the given position in the table
*

View File

@ -0,0 +1,154 @@
package org.broadinstitute.sting.utils.recalibration;
import java.util.*;
/**
* Functions for working with AdaptiveContexts
*
* User: depristo
* Date: 8/3/12
* Time: 12:21 PM
* To change this template use File | Settings | File Templates.
*/
public class AdaptiveContext {
private AdaptiveContext() {}
/**
* Return a freshly allocated tree filled in completely to fillDepth with
* all combinations of {A,C,G,T}^filldepth contexts. For nodes
* in the tree, they are simply copied. When the algorithm needs to
* generate new nodes (because they are missing) the subnodes inherit the
* observation and error counts of their parent.
*
* This algorithm produces data consistent with the standard output in a BQSR recal
* file for the Context covariate
*
* @param root
* @param fillDepth
* @return
*/
public static RecalDatumNode<ContextDatum> fillToDepth(final RecalDatumNode<ContextDatum> root, final int fillDepth) {
if ( root == null ) throw new IllegalArgumentException("root is null");
if ( fillDepth < 0 ) throw new IllegalArgumentException("fillDepth is < 0");
return fillToDepthRec(root, fillDepth, 0);
}
private static RecalDatumNode<ContextDatum> fillToDepthRec(final RecalDatumNode<ContextDatum> parent,
final int fillDepth,
final int currentDepth) {
// three cases:
// We are in the tree and so just recursively build
// We have reached our depth goal, so just return the parent since we are done
// We are outside of the tree, in which case we need to pointer to our parent node so we can
// we info (N, M) and we need a running context
if ( currentDepth < fillDepth ) {
// we need to create subnodes for each base, and propogate N and M down
final RecalDatumNode<ContextDatum> newParent = new RecalDatumNode<ContextDatum>(parent.getRecalDatum());
for ( final String base : Arrays.asList("A", "C", "G", "T")) {
ContextDatum subContext;
Set<RecalDatumNode<ContextDatum>> subContexts;
final RecalDatumNode<ContextDatum> subNode = findSubcontext(parent.getRecalDatum().context + base, parent);
if ( subNode != null ) {
// we have a subnode corresponding to the expected one, just copy and recurse
subContext = subNode.getRecalDatum();
subContexts = subNode.getSubnodes();
} else {
// have to create a new one
subContext = new ContextDatum(parent.getRecalDatum().context + base,
parent.getRecalDatum().getNumObservations(), parent.getRecalDatum().getNumMismatches());
subContexts = Collections.emptySet();
}
newParent.addSubnode(
fillToDepthRec(new RecalDatumNode<ContextDatum>(subContext, subContexts),
fillDepth, currentDepth + 1));
}
return newParent;
} else {
return parent;
}
}
/**
* Go from a flat list of contexts to the tree implied by the contexts
*
* Implicit nodes are created as needed, and their observation and error counts are the sum of the
* all of their subnodes.
*
* Note this does not guarentee the tree is complete, as some contexts (e.g., AAT) may be missing
* from the tree because they are absent from the input list of contexts.
*
* For input AAG, AAT, AC, G would produce the following tree:
*
* - x [root]
* - A
* - A
* - T
* - G
* - C
* - G
*
* sets the fixed penalties in the resulting tree as well
*
* @param flatContexts list of flat contexts
* @return
*/
public static RecalDatumNode<ContextDatum> createTreeFromFlatContexts(final List<ContextDatum> flatContexts) {
if ( flatContexts == null || flatContexts.isEmpty() )
throw new IllegalArgumentException("flatContexts cannot be empty or null");
final Queue<RecalDatumNode<ContextDatum>> remaining = new LinkedList<RecalDatumNode<ContextDatum>>();
final Map<String, RecalDatumNode<ContextDatum>> contextToNodes = new HashMap<String, RecalDatumNode<ContextDatum>>();
RecalDatumNode<ContextDatum> root = null;
// initialize -- start with all of the contexts
for ( final ContextDatum cd : flatContexts )
remaining.add(new RecalDatumNode<ContextDatum>(cd));
while ( remaining.peek() != null ) {
final RecalDatumNode<ContextDatum> add = remaining.poll();
final ContextDatum cd = add.getRecalDatum();
final String parentContext = cd.getParentContext();
RecalDatumNode<ContextDatum> parent = contextToNodes.get(parentContext);
if ( parent == null ) {
// haven't yet found parent, so make one, and enqueue it for processing
parent = new RecalDatumNode<ContextDatum>(new ContextDatum(parentContext, 0, 0));
contextToNodes.put(parentContext, parent);
if ( parentContext != ContextDatum.ROOT_CONTEXT )
remaining.add(parent);
else
root = parent;
}
parent.getRecalDatum().incrementNumObservations(cd.getNumObservations());
parent.getRecalDatum().incrementNumMismatches(cd.getNumMismatches());
parent.addSubnode(add);
}
if ( root == null )
throw new RuntimeException("root is unexpectedly null");
// set the fixed penalty everywhere in the tree, so that future modifications don't change the penalties
root.calcAndSetFixedPenalty(true);
return root;
}
/**
* Finds immediate subnode with contextToFind, or null if none exists
*
* @param tree whose subnodes should be searched
* @return
*/
public static RecalDatumNode<ContextDatum> findSubcontext(final String contextToFind, final RecalDatumNode<ContextDatum> tree) {
for ( final RecalDatumNode<ContextDatum> sub : tree.getSubnodes() )
if ( sub.getRecalDatum().context.equals(contextToFind) )
return sub;
return null;
}
}

View File

@ -4,10 +4,12 @@ import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.commons.math.stat.inference.ChiSquareTestImpl;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.Collection;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Set;
/**
@ -17,13 +19,18 @@ import java.util.Set;
* @since 07/27/12
*/
public class RecalDatumNode<T extends RecalDatum> {
private final static boolean USE_CHI2 = true;
protected static Logger logger = Logger.getLogger(RecalDatumNode.class);
/**
* fixedPenalty is this value if it's considered fixed
*/
private final static double UNINITIALIZED = Double.NEGATIVE_INFINITY;
private final T recalDatum;
private double fixedPenalty = UNINITIALIZED;
private final Set<RecalDatumNode<T>> subnodes;
@Requires({"recalDatum != null"})
public RecalDatumNode(final T recalDatum) {
this(recalDatum, new HashSet<RecalDatumNode<T>>());
}
@ -33,28 +40,45 @@ public class RecalDatumNode<T extends RecalDatum> {
return recalDatum.toString();
}
@Requires({"recalDatum != null", "subnodes != null"})
public RecalDatumNode(final T recalDatum, final Set<RecalDatumNode<T>> subnodes) {
this(recalDatum, UNINITIALIZED, subnodes);
}
@Requires({"recalDatum != null"})
protected RecalDatumNode(final T recalDatum, final double fixedPenalty) {
this(recalDatum, fixedPenalty, new HashSet<RecalDatumNode<T>>());
}
@Requires({"recalDatum != null", "subnodes != null"})
protected RecalDatumNode(final T recalDatum, final double fixedPenalty, final Set<RecalDatumNode<T>> subnodes) {
this.recalDatum = recalDatum;
this.fixedPenalty = fixedPenalty;
this.subnodes = new HashSet<RecalDatumNode<T>>(subnodes);
}
/**
* Get the recal data associated with this node
* @return
*/
@Ensures("result != null")
public T getRecalDatum() {
return recalDatum;
}
/**
* The set of all subnodes of this tree. May be modified.
* @return
*/
@Ensures("result != null")
public Set<RecalDatumNode<T>> getSubnodes() {
return subnodes;
}
/**
* Return the fixed penalty, if set, or else the the calculated penalty for this node
* @return
*/
public double getPenalty() {
if ( fixedPenalty != UNINITIALIZED )
return fixedPenalty;
@ -62,6 +86,17 @@ public class RecalDatumNode<T extends RecalDatum> {
return calcPenalty();
}
/**
* Set the fixed penalty for this node to a fresh calculation from calcPenalty
*
* This is important in the case where you want to compute the penalty from a full
* tree and then chop the tree up afterwards while considering the previous penalties.
* If you don't call this function then manipulating the tree may result in the
* penalty functions changing with changes in the tree.
*
* @param doEntireTree recurse into all subnodes?
* @return the fixed penalty for this node
*/
public double calcAndSetFixedPenalty(final boolean doEntireTree) {
fixedPenalty = calcPenalty();
if ( doEntireTree )
@ -70,15 +105,41 @@ public class RecalDatumNode<T extends RecalDatum> {
return fixedPenalty;
}
/**
* Add node to the set of subnodes of this node
* @param sub
*/
@Requires("sub != null")
public void addSubnode(final RecalDatumNode<T> sub) {
subnodes.add(sub);
}
/**
* Is this a leaf node (i.e., has no subnodes)?
* @return
*/
public boolean isLeaf() {
return subnodes.isEmpty();
}
public int getNumBranches() {
/**
* Is this node immediately above only leaf nodes?
*
* @return
*/
public boolean isAboveOnlyLeaves() {
for ( final RecalDatumNode<T> sub : subnodes )
if ( ! sub.isLeaf() )
return false;
return true;
}
/**
* What's the immediate number of subnodes from this node?
* @return
*/
@Ensures("result >= 0")
public int getNumSubnodes() {
return subnodes.size();
}
@ -89,6 +150,8 @@ public class RecalDatumNode<T extends RecalDatum> {
* definition have 0 penalty unless they represent a pruned tree with underlying -- but now
* pruned -- subtrees
*
* TODO -- can we really just add together the chi2 values?
*
* @return
*/
public double totalPenalty() {
@ -102,6 +165,10 @@ public class RecalDatumNode<T extends RecalDatum> {
}
}
/**
* What's the longest branch from this node to any leaf?
* @return
*/
public int maxDepth() {
int subMax = 0;
for ( final RecalDatumNode<T> sub : subnodes )
@ -109,6 +176,11 @@ public class RecalDatumNode<T extends RecalDatum> {
return subMax + 1;
}
/**
* What's the shortest branch from this node to any leaf? Includes this node
* @return
*/
@Ensures("result > 0")
public int minDepth() {
if ( isLeaf() )
return 1;
@ -120,6 +192,11 @@ public class RecalDatumNode<T extends RecalDatum> {
}
}
/**
* Return the number of nodes, including this one, reachable from this node
* @return
*/
@Ensures("result > 0")
public int size() {
int size = 1;
for ( final RecalDatumNode<T> sub : subnodes )
@ -127,6 +204,12 @@ public class RecalDatumNode<T extends RecalDatum> {
return size;
}
/**
* Count the number of leaf nodes reachable from this node
*
* @return
*/
@Ensures("result >= 0")
public int numLeaves() {
if ( isLeaf() )
return 1;
@ -138,44 +221,37 @@ public class RecalDatumNode<T extends RecalDatum> {
}
}
/**
* Calculate the chi^2 penalty among subnodes of this node. The chi^2 value
* indicates the degree of independence of the implied error rates among the
* immediate subnodes
*
* @return the chi2 penalty, or 0.0 if it cannot be calculated
*/
private double calcPenalty() {
if ( USE_CHI2 )
return calcPenaltyChi2();
else
return calcPenaltyLog10(getRecalDatum().getEmpiricalErrorRate());
}
private double calcPenaltyChi2() {
if ( isLeaf() )
return 0.0;
else if ( subnodes.size() == 1 )
// only one value, so its free to merge away
return 0.0;
else {
final long[][] counts = new long[subnodes.size()][2];
int i = 0;
for ( RecalDatumNode<T> subnode : subnodes ) {
counts[i][0] = subnode.getRecalDatum().getNumMismatches();
counts[i][1] = subnode.getRecalDatum().getNumObservations();
for ( final RecalDatumNode<T> subnode : subnodes ) {
// use the yates correction to help avoid all zeros => NaN
counts[i][0] = subnode.getRecalDatum().getNumMismatches() + 1;
counts[i][1] = subnode.getRecalDatum().getNumObservations() + 2;
i++;
}
final double chi2 = new ChiSquareTestImpl().chiSquare(counts);
// StringBuilder x = new StringBuilder();
// StringBuilder y = new StringBuilder();
// for ( int k = 0; k < counts.length; k++) {
// if ( k != 0 ) {
// x.append(", ");
// y.append(", ");
// }
// x.append(counts[k][0]);
// y.append(counts[k][1]);
// }
// logger.info("x = c(" + x.toString() + ")");
// logger.info("y = c(" + y.toString() + ")");
// logger.info("chi2 = " + chi2);
// make sure things are reasonable and fail early if not
if (Double.isInfinite(chi2) || Double.isNaN(chi2))
throw new ReviewedStingException("chi2 value is " + chi2 + " at " + getRecalDatum());
return chi2;
//return Math.log10(chi2);
}
}
@ -216,11 +292,17 @@ public class RecalDatumNode<T extends RecalDatum> {
}
}
/**
* Return a freshly allocated tree prunes to have no more than maxDepth from the root to any leaf
*
* @param maxDepth
* @return
*/
public RecalDatumNode<T> pruneToDepth(final int maxDepth) {
if ( maxDepth < 1 )
throw new IllegalArgumentException("maxDepth < 1");
else {
final Set<RecalDatumNode<T>> subPruned = new HashSet<RecalDatumNode<T>>(getNumBranches());
final Set<RecalDatumNode<T>> subPruned = new HashSet<RecalDatumNode<T>>(getNumSubnodes());
if ( maxDepth > 1 )
for ( final RecalDatumNode<T> sub : subnodes )
subPruned.add(sub.pruneToDepth(maxDepth - 1));
@ -228,12 +310,21 @@ public class RecalDatumNode<T extends RecalDatum> {
}
}
/**
* Return a freshly allocated tree with to no more than maxElements in order of penalty
*
* Note that nodes must have fixed penalties to this algorithm will fail.
*
* @param maxElements
* @return
*/
public RecalDatumNode<T> pruneByPenalty(final int maxElements) {
RecalDatumNode<T> root = this;
while ( root.size() > maxElements ) {
// remove the lowest penalty element, and continue
root = root.removeLowestPenaltyNode();
logger.debug("pruneByPenalty root size is now " + root.size() + " of max " + maxElements);
}
// our size is below the target, so we are good, return
@ -241,15 +332,15 @@ public class RecalDatumNode<T extends RecalDatum> {
}
/**
* Find the lowest penalty node in the tree, and return a tree without it
* Find the lowest penalty above leaf node in the tree, and return a tree without it
*
* Note this excludes the current (root) node
*
* @return
*/
private RecalDatumNode<T> removeLowestPenaltyNode() {
final Pair<RecalDatumNode<T>, Double> nodeToRemove = getMinPenaltyNode();
logger.info("Removing " + nodeToRemove.getFirst() + " with penalty " + nodeToRemove.getSecond());
final Pair<RecalDatumNode<T>, Double> nodeToRemove = getMinPenaltyAboveLeafNode();
//logger.info("Removing " + nodeToRemove.getFirst() + " with penalty " + nodeToRemove.getSecond());
final Pair<RecalDatumNode<T>, Boolean> result = removeNode(nodeToRemove.getFirst());
@ -262,20 +353,37 @@ public class RecalDatumNode<T extends RecalDatum> {
return oneRemoved;
}
private Pair<RecalDatumNode<T>, Double> getMinPenaltyNode() {
final double myValue = isLeaf() ? Double.MAX_VALUE : getPenalty();
Pair<RecalDatumNode<T>, Double> maxNode = new Pair<RecalDatumNode<T>, Double>(this, myValue);
for ( final RecalDatumNode<T> sub : subnodes ) {
final Pair<RecalDatumNode<T>, Double> subFind = sub.getMinPenaltyNode();
if ( subFind.getSecond() < maxNode.getSecond() ) {
maxNode = subFind;
/**
* Finds in the tree the node with the lowest penalty whose subnodes are all leaves
*
* @return
*/
private Pair<RecalDatumNode<T>, Double> getMinPenaltyAboveLeafNode() {
if ( isLeaf() )
// not allowed to remove leafs directly
return null;
if ( isAboveOnlyLeaves() )
// we only consider removing nodes above all leaves
return new Pair<RecalDatumNode<T>, Double>(this, getPenalty());
else {
// just recurse, taking the result with the min penalty of all subnodes
Pair<RecalDatumNode<T>, Double> minNode = null;
for ( final RecalDatumNode<T> sub : subnodes ) {
final Pair<RecalDatumNode<T>, Double> subFind = sub.getMinPenaltyAboveLeafNode();
if ( subFind != null && (minNode == null || subFind.getSecond() < minNode.getSecond()) ) {
minNode = subFind;
}
}
return minNode;
}
return maxNode;
}
/**
* Return a freshly allocated tree without the node nodeToRemove
*
* @param nodeToRemove
* @return
*/
private Pair<RecalDatumNode<T>, Boolean> removeNode(final RecalDatumNode<T> nodeToRemove) {
if ( this == nodeToRemove ) {
if ( isLeaf() )
@ -288,7 +396,7 @@ public class RecalDatumNode<T extends RecalDatum> {
boolean removedSomething = false;
// our sub nodes with the penalty node removed
final Set<RecalDatumNode<T>> sub = new HashSet<RecalDatumNode<T>>(getNumBranches());
final Set<RecalDatumNode<T>> sub = new HashSet<RecalDatumNode<T>>(getNumSubnodes());
for ( final RecalDatumNode<T> sub1 : subnodes ) {
if ( removedSomething ) {
@ -306,4 +414,29 @@ public class RecalDatumNode<T extends RecalDatum> {
return new Pair<RecalDatumNode<T>, Boolean>(node, removedSomething);
}
}
/**
* Return a collection of all of the data in the leaf nodes of this tree
*
* @return
*/
public Collection<T> getAllLeaves() {
final LinkedList<T> list = new LinkedList<T>();
getAllLeavesRec(list);
return list;
}
/**
* Helpful recursive function for getAllLeaves()
*
* @param list the destination for the list of leaves
*/
private void getAllLeavesRec(final LinkedList<T> list) {
if ( isLeaf() )
list.add(getRecalDatum());
else {
for ( final RecalDatumNode<T> sub : subnodes )
sub.getAllLeavesRec(list);
}
}
}

View File

@ -0,0 +1,64 @@
package org.broadinstitute.sting.utils.recalibration;
import org.broadinstitute.sting.BaseTest;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
/**
* User: depristo
* Date: 8/3/12
* Time: 12:26 PM
* To change this template use File | Settings | File Templates.
*/
public class AdaptiveContextUnitTest {
// TODO
// TODO actually need unit tests when we have validated the value of this approach
// TODO particularly before we attempt to optimize the algorithm
// TODO
// --------------------------------------------------------------------------------
//
// Provider
//
// --------------------------------------------------------------------------------
private class AdaptiveContextTestProvider extends BaseTest.TestDataProvider {
final RecalDatumNode<ContextDatum> pruned;
final RecalDatumNode<ContextDatum> full;
private AdaptiveContextTestProvider(Class c, RecalDatumNode<ContextDatum> pruned, RecalDatumNode<ContextDatum> full) {
super(AdaptiveContextTestProvider.class);
this.pruned = pruned;
this.full = full;
}
}
private RecalDatumNode<ContextDatum> makeTree(final String context, final int N, final int M,
final RecalDatumNode<ContextDatum> ... sub) {
final ContextDatum contextDatum = new ContextDatum(context, N, M);
final RecalDatumNode<ContextDatum> node = new RecalDatumNode<ContextDatum>(contextDatum);
for ( final RecalDatumNode<ContextDatum> sub1 : sub ) {
node.addSubnode(sub1);
}
return node;
}
@DataProvider(name = "AdaptiveContextTestProvider")
public Object[][] makeRecalDatumTestProvider() {
// final RecalDatumNode<ContextDatum> prune1 =
// makeTree("A", 10, 1,
// makeTree("AA", 11, 2),
// makeTree("AC", 12, 3),
// makeTree("AG", 13, 4),
// makeTree("AT", 14, 5));
//
// new AdaptiveContextTestProvider(pruned, full);
return AdaptiveContextTestProvider.getTests(AdaptiveContextTestProvider.class);
}
@Test(dataProvider = "AdaptiveContextTestProvider")
public void testAdaptiveContextFill(AdaptiveContextTestProvider cfg) {
}
}

View File

@ -0,0 +1,169 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
// our package
package org.broadinstitute.sting.utils.recalibration;
// the imports for unit testing.
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils;
import org.testng.Assert;
import org.testng.annotations.BeforeSuite;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class QualQuantizerUnitTest extends BaseTest {
@BeforeSuite
public void before() {
}
// --------------------------------------------------------------------------------
//
// merge case Provider
//
// --------------------------------------------------------------------------------
private class QualIntervalTestProvider extends TestDataProvider {
final QualQuantizer.QualInterval left, right;
int exError, exTotal, exQual;
double exErrorRate;
private QualIntervalTestProvider(int leftE, int leftN, int rightE, int rightN, int exError, int exTotal) {
super(QualIntervalTestProvider.class);
QualQuantizer qq = new QualQuantizer(0);
left = qq.new QualInterval(10, 10, leftN, leftE, 0);
right = qq.new QualInterval(11, 11, rightN, rightE, 0);
this.exError = exError;
this.exTotal = exTotal;
this.exErrorRate = (leftE + rightE + 1) / (1.0 * (leftN + rightN + 1));
this.exQual = QualityUtils.probToQual(1-this.exErrorRate, 0);
}
}
@DataProvider(name = "QualIntervalTestProvider")
public Object[][] makeQualIntervalTestProvider() {
new QualIntervalTestProvider(10, 100, 10, 1000, 20, 1100);
new QualIntervalTestProvider(0, 100, 10, 900, 10, 1000);
new QualIntervalTestProvider(10, 900, 0, 100, 10, 1000);
new QualIntervalTestProvider(0, 0, 10, 100, 10, 100);
new QualIntervalTestProvider(1, 10, 9, 90, 10, 100);
new QualIntervalTestProvider(1, 10, 9, 100000, 10, 100010);
new QualIntervalTestProvider(1, 10, 9, 1000000, 10,1000010);
return QualIntervalTestProvider.getTests(QualIntervalTestProvider.class);
}
@Test(dataProvider = "QualIntervalTestProvider")
public void testQualInterval(QualIntervalTestProvider cfg) {
QualQuantizer.QualInterval merged = cfg.left.merge(cfg.right);
Assert.assertEquals(merged.nErrors, cfg.exError);
Assert.assertEquals(merged.nObservations, cfg.exTotal);
Assert.assertEquals(merged.getErrorRate(), cfg.exErrorRate);
Assert.assertEquals(merged.getQual(), cfg.exQual);
}
@Test
public void testMinInterestingQual() {
for ( int q = 0; q < 15; q++ ) {
for ( int minQual = 0; minQual <= 10; minQual ++ ) {
QualQuantizer qq = new QualQuantizer(minQual);
QualQuantizer.QualInterval left = qq.new QualInterval(q, q, 100, 10, 0);
QualQuantizer.QualInterval right = qq.new QualInterval(q+1, q+1, 1000, 100, 0);
QualQuantizer.QualInterval merged = left.merge(right);
boolean shouldBeFree = q+1 <= minQual;
if ( shouldBeFree )
Assert.assertEquals(merged.getPenalty(), 0.0);
else
Assert.assertTrue(merged.getPenalty() > 0.0);
}
}
}
// --------------------------------------------------------------------------------
//
// High-level case Provider
//
// --------------------------------------------------------------------------------
private class QuantizerTestProvider extends TestDataProvider {
final List<Long> nObservationsPerQual = new ArrayList<Long>();
final int nLevels;
final List<Integer> expectedMap;
private QuantizerTestProvider(final List<Integer> nObservationsPerQual, final int nLevels, final List<Integer> expectedMap) {
super(QuantizerTestProvider.class);
for ( int x : nObservationsPerQual )
this.nObservationsPerQual.add((long)x);
this.nLevels = nLevels;
this.expectedMap = expectedMap;
}
@Override
public String toString() {
return String.format("QQTest nLevels=%d nObs=[%s] map=[%s]",
nLevels, Utils.join(",", nObservationsPerQual), Utils.join(",", expectedMap));
}
}
@DataProvider(name = "QuantizerTestProvider")
public Object[][] makeQuantizerTestProvider() {
List<Integer> allQ2 = Arrays.asList(0, 0, 1000, 0, 0);
new QuantizerTestProvider(allQ2, 5, Arrays.asList(0, 1, 2, 3, 4));
new QuantizerTestProvider(allQ2, 1, Arrays.asList(2, 2, 2, 2, 2));
new QuantizerTestProvider(Arrays.asList(0, 0, 1000, 0, 1000), 2, Arrays.asList(2, 2, 2, 2, 4));
new QuantizerTestProvider(Arrays.asList(0, 0, 1000, 1, 1000), 2, Arrays.asList(2, 2, 2, 4, 4));
new QuantizerTestProvider(Arrays.asList(0, 0, 1000, 10, 1000), 2, Arrays.asList(2, 2, 2, 2, 4));
return QuantizerTestProvider.getTests(QuantizerTestProvider.class);
}
@Test(dataProvider = "QuantizerTestProvider", enabled = true)
public void testQuantizer(QuantizerTestProvider cfg) {
QualQuantizer qq = new QualQuantizer(cfg.nObservationsPerQual, cfg.nLevels, 0);
logger.warn("cfg: " + cfg);
for ( int i = 0; i < cfg.expectedMap.size(); i++) {
int expected = cfg.expectedMap.get(i);
int observed = qq.originalToQuantizedMap.get(i);
//logger.warn(String.format(" qq map: %s : %d => %d", i, expected, observed));
Assert.assertEquals(observed, expected);
}
}
}

View File

@ -0,0 +1,154 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
// our package
package org.broadinstitute.sting.utils.recalibration;
// the imports for unit testing.
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils;
import org.testng.Assert;
import org.testng.annotations.BeforeSuite;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class RecalDatumUnitTest extends BaseTest {
// --------------------------------------------------------------------------------
//
// merge case Provider
//
// --------------------------------------------------------------------------------
private class RecalDatumTestProvider extends TestDataProvider {
int exError, exTotal, reportedQual;
private RecalDatumTestProvider(int E, int N, int reportedQual) {
super(RecalDatumTestProvider.class);
this.exError = E;
this.exTotal = N;
this.reportedQual = reportedQual;
}
public double getErrorRate() {
return (exError + 1) / (1.0 * (exTotal + 2));
}
public double getErrorRatePhredScaled() {
return QualityUtils.phredScaleErrorRate(getErrorRate());
}
public int getReportedQual() {
return reportedQual;
}
public RecalDatum makeRecalDatum() {
return new RecalDatum(exTotal, exError, (byte)getReportedQual());
}
@Override
public String toString() {
return String.format("exError=%d, exTotal=%d, reportedQual=%d", exError, exTotal, reportedQual);
}
}
@DataProvider(name = "RecalDatumTestProvider")
public Object[][] makeRecalDatumTestProvider() {
for ( int E : Arrays.asList(1, 10, 100, 1000, 10000) )
for ( int N : Arrays.asList(10, 100, 1000, 10000, 100000, 1000000) )
for ( int reportedQual : Arrays.asList(10, 20) )
if ( E <= N )
new RecalDatumTestProvider(E, N, reportedQual);
return RecalDatumTestProvider.getTests(RecalDatumTestProvider.class);
}
@Test(dataProvider = "RecalDatumTestProvider")
public void testRecalDatumBasics(RecalDatumTestProvider cfg) {
final RecalDatum datum = cfg.makeRecalDatum();
assertBasicFeaturesOfRecalDatum(datum, cfg);
}
private static void assertBasicFeaturesOfRecalDatum(final RecalDatum datum, final RecalDatumTestProvider cfg) {
Assert.assertEquals(datum.getNumMismatches(), cfg.exError);
Assert.assertEquals(datum.getNumObservations(), cfg.exTotal);
if ( cfg.getReportedQual() != -1 )
Assert.assertEquals(datum.getEstimatedQReportedAsByte(), cfg.getReportedQual());
BaseTest.assertEqualsDoubleSmart(datum.getEmpiricalQuality(), cfg.getErrorRatePhredScaled());
BaseTest.assertEqualsDoubleSmart(datum.getEmpiricalErrorRate(), cfg.getErrorRate());
}
@Test(dataProvider = "RecalDatumTestProvider")
public void testRecalDatumCopyAndCombine(RecalDatumTestProvider cfg) {
final RecalDatum datum = cfg.makeRecalDatum();
final RecalDatum copy = new RecalDatum(datum);
assertBasicFeaturesOfRecalDatum(copy, cfg);
RecalDatumTestProvider combinedCfg = new RecalDatumTestProvider(cfg.exError * 2, cfg.exTotal * 2, cfg.reportedQual);
copy.combine(datum);
assertBasicFeaturesOfRecalDatum(copy, combinedCfg);
}
@Test(dataProvider = "RecalDatumTestProvider")
public void testRecalDatumModification(RecalDatumTestProvider cfg) {
RecalDatum datum = cfg.makeRecalDatum();
datum.setEmpiricalQuality(10.1);
Assert.assertEquals(datum.getEmpiricalQuality(), 10.1);
datum.setEstimatedQReported(10.1);
Assert.assertEquals(datum.getEstimatedQReported(), 10.1);
Assert.assertEquals(datum.getEstimatedQReportedAsByte(), 10);
datum = cfg.makeRecalDatum();
cfg.exTotal = 100000;
datum.setNumObservations(cfg.exTotal);
assertBasicFeaturesOfRecalDatum(datum, cfg);
datum = cfg.makeRecalDatum();
cfg.exError = 1000;
datum.setNumMismatches(cfg.exError);
assertBasicFeaturesOfRecalDatum(datum, cfg);
datum = cfg.makeRecalDatum();
datum.increment(true);
cfg.exError++;
cfg.exTotal++;
assertBasicFeaturesOfRecalDatum(datum, cfg);
datum = cfg.makeRecalDatum();
datum.increment(10, 5);
cfg.exError += 5;
cfg.exTotal += 10;
assertBasicFeaturesOfRecalDatum(datum, cfg);
}
}