JavaDocs!

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@290 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kiran 2009-04-03 19:19:17 +00:00
parent 42eb356782
commit ef06924f73
5 changed files with 159 additions and 6 deletions

View File

@ -8,6 +8,16 @@ import cern.colt.matrix.linalg.Algebra;
import org.broadinstitute.sting.utils.QualityUtils;
/**
* BasecallingBaseModel is a class that represents the statistical
* model for all bases at a given cycle. It allows for easy, one
* pass training via the addTrainingPoint() method. Once the model
* is trained, computeLikelihoods will return the probability matrix
* over previous cycle's base hypotheses and current cycle base
* hypotheses (contextual prior is included in these likelihoods).
*
* @author Kiran Garimella
*/
public class BasecallingBaseModel {
private double[][] counts;
@ -19,8 +29,11 @@ public class BasecallingBaseModel {
private DoubleMatrix2D[][] inverseCovariances;
private double[][] norms;
Algebra alg;
private Algebra alg;
/**
* Constructor for BasecallingBaseModel
*/
public BasecallingBaseModel() {
counts = new double[4][4];
@ -44,6 +57,13 @@ public class BasecallingBaseModel {
alg = new Algebra();
}
/**
* Add a single training point to the model.
* @param basePrev the previous cycle's base call (A, C, G, T, or * for the first cycle)
* @param baseCur the current cycle's base call (A, C, G, T)
* @param qualCur the quality score for the current cycle's base call
* @param fourintensity the four intensities for the current cycle's base call
*/
public void addTrainingPoint(char basePrev, char baseCur, byte qualCur, double[] fourintensity) {
int basePrevIndex = baseToBaseIndex(basePrev);
int baseCurIndex = baseToBaseIndex(baseCur);
@ -67,6 +87,9 @@ public class BasecallingBaseModel {
readyToCall = false;
}
/**
* Precompute all the matrix inversions and determinants we'll need for computing the likelihood distributions.
*/
public void prepareToCallBases() {
for (int basePrevIndex = 0; basePrevIndex < 4; basePrevIndex++) {
for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) {
@ -88,6 +111,16 @@ public class BasecallingBaseModel {
readyToCall = true;
}
/**
* Compute the likelihood matrix for a base (contextual priors included).
*
* @param cycle the cycle we're calling right now
* @param basePrev the previous cycle's base
* @param qualPrev the previous cycle's quality score
* @param fourintensity the four intensities of the current cycle's base
* @return a 4x4 matrix of likelihoods, where the row is the previous cycle base hypothesis and
* the column is the current cycle base hypothesis
*/
public double[][] computeLikelihoods(int cycle, char basePrev, byte qualPrev, double[] fourintensity) {
if (!readyToCall) {
prepareToCallBases();
@ -115,6 +148,11 @@ public class BasecallingBaseModel {
return probdist;
}
/**
* Utility method for converting a base ([Aa*], [Cc], [Gg], [Tt]) to an index (0, 1, 2, 3);
* @param base
* @return 0, 1, 2, 3, or -1 if the base can't be understood.
*/
private int baseToBaseIndex(char base) {
switch (base) {
case 'A':

View File

@ -1,10 +1,23 @@
package org.broadinstitute.sting.playground.fourbasecaller;
import org.broadinstitute.sting.utils.Utils;
/**
* BasecallingReadModel represents the statistical models for
* all bases in all cycles. It allows for easy, one-pass
* training via the addTrainingPoint() method, and for the
* computation of the 4x4 likelihood matrix or the 1x4
* probability vector (with contextual components marginalized
* out of the likelihood matrix).
*
* @author Kiran Garimella
*/
public class BasecallingReadModel {
private BasecallingBaseModel[] basemodels = null;
/**
* Constructor for BasecallingReadModel.
*
* @param readLength the length of the read that this model will support
*/
public BasecallingReadModel(int readLength) {
basemodels = new BasecallingBaseModel[readLength];
@ -13,14 +26,42 @@ public class BasecallingReadModel {
}
}
/**
* Add a single training point to the model.
* @param cycle the cycle for which this point should be added
* @param basePrev the previous base
* @param baseCur the current base
* @param qualCur the current base's quality
* @param fourintensity the four intensities of the current base
*/
public void addTrainingPoint(int cycle, char basePrev, char baseCur, byte qualCur, double[] fourintensity) {
basemodels[cycle].addTrainingPoint(basePrev, baseCur, qualCur, fourintensity);
}
/**
* Compute the likelihood matrix for a given cycle.
*
* @param cycle the cycle number for the current base
* @param basePrev the previous cycle's base
* @param qualPrev the quality score for the previous cycle's base
* @param fourintensity the four intensities for the current cycle's base
* @return 4x4 matrix of likelihoods
*/
public double[][] computeLikelihoods(int cycle, char basePrev, byte qualPrev, double[] fourintensity) {
return basemodels[cycle].computeLikelihoods(cycle, basePrev, qualPrev, fourintensity);
}
/**
* Compute the probability distribution for the base at a given cycle.
* Contextual components of the likelihood matrix are marginalized out.
*
* @param cycle the cycle number for the current base
* @param basePrev the previous cycle's base
* @param qualPrev the quality score for the previous cycle's base
* @param fourintensity the four intensities for the current cycle's base
* @return an instance of FourProb, which encodes a base hypothesis, its probability,
* and the ranking among the other hypotheses
*/
public FourProb computeProbabilities(int cycle, char basePrev, byte qualPrev, double[] fourintensity) {
double[][] likes = computeLikelihoods(cycle, basePrev, qualPrev, fourintensity);

View File

@ -85,8 +85,6 @@ public class FourBaseRecaller extends CommandLineProgram {
byte[] nextbestqual = new byte[bases.length()];
for (int cycle = 0; cycle < bases.length(); cycle++) {
//char basePrev = (cycle == 0) ? '*' : bases.charAt(cycle - 1);
//byte qualPrev = (cycle == 0) ? 0 : quals[cycle - 1];
char basePrev = (cycle == 0) ? '*' : (char) asciiseq[cycle - 1];
byte qualPrev = (cycle == 0) ? 0 : bestqual[cycle - 1];
double[] fourintensity = intensities[cycle + cycle_offset];

View File

@ -3,10 +3,21 @@ package org.broadinstitute.sting.playground.fourbasecaller;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.QualityUtils;
/**
* FourProb represents four base hypotheses, their probabilities, and their ranking among one another.
*
* @author Kiran Garimella
*/
public class FourProb {
private double[] baseProbs;
private int[] baseIndices;
/**
* Constructor for FourProb.
*
* @param baseIndices the unsorted base indices (A:0, C:1, G:2, T:3). Now that I think about it, this is stupid.
* @param baseProbs the unsorted base hypothesis probabilities.
*/
public FourProb(int[] baseIndices, double[] baseProbs) {
Integer[] perm = Utils.SortPermutation(baseProbs);
double[] ascendingBaseProbs = Utils.PermuteArray(baseProbs, perm);
@ -21,11 +32,40 @@ public class FourProb {
}
}
/**
* Returns the index of the base at the specified rank.
*
* @param rank (0 = best, 3 = worst) the rank of the base whose index should be returned
* @return the index (0, 1, 2, 3).
*/
public int indexAtRank(int rank) { return baseIndices[rank]; }
/**
* Returns the base label of the base at the specified rank.
* @param rank (0 = best, 3 = worst) the rank of the base whose index should be returned
* @return the base label (A, C, G, T).
*/
public char baseAtRank(int rank) { return baseIndexToBase(indexAtRank(rank)); }
/**
* Returns the probability of the base at the specified rank.
* @param rank (0 = best, 3 = worst) the rank of the base whose index should be returned
* @return the probability of the base (0.0-1.0)
*/
public double probAtRank(int rank) { return baseProbs[rank]; }
/**
* Returns the quality score of the base at the specified rank.
* @param rank (0 = best, 3 = worst) the rank of the base whose index should be returned
* @return the quality score of the base (0-40)
*/
public byte qualAtRank(int rank) { return QualityUtils.probToQual(probAtRank(rank)); }
/**
* A utility method to convert a base index into a base label.
* @param baseIndex the index of the base (0, 1, 2, 3).
* @return A, C, G, T, or '.' if the base index can't be understood.
*/
private char baseIndexToBase(int baseIndex) {
switch (baseIndex) {
case 0: return 'A';
@ -36,6 +76,11 @@ public class FourProb {
}
}
/**
* Prettily formats the FourProb info.
*
* @return a prettily formatted Sting containing the base and quality score in rank order.
*/
public String toString() {
return (
"[" + baseAtRank(0) + ":" + qualAtRank(0) + " "

View File

@ -1,16 +1,47 @@
package org.broadinstitute.sting.utils;
/**
* QualityUtils is a static class (no instantiation allowed!) with some utility methods for manipulating
* quality scores.
*
* @author Kiran Garimella
*/
public class QualityUtils {
/**
* Private constructor. No instantiating this class!
*/
private QualityUtils() {}
/**
* Convert a quality score to a probability. This is the Phred-style
* conversion, *not* the Illumina-style conversion (though asymptotically, they're the same).
*
* @param qual a quality score (0-40)
* @return a probability (0.0-1.0)
*/
static public double qualToProb(byte qual) {
return 1.0 - Math.pow(10.0, ((double) qual)/-10.0);
}
/**
* Convert a probability to a quality score. Note, this is capped at Q40.
*
* @param prob a probability (0.0-1.0)
* @return a quality score (0-40)
*/
static public byte probToQual(double prob) {
return (byte) Math.round(-10.0*Math.log10(1.0 - prob + 0.0001));
}
/**
* Compress a base and a probability into a single byte so that it can be output in a SAMRecord's SQ field.
* Note: the highest probability this function can encode is 64%, so this function should only never be used on the best base hypothesis.
* Another note: the probability encoded here gets rounded to the nearest 1%.
*
* @param baseIndex the base index
* @param prob the base probability
* @return a byte containing the index and the probability
*/
static public byte baseAndProbToCompressedQuality(int baseIndex, double prob) {
byte compressedQual = (byte) baseIndex;
byte cprob = (byte) (100.0*prob);