diff --git a/java/src/org/broadinstitute/sting/playground/fourbasecaller/BasecallingBaseModel.java b/java/src/org/broadinstitute/sting/playground/fourbasecaller/BasecallingBaseModel.java index a817d9c7a..3300bd732 100755 --- a/java/src/org/broadinstitute/sting/playground/fourbasecaller/BasecallingBaseModel.java +++ b/java/src/org/broadinstitute/sting/playground/fourbasecaller/BasecallingBaseModel.java @@ -8,6 +8,16 @@ import cern.colt.matrix.linalg.Algebra; import org.broadinstitute.sting.utils.QualityUtils; +/** + * BasecallingBaseModel is a class that represents the statistical + * model for all bases at a given cycle. It allows for easy, one + * pass training via the addTrainingPoint() method. Once the model + * is trained, computeLikelihoods will return the probability matrix + * over previous cycle's base hypotheses and current cycle base + * hypotheses (contextual prior is included in these likelihoods). + * + * @author Kiran Garimella + */ public class BasecallingBaseModel { private double[][] counts; @@ -19,8 +29,11 @@ public class BasecallingBaseModel { private DoubleMatrix2D[][] inverseCovariances; private double[][] norms; - Algebra alg; + private Algebra alg; + /** + * Constructor for BasecallingBaseModel + */ public BasecallingBaseModel() { counts = new double[4][4]; @@ -44,6 +57,13 @@ public class BasecallingBaseModel { alg = new Algebra(); } + /** + * Add a single training point to the model. + * @param basePrev the previous cycle's base call (A, C, G, T, or * for the first cycle) + * @param baseCur the current cycle's base call (A, C, G, T) + * @param qualCur the quality score for the current cycle's base call + * @param fourintensity the four intensities for the current cycle's base call + */ public void addTrainingPoint(char basePrev, char baseCur, byte qualCur, double[] fourintensity) { int basePrevIndex = baseToBaseIndex(basePrev); int baseCurIndex = baseToBaseIndex(baseCur); @@ -67,6 +87,9 @@ public class BasecallingBaseModel { readyToCall = false; } + /** + * Precompute all the matrix inversions and determinants we'll need for computing the likelihood distributions. + */ public void prepareToCallBases() { for (int basePrevIndex = 0; basePrevIndex < 4; basePrevIndex++) { for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) { @@ -88,6 +111,16 @@ public class BasecallingBaseModel { readyToCall = true; } + /** + * Compute the likelihood matrix for a base (contextual priors included). + * + * @param cycle the cycle we're calling right now + * @param basePrev the previous cycle's base + * @param qualPrev the previous cycle's quality score + * @param fourintensity the four intensities of the current cycle's base + * @return a 4x4 matrix of likelihoods, where the row is the previous cycle base hypothesis and + * the column is the current cycle base hypothesis + */ public double[][] computeLikelihoods(int cycle, char basePrev, byte qualPrev, double[] fourintensity) { if (!readyToCall) { prepareToCallBases(); @@ -115,6 +148,11 @@ public class BasecallingBaseModel { return probdist; } + /** + * Utility method for converting a base ([Aa*], [Cc], [Gg], [Tt]) to an index (0, 1, 2, 3); + * @param base + * @return 0, 1, 2, 3, or -1 if the base can't be understood. + */ private int baseToBaseIndex(char base) { switch (base) { case 'A': diff --git a/java/src/org/broadinstitute/sting/playground/fourbasecaller/BasecallingReadModel.java b/java/src/org/broadinstitute/sting/playground/fourbasecaller/BasecallingReadModel.java index 590e7bb25..87abe494f 100644 --- a/java/src/org/broadinstitute/sting/playground/fourbasecaller/BasecallingReadModel.java +++ b/java/src/org/broadinstitute/sting/playground/fourbasecaller/BasecallingReadModel.java @@ -1,10 +1,23 @@ package org.broadinstitute.sting.playground.fourbasecaller; -import org.broadinstitute.sting.utils.Utils; - +/** + * BasecallingReadModel represents the statistical models for + * all bases in all cycles. It allows for easy, one-pass + * training via the addTrainingPoint() method, and for the + * computation of the 4x4 likelihood matrix or the 1x4 + * probability vector (with contextual components marginalized + * out of the likelihood matrix). + * + * @author Kiran Garimella + */ public class BasecallingReadModel { private BasecallingBaseModel[] basemodels = null; + /** + * Constructor for BasecallingReadModel. + * + * @param readLength the length of the read that this model will support + */ public BasecallingReadModel(int readLength) { basemodels = new BasecallingBaseModel[readLength]; @@ -13,14 +26,42 @@ public class BasecallingReadModel { } } + /** + * Add a single training point to the model. + * @param cycle the cycle for which this point should be added + * @param basePrev the previous base + * @param baseCur the current base + * @param qualCur the current base's quality + * @param fourintensity the four intensities of the current base + */ public void addTrainingPoint(int cycle, char basePrev, char baseCur, byte qualCur, double[] fourintensity) { basemodels[cycle].addTrainingPoint(basePrev, baseCur, qualCur, fourintensity); } + /** + * Compute the likelihood matrix for a given cycle. + * + * @param cycle the cycle number for the current base + * @param basePrev the previous cycle's base + * @param qualPrev the quality score for the previous cycle's base + * @param fourintensity the four intensities for the current cycle's base + * @return 4x4 matrix of likelihoods + */ public double[][] computeLikelihoods(int cycle, char basePrev, byte qualPrev, double[] fourintensity) { return basemodels[cycle].computeLikelihoods(cycle, basePrev, qualPrev, fourintensity); } + /** + * Compute the probability distribution for the base at a given cycle. + * Contextual components of the likelihood matrix are marginalized out. + * + * @param cycle the cycle number for the current base + * @param basePrev the previous cycle's base + * @param qualPrev the quality score for the previous cycle's base + * @param fourintensity the four intensities for the current cycle's base + * @return an instance of FourProb, which encodes a base hypothesis, its probability, + * and the ranking among the other hypotheses + */ public FourProb computeProbabilities(int cycle, char basePrev, byte qualPrev, double[] fourintensity) { double[][] likes = computeLikelihoods(cycle, basePrev, qualPrev, fourintensity); diff --git a/java/src/org/broadinstitute/sting/playground/fourbasecaller/FourBaseRecaller.java b/java/src/org/broadinstitute/sting/playground/fourbasecaller/FourBaseRecaller.java index c64bccbba..5ebba600d 100644 --- a/java/src/org/broadinstitute/sting/playground/fourbasecaller/FourBaseRecaller.java +++ b/java/src/org/broadinstitute/sting/playground/fourbasecaller/FourBaseRecaller.java @@ -85,8 +85,6 @@ public class FourBaseRecaller extends CommandLineProgram { byte[] nextbestqual = new byte[bases.length()]; for (int cycle = 0; cycle < bases.length(); cycle++) { - //char basePrev = (cycle == 0) ? '*' : bases.charAt(cycle - 1); - //byte qualPrev = (cycle == 0) ? 0 : quals[cycle - 1]; char basePrev = (cycle == 0) ? '*' : (char) asciiseq[cycle - 1]; byte qualPrev = (cycle == 0) ? 0 : bestqual[cycle - 1]; double[] fourintensity = intensities[cycle + cycle_offset]; diff --git a/java/src/org/broadinstitute/sting/playground/fourbasecaller/FourProb.java b/java/src/org/broadinstitute/sting/playground/fourbasecaller/FourProb.java index 6397ad911..065abefe4 100755 --- a/java/src/org/broadinstitute/sting/playground/fourbasecaller/FourProb.java +++ b/java/src/org/broadinstitute/sting/playground/fourbasecaller/FourProb.java @@ -3,10 +3,21 @@ package org.broadinstitute.sting.playground.fourbasecaller; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.QualityUtils; +/** + * FourProb represents four base hypotheses, their probabilities, and their ranking among one another. + * + * @author Kiran Garimella + */ public class FourProb { private double[] baseProbs; private int[] baseIndices; + /** + * Constructor for FourProb. + * + * @param baseIndices the unsorted base indices (A:0, C:1, G:2, T:3). Now that I think about it, this is stupid. + * @param baseProbs the unsorted base hypothesis probabilities. + */ public FourProb(int[] baseIndices, double[] baseProbs) { Integer[] perm = Utils.SortPermutation(baseProbs); double[] ascendingBaseProbs = Utils.PermuteArray(baseProbs, perm); @@ -21,11 +32,40 @@ public class FourProb { } } + /** + * Returns the index of the base at the specified rank. + * + * @param rank (0 = best, 3 = worst) the rank of the base whose index should be returned + * @return the index (0, 1, 2, 3). + */ public int indexAtRank(int rank) { return baseIndices[rank]; } + + /** + * Returns the base label of the base at the specified rank. + * @param rank (0 = best, 3 = worst) the rank of the base whose index should be returned + * @return the base label (A, C, G, T). + */ public char baseAtRank(int rank) { return baseIndexToBase(indexAtRank(rank)); } + + /** + * Returns the probability of the base at the specified rank. + * @param rank (0 = best, 3 = worst) the rank of the base whose index should be returned + * @return the probability of the base (0.0-1.0) + */ public double probAtRank(int rank) { return baseProbs[rank]; } + + /** + * Returns the quality score of the base at the specified rank. + * @param rank (0 = best, 3 = worst) the rank of the base whose index should be returned + * @return the quality score of the base (0-40) + */ public byte qualAtRank(int rank) { return QualityUtils.probToQual(probAtRank(rank)); } + /** + * A utility method to convert a base index into a base label. + * @param baseIndex the index of the base (0, 1, 2, 3). + * @return A, C, G, T, or '.' if the base index can't be understood. + */ private char baseIndexToBase(int baseIndex) { switch (baseIndex) { case 0: return 'A'; @@ -36,6 +76,11 @@ public class FourProb { } } + /** + * Prettily formats the FourProb info. + * + * @return a prettily formatted Sting containing the base and quality score in rank order. + */ public String toString() { return ( "[" + baseAtRank(0) + ":" + qualAtRank(0) + " " diff --git a/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/java/src/org/broadinstitute/sting/utils/QualityUtils.java index 6bc2ac70f..3c0dafac3 100755 --- a/java/src/org/broadinstitute/sting/utils/QualityUtils.java +++ b/java/src/org/broadinstitute/sting/utils/QualityUtils.java @@ -1,16 +1,47 @@ package org.broadinstitute.sting.utils; +/** + * QualityUtils is a static class (no instantiation allowed!) with some utility methods for manipulating + * quality scores. + * + * @author Kiran Garimella + */ public class QualityUtils { + /** + * Private constructor. No instantiating this class! + */ private QualityUtils() {} - + + /** + * Convert a quality score to a probability. This is the Phred-style + * conversion, *not* the Illumina-style conversion (though asymptotically, they're the same). + * + * @param qual a quality score (0-40) + * @return a probability (0.0-1.0) + */ static public double qualToProb(byte qual) { return 1.0 - Math.pow(10.0, ((double) qual)/-10.0); } + /** + * Convert a probability to a quality score. Note, this is capped at Q40. + * + * @param prob a probability (0.0-1.0) + * @return a quality score (0-40) + */ static public byte probToQual(double prob) { return (byte) Math.round(-10.0*Math.log10(1.0 - prob + 0.0001)); } + /** + * Compress a base and a probability into a single byte so that it can be output in a SAMRecord's SQ field. + * Note: the highest probability this function can encode is 64%, so this function should only never be used on the best base hypothesis. + * Another note: the probability encoded here gets rounded to the nearest 1%. + * + * @param baseIndex the base index + * @param prob the base probability + * @return a byte containing the index and the probability + */ static public byte baseAndProbToCompressedQuality(int baseIndex, double prob) { byte compressedQual = (byte) baseIndex; byte cprob = (byte) (100.0*prob);