2009-05-13 04:24:18 +08:00
|
|
|
package org.broadinstitute.sting.secondarybase;
|
2009-04-03 06:08:10 +08:00
|
|
|
|
2009-04-15 12:18:07 +08:00
|
|
|
import org.broadinstitute.sting.utils.BaseUtils;
|
|
|
|
|
import org.broadinstitute.sting.utils.QualityUtils;
|
|
|
|
|
|
2009-04-07 06:00:58 +08:00
|
|
|
import java.io.File;
|
2009-05-15 00:57:00 +08:00
|
|
|
import java.util.ArrayList;
|
2009-04-07 06:00:58 +08:00
|
|
|
|
2009-04-04 03:19:17 +08:00
|
|
|
/**
|
|
|
|
|
* BasecallingReadModel represents the statistical models for
|
2009-05-13 03:47:41 +08:00
|
|
|
* all bases in all cycles. It allows for easy training via
|
|
|
|
|
* the addTrainingPoint() method, and for the computation of
|
|
|
|
|
* the 4x4 likelihood matrix or the 1x4 probability vector
|
|
|
|
|
* (with contextual components marginalized out of the
|
|
|
|
|
* likelihood matrix).
|
2009-04-04 03:19:17 +08:00
|
|
|
*
|
|
|
|
|
* @author Kiran Garimella
|
|
|
|
|
*/
|
2009-04-03 06:08:10 +08:00
|
|
|
public class BasecallingReadModel {
|
|
|
|
|
private BasecallingBaseModel[] basemodels = null;
|
2009-05-20 08:09:20 +08:00
|
|
|
private boolean correctForContext = true;
|
|
|
|
|
|
2009-05-22 03:40:47 +08:00
|
|
|
/**
|
|
|
|
|
* Constructs a BasecallingReadModel with space for a given read length.
|
|
|
|
|
*
|
|
|
|
|
* @param readLength the length of the reads to which this model will apply.
|
|
|
|
|
*/
|
2009-05-20 08:09:20 +08:00
|
|
|
public BasecallingReadModel(int readLength) {
|
|
|
|
|
initialize(readLength);
|
|
|
|
|
}
|
|
|
|
|
|
2009-05-22 03:40:47 +08:00
|
|
|
/**
|
|
|
|
|
* Constructs a BasecallingReadModel and trains it using the specified training data.
|
|
|
|
|
*
|
|
|
|
|
* @param trainingData a set of RawReads from which the model will be trained.
|
|
|
|
|
*/
|
2009-05-20 08:09:20 +08:00
|
|
|
public BasecallingReadModel(ArrayList<RawRead> trainingData) {
|
|
|
|
|
initialize(trainingData.get(0).getReadLength());
|
|
|
|
|
|
|
|
|
|
train(trainingData);
|
|
|
|
|
}
|
|
|
|
|
|
2009-05-22 03:40:47 +08:00
|
|
|
/**
|
|
|
|
|
* Initialize the model and set default parameters for each cycle appropriately.
|
|
|
|
|
*
|
|
|
|
|
* @param readLength the length of the reads to which this model will apply.
|
|
|
|
|
*/
|
2009-05-20 08:09:20 +08:00
|
|
|
public void initialize(int readLength) {
|
2009-04-03 06:08:10 +08:00
|
|
|
basemodels = new BasecallingBaseModel[readLength];
|
|
|
|
|
|
2009-04-15 12:18:07 +08:00
|
|
|
for (int cycle = 0; cycle < readLength; cycle++) {
|
2009-05-20 08:09:20 +08:00
|
|
|
basemodels[cycle] = new BasecallingBaseModel(cycle != 0 && correctForContext);
|
2009-04-03 06:08:10 +08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2009-05-22 03:40:47 +08:00
|
|
|
/**
|
|
|
|
|
* Train the model using the specified training data.
|
|
|
|
|
*
|
|
|
|
|
* @param trainingData a set of RawReads from which the model will be trained.
|
|
|
|
|
*/
|
2009-05-20 08:09:20 +08:00
|
|
|
public void train(ArrayList<RawRead> trainingData) {
|
|
|
|
|
for ( RawRead read : trainingData ) {
|
2009-05-15 00:57:00 +08:00
|
|
|
addMeanPoints(read);
|
|
|
|
|
}
|
|
|
|
|
|
2009-05-20 08:09:20 +08:00
|
|
|
for ( RawRead read : trainingData ) {
|
2009-05-15 00:57:00 +08:00
|
|
|
addCovariancePoints(read);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2009-05-22 03:40:47 +08:00
|
|
|
/**
|
|
|
|
|
* Add a training point for the mean intensity values per base and per cycle.
|
|
|
|
|
*
|
|
|
|
|
* @param cycle the cycle number (0-based)
|
|
|
|
|
* @param probMatrix the probability matrix for the base
|
|
|
|
|
* @param fourintensity the four raw intensities for the base
|
|
|
|
|
*/
|
2009-05-13 03:47:41 +08:00
|
|
|
public void addMeanPoint(int cycle, double[][] probMatrix, double[] fourintensity) {
|
|
|
|
|
basemodels[cycle].addMeanPoint(probMatrix, fourintensity);
|
2009-04-07 09:20:15 +08:00
|
|
|
}
|
|
|
|
|
|
2009-05-22 03:40:47 +08:00
|
|
|
/**
|
|
|
|
|
* Add a training point for the mean intensity values per base in all cycles.
|
|
|
|
|
*
|
|
|
|
|
* @param read the raw read
|
|
|
|
|
*/
|
2009-05-15 00:57:00 +08:00
|
|
|
public void addMeanPoints(RawRead read) {
|
|
|
|
|
byte[] seqs = read.getSequence();
|
|
|
|
|
byte[] quals = read.getQuals();
|
|
|
|
|
short[][] ints = read.getIntensities();
|
|
|
|
|
|
|
|
|
|
for (int cycle = 0; cycle < seqs.length; cycle++) {
|
|
|
|
|
char basePrev = (char) ((cycle == 0) ? '.' : seqs[cycle - 1]);
|
|
|
|
|
char baseCur = (char) seqs[cycle];
|
|
|
|
|
double probCur = QualityUtils.qualToProb(quals[cycle]);
|
|
|
|
|
|
|
|
|
|
double[][] probMatrix = getBaseProbabilityMatrix(cycle, basePrev, baseCur, probCur);
|
|
|
|
|
|
|
|
|
|
double[] fourIntensity = new double[4];
|
|
|
|
|
for (int channel = 0; channel < 4; channel++) {
|
2009-06-24 23:41:06 +08:00
|
|
|
//fourIntensity[channel] = (double) ints[cycle][channel];
|
|
|
|
|
fourIntensity[channel] = (double) ints[channel][cycle];
|
2009-05-15 00:57:00 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
basemodels[cycle].addMeanPoint(probMatrix, fourIntensity);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2009-05-22 03:40:47 +08:00
|
|
|
/**
|
|
|
|
|
* Add a training point for the intensity covariance matrix per base and per cycle.
|
|
|
|
|
*
|
|
|
|
|
* @param cycle the cycle number (0-based)
|
|
|
|
|
* @param probMatrix the probability matrix for the base
|
|
|
|
|
* @param fourintensity the four raw intensities for the base
|
|
|
|
|
*/
|
2009-05-13 03:47:41 +08:00
|
|
|
public void addCovariancePoint(int cycle, double[][] probMatrix, double[] fourintensity) {
|
|
|
|
|
basemodels[cycle].addCovariancePoint(probMatrix, fourintensity);
|
2009-04-03 06:08:10 +08:00
|
|
|
}
|
|
|
|
|
|
2009-05-22 03:40:47 +08:00
|
|
|
/**
|
|
|
|
|
* Add a training point for the intensity covariance matrix per base in all cycles.
|
|
|
|
|
*
|
|
|
|
|
* @param read the raw read
|
|
|
|
|
*/
|
2009-05-15 00:57:00 +08:00
|
|
|
public void addCovariancePoints(RawRead read) {
|
|
|
|
|
byte[] seqs = read.getSequence();
|
|
|
|
|
byte[] quals = read.getQuals();
|
|
|
|
|
short[][] ints = read.getIntensities();
|
|
|
|
|
|
|
|
|
|
for (int cycle = 0; cycle < seqs.length; cycle++) {
|
|
|
|
|
char basePrev = (char) ((cycle == 0) ? '.' : seqs[cycle - 1]);
|
|
|
|
|
char baseCur = (char) seqs[cycle];
|
|
|
|
|
double probCur = QualityUtils.qualToProb(quals[cycle]);
|
|
|
|
|
|
|
|
|
|
double[][] probMatrix = getBaseProbabilityMatrix(cycle, basePrev, baseCur, probCur);
|
|
|
|
|
|
|
|
|
|
double[] fourIntensity = new double[4];
|
|
|
|
|
for (int channel = 0; channel < 4; channel++) {
|
2009-06-24 23:41:06 +08:00
|
|
|
//fourIntensity[channel] = (double) ints[cycle][channel];
|
|
|
|
|
fourIntensity[channel] = (double) ints[channel][cycle];
|
2009-05-15 00:57:00 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
basemodels[cycle].addCovariancePoint(probMatrix, fourIntensity);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2009-05-22 03:40:47 +08:00
|
|
|
/**
|
|
|
|
|
* Compute the likelihoods that a given set of intensities yields each possible base.
|
|
|
|
|
*
|
|
|
|
|
* @param cycle the cycle number (0-based)
|
|
|
|
|
* @param fourintensity the four raw intensities for the base
|
|
|
|
|
* @return the matrix of likelihoods
|
|
|
|
|
*/
|
2009-04-15 12:18:07 +08:00
|
|
|
public double[][] computeLikelihoods(int cycle, double[] fourintensity) {
|
2009-04-07 10:18:13 +08:00
|
|
|
return basemodels[cycle].computeLikelihoods(cycle, fourintensity);
|
2009-04-03 06:08:10 +08:00
|
|
|
}
|
|
|
|
|
|
2009-05-22 03:40:47 +08:00
|
|
|
/**
|
|
|
|
|
* Compute the probabilities that a given set of intensities yields each possible base.
|
|
|
|
|
*
|
|
|
|
|
* @param cycle the cycle number (0-based)
|
|
|
|
|
* @param basePrev the previous base
|
|
|
|
|
* @param qualPrev the previous base's quality score
|
|
|
|
|
* @param fourintensity the four raw intensities for the base
|
|
|
|
|
* @return the probability distribution over the four base possibilities
|
|
|
|
|
*/
|
2009-04-15 12:18:07 +08:00
|
|
|
public FourProb computeProbabilities(int cycle, char basePrev, byte qualPrev, double[] fourintensity) {
|
|
|
|
|
double[][] likes = computeLikelihoods(cycle, fourintensity);
|
2009-04-03 06:08:10 +08:00
|
|
|
|
|
|
|
|
double total = 0;
|
|
|
|
|
|
2009-04-15 12:18:07 +08:00
|
|
|
for (int basePrevIndex = 0; basePrevIndex < likes.length; basePrevIndex++) {
|
|
|
|
|
for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) {
|
|
|
|
|
double prior = 1.0;
|
|
|
|
|
if (correctForContext) {
|
|
|
|
|
double prob = QualityUtils.qualToProb(qualPrev);
|
|
|
|
|
if (basePrevIndex == BaseUtils.simpleBaseToBaseIndex(basePrev)) {
|
|
|
|
|
prior = prob;
|
|
|
|
|
} else {
|
|
|
|
|
prior = (1.0 - prob)/((double) (4*likes.length - 1));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
likes[basePrevIndex][baseCurIndex] = prior*likes[basePrevIndex][baseCurIndex];
|
|
|
|
|
total += likes[basePrevIndex][baseCurIndex];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (int basePrevIndex = 0; basePrevIndex < likes.length; basePrevIndex++) {
|
|
|
|
|
for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) {
|
|
|
|
|
likes[basePrevIndex][baseCurIndex] /= total;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2009-05-20 08:09:20 +08:00
|
|
|
return new FourProb(likes);
|
2009-05-13 03:47:41 +08:00
|
|
|
}
|
|
|
|
|
|
2009-05-22 03:40:47 +08:00
|
|
|
/**
|
|
|
|
|
* Call the bases in the given RawRead.
|
|
|
|
|
*
|
|
|
|
|
* @param read the RawRead
|
|
|
|
|
* @return the basecalled read
|
|
|
|
|
*/
|
2009-05-15 00:57:00 +08:00
|
|
|
public FourProbRead call(RawRead read) {
|
|
|
|
|
FourProbRead fpr = new FourProbRead(read.getReadLength());
|
|
|
|
|
|
|
|
|
|
for (int cycle = 0; cycle < read.getReadLength(); cycle++) {
|
|
|
|
|
char basePrev = (char) ((cycle == 0) ? '.' : read.getSequence()[cycle - 1]);
|
|
|
|
|
byte qualPrev = ((cycle == 0) ? 0 : read.getQuals()[cycle - 1]);
|
|
|
|
|
|
|
|
|
|
double[] fourIntensity = new double[4];
|
|
|
|
|
for (int channel = 0; channel < 4; channel++) {
|
2009-06-24 23:41:06 +08:00
|
|
|
//fourIntensity[channel] = (double) read.getIntensities()[cycle][channel];
|
|
|
|
|
fourIntensity[channel] = (double) read.getIntensities()[channel][cycle];
|
2009-05-15 00:57:00 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fpr.add(cycle, computeProbabilities(cycle, basePrev, qualPrev, fourIntensity));
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return fpr;
|
|
|
|
|
}
|
|
|
|
|
|
2009-05-22 03:40:47 +08:00
|
|
|
/**
|
|
|
|
|
* Return the probability matrix given the previous cycle's base, the current cycle's base, and the current base's probability.
|
|
|
|
|
*
|
|
|
|
|
* @param cycle the cycle number (0-based)
|
|
|
|
|
* @param basePrev the previous base
|
|
|
|
|
* @param baseCur the current base
|
|
|
|
|
* @param probCur the probability of the current base
|
|
|
|
|
* @return the probability matrix of the base
|
|
|
|
|
*/
|
2009-05-13 03:47:41 +08:00
|
|
|
public double[][] getBaseProbabilityMatrix(int cycle, char basePrev, char baseCur, double probCur) {
|
|
|
|
|
double[][] dist = new double[(correctForContext && cycle > 0) ? 4 : 1][4];
|
|
|
|
|
|
|
|
|
|
int actualBasePrevIndex = (correctForContext && cycle > 0) ? BaseUtils.simpleBaseToBaseIndex(basePrev) : 0;
|
|
|
|
|
int actualBaseCurIndex = BaseUtils.simpleBaseToBaseIndex(baseCur);
|
|
|
|
|
|
2009-06-09 09:01:13 +08:00
|
|
|
if (actualBasePrevIndex == -1) { actualBasePrevIndex = BaseUtils.getRandomBaseIndex(); }
|
|
|
|
|
if (actualBaseCurIndex == -1) { actualBaseCurIndex = BaseUtils.getRandomBaseIndex(); }
|
|
|
|
|
|
2009-05-13 03:47:41 +08:00
|
|
|
double residualTheories = (double) (dist.length*dist[0].length - 1);
|
|
|
|
|
|
|
|
|
|
for (int basePrevIndex = 0; basePrevIndex < dist.length; basePrevIndex++) {
|
|
|
|
|
for (int baseCurIndex = 0; baseCurIndex < dist[basePrevIndex].length; baseCurIndex++) {
|
|
|
|
|
dist[basePrevIndex][baseCurIndex] = (basePrevIndex == actualBasePrevIndex && baseCurIndex == actualBaseCurIndex) ? probCur : ((1.0 - probCur)/residualTheories);
|
2009-04-15 12:18:07 +08:00
|
|
|
}
|
|
|
|
|
}
|
2009-04-03 06:08:10 +08:00
|
|
|
|
2009-05-13 03:47:41 +08:00
|
|
|
return dist;
|
2009-04-03 06:08:10 +08:00
|
|
|
}
|
2009-04-07 06:00:58 +08:00
|
|
|
|
2009-05-22 03:40:47 +08:00
|
|
|
/**
|
|
|
|
|
* Write model parameters to disk.
|
|
|
|
|
*
|
|
|
|
|
* @param dir the directory in which model parameters should be stored.
|
|
|
|
|
*/
|
2009-04-07 06:00:58 +08:00
|
|
|
public void write(File dir) {
|
|
|
|
|
for (int cycle = 0; cycle < basemodels.length; cycle++) {
|
2009-04-13 03:45:33 +08:00
|
|
|
File outparam = new File(dir.getPath() + "/param." + cycle + ".r");
|
2009-04-07 06:00:58 +08:00
|
|
|
basemodels[cycle].write(outparam);
|
|
|
|
|
}
|
|
|
|
|
}
|
2009-04-03 06:08:10 +08:00
|
|
|
}
|