Intermediate commit. Refactored some simple base manipulation stuff into BaseUtils.java. Generalized some likelihood computation logic to make future possible EM-ing easier.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@424 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
d0b8d311e6
commit
7949e377e4
|
|
@ -7,6 +7,7 @@ import cern.colt.matrix.DoubleFactory2D;
|
||||||
import cern.colt.matrix.linalg.Algebra;
|
import cern.colt.matrix.linalg.Algebra;
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.QualityUtils;
|
import org.broadinstitute.sting.utils.QualityUtils;
|
||||||
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
|
|
||||||
|
|
@ -21,38 +22,46 @@ import java.io.*;
|
||||||
* @author Kiran Garimella
|
* @author Kiran Garimella
|
||||||
*/
|
*/
|
||||||
public class BasecallingBaseModel {
|
public class BasecallingBaseModel {
|
||||||
private double[] counts;
|
private double[][] counts;
|
||||||
private DoubleMatrix1D[] sums;
|
private DoubleMatrix1D[][] sums;
|
||||||
private DoubleMatrix2D[] unscaledCovarianceSums;
|
private DoubleMatrix2D[][] unscaledCovarianceSums;
|
||||||
|
|
||||||
private DoubleMatrix1D[] means;
|
private DoubleMatrix1D[][] means;
|
||||||
private DoubleMatrix2D[] inverseCovariances;
|
private DoubleMatrix2D[][] inverseCovariances;
|
||||||
private double[] norms;
|
private double[][] norms;
|
||||||
|
|
||||||
private cern.jet.math.Functions F = cern.jet.math.Functions.functions;
|
private cern.jet.math.Functions F = cern.jet.math.Functions.functions;
|
||||||
private Algebra alg;
|
private Algebra alg;
|
||||||
|
|
||||||
|
private boolean correctForContext = false;
|
||||||
|
private int numTheories = 1;
|
||||||
|
|
||||||
private boolean readyToCall = false;
|
private boolean readyToCall = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Constructor for BasecallingBaseModel
|
* Constructor for BasecallingBaseModel
|
||||||
*/
|
*/
|
||||||
public BasecallingBaseModel() {
|
public BasecallingBaseModel(boolean correctForContext) {
|
||||||
counts = new double[4];
|
this.correctForContext = correctForContext;
|
||||||
|
this.numTheories = (correctForContext) ? 4 : 1;
|
||||||
|
|
||||||
|
counts = new double[this.numTheories][4];
|
||||||
|
|
||||||
sums = new DoubleMatrix1D[4];
|
sums = new DoubleMatrix1D[this.numTheories][4];
|
||||||
unscaledCovarianceSums = new DoubleMatrix2D[4];
|
unscaledCovarianceSums = new DoubleMatrix2D[this.numTheories][4];
|
||||||
|
|
||||||
means = new DoubleMatrix1D[4];
|
means = new DoubleMatrix1D[this.numTheories][4];
|
||||||
inverseCovariances = new DoubleMatrix2D[4];
|
inverseCovariances = new DoubleMatrix2D[this.numTheories][4];
|
||||||
norms = new double[4];
|
norms = new double[this.numTheories][4];
|
||||||
|
|
||||||
for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) {
|
for (int basePrevIndex = 0; basePrevIndex < this.numTheories; basePrevIndex++) {
|
||||||
sums[baseCurIndex] = (DoubleFactory1D.dense).make(4);
|
for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) {
|
||||||
unscaledCovarianceSums[baseCurIndex] = (DoubleFactory2D.dense).make(4, 4);
|
sums[basePrevIndex][baseCurIndex] = (DoubleFactory1D.dense).make(4);
|
||||||
|
unscaledCovarianceSums[basePrevIndex][baseCurIndex] = (DoubleFactory2D.dense).make(4, 4);
|
||||||
|
|
||||||
means[baseCurIndex] = (DoubleFactory1D.dense).make(4);
|
means[basePrevIndex][baseCurIndex] = (DoubleFactory1D.dense).make(4);
|
||||||
inverseCovariances[baseCurIndex] = (DoubleFactory2D.dense).make(4, 4);
|
inverseCovariances[basePrevIndex][baseCurIndex] = (DoubleFactory2D.dense).make(4, 4);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
alg = new Algebra();
|
alg = new Algebra();
|
||||||
|
|
@ -61,23 +70,25 @@ public class BasecallingBaseModel {
|
||||||
/**
|
/**
|
||||||
* Add a single training point to the model to estimate the means.
|
* Add a single training point to the model to estimate the means.
|
||||||
*
|
*
|
||||||
|
* @param basePrev the previous cycle's base call (A, C, G, T)
|
||||||
* @param baseCur the current cycle's base call (A, C, G, T)
|
* @param baseCur the current cycle's base call (A, C, G, T)
|
||||||
* @param qualCur the quality score for the current cycle's base call
|
|
||||||
* @param fourintensity the four intensities for the current cycle's base call
|
|
||||||
*/
|
*/
|
||||||
public void addMeanPoint(char baseCur, byte qualCur, double[] fourintensity) {
|
public void addMeanPoint(char basePrev, char baseCur, byte qualCur, double[] fourintensity) {
|
||||||
int actualBaseCurIndex = baseToBaseIndex(baseCur);
|
int actualBasePrevIndex = BaseUtils.simpleBaseToBaseIndex(basePrev);
|
||||||
|
int actualBaseCurIndex = BaseUtils.simpleBaseToBaseIndex(baseCur);
|
||||||
double actualWeight = QualityUtils.qualToProb(qualCur);
|
double actualWeight = QualityUtils.qualToProb(qualCur);
|
||||||
|
|
||||||
for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) {
|
for (int basePrevIndex = 0; basePrevIndex < numTheories; basePrevIndex++) {
|
||||||
// We want to upweight the correct theory as much as we can and spread the remainder out evenly between all other hypotheses.
|
for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) {
|
||||||
double weight = (baseCurIndex == actualBaseCurIndex) ? actualWeight : ((1.0 - actualWeight)/3.0);
|
// We want to upweight the correct theory as much as we can and spread the remainder out evenly between all other hypotheses.
|
||||||
|
double weight = (baseCurIndex == actualBaseCurIndex && (!correctForContext || basePrevIndex == actualBasePrevIndex)) ? actualWeight : ((1.0 - actualWeight)/((double) (4*numTheories - 1)));
|
||||||
|
|
||||||
DoubleMatrix1D weightedChannelIntensities = (DoubleFactory1D.dense).make(fourintensity);
|
DoubleMatrix1D weightedChannelIntensities = (DoubleFactory1D.dense).make(fourintensity);
|
||||||
weightedChannelIntensities.assign(F.mult(weight));
|
weightedChannelIntensities.assign(F.mult(weight));
|
||||||
|
|
||||||
sums[baseCurIndex].assign(weightedChannelIntensities, F.plus);
|
sums[basePrevIndex][baseCurIndex].assign(weightedChannelIntensities, F.plus);
|
||||||
counts[baseCurIndex] += weight;
|
counts[basePrevIndex][baseCurIndex] += weight;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
readyToCall = false;
|
readyToCall = false;
|
||||||
|
|
@ -85,30 +96,34 @@ public class BasecallingBaseModel {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Add a single training point to the model to estimate the covariances.
|
* Add a single training point to the model to estimate the covariances.
|
||||||
*
|
*
|
||||||
|
* @param basePrev the previous cycle's base call (A, C, G, T)
|
||||||
* @param baseCur the current cycle's base call (A, C, G, T)
|
* @param baseCur the current cycle's base call (A, C, G, T)
|
||||||
* @param qualCur the quality score for the current cycle's base call
|
* @param qualCur the quality score for the current cycle's base call
|
||||||
* @param fourintensity the four intensities for the current cycle's base call
|
* @param fourintensity the four intensities for the current cycle's base call
|
||||||
*/
|
*/
|
||||||
public void addCovariancePoint(char baseCur, byte qualCur, double[] fourintensity) {
|
public void addCovariancePoint(char basePrev, char baseCur, byte qualCur, double[] fourintensity) {
|
||||||
int actualBaseCurIndex = baseToBaseIndex(baseCur);
|
int actualBasePrevIndex = BaseUtils.simpleBaseToBaseIndex(basePrev);
|
||||||
|
int actualBaseCurIndex = BaseUtils.simpleBaseToBaseIndex(baseCur);
|
||||||
double actualWeight = QualityUtils.qualToProb(qualCur);
|
double actualWeight = QualityUtils.qualToProb(qualCur);
|
||||||
|
|
||||||
for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) {
|
for (int basePrevIndex = 0; basePrevIndex < numTheories; basePrevIndex++) {
|
||||||
// We want to upweight the correct theory as much as we can and spread the remainder out evenly between all other hypotheses.
|
for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) {
|
||||||
double weight = (baseCurIndex == actualBaseCurIndex) ? actualWeight : ((1.0 - actualWeight)/3.0);
|
// We want to upweight the correct theory as much as we can and spread the remainder out evenly between all other hypotheses.
|
||||||
|
double weight = (baseCurIndex == actualBaseCurIndex && (!correctForContext || basePrevIndex == actualBasePrevIndex)) ? actualWeight : ((1.0 - actualWeight)/((double) (4*numTheories - 1)));
|
||||||
|
|
||||||
DoubleMatrix1D mean = sums[baseCurIndex].copy();
|
DoubleMatrix1D mean = sums[basePrevIndex][baseCurIndex].copy();
|
||||||
mean.assign(F.div(counts[baseCurIndex]));
|
mean.assign(F.div(counts[basePrevIndex][baseCurIndex]));
|
||||||
|
|
||||||
DoubleMatrix1D sub = (DoubleFactory1D.dense).make(fourintensity);
|
DoubleMatrix1D sub = (DoubleFactory1D.dense).make(fourintensity);
|
||||||
sub.assign(mean, F.minus);
|
sub.assign(mean, F.minus);
|
||||||
|
|
||||||
DoubleMatrix2D cov = (DoubleFactory2D.dense).make(4, 4);
|
DoubleMatrix2D cov = (DoubleFactory2D.dense).make(4, 4);
|
||||||
alg.multOuter(sub, sub, cov);
|
alg.multOuter(sub, sub, cov);
|
||||||
|
|
||||||
cov.assign(F.mult(weight));
|
cov.assign(F.mult(weight));
|
||||||
unscaledCovarianceSums[baseCurIndex].assign(cov, F.plus);
|
unscaledCovarianceSums[basePrevIndex][baseCurIndex].assign(cov, F.plus);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
readyToCall = false;
|
readyToCall = false;
|
||||||
|
|
@ -118,16 +133,18 @@ public class BasecallingBaseModel {
|
||||||
* Precompute all the matrix inversions and determinants we'll need for computing the likelihood distributions.
|
* Precompute all the matrix inversions and determinants we'll need for computing the likelihood distributions.
|
||||||
*/
|
*/
|
||||||
public void prepareToCallBases() {
|
public void prepareToCallBases() {
|
||||||
for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) {
|
for (int basePrevIndex = 0; basePrevIndex < numTheories; basePrevIndex++) {
|
||||||
means[baseCurIndex] = sums[baseCurIndex].copy();
|
for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) {
|
||||||
means[baseCurIndex].assign(F.div(counts[baseCurIndex]));
|
means[basePrevIndex][baseCurIndex] = sums[basePrevIndex][baseCurIndex].copy();
|
||||||
|
means[basePrevIndex][baseCurIndex].assign(F.div(counts[basePrevIndex][baseCurIndex]));
|
||||||
|
|
||||||
inverseCovariances[baseCurIndex] = unscaledCovarianceSums[baseCurIndex].copy();
|
inverseCovariances[basePrevIndex][baseCurIndex] = unscaledCovarianceSums[basePrevIndex][baseCurIndex].copy();
|
||||||
inverseCovariances[baseCurIndex].assign(F.div(counts[baseCurIndex]));
|
inverseCovariances[basePrevIndex][baseCurIndex].assign(F.div(counts[basePrevIndex][baseCurIndex]));
|
||||||
DoubleMatrix2D invcov = alg.inverse(inverseCovariances[baseCurIndex]);
|
DoubleMatrix2D invcov = alg.inverse(inverseCovariances[basePrevIndex][baseCurIndex]);
|
||||||
inverseCovariances[baseCurIndex] = invcov;
|
inverseCovariances[basePrevIndex][baseCurIndex] = invcov;
|
||||||
|
|
||||||
norms[baseCurIndex] = Math.pow(alg.det(invcov), 0.5)/Math.pow(2.0*Math.PI, 2.0);
|
norms[basePrevIndex][baseCurIndex] = Math.pow(alg.det(invcov), 0.5)/Math.pow(2.0*Math.PI, 2.0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
readyToCall = true;
|
readyToCall = true;
|
||||||
|
|
@ -141,22 +158,24 @@ public class BasecallingBaseModel {
|
||||||
* @return a 4x4 matrix of likelihoods, where the row is the previous cycle base hypothesis and
|
* @return a 4x4 matrix of likelihoods, where the row is the previous cycle base hypothesis and
|
||||||
* the column is the current cycle base hypothesis
|
* the column is the current cycle base hypothesis
|
||||||
*/
|
*/
|
||||||
public double[] computeLikelihoods(int cycle, double[] fourintensity) {
|
public double[][] computeLikelihoods(int cycle, double[] fourintensity) {
|
||||||
if (!readyToCall) {
|
if (!readyToCall) {
|
||||||
prepareToCallBases();
|
prepareToCallBases();
|
||||||
}
|
}
|
||||||
|
|
||||||
double[] likedist = new double[4];
|
double[][] likedist = new double[numTheories][4];
|
||||||
for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) {
|
for (int basePrevIndex = 0; basePrevIndex < numTheories; basePrevIndex++) {
|
||||||
double norm = norms[baseCurIndex];
|
for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) {
|
||||||
|
double norm = norms[basePrevIndex][baseCurIndex];
|
||||||
|
|
||||||
DoubleMatrix1D sub = (DoubleFactory1D.dense).make(fourintensity);
|
DoubleMatrix1D sub = (DoubleFactory1D.dense).make(fourintensity);
|
||||||
sub.assign(means[baseCurIndex], F.minus);
|
sub.assign(means[basePrevIndex][baseCurIndex], F.minus);
|
||||||
|
|
||||||
DoubleMatrix1D Ax = alg.mult(inverseCovariances[baseCurIndex], sub);
|
DoubleMatrix1D Ax = alg.mult(inverseCovariances[basePrevIndex][baseCurIndex], sub);
|
||||||
double exparg = -0.5*alg.mult(sub, Ax);
|
double exparg = -0.5*alg.mult(sub, Ax);
|
||||||
|
|
||||||
likedist[baseCurIndex] = norm*Math.exp(exparg);
|
likedist[basePrevIndex][baseCurIndex] = norm*Math.exp(exparg);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return likedist;
|
return likedist;
|
||||||
|
|
@ -166,66 +185,33 @@ public class BasecallingBaseModel {
|
||||||
try {
|
try {
|
||||||
PrintWriter writer = new PrintWriter(outparam);
|
PrintWriter writer = new PrintWriter(outparam);
|
||||||
|
|
||||||
for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) {
|
for (int basePrevIndex = 0; basePrevIndex < numTheories; basePrevIndex++) {
|
||||||
writer.print("mean_" + baseIndexToBase(baseCurIndex) + " = c(");
|
for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) {
|
||||||
for (int channel = 0; channel < 4; channel++) {
|
writer.print("mean_" + BaseUtils.baseIndexToSimpleBase(baseCurIndex) + " = c(");
|
||||||
writer.print(sums[baseCurIndex].getQuick(channel)/counts[baseCurIndex]);
|
for (int channel = 0; channel < 4; channel++) {
|
||||||
|
writer.print(sums[basePrevIndex][baseCurIndex].getQuick(channel)/counts[basePrevIndex][baseCurIndex]);
|
||||||
|
|
||||||
if (channel < 3) {
|
if (channel < 3) {
|
||||||
writer.print(", ");
|
writer.print(", ");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
writer.println(");");
|
||||||
writer.println(");");
|
|
||||||
|
|
||||||
DoubleMatrix2D cov = unscaledCovarianceSums[baseCurIndex].copy();
|
DoubleMatrix2D cov = unscaledCovarianceSums[basePrevIndex][baseCurIndex].copy();
|
||||||
cov.assign(F.div(counts[baseCurIndex]));
|
cov.assign(F.div(counts[basePrevIndex][baseCurIndex]));
|
||||||
|
|
||||||
writer.print("cov_" + baseIndexToBase(baseCurIndex) + " = matrix(c(");
|
writer.print("cov_" + BaseUtils.baseIndexToSimpleBase(baseCurIndex) + " = matrix(c(");
|
||||||
for (int channel1 = 0; channel1 < 4; channel1++) {
|
for (int channel1 = 0; channel1 < 4; channel1++) {
|
||||||
for (int channel2 = 0; channel2 < 4; channel2++) {
|
for (int channel2 = 0; channel2 < 4; channel2++) {
|
||||||
writer.print(cov.get(channel2, channel1) + (channel1 == 3 && channel2 == 3 ? "" : ","));
|
writer.print(cov.get(channel2, channel1) + (channel1 == 3 && channel2 == 3 ? "" : ","));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
writer.println("), nr=4, nc=4);\n");
|
||||||
}
|
}
|
||||||
writer.println("), nr=4, nc=4);\n");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
writer.close();
|
writer.close();
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Utility method for converting a base ([Aa*], [Cc], [Gg], [Tt]) to an index (0, 1, 2, 3);
|
|
||||||
*
|
|
||||||
* @param base
|
|
||||||
* @return 0, 1, 2, 3, or -1 if the base can't be understood.
|
|
||||||
*/
|
|
||||||
private int baseToBaseIndex(char base) {
|
|
||||||
switch (base) {
|
|
||||||
case 'A':
|
|
||||||
case 'a':
|
|
||||||
case '*': return 0;
|
|
||||||
|
|
||||||
case 'C':
|
|
||||||
case 'c': return 1;
|
|
||||||
|
|
||||||
case 'G':
|
|
||||||
case 'g': return 2;
|
|
||||||
|
|
||||||
case 'T':
|
|
||||||
case 't': return 3;
|
|
||||||
}
|
|
||||||
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
private char baseIndexToBase(int baseIndex) {
|
|
||||||
switch (baseIndex) {
|
|
||||||
case 0: return 'A';
|
|
||||||
case 1: return 'C';
|
|
||||||
case 2: return 'G';
|
|
||||||
case 3: return 'T';
|
|
||||||
default: return '.';
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,8 @@
|
||||||
package org.broadinstitute.sting.playground.fourbasecaller;
|
package org.broadinstitute.sting.playground.fourbasecaller;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
|
import org.broadinstitute.sting.utils.QualityUtils;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -14,34 +17,47 @@ import java.io.File;
|
||||||
*/
|
*/
|
||||||
public class BasecallingReadModel {
|
public class BasecallingReadModel {
|
||||||
private BasecallingBaseModel[] basemodels = null;
|
private BasecallingBaseModel[] basemodels = null;
|
||||||
|
private boolean correctForContext = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Constructor for BasecallingReadModel.
|
* Constructor for BasecallingReadModel.
|
||||||
*
|
*
|
||||||
* @param readLength the length of the read that this model will support
|
* @param readLength the length of the read that this model will support
|
||||||
*/
|
*/
|
||||||
public BasecallingReadModel(int readLength) {
|
public BasecallingReadModel(int readLength, boolean correctForContext) {
|
||||||
|
this.correctForContext = correctForContext;
|
||||||
|
|
||||||
basemodels = new BasecallingBaseModel[readLength];
|
basemodels = new BasecallingBaseModel[readLength];
|
||||||
|
|
||||||
for (int i = 0; i < readLength; i++) {
|
for (int cycle = 0; cycle < readLength; cycle++) {
|
||||||
basemodels[i] = new BasecallingBaseModel();
|
basemodels[cycle] = new BasecallingBaseModel(cycle == 0 ? false : correctForContext);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Add a single training point to the model.
|
* Add a single training point to the model means.
|
||||||
*
|
*
|
||||||
* @param cycle the cycle for which this point should be added
|
* @param cycle the cycle for which this point should be added
|
||||||
|
* @param basePrev the previous base
|
||||||
* @param baseCur the current base
|
* @param baseCur the current base
|
||||||
* @param qualCur the current base's quality
|
* @param qualCur the current base's quality
|
||||||
* @param fourintensity the four intensities of the current base
|
* @param fourintensity the four intensities of the current base
|
||||||
*/
|
*/
|
||||||
public void addMeanPoint(int cycle, char baseCur, byte qualCur, double[] fourintensity) {
|
public void addMeanPoint(int cycle, char basePrev, char baseCur, byte qualCur, double[] fourintensity) {
|
||||||
basemodels[cycle].addMeanPoint(baseCur, qualCur, fourintensity);
|
basemodels[cycle].addMeanPoint(basePrev, baseCur, qualCur, fourintensity);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void addCovariancePoint(int cycle, char baseCur, byte qualCur, double[] fourintensity) {
|
/**
|
||||||
basemodels[cycle].addCovariancePoint(baseCur, qualCur, fourintensity);
|
* Add a single training point to the model covariances.
|
||||||
|
*
|
||||||
|
* @param cycle the cycle for which this point should be added
|
||||||
|
* @param basePrev the previous base
|
||||||
|
* @param baseCur the current base
|
||||||
|
* @param qualCur the current base's quality
|
||||||
|
* @param fourintensity the four intensities of the current base
|
||||||
|
*/
|
||||||
|
public void addCovariancePoint(int cycle, char basePrev, char baseCur, byte qualCur, double[] fourintensity) {
|
||||||
|
basemodels[cycle].addCovariancePoint(basePrev, baseCur, qualCur, fourintensity);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -51,7 +67,7 @@ public class BasecallingReadModel {
|
||||||
* @param fourintensity the four intensities for the current cycle's base
|
* @param fourintensity the four intensities for the current cycle's base
|
||||||
* @return 4x4 matrix of likelihoods
|
* @return 4x4 matrix of likelihoods
|
||||||
*/
|
*/
|
||||||
public double[] computeLikelihoods(int cycle, double[] fourintensity) {
|
public double[][] computeLikelihoods(int cycle, double[] fourintensity) {
|
||||||
return basemodels[cycle].computeLikelihoods(cycle, fourintensity);
|
return basemodels[cycle].computeLikelihoods(cycle, fourintensity);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -63,15 +79,48 @@ public class BasecallingReadModel {
|
||||||
* @return an instance of FourProb, which encodes a base hypothesis, its probability,
|
* @return an instance of FourProb, which encodes a base hypothesis, its probability,
|
||||||
* and the ranking among the other hypotheses
|
* and the ranking among the other hypotheses
|
||||||
*/
|
*/
|
||||||
public FourProb computeProbabilities(int cycle, double[] fourintensity) {
|
public FourProb computeProbabilities(int cycle, char basePrev, byte qualPrev, double[] fourintensity) {
|
||||||
double[] likes = computeLikelihoods(cycle, fourintensity);
|
double[][] likes = computeLikelihoods(cycle, fourintensity);
|
||||||
|
|
||||||
double total = 0;
|
double total = 0;
|
||||||
|
|
||||||
for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) { total += likes[baseCurIndex]; }
|
for (int basePrevIndex = 0; basePrevIndex < likes.length; basePrevIndex++) {
|
||||||
for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) { likes[baseCurIndex] /= total; }
|
for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) {
|
||||||
|
double prior = 1.0;
|
||||||
|
if (correctForContext) {
|
||||||
|
double prob = QualityUtils.qualToProb(qualPrev);
|
||||||
|
if (basePrevIndex == BaseUtils.simpleBaseToBaseIndex(basePrev)) {
|
||||||
|
prior = prob;
|
||||||
|
} else {
|
||||||
|
prior = (1.0 - prob)/((double) (4*likes.length - 1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
likes[basePrevIndex][baseCurIndex] = prior*likes[basePrevIndex][baseCurIndex];
|
||||||
|
total += likes[basePrevIndex][baseCurIndex];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return new FourProb(likes);
|
for (int basePrevIndex = 0; basePrevIndex < likes.length; basePrevIndex++) {
|
||||||
|
for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) {
|
||||||
|
likes[basePrevIndex][baseCurIndex] /= total;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
FourProb fp = new FourProb(likes);
|
||||||
|
|
||||||
|
/*
|
||||||
|
System.out.println(fp.baseAtRank(0) + ":");
|
||||||
|
for (int basePrevIndex = 0; basePrevIndex < likes.length; basePrevIndex++) {
|
||||||
|
for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) {
|
||||||
|
System.out.print(likes[basePrevIndex][baseCurIndex] + " ");
|
||||||
|
}
|
||||||
|
System.out.println();
|
||||||
|
}
|
||||||
|
System.out.println();
|
||||||
|
*/
|
||||||
|
|
||||||
|
//return new FourProb(likes);
|
||||||
|
return fp;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -30,6 +30,7 @@ public class FourBaseRecaller extends CommandLineProgram {
|
||||||
public int CALLING_LIMIT = 1000000000;
|
public int CALLING_LIMIT = 1000000000;
|
||||||
public Boolean RAW = false;
|
public Boolean RAW = false;
|
||||||
public Boolean OLD = false;
|
public Boolean OLD = false;
|
||||||
|
public Boolean CONTEXT = false;
|
||||||
|
|
||||||
public static void main(String[] argv) {
|
public static void main(String[] argv) {
|
||||||
Instance = new FourBaseRecaller();
|
Instance = new FourBaseRecaller();
|
||||||
|
|
@ -46,6 +47,7 @@ public class FourBaseRecaller extends CommandLineProgram {
|
||||||
m_parser.addOptionalArg("clim", "C", "Number of reads to basecall", "CALLING_LIMIT");
|
m_parser.addOptionalArg("clim", "C", "Number of reads to basecall", "CALLING_LIMIT");
|
||||||
m_parser.addOptionalFlag("raw", "R", "Use raw intensities?", "RAW");
|
m_parser.addOptionalFlag("raw", "R", "Use raw intensities?", "RAW");
|
||||||
m_parser.addOptionalFlag("old", "1", "Old Bustard 1.1 mode?", "OLD");
|
m_parser.addOptionalFlag("old", "1", "Old Bustard 1.1 mode?", "OLD");
|
||||||
|
m_parser.addOptionalFlag("context", "X", "Correct for context?", "CONTEXT");
|
||||||
}
|
}
|
||||||
|
|
||||||
protected int execute() {
|
protected int execute() {
|
||||||
|
|
@ -72,7 +74,7 @@ public class FourBaseRecaller extends CommandLineProgram {
|
||||||
fread = ffp.next();
|
fread = ffp.next();
|
||||||
|
|
||||||
int cycle_offset = (END <= 1) ? 0 : bread.getIntensities().length/2;
|
int cycle_offset = (END <= 1) ? 0 : bread.getIntensities().length/2;
|
||||||
BasecallingReadModel model = new BasecallingReadModel(bread.getFirstReadSequence().length());
|
BasecallingReadModel model = new BasecallingReadModel(bread.getFirstReadSequence().length(), CONTEXT);
|
||||||
int queryid;
|
int queryid;
|
||||||
|
|
||||||
// learn mean parameters
|
// learn mean parameters
|
||||||
|
|
@ -90,9 +92,10 @@ public class FourBaseRecaller extends CommandLineProgram {
|
||||||
for (int cycle = 0; cycle < bases.length(); cycle++) {
|
for (int cycle = 0; cycle < bases.length(); cycle++) {
|
||||||
char baseCur = bases.charAt(cycle);
|
char baseCur = bases.charAt(cycle);
|
||||||
byte qualCur = quals[cycle];
|
byte qualCur = quals[cycle];
|
||||||
|
//double[] fourprob = getBaseProbabilityDistribution(bases.charAt(cycle), quals[cycle]);
|
||||||
double[] fourintensity = (RAW || OLD) ? rawintensities[cycle + cycle_offset] : intensities[cycle + cycle_offset];
|
double[] fourintensity = (RAW || OLD) ? rawintensities[cycle + cycle_offset] : intensities[cycle + cycle_offset];
|
||||||
|
|
||||||
model.addMeanPoint(cycle, baseCur, qualCur, fourintensity);
|
model.addMeanPoint(cycle, cycle == 0 ? '.' : bases.charAt(cycle - 1), baseCur, qualCur, fourintensity);
|
||||||
}
|
}
|
||||||
|
|
||||||
queryid++;
|
queryid++;
|
||||||
|
|
@ -118,9 +121,10 @@ public class FourBaseRecaller extends CommandLineProgram {
|
||||||
for (int cycle = 0; cycle < bases.length(); cycle++) {
|
for (int cycle = 0; cycle < bases.length(); cycle++) {
|
||||||
char baseCur = bases.charAt(cycle);
|
char baseCur = bases.charAt(cycle);
|
||||||
byte qualCur = quals[cycle];
|
byte qualCur = quals[cycle];
|
||||||
|
//double[] fourprob = getBaseProbabilityDistribution(bases.charAt(cycle), quals[cycle]);
|
||||||
double[] fourintensity = (RAW || OLD) ? rawintensities[cycle + cycle_offset] : intensities[cycle + cycle_offset];
|
double[] fourintensity = (RAW || OLD) ? rawintensities[cycle + cycle_offset] : intensities[cycle + cycle_offset];
|
||||||
|
|
||||||
model.addCovariancePoint(cycle, baseCur, qualCur, fourintensity);
|
model.addCovariancePoint(cycle, cycle == 0 ? '.' : bases.charAt(cycle - 1), baseCur, qualCur, fourintensity);
|
||||||
}
|
}
|
||||||
|
|
||||||
queryid++;
|
queryid++;
|
||||||
|
|
@ -143,7 +147,7 @@ public class FourBaseRecaller extends CommandLineProgram {
|
||||||
int[][] base_counts = new int[4][bread.getFirstReadSequence().length()];
|
int[][] base_counts = new int[4][bread.getFirstReadSequence().length()];
|
||||||
int bases_incorrect = 0, bases_total = 0;
|
int bases_incorrect = 0, bases_total = 0;
|
||||||
|
|
||||||
if (debugout != null) { debugout.println("cycle int_a int_c int_g int_t bustard_base kiran_base bustard_prob kiran_prob"); }
|
if (debugout != null) { debugout.println("cycle int_a int_c int_g int_t bustard_base kiran_base bustard_prob kiran_prob kiran_base_prev"); }
|
||||||
|
|
||||||
queryid = 0;
|
queryid = 0;
|
||||||
do {
|
do {
|
||||||
|
|
@ -161,14 +165,17 @@ public class FourBaseRecaller extends CommandLineProgram {
|
||||||
for (int cycle = 0; cycle < bases.length(); cycle++) {
|
for (int cycle = 0; cycle < bases.length(); cycle++) {
|
||||||
double[] fourintensity = (RAW || OLD) ? rawintensities[cycle + cycle_offset] : intensities[cycle + cycle_offset];
|
double[] fourintensity = (RAW || OLD) ? rawintensities[cycle + cycle_offset] : intensities[cycle + cycle_offset];
|
||||||
|
|
||||||
FourProb fp = model.computeProbabilities(cycle, fourintensity);
|
char basePrev = (cycle == 0 ? '.' : (char) asciiseq[cycle - 1]);
|
||||||
|
byte qualPrev = (cycle == 0 ? 40 : bestqual[cycle - 1]);
|
||||||
|
|
||||||
|
FourProb fp = model.computeProbabilities(cycle, basePrev, qualPrev, fourintensity);
|
||||||
|
|
||||||
asciiseq[cycle] = (byte) fp.baseAtRank(0);
|
asciiseq[cycle] = (byte) fp.baseAtRank(0);
|
||||||
bestqual[cycle] = fp.qualAtRank(0);
|
bestqual[cycle] = fp.qualAtRank(0);
|
||||||
nextbestqual[cycle] = QualityUtils.baseAndProbToCompressedQuality(fp.indexAtRank(1), fp.probAtRank(1));
|
nextbestqual[cycle] = QualityUtils.baseAndProbToCompressedQuality(fp.indexAtRank(1), fp.probAtRank(1));
|
||||||
|
|
||||||
if (debugout != null && bases.charAt(cycle) != '.' && base_counts[BaseUtils.simpleBaseToBaseIndex(bases.charAt(cycle))][cycle] < 1000) {
|
if (debugout != null && bases.charAt(cycle) != '.' && base_counts[BaseUtils.simpleBaseToBaseIndex(bases.charAt(cycle))][cycle] < 1000) {
|
||||||
debugout.println(cycle + " " + fourintensity[0] + " " + fourintensity[1] + " " + fourintensity[2] + " " + fourintensity[3] + " " + (bases.charAt(cycle)) + " " + ((char) asciiseq[cycle]) + " " + bestqual[cycle] + " " + quals[cycle]);
|
debugout.println(cycle + " " + fourintensity[0] + " " + fourintensity[1] + " " + fourintensity[2] + " " + fourintensity[3] + " " + (bases.charAt(cycle)) + " " + ((char) asciiseq[cycle]) + " " + bestqual[cycle] + " " + quals[cycle] + " " + basePrev);
|
||||||
|
|
||||||
base_counts[BaseUtils.simpleBaseToBaseIndex(bases.charAt(cycle))][cycle]++;
|
base_counts[BaseUtils.simpleBaseToBaseIndex(bases.charAt(cycle))][cycle]++;
|
||||||
}
|
}
|
||||||
|
|
@ -188,11 +195,27 @@ public class FourBaseRecaller extends CommandLineProgram {
|
||||||
}
|
}
|
||||||
sfw.close();
|
sfw.close();
|
||||||
|
|
||||||
System.out.println("Nonmatch rate: " + ((double) bases_incorrect)/((double) bases_total));
|
System.out.println("Disagreement rate: " + ((double) bases_incorrect)/((double) bases_total));
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private double[] getBaseProbabilityDistribution(char base, byte qual) {
|
||||||
|
double[] dist = new double[4];
|
||||||
|
|
||||||
|
int baseIndex = BaseUtils.simpleBaseToBaseIndex(base);
|
||||||
|
dist[baseIndex] = QualityUtils.qualToProb(qual);
|
||||||
|
double residualprob = (1.0 - dist[baseIndex])/3.0;
|
||||||
|
|
||||||
|
for (int i = 0; i < 4; i++) {
|
||||||
|
if (i != baseIndex) {
|
||||||
|
dist[i] = residualprob;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return dist;
|
||||||
|
}
|
||||||
|
|
||||||
private SAMRecord constructSAMRecord(String readNamePrefix, String bases, byte[] bestqual, byte[] nextbestqual, BustardReadData bread, SAMFileHeader sfh) {
|
private SAMRecord constructSAMRecord(String readNamePrefix, String bases, byte[] bestqual, byte[] nextbestqual, BustardReadData bread, SAMFileHeader sfh) {
|
||||||
SAMRecord sr = new SAMRecord(sfh);
|
SAMRecord sr = new SAMRecord(sfh);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -15,9 +15,16 @@ public class FourProb {
|
||||||
/**
|
/**
|
||||||
* Constructor for FourProb.
|
* Constructor for FourProb.
|
||||||
*
|
*
|
||||||
* @param baseProbs the unsorted base hypothesis probabilities (in ACGT order).
|
* @param baseLikes the unsorted base hypothesis probabilities (in ACGT order).
|
||||||
*/
|
*/
|
||||||
public FourProb(double[] baseProbs) {
|
public FourProb(double[][] baseLikes) {
|
||||||
|
double[] baseProbs = new double[4];
|
||||||
|
for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) {
|
||||||
|
for (int basePrevIndex = 0; basePrevIndex < baseLikes.length; basePrevIndex++) {
|
||||||
|
baseProbs[baseCurIndex] += baseLikes[basePrevIndex][baseCurIndex];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int[] baseIndices = {0, 1, 2, 3};
|
int[] baseIndices = {0, 1, 2, 3};
|
||||||
|
|
||||||
Integer[] perm = Utils.SortPermutation(baseProbs);
|
Integer[] perm = Utils.SortPermutation(baseProbs);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue