Added some debugging stuff (writes model parameters to one file per cycle).
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@304 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
0fc8a90553
commit
5a5c6d1276
|
|
@ -8,6 +8,8 @@ import cern.colt.matrix.linalg.Algebra;
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.QualityUtils;
|
import org.broadinstitute.sting.utils.QualityUtils;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* BasecallingBaseModel is a class that represents the statistical
|
* BasecallingBaseModel is a class that represents the statistical
|
||||||
* model for all bases at a given cycle. It allows for easy, one
|
* model for all bases at a given cycle. It allows for easy, one
|
||||||
|
|
@ -59,15 +61,34 @@ public class BasecallingBaseModel {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Add a single training point to the model.
|
* Add a single training point to the model.
|
||||||
|
*
|
||||||
* @param basePrev the previous cycle's base call (A, C, G, T, or * for the first cycle)
|
* @param basePrev the previous cycle's base call (A, C, G, T, or * for the first cycle)
|
||||||
* @param baseCur the current cycle's base call (A, C, G, T)
|
* @param baseCur the current cycle's base call (A, C, G, T)
|
||||||
* @param qualCur the quality score for the current cycle's base call
|
* @param qualCur the quality score for the current cycle's base call
|
||||||
* @param fourintensity the four intensities for the current cycle's base call
|
* @param fourintensity the four intensities for the current cycle's base call
|
||||||
*/
|
*/
|
||||||
public void addTrainingPoint(char basePrev, char baseCur, byte qualCur, double[] fourintensity) {
|
public void addTrainingPoint(char basePrev, char baseCur, byte qualCur, double[] fourintensity) {
|
||||||
int basePrevIndex = baseToBaseIndex(basePrev);
|
int actualBasePrevIndex = baseToBaseIndex(basePrev);
|
||||||
int baseCurIndex = baseToBaseIndex(baseCur);
|
int actualBaseCurIndex = baseToBaseIndex(baseCur);
|
||||||
|
double actualWeight = QualityUtils.qualToProb(qualCur);
|
||||||
|
double otherTheories = (basePrev == '*') ? 3.0 : 15.0;
|
||||||
|
|
||||||
|
cern.jet.math.Functions F = cern.jet.math.Functions.functions;
|
||||||
|
|
||||||
|
for (int basePrevIndex = 0; basePrevIndex < 4; basePrevIndex++) {
|
||||||
|
for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) {
|
||||||
|
// We want to upweight the correct theory as much as we can and spread the remainder out evenly between all other hypotheses.
|
||||||
|
double weight = (basePrevIndex == actualBasePrevIndex && baseCurIndex == actualBaseCurIndex) ? actualWeight : ((1.0 - actualWeight)/otherTheories);
|
||||||
|
|
||||||
|
DoubleMatrix1D weightedChannelIntensities = (DoubleFactory1D.dense).make(fourintensity);
|
||||||
|
weightedChannelIntensities.assign(F.mult(weight));
|
||||||
|
|
||||||
|
runningChannelSums[basePrevIndex][baseCurIndex].assign(weightedChannelIntensities, F.plus);
|
||||||
|
counts[basePrevIndex][baseCurIndex] += weight;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
if (basePrevIndex >= 0 && baseCurIndex >= 0) {
|
if (basePrevIndex >= 0 && baseCurIndex >= 0) {
|
||||||
for (int channel = 0; channel < 4; channel++) {
|
for (int channel = 0; channel < 4; channel++) {
|
||||||
double weight = QualityUtils.qualToProb(qualCur);
|
double weight = QualityUtils.qualToProb(qualCur);
|
||||||
|
|
@ -83,6 +104,7 @@ public class BasecallingBaseModel {
|
||||||
|
|
||||||
counts[basePrevIndex][baseCurIndex]++;
|
counts[basePrevIndex][baseCurIndex]++;
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
readyToCall = false;
|
readyToCall = false;
|
||||||
}
|
}
|
||||||
|
|
@ -148,6 +170,27 @@ public class BasecallingBaseModel {
|
||||||
return probdist;
|
return probdist;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void write(File outparam) {
|
||||||
|
try {
|
||||||
|
PrintWriter writer = new PrintWriter(outparam);
|
||||||
|
|
||||||
|
for (int basePrevIndex = 0; basePrevIndex < 4; basePrevIndex++) {
|
||||||
|
for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) {
|
||||||
|
writer.print("mean_" + baseIndexToBase(basePrevIndex) + "" + baseIndexToBase(baseCurIndex) + " : [ ");
|
||||||
|
for (int channel = 0; channel < 4; channel++) {
|
||||||
|
writer.print(runningChannelSums[basePrevIndex][baseCurIndex].getQuick(channel)/counts[basePrevIndex][baseCurIndex]);
|
||||||
|
writer.print(" ");
|
||||||
|
}
|
||||||
|
writer.print("]\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
writer.close();
|
||||||
|
} catch (IOException e) {
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Utility method for converting a base ([Aa*], [Cc], [Gg], [Tt]) to an index (0, 1, 2, 3);
|
* Utility method for converting a base ([Aa*], [Cc], [Gg], [Tt]) to an index (0, 1, 2, 3);
|
||||||
* @param base
|
* @param base
|
||||||
|
|
@ -171,4 +214,14 @@ public class BasecallingBaseModel {
|
||||||
|
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private char baseIndexToBase(int baseIndex) {
|
||||||
|
switch (baseIndex) {
|
||||||
|
case 0: return 'A';
|
||||||
|
case 1: return 'C';
|
||||||
|
case 2: return 'G';
|
||||||
|
case 3: return 'T';
|
||||||
|
default: return '.';
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,7 @@
|
||||||
package org.broadinstitute.sting.playground.fourbasecaller;
|
package org.broadinstitute.sting.playground.fourbasecaller;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* BasecallingReadModel represents the statistical models for
|
* BasecallingReadModel represents the statistical models for
|
||||||
* all bases in all cycles. It allows for easy, one-pass
|
* all bases in all cycles. It allows for easy, one-pass
|
||||||
|
|
@ -28,6 +30,7 @@ public class BasecallingReadModel {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Add a single training point to the model.
|
* Add a single training point to the model.
|
||||||
|
*
|
||||||
* @param cycle the cycle for which this point should be added
|
* @param cycle the cycle for which this point should be added
|
||||||
* @param basePrev the previous base
|
* @param basePrev the previous base
|
||||||
* @param baseCur the current base
|
* @param baseCur the current base
|
||||||
|
|
@ -82,4 +85,16 @@ public class BasecallingReadModel {
|
||||||
|
|
||||||
return new FourProb(baseindices, probs);
|
return new FourProb(baseindices, probs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes model parameters to a file per cycle.
|
||||||
|
*
|
||||||
|
* @param dir the directory where the parameters should be written
|
||||||
|
*/
|
||||||
|
public void write(File dir) {
|
||||||
|
for (int cycle = 0; cycle < basemodels.length; cycle++) {
|
||||||
|
File outparam = new File(dir.getPath() + "/param." + cycle);
|
||||||
|
basemodels[cycle].write(outparam);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -45,7 +45,7 @@ public class FourBaseRecaller extends CommandLineProgram {
|
||||||
bread = bfp.next();
|
bread = bfp.next();
|
||||||
|
|
||||||
int cycle_offset = (END <= 1) ? 0 : bread.getIntensities().length/2;
|
int cycle_offset = (END <= 1) ? 0 : bread.getIntensities().length/2;
|
||||||
BasecallingReadModel p = new BasecallingReadModel(bread.getFirstReadSequence().length());
|
BasecallingReadModel model = new BasecallingReadModel(bread.getFirstReadSequence().length());
|
||||||
int queryid;
|
int queryid;
|
||||||
|
|
||||||
// learn initial parameters
|
// learn initial parameters
|
||||||
|
|
@ -61,12 +61,16 @@ public class FourBaseRecaller extends CommandLineProgram {
|
||||||
byte qualCur = quals[cycle];
|
byte qualCur = quals[cycle];
|
||||||
double[] fourintensity = intensities[cycle + cycle_offset];
|
double[] fourintensity = intensities[cycle + cycle_offset];
|
||||||
|
|
||||||
p.addTrainingPoint(cycle, basePrev, baseCur, qualCur, fourintensity);
|
model.addTrainingPoint(cycle, basePrev, baseCur, qualCur, fourintensity);
|
||||||
}
|
}
|
||||||
|
|
||||||
queryid++;
|
queryid++;
|
||||||
} while (queryid < TRAINING_LIMIT && bfp.hasNext() && (bread = bfp.next()) != null);
|
} while (queryid < TRAINING_LIMIT && bfp.hasNext() && (bread = bfp.next()) != null);
|
||||||
|
|
||||||
|
File debugout = new File(OUT.getParentFile().getPath() + "/model/");
|
||||||
|
debugout.mkdir();
|
||||||
|
model.write(debugout);
|
||||||
|
|
||||||
// call bases
|
// call bases
|
||||||
SAMFileHeader sfh = new SAMFileHeader();
|
SAMFileHeader sfh = new SAMFileHeader();
|
||||||
SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, OUT);
|
SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, OUT);
|
||||||
|
|
@ -89,7 +93,7 @@ public class FourBaseRecaller extends CommandLineProgram {
|
||||||
byte qualPrev = (cycle == 0) ? 0 : bestqual[cycle - 1];
|
byte qualPrev = (cycle == 0) ? 0 : bestqual[cycle - 1];
|
||||||
double[] fourintensity = intensities[cycle + cycle_offset];
|
double[] fourintensity = intensities[cycle + cycle_offset];
|
||||||
|
|
||||||
FourProb fp = p.computeProbabilities(cycle, basePrev, qualPrev, fourintensity);
|
FourProb fp = model.computeProbabilities(cycle, basePrev, qualPrev, fourintensity);
|
||||||
|
|
||||||
asciiseq[cycle] = (byte) fp.baseAtRank(0);
|
asciiseq[cycle] = (byte) fp.baseAtRank(0);
|
||||||
bestqual[cycle] = fp.qualAtRank(0);
|
bestqual[cycle] = fp.qualAtRank(0);
|
||||||
|
|
@ -102,6 +106,8 @@ public class FourBaseRecaller extends CommandLineProgram {
|
||||||
queryid++;
|
queryid++;
|
||||||
} while (queryid < CALLING_LIMIT && bfp.hasNext() && (bread = bfp.next()) != null);
|
} while (queryid < CALLING_LIMIT && bfp.hasNext() && (bread = bfp.next()) != null);
|
||||||
|
|
||||||
|
sfw.close();
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue