Now basecalls an entire read (both ends of the pair, barcode... everything) at once. After, RawRead and FourProbRead can be asked to return a specified subset (corresponding to the ranges specified for each end of the read.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@754 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
e293d65ede
commit
6c5fbb988b
|
|
@ -18,43 +18,37 @@ import java.util.ArrayList;
|
||||||
*/
|
*/
|
||||||
public class BasecallingReadModel {
|
public class BasecallingReadModel {
|
||||||
private BasecallingBaseModel[] basemodels = null;
|
private BasecallingBaseModel[] basemodels = null;
|
||||||
private boolean correctForContext = false;
|
private boolean correctForContext = true;
|
||||||
|
|
||||||
/**
|
public BasecallingReadModel(int readLength) {
|
||||||
* Constructor for BasecallingReadModel.
|
initialize(readLength);
|
||||||
*
|
}
|
||||||
* @param readLength the length of the read that this model will support
|
|
||||||
*/
|
|
||||||
public BasecallingReadModel(int readLength, boolean correctForContext) {
|
|
||||||
this.correctForContext = correctForContext;
|
|
||||||
|
|
||||||
|
public BasecallingReadModel(ArrayList<RawRead> trainingData) {
|
||||||
|
initialize(trainingData.get(0).getReadLength());
|
||||||
|
|
||||||
|
train(trainingData);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void initialize(int readLength) {
|
||||||
basemodels = new BasecallingBaseModel[readLength];
|
basemodels = new BasecallingBaseModel[readLength];
|
||||||
|
|
||||||
for (int cycle = 0; cycle < readLength; cycle++) {
|
for (int cycle = 0; cycle < readLength; cycle++) {
|
||||||
basemodels[cycle] = new BasecallingBaseModel(cycle == 0 ? false : correctForContext);
|
basemodels[cycle] = new BasecallingBaseModel(cycle != 0 && correctForContext);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void train(BasecallingTrainingSet trainingSet) {
|
public void train(ArrayList<RawRead> trainingData) {
|
||||||
ArrayList<RawRead> trainingData = trainingSet.getTrainingData();
|
//for (int readIndex = 0; readIndex < trainingData.size(); readIndex++) {
|
||||||
|
for ( RawRead read : trainingData ) {
|
||||||
for (int readIndex = 0; readIndex < trainingData.size(); readIndex++) {
|
|
||||||
RawRead read = trainingData.get(readIndex);
|
|
||||||
addMeanPoints(read);
|
addMeanPoints(read);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int readIndex = 0; readIndex < trainingData.size(); readIndex++) {
|
for ( RawRead read : trainingData ) {
|
||||||
RawRead read = trainingData.get(readIndex);
|
|
||||||
addCovariancePoints(read);
|
addCovariancePoints(read);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Add a single training point to the model means.
|
|
||||||
*
|
|
||||||
* @param cycle the cycle for which this point should be added
|
|
||||||
* @param fourintensity the four intensities of the current base
|
|
||||||
*/
|
|
||||||
public void addMeanPoint(int cycle, double[][] probMatrix, double[] fourintensity) {
|
public void addMeanPoint(int cycle, double[][] probMatrix, double[] fourintensity) {
|
||||||
basemodels[cycle].addMeanPoint(probMatrix, fourintensity);
|
basemodels[cycle].addMeanPoint(probMatrix, fourintensity);
|
||||||
}
|
}
|
||||||
|
|
@ -80,12 +74,6 @@ public class BasecallingReadModel {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Add a single training point to the model covariances.
|
|
||||||
*
|
|
||||||
* @param cycle the cycle for which this point should be added
|
|
||||||
* @param fourintensity the four intensities of the current base
|
|
||||||
*/
|
|
||||||
public void addCovariancePoint(int cycle, double[][] probMatrix, double[] fourintensity) {
|
public void addCovariancePoint(int cycle, double[][] probMatrix, double[] fourintensity) {
|
||||||
basemodels[cycle].addCovariancePoint(probMatrix, fourintensity);
|
basemodels[cycle].addCovariancePoint(probMatrix, fourintensity);
|
||||||
}
|
}
|
||||||
|
|
@ -111,25 +99,10 @@ public class BasecallingReadModel {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Compute the likelihood matrix for a given cycle.
|
|
||||||
*
|
|
||||||
* @param cycle the cycle number for the current base
|
|
||||||
* @param fourintensity the four intensities for the current cycle's base
|
|
||||||
* @return 4x4 matrix of likelihoods
|
|
||||||
*/
|
|
||||||
public double[][] computeLikelihoods(int cycle, double[] fourintensity) {
|
public double[][] computeLikelihoods(int cycle, double[] fourintensity) {
|
||||||
return basemodels[cycle].computeLikelihoods(cycle, fourintensity);
|
return basemodels[cycle].computeLikelihoods(cycle, fourintensity);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Compute the probability distribution for the base at a given cycle.
|
|
||||||
*
|
|
||||||
* @param cycle the cycle number for the current base
|
|
||||||
* @param fourintensity the four intensities for the current cycle's base
|
|
||||||
* @return an instance of FourProb, which encodes a base hypothesis, its probability,
|
|
||||||
* and the ranking among the other hypotheses
|
|
||||||
*/
|
|
||||||
public FourProb computeProbabilities(int cycle, char basePrev, byte qualPrev, double[] fourintensity) {
|
public FourProb computeProbabilities(int cycle, char basePrev, byte qualPrev, double[] fourintensity) {
|
||||||
double[][] likes = computeLikelihoods(cycle, fourintensity);
|
double[][] likes = computeLikelihoods(cycle, fourintensity);
|
||||||
|
|
||||||
|
|
@ -157,9 +130,7 @@ public class BasecallingReadModel {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
FourProb fp = new FourProb(likes);
|
return new FourProb(likes);
|
||||||
|
|
||||||
return fp;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public FourProbRead call(RawRead read) {
|
public FourProbRead call(RawRead read) {
|
||||||
|
|
@ -174,7 +145,6 @@ public class BasecallingReadModel {
|
||||||
fourIntensity[channel] = (double) read.getIntensities()[cycle][channel];
|
fourIntensity[channel] = (double) read.getIntensities()[cycle][channel];
|
||||||
}
|
}
|
||||||
|
|
||||||
//fps[cycle] = computeProbabilities(cycle, basePrev, qualPrev, fourIntensity);
|
|
||||||
fpr.add(cycle, computeProbabilities(cycle, basePrev, qualPrev, fourIntensity));
|
fpr.add(cycle, computeProbabilities(cycle, basePrev, qualPrev, fourIntensity));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
@ -182,15 +152,6 @@ public class BasecallingReadModel {
|
||||||
return fpr;
|
return fpr;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the base probability matrix
|
|
||||||
*
|
|
||||||
* @param cycle the cycle of the base
|
|
||||||
* @param basePrev the previous base
|
|
||||||
* @param baseCur the current base
|
|
||||||
* @param probCur the probability of the current base
|
|
||||||
* @return the base probability matrix (1x4 if no contextual correction, 4x4 otherwise)
|
|
||||||
*/
|
|
||||||
public double[][] getBaseProbabilityMatrix(int cycle, char basePrev, char baseCur, double probCur) {
|
public double[][] getBaseProbabilityMatrix(int cycle, char basePrev, char baseCur, double probCur) {
|
||||||
double[][] dist = new double[(correctForContext && cycle > 0) ? 4 : 1][4];
|
double[][] dist = new double[(correctForContext && cycle > 0) ? 4 : 1][4];
|
||||||
|
|
||||||
|
|
@ -208,11 +169,6 @@ public class BasecallingReadModel {
|
||||||
return dist;
|
return dist;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Writes model parameters to a file per cycle.
|
|
||||||
*
|
|
||||||
* @param dir the directory where the parameters should be written
|
|
||||||
*/
|
|
||||||
public void write(File dir) {
|
public void write(File dir) {
|
||||||
for (int cycle = 0; cycle < basemodels.length; cycle++) {
|
for (int cycle = 0; cycle < basemodels.length; cycle++) {
|
||||||
File outparam = new File(dir.getPath() + "/param." + cycle + ".r");
|
File outparam = new File(dir.getPath() + "/param." + cycle + ".r");
|
||||||
|
|
|
||||||
|
|
@ -18,6 +18,23 @@ public class FourProbRead extends ArrayList<FourProb> {
|
||||||
super(initialCapacity);
|
super(initialCapacity);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a subset of the FourProbRead.
|
||||||
|
*
|
||||||
|
* @param cycleStart the starting cycle (0-based, inclusive)
|
||||||
|
* @param cycleStop the ending cycle (0-based, inclusive)
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public FourProbRead getSubset(int cycleStart, int cycleStop) {
|
||||||
|
FourProbRead subset = new FourProbRead(cycleStop - cycleStart + 1);
|
||||||
|
|
||||||
|
for (int cycle = cycleStart, offset = 0; cycle <= cycleStop; cycle++, offset++) {
|
||||||
|
subset.add(offset, this.get(cycle));
|
||||||
|
}
|
||||||
|
|
||||||
|
return subset;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the read sequence at a specified rank
|
* Get the read sequence at a specified rank
|
||||||
*
|
*
|
||||||
|
|
|
||||||
|
|
@ -13,12 +13,10 @@ import java.util.Iterator;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
public class IlluminaParser implements Iterator<RawRead>, Iterable<RawRead>, Closeable {
|
public class IlluminaParser implements Closeable {
|
||||||
private File bustardDir;
|
private File bustardDir;
|
||||||
private File firecrestDir;
|
private File firecrestDir;
|
||||||
private int lane;
|
private int lane;
|
||||||
private int cycleBegin;
|
|
||||||
private int cycleEnd;
|
|
||||||
|
|
||||||
private File[] intfiles;
|
private File[] intfiles;
|
||||||
private File[] seqfiles;
|
private File[] seqfiles;
|
||||||
|
|
@ -26,25 +24,20 @@ public class IlluminaParser implements Iterator<RawRead>, Iterable<RawRead>, Clo
|
||||||
|
|
||||||
private int currentTileIndex;
|
private int currentTileIndex;
|
||||||
private PasteParser currentTileParser;
|
private PasteParser currentTileParser;
|
||||||
|
private String[][] currentParseResult;
|
||||||
|
|
||||||
private boolean iterating = false;
|
public IlluminaParser(File bustardDir, int lane) {
|
||||||
|
|
||||||
public IlluminaParser(File bustardDir, int lane, int cycleBegin, int cycleEnd) {
|
|
||||||
this.bustardDir = bustardDir;
|
this.bustardDir = bustardDir;
|
||||||
this.firecrestDir = bustardDir.getParentFile();
|
this.firecrestDir = bustardDir.getParentFile();
|
||||||
this.lane = lane;
|
this.lane = lane;
|
||||||
this.cycleBegin = cycleBegin;
|
|
||||||
this.cycleEnd = cycleEnd;
|
|
||||||
|
|
||||||
initializeParser();
|
initializeParser();
|
||||||
}
|
}
|
||||||
|
|
||||||
public IlluminaParser(File bustardDir, File firecrestDir, int lane, int cycleBegin, int cycleEnd) {
|
public IlluminaParser(File bustardDir, File firecrestDir, int lane) {
|
||||||
this.bustardDir = bustardDir;
|
this.bustardDir = bustardDir;
|
||||||
this.firecrestDir = firecrestDir;
|
this.firecrestDir = firecrestDir;
|
||||||
this.lane = lane;
|
this.lane = lane;
|
||||||
this.cycleBegin = cycleBegin;
|
|
||||||
this.cycleEnd = cycleEnd;
|
|
||||||
|
|
||||||
initializeParser();
|
initializeParser();
|
||||||
}
|
}
|
||||||
|
|
@ -67,10 +60,9 @@ public class IlluminaParser implements Iterator<RawRead>, Iterable<RawRead>, Clo
|
||||||
Arrays.sort(seqfiles, getTileSortingComparator());
|
Arrays.sort(seqfiles, getTileSortingComparator());
|
||||||
Arrays.sort(prbfiles, getTileSortingComparator());
|
Arrays.sort(prbfiles, getTileSortingComparator());
|
||||||
|
|
||||||
iterator();
|
seekToTile(1);
|
||||||
|
|
||||||
// Todo: put some more consistency checks here
|
// Todo: put some more consistency checks here
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private FilenameFilter getFilenameFilter(final String suffix) {
|
private FilenameFilter getFilenameFilter(final String suffix) {
|
||||||
|
|
@ -129,30 +121,33 @@ public class IlluminaParser implements Iterator<RawRead>, Iterable<RawRead>, Clo
|
||||||
return (currentTileParser.hasNext() || seekToTile(currentTileIndex + 1));
|
return (currentTileParser.hasNext() || seekToTile(currentTileIndex + 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
public RawRead next() {
|
public boolean next() {
|
||||||
if (hasNext()) {
|
if (hasNext()) {
|
||||||
return new RawRead(currentTileParser.next(), cycleBegin, cycleEnd);
|
currentParseResult = currentTileParser.next();
|
||||||
|
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
return null;
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String[][] getCurrentParseResult() {
|
||||||
|
return currentParseResult;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void remove() {
|
public void remove() {
|
||||||
throw new UnsupportedOperationException("IlluminaParser.remove() method is not supported.");
|
throw new UnsupportedOperationException("IlluminaParser.remove() method is not supported.");
|
||||||
}
|
}
|
||||||
|
|
||||||
public Iterator<RawRead> iterator() {
|
|
||||||
if (iterating) {
|
|
||||||
throw new IllegalStateException("IlluminaParser.iterator() method can only be called once, before the first call to IlluminaParser.hasNext()");
|
|
||||||
}
|
|
||||||
|
|
||||||
seekToTile(1);
|
|
||||||
iterating = true;
|
|
||||||
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void close() {
|
public void close() {
|
||||||
currentTileParser.close();
|
currentTileParser.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public RawRead getRawRead() {
|
||||||
|
return getSubset(0, currentParseResult[1][4].length() - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
public RawRead getSubset(int cycleStart, int cycleStop) {
|
||||||
|
return new RawRead(currentParseResult, cycleStart, cycleStop);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -29,12 +29,12 @@ public class RawRead {
|
||||||
x = Short.valueOf(pastedReadString[0][2]);
|
x = Short.valueOf(pastedReadString[0][2]);
|
||||||
y = Short.valueOf(pastedReadString[0][3]);
|
y = Short.valueOf(pastedReadString[0][3]);
|
||||||
|
|
||||||
sequence = pastedReadString[1][4].substring(cycleBegin, cycleEnd).getBytes();
|
sequence = pastedReadString[1][4].substring(cycleBegin, cycleEnd + 1).getBytes();
|
||||||
|
|
||||||
quals = new byte[sequence.length];
|
quals = new byte[sequence.length];
|
||||||
intensities = new short[sequence.length][4];
|
intensities = new short[sequence.length][4];
|
||||||
|
|
||||||
for (int cycle = 0; cycle < sequence.length; cycle++) {
|
for (int cycle = cycleBegin, offset = 0; cycle <= cycleEnd; cycle++, offset++) {
|
||||||
byte maxQual = -50;
|
byte maxQual = -50;
|
||||||
|
|
||||||
for (int fullReadIndex = 4*cycle; fullReadIndex < 4*cycle + 4; fullReadIndex++) {
|
for (int fullReadIndex = 4*cycle; fullReadIndex < 4*cycle + 4; fullReadIndex++) {
|
||||||
|
|
@ -43,13 +43,13 @@ public class RawRead {
|
||||||
if (qual > maxQual) { maxQual = qual; }
|
if (qual > maxQual) { maxQual = qual; }
|
||||||
}
|
}
|
||||||
|
|
||||||
quals[cycle] = maxQual >= 0 ? maxQual : 0;
|
quals[offset] = maxQual >= 0 ? maxQual : 0;
|
||||||
|
|
||||||
for (int fullReadIndex = 4*cycle + 4, channel = 0; fullReadIndex < 4*cycle + 8; fullReadIndex++, channel++) {
|
for (int fullReadIndex = 4*cycle + 4, channel = 0; fullReadIndex < 4*cycle + 8; fullReadIndex++, channel++) {
|
||||||
double doubleChannelIntensity = Double.valueOf(pastedReadString[0][fullReadIndex]);
|
double doubleChannelIntensity = Double.valueOf(pastedReadString[0][fullReadIndex]);
|
||||||
short shortChannelIntensity = (short) doubleChannelIntensity;
|
short shortChannelIntensity = (short) doubleChannelIntensity;
|
||||||
|
|
||||||
intensities[cycle][channel] = shortChannelIntensity;
|
intensities[offset][channel] = shortChannelIntensity;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue