Now basecalls an entire read (both ends of the pair, barcode... everything) at once. After, RawRead and FourProbRead can be asked to return a specified subset (corresponding to the ranges specified for each end of the read.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@754 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kiran 2009-05-20 00:09:20 +00:00
parent e293d65ede
commit 6c5fbb988b
4 changed files with 62 additions and 94 deletions

View File

@ -18,43 +18,37 @@ import java.util.ArrayList;
*/
public class BasecallingReadModel {
private BasecallingBaseModel[] basemodels = null;
private boolean correctForContext = false;
private boolean correctForContext = true;
/**
* Constructor for BasecallingReadModel.
*
* @param readLength the length of the read that this model will support
*/
public BasecallingReadModel(int readLength, boolean correctForContext) {
this.correctForContext = correctForContext;
public BasecallingReadModel(int readLength) {
initialize(readLength);
}
public BasecallingReadModel(ArrayList<RawRead> trainingData) {
initialize(trainingData.get(0).getReadLength());
train(trainingData);
}
public void initialize(int readLength) {
basemodels = new BasecallingBaseModel[readLength];
for (int cycle = 0; cycle < readLength; cycle++) {
basemodels[cycle] = new BasecallingBaseModel(cycle == 0 ? false : correctForContext);
basemodels[cycle] = new BasecallingBaseModel(cycle != 0 && correctForContext);
}
}
public void train(BasecallingTrainingSet trainingSet) {
ArrayList<RawRead> trainingData = trainingSet.getTrainingData();
for (int readIndex = 0; readIndex < trainingData.size(); readIndex++) {
RawRead read = trainingData.get(readIndex);
public void train(ArrayList<RawRead> trainingData) {
//for (int readIndex = 0; readIndex < trainingData.size(); readIndex++) {
for ( RawRead read : trainingData ) {
addMeanPoints(read);
}
for (int readIndex = 0; readIndex < trainingData.size(); readIndex++) {
RawRead read = trainingData.get(readIndex);
for ( RawRead read : trainingData ) {
addCovariancePoints(read);
}
}
/**
* Add a single training point to the model means.
*
* @param cycle the cycle for which this point should be added
* @param fourintensity the four intensities of the current base
*/
public void addMeanPoint(int cycle, double[][] probMatrix, double[] fourintensity) {
basemodels[cycle].addMeanPoint(probMatrix, fourintensity);
}
@ -80,12 +74,6 @@ public class BasecallingReadModel {
}
}
/**
* Add a single training point to the model covariances.
*
* @param cycle the cycle for which this point should be added
* @param fourintensity the four intensities of the current base
*/
public void addCovariancePoint(int cycle, double[][] probMatrix, double[] fourintensity) {
basemodels[cycle].addCovariancePoint(probMatrix, fourintensity);
}
@ -111,25 +99,10 @@ public class BasecallingReadModel {
}
}
/**
* Compute the likelihood matrix for a given cycle.
*
* @param cycle the cycle number for the current base
* @param fourintensity the four intensities for the current cycle's base
* @return 4x4 matrix of likelihoods
*/
public double[][] computeLikelihoods(int cycle, double[] fourintensity) {
return basemodels[cycle].computeLikelihoods(cycle, fourintensity);
}
/**
* Compute the probability distribution for the base at a given cycle.
*
* @param cycle the cycle number for the current base
* @param fourintensity the four intensities for the current cycle's base
* @return an instance of FourProb, which encodes a base hypothesis, its probability,
* and the ranking among the other hypotheses
*/
public FourProb computeProbabilities(int cycle, char basePrev, byte qualPrev, double[] fourintensity) {
double[][] likes = computeLikelihoods(cycle, fourintensity);
@ -157,9 +130,7 @@ public class BasecallingReadModel {
}
}
FourProb fp = new FourProb(likes);
return fp;
return new FourProb(likes);
}
public FourProbRead call(RawRead read) {
@ -174,7 +145,6 @@ public class BasecallingReadModel {
fourIntensity[channel] = (double) read.getIntensities()[cycle][channel];
}
//fps[cycle] = computeProbabilities(cycle, basePrev, qualPrev, fourIntensity);
fpr.add(cycle, computeProbabilities(cycle, basePrev, qualPrev, fourIntensity));
}
@ -182,15 +152,6 @@ public class BasecallingReadModel {
return fpr;
}
/**
* Returns the base probability matrix
*
* @param cycle the cycle of the base
* @param basePrev the previous base
* @param baseCur the current base
* @param probCur the probability of the current base
* @return the base probability matrix (1x4 if no contextual correction, 4x4 otherwise)
*/
public double[][] getBaseProbabilityMatrix(int cycle, char basePrev, char baseCur, double probCur) {
double[][] dist = new double[(correctForContext && cycle > 0) ? 4 : 1][4];
@ -208,11 +169,6 @@ public class BasecallingReadModel {
return dist;
}
/**
* Writes model parameters to a file per cycle.
*
* @param dir the directory where the parameters should be written
*/
public void write(File dir) {
for (int cycle = 0; cycle < basemodels.length; cycle++) {
File outparam = new File(dir.getPath() + "/param." + cycle + ".r");

View File

@ -18,6 +18,23 @@ public class FourProbRead extends ArrayList<FourProb> {
super(initialCapacity);
}
/**
* Returns a subset of the FourProbRead.
*
* @param cycleStart the starting cycle (0-based, inclusive)
* @param cycleStop the ending cycle (0-based, inclusive)
* @return
*/
public FourProbRead getSubset(int cycleStart, int cycleStop) {
FourProbRead subset = new FourProbRead(cycleStop - cycleStart + 1);
for (int cycle = cycleStart, offset = 0; cycle <= cycleStop; cycle++, offset++) {
subset.add(offset, this.get(cycle));
}
return subset;
}
/**
* Get the read sequence at a specified rank
*

View File

@ -13,12 +13,10 @@ import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class IlluminaParser implements Iterator<RawRead>, Iterable<RawRead>, Closeable {
public class IlluminaParser implements Closeable {
private File bustardDir;
private File firecrestDir;
private int lane;
private int cycleBegin;
private int cycleEnd;
private File[] intfiles;
private File[] seqfiles;
@ -26,26 +24,21 @@ public class IlluminaParser implements Iterator<RawRead>, Iterable<RawRead>, Clo
private int currentTileIndex;
private PasteParser currentTileParser;
private String[][] currentParseResult;
private boolean iterating = false;
public IlluminaParser(File bustardDir, int lane, int cycleBegin, int cycleEnd) {
public IlluminaParser(File bustardDir, int lane) {
this.bustardDir = bustardDir;
this.firecrestDir = bustardDir.getParentFile();
this.lane = lane;
this.cycleBegin = cycleBegin;
this.cycleEnd = cycleEnd;
initializeParser();
}
public IlluminaParser(File bustardDir, File firecrestDir, int lane, int cycleBegin, int cycleEnd) {
public IlluminaParser(File bustardDir, File firecrestDir, int lane) {
this.bustardDir = bustardDir;
this.firecrestDir = firecrestDir;
this.lane = lane;
this.cycleBegin = cycleBegin;
this.cycleEnd = cycleEnd;
initializeParser();
}
@ -67,10 +60,9 @@ public class IlluminaParser implements Iterator<RawRead>, Iterable<RawRead>, Clo
Arrays.sort(seqfiles, getTileSortingComparator());
Arrays.sort(prbfiles, getTileSortingComparator());
iterator();
seekToTile(1);
// Todo: put some more consistency checks here
}
private FilenameFilter getFilenameFilter(final String suffix) {
@ -129,30 +121,33 @@ public class IlluminaParser implements Iterator<RawRead>, Iterable<RawRead>, Clo
return (currentTileParser.hasNext() || seekToTile(currentTileIndex + 1));
}
public RawRead next() {
public boolean next() {
if (hasNext()) {
return new RawRead(currentTileParser.next(), cycleBegin, cycleEnd);
currentParseResult = currentTileParser.next();
return true;
}
return null;
return false;
}
public String[][] getCurrentParseResult() {
return currentParseResult;
}
public void remove() {
throw new UnsupportedOperationException("IlluminaParser.remove() method is not supported.");
}
public Iterator<RawRead> iterator() {
if (iterating) {
throw new IllegalStateException("IlluminaParser.iterator() method can only be called once, before the first call to IlluminaParser.hasNext()");
}
seekToTile(1);
iterating = true;
return this;
}
public void close() {
currentTileParser.close();
}
public RawRead getRawRead() {
return getSubset(0, currentParseResult[1][4].length() - 1);
}
public RawRead getSubset(int cycleStart, int cycleStop) {
return new RawRead(currentParseResult, cycleStart, cycleStop);
}
}

View File

@ -29,12 +29,12 @@ public class RawRead {
x = Short.valueOf(pastedReadString[0][2]);
y = Short.valueOf(pastedReadString[0][3]);
sequence = pastedReadString[1][4].substring(cycleBegin, cycleEnd).getBytes();
sequence = pastedReadString[1][4].substring(cycleBegin, cycleEnd + 1).getBytes();
quals = new byte[sequence.length];
intensities = new short[sequence.length][4];
for (int cycle = 0; cycle < sequence.length; cycle++) {
for (int cycle = cycleBegin, offset = 0; cycle <= cycleEnd; cycle++, offset++) {
byte maxQual = -50;
for (int fullReadIndex = 4*cycle; fullReadIndex < 4*cycle + 4; fullReadIndex++) {
@ -43,13 +43,13 @@ public class RawRead {
if (qual > maxQual) { maxQual = qual; }
}
quals[cycle] = maxQual >= 0 ? maxQual : 0;
quals[offset] = maxQual >= 0 ? maxQual : 0;
for (int fullReadIndex = 4*cycle + 4, channel = 0; fullReadIndex < 4*cycle + 8; fullReadIndex++, channel++) {
double doubleChannelIntensity = Double.valueOf(pastedReadString[0][fullReadIndex]);
short shortChannelIntensity = (short) doubleChannelIntensity;
intensities[cycle][channel] = shortChannelIntensity;
intensities[offset][channel] = shortChannelIntensity;
}
}
}