Now basecalls an entire read (both ends of the pair, barcode... everything) at once. After, RawRead and FourProbRead can be asked to return a specified subset (corresponding to the ranges specified for each end of the read.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@754 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
e293d65ede
commit
6c5fbb988b
|
|
@ -18,43 +18,37 @@ import java.util.ArrayList;
|
|||
*/
|
||||
public class BasecallingReadModel {
|
||||
private BasecallingBaseModel[] basemodels = null;
|
||||
private boolean correctForContext = false;
|
||||
private boolean correctForContext = true;
|
||||
|
||||
/**
|
||||
* Constructor for BasecallingReadModel.
|
||||
*
|
||||
* @param readLength the length of the read that this model will support
|
||||
*/
|
||||
public BasecallingReadModel(int readLength, boolean correctForContext) {
|
||||
this.correctForContext = correctForContext;
|
||||
|
||||
public BasecallingReadModel(int readLength) {
|
||||
initialize(readLength);
|
||||
}
|
||||
|
||||
public BasecallingReadModel(ArrayList<RawRead> trainingData) {
|
||||
initialize(trainingData.get(0).getReadLength());
|
||||
|
||||
train(trainingData);
|
||||
}
|
||||
|
||||
public void initialize(int readLength) {
|
||||
basemodels = new BasecallingBaseModel[readLength];
|
||||
|
||||
for (int cycle = 0; cycle < readLength; cycle++) {
|
||||
basemodels[cycle] = new BasecallingBaseModel(cycle == 0 ? false : correctForContext);
|
||||
basemodels[cycle] = new BasecallingBaseModel(cycle != 0 && correctForContext);
|
||||
}
|
||||
}
|
||||
|
||||
public void train(BasecallingTrainingSet trainingSet) {
|
||||
ArrayList<RawRead> trainingData = trainingSet.getTrainingData();
|
||||
|
||||
for (int readIndex = 0; readIndex < trainingData.size(); readIndex++) {
|
||||
RawRead read = trainingData.get(readIndex);
|
||||
public void train(ArrayList<RawRead> trainingData) {
|
||||
//for (int readIndex = 0; readIndex < trainingData.size(); readIndex++) {
|
||||
for ( RawRead read : trainingData ) {
|
||||
addMeanPoints(read);
|
||||
}
|
||||
|
||||
for (int readIndex = 0; readIndex < trainingData.size(); readIndex++) {
|
||||
RawRead read = trainingData.get(readIndex);
|
||||
for ( RawRead read : trainingData ) {
|
||||
addCovariancePoints(read);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a single training point to the model means.
|
||||
*
|
||||
* @param cycle the cycle for which this point should be added
|
||||
* @param fourintensity the four intensities of the current base
|
||||
*/
|
||||
public void addMeanPoint(int cycle, double[][] probMatrix, double[] fourintensity) {
|
||||
basemodels[cycle].addMeanPoint(probMatrix, fourintensity);
|
||||
}
|
||||
|
|
@ -80,12 +74,6 @@ public class BasecallingReadModel {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a single training point to the model covariances.
|
||||
*
|
||||
* @param cycle the cycle for which this point should be added
|
||||
* @param fourintensity the four intensities of the current base
|
||||
*/
|
||||
public void addCovariancePoint(int cycle, double[][] probMatrix, double[] fourintensity) {
|
||||
basemodels[cycle].addCovariancePoint(probMatrix, fourintensity);
|
||||
}
|
||||
|
|
@ -111,25 +99,10 @@ public class BasecallingReadModel {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the likelihood matrix for a given cycle.
|
||||
*
|
||||
* @param cycle the cycle number for the current base
|
||||
* @param fourintensity the four intensities for the current cycle's base
|
||||
* @return 4x4 matrix of likelihoods
|
||||
*/
|
||||
public double[][] computeLikelihoods(int cycle, double[] fourintensity) {
|
||||
return basemodels[cycle].computeLikelihoods(cycle, fourintensity);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the probability distribution for the base at a given cycle.
|
||||
*
|
||||
* @param cycle the cycle number for the current base
|
||||
* @param fourintensity the four intensities for the current cycle's base
|
||||
* @return an instance of FourProb, which encodes a base hypothesis, its probability,
|
||||
* and the ranking among the other hypotheses
|
||||
*/
|
||||
public FourProb computeProbabilities(int cycle, char basePrev, byte qualPrev, double[] fourintensity) {
|
||||
double[][] likes = computeLikelihoods(cycle, fourintensity);
|
||||
|
||||
|
|
@ -157,9 +130,7 @@ public class BasecallingReadModel {
|
|||
}
|
||||
}
|
||||
|
||||
FourProb fp = new FourProb(likes);
|
||||
|
||||
return fp;
|
||||
return new FourProb(likes);
|
||||
}
|
||||
|
||||
public FourProbRead call(RawRead read) {
|
||||
|
|
@ -174,7 +145,6 @@ public class BasecallingReadModel {
|
|||
fourIntensity[channel] = (double) read.getIntensities()[cycle][channel];
|
||||
}
|
||||
|
||||
//fps[cycle] = computeProbabilities(cycle, basePrev, qualPrev, fourIntensity);
|
||||
fpr.add(cycle, computeProbabilities(cycle, basePrev, qualPrev, fourIntensity));
|
||||
|
||||
}
|
||||
|
|
@ -182,15 +152,6 @@ public class BasecallingReadModel {
|
|||
return fpr;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the base probability matrix
|
||||
*
|
||||
* @param cycle the cycle of the base
|
||||
* @param basePrev the previous base
|
||||
* @param baseCur the current base
|
||||
* @param probCur the probability of the current base
|
||||
* @return the base probability matrix (1x4 if no contextual correction, 4x4 otherwise)
|
||||
*/
|
||||
public double[][] getBaseProbabilityMatrix(int cycle, char basePrev, char baseCur, double probCur) {
|
||||
double[][] dist = new double[(correctForContext && cycle > 0) ? 4 : 1][4];
|
||||
|
||||
|
|
@ -208,11 +169,6 @@ public class BasecallingReadModel {
|
|||
return dist;
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes model parameters to a file per cycle.
|
||||
*
|
||||
* @param dir the directory where the parameters should be written
|
||||
*/
|
||||
public void write(File dir) {
|
||||
for (int cycle = 0; cycle < basemodels.length; cycle++) {
|
||||
File outparam = new File(dir.getPath() + "/param." + cycle + ".r");
|
||||
|
|
|
|||
|
|
@ -18,6 +18,23 @@ public class FourProbRead extends ArrayList<FourProb> {
|
|||
super(initialCapacity);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a subset of the FourProbRead.
|
||||
*
|
||||
* @param cycleStart the starting cycle (0-based, inclusive)
|
||||
* @param cycleStop the ending cycle (0-based, inclusive)
|
||||
* @return
|
||||
*/
|
||||
public FourProbRead getSubset(int cycleStart, int cycleStop) {
|
||||
FourProbRead subset = new FourProbRead(cycleStop - cycleStart + 1);
|
||||
|
||||
for (int cycle = cycleStart, offset = 0; cycle <= cycleStop; cycle++, offset++) {
|
||||
subset.add(offset, this.get(cycle));
|
||||
}
|
||||
|
||||
return subset;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the read sequence at a specified rank
|
||||
*
|
||||
|
|
|
|||
|
|
@ -13,12 +13,10 @@ import java.util.Iterator;
|
|||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class IlluminaParser implements Iterator<RawRead>, Iterable<RawRead>, Closeable {
|
||||
public class IlluminaParser implements Closeable {
|
||||
private File bustardDir;
|
||||
private File firecrestDir;
|
||||
private int lane;
|
||||
private int cycleBegin;
|
||||
private int cycleEnd;
|
||||
|
||||
private File[] intfiles;
|
||||
private File[] seqfiles;
|
||||
|
|
@ -26,26 +24,21 @@ public class IlluminaParser implements Iterator<RawRead>, Iterable<RawRead>, Clo
|
|||
|
||||
private int currentTileIndex;
|
||||
private PasteParser currentTileParser;
|
||||
private String[][] currentParseResult;
|
||||
|
||||
private boolean iterating = false;
|
||||
|
||||
public IlluminaParser(File bustardDir, int lane, int cycleBegin, int cycleEnd) {
|
||||
public IlluminaParser(File bustardDir, int lane) {
|
||||
this.bustardDir = bustardDir;
|
||||
this.firecrestDir = bustardDir.getParentFile();
|
||||
this.lane = lane;
|
||||
this.cycleBegin = cycleBegin;
|
||||
this.cycleEnd = cycleEnd;
|
||||
|
||||
initializeParser();
|
||||
}
|
||||
|
||||
public IlluminaParser(File bustardDir, File firecrestDir, int lane, int cycleBegin, int cycleEnd) {
|
||||
public IlluminaParser(File bustardDir, File firecrestDir, int lane) {
|
||||
this.bustardDir = bustardDir;
|
||||
this.firecrestDir = firecrestDir;
|
||||
this.lane = lane;
|
||||
this.cycleBegin = cycleBegin;
|
||||
this.cycleEnd = cycleEnd;
|
||||
|
||||
|
||||
initializeParser();
|
||||
}
|
||||
|
||||
|
|
@ -67,10 +60,9 @@ public class IlluminaParser implements Iterator<RawRead>, Iterable<RawRead>, Clo
|
|||
Arrays.sort(seqfiles, getTileSortingComparator());
|
||||
Arrays.sort(prbfiles, getTileSortingComparator());
|
||||
|
||||
iterator();
|
||||
seekToTile(1);
|
||||
|
||||
// Todo: put some more consistency checks here
|
||||
|
||||
}
|
||||
|
||||
private FilenameFilter getFilenameFilter(final String suffix) {
|
||||
|
|
@ -129,30 +121,33 @@ public class IlluminaParser implements Iterator<RawRead>, Iterable<RawRead>, Clo
|
|||
return (currentTileParser.hasNext() || seekToTile(currentTileIndex + 1));
|
||||
}
|
||||
|
||||
public RawRead next() {
|
||||
public boolean next() {
|
||||
if (hasNext()) {
|
||||
return new RawRead(currentTileParser.next(), cycleBegin, cycleEnd);
|
||||
currentParseResult = currentTileParser.next();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return null;
|
||||
return false;
|
||||
}
|
||||
|
||||
public String[][] getCurrentParseResult() {
|
||||
return currentParseResult;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("IlluminaParser.remove() method is not supported.");
|
||||
}
|
||||
|
||||
public Iterator<RawRead> iterator() {
|
||||
if (iterating) {
|
||||
throw new IllegalStateException("IlluminaParser.iterator() method can only be called once, before the first call to IlluminaParser.hasNext()");
|
||||
}
|
||||
|
||||
seekToTile(1);
|
||||
iterating = true;
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
public void close() {
|
||||
currentTileParser.close();
|
||||
}
|
||||
|
||||
public RawRead getRawRead() {
|
||||
return getSubset(0, currentParseResult[1][4].length() - 1);
|
||||
}
|
||||
|
||||
public RawRead getSubset(int cycleStart, int cycleStop) {
|
||||
return new RawRead(currentParseResult, cycleStart, cycleStop);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -29,12 +29,12 @@ public class RawRead {
|
|||
x = Short.valueOf(pastedReadString[0][2]);
|
||||
y = Short.valueOf(pastedReadString[0][3]);
|
||||
|
||||
sequence = pastedReadString[1][4].substring(cycleBegin, cycleEnd).getBytes();
|
||||
sequence = pastedReadString[1][4].substring(cycleBegin, cycleEnd + 1).getBytes();
|
||||
|
||||
quals = new byte[sequence.length];
|
||||
intensities = new short[sequence.length][4];
|
||||
|
||||
for (int cycle = 0; cycle < sequence.length; cycle++) {
|
||||
for (int cycle = cycleBegin, offset = 0; cycle <= cycleEnd; cycle++, offset++) {
|
||||
byte maxQual = -50;
|
||||
|
||||
for (int fullReadIndex = 4*cycle; fullReadIndex < 4*cycle + 4; fullReadIndex++) {
|
||||
|
|
@ -43,13 +43,13 @@ public class RawRead {
|
|||
if (qual > maxQual) { maxQual = qual; }
|
||||
}
|
||||
|
||||
quals[cycle] = maxQual >= 0 ? maxQual : 0;
|
||||
quals[offset] = maxQual >= 0 ? maxQual : 0;
|
||||
|
||||
for (int fullReadIndex = 4*cycle + 4, channel = 0; fullReadIndex < 4*cycle + 8; fullReadIndex++, channel++) {
|
||||
double doubleChannelIntensity = Double.valueOf(pastedReadString[0][fullReadIndex]);
|
||||
short shortChannelIntensity = (short) doubleChannelIntensity;
|
||||
|
||||
intensities[cycle][channel] = shortChannelIntensity;
|
||||
intensities[offset][channel] = shortChannelIntensity;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue