From 6c5fbb988b288539dbf194eb09a06aeb2fca7d8f Mon Sep 17 00:00:00 2001 From: kiran Date: Wed, 20 May 2009 00:09:20 +0000 Subject: [PATCH] Now basecalls an entire read (both ends of the pair, barcode... everything) at once. After, RawRead and FourProbRead can be asked to return a specified subset (corresponding to the ranges specified for each end of the read. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@754 348d0f76-0448-11de-a6fe-93d51630548a --- .../secondarybase/BasecallingReadModel.java | 80 +++++-------------- .../sting/secondarybase/FourProbRead.java | 17 ++++ .../sting/secondarybase/IlluminaParser.java | 51 ++++++------ .../sting/secondarybase/RawRead.java | 8 +- 4 files changed, 62 insertions(+), 94 deletions(-) diff --git a/java/src/org/broadinstitute/sting/secondarybase/BasecallingReadModel.java b/java/src/org/broadinstitute/sting/secondarybase/BasecallingReadModel.java index 2e52ae94b..26788fd07 100644 --- a/java/src/org/broadinstitute/sting/secondarybase/BasecallingReadModel.java +++ b/java/src/org/broadinstitute/sting/secondarybase/BasecallingReadModel.java @@ -18,43 +18,37 @@ import java.util.ArrayList; */ public class BasecallingReadModel { private BasecallingBaseModel[] basemodels = null; - private boolean correctForContext = false; + private boolean correctForContext = true; - /** - * Constructor for BasecallingReadModel. - * - * @param readLength the length of the read that this model will support - */ - public BasecallingReadModel(int readLength, boolean correctForContext) { - this.correctForContext = correctForContext; - + public BasecallingReadModel(int readLength) { + initialize(readLength); + } + + public BasecallingReadModel(ArrayList trainingData) { + initialize(trainingData.get(0).getReadLength()); + + train(trainingData); + } + + public void initialize(int readLength) { basemodels = new BasecallingBaseModel[readLength]; for (int cycle = 0; cycle < readLength; cycle++) { - basemodels[cycle] = new BasecallingBaseModel(cycle == 0 ? false : correctForContext); + basemodels[cycle] = new BasecallingBaseModel(cycle != 0 && correctForContext); } } - public void train(BasecallingTrainingSet trainingSet) { - ArrayList trainingData = trainingSet.getTrainingData(); - - for (int readIndex = 0; readIndex < trainingData.size(); readIndex++) { - RawRead read = trainingData.get(readIndex); + public void train(ArrayList trainingData) { + //for (int readIndex = 0; readIndex < trainingData.size(); readIndex++) { + for ( RawRead read : trainingData ) { addMeanPoints(read); } - for (int readIndex = 0; readIndex < trainingData.size(); readIndex++) { - RawRead read = trainingData.get(readIndex); + for ( RawRead read : trainingData ) { addCovariancePoints(read); } } - /** - * Add a single training point to the model means. - * - * @param cycle the cycle for which this point should be added - * @param fourintensity the four intensities of the current base - */ public void addMeanPoint(int cycle, double[][] probMatrix, double[] fourintensity) { basemodels[cycle].addMeanPoint(probMatrix, fourintensity); } @@ -80,12 +74,6 @@ public class BasecallingReadModel { } } - /** - * Add a single training point to the model covariances. - * - * @param cycle the cycle for which this point should be added - * @param fourintensity the four intensities of the current base - */ public void addCovariancePoint(int cycle, double[][] probMatrix, double[] fourintensity) { basemodels[cycle].addCovariancePoint(probMatrix, fourintensity); } @@ -111,25 +99,10 @@ public class BasecallingReadModel { } } - /** - * Compute the likelihood matrix for a given cycle. - * - * @param cycle the cycle number for the current base - * @param fourintensity the four intensities for the current cycle's base - * @return 4x4 matrix of likelihoods - */ public double[][] computeLikelihoods(int cycle, double[] fourintensity) { return basemodels[cycle].computeLikelihoods(cycle, fourintensity); } - /** - * Compute the probability distribution for the base at a given cycle. - * - * @param cycle the cycle number for the current base - * @param fourintensity the four intensities for the current cycle's base - * @return an instance of FourProb, which encodes a base hypothesis, its probability, - * and the ranking among the other hypotheses - */ public FourProb computeProbabilities(int cycle, char basePrev, byte qualPrev, double[] fourintensity) { double[][] likes = computeLikelihoods(cycle, fourintensity); @@ -157,9 +130,7 @@ public class BasecallingReadModel { } } - FourProb fp = new FourProb(likes); - - return fp; + return new FourProb(likes); } public FourProbRead call(RawRead read) { @@ -174,7 +145,6 @@ public class BasecallingReadModel { fourIntensity[channel] = (double) read.getIntensities()[cycle][channel]; } - //fps[cycle] = computeProbabilities(cycle, basePrev, qualPrev, fourIntensity); fpr.add(cycle, computeProbabilities(cycle, basePrev, qualPrev, fourIntensity)); } @@ -182,15 +152,6 @@ public class BasecallingReadModel { return fpr; } - /** - * Returns the base probability matrix - * - * @param cycle the cycle of the base - * @param basePrev the previous base - * @param baseCur the current base - * @param probCur the probability of the current base - * @return the base probability matrix (1x4 if no contextual correction, 4x4 otherwise) - */ public double[][] getBaseProbabilityMatrix(int cycle, char basePrev, char baseCur, double probCur) { double[][] dist = new double[(correctForContext && cycle > 0) ? 4 : 1][4]; @@ -208,11 +169,6 @@ public class BasecallingReadModel { return dist; } - /** - * Writes model parameters to a file per cycle. - * - * @param dir the directory where the parameters should be written - */ public void write(File dir) { for (int cycle = 0; cycle < basemodels.length; cycle++) { File outparam = new File(dir.getPath() + "/param." + cycle + ".r"); diff --git a/java/src/org/broadinstitute/sting/secondarybase/FourProbRead.java b/java/src/org/broadinstitute/sting/secondarybase/FourProbRead.java index 40ae72c67..c2f1ae402 100755 --- a/java/src/org/broadinstitute/sting/secondarybase/FourProbRead.java +++ b/java/src/org/broadinstitute/sting/secondarybase/FourProbRead.java @@ -18,6 +18,23 @@ public class FourProbRead extends ArrayList { super(initialCapacity); } + /** + * Returns a subset of the FourProbRead. + * + * @param cycleStart the starting cycle (0-based, inclusive) + * @param cycleStop the ending cycle (0-based, inclusive) + * @return + */ + public FourProbRead getSubset(int cycleStart, int cycleStop) { + FourProbRead subset = new FourProbRead(cycleStop - cycleStart + 1); + + for (int cycle = cycleStart, offset = 0; cycle <= cycleStop; cycle++, offset++) { + subset.add(offset, this.get(cycle)); + } + + return subset; + } + /** * Get the read sequence at a specified rank * diff --git a/java/src/org/broadinstitute/sting/secondarybase/IlluminaParser.java b/java/src/org/broadinstitute/sting/secondarybase/IlluminaParser.java index 1fefd4656..611646736 100755 --- a/java/src/org/broadinstitute/sting/secondarybase/IlluminaParser.java +++ b/java/src/org/broadinstitute/sting/secondarybase/IlluminaParser.java @@ -13,12 +13,10 @@ import java.util.Iterator; import java.util.regex.Matcher; import java.util.regex.Pattern; -public class IlluminaParser implements Iterator, Iterable, Closeable { +public class IlluminaParser implements Closeable { private File bustardDir; private File firecrestDir; private int lane; - private int cycleBegin; - private int cycleEnd; private File[] intfiles; private File[] seqfiles; @@ -26,26 +24,21 @@ public class IlluminaParser implements Iterator, Iterable, Clo private int currentTileIndex; private PasteParser currentTileParser; + private String[][] currentParseResult; - private boolean iterating = false; - - public IlluminaParser(File bustardDir, int lane, int cycleBegin, int cycleEnd) { + public IlluminaParser(File bustardDir, int lane) { this.bustardDir = bustardDir; this.firecrestDir = bustardDir.getParentFile(); this.lane = lane; - this.cycleBegin = cycleBegin; - this.cycleEnd = cycleEnd; initializeParser(); } - public IlluminaParser(File bustardDir, File firecrestDir, int lane, int cycleBegin, int cycleEnd) { + public IlluminaParser(File bustardDir, File firecrestDir, int lane) { this.bustardDir = bustardDir; this.firecrestDir = firecrestDir; this.lane = lane; - this.cycleBegin = cycleBegin; - this.cycleEnd = cycleEnd; - + initializeParser(); } @@ -67,10 +60,9 @@ public class IlluminaParser implements Iterator, Iterable, Clo Arrays.sort(seqfiles, getTileSortingComparator()); Arrays.sort(prbfiles, getTileSortingComparator()); - iterator(); + seekToTile(1); // Todo: put some more consistency checks here - } private FilenameFilter getFilenameFilter(final String suffix) { @@ -129,30 +121,33 @@ public class IlluminaParser implements Iterator, Iterable, Clo return (currentTileParser.hasNext() || seekToTile(currentTileIndex + 1)); } - public RawRead next() { + public boolean next() { if (hasNext()) { - return new RawRead(currentTileParser.next(), cycleBegin, cycleEnd); + currentParseResult = currentTileParser.next(); + + return true; } - return null; + return false; + } + + public String[][] getCurrentParseResult() { + return currentParseResult; } public void remove() { throw new UnsupportedOperationException("IlluminaParser.remove() method is not supported."); } - public Iterator iterator() { - if (iterating) { - throw new IllegalStateException("IlluminaParser.iterator() method can only be called once, before the first call to IlluminaParser.hasNext()"); - } - - seekToTile(1); - iterating = true; - - return this; - } - public void close() { currentTileParser.close(); } + + public RawRead getRawRead() { + return getSubset(0, currentParseResult[1][4].length() - 1); + } + + public RawRead getSubset(int cycleStart, int cycleStop) { + return new RawRead(currentParseResult, cycleStart, cycleStop); + } } diff --git a/java/src/org/broadinstitute/sting/secondarybase/RawRead.java b/java/src/org/broadinstitute/sting/secondarybase/RawRead.java index 9df4dea79..36c716303 100755 --- a/java/src/org/broadinstitute/sting/secondarybase/RawRead.java +++ b/java/src/org/broadinstitute/sting/secondarybase/RawRead.java @@ -29,12 +29,12 @@ public class RawRead { x = Short.valueOf(pastedReadString[0][2]); y = Short.valueOf(pastedReadString[0][3]); - sequence = pastedReadString[1][4].substring(cycleBegin, cycleEnd).getBytes(); + sequence = pastedReadString[1][4].substring(cycleBegin, cycleEnd + 1).getBytes(); quals = new byte[sequence.length]; intensities = new short[sequence.length][4]; - for (int cycle = 0; cycle < sequence.length; cycle++) { + for (int cycle = cycleBegin, offset = 0; cycle <= cycleEnd; cycle++, offset++) { byte maxQual = -50; for (int fullReadIndex = 4*cycle; fullReadIndex < 4*cycle + 4; fullReadIndex++) { @@ -43,13 +43,13 @@ public class RawRead { if (qual > maxQual) { maxQual = qual; } } - quals[cycle] = maxQual >= 0 ? maxQual : 0; + quals[offset] = maxQual >= 0 ? maxQual : 0; for (int fullReadIndex = 4*cycle + 4, channel = 0; fullReadIndex < 4*cycle + 8; fullReadIndex++, channel++) { double doubleChannelIntensity = Double.valueOf(pastedReadString[0][fullReadIndex]); short shortChannelIntensity = (short) doubleChannelIntensity; - intensities[cycle][channel] = shortChannelIntensity; + intensities[offset][channel] = shortChannelIntensity; } } }