From 5a5c6d1276def33f39ccce191ed6613cdeadde5d Mon Sep 17 00:00:00 2001 From: kiran Date: Mon, 6 Apr 2009 22:00:58 +0000 Subject: [PATCH] Added some debugging stuff (writes model parameters to one file per cycle). git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@304 348d0f76-0448-11de-a6fe-93d51630548a --- .../fourbasecaller/BasecallingBaseModel.java | 57 ++++++++++++++++++- .../fourbasecaller/BasecallingReadModel.java | 15 +++++ .../fourbasecaller/FourBaseRecaller.java | 12 +++- 3 files changed, 79 insertions(+), 5 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/fourbasecaller/BasecallingBaseModel.java b/java/src/org/broadinstitute/sting/playground/fourbasecaller/BasecallingBaseModel.java index 3300bd732..41cee7de4 100755 --- a/java/src/org/broadinstitute/sting/playground/fourbasecaller/BasecallingBaseModel.java +++ b/java/src/org/broadinstitute/sting/playground/fourbasecaller/BasecallingBaseModel.java @@ -8,6 +8,8 @@ import cern.colt.matrix.linalg.Algebra; import org.broadinstitute.sting.utils.QualityUtils; +import java.io.*; + /** * BasecallingBaseModel is a class that represents the statistical * model for all bases at a given cycle. It allows for easy, one @@ -59,15 +61,34 @@ public class BasecallingBaseModel { /** * Add a single training point to the model. + * * @param basePrev the previous cycle's base call (A, C, G, T, or * for the first cycle) * @param baseCur the current cycle's base call (A, C, G, T) * @param qualCur the quality score for the current cycle's base call * @param fourintensity the four intensities for the current cycle's base call */ public void addTrainingPoint(char basePrev, char baseCur, byte qualCur, double[] fourintensity) { - int basePrevIndex = baseToBaseIndex(basePrev); - int baseCurIndex = baseToBaseIndex(baseCur); + int actualBasePrevIndex = baseToBaseIndex(basePrev); + int actualBaseCurIndex = baseToBaseIndex(baseCur); + double actualWeight = QualityUtils.qualToProb(qualCur); + double otherTheories = (basePrev == '*') ? 3.0 : 15.0; + cern.jet.math.Functions F = cern.jet.math.Functions.functions; + + for (int basePrevIndex = 0; basePrevIndex < 4; basePrevIndex++) { + for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) { + // We want to upweight the correct theory as much as we can and spread the remainder out evenly between all other hypotheses. + double weight = (basePrevIndex == actualBasePrevIndex && baseCurIndex == actualBaseCurIndex) ? actualWeight : ((1.0 - actualWeight)/otherTheories); + + DoubleMatrix1D weightedChannelIntensities = (DoubleFactory1D.dense).make(fourintensity); + weightedChannelIntensities.assign(F.mult(weight)); + + runningChannelSums[basePrevIndex][baseCurIndex].assign(weightedChannelIntensities, F.plus); + counts[basePrevIndex][baseCurIndex] += weight; + } + } + + /* if (basePrevIndex >= 0 && baseCurIndex >= 0) { for (int channel = 0; channel < 4; channel++) { double weight = QualityUtils.qualToProb(qualCur); @@ -83,6 +104,7 @@ public class BasecallingBaseModel { counts[basePrevIndex][baseCurIndex]++; } + */ readyToCall = false; } @@ -148,6 +170,27 @@ public class BasecallingBaseModel { return probdist; } + public void write(File outparam) { + try { + PrintWriter writer = new PrintWriter(outparam); + + for (int basePrevIndex = 0; basePrevIndex < 4; basePrevIndex++) { + for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) { + writer.print("mean_" + baseIndexToBase(basePrevIndex) + "" + baseIndexToBase(baseCurIndex) + " : [ "); + for (int channel = 0; channel < 4; channel++) { + writer.print(runningChannelSums[basePrevIndex][baseCurIndex].getQuick(channel)/counts[basePrevIndex][baseCurIndex]); + writer.print(" "); + } + writer.print("]\n"); + } + } + + writer.close(); + } catch (IOException e) { + + } + } + /** * Utility method for converting a base ([Aa*], [Cc], [Gg], [Tt]) to an index (0, 1, 2, 3); * @param base @@ -171,4 +214,14 @@ public class BasecallingBaseModel { return -1; } + + private char baseIndexToBase(int baseIndex) { + switch (baseIndex) { + case 0: return 'A'; + case 1: return 'C'; + case 2: return 'G'; + case 3: return 'T'; + default: return '.'; + } + } } diff --git a/java/src/org/broadinstitute/sting/playground/fourbasecaller/BasecallingReadModel.java b/java/src/org/broadinstitute/sting/playground/fourbasecaller/BasecallingReadModel.java index 87abe494f..59e2c6506 100644 --- a/java/src/org/broadinstitute/sting/playground/fourbasecaller/BasecallingReadModel.java +++ b/java/src/org/broadinstitute/sting/playground/fourbasecaller/BasecallingReadModel.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.playground.fourbasecaller; +import java.io.File; + /** * BasecallingReadModel represents the statistical models for * all bases in all cycles. It allows for easy, one-pass @@ -28,6 +30,7 @@ public class BasecallingReadModel { /** * Add a single training point to the model. + * * @param cycle the cycle for which this point should be added * @param basePrev the previous base * @param baseCur the current base @@ -82,4 +85,16 @@ public class BasecallingReadModel { return new FourProb(baseindices, probs); } + + /** + * Writes model parameters to a file per cycle. + * + * @param dir the directory where the parameters should be written + */ + public void write(File dir) { + for (int cycle = 0; cycle < basemodels.length; cycle++) { + File outparam = new File(dir.getPath() + "/param." + cycle); + basemodels[cycle].write(outparam); + } + } } diff --git a/java/src/org/broadinstitute/sting/playground/fourbasecaller/FourBaseRecaller.java b/java/src/org/broadinstitute/sting/playground/fourbasecaller/FourBaseRecaller.java index 93cdf868d..7169c78e4 100644 --- a/java/src/org/broadinstitute/sting/playground/fourbasecaller/FourBaseRecaller.java +++ b/java/src/org/broadinstitute/sting/playground/fourbasecaller/FourBaseRecaller.java @@ -45,7 +45,7 @@ public class FourBaseRecaller extends CommandLineProgram { bread = bfp.next(); int cycle_offset = (END <= 1) ? 0 : bread.getIntensities().length/2; - BasecallingReadModel p = new BasecallingReadModel(bread.getFirstReadSequence().length()); + BasecallingReadModel model = new BasecallingReadModel(bread.getFirstReadSequence().length()); int queryid; // learn initial parameters @@ -61,12 +61,16 @@ public class FourBaseRecaller extends CommandLineProgram { byte qualCur = quals[cycle]; double[] fourintensity = intensities[cycle + cycle_offset]; - p.addTrainingPoint(cycle, basePrev, baseCur, qualCur, fourintensity); + model.addTrainingPoint(cycle, basePrev, baseCur, qualCur, fourintensity); } queryid++; } while (queryid < TRAINING_LIMIT && bfp.hasNext() && (bread = bfp.next()) != null); + File debugout = new File(OUT.getParentFile().getPath() + "/model/"); + debugout.mkdir(); + model.write(debugout); + // call bases SAMFileHeader sfh = new SAMFileHeader(); SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, OUT); @@ -89,7 +93,7 @@ public class FourBaseRecaller extends CommandLineProgram { byte qualPrev = (cycle == 0) ? 0 : bestqual[cycle - 1]; double[] fourintensity = intensities[cycle + cycle_offset]; - FourProb fp = p.computeProbabilities(cycle, basePrev, qualPrev, fourintensity); + FourProb fp = model.computeProbabilities(cycle, basePrev, qualPrev, fourintensity); asciiseq[cycle] = (byte) fp.baseAtRank(0); bestqual[cycle] = fp.qualAtRank(0); @@ -102,6 +106,8 @@ public class FourBaseRecaller extends CommandLineProgram { queryid++; } while (queryid < CALLING_LIMIT && bfp.hasNext() && (bread = bfp.next()) != null); + sfw.close(); + return 0; }