Added some debugging stuff (writes model parameters to one file per cycle).

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@304 348d0f76-0448-11de-a6fe-93d51630548a
2009-04-06 22:00:58 +00:00 · 2009-04-06 22:00:58 +00:00 · 5a5c6d1276
parent 0fc8a90553
commit 5a5c6d1276
3 changed files with 79 additions and 5 deletions
--- a/java/src/org/broadinstitute/sting/playground/fourbasecaller/BasecallingBaseModel.java
+++ b/java/src/org/broadinstitute/sting/playground/fourbasecaller/BasecallingBaseModel.java
@ -8,6 +8,8 @@ import cern.colt.matrix.linalg.Algebra;

 import org.broadinstitute.sting.utils.QualityUtils;

+import java.io.*;
+
 /**
 * BasecallingBaseModel is a class that represents the statistical
 * model for all bases at a given cycle.  It allows for easy, one
@ -59,15 +61,34 @@ public class BasecallingBaseModel {

    /**
     * Add a single training point to the model.
+     *
     * @param basePrev       the previous cycle's base call (A, C, G, T, or * for the first cycle)
     * @param baseCur        the current cycle's base call (A, C, G, T)
     * @param qualCur        the quality score for the current cycle's base call
     * @param fourintensity  the four intensities for the current cycle's base call
     */
    public void addTrainingPoint(char basePrev, char baseCur, byte qualCur, double[] fourintensity) {
-        int basePrevIndex = baseToBaseIndex(basePrev);
-        int baseCurIndex = baseToBaseIndex(baseCur);
+        int actualBasePrevIndex = baseToBaseIndex(basePrev);
+        int actualBaseCurIndex = baseToBaseIndex(baseCur);
+        double actualWeight = QualityUtils.qualToProb(qualCur);
+        double otherTheories = (basePrev == '*') ? 3.0 : 15.0;

+        cern.jet.math.Functions F = cern.jet.math.Functions.functions;
+
+        for (int basePrevIndex = 0; basePrevIndex < 4; basePrevIndex++) {
+            for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) {
+                // We want to upweight the correct theory as much as we can and spread the remainder out evenly between all other hypotheses.
+                double weight = (basePrevIndex == actualBasePrevIndex && baseCurIndex == actualBaseCurIndex) ? actualWeight : ((1.0 - actualWeight)/otherTheories);
+
+                DoubleMatrix1D weightedChannelIntensities = (DoubleFactory1D.dense).make(fourintensity);
+                weightedChannelIntensities.assign(F.mult(weight));
+
+                runningChannelSums[basePrevIndex][baseCurIndex].assign(weightedChannelIntensities, F.plus);
+                counts[basePrevIndex][baseCurIndex] += weight;
+            }
+        }
+
+        /*
        if (basePrevIndex >= 0 && baseCurIndex >= 0) {
            for (int channel = 0; channel < 4; channel++) {
                double weight = QualityUtils.qualToProb(qualCur);
@ -83,6 +104,7 @@ public class BasecallingBaseModel {

            counts[basePrevIndex][baseCurIndex]++;
        }
+        */

        readyToCall = false;
    }
@ -148,6 +170,27 @@ public class BasecallingBaseModel {
        return probdist;
    }

+    public void write(File outparam) {
+        try {
+            PrintWriter writer = new PrintWriter(outparam);
+
+            for (int basePrevIndex = 0; basePrevIndex < 4; basePrevIndex++) {
+                for (int baseCurIndex = 0; baseCurIndex < 4; baseCurIndex++) {
+                    writer.print("mean_" + baseIndexToBase(basePrevIndex) + "" + baseIndexToBase(baseCurIndex) + " : [ ");
+                    for (int channel = 0; channel < 4; channel++) {
+                        writer.print(runningChannelSums[basePrevIndex][baseCurIndex].getQuick(channel)/counts[basePrevIndex][baseCurIndex]);
+                        writer.print(" ");
+                    }
+                    writer.print("]\n");
+                }
+            }
+
+            writer.close();
+        } catch (IOException e) {
+
+        }
+    }
+
    /**
     * Utility method for converting a base ([Aa*], [Cc], [Gg], [Tt]) to an index (0, 1, 2, 3);
     * @param base
@ -171,4 +214,14 @@ public class BasecallingBaseModel {

        return -1;
    }
+
+    private char baseIndexToBase(int baseIndex) {
+        switch (baseIndex) {
+            case 0: return 'A';
+            case 1: return 'C';
+            case 2: return 'G';
+            case 3: return 'T';
+            default: return '.';
+        }
+    }
 }
--- a/java/src/org/broadinstitute/sting/playground/fourbasecaller/BasecallingReadModel.java
+++ b/java/src/org/broadinstitute/sting/playground/fourbasecaller/BasecallingReadModel.java
@ -1,5 +1,7 @@
 package org.broadinstitute.sting.playground.fourbasecaller;

+import java.io.File;
+
 /**
 * BasecallingReadModel represents the statistical models for
 * all bases in all cycles.  It allows for easy, one-pass
@ -28,6 +30,7 @@ public class BasecallingReadModel {

    /**
     * Add a single training point to the model.
+     *
     * @param cycle         the cycle for which this point should be added
     * @param basePrev      the previous base
     * @param baseCur       the current base
@ -82,4 +85,16 @@ public class BasecallingReadModel {

        return new FourProb(baseindices, probs);
    }
+
+    /**
+     * Writes model parameters to a file per cycle.
+     *
+     * @param dir  the directory where the parameters should be written
+     */
+    public void write(File dir) {
+        for (int cycle = 0; cycle < basemodels.length; cycle++) {
+            File outparam = new File(dir.getPath() + "/param." + cycle);
+            basemodels[cycle].write(outparam);
+        }
+    }
 }
--- a/java/src/org/broadinstitute/sting/playground/fourbasecaller/FourBaseRecaller.java
+++ b/java/src/org/broadinstitute/sting/playground/fourbasecaller/FourBaseRecaller.java
@ -45,7 +45,7 @@ public class FourBaseRecaller extends CommandLineProgram {
        bread = bfp.next();

        int cycle_offset = (END <= 1) ? 0 : bread.getIntensities().length/2;
-        BasecallingReadModel p = new BasecallingReadModel(bread.getFirstReadSequence().length());
+        BasecallingReadModel model = new BasecallingReadModel(bread.getFirstReadSequence().length());
        int queryid;

        // learn initial parameters
@ -61,12 +61,16 @@ public class FourBaseRecaller extends CommandLineProgram {
                byte qualCur  = quals[cycle];
                double[] fourintensity = intensities[cycle + cycle_offset];

-                p.addTrainingPoint(cycle, basePrev, baseCur, qualCur, fourintensity);
+                model.addTrainingPoint(cycle, basePrev, baseCur, qualCur, fourintensity);
            }

            queryid++;
        } while (queryid < TRAINING_LIMIT && bfp.hasNext() && (bread = bfp.next()) != null);

+        File debugout = new File(OUT.getParentFile().getPath() + "/model/");
+        debugout.mkdir();
+        model.write(debugout);
+
        // call bases
        SAMFileHeader sfh = new SAMFileHeader();
        SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, OUT);
@ -89,7 +93,7 @@ public class FourBaseRecaller extends CommandLineProgram {
                byte qualPrev = (cycle == 0) ? 0 : bestqual[cycle - 1];
                double[] fourintensity = intensities[cycle + cycle_offset];

-                FourProb fp = p.computeProbabilities(cycle, basePrev, qualPrev, fourintensity);
+                FourProb fp = model.computeProbabilities(cycle, basePrev, qualPrev, fourintensity);

                asciiseq[cycle] = (byte) fp.baseAtRank(0);
                bestqual[cycle] = fp.qualAtRank(0);
@ -102,6 +106,8 @@ public class FourBaseRecaller extends CommandLineProgram {
            queryid++;
        } while (queryid < CALLING_LIMIT && bfp.hasNext() && (bread = bfp.next()) != null);

+        sfw.close();
+
        return 0;
    }