Java version of basecaller that estimates probability distribution over four-base hypothesis space via an internal-control-initialized Gaussian mixture model over base channel intensities.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@127 348d0f76-0448-11de-a6fe-93d51630548a
2009-03-21 17:58:50 +00:00 · 2009-03-21 17:58:50 +00:00 · 6e1fa7d61a
parent 3e350006e0
commit 6e1fa7d61a
6 changed files with 473 additions and 0 deletions
--- a/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/FourBaseCaller.java
+++ b/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/FourBaseCaller.java
@ -0,0 +1,221 @@
+package org.broadinstitute.sting.projects.fourbasecaller;
+
+import java.io.File;
+import java.util.Vector;
+import java.lang.Math;
+
+import org.broadinstitute.sting.illumina.FirecrestFileParser;
+import org.broadinstitute.sting.illumina.FourIntensity;
+import cern.colt.matrix.linalg.Algebra;
+import cern.colt.matrix.DoubleMatrix1D;
+import cern.colt.matrix.DoubleFactory1D;
+import net.sf.samtools.SAMFileHeader;
+import net.sf.samtools.SAMFileWriter;
+import net.sf.samtools.SAMFileWriterFactory;
+import net.sf.samtools.SAMRecord;
+
+public class FourBaseCaller {
+    public static void main(String[] args) {
+        // Parse args
+        File firecrestDirectory = new File(args[0]);
+        int lane = Integer.valueOf(args[1]);
+        String icrefpath = args[2];
+
+        // Load ic reference
+        ManyNucleotideSequences controls = new ManyNucleotideSequences(icrefpath);
+        controls.reverseComplement();
+
+        // Loop through firecrest data, find ICs, and compute signal means
+        Vector<FourIntensity[]> recoveredICs = new Vector<FourIntensity[]>();
+        Vector<Integer> icids = new Vector<Integer>();
+
+        FirecrestFileParser ffp = new FirecrestFileParser(firecrestDirectory, lane);
+
+        NucleotideChannelMeans[] cmeans = new NucleotideChannelMeans[controls.get(0).size()];
+        for (int i = 0; i < cmeans.length; i++) {
+            cmeans[i] = new NucleotideChannelMeans();
+        }
+
+        int numReads = 0;
+        while (ffp.hasNext() && numReads < 100000) {
+            if (numReads % 10000 == 0) {
+                System.out.println("Processed " + numReads + " reads.");
+            }
+            FourIntensity[] intensities = ffp.next().getIntensities();
+            NucleotideSequence rawread = toNucleotideSequence(intensities);
+
+            int[] editDistances = new int[controls.size()];
+
+            for (int icTemplateIndex = 0; icTemplateIndex < controls.size(); icTemplateIndex++) {
+                editDistances[icTemplateIndex] = rawread.editDistance(controls.get(icTemplateIndex), 36);
+
+                if (editDistances[icTemplateIndex] <= 8) {
+                    recoveredICs.add(intensities);
+                    icids.add(icTemplateIndex);
+
+                    for (int cycle = 0; cycle < ((controls.get(0).size() < intensities.length) ? controls.get(0).size() : intensities.length); cycle++) {
+                        Nucleotide nuc = controls.get(icTemplateIndex).get(cycle);
+                        FourIntensity sig = intensities[cycle];
+
+                        cmeans[cycle].add(nuc, sig);
+                    }
+
+                    break;
+                }
+            }
+
+            numReads++;
+        }
+
+        // Go through the data again and compute signal covariances
+        NucleotideChannelCovariances[] ccov = new NucleotideChannelCovariances[controls.get(0).size()];
+        for (int i = 0; i < cmeans.length; i++) {
+            ccov[i] = new NucleotideChannelCovariances();
+        }
+        
+        for (int index = 0; index < recoveredICs.size(); index++) {
+            NucleotideSequence control = controls.get(icids.get(index).intValue());
+            FourIntensity[] intensities = recoveredICs.get(index);
+            int seqlength = (control.size() < intensities.length) ? control.size() : intensities.length;
+
+            for (int cycle = 0; cycle < seqlength; cycle++) {
+                Nucleotide nuc = control.get(cycle);
+                FourIntensity sig = intensities[cycle];
+                NucleotideChannelMeans mus = cmeans[cycle];
+
+                ccov[cycle].add(nuc, sig, mus);
+            }
+        }
+
+        // Now compute probabilities for the bases
+        Algebra alg = new Algebra();
+
+        for (int cycle = 0; cycle < recoveredICs.get(0).length; cycle++) {
+            ccov[cycle].invert();
+        }
+
+        FirecrestFileParser ffp2 = new FirecrestFileParser(firecrestDirectory, lane);
+
+        numReads = 0;
+
+        SAMFileHeader sfh = new SAMFileHeader();
+        SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, new File("/wga/scr1/GSA/kiran/basecalling/simulation/test.sam"));
+
+        while (ffp2.hasNext() && numReads < 100000) {
+            FourIntensity[] intensities = ffp2.next().getIntensities();
+
+            byte[] asciiseq = new byte[intensities.length];
+            byte[] bestqual = new byte[intensities.length];
+            byte[] nextbestqual = new byte[intensities.length];
+
+            for (int cycle = 0; cycle < intensities.length; cycle++) {
+                FourIntensity fi = intensities[cycle];
+
+                double[] likes = new double[4];
+                double total = 0.0;
+
+                for (Nucleotide nuc : Nucleotide.values()) {
+                    double norm = Math.sqrt(alg.det(ccov[cycle].channelCovariances(nuc)))/Math.pow(2.0*Math.PI, 2.0);
+
+                    DoubleMatrix1D sub = subtract(fi, cmeans[cycle].channelMeans(nuc));
+                    DoubleMatrix1D Ax = alg.mult(ccov[cycle].channelCovariances(nuc), sub);
+
+                    double exparg = -0.5*alg.mult(sub, Ax);
+                    likes[nuc.ordinal()] = norm*Math.exp(exparg);
+                    total += likes[nuc.ordinal()];
+                }
+
+                Nucleotide call1 = Nucleotide.A;
+                double prob1 = likes[0]/total;
+                for (int i = 1; i < 4; i++) {
+                    if (likes[i]/total > prob1) {
+                        prob1 = likes[i]/total;
+
+                        switch (i) {
+                            case 1: call1 = Nucleotide.C; break;
+                            case 2: call1 = Nucleotide.G; break;
+                            case 3: call1 = Nucleotide.T; break;
+                        }
+                    }
+                }
+
+                Nucleotide call2 = Nucleotide.A;
+                double prob2 = 0.0;
+                for (int i = 0; i < 4; i++) {
+                    if (i != call1.ordinal() && likes[i]/total > prob2 && likes[i]/total < prob1) {
+                        prob2 = likes[i]/total;
+
+                        switch (i) {
+                            case 0: call2 = Nucleotide.A; break;
+                            case 1: call2 = Nucleotide.C; break;
+                            case 2: call2 = Nucleotide.G; break;
+                            case 3: call2 = Nucleotide.T; break;
+                        }
+                    }
+                }
+
+                asciiseq[cycle] = (byte) call1.asChar();
+                bestqual[cycle] = toPhredScore(prob1);
+                nextbestqual[cycle] = toCompressedQuality(call2, prob2);
+
+                //System.out.print(call.asChar());
+            }
+
+            SAMRecord sr = new SAMRecord(sfh);
+            sr.setReadName(Integer.toString(numReads));
+            sr.setReadUmappedFlag(true);
+            sr.setReadBases(asciiseq);
+            sr.setBaseQualities(bestqual);
+            sr.setAttribute("SQ", nextbestqual);
+            sfw.addAlignment(sr);
+
+            numReads++;
+        }
+
+        sfw.close();
+    }
+
+    private static byte toPhredScore(double prob) {
+        byte qual = (prob - 1.0 < 0.00001) ? 40 : (byte) (-10*Math.log10(1.0 - prob));
+        return (qual > 40) ? 40 : qual;
+    }
+
+    private static DoubleMatrix1D subtract(FourIntensity a, FourIntensity b) {
+        DoubleMatrix1D sub = (DoubleFactory1D.dense).make(4);
+
+        for (int i = 0; i < 4; i++) {
+            sub.set(i, a.getChannelIntensity(i) - b.getChannelIntensity(i));
+        }
+
+        return sub;
+    }
+
+    private static byte toCompressedQuality(Nucleotide base, double prob) {
+        byte compressedQual = (byte) base.ordinal();
+        byte cprob = (byte) (100.0*prob);
+        byte qualmask = (byte) 252;
+        compressedQual += ((cprob << 2) & qualmask);
+
+        return compressedQual;
+    }
+
+    private static NucleotideSequence toNucleotideSequence(FourIntensity[] intensities) {
+        NucleotideSequence ns = new NucleotideSequence(intensities.length);
+
+        for (int cycle = 0; cycle < intensities.length; cycle++) {
+            int brightestChannel = intensities[cycle].brightestChannel();
+
+            Nucleotide nt = Nucleotide.A;
+            switch (brightestChannel) {
+                case 0: nt = Nucleotide.A; break;
+                case 1: nt = Nucleotide.C; break;
+                case 2: nt = Nucleotide.G; break;
+                case 3: nt = Nucleotide.T; break;
+            }
+
+            ns.set(cycle, nt);
+        }
+
+        return ns;
+    }
+}
--- a/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/ManyNucleotideSequences.java
+++ b/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/ManyNucleotideSequences.java
@ -0,0 +1,51 @@
+package org.broadinstitute.sting.projects.fourbasecaller;
+
+import java.io.File;
+import java.util.Vector;
+
+import org.broadinstitute.sting.projects.fourbasecaller.NucleotideSequence;
+
+import edu.mit.broad.picard.reference.ReferenceSequence;
+import edu.mit.broad.picard.reference.ReferenceSequenceFile;
+import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory;
+
+class ManyNucleotideSequences extends java.util.Vector<NucleotideSequence> {
+    public ManyNucleotideSequences() {}
+
+    public ManyNucleotideSequences(int initialCapacity) {
+        this.setSize(initialCapacity);
+    }
+
+    public ManyNucleotideSequences(String seqpath) {
+        ReferenceSequenceFile seqfile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(seqpath));
+
+        ReferenceSequence rs;
+        while ((rs = seqfile.nextSequence()) != null) {
+            byte[] bases = rs.getBases();
+
+            NucleotideSequence ns = new NucleotideSequence(bases.length);
+            for (int locus = 0; locus < bases.length; locus++) {
+                Nucleotide nt;
+                switch(bases[locus]) {
+                    case 'A': nt = Nucleotide.A; break;
+                    case 'C': nt = Nucleotide.C; break;
+                    case 'G': nt = Nucleotide.G; break;
+                    case 'T': nt = Nucleotide.T; break;
+                    default: nt = Nucleotide.T; break;
+                }
+
+                ns.set(locus, nt);
+            }
+
+            this.add(ns);
+        }
+    }
+
+    public ManyNucleotideSequences reverseComplement() {
+        for (int contig = 0; contig < this.size(); contig++) {
+            this.get(contig).reverseComplement();
+        }
+
+        return this;
+    }
+}
--- a/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/Nucleotide.java
+++ b/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/Nucleotide.java
@ -0,0 +1,36 @@
+package org.broadinstitute.sting.projects.fourbasecaller;
+
+import java.lang.String;
+
+public enum Nucleotide {
+    A, C, G, T;
+
+    Nucleotide complement() {
+        switch (this) {
+            case A: return T;
+            case C: return G;
+            case G: return C;
+            case T: return A;
+        }
+
+        throw new IllegalArgumentException("Nucleotide must be A, C, G, or T.");
+    }
+
+    public char asChar() {
+        switch(this) {
+            case A: return 'A';
+            case C: return 'C';
+            case G: return 'G';
+            case T: return 'T';
+        }
+
+        throw new IllegalArgumentException("Nucleotide must be A, C, G, or T.");
+    }
+
+    public String toString() {
+        char[] value = new char[1];
+        value[0] = this.asChar();
+
+        return new String(value);
+    }
+}
--- a/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/NucleotideChannelCovariances.java
+++ b/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/NucleotideChannelCovariances.java
@ -0,0 +1,64 @@
+package org.broadinstitute.sting.projects.fourbasecaller;
+
+import cern.colt.matrix.DoubleFactory2D;
+import cern.colt.matrix.DoubleMatrix2D;
+import cern.colt.matrix.linalg.Algebra;
+import org.broadinstitute.sting.illumina.FourIntensity;
+
+public class NucleotideChannelCovariances {
+    private DoubleMatrix2D[] covs;
+    private int[] counts;
+
+    public NucleotideChannelCovariances() {
+        counts = new int[4];
+
+        covs = new DoubleMatrix2D[4];
+        for (int i = 0; i < 4; i++) {
+            covs[i] = (DoubleFactory2D.dense).make(4, 4);
+        }
+    }
+
+    public void add(Nucleotide nuc, FourIntensity sig, NucleotideChannelMeans mus) {
+        FourIntensity f = new FourIntensity(sig);
+        f.subtract(mus.channelMeans(nuc));
+
+        DoubleMatrix2D cov = covs[nuc.ordinal()];
+
+        for (int i = 0; i < 4; i++) {
+            for (int j = 0; j < 4; j++) {
+                cov.set(i, j, cov.get(i, j) + f.getChannelIntensity(i)*f.getChannelIntensity(j));
+            }
+        }
+        
+        counts[nuc.ordinal()]++;
+    }
+
+    public DoubleMatrix2D channelCovariances(Nucleotide base) {
+        DoubleMatrix2D newcov = covs[base.ordinal()].copy();
+
+        /*
+        for (int i = 0; i < 4; i++) {
+            for (int j = 0; j < 4; j++) {
+                newcov.set(i, j, newcov.get(i, j)/((double) counts[base.ordinal()]));
+            }
+        }
+        */
+
+        return newcov;
+    }
+
+    public void invert() {
+        Algebra alg = new Algebra();
+
+        for (int i = 0; i < 4; i++) {
+            for (int j = 0; j < 4; j++) {
+                for (int k = 0; k < 4; k++) {
+                    covs[i].set(j, k, covs[i].get(j, k)/((double) counts[i]));
+                }
+            }
+            
+            DoubleMatrix2D covinv = alg.inverse(covs[i]);
+            covs[i] = covinv;
+        }
+    }
+}
--- a/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/NucleotideChannelMeans.java
+++ b/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/NucleotideChannelMeans.java
@ -0,0 +1,42 @@
+package org.broadinstitute.sting.projects.fourbasecaller;
+
+import org.broadinstitute.sting.illumina.FourIntensity;
+
+public class NucleotideChannelMeans {
+    private FourIntensity[] sigs;
+    private int[] counts;
+
+    public NucleotideChannelMeans() {
+        counts = new int[4];
+        sigs = new FourIntensity[4];
+
+        for (int base = 0; base < 4; base++) {
+            sigs[base] = new FourIntensity();
+        }
+    }
+
+    public void add(Nucleotide base, FourIntensity intensity) {
+        sigs[base.ordinal()].add(intensity);
+        counts[base.ordinal()]++;
+    }
+
+    public FourIntensity channelMeans(Nucleotide base) {
+        FourIntensity meansig = new FourIntensity(sigs[base.ordinal()]);
+        meansig.divide(counts[base.ordinal()]);
+
+        return meansig;
+    }
+
+    public float channelMean(Nucleotide base, int channel) {
+        FourIntensity meansig = channelMeans(base);
+
+        return meansig.getChannelIntensity(channel);
+    }
+
+    public String toString() {
+        return "a:{" + channelMean(Nucleotide.A, 0) + " "  + channelMean(Nucleotide.A, 1) + " " + channelMean(Nucleotide.A, 2) + " " + channelMean(Nucleotide.A, 3) + "} " +
+               "c:{" + channelMean(Nucleotide.C, 0) + " "  + channelMean(Nucleotide.C, 1) + " " + channelMean(Nucleotide.C, 2) + " " + channelMean(Nucleotide.C, 3) + "} " +
+               "g:{" + channelMean(Nucleotide.G, 0) + " "  + channelMean(Nucleotide.G, 1) + " " + channelMean(Nucleotide.G, 2) + " " + channelMean(Nucleotide.G, 3) + "} " +
+               "t:{" + channelMean(Nucleotide.T, 0) + " "  + channelMean(Nucleotide.T, 1) + " " + channelMean(Nucleotide.T, 2) + " " + channelMean(Nucleotide.T, 3) + "}";
+    }
+}
--- a/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/NucleotideSequence.java
+++ b/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/NucleotideSequence.java
@ -0,0 +1,59 @@
+package org.broadinstitute.sting.projects.fourbasecaller;
+
+import org.broadinstitute.sting.projects.fourbasecaller.Nucleotide;
+
+import java.lang.String;
+import java.util.Vector;
+
+class NucleotideSequence extends java.util.Vector<Nucleotide> {
+    public NucleotideSequence(int initialCapacity) {
+        this.setSize(initialCapacity);
+    }
+
+    public NucleotideSequence(NucleotideSequence ns) {
+        this.setSize(ns.size());
+
+        for (int locus = 0; locus < ns.size(); locus++) {
+            switch (ns.get(locus)) {
+                case A: this.set(locus, Nucleotide.A); break;
+                case C: this.set(locus, Nucleotide.C); break;
+                case G: this.set(locus, Nucleotide.G); break;
+                case T: this.set(locus, Nucleotide.T); break;
+            }
+        }
+    }
+
+    public NucleotideSequence reverseComplement() {
+        if (this.size() % 2 != 0) {
+            this.get(this.size()/2 + 1).complement();
+        }
+
+        for (int locus = 0; locus < this.size()/2; locus++) {
+            Nucleotide temp = this.get(locus).complement();
+            this.set(locus, this.get(this.size() - locus - 1).complement());
+            this.set(this.size() - locus - 1, temp);
+        }
+
+        return this;
+    }
+
+    public int editDistance(NucleotideSequence ns) { return editDistance(ns, this.size() < ns.size() ? this.size() : ns.size()); }
+
+    public int editDistance(NucleotideSequence ns, int searchLength) {
+        int editDistance = 0;
+        for (int locus = 0; locus < searchLength; locus++) {
+            editDistance += (this.get(locus) != ns.get(locus)) ? 1 : 0;
+        }
+
+        return editDistance;
+    }
+
+    public String toString() {
+        char[] charbuffer = new char[this.size()];
+        for (int locus = 0; locus < this.size(); locus++) {
+            charbuffer[locus] = this.get(locus).asChar();
+        }
+
+        return new String(charbuffer);
+    }
+}