From 6e1fa7d61ae9a6b4b72e94e190359fc2fd952ff9 Mon Sep 17 00:00:00 2001 From: kiran Date: Sat, 21 Mar 2009 17:58:50 +0000 Subject: [PATCH] Java version of basecaller that estimates probability distribution over four-base hypothesis space via an internal-control-initialized Gaussian mixture model over base channel intensities. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@127 348d0f76-0448-11de-a6fe-93d51630548a --- .../fourbasecaller/FourBaseCaller.java | 221 ++++++++++++++++++ .../ManyNucleotideSequences.java | 51 ++++ .../projects/fourbasecaller/Nucleotide.java | 36 +++ .../NucleotideChannelCovariances.java | 64 +++++ .../NucleotideChannelMeans.java | 42 ++++ .../fourbasecaller/NucleotideSequence.java | 59 +++++ 6 files changed, 473 insertions(+) create mode 100644 playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/FourBaseCaller.java create mode 100644 playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/ManyNucleotideSequences.java create mode 100644 playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/Nucleotide.java create mode 100755 playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/NucleotideChannelCovariances.java create mode 100755 playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/NucleotideChannelMeans.java create mode 100644 playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/NucleotideSequence.java diff --git a/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/FourBaseCaller.java b/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/FourBaseCaller.java new file mode 100644 index 000000000..6daad50c1 --- /dev/null +++ b/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/FourBaseCaller.java @@ -0,0 +1,221 @@ +package org.broadinstitute.sting.projects.fourbasecaller; + +import java.io.File; +import java.util.Vector; +import java.lang.Math; + +import org.broadinstitute.sting.illumina.FirecrestFileParser; +import org.broadinstitute.sting.illumina.FourIntensity; +import cern.colt.matrix.linalg.Algebra; +import cern.colt.matrix.DoubleMatrix1D; +import cern.colt.matrix.DoubleFactory1D; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMFileWriter; +import net.sf.samtools.SAMFileWriterFactory; +import net.sf.samtools.SAMRecord; + +public class FourBaseCaller { + public static void main(String[] args) { + // Parse args + File firecrestDirectory = new File(args[0]); + int lane = Integer.valueOf(args[1]); + String icrefpath = args[2]; + + // Load ic reference + ManyNucleotideSequences controls = new ManyNucleotideSequences(icrefpath); + controls.reverseComplement(); + + // Loop through firecrest data, find ICs, and compute signal means + Vector recoveredICs = new Vector(); + Vector icids = new Vector(); + + FirecrestFileParser ffp = new FirecrestFileParser(firecrestDirectory, lane); + + NucleotideChannelMeans[] cmeans = new NucleotideChannelMeans[controls.get(0).size()]; + for (int i = 0; i < cmeans.length; i++) { + cmeans[i] = new NucleotideChannelMeans(); + } + + int numReads = 0; + while (ffp.hasNext() && numReads < 100000) { + if (numReads % 10000 == 0) { + System.out.println("Processed " + numReads + " reads."); + } + FourIntensity[] intensities = ffp.next().getIntensities(); + NucleotideSequence rawread = toNucleotideSequence(intensities); + + int[] editDistances = new int[controls.size()]; + + for (int icTemplateIndex = 0; icTemplateIndex < controls.size(); icTemplateIndex++) { + editDistances[icTemplateIndex] = rawread.editDistance(controls.get(icTemplateIndex), 36); + + if (editDistances[icTemplateIndex] <= 8) { + recoveredICs.add(intensities); + icids.add(icTemplateIndex); + + for (int cycle = 0; cycle < ((controls.get(0).size() < intensities.length) ? controls.get(0).size() : intensities.length); cycle++) { + Nucleotide nuc = controls.get(icTemplateIndex).get(cycle); + FourIntensity sig = intensities[cycle]; + + cmeans[cycle].add(nuc, sig); + } + + break; + } + } + + numReads++; + } + + // Go through the data again and compute signal covariances + NucleotideChannelCovariances[] ccov = new NucleotideChannelCovariances[controls.get(0).size()]; + for (int i = 0; i < cmeans.length; i++) { + ccov[i] = new NucleotideChannelCovariances(); + } + + for (int index = 0; index < recoveredICs.size(); index++) { + NucleotideSequence control = controls.get(icids.get(index).intValue()); + FourIntensity[] intensities = recoveredICs.get(index); + int seqlength = (control.size() < intensities.length) ? control.size() : intensities.length; + + for (int cycle = 0; cycle < seqlength; cycle++) { + Nucleotide nuc = control.get(cycle); + FourIntensity sig = intensities[cycle]; + NucleotideChannelMeans mus = cmeans[cycle]; + + ccov[cycle].add(nuc, sig, mus); + } + } + + // Now compute probabilities for the bases + Algebra alg = new Algebra(); + + for (int cycle = 0; cycle < recoveredICs.get(0).length; cycle++) { + ccov[cycle].invert(); + } + + FirecrestFileParser ffp2 = new FirecrestFileParser(firecrestDirectory, lane); + + numReads = 0; + + SAMFileHeader sfh = new SAMFileHeader(); + SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, new File("/wga/scr1/GSA/kiran/basecalling/simulation/test.sam")); + + while (ffp2.hasNext() && numReads < 100000) { + FourIntensity[] intensities = ffp2.next().getIntensities(); + + byte[] asciiseq = new byte[intensities.length]; + byte[] bestqual = new byte[intensities.length]; + byte[] nextbestqual = new byte[intensities.length]; + + for (int cycle = 0; cycle < intensities.length; cycle++) { + FourIntensity fi = intensities[cycle]; + + double[] likes = new double[4]; + double total = 0.0; + + for (Nucleotide nuc : Nucleotide.values()) { + double norm = Math.sqrt(alg.det(ccov[cycle].channelCovariances(nuc)))/Math.pow(2.0*Math.PI, 2.0); + + DoubleMatrix1D sub = subtract(fi, cmeans[cycle].channelMeans(nuc)); + DoubleMatrix1D Ax = alg.mult(ccov[cycle].channelCovariances(nuc), sub); + + double exparg = -0.5*alg.mult(sub, Ax); + likes[nuc.ordinal()] = norm*Math.exp(exparg); + total += likes[nuc.ordinal()]; + } + + Nucleotide call1 = Nucleotide.A; + double prob1 = likes[0]/total; + for (int i = 1; i < 4; i++) { + if (likes[i]/total > prob1) { + prob1 = likes[i]/total; + + switch (i) { + case 1: call1 = Nucleotide.C; break; + case 2: call1 = Nucleotide.G; break; + case 3: call1 = Nucleotide.T; break; + } + } + } + + Nucleotide call2 = Nucleotide.A; + double prob2 = 0.0; + for (int i = 0; i < 4; i++) { + if (i != call1.ordinal() && likes[i]/total > prob2 && likes[i]/total < prob1) { + prob2 = likes[i]/total; + + switch (i) { + case 0: call2 = Nucleotide.A; break; + case 1: call2 = Nucleotide.C; break; + case 2: call2 = Nucleotide.G; break; + case 3: call2 = Nucleotide.T; break; + } + } + } + + asciiseq[cycle] = (byte) call1.asChar(); + bestqual[cycle] = toPhredScore(prob1); + nextbestqual[cycle] = toCompressedQuality(call2, prob2); + + //System.out.print(call.asChar()); + } + + SAMRecord sr = new SAMRecord(sfh); + sr.setReadName(Integer.toString(numReads)); + sr.setReadUmappedFlag(true); + sr.setReadBases(asciiseq); + sr.setBaseQualities(bestqual); + sr.setAttribute("SQ", nextbestqual); + sfw.addAlignment(sr); + + numReads++; + } + + sfw.close(); + } + + private static byte toPhredScore(double prob) { + byte qual = (prob - 1.0 < 0.00001) ? 40 : (byte) (-10*Math.log10(1.0 - prob)); + return (qual > 40) ? 40 : qual; + } + + private static DoubleMatrix1D subtract(FourIntensity a, FourIntensity b) { + DoubleMatrix1D sub = (DoubleFactory1D.dense).make(4); + + for (int i = 0; i < 4; i++) { + sub.set(i, a.getChannelIntensity(i) - b.getChannelIntensity(i)); + } + + return sub; + } + + private static byte toCompressedQuality(Nucleotide base, double prob) { + byte compressedQual = (byte) base.ordinal(); + byte cprob = (byte) (100.0*prob); + byte qualmask = (byte) 252; + compressedQual += ((cprob << 2) & qualmask); + + return compressedQual; + } + + private static NucleotideSequence toNucleotideSequence(FourIntensity[] intensities) { + NucleotideSequence ns = new NucleotideSequence(intensities.length); + + for (int cycle = 0; cycle < intensities.length; cycle++) { + int brightestChannel = intensities[cycle].brightestChannel(); + + Nucleotide nt = Nucleotide.A; + switch (brightestChannel) { + case 0: nt = Nucleotide.A; break; + case 1: nt = Nucleotide.C; break; + case 2: nt = Nucleotide.G; break; + case 3: nt = Nucleotide.T; break; + } + + ns.set(cycle, nt); + } + + return ns; + } +} diff --git a/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/ManyNucleotideSequences.java b/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/ManyNucleotideSequences.java new file mode 100644 index 000000000..ca2ffe28f --- /dev/null +++ b/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/ManyNucleotideSequences.java @@ -0,0 +1,51 @@ +package org.broadinstitute.sting.projects.fourbasecaller; + +import java.io.File; +import java.util.Vector; + +import org.broadinstitute.sting.projects.fourbasecaller.NucleotideSequence; + +import edu.mit.broad.picard.reference.ReferenceSequence; +import edu.mit.broad.picard.reference.ReferenceSequenceFile; +import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory; + +class ManyNucleotideSequences extends java.util.Vector { + public ManyNucleotideSequences() {} + + public ManyNucleotideSequences(int initialCapacity) { + this.setSize(initialCapacity); + } + + public ManyNucleotideSequences(String seqpath) { + ReferenceSequenceFile seqfile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(seqpath)); + + ReferenceSequence rs; + while ((rs = seqfile.nextSequence()) != null) { + byte[] bases = rs.getBases(); + + NucleotideSequence ns = new NucleotideSequence(bases.length); + for (int locus = 0; locus < bases.length; locus++) { + Nucleotide nt; + switch(bases[locus]) { + case 'A': nt = Nucleotide.A; break; + case 'C': nt = Nucleotide.C; break; + case 'G': nt = Nucleotide.G; break; + case 'T': nt = Nucleotide.T; break; + default: nt = Nucleotide.T; break; + } + + ns.set(locus, nt); + } + + this.add(ns); + } + } + + public ManyNucleotideSequences reverseComplement() { + for (int contig = 0; contig < this.size(); contig++) { + this.get(contig).reverseComplement(); + } + + return this; + } +} diff --git a/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/Nucleotide.java b/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/Nucleotide.java new file mode 100644 index 000000000..4921554f4 --- /dev/null +++ b/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/Nucleotide.java @@ -0,0 +1,36 @@ +package org.broadinstitute.sting.projects.fourbasecaller; + +import java.lang.String; + +public enum Nucleotide { + A, C, G, T; + + Nucleotide complement() { + switch (this) { + case A: return T; + case C: return G; + case G: return C; + case T: return A; + } + + throw new IllegalArgumentException("Nucleotide must be A, C, G, or T."); + } + + public char asChar() { + switch(this) { + case A: return 'A'; + case C: return 'C'; + case G: return 'G'; + case T: return 'T'; + } + + throw new IllegalArgumentException("Nucleotide must be A, C, G, or T."); + } + + public String toString() { + char[] value = new char[1]; + value[0] = this.asChar(); + + return new String(value); + } +} diff --git a/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/NucleotideChannelCovariances.java b/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/NucleotideChannelCovariances.java new file mode 100755 index 000000000..3b1f85ecc --- /dev/null +++ b/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/NucleotideChannelCovariances.java @@ -0,0 +1,64 @@ +package org.broadinstitute.sting.projects.fourbasecaller; + +import cern.colt.matrix.DoubleFactory2D; +import cern.colt.matrix.DoubleMatrix2D; +import cern.colt.matrix.linalg.Algebra; +import org.broadinstitute.sting.illumina.FourIntensity; + +public class NucleotideChannelCovariances { + private DoubleMatrix2D[] covs; + private int[] counts; + + public NucleotideChannelCovariances() { + counts = new int[4]; + + covs = new DoubleMatrix2D[4]; + for (int i = 0; i < 4; i++) { + covs[i] = (DoubleFactory2D.dense).make(4, 4); + } + } + + public void add(Nucleotide nuc, FourIntensity sig, NucleotideChannelMeans mus) { + FourIntensity f = new FourIntensity(sig); + f.subtract(mus.channelMeans(nuc)); + + DoubleMatrix2D cov = covs[nuc.ordinal()]; + + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + cov.set(i, j, cov.get(i, j) + f.getChannelIntensity(i)*f.getChannelIntensity(j)); + } + } + + counts[nuc.ordinal()]++; + } + + public DoubleMatrix2D channelCovariances(Nucleotide base) { + DoubleMatrix2D newcov = covs[base.ordinal()].copy(); + + /* + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + newcov.set(i, j, newcov.get(i, j)/((double) counts[base.ordinal()])); + } + } + */ + + return newcov; + } + + public void invert() { + Algebra alg = new Algebra(); + + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + for (int k = 0; k < 4; k++) { + covs[i].set(j, k, covs[i].get(j, k)/((double) counts[i])); + } + } + + DoubleMatrix2D covinv = alg.inverse(covs[i]); + covs[i] = covinv; + } + } +} diff --git a/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/NucleotideChannelMeans.java b/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/NucleotideChannelMeans.java new file mode 100755 index 000000000..511a6c67e --- /dev/null +++ b/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/NucleotideChannelMeans.java @@ -0,0 +1,42 @@ +package org.broadinstitute.sting.projects.fourbasecaller; + +import org.broadinstitute.sting.illumina.FourIntensity; + +public class NucleotideChannelMeans { + private FourIntensity[] sigs; + private int[] counts; + + public NucleotideChannelMeans() { + counts = new int[4]; + sigs = new FourIntensity[4]; + + for (int base = 0; base < 4; base++) { + sigs[base] = new FourIntensity(); + } + } + + public void add(Nucleotide base, FourIntensity intensity) { + sigs[base.ordinal()].add(intensity); + counts[base.ordinal()]++; + } + + public FourIntensity channelMeans(Nucleotide base) { + FourIntensity meansig = new FourIntensity(sigs[base.ordinal()]); + meansig.divide(counts[base.ordinal()]); + + return meansig; + } + + public float channelMean(Nucleotide base, int channel) { + FourIntensity meansig = channelMeans(base); + + return meansig.getChannelIntensity(channel); + } + + public String toString() { + return "a:{" + channelMean(Nucleotide.A, 0) + " " + channelMean(Nucleotide.A, 1) + " " + channelMean(Nucleotide.A, 2) + " " + channelMean(Nucleotide.A, 3) + "} " + + "c:{" + channelMean(Nucleotide.C, 0) + " " + channelMean(Nucleotide.C, 1) + " " + channelMean(Nucleotide.C, 2) + " " + channelMean(Nucleotide.C, 3) + "} " + + "g:{" + channelMean(Nucleotide.G, 0) + " " + channelMean(Nucleotide.G, 1) + " " + channelMean(Nucleotide.G, 2) + " " + channelMean(Nucleotide.G, 3) + "} " + + "t:{" + channelMean(Nucleotide.T, 0) + " " + channelMean(Nucleotide.T, 1) + " " + channelMean(Nucleotide.T, 2) + " " + channelMean(Nucleotide.T, 3) + "}"; + } +} diff --git a/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/NucleotideSequence.java b/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/NucleotideSequence.java new file mode 100644 index 000000000..d4ef5c10b --- /dev/null +++ b/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/NucleotideSequence.java @@ -0,0 +1,59 @@ +package org.broadinstitute.sting.projects.fourbasecaller; + +import org.broadinstitute.sting.projects.fourbasecaller.Nucleotide; + +import java.lang.String; +import java.util.Vector; + +class NucleotideSequence extends java.util.Vector { + public NucleotideSequence(int initialCapacity) { + this.setSize(initialCapacity); + } + + public NucleotideSequence(NucleotideSequence ns) { + this.setSize(ns.size()); + + for (int locus = 0; locus < ns.size(); locus++) { + switch (ns.get(locus)) { + case A: this.set(locus, Nucleotide.A); break; + case C: this.set(locus, Nucleotide.C); break; + case G: this.set(locus, Nucleotide.G); break; + case T: this.set(locus, Nucleotide.T); break; + } + } + } + + public NucleotideSequence reverseComplement() { + if (this.size() % 2 != 0) { + this.get(this.size()/2 + 1).complement(); + } + + for (int locus = 0; locus < this.size()/2; locus++) { + Nucleotide temp = this.get(locus).complement(); + this.set(locus, this.get(this.size() - locus - 1).complement()); + this.set(this.size() - locus - 1, temp); + } + + return this; + } + + public int editDistance(NucleotideSequence ns) { return editDistance(ns, this.size() < ns.size() ? this.size() : ns.size()); } + + public int editDistance(NucleotideSequence ns, int searchLength) { + int editDistance = 0; + for (int locus = 0; locus < searchLength; locus++) { + editDistance += (this.get(locus) != ns.get(locus)) ? 1 : 0; + } + + return editDistance; + } + + public String toString() { + char[] charbuffer = new char[this.size()]; + for (int locus = 0; locus < this.size(); locus++) { + charbuffer[locus] = this.get(locus).asChar(); + } + + return new String(charbuffer); + } +}