Java version of basecaller that estimates probability distribution over four-base hypothesis space via an internal-control-initialized Gaussian mixture model over base channel intensities.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@127 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kiran 2009-03-21 17:58:50 +00:00
parent 3e350006e0
commit 6e1fa7d61a
6 changed files with 473 additions and 0 deletions

View File

@ -0,0 +1,221 @@
package org.broadinstitute.sting.projects.fourbasecaller;
import java.io.File;
import java.util.Vector;
import java.lang.Math;
import org.broadinstitute.sting.illumina.FirecrestFileParser;
import org.broadinstitute.sting.illumina.FourIntensity;
import cern.colt.matrix.linalg.Algebra;
import cern.colt.matrix.DoubleMatrix1D;
import cern.colt.matrix.DoubleFactory1D;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileWriter;
import net.sf.samtools.SAMFileWriterFactory;
import net.sf.samtools.SAMRecord;
public class FourBaseCaller {
public static void main(String[] args) {
// Parse args
File firecrestDirectory = new File(args[0]);
int lane = Integer.valueOf(args[1]);
String icrefpath = args[2];
// Load ic reference
ManyNucleotideSequences controls = new ManyNucleotideSequences(icrefpath);
controls.reverseComplement();
// Loop through firecrest data, find ICs, and compute signal means
Vector<FourIntensity[]> recoveredICs = new Vector<FourIntensity[]>();
Vector<Integer> icids = new Vector<Integer>();
FirecrestFileParser ffp = new FirecrestFileParser(firecrestDirectory, lane);
NucleotideChannelMeans[] cmeans = new NucleotideChannelMeans[controls.get(0).size()];
for (int i = 0; i < cmeans.length; i++) {
cmeans[i] = new NucleotideChannelMeans();
}
int numReads = 0;
while (ffp.hasNext() && numReads < 100000) {
if (numReads % 10000 == 0) {
System.out.println("Processed " + numReads + " reads.");
}
FourIntensity[] intensities = ffp.next().getIntensities();
NucleotideSequence rawread = toNucleotideSequence(intensities);
int[] editDistances = new int[controls.size()];
for (int icTemplateIndex = 0; icTemplateIndex < controls.size(); icTemplateIndex++) {
editDistances[icTemplateIndex] = rawread.editDistance(controls.get(icTemplateIndex), 36);
if (editDistances[icTemplateIndex] <= 8) {
recoveredICs.add(intensities);
icids.add(icTemplateIndex);
for (int cycle = 0; cycle < ((controls.get(0).size() < intensities.length) ? controls.get(0).size() : intensities.length); cycle++) {
Nucleotide nuc = controls.get(icTemplateIndex).get(cycle);
FourIntensity sig = intensities[cycle];
cmeans[cycle].add(nuc, sig);
}
break;
}
}
numReads++;
}
// Go through the data again and compute signal covariances
NucleotideChannelCovariances[] ccov = new NucleotideChannelCovariances[controls.get(0).size()];
for (int i = 0; i < cmeans.length; i++) {
ccov[i] = new NucleotideChannelCovariances();
}
for (int index = 0; index < recoveredICs.size(); index++) {
NucleotideSequence control = controls.get(icids.get(index).intValue());
FourIntensity[] intensities = recoveredICs.get(index);
int seqlength = (control.size() < intensities.length) ? control.size() : intensities.length;
for (int cycle = 0; cycle < seqlength; cycle++) {
Nucleotide nuc = control.get(cycle);
FourIntensity sig = intensities[cycle];
NucleotideChannelMeans mus = cmeans[cycle];
ccov[cycle].add(nuc, sig, mus);
}
}
// Now compute probabilities for the bases
Algebra alg = new Algebra();
for (int cycle = 0; cycle < recoveredICs.get(0).length; cycle++) {
ccov[cycle].invert();
}
FirecrestFileParser ffp2 = new FirecrestFileParser(firecrestDirectory, lane);
numReads = 0;
SAMFileHeader sfh = new SAMFileHeader();
SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, new File("/wga/scr1/GSA/kiran/basecalling/simulation/test.sam"));
while (ffp2.hasNext() && numReads < 100000) {
FourIntensity[] intensities = ffp2.next().getIntensities();
byte[] asciiseq = new byte[intensities.length];
byte[] bestqual = new byte[intensities.length];
byte[] nextbestqual = new byte[intensities.length];
for (int cycle = 0; cycle < intensities.length; cycle++) {
FourIntensity fi = intensities[cycle];
double[] likes = new double[4];
double total = 0.0;
for (Nucleotide nuc : Nucleotide.values()) {
double norm = Math.sqrt(alg.det(ccov[cycle].channelCovariances(nuc)))/Math.pow(2.0*Math.PI, 2.0);
DoubleMatrix1D sub = subtract(fi, cmeans[cycle].channelMeans(nuc));
DoubleMatrix1D Ax = alg.mult(ccov[cycle].channelCovariances(nuc), sub);
double exparg = -0.5*alg.mult(sub, Ax);
likes[nuc.ordinal()] = norm*Math.exp(exparg);
total += likes[nuc.ordinal()];
}
Nucleotide call1 = Nucleotide.A;
double prob1 = likes[0]/total;
for (int i = 1; i < 4; i++) {
if (likes[i]/total > prob1) {
prob1 = likes[i]/total;
switch (i) {
case 1: call1 = Nucleotide.C; break;
case 2: call1 = Nucleotide.G; break;
case 3: call1 = Nucleotide.T; break;
}
}
}
Nucleotide call2 = Nucleotide.A;
double prob2 = 0.0;
for (int i = 0; i < 4; i++) {
if (i != call1.ordinal() && likes[i]/total > prob2 && likes[i]/total < prob1) {
prob2 = likes[i]/total;
switch (i) {
case 0: call2 = Nucleotide.A; break;
case 1: call2 = Nucleotide.C; break;
case 2: call2 = Nucleotide.G; break;
case 3: call2 = Nucleotide.T; break;
}
}
}
asciiseq[cycle] = (byte) call1.asChar();
bestqual[cycle] = toPhredScore(prob1);
nextbestqual[cycle] = toCompressedQuality(call2, prob2);
//System.out.print(call.asChar());
}
SAMRecord sr = new SAMRecord(sfh);
sr.setReadName(Integer.toString(numReads));
sr.setReadUmappedFlag(true);
sr.setReadBases(asciiseq);
sr.setBaseQualities(bestqual);
sr.setAttribute("SQ", nextbestqual);
sfw.addAlignment(sr);
numReads++;
}
sfw.close();
}
private static byte toPhredScore(double prob) {
byte qual = (prob - 1.0 < 0.00001) ? 40 : (byte) (-10*Math.log10(1.0 - prob));
return (qual > 40) ? 40 : qual;
}
private static DoubleMatrix1D subtract(FourIntensity a, FourIntensity b) {
DoubleMatrix1D sub = (DoubleFactory1D.dense).make(4);
for (int i = 0; i < 4; i++) {
sub.set(i, a.getChannelIntensity(i) - b.getChannelIntensity(i));
}
return sub;
}
private static byte toCompressedQuality(Nucleotide base, double prob) {
byte compressedQual = (byte) base.ordinal();
byte cprob = (byte) (100.0*prob);
byte qualmask = (byte) 252;
compressedQual += ((cprob << 2) & qualmask);
return compressedQual;
}
private static NucleotideSequence toNucleotideSequence(FourIntensity[] intensities) {
NucleotideSequence ns = new NucleotideSequence(intensities.length);
for (int cycle = 0; cycle < intensities.length; cycle++) {
int brightestChannel = intensities[cycle].brightestChannel();
Nucleotide nt = Nucleotide.A;
switch (brightestChannel) {
case 0: nt = Nucleotide.A; break;
case 1: nt = Nucleotide.C; break;
case 2: nt = Nucleotide.G; break;
case 3: nt = Nucleotide.T; break;
}
ns.set(cycle, nt);
}
return ns;
}
}

View File

@ -0,0 +1,51 @@
package org.broadinstitute.sting.projects.fourbasecaller;
import java.io.File;
import java.util.Vector;
import org.broadinstitute.sting.projects.fourbasecaller.NucleotideSequence;
import edu.mit.broad.picard.reference.ReferenceSequence;
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory;
class ManyNucleotideSequences extends java.util.Vector<NucleotideSequence> {
public ManyNucleotideSequences() {}
public ManyNucleotideSequences(int initialCapacity) {
this.setSize(initialCapacity);
}
public ManyNucleotideSequences(String seqpath) {
ReferenceSequenceFile seqfile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(seqpath));
ReferenceSequence rs;
while ((rs = seqfile.nextSequence()) != null) {
byte[] bases = rs.getBases();
NucleotideSequence ns = new NucleotideSequence(bases.length);
for (int locus = 0; locus < bases.length; locus++) {
Nucleotide nt;
switch(bases[locus]) {
case 'A': nt = Nucleotide.A; break;
case 'C': nt = Nucleotide.C; break;
case 'G': nt = Nucleotide.G; break;
case 'T': nt = Nucleotide.T; break;
default: nt = Nucleotide.T; break;
}
ns.set(locus, nt);
}
this.add(ns);
}
}
public ManyNucleotideSequences reverseComplement() {
for (int contig = 0; contig < this.size(); contig++) {
this.get(contig).reverseComplement();
}
return this;
}
}

View File

@ -0,0 +1,36 @@
package org.broadinstitute.sting.projects.fourbasecaller;
import java.lang.String;
public enum Nucleotide {
A, C, G, T;
Nucleotide complement() {
switch (this) {
case A: return T;
case C: return G;
case G: return C;
case T: return A;
}
throw new IllegalArgumentException("Nucleotide must be A, C, G, or T.");
}
public char asChar() {
switch(this) {
case A: return 'A';
case C: return 'C';
case G: return 'G';
case T: return 'T';
}
throw new IllegalArgumentException("Nucleotide must be A, C, G, or T.");
}
public String toString() {
char[] value = new char[1];
value[0] = this.asChar();
return new String(value);
}
}

View File

@ -0,0 +1,64 @@
package org.broadinstitute.sting.projects.fourbasecaller;
import cern.colt.matrix.DoubleFactory2D;
import cern.colt.matrix.DoubleMatrix2D;
import cern.colt.matrix.linalg.Algebra;
import org.broadinstitute.sting.illumina.FourIntensity;
public class NucleotideChannelCovariances {
private DoubleMatrix2D[] covs;
private int[] counts;
public NucleotideChannelCovariances() {
counts = new int[4];
covs = new DoubleMatrix2D[4];
for (int i = 0; i < 4; i++) {
covs[i] = (DoubleFactory2D.dense).make(4, 4);
}
}
public void add(Nucleotide nuc, FourIntensity sig, NucleotideChannelMeans mus) {
FourIntensity f = new FourIntensity(sig);
f.subtract(mus.channelMeans(nuc));
DoubleMatrix2D cov = covs[nuc.ordinal()];
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 4; j++) {
cov.set(i, j, cov.get(i, j) + f.getChannelIntensity(i)*f.getChannelIntensity(j));
}
}
counts[nuc.ordinal()]++;
}
public DoubleMatrix2D channelCovariances(Nucleotide base) {
DoubleMatrix2D newcov = covs[base.ordinal()].copy();
/*
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 4; j++) {
newcov.set(i, j, newcov.get(i, j)/((double) counts[base.ordinal()]));
}
}
*/
return newcov;
}
public void invert() {
Algebra alg = new Algebra();
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 4; j++) {
for (int k = 0; k < 4; k++) {
covs[i].set(j, k, covs[i].get(j, k)/((double) counts[i]));
}
}
DoubleMatrix2D covinv = alg.inverse(covs[i]);
covs[i] = covinv;
}
}
}

View File

@ -0,0 +1,42 @@
package org.broadinstitute.sting.projects.fourbasecaller;
import org.broadinstitute.sting.illumina.FourIntensity;
public class NucleotideChannelMeans {
private FourIntensity[] sigs;
private int[] counts;
public NucleotideChannelMeans() {
counts = new int[4];
sigs = new FourIntensity[4];
for (int base = 0; base < 4; base++) {
sigs[base] = new FourIntensity();
}
}
public void add(Nucleotide base, FourIntensity intensity) {
sigs[base.ordinal()].add(intensity);
counts[base.ordinal()]++;
}
public FourIntensity channelMeans(Nucleotide base) {
FourIntensity meansig = new FourIntensity(sigs[base.ordinal()]);
meansig.divide(counts[base.ordinal()]);
return meansig;
}
public float channelMean(Nucleotide base, int channel) {
FourIntensity meansig = channelMeans(base);
return meansig.getChannelIntensity(channel);
}
public String toString() {
return "a:{" + channelMean(Nucleotide.A, 0) + " " + channelMean(Nucleotide.A, 1) + " " + channelMean(Nucleotide.A, 2) + " " + channelMean(Nucleotide.A, 3) + "} " +
"c:{" + channelMean(Nucleotide.C, 0) + " " + channelMean(Nucleotide.C, 1) + " " + channelMean(Nucleotide.C, 2) + " " + channelMean(Nucleotide.C, 3) + "} " +
"g:{" + channelMean(Nucleotide.G, 0) + " " + channelMean(Nucleotide.G, 1) + " " + channelMean(Nucleotide.G, 2) + " " + channelMean(Nucleotide.G, 3) + "} " +
"t:{" + channelMean(Nucleotide.T, 0) + " " + channelMean(Nucleotide.T, 1) + " " + channelMean(Nucleotide.T, 2) + " " + channelMean(Nucleotide.T, 3) + "}";
}
}

View File

@ -0,0 +1,59 @@
package org.broadinstitute.sting.projects.fourbasecaller;
import org.broadinstitute.sting.projects.fourbasecaller.Nucleotide;
import java.lang.String;
import java.util.Vector;
class NucleotideSequence extends java.util.Vector<Nucleotide> {
public NucleotideSequence(int initialCapacity) {
this.setSize(initialCapacity);
}
public NucleotideSequence(NucleotideSequence ns) {
this.setSize(ns.size());
for (int locus = 0; locus < ns.size(); locus++) {
switch (ns.get(locus)) {
case A: this.set(locus, Nucleotide.A); break;
case C: this.set(locus, Nucleotide.C); break;
case G: this.set(locus, Nucleotide.G); break;
case T: this.set(locus, Nucleotide.T); break;
}
}
}
public NucleotideSequence reverseComplement() {
if (this.size() % 2 != 0) {
this.get(this.size()/2 + 1).complement();
}
for (int locus = 0; locus < this.size()/2; locus++) {
Nucleotide temp = this.get(locus).complement();
this.set(locus, this.get(this.size() - locus - 1).complement());
this.set(this.size() - locus - 1, temp);
}
return this;
}
public int editDistance(NucleotideSequence ns) { return editDistance(ns, this.size() < ns.size() ? this.size() : ns.size()); }
public int editDistance(NucleotideSequence ns, int searchLength) {
int editDistance = 0;
for (int locus = 0; locus < searchLength; locus++) {
editDistance += (this.get(locus) != ns.get(locus)) ? 1 : 0;
}
return editDistance;
}
public String toString() {
char[] charbuffer = new char[this.size()];
for (int locus = 0; locus < this.size(); locus++) {
charbuffer[locus] = this.get(locus).asChar();
}
return new String(charbuffer);
}
}