Java version of basecaller that estimates probability distribution over four-base hypothesis space via an internal-control-initialized Gaussian mixture model over base channel intensities.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@127 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
3e350006e0
commit
6e1fa7d61a
|
|
@ -0,0 +1,221 @@
|
|||
package org.broadinstitute.sting.projects.fourbasecaller;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Vector;
|
||||
import java.lang.Math;
|
||||
|
||||
import org.broadinstitute.sting.illumina.FirecrestFileParser;
|
||||
import org.broadinstitute.sting.illumina.FourIntensity;
|
||||
import cern.colt.matrix.linalg.Algebra;
|
||||
import cern.colt.matrix.DoubleMatrix1D;
|
||||
import cern.colt.matrix.DoubleFactory1D;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMFileWriter;
|
||||
import net.sf.samtools.SAMFileWriterFactory;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
public class FourBaseCaller {
|
||||
public static void main(String[] args) {
|
||||
// Parse args
|
||||
File firecrestDirectory = new File(args[0]);
|
||||
int lane = Integer.valueOf(args[1]);
|
||||
String icrefpath = args[2];
|
||||
|
||||
// Load ic reference
|
||||
ManyNucleotideSequences controls = new ManyNucleotideSequences(icrefpath);
|
||||
controls.reverseComplement();
|
||||
|
||||
// Loop through firecrest data, find ICs, and compute signal means
|
||||
Vector<FourIntensity[]> recoveredICs = new Vector<FourIntensity[]>();
|
||||
Vector<Integer> icids = new Vector<Integer>();
|
||||
|
||||
FirecrestFileParser ffp = new FirecrestFileParser(firecrestDirectory, lane);
|
||||
|
||||
NucleotideChannelMeans[] cmeans = new NucleotideChannelMeans[controls.get(0).size()];
|
||||
for (int i = 0; i < cmeans.length; i++) {
|
||||
cmeans[i] = new NucleotideChannelMeans();
|
||||
}
|
||||
|
||||
int numReads = 0;
|
||||
while (ffp.hasNext() && numReads < 100000) {
|
||||
if (numReads % 10000 == 0) {
|
||||
System.out.println("Processed " + numReads + " reads.");
|
||||
}
|
||||
FourIntensity[] intensities = ffp.next().getIntensities();
|
||||
NucleotideSequence rawread = toNucleotideSequence(intensities);
|
||||
|
||||
int[] editDistances = new int[controls.size()];
|
||||
|
||||
for (int icTemplateIndex = 0; icTemplateIndex < controls.size(); icTemplateIndex++) {
|
||||
editDistances[icTemplateIndex] = rawread.editDistance(controls.get(icTemplateIndex), 36);
|
||||
|
||||
if (editDistances[icTemplateIndex] <= 8) {
|
||||
recoveredICs.add(intensities);
|
||||
icids.add(icTemplateIndex);
|
||||
|
||||
for (int cycle = 0; cycle < ((controls.get(0).size() < intensities.length) ? controls.get(0).size() : intensities.length); cycle++) {
|
||||
Nucleotide nuc = controls.get(icTemplateIndex).get(cycle);
|
||||
FourIntensity sig = intensities[cycle];
|
||||
|
||||
cmeans[cycle].add(nuc, sig);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
numReads++;
|
||||
}
|
||||
|
||||
// Go through the data again and compute signal covariances
|
||||
NucleotideChannelCovariances[] ccov = new NucleotideChannelCovariances[controls.get(0).size()];
|
||||
for (int i = 0; i < cmeans.length; i++) {
|
||||
ccov[i] = new NucleotideChannelCovariances();
|
||||
}
|
||||
|
||||
for (int index = 0; index < recoveredICs.size(); index++) {
|
||||
NucleotideSequence control = controls.get(icids.get(index).intValue());
|
||||
FourIntensity[] intensities = recoveredICs.get(index);
|
||||
int seqlength = (control.size() < intensities.length) ? control.size() : intensities.length;
|
||||
|
||||
for (int cycle = 0; cycle < seqlength; cycle++) {
|
||||
Nucleotide nuc = control.get(cycle);
|
||||
FourIntensity sig = intensities[cycle];
|
||||
NucleotideChannelMeans mus = cmeans[cycle];
|
||||
|
||||
ccov[cycle].add(nuc, sig, mus);
|
||||
}
|
||||
}
|
||||
|
||||
// Now compute probabilities for the bases
|
||||
Algebra alg = new Algebra();
|
||||
|
||||
for (int cycle = 0; cycle < recoveredICs.get(0).length; cycle++) {
|
||||
ccov[cycle].invert();
|
||||
}
|
||||
|
||||
FirecrestFileParser ffp2 = new FirecrestFileParser(firecrestDirectory, lane);
|
||||
|
||||
numReads = 0;
|
||||
|
||||
SAMFileHeader sfh = new SAMFileHeader();
|
||||
SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, new File("/wga/scr1/GSA/kiran/basecalling/simulation/test.sam"));
|
||||
|
||||
while (ffp2.hasNext() && numReads < 100000) {
|
||||
FourIntensity[] intensities = ffp2.next().getIntensities();
|
||||
|
||||
byte[] asciiseq = new byte[intensities.length];
|
||||
byte[] bestqual = new byte[intensities.length];
|
||||
byte[] nextbestqual = new byte[intensities.length];
|
||||
|
||||
for (int cycle = 0; cycle < intensities.length; cycle++) {
|
||||
FourIntensity fi = intensities[cycle];
|
||||
|
||||
double[] likes = new double[4];
|
||||
double total = 0.0;
|
||||
|
||||
for (Nucleotide nuc : Nucleotide.values()) {
|
||||
double norm = Math.sqrt(alg.det(ccov[cycle].channelCovariances(nuc)))/Math.pow(2.0*Math.PI, 2.0);
|
||||
|
||||
DoubleMatrix1D sub = subtract(fi, cmeans[cycle].channelMeans(nuc));
|
||||
DoubleMatrix1D Ax = alg.mult(ccov[cycle].channelCovariances(nuc), sub);
|
||||
|
||||
double exparg = -0.5*alg.mult(sub, Ax);
|
||||
likes[nuc.ordinal()] = norm*Math.exp(exparg);
|
||||
total += likes[nuc.ordinal()];
|
||||
}
|
||||
|
||||
Nucleotide call1 = Nucleotide.A;
|
||||
double prob1 = likes[0]/total;
|
||||
for (int i = 1; i < 4; i++) {
|
||||
if (likes[i]/total > prob1) {
|
||||
prob1 = likes[i]/total;
|
||||
|
||||
switch (i) {
|
||||
case 1: call1 = Nucleotide.C; break;
|
||||
case 2: call1 = Nucleotide.G; break;
|
||||
case 3: call1 = Nucleotide.T; break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Nucleotide call2 = Nucleotide.A;
|
||||
double prob2 = 0.0;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (i != call1.ordinal() && likes[i]/total > prob2 && likes[i]/total < prob1) {
|
||||
prob2 = likes[i]/total;
|
||||
|
||||
switch (i) {
|
||||
case 0: call2 = Nucleotide.A; break;
|
||||
case 1: call2 = Nucleotide.C; break;
|
||||
case 2: call2 = Nucleotide.G; break;
|
||||
case 3: call2 = Nucleotide.T; break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
asciiseq[cycle] = (byte) call1.asChar();
|
||||
bestqual[cycle] = toPhredScore(prob1);
|
||||
nextbestqual[cycle] = toCompressedQuality(call2, prob2);
|
||||
|
||||
//System.out.print(call.asChar());
|
||||
}
|
||||
|
||||
SAMRecord sr = new SAMRecord(sfh);
|
||||
sr.setReadName(Integer.toString(numReads));
|
||||
sr.setReadUmappedFlag(true);
|
||||
sr.setReadBases(asciiseq);
|
||||
sr.setBaseQualities(bestqual);
|
||||
sr.setAttribute("SQ", nextbestqual);
|
||||
sfw.addAlignment(sr);
|
||||
|
||||
numReads++;
|
||||
}
|
||||
|
||||
sfw.close();
|
||||
}
|
||||
|
||||
private static byte toPhredScore(double prob) {
|
||||
byte qual = (prob - 1.0 < 0.00001) ? 40 : (byte) (-10*Math.log10(1.0 - prob));
|
||||
return (qual > 40) ? 40 : qual;
|
||||
}
|
||||
|
||||
private static DoubleMatrix1D subtract(FourIntensity a, FourIntensity b) {
|
||||
DoubleMatrix1D sub = (DoubleFactory1D.dense).make(4);
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
sub.set(i, a.getChannelIntensity(i) - b.getChannelIntensity(i));
|
||||
}
|
||||
|
||||
return sub;
|
||||
}
|
||||
|
||||
private static byte toCompressedQuality(Nucleotide base, double prob) {
|
||||
byte compressedQual = (byte) base.ordinal();
|
||||
byte cprob = (byte) (100.0*prob);
|
||||
byte qualmask = (byte) 252;
|
||||
compressedQual += ((cprob << 2) & qualmask);
|
||||
|
||||
return compressedQual;
|
||||
}
|
||||
|
||||
private static NucleotideSequence toNucleotideSequence(FourIntensity[] intensities) {
|
||||
NucleotideSequence ns = new NucleotideSequence(intensities.length);
|
||||
|
||||
for (int cycle = 0; cycle < intensities.length; cycle++) {
|
||||
int brightestChannel = intensities[cycle].brightestChannel();
|
||||
|
||||
Nucleotide nt = Nucleotide.A;
|
||||
switch (brightestChannel) {
|
||||
case 0: nt = Nucleotide.A; break;
|
||||
case 1: nt = Nucleotide.C; break;
|
||||
case 2: nt = Nucleotide.G; break;
|
||||
case 3: nt = Nucleotide.T; break;
|
||||
}
|
||||
|
||||
ns.set(cycle, nt);
|
||||
}
|
||||
|
||||
return ns;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,51 @@
|
|||
package org.broadinstitute.sting.projects.fourbasecaller;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Vector;
|
||||
|
||||
import org.broadinstitute.sting.projects.fourbasecaller.NucleotideSequence;
|
||||
|
||||
import edu.mit.broad.picard.reference.ReferenceSequence;
|
||||
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
|
||||
import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory;
|
||||
|
||||
class ManyNucleotideSequences extends java.util.Vector<NucleotideSequence> {
|
||||
public ManyNucleotideSequences() {}
|
||||
|
||||
public ManyNucleotideSequences(int initialCapacity) {
|
||||
this.setSize(initialCapacity);
|
||||
}
|
||||
|
||||
public ManyNucleotideSequences(String seqpath) {
|
||||
ReferenceSequenceFile seqfile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(seqpath));
|
||||
|
||||
ReferenceSequence rs;
|
||||
while ((rs = seqfile.nextSequence()) != null) {
|
||||
byte[] bases = rs.getBases();
|
||||
|
||||
NucleotideSequence ns = new NucleotideSequence(bases.length);
|
||||
for (int locus = 0; locus < bases.length; locus++) {
|
||||
Nucleotide nt;
|
||||
switch(bases[locus]) {
|
||||
case 'A': nt = Nucleotide.A; break;
|
||||
case 'C': nt = Nucleotide.C; break;
|
||||
case 'G': nt = Nucleotide.G; break;
|
||||
case 'T': nt = Nucleotide.T; break;
|
||||
default: nt = Nucleotide.T; break;
|
||||
}
|
||||
|
||||
ns.set(locus, nt);
|
||||
}
|
||||
|
||||
this.add(ns);
|
||||
}
|
||||
}
|
||||
|
||||
public ManyNucleotideSequences reverseComplement() {
|
||||
for (int contig = 0; contig < this.size(); contig++) {
|
||||
this.get(contig).reverseComplement();
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,36 @@
|
|||
package org.broadinstitute.sting.projects.fourbasecaller;
|
||||
|
||||
import java.lang.String;
|
||||
|
||||
public enum Nucleotide {
|
||||
A, C, G, T;
|
||||
|
||||
Nucleotide complement() {
|
||||
switch (this) {
|
||||
case A: return T;
|
||||
case C: return G;
|
||||
case G: return C;
|
||||
case T: return A;
|
||||
}
|
||||
|
||||
throw new IllegalArgumentException("Nucleotide must be A, C, G, or T.");
|
||||
}
|
||||
|
||||
public char asChar() {
|
||||
switch(this) {
|
||||
case A: return 'A';
|
||||
case C: return 'C';
|
||||
case G: return 'G';
|
||||
case T: return 'T';
|
||||
}
|
||||
|
||||
throw new IllegalArgumentException("Nucleotide must be A, C, G, or T.");
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
char[] value = new char[1];
|
||||
value[0] = this.asChar();
|
||||
|
||||
return new String(value);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,64 @@
|
|||
package org.broadinstitute.sting.projects.fourbasecaller;
|
||||
|
||||
import cern.colt.matrix.DoubleFactory2D;
|
||||
import cern.colt.matrix.DoubleMatrix2D;
|
||||
import cern.colt.matrix.linalg.Algebra;
|
||||
import org.broadinstitute.sting.illumina.FourIntensity;
|
||||
|
||||
public class NucleotideChannelCovariances {
|
||||
private DoubleMatrix2D[] covs;
|
||||
private int[] counts;
|
||||
|
||||
public NucleotideChannelCovariances() {
|
||||
counts = new int[4];
|
||||
|
||||
covs = new DoubleMatrix2D[4];
|
||||
for (int i = 0; i < 4; i++) {
|
||||
covs[i] = (DoubleFactory2D.dense).make(4, 4);
|
||||
}
|
||||
}
|
||||
|
||||
public void add(Nucleotide nuc, FourIntensity sig, NucleotideChannelMeans mus) {
|
||||
FourIntensity f = new FourIntensity(sig);
|
||||
f.subtract(mus.channelMeans(nuc));
|
||||
|
||||
DoubleMatrix2D cov = covs[nuc.ordinal()];
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
for (int j = 0; j < 4; j++) {
|
||||
cov.set(i, j, cov.get(i, j) + f.getChannelIntensity(i)*f.getChannelIntensity(j));
|
||||
}
|
||||
}
|
||||
|
||||
counts[nuc.ordinal()]++;
|
||||
}
|
||||
|
||||
public DoubleMatrix2D channelCovariances(Nucleotide base) {
|
||||
DoubleMatrix2D newcov = covs[base.ordinal()].copy();
|
||||
|
||||
/*
|
||||
for (int i = 0; i < 4; i++) {
|
||||
for (int j = 0; j < 4; j++) {
|
||||
newcov.set(i, j, newcov.get(i, j)/((double) counts[base.ordinal()]));
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
return newcov;
|
||||
}
|
||||
|
||||
public void invert() {
|
||||
Algebra alg = new Algebra();
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
for (int j = 0; j < 4; j++) {
|
||||
for (int k = 0; k < 4; k++) {
|
||||
covs[i].set(j, k, covs[i].get(j, k)/((double) counts[i]));
|
||||
}
|
||||
}
|
||||
|
||||
DoubleMatrix2D covinv = alg.inverse(covs[i]);
|
||||
covs[i] = covinv;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,42 @@
|
|||
package org.broadinstitute.sting.projects.fourbasecaller;
|
||||
|
||||
import org.broadinstitute.sting.illumina.FourIntensity;
|
||||
|
||||
public class NucleotideChannelMeans {
|
||||
private FourIntensity[] sigs;
|
||||
private int[] counts;
|
||||
|
||||
public NucleotideChannelMeans() {
|
||||
counts = new int[4];
|
||||
sigs = new FourIntensity[4];
|
||||
|
||||
for (int base = 0; base < 4; base++) {
|
||||
sigs[base] = new FourIntensity();
|
||||
}
|
||||
}
|
||||
|
||||
public void add(Nucleotide base, FourIntensity intensity) {
|
||||
sigs[base.ordinal()].add(intensity);
|
||||
counts[base.ordinal()]++;
|
||||
}
|
||||
|
||||
public FourIntensity channelMeans(Nucleotide base) {
|
||||
FourIntensity meansig = new FourIntensity(sigs[base.ordinal()]);
|
||||
meansig.divide(counts[base.ordinal()]);
|
||||
|
||||
return meansig;
|
||||
}
|
||||
|
||||
public float channelMean(Nucleotide base, int channel) {
|
||||
FourIntensity meansig = channelMeans(base);
|
||||
|
||||
return meansig.getChannelIntensity(channel);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "a:{" + channelMean(Nucleotide.A, 0) + " " + channelMean(Nucleotide.A, 1) + " " + channelMean(Nucleotide.A, 2) + " " + channelMean(Nucleotide.A, 3) + "} " +
|
||||
"c:{" + channelMean(Nucleotide.C, 0) + " " + channelMean(Nucleotide.C, 1) + " " + channelMean(Nucleotide.C, 2) + " " + channelMean(Nucleotide.C, 3) + "} " +
|
||||
"g:{" + channelMean(Nucleotide.G, 0) + " " + channelMean(Nucleotide.G, 1) + " " + channelMean(Nucleotide.G, 2) + " " + channelMean(Nucleotide.G, 3) + "} " +
|
||||
"t:{" + channelMean(Nucleotide.T, 0) + " " + channelMean(Nucleotide.T, 1) + " " + channelMean(Nucleotide.T, 2) + " " + channelMean(Nucleotide.T, 3) + "}";
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,59 @@
|
|||
package org.broadinstitute.sting.projects.fourbasecaller;
|
||||
|
||||
import org.broadinstitute.sting.projects.fourbasecaller.Nucleotide;
|
||||
|
||||
import java.lang.String;
|
||||
import java.util.Vector;
|
||||
|
||||
class NucleotideSequence extends java.util.Vector<Nucleotide> {
|
||||
public NucleotideSequence(int initialCapacity) {
|
||||
this.setSize(initialCapacity);
|
||||
}
|
||||
|
||||
public NucleotideSequence(NucleotideSequence ns) {
|
||||
this.setSize(ns.size());
|
||||
|
||||
for (int locus = 0; locus < ns.size(); locus++) {
|
||||
switch (ns.get(locus)) {
|
||||
case A: this.set(locus, Nucleotide.A); break;
|
||||
case C: this.set(locus, Nucleotide.C); break;
|
||||
case G: this.set(locus, Nucleotide.G); break;
|
||||
case T: this.set(locus, Nucleotide.T); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public NucleotideSequence reverseComplement() {
|
||||
if (this.size() % 2 != 0) {
|
||||
this.get(this.size()/2 + 1).complement();
|
||||
}
|
||||
|
||||
for (int locus = 0; locus < this.size()/2; locus++) {
|
||||
Nucleotide temp = this.get(locus).complement();
|
||||
this.set(locus, this.get(this.size() - locus - 1).complement());
|
||||
this.set(this.size() - locus - 1, temp);
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
public int editDistance(NucleotideSequence ns) { return editDistance(ns, this.size() < ns.size() ? this.size() : ns.size()); }
|
||||
|
||||
public int editDistance(NucleotideSequence ns, int searchLength) {
|
||||
int editDistance = 0;
|
||||
for (int locus = 0; locus < searchLength; locus++) {
|
||||
editDistance += (this.get(locus) != ns.get(locus)) ? 1 : 0;
|
||||
}
|
||||
|
||||
return editDistance;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
char[] charbuffer = new char[this.size()];
|
||||
for (int locus = 0; locus < this.size(); locus++) {
|
||||
charbuffer[locus] = this.get(locus).asChar();
|
||||
}
|
||||
|
||||
return new String(charbuffer);
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue