Refactored into oblivon.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@276 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kiran 2009-04-02 22:12:15 +00:00
parent dffc879240
commit 7d889c0661
6 changed files with 0 additions and 495 deletions

View File

@ -1,243 +0,0 @@
package org.broadinstitute.sting.playground.fourbasecaller;
import java.io.File;
import java.util.Vector;
import java.lang.Math;
import org.broadinstitute.sting.playground.illumina.FirecrestFileParser;
import org.broadinstitute.sting.playground.illumina.FourIntensity;
import cern.colt.matrix.linalg.Algebra;
import cern.colt.matrix.DoubleMatrix1D;
import cern.colt.matrix.DoubleFactory1D;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileWriter;
import net.sf.samtools.SAMFileWriterFactory;
import net.sf.samtools.SAMRecord;
public class FourBaseCaller {
public static void main(String[] argv) {
// Parse args
File FIRECREST_DIR = new File(argv[0]);
int LANE = Integer.valueOf(argv[1]);
File SAM_OUT = new File(argv[2]);
int CYCLE_START = Integer.valueOf(argv[3]);
int CYCLE_STOP = Integer.valueOf(argv[4]);
String IC_REF_PATH = "/seq/references/Synthetic_internal_controls_set1/v0/Synthetic_internal_controls_set1.fasta";
// Load ic reference in an ugly way
ManyNucleotideSequences fw_controls = new ManyNucleotideSequences(IC_REF_PATH);
ManyNucleotideSequences rc_controls = new ManyNucleotideSequences(IC_REF_PATH);
rc_controls.reverseComplement();
ManyNucleotideSequences controls = new ManyNucleotideSequences();
controls.addAll(fw_controls);
controls.addAll(rc_controls);
// Loop through firecrest data, find ICs, and compute signal means
Vector<FourIntensity[]> recoveredICs = new Vector<FourIntensity[]>();
Vector<Integer> icids = new Vector<Integer>();
FirecrestFileParser ffp = new FirecrestFileParser(FIRECREST_DIR, LANE, CYCLE_START, CYCLE_STOP);
NucleotideChannelMeans[] cmeans = new NucleotideChannelMeans[controls.get(0).size()];
for (int i = 0; i < cmeans.length; i++) {
cmeans[i] = new NucleotideChannelMeans();
}
int numReads = 0;
while (ffp.hasNext()) {
if (numReads % 1000000 == 0) {
System.err.println("Processed " + numReads + " reads for parameters.");
}
FourIntensity[] intensities = ffp.next().getIntensities();
NucleotideSequence rawread = toNucleotideSequence(intensities);
int[] editDistances = new int[controls.size()];
for (int icTemplateIndex = 0; icTemplateIndex < controls.size(); icTemplateIndex++) {
editDistances[icTemplateIndex] = rawread.editDistance(controls.get(icTemplateIndex), 36);
if (editDistances[icTemplateIndex] <= 8) {
recoveredICs.add(intensities);
icids.add(icTemplateIndex);
for (int cycle = 0; cycle < ((controls.get(0).size() < intensities.length) ? controls.get(0).size() : intensities.length); cycle++) {
Nucleotide nuc = controls.get(icTemplateIndex).get(cycle);
FourIntensity sig = intensities[cycle];
cmeans[cycle].add(nuc, sig);
}
System.err.print("Found " + icids.size() + " ICs in " + numReads + " reads " + (((double) icids.size())/((double) numReads)) + ".\r");
break;
}
}
numReads++;
}
// Go through the data again and compute signal covariances
NucleotideChannelCovariances[] ccov = new NucleotideChannelCovariances[controls.get(0).size()];
for (int i = 0; i < cmeans.length; i++) {
ccov[i] = new NucleotideChannelCovariances();
}
for (int index = 0; index < recoveredICs.size(); index++) {
NucleotideSequence control = controls.get(icids.get(index).intValue());
FourIntensity[] intensities = recoveredICs.get(index);
int seqlength = (control.size() < intensities.length) ? control.size() : intensities.length;
for (int cycle = 0; cycle < seqlength; cycle++) {
Nucleotide nuc = control.get(cycle);
FourIntensity sig = intensities[cycle];
NucleotideChannelMeans mus = cmeans[cycle];
ccov[cycle].add(nuc, sig, mus);
}
}
System.out.println("Found " + recoveredICs.size() + " in " + numReads + " reads.");
// Extend IC solutions to end of read length (if ICs are shorter than read)
System.err.println("Extending cycle " + (controls.get(0).size() - 1) + " parameters from cycle " + controls.get(0).size() + " to cycle " + recoveredICs.get(0).length);
for (int cycle = controls.get(0).size(); cycle < recoveredICs.get(0).length; cycle++) {
cmeans[cycle] = cmeans[controls.get(0).size() - 1];
ccov[cycle] = ccov[controls.get(0).size() - 1];
}
// Now compute probabilities for the bases
Algebra alg = new Algebra();
for (int cycle = 0; cycle < recoveredICs.get(0).length; cycle++) {
ccov[cycle].invert();
}
FirecrestFileParser ffp2 = new FirecrestFileParser(FIRECREST_DIR, LANE, CYCLE_START, CYCLE_STOP);
SAMFileHeader sfh = new SAMFileHeader();
SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, SAM_OUT);
numReads = 0;
while (ffp2.hasNext()) {
if (numReads % 1000000 == 0) {
System.err.println("Basecalled " + numReads + " reads.");
}
FourIntensity[] intensities = ffp2.next().getIntensities();
byte[] asciiseq = new byte[intensities.length];
byte[] bestqual = new byte[intensities.length];
byte[] nextbestqual = new byte[intensities.length];
for (int cycle = 0; cycle < intensities.length; cycle++) {
FourIntensity fi = intensities[cycle];
double[] likes = new double[4];
double total = 0.0;
for (Nucleotide nuc : Nucleotide.values()) {
double norm = Math.sqrt(alg.det(ccov[cycle].channelCovariances(nuc)))/Math.pow(2.0*Math.PI, 2.0);
DoubleMatrix1D sub = subtract(fi, cmeans[cycle].channelMeans(nuc));
DoubleMatrix1D Ax = alg.mult(ccov[cycle].channelCovariances(nuc), sub);
double exparg = -0.5*alg.mult(sub, Ax);
likes[nuc.ordinal()] = norm*Math.exp(exparg);
total += likes[nuc.ordinal()];
}
Nucleotide call1 = Nucleotide.A;
double prob1 = likes[0]/total;
for (int i = 1; i < 4; i++) {
if (likes[i]/total > prob1) {
prob1 = likes[i]/total;
switch (i) {
case 1: call1 = Nucleotide.C; break;
case 2: call1 = Nucleotide.G; break;
case 3: call1 = Nucleotide.T; break;
}
}
}
Nucleotide call2 = Nucleotide.A;
double prob2 = 0.0;
for (int i = 0; i < 4; i++) {
if (i != call1.ordinal() && likes[i]/total > prob2 && likes[i]/total < prob1) {
prob2 = likes[i]/total;
switch (i) {
case 0: call2 = Nucleotide.A; break;
case 1: call2 = Nucleotide.C; break;
case 2: call2 = Nucleotide.G; break;
case 3: call2 = Nucleotide.T; break;
}
}
}
asciiseq[cycle] = (byte) call1.asChar();
bestqual[cycle] = toPhredScore(prob1);
nextbestqual[cycle] = toCompressedQuality(call2, prob2);
}
SAMRecord sr = new SAMRecord(sfh);
sr.setReadName(Integer.toString(numReads));
sr.setReadUmappedFlag(true);
sr.setReadBases(asciiseq);
sr.setBaseQualities(bestqual);
sr.setAttribute("SQ", nextbestqual);
sfw.addAlignment(sr);
numReads++;
}
sfw.close();
System.out.println("Done.");
}
private static byte toPhredScore(double prob) {
byte qual = (1.0 - prob < 0.00001) ? 40 : (byte) (-10*Math.log10(1.0 - prob));
return (qual > 40) ? 40 : qual;
}
private static DoubleMatrix1D subtract(FourIntensity a, FourIntensity b) {
DoubleMatrix1D sub = (DoubleFactory1D.dense).make(4);
for (int i = 0; i < 4; i++) {
sub.set(i, a.getChannelIntensity(i) - b.getChannelIntensity(i));
}
return sub;
}
private static byte toCompressedQuality(Nucleotide base, double prob) {
byte compressedQual = (byte) base.ordinal();
byte cprob = (byte) (100.0*prob);
byte qualmask = (byte) 252;
compressedQual += ((cprob << 2) & qualmask);
return compressedQual;
}
private static NucleotideSequence toNucleotideSequence(FourIntensity[] intensities) {
NucleotideSequence ns = new NucleotideSequence(intensities.length);
for (int cycle = 0; cycle < intensities.length; cycle++) {
int brightestChannel = intensities[cycle].brightestChannel();
Nucleotide nt = Nucleotide.A;
switch (brightestChannel) {
case 0: nt = Nucleotide.A; break;
case 1: nt = Nucleotide.C; break;
case 2: nt = Nucleotide.G; break;
case 3: nt = Nucleotide.T; break;
}
ns.set(cycle, nt);
}
return ns;
}
}

View File

@ -1,51 +0,0 @@
package org.broadinstitute.sting.playground.fourbasecaller;
import java.io.File;
import java.util.Vector;
import org.broadinstitute.sting.playground.fourbasecaller.NucleotideSequence;
import edu.mit.broad.picard.reference.ReferenceSequence;
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory;
class ManyNucleotideSequences extends java.util.Vector<NucleotideSequence> {
public ManyNucleotideSequences() {}
public ManyNucleotideSequences(int initialCapacity) {
this.setSize(initialCapacity);
}
public ManyNucleotideSequences(String seqpath) {
ReferenceSequenceFile seqfile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(seqpath));
ReferenceSequence rs;
while ((rs = seqfile.nextSequence()) != null) {
byte[] bases = rs.getBases();
NucleotideSequence ns = new NucleotideSequence(bases.length);
for (int locus = 0; locus < bases.length; locus++) {
Nucleotide nt;
switch(bases[locus]) {
case 'A': nt = Nucleotide.A; break;
case 'C': nt = Nucleotide.C; break;
case 'G': nt = Nucleotide.G; break;
case 'T': nt = Nucleotide.T; break;
default: nt = Nucleotide.T; break;
}
ns.set(locus, nt);
}
this.add(ns);
}
}
public ManyNucleotideSequences reverseComplement() {
for (int contig = 0; contig < this.size(); contig++) {
this.get(contig).reverseComplement();
}
return this;
}
}

View File

@ -1,36 +0,0 @@
package org.broadinstitute.sting.playground.fourbasecaller;
import java.lang.String;
public enum Nucleotide {
A, C, G, T;
Nucleotide complement() {
switch (this) {
case A: return T;
case C: return G;
case G: return C;
case T: return A;
}
throw new IllegalArgumentException("Nucleotide must be A, C, G, or T.");
}
public char asChar() {
switch(this) {
case A: return 'A';
case C: return 'C';
case G: return 'G';
case T: return 'T';
}
throw new IllegalArgumentException("Nucleotide must be A, C, G, or T.");
}
public String toString() {
char[] value = new char[1];
value[0] = this.asChar();
return new String(value);
}
}

View File

@ -1,64 +0,0 @@
package org.broadinstitute.sting.playground.fourbasecaller;
import cern.colt.matrix.DoubleFactory2D;
import cern.colt.matrix.DoubleMatrix2D;
import cern.colt.matrix.linalg.Algebra;
import org.broadinstitute.sting.playground.illumina.FourIntensity;
public class NucleotideChannelCovariances {
private DoubleMatrix2D[] covs;
private int[] counts;
public NucleotideChannelCovariances() {
counts = new int[4];
covs = new DoubleMatrix2D[4];
for (int i = 0; i < 4; i++) {
covs[i] = (DoubleFactory2D.dense).make(4, 4);
}
}
public void add(Nucleotide nuc, FourIntensity sig, NucleotideChannelMeans mus) {
FourIntensity f = new FourIntensity(sig);
f.subtract(mus.channelMeans(nuc));
DoubleMatrix2D cov = covs[nuc.ordinal()];
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 4; j++) {
cov.set(i, j, cov.get(i, j) + f.getChannelIntensity(i)*f.getChannelIntensity(j));
}
}
counts[nuc.ordinal()]++;
}
public DoubleMatrix2D channelCovariances(Nucleotide base) {
DoubleMatrix2D newcov = covs[base.ordinal()].copy();
/*
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 4; j++) {
newcov.set(i, j, newcov.get(i, j)/((double) counts[base.ordinal()]));
}
}
*/
return newcov;
}
public void invert() {
Algebra alg = new Algebra();
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 4; j++) {
for (int k = 0; k < 4; k++) {
covs[i].set(j, k, covs[i].get(j, k)/((double) counts[i]));
}
}
DoubleMatrix2D covinv = alg.inverse(covs[i]);
covs[i] = covinv;
}
}
}

View File

@ -1,42 +0,0 @@
package org.broadinstitute.sting.playground.fourbasecaller;
import org.broadinstitute.sting.playground.illumina.FourIntensity;
public class NucleotideChannelMeans {
private FourIntensity[] sigs;
private int[] counts;
public NucleotideChannelMeans() {
counts = new int[4];
sigs = new FourIntensity[4];
for (int base = 0; base < 4; base++) {
sigs[base] = new FourIntensity();
}
}
public void add(Nucleotide base, FourIntensity intensity) {
sigs[base.ordinal()].add(intensity);
counts[base.ordinal()]++;
}
public FourIntensity channelMeans(Nucleotide base) {
FourIntensity meansig = new FourIntensity(sigs[base.ordinal()]);
meansig.divide(counts[base.ordinal()]);
return meansig;
}
public float channelMean(Nucleotide base, int channel) {
FourIntensity meansig = channelMeans(base);
return meansig.getChannelIntensity(channel);
}
public String toString() {
return "a:{" + channelMean(Nucleotide.A, 0) + " " + channelMean(Nucleotide.A, 1) + " " + channelMean(Nucleotide.A, 2) + " " + channelMean(Nucleotide.A, 3) + "} " +
"c:{" + channelMean(Nucleotide.C, 0) + " " + channelMean(Nucleotide.C, 1) + " " + channelMean(Nucleotide.C, 2) + " " + channelMean(Nucleotide.C, 3) + "} " +
"g:{" + channelMean(Nucleotide.G, 0) + " " + channelMean(Nucleotide.G, 1) + " " + channelMean(Nucleotide.G, 2) + " " + channelMean(Nucleotide.G, 3) + "} " +
"t:{" + channelMean(Nucleotide.T, 0) + " " + channelMean(Nucleotide.T, 1) + " " + channelMean(Nucleotide.T, 2) + " " + channelMean(Nucleotide.T, 3) + "}";
}
}

View File

@ -1,59 +0,0 @@
package org.broadinstitute.sting.playground.fourbasecaller;
import org.broadinstitute.sting.playground.fourbasecaller.Nucleotide;
import java.lang.String;
import java.util.Vector;
class NucleotideSequence extends java.util.Vector<Nucleotide> {
public NucleotideSequence(int initialCapacity) {
this.setSize(initialCapacity);
}
public NucleotideSequence(NucleotideSequence ns) {
this.setSize(ns.size());
for (int locus = 0; locus < ns.size(); locus++) {
switch (ns.get(locus)) {
case A: this.set(locus, Nucleotide.A); break;
case C: this.set(locus, Nucleotide.C); break;
case G: this.set(locus, Nucleotide.G); break;
case T: this.set(locus, Nucleotide.T); break;
}
}
}
public NucleotideSequence reverseComplement() {
if (this.size() % 2 != 0) {
this.get(this.size()/2 + 1).complement();
}
for (int locus = 0; locus < this.size()/2; locus++) {
Nucleotide temp = this.get(locus).complement();
this.set(locus, this.get(this.size() - locus - 1).complement());
this.set(this.size() - locus - 1, temp);
}
return this;
}
public int editDistance(NucleotideSequence ns) { return editDistance(ns, this.size() < ns.size() ? this.size() : ns.size()); }
public int editDistance(NucleotideSequence ns, int searchLength) {
int editDistance = 0;
for (int locus = 0; locus < searchLength; locus++) {
editDistance += (this.get(locus) != ns.get(locus)) ? 1 : 0;
}
return editDistance;
}
public String toString() {
char[] charbuffer = new char[this.size()];
for (int locus = 0; locus < this.size(); locus++) {
charbuffer[locus] = this.get(locus).asChar();
}
return new String(charbuffer);
}
}