diff --git a/java/src/org/broadinstitute/sting/playground/fourbasecaller/FourBaseCaller.java b/java/src/org/broadinstitute/sting/playground/fourbasecaller/FourBaseCaller.java deleted file mode 100644 index ebc749cc0..000000000 --- a/java/src/org/broadinstitute/sting/playground/fourbasecaller/FourBaseCaller.java +++ /dev/null @@ -1,243 +0,0 @@ -package org.broadinstitute.sting.playground.fourbasecaller; - -import java.io.File; -import java.util.Vector; -import java.lang.Math; - -import org.broadinstitute.sting.playground.illumina.FirecrestFileParser; -import org.broadinstitute.sting.playground.illumina.FourIntensity; -import cern.colt.matrix.linalg.Algebra; -import cern.colt.matrix.DoubleMatrix1D; -import cern.colt.matrix.DoubleFactory1D; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMFileWriter; -import net.sf.samtools.SAMFileWriterFactory; -import net.sf.samtools.SAMRecord; - -public class FourBaseCaller { - public static void main(String[] argv) { - // Parse args - File FIRECREST_DIR = new File(argv[0]); - int LANE = Integer.valueOf(argv[1]); - File SAM_OUT = new File(argv[2]); - int CYCLE_START = Integer.valueOf(argv[3]); - int CYCLE_STOP = Integer.valueOf(argv[4]); - - String IC_REF_PATH = "/seq/references/Synthetic_internal_controls_set1/v0/Synthetic_internal_controls_set1.fasta"; - - // Load ic reference in an ugly way - ManyNucleotideSequences fw_controls = new ManyNucleotideSequences(IC_REF_PATH); - ManyNucleotideSequences rc_controls = new ManyNucleotideSequences(IC_REF_PATH); - rc_controls.reverseComplement(); - ManyNucleotideSequences controls = new ManyNucleotideSequences(); - controls.addAll(fw_controls); - controls.addAll(rc_controls); - - // Loop through firecrest data, find ICs, and compute signal means - Vector recoveredICs = new Vector(); - Vector icids = new Vector(); - - FirecrestFileParser ffp = new FirecrestFileParser(FIRECREST_DIR, LANE, CYCLE_START, CYCLE_STOP); - - NucleotideChannelMeans[] cmeans = new NucleotideChannelMeans[controls.get(0).size()]; - for (int i = 0; i < cmeans.length; i++) { - cmeans[i] = new NucleotideChannelMeans(); - } - - int numReads = 0; - while (ffp.hasNext()) { - if (numReads % 1000000 == 0) { - System.err.println("Processed " + numReads + " reads for parameters."); - } - FourIntensity[] intensities = ffp.next().getIntensities(); - NucleotideSequence rawread = toNucleotideSequence(intensities); - - int[] editDistances = new int[controls.size()]; - - for (int icTemplateIndex = 0; icTemplateIndex < controls.size(); icTemplateIndex++) { - editDistances[icTemplateIndex] = rawread.editDistance(controls.get(icTemplateIndex), 36); - - if (editDistances[icTemplateIndex] <= 8) { - recoveredICs.add(intensities); - icids.add(icTemplateIndex); - - for (int cycle = 0; cycle < ((controls.get(0).size() < intensities.length) ? controls.get(0).size() : intensities.length); cycle++) { - Nucleotide nuc = controls.get(icTemplateIndex).get(cycle); - FourIntensity sig = intensities[cycle]; - - cmeans[cycle].add(nuc, sig); - } - - System.err.print("Found " + icids.size() + " ICs in " + numReads + " reads " + (((double) icids.size())/((double) numReads)) + ".\r"); - - break; - } - } - - numReads++; - } - - // Go through the data again and compute signal covariances - NucleotideChannelCovariances[] ccov = new NucleotideChannelCovariances[controls.get(0).size()]; - for (int i = 0; i < cmeans.length; i++) { - ccov[i] = new NucleotideChannelCovariances(); - } - - for (int index = 0; index < recoveredICs.size(); index++) { - NucleotideSequence control = controls.get(icids.get(index).intValue()); - FourIntensity[] intensities = recoveredICs.get(index); - int seqlength = (control.size() < intensities.length) ? control.size() : intensities.length; - - for (int cycle = 0; cycle < seqlength; cycle++) { - Nucleotide nuc = control.get(cycle); - FourIntensity sig = intensities[cycle]; - NucleotideChannelMeans mus = cmeans[cycle]; - - ccov[cycle].add(nuc, sig, mus); - } - } - - System.out.println("Found " + recoveredICs.size() + " in " + numReads + " reads."); - - // Extend IC solutions to end of read length (if ICs are shorter than read) - System.err.println("Extending cycle " + (controls.get(0).size() - 1) + " parameters from cycle " + controls.get(0).size() + " to cycle " + recoveredICs.get(0).length); - for (int cycle = controls.get(0).size(); cycle < recoveredICs.get(0).length; cycle++) { - cmeans[cycle] = cmeans[controls.get(0).size() - 1]; - ccov[cycle] = ccov[controls.get(0).size() - 1]; - } - - // Now compute probabilities for the bases - Algebra alg = new Algebra(); - - for (int cycle = 0; cycle < recoveredICs.get(0).length; cycle++) { - ccov[cycle].invert(); - } - - FirecrestFileParser ffp2 = new FirecrestFileParser(FIRECREST_DIR, LANE, CYCLE_START, CYCLE_STOP); - - SAMFileHeader sfh = new SAMFileHeader(); - SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, SAM_OUT); - - numReads = 0; - while (ffp2.hasNext()) { - if (numReads % 1000000 == 0) { - System.err.println("Basecalled " + numReads + " reads."); - } - - FourIntensity[] intensities = ffp2.next().getIntensities(); - - byte[] asciiseq = new byte[intensities.length]; - byte[] bestqual = new byte[intensities.length]; - byte[] nextbestqual = new byte[intensities.length]; - - for (int cycle = 0; cycle < intensities.length; cycle++) { - FourIntensity fi = intensities[cycle]; - - double[] likes = new double[4]; - double total = 0.0; - - for (Nucleotide nuc : Nucleotide.values()) { - double norm = Math.sqrt(alg.det(ccov[cycle].channelCovariances(nuc)))/Math.pow(2.0*Math.PI, 2.0); - - DoubleMatrix1D sub = subtract(fi, cmeans[cycle].channelMeans(nuc)); - DoubleMatrix1D Ax = alg.mult(ccov[cycle].channelCovariances(nuc), sub); - - double exparg = -0.5*alg.mult(sub, Ax); - likes[nuc.ordinal()] = norm*Math.exp(exparg); - total += likes[nuc.ordinal()]; - } - - Nucleotide call1 = Nucleotide.A; - double prob1 = likes[0]/total; - for (int i = 1; i < 4; i++) { - if (likes[i]/total > prob1) { - prob1 = likes[i]/total; - - switch (i) { - case 1: call1 = Nucleotide.C; break; - case 2: call1 = Nucleotide.G; break; - case 3: call1 = Nucleotide.T; break; - } - } - } - - Nucleotide call2 = Nucleotide.A; - double prob2 = 0.0; - for (int i = 0; i < 4; i++) { - if (i != call1.ordinal() && likes[i]/total > prob2 && likes[i]/total < prob1) { - prob2 = likes[i]/total; - - switch (i) { - case 0: call2 = Nucleotide.A; break; - case 1: call2 = Nucleotide.C; break; - case 2: call2 = Nucleotide.G; break; - case 3: call2 = Nucleotide.T; break; - } - } - } - - asciiseq[cycle] = (byte) call1.asChar(); - bestqual[cycle] = toPhredScore(prob1); - nextbestqual[cycle] = toCompressedQuality(call2, prob2); - } - - SAMRecord sr = new SAMRecord(sfh); - sr.setReadName(Integer.toString(numReads)); - sr.setReadUmappedFlag(true); - sr.setReadBases(asciiseq); - sr.setBaseQualities(bestqual); - sr.setAttribute("SQ", nextbestqual); - sfw.addAlignment(sr); - - numReads++; - } - - sfw.close(); - - System.out.println("Done."); - } - - private static byte toPhredScore(double prob) { - byte qual = (1.0 - prob < 0.00001) ? 40 : (byte) (-10*Math.log10(1.0 - prob)); - return (qual > 40) ? 40 : qual; - } - - private static DoubleMatrix1D subtract(FourIntensity a, FourIntensity b) { - DoubleMatrix1D sub = (DoubleFactory1D.dense).make(4); - - for (int i = 0; i < 4; i++) { - sub.set(i, a.getChannelIntensity(i) - b.getChannelIntensity(i)); - } - - return sub; - } - - private static byte toCompressedQuality(Nucleotide base, double prob) { - byte compressedQual = (byte) base.ordinal(); - byte cprob = (byte) (100.0*prob); - byte qualmask = (byte) 252; - compressedQual += ((cprob << 2) & qualmask); - - return compressedQual; - } - - private static NucleotideSequence toNucleotideSequence(FourIntensity[] intensities) { - NucleotideSequence ns = new NucleotideSequence(intensities.length); - - for (int cycle = 0; cycle < intensities.length; cycle++) { - int brightestChannel = intensities[cycle].brightestChannel(); - - Nucleotide nt = Nucleotide.A; - switch (brightestChannel) { - case 0: nt = Nucleotide.A; break; - case 1: nt = Nucleotide.C; break; - case 2: nt = Nucleotide.G; break; - case 3: nt = Nucleotide.T; break; - } - - ns.set(cycle, nt); - } - - return ns; - } -} diff --git a/java/src/org/broadinstitute/sting/playground/fourbasecaller/ManyNucleotideSequences.java b/java/src/org/broadinstitute/sting/playground/fourbasecaller/ManyNucleotideSequences.java deleted file mode 100644 index 52238fbe2..000000000 --- a/java/src/org/broadinstitute/sting/playground/fourbasecaller/ManyNucleotideSequences.java +++ /dev/null @@ -1,51 +0,0 @@ -package org.broadinstitute.sting.playground.fourbasecaller; - -import java.io.File; -import java.util.Vector; - -import org.broadinstitute.sting.playground.fourbasecaller.NucleotideSequence; - -import edu.mit.broad.picard.reference.ReferenceSequence; -import edu.mit.broad.picard.reference.ReferenceSequenceFile; -import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory; - -class ManyNucleotideSequences extends java.util.Vector { - public ManyNucleotideSequences() {} - - public ManyNucleotideSequences(int initialCapacity) { - this.setSize(initialCapacity); - } - - public ManyNucleotideSequences(String seqpath) { - ReferenceSequenceFile seqfile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(seqpath)); - - ReferenceSequence rs; - while ((rs = seqfile.nextSequence()) != null) { - byte[] bases = rs.getBases(); - - NucleotideSequence ns = new NucleotideSequence(bases.length); - for (int locus = 0; locus < bases.length; locus++) { - Nucleotide nt; - switch(bases[locus]) { - case 'A': nt = Nucleotide.A; break; - case 'C': nt = Nucleotide.C; break; - case 'G': nt = Nucleotide.G; break; - case 'T': nt = Nucleotide.T; break; - default: nt = Nucleotide.T; break; - } - - ns.set(locus, nt); - } - - this.add(ns); - } - } - - public ManyNucleotideSequences reverseComplement() { - for (int contig = 0; contig < this.size(); contig++) { - this.get(contig).reverseComplement(); - } - - return this; - } -} diff --git a/java/src/org/broadinstitute/sting/playground/fourbasecaller/Nucleotide.java b/java/src/org/broadinstitute/sting/playground/fourbasecaller/Nucleotide.java deleted file mode 100644 index 407a412f6..000000000 --- a/java/src/org/broadinstitute/sting/playground/fourbasecaller/Nucleotide.java +++ /dev/null @@ -1,36 +0,0 @@ -package org.broadinstitute.sting.playground.fourbasecaller; - -import java.lang.String; - -public enum Nucleotide { - A, C, G, T; - - Nucleotide complement() { - switch (this) { - case A: return T; - case C: return G; - case G: return C; - case T: return A; - } - - throw new IllegalArgumentException("Nucleotide must be A, C, G, or T."); - } - - public char asChar() { - switch(this) { - case A: return 'A'; - case C: return 'C'; - case G: return 'G'; - case T: return 'T'; - } - - throw new IllegalArgumentException("Nucleotide must be A, C, G, or T."); - } - - public String toString() { - char[] value = new char[1]; - value[0] = this.asChar(); - - return new String(value); - } -} diff --git a/java/src/org/broadinstitute/sting/playground/fourbasecaller/NucleotideChannelCovariances.java b/java/src/org/broadinstitute/sting/playground/fourbasecaller/NucleotideChannelCovariances.java deleted file mode 100755 index 669188148..000000000 --- a/java/src/org/broadinstitute/sting/playground/fourbasecaller/NucleotideChannelCovariances.java +++ /dev/null @@ -1,64 +0,0 @@ -package org.broadinstitute.sting.playground.fourbasecaller; - -import cern.colt.matrix.DoubleFactory2D; -import cern.colt.matrix.DoubleMatrix2D; -import cern.colt.matrix.linalg.Algebra; -import org.broadinstitute.sting.playground.illumina.FourIntensity; - -public class NucleotideChannelCovariances { - private DoubleMatrix2D[] covs; - private int[] counts; - - public NucleotideChannelCovariances() { - counts = new int[4]; - - covs = new DoubleMatrix2D[4]; - for (int i = 0; i < 4; i++) { - covs[i] = (DoubleFactory2D.dense).make(4, 4); - } - } - - public void add(Nucleotide nuc, FourIntensity sig, NucleotideChannelMeans mus) { - FourIntensity f = new FourIntensity(sig); - f.subtract(mus.channelMeans(nuc)); - - DoubleMatrix2D cov = covs[nuc.ordinal()]; - - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 4; j++) { - cov.set(i, j, cov.get(i, j) + f.getChannelIntensity(i)*f.getChannelIntensity(j)); - } - } - - counts[nuc.ordinal()]++; - } - - public DoubleMatrix2D channelCovariances(Nucleotide base) { - DoubleMatrix2D newcov = covs[base.ordinal()].copy(); - - /* - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 4; j++) { - newcov.set(i, j, newcov.get(i, j)/((double) counts[base.ordinal()])); - } - } - */ - - return newcov; - } - - public void invert() { - Algebra alg = new Algebra(); - - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 4; j++) { - for (int k = 0; k < 4; k++) { - covs[i].set(j, k, covs[i].get(j, k)/((double) counts[i])); - } - } - - DoubleMatrix2D covinv = alg.inverse(covs[i]); - covs[i] = covinv; - } - } -} diff --git a/java/src/org/broadinstitute/sting/playground/fourbasecaller/NucleotideChannelMeans.java b/java/src/org/broadinstitute/sting/playground/fourbasecaller/NucleotideChannelMeans.java deleted file mode 100755 index f07b7e7dc..000000000 --- a/java/src/org/broadinstitute/sting/playground/fourbasecaller/NucleotideChannelMeans.java +++ /dev/null @@ -1,42 +0,0 @@ -package org.broadinstitute.sting.playground.fourbasecaller; - -import org.broadinstitute.sting.playground.illumina.FourIntensity; - -public class NucleotideChannelMeans { - private FourIntensity[] sigs; - private int[] counts; - - public NucleotideChannelMeans() { - counts = new int[4]; - sigs = new FourIntensity[4]; - - for (int base = 0; base < 4; base++) { - sigs[base] = new FourIntensity(); - } - } - - public void add(Nucleotide base, FourIntensity intensity) { - sigs[base.ordinal()].add(intensity); - counts[base.ordinal()]++; - } - - public FourIntensity channelMeans(Nucleotide base) { - FourIntensity meansig = new FourIntensity(sigs[base.ordinal()]); - meansig.divide(counts[base.ordinal()]); - - return meansig; - } - - public float channelMean(Nucleotide base, int channel) { - FourIntensity meansig = channelMeans(base); - - return meansig.getChannelIntensity(channel); - } - - public String toString() { - return "a:{" + channelMean(Nucleotide.A, 0) + " " + channelMean(Nucleotide.A, 1) + " " + channelMean(Nucleotide.A, 2) + " " + channelMean(Nucleotide.A, 3) + "} " + - "c:{" + channelMean(Nucleotide.C, 0) + " " + channelMean(Nucleotide.C, 1) + " " + channelMean(Nucleotide.C, 2) + " " + channelMean(Nucleotide.C, 3) + "} " + - "g:{" + channelMean(Nucleotide.G, 0) + " " + channelMean(Nucleotide.G, 1) + " " + channelMean(Nucleotide.G, 2) + " " + channelMean(Nucleotide.G, 3) + "} " + - "t:{" + channelMean(Nucleotide.T, 0) + " " + channelMean(Nucleotide.T, 1) + " " + channelMean(Nucleotide.T, 2) + " " + channelMean(Nucleotide.T, 3) + "}"; - } -} diff --git a/java/src/org/broadinstitute/sting/playground/fourbasecaller/NucleotideSequence.java b/java/src/org/broadinstitute/sting/playground/fourbasecaller/NucleotideSequence.java deleted file mode 100644 index 1b6d7d587..000000000 --- a/java/src/org/broadinstitute/sting/playground/fourbasecaller/NucleotideSequence.java +++ /dev/null @@ -1,59 +0,0 @@ -package org.broadinstitute.sting.playground.fourbasecaller; - -import org.broadinstitute.sting.playground.fourbasecaller.Nucleotide; - -import java.lang.String; -import java.util.Vector; - -class NucleotideSequence extends java.util.Vector { - public NucleotideSequence(int initialCapacity) { - this.setSize(initialCapacity); - } - - public NucleotideSequence(NucleotideSequence ns) { - this.setSize(ns.size()); - - for (int locus = 0; locus < ns.size(); locus++) { - switch (ns.get(locus)) { - case A: this.set(locus, Nucleotide.A); break; - case C: this.set(locus, Nucleotide.C); break; - case G: this.set(locus, Nucleotide.G); break; - case T: this.set(locus, Nucleotide.T); break; - } - } - } - - public NucleotideSequence reverseComplement() { - if (this.size() % 2 != 0) { - this.get(this.size()/2 + 1).complement(); - } - - for (int locus = 0; locus < this.size()/2; locus++) { - Nucleotide temp = this.get(locus).complement(); - this.set(locus, this.get(this.size() - locus - 1).complement()); - this.set(this.size() - locus - 1, temp); - } - - return this; - } - - public int editDistance(NucleotideSequence ns) { return editDistance(ns, this.size() < ns.size() ? this.size() : ns.size()); } - - public int editDistance(NucleotideSequence ns, int searchLength) { - int editDistance = 0; - for (int locus = 0; locus < searchLength; locus++) { - editDistance += (this.get(locus) != ns.get(locus)) ? 1 : 0; - } - - return editDistance; - } - - public String toString() { - char[] charbuffer = new char[this.size()]; - for (int locus = 0; locus < this.size(); locus++) { - charbuffer[locus] = this.get(locus).asChar(); - } - - return new String(charbuffer); - } -}