Should now be appropriately using Bustard data to call bases (there are some mathematical subtleties that arise when no longer using ICs as initialization data. Also writes some more relevant fields in the SAM records. WAAAAAY simpler than old version. Like, super way.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@275 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
59334b0270
commit
dffc879240
|
|
@ -1,231 +1,120 @@
|
||||||
package org.broadinstitute.sting.playground.fourbasecaller;
|
package org.broadinstitute.sting.playground.fourbasecaller;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FilenameFilter;
|
|
||||||
import java.io.FileFilter;
|
|
||||||
import java.util.Vector;
|
|
||||||
import java.lang.Math;
|
|
||||||
|
|
||||||
import org.broadinstitute.sting.playground.illumina.FirecrestFileParser;
|
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
||||||
import org.broadinstitute.sting.playground.illumina.FourIntensity;
|
import org.broadinstitute.sting.utils.QualityUtils;
|
||||||
import cern.colt.matrix.linalg.Algebra;
|
|
||||||
import cern.colt.matrix.DoubleMatrix1D;
|
|
||||||
import cern.colt.matrix.DoubleFactory1D;
|
|
||||||
import net.sf.samtools.SAMFileHeader;
|
import net.sf.samtools.SAMFileHeader;
|
||||||
import net.sf.samtools.SAMFileWriter;
|
import net.sf.samtools.SAMFileWriter;
|
||||||
import net.sf.samtools.SAMFileWriterFactory;
|
import net.sf.samtools.SAMFileWriterFactory;
|
||||||
import net.sf.samtools.SAMRecord;
|
import net.sf.samtools.SAMRecord;
|
||||||
import edu.mit.broad.picard.illumina.BustardFileParser;
|
import edu.mit.broad.picard.illumina.BustardFileParser;
|
||||||
import edu.mit.broad.picard.illumina.BustardReadData;
|
import edu.mit.broad.picard.illumina.BustardReadData;
|
||||||
import edu.mit.broad.picard.illumina.BustardFileParser_1_1;
|
|
||||||
|
|
||||||
public class FourBaseRecaller {
|
public class FourBaseRecaller extends CommandLineProgram {
|
||||||
|
public static FourBaseRecaller Instance = null;
|
||||||
|
|
||||||
|
public File DIR;
|
||||||
|
public int LANE;
|
||||||
|
public File OUT;
|
||||||
|
public int END = 0;
|
||||||
|
public int TRAINING_LIMIT = 1000000000;
|
||||||
|
public int CALLING_LIMIT = 1000000000;
|
||||||
|
|
||||||
public static void main(String[] argv) {
|
public static void main(String[] argv) {
|
||||||
// Parse args
|
Instance = new FourBaseRecaller();
|
||||||
File FIRECREST_DIR = new File(argv[0]);
|
start(Instance, argv);
|
||||||
int LANE = Integer.valueOf(argv[1]);
|
|
||||||
File SAM_OUT = new File(argv[2]);
|
|
||||||
int CYCLE_START = Integer.valueOf(argv[3]);
|
|
||||||
int CYCLE_STOP = Integer.valueOf(argv[4]);
|
|
||||||
boolean isPaired = Boolean.valueOf(argv[5]);
|
|
||||||
|
|
||||||
int readLength = (CYCLE_STOP - CYCLE_START);
|
|
||||||
File BUSTARD_DIR = getBustardDirectory(FIRECREST_DIR);
|
|
||||||
int limit = 1000000;
|
|
||||||
|
|
||||||
NucleotideChannelMeans[] cmeans = new NucleotideChannelMeans[readLength];
|
|
||||||
NucleotideChannelCovariances[] ccov = new NucleotideChannelCovariances[readLength];
|
|
||||||
for (int i = 0; i < readLength; i++) {
|
|
||||||
cmeans[i] = new NucleotideChannelMeans();
|
|
||||||
ccov[i] = new NucleotideChannelCovariances();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Loop through bustard data and compute signal means
|
protected void setupArgs() {
|
||||||
FirecrestFileParser ffp1 = new FirecrestFileParser(FIRECREST_DIR, LANE, CYCLE_START, CYCLE_STOP);
|
m_parser.addRequiredArg("dir", "D", "Illumina Bustard directory", "DIR");
|
||||||
BustardFileParser_1_1 bfp1 = new BustardFileParser_1_1(BUSTARD_DIR, LANE, isPaired, "BS");
|
m_parser.addRequiredArg("lane", "L", "Illumina flowcell lane", "LANE");
|
||||||
|
m_parser.addRequiredArg("out", "O", "Output path for sam file", "OUT");
|
||||||
for (int queryid = 0; queryid < limit && ffp1.hasNext(); queryid++) {
|
m_parser.addOptionalArg("end", "E", "End of read to process (0 = whole read, i.e. unpaired; 1 = first end; 2 = second end)", "END");
|
||||||
if (queryid % (limit/10) == 0) {
|
m_parser.addOptionalArg("tlim", "T", "Number of reads to use for parameter initialization", "TRAINING_LIMIT");
|
||||||
System.err.println("Processed " + queryid + " reads for means.");
|
m_parser.addOptionalArg("clim", "C", "Number of reads to basecall", "CALLING_LIMIT");
|
||||||
}
|
|
||||||
FourIntensity[] intensities = ffp1.next().getIntensities();
|
|
||||||
String rsq = (CYCLE_START == 0) ? bfp1.next().getFirstReadSequence() : bfp1.next().getSecondReadSequence();
|
|
||||||
|
|
||||||
for (int cycle = 0; cycle < readLength; cycle++) {
|
|
||||||
FourIntensity sig = intensities[cycle];
|
|
||||||
|
|
||||||
if (rsq.charAt(cycle) == 'A') { cmeans[cycle].add(Nucleotide.A, sig); }
|
|
||||||
else if (rsq.charAt(cycle) == 'C') { cmeans[cycle].add(Nucleotide.C, sig); }
|
|
||||||
else if (rsq.charAt(cycle) == 'G') { cmeans[cycle].add(Nucleotide.G, sig); }
|
|
||||||
else if (rsq.charAt(cycle) == 'T') { cmeans[cycle].add(Nucleotide.T, sig); }
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Go through the data again and compute signal covariances
|
protected int execute() {
|
||||||
FirecrestFileParser ffp2 = new FirecrestFileParser(FIRECREST_DIR, LANE, CYCLE_START, CYCLE_STOP);
|
boolean isPaired = (END > 0);
|
||||||
BustardFileParser_1_1 bfp2 = new BustardFileParser_1_1(BUSTARD_DIR, LANE, isPaired, "BS");
|
|
||||||
|
|
||||||
for (int queryid = 0; queryid < limit && ffp2.hasNext(); queryid++) {
|
BustardFileParser bfp = new BustardFileParser(DIR, LANE, isPaired, "BS");
|
||||||
if (queryid % (limit/10) == 0) {
|
BustardReadData bread = bfp.next();
|
||||||
System.err.println("Processed " + queryid + " reads for covariances.");
|
|
||||||
|
int cycle_offset = (END <= 1) ? 0 : bread.getIntensities().length/2;
|
||||||
|
BasecallingReadModel p = new BasecallingReadModel(bread.getFirstReadSequence().length());
|
||||||
|
int queryid;
|
||||||
|
|
||||||
|
// learn parameters
|
||||||
|
queryid = 0;
|
||||||
|
do {
|
||||||
|
String bases = (END <= 1) ? bread.getFirstReadSequence() : bread.getSecondReadSequence();
|
||||||
|
byte[] quals = (END <= 1) ? bread.getFirstReadPhredBinaryQualities() : bread.getSecondReadPhredBinaryQualities();
|
||||||
|
double[][] intensities = bread.getIntensities();
|
||||||
|
|
||||||
|
for (int cycle = 0; cycle < bases.length(); cycle++) {
|
||||||
|
char basePrev = (cycle == 0) ? '*' : bases.charAt(cycle - 1);
|
||||||
|
char baseCur = bases.charAt(cycle);
|
||||||
|
byte qualCur = quals[cycle];
|
||||||
|
double[] fourintensity = intensities[cycle + cycle_offset];
|
||||||
|
|
||||||
|
p.addTrainingPoint(cycle, basePrev, baseCur, qualCur, fourintensity);
|
||||||
}
|
}
|
||||||
|
|
||||||
FourIntensity[] intensities = ffp2.next().getIntensities();
|
queryid++;
|
||||||
String rsq = (CYCLE_START == 0) ? bfp2.next().getFirstReadSequence() : bfp2.next().getSecondReadSequence();
|
} while (queryid < TRAINING_LIMIT && bfp.hasNext() && (bread = bfp.next()) != null);
|
||||||
|
|
||||||
for (int cycle = 0; cycle < readLength; cycle++) {
|
|
||||||
FourIntensity sig = intensities[cycle];
|
|
||||||
NucleotideChannelMeans mus = cmeans[cycle];
|
|
||||||
|
|
||||||
if (rsq.charAt(cycle) == 'A') { ccov[cycle].add(Nucleotide.A, sig, mus); }
|
|
||||||
else if (rsq.charAt(cycle) == 'C') { ccov[cycle].add(Nucleotide.C, sig, mus); }
|
|
||||||
else if (rsq.charAt(cycle) == 'G') { ccov[cycle].add(Nucleotide.G, sig, mus); }
|
|
||||||
else if (rsq.charAt(cycle) == 'T') { ccov[cycle].add(Nucleotide.T, sig, mus); }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Now compute probabilities for the bases
|
|
||||||
Algebra alg = new Algebra();
|
|
||||||
|
|
||||||
for (int cycle = 0; cycle < readLength; cycle++) {
|
|
||||||
ccov[cycle].invert();
|
|
||||||
}
|
|
||||||
|
|
||||||
FirecrestFileParser ffp3 = new FirecrestFileParser(FIRECREST_DIR, LANE, CYCLE_START, CYCLE_STOP);
|
|
||||||
|
|
||||||
|
// call bases
|
||||||
SAMFileHeader sfh = new SAMFileHeader();
|
SAMFileHeader sfh = new SAMFileHeader();
|
||||||
SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, SAM_OUT);
|
SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, OUT);
|
||||||
|
|
||||||
for (int queryid = 0; ffp3.hasNext(); queryid++) {
|
queryid = 0;
|
||||||
if (queryid % limit == 0) {
|
do {
|
||||||
System.err.println("Basecalled " + queryid + " reads.");
|
String bases = (END <= 1) ? bread.getFirstReadSequence() : bread.getSecondReadSequence();
|
||||||
}
|
byte[] quals = (END <= 1) ? bread.getFirstReadPhredBinaryQualities() : bread.getSecondReadPhredBinaryQualities();
|
||||||
|
double[][] intensities = bread.getIntensities();
|
||||||
|
|
||||||
FourIntensity[] intensities = ffp3.next().getIntensities();
|
byte[] asciiseq = new byte[bases.length()];
|
||||||
|
byte[] bestqual = new byte[bases.length()];
|
||||||
|
byte[] nextbestqual = new byte[bases.length()];
|
||||||
|
|
||||||
byte[] asciiseq = new byte[readLength];
|
for (int cycle = 0; cycle < bases.length(); cycle++) {
|
||||||
byte[] bestqual = new byte[readLength];
|
char basePrev = (cycle == 0) ? '*' : bases.charAt(cycle - 1);
|
||||||
byte[] nextbestqual = new byte[readLength];
|
byte qualPrev = (cycle == 0) ? 0 : quals[cycle - 1];
|
||||||
|
double[] fourintensity = intensities[cycle + cycle_offset];
|
||||||
|
|
||||||
for (int cycle = 0; cycle < readLength; cycle++) {
|
FourProb fp = p.computeProbabilities(cycle, basePrev, qualPrev, fourintensity);
|
||||||
FourIntensity fi = intensities[cycle];
|
|
||||||
|
|
||||||
double[] likes = new double[4];
|
asciiseq[cycle] = (byte) fp.baseAtRank(0);
|
||||||
double total = 0.0;
|
bestqual[cycle] = fp.qualAtRank(0);
|
||||||
|
nextbestqual[cycle] = QualityUtils.qualAndProbToCompressedQuality(fp.indexAtRank(1), fp.probAtRank(1));
|
||||||
for (Nucleotide nuc : Nucleotide.values()) {
|
|
||||||
double norm = Math.sqrt(alg.det(ccov[cycle].channelCovariances(nuc)))/Math.pow(2.0*Math.PI, 2.0);
|
|
||||||
|
|
||||||
DoubleMatrix1D sub = subtract(fi, cmeans[cycle].channelMeans(nuc));
|
|
||||||
DoubleMatrix1D Ax = alg.mult(ccov[cycle].channelCovariances(nuc), sub);
|
|
||||||
|
|
||||||
double exparg = -0.5*alg.mult(sub, Ax);
|
|
||||||
likes[nuc.ordinal()] = norm*Math.exp(exparg);
|
|
||||||
total += likes[nuc.ordinal()];
|
|
||||||
}
|
|
||||||
|
|
||||||
Nucleotide call1 = Nucleotide.A;
|
|
||||||
double prob1 = likes[0]/total;
|
|
||||||
for (int i = 1; i < 4; i++) {
|
|
||||||
if (likes[i]/total > prob1) {
|
|
||||||
prob1 = likes[i]/total;
|
|
||||||
|
|
||||||
switch (i) {
|
|
||||||
case 1: call1 = Nucleotide.C; break;
|
|
||||||
case 2: call1 = Nucleotide.G; break;
|
|
||||||
case 3: call1 = Nucleotide.T; break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Nucleotide call2 = Nucleotide.A;
|
|
||||||
double prob2 = 0.0;
|
|
||||||
for (int i = 0; i < 4; i++) {
|
|
||||||
if (i != call1.ordinal() && likes[i]/total > prob2 && likes[i]/total < prob1) {
|
|
||||||
prob2 = likes[i]/total;
|
|
||||||
|
|
||||||
switch (i) {
|
|
||||||
case 0: call2 = Nucleotide.A; break;
|
|
||||||
case 1: call2 = Nucleotide.C; break;
|
|
||||||
case 2: call2 = Nucleotide.G; break;
|
|
||||||
case 3: call2 = Nucleotide.T; break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
asciiseq[cycle] = (byte) call1.asChar();
|
|
||||||
bestqual[cycle] = toPhredScore(prob1);
|
|
||||||
nextbestqual[cycle] = toCompressedQuality(call2, prob2);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
SAMRecord sr = new SAMRecord(sfh);
|
SAMRecord sr = new SAMRecord(sfh);
|
||||||
sr.setReadName(Integer.toString(queryid));
|
sr.setReadName(bread.getReadName());
|
||||||
sr.setReadUmappedFlag(true);
|
sr.setReadUmappedFlag(true);
|
||||||
sr.setReadBases(asciiseq);
|
sr.setReadBases(asciiseq);
|
||||||
sr.setBaseQualities(bestqual);
|
sr.setBaseQualities(bestqual);
|
||||||
sr.setAttribute("SQ", nextbestqual);
|
sr.setAttribute("SQ", nextbestqual);
|
||||||
|
sr.setReadFailsVendorQualityCheckFlag(!bread.isPf());
|
||||||
|
sr.setReadPairedFlag(isPaired);
|
||||||
|
if (isPaired) {
|
||||||
|
sr.setMateUnmappedFlag(true);
|
||||||
|
sr.setFirstOfPairFlag(END <= 1);
|
||||||
|
sr.setSecondOfPairFlag(END > 1);
|
||||||
|
}
|
||||||
sfw.addAlignment(sr);
|
sfw.addAlignment(sr);
|
||||||
}
|
|
||||||
|
|
||||||
sfw.close();
|
/*
|
||||||
|
System.out.println(sr.format());
|
||||||
|
System.out.println(sr.getReadString());
|
||||||
|
System.out.println(bases);
|
||||||
|
System.out.println("\n");
|
||||||
|
*/
|
||||||
|
|
||||||
System.err.println("Done.");
|
queryid++;
|
||||||
}
|
} while (queryid < CALLING_LIMIT && bfp.hasNext() && (bread = bfp.next()) != null);
|
||||||
|
|
||||||
private static byte toPhredScore(double prob) {
|
return 0;
|
||||||
byte qual = (1.0 - prob < 0.00001) ? 40 : (byte) (-10*Math.log10(1.0 - prob));
|
|
||||||
//System.out.println("prob=" + prob + " qual=" + qual);
|
|
||||||
return (qual > 40) ? 40 : qual;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static DoubleMatrix1D subtract(FourIntensity a, FourIntensity b) {
|
|
||||||
DoubleMatrix1D sub = (DoubleFactory1D.dense).make(4);
|
|
||||||
|
|
||||||
for (int i = 0; i < 4; i++) {
|
|
||||||
sub.set(i, a.getChannelIntensity(i) - b.getChannelIntensity(i));
|
|
||||||
}
|
|
||||||
|
|
||||||
return sub;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static byte toCompressedQuality(Nucleotide base, double prob) {
|
|
||||||
byte compressedQual = (byte) base.ordinal();
|
|
||||||
byte cprob = (byte) (100.0*prob);
|
|
||||||
byte qualmask = (byte) 252;
|
|
||||||
compressedQual += ((cprob << 2) & qualmask);
|
|
||||||
|
|
||||||
return compressedQual;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static NucleotideSequence toNucleotideSequence(FourIntensity[] intensities) {
|
|
||||||
NucleotideSequence ns = new NucleotideSequence(intensities.length);
|
|
||||||
|
|
||||||
for (int cycle = 0; cycle < intensities.length; cycle++) {
|
|
||||||
int brightestChannel = intensities[cycle].brightestChannel();
|
|
||||||
|
|
||||||
Nucleotide nt = Nucleotide.A;
|
|
||||||
switch (brightestChannel) {
|
|
||||||
case 0: nt = Nucleotide.A; break;
|
|
||||||
case 1: nt = Nucleotide.C; break;
|
|
||||||
case 2: nt = Nucleotide.G; break;
|
|
||||||
case 3: nt = Nucleotide.T; break;
|
|
||||||
}
|
|
||||||
|
|
||||||
ns.set(cycle, nt);
|
|
||||||
}
|
|
||||||
|
|
||||||
return ns;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static File getBustardDirectory(File firecrestDir) {
|
|
||||||
FileFilter filter = new FileFilter() {
|
|
||||||
public boolean accept(File file) {
|
|
||||||
return (file.isDirectory() && file.getName().contains("Bustard"));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
File[] bustardDirs = firecrestDir.listFiles(filter);
|
|
||||||
|
|
||||||
return bustardDirs[0];
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue