2009-03-25 04:11:39 +08:00
|
|
|
package org.broadinstitute.sting.playground.fourbasecaller;
|
|
|
|
|
|
2009-04-13 03:44:07 +08:00
|
|
|
import edu.mit.broad.picard.illumina.BustardFileParser;
|
|
|
|
|
import edu.mit.broad.picard.illumina.BustardReadData;
|
|
|
|
|
import edu.mit.broad.picard.illumina.AbstractBustardFileParser;
|
|
|
|
|
import edu.mit.broad.picard.illumina.BustardFileParser_1_1;
|
2009-03-25 04:11:39 +08:00
|
|
|
import net.sf.samtools.SAMFileHeader;
|
|
|
|
|
import net.sf.samtools.SAMFileWriter;
|
|
|
|
|
import net.sf.samtools.SAMFileWriterFactory;
|
|
|
|
|
import net.sf.samtools.SAMRecord;
|
2009-04-13 03:44:07 +08:00
|
|
|
import org.broadinstitute.sting.playground.illumina.FirecrestFileParser;
|
|
|
|
|
import org.broadinstitute.sting.playground.illumina.FirecrestReadData;
|
|
|
|
|
import org.broadinstitute.sting.utils.QualityUtils;
|
|
|
|
|
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
|
|
|
|
|
|
|
|
|
import java.io.File;
|
|
|
|
|
import java.io.IOException;
|
|
|
|
|
import java.io.PrintWriter;
|
2009-03-25 04:11:39 +08:00
|
|
|
|
2009-04-03 06:10:13 +08:00
|
|
|
public class FourBaseRecaller extends CommandLineProgram {
|
|
|
|
|
public static FourBaseRecaller Instance = null;
|
|
|
|
|
|
|
|
|
|
public File DIR;
|
|
|
|
|
public int LANE;
|
2009-04-13 03:44:07 +08:00
|
|
|
public String RUN_BARCODE;
|
2009-04-03 06:10:13 +08:00
|
|
|
public File OUT;
|
|
|
|
|
public int END = 0;
|
|
|
|
|
public int TRAINING_LIMIT = 1000000000;
|
|
|
|
|
public int CALLING_LIMIT = 1000000000;
|
2009-04-10 04:50:11 +08:00
|
|
|
public Boolean RAW = false;
|
2009-04-13 03:44:07 +08:00
|
|
|
public Boolean OLD = false;
|
2009-04-03 06:10:13 +08:00
|
|
|
|
2009-03-25 04:11:39 +08:00
|
|
|
public static void main(String[] argv) {
|
2009-04-03 06:10:13 +08:00
|
|
|
Instance = new FourBaseRecaller();
|
|
|
|
|
start(Instance, argv);
|
|
|
|
|
}
|
2009-03-25 04:11:39 +08:00
|
|
|
|
2009-04-03 06:10:13 +08:00
|
|
|
protected void setupArgs() {
|
2009-04-13 03:44:07 +08:00
|
|
|
m_parser.addRequiredArg("dir", "D", "Illumina Bustard directory", "DIR");
|
|
|
|
|
m_parser.addRequiredArg("lane", "L", "Illumina flowcell lane", "LANE");
|
|
|
|
|
m_parser.addOptionalArg("end", "E", "End of read to process (0 = whole read, i.e. unpaired; 1 = first end; 2 = second end)", "END");
|
|
|
|
|
m_parser.addRequiredArg("run_barcode", "B", "Illumina Run Barcode (e.g. 305PJAAXX080716)", "RUN_BARCODE");
|
|
|
|
|
m_parser.addRequiredArg("out", "O", "Output path for sam file", "OUT");
|
|
|
|
|
m_parser.addOptionalArg("tlim", "T", "Number of reads to use for parameter initialization", "TRAINING_LIMIT");
|
|
|
|
|
m_parser.addOptionalArg("clim", "C", "Number of reads to basecall", "CALLING_LIMIT");
|
|
|
|
|
m_parser.addOptionalFlag("raw", "R", "Use raw intensities?", "RAW");
|
|
|
|
|
m_parser.addOptionalFlag("old", "1", "Old Bustard 1.1 mode?", "OLD");
|
2009-04-03 06:10:13 +08:00
|
|
|
}
|
2009-03-25 04:11:39 +08:00
|
|
|
|
2009-04-03 06:10:13 +08:00
|
|
|
protected int execute() {
|
|
|
|
|
boolean isPaired = (END > 0);
|
2009-03-25 04:11:39 +08:00
|
|
|
|
2009-04-07 10:18:13 +08:00
|
|
|
// Set up debugging paths
|
|
|
|
|
File debugdir = new File(OUT.getPath() + ".debug/");
|
|
|
|
|
debugdir.mkdir();
|
|
|
|
|
PrintWriter debugout = null;
|
|
|
|
|
try {
|
|
|
|
|
debugout = new PrintWriter(debugdir.getPath() + "/debug.out");
|
|
|
|
|
} catch (IOException e) {
|
|
|
|
|
}
|
|
|
|
|
|
2009-04-13 03:44:07 +08:00
|
|
|
AbstractBustardFileParser bfp;
|
2009-04-03 23:48:51 +08:00
|
|
|
BustardReadData bread;
|
|
|
|
|
|
2009-04-10 04:50:11 +08:00
|
|
|
FirecrestFileParser ffp;
|
|
|
|
|
FirecrestReadData fread;
|
|
|
|
|
|
2009-04-13 03:44:07 +08:00
|
|
|
bfp = OLD ? new BustardFileParser_1_1(DIR, LANE, isPaired, RUN_BARCODE) : new BustardFileParser(DIR, LANE, isPaired, RUN_BARCODE);
|
2009-04-03 23:48:51 +08:00
|
|
|
bread = bfp.next();
|
2009-04-10 04:50:11 +08:00
|
|
|
ffp = new FirecrestFileParser(DIR.getParentFile(), LANE);
|
|
|
|
|
fread = ffp.next();
|
2009-03-25 04:11:39 +08:00
|
|
|
|
2009-04-03 06:10:13 +08:00
|
|
|
int cycle_offset = (END <= 1) ? 0 : bread.getIntensities().length/2;
|
2009-04-07 06:00:58 +08:00
|
|
|
BasecallingReadModel model = new BasecallingReadModel(bread.getFirstReadSequence().length());
|
2009-04-03 06:10:13 +08:00
|
|
|
int queryid;
|
|
|
|
|
|
2009-04-07 09:20:15 +08:00
|
|
|
// learn mean parameters
|
2009-04-13 03:44:07 +08:00
|
|
|
System.err.println("Computing mean parameters...");
|
2009-04-07 09:20:15 +08:00
|
|
|
|
|
|
|
|
queryid = 0;
|
|
|
|
|
do {
|
2009-04-13 03:44:07 +08:00
|
|
|
assert(bread.getXCoordinate() == fread.getXCoordinate() && bread.getYCoordinate() == fread.getYCoordinate());
|
|
|
|
|
|
2009-04-07 09:20:15 +08:00
|
|
|
String bases = (END <= 1) ? bread.getFirstReadSequence() : bread.getSecondReadSequence();
|
|
|
|
|
byte[] quals = (END <= 1) ? bread.getFirstReadPhredBinaryQualities() : bread.getSecondReadPhredBinaryQualities();
|
|
|
|
|
double[][] intensities = bread.getIntensities();
|
2009-04-10 04:50:11 +08:00
|
|
|
double[][] rawintensities = fread.getIntensities();
|
2009-04-07 09:20:15 +08:00
|
|
|
|
|
|
|
|
for (int cycle = 0; cycle < bases.length(); cycle++) {
|
|
|
|
|
char baseCur = bases.charAt(cycle);
|
|
|
|
|
byte qualCur = quals[cycle];
|
2009-04-13 03:44:07 +08:00
|
|
|
double[] fourintensity = (RAW || OLD) ? rawintensities[cycle + cycle_offset] : intensities[cycle + cycle_offset];
|
2009-04-07 09:20:15 +08:00
|
|
|
|
|
|
|
|
model.addMeanPoint(cycle, baseCur, qualCur, fourintensity);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
queryid++;
|
2009-04-10 04:50:11 +08:00
|
|
|
} while (queryid < TRAINING_LIMIT && bfp.hasNext() && (bread = bfp.next()) != null && (fread = ffp.next()) != null);
|
|
|
|
|
|
2009-04-07 09:20:15 +08:00
|
|
|
// learn covariance parameters
|
2009-04-13 03:44:07 +08:00
|
|
|
System.err.println("Computing covariance parameters...");
|
|
|
|
|
|
|
|
|
|
bfp = OLD ? new BustardFileParser_1_1(DIR, LANE, isPaired, RUN_BARCODE) : new BustardFileParser(DIR, LANE, isPaired, RUN_BARCODE);
|
2009-04-07 09:20:15 +08:00
|
|
|
bread = bfp.next();
|
2009-04-10 04:50:11 +08:00
|
|
|
ffp = new FirecrestFileParser(DIR.getParentFile(), LANE);
|
|
|
|
|
fread = ffp.next();
|
2009-04-07 09:20:15 +08:00
|
|
|
|
2009-04-03 06:10:13 +08:00
|
|
|
queryid = 0;
|
|
|
|
|
do {
|
2009-04-13 03:44:07 +08:00
|
|
|
assert(bread.getXCoordinate() == fread.getXCoordinate() && bread.getYCoordinate() == fread.getYCoordinate());
|
|
|
|
|
|
2009-04-03 06:10:13 +08:00
|
|
|
String bases = (END <= 1) ? bread.getFirstReadSequence() : bread.getSecondReadSequence();
|
|
|
|
|
byte[] quals = (END <= 1) ? bread.getFirstReadPhredBinaryQualities() : bread.getSecondReadPhredBinaryQualities();
|
|
|
|
|
double[][] intensities = bread.getIntensities();
|
2009-04-10 04:50:11 +08:00
|
|
|
double[][] rawintensities = fread.getIntensities();
|
2009-03-25 04:11:39 +08:00
|
|
|
|
2009-04-03 06:10:13 +08:00
|
|
|
for (int cycle = 0; cycle < bases.length(); cycle++) {
|
|
|
|
|
char baseCur = bases.charAt(cycle);
|
|
|
|
|
byte qualCur = quals[cycle];
|
2009-04-13 03:44:07 +08:00
|
|
|
double[] fourintensity = (RAW || OLD) ? rawintensities[cycle + cycle_offset] : intensities[cycle + cycle_offset];
|
2009-03-25 04:11:39 +08:00
|
|
|
|
2009-04-07 09:20:15 +08:00
|
|
|
model.addCovariancePoint(cycle, baseCur, qualCur, fourintensity);
|
2009-04-03 06:10:13 +08:00
|
|
|
}
|
2009-03-25 04:11:39 +08:00
|
|
|
|
2009-04-03 06:10:13 +08:00
|
|
|
queryid++;
|
2009-04-10 04:50:11 +08:00
|
|
|
} while (queryid < TRAINING_LIMIT && bfp.hasNext() && (bread = bfp.next()) != null && (fread = ffp.next()) != null);
|
2009-03-25 04:11:39 +08:00
|
|
|
|
2009-04-13 03:44:07 +08:00
|
|
|
// write model parameters
|
2009-04-07 10:18:13 +08:00
|
|
|
model.write(debugdir);
|
2009-04-07 06:00:58 +08:00
|
|
|
|
2009-04-03 06:10:13 +08:00
|
|
|
// call bases
|
2009-04-13 03:44:07 +08:00
|
|
|
System.err.println("Calling bases...");
|
|
|
|
|
|
2009-03-25 04:11:39 +08:00
|
|
|
SAMFileHeader sfh = new SAMFileHeader();
|
2009-04-03 06:10:13 +08:00
|
|
|
SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, OUT);
|
2009-04-03 23:48:51 +08:00
|
|
|
|
2009-04-13 03:44:07 +08:00
|
|
|
bfp = OLD ? new BustardFileParser_1_1(DIR, LANE, isPaired, RUN_BARCODE) : new BustardFileParser(DIR, LANE, isPaired, RUN_BARCODE);
|
2009-04-03 23:48:51 +08:00
|
|
|
bread = bfp.next();
|
2009-04-10 04:50:11 +08:00
|
|
|
ffp = new FirecrestFileParser(DIR.getParentFile(), LANE);
|
|
|
|
|
fread = ffp.next();
|
2009-03-25 04:11:39 +08:00
|
|
|
|
2009-04-13 03:44:07 +08:00
|
|
|
if (debugout != null) { debugout.println("cycle int_a int_c int_g int_t bustard_base kiran_base bustard_prob kiran_prob"); }
|
|
|
|
|
|
2009-04-03 06:10:13 +08:00
|
|
|
queryid = 0;
|
|
|
|
|
do {
|
2009-04-13 03:44:07 +08:00
|
|
|
assert(bread.getXCoordinate() == fread.getXCoordinate() && bread.getYCoordinate() == fread.getYCoordinate());
|
|
|
|
|
|
2009-04-03 06:10:13 +08:00
|
|
|
String bases = (END <= 1) ? bread.getFirstReadSequence() : bread.getSecondReadSequence();
|
|
|
|
|
byte[] quals = (END <= 1) ? bread.getFirstReadPhredBinaryQualities() : bread.getSecondReadPhredBinaryQualities();
|
|
|
|
|
double[][] intensities = bread.getIntensities();
|
2009-04-10 04:50:11 +08:00
|
|
|
double[][] rawintensities = fread.getIntensities();
|
2009-04-03 06:10:13 +08:00
|
|
|
|
|
|
|
|
byte[] asciiseq = new byte[bases.length()];
|
|
|
|
|
byte[] bestqual = new byte[bases.length()];
|
|
|
|
|
byte[] nextbestqual = new byte[bases.length()];
|
|
|
|
|
|
|
|
|
|
for (int cycle = 0; cycle < bases.length(); cycle++) {
|
2009-04-13 03:44:07 +08:00
|
|
|
double[] fourintensity = (RAW || OLD) ? rawintensities[cycle + cycle_offset] : intensities[cycle + cycle_offset];
|
2009-04-03 06:10:13 +08:00
|
|
|
|
2009-04-07 10:18:13 +08:00
|
|
|
FourProb fp = model.computeProbabilities(cycle, fourintensity);
|
|
|
|
|
|
2009-04-03 06:10:13 +08:00
|
|
|
asciiseq[cycle] = (byte) fp.baseAtRank(0);
|
|
|
|
|
bestqual[cycle] = fp.qualAtRank(0);
|
2009-04-03 23:48:51 +08:00
|
|
|
nextbestqual[cycle] = QualityUtils.baseAndProbToCompressedQuality(fp.indexAtRank(1), fp.probAtRank(1));
|
2009-04-13 03:44:07 +08:00
|
|
|
|
|
|
|
|
if (debugout != null && queryid < 5000 && bases.charAt(cycle) != '.') {
|
|
|
|
|
debugout.println(cycle + " " + fourintensity[0] + " " + fourintensity[1] + " " + fourintensity[2] + " " + fourintensity[3] + " " + (bases.charAt(cycle)) + " " + ((char) asciiseq[cycle]) + " " + bestqual[cycle] + " " + quals[cycle]);
|
|
|
|
|
//System.out.println(cycle + " " + fourintensity[0] + " " + fourintensity[1] + " " + fourintensity[2] + " " + fourintensity[3] + " " + (bases.charAt(cycle)) + " " + ((char) asciiseq[cycle]) + " " + bestqual[cycle] + " " + quals[cycle]);
|
|
|
|
|
}
|
2009-03-25 04:11:39 +08:00
|
|
|
}
|
|
|
|
|
|
2009-04-13 03:44:07 +08:00
|
|
|
sfw.addAlignment(constructSAMRecord("KIR_", new String(asciiseq), bestqual, nextbestqual, bread, sfh));
|
|
|
|
|
sfw.addAlignment(constructSAMRecord("BUS_", bases, quals, null, bread, sfh));
|
2009-04-05 12:48:31 +08:00
|
|
|
|
2009-04-03 06:10:13 +08:00
|
|
|
queryid++;
|
2009-04-13 03:44:07 +08:00
|
|
|
} while (queryid < CALLING_LIMIT && bfp.hasNext() && (bread = bfp.next()) != null && (fread = ffp.next()) != null);
|
2009-03-25 04:11:39 +08:00
|
|
|
|
2009-04-13 03:44:07 +08:00
|
|
|
if (debugout != null) {
|
|
|
|
|
debugout.close();
|
|
|
|
|
}
|
2009-04-07 06:00:58 +08:00
|
|
|
sfw.close();
|
|
|
|
|
|
2009-04-03 06:10:13 +08:00
|
|
|
return 0;
|
2009-03-25 04:11:39 +08:00
|
|
|
}
|
2009-04-05 12:48:31 +08:00
|
|
|
|
2009-04-13 03:44:07 +08:00
|
|
|
private SAMRecord constructSAMRecord(String readNamePrefix, String bases, byte[] bestqual, byte[] nextbestqual, BustardReadData bread, SAMFileHeader sfh) {
|
2009-04-05 12:48:31 +08:00
|
|
|
SAMRecord sr = new SAMRecord(sfh);
|
|
|
|
|
|
|
|
|
|
sr.setReadName(readNamePrefix + bread.getReadName());
|
|
|
|
|
sr.setReadUmappedFlag(true);
|
|
|
|
|
sr.setReadString(bases);
|
|
|
|
|
sr.setBaseQualities(bestqual);
|
|
|
|
|
if (nextbestqual != null) { sr.setAttribute("SQ", nextbestqual); }
|
|
|
|
|
sr.setReadFailsVendorQualityCheckFlag(!bread.isPf());
|
|
|
|
|
|
|
|
|
|
return sr;
|
|
|
|
|
}
|
2009-03-25 04:11:39 +08:00
|
|
|
}
|