2009-05-13 04:24:18 +08:00
|
|
|
package org.broadinstitute.sting.secondarybase;
|
2009-05-13 03:46:34 +08:00
|
|
|
|
2009-05-19 01:42:08 +08:00
|
|
|
import net.sf.samtools.*;
|
2009-05-15 02:58:43 +08:00
|
|
|
import org.broadinstitute.sting.utils.BaseUtils;
|
2009-05-19 01:42:08 +08:00
|
|
|
import org.broadinstitute.sting.utils.QualityUtils;
|
2009-05-15 02:58:43 +08:00
|
|
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
|
|
|
|
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
|
|
|
|
|
|
|
|
|
import java.io.File;
|
2009-05-19 01:42:08 +08:00
|
|
|
import java.util.HashMap;
|
2009-05-13 03:46:34 +08:00
|
|
|
|
2009-05-16 05:22:24 +08:00
|
|
|
/**
|
|
|
|
|
* AnnotateSecondaryBase computes the second best base for every base in an Illumina lane.
|
|
|
|
|
* First, a statistical model is fit to a subset of the raw Illumina intensities (i.e. those
|
|
|
|
|
* generated by Illumina's "Firecrest" package). Then, every read's set of raw intensities
|
|
|
|
|
* is evaluated against this model to determine the base probability distribution of a given
|
|
|
|
|
* base observation.
|
|
|
|
|
*
|
|
|
|
|
* Approximately 95% of the time, this method and Illumina's basecalling package, "Bustard",
|
|
|
|
|
* agree on the identity of the best base. In these cases, we simply annotate the
|
|
|
|
|
* second-best base. In cases where this method and Bustard disagree, we annotate the
|
|
|
|
|
* secondary base as this method's primary base.
|
|
|
|
|
*
|
|
|
|
|
* @author Kiran Garimella
|
|
|
|
|
*/
|
2009-05-13 03:46:34 +08:00
|
|
|
public class AnnotateSecondaryBase extends CommandLineProgram {
|
|
|
|
|
public static AnnotateSecondaryBase Instance = null;
|
|
|
|
|
|
2009-05-15 00:58:22 +08:00
|
|
|
@Argument(fullName="dir", shortName="D", doc="Illumina Bustard directory") public File BUSTARD_DIR;
|
2009-05-13 03:46:34 +08:00
|
|
|
@Argument(fullName="lane", shortName="L", doc="Illumina flowcell lane") public int LANE;
|
2009-05-15 00:58:22 +08:00
|
|
|
@Argument(fullName="sam_in", shortName="SI", doc="The file to use for training and annotation", required=false) public File SAM_IN;
|
2009-05-13 03:46:34 +08:00
|
|
|
@Argument(fullName="sam_out", shortName="SO", doc="Output path for sam file") public File SAM_OUT;
|
|
|
|
|
@Argument(fullName="reference", shortName="R", doc="Reference sequence to which sam_in is aligned (in fasta format)") public File REFERENCE;
|
|
|
|
|
@Argument(fullName="cycle_begin", shortName="CB", doc="On what cycle does the read begin? (0-based inclusive)") public int CYCLE_BEGIN;
|
|
|
|
|
@Argument(fullName="cycle_end", shortName="CE", doc="On what cycle does the read end? (0-based inclusive)") public int CYCLE_END;
|
2009-05-15 00:58:22 +08:00
|
|
|
@Argument(fullName="tlim", shortName="T", doc="Number of reads to use for parameter initialization", required=false) public int TRAINING_LIMIT = 250000;
|
2009-05-13 03:46:34 +08:00
|
|
|
@Argument(fullName="clim", shortName="C", doc="Number of reads to basecall", required=false) public int CALLING_LIMIT = Integer.MAX_VALUE;
|
2009-05-15 00:58:22 +08:00
|
|
|
@Argument(fullName="runbarcode", shortName="B", doc="Run barcode (embedded as part of the read name") public String RUN_BARCODE;
|
2009-05-13 03:46:34 +08:00
|
|
|
|
|
|
|
|
public static void main(String[] argv) {
|
|
|
|
|
Instance = new AnnotateSecondaryBase();
|
|
|
|
|
start(Instance, argv);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected int execute() {
|
2009-05-15 00:58:22 +08:00
|
|
|
BasecallingTrainingSet trainingSet = new BasecallingTrainingSet(BUSTARD_DIR, LANE, CYCLE_BEGIN, CYCLE_END, TRAINING_LIMIT);
|
|
|
|
|
|
2009-05-16 05:22:24 +08:00
|
|
|
// Iterate through raw Firecrest data and store the first N reads up to TRAINING_LIMIT
|
2009-05-17 12:09:23 +08:00
|
|
|
System.out.println("Loading training set from the first " + TRAINING_LIMIT + " unambiguous reads in the raw data...");
|
2009-05-16 05:22:24 +08:00
|
|
|
trainingSet.loadFirstNUnambiguousReadsTrainingSet();
|
2009-05-13 03:46:34 +08:00
|
|
|
|
2009-05-15 00:58:22 +08:00
|
|
|
// Iterate through the stored training data and add the info to the BasecallingReadModel
|
2009-05-15 02:01:41 +08:00
|
|
|
System.out.println("Applying training set...");
|
2009-05-17 12:09:23 +08:00
|
|
|
BasecallingReadModel model = new BasecallingReadModel(CYCLE_END - CYCLE_BEGIN + 1, true);
|
2009-05-15 00:58:22 +08:00
|
|
|
model.train(trainingSet);
|
|
|
|
|
|
|
|
|
|
// Call bases and write results
|
|
|
|
|
SAMFileHeader sfh = new SAMFileHeader();
|
|
|
|
|
SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, SAM_OUT);
|
|
|
|
|
|
2009-05-17 11:37:57 +08:00
|
|
|
BasecallingStats bstats = new BasecallingStats();
|
2009-05-15 00:58:22 +08:00
|
|
|
IlluminaParser iparser = new IlluminaParser(BUSTARD_DIR, LANE, CYCLE_BEGIN, CYCLE_END);
|
2009-05-19 01:42:08 +08:00
|
|
|
|
|
|
|
|
HashMap<String, byte[]> sqhash = null;
|
|
|
|
|
if (canAnnotate(SAM_IN)) {
|
|
|
|
|
System.out.println("Loading read names from aligned SAM file...");
|
|
|
|
|
|
|
|
|
|
sqhash = new HashMap<String, byte[]>(10000000);
|
|
|
|
|
|
|
|
|
|
SAMFileReader samIn = new SAMFileReader(SAM_IN);
|
|
|
|
|
|
|
|
|
|
for (SAMRecord sr : samIn) {
|
|
|
|
|
sqhash.put(sr.getReadName(), null);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
samIn.close();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
System.out.println("Calling bases...");
|
2009-05-15 02:58:43 +08:00
|
|
|
RawRead rr;
|
2009-05-17 11:37:57 +08:00
|
|
|
while (bstats.getReadsTotal() < CALLING_LIMIT && (rr = iparser.next()) != null) {
|
2009-05-19 01:42:08 +08:00
|
|
|
if (canAnnotate(SAM_IN)) {
|
|
|
|
|
String readname = String.format("%s:%s#0", RUN_BARCODE, rr.getReadKey());
|
2009-05-15 00:58:22 +08:00
|
|
|
|
2009-05-19 01:42:08 +08:00
|
|
|
if (sqhash.containsKey(readname)) {
|
|
|
|
|
FourProbRead fpr = model.call(rr);
|
2009-05-17 12:09:23 +08:00
|
|
|
|
2009-05-19 01:42:08 +08:00
|
|
|
byte[] sqtag = fpr.getSQTag(rr);
|
|
|
|
|
|
|
|
|
|
sqhash.put(readname, sqtag);
|
|
|
|
|
|
|
|
|
|
bstats.update(rr, fpr);
|
|
|
|
|
bstats.notifyOnInterval(10000);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
FourProbRead fpr = model.call(rr);
|
|
|
|
|
|
|
|
|
|
sfw.addAlignment(constructSAMRecord(rr, fpr, sfh, RUN_BARCODE));
|
|
|
|
|
|
|
|
|
|
bstats.update(rr, fpr);
|
|
|
|
|
bstats.notifyOnInterval(10000);
|
|
|
|
|
}
|
2009-05-13 03:46:34 +08:00
|
|
|
}
|
|
|
|
|
|
2009-05-15 00:58:22 +08:00
|
|
|
iparser.close();
|
2009-05-13 03:46:34 +08:00
|
|
|
|
2009-05-17 12:09:23 +08:00
|
|
|
bstats.notifyNow();
|
2009-05-19 01:42:08 +08:00
|
|
|
|
|
|
|
|
if (canAnnotate(SAM_IN)) {
|
|
|
|
|
// Correlate SQ tags with aligned SAM records:
|
|
|
|
|
System.out.println("Merge unaligned and aligned SAM files...");
|
|
|
|
|
|
|
|
|
|
SAMFileReader samIn = new SAMFileReader(SAM_IN);
|
|
|
|
|
|
|
|
|
|
for (SAMRecord sr : samIn) {
|
|
|
|
|
if (sqhash.containsKey(sr.getReadName())) {
|
|
|
|
|
byte[] sqtag = sqhash.get(sr.getReadName());
|
|
|
|
|
|
|
|
|
|
if (sqtag != null) {
|
|
|
|
|
if (sr.getReadNegativeStrandFlag()) {
|
|
|
|
|
QualityUtils.reverseComplementCompressedQualityArray(sqtag);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
sr.setAttribute("SQ", sqtag);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
sfw.addAlignment(sr);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
samIn.close();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
sfw.close();
|
|
|
|
|
|
2009-05-15 08:07:57 +08:00
|
|
|
System.out.println("Done.");
|
2009-05-15 02:01:41 +08:00
|
|
|
|
|
|
|
|
return 0;
|
2009-05-13 03:46:34 +08:00
|
|
|
}
|
|
|
|
|
|
2009-05-19 01:42:08 +08:00
|
|
|
/**
|
|
|
|
|
* Simple test to determine whether we're in aligned bam annotation mode or not.
|
|
|
|
|
*
|
|
|
|
|
* @param samfile the aligned sam file
|
|
|
|
|
* @return true if the file exists, false otherwise
|
|
|
|
|
*/
|
|
|
|
|
private boolean canAnnotate(File samfile) {
|
|
|
|
|
return (samfile != null && samfile.exists());
|
|
|
|
|
}
|
|
|
|
|
|
2009-05-16 05:22:24 +08:00
|
|
|
/**
|
|
|
|
|
* Construct a SAMRecord object with the specified information. The secondary bases
|
|
|
|
|
* will be annotated suchthat they will not conflict with the primary base.
|
|
|
|
|
*
|
|
|
|
|
* @param rr the raw Illumina read
|
|
|
|
|
* @param fpr the four-base distributions for every base in the read
|
|
|
|
|
* @param sfh the SAM header
|
|
|
|
|
* @param runBarcode the run barcode of the lane (used to prefix the reads)
|
|
|
|
|
*
|
|
|
|
|
* @return a fully-constructed SAM record
|
|
|
|
|
*/
|
2009-05-15 00:58:22 +08:00
|
|
|
private SAMRecord constructSAMRecord(RawRead rr, FourProbRead fpr, SAMFileHeader sfh, String runBarcode) {
|
|
|
|
|
SAMRecord sr = new SAMRecord(sfh);
|
|
|
|
|
|
|
|
|
|
sr.setReadName(runBarcode + ":" + rr.getReadKey() + "#0");
|
|
|
|
|
sr.setReadUmappedFlag(true);
|
|
|
|
|
sr.setReadString(rr.getSequenceAsString());
|
|
|
|
|
sr.setBaseQualities(rr.getQuals());
|
|
|
|
|
sr.setAttribute("SQ", fpr.getSQTag(rr));
|
2009-05-13 03:46:34 +08:00
|
|
|
|
2009-05-15 00:58:22 +08:00
|
|
|
return sr;
|
2009-05-13 03:46:34 +08:00
|
|
|
}
|
|
|
|
|
}
|