package org.broadinstitute.sting.secondarybase; import net.sf.samtools.*; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram; import java.io.File; import java.util.HashMap; /** * AnnotateSecondaryBase computes the second best base for every base in an Illumina lane. * First, a statistical model is fit to a subset of the raw Illumina intensities (i.e. those * generated by Illumina's "Firecrest" package). Then, every read's set of raw intensities * is evaluated against this model to determine the base probability distribution of a given * base observation. * * Approximately 95% of the time, this method and Illumina's basecalling package, "Bustard", * agree on the identity of the best base. In these cases, we simply annotate the * second-best base. In cases where this method and Bustard disagree, we annotate the * secondary base as this method's primary base. * * @author Kiran Garimella */ public class AnnotateSecondaryBase extends CommandLineProgram { public static AnnotateSecondaryBase Instance = null; @Argument(fullName="dir", shortName="D", doc="Illumina Bustard directory") public File BUSTARD_DIR; @Argument(fullName="lane", shortName="L", doc="Illumina flowcell lane") public int LANE; @Argument(fullName="sam_in", shortName="SI", doc="The file to use for training and annotation", required=false) public File SAM_IN; @Argument(fullName="sam_out", shortName="SO", doc="Output path for sam file") public File SAM_OUT; @Argument(fullName="reference", shortName="R", doc="Reference sequence to which sam_in is aligned (in fasta format)") public File REFERENCE; @Argument(fullName="cycle_begin", shortName="CB", doc="On what cycle does the read begin? (0-based inclusive)") public int CYCLE_BEGIN; @Argument(fullName="cycle_end", shortName="CE", doc="On what cycle does the read end? (0-based inclusive)") public int CYCLE_END; @Argument(fullName="tlim", shortName="T", doc="Number of reads to use for parameter initialization", required=false) public int TRAINING_LIMIT = 250000; @Argument(fullName="clim", shortName="C", doc="Number of reads to basecall", required=false) public int CALLING_LIMIT = Integer.MAX_VALUE; @Argument(fullName="runbarcode", shortName="B", doc="Run barcode (embedded as part of the read name") public String RUN_BARCODE; public static void main(String[] argv) { Instance = new AnnotateSecondaryBase(); start(Instance, argv); } protected int execute() { BasecallingTrainingSet trainingSet = new BasecallingTrainingSet(BUSTARD_DIR, LANE, CYCLE_BEGIN, CYCLE_END, TRAINING_LIMIT); // Iterate through raw Firecrest data and store the first N reads up to TRAINING_LIMIT System.out.println("Loading training set from the first " + TRAINING_LIMIT + " unambiguous reads in the raw data..."); trainingSet.loadFirstNUnambiguousReadsTrainingSet(); // Iterate through the stored training data and add the info to the BasecallingReadModel System.out.println("Applying training set..."); BasecallingReadModel model = new BasecallingReadModel(CYCLE_END - CYCLE_BEGIN + 1, true); model.train(trainingSet); // Call bases and write results SAMFileHeader sfh = new SAMFileHeader(); SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, SAM_OUT); BasecallingStats bstats = new BasecallingStats(); IlluminaParser iparser = new IlluminaParser(BUSTARD_DIR, LANE, CYCLE_BEGIN, CYCLE_END); HashMap sqhash = null; if (canAnnotate(SAM_IN)) { System.out.println("Loading read names from aligned SAM file..."); sqhash = new HashMap(10000000); SAMFileReader samIn = new SAMFileReader(SAM_IN); for (SAMRecord sr : samIn) { sqhash.put(sr.getReadName(), null); } samIn.close(); } System.out.println("Calling bases..."); RawRead rr; while (bstats.getReadsTotal() < CALLING_LIMIT && (rr = iparser.next()) != null) { if (canAnnotate(SAM_IN)) { String readname = String.format("%s:%s#0", RUN_BARCODE, rr.getReadKey()); if (sqhash.containsKey(readname)) { FourProbRead fpr = model.call(rr); byte[] sqtag = fpr.getSQTag(rr); sqhash.put(readname, sqtag); bstats.update(rr, fpr); bstats.notifyOnInterval(10000); } } else { FourProbRead fpr = model.call(rr); sfw.addAlignment(constructSAMRecord(rr, fpr, sfh, RUN_BARCODE)); bstats.update(rr, fpr); bstats.notifyOnInterval(10000); } } iparser.close(); bstats.notifyNow(); if (canAnnotate(SAM_IN)) { // Correlate SQ tags with aligned SAM records: System.out.println("Merge unaligned and aligned SAM files..."); SAMFileReader samIn = new SAMFileReader(SAM_IN); for (SAMRecord sr : samIn) { if (sqhash.containsKey(sr.getReadName())) { byte[] sqtag = sqhash.get(sr.getReadName()); if (sqtag != null) { if (sr.getReadNegativeStrandFlag()) { QualityUtils.reverseComplementCompressedQualityArray(sqtag); } sr.setAttribute("SQ", sqtag); } } sfw.addAlignment(sr); } samIn.close(); } sfw.close(); System.out.println("Done."); return 0; } /** * Simple test to determine whether we're in aligned bam annotation mode or not. * * @param samfile the aligned sam file * @return true if the file exists, false otherwise */ private boolean canAnnotate(File samfile) { return (samfile != null && samfile.exists()); } /** * Construct a SAMRecord object with the specified information. The secondary bases * will be annotated suchthat they will not conflict with the primary base. * * @param rr the raw Illumina read * @param fpr the four-base distributions for every base in the read * @param sfh the SAM header * @param runBarcode the run barcode of the lane (used to prefix the reads) * * @return a fully-constructed SAM record */ private SAMRecord constructSAMRecord(RawRead rr, FourProbRead fpr, SAMFileHeader sfh, String runBarcode) { SAMRecord sr = new SAMRecord(sfh); sr.setReadName(runBarcode + ":" + rr.getReadKey() + "#0"); sr.setReadUmappedFlag(true); sr.setReadString(rr.getSequenceAsString()); sr.setBaseQualities(rr.getQuals()); sr.setAttribute("SQ", fpr.getSQTag(rr)); return sr; } }