package org.broadinstitute.sting.secondarybase; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileWriter; import net.sf.samtools.SAMFileWriterFactory; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram; import java.io.File; /** * AnnotateSecondaryBase computes the second best base for every base in an Illumina lane. * First, a statistical model is fit to a subset of the raw Illumina intensities (i.e. those * generated by Illumina's "Firecrest" package). Then, every read's set of raw intensities * is evaluated against this model to determine the base probability distribution of a given * base observation. * * Approximately 95% of the time, this method and Illumina's basecalling package, "Bustard", * agree on the identity of the best base. In these cases, we simply annotate the * second-best base. In cases where this method and Bustard disagree, we annotate the * secondary base as this method's primary base. * * @author Kiran Garimella */ public class AnnotateSecondaryBase extends CommandLineProgram { public static AnnotateSecondaryBase Instance = null; @Argument(fullName="dir", shortName="D", doc="Illumina Bustard directory") public File BUSTARD_DIR; @Argument(fullName="lane", shortName="L", doc="Illumina flowcell lane") public int LANE; @Argument(fullName="sam_in", shortName="SI", doc="The file to use for training and annotation", required=false) public File SAM_IN; @Argument(fullName="sam_out", shortName="SO", doc="Output path for sam file") public File SAM_OUT; @Argument(fullName="reference", shortName="R", doc="Reference sequence to which sam_in is aligned (in fasta format)") public File REFERENCE; @Argument(fullName="cycle_begin", shortName="CB", doc="On what cycle does the read begin? (0-based inclusive)") public int CYCLE_BEGIN; @Argument(fullName="cycle_end", shortName="CE", doc="On what cycle does the read end? (0-based inclusive)") public int CYCLE_END; @Argument(fullName="tlim", shortName="T", doc="Number of reads to use for parameter initialization", required=false) public int TRAINING_LIMIT = 250000; @Argument(fullName="clim", shortName="C", doc="Number of reads to basecall", required=false) public int CALLING_LIMIT = Integer.MAX_VALUE; @Argument(fullName="context", shortName="X", doc="Attempt to correct for context?", required=false) public Boolean CONTEXT = false; @Argument(fullName="runbarcode", shortName="B", doc="Run barcode (embedded as part of the read name") public String RUN_BARCODE; public static void main(String[] argv) { Instance = new AnnotateSecondaryBase(); start(Instance, argv); } protected int execute() { BasecallingTrainingSet trainingSet = new BasecallingTrainingSet(BUSTARD_DIR, LANE, CYCLE_BEGIN, CYCLE_END, TRAINING_LIMIT); /* // This doesn't work right now... // Find alignments with zero mismatches and store them until we've picked up TRAINING_LIMIT alignments System.out.println("Loading training set from the first " + TRAINING_LIMIT + " perfect reads in the aligned data..."); trainingSet.loadPreAlignedTrainingSet(SAM_IN, REFERENCE); */ // Iterate through raw Firecrest data and store the first N reads up to TRAINING_LIMIT System.out.println("Loading training set from the first " + TRAINING_LIMIT + " reads in the raw data..."); trainingSet.loadFirstNUnambiguousReadsTrainingSet(); // Iterate through the stored training data and add the info to the BasecallingReadModel System.out.println("Applying training set..."); BasecallingReadModel model = new BasecallingReadModel(CYCLE_END - CYCLE_BEGIN + 1, CONTEXT); model.train(trainingSet); // Call bases and write results System.out.println("Calling bases and writing SAM file..."); SAMFileHeader sfh = new SAMFileHeader(); SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, SAM_OUT); IlluminaParser iparser = new IlluminaParser(BUSTARD_DIR, LANE, CYCLE_BEGIN, CYCLE_END); RawRead rr; int basesConsistent = 0, basesTotal = 0, readsTotal = 0; while (readsTotal < CALLING_LIMIT && (rr = iparser.next()) != null) { FourProbRead fpr = model.call(rr); SAMRecord sr = constructSAMRecord(rr, fpr, sfh, RUN_BARCODE); sfw.addAlignment(sr); for (int cycle = 0; cycle < fpr.size(); cycle++) { int rawBaseIndex = BaseUtils.simpleBaseToBaseIndex((char) rr.getSequence()[cycle]); int fpBaseIndex = fpr.get(cycle).indexAtRank(0); if (rawBaseIndex >= 0 && fpBaseIndex >= 0) { basesTotal++; if (rawBaseIndex == fpBaseIndex) { basesConsistent++; } } } if (basesTotal % 10000 == 0 && basesTotal > 0) { System.out.printf("%% bases consistent: %d/%d (%4.4f)\r", basesConsistent, basesTotal, ((double) basesConsistent)/((double) basesTotal)); } readsTotal++; } iparser.close(); sfw.close(); System.out.printf("%% bases consistent: %d/%d (%4.4f)\n", basesConsistent, basesTotal, ((double) basesConsistent)/((double) basesTotal)); System.out.println("Done."); return 0; } /** * Construct a SAMRecord object with the specified information. The secondary bases * will be annotated suchthat they will not conflict with the primary base. * * @param rr the raw Illumina read * @param fpr the four-base distributions for every base in the read * @param sfh the SAM header * @param runBarcode the run barcode of the lane (used to prefix the reads) * * @return a fully-constructed SAM record */ private SAMRecord constructSAMRecord(RawRead rr, FourProbRead fpr, SAMFileHeader sfh, String runBarcode) { SAMRecord sr = new SAMRecord(sfh); sr.setReadName(runBarcode + ":" + rr.getReadKey() + "#0"); sr.setReadUmappedFlag(true); sr.setReadString(rr.getSequenceAsString()); sr.setBaseQualities(rr.getQuals()); sr.setAttribute("SQ", fpr.getSQTag(rr)); return sr; } }