package org.broadinstitute.sting.secondarybase; import net.sf.samtools.*; import net.sf.samtools.util.CloseableIterator; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.Pair; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * AnnotateSecondaryBase computes the second best base for every base in an Illumina lane. * First, a statistical model is fit to a subset of the raw Illumina intensities (i.e. those * generated by Illumina's "Firecrest" package). Then, every read's set of raw intensities * is evaluated against this model to determine the base probability distribution of a given * base observation. * * Approximately 95% of the time, this method and Illumina's basecalling package, "Bustard", * agree on the identity of the best base. In these cases, we simply annotate our estimate * of the second-best base. In cases where this method and Bustard disagree, we annotate * the secondary base as this method's primary base. * * @author Kiran Garimella */ /* An example invocation: java -Xmx2048m -Djava.io.tmpdir=/broad/hptmp/ -jar /path/to/AnnotateSecondaryBase.jar \ -D /seq/solexaproc2/SL-XAX/analyzed/090217_SL-XAX_0003_FC30R47AAXX/Data/C1-152_Firecrest1.3.2_25-02-2009_prodinfo/Bustard1.3.2_25-02-2009_prodinfo/ \ -L 5 \ -B 30R47AAXX090217 -CR '0-75,76-151' \ -SO ~/test.sam \ -SI /seq/picard/30R47AAXX/C1-152_2009-02-17_2009-04-02/5/Solexa-10265/30R47AAXX.5.aligned.bam \ */ public class AnnotateSecondaryBase extends CommandLineProgram { public static AnnotateSecondaryBase Instance = null; @Argument(fullName="bustard_dir", shortName="D", doc="Illumina Bustard directory") public File BUSTARD_DIR; @Argument(fullName="lane", shortName="L", doc="Illumina flowcell lane") public int LANE; @Argument(fullName="run_barcode", shortName="B", doc="Run barcode (embedded as part of the read name; i.e. 30R47AAXX090217)") public String RUN_BARCODE; @Argument(fullName="cycle_ranges", shortName="CR", doc="Cycle ranges for single-end or paired reads (i.e. '0-50,56-106') (0-based, inclusive)") public String CYCLE_RANGES; @Argument(fullName="sam_out", shortName="SO", doc="Output path for sam file") public File SAM_OUT; @Argument(fullName="sam_in", shortName="SI", doc="The file to use for training and annotation", required=false) public File SAM_IN; @Argument(fullName="training_limit", shortName="T", doc="Number of reads to use for parameter initialization", required=false) public int TRAINING_LIMIT = 100000; @Argument(fullName="calling_limit", shortName="C", doc="Number of reads to basecall", required=false) public int CALLING_LIMIT = Integer.MAX_VALUE; public static void main(String[] argv) { Instance = new AnnotateSecondaryBase(); start(Instance, argv); } protected int execute() { ArrayList> cycleRanges = getCycleRanges(CYCLE_RANGES); BasecallingTrainer trainer = new BasecallingTrainer(BUSTARD_DIR, LANE, TRAINING_LIMIT); // Iterate through raw Firecrest data and store the first unambiguous N reads up to TRAINING_LIMIT System.out.println("Loading training set from the first " + TRAINING_LIMIT + " unambiguous reads in the raw data..."); trainer.loadFirstNUnambiguousReadsTrainingSet(); // Iterate through the stored training data and add the info to the BasecallingReadModel System.out.println("Applying training set..."); BasecallingReadModel model = new BasecallingReadModel(trainer.getTrainingData()); // Call bases and write results System.out.println("Calling bases..."); SAMFileHeader sfh = new SAMFileHeader(); sfh.setSortOrder(SAMFileHeader.SortOrder.queryname); File unalignedSam = (canAnnotate(SAM_IN)) ? getTempSAMFile("unaligned") : SAM_OUT; SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, unalignedSam); IlluminaParser iparser = new IlluminaParser(BUSTARD_DIR, LANE); BasecallingStats bstats = new BasecallingStats(); while (bstats.getReadsTotal() < CALLING_LIMIT && iparser.next()) { RawRead rr = iparser.getRawRead(); FourProbRead fpr = model.call(rr); for (int cycleRangeIndex = 0; cycleRangeIndex < cycleRanges.size(); cycleRangeIndex++) { Pair cycleRange = cycleRanges.get(cycleRangeIndex); RawRead rrEnd = iparser.getSubset(cycleRange.getFirst(), cycleRange.getSecond()); FourProbRead fprEnd = fpr.getSubset(cycleRange.getFirst(), cycleRange.getSecond()); sfw.addAlignment(constructSAMRecord(rrEnd, fprEnd, sfh, RUN_BARCODE, cycleRanges.size() == 2, cycleRangeIndex == 1)); if (cycleRangeIndex == 0) { bstats.update(rrEnd, fprEnd); bstats.notifyOnInterval(5000); } } } bstats.notifyNow(); iparser.close(); sfw.close(); if (canAnnotate(SAM_IN)) { // If we're in annotation mode, annotate the aligned BAM file with the SQ tag System.out.println("Annotating aligned SAM file..."); System.out.println(" sorting aligned SAM file by read name..."); File alignedSam = getTempSAMFile("aligned"); sortBAMByReadName(SAM_IN, alignedSam); System.out.println(" merging unaligned and aligned SAM files..."); File mergedSam = SAM_OUT; mergeUnalignedAndAlignedBams(unalignedSam, alignedSam, mergedSam); } System.out.println("Done."); return 0; } /** * Return a tempfile. This is a laziness method so that I don't have to litter my code with try/catch blocks for IOExceptions. * * @param prefix the prefix for the temp file * @return the temp file */ private File getTempSAMFile(String prefix) { try { File tempFile = File.createTempFile(prefix, ".sam", SAM_OUT.getParentFile()); tempFile.deleteOnExit(); return tempFile; } catch (IOException e) { throw new StingException("Unable to create tempfile in directory " + SAM_OUT.getParent()); } } /** * Parse the cycle_ranges string that defines the cycle where a read starts and stops. * Comma-separated ranges are interpreted to be the first and second end of a pair. * * @param cycleRangesString the 0-based, inclusive, comma-separated ranges (i.e. '0-50,51-100') * @return an ArrayList of cycle ranges */ private ArrayList< Pair > getCycleRanges(String cycleRangesString) { ArrayList< Pair > cycleRanges = new ArrayList< Pair >(); String[] pieces = cycleRangesString.split(","); Pattern p = Pattern.compile("(\\d+)-(\\d+)"); for (String piece : pieces) { Matcher m = p.matcher(piece); if (m.find()) { Integer cycleStart = new Integer(m.group(1)); Integer cycleStop = new Integer(m.group(2)); cycleRanges.add(new Pair(cycleStart, cycleStop)); } } if (cycleRanges.size() == 0) { throw new StingException("At least one cycle range must be specified."); } if (cycleRanges.size() > 2) { throw new StingException(cycleRanges.size() + " specified, but we're unable to handle more than 2."); } return cycleRanges; } /** * Simple test to determine whether we're in aligned bam annotation mode or not. * * @param samfile the aligned sam file * @return true if the file exists, false otherwise */ private boolean canAnnotate(File samfile) { return (samfile != null && samfile.exists()); } /** * Construct a SAMRecord object with the specified information. The secondary bases * will be annotated suchthat they will not conflict with the primary base. * * @param rr the raw Illumina read * @param fpr the four-base distributions for every base in the read * @param sfh the SAM header * @param runBarcode the run barcode of the lane (used to prefix the reads) * @param isPaired is this a paired-end lane? * @param isSecondEndOfPair is this the second end of the pair? * * @return a fully-constructed SAM record */ private SAMRecord constructSAMRecord(RawRead rr, FourProbRead fpr, SAMFileHeader sfh, String runBarcode, boolean isPaired, boolean isSecondEndOfPair) { SAMRecord sr = new SAMRecord(sfh); sr.setReadName(runBarcode + ":" + rr.getReadKey() + "#0"); sr.setReadUmappedFlag(true); sr.setReadString(rr.getSequenceAsString()); sr.setBaseQualities(rr.getQuals()); sr.setReadPairedFlag(isPaired); if (isPaired) { sr.setMateUnmappedFlag(true); sr.setFirstOfPairFlag(!isSecondEndOfPair); } sr.setAttribute("SQ", fpr.getSQTag(rr)); return sr; } /** * Resorts a SAM file to queryname order. * * @param samFile the input SAM file * @param sortedSamFile the sorted SAM output file */ private void sortBAMByReadName(File samFile, File sortedSamFile) { SAMFileReader samIn = new SAMFileReader(samFile); SAMFileHeader sfh = samIn.getFileHeader(); sfh.setSortOrder(SAMFileHeader.SortOrder.queryname); SAMFileWriter samOut = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, sortedSamFile); for (SAMRecord sr : samIn) { samOut.addAlignment(sr); } samIn.close(); samOut.close(); } /** * Merges two SAM files that have been sorted in queryname order * * @param queryNameSortedUnalignedSam the sorted unaligned SAM file * @param queryNameSortedAlignedSam the sorted aligned SAM file * @param mergedSam the output file where the merged results should be stored */ private void mergeUnalignedAndAlignedBams(File queryNameSortedUnalignedSam, File queryNameSortedAlignedSam, File mergedSam) { SAMFileReader usam = new SAMFileReader(queryNameSortedUnalignedSam); SAMFileReader asam = new SAMFileReader(queryNameSortedAlignedSam); SAMFileHeader sfh = asam.getFileHeader(); sfh.setSortOrder(SAMFileHeader.SortOrder.coordinate); SAMFileWriter samOut = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, mergedSam); CloseableIterator usamIt = usam.iterator(); CloseableIterator asamIt = asam.iterator(); SAMRecord usr = usamIt.next(); SAMRecord asr = asamIt.next(); do { // Pull a record from the unaligned file and advance the aligned file until we find the matching record. We // don't have to advance the unaligned file until we find our record because we assume every record we generate // will be in the aligned file (which also contains unaligned reads). // // If Picard ever stops storing the unaligned reads, this logic will need to be rewritten. if (usr.getReadName().equals(asr.getReadName()) && usr.getFirstOfPairFlag() == asr.getFirstOfPairFlag()) { byte[] sqtag = (byte[]) usr.getAttribute("SQ"); String usrread = usr.getReadString(); String asrread = asr.getReadString(); if (asr.getReadNegativeStrandFlag()) { sqtag = QualityUtils.reverseComplementCompressedQualityArray(sqtag); asrread = BaseUtils.simpleReverseComplement(asrread); } if (usrread != null && asrread != null && !usrread.equals(asrread)) { throw new StingException( String.format("Purportedly identical unaligned and aligned reads have different read sequences. Perhaps this lane was reanalyzed by the Illumina software but not the production pipeline?\n '%s:%b:%s'\n '%s:%b:%s'", usr.getReadName(), usr.getFirstOfPairFlag(), usrread, asr.getReadName(), asr.getFirstOfPairFlag(), asrread)); } asr.setAttribute("SQ", sqtag); usr = usamIt.next(); } else { asr = asamIt.next(); } samOut.addAlignment(asr); } while (usamIt.hasNext() && asamIt.hasNext()); usam.close(); asam.close(); samOut.close(); } /** * For debugging purposes. Spits out relevant information for two SAMRecords. * * @param sra first SAMRecord * @param srb second SAMRecord */ private void printRecords(SAMRecord sra, SAMRecord srb) { System.out.println("a: " + sra.getReadName() + " " + sra.getFirstOfPairFlag()); System.out.println("b: " + srb.getReadName() + " " + srb.getFirstOfPairFlag()); System.out.println(); } }