package org.broadinstitute.sting.secondarybase;

import net.sf.samtools.*;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;

import java.io.File;
import java.util.HashMap;

/**
 * AnnotateSecondaryBase computes the second best base for every base in an Illumina lane.
 * First, a statistical model is fit to a subset of the raw Illumina intensities (i.e. those
 * generated by Illumina's "Firecrest" package).  Then, every read's set of raw intensities
 * is evaluated against this model to determine the base probability distribution of a given
 * base observation.
 *
 * Approximately 95% of the time, this method and Illumina's basecalling package, "Bustard",
 * agree on the identity of the best base.  In these cases, we simply annotate the
 * second-best base.  In cases where this method and Bustard disagree, we annotate the
 * secondary base as this method's primary base.
 *
 * @author Kiran Garimella
 */
public class AnnotateSecondaryBase extends CommandLineProgram {
    public static AnnotateSecondaryBase Instance = null;

    @Argument(fullName="dir", shortName="D", doc="Illumina Bustard directory") public File BUSTARD_DIR;
    @Argument(fullName="lane", shortName="L", doc="Illumina flowcell lane") public int LANE;
    @Argument(fullName="sam_in", shortName="SI", doc="The file to use for training and annotation", required=false) public File SAM_IN;
    @Argument(fullName="sam_out", shortName="SO", doc="Output path for sam file") public File SAM_OUT;
    @Argument(fullName="reference", shortName="R", doc="Reference sequence to which sam_in is aligned (in fasta format)") public File REFERENCE;
    @Argument(fullName="cycle_begin", shortName="CB", doc="On what cycle does the read begin? (0-based inclusive)") public int CYCLE_BEGIN;
    @Argument(fullName="cycle_end", shortName="CE", doc="On what cycle does the read end? (0-based inclusive)") public int CYCLE_END;
    @Argument(fullName="tlim", shortName="T", doc="Number of reads to use for parameter initialization", required=false) public int TRAINING_LIMIT = 250000;
    @Argument(fullName="clim", shortName="C", doc="Number of reads to basecall", required=false) public int CALLING_LIMIT = Integer.MAX_VALUE;
    @Argument(fullName="runbarcode", shortName="B", doc="Run barcode (embedded as part of the read name") public String RUN_BARCODE;

    public static void main(String[] argv) {
        Instance = new AnnotateSecondaryBase();
        start(Instance, argv);
    }

    protected int execute() {
        BasecallingTrainingSet trainingSet = new BasecallingTrainingSet(BUSTARD_DIR, LANE, CYCLE_BEGIN, CYCLE_END, TRAINING_LIMIT);

        // Iterate through raw Firecrest data and store the first N reads up to TRAINING_LIMIT
        System.out.println("Loading training set from the first " + TRAINING_LIMIT + " unambiguous reads in the raw data...");
        trainingSet.loadFirstNUnambiguousReadsTrainingSet();

        // Iterate through the stored training data and add the info to the BasecallingReadModel
        System.out.println("Applying training set...");
        BasecallingReadModel model = new BasecallingReadModel(CYCLE_END - CYCLE_BEGIN + 1, true);
        model.train(trainingSet);

        // Call bases and write results
        SAMFileHeader sfh = new SAMFileHeader();
        SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, SAM_OUT);

        BasecallingStats bstats = new BasecallingStats();
        IlluminaParser iparser = new IlluminaParser(BUSTARD_DIR, LANE, CYCLE_BEGIN, CYCLE_END);

        HashMap<String, byte[]> sqhash = null;
        if (canAnnotate(SAM_IN)) {
            System.out.println("Loading read names from aligned SAM file...");
            
            sqhash = new HashMap<String, byte[]>(10000000);

            SAMFileReader samIn = new SAMFileReader(SAM_IN);

            for (SAMRecord sr : samIn) {
                sqhash.put(sr.getReadName(), null);
            }

            samIn.close();
        }

        System.out.println("Calling bases...");
        RawRead rr;
        while (bstats.getReadsTotal() < CALLING_LIMIT && (rr = iparser.next()) != null) {
            if (canAnnotate(SAM_IN)) {
                String readname = String.format("%s:%s#0", RUN_BARCODE, rr.getReadKey());

                if (sqhash.containsKey(readname)) {
                    FourProbRead fpr = model.call(rr);

                    byte[] sqtag = fpr.getSQTag(rr);

                    sqhash.put(readname, sqtag);

                    bstats.update(rr, fpr);
                    bstats.notifyOnInterval(10000);
                }
            } else {
                FourProbRead fpr = model.call(rr);

                sfw.addAlignment(constructSAMRecord(rr, fpr, sfh, RUN_BARCODE));

                bstats.update(rr, fpr);
                bstats.notifyOnInterval(10000);
            }
        }

        iparser.close();

        bstats.notifyNow();

        if (canAnnotate(SAM_IN)) {
            // Correlate SQ tags with aligned SAM records:
            System.out.println("Merge unaligned and aligned SAM files...");

            SAMFileReader samIn = new SAMFileReader(SAM_IN);

            for (SAMRecord sr : samIn) {
                if (sqhash.containsKey(sr.getReadName())) {
                    byte[] sqtag = sqhash.get(sr.getReadName());

                    if (sqtag != null) {
                        if (sr.getReadNegativeStrandFlag()) {
                            QualityUtils.reverseComplementCompressedQualityArray(sqtag);
                        }

                        sr.setAttribute("SQ", sqtag);
                    }
                }
                
                sfw.addAlignment(sr);
            }

            samIn.close();
        }
        
        sfw.close();

        System.out.println("Done.");

        return 0;
    }

    /**
     * Simple test to determine whether we're in aligned bam annotation mode or not.
     *
     * @param samfile  the aligned sam file
     * @return true if the file exists, false otherwise
     */
    private boolean canAnnotate(File samfile) {
        return (samfile != null && samfile.exists());
    }

    /**
     * Construct a SAMRecord object with the specified information.  The secondary bases
     * will be annotated suchthat they will not conflict with the primary base.
     *
     * @param rr          the raw Illumina read
     * @param fpr         the four-base distributions for every base in the read
     * @param sfh         the SAM header
     * @param runBarcode  the run barcode of the lane (used to prefix the reads)
     *
     * @return a fully-constructed SAM record
     */
    private SAMRecord constructSAMRecord(RawRead rr, FourProbRead fpr, SAMFileHeader sfh, String runBarcode) {
        SAMRecord sr = new SAMRecord(sfh);

        sr.setReadName(runBarcode + ":" + rr.getReadKey() + "#0");
        sr.setReadUmappedFlag(true);
        sr.setReadString(rr.getSequenceAsString());
        sr.setBaseQualities(rr.getQuals());
        sr.setAttribute("SQ", fpr.getSQTag(rr));

        return sr;
    }
}