gatk-3.8/java/src/org/broadinstitute/sting/secondarybase/AnnotateSecondaryBase.java

package org.broadinstitute.sting.secondarybase;

import net.sf.samtools.*;
import net.sf.samtools.util.CloseableIterator;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * AnnotateSecondaryBase computes the second best base for every base in an Illumina lane.
 * First, a statistical model is fit to a subset of the raw Illumina intensities (i.e. those
 * generated by Illumina's "Firecrest" package).  Then, every read's set of raw intensities
 * is evaluated against this model to determine the base probability distribution of a given
 * base observation.
 *
 * Approximately 95% of the time, this method and Illumina's basecalling package, "Bustard",
 * agree on the identity of the best base.  In these cases, we simply annotate our estimate
 * of the second-best base.  In cases where this method and Bustard disagree, we annotate
 * the secondary base as this method's primary base.
 *
 * @author Kiran Garimella
 */

/*
 An example invocation:
   java -Xmx2048m -Djava.io.tmpdir=/broad/hptmp/ -jar /path/to/AnnotateSecondaryBase.jar \
    -D /seq/solexaproc2/SL-XAX/analyzed/090217_SL-XAX_0003_FC30R47AAXX/Data/C1-152_Firecrest1.3.2_25-02-2009_prodinfo/Bustard1.3.2_25-02-2009_prodinfo/ \
    -L 5 \
    -B 30R47AAXX090217
    -CR '0-75,76-151' \
    -SO ~/test.sam \
    -SI /seq/picard/30R47AAXX/C1-152_2009-02-17_2009-04-02/5/Solexa-10265/30R47AAXX.5.aligned.bam \
 */
    
public class AnnotateSecondaryBase extends CommandLineProgram {
    public static AnnotateSecondaryBase Instance = null;

    @Argument(fullName="bustard_dir", shortName="D", doc="Illumina Bustard directory") public File BUSTARD_DIR;
    @Argument(fullName="lane", shortName="L", doc="Illumina flowcell lane") public int LANE;
    @Argument(fullName="run_barcode", shortName="B", doc="Run barcode (embedded as part of the read name; i.e. 30R47AAXX090217)") public String RUN_BARCODE;
    @Argument(fullName="cycle_ranges", shortName="CR", doc="Cycle ranges for single-end or paired reads (i.e. '0-50,56-106') (0-based, inclusive)") public String CYCLE_RANGES;
    @Argument(fullName="sam_out", shortName="SO", doc="Output path for sam file") public File SAM_OUT;
    @Argument(fullName="sam_in", shortName="SI", doc="The file to use for training and annotation", required=false) public File SAM_IN;
    @Argument(fullName="training_limit", shortName="T", doc="Number of reads to use for parameter initialization", required=false) public int TRAINING_LIMIT = 100000;
    @Argument(fullName="calling_limit", shortName="C", doc="Number of reads to basecall", required=false) public int CALLING_LIMIT = Integer.MAX_VALUE;

    public static void main(String[] argv) {
        Instance = new AnnotateSecondaryBase();
        start(Instance, argv);
    }

    protected int execute() {
        ArrayList<Pair<Integer, Integer>> cycleRanges = getCycleRanges(CYCLE_RANGES);

        BasecallingTrainer trainer = new BasecallingTrainer(BUSTARD_DIR, LANE, TRAINING_LIMIT);

        // Iterate through raw Firecrest data and store the first unambiguous N reads up to TRAINING_LIMIT
        System.out.println("Loading training set from the first " + TRAINING_LIMIT + " unambiguous reads in the raw data...");
        trainer.loadFirstNUnambiguousReadsTrainingSet();

        // Iterate through the stored training data and add the info to the BasecallingReadModel
        System.out.println("Applying training set...");
        BasecallingReadModel model = new BasecallingReadModel(trainer.getTrainingData());

        // Call bases and write results
        System.out.println("Calling bases...");

        SAMFileHeader sfh = new SAMFileHeader();
        sfh.setSortOrder(SAMFileHeader.SortOrder.queryname);
        
        File unalignedSam = (canAnnotate(SAM_IN)) ? getTempSAMFile("unaligned") : SAM_OUT;
        SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, unalignedSam);

        IlluminaParser iparser = new IlluminaParser(BUSTARD_DIR, LANE);

        BasecallingStats bstats = new BasecallingStats();

        while (bstats.getReadsTotal() < CALLING_LIMIT && iparser.next()) {
            RawRead rr = iparser.getRawRead();
            FourProbRead fpr = model.call(rr);

            for (int cycleRangeIndex = 0; cycleRangeIndex < cycleRanges.size(); cycleRangeIndex++) {
                Pair<Integer, Integer> cycleRange = cycleRanges.get(cycleRangeIndex);

                RawRead rrEnd = iparser.getSubset(cycleRange.getFirst(), cycleRange.getSecond());
                FourProbRead fprEnd = fpr.getSubset(cycleRange.getFirst(), cycleRange.getSecond());

                sfw.addAlignment(constructSAMRecord(rrEnd, fprEnd, sfh, RUN_BARCODE, cycleRanges.size() == 2, cycleRangeIndex == 1));

                if (cycleRangeIndex == 0) {
                    bstats.update(rrEnd, fprEnd);
                    bstats.notifyOnInterval(5000);
                }
            }
        }

        bstats.notifyNow();

        iparser.close();
        sfw.close();

        if (canAnnotate(SAM_IN)) {
            // If we're in annotation mode, annotate the aligned BAM file with the SQ tag
            System.out.println("Annotating aligned SAM file...");
            
            System.out.println("  sorting aligned SAM file by read name...");
            File alignedSam = getTempSAMFile("aligned");
            sortBAMByReadName(SAM_IN, alignedSam);

            System.out.println("  merging unaligned and aligned SAM files...");
            File mergedSam = SAM_OUT;
            mergeUnalignedAndAlignedBams(unalignedSam, alignedSam, mergedSam);
        }

        System.out.println("Done.");

        return 0;
    }

    /**
     * Return a tempfile.  This is a laziness method so that I don't have to litter my code with try/catch blocks for IOExceptions.
     *
     * @param prefix  the prefix for the temp file
     * @return  the temp file
     */
    private File getTempSAMFile(String prefix) {
        try {
            File tempFile = File.createTempFile(prefix, ".sam", SAM_OUT.getParentFile());
            //tempFile.deleteOnExit();

            // Ensure that the volumes we're about to use are ready.
            PathUtils.refreshVolume(tempFile);
            PathUtils.refreshVolume(new File(System.getProperty("java.io.tmpdir")));

            return tempFile;
        } catch (IOException e) {
            throw new StingException("Unable to create tempfile in directory " + SAM_OUT.getParent());
        }
    }

    /**
     * Parse the cycle_ranges string that defines the cycle where a read starts and stops.
     * Comma-separated ranges are interpreted to be the first and second end of a pair.
     *
     * @param cycleRangesString  the 0-based, inclusive, comma-separated ranges (i.e. '0-50,51-100')
     * @return an ArrayList of cycle ranges
     */
    private ArrayList< Pair<Integer, Integer> > getCycleRanges(String cycleRangesString) {
        ArrayList< Pair<Integer, Integer> > cycleRanges = new ArrayList< Pair<Integer, Integer> >();

        String[] pieces = cycleRangesString.split(",");
        
        Pattern p = Pattern.compile("(\\d+)-(\\d+)");

        for (String piece : pieces) {
            Matcher m = p.matcher(piece);

            if (m.find()) {
                Integer cycleStart = new Integer(m.group(1));
                Integer cycleStop = new Integer(m.group(2));

                cycleRanges.add(new Pair<Integer, Integer>(cycleStart, cycleStop));
            }
        }

        if (cycleRanges.size() == 0) {
            throw new StingException("At least one cycle range must be specified.");
        }

        if (cycleRanges.size() > 2) {
            throw new StingException(cycleRanges.size() + " specified, but we're unable to handle more than 2.");
        }

        return cycleRanges;
    }

    /**
     * Simple test to determine whether we're in aligned bam annotation mode or not.
     *
     * @param samfile  the aligned sam file
     * @return true if the file exists, false otherwise
     */
    private boolean canAnnotate(File samfile) {
        return (samfile != null && samfile.exists());
    }

    /**
     * Construct a SAMRecord object with the specified information.  The secondary bases
     * will be annotated suchthat they will not conflict with the primary base.
     *
     * @param rr                 the raw Illumina read
     * @param fpr                the four-base distributions for every base in the read
     * @param sfh                the SAM header
     * @param runBarcode         the run barcode of the lane (used to prefix the reads)
     * @param isPaired           is this a paired-end lane?
     * @param isSecondEndOfPair  is this the second end of the pair?
     *
     * @return a fully-constructed SAM record
     */
    private SAMRecord constructSAMRecord(RawRead rr, FourProbRead fpr, SAMFileHeader sfh, String runBarcode, boolean isPaired, boolean isSecondEndOfPair) {
        SAMRecord sr = new SAMRecord(sfh);

        sr.setReadName(runBarcode + ":" + rr.getReadKey() + "#0");
        sr.setReadUmappedFlag(true);
        sr.setReadString(rr.getSequenceAsString());
        sr.setBaseQualities(rr.getQuals());

        sr.setReadPairedFlag(isPaired);
        if (isPaired) {
            sr.setMateUnmappedFlag(true);
            sr.setFirstOfPairFlag(!isSecondEndOfPair);
        }
        
        sr.setAttribute("SQ", fpr.getSQTag(rr));

        return sr;
    }

    /**
     * Resorts a SAM file to queryname order.
     *
     * @param samFile        the input SAM file
     * @param sortedSamFile  the sorted SAM output file
     */
    private void sortBAMByReadName(File samFile, File sortedSamFile) {
        SAMFileReader samIn = new SAMFileReader(samFile);

        SAMFileHeader sfh = samIn.getFileHeader();
        sfh.setSortOrder(SAMFileHeader.SortOrder.queryname);

        SAMFileWriter samOut = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, sortedSamFile);

        for (SAMRecord sr : samIn) {
            samOut.addAlignment(sr);
        }

        samIn.close();
        samOut.close();
    }

    /**
     * Merges two SAM files that have been sorted in queryname order
     * 
     * @param queryNameSortedUnalignedSam  the sorted unaligned SAM file
     * @param queryNameSortedAlignedSam    the sorted aligned SAM file
     * @param mergedSam the output file where the merged results should be stored
     */
    private void mergeUnalignedAndAlignedBams(File queryNameSortedUnalignedSam, File queryNameSortedAlignedSam, File mergedSam) {
        SAMFileReader usam = new SAMFileReader(queryNameSortedUnalignedSam);
        SAMFileReader asam = new SAMFileReader(queryNameSortedAlignedSam);

        SAMFileHeader sfh = asam.getFileHeader();
        sfh.setSortOrder(SAMFileHeader.SortOrder.coordinate);

        SAMFileWriter samOut = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, mergedSam);

        CloseableIterator<SAMRecord> usamIt = usam.iterator();
        CloseableIterator<SAMRecord> asamIt = asam.iterator();

        SAMRecord usr = usamIt.next();
        SAMRecord asr = asamIt.next();

        do {
            // Pull a record from the unaligned file and advance the aligned file until we find the matching record.  We
            // don't have to advance the unaligned file until we find our record because we assume every record we generate
            // will be in the aligned file (which also contains unaligned reads).
            //
            // If Picard ever stops storing the unaligned reads, this logic will need to be rewritten.
            
            if (usr.getReadName().equals(asr.getReadName()) && usr.getFirstOfPairFlag() == asr.getFirstOfPairFlag()) {
                byte[] sqtag = (byte[]) usr.getAttribute("SQ");
                String usrread = usr.getReadString();
                String asrread = asr.getReadString();

                if (asr.getReadNegativeStrandFlag()) {
                    sqtag = QualityUtils.reverseComplementCompressedQualityArray(sqtag);
                    asrread = BaseUtils.simpleReverseComplement(asrread);
                }

                if (usrread != null && asrread != null && !usrread.equals(asrread)) {
                    throw new StingException(
                        String.format("Purportedly identical unaligned and aligned reads have different read sequences.  Perhaps this lane was reanalyzed by the Illumina software but not the production pipeline?\n '%s:%b:%s'\n '%s:%b:%s'",
                                      usr.getReadName(), usr.getFirstOfPairFlag(), usrread,
                                      asr.getReadName(), asr.getFirstOfPairFlag(), asrread));
                }

                asr.setAttribute("SQ", sqtag);

                usr = usamIt.next();
            } else {
                asr = asamIt.next();
            }

            samOut.addAlignment(asr);
        } while (usamIt.hasNext() && asamIt.hasNext());

        usam.close();
        asam.close();
        samOut.close();
    }

    /**
     * For debugging purposes.  Spits out relevant information for two SAMRecords.
     * 
     * @param sra  first SAMRecord
     * @param srb  second SAMRecord
     */
    private void printRecords(SAMRecord sra, SAMRecord srb) {
        System.out.println("a: " + sra.getReadName() + " " + sra.getFirstOfPairFlag());
        System.out.println("b: " + srb.getReadName() + " " + srb.getFirstOfPairFlag());
        System.out.println();

    }
}