gatk-3.8/java/src/org/broadinstitute/sting/secondarybase/AnnotateSecondaryBase.java

276 lines
11 KiB
Java
Raw Normal View History

package org.broadinstitute.sting.secondarybase;
import net.sf.samtools.*;
import net.sf.samtools.util.CloseableIterator;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.ArrayList;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
/**
* AnnotateSecondaryBase computes the second best base for every base in an Illumina lane.
* First, a statistical model is fit to a subset of the raw Illumina intensities (i.e. those
* generated by Illumina's "Firecrest" package). Then, every read's set of raw intensities
* is evaluated against this model to determine the base probability distribution of a given
* base observation.
*
* Approximately 95% of the time, this method and Illumina's basecalling package, "Bustard",
* agree on the identity of the best base. In these cases, we simply annotate the
* second-best base. In cases where this method and Bustard disagree, we annotate the
* secondary base as this method's primary base.
*
* @author Kiran Garimella
*/
public class AnnotateSecondaryBase extends CommandLineProgram {
public static AnnotateSecondaryBase Instance = null;
@Argument(fullName="dir", shortName="D", doc="Illumina Bustard directory") public File BUSTARD_DIR;
@Argument(fullName="lane", shortName="L", doc="Illumina flowcell lane") public int LANE;
@Argument(fullName="sam_in", shortName="SI", doc="The file to use for training and annotation", required=false) public File SAM_IN;
@Argument(fullName="sam_out", shortName="SO", doc="Output path for sam file") public File SAM_OUT;
@Argument(fullName="reference", shortName="R", doc="Reference sequence to which sam_in is aligned (in fasta format)") public File REFERENCE;
@Argument(fullName="cycle_ranges", shortName="CR", doc="Cycle ranges for single-end or paired reads (i.e. '0-50,56-106') (0-based inclusive)") public String CYCLE_RANGES;
@Argument(fullName="tlim", shortName="T", doc="Number of reads to use for parameter initialization", required=false) public int TRAINING_LIMIT = 100000;
@Argument(fullName="clim", shortName="C", doc="Number of reads to basecall", required=false) public int CALLING_LIMIT = Integer.MAX_VALUE;
@Argument(fullName="runbarcode", shortName="B", doc="Run barcode (embedded as part of the read name)") public String RUN_BARCODE;
public static void main(String[] argv) {
Instance = new AnnotateSecondaryBase();
start(Instance, argv);
}
protected int execute() {
ArrayList<Pair<Integer, Integer>> cycleRanges = getCycleRanges(CYCLE_RANGES);
BasecallingTrainer trainer = new BasecallingTrainer(BUSTARD_DIR, LANE, TRAINING_LIMIT);
// Iterate through raw Firecrest data and the first N reads up to TRAINING_LIMIT
System.out.println("Loading training set from the first " + TRAINING_LIMIT + " unambiguous reads in the raw data...");
trainer.loadFirstNUnambiguousReadsTrainingSet();
// Iterate through the stored training data and add the info to the BasecallingReadModel
System.out.println("Applying training set...");
BasecallingReadModel model = new BasecallingReadModel(trainer.getTrainingData());
// Call bases and write results
System.out.println("Calling bases...");
SAMFileHeader sfh = new SAMFileHeader();
sfh.setSortOrder(SAMFileHeader.SortOrder.queryname);
SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, SAM_OUT);
IlluminaParser iparser = new IlluminaParser(BUSTARD_DIR, LANE);
BasecallingStats bstats = new BasecallingStats();
while (bstats.getReadsTotal() < CALLING_LIMIT && iparser.next()) {
RawRead rr = iparser.getRawRead();
FourProbRead fpr = model.call(rr);
for (int cycleRangeIndex = 0; cycleRangeIndex < cycleRanges.size(); cycleRangeIndex++) {
Pair<Integer, Integer> cycleRange = cycleRanges.get(cycleRangeIndex);
RawRead rrEnd = iparser.getSubset(cycleRange.getFirst(), cycleRange.getSecond());
FourProbRead fprEnd = fpr.getSubset(cycleRange.getFirst(), cycleRange.getSecond());
sfw.addAlignment(constructSAMRecord(rrEnd, fprEnd, sfh, RUN_BARCODE, cycleRanges.size() == 2, cycleRangeIndex == 1));
if (cycleRangeIndex == 0) {
bstats.update(rrEnd, fprEnd);
bstats.notifyOnInterval(5000);
}
}
}
bstats.notifyNow();
iparser.close();
sfw.close();
if (canAnnotate(SAM_IN)) {
// If we're in annotation mode, annotate the aligned BAM file with the SQ tag
System.out.println("Annotating aligned BAM file...");
try {
File sortedSam = File.createTempFile("sorted", ".sam", SAM_OUT.getParentFile());
//System.out.println(" sorted file: " + sortedSam.getAbsolutePath());
sortBAMByReadName(SAM_IN, sortedSam);
File mergedSam = File.createTempFile("merged", ".sam", SAM_OUT.getParentFile());
//System.out.println(" merged file: " + mergedSam.getAbsolutePath());
mergeUnalignedAndAlignedBams(SAM_OUT, sortedSam, mergedSam);
} catch (IOException e) {
throw new StingException("There was a problem in trying to merge the unaligned and aligned BAM files.");
}
}
System.out.println("Done.");
return 0;
}
/**
* Parse the cycle_ranges string that defines the cycle where a read starts and stops.
* Comma-separated ranges are interpreted to be the first and second end of a pair.
*
* @param cycleRangesString the 0-based, inclusive, comma-separated ranges (i.e. '0-50,51-100')
* @return an ArrayList of cycle ranges
*/
private ArrayList< Pair<Integer, Integer> > getCycleRanges(String cycleRangesString) {
ArrayList< Pair<Integer, Integer> > cycleRanges = new ArrayList< Pair<Integer, Integer> >();
String[] pieces = cycleRangesString.split(",");
Pattern p = Pattern.compile("(\\d+)-(\\d+)");
for (int pieceIndex = 0; pieceIndex < pieces.length; pieceIndex++) {
Matcher m = p.matcher(pieces[pieceIndex]);
if (m.find()) {
Integer cycleStart = new Integer(m.group(1));
Integer cycleStop = new Integer(m.group(2));
cycleRanges.add(new Pair<Integer, Integer>(cycleStart, cycleStop));
}
}
if (cycleRanges.size() == 0) {
throw new StingException("At least one cycle range must be specified.");
}
if (cycleRanges.size() > 2) {
throw new StingException(cycleRanges.size() + " specified, but we're unable to handle more than 2.");
}
return cycleRanges;
}
/**
* Simple test to determine whether we're in aligned bam annotation mode or not.
*
* @param samfile the aligned sam file
* @return true if the file exists, false otherwise
*/
private boolean canAnnotate(File samfile) {
return (samfile != null && samfile.exists());
}
/**
* Construct a SAMRecord object with the specified information. The secondary bases
* will be annotated suchthat they will not conflict with the primary base.
*
* @param rr the raw Illumina read
* @param fpr the four-base distributions for every base in the read
* @param sfh the SAM header
* @param runBarcode the run barcode of the lane (used to prefix the reads)
*
* @return a fully-constructed SAM record
*/
private SAMRecord constructSAMRecord(RawRead rr, FourProbRead fpr, SAMFileHeader sfh, String runBarcode, boolean isPaired, boolean secondEndOfPair) {
SAMRecord sr = new SAMRecord(sfh);
sr.setReadName(runBarcode + ":" + rr.getReadKey() + "#0");
sr.setMateUnmappedFlag(true);
sr.setReadUmappedFlag(true);
sr.setReadString(rr.getSequenceAsString());
sr.setBaseQualities(rr.getQuals());
sr.setReadPairedFlag(isPaired);
if (isPaired) {
sr.setFirstOfPairFlag(!secondEndOfPair);
}
sr.setAttribute("SQ", fpr.getSQTag(rr));
return sr;
}
/**
* Resorts a SAM file to queryname order.
*
* @param samFile the input SAM file
* @param sortedSamFile the sorted SAM output file
*/
private void sortBAMByReadName(File samFile, File sortedSamFile) {
SAMFileReader samIn = new SAMFileReader(samFile);
SAMFileHeader sfh = samIn.getFileHeader();
sfh.setSortOrder(SAMFileHeader.SortOrder.queryname);
SAMFileWriter samOut = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, sortedSamFile);
for (SAMRecord sr : samIn) {
samOut.addAlignment(sr);
}
samIn.close();
samOut.close();
}
/**
* Merges two SAM files that have been sorted in queryname order
*
* @param queryNameSortedUnalignedSam
* @param queryNameSortedAlignedSam
* @param mergedSam
*/
private void mergeUnalignedAndAlignedBams(File queryNameSortedUnalignedSam, File queryNameSortedAlignedSam, File mergedSam) {
SAMFileReader usam = new SAMFileReader(queryNameSortedUnalignedSam);
SAMFileReader asam = new SAMFileReader(queryNameSortedAlignedSam);
SAMFileHeader sfh = asam.getFileHeader();
sfh.setSortOrder(SAMFileHeader.SortOrder.coordinate);
SAMFileWriter samOut = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, mergedSam);
CloseableIterator<SAMRecord> usamIt = usam.iterator();
CloseableIterator<SAMRecord> asamIt = asam.iterator();
SAMRecord usr = usamIt.next();
SAMRecord asr = asamIt.next();
do {
while (usamIt.hasNext() && asamIt.hasNext() && !usr.getReadName().matches(asr.getReadName()) && usr.getReadNegativeStrandFlag() == asr.getReadNegativeStrandFlag()) {
int comp = usr.getReadName().compareTo(asr.getReadName());
if (comp < 0) {
usr = usamIt.next();
} else if (comp > 0) {
asr = asamIt.next();
}
}
if (usr.getReadName().matches(asr.getReadName()) && usr.getReadNegativeStrandFlag() == asr.getReadNegativeStrandFlag()) {
byte[] sqtag = (byte[]) usr.getAttribute("SQ");
if (sqtag != null) {
if (asr.getReadNegativeStrandFlag()) {
QualityUtils.reverseComplementCompressedQualityArray(sqtag);
asr.setAttribute("SQ", sqtag);
}
}
samOut.addAlignment(asr);
usr = usamIt.next();
asr = asamIt.next();
}
} while (usamIt.hasNext() && asamIt.hasNext());
usam.close();
asam.close();
samOut.close();
}
}