345 lines
14 KiB
Java
Executable File
345 lines
14 KiB
Java
Executable File
package org.broadinstitute.sting.secondarybase;
|
|
|
|
import net.sf.samtools.*;
|
|
import net.sf.samtools.util.CloseableIterator;
|
|
import org.broadinstitute.sting.utils.*;
|
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
|
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
|
|
|
import java.io.File;
|
|
import java.io.IOException;
|
|
import java.util.ArrayList;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
|
|
/**
|
|
* AnnotateSecondaryBase computes the second best base for every base in an Illumina lane.
|
|
* First, a statistical model is fit to a subset of the raw Illumina intensities (i.e. those
|
|
* generated by Illumina's "Firecrest" package). Then, every read's set of raw intensities
|
|
* is evaluated against this model to determine the base probability distribution of a given
|
|
* base observation.
|
|
*
|
|
* Approximately 95% of the time, this method and Illumina's basecalling package, "Bustard",
|
|
* agree on the identity of the best base. In these cases, we simply annotate our estimate
|
|
* of the second-best base. In cases where this method and Bustard disagree, we annotate
|
|
* the secondary base as this method's primary base.
|
|
*
|
|
* @author Kiran Garimella
|
|
*/
|
|
|
|
/*
|
|
An example invocation:
|
|
java -Xmx2048m -Djava.io.tmpdir=/broad/hptmp/ -jar /path/to/AnnotateSecondaryBase.jar \
|
|
-D /seq/solexaproc2/SL-XAX/analyzed/090217_SL-XAX_0003_FC30R47AAXX/Data/C1-152_Firecrest1.3.2_25-02-2009_prodinfo/Bustard1.3.2_25-02-2009_prodinfo/ \
|
|
-L 5 \
|
|
-B 30R47AAXX090217
|
|
-CR '0-75,76-151' \
|
|
-SO ~/test.sam \
|
|
-SI /seq/picard/30R47AAXX/C1-152_2009-02-17_2009-04-02/5/Solexa-10265/30R47AAXX.5.aligned.bam \
|
|
*/
|
|
|
|
public class AnnotateSecondaryBase extends CommandLineProgram {
|
|
public static AnnotateSecondaryBase Instance = null;
|
|
|
|
@Argument(fullName="bustard_dir", shortName="D", doc="Illumina Bustard directory") public File BUSTARD_DIR;
|
|
@Argument(fullName="lane", shortName="L", doc="Illumina flowcell lane") public int LANE;
|
|
@Argument(fullName="run_barcode", shortName="B", doc="Run barcode (embedded as part of the read name; i.e. 30R47AAXX090217)") public String RUN_BARCODE;
|
|
@Argument(fullName="cycle_ranges", shortName="CR", doc="Cycle ranges for single-end or paired reads (i.e. '0-50,56-106') (0-based, inclusive)") public String CYCLE_RANGES;
|
|
@Argument(fullName="sam_out", shortName="SO", doc="Output path for sam file") public File SAM_OUT;
|
|
@Argument(fullName="sam_in", shortName="SI", doc="The file to use for training and annotation", required=false) public File SAM_IN;
|
|
@Argument(fullName="training_limit", shortName="T", doc="Number of reads to use for parameter initialization", required=false) public int TRAINING_LIMIT = 100000;
|
|
@Argument(fullName="calling_limit", shortName="C", doc="Number of reads to basecall", required=false) public int CALLING_LIMIT = Integer.MAX_VALUE;
|
|
@Argument(fullName="unaligned_sam", shortName="US", doc="Unaligned sam file, so we can skip making it", required=false) public File USAM;
|
|
@Argument(fullName="aligned_sam", shortName="AS", doc="Aligned, queryname-sorted sam file, so we can skip resorting it", required=false) public File ASAM;
|
|
|
|
public static void main(String[] argv) {
|
|
Instance = new AnnotateSecondaryBase();
|
|
start(Instance, argv);
|
|
}
|
|
|
|
protected int execute() {
|
|
ArrayList<Pair<Integer, Integer>> cycleRanges = getCycleRanges(CYCLE_RANGES);
|
|
File unalignedSam;
|
|
|
|
if (USAM == null || !USAM.exists()) {
|
|
BasecallingTrainer trainer = new BasecallingTrainer(BUSTARD_DIR, LANE, TRAINING_LIMIT);
|
|
|
|
// Iterate through raw Firecrest data and store the first N reasonable reads up to TRAINING_LIMIT
|
|
System.out.println("Loading training set from the first " + TRAINING_LIMIT + " reasonable reads in the raw data...");
|
|
trainer.loadFirstNReasonableReadsTrainingSet();
|
|
|
|
// Iterate through the stored training data and add the info to the BasecallingReadModel
|
|
System.out.println("Applying training set...");
|
|
BasecallingReadModel model = new BasecallingReadModel(trainer.getTrainingData());
|
|
|
|
// Call bases and write results
|
|
System.out.println("Calling bases...");
|
|
|
|
SAMFileHeader sfh = new SAMFileHeader();
|
|
sfh.setSortOrder(SAMFileHeader.SortOrder.queryname);
|
|
|
|
unalignedSam = (canAnnotate(SAM_IN)) ? getTempSAMFile("unaligned") : SAM_OUT;
|
|
SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, unalignedSam);
|
|
|
|
IlluminaParser iparser = new IlluminaParser(BUSTARD_DIR, LANE);
|
|
|
|
BasecallingStats bstats = new BasecallingStats();
|
|
|
|
while (bstats.getReadsTotal() < CALLING_LIMIT && iparser.next()) {
|
|
RawRead rr = iparser.getRawRead();
|
|
FourProbRead fpr = model.call(rr);
|
|
|
|
for (int cycleRangeIndex = 0; cycleRangeIndex < cycleRanges.size(); cycleRangeIndex++) {
|
|
Pair<Integer, Integer> cycleRange = cycleRanges.get(cycleRangeIndex);
|
|
|
|
RawRead rrEnd = iparser.getSubset(cycleRange.getFirst(), cycleRange.getSecond());
|
|
FourProbRead fprEnd = fpr.getSubset(cycleRange.getFirst(), cycleRange.getSecond());
|
|
|
|
sfw.addAlignment(constructSAMRecord(rrEnd, fprEnd, sfh, RUN_BARCODE, cycleRanges.size() == 2, cycleRangeIndex == 1));
|
|
|
|
if (cycleRangeIndex == 0) {
|
|
bstats.update(rrEnd, fprEnd);
|
|
bstats.notifyOnInterval(5000);
|
|
}
|
|
}
|
|
}
|
|
|
|
bstats.notifyNow();
|
|
|
|
iparser.close();
|
|
sfw.close();
|
|
} else {
|
|
unalignedSam = USAM;
|
|
}
|
|
|
|
if (canAnnotate(SAM_IN)) {
|
|
// If we're in annotation mode, annotate the aligned BAM file with the SQ tag
|
|
System.out.println("Annotating aligned SAM file...");
|
|
|
|
File alignedSam;
|
|
if (ASAM == null || !ASAM.exists()) {
|
|
System.out.println(" sorting aligned SAM file by read name...");
|
|
alignedSam = getTempSAMFile("aligned");
|
|
sortBAMByReadName(SAM_IN, alignedSam);
|
|
} else {
|
|
alignedSam = ASAM;
|
|
}
|
|
|
|
System.out.println(" merging unaligned and aligned SAM files...");
|
|
File mergedSam = SAM_OUT;
|
|
mergeUnalignedAndAlignedBams(unalignedSam, alignedSam, mergedSam);
|
|
}
|
|
|
|
System.out.println("Done.");
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* Return a tempfile. This is a laziness method so that I don't have to litter my code with try/catch blocks for IOExceptions.
|
|
*
|
|
* @param prefix the prefix for the temp file
|
|
* @return the temp file
|
|
*/
|
|
private File getTempSAMFile(String prefix) {
|
|
try {
|
|
File tempFile = File.createTempFile(prefix, ".sam", SAM_OUT.getParentFile());
|
|
//tempFile.deleteOnExit();
|
|
|
|
// Ensure that the volumes we're about to use are ready.
|
|
PathUtils.refreshVolume(tempFile);
|
|
PathUtils.refreshVolume(new File(System.getProperty("java.io.tmpdir")));
|
|
|
|
return tempFile;
|
|
} catch (IOException e) {
|
|
throw new StingException("Unable to create tempfile in directory " + SAM_OUT.getParent());
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Parse the cycle_ranges string that defines the cycle where a read starts and stops.
|
|
* Comma-separated ranges are interpreted to be the first and second end of a pair.
|
|
*
|
|
* @param cycleRangesString the 0-based, inclusive, comma-separated ranges (i.e. '0-50,51-100')
|
|
* @return an ArrayList of cycle ranges
|
|
*/
|
|
private ArrayList< Pair<Integer, Integer> > getCycleRanges(String cycleRangesString) {
|
|
ArrayList< Pair<Integer, Integer> > cycleRanges = new ArrayList< Pair<Integer, Integer> >();
|
|
|
|
String[] pieces = cycleRangesString.split(",");
|
|
|
|
Pattern p = Pattern.compile("(\\d+)-(\\d+)");
|
|
|
|
for (String piece : pieces) {
|
|
Matcher m = p.matcher(piece);
|
|
|
|
if (m.find()) {
|
|
Integer cycleStart = new Integer(m.group(1));
|
|
Integer cycleStop = new Integer(m.group(2));
|
|
|
|
cycleRanges.add(new Pair<Integer, Integer>(cycleStart, cycleStop));
|
|
}
|
|
}
|
|
|
|
if (cycleRanges.size() == 0) {
|
|
throw new StingException("At least one cycle range must be specified.");
|
|
}
|
|
|
|
if (cycleRanges.size() > 2) {
|
|
throw new StingException(cycleRanges.size() + " specified, but we're unable to handle more than 2.");
|
|
}
|
|
|
|
return cycleRanges;
|
|
}
|
|
|
|
/**
|
|
* Simple test to determine whether we're in aligned bam annotation mode or not.
|
|
*
|
|
* @param samfile the aligned sam file
|
|
* @return true if the file exists, false otherwise
|
|
*/
|
|
private boolean canAnnotate(File samfile) {
|
|
return (samfile != null && samfile.exists());
|
|
}
|
|
|
|
/**
|
|
* Construct a SAMRecord object with the specified information. The secondary bases
|
|
* will be annotated suchthat they will not conflict with the primary base.
|
|
*
|
|
* @param rr the raw Illumina read
|
|
* @param fpr the four-base distributions for every base in the read
|
|
* @param sfh the SAM header
|
|
* @param runBarcode the run barcode of the lane (used to prefix the reads)
|
|
* @param isPaired is this a paired-end lane?
|
|
* @param isSecondEndOfPair is this the second end of the pair?
|
|
*
|
|
* @return a fully-constructed SAM record
|
|
*/
|
|
private SAMRecord constructSAMRecord(RawRead rr, FourProbRead fpr, SAMFileHeader sfh, String runBarcode, boolean isPaired, boolean isSecondEndOfPair) {
|
|
SAMRecord sr = new SAMRecord(sfh);
|
|
|
|
sr.setReadName(runBarcode + ":" + rr.getReadKey() + "#0");
|
|
sr.setReadUmappedFlag(true);
|
|
sr.setReadString(rr.getSequenceAsString());
|
|
sr.setBaseQualities(rr.getQuals());
|
|
|
|
sr.setReadPairedFlag(isPaired);
|
|
if (isPaired) {
|
|
sr.setMateUnmappedFlag(true);
|
|
sr.setFirstOfPairFlag(!isSecondEndOfPair);
|
|
}
|
|
|
|
sr.setAttribute("SQ", fpr.getSQTag(rr));
|
|
|
|
return sr;
|
|
}
|
|
|
|
/**
|
|
* Resorts a SAM file to queryname order.
|
|
*
|
|
* @param samFile the input SAM file
|
|
* @param sortedSamFile the sorted SAM output file
|
|
*/
|
|
private void sortBAMByReadName(File samFile, File sortedSamFile) {
|
|
SAMFileReader samIn = new SAMFileReader(samFile);
|
|
|
|
SAMFileHeader sfh = samIn.getFileHeader();
|
|
sfh.setSortOrder(SAMFileHeader.SortOrder.queryname);
|
|
|
|
SAMFileWriter samOut = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, sortedSamFile);
|
|
|
|
for (SAMRecord sr : samIn) {
|
|
samOut.addAlignment(sr);
|
|
}
|
|
|
|
samIn.close();
|
|
samOut.close();
|
|
}
|
|
|
|
/**
|
|
* Merges two SAM files that have been sorted in queryname order
|
|
*
|
|
* @param queryNameSortedUnalignedSam the sorted unaligned SAM file
|
|
* @param queryNameSortedAlignedSam the sorted aligned SAM file
|
|
* @param mergedSam the output file where the merged results should be stored
|
|
*/
|
|
private void mergeUnalignedAndAlignedBams(File queryNameSortedUnalignedSam, File queryNameSortedAlignedSam, File mergedSam) {
|
|
SAMFileReader usam = new SAMFileReader(queryNameSortedUnalignedSam);
|
|
SAMFileReader asam = new SAMFileReader(queryNameSortedAlignedSam);
|
|
|
|
SAMFileHeader sfh = asam.getFileHeader();
|
|
sfh.setSortOrder(SAMFileHeader.SortOrder.coordinate);
|
|
|
|
SAMFileWriter samOut = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, mergedSam);
|
|
|
|
CloseableIterator<SAMRecord> usamIt = usam.iterator();
|
|
CloseableIterator<SAMRecord> asamIt = asam.iterator();
|
|
|
|
SAMRecord usr = usamIt.next();
|
|
SAMRecord asr = asamIt.next();
|
|
|
|
int annotatedRecords = 0;
|
|
|
|
do {
|
|
// Pull a record from the unaligned file and advance the aligned file until we find the matching record. We
|
|
// don't have to advance the unaligned file until we find our record because we assume every record we generate
|
|
// will be in the aligned file (which also contains unaligned reads).
|
|
//
|
|
// If Picard ever stops storing the unaligned reads, this logic will need to be rewritten.
|
|
System.out.println(asr.getReadString());
|
|
System.out.println(BaseUtils.simpleReverseComplement(asr.getReadString()));
|
|
System.out.println();
|
|
|
|
if (usr.getReadName().equals(asr.getReadName()) && usr.getFirstOfPairFlag() == asr.getFirstOfPairFlag()) {
|
|
byte[] sqtag = (byte[]) usr.getAttribute("SQ");
|
|
String usrread = usr.getReadString();
|
|
String asrread = asr.getReadString();
|
|
|
|
System.out.println(asrread);
|
|
|
|
if (asr.getReadNegativeStrandFlag()) {
|
|
sqtag = QualityUtils.reverseComplementCompressedQualityArray(sqtag);
|
|
asrread = BaseUtils.simpleReverseComplement(asrread);
|
|
|
|
System.out.println(asrread);
|
|
}
|
|
|
|
if (usrread != null && asrread != null && !usrread.equals(asrread)) {
|
|
throw new StingException(
|
|
String.format("Purportedly identical unaligned and aligned reads have different read sequences. Perhaps this lane was reanalyzed by the Illumina software but not the production pipeline?\n '%s:%b:%s'\n '%s:%b:%s'",
|
|
usr.getReadName(), usr.getFirstOfPairFlag(), usrread,
|
|
asr.getReadName(), asr.getFirstOfPairFlag(), asrread));
|
|
}
|
|
|
|
asr.setAttribute("SQ", sqtag);
|
|
annotatedRecords++;
|
|
|
|
System.out.println("Annotated " + annotatedRecords + " records.");
|
|
|
|
usr = usamIt.next();
|
|
} else {
|
|
asr = asamIt.next();
|
|
}
|
|
|
|
samOut.addAlignment(asr);
|
|
} while (usamIt.hasNext() && asamIt.hasNext());
|
|
|
|
usam.close();
|
|
asam.close();
|
|
samOut.close();
|
|
}
|
|
|
|
/**
|
|
* For debugging purposes. Spits out relevant information for two SAMRecords.
|
|
*
|
|
* @param sra first SAMRecord
|
|
* @param srb second SAMRecord
|
|
*/
|
|
private void printRecords(SAMRecord sra, SAMRecord srb) {
|
|
System.out.println("a: " + sra.getReadName() + " " + sra.getFirstOfPairFlag());
|
|
System.out.println("b: " + srb.getReadName() + " " + srb.getFirstOfPairFlag());
|
|
System.out.println();
|
|
|
|
}
|
|
}
|