package org.broadinstitute.sting.alignment.bwa; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.alignment.reference.packing.PackUtils; import org.broadinstitute.sting.alignment.reference.bwt.BWT; import org.broadinstitute.sting.alignment.reference.bwt.BWTWriter; import org.broadinstitute.sting.alignment.reference.bwt.SuffixArray; import org.broadinstitute.sting.alignment.reference.bwt.SuffixArrayWriter; import org.broadinstitute.sting.alignment.reference.bwt.ANNWriter; import org.broadinstitute.sting.alignment.reference.bwt.AMBWriter; import java.io.File; import java.io.IOException; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; import net.sf.samtools.util.StringUtil; /** * Support files for BWT. * * @author mhanna * @version 0.1 */ public class BWTFiles { /** * ANN (?) file name. */ public final File annFile; /** * AMB (?) file name. */ public final File ambFile; /** * Packed reference sequence file. */ public final File pacFile; /** * Reverse of packed reference sequence file. */ public final File rpacFile; /** * Forward BWT file. */ public final File forwardBWTFile; /** * Forward suffix array file. */ public final File forwardSAFile; /** * Reverse BWT file. */ public final File reverseBWTFile; /** * Reverse suffix array file. */ public final File reverseSAFile; /** * Where these files autogenerated on the fly? */ public final boolean autogenerated; /** * Create a new BWA configuration file using the given prefix. * @param prefix Prefix to use when creating the configuration. Must not be null. */ public BWTFiles(String prefix) { if(prefix == null) throw new StingException("Prefix must not be null."); annFile = new File(prefix + ".ann"); ambFile = new File(prefix + ".amb"); pacFile = new File(prefix + ".pac"); rpacFile = new File(prefix + ".rpac"); forwardBWTFile = new File(prefix + ".bwt"); forwardSAFile = new File(prefix + ".sa"); reverseBWTFile = new File(prefix + ".rbwt"); reverseSAFile = new File(prefix + ".rsa"); autogenerated = false; } /** * Hand-create a new BWTFiles object, specifying a unique file object for each type. * @param annFile ANN (alternate dictionary) file. * @param ambFile AMB (holes) files. * @param pacFile Packed representation of the forward reference sequence. * @param forwardBWTFile BWT representation of the forward reference sequence. * @param forwardSAFile SA representation of the forward reference sequence. * @param rpacFile Packed representation of the reversed reference sequence. * @param reverseBWTFile BWT representation of the reversed reference sequence. * @param reverseSAFile SA representation of the reversed reference sequence. */ private BWTFiles(File annFile, File ambFile, File pacFile, File forwardBWTFile, File forwardSAFile, File rpacFile, File reverseBWTFile, File reverseSAFile) { this.annFile = annFile; this.ambFile = ambFile; this.pacFile = pacFile; this.forwardBWTFile = forwardBWTFile; this.forwardSAFile = forwardSAFile; this.rpacFile = rpacFile; this.reverseBWTFile = reverseBWTFile; this.reverseSAFile = reverseSAFile; autogenerated = true; } /** * Close out this files object, in the process deleting any temporary filse * that were created. */ public void close() { if(autogenerated) { boolean success = true; success = annFile.delete(); success &= ambFile.delete(); success &= pacFile.delete(); success &= forwardBWTFile.delete(); success &= forwardSAFile.delete(); success &= rpacFile.delete(); success &= reverseBWTFile.delete(); success &= reverseSAFile.delete(); if(!success) throw new StingException("Unable to clean up autogenerated representation"); } } /** * Create a new set of BWT files from the given reference sequence. * @param referenceSequence Sequence from which to build metadata. * @return A new object representing encoded representations of each sequence. */ public static BWTFiles createFromReferenceSequence(byte[] referenceSequence) { byte[] normalizedReferenceSequence = new byte[referenceSequence.length]; System.arraycopy(referenceSequence,0,normalizedReferenceSequence,0,referenceSequence.length); normalizeReferenceSequence(normalizedReferenceSequence); File annFile,ambFile,pacFile,bwtFile,saFile,rpacFile,rbwtFile,rsaFile; try { // Write the ann and amb for this reference sequence. annFile = File.createTempFile("bwt",".ann"); ambFile = File.createTempFile("bwt",".amb"); SAMSequenceDictionary dictionary = new SAMSequenceDictionary(); dictionary.addSequence(new SAMSequenceRecord("autogenerated",normalizedReferenceSequence.length)); ANNWriter annWriter = new ANNWriter(annFile); annWriter.write(dictionary); annWriter.close(); AMBWriter ambWriter = new AMBWriter(ambFile); ambWriter.writeEmpty(dictionary); ambWriter.close(); // Write the encoded files for the forward version of this reference sequence. pacFile = File.createTempFile("bwt",".pac"); bwtFile = File.createTempFile("bwt",".bwt"); saFile = File.createTempFile("bwt",".sa"); writeEncodedReferenceSequence(normalizedReferenceSequence,pacFile,bwtFile,saFile); // Write the encoded files for the reverse version of this reference sequence. byte[] reverseReferenceSequence = BaseUtils.reverse(normalizedReferenceSequence); rpacFile = File.createTempFile("bwt",".rpac"); rbwtFile = File.createTempFile("bwt",".rbwt"); rsaFile = File.createTempFile("bwt",".rsa"); writeEncodedReferenceSequence(reverseReferenceSequence,rpacFile,rbwtFile,rsaFile); } catch(IOException ex) { throw new StingException("Unable to write autogenerated reference sequence to temporary files"); } // Make sure that, at the very least, all temporary files are deleted on exit. annFile.deleteOnExit(); ambFile.deleteOnExit(); pacFile.deleteOnExit(); bwtFile.deleteOnExit(); saFile.deleteOnExit(); rpacFile.deleteOnExit(); rbwtFile.deleteOnExit(); rsaFile.deleteOnExit(); return new BWTFiles(annFile,ambFile,pacFile,bwtFile,saFile,rpacFile,rbwtFile,rsaFile); } /** * Write the encoded form of the reference sequence. In the case of BWA, the encoded reference * sequence is the reference itself in PAC format, the BWT, and the suffix array. * @param referenceSequence The reference sequence to encode. * @param pacFile Target for the PAC-encoded reference. * @param bwtFile Target for the BWT representation of the reference. * @param suffixArrayFile Target for the suffix array encoding of the reference. * @throws java.io.IOException In case of issues writing to the file. */ private static void writeEncodedReferenceSequence(byte[] referenceSequence, File pacFile, File bwtFile, File suffixArrayFile) throws IOException { PackUtils.writeReferenceSequence(pacFile,referenceSequence); BWT bwt = BWT.createFromReferenceSequence(referenceSequence); BWTWriter bwtWriter = new BWTWriter(bwtFile); bwtWriter.write(bwt); bwtWriter.close(); SuffixArray suffixArray = SuffixArray.createFromReferenceSequence(referenceSequence); SuffixArrayWriter suffixArrayWriter = new SuffixArrayWriter(suffixArrayFile); suffixArrayWriter.write(suffixArray); suffixArrayWriter.close(); } /** * Convert the given reference sequence into a form suitable for building into * on-the-fly sequences. * @param referenceSequence The reference sequence to normalize. * @throws StingException if normalized sequence cannot be generated. */ private static void normalizeReferenceSequence(byte[] referenceSequence) { StringUtil.toUpperCase(referenceSequence); for(byte base: referenceSequence) { if(base != 'A' && base != 'C' && base != 'G' && base != 'T') throw new StingException(String.format("Base type %c is not supported when building references on-the-fly",(char)base)); } } }