2009-11-21 01:08:09 +08:00
|
|
|
package org.broadinstitute.sting.alignment.bwa;
|
|
|
|
|
|
2010-09-12 22:47:19 +08:00
|
|
|
import org.broadinstitute.sting.utils.exceptions.GATKException;
|
2010-05-20 22:05:13 +08:00
|
|
|
import org.broadinstitute.sting.utils.Utils;
|
2010-01-03 04:19:14 +08:00
|
|
|
import org.broadinstitute.sting.alignment.reference.packing.PackUtils;
|
|
|
|
|
import org.broadinstitute.sting.alignment.reference.bwt.BWT;
|
|
|
|
|
import org.broadinstitute.sting.alignment.reference.bwt.BWTWriter;
|
|
|
|
|
import org.broadinstitute.sting.alignment.reference.bwt.SuffixArray;
|
|
|
|
|
import org.broadinstitute.sting.alignment.reference.bwt.SuffixArrayWriter;
|
|
|
|
|
import org.broadinstitute.sting.alignment.reference.bwt.ANNWriter;
|
|
|
|
|
import org.broadinstitute.sting.alignment.reference.bwt.AMBWriter;
|
|
|
|
|
|
|
|
|
|
import java.io.File;
|
|
|
|
|
import java.io.IOException;
|
|
|
|
|
|
|
|
|
|
import net.sf.samtools.SAMSequenceDictionary;
|
|
|
|
|
import net.sf.samtools.SAMSequenceRecord;
|
2010-01-04 08:54:57 +08:00
|
|
|
import net.sf.samtools.util.StringUtil;
|
2009-11-21 01:08:09 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Support files for BWT.
|
|
|
|
|
*
|
|
|
|
|
* @author mhanna
|
|
|
|
|
* @version 0.1
|
|
|
|
|
*/
|
|
|
|
|
public class BWTFiles {
|
|
|
|
|
/**
|
|
|
|
|
* ANN (?) file name.
|
|
|
|
|
*/
|
2010-01-03 04:19:14 +08:00
|
|
|
public final File annFile;
|
2009-11-21 01:08:09 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* AMB (?) file name.
|
|
|
|
|
*/
|
2010-01-03 04:19:14 +08:00
|
|
|
public final File ambFile;
|
2009-11-21 01:08:09 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Packed reference sequence file.
|
|
|
|
|
*/
|
2010-01-03 04:19:14 +08:00
|
|
|
public final File pacFile;
|
2009-11-21 01:08:09 +08:00
|
|
|
|
2010-01-05 22:48:19 +08:00
|
|
|
/**
|
|
|
|
|
* Reverse of packed reference sequence file.
|
|
|
|
|
*/
|
|
|
|
|
public final File rpacFile;
|
|
|
|
|
|
2009-11-21 01:08:09 +08:00
|
|
|
/**
|
|
|
|
|
* Forward BWT file.
|
|
|
|
|
*/
|
2010-01-03 04:19:14 +08:00
|
|
|
public final File forwardBWTFile;
|
2009-11-21 01:08:09 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Forward suffix array file.
|
|
|
|
|
*/
|
2010-01-03 04:19:14 +08:00
|
|
|
public final File forwardSAFile;
|
2009-11-21 01:08:09 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Reverse BWT file.
|
|
|
|
|
*/
|
2010-01-03 04:19:14 +08:00
|
|
|
public final File reverseBWTFile;
|
2009-11-21 01:08:09 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Reverse suffix array file.
|
|
|
|
|
*/
|
2010-01-03 04:19:14 +08:00
|
|
|
public final File reverseSAFile;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Where these files autogenerated on the fly?
|
|
|
|
|
*/
|
2010-01-05 22:48:19 +08:00
|
|
|
public final boolean autogenerated;
|
2009-11-21 01:08:09 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Create a new BWA configuration file using the given prefix.
|
|
|
|
|
* @param prefix Prefix to use when creating the configuration. Must not be null.
|
|
|
|
|
*/
|
|
|
|
|
public BWTFiles(String prefix) {
|
|
|
|
|
if(prefix == null)
|
2010-09-10 07:21:17 +08:00
|
|
|
throw new GATKException("Prefix must not be null.");
|
2010-01-03 04:19:14 +08:00
|
|
|
annFile = new File(prefix + ".ann");
|
|
|
|
|
ambFile = new File(prefix + ".amb");
|
|
|
|
|
pacFile = new File(prefix + ".pac");
|
2010-01-05 22:48:19 +08:00
|
|
|
rpacFile = new File(prefix + ".rpac");
|
2010-01-03 04:19:14 +08:00
|
|
|
forwardBWTFile = new File(prefix + ".bwt");
|
|
|
|
|
forwardSAFile = new File(prefix + ".sa");
|
|
|
|
|
reverseBWTFile = new File(prefix + ".rbwt");
|
|
|
|
|
reverseSAFile = new File(prefix + ".rsa");
|
|
|
|
|
autogenerated = false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Hand-create a new BWTFiles object, specifying a unique file object for each type.
|
|
|
|
|
* @param annFile ANN (alternate dictionary) file.
|
|
|
|
|
* @param ambFile AMB (holes) files.
|
|
|
|
|
* @param pacFile Packed representation of the forward reference sequence.
|
|
|
|
|
* @param forwardBWTFile BWT representation of the forward reference sequence.
|
|
|
|
|
* @param forwardSAFile SA representation of the forward reference sequence.
|
2010-01-05 22:48:19 +08:00
|
|
|
* @param rpacFile Packed representation of the reversed reference sequence.
|
2010-01-03 04:19:14 +08:00
|
|
|
* @param reverseBWTFile BWT representation of the reversed reference sequence.
|
|
|
|
|
* @param reverseSAFile SA representation of the reversed reference sequence.
|
|
|
|
|
*/
|
|
|
|
|
private BWTFiles(File annFile,
|
|
|
|
|
File ambFile,
|
|
|
|
|
File pacFile,
|
|
|
|
|
File forwardBWTFile,
|
|
|
|
|
File forwardSAFile,
|
2010-01-05 22:48:19 +08:00
|
|
|
File rpacFile,
|
2010-01-03 04:19:14 +08:00
|
|
|
File reverseBWTFile,
|
|
|
|
|
File reverseSAFile) {
|
|
|
|
|
this.annFile = annFile;
|
|
|
|
|
this.ambFile = ambFile;
|
|
|
|
|
this.pacFile = pacFile;
|
|
|
|
|
this.forwardBWTFile = forwardBWTFile;
|
|
|
|
|
this.forwardSAFile = forwardSAFile;
|
2010-01-05 22:48:19 +08:00
|
|
|
this.rpacFile = rpacFile;
|
2010-01-03 04:19:14 +08:00
|
|
|
this.reverseBWTFile = reverseBWTFile;
|
|
|
|
|
this.reverseSAFile = reverseSAFile;
|
|
|
|
|
autogenerated = true;
|
2009-11-21 01:08:09 +08:00
|
|
|
}
|
2010-01-03 04:19:14 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Close out this files object, in the process deleting any temporary filse
|
|
|
|
|
* that were created.
|
|
|
|
|
*/
|
|
|
|
|
public void close() {
|
|
|
|
|
if(autogenerated) {
|
|
|
|
|
boolean success = true;
|
|
|
|
|
success = annFile.delete();
|
|
|
|
|
success &= ambFile.delete();
|
|
|
|
|
success &= pacFile.delete();
|
|
|
|
|
success &= forwardBWTFile.delete();
|
|
|
|
|
success &= forwardSAFile.delete();
|
2010-01-05 22:48:19 +08:00
|
|
|
success &= rpacFile.delete();
|
2010-01-03 04:19:14 +08:00
|
|
|
success &= reverseBWTFile.delete();
|
|
|
|
|
success &= reverseSAFile.delete();
|
|
|
|
|
|
|
|
|
|
if(!success)
|
2010-09-10 07:21:17 +08:00
|
|
|
throw new GATKException("Unable to clean up autogenerated representation");
|
2010-01-03 04:19:14 +08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Create a new set of BWT files from the given reference sequence.
|
|
|
|
|
* @param referenceSequence Sequence from which to build metadata.
|
|
|
|
|
* @return A new object representing encoded representations of each sequence.
|
|
|
|
|
*/
|
|
|
|
|
public static BWTFiles createFromReferenceSequence(byte[] referenceSequence) {
|
2010-01-04 08:54:57 +08:00
|
|
|
byte[] normalizedReferenceSequence = new byte[referenceSequence.length];
|
|
|
|
|
System.arraycopy(referenceSequence,0,normalizedReferenceSequence,0,referenceSequence.length);
|
|
|
|
|
normalizeReferenceSequence(normalizedReferenceSequence);
|
|
|
|
|
|
2010-01-03 04:19:14 +08:00
|
|
|
File annFile,ambFile,pacFile,bwtFile,saFile,rpacFile,rbwtFile,rsaFile;
|
|
|
|
|
try {
|
|
|
|
|
// Write the ann and amb for this reference sequence.
|
2010-01-04 08:54:57 +08:00
|
|
|
annFile = File.createTempFile("bwt",".ann");
|
|
|
|
|
ambFile = File.createTempFile("bwt",".amb");
|
2010-01-03 04:19:14 +08:00
|
|
|
|
|
|
|
|
SAMSequenceDictionary dictionary = new SAMSequenceDictionary();
|
2010-01-04 08:54:57 +08:00
|
|
|
dictionary.addSequence(new SAMSequenceRecord("autogenerated",normalizedReferenceSequence.length));
|
2010-01-03 04:19:14 +08:00
|
|
|
|
|
|
|
|
ANNWriter annWriter = new ANNWriter(annFile);
|
|
|
|
|
annWriter.write(dictionary);
|
|
|
|
|
annWriter.close();
|
|
|
|
|
|
|
|
|
|
AMBWriter ambWriter = new AMBWriter(ambFile);
|
|
|
|
|
ambWriter.writeEmpty(dictionary);
|
|
|
|
|
ambWriter.close();
|
|
|
|
|
|
|
|
|
|
// Write the encoded files for the forward version of this reference sequence.
|
2010-01-04 08:54:57 +08:00
|
|
|
pacFile = File.createTempFile("bwt",".pac");
|
|
|
|
|
bwtFile = File.createTempFile("bwt",".bwt");
|
|
|
|
|
saFile = File.createTempFile("bwt",".sa");
|
2010-01-03 04:19:14 +08:00
|
|
|
|
2010-01-04 08:54:57 +08:00
|
|
|
writeEncodedReferenceSequence(normalizedReferenceSequence,pacFile,bwtFile,saFile);
|
2010-01-03 04:19:14 +08:00
|
|
|
|
|
|
|
|
// Write the encoded files for the reverse version of this reference sequence.
|
2010-05-20 22:05:13 +08:00
|
|
|
byte[] reverseReferenceSequence = Utils.reverse(normalizedReferenceSequence);
|
2010-01-03 04:19:14 +08:00
|
|
|
|
2010-01-04 08:54:57 +08:00
|
|
|
rpacFile = File.createTempFile("bwt",".rpac");
|
|
|
|
|
rbwtFile = File.createTempFile("bwt",".rbwt");
|
|
|
|
|
rsaFile = File.createTempFile("bwt",".rsa");
|
2010-01-03 04:19:14 +08:00
|
|
|
|
|
|
|
|
writeEncodedReferenceSequence(reverseReferenceSequence,rpacFile,rbwtFile,rsaFile);
|
|
|
|
|
}
|
|
|
|
|
catch(IOException ex) {
|
2010-09-10 07:21:17 +08:00
|
|
|
throw new GATKException("Unable to write autogenerated reference sequence to temporary files");
|
2010-01-03 04:19:14 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Make sure that, at the very least, all temporary files are deleted on exit.
|
|
|
|
|
annFile.deleteOnExit();
|
|
|
|
|
ambFile.deleteOnExit();
|
|
|
|
|
pacFile.deleteOnExit();
|
|
|
|
|
bwtFile.deleteOnExit();
|
|
|
|
|
saFile.deleteOnExit();
|
|
|
|
|
rpacFile.deleteOnExit();
|
|
|
|
|
rbwtFile.deleteOnExit();
|
|
|
|
|
rsaFile.deleteOnExit();
|
|
|
|
|
|
2010-01-05 22:48:19 +08:00
|
|
|
return new BWTFiles(annFile,ambFile,pacFile,bwtFile,saFile,rpacFile,rbwtFile,rsaFile);
|
2010-01-03 04:19:14 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Write the encoded form of the reference sequence. In the case of BWA, the encoded reference
|
|
|
|
|
* sequence is the reference itself in PAC format, the BWT, and the suffix array.
|
|
|
|
|
* @param referenceSequence The reference sequence to encode.
|
|
|
|
|
* @param pacFile Target for the PAC-encoded reference.
|
|
|
|
|
* @param bwtFile Target for the BWT representation of the reference.
|
|
|
|
|
* @param suffixArrayFile Target for the suffix array encoding of the reference.
|
|
|
|
|
* @throws java.io.IOException In case of issues writing to the file.
|
|
|
|
|
*/
|
|
|
|
|
private static void writeEncodedReferenceSequence(byte[] referenceSequence,
|
|
|
|
|
File pacFile,
|
|
|
|
|
File bwtFile,
|
|
|
|
|
File suffixArrayFile) throws IOException {
|
|
|
|
|
PackUtils.writeReferenceSequence(pacFile,referenceSequence);
|
|
|
|
|
|
|
|
|
|
BWT bwt = BWT.createFromReferenceSequence(referenceSequence);
|
|
|
|
|
BWTWriter bwtWriter = new BWTWriter(bwtFile);
|
|
|
|
|
bwtWriter.write(bwt);
|
|
|
|
|
bwtWriter.close();
|
|
|
|
|
|
|
|
|
|
SuffixArray suffixArray = SuffixArray.createFromReferenceSequence(referenceSequence);
|
|
|
|
|
SuffixArrayWriter suffixArrayWriter = new SuffixArrayWriter(suffixArrayFile);
|
|
|
|
|
suffixArrayWriter.write(suffixArray);
|
|
|
|
|
suffixArrayWriter.close();
|
2010-01-04 08:54:57 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Convert the given reference sequence into a form suitable for building into
|
|
|
|
|
* on-the-fly sequences.
|
|
|
|
|
* @param referenceSequence The reference sequence to normalize.
|
2010-09-10 23:25:30 +08:00
|
|
|
* @throws GATKException if normalized sequence cannot be generated.
|
2010-01-04 08:54:57 +08:00
|
|
|
*/
|
|
|
|
|
private static void normalizeReferenceSequence(byte[] referenceSequence) {
|
|
|
|
|
StringUtil.toUpperCase(referenceSequence);
|
|
|
|
|
for(byte base: referenceSequence) {
|
|
|
|
|
if(base != 'A' && base != 'C' && base != 'G' && base != 'T')
|
2010-09-10 07:21:17 +08:00
|
|
|
throw new GATKException(String.format("Base type %c is not supported when building references on-the-fly",(char)base));
|
2010-01-04 08:54:57 +08:00
|
|
|
}
|
|
|
|
|
}
|
2009-11-21 01:08:09 +08:00
|
|
|
}
|