From b6ecc9e15194e2e89cad90f0a6576069b8b97f90 Mon Sep 17 00:00:00 2001 From: hanna Date: Sat, 2 Jan 2010 20:19:14 +0000 Subject: [PATCH] Support for ad-hoc reference sequences. Also reenabled BWA/Java integration test, which was commented out and the data backing it up deleted without my knowledge. Unfortunately, since the data was deleted, I had to regenerate the data and a new md5. Hopefully the aligner output is still correct. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2493 348d0f76-0448-11de-a6fe-93d51630548a --- ...tute_sting_alignment_bwa_c_BWACAligner.cpp | 53 ++++-- .../sting/alignment/bwa/BWAAligner.java | 26 ++- .../sting/alignment/bwa/BWTFiles.java | 177 ++++++++++++++++-- .../sting/alignment/bwa/c/BWACAligner.java | 26 ++- .../alignment/bwa/java/BWAJavaAligner.java | 22 ++- .../sting/alignment/reference/bwt/BWT.java | 19 ++ .../alignment/reference/bwt/BWTWriter.java | 8 +- .../reference/bwt/CreateBWTFromReference.java | 95 ++-------- .../alignment/reference/bwt/SuffixArray.java | 75 ++++++++ .../reference/bwt/SuffixArrayWriter.java | 6 +- .../packing/CreatePACFromReference.java | 15 +- .../reference/packing/PackUtils.java | 24 +++ .../alignment/AlignerIntegrationTest.java | 4 +- 13 files changed, 391 insertions(+), 159 deletions(-) diff --git a/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.cpp b/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.cpp index 14a1c04c4..1ccbef0d4 100644 --- a/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.cpp +++ b/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.cpp @@ -12,27 +12,27 @@ typedef void (BWA::*int_setter)(int value); typedef void (BWA::*float_setter)(float value); static jobject convert_to_java_alignment(JNIEnv* env, const jbyte* read_bases, const jsize read_length, const Alignment& alignment); -static jstring get_configuration_string(JNIEnv* env, jobject configuration, const char* field_name); +static jstring get_configuration_file(JNIEnv* env, jobject configuration, const char* field_name); static void set_int_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, int_setter setter); static void set_float_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, float_setter setter); static void throw_config_value_exception(JNIEnv* env, const char* field_name, const char* message); JNIEXPORT jlong JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_create(JNIEnv* env, jobject instance, jobject bwtFiles, jobject configuration) { - jstring java_ann = get_configuration_string(env,bwtFiles,"annFileName"); - if(env->ExceptionCheck()) return 0L; - jstring java_amb = get_configuration_string(env,bwtFiles,"ambFileName"); - if(env->ExceptionCheck()) return 0L; - jstring java_pac = get_configuration_string(env,bwtFiles,"pacFileName"); - if(env->ExceptionCheck()) return 0L; - jstring java_forward_bwt = get_configuration_string(env,bwtFiles,"forwardBWTFileName"); - if(env->ExceptionCheck()) return 0L; - jstring java_forward_sa = get_configuration_string(env,bwtFiles,"forwardSAFileName"); - if(env->ExceptionCheck()) return 0L; - jstring java_reverse_bwt = get_configuration_string(env,bwtFiles,"reverseBWTFileName"); - if(env->ExceptionCheck()) return 0L; - jstring java_reverse_sa = get_configuration_string(env,bwtFiles,"reverseSAFileName"); - if(env->ExceptionCheck()) return 0L; + jstring java_ann = get_configuration_file(env,bwtFiles,"annFile"); + if(java_ann == NULL) return 0L; + jstring java_amb = get_configuration_file(env,bwtFiles,"ambFile"); + if(java_amb == NULL) return 0L; + jstring java_pac = get_configuration_file(env,bwtFiles,"pacFile"); + if(java_pac == NULL) return 0L; + jstring java_forward_bwt = get_configuration_file(env,bwtFiles,"forwardBWTFile"); + if(java_forward_bwt == NULL) return 0L; + jstring java_forward_sa = get_configuration_file(env,bwtFiles,"forwardSAFile"); + if(java_forward_sa == NULL) return 0L; + jstring java_reverse_bwt = get_configuration_file(env,bwtFiles,"reverseBWTFile"); + if(java_reverse_bwt == NULL) return 0L; + jstring java_reverse_sa = get_configuration_file(env,bwtFiles,"reverseSAFile"); + if(java_reverse_sa == NULL) return 0L; const char* ann_filename = env->GetStringUTFChars(java_ann,JNI_FALSE); if(env->ExceptionCheck()) return 0L; @@ -332,16 +332,29 @@ static jobject convert_to_java_alignment(JNIEnv *env, const jbyte* read_bases, c return java_alignment; } -static jstring get_configuration_string(JNIEnv* env, jobject configuration, const char* field_name) { +static jstring get_configuration_file(JNIEnv* env, jobject configuration, const char* field_name) { jclass configuration_class = env->GetObjectClass(configuration); if(configuration_class == NULL) return NULL; - jfieldID configuration_field = env->GetFieldID(configuration_class, field_name, "Ljava/lang/String;"); + jfieldID configuration_field = env->GetFieldID(configuration_class, field_name, "Ljava/io/File;"); if(configuration_field == NULL) return NULL; - jstring result = (jstring)env->GetObjectField(configuration,configuration_field); - env->DeleteLocalRef(configuration_class); - return result; + jobject configuration_file = (jobject)env->GetObjectField(configuration,configuration_field); + + jclass file_class = env->FindClass("java/io/File"); + if(file_class == NULL) return NULL; + + jmethodID path_extractor = env->GetMethodID(file_class,"getAbsolutePath", "()Ljava/lang/String;"); + if(path_extractor == NULL) return NULL; + + jstring path = (jstring)env->CallObjectMethod(configuration_file,path_extractor); + if(path == NULL) return NULL; + + env->DeleteLocalRef(configuration_class); + env->DeleteLocalRef(file_class); + env->DeleteLocalRef(configuration_file); + + return path; } static void set_int_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, int_setter setter) { diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java b/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java index 75cde973e..0fe97b55d 100644 --- a/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java +++ b/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java @@ -1,11 +1,6 @@ package org.broadinstitute.sting.alignment.bwa; import org.broadinstitute.sting.alignment.Aligner; -import org.broadinstitute.sting.alignment.Alignment; -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMFileHeader; - -import java.util.Iterator; /** * Align reads using BWA. @@ -14,6 +9,15 @@ import java.util.Iterator; * @version 0.1 */ public abstract class BWAAligner implements Aligner { + /** + * The supporting files used by BWA. + */ + protected BWTFiles bwtFiles; + + /** + * The current configuration for the BWA aligner. + */ + protected BWAConfiguration configuration; /** * Create a new BWAAligner. Purpose of this call is to ensure that all BWA constructors accept the correct @@ -21,7 +25,17 @@ public abstract class BWAAligner implements Aligner { * @param bwtFiles The many files representing BWTs persisted to disk. * @param configuration Configuration parameters for the alignment. */ - public BWAAligner(BWTFiles bwtFiles, BWAConfiguration configuration) { } + public BWAAligner(BWTFiles bwtFiles, BWAConfiguration configuration) { + this.bwtFiles = bwtFiles; + this.configuration = configuration; + } + + /** + * Close the existing aligner, freeing resources it consumed. + */ + public void close() { + bwtFiles.close(); + } /** * Update the configuration passed to the BWA aligner. diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/BWTFiles.java b/java/src/org/broadinstitute/sting/alignment/bwa/BWTFiles.java index b745528b6..d156c015b 100644 --- a/java/src/org/broadinstitute/sting/alignment/bwa/BWTFiles.java +++ b/java/src/org/broadinstitute/sting/alignment/bwa/BWTFiles.java @@ -1,6 +1,19 @@ package org.broadinstitute.sting.alignment.bwa; import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.alignment.reference.packing.PackUtils; +import org.broadinstitute.sting.alignment.reference.bwt.BWT; +import org.broadinstitute.sting.alignment.reference.bwt.BWTWriter; +import org.broadinstitute.sting.alignment.reference.bwt.SuffixArray; +import org.broadinstitute.sting.alignment.reference.bwt.SuffixArrayWriter; +import org.broadinstitute.sting.alignment.reference.bwt.ANNWriter; +import org.broadinstitute.sting.alignment.reference.bwt.AMBWriter; + +import java.io.File; +import java.io.IOException; + +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; /** * Support files for BWT. @@ -12,37 +25,42 @@ public class BWTFiles { /** * ANN (?) file name. */ - public final String annFileName; + public final File annFile; /** * AMB (?) file name. */ - public final String ambFileName; + public final File ambFile; /** * Packed reference sequence file. */ - public final String pacFileName; + public final File pacFile; /** * Forward BWT file. */ - public final String forwardBWTFileName; + public final File forwardBWTFile; /** * Forward suffix array file. */ - public final String forwardSAFileName; + public final File forwardSAFile; /** * Reverse BWT file. */ - public final String reverseBWTFileName; + public final File reverseBWTFile; /** * Reverse suffix array file. */ - public final String reverseSAFileName; + public final File reverseSAFile; + + /** + * Where these files autogenerated on the fly? + */ + private final boolean autogenerated; /** * Create a new BWA configuration file using the given prefix. @@ -51,12 +69,143 @@ public class BWTFiles { public BWTFiles(String prefix) { if(prefix == null) throw new StingException("Prefix must not be null."); - annFileName = prefix + ".ann"; - ambFileName = prefix + ".amb"; - pacFileName = prefix + ".pac"; - forwardBWTFileName = prefix + ".bwt"; - forwardSAFileName = prefix + ".sa"; - reverseBWTFileName = prefix + ".rbwt"; - reverseSAFileName = prefix + ".rsa"; + annFile = new File(prefix + ".ann"); + ambFile = new File(prefix + ".amb"); + pacFile = new File(prefix + ".pac"); + forwardBWTFile = new File(prefix + ".bwt"); + forwardSAFile = new File(prefix + ".sa"); + reverseBWTFile = new File(prefix + ".rbwt"); + reverseSAFile = new File(prefix + ".rsa"); + autogenerated = false; } + + /** + * Hand-create a new BWTFiles object, specifying a unique file object for each type. + * @param annFile ANN (alternate dictionary) file. + * @param ambFile AMB (holes) files. + * @param pacFile Packed representation of the forward reference sequence. + * @param forwardBWTFile BWT representation of the forward reference sequence. + * @param forwardSAFile SA representation of the forward reference sequence. + * @param reverseBWTFile BWT representation of the reversed reference sequence. + * @param reverseSAFile SA representation of the reversed reference sequence. + */ + private BWTFiles(File annFile, + File ambFile, + File pacFile, + File forwardBWTFile, + File forwardSAFile, + File reverseBWTFile, + File reverseSAFile) { + this.annFile = annFile; + this.ambFile = ambFile; + this.pacFile = pacFile; + this.forwardBWTFile = forwardBWTFile; + this.forwardSAFile = forwardSAFile; + this.reverseBWTFile = reverseBWTFile; + this.reverseSAFile = reverseSAFile; + autogenerated = true; + } + + /** + * Close out this files object, in the process deleting any temporary filse + * that were created. + */ + public void close() { + if(autogenerated) { + boolean success = true; + success = annFile.delete(); + success &= ambFile.delete(); + success &= pacFile.delete(); + success &= forwardBWTFile.delete(); + success &= forwardSAFile.delete(); + success &= reverseBWTFile.delete(); + success &= reverseSAFile.delete(); + + if(!success) + throw new StingException("Unable to clean up autogenerated representation"); + } + } + + /** + * Create a new set of BWT files from the given reference sequence. + * @param referenceSequence Sequence from which to build metadata. + * @return A new object representing encoded representations of each sequence. + */ + public static BWTFiles createFromReferenceSequence(byte[] referenceSequence) { + File annFile,ambFile,pacFile,bwtFile,saFile,rpacFile,rbwtFile,rsaFile; + try { + // Write the ann and amb for this reference sequence. + annFile = File.createTempFile("bwt","ann"); + ambFile = File.createTempFile("bwt","amb"); + + SAMSequenceDictionary dictionary = new SAMSequenceDictionary(); + dictionary.addSequence(new SAMSequenceRecord("autogenerated",referenceSequence.length)); + + ANNWriter annWriter = new ANNWriter(annFile); + annWriter.write(dictionary); + annWriter.close(); + + AMBWriter ambWriter = new AMBWriter(ambFile); + ambWriter.writeEmpty(dictionary); + ambWriter.close(); + + // Write the encoded files for the forward version of this reference sequence. + pacFile = File.createTempFile("bwt","pac"); + bwtFile = File.createTempFile("bwt","bwt"); + saFile = File.createTempFile("bwt","sa"); + + writeEncodedReferenceSequence(referenceSequence,pacFile,bwtFile,saFile); + + // Write the encoded files for the reverse version of this reference sequence. + byte[] reverseReferenceSequence = new byte[referenceSequence.length]; + System.arraycopy(referenceSequence,0,reverseReferenceSequence,0,referenceSequence.length); + + rpacFile = File.createTempFile("bwt","rpac"); + rbwtFile = File.createTempFile("bwt","rbwt"); + rsaFile = File.createTempFile("bwt","rsa"); + + writeEncodedReferenceSequence(reverseReferenceSequence,rpacFile,rbwtFile,rsaFile); + } + catch(IOException ex) { + throw new StingException("Unable to write autogenerated reference sequence to temporary files"); + } + + // Make sure that, at the very least, all temporary files are deleted on exit. + annFile.deleteOnExit(); + ambFile.deleteOnExit(); + pacFile.deleteOnExit(); + bwtFile.deleteOnExit(); + saFile.deleteOnExit(); + rpacFile.deleteOnExit(); + rbwtFile.deleteOnExit(); + rsaFile.deleteOnExit(); + + return new BWTFiles(annFile,ambFile,pacFile,bwtFile,saFile,rbwtFile,rsaFile); + } + + /** + * Write the encoded form of the reference sequence. In the case of BWA, the encoded reference + * sequence is the reference itself in PAC format, the BWT, and the suffix array. + * @param referenceSequence The reference sequence to encode. + * @param pacFile Target for the PAC-encoded reference. + * @param bwtFile Target for the BWT representation of the reference. + * @param suffixArrayFile Target for the suffix array encoding of the reference. + * @throws java.io.IOException In case of issues writing to the file. + */ + private static void writeEncodedReferenceSequence(byte[] referenceSequence, + File pacFile, + File bwtFile, + File suffixArrayFile) throws IOException { + PackUtils.writeReferenceSequence(pacFile,referenceSequence); + + BWT bwt = BWT.createFromReferenceSequence(referenceSequence); + BWTWriter bwtWriter = new BWTWriter(bwtFile); + bwtWriter.write(bwt); + bwtWriter.close(); + + SuffixArray suffixArray = SuffixArray.createFromReferenceSequence(referenceSequence); + SuffixArrayWriter suffixArrayWriter = new SuffixArrayWriter(suffixArrayFile); + suffixArrayWriter.write(suffixArray); + suffixArrayWriter.close(); + } } diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/c/BWACAligner.java b/java/src/org/broadinstitute/sting/alignment/bwa/c/BWACAligner.java index 9606295b4..69ca2bfcd 100644 --- a/java/src/org/broadinstitute/sting/alignment/bwa/c/BWACAligner.java +++ b/java/src/org/broadinstitute/sting/alignment/bwa/c/BWACAligner.java @@ -3,14 +3,12 @@ package org.broadinstitute.sting.alignment.bwa.c; import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.alignment.Alignment; import org.broadinstitute.sting.alignment.bwa.BWAConfiguration; import org.broadinstitute.sting.alignment.bwa.BWTFiles; import org.broadinstitute.sting.alignment.bwa.BWAAligner; import java.util.*; -import java.io.File; /** * An aligner using the BWA/C implementation. @@ -33,17 +31,26 @@ public class BWACAligner extends BWAAligner { if(thunkPointer != 0) throw new StingException("BWA/C attempting to reinitialize."); - if(!new File(bwtFiles.annFileName).exists()) throw new StingException("ANN file is missing; please rerun 'bwa aln' to regenerate it."); - if(!new File(bwtFiles.ambFileName).exists()) throw new StingException("AMB file is missing; please rerun 'bwa aln' to regenerate it."); - if(!new File(bwtFiles.pacFileName).exists()) throw new StingException("PAC file is missing; please rerun 'bwa aln' to regenerate it."); - if(!new File(bwtFiles.forwardBWTFileName).exists()) throw new StingException("Forward BWT file is missing; please rerun 'bwa aln' to regenerate it."); - if(!new File(bwtFiles.forwardSAFileName).exists()) throw new StingException("Forward SA file is missing; please rerun 'bwa aln' to regenerate it."); - if(!new File(bwtFiles.reverseBWTFileName).exists()) throw new StingException("Reverse BWT file is missing; please rerun 'bwa aln' to regenerate it."); - if(!new File(bwtFiles.reverseSAFileName).exists()) throw new StingException("Reverse SA file is missing; please rerun 'bwa aln' to regenerate it."); + if(!bwtFiles.annFile.exists()) throw new StingException("ANN file is missing; please rerun 'bwa aln' to regenerate it."); + if(!bwtFiles.ambFile.exists()) throw new StingException("AMB file is missing; please rerun 'bwa aln' to regenerate it."); + if(!bwtFiles.pacFile.exists()) throw new StingException("PAC file is missing; please rerun 'bwa aln' to regenerate it."); + if(!bwtFiles.forwardBWTFile.exists()) throw new StingException("Forward BWT file is missing; please rerun 'bwa aln' to regenerate it."); + if(!bwtFiles.forwardSAFile.exists()) throw new StingException("Forward SA file is missing; please rerun 'bwa aln' to regenerate it."); + if(!bwtFiles.reverseBWTFile.exists()) throw new StingException("Reverse BWT file is missing; please rerun 'bwa aln' to regenerate it."); + if(!bwtFiles.reverseSAFile.exists()) throw new StingException("Reverse SA file is missing; please rerun 'bwa aln' to regenerate it."); thunkPointer = create(bwtFiles,configuration); } + /** + * Create an aligner object using an array of bytes as a reference. + * @param referenceSequence Reference sequence to encode ad-hoc. + * @param configuration Configuration for the given aligner. + */ + public BWACAligner(byte[] referenceSequence, BWAConfiguration configuration) { + this(BWTFiles.createFromReferenceSequence(referenceSequence),configuration); + } + /** * Update the configuration passed to the BWA aligner. * @param configuration New configuration to set. @@ -63,6 +70,7 @@ public class BWACAligner extends BWAAligner { if(thunkPointer == 0) throw new StingException("BWA/C close attempted, but BWA/C is not properly initialized."); destroy(thunkPointer); + super.close(); } /** diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/java/BWAJavaAligner.java b/java/src/org/broadinstitute/sting/alignment/bwa/java/BWAJavaAligner.java index 46393a68a..15aa7f49f 100644 --- a/java/src/org/broadinstitute/sting/alignment/bwa/java/BWAJavaAligner.java +++ b/java/src/org/broadinstitute/sting/alignment/bwa/java/BWAJavaAligner.java @@ -1,11 +1,9 @@ package org.broadinstitute.sting.alignment.bwa.java; import org.broadinstitute.sting.alignment.reference.bwt.*; -import org.broadinstitute.sting.alignment.bwa.java.LowerBound; -import org.broadinstitute.sting.alignment.bwa.java.BWAAlignment; -import org.broadinstitute.sting.alignment.bwa.java.AlignmentState; +import org.broadinstitute.sting.alignment.bwa.BWAAligner; +import org.broadinstitute.sting.alignment.bwa.BWAConfiguration; import org.broadinstitute.sting.alignment.Alignment; -import org.broadinstitute.sting.alignment.Aligner; import org.broadinstitute.sting.utils.BaseUtils; import java.io.File; @@ -20,7 +18,7 @@ import net.sf.samtools.SAMFileHeader; * @author mhanna * @version 0.1 */ -public class BWAJavaAligner implements Aligner { +public class BWAJavaAligner extends BWAAligner { /** * BWT in the forward direction. */ @@ -77,6 +75,7 @@ public class BWAJavaAligner implements Aligner { public final int INDEL_END_SKIP = 5; public BWAJavaAligner( File forwardBWTFile, File reverseBWTFile, File forwardSuffixArrayFile, File reverseSuffixArrayFile ) { + super(null,null); forwardBWT = new BWTReader(forwardBWTFile).read(); reverseBWT = new BWTReader(reverseBWTFile).read(); forwardSuffixArray = new SuffixArrayReader(forwardSuffixArrayFile,forwardBWT).read(); @@ -86,7 +85,18 @@ public class BWAJavaAligner implements Aligner { /** * Close this instance of the BWA pointer and delete its resources. */ - public void close() { throw new UnsupportedOperationException("BWAJavaAligner does not yet support the standard Aligner interface."); } + @Override + public void close() { + super.close(); + } + + /** + * Update the current parameters of this aligner. + * @param configuration New configuration to set. + */ + public void updateConfiguration(BWAConfiguration configuration) { + throw new UnsupportedOperationException("Configuration of the BWA aligner can't currently be changed."); + } /** * Allow the aligner to choose one alignment randomly from the pile of best alignments. diff --git a/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWT.java b/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWT.java index dac8f1523..ca1720339 100644 --- a/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWT.java +++ b/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWT.java @@ -98,6 +98,25 @@ public class BWT { return counts.getTotal(); } + /** + * Create a new BWT from the given reference sequence. + * @param referenceSequence Sequence from which to derive the BWT. + * @return reference sequence-derived BWT. + */ + public static BWT createFromReferenceSequence(byte[] referenceSequence) { + SuffixArray suffixArray = SuffixArray.createFromReferenceSequence(referenceSequence); + + byte[] bwt = new byte[(int)suffixArray.length()-1]; + int bwtIndex = 0; + for(long suffixArrayIndex = 0; suffixArrayIndex < suffixArray.length(); suffixArrayIndex++) { + if(suffixArray.get(suffixArrayIndex) == 0) + continue; + bwt[bwtIndex++] = referenceSequence[(int)suffixArray.get(suffixArrayIndex)-1]; + } + + return new BWT(suffixArray.inverseSA0,suffixArray.occurrences,bwt); + } + /** * Gets the base at a given position in the BWT. * @param index The index to use. diff --git a/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTWriter.java b/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTWriter.java index 772a194bb..113eff4fa 100644 --- a/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTWriter.java +++ b/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTWriter.java @@ -20,12 +20,12 @@ public class BWTWriter { private final OutputStream outputStream; /** - * Create a new BWT reader. - * @param inputFile File in which the BWT is stored. + * Create a new BWT writer. + * @param outputFile File in which the BWT is stored. */ - public BWTWriter( File inputFile ) { + public BWTWriter( File outputFile ) { try { - this.outputStream = new BufferedOutputStream(new FileOutputStream(inputFile)); + this.outputStream = new BufferedOutputStream(new FileOutputStream(outputFile)); } catch( FileNotFoundException ex ) { throw new StingException("Unable to open output file", ex); diff --git a/java/src/org/broadinstitute/sting/alignment/reference/bwt/CreateBWTFromReference.java b/java/src/org/broadinstitute/sting/alignment/reference/bwt/CreateBWTFromReference.java index 29ac6e790..aa4033231 100755 --- a/java/src/org/broadinstitute/sting/alignment/reference/bwt/CreateBWTFromReference.java +++ b/java/src/org/broadinstitute/sting/alignment/reference/bwt/CreateBWTFromReference.java @@ -28,11 +28,8 @@ package org.broadinstitute.sting.alignment.reference.bwt; import net.sf.picard.reference.ReferenceSequenceFile; import net.sf.picard.reference.ReferenceSequenceFileFactory; import net.sf.picard.reference.ReferenceSequence; -import net.sf.samtools.util.StringUtil; import java.io.*; -import java.util.TreeSet; -import java.util.Comparator; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.alignment.reference.packing.PackUtils; @@ -44,45 +41,29 @@ import org.broadinstitute.sting.alignment.reference.packing.PackUtils; * @version 0.1 */ public class CreateBWTFromReference { - private String loadReference( File inputFile ) { + private byte[] loadReference( File inputFile ) { // Read in the first sequence in the input file ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(inputFile); ReferenceSequence sequence = reference.nextSequence(); - return StringUtil.bytesToString(sequence.getBases()); + return sequence.getBases(); } - private String loadReverseReference( File inputFile ) { + private byte[] loadReverseReference( File inputFile ) { ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(inputFile); ReferenceSequence sequence = reference.nextSequence(); PackUtils.reverse(sequence.getBases()); - return StringUtil.bytesToString(sequence.getBases()); + return sequence.getBases(); } - private Counts countOccurrences( String sequence ) { + private Counts countOccurrences( byte[] sequence ) { Counts occurrences = new Counts(); - for( char base: sequence.toCharArray() ) - occurrences.increment((byte)base); + for( byte base: sequence ) + occurrences.increment(base); return occurrences; } - private long[] createSuffixArray( String sequence ) { - TreeSet suffixArrayBuilder = new TreeSet( new SuffixArrayComparator(sequence) ); - - // Build out the suffix array using a custom comparator. - System.out.printf("Creating sequence array of length %d%n", sequence.length() ); - for( int i = 0; i <= sequence.length(); i++ ) { - suffixArrayBuilder.add(i); - if( i % 100000 == 0 ) - System.out.printf("Added sequence %d%n", i); - } - - // Copy the suffix array into an int array. - long[] suffixArray = new long[suffixArrayBuilder.size()]; - int i = 0; - for( Integer element: suffixArrayBuilder ) - suffixArray[i++] = element; - - return suffixArray; + private long[] createSuffixArray( byte[] sequence ) { + return SuffixArray.createFromReferenceSequence(sequence).sequence; } private long[] invertSuffixArray( long[] suffixArray ) { @@ -107,17 +88,6 @@ public class CreateBWTFromReference { return inverseCompressedSuffixArray; } - private byte[] createBWT( String sequence, long[] suffixArray ) { - byte[] bwt = new byte[suffixArray.length-1]; - int i = 0; - for( long suffixArrayEntry: suffixArray ) { - if( suffixArrayEntry == 0 ) - continue; - bwt[i++] = (byte)sequence.charAt((int)suffixArrayEntry-1); - } - return bwt; - } - public static void main( String argv[] ) throws IOException { if( argv.length != 5 ) { System.out.println("USAGE: CreateBWTFromReference .fasta "); @@ -141,8 +111,8 @@ public class CreateBWTFromReference { CreateBWTFromReference creator = new CreateBWTFromReference(); - String sequence = creator.loadReference(inputFile); - String reverseSequence = creator.loadReverseReference(inputFile); + byte[] sequence = creator.loadReference(inputFile); + byte[] reverseSequence = creator.loadReverseReference(inputFile); // Count the occurences of each given base. Counts occurrences = creator.countOccurrences(sequence); @@ -162,13 +132,6 @@ public class CreateBWTFromReference { SuffixArray suffixArray = new SuffixArray( inverseSuffixArray[0], occurrences, suffixArrayData ); SuffixArray reverseSuffixArray = new SuffixArray( reverseInverseSuffixArray[0], occurrences, reverseSuffixArrayData ); - /* - for( int i = 0; i < 8; i++ ) - System.out.printf("suffixArray[%d] = %d (%s...)%n", i, suffixArray.sequence[i], sequence.substring(suffixArray.sequence[i],Math.min(suffixArray.sequence[i]+100,sequence.length()))); - for( int i = 0; i < 8; i++ ) - System.out.printf("inverseSuffixArray[%d] = %d (%s...)%n", i, inverseSuffixArray[i], sequence.substring(i,Math.min(i+100,sequence.length()))); - */ - /* // Create the data structure for the compressed suffix array and print diagnostics. int[] compressedSuffixArray = creator.createCompressedSuffixArray(suffixArray.sequence,inverseSuffixArray); @@ -186,8 +149,8 @@ public class CreateBWTFromReference { */ // Create the BWT. - BWT bwt = new BWT(inverseSuffixArray[0], occurrences, creator.createBWT(sequence, suffixArray.sequence)); - BWT reverseBWT = new BWT( reverseInverseSuffixArray[0], occurrences, creator.createBWT(reverseSequence, reverseSuffixArray.sequence)); + BWT bwt = BWT.createFromReferenceSequence(sequence); + BWT reverseBWT = BWT.createFromReferenceSequence(reverseSequence); byte[] bwtSequence = bwt.getSequence(); System.out.printf("BWT: %s... (length = %d)%n", new String(bwtSequence,0,80),bwt.length()); @@ -234,36 +197,4 @@ public class CreateBWTFromReference { } } - /** - * Compares two suffix arrays of the given sequence. Will return whichever string appears - * first in lexicographic order. - */ - public static class SuffixArrayComparator implements Comparator { - /** - * The data source for all suffix arrays. - */ - private final String sequence; - - /** - * Create a new comparator. - * @param sequence Reference sequence to use as basis for comparison. - */ - public SuffixArrayComparator( String sequence ) { - this.sequence = sequence; - } - - /** - * Compare the two given suffix arrays. Criteria for comparison is the lexicographic order of - * the two substrings sequence[lhs:], sequence[rhs:]. - * @param lhs Left-hand side of comparison. - * @param rhs Right-hand side of comparison. - * @return How the suffix arrays represented by lhs, rhs compare. - */ - public int compare( Integer lhs, Integer rhs ) { - String lhsSuffixArray = sequence.substring(lhs); - String rhsSuffixArray = sequence.substring(rhs); - return lhsSuffixArray.compareTo(rhsSuffixArray); - } - } - } diff --git a/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArray.java b/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArray.java index 0f808be2a..c4968cff5 100644 --- a/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArray.java +++ b/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArray.java @@ -2,6 +2,11 @@ package org.broadinstitute.sting.alignment.reference.bwt; import org.broadinstitute.sting.utils.StingException; +import java.util.Comparator; +import java.util.TreeSet; + +import net.sf.samtools.util.StringUtil; + /** * An in-memory representation of a suffix array. * @@ -81,4 +86,74 @@ public class SuffixArray { } return (sequence[(int)(index/sequenceInterval)]+iterations) % length(); } + + /** + * Create a suffix array from a given reference sequence. + * @param sequence The reference sequence to use when building the suffix array. + * @return a constructed suffix array. + */ + public static SuffixArray createFromReferenceSequence(byte[] sequence) { + // The builder for the suffix array. Use an integer in this case because + // Java arrays can only hold an integer. + TreeSet suffixArrayBuilder = new TreeSet(new SuffixArrayComparator(sequence)); + + Counts occurrences = new Counts(); + for( byte base: sequence ) + occurrences.increment(base); + + // Build out the suffix array using a custom comparator. + for( int i = 0; i <= sequence.length; i++ ) + suffixArrayBuilder.add(i); + + // Copy the suffix array into an array. + long[] suffixArray = new long[suffixArrayBuilder.size()]; + int i = 0; + for( Integer element: suffixArrayBuilder ) + suffixArray[i++] = element; + + // Find the first element in the inverse suffix array. + long inverseSA0 = -1; + for(i = 0; i < sequence.length; i++) { + if(suffixArray[i] == 0) + inverseSA0 = i; + } + if(inverseSA0 < 0) + throw new StingException("Unable to find first inverse SA entry in generated suffix array."); + + return new SuffixArray(inverseSA0,occurrences,suffixArray); + } + + /** + * Compares two suffix arrays of the given sequence. Will return whichever string appears + * first in lexicographic order. + */ + private static class SuffixArrayComparator implements Comparator { + /** + * The data source for all suffix arrays. + */ + private final String sequence; + + /** + * Create a new comparator. + * @param sequence Reference sequence to use as basis for comparison. + */ + public SuffixArrayComparator( byte[] sequence ) { + // Processing the suffix array tends to be easier as a string. + this.sequence = StringUtil.bytesToString(sequence); + } + + /** + * Compare the two given suffix arrays. Criteria for comparison is the lexicographic order of + * the two substrings sequence[lhs:], sequence[rhs:]. + * @param lhs Left-hand side of comparison. + * @param rhs Right-hand side of comparison. + * @return How the suffix arrays represented by lhs, rhs compare. + */ + public int compare( Integer lhs, Integer rhs ) { + String lhsSuffixArray = sequence.substring(lhs); + String rhsSuffixArray = sequence.substring(rhs); + return lhsSuffixArray.compareTo(rhsSuffixArray); + } + } + } diff --git a/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayWriter.java b/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayWriter.java index 84469cc21..781d54965 100644 --- a/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayWriter.java +++ b/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayWriter.java @@ -20,11 +20,11 @@ public class SuffixArrayWriter { /** * Create a new suffix array reader. - * @param inputFile File in which the suffix array is stored. + * @param outputFile File in which the suffix array is stored. */ - public SuffixArrayWriter( File inputFile ) { + public SuffixArrayWriter( File outputFile ) { try { - this.outputStream = new BufferedOutputStream(new FileOutputStream(inputFile)); + this.outputStream = new BufferedOutputStream(new FileOutputStream(outputFile)); } catch( FileNotFoundException ex ) { throw new StingException("Unable to open input file", ex); diff --git a/java/src/org/broadinstitute/sting/alignment/reference/packing/CreatePACFromReference.java b/java/src/org/broadinstitute/sting/alignment/reference/packing/CreatePACFromReference.java index 06c822955..8211c97d8 100755 --- a/java/src/org/broadinstitute/sting/alignment/reference/packing/CreatePACFromReference.java +++ b/java/src/org/broadinstitute/sting/alignment/reference/packing/CreatePACFromReference.java @@ -53,23 +53,12 @@ public class CreatePACFromReference { ReferenceSequence sequence = reference.nextSequence(); // Target file for output - writeSequence( new File(argv[1]), sequence.getBases() ); + PackUtils.writeReferenceSequence( new File(argv[1]), sequence.getBases() ); // Reverse the bases in the reference PackUtils.reverse(sequence.getBases()); // Target file for output - writeSequence( new File(argv[2]), sequence.getBases() ); - } - - private static void writeSequence( File outputFile, byte[] bases ) throws IOException { - OutputStream outputStream = new FileOutputStream(outputFile); - - BasePackedOutputStream basePackedOutputStream = new BasePackedOutputStream(Byte.class, outputStream, ByteOrder.BIG_ENDIAN); - basePackedOutputStream.write(bases); - - outputStream.write(bases.length%PackUtils.ALPHABET_SIZE); - - outputStream.close(); + PackUtils.writeReferenceSequence( new File(argv[2]), sequence.getBases() ); } } diff --git a/java/src/org/broadinstitute/sting/alignment/reference/packing/PackUtils.java b/java/src/org/broadinstitute/sting/alignment/reference/packing/PackUtils.java index 0c5d29fd6..abe2ae7e2 100644 --- a/java/src/org/broadinstitute/sting/alignment/reference/packing/PackUtils.java +++ b/java/src/org/broadinstitute/sting/alignment/reference/packing/PackUtils.java @@ -2,6 +2,12 @@ package org.broadinstitute.sting.alignment.reference.packing; import org.broadinstitute.sting.utils.StingException; +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; +import java.io.FileOutputStream; +import java.nio.ByteOrder; + /** * Utilities designed for packing / unpacking bases. * @@ -24,6 +30,24 @@ public class PackUtils { */ public static final int BITS_PER_BYTE = 8; + /** + * Writes a reference sequence to a PAC file. + * @param outputFile Filename for the PAC file. + * @param referenceSequence Reference sequence to write. + * @throws IOException If there's a problem writing to the output file. + */ + public static void writeReferenceSequence( File outputFile, byte[] referenceSequence ) throws IOException { + OutputStream outputStream = new FileOutputStream(outputFile); + + BasePackedOutputStream basePackedOutputStream = new BasePackedOutputStream(Byte.class, outputStream, ByteOrder.BIG_ENDIAN); + basePackedOutputStream.write(referenceSequence); + + outputStream.write(referenceSequence.length%PackUtils.ALPHABET_SIZE); + + outputStream.close(); + } + + /** * How many bits can a given type hold? * @param type Type to test. diff --git a/java/test/org/broadinstitute/sting/alignment/AlignerIntegrationTest.java b/java/test/org/broadinstitute/sting/alignment/AlignerIntegrationTest.java index 9eccd3d71..374e5ee51 100644 --- a/java/test/org/broadinstitute/sting/alignment/AlignerIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/alignment/AlignerIntegrationTest.java @@ -16,7 +16,7 @@ import java.io.File; public class AlignerIntegrationTest extends WalkerTest { @Test public void testBasicAlignment() { - String md5 = "ed098018ffcbd055bee966466044428a"; + String md5 = "c6d95d8ae707e78fefdaa7375f130995"; WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R /humgen/gsa-scr1/GATK_Data/bwa/human_b36_both.fasta" + " -T Align" + @@ -24,6 +24,6 @@ public class AlignerIntegrationTest extends WalkerTest { " -ob %s", 1, // just one output file Arrays.asList(md5)); - //executeTest("testBasicAlignment", spec); + executeTest("testBasicAlignment", spec); } }