Support for ad-hoc reference sequences. Also reenabled BWA/Java integration test, which was commented out

and the data backing it up deleted without my knowledge.  Unfortunately, since the data was deleted, I had
to regenerate the data and a new md5.  Hopefully the aligner output is still correct.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2493 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2010-01-02 20:19:14 +00:00
parent 89f3ee614a
commit b6ecc9e151
13 changed files with 391 additions and 159 deletions

View File

@ -12,27 +12,27 @@ typedef void (BWA::*int_setter)(int value);
typedef void (BWA::*float_setter)(float value);
static jobject convert_to_java_alignment(JNIEnv* env, const jbyte* read_bases, const jsize read_length, const Alignment& alignment);
static jstring get_configuration_string(JNIEnv* env, jobject configuration, const char* field_name);
static jstring get_configuration_file(JNIEnv* env, jobject configuration, const char* field_name);
static void set_int_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, int_setter setter);
static void set_float_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, float_setter setter);
static void throw_config_value_exception(JNIEnv* env, const char* field_name, const char* message);
JNIEXPORT jlong JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_create(JNIEnv* env, jobject instance, jobject bwtFiles, jobject configuration)
{
jstring java_ann = get_configuration_string(env,bwtFiles,"annFileName");
if(env->ExceptionCheck()) return 0L;
jstring java_amb = get_configuration_string(env,bwtFiles,"ambFileName");
if(env->ExceptionCheck()) return 0L;
jstring java_pac = get_configuration_string(env,bwtFiles,"pacFileName");
if(env->ExceptionCheck()) return 0L;
jstring java_forward_bwt = get_configuration_string(env,bwtFiles,"forwardBWTFileName");
if(env->ExceptionCheck()) return 0L;
jstring java_forward_sa = get_configuration_string(env,bwtFiles,"forwardSAFileName");
if(env->ExceptionCheck()) return 0L;
jstring java_reverse_bwt = get_configuration_string(env,bwtFiles,"reverseBWTFileName");
if(env->ExceptionCheck()) return 0L;
jstring java_reverse_sa = get_configuration_string(env,bwtFiles,"reverseSAFileName");
if(env->ExceptionCheck()) return 0L;
jstring java_ann = get_configuration_file(env,bwtFiles,"annFile");
if(java_ann == NULL) return 0L;
jstring java_amb = get_configuration_file(env,bwtFiles,"ambFile");
if(java_amb == NULL) return 0L;
jstring java_pac = get_configuration_file(env,bwtFiles,"pacFile");
if(java_pac == NULL) return 0L;
jstring java_forward_bwt = get_configuration_file(env,bwtFiles,"forwardBWTFile");
if(java_forward_bwt == NULL) return 0L;
jstring java_forward_sa = get_configuration_file(env,bwtFiles,"forwardSAFile");
if(java_forward_sa == NULL) return 0L;
jstring java_reverse_bwt = get_configuration_file(env,bwtFiles,"reverseBWTFile");
if(java_reverse_bwt == NULL) return 0L;
jstring java_reverse_sa = get_configuration_file(env,bwtFiles,"reverseSAFile");
if(java_reverse_sa == NULL) return 0L;
const char* ann_filename = env->GetStringUTFChars(java_ann,JNI_FALSE);
if(env->ExceptionCheck()) return 0L;
@ -332,16 +332,29 @@ static jobject convert_to_java_alignment(JNIEnv *env, const jbyte* read_bases, c
return java_alignment;
}
static jstring get_configuration_string(JNIEnv* env, jobject configuration, const char* field_name) {
static jstring get_configuration_file(JNIEnv* env, jobject configuration, const char* field_name) {
jclass configuration_class = env->GetObjectClass(configuration);
if(configuration_class == NULL) return NULL;
jfieldID configuration_field = env->GetFieldID(configuration_class, field_name, "Ljava/lang/String;");
jfieldID configuration_field = env->GetFieldID(configuration_class, field_name, "Ljava/io/File;");
if(configuration_field == NULL) return NULL;
jstring result = (jstring)env->GetObjectField(configuration,configuration_field);
env->DeleteLocalRef(configuration_class);
return result;
jobject configuration_file = (jobject)env->GetObjectField(configuration,configuration_field);
jclass file_class = env->FindClass("java/io/File");
if(file_class == NULL) return NULL;
jmethodID path_extractor = env->GetMethodID(file_class,"getAbsolutePath", "()Ljava/lang/String;");
if(path_extractor == NULL) return NULL;
jstring path = (jstring)env->CallObjectMethod(configuration_file,path_extractor);
if(path == NULL) return NULL;
env->DeleteLocalRef(configuration_class);
env->DeleteLocalRef(file_class);
env->DeleteLocalRef(configuration_file);
return path;
}
static void set_int_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, int_setter setter) {

View File

@ -1,11 +1,6 @@
package org.broadinstitute.sting.alignment.bwa;
import org.broadinstitute.sting.alignment.Aligner;
import org.broadinstitute.sting.alignment.Alignment;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileHeader;
import java.util.Iterator;
/**
* Align reads using BWA.
@ -14,6 +9,15 @@ import java.util.Iterator;
* @version 0.1
*/
public abstract class BWAAligner implements Aligner {
/**
* The supporting files used by BWA.
*/
protected BWTFiles bwtFiles;
/**
* The current configuration for the BWA aligner.
*/
protected BWAConfiguration configuration;
/**
* Create a new BWAAligner. Purpose of this call is to ensure that all BWA constructors accept the correct
@ -21,7 +25,17 @@ public abstract class BWAAligner implements Aligner {
* @param bwtFiles The many files representing BWTs persisted to disk.
* @param configuration Configuration parameters for the alignment.
*/
public BWAAligner(BWTFiles bwtFiles, BWAConfiguration configuration) { }
public BWAAligner(BWTFiles bwtFiles, BWAConfiguration configuration) {
this.bwtFiles = bwtFiles;
this.configuration = configuration;
}
/**
* Close the existing aligner, freeing resources it consumed.
*/
public void close() {
bwtFiles.close();
}
/**
* Update the configuration passed to the BWA aligner.

View File

@ -1,6 +1,19 @@
package org.broadinstitute.sting.alignment.bwa;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.alignment.reference.packing.PackUtils;
import org.broadinstitute.sting.alignment.reference.bwt.BWT;
import org.broadinstitute.sting.alignment.reference.bwt.BWTWriter;
import org.broadinstitute.sting.alignment.reference.bwt.SuffixArray;
import org.broadinstitute.sting.alignment.reference.bwt.SuffixArrayWriter;
import org.broadinstitute.sting.alignment.reference.bwt.ANNWriter;
import org.broadinstitute.sting.alignment.reference.bwt.AMBWriter;
import java.io.File;
import java.io.IOException;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
/**
* Support files for BWT.
@ -12,37 +25,42 @@ public class BWTFiles {
/**
* ANN (?) file name.
*/
public final String annFileName;
public final File annFile;
/**
* AMB (?) file name.
*/
public final String ambFileName;
public final File ambFile;
/**
* Packed reference sequence file.
*/
public final String pacFileName;
public final File pacFile;
/**
* Forward BWT file.
*/
public final String forwardBWTFileName;
public final File forwardBWTFile;
/**
* Forward suffix array file.
*/
public final String forwardSAFileName;
public final File forwardSAFile;
/**
* Reverse BWT file.
*/
public final String reverseBWTFileName;
public final File reverseBWTFile;
/**
* Reverse suffix array file.
*/
public final String reverseSAFileName;
public final File reverseSAFile;
/**
* Where these files autogenerated on the fly?
*/
private final boolean autogenerated;
/**
* Create a new BWA configuration file using the given prefix.
@ -51,12 +69,143 @@ public class BWTFiles {
public BWTFiles(String prefix) {
if(prefix == null)
throw new StingException("Prefix must not be null.");
annFileName = prefix + ".ann";
ambFileName = prefix + ".amb";
pacFileName = prefix + ".pac";
forwardBWTFileName = prefix + ".bwt";
forwardSAFileName = prefix + ".sa";
reverseBWTFileName = prefix + ".rbwt";
reverseSAFileName = prefix + ".rsa";
annFile = new File(prefix + ".ann");
ambFile = new File(prefix + ".amb");
pacFile = new File(prefix + ".pac");
forwardBWTFile = new File(prefix + ".bwt");
forwardSAFile = new File(prefix + ".sa");
reverseBWTFile = new File(prefix + ".rbwt");
reverseSAFile = new File(prefix + ".rsa");
autogenerated = false;
}
/**
* Hand-create a new BWTFiles object, specifying a unique file object for each type.
* @param annFile ANN (alternate dictionary) file.
* @param ambFile AMB (holes) files.
* @param pacFile Packed representation of the forward reference sequence.
* @param forwardBWTFile BWT representation of the forward reference sequence.
* @param forwardSAFile SA representation of the forward reference sequence.
* @param reverseBWTFile BWT representation of the reversed reference sequence.
* @param reverseSAFile SA representation of the reversed reference sequence.
*/
private BWTFiles(File annFile,
File ambFile,
File pacFile,
File forwardBWTFile,
File forwardSAFile,
File reverseBWTFile,
File reverseSAFile) {
this.annFile = annFile;
this.ambFile = ambFile;
this.pacFile = pacFile;
this.forwardBWTFile = forwardBWTFile;
this.forwardSAFile = forwardSAFile;
this.reverseBWTFile = reverseBWTFile;
this.reverseSAFile = reverseSAFile;
autogenerated = true;
}
/**
* Close out this files object, in the process deleting any temporary filse
* that were created.
*/
public void close() {
if(autogenerated) {
boolean success = true;
success = annFile.delete();
success &= ambFile.delete();
success &= pacFile.delete();
success &= forwardBWTFile.delete();
success &= forwardSAFile.delete();
success &= reverseBWTFile.delete();
success &= reverseSAFile.delete();
if(!success)
throw new StingException("Unable to clean up autogenerated representation");
}
}
/**
* Create a new set of BWT files from the given reference sequence.
* @param referenceSequence Sequence from which to build metadata.
* @return A new object representing encoded representations of each sequence.
*/
public static BWTFiles createFromReferenceSequence(byte[] referenceSequence) {
File annFile,ambFile,pacFile,bwtFile,saFile,rpacFile,rbwtFile,rsaFile;
try {
// Write the ann and amb for this reference sequence.
annFile = File.createTempFile("bwt","ann");
ambFile = File.createTempFile("bwt","amb");
SAMSequenceDictionary dictionary = new SAMSequenceDictionary();
dictionary.addSequence(new SAMSequenceRecord("autogenerated",referenceSequence.length));
ANNWriter annWriter = new ANNWriter(annFile);
annWriter.write(dictionary);
annWriter.close();
AMBWriter ambWriter = new AMBWriter(ambFile);
ambWriter.writeEmpty(dictionary);
ambWriter.close();
// Write the encoded files for the forward version of this reference sequence.
pacFile = File.createTempFile("bwt","pac");
bwtFile = File.createTempFile("bwt","bwt");
saFile = File.createTempFile("bwt","sa");
writeEncodedReferenceSequence(referenceSequence,pacFile,bwtFile,saFile);
// Write the encoded files for the reverse version of this reference sequence.
byte[] reverseReferenceSequence = new byte[referenceSequence.length];
System.arraycopy(referenceSequence,0,reverseReferenceSequence,0,referenceSequence.length);
rpacFile = File.createTempFile("bwt","rpac");
rbwtFile = File.createTempFile("bwt","rbwt");
rsaFile = File.createTempFile("bwt","rsa");
writeEncodedReferenceSequence(reverseReferenceSequence,rpacFile,rbwtFile,rsaFile);
}
catch(IOException ex) {
throw new StingException("Unable to write autogenerated reference sequence to temporary files");
}
// Make sure that, at the very least, all temporary files are deleted on exit.
annFile.deleteOnExit();
ambFile.deleteOnExit();
pacFile.deleteOnExit();
bwtFile.deleteOnExit();
saFile.deleteOnExit();
rpacFile.deleteOnExit();
rbwtFile.deleteOnExit();
rsaFile.deleteOnExit();
return new BWTFiles(annFile,ambFile,pacFile,bwtFile,saFile,rbwtFile,rsaFile);
}
/**
* Write the encoded form of the reference sequence. In the case of BWA, the encoded reference
* sequence is the reference itself in PAC format, the BWT, and the suffix array.
* @param referenceSequence The reference sequence to encode.
* @param pacFile Target for the PAC-encoded reference.
* @param bwtFile Target for the BWT representation of the reference.
* @param suffixArrayFile Target for the suffix array encoding of the reference.
* @throws java.io.IOException In case of issues writing to the file.
*/
private static void writeEncodedReferenceSequence(byte[] referenceSequence,
File pacFile,
File bwtFile,
File suffixArrayFile) throws IOException {
PackUtils.writeReferenceSequence(pacFile,referenceSequence);
BWT bwt = BWT.createFromReferenceSequence(referenceSequence);
BWTWriter bwtWriter = new BWTWriter(bwtFile);
bwtWriter.write(bwt);
bwtWriter.close();
SuffixArray suffixArray = SuffixArray.createFromReferenceSequence(referenceSequence);
SuffixArrayWriter suffixArrayWriter = new SuffixArrayWriter(suffixArrayFile);
suffixArrayWriter.write(suffixArray);
suffixArrayWriter.close();
}
}

View File

@ -3,14 +3,12 @@ package org.broadinstitute.sting.alignment.bwa.c;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileHeader;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.alignment.Alignment;
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
import org.broadinstitute.sting.alignment.bwa.BWTFiles;
import org.broadinstitute.sting.alignment.bwa.BWAAligner;
import java.util.*;
import java.io.File;
/**
* An aligner using the BWA/C implementation.
@ -33,17 +31,26 @@ public class BWACAligner extends BWAAligner {
if(thunkPointer != 0)
throw new StingException("BWA/C attempting to reinitialize.");
if(!new File(bwtFiles.annFileName).exists()) throw new StingException("ANN file is missing; please rerun 'bwa aln' to regenerate it.");
if(!new File(bwtFiles.ambFileName).exists()) throw new StingException("AMB file is missing; please rerun 'bwa aln' to regenerate it.");
if(!new File(bwtFiles.pacFileName).exists()) throw new StingException("PAC file is missing; please rerun 'bwa aln' to regenerate it.");
if(!new File(bwtFiles.forwardBWTFileName).exists()) throw new StingException("Forward BWT file is missing; please rerun 'bwa aln' to regenerate it.");
if(!new File(bwtFiles.forwardSAFileName).exists()) throw new StingException("Forward SA file is missing; please rerun 'bwa aln' to regenerate it.");
if(!new File(bwtFiles.reverseBWTFileName).exists()) throw new StingException("Reverse BWT file is missing; please rerun 'bwa aln' to regenerate it.");
if(!new File(bwtFiles.reverseSAFileName).exists()) throw new StingException("Reverse SA file is missing; please rerun 'bwa aln' to regenerate it.");
if(!bwtFiles.annFile.exists()) throw new StingException("ANN file is missing; please rerun 'bwa aln' to regenerate it.");
if(!bwtFiles.ambFile.exists()) throw new StingException("AMB file is missing; please rerun 'bwa aln' to regenerate it.");
if(!bwtFiles.pacFile.exists()) throw new StingException("PAC file is missing; please rerun 'bwa aln' to regenerate it.");
if(!bwtFiles.forwardBWTFile.exists()) throw new StingException("Forward BWT file is missing; please rerun 'bwa aln' to regenerate it.");
if(!bwtFiles.forwardSAFile.exists()) throw new StingException("Forward SA file is missing; please rerun 'bwa aln' to regenerate it.");
if(!bwtFiles.reverseBWTFile.exists()) throw new StingException("Reverse BWT file is missing; please rerun 'bwa aln' to regenerate it.");
if(!bwtFiles.reverseSAFile.exists()) throw new StingException("Reverse SA file is missing; please rerun 'bwa aln' to regenerate it.");
thunkPointer = create(bwtFiles,configuration);
}
/**
* Create an aligner object using an array of bytes as a reference.
* @param referenceSequence Reference sequence to encode ad-hoc.
* @param configuration Configuration for the given aligner.
*/
public BWACAligner(byte[] referenceSequence, BWAConfiguration configuration) {
this(BWTFiles.createFromReferenceSequence(referenceSequence),configuration);
}
/**
* Update the configuration passed to the BWA aligner.
* @param configuration New configuration to set.
@ -63,6 +70,7 @@ public class BWACAligner extends BWAAligner {
if(thunkPointer == 0)
throw new StingException("BWA/C close attempted, but BWA/C is not properly initialized.");
destroy(thunkPointer);
super.close();
}
/**

View File

@ -1,11 +1,9 @@
package org.broadinstitute.sting.alignment.bwa.java;
import org.broadinstitute.sting.alignment.reference.bwt.*;
import org.broadinstitute.sting.alignment.bwa.java.LowerBound;
import org.broadinstitute.sting.alignment.bwa.java.BWAAlignment;
import org.broadinstitute.sting.alignment.bwa.java.AlignmentState;
import org.broadinstitute.sting.alignment.bwa.BWAAligner;
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
import org.broadinstitute.sting.alignment.Alignment;
import org.broadinstitute.sting.alignment.Aligner;
import org.broadinstitute.sting.utils.BaseUtils;
import java.io.File;
@ -20,7 +18,7 @@ import net.sf.samtools.SAMFileHeader;
* @author mhanna
* @version 0.1
*/
public class BWAJavaAligner implements Aligner {
public class BWAJavaAligner extends BWAAligner {
/**
* BWT in the forward direction.
*/
@ -77,6 +75,7 @@ public class BWAJavaAligner implements Aligner {
public final int INDEL_END_SKIP = 5;
public BWAJavaAligner( File forwardBWTFile, File reverseBWTFile, File forwardSuffixArrayFile, File reverseSuffixArrayFile ) {
super(null,null);
forwardBWT = new BWTReader(forwardBWTFile).read();
reverseBWT = new BWTReader(reverseBWTFile).read();
forwardSuffixArray = new SuffixArrayReader(forwardSuffixArrayFile,forwardBWT).read();
@ -86,7 +85,18 @@ public class BWAJavaAligner implements Aligner {
/**
* Close this instance of the BWA pointer and delete its resources.
*/
public void close() { throw new UnsupportedOperationException("BWAJavaAligner does not yet support the standard Aligner interface."); }
@Override
public void close() {
super.close();
}
/**
* Update the current parameters of this aligner.
* @param configuration New configuration to set.
*/
public void updateConfiguration(BWAConfiguration configuration) {
throw new UnsupportedOperationException("Configuration of the BWA aligner can't currently be changed.");
}
/**
* Allow the aligner to choose one alignment randomly from the pile of best alignments.

View File

@ -98,6 +98,25 @@ public class BWT {
return counts.getTotal();
}
/**
* Create a new BWT from the given reference sequence.
* @param referenceSequence Sequence from which to derive the BWT.
* @return reference sequence-derived BWT.
*/
public static BWT createFromReferenceSequence(byte[] referenceSequence) {
SuffixArray suffixArray = SuffixArray.createFromReferenceSequence(referenceSequence);
byte[] bwt = new byte[(int)suffixArray.length()-1];
int bwtIndex = 0;
for(long suffixArrayIndex = 0; suffixArrayIndex < suffixArray.length(); suffixArrayIndex++) {
if(suffixArray.get(suffixArrayIndex) == 0)
continue;
bwt[bwtIndex++] = referenceSequence[(int)suffixArray.get(suffixArrayIndex)-1];
}
return new BWT(suffixArray.inverseSA0,suffixArray.occurrences,bwt);
}
/**
* Gets the base at a given position in the BWT.
* @param index The index to use.

View File

@ -20,12 +20,12 @@ public class BWTWriter {
private final OutputStream outputStream;
/**
* Create a new BWT reader.
* @param inputFile File in which the BWT is stored.
* Create a new BWT writer.
* @param outputFile File in which the BWT is stored.
*/
public BWTWriter( File inputFile ) {
public BWTWriter( File outputFile ) {
try {
this.outputStream = new BufferedOutputStream(new FileOutputStream(inputFile));
this.outputStream = new BufferedOutputStream(new FileOutputStream(outputFile));
}
catch( FileNotFoundException ex ) {
throw new StingException("Unable to open output file", ex);

View File

@ -28,11 +28,8 @@ package org.broadinstitute.sting.alignment.reference.bwt;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFileFactory;
import net.sf.picard.reference.ReferenceSequence;
import net.sf.samtools.util.StringUtil;
import java.io.*;
import java.util.TreeSet;
import java.util.Comparator;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.alignment.reference.packing.PackUtils;
@ -44,45 +41,29 @@ import org.broadinstitute.sting.alignment.reference.packing.PackUtils;
* @version 0.1
*/
public class CreateBWTFromReference {
private String loadReference( File inputFile ) {
private byte[] loadReference( File inputFile ) {
// Read in the first sequence in the input file
ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(inputFile);
ReferenceSequence sequence = reference.nextSequence();
return StringUtil.bytesToString(sequence.getBases());
return sequence.getBases();
}
private String loadReverseReference( File inputFile ) {
private byte[] loadReverseReference( File inputFile ) {
ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(inputFile);
ReferenceSequence sequence = reference.nextSequence();
PackUtils.reverse(sequence.getBases());
return StringUtil.bytesToString(sequence.getBases());
return sequence.getBases();
}
private Counts countOccurrences( String sequence ) {
private Counts countOccurrences( byte[] sequence ) {
Counts occurrences = new Counts();
for( char base: sequence.toCharArray() )
occurrences.increment((byte)base);
for( byte base: sequence )
occurrences.increment(base);
return occurrences;
}
private long[] createSuffixArray( String sequence ) {
TreeSet<Integer> suffixArrayBuilder = new TreeSet<Integer>( new SuffixArrayComparator(sequence) );
// Build out the suffix array using a custom comparator.
System.out.printf("Creating sequence array of length %d%n", sequence.length() );
for( int i = 0; i <= sequence.length(); i++ ) {
suffixArrayBuilder.add(i);
if( i % 100000 == 0 )
System.out.printf("Added sequence %d%n", i);
}
// Copy the suffix array into an int array.
long[] suffixArray = new long[suffixArrayBuilder.size()];
int i = 0;
for( Integer element: suffixArrayBuilder )
suffixArray[i++] = element;
return suffixArray;
private long[] createSuffixArray( byte[] sequence ) {
return SuffixArray.createFromReferenceSequence(sequence).sequence;
}
private long[] invertSuffixArray( long[] suffixArray ) {
@ -107,17 +88,6 @@ public class CreateBWTFromReference {
return inverseCompressedSuffixArray;
}
private byte[] createBWT( String sequence, long[] suffixArray ) {
byte[] bwt = new byte[suffixArray.length-1];
int i = 0;
for( long suffixArrayEntry: suffixArray ) {
if( suffixArrayEntry == 0 )
continue;
bwt[i++] = (byte)sequence.charAt((int)suffixArrayEntry-1);
}
return bwt;
}
public static void main( String argv[] ) throws IOException {
if( argv.length != 5 ) {
System.out.println("USAGE: CreateBWTFromReference <input>.fasta <output bwt> <output rbwt> <output sa> <output rsa>");
@ -141,8 +111,8 @@ public class CreateBWTFromReference {
CreateBWTFromReference creator = new CreateBWTFromReference();
String sequence = creator.loadReference(inputFile);
String reverseSequence = creator.loadReverseReference(inputFile);
byte[] sequence = creator.loadReference(inputFile);
byte[] reverseSequence = creator.loadReverseReference(inputFile);
// Count the occurences of each given base.
Counts occurrences = creator.countOccurrences(sequence);
@ -162,13 +132,6 @@ public class CreateBWTFromReference {
SuffixArray suffixArray = new SuffixArray( inverseSuffixArray[0], occurrences, suffixArrayData );
SuffixArray reverseSuffixArray = new SuffixArray( reverseInverseSuffixArray[0], occurrences, reverseSuffixArrayData );
/*
for( int i = 0; i < 8; i++ )
System.out.printf("suffixArray[%d] = %d (%s...)%n", i, suffixArray.sequence[i], sequence.substring(suffixArray.sequence[i],Math.min(suffixArray.sequence[i]+100,sequence.length())));
for( int i = 0; i < 8; i++ )
System.out.printf("inverseSuffixArray[%d] = %d (%s...)%n", i, inverseSuffixArray[i], sequence.substring(i,Math.min(i+100,sequence.length())));
*/
/*
// Create the data structure for the compressed suffix array and print diagnostics.
int[] compressedSuffixArray = creator.createCompressedSuffixArray(suffixArray.sequence,inverseSuffixArray);
@ -186,8 +149,8 @@ public class CreateBWTFromReference {
*/
// Create the BWT.
BWT bwt = new BWT(inverseSuffixArray[0], occurrences, creator.createBWT(sequence, suffixArray.sequence));
BWT reverseBWT = new BWT( reverseInverseSuffixArray[0], occurrences, creator.createBWT(reverseSequence, reverseSuffixArray.sequence));
BWT bwt = BWT.createFromReferenceSequence(sequence);
BWT reverseBWT = BWT.createFromReferenceSequence(reverseSequence);
byte[] bwtSequence = bwt.getSequence();
System.out.printf("BWT: %s... (length = %d)%n", new String(bwtSequence,0,80),bwt.length());
@ -234,36 +197,4 @@ public class CreateBWTFromReference {
}
}
/**
* Compares two suffix arrays of the given sequence. Will return whichever string appears
* first in lexicographic order.
*/
public static class SuffixArrayComparator implements Comparator<Integer> {
/**
* The data source for all suffix arrays.
*/
private final String sequence;
/**
* Create a new comparator.
* @param sequence Reference sequence to use as basis for comparison.
*/
public SuffixArrayComparator( String sequence ) {
this.sequence = sequence;
}
/**
* Compare the two given suffix arrays. Criteria for comparison is the lexicographic order of
* the two substrings sequence[lhs:], sequence[rhs:].
* @param lhs Left-hand side of comparison.
* @param rhs Right-hand side of comparison.
* @return How the suffix arrays represented by lhs, rhs compare.
*/
public int compare( Integer lhs, Integer rhs ) {
String lhsSuffixArray = sequence.substring(lhs);
String rhsSuffixArray = sequence.substring(rhs);
return lhsSuffixArray.compareTo(rhsSuffixArray);
}
}
}

View File

@ -2,6 +2,11 @@ package org.broadinstitute.sting.alignment.reference.bwt;
import org.broadinstitute.sting.utils.StingException;
import java.util.Comparator;
import java.util.TreeSet;
import net.sf.samtools.util.StringUtil;
/**
* An in-memory representation of a suffix array.
*
@ -81,4 +86,74 @@ public class SuffixArray {
}
return (sequence[(int)(index/sequenceInterval)]+iterations) % length();
}
/**
* Create a suffix array from a given reference sequence.
* @param sequence The reference sequence to use when building the suffix array.
* @return a constructed suffix array.
*/
public static SuffixArray createFromReferenceSequence(byte[] sequence) {
// The builder for the suffix array. Use an integer in this case because
// Java arrays can only hold an integer.
TreeSet<Integer> suffixArrayBuilder = new TreeSet<Integer>(new SuffixArrayComparator(sequence));
Counts occurrences = new Counts();
for( byte base: sequence )
occurrences.increment(base);
// Build out the suffix array using a custom comparator.
for( int i = 0; i <= sequence.length; i++ )
suffixArrayBuilder.add(i);
// Copy the suffix array into an array.
long[] suffixArray = new long[suffixArrayBuilder.size()];
int i = 0;
for( Integer element: suffixArrayBuilder )
suffixArray[i++] = element;
// Find the first element in the inverse suffix array.
long inverseSA0 = -1;
for(i = 0; i < sequence.length; i++) {
if(suffixArray[i] == 0)
inverseSA0 = i;
}
if(inverseSA0 < 0)
throw new StingException("Unable to find first inverse SA entry in generated suffix array.");
return new SuffixArray(inverseSA0,occurrences,suffixArray);
}
/**
* Compares two suffix arrays of the given sequence. Will return whichever string appears
* first in lexicographic order.
*/
private static class SuffixArrayComparator implements Comparator<Integer> {
/**
* The data source for all suffix arrays.
*/
private final String sequence;
/**
* Create a new comparator.
* @param sequence Reference sequence to use as basis for comparison.
*/
public SuffixArrayComparator( byte[] sequence ) {
// Processing the suffix array tends to be easier as a string.
this.sequence = StringUtil.bytesToString(sequence);
}
/**
* Compare the two given suffix arrays. Criteria for comparison is the lexicographic order of
* the two substrings sequence[lhs:], sequence[rhs:].
* @param lhs Left-hand side of comparison.
* @param rhs Right-hand side of comparison.
* @return How the suffix arrays represented by lhs, rhs compare.
*/
public int compare( Integer lhs, Integer rhs ) {
String lhsSuffixArray = sequence.substring(lhs);
String rhsSuffixArray = sequence.substring(rhs);
return lhsSuffixArray.compareTo(rhsSuffixArray);
}
}
}

View File

@ -20,11 +20,11 @@ public class SuffixArrayWriter {
/**
* Create a new suffix array reader.
* @param inputFile File in which the suffix array is stored.
* @param outputFile File in which the suffix array is stored.
*/
public SuffixArrayWriter( File inputFile ) {
public SuffixArrayWriter( File outputFile ) {
try {
this.outputStream = new BufferedOutputStream(new FileOutputStream(inputFile));
this.outputStream = new BufferedOutputStream(new FileOutputStream(outputFile));
}
catch( FileNotFoundException ex ) {
throw new StingException("Unable to open input file", ex);

View File

@ -53,23 +53,12 @@ public class CreatePACFromReference {
ReferenceSequence sequence = reference.nextSequence();
// Target file for output
writeSequence( new File(argv[1]), sequence.getBases() );
PackUtils.writeReferenceSequence( new File(argv[1]), sequence.getBases() );
// Reverse the bases in the reference
PackUtils.reverse(sequence.getBases());
// Target file for output
writeSequence( new File(argv[2]), sequence.getBases() );
}
private static void writeSequence( File outputFile, byte[] bases ) throws IOException {
OutputStream outputStream = new FileOutputStream(outputFile);
BasePackedOutputStream<Byte> basePackedOutputStream = new BasePackedOutputStream<Byte>(Byte.class, outputStream, ByteOrder.BIG_ENDIAN);
basePackedOutputStream.write(bases);
outputStream.write(bases.length%PackUtils.ALPHABET_SIZE);
outputStream.close();
PackUtils.writeReferenceSequence( new File(argv[2]), sequence.getBases() );
}
}

View File

@ -2,6 +2,12 @@ package org.broadinstitute.sting.alignment.reference.packing;
import org.broadinstitute.sting.utils.StingException;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.FileOutputStream;
import java.nio.ByteOrder;
/**
* Utilities designed for packing / unpacking bases.
*
@ -24,6 +30,24 @@ public class PackUtils {
*/
public static final int BITS_PER_BYTE = 8;
/**
* Writes a reference sequence to a PAC file.
* @param outputFile Filename for the PAC file.
* @param referenceSequence Reference sequence to write.
* @throws IOException If there's a problem writing to the output file.
*/
public static void writeReferenceSequence( File outputFile, byte[] referenceSequence ) throws IOException {
OutputStream outputStream = new FileOutputStream(outputFile);
BasePackedOutputStream<Byte> basePackedOutputStream = new BasePackedOutputStream<Byte>(Byte.class, outputStream, ByteOrder.BIG_ENDIAN);
basePackedOutputStream.write(referenceSequence);
outputStream.write(referenceSequence.length%PackUtils.ALPHABET_SIZE);
outputStream.close();
}
/**
* How many bits can a given type hold?
* @param type Type to test.

View File

@ -16,7 +16,7 @@ import java.io.File;
public class AlignerIntegrationTest extends WalkerTest {
@Test
public void testBasicAlignment() {
String md5 = "ed098018ffcbd055bee966466044428a";
String md5 = "c6d95d8ae707e78fefdaa7375f130995";
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R /humgen/gsa-scr1/GATK_Data/bwa/human_b36_both.fasta" +
" -T Align" +
@ -24,6 +24,6 @@ public class AlignerIntegrationTest extends WalkerTest {
" -ob %s",
1, // just one output file
Arrays.asList(md5));
//executeTest("testBasicAlignment", spec);
executeTest("testBasicAlignment", spec);
}
}