diff --git a/c/bwa/build_linux.sh b/c/bwa/build_linux.sh index 449b7211e..c713f3963 100755 --- a/c/bwa/build_linux.sh +++ b/c/bwa/build_linux.sh @@ -1,5 +1,5 @@ #!/bin/sh -export BWA_HOME="/humgen/gsa-hphome1/hanna/src/bwa-0.5.3" +export BWA_HOME="/humgen/gsa-scr1/hanna/src/bwa" export JAVA_INCLUDE="/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/include -I/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/include/linux" export TARGET_LIB="libbwa.so" export EXTRA_LIBS="-lc -lz -lstdc++ -lpthread" diff --git a/c/bwa/bwa_gateway.cpp b/c/bwa/bwa_gateway.cpp index 45e28b6d7..dbb3d39bd 100644 --- a/c/bwa/bwa_gateway.cpp +++ b/c/bwa/bwa_gateway.cpp @@ -36,22 +36,114 @@ BWA::~BWA() { bwt_destroy(bwts[1]); } -void BWA::align(const char* bases, const unsigned read_length, Alignment*& alignments, unsigned& num_alignments) +void BWA::find_paths(const char* bases, const unsigned read_length, bwt_aln1_t*& paths, unsigned& num_paths, unsigned& best_path_count, unsigned& second_best_path_count) { bwa_seq_t* sequence = create_sequence(); copy_bases_into_sequence(sequence, bases, read_length); - // Calculate the suffix array interval for each sequence, storing the result in sequence.aln (and sequence.n_aln). + // Calculate the suffix array interval for each sequence, storing the result in sequence->aln (and sequence->n_aln). // This method will destroy the contents of seq and rseq. bwa_cal_sa_reg_gap(0,bwts,1,sequence,&options); - // Translate suffix array indices into exactly how many alignments have been found. + paths = new bwt_aln1_t[sequence->n_aln]; + memcpy(paths,sequence->aln,sequence->n_aln*sizeof(bwt_aln1_t)); + num_paths = sequence->n_aln; + + // Call aln2seq to initialize the type of match present. + bwa_aln2seq(sequence->n_aln,sequence->aln,sequence); + best_path_count = sequence->c1; + second_best_path_count = sequence->c2; + + bwa_free_read_seq(1,sequence); +} + +void BWA::generate_alignments_from_paths(const char* bases, + const unsigned read_length, + bwt_aln1_t* paths, + const unsigned num_paths, + const unsigned best_count, + const unsigned second_best_count, + Alignment*& alignments, + unsigned& num_alignments) +{ + bwa_seq_t* sequence = create_sequence(); + copy_bases_into_sequence(sequence, bases, read_length); + + sequence->aln = paths; + sequence->n_aln = num_paths; + + // (Ab)use bwa_aln2seq to propagate values stored in the path out into the sequence itself. bwa_aln2seq(sequence->n_aln,sequence->aln,sequence); - // Calculate and refine the position for each alignment. This position may be inaccurate - // if the read contains indels, etc. Refinement requires the original sequences in the proper order. - copy_bases_into_sequence(sequence, bases, read_length); - create_alignments(sequence, alignments, num_alignments); + // But overwrite key parts of the sequence in case the user passed back only a smaller subset + // of the paths. + sequence->c1 = best_count; + sequence->c2 = second_best_count; + sequence->type = sequence->c1 > 1 ? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE; + + num_alignments = 0; + for(unsigned i = 0; i < (unsigned)sequence->n_aln; i++) + num_alignments += (sequence->aln + i)->l - (sequence->aln + i)->k + 1; + + alignments = new Alignment[num_alignments]; + unsigned alignment_idx = 0; + + for(unsigned path_idx = 0; path_idx < (unsigned)num_paths; path_idx++) { + // Stub in a 'working' path, so that only the desired alignment is local-aligned. + const bwt_aln1_t* path = paths + path_idx; + bwt_aln1_t working_path = *path; + + // Loop through all alignments, aligning each one individually. + for(unsigned sa_idx = path->k; sa_idx <= path->l; sa_idx++) { + working_path.k = working_path.l = sa_idx; + sequence->aln = &working_path; + sequence->n_aln = 1; + + sequence->sa = sa_idx; + sequence->strand = path->a; + sequence->score = path->score; + + // Each time through bwa_refine_gapped, seq gets reversed. Revert the reverse. + // TODO: Fix the interface to bwa_refine_gapped so its easier to work with. + if(alignment_idx > 0) + seq_reverse(sequence->len, sequence->seq, 0); + + // Calculate the local coordinate and local alignment. + bwa_cal_pac_pos_core(bwts[0],bwts[1],sequence,options.max_diff,options.fnr); + bwa_refine_gapped(bns, 1, sequence, reference, NULL); + + // Copy the local alignment data into the alignment object. + Alignment& alignment = *(alignments + alignment_idx); + + // Populate basic path info + alignment.num_mismatches = sequence->n_mm; + alignment.num_gap_opens = sequence->n_gapo; + alignment.num_gap_extensions = sequence->n_gape; + alignment.num_best = sequence->c1; + alignment.num_second_best = sequence->c2; + + alignment.type = sequence->type; + bns_coor_pac2real(bns, sequence->pos, pos_end(sequence) - sequence->pos, &alignment.contig); + alignment.pos = sequence->pos - bns->anns[alignment.contig].offset + 1; + alignment.negative_strand = sequence->strand; + alignment.mapping_quality = sequence->mapQ; + + alignment.cigar = NULL; + if(sequence->cigar) { + alignment.cigar = new uint16_t[sequence->n_cigar]; + memcpy(alignment.cigar,sequence->cigar,sequence->n_cigar*sizeof(uint16_t)); + } + alignment.n_cigar = sequence->n_cigar; + + delete[] sequence->md; + sequence->md = NULL; + + alignment_idx++; + } + } + + sequence->aln = NULL; + sequence->n_aln = 0; bwa_free_read_seq(1,sequence); } @@ -132,63 +224,3 @@ void BWA::copy_bases_into_sequence(bwa_seq_t* sequence, const char* bases, const sequence->full_len = sequence->len = read_length; } - -void BWA::create_alignments(bwa_seq_t* sequence, Alignment*& alignments, unsigned& num_alignments) { - num_alignments = 0; - for(unsigned i = 0; i < (unsigned)sequence->n_aln; i++) - num_alignments += (sequence->aln + i)->l - (sequence->aln + i)->k + 1; - - alignments = new Alignment[num_alignments]; - unsigned alignment_idx = 0; - - // backup existing alignment blocks. - bwt_aln1_t* alignment_blocks = sequence->aln; - int num_alignment_blocks = sequence->n_aln; - - for(unsigned alignment_block_idx = 0; alignment_block_idx < (unsigned)num_alignment_blocks; alignment_block_idx++) { - // Stub in a 'working' alignment block, so that only the desired alignment is local-aligned. - const bwt_aln1_t* alignment_block = alignment_blocks + alignment_block_idx; - bwt_aln1_t working_alignment_block = *alignment_block; - - // Loop through all alignments, aligning each one individually. - for(unsigned sa_idx = alignment_block->k; sa_idx <= alignment_block->l; sa_idx++) { - working_alignment_block.k = working_alignment_block.l = sa_idx; - sequence->aln = &working_alignment_block; - sequence->n_aln = 1; - - sequence->sa = sa_idx; - sequence->strand = alignment_block->a; - sequence->score = alignment_block->score; - - // Each time through bwa_refine_gapped, seq gets reversed. Revert the reverse. - // TODO: Fix the interface to bwa_refine_gapped so its easier to work with. - if(alignment_idx > 0) - seq_reverse(sequence->len, sequence->seq, 0); - - // Calculate the local coordinate and local alignment. - bwa_cal_pac_pos_core(bwts[0],bwts[1],sequence,options.max_diff,options.fnr); - bwa_refine_gapped(bns, 1, sequence, reference, NULL); - - // Copy the local alignment data into the alignment object. - Alignment& alignment = *(alignments + alignment_idx); - alignment.type = sequence->type; - bns_coor_pac2real(bns, sequence->pos, pos_end(sequence) - sequence->pos, &alignment.contig); - alignment.pos = sequence->pos - bns->anns[alignment.contig].offset + 1; - alignment.negative_strand = sequence->strand; - alignment.mapQ = sequence->mapQ; - - alignment.cigar = NULL; - if(sequence->cigar) { - alignment.cigar = new uint16_t[sequence->n_cigar]; - memcpy(alignment.cigar,sequence->cigar,sequence->n_cigar*sizeof(uint16_t)); - } - alignment.n_cigar = sequence->n_cigar; - - alignment_idx++; - } - } - - // Restore original alignment blocks. - sequence->aln = alignment_blocks; - sequence->n_aln = num_alignment_blocks; -} diff --git a/c/bwa/bwa_gateway.h b/c/bwa/bwa_gateway.h index f559cd9e2..a794f7958 100644 --- a/c/bwa/bwa_gateway.h +++ b/c/bwa/bwa_gateway.h @@ -19,7 +19,6 @@ class BWA { void load_default_options(); bwa_seq_t* create_sequence(); void copy_bases_into_sequence(bwa_seq_t* sequence, const char* bases, const unsigned read_length); - void create_alignments(bwa_seq_t* sequence, Alignment*& alignments, unsigned& num_alignments); public: BWA(const char* ann_filename, @@ -41,7 +40,20 @@ class BWA { void set_gap_extension_penalty(int penalty); // Perform the alignment - void align(const char* bases, const unsigned read_length, Alignment*& alignments, unsigned& num_alignments); + void find_paths(const char* bases, + const unsigned read_length, + bwt_aln1_t*& paths, + unsigned& num_paths, + unsigned& best_path_count, + unsigned& second_best_path_count); + void generate_alignments_from_paths(const char* bases, + const unsigned read_length, + bwt_aln1_t* paths, + const unsigned num_paths, + const unsigned best_count, + const unsigned second_best_count, + Alignment*& alignments, + unsigned& num_alignments); }; class Alignment { @@ -50,11 +62,16 @@ class Alignment { int contig; bwtint_t pos; bool negative_strand; + uint32_t mapping_quality; uint16_t *cigar; int n_cigar; - uint32_t mapQ; + uint8_t num_mismatches; + uint8_t num_gap_opens; + uint8_t num_gap_extensions; + uint32_t num_best; + uint32_t num_second_best; }; #endif // BWA_GATEWAY diff --git a/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.cpp b/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.cpp index 77451b86e..7db61bf09 100644 --- a/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.cpp +++ b/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.cpp @@ -95,7 +95,63 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner delete bwa; } -JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_getAlignments(JNIEnv* env, jobject object, jlong java_bwa, jbyteArray java_bases) +JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_getPaths(JNIEnv *env, jobject instance, jlong java_bwa, jbyteArray java_bases) +{ + BWA* bwa = (BWA*)java_bwa; + + const jsize read_length = env->GetArrayLength(java_bases); + if(env->ExceptionCheck()) return NULL; + + jbyte *read_bases = env->GetByteArrayElements(java_bases,JNI_FALSE); + if(read_bases == NULL) return NULL; + + bwt_aln1_t* paths = NULL; + unsigned num_paths = 0; + + unsigned best_path_count, second_best_path_count; + bwa->find_paths((const char*)read_bases,read_length,paths,num_paths,best_path_count,second_best_path_count); + + jobjectArray java_paths = env->NewObjectArray(num_paths, env->FindClass("org/broadinstitute/sting/alignment/bwa/c/BWAPath"), NULL); + if(java_paths == NULL) return NULL; + + for(unsigned path_idx = 0; path_idx < (unsigned)num_paths; path_idx++) { + bwt_aln1_t& path = *(paths + path_idx); + + jclass java_path_class = env->FindClass("org/broadinstitute/sting/alignment/bwa/c/BWAPath"); + if(java_path_class == NULL) return NULL; + + jmethodID java_path_constructor = env->GetMethodID(java_path_class, "", "(IIIZJJIII)V"); + if(java_path_constructor == NULL) return NULL; + + // Note that k/l are being cast to long. Bad things will happen if JNI assumes that they're ints. + jobject java_path = env->NewObject(java_path_class, + java_path_constructor, + path.n_mm, + path.n_gapo, + path.n_gape, + path.a, + (jlong)path.k, + (jlong)path.l, + path.score, + best_path_count, + second_best_path_count); + if(java_path == NULL) return NULL; + + env->SetObjectArrayElement(java_paths,path_idx,java_path); + if(env->ExceptionCheck()) return NULL; + + env->DeleteLocalRef(java_path_class); + if(env->ExceptionCheck()) return NULL; + } + + delete[] paths; + + env->ReleaseByteArrayElements(java_bases,read_bases,JNI_FALSE); + + return env->ExceptionCheck() ? NULL : java_paths; +} + +JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_convertPathsToAlignments(JNIEnv *env, jobject instance, jlong java_bwa, jbyteArray java_bases, jobjectArray java_paths) { BWA* bwa = (BWA*)java_bwa; @@ -105,10 +161,67 @@ JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWA jbyte *read_bases = env->GetByteArrayElements(java_bases,JNI_FALSE); if(read_bases == NULL) return NULL; + const jsize num_paths = env->GetArrayLength(java_paths); + bwt_aln1_t* paths = new bwt_aln1_t[num_paths]; + unsigned best_count = 0, second_best_count = 0; + + for(unsigned path_idx = 0; path_idx < (unsigned)num_paths; path_idx++) { + jobject java_path = env->GetObjectArrayElement(java_paths,path_idx); + jclass java_path_class = env->GetObjectClass(java_path); + if(java_path_class == NULL) return NULL; + + bwt_aln1_t& path = *(paths + path_idx); + + jfieldID mismatches_field = env->GetFieldID(java_path_class, "numMismatches", "I"); + if(mismatches_field == NULL) return NULL; + path.n_mm = env->GetIntField(java_path,mismatches_field); + if(env->ExceptionCheck()) return NULL; + + jfieldID gap_opens_field = env->GetFieldID(java_path_class, "numGapOpens", "I"); + if(gap_opens_field == NULL) return NULL; + path.n_gapo = env->GetIntField(java_path,gap_opens_field); + if(env->ExceptionCheck()) return NULL; + + jfieldID gap_extensions_field = env->GetFieldID(java_path_class, "numGapExtensions", "I"); + if(gap_extensions_field == NULL) return NULL; + path.n_gape = env->GetIntField(java_path,gap_extensions_field); + if(env->ExceptionCheck()) return NULL; + + jfieldID negative_strand_field = env->GetFieldID(java_path_class, "negativeStrand", "Z"); + if(negative_strand_field == NULL) return NULL; + path.a = env->GetBooleanField(java_path,negative_strand_field); + if(env->ExceptionCheck()) return NULL; + + jfieldID k_field = env->GetFieldID(java_path_class, "k", "J"); + if(k_field == NULL) return NULL; + path.k = env->GetLongField(java_path,k_field); + if(env->ExceptionCheck()) return NULL; + + jfieldID l_field = env->GetFieldID(java_path_class, "l", "J"); + if(l_field == NULL) return NULL; + path.l = env->GetLongField(java_path,l_field); + if(env->ExceptionCheck()) return NULL; + + jfieldID score_field = env->GetFieldID(java_path_class, "score", "I"); + if(score_field == NULL) return NULL; + path.score = env->GetIntField(java_path,score_field); + if(env->ExceptionCheck()) return NULL; + + jfieldID best_count_field = env->GetFieldID(java_path_class, "bestCount", "I"); + if(best_count_field == NULL) return NULL; + best_count = env->GetIntField(java_path,best_count_field); + if(env->ExceptionCheck()) return NULL; + + jfieldID second_best_count_field = env->GetFieldID(java_path_class, "secondBestCount", "I"); + if(second_best_count_field == NULL) return NULL; + second_best_count = env->GetIntField(java_path,second_best_count_field); + if(env->ExceptionCheck()) return NULL; + } + Alignment* alignments = NULL; unsigned num_alignments = 0; - - bwa->align((const char*)read_bases,read_length,alignments,num_alignments); + bwa->generate_alignments_from_paths((const char*)read_bases,read_length,paths,num_paths,best_count,second_best_count,alignments,num_alignments); + num_alignments = 0; jobjectArray java_alignments = env->NewObjectArray(num_alignments, env->FindClass("org/broadinstitute/sting/alignment/Alignment"), NULL); if(java_alignments == NULL) return NULL; @@ -128,39 +241,44 @@ JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWA if(alignment.cigar) { for(unsigned cigar_idx = 0; cigar_idx < (unsigned)alignment.n_cigar; ++cigar_idx) { - jchar cigar_operator = "MIDS"[alignment.cigar[cigar_idx]>>14]; - jint cigar_length = alignment.cigar[cigar_idx]&0x3fff; - - env->SetCharArrayRegion(java_cigar_operators,cigar_idx,1,&cigar_operator); - if(env->ExceptionCheck()) return NULL; - env->SetIntArrayRegion(java_cigar_lengths,cigar_idx,1,&cigar_length); - if(env->ExceptionCheck()) return NULL; + jchar cigar_operator = "MIDS"[alignment.cigar[cigar_idx]>>14]; + jint cigar_length = alignment.cigar[cigar_idx]&0x3fff; + + env->SetCharArrayRegion(java_cigar_operators,cigar_idx,1,&cigar_operator); + if(env->ExceptionCheck()) return NULL; + env->SetIntArrayRegion(java_cigar_lengths,cigar_idx,1,&cigar_length); + if(env->ExceptionCheck()) return NULL; } } else { if(alignment.type != BWA_TYPE_NO_MATCH) { - jchar cigar_operator = 'M'; - env->SetCharArrayRegion(java_cigar_operators,0,1,&cigar_operator); - if(env->ExceptionCheck()) return NULL; - env->SetIntArrayRegion(java_cigar_lengths,0,1,&read_length); - if(env->ExceptionCheck()) return NULL; + jchar cigar_operator = 'M'; + env->SetCharArrayRegion(java_cigar_operators,0,1,&cigar_operator); + if(env->ExceptionCheck()) return NULL; + env->SetIntArrayRegion(java_cigar_lengths,0,1,&read_length); + if(env->ExceptionCheck()) return NULL; } } - + jclass java_alignment_class = env->FindClass("org/broadinstitute/sting/alignment/Alignment"); if(java_alignment_class == NULL) return NULL; - jmethodID java_alignment_constructor = env->GetMethodID(java_alignment_class, "", "(IIZI[C[I)V"); + jmethodID java_alignment_constructor = env->GetMethodID(java_alignment_class, "", "(IIZI[C[IIIIII)V"); if(java_alignment_constructor == NULL) return NULL; jobject java_alignment = env->NewObject(java_alignment_class, - java_alignment_constructor, - alignment.contig, - alignment.pos, - alignment.negative_strand, - alignment.mapQ, - java_cigar_operators, - java_cigar_lengths); + java_alignment_constructor, + alignment.contig, + alignment.pos, + alignment.negative_strand, + alignment.mapping_quality, + java_cigar_operators, + java_cigar_lengths, + alignment.num_mismatches, + alignment.num_gap_opens, + alignment.num_gap_extensions, + alignment.num_best, + alignment.num_second_best); if(java_alignment == NULL) return NULL; delete[] alignment.cigar; @@ -173,6 +291,7 @@ JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWA } delete[] alignments; + delete[] paths; env->ReleaseByteArrayElements(java_bases,read_bases,JNI_FALSE); diff --git a/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.h b/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.h index c2c081e16..afadb2493 100644 --- a/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.h +++ b/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.h @@ -25,12 +25,20 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner /* * Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner - * Method: getAlignments - * Signature: (J[B)[Lorg/broadinstitute/sting/alignment/Alignment; + * Method: getPaths + * Signature: (J[B)[Lorg/broadinstitute/sting/alignment/bwa/c/BWAPath; */ -JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_getAlignments +JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_getPaths (JNIEnv *, jobject, jlong, jbyteArray); +/* + * Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner + * Method: convertPathsToAlignments + * Signature: (J[B[Lorg/broadinstitute/sting/alignment/bwa/c/BWAPath;)[Lorg/broadinstitute/sting/alignment/Alignment; + */ +JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_convertPathsToAlignments + (JNIEnv *, jobject, jlong, jbyteArray, jobjectArray); + #ifdef __cplusplus } #endif diff --git a/java/src/org/broadinstitute/sting/alignment/Alignment.java b/java/src/org/broadinstitute/sting/alignment/Alignment.java index 67857e6fa..e484e046d 100644 --- a/java/src/org/broadinstitute/sting/alignment/Alignment.java +++ b/java/src/org/broadinstitute/sting/alignment/Alignment.java @@ -16,8 +16,14 @@ public class Alignment { protected boolean negativeStrand; protected int mappingQuality; - private char[] cigarOperators; - private int[] cigarLengths; + protected char[] cigarOperators; + protected int[] cigarLengths; + + protected int numMismatches; + protected int numGapOpens; + protected int numGapExtensions; + protected int bestCount; + protected int secondBestCount; /** * Gets the index of the given contig. @@ -43,6 +49,36 @@ public class Alignment { */ public int getMappingQuality() { return mappingQuality; } + /** + * Gets the number of mismatches in the read. + * @return Number of mismatches. + */ + public int getNumMismatches() { return numMismatches; } + + /** + * Get the number of gap opens. + * @return Number of gap opens. + */ + public int getNumGapOpens() { return numGapOpens; } + + /** + * Get the number of gap extensions. + * @return Number of gap extensions. + */ + public int getNumGapExtensions() { return numGapExtensions; } + + /** + * Get the number of best alignments. + * @return Number of top scoring alignments. + */ + public int getBestCount() { return bestCount; } + + /** + * Get the number of second best alignments. + * @return Number of second best scoring alignments. + */ + public int getSecondBestCount() { return secondBestCount; } + /** * Gets the cigar for this alignment. * @return sam-jdk formatted alignment. @@ -87,13 +123,27 @@ public class Alignment { * @param cigarOperators The ordered operators in the cigar string. * @param cigarLengths The lengths to which each operator applies. */ - public Alignment(int contigIndex, int alignmentStart, boolean negativeStrand, int mappingQuality, - char[] cigarOperators, int[] cigarLengths) { + public Alignment(int contigIndex, + int alignmentStart, + boolean negativeStrand, + int mappingQuality, + char[] cigarOperators, + int[] cigarLengths, + int numMismatches, + int numGapOpens, + int numGapExtensions, + int bestCount, + int secondBestCount) { this.contigIndex = contigIndex; this.alignmentStart = alignmentStart; this.negativeStrand = negativeStrand; this.mappingQuality = mappingQuality; this.cigarOperators = cigarOperators; this.cigarLengths = cigarLengths; + this.numMismatches = numMismatches; + this.numGapOpens = numGapOpens; + this.numGapExtensions = numGapExtensions; + this.bestCount = bestCount; + this.secondBestCount = secondBestCount; } } diff --git a/java/src/org/broadinstitute/sting/alignment/AlignmentValidationWalker.java b/java/src/org/broadinstitute/sting/alignment/AlignmentValidationWalker.java index ae96fc07e..8b0ba978e 100644 --- a/java/src/org/broadinstitute/sting/alignment/AlignmentValidationWalker.java +++ b/java/src/org/broadinstitute/sting/alignment/AlignmentValidationWalker.java @@ -6,9 +6,14 @@ import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.alignment.bwa.c.BWACAligner; import org.broadinstitute.sting.alignment.bwa.c.BWACConfiguration; +import org.broadinstitute.sting.alignment.bwa.c.BWAPath; import net.sf.samtools.SAMRecord; import java.util.Iterator; +import java.util.SortedMap; +import java.util.TreeMap; +import java.io.PrintWriter; +import java.io.FileNotFoundException; /** * Validates alignments against existing reads. @@ -23,6 +28,8 @@ public class AlignmentValidationWalker extends ReadWalker { @Argument(fullName="BWTPrefix",shortName="BWT",doc="Index files generated by bwa index -d bwtsw",required=false) String prefix = "/Users/mhanna/reference/Ecoli/Escherichia_coli_K12_MG1655.fasta"; + //SortedMap alignmentCounts = new TreeMap(); + /** * The instance used to generate alignments. */ @@ -48,7 +55,7 @@ public class AlignmentValidationWalker extends ReadWalker { @Override public boolean filter(char[] ref, SAMRecord read) { return true; - //return read.getReadName().contains("SL-XBC:1:61:1719:1512#0"); + //return read.getReadName().contains("SL-XBC:1:13:1296:1410#0"); } /** @@ -66,6 +73,15 @@ public class AlignmentValidationWalker extends ReadWalker { boolean matches = true; Iterator alignments = aligner.getAlignments(bases); + /* + BWAPath[] paths = aligner.getPaths(bases); + int bestCount = 0; + if(paths.length > 0) bestCount = paths[0].bestCount; + if(alignmentCounts.containsKey(bestCount)) + alignmentCounts.put(bestCount,alignmentCounts.get(bestCount)+1); + else + alignmentCounts.put(bestCount,1); + */ if(!alignments.hasNext()) { matches = read.getReadUnmappedFlag(); @@ -102,7 +118,9 @@ public class AlignmentValidationWalker extends ReadWalker { throw new StingException(String.format("Read %s mismatches!", read.getReadName())); } - if(++count % 10000 == 0) logger.info(String.format("Processed %d reads", count)); + if(++count % 10000 == 0) { + logger.info(String.format("Processed %d reads", count)); + } //logger.info(String.format("validated read %s", read.getReadName())); @@ -135,6 +153,18 @@ public class AlignmentValidationWalker extends ReadWalker { public void onTraversalDone(Integer result) { aligner.close(); super.onTraversalDone(result); + +/* + try { + PrintWriter output = new PrintWriter("GATK_distribution.out"); + for(SortedMap.Entry results: alignmentCounts.entrySet()) + output.printf("%d\t%d%n", results.getKey(), results.getValue()); + output.close(); + } + catch(FileNotFoundException ex) { + throw new StingException("Unable to open output file"); + } +*/ } } diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/c/BWACAligner.java b/java/src/org/broadinstitute/sting/alignment/bwa/c/BWACAligner.java index e56caf0dd..c8c2bc6b9 100644 --- a/java/src/org/broadinstitute/sting/alignment/bwa/c/BWACAligner.java +++ b/java/src/org/broadinstitute/sting/alignment/bwa/c/BWACAligner.java @@ -4,11 +4,8 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.alignment.Alignment; -import org.broadinstitute.sting.alignment.bwa.c.BWACConfiguration; -import java.util.Comparator; -import java.util.Arrays; -import java.util.Iterator; +import java.util.*; /** * An aligner using the BWA/C implementation. @@ -34,7 +31,7 @@ public class BWACAligner { protected native long create(BWACConfiguration configuration); /** - * Destroy the + * Destroy the BWA/C thunk. * @param thunkPointer Pointer to the allocated thunk. */ protected native void destroy(long thunkPointer); @@ -59,8 +56,8 @@ public class BWACAligner { * @param bases List of bases. * @return Iterator to alignments. */ - public Iterator getAlignments(byte[] bases) { - final Alignment[] alignments = getAllAlignments(bases); + public Iterator getAlignments(final byte[] bases) { + final BWAPath[] paths = getPaths(bases); return new Iterator() { /** * The last position accessed. @@ -71,19 +68,19 @@ public class BWACAligner { * Whether all alignments have been seen based on the current position. * @return True if any more alignments are pending. False otherwise. */ - public boolean hasNext() { return position < alignments.length; } + public boolean hasNext() { return position < paths.length; } /** * Return the next cross-section of alignments, based on mapping quality. * @return Array of the next set of alignments of a given mapping quality. */ public Alignment[] next() { - if(position >= alignments.length) + if(position >= paths.length) throw new UnsupportedOperationException("Out of alignments to return."); - int mapQ = alignments[position].getMappingQuality(); + int score = paths[position].score; int startingPosition = position; - while(position < alignments.length && alignments[position].getMappingQuality() == mapQ) position++; - return Arrays.copyOfRange(alignments,startingPosition,position); + while(position < paths.length && paths[position].score == score) position++; + return convertPathsToAlignments(bases,Arrays.copyOfRange(paths,startingPosition,position)); } /** @@ -93,19 +90,6 @@ public class BWACAligner { }; } - /** - * Align the given base array to the BWT. The base array should be in ASCII format. - * @param bases ASCII representation of byte array. - * @return an array of indices into the bwa. - */ - public Alignment[] getAllAlignments(byte[] bases) { - if(thunkPointer == 0) - throw new StingException("BWA/C align attempted, but BWA/C was never properly initialized."); - Alignment[] alignments = getAlignments(thunkPointer,bases); - Arrays.sort(alignments,new AlignmentMapQComparator()); - return alignments; - } - /** * Get a iterator of aligned reads, batched by mapping quality. * @param read Read to align. @@ -182,28 +166,56 @@ public class BWACAligner { } /** - * Caller to the . The base array should be in - * ASCII format. - * @param thunkPointer pointer to the C++ object managing BWA/C. + * Align the given base array to the BWT. The base array should be in ASCII format. * @param bases ASCII representation of byte array. + * @return an array of indices into the bwa. */ - protected native Alignment[] getAlignments(long thunkPointer, byte[] bases); + public Alignment[] getAllAlignments(byte[] bases) { + if(thunkPointer == 0) + throw new StingException("BWA/C align attempted, but BWA/C is not properly initialized."); + BWAPath[] paths = getPaths(bases); + return convertPathsToAlignments(thunkPointer,bases,paths); + } /** - * Compares two alignments based on their mapping quality. The one with the highest mapping quality comes FIRST. + * Get the paths associated with the given base string. + * @param bases List of bases. + * @return A set of paths through the BWA. */ - private class AlignmentMapQComparator implements Comparator { - /** - * Compares two alignments based on their score. If lhs has a higher mapping quality than - * rhs, the score will be negative. - * @param lhs Left-hand side for comparison. - * @param rhs Right-hand side for comparison. - * @return Negative value if lhs' score is highe - */ - public int compare(Alignment lhs, Alignment rhs) { - // NOTE: this strategy works because Integer.MAX_VALUE, Integer.MIN_VALUE >> than valid range of mapping - // qualities. - return rhs.getMappingQuality() - lhs.getMappingQuality(); - } + public BWAPath[] getPaths(byte[] bases) { + if(thunkPointer == 0) + throw new StingException("BWA/C getPaths attempted, but BWA/C is not properly initialized."); + BWAPath[] paths = getPaths(thunkPointer,bases); + return paths; } + + /** + * Do the extra steps involved in converting a local alignment to a global alignment. + * @param bases ASCII representation of byte array. + * @param paths Paths through the current BWT. + * @return A list of alignments. + */ + protected Alignment[] convertPathsToAlignments(byte[] bases, BWAPath[] paths) { + if(thunkPointer == 0) + throw new StingException("BWA/C convertPathsToAlignments attempted, but BWA/C is not properly initialized."); + return convertPathsToAlignments(thunkPointer,bases,paths); + } + + /** + * Caller to the path generation functionality within BWA/C. Call this method's getPaths() wrapper (above) instead. + * @param thunkPointer pointer to the C++ object managing BWA/C. + * @param bases ASCII representation of byte array. + * @return A list of paths through the specified BWT. + */ + protected native BWAPath[] getPaths(long thunkPointer, byte[] bases); + + /** + * Do the extra steps involved in converting a local alignment to a global alignment. + * Call this method's convertPathsToAlignments() wrapper (above) instead. + * @param thunkPointer pointer to the C++ object managing BWA/C. + * @param bases ASCII representation of byte array. + * @param paths Paths through the current BWT. + * @return A list of alignments. + */ + protected native Alignment[] convertPathsToAlignments(long thunkPointer, byte[] bases, BWAPath[] paths); } diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/c/BWAPath.java b/java/src/org/broadinstitute/sting/alignment/bwa/c/BWAPath.java new file mode 100755 index 000000000..347d4344f --- /dev/null +++ b/java/src/org/broadinstitute/sting/alignment/bwa/c/BWAPath.java @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.alignment.bwa.c; + +/** + * Models a BWA path. + * + * @author mhanna + * @version 0.1 + */ +public class BWAPath { + /** + * Number of mismatches encountered along this path. + */ + public final int numMismatches; + + /** + * Number of gap opens encountered along this path. + */ + public final int numGapOpens; + + /** + * Number of gap extensions along this path. + */ + public final int numGapExtensions; + + /** + * Whether this alignment was found on the positive or negative strand. + */ + public final boolean negativeStrand; + + /** + * Starting coordinate in the BWT. + */ + public final long k; + + /** + * Ending coordinate in the BWT. + */ + public final long l; + + /** + * The score of this path. + */ + public final int score; + + /** + * The number of best alignments seen along this path. + */ + public final int bestCount; + + /** + * The number of second best alignments seen along this path. + */ + public final int secondBestCount; + + /** + * Create a new path with the given attributes. + * @param numMismatches Number of mismatches along path. + * @param numGapOpens Number of gap opens along path. + * @param numGapExtensions Number of gap extensions along path. + * @param k Index to first coordinate within BWT. + * @param l Index to last coordinate within BWT. + * @param score Score of this alignment. Not the mapping quality. + */ + public BWAPath(int numMismatches, int numGapOpens, int numGapExtensions, boolean negativeStrand, long k, long l, int score, int bestCount, int secondBestCount) { + this.numMismatches = numMismatches; + this.numGapOpens = numGapOpens; + this.numGapExtensions = numGapExtensions; + this.negativeStrand = negativeStrand; + this.k = k; + this.l = l; + this.score = score; + this.bestCount = bestCount; + this.secondBestCount = secondBestCount; + } + +}