#include #include #include #include "bwase.h" #include "bwa_gateway.h" BWA::BWA(const char* ann_filename, const char* amb_filename, const char* pac_filename, const char* forward_bwt_filename, const char* forward_sa_filename, const char* reverse_bwt_filename, const char* reverse_sa_filename) { // Load the bns (?) and reference bns = bns_restore_core(ann_filename,amb_filename,pac_filename); reference = new ubyte_t[bns->l_pac/4+1]; rewind(bns->fp_pac); fread(reference, 1, bns->l_pac/4+1, bns->fp_pac); fclose(bns->fp_pac); bns->fp_pac = NULL; // Load the BWTs (both directions) and suffix arrays (both directions) bwts[0] = bwt_restore_bwt(forward_bwt_filename); bwt_restore_sa(forward_sa_filename, bwts[0]); bwts[1] = bwt_restore_bwt(reverse_bwt_filename); bwt_restore_sa(reverse_sa_filename, bwts[1]); load_default_options(); // Always reinitialize the random seed whenever a new set of files are loaded. initialize_random_seed(); // initialize the bwase subsystem bwase_initialize(); } BWA::~BWA() { delete[] reference; bns_destroy(bns); bwt_destroy(bwts[0]); bwt_destroy(bwts[1]); } void BWA::find_paths(const char* bases, const unsigned read_length, bwt_aln1_t*& paths, unsigned& num_paths, unsigned& best_path_count, unsigned& second_best_path_count) { bwa_seq_t* sequence = create_sequence(bases, read_length); // Calculate the suffix array interval for each sequence, storing the result in sequence->aln (and sequence->n_aln). // This method will destroy the contents of seq and rseq. bwa_cal_sa_reg_gap(0,bwts,1,sequence,&options); paths = new bwt_aln1_t[sequence->n_aln]; memcpy(paths,sequence->aln,sequence->n_aln*sizeof(bwt_aln1_t)); num_paths = sequence->n_aln; // Call aln2seq to initialize the type of match present. bwa_aln2seq(sequence->n_aln,sequence->aln,sequence); best_path_count = sequence->c1; second_best_path_count = sequence->c2; bwa_free_read_seq(1,sequence); } Alignment* BWA::generate_single_alignment(const char* bases, const unsigned read_length) { bwa_seq_t* sequence = create_sequence(bases,read_length); // Calculate paths. bwa_cal_sa_reg_gap(0,bwts,1,sequence,&options); // Check for no alignments found and return null. if(sequence->n_aln == 0) { bwa_free_read_seq(1,sequence); return NULL; } // bwa_cal_sa_reg_gap destroys the bases / read length. Copy them back in. copy_bases_into_sequence(sequence,bases,read_length); // Pick best alignment and propagate its information into the sequence. bwa_aln2seq(sequence->n_aln,sequence->aln,sequence); // Generate the best alignment from the sequence. Alignment* alignment = new Alignment; *alignment = generate_final_alignment_from_sequence(sequence); bwa_free_read_seq(1,sequence); return alignment; } void BWA::generate_alignments_from_paths(const char* bases, const unsigned read_length, bwt_aln1_t* paths, const unsigned num_paths, const unsigned best_count, const unsigned second_best_count, Alignment*& alignments, unsigned& num_alignments) { bwa_seq_t* sequence = create_sequence(bases,read_length); sequence->aln = paths; sequence->n_aln = num_paths; // (Ab)use bwa_aln2seq to propagate values stored in the path out into the sequence itself. bwa_aln2seq(sequence->n_aln,sequence->aln,sequence); // But overwrite key parts of the sequence in case the user passed back only a smaller subset // of the paths. sequence->c1 = best_count; sequence->c2 = second_best_count; sequence->type = sequence->c1 > 1 ? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE; num_alignments = 0; for(unsigned i = 0; i < (unsigned)sequence->n_aln; i++) num_alignments += (sequence->aln + i)->l - (sequence->aln + i)->k + 1; alignments = new Alignment[num_alignments]; unsigned alignment_idx = 0; for(unsigned path_idx = 0; path_idx < (unsigned)num_paths; path_idx++) { // Stub in a 'working' path, so that only the desired alignment is local-aligned. const bwt_aln1_t* path = paths + path_idx; bwt_aln1_t working_path = *path; // Loop through all alignments, aligning each one individually. for(unsigned sa_idx = path->k; sa_idx <= path->l; sa_idx++) { working_path.k = working_path.l = sa_idx; sequence->aln = &working_path; sequence->n_aln = 1; sequence->sa = sa_idx; sequence->strand = path->a; sequence->score = path->score; // Each time through bwa_refine_gapped, seq gets reversed. Revert the reverse. // TODO: Fix the interface to bwa_refine_gapped so its easier to work with. if(alignment_idx > 0) seq_reverse(sequence->len, sequence->seq, 0); // Copy the local alignment data into the alignment object. *(alignments + alignment_idx) = generate_final_alignment_from_sequence(sequence); alignment_idx++; } } sequence->aln = NULL; sequence->n_aln = 0; bwa_free_read_seq(1,sequence); } Alignment BWA::generate_final_alignment_from_sequence(bwa_seq_t* sequence) { // Calculate the local coordinate and local alignment. bwa_cal_pac_pos_core(bwts[0],bwts[1],sequence,options.max_diff,options.fnr); bwa_refine_gapped(bns, 1, sequence, reference, NULL); // Copy the local alignment data into the alignment object. Alignment alignment; // Populate basic path info alignment.edit_distance = sequence->nm; alignment.num_mismatches = sequence->n_mm; alignment.num_gap_opens = sequence->n_gapo; alignment.num_gap_extensions = sequence->n_gape; alignment.num_best = sequence->c1; alignment.num_second_best = sequence->c2; // Final alignment position. alignment.type = sequence->type; bns_coor_pac2real(bns, sequence->pos, pos_end(sequence) - sequence->pos, &alignment.contig); alignment.pos = sequence->pos - bns->anns[alignment.contig].offset + 1; alignment.negative_strand = sequence->strand; alignment.mapping_quality = sequence->mapQ; // Cigar step. alignment.cigar = NULL; if(sequence->cigar) { alignment.cigar = new uint16_t[sequence->n_cigar]; memcpy(alignment.cigar,sequence->cigar,sequence->n_cigar*sizeof(uint16_t)); } alignment.n_cigar = sequence->n_cigar; // MD tag with a better breakdown of differences in the cigar alignment.md = strdup(sequence->md); delete[] sequence->md; sequence->md = NULL; return alignment; } void BWA::load_default_options() { options.s_mm = 3; options.s_gapo = 11; options.s_gape = 4; options.mode = 3; options.indel_end_skip = 5; options.max_del_occ = 10; options.max_entries = 2000000; options.fnr = 0.04; options.max_diff = -1; options.max_gapo = 1; options.max_gape = 6; options.max_seed_diff = 2; options.seed_len = 2147483647; options.n_threads = 1; options.max_top2 = 30; options.trim_qual = 0; } void BWA::initialize_random_seed() { srand48(bns->seed); } void BWA::set_max_edit_distance(float edit_distance) { if(edit_distance > 0 && edit_distance < 1) { options.fnr = edit_distance; options.max_diff = -1; } else { options.fnr = -1.0; options.max_diff = (int)edit_distance; } } void BWA::set_max_gap_opens(int max_gap_opens) { options.max_gapo = max_gap_opens; } void BWA::set_max_gap_extensions(int max_gap_extensions) { options.max_gape = max_gap_extensions; } void BWA::set_disallow_indel_within_range(int indel_range) { options.indel_end_skip = indel_range; } void BWA::set_mismatch_penalty(int penalty) { options.s_mm = penalty; } void BWA::set_gap_open_penalty(int penalty) { options.s_gapo = penalty; } void BWA::set_gap_extension_penalty(int penalty) { options.s_gape = penalty; } void BWA::set_mode_nonstop() { options.mode |= BWA_MODE_NONSTOP; options.max_top2 = 0x7fffffff; } void BWA::set_max_entries_in_queue(int max_entries) { options.max_entries = max_entries; } /** * Create a sequence with a set of reasonable initial defaults. * Will leave seq and rseq empty. */ bwa_seq_t* BWA::create_sequence(const char* bases, const unsigned read_length) { bwa_seq_t* sequence = new bwa_seq_t; sequence->tid = -1; sequence->name = 0; copy_bases_into_sequence(sequence, bases, read_length); sequence->qual = 0; sequence->aln = 0; sequence->md = 0; sequence->cigar = NULL; sequence->n_cigar = 0; sequence->multi = NULL; sequence->n_multi = 0; return sequence; } void BWA::copy_bases_into_sequence(bwa_seq_t* sequence, const char* bases, const unsigned read_length) { // seq, rseq will ultimately be freed by bwa_cal_sa_reg_gap sequence->seq = new ubyte_t[read_length]; sequence->rseq = new ubyte_t[read_length]; for(unsigned i = 0; i < read_length; i++) sequence->seq[i] = nst_nt4_table[(unsigned)bases[i]]; memcpy(sequence->rseq,sequence->seq,read_length); // BWA expects the read bases to arrive reversed. seq_reverse(read_length,sequence->seq,0); seq_reverse(read_length,sequence->rseq,1); sequence->full_len = sequence->len = read_length; }