2013-02-01 02:59:48 +08:00
# ifndef BWAMEM_H_
# define BWAMEM_H_
# include "bwt.h"
2013-02-08 02:13:43 +08:00
# include "bntseq.h"
2013-02-24 04:30:46 +08:00
# include "bwa.h"
2013-02-01 02:59:48 +08:00
2013-02-26 02:00:35 +08:00
# define MEM_MAPQ_COEF 30.0
2013-02-19 13:50:39 +08:00
# define MEM_MAPQ_MAX 60
2013-02-13 05:15:26 +08:00
2013-02-02 03:38:44 +08:00
struct __smem_i ;
typedef struct __smem_i smem_i ;
2013-02-01 02:59:48 +08:00
2013-02-19 05:33:06 +08:00
# define MEM_F_PE 0x2
# define MEM_F_NOPAIRING 0x4
2013-02-25 02:23:43 +08:00
# define MEM_F_ALL 0x8
2013-02-27 01:49:48 +08:00
# define MEM_F_NO_MULTI 0x10
2013-04-10 04:13:55 +08:00
# define MEM_F_NO_RESCUE 0x20
2013-02-19 05:33:06 +08:00
2013-02-01 02:59:48 +08:00
typedef struct {
2013-02-26 00:56:02 +08:00
int a , b , q , r ; // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r
2013-02-28 10:13:39 +08:00
int pen_unpaired ; // phred-scaled penalty for unpaired reads
int pen_clip ; // clipping penalty. This score is not deducted from the DP score.
2013-02-26 00:56:02 +08:00
int w ; // band width
2013-03-05 13:34:33 +08:00
int zdrop ; // Z-dropoff
2013-02-28 10:13:39 +08:00
2013-03-06 11:49:38 +08:00
int T ; // output score threshold; only affecting output
2013-02-26 00:56:02 +08:00
int flag ; // see MEM_F_* macros
int min_seed_len ; // minimum seed length
float split_factor ; // split into a seed if MEM is longer than min_seed_len*split_factor
int split_width ; // split into a seed if its occurence is smaller than this value
int max_occ ; // skip a seed if its occurence is larger than this value
int max_chain_gap ; // do not chain seed if it is max_chain_gap-bp away from the closest seed
int n_threads ; // number of threads
int chunk_size ; // process chunk_size-bp sequences in a batch
float mask_level ; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits
float chain_drop_ratio ; // drop a chain if its seed coverage is below chain_drop_ratio times the seed coverage of a better chain overlapping with the small chain
int max_ins ; // when estimating insert size distribution, skip pairs with insert longer than this value
int max_matesw ; // perform maximally max_matesw rounds of mate-SW for each end
int8_t mat [ 25 ] ; // scoring matrix; mat[0] == 0 if unset
2013-02-02 05:39:50 +08:00
} mem_opt_t ;
2013-02-01 02:59:48 +08:00
2013-02-02 05:39:50 +08:00
typedef struct {
2013-02-25 02:09:29 +08:00
int64_t rb , re ; // [rb,re): reference sequence in the alignment
int qb , qe ; // [qb,qe): query sequence in the alignment
2013-03-16 09:26:37 +08:00
int score ; // best local SW score
int truesc ; // actual score corresponding to the aligned region; possibly smaller than $score
2013-02-25 02:09:29 +08:00
int sub ; // 2nd best SW score
int csub ; // SW score of a tandem hit
int sub_n ; // approximate number of suboptimal hits
2013-03-05 06:29:07 +08:00
int w ; // actual band width used in extension
2013-02-25 02:09:29 +08:00
int seedcov ; // length of regions coverged by seeds
int secondary ; // index of the parent hit shadowing the current hit; <0 if primary
2013-02-07 02:59:32 +08:00
} mem_alnreg_t ;
2013-02-01 04:55:22 +08:00
2013-02-28 11:28:29 +08:00
typedef struct { size_t n , m ; mem_alnreg_t * a ; } mem_alnreg_v ;
2013-02-11 23:59:38 +08:00
typedef struct {
2013-04-27 00:31:18 +08:00
int low , high ; // lower and upper bounds within which a read pair is considered to be properly paired
int failed ; // non-zero if the orientation is not supported by sufficient data
double avg , std ; // mean and stddev of the insert size distribution
2013-02-11 23:59:38 +08:00
} mem_pestat_t ;
2013-02-28 11:28:29 +08:00
typedef struct { // This struct is only used for the convenience of API.
2013-03-12 09:25:17 +08:00
int64_t pos ; // forward strand 5'-end mapping position
int rid ; // reference sequence index in bntseq_t; <0 for unmapped
int flag ; // extra flag
2013-03-02 00:14:51 +08:00
uint32_t is_rev : 1 , mapq : 8 , NM : 23 ; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance
int n_cigar ; // number of CIGAR operations
uint32_t * cigar ; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234
2013-03-12 09:25:17 +08:00
int score , sub ;
2013-02-28 11:28:29 +08:00
} mem_aln_t ;
2013-02-08 02:13:43 +08:00
2013-02-01 02:59:48 +08:00
# ifdef __cplusplus
extern " C " {
# endif
2013-02-25 01:17:29 +08:00
smem_i * smem_itr_init ( const bwt_t * bwt ) ;
void smem_itr_destroy ( smem_i * itr ) ;
void smem_set_query ( smem_i * itr , int len , const uint8_t * query ) ;
const bwtintv_v * smem_next ( smem_i * itr , int split_len , int split_width ) ;
2013-02-01 02:59:48 +08:00
2013-02-25 01:17:29 +08:00
mem_opt_t * mem_opt_init ( void ) ;
void mem_fill_scmat ( int a , int b , int8_t mat [ 25 ] ) ;
2013-02-01 05:26:05 +08:00
2013-02-25 02:09:29 +08:00
/**
* Align a batch of sequences and generate the alignments in the SAM format
*
* This routine requires $ seqs [ i ] . { l_seq , seq , name } and write $ seqs [ i ] . sam .
* Note that $ seqs [ i ] . sam may consist of several SAM lines if the
* corresponding sequence has multiple primary hits .
*
* In the paired - end mode ( i . e . MEM_F_PE is set in $ opt - > flag ) , query
* sequences must be interleaved : $ n must be an even number and the 2 i - th
* sequence and the ( 2 i + 1 ) - th sequence constitute a read pair . In this
* mode , there should be enough ( typically > 50 ) unique pairs for the
* routine to infer the orientation and insert size .
*
* @ param opt alignment parameters
* @ param bwt FM - index of the reference sequence
* @ param bns Information of the reference
* @ param pac 2 - bit encoded reference
* @ param n number of query sequences
* @ param seqs query sequences ; $ seqs [ i ] . seq / sam to be modified after the call
2013-04-27 00:31:18 +08:00
* @ param pes0 insert - size info ; if NULL , infer from data ; if not NULL , it should be an array with 4 elements ,
* corresponding to each FF , FR , RF and RR orientation . See mem_pestat ( ) for more info .
2013-02-25 02:09:29 +08:00
*/
2013-04-27 00:31:18 +08:00
void mem_process_seqs ( const mem_opt_t * opt , const bwt_t * bwt , const bntseq_t * bns , const uint8_t * pac , int n , bseq1_t * seqs , const mem_pestat_t * pes0 ) ;
2013-02-25 02:09:29 +08:00
/**
* Find the aligned regions for one query sequence
*
* Note that this routine does not generate CIGAR . CIGAR should be
2013-03-02 00:47:51 +08:00
* generated later by mem_reg2aln ( ) below .
2013-02-25 02:09:29 +08:00
*
* @ param opt alignment parameters
* @ param bwt FM - index of the reference sequence
* @ param bns Information of the reference
* @ param pac 2 - bit encoded reference
* @ param l_seq length of query sequence
2013-03-02 00:14:51 +08:00
* @ param seq query sequence
2013-02-25 02:09:29 +08:00
*
* @ return list of aligned regions .
*/
2013-03-02 00:14:51 +08:00
mem_alnreg_v mem_align1 ( const mem_opt_t * opt , const bwt_t * bwt , const bntseq_t * bns , const uint8_t * pac , int l_seq , const char * seq ) ;
2013-02-25 02:09:29 +08:00
2013-03-02 00:14:51 +08:00
/**
* Generate CIGAR and forward - strand position from alignment region
*
* @ param opt alignment parameters
* @ param bns Information of the reference
* @ param pac 2 - bit encoded reference
* @ param l_seq length of query sequence
* @ param seq query sequence
* @ param ar one alignment region
*
* @ return CIGAR , strand , mapping quality and forward - strand position
*/
mem_aln_t mem_reg2aln ( const mem_opt_t * opt , const bntseq_t * bns , const uint8_t * pac , int l_seq , const char * seq , const mem_alnreg_t * ar ) ;
2013-02-28 11:28:29 +08:00
2013-02-25 02:09:29 +08:00
/**
* Infer the insert size distribution from interleaved alignment regions
*
* This function can be called after mem_align1 ( ) , as long as paired - end
* reads are properly interleaved .
*
* @ param opt alignment parameters
* @ param l_pac length of concatenated reference sequence
* @ param n number of query sequences ; must be an even number
* @ param regs region array of size $ n ; 2 i - th and ( 2 i + 1 ) - th elements constitute a pair
* @ param pes inferred insert size distribution ( output )
*/
2013-02-25 01:17:29 +08:00
void mem_pestat ( const mem_opt_t * opt , int64_t l_pac , int n , const mem_alnreg_v * regs , mem_pestat_t pes [ 4 ] ) ;
2013-02-11 23:59:38 +08:00
2013-02-01 02:59:48 +08:00
# ifdef __cplusplus
}
# endif
# endif