2013-02-01 02:59:48 +08:00
# ifndef BWAMEM_H_
# define BWAMEM_H_
# include "bwt.h"
2013-02-08 02:13:43 +08:00
# include "bntseq.h"
2013-02-24 04:30:46 +08:00
# include "bwa.h"
2013-02-01 02:59:48 +08:00
2013-02-26 02:00:35 +08:00
# define MEM_MAPQ_COEF 30.0
2013-02-19 13:50:39 +08:00
# define MEM_MAPQ_MAX 60
2013-02-13 05:15:26 +08:00
2013-02-02 03:38:44 +08:00
struct __smem_i ;
typedef struct __smem_i smem_i ;
2013-02-01 02:59:48 +08:00
2013-02-19 05:33:06 +08:00
# define MEM_F_HARDCLIP 0x1
# define MEM_F_PE 0x2
# define MEM_F_NOPAIRING 0x4
2013-02-25 02:23:43 +08:00
# define MEM_F_ALL 0x8
2013-02-27 01:49:48 +08:00
# define MEM_F_NO_MULTI 0x10
2013-02-19 05:33:06 +08:00
2013-02-01 02:59:48 +08:00
typedef struct {
2013-02-26 00:56:02 +08:00
int a , b , q , r ; // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r
2013-02-28 10:13:39 +08:00
int pen_unpaired ; // phred-scaled penalty for unpaired reads
int pen_clip ; // clipping penalty. This score is not deducted from the DP score.
2013-02-26 00:56:02 +08:00
int w ; // band width
2013-02-28 10:13:39 +08:00
2013-02-26 00:56:02 +08:00
int flag ; // see MEM_F_* macros
int min_seed_len ; // minimum seed length
float split_factor ; // split into a seed if MEM is longer than min_seed_len*split_factor
int split_width ; // split into a seed if its occurence is smaller than this value
int max_occ ; // skip a seed if its occurence is larger than this value
int max_chain_gap ; // do not chain seed if it is max_chain_gap-bp away from the closest seed
int n_threads ; // number of threads
int chunk_size ; // process chunk_size-bp sequences in a batch
float mask_level ; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits
float chain_drop_ratio ; // drop a chain if its seed coverage is below chain_drop_ratio times the seed coverage of a better chain overlapping with the small chain
int max_ins ; // when estimating insert size distribution, skip pairs with insert longer than this value
int max_matesw ; // perform maximally max_matesw rounds of mate-SW for each end
int8_t mat [ 25 ] ; // scoring matrix; mat[0] == 0 if unset
2013-02-02 05:39:50 +08:00
} mem_opt_t ;
2013-02-01 02:59:48 +08:00
2013-02-02 05:39:50 +08:00
typedef struct {
2013-02-25 02:09:29 +08:00
int64_t rb , re ; // [rb,re): reference sequence in the alignment
int qb , qe ; // [qb,qe): query sequence in the alignment
int score ; // best SW score
int sub ; // 2nd best SW score
int csub ; // SW score of a tandem hit
int sub_n ; // approximate number of suboptimal hits
int seedcov ; // length of regions coverged by seeds
int secondary ; // index of the parent hit shadowing the current hit; <0 if primary
2013-02-07 02:59:32 +08:00
} mem_alnreg_t ;
2013-02-01 04:55:22 +08:00
2013-02-28 11:28:29 +08:00
typedef struct { size_t n , m ; mem_alnreg_t * a ; } mem_alnreg_v ;
2013-02-11 23:59:38 +08:00
typedef struct {
int low , high , failed ;
double avg , std ;
} mem_pestat_t ;
2013-02-28 11:28:29 +08:00
typedef struct { // TODO: This is an intermediate struct only. Better get rid of it.
2013-02-12 04:29:03 +08:00
int64_t rb , re ;
int qb , qe , flag , qual ;
// optional info
int score , sub ;
} bwahit_t ;
2013-02-28 11:28:29 +08:00
typedef struct { // This struct is only used for the convenience of API.
int rid ;
int pos ;
uint32_t is_rev : 1 , mapq : 8 , NM : 23 ;
int n_cigar ;
uint32_t * cigar ;
} mem_aln_t ;
2013-02-08 02:13:43 +08:00
2013-02-01 02:59:48 +08:00
# ifdef __cplusplus
extern " C " {
# endif
2013-02-25 01:17:29 +08:00
smem_i * smem_itr_init ( const bwt_t * bwt ) ;
void smem_itr_destroy ( smem_i * itr ) ;
void smem_set_query ( smem_i * itr , int len , const uint8_t * query ) ;
const bwtintv_v * smem_next ( smem_i * itr , int split_len , int split_width ) ;
2013-02-01 02:59:48 +08:00
2013-02-25 01:17:29 +08:00
mem_opt_t * mem_opt_init ( void ) ;
void mem_fill_scmat ( int a , int b , int8_t mat [ 25 ] ) ;
2013-02-01 05:26:05 +08:00
2013-02-25 02:09:29 +08:00
/**
* Align a batch of sequences and generate the alignments in the SAM format
*
* This routine requires $ seqs [ i ] . { l_seq , seq , name } and write $ seqs [ i ] . sam .
* Note that $ seqs [ i ] . sam may consist of several SAM lines if the
* corresponding sequence has multiple primary hits .
*
* In the paired - end mode ( i . e . MEM_F_PE is set in $ opt - > flag ) , query
* sequences must be interleaved : $ n must be an even number and the 2 i - th
* sequence and the ( 2 i + 1 ) - th sequence constitute a read pair . In this
* mode , there should be enough ( typically > 50 ) unique pairs for the
* routine to infer the orientation and insert size .
*
* @ param opt alignment parameters
* @ param bwt FM - index of the reference sequence
* @ param bns Information of the reference
* @ param pac 2 - bit encoded reference
* @ param n number of query sequences
* @ param seqs query sequences ; $ seqs [ i ] . seq / sam to be modified after the call
*/
void mem_process_seqs ( const mem_opt_t * opt , const bwt_t * bwt , const bntseq_t * bns , const uint8_t * pac , int n , bseq1_t * seqs ) ;
/**
* Find the aligned regions for one query sequence
*
* Note that this routine does not generate CIGAR . CIGAR should be
* generated later by bwa_gen_cigar ( ) defined in bwa . c .
*
* @ param opt alignment parameters
* @ param bwt FM - index of the reference sequence
* @ param bns Information of the reference
* @ param pac 2 - bit encoded reference
* @ param l_seq length of query sequence
* @ param seq query sequence ; conversion ACGTN / acgtn = > 01234 to be applied
*
* @ return list of aligned regions .
*/
mem_alnreg_v mem_align1 ( const mem_opt_t * opt , const bwt_t * bwt , const bntseq_t * bns , const uint8_t * pac , int l_seq , char * seq ) ;
2013-02-28 11:45:18 +08:00
mem_aln_t mem_reg2aln ( const mem_opt_t * opt , const bntseq_t * bns , const uint8_t * pac , int l_query , uint8_t * query , const mem_alnreg_t * ar ) ;
2013-02-28 11:28:29 +08:00
2013-02-25 02:09:29 +08:00
/**
* Infer the insert size distribution from interleaved alignment regions
*
* This function can be called after mem_align1 ( ) , as long as paired - end
* reads are properly interleaved .
*
* @ param opt alignment parameters
* @ param l_pac length of concatenated reference sequence
* @ param n number of query sequences ; must be an even number
* @ param regs region array of size $ n ; 2 i - th and ( 2 i + 1 ) - th elements constitute a pair
* @ param pes inferred insert size distribution ( output )
*/
2013-02-25 01:17:29 +08:00
void mem_pestat ( const mem_opt_t * opt , int64_t l_pac , int n , const mem_alnreg_v * regs , mem_pestat_t pes [ 4 ] ) ;
2013-02-11 23:59:38 +08:00
2013-02-01 02:59:48 +08:00
# ifdef __cplusplus
}
# endif
# endif