diff --git a/README.md b/README.md index 4bfdae1..e000bb8 100644 --- a/README.md +++ b/README.md @@ -139,12 +139,15 @@ parameters at the same time. The default setting is the same as `map-ont`. ```sh minimap2 -ax map-pb ref.fa pacbio-reads.fq > aln.sam # for PacBio CLR reads minimap2 -ax map-ont ref.fa ont-reads.fq > aln.sam # for Oxford Nanopore reads +minimap2 -ax map-iclr ref.fa iclr-reads.fq > aln.sam # for Illumina Complete Long Reads ``` The difference between `map-pb` and `map-ont` is that `map-pb` uses homopolymer-compressed (HPC) minimizers as seeds, while `map-ont` uses ordinary -minimizers as seeds. Emperical evaluation suggests HPC minimizers improve +minimizers as seeds. Empirical evaluation suggests HPC minimizers improve performance and sensitivity when aligning PacBio CLR reads, but hurt when aligning -Nanopore reads. +Nanopore reads. `map-iclr` uses an adjusted alignment scoring matrix that +accounts for the low overall error rate in the reads, with transversion errors +being less frequent than transitions. #### Map long mRNA/cDNA reads diff --git a/align.c b/align.c index 316af78..f13d264 100644 --- a/align.c +++ b/align.c @@ -21,6 +21,17 @@ static void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t sc mat[(m - 1) * m + j] = sc_ambi; } +static void ksw_gen_ts_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t transition, int8_t sc_ambi) +{ + assert(m==5); + ksw_gen_simple_mat(m,mat,a,b,sc_ambi); + transition = transition > 0? -transition : transition; + mat[0*m+2]=transition; // A->G + mat[1*m+3]=transition; // C->T + mat[2*m+0]=transition; // G->A + mat[3*m+1]=transition; // T->C +} + static inline void mm_seq_rev(uint32_t len, uint8_t *seq) { uint32_t i; @@ -323,6 +334,7 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr); fputc('\n', stderr); } + if (opt->b != opt->transition) flag |= KSW_EZ_GENERIC_SC; if (opt->max_sw_mat > 0 && (int64_t)tlen * qlen > opt->max_sw_mat) { ksw_reset_extz(ez); ez->zdropped = 1; @@ -586,7 +598,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int r2->cnt = 0; if (r->cnt == 0) return; - ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi); + ksw_gen_ts_mat(5, mat, opt->a, opt->b, opt->transition, opt->sc_ambi); bw = (int)(opt->bw * 1.5 + 1.); bw_long = (int)(opt->bw_long * 1.5 + 1.); if (bw_long < bw) bw_long = bw; @@ -844,7 +856,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i if (ql < opt->min_chain_score || ql > opt->max_gap) return 0; if (tl < opt->min_chain_score || tl > opt->max_gap) return 0; - ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi); + ksw_gen_ts_mat(5, mat, opt->a, opt->b, opt->transition, opt->sc_ambi); tseq = (uint8_t*)kmalloc(km, tl); mm_idx_getseq(mi, r1->rid, r1->re, r2->rs, tseq); qseq = r1->rev? &qseq0[0][r2->qe] : &qseq0[1][qlen - r2->qs]; diff --git a/main.c b/main.c index 7db642d..d50b565 100644 --- a/main.c +++ b/main.c @@ -120,7 +120,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int64_t flag, int long_idx, const int main(int argc, char *argv[]) { - const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:J:"; + const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:b:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:J:"; ketopt_t o = KETOPT_INIT; mm_mapopt_t opt; mm_idxopt_t ipt; @@ -178,6 +178,7 @@ int main(int argc, char *argv[]) else if (c == 'm') opt.min_chain_score = atoi(o.arg); else if (c == 'A') opt.a = atoi(o.arg); else if (c == 'B') opt.b = atoi(o.arg); + else if (c == 'b') opt.transition = atoi(o.arg); else if (c == 's') opt.min_dp_max = atoi(o.arg); else if (c == 'C') opt.noncan = atoi(o.arg); else if (c == 'I') ipt.batch_size = mm_parse_num(o.arg); @@ -367,7 +368,7 @@ int main(int argc, char *argv[]) fprintf(fp_help, " --version show version number\n"); fprintf(fp_help, " Preset:\n"); fprintf(fp_help, " -x STR preset (always applied before other options; see minimap2.1 for details) []\n"); - fprintf(fp_help, " - map-pb/map-ont - PacBio CLR/Nanopore vs reference mapping\n"); + fprintf(fp_help, " - map-pb/map-ont/map-iclr-prerender/map-iclr - PacBio/Nanopore/ICLR vs reference mapping\n"); fprintf(fp_help, " - map-hifi - PacBio HiFi reads vs reference mapping\n"); fprintf(fp_help, " - ava-pb/ava-ont - PacBio/Nanopore read overlap\n"); fprintf(fp_help, " - asm5/asm10/asm20 - asm-to-ref mapping, for ~0.1/1/5%% sequence divergence\n"); diff --git a/minimap.h b/minimap.h index 504bdc8..59f6fa0 100644 --- a/minimap.h +++ b/minimap.h @@ -153,6 +153,7 @@ typedef struct { float alt_drop; int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties + int transition; // transition mismatch score (A:G, C:T) int sc_ambi; // score when one or both bases are "N" int noncan; // cost of non-canonical splicing sites int junc_bonus; diff --git a/options.c b/options.c index b3344b5..cd63111 100644 --- a/options.c +++ b/options.c @@ -45,6 +45,7 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->alt_drop = 0.15f; opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1; + opt->transition = opt->b; opt->sc_ambi = 1; opt->zdrop = 400, opt->zdrop_inv = 200; opt->end_bonus = -1; @@ -112,6 +113,14 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) mo->occ_dist = 500; mo->min_mid_occ = 50, mo->max_mid_occ = 500; mo->min_dp_max = 200; + } else if (strcmp(preset, "map-iclr-prerender") == 0) { + io->flag = 0, io->k = 15; + mo->b = 6, mo->transition = 1; + mo->q = 10, mo->q2 = 50; + } else if (strcmp(preset, "map-iclr") == 0) { + io->flag = 0, io->k = 19; + mo->b = 6, mo->transition = 4; + mo->q = 10, mo->q2 = 50; } else if (strncmp(preset, "asm", 3) == 0) { io->flag = 0, io->k = 19, io->w = 19; mo->bw = 1000, mo->bw_long = 100000;