Illumina Complete Long Read presets (#1069)

* Implements a transition-aware alignment scoring scheme and configuration presets for ICLR

* Fix to enable use of general scoring matrix in ksw as suggested by lh3

---------

Co-authored-by: koadman <>
This commit is contained in:
Aaron Darling 2023-06-04 22:06:15 +07:00 committed by GitHub
parent e28a55be86
commit ace990c381
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 32 additions and 6 deletions

View File

@ -139,12 +139,15 @@ parameters at the same time. The default setting is the same as `map-ont`.
```sh
minimap2 -ax map-pb ref.fa pacbio-reads.fq > aln.sam # for PacBio CLR reads
minimap2 -ax map-ont ref.fa ont-reads.fq > aln.sam # for Oxford Nanopore reads
minimap2 -ax map-iclr ref.fa iclr-reads.fq > aln.sam # for Illumina Complete Long Reads
```
The difference between `map-pb` and `map-ont` is that `map-pb` uses
homopolymer-compressed (HPC) minimizers as seeds, while `map-ont` uses ordinary
minimizers as seeds. Emperical evaluation suggests HPC minimizers improve
minimizers as seeds. Empirical evaluation suggests HPC minimizers improve
performance and sensitivity when aligning PacBio CLR reads, but hurt when aligning
Nanopore reads.
Nanopore reads. `map-iclr` uses an adjusted alignment scoring matrix that
accounts for the low overall error rate in the reads, with transversion errors
being less frequent than transitions.
#### <a name="map-long-splice"></a>Map long mRNA/cDNA reads

16
align.c
View File

@ -21,6 +21,17 @@ static void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t sc
mat[(m - 1) * m + j] = sc_ambi;
}
static void ksw_gen_ts_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t transition, int8_t sc_ambi)
{
assert(m==5);
ksw_gen_simple_mat(m,mat,a,b,sc_ambi);
transition = transition > 0? -transition : transition;
mat[0*m+2]=transition; // A->G
mat[1*m+3]=transition; // C->T
mat[2*m+0]=transition; // G->A
mat[3*m+1]=transition; // T->C
}
static inline void mm_seq_rev(uint32_t len, uint8_t *seq)
{
uint32_t i;
@ -323,6 +334,7 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint
for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr);
fputc('\n', stderr);
}
if (opt->b != opt->transition) flag |= KSW_EZ_GENERIC_SC;
if (opt->max_sw_mat > 0 && (int64_t)tlen * qlen > opt->max_sw_mat) {
ksw_reset_extz(ez);
ez->zdropped = 1;
@ -586,7 +598,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
r2->cnt = 0;
if (r->cnt == 0) return;
ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi);
ksw_gen_ts_mat(5, mat, opt->a, opt->b, opt->transition, opt->sc_ambi);
bw = (int)(opt->bw * 1.5 + 1.);
bw_long = (int)(opt->bw_long * 1.5 + 1.);
if (bw_long < bw) bw_long = bw;
@ -844,7 +856,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i
if (ql < opt->min_chain_score || ql > opt->max_gap) return 0;
if (tl < opt->min_chain_score || tl > opt->max_gap) return 0;
ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi);
ksw_gen_ts_mat(5, mat, opt->a, opt->b, opt->transition, opt->sc_ambi);
tseq = (uint8_t*)kmalloc(km, tl);
mm_idx_getseq(mi, r1->rid, r1->re, r2->rs, tseq);
qseq = r1->rev? &qseq0[0][r2->qe] : &qseq0[1][qlen - r2->qs];

5
main.c
View File

@ -120,7 +120,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int64_t flag, int long_idx, const
int main(int argc, char *argv[])
{
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:J:";
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:b:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:J:";
ketopt_t o = KETOPT_INIT;
mm_mapopt_t opt;
mm_idxopt_t ipt;
@ -178,6 +178,7 @@ int main(int argc, char *argv[])
else if (c == 'm') opt.min_chain_score = atoi(o.arg);
else if (c == 'A') opt.a = atoi(o.arg);
else if (c == 'B') opt.b = atoi(o.arg);
else if (c == 'b') opt.transition = atoi(o.arg);
else if (c == 's') opt.min_dp_max = atoi(o.arg);
else if (c == 'C') opt.noncan = atoi(o.arg);
else if (c == 'I') ipt.batch_size = mm_parse_num(o.arg);
@ -367,7 +368,7 @@ int main(int argc, char *argv[])
fprintf(fp_help, " --version show version number\n");
fprintf(fp_help, " Preset:\n");
fprintf(fp_help, " -x STR preset (always applied before other options; see minimap2.1 for details) []\n");
fprintf(fp_help, " - map-pb/map-ont - PacBio CLR/Nanopore vs reference mapping\n");
fprintf(fp_help, " - map-pb/map-ont/map-iclr-prerender/map-iclr - PacBio/Nanopore/ICLR vs reference mapping\n");
fprintf(fp_help, " - map-hifi - PacBio HiFi reads vs reference mapping\n");
fprintf(fp_help, " - ava-pb/ava-ont - PacBio/Nanopore read overlap\n");
fprintf(fp_help, " - asm5/asm10/asm20 - asm-to-ref mapping, for ~0.1/1/5%% sequence divergence\n");

View File

@ -153,6 +153,7 @@ typedef struct {
float alt_drop;
int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties
int transition; // transition mismatch score (A:G, C:T)
int sc_ambi; // score when one or both bases are "N"
int noncan; // cost of non-canonical splicing sites
int junc_bonus;

View File

@ -45,6 +45,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
opt->alt_drop = 0.15f;
opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1;
opt->transition = opt->b;
opt->sc_ambi = 1;
opt->zdrop = 400, opt->zdrop_inv = 200;
opt->end_bonus = -1;
@ -112,6 +113,14 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
mo->occ_dist = 500;
mo->min_mid_occ = 50, mo->max_mid_occ = 500;
mo->min_dp_max = 200;
} else if (strcmp(preset, "map-iclr-prerender") == 0) {
io->flag = 0, io->k = 15;
mo->b = 6, mo->transition = 1;
mo->q = 10, mo->q2 = 50;
} else if (strcmp(preset, "map-iclr") == 0) {
io->flag = 0, io->k = 19;
mo->b = 6, mo->transition = 4;
mo->q = 10, mo->q2 = 50;
} else if (strncmp(preset, "asm", 3) == 0) {
io->flag = 0, io->k = 19, io->w = 19;
mo->bw = 1000, mo->bw_long = 100000;