Illumina Complete Long Read presets (#1069)
* Implements a transition-aware alignment scoring scheme and configuration presets for ICLR * Fix to enable use of general scoring matrix in ksw as suggested by lh3 --------- Co-authored-by: koadman <>
This commit is contained in:
parent
e28a55be86
commit
ace990c381
|
|
@ -139,12 +139,15 @@ parameters at the same time. The default setting is the same as `map-ont`.
|
|||
```sh
|
||||
minimap2 -ax map-pb ref.fa pacbio-reads.fq > aln.sam # for PacBio CLR reads
|
||||
minimap2 -ax map-ont ref.fa ont-reads.fq > aln.sam # for Oxford Nanopore reads
|
||||
minimap2 -ax map-iclr ref.fa iclr-reads.fq > aln.sam # for Illumina Complete Long Reads
|
||||
```
|
||||
The difference between `map-pb` and `map-ont` is that `map-pb` uses
|
||||
homopolymer-compressed (HPC) minimizers as seeds, while `map-ont` uses ordinary
|
||||
minimizers as seeds. Emperical evaluation suggests HPC minimizers improve
|
||||
minimizers as seeds. Empirical evaluation suggests HPC minimizers improve
|
||||
performance and sensitivity when aligning PacBio CLR reads, but hurt when aligning
|
||||
Nanopore reads.
|
||||
Nanopore reads. `map-iclr` uses an adjusted alignment scoring matrix that
|
||||
accounts for the low overall error rate in the reads, with transversion errors
|
||||
being less frequent than transitions.
|
||||
|
||||
#### <a name="map-long-splice"></a>Map long mRNA/cDNA reads
|
||||
|
||||
|
|
|
|||
16
align.c
16
align.c
|
|
@ -21,6 +21,17 @@ static void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t sc
|
|||
mat[(m - 1) * m + j] = sc_ambi;
|
||||
}
|
||||
|
||||
static void ksw_gen_ts_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t transition, int8_t sc_ambi)
|
||||
{
|
||||
assert(m==5);
|
||||
ksw_gen_simple_mat(m,mat,a,b,sc_ambi);
|
||||
transition = transition > 0? -transition : transition;
|
||||
mat[0*m+2]=transition; // A->G
|
||||
mat[1*m+3]=transition; // C->T
|
||||
mat[2*m+0]=transition; // G->A
|
||||
mat[3*m+1]=transition; // T->C
|
||||
}
|
||||
|
||||
static inline void mm_seq_rev(uint32_t len, uint8_t *seq)
|
||||
{
|
||||
uint32_t i;
|
||||
|
|
@ -323,6 +334,7 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint
|
|||
for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr);
|
||||
fputc('\n', stderr);
|
||||
}
|
||||
if (opt->b != opt->transition) flag |= KSW_EZ_GENERIC_SC;
|
||||
if (opt->max_sw_mat > 0 && (int64_t)tlen * qlen > opt->max_sw_mat) {
|
||||
ksw_reset_extz(ez);
|
||||
ez->zdropped = 1;
|
||||
|
|
@ -586,7 +598,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
|
|||
|
||||
r2->cnt = 0;
|
||||
if (r->cnt == 0) return;
|
||||
ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi);
|
||||
ksw_gen_ts_mat(5, mat, opt->a, opt->b, opt->transition, opt->sc_ambi);
|
||||
bw = (int)(opt->bw * 1.5 + 1.);
|
||||
bw_long = (int)(opt->bw_long * 1.5 + 1.);
|
||||
if (bw_long < bw) bw_long = bw;
|
||||
|
|
@ -844,7 +856,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i
|
|||
if (ql < opt->min_chain_score || ql > opt->max_gap) return 0;
|
||||
if (tl < opt->min_chain_score || tl > opt->max_gap) return 0;
|
||||
|
||||
ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi);
|
||||
ksw_gen_ts_mat(5, mat, opt->a, opt->b, opt->transition, opt->sc_ambi);
|
||||
tseq = (uint8_t*)kmalloc(km, tl);
|
||||
mm_idx_getseq(mi, r1->rid, r1->re, r2->rs, tseq);
|
||||
qseq = r1->rev? &qseq0[0][r2->qe] : &qseq0[1][qlen - r2->qs];
|
||||
|
|
|
|||
5
main.c
5
main.c
|
|
@ -120,7 +120,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int64_t flag, int long_idx, const
|
|||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:J:";
|
||||
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:b:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:J:";
|
||||
ketopt_t o = KETOPT_INIT;
|
||||
mm_mapopt_t opt;
|
||||
mm_idxopt_t ipt;
|
||||
|
|
@ -178,6 +178,7 @@ int main(int argc, char *argv[])
|
|||
else if (c == 'm') opt.min_chain_score = atoi(o.arg);
|
||||
else if (c == 'A') opt.a = atoi(o.arg);
|
||||
else if (c == 'B') opt.b = atoi(o.arg);
|
||||
else if (c == 'b') opt.transition = atoi(o.arg);
|
||||
else if (c == 's') opt.min_dp_max = atoi(o.arg);
|
||||
else if (c == 'C') opt.noncan = atoi(o.arg);
|
||||
else if (c == 'I') ipt.batch_size = mm_parse_num(o.arg);
|
||||
|
|
@ -367,7 +368,7 @@ int main(int argc, char *argv[])
|
|||
fprintf(fp_help, " --version show version number\n");
|
||||
fprintf(fp_help, " Preset:\n");
|
||||
fprintf(fp_help, " -x STR preset (always applied before other options; see minimap2.1 for details) []\n");
|
||||
fprintf(fp_help, " - map-pb/map-ont - PacBio CLR/Nanopore vs reference mapping\n");
|
||||
fprintf(fp_help, " - map-pb/map-ont/map-iclr-prerender/map-iclr - PacBio/Nanopore/ICLR vs reference mapping\n");
|
||||
fprintf(fp_help, " - map-hifi - PacBio HiFi reads vs reference mapping\n");
|
||||
fprintf(fp_help, " - ava-pb/ava-ont - PacBio/Nanopore read overlap\n");
|
||||
fprintf(fp_help, " - asm5/asm10/asm20 - asm-to-ref mapping, for ~0.1/1/5%% sequence divergence\n");
|
||||
|
|
|
|||
|
|
@ -153,6 +153,7 @@ typedef struct {
|
|||
float alt_drop;
|
||||
|
||||
int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties
|
||||
int transition; // transition mismatch score (A:G, C:T)
|
||||
int sc_ambi; // score when one or both bases are "N"
|
||||
int noncan; // cost of non-canonical splicing sites
|
||||
int junc_bonus;
|
||||
|
|
|
|||
|
|
@ -45,6 +45,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
|
|||
opt->alt_drop = 0.15f;
|
||||
|
||||
opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1;
|
||||
opt->transition = opt->b;
|
||||
opt->sc_ambi = 1;
|
||||
opt->zdrop = 400, opt->zdrop_inv = 200;
|
||||
opt->end_bonus = -1;
|
||||
|
|
@ -112,6 +113,14 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
|||
mo->occ_dist = 500;
|
||||
mo->min_mid_occ = 50, mo->max_mid_occ = 500;
|
||||
mo->min_dp_max = 200;
|
||||
} else if (strcmp(preset, "map-iclr-prerender") == 0) {
|
||||
io->flag = 0, io->k = 15;
|
||||
mo->b = 6, mo->transition = 1;
|
||||
mo->q = 10, mo->q2 = 50;
|
||||
} else if (strcmp(preset, "map-iclr") == 0) {
|
||||
io->flag = 0, io->k = 19;
|
||||
mo->b = 6, mo->transition = 4;
|
||||
mo->q = 10, mo->q2 = 50;
|
||||
} else if (strncmp(preset, "asm", 3) == 0) {
|
||||
io->flag = 0, io->k = 19, io->w = 19;
|
||||
mo->bw = 1000, mo->bw_long = 100000;
|
||||
|
|
|
|||
Loading…
Reference in New Issue