r512: option to filter poorly aligned reads

This commit is contained in:
Heng Li 2017-10-16 10:38:22 -04:00
parent 858213d513
commit e6f525edaf
8 changed files with 131 additions and 38 deletions

56
align.c
View File

@ -117,7 +117,7 @@ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq,
}
}
static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e)
static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *qual, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e)
{
uint32_t k, l, toff = 0, qoff = 0;
int32_t s = 0, max = 0, qshift, tshift;
@ -128,21 +128,30 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
for (k = 0; k < p->n_cigar; ++k) {
uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4;
if (op == 0) { // match/mismatch
int n_ambi = 0, n_diff = 0;
float n_diff2 = 0.0f;
for (l = 0; l < len; ++l) {
int cq = qseq[qoff + l], ct = tseq[toff + l];
if (ct > 3 || cq > 3) ++p->n_ambi;
else if (ct != cq) ++p->n_diff;
if (ct > 3 || cq > 3) ++n_ambi;
else if (ct != cq) {
++n_diff;
n_diff2 += qual == 0 || qual[qoff + l] >= 20? 1.0f : .05f * qual[qoff + l];
}
s += mat[ct * 5 + cq];
if (s < 0) s = 0;
else max = max > s? max : s;
}
p->n_ambi += n_ambi;
p->n_diff += n_diff;
toff += len, qoff += len, p->blen += len;
p->n_diff2 += n_diff2, p->blen2 += len - n_ambi;
} else if (op == 1) { // insertion
int n_ambi = 0;
for (l = 0; l < len; ++l)
if (qseq[qoff + l] > 3) ++n_ambi;
qoff += len, p->blen += len;
p->n_ambi += n_ambi, p->n_diff += len - n_ambi;
p->n_diff2 += 1.0f, ++p->blen2;
s -= q + e * len;
if (s < 0) s = 0;
} else if (op == 2) { // deletion
@ -151,6 +160,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
if (tseq[toff + l] > 3) ++n_ambi;
toff += len, p->blen += len;
p->n_ambi += n_ambi, p->n_diff += len - n_ambi;
p->n_diff2 += 1.0f, ++p->blen2;
s -= q + e * len;
if (s < 0) s = 0;
} else if (op == 3) { // intron
@ -335,7 +345,7 @@ static void mm_max_stretch(const mm_mapopt_t *opt, const mm_reg1_t *r, const mm1
*as = max_i, *cnt = max_len;
}
static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], mm_reg1_t *r, mm_reg1_t *r2, int n_a, mm128_t *a, ksw_extz_t *ez, int splice_flag)
static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], uint8_t *qual0[2], mm_reg1_t *r, mm_reg1_t *r2, int n_a, mm128_t *a, ksw_extz_t *ez, int splice_flag)
{
int is_sr = !!(opt->flag & MM_F_SR), is_splice = !!(opt->flag & MM_F_SPLICE);
int32_t rid = a[r->as].x<<1>>33, rev = a[r->as].x>>63, as1, cnt1;
@ -381,10 +391,10 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
if (is_sr) {
qs0 = 0, qe0 = qlen;
l = qs;
l += l * opt->a > opt->q? (l * opt->a - opt->q) / opt->e : 0;
l += l * opt->a + opt->end_bonus > opt->q? (l * opt->a + opt->end_bonus - opt->q) / opt->e : 0;
rs0 = rs - l > 0? rs - l : 0;
l = qlen - qe;
l += l * opt->a > opt->q? (l * opt->a - opt->q) / opt->e : 0;
l += l * opt->a + opt->end_bonus > opt->q? (l * opt->a + opt->end_bonus - opt->q) / opt->e : 0;
re0 = re + l < mi->seq[rid].len? re + l : mi->seq[rid].len;
} else {
// compute rs0 and qs0
@ -523,7 +533,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
assert(re1 - rs1 <= re0 - rs0);
if (r->p) {
mm_idx_getseq(mi, rid, rs1, re1, tseq);
mm_update_extra(r, &qseq0[r->rev][qs1], tseq, mat, opt->q, opt->e);
mm_update_extra(r, &qseq0[r->rev][qs1], qual0[r->rev]? &qual0[r->rev][qs1] : 0, tseq, mat, opt->q, opt->e);
if (rev && r->p->trans_strand)
r->p->trans_strand ^= 3; // flip to the read strand
}
@ -531,10 +541,10 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
kfree(km, tseq);
}
static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], const mm_reg1_t *r1, const mm_reg1_t *r2, mm_reg1_t *r_inv, ksw_extz_t *ez)
static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], uint8_t *qual0[2], const mm_reg1_t *r1, const mm_reg1_t *r2, mm_reg1_t *r_inv, ksw_extz_t *ez)
{
int tl, ql, score, ret = 0, q_off, t_off;
uint8_t *tseq, *qseq;
uint8_t *tseq, *qseq, *qual;
int8_t mat[25];
void *qp;
@ -552,6 +562,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i
tseq = (uint8_t*)kmalloc(km, tl);
mm_idx_getseq(mi, r1->rid, r1->re, r2->rs, tseq);
qseq = &qseq0[!r1->rev][qlen - r2->qs];
qual = qual0[!r1->rev]? &qseq0[!r1->rev][qlen - r2->qs] : 0;
mm_seq_rev(ql, qseq);
mm_seq_rev(tl, tseq);
@ -573,7 +584,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i
r_inv->rid = r1->rid;
r_inv->qs = r1->qe + q_off, r_inv->qe = r_inv->qs + ez->max_q + 1;
r_inv->rs = r1->re + t_off, r_inv->re = r_inv->rs + ez->max_t + 1;
mm_update_extra(r_inv, qseq + q_off, tseq + t_off, mat, opt->q, opt->e);
mm_update_extra(r_inv, &qseq[q_off], qual? &qual[q_off] : 0, &tseq[t_off], mat, opt->q, opt->e);
ret = 1;
end_align1_inv:
kfree(km, tseq);
@ -590,20 +601,26 @@ static inline mm_reg1_t *mm_insert_reg(const mm_reg1_t *r, int i, int *n_regs, m
return regs;
}
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a)
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, const char *qual, int *n_regs_, mm_reg1_t *regs, mm128_t *a)
{
extern unsigned char seq_nt4_table[256];
int32_t i, n_regs = *n_regs_, n_a;
uint8_t *qseq0[2];
uint8_t *qseq0[2], *qual0[2];
ksw_extz_t ez;
// encode the query sequence
qseq0[0] = (uint8_t*)kmalloc(km, qlen);
qseq0[1] = (uint8_t*)kmalloc(km, qlen);
qseq0[0] = (uint8_t*)kmalloc(km, qlen * 2);
qseq0[1] = qseq0[0] + qlen;
for (i = 0; i < qlen; ++i) {
qseq0[0][i] = seq_nt4_table[(uint8_t)qstr[i]];
qseq0[1][qlen - 1 - i] = qseq0[0][i] < 4? 3 - qseq0[0][i] : 4;
}
if (qual) {
qual0[0] = (uint8_t*)kmalloc(km, qlen * 2);
qual0[1] = qual0[0] + qlen;
for (i = 0; i < qlen; ++i)
qual0[0][i] = qual0[1][qlen - 1 - i] = qual[i] - 33;
} else qual0[0] = qual0[1] = 0;
// align through seed hits
n_a = mm_squeeze_a(km, n_regs, regs, a);
@ -614,8 +631,8 @@ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *m
mm_reg1_t s[2], s2[2];
int which, trans_strand;
s[0] = s[1] = regs[i];
mm_align1(km, opt, mi, qlen, qseq0, &s[0], &s2[0], n_a, a, &ez, MM_F_SPLICE_FOR);
mm_align1(km, opt, mi, qlen, qseq0, &s[1], &s2[1], n_a, a, &ez, MM_F_SPLICE_REV);
mm_align1(km, opt, mi, qlen, qseq0, qual0, &s[0], &s2[0], n_a, a, &ez, MM_F_SPLICE_FOR);
mm_align1(km, opt, mi, qlen, qseq0, qual0, &s[1], &s2[1], n_a, a, &ez, MM_F_SPLICE_REV);
if (s[0].p->dp_score > s[1].p->dp_score) which = 0, trans_strand = 1;
else if (s[0].p->dp_score < s[1].p->dp_score) which = 1, trans_strand = 2;
else trans_strand = 3, which = (qlen + s[0].p->dp_score) & 1; // randomly choose a strand, effectively
@ -628,20 +645,21 @@ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *m
}
regs[i].p->trans_strand = trans_strand;
} else { // one round of alignment
mm_align1(km, opt, mi, qlen, qseq0, &regs[i], &r2, n_a, a, &ez, opt->flag);
mm_align1(km, opt, mi, qlen, qseq0, qual0, &regs[i], &r2, n_a, a, &ez, opt->flag);
if (opt->flag&MM_F_SPLICE)
regs[i].p->trans_strand = opt->flag&MM_F_SPLICE_FOR? 1 : 2;
}
if (r2.cnt > 0) regs = mm_insert_reg(&r2, i, &n_regs, regs);
if (!(opt->flag&MM_F_SPLICE) && !(opt->flag&MM_F_SR) && i > 0) { // don't try inversion alignment for -xsplice or -xsr
if (mm_align1_inv(km, opt, mi, qlen, qseq0, &regs[i-1], &regs[i], &r2, &ez)) {
if (mm_align1_inv(km, opt, mi, qlen, qseq0, qual0, &regs[i-1], &regs[i], &r2, &ez)) {
regs = mm_insert_reg(&r2, i, &n_regs, regs);
++i; // skip the inserted INV alignment
}
}
}
*n_regs_ = n_regs;
kfree(km, qseq0[0]); kfree(km, qseq0[1]);
kfree(km, qseq0[0]);
if (qual0[0]) kfree(km, qual0[0]);
kfree(km, ez.cigar);
mm_filter_regs(km, opt, n_regs_, regs);
mm_hit_sort_by_dp(km, n_regs_, regs);

View File

@ -196,14 +196,15 @@ static void write_cs(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_
static inline void write_tags(kstring_t *s, const mm_reg1_t *r)
{
int type = r->inv? 'I' : r->id == r->parent? 'P' : 'S';
mm_sprintf_lite(s, "\ttp:A:%c\tcm:i:%d\ts1:i:%d", type, r->cnt, r->score);
if (r->parent == r->id) mm_sprintf_lite(s, "\ts2:i:%d", r->subsc);
if (r->split) mm_sprintf_lite(s, "\tzd:i:%d", r->split);
if (r->iden_flt) mm_sprintf_lite(s, "\tom:i:%d", r->mapq);
if (r->p) {
mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->p->n_diff, r->p->dp_max, r->p->dp_score, r->p->n_ambi);
if (r->p->trans_strand == 1 || r->p->trans_strand == 2)
mm_sprintf_lite(s, "\tts:A:%c", "?+-?"[r->p->trans_strand]);
}
mm_sprintf_lite(s, "\ttp:A:%c\tcm:i:%d\ts1:i:%d", type, r->cnt, r->score);
if (r->parent == r->id) mm_sprintf_lite(s, "\ts2:i:%d", r->subsc);
if (r->split) mm_sprintf_lite(s, "\tzd:i:%d", r->split);
}
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag)
@ -303,8 +304,9 @@ void mm_write_sam2(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se
mm_sprintf_lite(s, "\t%s\t%d\t0\t*", mi->seq[this_rid].name, this_pos+1);
} else mm_sprintf_lite(s, "\t*\t0\t0\t*");
} else {
int mapq = !r->iden_flt? r->mapq : r->mapq < 3? r->mapq : 3;
this_rid = r->rid, this_pos = r->rs, this_rev = r->rev;
mm_sprintf_lite(s, "\t%s\t%d\t%d\t", mi->seq[r->rid].name, r->rs+1, r->mapq);
mm_sprintf_lite(s, "\t%s\t%d\t%d\t", mi->seq[r->rid].name, r->rs+1, mapq);
if (r->p) { // actually this should always be true for SAM output
uint32_t k, clip_len = r->rev? t->l_seq - r->qe : r->qs;
int clip_char = (flag&0x800)? 'H' : 'S';

39
hit.c
View File

@ -239,6 +239,45 @@ void mm_filter_regs(void *km, const mm_mapopt_t *opt, int *n_regs, mm_reg1_t *re
*n_regs = k;
}
void mm_filter_by_identity(void *km, int n_regs, mm_reg1_t *regs, float min_iden, int qlen, const char *qual) // TODO: make sure it is not beyond the ends of contigs
{
int i, j, n_aux = 0, en, blen = 0;
uint64_t *aux;
float n_diff = 0.0f;
if (n_regs <= 0) return;
for (i = 0; i < n_regs; ++i)
if (regs[i].id == regs[i].parent && regs[i].pe_thru) // sequenced through the fragment; don't filter
return;
for (i = 0; i < n_regs; ++i)
if (regs[i].id == regs[i].parent)
++n_aux;
assert(n_aux >= 1);
aux = (uint64_t*)kmalloc(km, n_aux * 8);
for (i = 0, n_aux = 0; i < n_regs; ++i)
if (regs[i].id == regs[i].parent)
aux[n_aux++] = (uint64_t)regs[i].qs<<32 | i;
radix_sort_64(aux, aux + n_aux);
for (i = 0, en = 0; i < n_aux; ++i) {
mm_reg1_t *r = &regs[(int32_t)aux[i]];
if (r->qs > en) {
for (j = en; j < r->qs; ++j)
n_diff += qual == 0 || qual[j] >= 53? 0.5f : 0.025f * (qual[j] - 33);
blen += r->qs - en;
}
assert(r->p);
blen += r->p->blen2;
n_diff += r->p->n_diff2;
en = en > r->qe? en : r->qe;
}
for (j = en; j < qlen; ++j)
n_diff += qual == 0 || qual[j] >= 53? 0.5f : 0.025f * (qual[j] - 33);
blen += qlen - en;
kfree(km, aux);
if (1.0f - n_diff / blen < min_iden)
for (i = 0; i < n_regs; ++i)
regs[i].iden_flt = 1;
}
int mm_squeeze_a(void *km, int n_regs, mm_reg1_t *regs, mm128_t *a)
{ // squeeze out regions in a[] that are not referenced by regs[]
int i, as = 0;

6
main.c
View File

@ -6,7 +6,7 @@
#include "mmpriv.h"
#include "getopt.h"
#define MM_VERSION "2.2-r511-dirty"
#define MM_VERSION "2.2-r512-dirty"
#ifdef __linux__
#include <sys/resource.h>
@ -65,7 +65,7 @@ static inline int64_t mm_parse_num(const char *str)
int main(int argc, char *argv[])
{
const char *opt_str = "2aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:";
const char *opt_str = "2aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:i:";
mm_mapopt_t opt;
mm_idxopt_t ipt;
int i, c, n_threads = 3, long_idx, max_gap_ref = 0;
@ -100,6 +100,7 @@ int main(int argc, char *argv[])
else if (c == 'g') opt.max_gap = (int)mm_parse_num(optarg);
else if (c == 'G') max_gap_ref = (int)mm_parse_num(optarg);
else if (c == 'F') opt.max_frag_len = (int)mm_parse_num(optarg);
else if (c == 'i') opt.min_iden = atof(optarg);
else if (c == 'N') opt.best_n = atoi(optarg);
else if (c == 'p') opt.pri_ratio = atof(optarg);
else if (c == 'M') opt.mask_level = atof(optarg);
@ -222,6 +223,7 @@ int main(int argc, char *argv[])
fprintf(fp_help, " -z INT Z-drop score [%d]\n", opt.zdrop);
fprintf(fp_help, " -s INT minimal peak DP alignment score [%d]\n", opt.min_dp_max);
fprintf(fp_help, " -u CHAR how to find GT-AG. f:transcript strand, b:both strands, n:don't match GT-AG [n]\n");
fprintf(fp_help, " -i FLOAT min identity (mapQ reduced to 0 if below) [0]\n");
fprintf(fp_help, " Input/Output:\n");
fprintf(fp_help, " -a output in the SAM format (PAF by default)\n");
fprintf(fp_help, " -Q don't output base quality in SAM\n");

28
map.c
View File

@ -85,9 +85,9 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
io->is_hpc = 0, io->k = 21, io->w = 11;
mo->flag |= MM_F_SR | MM_F_FRAG_MODE | MM_F_NO_PRINT_2ND | MM_F_2_IO_THREADS;
mo->pe_ori = 0<<1|1; // FR
mo->a = 2, mo->b = 8, mo->q = 12, mo->e = 2, mo->q2 = 32, mo->e2 = 1;
mo->a = 2, mo->b = 8, mo->q = 12, mo->e = 2, mo->q2 = 24, mo->e2 = 1;
mo->zdrop = 100;
mo->end_bonus = 11;
mo->end_bonus = 15;
mo->max_frag_len = 800;
mo->max_gap = 100;
mo->bw = 100;
@ -251,10 +251,10 @@ static void chain_post(const mm_mapopt_t *opt, int max_chain_gap_ref, const mm_i
}
}
static mm_reg1_t *align_regs(const mm_mapopt_t *opt, const mm_idx_t *mi, void *km, int qlen, const char *seq, int *n_regs, mm_reg1_t *regs, mm128_t *a)
static mm_reg1_t *align_regs(const mm_mapopt_t *opt, const mm_idx_t *mi, void *km, int qlen, const char *seq, const char *qual, int *n_regs, mm_reg1_t *regs, mm128_t *a)
{
if (!(opt->flag & MM_F_CIGAR)) return regs;
regs = mm_align_skeleton(km, opt, mi, qlen, seq, n_regs, regs, a); // this calls mm_filter_regs()
regs = mm_align_skeleton(km, opt, mi, qlen, seq, qual, n_regs, regs, a); // this calls mm_filter_regs()
if (!(opt->flag & MM_F_AVA)) {
mm_set_parent(km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b);
mm_select_sub(km, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs);
@ -263,7 +263,7 @@ static mm_reg1_t *align_regs(const mm_mapopt_t *opt, const mm_idx_t *mi, void *k
return regs;
}
void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **seqs, int *n_regs, mm_reg1_t **regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname)
void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **seqs, const char **quals, int *n_regs, mm_reg1_t **regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname)
{
int i, j, rep_len, qlen_sum, n_regs0;
int max_chain_gap_qry, max_chain_gap_ref, is_splice = !!(opt->flag & MM_F_SPLICE);
@ -340,7 +340,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
chain_post(opt, max_chain_gap_ref, mi, b->km, qlen_sum, n_segs, qlens, &n_regs0, regs0, a);
if (n_segs == 1) { // uni-segment
regs0 = align_regs(opt, mi, b->km, qlens[0], seqs[0], &n_regs0, regs0, a);
regs0 = align_regs(opt, mi, b->km, qlens[0], seqs[0], quals? quals[0] : 0, &n_regs0, regs0, a);
mm_set_mapq(n_regs0, regs0, opt->min_chain_score, opt->a, rep_len);
n_regs[0] = n_regs0, regs[0] = regs0;
} else { // multi-segment
@ -349,13 +349,16 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
free(regs0);
for (i = 0; i < n_segs; ++i) {
mm_set_parent(b->km, opt->mask_level, n_regs[i], regs[i], opt->a * 2 + opt->b); // update mm_reg1_t::parent
regs[i] = align_regs(opt, mi, b->km, qlens[i], seqs[i], &n_regs[i], regs[i], seg[i].a);
regs[i] = align_regs(opt, mi, b->km, qlens[i], seqs[i], quals? quals[i] : 0, &n_regs[i], regs[i], seg[i].a);
mm_set_mapq(n_regs[i], regs[i], opt->min_chain_score, opt->a, rep_len);
}
mm_seg_free(b->km, n_segs, seg);
if (n_segs == 2 && opt->pe_ori >= 0 && (opt->flag&MM_F_CIGAR))
mm_pair(b->km, max_chain_gap_ref, opt->pe_bonus, opt->a * 2 + opt->b, opt->a, qlens, n_regs, regs); // pairing
}
if (opt->min_iden > 0.0f)
for (i = 0; i < n_segs; ++i)
mm_filter_by_identity(b->km, n_regs[i], regs[i], opt->min_iden, qlens[i], quals[i]);
kfree(b->km, a);
kfree(b->km, u);
@ -364,7 +367,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname)
{
mm_reg1_t *regs;
mm_map_frag(mi, 1, &qlen, &seq, n_regs, &regs, b, opt, qname);
mm_map_frag(mi, 1, &qlen, &seq, 0, n_regs, &regs, b, opt, qname);
return regs;
}
@ -392,20 +395,22 @@ typedef struct {
static void worker_for(void *_data, long i, int tid) // kt_for() callback
{
step_t *s = (step_t*)_data;
int *qlens, j, off = s->seg_off[i], pe_ori = s->p->opt->pe_ori;
const char **qseqs;
int *qlens, j, off = s->seg_off[i], pe_ori = s->p->opt->pe_ori, is_sr = !!(s->p->opt->flag & MM_F_SR);
const char **qseqs, **quals = 0;
mm_tbuf_t *b = s->buf[tid];
if (mm_dbg_flag & MM_DBG_PRINT_QNAME)
fprintf(stderr, "QR\t%s\t%d\n", s->seq[off].name, tid);
qlens = (int*)kmalloc(b->km, s->n_seg[i] * sizeof(int));
qseqs = (const char**)kmalloc(b->km, s->n_seg[i] * sizeof(const char**));
quals = (const char**)kmalloc(b->km, s->n_seg[i] * sizeof(const char**));
for (j = 0; j < s->n_seg[i]; ++j) {
if (s->n_seg[i] == 2 && ((j == 0 && (pe_ori>>1&1)) || (j == 1 && (pe_ori&1))))
mm_revcomp_bseq(&s->seq[off + j]);
qlens[j] = s->seq[off + j].l_seq;
qseqs[j] = s->seq[off + j].seq;
quals[j] = is_sr? s->seq[off + j].qual : 0;
}
mm_map_frag(s->p->mi, s->n_seg[i], qlens, qseqs, &s->n_reg[off], &s->reg[off], b, s->p->opt, s->seq[off].name);
mm_map_frag(s->p->mi, s->n_seg[i], qlens, qseqs, quals, &s->n_reg[off], &s->reg[off], b, s->p->opt, s->seq[off].name);
for (j = 0; j < s->n_seg[i]; ++j) // flip the query strand and coordinate to the original read strand
if (s->n_seg[i] == 2 && ((j == 0 && (pe_ori>>1&1)) || (j == 1 && (pe_ori&1)))) {
int k, t;
@ -420,6 +425,7 @@ static void worker_for(void *_data, long i, int tid) // kt_for() callback
}
kfree(b->km, qlens);
kfree(b->km, qseqs);
kfree(b->km, quals);
}
static void *worker_pipeline(void *shared, int step, void *in)

View File

@ -58,6 +58,8 @@ typedef struct {
uint32_t n_diff; // number of differences, including ambiguous bases
uint32_t n_ambi:30, trans_strand:2; // number of ambiguous bases; transcript strand: 0 for unknown, 1 for +, 2 for -
uint32_t n_cigar; // number of cigar operations in cigar[]
float n_diff2;
uint32_t blen2;
uint32_t cigar[];
} mm_extra_t;
@ -71,7 +73,7 @@ typedef struct {
int32_t as; // offset in the a[] array (for internal uses only)
int32_t fuzzy_mlen, fuzzy_blen; // seeded exact match length; seeded alignment block length (approximate)
uint32_t mapq:8, split:2, n_sub:22; // mapQ; split pattern; number of suboptimal mappings
uint32_t sam_pri:1, proper_frag:1, dummy:30;
uint32_t sam_pri:1, proper_frag:1, iden_flt:1, pe_thru:1, dummy:29;
uint32_t hash;
mm_extra_t *p;
} mm_reg1_t;
@ -98,6 +100,7 @@ typedef struct {
float mask_level;
float pri_ratio;
int best_n; // top best_n chains are subjected to DP alignment
float min_iden;
int max_join_long, max_join_short;
int min_join_flank_sc;

View File

@ -64,7 +64,7 @@ const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n);
int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f);
mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, const char *qual, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a);
void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a);
@ -75,6 +75,7 @@ void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff
void mm_select_sub(void *km, float pri_ratio, int min_diff, int best_n, int *n_, mm_reg1_t *r);
void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int max_gap_ref, int min_diff, int best_n, int n_segs, const int *qlens, int *n_, mm_reg1_t *r);
void mm_filter_regs(void *km, const mm_mapopt_t *opt, int *n_regs, mm_reg1_t *regs);
void mm_filter_by_identity(void *km, int n_regs, mm_reg1_t *regs, float min_iden, int qlen, const char *qual);
void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs, mm128_t *a);
void mm_hit_sort_by_dp(void *km, int *n_regs, mm_reg1_t *r);
void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len);

22
pe.c
View File

@ -42,6 +42,26 @@ void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int
}
}
void mm_set_pe_thru(const int *qlens, int *n_regs, mm_reg1_t **regs)
{
int s, i, n_pri[2], pri[2];
n_pri[0] = n_pri[1] = 0;
pri[0] = pri[1] = -1;
for (s = 0; s < 2; ++s)
for (i = 0; i < n_regs[s]; ++i)
if (regs[s][i].id == regs[s][i].parent)
++n_pri[s], pri[s] = i;
if (n_pri[0] == 1 && n_pri[1] == 1) {
mm_reg1_t *p = &regs[0][pri[0]];
mm_reg1_t *q = &regs[1][pri[1]];
if (p->rid == q->rid && p->rev == q->rev && abs(p->rs - q->rs) < 3 && abs(p->re - p->re) < 3
&& ((p->qs == 0 && qlens[1] - q->qe == 0) || (q->qs == 0 && qlens[0] - p->qe == 0)))
{
p->pe_thru = q->pe_thru = 1;
}
}
}
#include "ksort.h"
typedef struct {
@ -152,4 +172,6 @@ void mm_pair(void *km, int max_gap_ref, int pe_bonus, int sub_diff, int match_sc
kfree(km, a);
kfree(km, sc.a);
mm_set_pe_thru(qlens, n_regs, regs);
}