From 7dceae8c5d8a47592df747b94129cce4d7f351ad Mon Sep 17 00:00:00 2001 From: zzh Date: Fri, 16 Feb 2024 20:59:59 +0800 Subject: [PATCH] =?UTF-8?q?seed=E4=B8=89=E4=B8=AA=E6=AD=A5=E9=AA=A4?= =?UTF-8?q?=EF=BC=8C=E5=85=A8=E9=83=A8=E7=94=A8fmt=E5=AE=9E=E7=8E=B0?= =?UTF-8?q?=EF=BC=8C=E8=80=8C=E4=B8=94=E7=BB=93=E6=9E=9C=E4=B8=80=E8=87=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bwamem.c | 60 ++++++++++++++++++++++++++++++++++++------------------- fmt_idx.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++ fmt_idx.h | 1 + run.sh | 10 +++++----- 4 files changed, 96 insertions(+), 26 deletions(-) diff --git a/bwamem.c b/bwamem.c index 5a9d606..b0dd8a9 100644 --- a/bwamem.c +++ b/bwamem.c @@ -144,6 +144,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn int start_width = 1; int split_len = (int)(opt->min_seed_len * opt->split_factor + .499); int max_seed_len = 0; + int start_N_num = 0, start_flag = 1; a->mem.n = 0; // char *BASE = "ACGT"; // for (i = 0; i < len; ++i) @@ -157,8 +158,11 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn //fprintf(fp1, "seq: %ld\n", dn++); //fprintf(stderr, "seq: %ld\n", dn++); //dn ++; + //goto third_seed; + while (x < len) { if (seq[x] < 4) { + start_flag = 0; #ifdef SHOW_PERF int64_t tmp_time = realtime_msec(); #endif @@ -172,22 +176,27 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn bwtintv_t *p = &a->mem1.a[i]; //fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info); //fprintf(stderr, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info); - int slen = (uint32_t)p->info - (p->info >> 32); // seed length - if (slen < 16) ++n16; - if (slen < 17) ++n17; - if (slen < 18) ++n18; - if (slen < 19) ++n19; - ++nall; + //int slen = (uint32_t)p->info - (p->info >> 32); // seed length + //if (slen < 16) ++n16; + //if (slen < 17) ++n17; + //if (slen < 18) ++n18; + //if (slen < 19) ++n19; + //++nall; max_seed_len = fmax(max_seed_len, slen); if (slen >= opt->min_seed_len) kv_push(bwtintv_t, a->mem, *p); } - } else ++x; + } else { + ++x; + if (start_flag) + ++start_N_num; + } } // second pass: find MEMs inside a long SMEM - //if (max_seed_len == len) - //goto collect_intv_end; - + //if (max_seed_len == len - start_N_num) + // goto collect_intv_end; + // goto third_seed; + old_n = a->mem.n; for (k = 0; k < old_n; ++k) { bwtintv_t *p = &a->mem.a[k]; @@ -206,19 +215,20 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn bwtintv_t *p = &a->mem1.a[i]; //fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info); //fprintf(stderr, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info); - int slen = (uint32_t)p->info - (p->info >> 32); - if (slen < 16) ++n16; - if (slen < 17) ++n17; - if (slen < 18) ++n18; - if (slen < 19) ++n19; - ++nall; + //int slen = (uint32_t)p->info - (p->info >> 32); + //if (slen < 16) ++n16; + //if (slen < 17) ++n17; + //if (slen < 18) ++n18; + //if (slen < 19) ++n19; + //++nall; if ((uint32_t)a->mem1.a[i].info - (a->mem1.a[i].info >> 32) >= opt->min_seed_len) kv_push(bwtintv_t, a->mem, a->mem1.a[i]); } } - - goto collect_intv_end; - + //if (max_seed_len == len - start_N_num) + // goto collect_intv_end; + //goto collect_intv_end; +third_seed: // third pass: LAST-like if (opt->max_mem_intv > 0) { x = 0; @@ -229,12 +239,20 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn #ifdef SHOW_PERF int64_t tmp_time = realtime_msec(); #endif - x = bwt_seed_strategy1(bwt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m); + //x = bwt_seed_strategy1(bwt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m); + x = fmt_seed_strategy1(fmt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m); #ifdef SHOW_PERF tmp_time = realtime_msec() - tmp_time; __sync_fetch_and_add(&time_bwt_smem1a, tmp_time); #endif - if (m.x[2] > 0) kv_push(bwtintv_t, a->mem, m); + //bwtintv_t *p = &m; + //fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info); + + if (m.x[2] > 0) { + kv_push(bwtintv_t, a->mem, m); + //bwtintv_t *p = &m; + //fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info); + } } else { // for now, we never come to this block which is slower x = bwt_smem1a(bwt, len, seq, x, start_width, opt->max_mem_intv, &a->mem1, a->tmpv); for (i = 0; i < a->mem1.n; ++i) diff --git a/fmt_idx.c b/fmt_idx.c index 406a904..8a3f834 100644 --- a/fmt_idx.c +++ b/fmt_idx.c @@ -854,4 +854,55 @@ fmt_smem_end: if (tmpvec == 0 || tmpvec[0] == 0) free(a[0].a); return ret; +} + +int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem) +{ + int i, kmer_len; + bwtintv_t ik, ok1, ok2; + uint64_t qbit; + + + memset(mem, 0, sizeof(bwtintv_t)); + if (q[x] > 3) + return x + 1; + + qbit = (uint32_t)build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len); + + fmt_kmer_get(fmt, &ik, qbit, kmer_len-1); // 初始碱基位置 + ik.info = x + kmer_len; + +#define COND_SET_RETURN(iv, ov, start_pos, end_pos, max_intv, min_len) \ + if (iv.x[2] < max_intv && end_pos - start_pos >= min_len) \ + { \ + (ov) = (iv); \ + (ov).info = (uint64_t)start_pos << 32 | (end_pos + 1); \ + return end_pos + 1; \ + } + + for (i = (int)ik.info; i + 1 < len; i += 2) + { // forward search + if (q[i] < 4 && q[i + 1] < 4) + { + fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]); + COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len); + COND_SET_RETURN(ok2, *mem, x, i + 1, max_intv, min_len); + ik = ok2; + } + else if (q[i] < 4) // q[i+1] >= 4 + { + fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]); + COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len); + return i + 2; + } + else // q[i] >= 4 + { + return i + 1; + } + } + if (i == len - 1) { + fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]); + COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len); + } + return len; } \ No newline at end of file diff --git a/fmt_idx.h b/fmt_idx.h index cbf3b5b..07efff6 100644 --- a/fmt_idx.h +++ b/fmt_idx.h @@ -131,4 +131,5 @@ void fmt_kmer_get(const FMTIndex *fmt, bwtintv_t *ok, uint32_t qbit, int pos); // 找smem(seed) int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]); +int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem); #endif \ No newline at end of file diff --git a/run.sh b/run.sh index 59d46db..c116ec9 100755 --- a/run.sh +++ b/run.sh @@ -4,16 +4,16 @@ thread=1 #n_r1=~/data/fastq/ZY2105177532213000/sn_r1.fq #n_r2=~/data/fastq/ZY2105177532213000/sn_r2.fq #reference=~/data/reference/human_g1k_v37_decoy.fasta -n_r1=~/fastq/sn_r1.fq -n_r2=~/fastq/sn_r2.fq -#n_r1=~/fastq/tiny_n_r1.fq -#n_r2=~/fastq/tiny_n_r3.fq +#n_r1=~/fastq/sn_r1.fq +#n_r2=~/fastq/sn_r2.fq +n_r1=~/fastq/tiny_n_r1.fq +n_r2=~/fastq/tiny_n_r2.fq #n_r1=~/fastq/diff_r1.fq #n_r2=~/fastq/diff_r2.fq #n_r1=~/fastq/d_r1.fq #n_r2=~/fastq/d_r2.fq reference=~/reference/human_g1k_v37_decoy.fasta -out=./fmt_all.sam +out=./out.sam #out=/dev/null #time ./bwa mem -t 12 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \ # /home/zzh/data/reference/human_g1k_v37_decoy.fasta \