加入了获取reads和extension数据的代码

This commit is contained in:
zzh 2024-03-27 23:47:39 +08:00
parent 1e3965cb7d
commit dd03596997
6 changed files with 88 additions and 15 deletions

2
.gitignore vendored
View File

@ -2,6 +2,8 @@
*.txt *.txt
*.sam *.sam
*.log *.log
*.fa
dataset/
bwa bwa
test test
test64 test64

View File

@ -1482,7 +1482,23 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in
#endif #endif
#ifdef FILTER_FULL_MATCH #ifdef FILTER_FULL_MATCH
if (max_seed_len == len - start_N_num) goto collect_intv_end; if (max_seed_len == len - start_N_num) {
#ifdef DEBUG_OUTPUT
if (start_N_num == 0) {
for (i = 0; i < len; ++i)
fprintf(gfp1, "%c", "ACGT"[seq[i]]);
fprintf(gfp1, "\n");
#ifdef SHOW_DATA_PERF
gdat[2]++;
if (gdat[2] % 100 == 0) {
fprintf(stderr, "reads num: %ld\n", gdat[2]);
}
#endif
}
#endif
goto collect_intv_end;
}
#endif #endif
// 2. second pass: find MEMs inside a long SMEM // 2. second pass: find MEMs inside a long SMEM

View File

@ -92,7 +92,8 @@ int64_t gdat[100];
#endif #endif
#ifdef DEBUG_OUTPUT #ifdef DEBUG_OUTPUT
FILE * gfp1, *gfp2, *gfp3; FILE * gfp1, *gfp2, *gfp3, *gfp4;
FILE *gfq[4], *gft[4], *gfi[4];
#endif #endif
extern unsigned char nst_nt4_table[256]; extern unsigned char nst_nt4_table[256];
@ -414,6 +415,18 @@ int main_mem(int argc, char *argv[])
gfp1 = fopen("./fp1.txt", "w"); gfp1 = fopen("./fp1.txt", "w");
gfp2 = fopen("./fp2.txt", "w"); gfp2 = fopen("./fp2.txt", "w");
gfp3 = fopen("./fp3.txt", "w"); gfp3 = fopen("./fp3.txt", "w");
gfp4 = fopen("./fp4.txt", "w");
char fnbuf[1024];
int ii;
for (ii = 0; ii < 4; ++ii)
{
sprintf(fnbuf, "./ext_out/q_%d.fa", ii);
gfq[ii] = fopen(fnbuf, "w");
sprintf(fnbuf, "./ext_out/t_%d.fa", ii);
gft[ii] = fopen(fnbuf, "w");
sprintf(fnbuf, "./ext_out/i_%d.txt", ii);
gfi[ii] = fopen(fnbuf, "w");
}
#endif #endif
#ifdef SHOW_PERF #ifdef SHOW_PERF
@ -776,13 +789,15 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, "time_process_seq: %0.2lf s\n", (time_work_kernel + time_work_sam) * 1.0 / proc_freq); fprintf(stderr, "time_process_seq: %0.2lf s\n", (time_work_kernel + time_work_sam) * 1.0 / proc_freq);
fprintf(stderr, "time_seed_all: %0.2lf (%0.2lf, %0.2lf) s\n", avg_seed_all * 1.0 / proc_freq / opt->n_threads, min_seed_all * 1.0 / proc_freq, max_seed_all * 1.0 / proc_freq); fprintf(stderr, "time_seed_all: %0.2lf (%0.2lf, %0.2lf) s\n", avg_seed_all * 1.0 / proc_freq / opt->n_threads, min_seed_all * 1.0 / proc_freq, max_seed_all * 1.0 / proc_freq);
fprintf(stderr, "time_seed_1: %0.2lf s\n", avg_seed_1 * 1.0 / proc_freq / opt->n_threads);
fprintf(stderr, "time_seed_2: %0.2lf s\n", avg_seed_2 * 1.0 / proc_freq / opt->n_threads);
fprintf(stderr, "time_seed_3: %0.2lf s\n", avg_seed_3 * 1.0 / proc_freq / opt->n_threads);
fprintf(stderr, "time_sa: %0.2lf s\n", avg_sa * 1.0 / proc_freq / opt->n_threads); fprintf(stderr, "time_sa: %0.2lf s\n", avg_sa * 1.0 / proc_freq / opt->n_threads);
fprintf(stderr, "time_chain: %0.2lf s\n", avg_chain * 1.0 / proc_freq / opt->n_threads); fprintf(stderr, "time_chain: %0.2lf s\n", avg_chain * 1.0 / proc_freq / opt->n_threads);
fprintf(stderr, "time_bsw_all: %0.2lf (%0.2lf, %0.2lf) s\n", avg_bsw_all * 1.0 / proc_freq / opt->n_threads, min_bsw_all * 1.0 / proc_freq, max_bsw_all * 1.0 / proc_freq); fprintf(stderr, "time_bsw_all: %0.2lf (%0.2lf, %0.2lf) s\n", avg_bsw_all * 1.0 / proc_freq / opt->n_threads, min_bsw_all * 1.0 / proc_freq, max_bsw_all * 1.0 / proc_freq);
fprintf(stderr, "\n");
fprintf(stderr, "time_seed_1: %0.2lf s\n", avg_seed_1 * 1.0 / proc_freq / opt->n_threads);
fprintf(stderr, "time_seed_2: %0.2lf s\n", avg_seed_2 * 1.0 / proc_freq / opt->n_threads);
fprintf(stderr, "time_seed_3: %0.2lf s\n", avg_seed_3 * 1.0 / proc_freq / opt->n_threads);
fprintf(stderr, "time_seed_chain: %0.2lf s\n", (avg_seed_all + avg_chain) * 1.0 / proc_freq / opt->n_threads);
fprintf(stderr, "time_bsw: %0.2lf s\n", avg_bsw * 1.0 / proc_freq / opt->n_threads); fprintf(stderr, "time_bsw: %0.2lf s\n", avg_bsw * 1.0 / proc_freq / opt->n_threads);
fprintf(stderr, "\n"); fprintf(stderr, "\n");
@ -798,6 +813,13 @@ int main_mem(int argc, char *argv[])
fclose(gfp1); fclose(gfp1);
fclose(gfp2); fclose(gfp2);
fclose(gfp3); fclose(gfp3);
fclose(gfp4);
for (ii = 0; ii < 4; ++ii)
{
fclose(gfq[ii]);
fclose(gft[ii]);
fclose(gfi[ii]);
}
#endif #endif
return 0; return 0;

View File

@ -6,8 +6,9 @@
#include <immintrin.h> #include <immintrin.h>
#include <emmintrin.h> #include <emmintrin.h>
#ifdef SHOW_PERF #ifdef DEBUG_OUTPUT
extern FILE *gfp1, *gfp2, *gfp3; extern FILE *gfp1, *gfp2, *gfp3;
extern FILE *gfq[4], *gft[4], *gfi[4];
#endif #endif
#define NO_VAL -1 #define NO_VAL -1
@ -172,6 +173,25 @@ static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
tmp = fA1; fA1 = fA2; fA2 = tmp; \ tmp = fA1; fA1 = fA2; fA2 = tmp; \
tmp = mA1; mA1 = mA2; mA2 = tmp; tmp = mA1; mA1 = mA2; mA2 = tmp;
void write_query_target_sequence(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int h0, int fnum)
{
// 写到三个文件里query.fatarget.fa每行一个序列info.txt包含前缀得分h0和长度信息qlentlen
FILE *query_f = gfq[fnum],
*target_f = gft[fnum],
*info_f = gfi[fnum];
const char seq_map[5] = {'A', 'C', 'G', 'T', 'N'};
int i;
// 处理query
for (i = 0; i < qlen; ++i)
fprintf(query_f, "%c", seq_map[query[i]]);
fprintf(query_f, "\n");
// 处理target
for (i = 0; i < tlen; ++i)
fprintf(target_f, "%c", seq_map[target[i]]);
fprintf(target_f, "\n");
// 处理其他信息
fprintf(info_f, "%-8d%-8d%-8d\n", qlen, tlen, h0);
}
int ksw_extend2_avx2(int qlen, // query length 待匹配段碱基的query长度 int ksw_extend2_avx2(int qlen, // query length 待匹配段碱基的query长度
const uint8_t *query, // read碱基序列 const uint8_t *query, // read碱基序列
@ -201,6 +221,15 @@ int ksw_extend2_avx2(int qlen, // query length 待匹配段碱基的query长度
#ifdef DEBUG_OUTPUT #ifdef DEBUG_OUTPUT
//fprintf(gfp1, "%d\n", qlen); //fprintf(gfp1, "%d\n", qlen);
if (qlen <= 30) {
write_query_target_sequence(qlen, query, tlen, target, h0, 0);
} else if (qlen < 60) {
write_query_target_sequence(qlen, query, tlen, target, h0, 1);
} else if (qlen < 90) {
write_query_target_sequence(qlen, query, tlen, target, h0, 2);
} else {
write_query_target_sequence(qlen, query, tlen, target, h0, 3);
}
#endif #endif
if (qlen * a + h0 < 255) return ksw_extend2_avx2_u8(qlen, query, tlen, target, is_left, m, mat, o_del, e_del, o_ins, e_ins, a, b, w, end_bonus, zdrop, h0, _qle, _tle, _gtle, _gscore, _max_off, buf); if (qlen * a + h0 < 255) return ksw_extend2_avx2_u8(qlen, query, tlen, target, is_left, m, mat, o_del, e_del, o_ins, e_ins, a, b, w, end_bonus, zdrop, h0, _qle, _tle, _gtle, _gscore, _max_off, buf);

17
run.sh
View File

@ -3,12 +3,14 @@ thread=1
#n_r2=~/data/fastq/dataset/na12878_wgs_150/bn2.fq #n_r2=~/data/fastq/dataset/na12878_wgs_150/bn2.fq
#n_r1=~/data/fastq/dataset/na12878_wes_144/SRR25735653_1.fastq #n_r1=~/data/fastq/dataset/na12878_wes_144/SRR25735653_1.fastq
#n_r2=~/data/fastq/dataset/na12878_wes_144/SRR25735653_2.fastq #n_r2=~/data/fastq/dataset/na12878_wes_144/SRR25735653_2.fastq
#n_r1=~/data/fastq/dataset/na12878_wgs_150/n1.fq #n_r1=~/data/fastq/dataset/na12878_wgs_150/s_1.fq
#n_r2=~/data/fastq/dataset/na12878_wgs_150/n2.fq #n_r2=~/data/fastq/dataset/na12878_wgs_150/s_2.fq
#n_r1=~/data/fastq/dataset/na12878_wgs_101/na_1.fq #n_r1=~/data/fastq/dataset/na12878_wgs_101/na_1.fq
#n_r2=~/data/fastq/dataset/na12878_wgs_101/na_2.fq #n_r2=~/data/fastq/dataset/na12878_wgs_101/na_2.fq
n_r1=~/data/fastq/dataset/zy_wes/sn1.fq n_r1=~/data/fastq/dataset/zy_wes/s_1.fq
n_r2=~/data/fastq/dataset/zy_wes/sn2.fq n_r2=~/data/fastq/dataset/zy_wes/s_2.fq
#n_r1=~/data/fastq/dataset/zy_wes/bloodgDNA_r1.fastq
#n_r2=~/data/fastq/dataset/zy_wes/bloodgDNA_r2.fastq
#n_r1=~/data/fastq/zy/wgs/n1.fq #n_r1=~/data/fastq/zy/wgs/n1.fq
@ -54,12 +56,13 @@ n_r2=~/data/fastq/dataset/zy_wes/sn2.fq
#n_r2=~/fastq/diff_r2.fq #n_r2=~/fastq/diff_r2.fq
#n_r1=~/fastq/d_r1.fq #n_r1=~/fastq/d_r1.fq
#n_r2=~/fastq/d_r2.fq #n_r2=~/fastq/d_r2.fq
reference=~/data/reference/human_g1k_v37_decoy.fasta reference=~/reference/human_g1k_v37_decoy.fasta
#reference=~/data/reference/human_g1k_v37_decoy.fasta
#out=./all.sam #out=./all.sam
#out=./sn.sam #out=./sn.sam
#out=./ssn-x1.sam #out=./ssn-x1.sam
out=./out.sam #out=./out.sam
#out=/dev/null out=/dev/null
#out=./na12878.sam #out=./na12878.sam
#time ./bwa mem -t 12 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \ #time ./bwa mem -t 12 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
# /home/zzh/data/reference/human_g1k_v37_decoy.fasta \ # /home/zzh/data/reference/human_g1k_v37_decoy.fasta \

View File

@ -31,7 +31,7 @@
#include <stdio.h> #include <stdio.h>
#include <zlib.h> #include <zlib.h>
#define USE_RDTSC 0 #define USE_RDTSC 1
#ifdef SHOW_PERF #ifdef SHOW_PERF
extern uint64_t time_process_data, time_read, time_write, time_compute, extern uint64_t time_process_data, time_read, time_write, time_compute,
@ -48,7 +48,8 @@ extern int64_t gdat[100];
#endif #endif
#ifdef DEBUG_OUTPUT #ifdef DEBUG_OUTPUT
extern FILE *gfp1, *gfp2, *gfp3; extern FILE *gfp1, *gfp2, *gfp3, *gfp4;
extern FILE *gfq[4], *gft[4], *gfi[4];
#endif #endif
#undef MAX #undef MAX