diff --git a/.gitignore b/.gitignore index 09de0bd..e32c3fb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +# for fast-markdup +*.sam +*.log # ---> C++ # Prerequisites *.d diff --git a/.vscode/launch.json b/.vscode/launch.json index ce739e7..08f219a 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -13,10 +13,10 @@ "program": "${workspaceRoot}/build/bin/picard_cpp", "args": [ "MarkDuplicates", - "--INPUT", "/mnt/d/data/100w.bam", + "--INPUT", "~/data/bam/100w.bam", "--OUTPUT", "out.bam", "--METRICS_FILE", "metrics.txt", - "--num_threads", "12", + "--num_threads", "1", "--max_mem", "4G", "--verbosity", "DEBUG", "--asyncio", "true", diff --git a/.vscode/settings.json b/.vscode/settings.json index 37e59d1..90de365 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -88,6 +88,13 @@ "typeindex": "cpp", "typeinfo": "cpp", "valarray": "cpp", - "variant": "cpp" + "variant": "cpp", + "__split_buffer": "cpp", + "executor": "cpp", + "io_context": "cpp", + "netfwd": "cpp", + "timer": "cpp", + "__nullptr": "cpp", + "__node_handle": "c" } } \ No newline at end of file diff --git a/build.sh b/build.sh index b4dd51f..a71b197 100755 --- a/build.sh +++ b/build.sh @@ -1,8 +1,8 @@ #!/bin/bash -dir="/home/zzh/work/GeneKit/picard_cpp/build" +dir="/home/zzh/work/ngs/picard_cpp/build" [ -d "$dir" ] && rm -rf "$dir" mkdir "$dir" cd "$dir" #cmake .. -DCMAKE_BUILD_TYPE=Debug cmake .. -DCMAKE_BUILD_TYPE=Release -make -j 8 +make -j 16 diff --git a/run.sh b/run.sh index a2fc5c3..46d89c0 100755 --- a/run.sh +++ b/run.sh @@ -1,9 +1,14 @@ -time /home/zzh/work/GeneKit/picard_cpp/build/bin/picard_cpp \ +input=~/data/bam/zy_normal.bam +#input=~/data/bam/zy_tumor.bam +#input=~/data/bam/100w.bam + +time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \ MarkDuplicates \ - --INPUT /mnt/d/data/zy_tumor.bam \ - --OUTPUT /mnt/d/data/out.bam \ + --INPUT $input \ + --OUTPUT ~/data/bam/out.bam \ + --INDEX_FORMAT BAI \ --num_threads 1 \ - --max_mem 4G \ + --max_mem 2G \ --verbosity DEBUG \ --asyncio true #\ #--READ_NAME_REGEX ".*?([0-9]+):([0-9]+):([0-9]+)$" @@ -12,4 +17,4 @@ time /home/zzh/work/GeneKit/picard_cpp/build/bin/picard_cpp \ # --INPUT /mnt/d/data/100w.bam \ # --INPUT /mnt/d/data/zy_normal.bam \ # zy_tumor -# tumor_region \ No newline at end of file +# tumor_region diff --git a/src/common/hts/bam_buf.cpp b/src/common/hts/bam_buf.cpp index 685cea9..dfe509e 100755 --- a/src/common/hts/bam_buf.cpp +++ b/src/common/hts/bam_buf.cpp @@ -13,116 +13,87 @@ * BamBuf类 */ // 读取数据直到读完,或者缓冲区满 -int BamBuf::ReadBam() -{ +int BamBuf::ReadBam() { int read_num = 0; - if (handle_last) - { // 处理上次读入的最后一个bam - if (has_enough_space()) - { // 必须调用,在边界处调整memffset + if (handle_last) { // 处理上次读入的最后一个bam + if (has_enough_space()) { // 必须调用,在边界处调整memffset ++read_num; append_one_bam(); - } - else - { - return read_num; // 还是没空间 + } else { + return read_num; // 还是没空间 } } - while (read_stat_ >= 0 && (read_stat_ = sam_read1(fp, hdr, bw->b)) >= 0) - { + while (read_stat_ >= 0 && (read_stat_ = sam_read1(fp, hdr, bw->b)) >= 0) { bw->end_pos_ = BamWrap::BamEndPos(bw->b); - if (has_enough_space()) - { // 还有空间 + if (has_enough_space()) { // 还有空间 append_one_bam(); - ++read_num; // 放进缓存才算读取到 - } - else - { + ++read_num; // 放进缓存才算读取到 + } else { break; } } - if (read_stat_ >= 0) - { + if (read_stat_ >= 0) { handle_last = true; - } - else - { + } else { handle_last = false; } return read_num; } // 初始化缓存 -void BamBuf::Init(samFile *fp, - sam_hdr_t *hdr, - int64_t mem_size) -{ +void BamBuf::Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size) { this->fp = fp; this->hdr = hdr; this->mem_size = mem_size; this->mem = (uint8_t *)malloc(mem_size); this->bw = (BamWrap *)malloc(sizeof(BamWrap)); this->bw->b = bam_init1(); - if (bw == NULL || - this->mem == NULL || - this->bw->b == NULL) - { + if (bw == NULL || this->mem == NULL || this->bw->b == NULL) { fprintf(stderr, "allocate memory failed! Abort\n"); exit(-1); } } -void BamBuf::ClearBeforeIdx(size_t idxInBv) -{ +void BamBuf::ClearBeforeIdx(size_t idxInBv) { if (idxInBv < 1) return; int i = 0, j = idxInBv; - for (; j < bv.size(); ++i, ++j) - { + for (; j < bv.size(); ++i, ++j) { bv[i] = bv[j]; } bv.resize(i); prepare_read(); } -void BamBuf::ClearAll() -{ - +void BamBuf::ClearAll() { bv.clear(); prepare_read(); } // 为下一次读取做准备, 计算一些边界条件 -inline void BamBuf::prepare_read() -{ +inline void BamBuf::prepare_read() { // 计算余留的下次计算可能用到的bam所占的位置 - if (bv.size() > 0) - { + if (bv.size() > 0) { BamWrap *bw = bv[0]; legacy_start = (int64_t)bw - (int64_t)mem; bw = bv.back(); legacy_end = (int64_t)bw + bw->length() - (int64_t)mem; - } - else - { + } else { legacy_start = legacy_end = 0; - mem_offset = 0; // 上次没剩下,那就从头存储 + mem_offset = 0; // 上次没剩下,那就从头存储 } } // 检查缓存是否还有空间 -inline bool BamBuf::has_enough_space() -{ +inline bool BamBuf::has_enough_space() { const uint32_t bam_len = bw->length(); int64_t potential_end = mem_offset + bam_len; if (legacy_start <= legacy_end) legacy_start += mem_size; - if (potential_end >= legacy_start) - { + if (potential_end >= legacy_start) { return false; } - if (potential_end >= mem_size) - { + if (potential_end >= mem_size) { mem_offset = 0; } int64_t virtual_offset = mem_offset; @@ -133,8 +104,7 @@ inline bool BamBuf::has_enough_space() } // 处理一个读取后的bam -inline void BamBuf::append_one_bam() -{ +inline void BamBuf::append_one_bam() { BamWrap *bwp = (BamWrap *)(mem + mem_offset); *bwp = *bw; bwp->b = (bam1_t *)((char *)bwp + sizeof(*bwp)); @@ -148,12 +118,9 @@ inline void BamBuf::append_one_bam() } // 处理上次读入的最后一个read -inline bool BamBuf::handle_last_read() -{ - if (handle_last) - { // 处理上次读入的最后一个bam - if (has_enough_space()) - { // 必须调用,在边界处调整memffset +inline bool BamBuf::handle_last_read() { + if (handle_last) { // 处理上次读入的最后一个bam + if (has_enough_space()) { // 必须调用,在边界处调整memffset append_one_bam(); handle_last = false; return true; @@ -166,49 +133,35 @@ inline bool BamBuf::handle_last_read() * AsyncIoBamBuf 类 */ // 初始化缓存 -void AsyncIoBamBuf::Init(samFile *fp, - sam_hdr_t *hdr, - int64_t mem_size) -{ - if (use_async_io_) - { +void AsyncIoBamBuf::Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size) { + if (use_async_io_) { buf1_.Init(fp, hdr, mem_size >> 1); buf2_.Init(fp, hdr, mem_size >> 1); pi_ = &buf1_; po_ = &buf2_; tid_ = (pthread_t *)malloc(sizeof(pthread_t)); - } - else - { + } else { buf1_.Init(fp, hdr, mem_size); pi_ = &buf1_; } } // 读取数据 -int AsyncIoBamBuf::ReadBam() -{ - if (use_async_io_) - { +int AsyncIoBamBuf::ReadBam() { + if (use_async_io_) { hasThread = true; return async_read_bam(); - } - else - { + } else { return sync_read_bam(); } } -int AsyncIoBamBuf::sync_read_bam() -{ +int AsyncIoBamBuf::sync_read_bam() { int read_num = 0; - if (clear_all_) - { + if (clear_all_) { clear_all_ = false; pi_->ClearAll(); - } - else if (clear_before_idx_ > 0) - { + } else if (clear_before_idx_ > 0) { pi_->ClearBeforeIdx(clear_before_idx_); clear_before_idx_ = 0; } @@ -217,23 +170,18 @@ int AsyncIoBamBuf::sync_read_bam() return read_num; } -int AsyncIoBamBuf::async_read_bam() -{ +int AsyncIoBamBuf::async_read_bam() { int read_num = 0; - if (first_read_) - { + if (first_read_) { read_num = pi_->ReadBam(); first_read_ = false; refresh_bam_arr(); - } - else - { + } else { // join, 交换缓冲区指针 pthread_join(*tid_, 0); resize_buf(); - if (need_read_) - { // 需要交换指针 + if (need_read_) { // 需要交换指针 BamBuf *tmp = pi_; pi_ = po_; po_ = tmp; @@ -246,72 +194,52 @@ int AsyncIoBamBuf::async_read_bam() return read_num; } -void *AsyncIoBamBuf::async_read(void *data) -{ +void *AsyncIoBamBuf::async_read(void *data) { AsyncIoBamBuf *ab = (AsyncIoBamBuf *)data; - if (ab->need_read_ && ab->ReadStat() >= 0) - { // 需要读取 + if (ab->need_read_ && ab->ReadStat() >= 0) { // 需要读取 ab->last_read_num_ = ab->po_->ReadBam(); - } - else - { + } else { ab->last_read_num_ = 0; } pthread_exit(0); } -// 为下一次读取做准备, 计算一些边界条件,延迟操作,因为此时可能po_对应的buf正在读取 -void AsyncIoBamBuf::ClearBeforeIdx(size_t idxInBv) -{ +// 为下一次读取做准备, +// 计算一些边界条件,延迟操作,因为此时可能po_对应的buf正在读取 +void AsyncIoBamBuf::ClearBeforeIdx(size_t idxInBv) { clear_before_idx_ = idxInBv; } // 清空上一次所有读入的数据,延迟操作,因为此时可能po_对应的buf正在读取 -void AsyncIoBamBuf::ClearAll() -{ - clear_all_ = true; -} +void AsyncIoBamBuf::ClearAll() { clear_all_ = true; } -inline void AsyncIoBamBuf::resize_buf() -{ - if (clear_all_) - { // 清理上一轮的数据 +inline void AsyncIoBamBuf::resize_buf() { + if (clear_all_) { // 清理上一轮的数据 clear_all_ = false; po_->ClearBeforeIdx(legacy_size_); pi_->ClearAll(); - if (pi_->handle_last_read()) // 上次读取有一个read没放入缓存 - { + if (pi_->handle_last_read()) { // 上次读取有一个read没放入缓存 last_read_num_ += 1; - legacy_size_ = pi_->Size(); // 应该只有一个read + legacy_size_ = pi_->Size(); // 应该只有一个read need_read_ = true; - } - else // 没空间存放,则不交换指针,或者文件已经读取完毕 - { + } else { // 没空间存放,则不交换指针,或者文件已经读取完毕 legacy_size_ = 0; need_read_ = false; } - } - else if (clear_before_idx_ > 0) - { - if (clear_before_idx_ < legacy_size_) - { + } else if (clear_before_idx_ > 0) { + if (clear_before_idx_ < legacy_size_) { po_->ClearBeforeIdx(clear_before_idx_); legacy_size_ -= clear_before_idx_; // 不需要交换指针,不需要读取 need_read_ = false; - } - else - { + } else { po_->ClearBeforeIdx(legacy_size_); pi_->ClearBeforeIdx(clear_before_idx_ - legacy_size_); - if (pi_->handle_last_read()) // 上次读取有一个read没放入缓存 - { + if (pi_->handle_last_read()) {// 上次读取有一个read没放入缓存 last_read_num_ += 1; - legacy_size_ = pi_->Size(); // 应该只有一个read + legacy_size_ = pi_->Size(); // 应该只有一个read need_read_ = true; - } - else // 没空间存放,则不交换指针,或者文件已经读取完毕 - { + } else { // 没空间存放,则不交换指针,或者文件已经读取完毕 legacy_size_ = 0; need_read_ = false; } diff --git a/src/common/hts/bam_buf.h b/src/common/hts/bam_buf.h index 61c0934..5ea0c99 100644 --- a/src/common/hts/bam_buf.h +++ b/src/common/hts/bam_buf.h @@ -10,19 +10,18 @@ #ifndef BAM_BUF_H_ #define BAM_BUF_H_ -#include -#include +#include +#include #include +#include #include #include -#include -#include +#include +#include #include #include -#include - -#include +#include #include "bam_wrap.h" @@ -32,24 +31,22 @@ using namespace std; /* * 存放读入的bam数据 */ -struct BamBuf -{ - sam_hdr_t *hdr; // sam文件的header信息 - samFile *fp; // sam文件指针 - BamWrap *bw = nullptr; // 用来循环读入bam - uint8_t *mem = nullptr; // 用来存放bam的数据, 程序结束后自动释放,所以没在析构函数里释放 - int64_t mem_offset = 0; // 下一次要存放的位置 - int64_t mem_size; // 缓存大小 - int read_stat_ = 0; // 读取状态,是否读完 - vector bv; // 方便对bam数据的访问 - int64_t legacy_start = 0; // 没处理完的bam在mem中的起始位置, 闭区间 - int64_t legacy_end = 0; // 没处理完的bam在mem中的结束位置, 开区间 - bool handle_last = false; // 上次最后读入的bam是否需要处理 +struct BamBuf { + sam_hdr_t *hdr; // sam文件的header信息 + samFile *fp; // sam文件指针 + BamWrap *bw = nullptr; // 用来循环读入bam + uint8_t *mem = nullptr; // 用来存放bam的数据, + // 程序结束后自动释放,所以没在析构函数里释放 + int64_t mem_offset = 0; // 下一次要存放的位置 + int64_t mem_size; // 缓存大小 + int read_stat_ = 0; // 读取状态,是否读完 + vector bv; // 方便对bam数据的访问 + int64_t legacy_start = 0; // 没处理完的bam在mem中的起始位置, 闭区间 + int64_t legacy_end = 0; // 没处理完的bam在mem中的结束位置, 开区间 + bool handle_last = false; // 上次最后读入的bam是否需要处理 // 初始化缓存 - void Init(samFile *fp, - sam_hdr_t *hdr, - int64_t mem_size); + void Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size); // 读取数据直到读完,或者缓冲区满 int ReadBam(); // 为下一次读取做准备, 计算一些边界条件 @@ -57,27 +54,24 @@ struct BamBuf // 清空上一次所有读入的数据 void ClearAll(); inline int64_t Size() { return bv.size(); } // 包含多少个read - inline int ReadStat() { return read_stat_; } // 文件的读取状态,是否可读(读取完全) - ~BamBuf() - { - if (this->mem != nullptr) - { + inline int ReadStat() { + return read_stat_; + } // 文件的读取状态,是否可读(读取完全) + ~BamBuf() { + if (this->mem != nullptr) { free(this->mem); } - if (this->bw != nullptr) - { + if (this->bw != nullptr) { bam_destroy1(bw->b); free(bw); } } - void FreeMemory() // 释放开辟的内存 + void FreeMemory() // 释放开辟的内存 { - if (this->mem != nullptr) - { + if (this->mem != nullptr) { free(this->mem); } - if (this->bw != nullptr) - { + if (this->bw != nullptr) { bam_destroy1(bw->b); free(bw); } @@ -102,31 +96,31 @@ struct BamBuf /* * io异步缓冲区 */ -struct AsyncIoBamBuf -{ +struct AsyncIoBamBuf { BamBuf buf1_; BamBuf buf2_; - BamBuf *pi_; // 当前用的buf - BamBuf *po_; // 后台在读取的buf + BamBuf *pi_; // 当前用的buf + BamBuf *po_; // 后台在读取的buf pthread_t *tid_ = NULL; bool hasThread = false; - int64_t legacy_size_ = 0; // 上一轮运算之后,缓存中还剩余的上次读取的read数量 + int64_t legacy_size_ = + 0; // 上一轮运算之后,缓存中还剩余的上次读取的read数量 bool first_read_ = true; - int last_read_num_ = 0; // 上一次读取了多少reads + int last_read_num_ = 0; // 上一次读取了多少reads bool need_read_ = true; bool use_async_io_ = true; - int64_t clear_before_idx_ = 0; // 用户异步读取,下一轮读取之前清理掉clear_before_idx_之前的所有reads - bool clear_all_ = false; // 用于异步读取,下一轮读取之前清理掉之前的所有reads + int64_t clear_before_idx_ = + 0; // 用户异步读取,下一轮读取之前清理掉clear_before_idx_之前的所有reads + bool clear_all_ = + false; // 用于异步读取,下一轮读取之前清理掉之前的所有reads - vector bam_arr_; // 用来访问buf中的bam + vector bam_arr_; // 用来访问buf中的bam AsyncIoBamBuf() {} AsyncIoBamBuf(bool use_async) : use_async_io_(use_async) {} // 析构 - ~AsyncIoBamBuf() - { - if (tid_ != NULL) - { + ~AsyncIoBamBuf() { + if (tid_ != NULL) { if (hasThread) pthread_join(*tid_, 0); free(tid_); @@ -136,35 +130,29 @@ struct AsyncIoBamBuf } // 初始化缓存 - void Init(samFile *fp, - sam_hdr_t *hdr, - int64_t mem_size); + void Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size); // 读取数据 int ReadBam(); // 为下一次读取做准备, 计算一些边界条件 void ClearBeforeIdx(size_t idxInBv); - vector &GetBamArr() { return bam_arr_; } // 获取bam array + vector &GetBamArr() { return bam_arr_; } // 获取bam array // 清空上一次所有读入的数据 void ClearAll(); // 包含的read数量 inline int64_t Size() { return legacy_size_ + pi_->Size(); } inline int ReadStat() { return pi_->read_stat_; } - inline BamWrap *operator[](int64_t pos) - { - return bam_arr_[pos]; - } + inline BamWrap *operator[](int64_t pos) { return bam_arr_[pos]; } // 获取某一段reads - inline vector Slice(size_t startIdx, size_t endIdx) - { + inline vector Slice(size_t startIdx, size_t endIdx) { if (endIdx > startIdx) { auto begItr = bam_arr_.begin(); - return std::move(vector(begItr + startIdx, begItr + endIdx)); + return std::move( + vector(begItr + startIdx, begItr + endIdx)); } return std::move(vector()); } - void FreeMemory() - { + void FreeMemory() { buf1_.FreeMemory(); buf2_.FreeMemory(); } @@ -176,11 +164,9 @@ struct AsyncIoBamBuf // 异步读取线程函数 static void *async_read(void *data); void resize_buf(); - inline void refresh_bam_arr() - { + inline void refresh_bam_arr() { bam_arr_.resize(this->Size()); - for (int i = 0; i < bam_arr_.size(); ++i) - { + for (int i = 0; i < bam_arr_.size(); ++i) { if (i < legacy_size_) bam_arr_[i] = (*po_)[i]; else diff --git a/src/common/hts/bam_wrap.h b/src/common/hts/bam_wrap.h index d338256..1562514 100644 --- a/src/common/hts/bam_wrap.h +++ b/src/common/hts/bam_wrap.h @@ -9,15 +9,14 @@ #ifndef BAM_WRAP_H_ #define BAM_WRAP_H_ -#include -#include -#include -#include - +#include #include #include -#include +#include +#include +#include +#include using namespace std; @@ -28,52 +27,32 @@ using namespace std; /* * sam read的封装 */ -struct BamWrap -{ +struct BamWrap { // 将contig左移后加上pos作为全局位置 - const static int MAX_CONTIG_LEN_SHIFT = 30; + const static int MAX_CONTIG_LEN_SHIFT = 40; // 将染色体id左移多少位,和位点拼合在一起 const static int READ_MAX_LENGTH = 200; - const static int READ_MAX_DEPTH = 1000; // 这只是用来初始化空间用的,深度大于这个值也没关系 + const static int READ_MAX_DEPTH = 1000; // 这只是用来初始化空间用的,深度大于这个值也没关系 // 成员变量尽量少,减少占用内存空间 bam1_t *b; - int64_t end_pos_; // bam的全局结束位置, 闭区间 + int64_t end_pos_; // bam的全局结束位置, 闭区间 // 全局开始位置 - inline int64_t start_pos() - { - return bam_global_pos(b); - } + inline int64_t start_pos() { return bam_global_pos(b); } // 全局结束位置 - inline int64_t end_pos() - { - return end_pos_; - } + inline int64_t end_pos() { return end_pos_; } // 和reference对应的序列长度 - inline int16_t read_len() - { - return (end_pos_ - start_pos() + 1); - } + inline int16_t read_len() { return (end_pos_ - start_pos() + 1); } // 在contig内的开始位置 - inline int32_t contig_pos() - { - return b->core.pos; - } + inline int32_t contig_pos() { return b->core.pos; } // 在contig内部的结束位置 - inline int32_t contig_end_pos() - { - return bam_pos(end_pos_); - } + inline int32_t contig_end_pos() { return bam_pos(end_pos_); } // 序列的长度(AGTC字母个数) - inline int16_t seq_len() - { - return b->core.l_qseq; - } + inline int16_t seq_len() { return b->core.l_qseq; } // 算上开头的softclip - inline int32_t softclip_start() - { + inline int32_t softclip_start() { const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; const char c = bam_cigar_opchr(cigar[0]); @@ -84,8 +63,7 @@ struct BamWrap } // 算上结尾的softclip - inline int32_t softclip_end() - { + inline int32_t softclip_end() { const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; const char c = bam_cigar_opchr(cigar[bc.n_cigar - 1]); @@ -96,8 +74,7 @@ struct BamWrap } // 算上结尾的softclip - inline int32_t right_softclip_len() - { + inline int32_t right_softclip_len() { const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; const char c = bam_cigar_opchr(cigar[bc.n_cigar - 1]); @@ -108,14 +85,12 @@ struct BamWrap } // 获取序列 - inline std::string sequence() - { + inline std::string sequence() { ostringstream oss; char *seq = (char *)bam_get_seq(b); const bam1_core_t &bc = b->core; const char base_to_char[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'}; - for (int i = 0; i < bc.l_qseq; ++i) - { + for (int i = 0; i < bc.l_qseq; ++i) { char base = base_to_char[bam_seqi(seq, i)]; oss << base; } @@ -123,18 +98,13 @@ struct BamWrap } // 获取名字 - inline const char *query_name() - { - return bam_get_qname(b); - } + inline const char *query_name() { return bam_get_qname(b); } // 获取cigar 字符串 - inline string cigar_str() - { + inline string cigar_str() { ostringstream oss; const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; - for (int i = 0; i < bc.n_cigar; ++i) - { + for (int i = 0; i < bc.n_cigar; ++i) { const char c = bam_cigar_opchr(cigar[i]); const int len = bam_cigar_oplen(cigar[i]); oss << len << c; @@ -143,21 +113,14 @@ struct BamWrap } // 占用的内存大小 - inline int16_t length() - { - return sizeof(*this) + - sizeof(bam1_t) + - b->l_data; - } + inline int16_t length() { return sizeof(*this) + sizeof(bam1_t) + b->l_data; } // 获取cigar中insert的总长度 - inline int32_t insert_cigar_len() - { + inline int32_t insert_cigar_len() { const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; int ret = 0; - for (int i = 0; i < bc.n_cigar; ++i) - { + for (int i = 0; i < bc.n_cigar; ++i) { const char c = bam_cigar_opchr(cigar[i]); const int len = bam_cigar_oplen(cigar[i]); if (c == 'I') @@ -167,13 +130,11 @@ struct BamWrap } // 获取cigar中delete的总长度 - inline int32_t del_cigar_len() - { + inline int32_t del_cigar_len() { const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; int ret = 0; - for (int i = 0; i < bc.n_cigar; ++i) - { + for (int i = 0; i < bc.n_cigar; ++i) { const char c = bam_cigar_opchr(cigar[i]); const int len = bam_cigar_oplen(cigar[i]); if (c == 'D') @@ -183,13 +144,11 @@ struct BamWrap } // 计算sam read的终点位置 - static inline int64_t BamEndPos(const bam1_t *b) - { + static inline int64_t BamEndPos(const bam1_t *b) { const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; int start_offset = -1; - for (int i = 0; i < bc.n_cigar; ++i) - { + for (int i = 0; i < bc.n_cigar; ++i) { const char c = bam_cigar_opchr(cigar[i]); const int len = bam_cigar_oplen(cigar[i]); if (c == 'D' || c == 'N' || c == 'M' || c == '=' || c == 'X') @@ -198,31 +157,22 @@ struct BamWrap return (((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)(b->core.pos + start_offset)); }; - bool HasWellDefinedFragmentSize() - { + bool HasWellDefinedFragmentSize() { const bam1_core_t &bc = b->core; bool hasWellDefinedFragmentSize = true; - if (bc.isize == 0 || - !(bc.flag & BAM_FPAIRED) || - ((bc.flag & BAM_FUNMAP) || (bc.flag & BAM_FMUNMAP)) || - ((bool)(bc.flag & BAM_FREVERSE) == (bool)(bc.flag & BAM_FMREVERSE))) - { + if (bc.isize == 0 || !(bc.flag & BAM_FPAIRED) || ((bc.flag & BAM_FUNMAP) || (bc.flag & BAM_FMUNMAP)) || + ((bool)(bc.flag & BAM_FREVERSE) == (bool)(bc.flag & BAM_FMREVERSE))) { hasWellDefinedFragmentSize = false; - } - else if (bc.flag & BAM_FREVERSE) - { + } else if (bc.flag & BAM_FREVERSE) { hasWellDefinedFragmentSize = contig_end_pos() > bc.mpos ? true : false; - } - else - { + } else { hasWellDefinedFragmentSize = bc.pos <= bc.mpos + bc.isize ? true : false; } return hasWellDefinedFragmentSize; } // 计算bam的adapterBoundary - int GetAdapterBoundary() - { + int GetAdapterBoundary() { const bam1_core_t &bc = b->core; int adapterBoundary; if (!HasWellDefinedFragmentSize()) @@ -230,39 +180,34 @@ struct BamWrap else if (bc.flag & BAM_FREVERSE) adapterBoundary = bc.mpos - 1; else - adapterBoundary = bc.pos + abs(bc.isize); // GATK4.0 和 GATK3.5不一样,3.5的这里+1 + adapterBoundary = bc.pos + abs(bc.isize); // GATK4.0 和 GATK3.5不一样,3.5的这里+1 return adapterBoundary; } // 获取开头的I的长度 - inline int GetHeadInsertLen() - { + inline int GetHeadInsertLen() { int insLen = 0; const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; - for (int i = 0; i < bc.n_cigar; ++i) - { + for (int i = 0; i < bc.n_cigar; ++i) { const char c = bam_cigar_opchr(cigar[i]); const int len = bam_cigar_oplen(cigar[i]); - if (c == 'I') - { + if (c == 'I') { insLen = len; break; - } - else if (c != 'H' && c != 'S') + } else if (c != 'H' && c != 'S') break; } return insLen; } - // 获取soft clip开始位置(能处理H和S相连的情况,有这种情况么?, 注意开头的I要当做S?) - inline int64_t GetSoftStart() - { + // 获取soft clip开始位置(能处理H和S相连的情况,有这种情况么?, + // 注意开头的I要当做S?) + inline int64_t GetSoftStart() { int64_t softStart = b->core.pos; const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; - for (int i = 0; i < bc.n_cigar; ++i) - { + for (int i = 0; i < bc.n_cigar; ++i) { const char c = bam_cigar_opchr(cigar[i]); const int len = bam_cigar_oplen(cigar[i]); if (c == 'S' || c == 'I') @@ -274,13 +219,11 @@ struct BamWrap } // 获取unclipped开始位置(包括hardclip) - inline int64_t GetUnclippedStart() - { + inline int64_t GetUnclippedStart() { int64_t start = b->core.pos; const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; - for (int i = 0; i < bc.n_cigar; ++i) - { + for (int i = 0; i < bc.n_cigar; ++i) { const char c = bam_cigar_opchr(cigar[i]); const int len = bam_cigar_oplen(cigar[i]); if (c == 'S' || c == 'H') @@ -292,13 +235,11 @@ struct BamWrap } // 获取unclipped结束位置(包括hardclip) - inline int64_t GetUnclippedEnd() - { + inline int64_t GetUnclippedEnd() { int64_t end_pos = bam_endpos(b); const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; - for (int i = bc.n_cigar - 1; i >= 0; --i) - { + for (int i = bc.n_cigar - 1; i >= 0; --i) { const char c = bam_cigar_opchr(cigar[i]); const int len = bam_cigar_oplen(cigar[i]); if (c == 'S' || c == 'H') @@ -311,12 +252,10 @@ struct BamWrap /* 获取碱基质量分数的加和 */ /** Calculates a score for the read which is the sum of scores over Q15. */ - inline int GetSumOfBaseQualities() - { + inline int GetSumOfBaseQualities() { int score = 0; uint8_t *qual = bam_get_qual(b); - for (int i = 0; i < b->core.l_qseq; ++i) - { + for (int i = 0; i < b->core.l_qseq; ++i) { if (qual[i] >= 15) score += qual[i]; } @@ -327,93 +266,63 @@ struct BamWrap /* 与flag相关的检测 */ /* 没有比对上 unmapped */ - inline bool GetReadUnmappedFlag() - { - return b->core.flag & BAM_FUNMAP; - } + inline bool GetReadUnmappedFlag() { return b->core.flag & BAM_FUNMAP; } /* Template having multiple segments in sequencing */ - inline bool GetReadPairedFlag() - { - return b->core.flag & BAM_FPAIRED; - } + inline bool GetReadPairedFlag() { return b->core.flag & BAM_FPAIRED; } /** * the read fails platform/vendor quality checks. */ - inline bool GetReadFailsVendorQualityCheckFlag() - { - return b->core.flag & BAM_FQCFAIL; - } + inline bool GetReadFailsVendorQualityCheckFlag() { return b->core.flag & BAM_FQCFAIL; } /** * the mate is unmapped. */ - bool GetMateUnmappedFlag() - { - return b->core.flag & BAM_FMUNMAP; - } + bool GetMateUnmappedFlag() { return b->core.flag & BAM_FMUNMAP; } /** - * @return whether the alignment is secondary (an alternative alignment of the read). + * @return whether the alignment is secondary (an alternative alignment of + * the read). */ - bool IsSecondaryAlignment() - { - return b->core.flag & BAM_FSECONDARY; - } + bool IsSecondaryAlignment() { return b->core.flag & BAM_FSECONDARY; } /** - * @return whether the alignment is supplementary (a split alignment such as a chimeric alignment). + * @return whether the alignment is supplementary (a split alignment such as + * a chimeric alignment). */ - bool GetSupplementaryAlignmentFlag() - { - return b->core.flag & BAM_FSUPPLEMENTARY; - } + bool GetSupplementaryAlignmentFlag() { return b->core.flag & BAM_FSUPPLEMENTARY; } /* - * Tests if this record is either a secondary and/or supplementary alignment; + * Tests if this record is either a secondary and/or supplementary + * alignment; */ - bool IsSecondaryOrSupplementary() - { - return IsSecondaryAlignment() || GetSupplementaryAlignmentFlag(); - } + bool IsSecondaryOrSupplementary() { return IsSecondaryAlignment() || GetSupplementaryAlignmentFlag(); } /** * the read is the first read in a pair. */ - bool GetFirstOfPairFlag() - { - return b->core.flag & BAM_FREAD1; - } + bool GetFirstOfPairFlag() { return b->core.flag & BAM_FREAD1; } /** * strand of the query (false for forward; true for reverse strand). */ - bool GetReadNegativeStrandFlag() - { - return b->core.flag & BAM_FREVERSE; - } + bool GetReadNegativeStrandFlag() { return b->core.flag & BAM_FREVERSE; } /** * strand of the mate (false for forward; true for reverse strand). */ - bool GetMateNegativeStrandFlag() - { - return b->core.flag & BAM_FMREVERSE; - } + bool GetMateNegativeStrandFlag() { return b->core.flag & BAM_FMREVERSE; } /* 其他的一些信息 */ - inline int GetReferenceLength() - { + inline int GetReferenceLength() { int length = 0; const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; - for (int i = 0; i < bc.n_cigar; ++i) - { + for (int i = 0; i < bc.n_cigar; ++i) { const char c = bam_cigar_opchr(cigar[i]); const int len = bam_cigar_oplen(cigar[i]); - switch (c) - { + switch (c) { case 'M': case 'D': case 'N': @@ -429,24 +338,20 @@ struct BamWrap } // 计算bam的全局位置,算上染色体序号和比对位置 - static inline int64_t bam_global_pos(bam1_t *b) - { + static inline int64_t bam_global_pos(bam1_t *b) { return (((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)b->core.pos); } - static inline int64_t bam_global_pos(int tid, int pos) - { + static inline int64_t bam_global_pos(int tid, int pos) { return (((int64_t)tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)pos); } // 根据全局位置获取bam的染色体序号 - static inline int32_t bam_tid(int64_t global_pos) - { + static inline int32_t bam_tid(int64_t global_pos) { const int64_t mask = ~(((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1); const int64_t high_tid = global_pos & mask; return (int32_t)(high_tid >> MAX_CONTIG_LEN_SHIFT); } // 根据全局位置获取bam的比对位置(染色体内) - static inline int32_t bam_pos(int64_t global_pos) - { + static inline int32_t bam_pos(int64_t global_pos) { const int64_t mask = ((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1; return (int32_t)(global_pos & mask); } diff --git a/src/common/utils/debug.cpp b/src/common/utils/debug.cpp new file mode 100644 index 0000000..e69de29 diff --git a/src/common/utils/debug.h b/src/common/utils/debug.h new file mode 100644 index 0000000..e69de29 diff --git a/src/common/utils/global_arg.cpp b/src/common/utils/global_arg.cpp index 1588147..d707dd6 100644 --- a/src/common/utils/global_arg.cpp +++ b/src/common/utils/global_arg.cpp @@ -23,8 +23,7 @@ using std::vector; struct option *GlobalArg::GLOBAL_OPT = nullptr; // 初始化参数 -void GlobalArg::initGlobalOptions() -{ +void GlobalArg::initGlobalOptions() { vector v; v.push_back({"INPUT", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_INPUT}); // 输入文件 v.push_back({"OUTPUT", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_OUTPUT}); // 输出文件 @@ -51,59 +50,53 @@ void GlobalArg::initGlobalOptions() } // 解析参数 -void GlobalArg::parseArgument(int argNum) -{ +void GlobalArg::parseArgument(int argNum) { using namespace ns_ga; - switch (argNum) - { - case OPT_INPUT: - in_fn = optarg; - break; - case OPT_OUTPUT: - out_fn = optarg; - break; - case OPT_NUM_THREADS: - num_threads = std::stoi(optarg); - break; - case OPT_MAX_MEM: - { - char *q; - size_t mem_arg = strtol(optarg, &q, 0); - if (*q == 'k' || *q == 'K') - mem_arg <<= 10; - else if (*q == 'm' || *q == 'M') - mem_arg <<= 20; - else if (*q == 'g' || *q == 'G') - mem_arg <<= 30; - if (mem_arg >= max_mem) - max_mem = mem_arg; - else - { - std::cerr << "[Warn] Too small mem size, use default" << std::endl; - } - break; + switch (argNum) { + case OPT_INPUT: + in_fn = optarg; + break; + case OPT_OUTPUT: + out_fn = optarg; + break; + case OPT_NUM_THREADS: + num_threads = std::stoi(optarg); + break; + case OPT_MAX_MEM: { + char *q; + size_t mem_arg = strtol(optarg, &q, 0); + if (*q == 'k' || *q == 'K') + mem_arg <<= 10; + else if (*q == 'm' || *q == 'M') + mem_arg <<= 20; + else if (*q == 'g' || *q == 'G') + mem_arg <<= 30; + if (mem_arg >= max_mem) + max_mem = mem_arg; + else { + std::cerr << "[Warn] Too small mem size, use default" << std::endl; } - case OPT_LOG_LEVEL: - { - if (strcmp("ERROR", optarg) == 0) - verbosity = ns_ga::ERROR; - else if (strcmp("WARNING", optarg) == 0) - verbosity = ns_ga::WARNING; - else if (strcmp("INFO", optarg) == 0) - verbosity = ns_ga::INFO; - else if (strcmp("DEBUG", optarg) == 0) - verbosity = ns_ga::DEBUG; - break; - } - case OPT_ASYNCIO: - { - if (strcmp("true", optarg) == 0) - use_asyncio = true; - else if (strcmp("false", optarg) == 0) - use_asyncio = false; - break; - } - default: - break; + break; + } + case OPT_LOG_LEVEL: { + if (strcmp("ERROR", optarg) == 0) + verbosity = ns_ga::ERROR; + else if (strcmp("WARNING", optarg) == 0) + verbosity = ns_ga::WARNING; + else if (strcmp("INFO", optarg) == 0) + verbosity = ns_ga::INFO; + else if (strcmp("DEBUG", optarg) == 0) + verbosity = ns_ga::DEBUG; + break; + } + case OPT_ASYNCIO: { + if (strcmp("true", optarg) == 0) + use_asyncio = true; + else if (strcmp("false", optarg) == 0) + use_asyncio = false; + break; + } + default: + break; } } \ No newline at end of file diff --git a/src/common/utils/global_arg.h b/src/common/utils/global_arg.h index 8ff64be..903eb14 100644 --- a/src/common/utils/global_arg.h +++ b/src/common/utils/global_arg.h @@ -9,63 +9,57 @@ Date : 2023/10/23 #ifndef GLOBAL_ARG_H_ #define GLOBAL_ARG_H_ -#include -#include -#include #include #include +#include +#include +#include + using std::map; using std::string; using std::vector; namespace ns_ga { - enum GlobalOptEnum - { - _START_NUM = 1, - OPT_INPUT, - OPT_OUTPUT, - OPT_NUM_THREADS, - OPT_MAX_MEM, - OPT_LOG_LEVEL, - OPT_ASYNCIO, - OPT_VERSION, - OPT_HELP, - _END_NUM - }; +enum GlobalOptEnum { + _START_NUM = 1, + OPT_INPUT, + OPT_OUTPUT, + OPT_NUM_THREADS, + OPT_MAX_MEM, + OPT_LOG_LEVEL, + OPT_ASYNCIO, + OPT_VERSION, + OPT_HELP, + _END_NUM +}; - // log level - enum LogLevelEnum - { - ERROR, - WARNING, - INFO, - DEBUG - }; -} +// log level +enum LogLevelEnum { ERROR, WARNING, INFO, DEBUG }; +} // namespace ns_ga /* 全局共享的一些参数 */ -struct GlobalArg -{ - const static int GLOBAL_ARG_CNT = ns_ga::GlobalOptEnum::_END_NUM - ns_ga::GlobalOptEnum::_START_NUM; // 这里不需要减1 +struct GlobalArg { + const static int GLOBAL_ARG_CNT = + ns_ga::GlobalOptEnum::_END_NUM - + ns_ga::GlobalOptEnum::_START_NUM; // 这里不需要减1 static struct option *GLOBAL_OPT; - string in_fn; // input bam filename - string out_fn; // output bam filename - int num_threads = 1; // 线程个数 - size_t max_mem = ((size_t)2) << 30; // 最小2G - ns_ga::LogLevelEnum verbosity = ns_ga::INFO; // 打印信息级别 - bool use_asyncio = true; // 是否使用异步io + string in_fn; // input bam filename + string out_fn; // output bam filename + int num_threads = 1; // 线程个数 + size_t max_mem = ((size_t)1) << 30; // 最小1G + ns_ga::LogLevelEnum verbosity = ns_ga::INFO; // 打印信息级别 + bool use_asyncio = true; // 是否使用异步io - vector vArgInfo; // 每个参数的帮助信息 + vector vArgInfo; // 每个参数的帮助信息 // 单例模式 GlobalArg(const GlobalArg &) = delete; GlobalArg &operator=(const GlobalArg &) = delete; // 获取单例 - static GlobalArg &Instance() - { + static GlobalArg &Instance() { static GlobalArg instance; return instance; } @@ -76,8 +70,7 @@ struct GlobalArg void parseArgument(int argNum); // 获取对应参数在数组(option和help info)中的索引 - int getArgIndx(ns_ga::GlobalOptEnum opt) - { + int getArgIndx(ns_ga::GlobalOptEnum opt) { return opt - ns_ga::GlobalOptEnum::OPT_INPUT; } @@ -95,11 +88,9 @@ struct GlobalArg printf("--verbosity = %d\n", verbosity); printf("--asyncio = %d\n", use_asyncio); } -private : - GlobalArg() - { - initGlobalOptions(); - }; + +private: + GlobalArg() { initGlobalOptions(); }; }; #endif \ No newline at end of file diff --git a/src/common/utils/murmur3.h b/src/common/utils/murmur3.h index 5ffadef..5dcdee3 100644 --- a/src/common/utils/murmur3.h +++ b/src/common/utils/murmur3.h @@ -7,36 +7,34 @@ Author : Zhang Zhonghai Date : 2023/11/6 */ -#include #include +#include using std::string; /** - * Provides an implementation of the Murmur3_32 hash algorithm that has desirable properties in terms of randomness - * and uniformity of the distribution of output values that make it a useful hashing algorithm for downsampling. + * Provides an implementation of the Murmur3_32 hash algorithm that has + * desirable properties in terms of randomness and uniformity of the + * distribution of output values that make it a useful hashing algorithm for + * downsampling. */ -struct Murmur3 -{ +struct Murmur3 { int seed_ = 0; /** Hashes a character stream to an int using Murmur3. */ - int HashUnencodedChars(const string &input) - { + int HashUnencodedChars(const string &input) { int h1 = this->seed_; // step through the CharSequence 2 chars at a time const int length = input.size(); - for (int i = 1; i < length; i += 2) - { + for (int i = 1; i < length; i += 2) { int k1 = input.at(i - 1) | (input.at(i) << 16); k1 = mixK1(k1); h1 = mixH1(h1, k1); } // deal with any remaining characters - if ((length & 1) == 1) - { + if ((length & 1) == 1) { int k1 = input.at(length - 1); k1 = mixK1(k1); h1 ^= k1; @@ -45,14 +43,12 @@ struct Murmur3 return fmix(h1, 2 * length); } - static Murmur3 &Instance() - { + static Murmur3 &Instance() { static Murmur3 instance; return instance; } - static int mixK1(int k1) - { + static int mixK1(int k1) { const int c1 = 0xcc9e2d51; const int c2 = 0x1b873593; k1 *= c1; @@ -61,8 +57,7 @@ struct Murmur3 return k1; } - static int mixH1(int h1, int k1) - { + static int mixH1(int h1, int k1) { h1 ^= k1; h1 = h1 << 13; h1 = h1 * 5 + 0xe6546b64; @@ -70,8 +65,7 @@ struct Murmur3 } // Finalization mix - force all bits of a hash block to avalanche - static int fmix(int h1, int length) - { + static int fmix(int h1, int length) { h1 ^= length; h1 ^= (unsigned int)h1 >> 16; h1 *= 0x85ebca6b; @@ -82,8 +76,7 @@ struct Murmur3 } private: - Murmur3() - { + Murmur3() { auto &&rd = std::random_device{}; seed_ = rd(); } diff --git a/src/common/utils/profiling.cpp b/src/common/utils/profiling.cpp new file mode 100644 index 0000000..e69de29 diff --git a/src/common/utils/profiling.h b/src/common/utils/profiling.h new file mode 100644 index 0000000..e69de29 diff --git a/src/sam/markdups/dup_metrics.h b/src/sam/markdups/dup_metrics.h new file mode 100644 index 0000000..508c4e8 --- /dev/null +++ b/src/sam/markdups/dup_metrics.h @@ -0,0 +1,70 @@ +#pragma once + +#include +#include + +using std::string; + +/* + * 标记冗余过程中的一些数据统计 + */ +struct DuplicationMetrics { + /** + * The library on which the duplicate marking was performed. + */ + string LIBRARY = ""; + /** + * The number of mapped reads examined which did not have a mapped mate pair, + * either because the read is unpaired, or the read is paired to an unmapped mate. + */ + uint64_t UNPAIRED_READS_EXAMINED = 0; + /** + * The number of mapped read pairs examined. (Primary, non-supplemental) + */ + uint64_t READ_PAIRS_EXAMINED = 0; + /** + * The number of reads that were either secondary or supplementary + */ + uint64_t SECONDARY_OR_SUPPLEMENTARY_RDS = 0; + /** + * The total number of unmapped reads examined. (Primary, non-supplemental) + */ + uint64_t UNMAPPED_READS = 0; + /** + * The number of fragments that were marked as duplicates. + */ + uint64_t UNPAIRED_READ_DUPLICATES = 0; + /** + * The number of read pairs that were marked as duplicates. + */ + uint64_t READ_PAIR_DUPLICATES = 0; + /** + * The number of read pairs duplicates that were caused by optical duplication. + * Value is always < READ_PAIR_DUPLICATES, which counts all duplicates regardless of source. + */ + uint64_t READ_PAIR_OPTICAL_DUPLICATES = 0; + /** + * The fraction of mapped sequence that is marked as duplicate. + */ + double PERCENT_DUPLICATION = 0.0; + /** + * The estimated number of unique molecules in the library based on PE duplication. + */ + uint64_t ESTIMATED_LIBRARY_SIZE = 0; + + // 其他的统计数据 + + // addSingletonToCount需要记录的数据 + uint64_t DuplicateCountHist = 0; + uint64_t NonOpticalDuplicateCountHist = 0; + + // track optical duplicates 需要记录的数据 + uint64_t OpticalDuplicatesCountHist = 0; + uint64_t OpticalDuplicatesByLibraryId = 0; + + // 统计相关的函数 + void AddSingletonToCount() { + ++this->DuplicateCountHist; + ++this->NonOpticalDuplicateCountHist; + } +}; \ No newline at end of file diff --git a/src/sam/markdups/markdups.cpp b/src/sam/markdups/markdups.cpp index 6dba1d5..7860205 100644 --- a/src/sam/markdups/markdups.cpp +++ b/src/sam/markdups/markdups.cpp @@ -1,100 +1,99 @@ /* -Description: 标记bam文件中的冗余信息,只处理按照坐标排序后的bam,且bam为单一样本数据 +Description: +标记bam文件中的冗余信息,只处理按照坐标排序后的bam,且bam为单一样本数据 Copyright : All right reserved by ICT Author : Zhang Zhonghai Date : 2023/10/23 */ -#include "markdups_arg.h" -// 有太多define冲突,放到最后include - - #include #include +#include #include #include #include -#include #include +#include +#include #include #include -#include -#include - #include -#include -#include #include +#include #include #include +#include + +#include "markdups_arg.h" +#include "md_funcs.h" +#include "parallel_md.h" +#include "serial_md.h" +#include "shared_args.h" +#include "dup_metrics.h" using namespace std; using std::cout; - #define SMA_TAG_PG "PG" -#define BAM_BLOCK_SIZE 2 * 1024 * 1024 +#define BAM_BLOCK_SIZE 16L * 1024 * 1024 #define NO_SUCH_INDEX INT64_MAX -static Timer tm_arr[20]; // 用来测试性能 +Timer tm_arr[20]; // 用来测试性能 /* 全局本地变量 */ -static vector g_vRnParser; // 每个线程一个read name parser -static samFile *g_inBamFp; // 输入的bam文件 -static sam_hdr_t *g_inBamHeader; // 输入的bam文件头信息 -static samFile *g_outBamFp = nullptr; // 输出文件, sam或者bam格式 -static sam_hdr_t *g_outBamHeader; // 输出文件的header +vector g_vRnParser; // 每个线程一个read name parser +samFile *g_inBamFp; // 输入的bam文件 +sam_hdr_t *g_inBamHeader; // 输入的bam文件头信息 +samFile *g_outBamFp = nullptr; // 输出文件, sam或者bam格式 +sam_hdr_t *g_outBamHeader; // 输出文件的header /* 参数对象作为全局对象,免得多次作为参数传入函数中 */ -static GlobalArg &g_gArg = GlobalArg::Instance(); -static MarkDupsArg g_mdArg; - - -#include "md_funcs.h" -#include "serial_md.h" -#include "parallel_md.h" - - - +GlobalArg &g_gArg = GlobalArg::Instance(); +static MarkDupsArg g_mdArg_; +MarkDupsArg &g_mdArg = g_mdArg_; +static GlobalDataArg gData_; +GlobalDataArg &gData = gData_; +DuplicationMetrics gMetrics_; +DuplicationMetrics &gMetrics = gMetrics_; /* - * mark duplicate 入口,假定bam是按照比对后的坐标排序的,同一个样本的话不需要考虑barcode的问题 + * mark duplicate + * 入口,假定bam是按照比对后的坐标排序的,同一个样本的话不需要考虑barcode的问题 */ -int MarkDuplicates(int argc, char *argv[]) -{ +int MarkDuplicates(int argc, char *argv[]) { Timer::log_time("程序开始"); Timer time_all; - + /* 读取命令行参数 */ - g_mdArg.parseArgument(argc, argv, &g_gArg); // 解析命令行参数 - if (g_gArg.num_threads < 1) // 线程数不能小于1 - g_gArg.num_threads = 1; + g_mdArg.parseArgument(argc, argv, &g_gArg); // 解析命令行参数 + if (g_gArg.num_threads < 1) + g_gArg.num_threads = 1; // 线程数不能小于1 /* 初始化一些参数和变量*/ g_vRnParser.resize(g_gArg.num_threads); for (auto &parser : g_vRnParser) - parser.SetReadNameRegex(g_mdArg.READ_NAME_REGEX); // 用来解析read name中的tile,x,y信息 - + parser.SetReadNameRegex(g_mdArg.READ_NAME_REGEX); // 用来解析read name中的tile,x,y信息 + /* 打开输入bam文件 */ g_inBamFp = sam_open_format(g_gArg.in_fn.c_str(), "r", nullptr); - if (!g_inBamFp) - { + if (!g_inBamFp) { Error("[%s] load sam/bam file failed.\n", __func__); return -1; } hts_set_opt(g_inBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE); - g_inBamHeader = sam_hdr_read(g_inBamFp); // 读取header + g_inBamHeader = sam_hdr_read(g_inBamFp); // 读取header + // 获取样本名称(libraryId) + gMetrics.LIBRARY = sam_hdr_line_name(g_inBamHeader, "RG", 0); /* 利用线程池对输入输出文件进行读写 */ - htsThreadPool htsPoolRead = {NULL, 0}; // 多线程读取,创建线程池 - htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池 - // htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads); - htsPoolRead.pool = hts_tpool_init(16); - // htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads); - htsPoolWrite.pool = hts_tpool_init(16); - if (!htsPoolRead.pool || !htsPoolWrite.pool) - { + htsThreadPool htsPoolRead = {NULL, 0}; // 多线程读取,创建线程池 + htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池 + //htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads); + //htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads); + htsPoolRead.pool = hts_tpool_init(32); + htsPoolWrite.pool = hts_tpool_init(32); + if (!htsPoolRead.pool || !htsPoolWrite.pool) { Error("[%d] failed to set up thread pool", __LINE__); sam_close(g_inBamFp); return -1; @@ -106,86 +105,94 @@ int MarkDuplicates(int argc, char *argv[]) sam_open_mode(modeout + 1, g_gArg.out_fn.c_str(), NULL); g_outBamFp = sam_open(g_gArg.out_fn.c_str(), modeout); g_outBamHeader = sam_hdr_dup(g_inBamHeader); + // 用同样的线程池处理输出文件 hts_set_opt(g_outBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE); - hts_set_opt(g_outBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite); // 用同样的线程池处理输出文件 - + hts_set_opt(g_outBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite); /* 冗余检查和标记 */ - if (g_gArg.num_threads == 1) - { - serialMarkDups(); // 串行运行 - } - else - { - parallelMarkDups(); // 并行运行 + if (g_gArg.num_threads == 1) { + serialMarkDups(); // 串行运行 + } else { + parallelMarkDups(); // 并行运行 } /* 标记冗余, 将处理后的结果写入文件 */ - sam_close(g_inBamFp); // 重新打开bam文件 + sam_close(g_inBamFp); // 重新打开bam文件 g_inBamFp = sam_open_format(g_gArg.in_fn.c_str(), "r", nullptr); - if (!g_inBamFp) - { + if (!g_inBamFp) { Error("[%s] load sam/bam file failed.\n", __func__); return -1; } hts_set_opt(g_inBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE); hts_set_opt(g_inBamFp, HTS_OPT_THREAD_POOL, &htsPoolRead); - g_inBamHeader = sam_hdr_read(g_inBamFp); // 读取header - if (sam_hdr_write(g_outBamFp, g_outBamHeader) != 0) - { + g_inBamHeader = sam_hdr_read(g_inBamFp); // 读取header + if (sam_hdr_write(g_outBamFp, g_outBamHeader) != 0) { Error("failed writing header to \"%s\"", g_gArg.out_fn.c_str()); sam_close(g_outBamFp); sam_close(g_inBamFp); return -1; } // 输出index文件 - string indexFn = g_gArg.out_fn + ".csi"; // 现在索引都是csi格式的 - if (sam_idx_init(g_outBamFp, g_outBamHeader, 14 /*csi*/, indexFn.c_str()) < 0) - { + // string indexFn = g_gArg.out_fn + ".csi"; // 现在索引都是csi格式的 + string indexFn = g_gArg.out_fn + ".bai"; // min_shift = 0 是bai格式 + int index_min_shift = 0; + if (g_mdArg.INDEX_FORMAT == ns_md::IndexFormat::CSI) { + indexFn = g_gArg.out_fn + ".csi"; + index_min_shift = 14; + } + if (sam_idx_init(g_outBamFp, g_outBamHeader, 0 /*csi 14*/, indexFn.c_str()) < 0) { Error("failed to open index \"%s\" for writing", indexFn.c_str()); sam_close(g_outBamFp); sam_close(g_inBamFp); return -1; } // 读取输入文件 - // BamBufType inBuf(false); // inBuf(g_gArg.use_asyncio); + // BamBufType inBuf(false); BamBufType inBuf(g_gArg.use_asyncio); inBuf.Init(g_inBamFp, g_inBamHeader, g_gArg.max_mem); + DupIdxQueue idxQue; + idxQue.Init(&gData.dupIdxArr); Timer tw; - while (inBuf.ReadStat() >= 0) - { + cout << "dupsize: " << idxQue.Size() << endl; + uint64_t bamIdx = 0; + uint64_t dupIdx = idxQue.Pop(); + cout << "dup arr size: " << gData.dupIdxArr.size() << endl; + cout << "first dup: " << dupIdx << endl; + while (inBuf.ReadStat() >= 0) { Timer tw1; size_t readNum = inBuf.ReadBam(); cout << "read: " << readNum << endl; - for (size_t i = 0; i < inBuf.Size(); ++i) - { + for (size_t i = 0; i < inBuf.Size(); ++i) { /* 判断是否冗余 */ - if (sam_write1(g_outBamFp, g_outBamHeader, inBuf[i]->b) < 0) - { + if (bamIdx == dupIdx) { + // cout << "冗余" << bamIdx << endl; + dupIdx = idxQue.Pop(); + } + if (sam_write1(g_outBamFp, g_outBamHeader, inBuf[i]->b) < 0) { Error("failed writing header to \"%s\"", g_gArg.out_fn.c_str()); sam_close(g_outBamFp); sam_close(g_inBamFp); return -1; } + ++bamIdx; } inBuf.ClearAll(); cout << "write round time: " << tw1.seconds_elapsed() << " s" << endl; } - if (sam_idx_save(g_outBamFp) < 0) - { + cout << "dupsize: " << idxQue.Size() << endl; + if (sam_idx_save(g_outBamFp) < 0) { Error("writing index failed"); sam_close(g_outBamFp); sam_close(g_inBamFp); return -1; } cout << "write time: " << tw.seconds_elapsed() << " s" << endl; - /* 关闭文件,收尾清理 */ sam_close(g_outBamFp); sam_close(g_inBamFp); cout << " 总时间: " << time_all.seconds_elapsed() << endl; -// cout << "计算read end: " << tm_arr[0].acc_seconds_elapsed() << endl; + // cout << "计算read end: " << tm_arr[0].acc_seconds_elapsed() << endl; Timer::log_time("程序结束"); return 0; -} \ No newline at end of file + } \ No newline at end of file diff --git a/src/sam/markdups/markdups_arg.cpp b/src/sam/markdups/markdups_arg.cpp index c8a9453..f00eb51 100644 --- a/src/sam/markdups/markdups_arg.cpp +++ b/src/sam/markdups/markdups_arg.cpp @@ -64,6 +64,7 @@ const static struct option kMdOpts[] = { {"COMPRESSION_LEVEL", required_argument, NULL, COMPRESSION_LEVEL}, {"MAX_RECORDS_IN_RAM", required_argument, NULL, MAX_RECORDS_IN_RAM}, {"CREATE_INDEX", required_argument, NULL, CREATE_INDEX}, + {"INDEX_FORMAT", required_argument, NULL, INDEX_FORMAT}, {"CREATE_MD5_FILE", required_argument, NULL, CREATE_MD5_FILE}}; // 判断bool类型的参数 @@ -220,6 +221,11 @@ void MarkDupsArg::parseArgument(int argc, case ns_md::CREATE_INDEX: setBoolArg(&CREATE_INDEX); break; + case ns_md::INDEX_FORMAT: + if (strcmp("CSI", optarg) == 0) + INDEX_FORMAT = ns_md::IndexFormat::CSI; + else + INDEX_FORMAT = ns_md::IndexFormat::BAI; case ns_md::CREATE_MD5_FILE: setBoolArg(&CREATE_MD5_FILE); break; @@ -235,7 +241,8 @@ void MarkDupsArg::parseArgument(int argc, /* 打印参数信息 */ void MarkDupsArg::printArgValue() { - printf("--READ_NAME_REGEX = %s\n", this->READ_NAME_REGEX.c_str()); + printf("--READ_NAME_REGEX = %s\n", this->READ_NAME_REGEX.c_str()); + printf("--INDEX_FORMAT = %s\n", this->INDEX_FORMAT == ns_md::IndexFormat::BAI ? "bai" : "csi"); } // 打印版本信息 @@ -272,6 +279,8 @@ void MarkDupsArg::PrintHelp() "\n" "Optional Arguments:\n" "\n" + "--INDEX_FORMAT Format for bam index file. Possible values: {BAI, CSI}\n" + "\n" "--ADD_PG_TAG_TO_READS \n" " Add PG tag to each read in a SAM or BAM Default value: true. Possible values: {true,\n" " false}\n" diff --git a/src/sam/markdups/markdups_arg.h b/src/sam/markdups/markdups_arg.h index 1abf8a7..af5fb41 100644 --- a/src/sam/markdups/markdups_arg.h +++ b/src/sam/markdups/markdups_arg.h @@ -21,91 +21,92 @@ using std::vector; class GlobalArg; namespace ns_md { - /* 用于markduplicate模块的参数,这个枚举用于getoption */ - enum MarkDupsArgEnum - { - _START_NUM = 100, - MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP, - MAX_FILE_HANDLES_FOR_READ_ENDS_MAP, - SORTING_COLLECTION_SIZE_RATIO, - BARCODE_TAG, - READ_ONE_BARCODE_TAG, - READ_TWO_BARCODE_TAG, - TAG_DUPLICATE_SET_MEMBERS, - REMOVE_SEQUENCING_DUPLICATES, - TAGGING_POLICY, - CLEAR_DT, - DUPLEX_UMI, - MOLECULAR_IDENTIFIER_TAG, - METRICS_FILE, - REMOVE_DUPLICATES, - ASSUME_SORTED, - ASSUME_SORT_ORDER, - DUPLICATE_SCORING_STRATEGY, - PROGRAM_RECORD_ID, - PROGRAM_GROUP_VERSION, - PROGRAM_GROUP_COMMAND_LINE, - PROGRAM_GROUP_NAME, - COMMENT, - READ_NAME_REGEX, - OPTICAL_DUPLICATE_PIXEL_DISTANCE, - MAX_OPTICAL_DUPLICATE_SET_SIZE, - QUIET, - VALIDATION_STRINGENCY, - COMPRESSION_LEVEL, - MAX_RECORDS_IN_RAM, - CREATE_INDEX, - CREATE_MD5_FILE, - _END_NUM - }; - - /* How strict to be when reading a SAM or BAM, beyond bare minimum validation. */ - enum ValidationStringency - { - /** - * Do the right thing, throw an exception if something looks wrong. - */ - STRICT, - /** - * Emit warnings but keep going if possible. - */ - LENIENT, - /** - * Like LENIENT, only don't emit warning messages. - */ - SILENT, - - DEFAULT_STRINGENCY = SILENT - }; +/* 用于markduplicate模块的参数,这个枚举用于getoption */ +enum MarkDupsArgEnum { + _START_NUM = 100, + MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP, + MAX_FILE_HANDLES_FOR_READ_ENDS_MAP, + SORTING_COLLECTION_SIZE_RATIO, + BARCODE_TAG, + READ_ONE_BARCODE_TAG, + READ_TWO_BARCODE_TAG, + TAG_DUPLICATE_SET_MEMBERS, + REMOVE_SEQUENCING_DUPLICATES, + TAGGING_POLICY, + CLEAR_DT, + DUPLEX_UMI, + MOLECULAR_IDENTIFIER_TAG, + METRICS_FILE, + REMOVE_DUPLICATES, + ASSUME_SORTED, + ASSUME_SORT_ORDER, + DUPLICATE_SCORING_STRATEGY, + PROGRAM_RECORD_ID, + PROGRAM_GROUP_VERSION, + PROGRAM_GROUP_COMMAND_LINE, + PROGRAM_GROUP_NAME, + COMMENT, + READ_NAME_REGEX, + OPTICAL_DUPLICATE_PIXEL_DISTANCE, + MAX_OPTICAL_DUPLICATE_SET_SIZE, + QUIET, + VALIDATION_STRINGENCY, + COMPRESSION_LEVEL, + MAX_RECORDS_IN_RAM, + CREATE_INDEX, + INDEX_FORMAT, + CREATE_MD5_FILE, + _END_NUM +}; +/* How strict to be when reading a SAM or BAM, beyond bare minimum validation. + */ +enum ValidationStringency { /** - * Enum used to control how duplicates are flagged in the DT optional tag on each read. + * Do the right thing, throw an exception if something looks wrong. */ - enum DuplicateTaggingPolicy - { - DontTag, - OpticalOnly, - All - }; + STRICT, + /** + * Emit warnings but keep going if possible. + */ + LENIENT, + /** + * Like LENIENT, only don't emit warning messages. + */ + SILENT, - /* 排序的方式 */ - enum SortOrder - { - unsorted, - queryname, - coordinate, - duplicate, // NB: this is not in the SAM spec! - unknown - }; + DEFAULT_STRINGENCY = SILENT +}; - /* 计算reads分数的方式(比那个read得分更高) */ - enum ScoringStrategy - { - SUM_OF_BASE_QUALITIES, - TOTAL_MAPPED_REFERENCE_LENGTH, - RANDOM - }; -} +/** + * Enum used to control how duplicates are flagged in the DT optional tag on + * each read. + */ +enum DuplicateTaggingPolicy { DontTag, OpticalOnly, All }; + +/* 排序的方式 */ +enum SortOrder { + unsorted, + queryname, + coordinate, + duplicate, // NB: this is not in the SAM spec! + unknown +}; + +/* 计算reads分数的方式(比那个read得分更高) */ +enum ScoringStrategy { + SUM_OF_BASE_QUALITIES, + TOTAL_MAPPED_REFERENCE_LENGTH, + RANDOM +}; + +/* 索引文件的格式 (bai或者csi) */ +enum IndexFormat { + BAI, + CSI +}; + +} // namespace ns_md /* markduplicate 需要的参数*/ struct MarkDupsArg @@ -287,6 +288,8 @@ struct MarkDupsArg /* "Whether to create an index when writing VCF or coordinate sorted BAM output.", common = true */ bool CREATE_INDEX = false; + ns_md::IndexFormat INDEX_FORMAT = ns_md::IndexFormat::BAI; + /* "Whether to create an MD5 digest for any BAM or FASTQ files created. ", common = true */ bool CREATE_MD5_FILE = false; diff --git a/src/sam/markdups/md_funcs.cpp b/src/sam/markdups/md_funcs.cpp new file mode 100644 index 0000000..eb0eb99 --- /dev/null +++ b/src/sam/markdups/md_funcs.cpp @@ -0,0 +1,456 @@ + +#include "md_funcs.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "dup_metrics.h" +#include "markdups_arg.h" +#include "shared_args.h" + +using std::cerr; +using std::endl; +using std::map; +using std::set; +using std::unordered_map; +using std::vector; + +/* 清除key位置的数据 */ +void clearIdxAtPos(int64_t key, map> *pmsIdx) { + auto &msIdx = *pmsIdx; + if (msIdx.find(key) != msIdx.end()) + msIdx[key].clear(); // 清除该位点的冗余结果 +} + +/* 删除key位置的数据 */ +void delIdxAtPos(int64_t key, map> *pmsIdx) { + auto &msIdx = *pmsIdx; + if (msIdx.find(key) != msIdx.end()) + msIdx.erase(key); +} + +/* + * 计算read的分数 + */ +int16_t computeDuplicateScore(BamWrap &bw) { + int16_t score = 0; + switch (g_mdArg.DUPLICATE_SCORING_STRATEGY) { + case ns_md::SUM_OF_BASE_QUALITIES: + // two (very) long reads worth of high-quality bases can go over + // Short.MAX_VALUE/2 and risk overflow. + score += (int16_t)min(bw.GetSumOfBaseQualities(), INT16_MAX / 2); + break; + case ns_md::TOTAL_MAPPED_REFERENCE_LENGTH: + if (!bw.GetReadUnmappedFlag()) + // no need to remember the score since this scoring mechanism is + // symmetric + score = (int16_t)min(bw.GetReferenceLength(), INT16_MAX / 2); + break; + case ns_md::RANDOM: + // The RANDOM score gives the same score to both reads so that they get + // filtered together. it's not critical do use the readName since the + // scores from both ends get added, but it seem to be clearer this way. + score += (short)(Murmur3::Instance().HashUnencodedChars(bw.query_name()) & 0b11111111111111); + // subtract Short.MIN_VALUE/4 from it to end up with a number between + // 0 and Short.MAX_VALUE/2. This number can be then discounted in case + // the read is not passing filters. We need to stay far from overflow so + // that when we add the two scores from the two read mates we do not + // overflow since that could cause us to chose a failing read-pair + // instead of a passing one. + score -= INT16_MIN / 4; + default: + break; + } + // make sure that filter-failing records are heavily discounted. (the + // discount can happen twice, once for each mate, so need to make sure we do + // not subtract more than Short.MIN_VALUE overall.) + score += bw.GetReadFailsVendorQualityCheckFlag() ? (int16_t)(INT16_MIN / 2) : 0; + + return score; +} + +/* + * Builds a read ends object that represents a single read. + * 用来表示一个read的特征结构 + */ +void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey) { + auto &k = *pKey; + auto &bc = bw.b->core; + k.read1FirstOfPair = bw.GetFirstOfPairFlag(); + k.read1ReferenceIndex = bc.tid; + k.read1Coordinate = (bc.flag & BAM_FREVERSE) ? bw.GetUnclippedEnd() : bw.GetUnclippedStart(); + k.orientation = (bc.flag & BAM_FREVERSE) ? ReadEnds::R : ReadEnds::F; + + k.read1IndexInFile = index; + k.score = computeDuplicateScore(bw); + // Doing this lets the ends object know that it's part of a pair + if (bw.GetReadPairedFlag() && !bw.GetMateUnmappedFlag()) { + k.read2ReferenceIndex = bc.mtid; + } + // Fill in the location information for optical duplicates + rnParser.AddLocationInformation(bw.query_name(), pKey); + // cout << k.tile << ' ' << k.x << ' ' << k.y << endl; + // 计算位置key + k.posKey = + BamWrap::bam_global_pos(k.read1ReferenceIndex, k.read1Coordinate); // << 1 | k.orientation; +} + +/** + * Takes a list of ReadEndsForMarkDuplicates objects and identify the + * representative read based on quality score. For all members of the duplicate + * set, add the read1 index-in-file of the representative read to the records of + * the first and second in a pair. This value becomes is used for the 'DI' tag. + */ +void addRepresentativeReadIndex(vector &vpRe) {} + +/* 处理一组pairend的readends,标记冗余 */ +void markDuplicatePairs(int64_t posKey, vector &vpRe, + DupContainer *dupIdx, DupContainer *opticalDupIdx) { + if (vpRe.size() < 2) { + if (vpRe.size() == 1) { + // addSingletonToCount(libraryIdGenerator); + } + return; + } + // cout << "pos:" << posKey + 1 << ";size:" << vpRe.size() << endl; + auto &vDupIdx = dupIdx->AtPos(posKey); + auto &vOpticalDupIdx = opticalDupIdx->AtPos(posKey); + + int maxScore = 0; + const ReadEnds *pBest = nullptr; + /** All read ends should have orientation FF, FR, RF, or RR **/ + for (auto pe : vpRe) // 找分数最高的readend + { + if (pe->score > maxScore || pBest == nullptr) { + maxScore = pe->score; + pBest = pe; + } + } + if (!g_mdArg.READ_NAME_REGEX.empty()) // 检查光学冗余 + { + // trackOpticalDuplicates + } + for (auto pe : vpRe) // 对非best read标记冗余 + { + if (pe != pBest) // 非best + { + vDupIdx.push_back(pe->read1IndexInFile); // 添加read1 + if (pe->read2IndexInFile != pe->read1IndexInFile) + vDupIdx.push_back(pe->read2IndexInFile); // 添加read2 + } + } + + if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS) { + addRepresentativeReadIndex(vpRe); + } +} + +/* 处理一组非paired的readends,标记冗余 */ +void markDuplicateFragments(int64_t posKey, vector &vpRe, bool containsPairs, + DupContainer *dupIdx) { + auto &vDupIdx = dupIdx->AtPos(posKey); + + if (containsPairs) { + for (auto pe : vpRe) { + if (!pe->IsPaired()) { + vDupIdx.push_back(pe->read1IndexInFile); + } + } + } else { + int maxScore = 0; + const ReadEnds *pBest = nullptr; + for (auto pe : vpRe) { + if (pe->score > maxScore || pBest == nullptr) { + maxScore = pe->score; + pBest = pe; + } + } + + for (auto pe : vpRe) { + if (pe != pBest) { + vDupIdx.push_back(pe->read1IndexInFile); + } + } + } +} + +/* 处理位于某个坐标的pairend reads */ +void handlePairs(int64_t posKey, vector &readEnds, + vector &vpCache, DupContainer *dupIdx, + DupContainer *opticalDupIdx) { + if (readEnds.size() > 1) { // 有潜在的冗余 + vpCache.clear(); + // std::sort(readEnds.begin(), readEnds.end()); + const ReadEnds *pReadEnd = nullptr; + for (auto &re : readEnds) { + if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样 + vpCache.push_back(&re); // 处理一个潜在的冗余组 + else { + markDuplicatePairs(posKey, vpCache, dupIdx, + opticalDupIdx); // 不一样 + vpCache.clear(); + vpCache.push_back(&re); + pReadEnd = &re; + } + } + markDuplicatePairs(posKey, vpCache, dupIdx, opticalDupIdx); + } +} + +/* 处理位于某个坐标的 reads */ +void handleFrags(int64_t posKey, vector &readEnds, + vector &vpCache, DupContainer *dupIdx) { + if (readEnds.size() > 1) // 有潜在的冗余 + { + vpCache.clear(); + // std::sort(readEnds.begin(), readEnds.end()); + const ReadEnds *pReadEnd = nullptr; + bool containsPairs = false; + bool containsFrags = false; + for (auto &re : readEnds) { + if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false)) { + vpCache.push_back(&re); + containsPairs = containsPairs || re.IsPaired(); + containsFrags = containsFrags || !re.IsPaired(); + } else { + if (vpCache.size() > 1 && containsFrags) { + markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx); + } + vpCache.clear(); + vpCache.push_back(&re); + pReadEnd = &re; + containsPairs = re.IsPaired(); + containsFrags = !re.IsPaired(); + } + } + if (vpCache.size() > 1 && containsFrags) { + markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx); + } + } +} + +/* 对找到的pairend read end添加一些信息 */ +void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds) { + auto &pairedEnds = *pPairedEnds; + int64_t bamIdx = fragEnd.read1IndexInFile; + const int matesRefIndex = fragEnd.read1ReferenceIndex; + const int matesCoordinate = fragEnd.read1Coordinate; + // Set orientationForOpticalDuplicates, which always goes by the first then + // the second end for the strands. NB: must do this before updating the + // orientation later. + if (fragEnd.read1FirstOfPair) { + pairedEnds.orientationForOpticalDuplicates = ReadEnds::GetOrientationByte( + fragEnd.IsNegativeStrand(), pairedEnds.orientation == ReadEnds::R); + } else { + pairedEnds.orientationForOpticalDuplicates = ReadEnds::GetOrientationByte( + pairedEnds.orientation == ReadEnds::R, fragEnd.IsNegativeStrand()); + } + // If the other read is actually later, simply add the other read's data as + // read2, else flip the reads + if (matesRefIndex > pairedEnds.read1ReferenceIndex || + (matesRefIndex == pairedEnds.read1ReferenceIndex && + matesCoordinate >= pairedEnds.read1Coordinate)) { + pairedEnds.read2ReferenceIndex = matesRefIndex; + pairedEnds.read2Coordinate = matesCoordinate; + pairedEnds.read2IndexInFile = bamIdx; + pairedEnds.orientation = ReadEnds::GetOrientationByte(pairedEnds.orientation == ReadEnds::R, + fragEnd.IsNegativeStrand()); + + // if the two read ends are in the same position, pointing in opposite + // directions, the orientation is undefined and the procedure above will + // depend on the order of the reads in the file. To avoid this, we set + // it explicitly (to FR): + if (pairedEnds.read2ReferenceIndex == pairedEnds.read1ReferenceIndex && + pairedEnds.read2Coordinate == pairedEnds.read1Coordinate && + pairedEnds.orientation == ReadEnds::RF) { + pairedEnds.orientation = ReadEnds::FR; + } + } else { + pairedEnds.read2ReferenceIndex = pairedEnds.read1ReferenceIndex; + pairedEnds.read2Coordinate = pairedEnds.read1Coordinate; + pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile; + pairedEnds.read1ReferenceIndex = matesRefIndex; + pairedEnds.read1Coordinate = matesCoordinate; + pairedEnds.read1IndexInFile = bamIdx; + pairedEnds.orientation = ReadEnds::GetOrientationByte( + fragEnd.IsNegativeStrand(), pairedEnds.orientation == ReadEnds::R); + pairedEnds.posKey = fragEnd.posKey; + } + pairedEnds.score += fragEnd.score; +} + +static inline bool closeEnough(const ReadEnds *lhs, const ReadEnds *rhs, const int distance) { + return lhs != rhs && // no comparing an object to itself (checked using object identity)! + (lhs->tile != ReadEnds::NO_VALUE) && + (rhs->tile != ReadEnds::NO_VALUE) && // no comparing objects without locations + lhs->tile == rhs->tile && // and the same tile + abs(lhs->x - rhs->x) <= distance && abs(lhs->y - rhs->y) <= distance; +} + +/** + * Finds which reads within the list of duplicates that are likely to be optical/co-localized duplicates of + * one another. Within each cluster of optical duplicates that is found, one read remains un-flagged for + * optical duplication and the rest are flagged as optical duplicates. The set of reads that are considered + * optical duplicates are indicated by returning "true" at the same index in the resulting boolean[] as the + * read appeared in the input list of physical locations. + * + * @param list a list of reads that are determined to be duplicates of one another + * @param keeper a single PhysicalLocation that is the one being kept as non-duplicate, and thus should never be + * annotated as an optical duplicate. May in some cases be null, or a PhysicalLocation not + * contained within the list! (always not be null!) + * @return a boolean[] of the same length as the incoming list marking which reads are optical duplicates + */ +static void findOpticalDuplicates(vector &readEndsArr, const ReadEnds *pBestRe, + vector *pOpticalDuplicateFlags) { + const int DEFAULT_OPTICAL_DUPLICATE_DISTANCE = 100; + const int DEFAULT_MAX_DUPLICATE_SET_SIZE = 300000; + + vector &opticalDuplicateFlags = *pOpticalDuplicateFlags; + opticalDuplicateFlags.push_back(true); + int len = readEndsArr.size(); + // If there is only one or zero reads passed in (so there are obviously no optical duplicates), + // or if there are too many reads (so we don't want to try to run this expensive n^2 algorithm), + // then just return an array of all false + if (len < 2 || len > DEFAULT_MAX_DUPLICATE_SET_SIZE) { + return; + } + + if (len >= 4) { + /** + * Compute the optical duplicates correctly in the case where the duplicate group could end up with transitive + * optical duplicates + * getOpticalDuplicatesFlagWithGraph + */ + // Make a graph where the edges are reads that lie within the optical duplicate pixel distance from each other, + // we will then use the union-find algorithm to cluster the graph and find optical duplicate groups + Graph opticalDistanceRelationGraph; + unordered_map> tileRGmap; + int keeperIndex = -1; + for (int i = 0; i < readEndsArr.size(); ++i) { + const ReadEnds *currentLoc = readEndsArr[i]; + if (currentLoc == pBestRe) + keeperIndex = i; + if (currentLoc->tile != ReadEnds::NO_VALUE) { + int key = currentLoc->tile; // 只处理一个样本,所以只有一个read group + tileRGmap[key].push_back(i); + } + } + + } else { + /** + * Compute optical duplicates quickly in the standard case where we know that there won't be any transitive + * distances to worry about. Note, this is guaranteed to be correct when there are at most 2x reads from a + * readgroup or 3x with the keeper present + * getOpticalDuplicatesFlagFast + */ + // First go through and compare all the reads to the keeper + for (int i = 0; i < len; ++i) { + const ReadEnds *other = readEndsArr[i]; + opticalDuplicateFlags[i] = closeEnough(pBestRe, other, DEFAULT_OPTICAL_DUPLICATE_DISTANCE); + } + // Now go through and do each pairwise comparison not involving the actualKeeper + for (int i = 0; i < len; ++i) { + const ReadEnds *lhs = readEndsArr[i]; + if (lhs == pBestRe) // no comparisons to actualKeeper since those are all handled above + continue; + for (int j = i + 1; j < len; ++j) { + const ReadEnds *rhs = readEndsArr[j]; + if (rhs == pBestRe) // no comparisons to actualKeeper since those are all handled above + continue; + if (opticalDuplicateFlags[i] && opticalDuplicateFlags[j]) + continue; // both already marked, no need to check + if (closeEnough(lhs, rhs, DEFAULT_OPTICAL_DUPLICATE_DISTANCE)) { + // At this point we want to mark either lhs or rhs as duplicate. Either could have been marked + // as a duplicate of the keeper (but not both - that's checked above), so be careful about which + // one to now mark as a duplicate. + int index = opticalDuplicateFlags[j] ? i : j; + opticalDuplicateFlags[index] = true; + } + } + } + } +} + +/** + * Looks through the set of reads and identifies how many of the duplicates are + * in fact optical duplicates, and stores the data in the instance level histogram. + * + * We expect only reads with FR or RF orientations, not a mixture of both. + * + * In PCR duplicate detection, a duplicates can be a have FR and RF when fixing the orientation order to the first end + * of the mate. In optical duplicate detection, we do not consider them duplicates if one read as FR and the other RF + * when we order orientation by the first mate sequenced (read #1 of the pair). + */ +static int checkOpticalDuplicates(vector &readEndsArr, const ReadEnds *pBestRe) { + vector opticalDuplicateFlags(readEndsArr.size(), false); + // find OpticalDuplicates + findOpticalDuplicates(readEndsArr, pBestRe, &opticalDuplicateFlags); + int opticalDuplicates = 0; + for (int i = 0; i < opticalDuplicateFlags.size(); ++i) { + if (opticalDuplicateFlags[i]) { + ++opticalDuplicates; + ReadEnds *pRe = const_cast(readEndsArr[i]); + pRe->isOpticalDuplicate = true; + } + } + if (opticalDuplicates > 0) + gMetrics.OpticalDuplicatesByLibraryId += opticalDuplicates; + return opticalDuplicates; +} + +/** + * 记录光学原因造成的冗余 + */ +void trackOpticalDuplicates(vector &readEndsArr, const ReadEnds *pBestRe) { + bool hasFR = false, hasRF = false; + // Check to see if we have a mixture of FR/RF + for (auto pRe : readEndsArr) { + if (ReadEnds::FR == pRe->orientationForOpticalDuplicates) + hasFR = true; + else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates) + hasRF = true; + } + + // Check if we need to partition since the orientations could have changed + int nOpticalDup; + if (hasFR && hasRF) { // need to track them independently + vector trackOpticalDuplicatesF; + vector trackOpticalDuplicatesR; + // Split into two lists: first of pairs and second of pairs, + // since they must have orientation and same starting end + for (auto pRe: readEndsArr) { + if (ReadEnds::FR == pRe->orientationForOpticalDuplicates) + trackOpticalDuplicatesF.push_back(pRe); + else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates) + trackOpticalDuplicatesR.push_back(pRe); + else + cerr << "Found an unexpected orientation: " << pRe->orientation << endl; + } + // track the duplicates + int nOpticalDupF = checkOpticalDuplicates(trackOpticalDuplicatesF, pBestRe); + int nOpticalDupR = checkOpticalDuplicates(trackOpticalDuplicatesR, pBestRe); + nOpticalDup = nOpticalDupF + nOpticalDupR; + } else { // No need to partition + nOpticalDup = checkOpticalDuplicates(readEndsArr, pBestRe); + } + + // trackDuplicateCounts + gMetrics.DuplicateCountHist += readEndsArr.size() - nOpticalDup; + if (readEndsArr.size() > nOpticalDup) + gMetrics.NonOpticalDuplicateCountHist += readEndsArr.size() - nOpticalDup; + if (nOpticalDup) + gMetrics.OpticalDuplicatesCountHist += nOpticalDup + 1; +} \ No newline at end of file diff --git a/src/sam/markdups/md_funcs.h b/src/sam/markdups/md_funcs.h index 89655b8..b723f65 100644 --- a/src/sam/markdups/md_funcs.h +++ b/src/sam/markdups/md_funcs.h @@ -1,60 +1,64 @@ +#pragma once + #include + +#include +#include +#include +#include + +using std::priority_queue; +using std::unordered_map; +using std::unordered_set; +using std::vector; + /* 前向声明 */ +class BamWrap; +class ReadEnds; +class ReadNameParser; /* 存放readend或者冗余idx,避免频繁的开辟和释放内存 */ template -struct DupContainer -{ - vector> arr; // 类似map> 或 map> - vector pos; // arr中每个元素对应的position +struct DupContainer { + vector> arr; // 类似map> 或 map> + vector pos; // arr中每个元素对应的position // unordered_map idx; // 某个位点对应在vector中的坐标 - tsl::robin_map idx; // 某个位点对应在vector中的坐标 - int64_t size = 0; // 实际使用的空间 - int64_t capacity = 0; // 内存容量 - inline void Init() - { + tsl::robin_map idx; // 某个位点对应在vector中的坐标 + int64_t size = 0; // 实际使用的空间 + int64_t capacity = 0; // 内存容量 + inline void Init() { idx.clear(); size = 0; } - inline void SortAtPos(int64_t p) // 这里的pos表示位点 + inline void SortAtPos(int64_t p) // 这里的pos表示位点 { - if (idx.find(p) != idx.end()) - { + if (idx.find(p) != idx.end()) { const int64_t i = idx.at(p); std::sort(arr[i].begin(), arr[i].end()); } } - inline void SortAtIdx(int64_t i) // 这里的i表示vector的索引 + inline void SortAtIdx(int64_t i) // 这里的i表示vector的索引 { std::sort(arr[i].begin(), arr[i].end()); } - inline void RemoveAtPos(int64_t p) - { - if (idx.find(p) != idx.end()) - { + inline void RemoveAtPos(int64_t p) { + if (idx.find(p) != idx.end()) { const int64_t i = idx.at(p); arr[i].clear(); } } - inline void RemoveAtIdx(int64_t i) // 这里的i表示vector的索引 + inline void RemoveAtIdx(int64_t i) // 这里的i表示vector的索引 { arr[i].clear(); } - inline void AddAtPos(int64_t p, T &val) - { - AtPos(p).push_back(val); - } - - inline vector &AtPos(int64_t p) - { - if (idx.find(p) != idx.end()) - { + inline void AddAtPos(int64_t p, T &val) { AtPos(p).push_back(val); } + + inline vector &AtPos(int64_t p) { + if (idx.find(p) != idx.end()) { const int64_t i = idx.at(p); return arr[i]; } - - if (size >= capacity) - { + if (size >= capacity) { capacity += 1; arr.push_back(vector()); pos.push_back(0); @@ -67,308 +71,131 @@ struct DupContainer } }; -/* 清除key位置的数据 */ -static inline void clearIdxAtPos(int64_t key, map> *pmsIdx) -{ - auto &msIdx = *pmsIdx; - if (msIdx.find(key) != msIdx.end()) - msIdx[key].clear(); // 清除该位点的冗余结果 -} -/* 删除key位置的数据 */ -static inline void delIdxAtPos(int64_t key, map> *pmsIdx) -{ - auto &msIdx = *pmsIdx; - if (msIdx.find(key) != msIdx.end()) - msIdx.erase(key); -} +/* + * 优先队列,用最小堆来实现对所有冗余索引的排序 + */ +struct PairArrIdIdx { + int arrId = 0; + uint64_t arrIdx = 0; + int64_t dupIdx = 0; +}; + +struct IdxGreaterThan { + bool operator()(const PairArrIdIdx &a, const PairArrIdIdx &b) { return a.dupIdx > b.dupIdx; } +}; + +struct DupIdxQueue { + // 将冗余索引和他对应的task vector对应起来 + + // 由于是多个task来查找冗余,所以每次找到的冗余index都放在一个独立的vector中,vector之间可能有重叠,所以需要用一个最小堆来维护 + vector> *dupIdx2DArr; + priority_queue, IdxGreaterThan> minHeap; + uint64_t popNum = 0; + + int Init(vector> *_dupIdx2DArr) { + dupIdx2DArr = _dupIdx2DArr; + if (dupIdx2DArr == nullptr) { + return -1; + } + for (int i = 0; i < dupIdx2DArr->size(); ++i) { + auto &v = (*dupIdx2DArr)[i]; + if (!v.empty()) { + minHeap.push({i, 1, v[0]}); + } + } + return 0; + } + + int64_t Pop() { + int64_t ret = -1; + if (!minHeap.empty()) { + auto idx = minHeap.top(); + minHeap.pop(); + ++popNum; + ret = idx.dupIdx; + auto &v = (*dupIdx2DArr)[idx.arrId]; + if (v.size() > idx.arrIdx) { + minHeap.push({idx.arrId, idx.arrIdx + 1, v[idx.arrIdx]}); + } + } + return ret; + } + + uint64_t Size() { + uint64_t len = 0; + if (dupIdx2DArr != nullptr) { + for (auto &v : *dupIdx2DArr) { + len += v.size(); + } + } + return len - popNum; + } +}; + +/* + * 用来检测optical duplication的graph + */ +template +struct Graph { // 用set? + unordered_set nodes; // 图中的结点 + unordered_map> neighbors; // 邻接列表 + + Node *addNode(const Node &singleton) { + if (nodes.find(singleton) == nodes.end()) { + Node *n = const_cast(&(*nodes.insert(singleton).first)); + neighbors[n].clear(); + return n; + } + return const_cast(&(*nodes.find(singleton))); + } +}; /* * 计算read的分数 */ -static int16_t computeDuplicateScore(BamWrap &bw) -{ - int16_t score = 0; - switch (g_mdArg.DUPLICATE_SCORING_STRATEGY) - { - case ns_md::SUM_OF_BASE_QUALITIES: - // two (very) long reads worth of high-quality bases can go over Short.MAX_VALUE/2 - // and risk overflow. - score += (int16_t)min(bw.GetSumOfBaseQualities(), INT16_MAX / 2); - break; - case ns_md::TOTAL_MAPPED_REFERENCE_LENGTH: - if (!bw.GetReadUnmappedFlag()) - // no need to remember the score since this scoring mechanism is symmetric - score = (int16_t)min(bw.GetReferenceLength(), INT16_MAX / 2); - break; - case ns_md::RANDOM: - // The RANDOM score gives the same score to both reads so that they get filtered together. - // it's not critical do use the readName since the scores from both ends get added, but it seem - // to be clearer this way. - score += (short)(Murmur3::Instance().HashUnencodedChars(bw.query_name()) & 0b11111111111111); - // subtract Short.MIN_VALUE/4 from it to end up with a number between - // 0 and Short.MAX_VALUE/2. This number can be then discounted in case the read is - // not passing filters. We need to stay far from overflow so that when we add the two - // scores from the two read mates we do not overflow since that could cause us to chose a - // failing read-pair instead of a passing one. - score -= INT16_MIN / 4; - default: - break; - } - // make sure that filter-failing records are heavily discounted. (the discount can happen twice, once - // for each mate, so need to make sure we do not subtract more than Short.MIN_VALUE overall.) - score += bw.GetReadFailsVendorQualityCheckFlag() ? (int16_t)(INT16_MIN / 2) : 0; - - return score; -} +int16_t computeDuplicateScore(BamWrap &bw); /* - * Builds a read ends object that represents a single read. 用来表示一个read的特征结构 + * Builds a read ends object that represents a single read. + * 用来表示一个read的特征结构 */ -static void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey) -{ - auto &k = *pKey; - auto &bc = bw.b->core; - k.read1FirstOfPair = bw.GetFirstOfPairFlag(); - k.read1ReferenceIndex = bc.tid; - k.read1Coordinate = (bc.flag & BAM_FREVERSE) ? bw.GetUnclippedEnd() : bw.GetUnclippedStart(); - k.orientation = (bc.flag & BAM_FREVERSE) ? ReadEnds::R : ReadEnds::F; +void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey); - k.read1IndexInFile = index; - k.score = computeDuplicateScore(bw); - // Doing this lets the ends object know that it's part of a pair - if (bw.GetReadPairedFlag() && !bw.GetMateUnmappedFlag()) - { - k.read2ReferenceIndex = bc.mtid; - } - // Fill in the location information for optical duplicates - rnParser.AddLocationInformation(bw.query_name(), pKey); - // cout << k.tile << ' ' << k.x << ' ' << k.y << endl; - // 计算位置key - k.posKey = BamWrap::bam_global_pos(k.read1ReferenceIndex, k.read1Coordinate); // << 1 | k.orientation; -} +/* + * 处理一组pairend的readends,标记冗余 + */ +void markDuplicatePairs(int64_t posKey, vector &vpRe, + DupContainer *dupIdx, DupContainer *opticalDupIdx); + +/* + * 处理一组非paired的readends,标记冗余 + */ +void markDuplicateFragments(int64_t posKey, vector &vpRe, bool containsPairs, + DupContainer *dupIdx); + +/* + * 处理位于某个坐标的pairend reads + */ +void handlePairs(int64_t posKey, vector &readEnds, vector &vpCache, + DupContainer *dupIdx, DupContainer *opticalDupIdx); + +/* + * 处理位于某个坐标的非配对的frag reads + */ +void handleFrags(int64_t posKey, vector &readEnds, vector &vpCache, + DupContainer *dupIdx); + +/* + * 对找到的pairend read end添加一些信息 + */ +void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds); /** - * Takes a list of ReadEndsForMarkDuplicates objects and identify the representative read based on - * quality score. For all members of the duplicate set, add the read1 index-in-file of the representative - * read to the records of the first and second in a pair. This value becomes is used for - * the 'DI' tag. + * Looks through the set of reads and identifies how many of the duplicates are + * in fact optical duplicates, and stores the data in the instance level histogram. + * Additionally sets the transient isOpticalDuplicate flag on each read end that is + * identified as an optical duplicate. + * 记录光学原因造成的冗余 */ -static void addRepresentativeReadIndex(vector &vpRe) -{ -} - -/* 处理一组pairend的readends,标记冗余 */ -static void markDuplicatePairs(int64_t posKey, - vector &vpRe, - DupContainer *dupIdx, - DupContainer *opticalDupIdx) -{ - if (vpRe.size() < 2) - { - if (vpRe.size() == 1) - { - // addSingletonToCount(libraryIdGenerator); - } - return; - } - // cout << "pos:" << posKey + 1 << ";size:" << vpRe.size() << endl; - auto &vDupIdx = dupIdx->AtPos(posKey); - auto &vOpticalDupIdx = opticalDupIdx->AtPos(posKey); - - int maxScore = 0; - const ReadEnds *pBest = nullptr; - /** All read ends should have orientation FF, FR, RF, or RR **/ - for (auto pe : vpRe) // 找分数最高的readend - { - if (pe->score > maxScore || pBest == nullptr) - { - maxScore = pe->score; - pBest = pe; - } - } - if (!g_mdArg.READ_NAME_REGEX.empty()) // 检查光学冗余 - { - // trackOpticalDuplicates - } - for (auto pe : vpRe) // 对非best read标记冗余 - { - if (pe != pBest) // 非best - { - vDupIdx.push_back(pe->read1IndexInFile); // 添加read1 - if (pe->read2IndexInFile != pe->read1IndexInFile) - vDupIdx.push_back(pe->read2IndexInFile); // 添加read2 - } - } - - if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS) - { - addRepresentativeReadIndex(vpRe); - } -} - -/* 处理一组非paired的readends,标记冗余 */ -static void markDuplicateFragments(int64_t posKey, - vector &vpRe, - bool containsPairs, - DupContainer *dupIdx) -{ - auto &vDupIdx = dupIdx->AtPos(posKey); - - if (containsPairs) - { - for (auto pe : vpRe) - { - if (!pe->IsPaired()) - { - vDupIdx.push_back(pe->read1IndexInFile); - } - } - } - else - { - int maxScore = 0; - const ReadEnds *pBest = nullptr; - for (auto pe : vpRe) - { - if (pe->score > maxScore || pBest == nullptr) - { - maxScore = pe->score; - pBest = pe; - } - } - - for (auto pe : vpRe) - { - if (pe != pBest) - { - vDupIdx.push_back(pe->read1IndexInFile); - } - } - } -} - -/* 处理位于某个坐标的pairend reads */ -static inline void handlePairs(int64_t posKey, - vector &readEnds, - vector &vpCache, - DupContainer *dupIdx, - DupContainer *opticalDupIdx) -{ - if (readEnds.size() > 1) // 有潜在的冗余 - { - vpCache.clear(); -// std::sort(readEnds.begin(), readEnds.end()); - const ReadEnds *pReadEnd = nullptr; - for (auto &re : readEnds) - { - if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样 - vpCache.push_back(&re); // 处理一个潜在的冗余组 - else - { - markDuplicatePairs(posKey, vpCache, dupIdx, opticalDupIdx); // 不一样 - vpCache.clear(); - vpCache.push_back(&re); - pReadEnd = &re; - } - } - markDuplicatePairs(posKey, vpCache, dupIdx, opticalDupIdx); - } -} - -/* 处理位于某个坐标的 reads */ -static inline void handleFrags( - int64_t posKey, - vector &readEnds, - vector &vpCache, - DupContainer *dupIdx) -{ - if (readEnds.size() > 1) // 有潜在的冗余 - { - vpCache.clear(); -// std::sort(readEnds.begin(), readEnds.end()); - const ReadEnds *pReadEnd = nullptr; - bool containsPairs = false; - bool containsFrags = false; - for (auto &re : readEnds) - { - if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false)) - { - vpCache.push_back(&re); - containsPairs = containsPairs || re.IsPaired(); - containsFrags = containsFrags || !re.IsPaired(); - } - else - { - if (vpCache.size() > 1 && containsFrags) - { - markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx); - } - vpCache.clear(); - vpCache.push_back(&re); - pReadEnd = &re; - containsPairs = re.IsPaired(); - containsFrags = !re.IsPaired(); - } - } - if (vpCache.size() > 1 && containsFrags) - { - markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx); - } - } -} - -/* 对找到的pairend read end添加一些信息 */ -static inline void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds) -{ - auto &pairedEnds = *pPairedEnds; - int64_t bamIdx = fragEnd.read1IndexInFile; - const int matesRefIndex = fragEnd.read1ReferenceIndex; - const int matesCoordinate = fragEnd.read1Coordinate; - // Set orientationForOpticalDuplicates, which always goes by the first then the second end for the strands. NB: must do this - // before updating the orientation later. - if (fragEnd.read1FirstOfPair) - { - pairedEnds.orientationForOpticalDuplicates = - ReadEnds::GetOrientationByte(fragEnd.IsNegativeStrand(), pairedEnds.orientation == ReadEnds::R); - } - else - { - pairedEnds.orientationForOpticalDuplicates = - ReadEnds::GetOrientationByte(pairedEnds.orientation == ReadEnds::R, fragEnd.IsNegativeStrand()); - } - // If the other read is actually later, simply add the other read's data as read2, else flip the reads - if (matesRefIndex > pairedEnds.read1ReferenceIndex || - (matesRefIndex == pairedEnds.read1ReferenceIndex && matesCoordinate >= pairedEnds.read1Coordinate)) - { - pairedEnds.read2ReferenceIndex = matesRefIndex; - pairedEnds.read2Coordinate = matesCoordinate; - pairedEnds.read2IndexInFile = bamIdx; - pairedEnds.orientation = ReadEnds::GetOrientationByte(pairedEnds.orientation == ReadEnds::R, - fragEnd.IsNegativeStrand()); - - // if the two read ends are in the same position, pointing in opposite directions, - // the orientation is undefined and the procedure above - // will depend on the order of the reads in the file. - // To avoid this, we set it explicitly (to FR): - if (pairedEnds.read2ReferenceIndex == pairedEnds.read1ReferenceIndex && - pairedEnds.read2Coordinate == pairedEnds.read1Coordinate && - pairedEnds.orientation == ReadEnds::RF) - { - pairedEnds.orientation = ReadEnds::FR; - } - } - else - { - pairedEnds.read2ReferenceIndex = pairedEnds.read1ReferenceIndex; - pairedEnds.read2Coordinate = pairedEnds.read1Coordinate; - pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile; - pairedEnds.read1ReferenceIndex = matesRefIndex; - pairedEnds.read1Coordinate = matesCoordinate; - pairedEnds.read1IndexInFile = bamIdx; - pairedEnds.orientation = ReadEnds::GetOrientationByte(fragEnd.IsNegativeStrand(), - pairedEnds.orientation == ReadEnds::R); - pairedEnds.posKey = fragEnd.posKey; - } - pairedEnds.score += fragEnd.score; -} \ No newline at end of file +void trackOpticalDuplicates(vector &readEndsArr, const ReadEnds *pBestRe); \ No newline at end of file diff --git a/src/sam/markdups/parallel_md.cpp b/src/sam/markdups/parallel_md.cpp new file mode 100644 index 0000000..e69de29 diff --git a/src/sam/markdups/serial_md.cpp b/src/sam/markdups/serial_md.cpp new file mode 100644 index 0000000..ed1dcc7 --- /dev/null +++ b/src/sam/markdups/serial_md.cpp @@ -0,0 +1,783 @@ +#include "serial_md.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "markdups_arg.h" +#include "md_funcs.h" +#include "shared_args.h" +#include "dup_metrics.h" + +using std::cout; +using std::set; +using std::vector; + +/* 查找 */ +// template +// static inline Itr binaryFind(Itr first, Itr last, const T &val) +// { +// first = std::lower_bound(first, last, val); +// return (first != last && *first == val) ? first : last; +// } + +/* 排序 */ +static inline void sortReadEndsArr(vector &arr) { + size_t blockSize = 64 * 1024; + if (arr.size() < blockSize) { + std::sort(arr.begin(), arr.end()); + return; + } + size_t blockNum = (arr.size() + blockSize - 1) / blockSize; + size_t crossNum = 1024; + size_t start, end, i, left, right; + std::sort(arr.begin(), arr.begin() + blockSize); + for (i = 1; i < blockNum; ++i) { + start = i * blockSize; + end = min(start + blockSize, arr.size()); + std::sort(arr.begin() + start, arr.begin() + end); + left = crossNum; + while (!(arr[start - left] < arr[start])) { + left = left << 1; + if (left >= blockSize) { + std::sort(arr.begin(), arr.end()); // 退化到普通排序 + return; + } + } + right = min(crossNum, end - start - 1); + + while (!(arr[start - 1] < arr[start + right])) { + right = min(right << 1, end - start - 1); + if (right == end - start - 1) + break; + } + std::sort(arr.begin() + start - left, arr.begin() + start + right); + } +} + +/* 处理一组pairend的readends,标记冗余, 这个函数需要串行运行,因为需要做一些统计*/ +static void markDupsForPairs(vector &vpRe, set *dupIdx, set *opticalDupIdx, + set *notDupIdx = nullptr) { + if (vpRe.size() < 2) { + if (vpRe.size() == 1) { + // addSingletonToCount(libraryIdGenerator); + // 这个统计可能会有误差,因为当前位点可能还有没匹配上的read,导致当前位点的read(paired)数量为1 + // 可以通过后续的补充计算来解决这个问题,有必要么? + gMetrics.AddSingletonToCount(); + } + return; + } + int maxScore = 0; + const ReadEnds *pBest = nullptr; + /** All read ends should have orientation FF, FR, RF, or RR **/ + for (auto pe : vpRe) { // 找分数最高的readend + if (pe->score > maxScore || pBest == nullptr) { + maxScore = pe->score; + pBest = pe; + } + } + if (notDupIdx != nullptr) { + notDupIdx->insert(pBest->read1IndexInFile); + notDupIdx->insert(pBest->read2IndexInFile); + } + if (!g_mdArg.READ_NAME_REGEX.empty()) { // 检查光学冗余 + // trackOpticalDuplicates + trackOpticalDuplicates(vpRe, pBest); + } + for (auto pe : vpRe) // 对非best read标记冗余 + { + if (pe != pBest) // 非best + { + dupIdx->insert(pe->read1IndexInFile); // 添加read1 + if (pe->read2IndexInFile != pe->read1IndexInFile) + dupIdx->insert(pe->read2IndexInFile); // 添加read2 + } + } + // 在输出的bam文件中添加tag + if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS) { + // addRepresentativeReadIndex(vpRe); // 每次都更新就行,用最新的覆盖之前的(如果之前有) + } +} + +/* 处理一组非paired的readends,标记冗余 */ +static void markDupsForFrags(vector &vpRe, bool containsPairs, set *dupIdx, + set *notDupIdx = nullptr) { + if (containsPairs) { + for (auto pe : vpRe) { + if (!pe->IsPaired()) { + dupIdx->insert(pe->read1IndexInFile); + } + } + } else { + int maxScore = 0; + const ReadEnds *pBest = nullptr; + for (auto pe : vpRe) { + if (pe->score > maxScore || pBest == nullptr) { + maxScore = pe->score; + pBest = pe; + } + } + if (notDupIdx != nullptr) { + notDupIdx->insert(pBest->read1IndexInFile); + } + for (auto pe : vpRe) { + if (pe != pBest) { + dupIdx->insert(pe->read1IndexInFile); + } + } + } +} + +/* 找到与readend pos相等的所有readend */ +static void getEqualRE(const ReadEnds &re, vector &src, vector *dst) { + auto range = std::equal_range(src.begin(), src.end(), re, + ReadEnds::PairsLittleThan); // 只比对位点 + dst->insert(dst->end(), range.first, range.second); +} + +/* 单线程生成readends (第一步)*/ +static void generateReadEnds(SerailMarkDupArg *arg) { + auto &p = *arg; + auto &rnParser = g_vRnParser[0]; + + p.pairs.clear(); + p.frags.clear(); + p.unpairedDic.clear(); + p.unpairedPosArr.clear(); + + /* 处理每个read,创建ReadEnd,并放入frag和pair中 */ + set reSet; + + ReadEnds lastRe; + + for (int i = 0; i < p.bams.size(); ++i) // 循环处理每个read + { + BamWrap *bw = p.bams[i]; + const int64_t bamIdx = p.bamStartIdx + i; + if (bw->GetReadUnmappedFlag()) { + if (bw->b->core.tid == -1) + // When we hit the unmapped reads with no coordinate, no reason + // to continue (only in coordinate sort). + break; + } else if (!bw->IsSecondaryOrSupplementary()) // 是主要比对 + { + ReadEnds fragEnd; + tm_arr[8].acc_start(); + buildReadEnds(*bw, bamIdx, rnParser, &fragEnd); + tm_arr[8].acc_end(); + p.frags.push_back(fragEnd); // 添加进frag集合 + if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) // 是pairend而且互补的read也比对上了 + { + string key = bw->query_name(); + if (p.unpairedDic.find(key) == p.unpairedDic.end()) { + p.unpairedDic[key] = {p.taskSeq, fragEnd}; + } else // 找到了pairend + { + auto &pairedEnds = p.unpairedDic.at(key).unpairedRE; + modifyPairedEnds(fragEnd, &pairedEnds); + p.pairs.push_back(pairedEnds); + p.unpairedDic.erase(key); // 删除找到的pairend + } + } + } + } + tm_arr[9].acc_start(); + sortReadEndsArr(p.frags); + // sort(p.frags.begin(), p.frags.end()); + tm_arr[9].acc_end(); + // cout << "sort pairs" << endl; + tm_arr[10].acc_start(); + sort(p.pairs.begin(), p.pairs.end()); + tm_arr[10].acc_end(); + // 记录位点上的未匹配的read个数 + for (auto &e : p.unpairedDic) { + auto posKey = e.second.unpairedRE.posKey; + auto &unpairArrInfo = p.unpairedPosArr[posKey]; + unpairArrInfo.unpairedNum++; + unpairArrInfo.taskSeq = p.taskSeq; + unpairArrInfo.readNameSet.insert(e.first); + } + // cout << "依赖比例:" << (float)p.unpairedDic.size() / p.frags.size() << + // endl; +} + +/* 处理pairs */ +static void processPairs(vector &readEnds, set *dupIdx, set *opticalDupIdx, + set *notDupIdx = nullptr) { + vector vpCache; // 有可能是冗余的reads + const ReadEnds *pReadEnd = nullptr; + for (auto &re : readEnds) { + if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样 + vpCache.push_back(&re); // 处理一个潜在的冗余组 + else { + markDupsForPairs(vpCache, dupIdx, opticalDupIdx, notDupIdx); // 不一样 + vpCache.clear(); + vpCache.push_back(&re); + pReadEnd = &re; + } + } + markDupsForPairs(vpCache, dupIdx, opticalDupIdx, notDupIdx); +} + +/* 处理frags */ +static void processFrags(vector &readEnds, set *dupIdx, set *notDupIdx = nullptr) { + bool containsPairs = false; + bool containsFrags = false; + vector vpCache; // 有可能是冗余的reads + const ReadEnds *pReadEnd = nullptr; + for (auto &re : readEnds) { + if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false)) { + vpCache.push_back(&re); + containsPairs = containsPairs || re.IsPaired(); + containsFrags = containsFrags || !re.IsPaired(); + } else { + if (vpCache.size() > 1 && containsFrags) { + markDupsForFrags(vpCache, containsPairs, dupIdx, notDupIdx); + } + vpCache.clear(); + vpCache.push_back(&re); + pReadEnd = &re; + containsPairs = re.IsPaired(); + containsFrags = !re.IsPaired(); + } + } + if (vpCache.size() > 1 && containsFrags) { + markDupsForFrags(vpCache, containsPairs, dupIdx, notDupIdx); + } +} + +/* 单线程markdup (第二步)*/ +static void markdups(SerailMarkDupArg *arg) { + auto &p = *arg; + p.pairDupIdx.clear(); + p.pairOpticalDupIdx.clear(); + p.fragDupIdx.clear(); + /* generateDuplicateIndexes,计算冗余read在所有read中的位置索引 */ + // 先处理 pair + processPairs(p.pairs, &p.pairDupIdx, &p.pairOpticalDupIdx); + + // 再处理frag + processFrags(p.frags, &p.fragDupIdx); +} + +/* 获取交叉部分的数据 */ +static inline void getIntersectData(vector &leftArr, vector &rightArr, vector *dst, + bool isPairCmp = false) { + if (leftArr.empty() || rightArr.empty()) { + cout << "bad size: " << leftArr.size() << '\t' << rightArr.size() << endl; + return; + } + const size_t leftEndIdx = leftArr.size() - 1; + const size_t rightStartIdx = 0; + size_t leftSpan = 0; + size_t rightSpan = 0; + + while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx - leftSpan], rightArr[rightStartIdx], isPairCmp)) { + leftSpan += 1; + if (leftSpan > leftEndIdx) { + leftSpan = leftArr.size() - 1; + break; + } + } + + while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx], rightArr[rightSpan], isPairCmp)) { + rightSpan += 1; + if (rightSpan == rightArr.size() - 1) + break; + } + dst->insert(dst->end(), leftArr.end() - leftSpan, leftArr.end()); + dst->insert(dst->end(), rightArr.begin(), rightArr.begin() + rightSpan); + std::sort(dst->begin(), dst->end()); +} + +/* 将frags重叠部分的dup idx放进数据中 */ +static inline void refreshFragDupIdx(set &dupIdx, set ¬DupIdx, SerailMarkDupArg *lastArg, + SerailMarkDupArg *curArg) { + auto &lp = *lastArg; + auto &p = *curArg; + for (auto idx : dupIdx) { + lp.fragDupIdx.insert(idx); + p.fragDupIdx.erase(idx); + } + for (auto idx : notDupIdx) { + lp.fragDupIdx.erase(idx); + p.fragDupIdx.erase(idx); + } +} + +/* 将pairs重叠部分的dup idx放进数据中 */ +static inline void refreshPairDupIdx(set &dupIdx, set &opticalDupIdx, set ¬DupIdx, + SerailMarkDupArg *lastArg, SerailMarkDupArg *curArg) { + auto &lp = *lastArg; + auto &p = *curArg; + for (auto idx : dupIdx) { + lp.pairDupIdx.insert(idx); + p.pairDupIdx.erase(idx); + } + for (auto idx : opticalDupIdx) { + lp.pairOpticalDupIdx.insert(idx); + p.pairOpticalDupIdx.erase(idx); + } + for (auto idx : notDupIdx) { + lp.pairDupIdx.erase(idx); + lp.pairOpticalDupIdx.erase(idx); + p.pairDupIdx.erase(idx); + p.pairOpticalDupIdx.erase(idx); + } +} + +// 用来分别处理dup和optical dup +static void refeshTaskDupInfo(set &dupIdx, set &opticalDupIdx, set ¬DupIdx, + set &latterDupIdx, set &latterOpticalDupIdx, + set &latterNotDupIdx) { + for (auto idx : dupIdx) latterDupIdx.insert(idx); + for (auto idx : opticalDupIdx) latterOpticalDupIdx.insert(idx); + for (auto idx : notDupIdx) latterNotDupIdx.insert(idx); +} + +/* 最后合并数据并排序 */ +static void refeshFinalTaskDupInfo(set &dupIdx, set ¬DupIdx, vector &dupArr) { + vector midArr; + + auto ai = dupArr.begin(); + auto bi = dupIdx.begin(); + auto ae = dupArr.end(); + auto be = dupIdx.end(); + + int64_t val = 0; + while (ai != ae && bi != be) { + if (*ai < *bi) { + val = *ai++; + } else if (*bi < *ai) { + val = *bi++; + } else { + val = *ai++; + bi++; + } + if (notDupIdx.find(val) == notDupIdx.end()) { + midArr.push_back(val); + } + } + while (ai != ae) { + val = *ai++; + if (notDupIdx.find(val) == notDupIdx.end()) { + midArr.push_back(val); + } + } + while (bi != be) { + val = *bi++; + if (notDupIdx.find(val) == notDupIdx.end()) { + midArr.push_back(val); + } + } + dupArr = midArr; +} + +/* 处理相邻的两个任务,有相交叉的数据 */ +static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *curArg, GlobalDataArg *gDataArg) { + auto &lp = *lastArg; + auto &p = *curArg; + auto &g = *gDataArg; + + vector reArr; + set dupIdx; + set notDupIdx; + // 先处理重叠的frags + getIntersectData(lp.frags, p.frags, &reArr); + processFrags(reArr, &dupIdx, ¬DupIdx); + refreshFragDupIdx(dupIdx, notDupIdx, &lp, &p); + + // 再处理重叠的pairs + reArr.clear(); + dupIdx.clear(); + notDupIdx.clear(); + set opticalDupIdx; + getIntersectData(lp.pairs, p.pairs, &reArr, true); + processPairs(reArr, &dupIdx, &opticalDupIdx, ¬DupIdx); + refreshPairDupIdx(dupIdx, opticalDupIdx, notDupIdx, &lp, &p); + + // 处理之前未匹配的部分 + map recalcPos; + set alreadyAdd; // 与该位点相同的pair都添加到数组里了 + set addToGlobal; + int64_t prevLastPos = 0, nextFirstPos = 0; + if (lp.frags.size() > 0) + prevLastPos = lp.frags.back().posKey; + if (p.frags.size() > 0) + nextFirstPos = p.frags[0].posKey; + // cout << "range: " << nextFirstPos << '\t' << prevLastPos << endl; + for (auto &prevUnpair : lp.unpairedDic) { // 遍历上一个任务中的每个未匹配的read + auto &readName = prevUnpair.first; + auto &prevPosInfo = prevUnpair.second; + auto prevFragEnd = prevPosInfo.unpairedRE; // 未匹配的read end + + if (p.unpairedDic.find(readName) != p.unpairedDic.end()) { // 在当前这个任务里找到了这个未匹配的read + auto &nextPosInfo = p.unpairedDic[readName]; + auto &nextFragEnd = nextPosInfo.unpairedRE; + int64_t prevPosKey = prevFragEnd.posKey; + modifyPairedEnds(nextFragEnd, &prevFragEnd); // 在某些clip情况下,poskey可能是后面的read + int64_t nextPosKey = max(prevPosKey, nextFragEnd.posKey); + CalcKey ck = {prevPosKey, nextPosKey}; + UnpairedPosInfo *prevUnpairInfoP = nullptr; + UnpairedPosInfo *nextUnpairInfoP = nullptr; + if (lp.unpairedPosArr.find(prevPosKey) != lp.unpairedPosArr.end()) + prevUnpairInfoP = &lp.unpairedPosArr[prevPosKey]; + if (p.unpairedPosArr.find(prevPosKey) != p.unpairedPosArr.end()) + nextUnpairInfoP = &p.unpairedPosArr[prevPosKey]; + + // pos分为两种情况,根据poskey(pair中两个read分别的pos)的位置确定 + // 1. + // prevpos在交叉部分之前,nextpos在交叉部分之后,这种情况不需要获取pairarr中的数据; + // 2. + // prevpos在交叉部分之前,nextpos在交叉部分,需要获取lp中的相等read + // pair进行重新计算 + // 复杂情况1. + // g中包含prevPosKey对应的unpair,p中有对应的pair,此时应该把这些pair考虑进去 + // 3. + // prevpos在交叉部分,nextpos在交叉部分之后,需要获取p中的相等read + // pair进行重新计算 + // 复杂情况2. p中是否包含prevPosKey对应的unpair + // 4. + // prevpos在交叉部分,nextpos在交叉部分,需要获取lp和p中的相等read + // pair进行重新计算 + + bool addDataToPos = true; + if (alreadyAdd.find(ck) != alreadyAdd.end()) { + addDataToPos = false; // 之前已经添加过了,后面就不用再添加数据了 + } else + alreadyAdd.insert(ck); + + if (prevPosKey < nextFirstPos) { // prevpos在交叉部分之前 + auto &prevPairArr = prevUnpairInfoP->pairArr; // prevUnpairInfoP肯定不是nullptr + prevPairArr.push_back(prevFragEnd); + if (nextPosKey <= prevLastPos && addDataToPos) { // 第二种情况 + getEqualRE(prevFragEnd, lp.pairs, &prevPairArr); + } + // 第一种情况,第二种情况下都会出现,复杂情况一 + auto gPosInfo = g.unpairedPosArr.find(prevPosKey); + if (gPosInfo != g.unpairedPosArr.end()) { // 可能g和p有匹配的,刚好和该位点一致 + auto &gUnpairInfo = gPosInfo->second; + auto pPosInfo = p.unpairedPosArr.find(nextPosKey); + if (pPosInfo != p.unpairedPosArr.end()) { + auto &pUnpairInfo = pPosInfo->second; + for (auto &rn : gUnpairInfo.readNameSet) { // 遍历每一个readname,看是否有匹配的 + if (pUnpairInfo.readNameSet.find(rn) != pUnpairInfo.readNameSet.end()) { + auto pe = g.unpairedDic[rn].unpairedRE; + auto fe = p.unpairedDic[rn].unpairedRE; + modifyPairedEnds(fe, &pe); + prevPairArr.push_back(pe); + g.unpairedDic.erase(rn); + p.unpairedDic.erase(rn); + // cout << "找到了!" << rn << endl; + } + } + } + } + recalcPos[ck] = prevPosInfo.taskSeq; + std::sort(prevPairArr.begin(), prevPairArr.end()); + } else { // prevpos在交叉部分 + if (nextPosKey > prevLastPos) { // nextpos在交叉部分之后 第三种情况 + if (nextUnpairInfoP != nullptr) { // 且在pos点,next task有unpair,这样才把这些数据放到next task里 + auto &nextPairArr = nextUnpairInfoP->pairArr; + nextPairArr.push_back(prevFragEnd); + auto &prevPairArr = prevUnpairInfoP->pairArr; + prevPairArr.push_back(prevFragEnd); + if (addDataToPos) { + getEqualRE(prevFragEnd, p.pairs, &prevPairArr); + } + // 将数据放到next task里,(这个位点以后会可能还会计算到,目前方案是都计算,只是把冗余剔除) + recalcPos[ck] = nextPosInfo.taskSeq; + + std::sort(prevPairArr.begin(), prevPairArr.end()); + } else { // next task在该位点没有unpair,那就把数据放到prev task里 + auto &prevPairArr = prevUnpairInfoP->pairArr; // prevUnpairInfoP肯定不是nullptr + prevPairArr.push_back(prevFragEnd); + if (addDataToPos) // 第二种情况 + getEqualRE(prevFragEnd, p.pairs, &prevPairArr); + recalcPos[ck] = prevPosInfo.taskSeq; + std::sort(prevPairArr.begin(), prevPairArr.end()); + } + } else { // 第四种情况 + if (prevUnpairInfoP == nullptr) { + prevUnpairInfoP = &lp.unpairedPosArr[prevPosKey]; + prevUnpairInfoP->taskSeq = lp.taskSeq; + } + auto &prevPairArr = prevUnpairInfoP->pairArr; + prevPairArr.push_back(prevFragEnd); + if (addDataToPos) { + getEqualRE(prevFragEnd, lp.pairs, &prevPairArr); + getEqualRE(prevFragEnd, p.pairs, &prevPairArr); + } + recalcPos[ck] = prevPosInfo.taskSeq; + std::sort(prevPairArr.begin(), prevPairArr.end()); + } + } + p.unpairedDic.erase(readName); // 在next task里删除该read + } else if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // 在遗留数据中找到了匹配的read + auto &remainPosInfo = g.unpairedDic[readName]; + auto remainFragEnd = remainPosInfo.unpairedRE; + int64_t remainPosKey = remainFragEnd.posKey; + modifyPairedEnds(prevFragEnd, &remainFragEnd); // 在某些clip情况下,poskey可能是后面的read + auto &remainUnpairInfo = g.unpairedPosArr[remainPosKey]; + auto &remainPairArr = remainUnpairInfo.pairArr; + remainPairArr.push_back(remainFragEnd); + CalcKey ck = {remainPosKey, prevFragEnd.posKey}; + recalcPos[ck] = remainPosInfo.taskSeq; + std::sort(remainPairArr.begin(), remainPairArr.end()); + + g.unpairedDic.erase(readName); + } else { // 都没找到,那就保存到遗留数据里 + int64_t prevPosKey = prevFragEnd.posKey; + g.unpairedDic.insert(prevUnpair); + addToGlobal.insert(prevPosKey); + } + } + // 最后再添加,以防开始赋值,后来这个位置要是又添加了新的数据 + for (auto posKey : addToGlobal) g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey]; + + map taskChanged; + set posProcessed; + for (auto &e : recalcPos) { + auto posKey = e.first.read1Pos; + if (posProcessed.find(posKey) != posProcessed.end()) + continue; + posProcessed.insert(posKey); + auto taskSeq = e.second; + auto &t = taskChanged[taskSeq]; + // 在对应的任务包含的dup idx里修改结果数据 + vector *pairArrP = nullptr; + if (taskSeq < lp.taskSeq) + pairArrP = &g.unpairedPosArr[posKey].pairArr; + else + pairArrP = &lp.unpairedPosArr[posKey].pairArr; + processPairs(*pairArrP, &t.dupIdx, &t.opticalDupIdx, &t.notDupIdx); + if (taskSeq < lp.taskSeq) + g.unpairedPosArr.erase(posKey); + } + // 更新结果 + + for (auto &e : taskChanged) { + auto taskSeq = e.first; + auto &t = e.second; + if (taskSeq < lp.taskSeq) { + refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.notDupIdx, g.latterDupIdxArr[taskSeq], + g.latterOpticalDupIdxArr[taskSeq], g.latterNotDupIdxArr[taskSeq]); + } else if (taskSeq == lp.taskSeq) { + refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.notDupIdx, &lp, &p); + } else { + refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.notDupIdx, &p, &lp); // 把结果放到p中 + } + } + + // cout << "remain unpaired: " << g.unpairedDic.size() << '\t' << + // g.unpairedPosArr.size() << endl; cout << "calc g time: " << + // t.seconds_elapsed() << " s" << endl; 将dupidx放进全局数据 + g.latterDupIdxArr.push_back(set()); + g.latterOpticalDupIdxArr.push_back(set()); + g.latterNotDupIdxArr.push_back(set()); + + g.dupIdxArr.push_back(vector()); + auto &vIdx = g.dupIdxArr.back(); + lp.pairDupIdx.insert(lp.fragDupIdx.begin(), lp.fragDupIdx.end()); + vIdx.insert(vIdx.end(), lp.pairDupIdx.begin(), lp.pairDupIdx.end()); + + g.opticalDupIdxArr.push_back(vector()); + auto &vOpticalIdx = g.opticalDupIdxArr.back(); + vOpticalIdx.insert(vOpticalIdx.end(), lp.pairOpticalDupIdx.begin(), lp.pairOpticalDupIdx.end()); +} + +/* 当所有任务结束后,global data里还有未处理的数据 */ +static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) { + auto &lp = *task; + auto &g = *gDataArg; + // 遗留的未匹配的pair + for (auto &prevUnpair : lp.unpairedDic) { // 遍历上一个任务中的每个未匹配的read + auto &readName = prevUnpair.first; + auto &prevPosInfo = prevUnpair.second; + auto prevFragEnd = prevPosInfo.unpairedRE; // 未匹配的read end + + if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // 在遗留数据中找到了匹配的read + auto &remainPosInfo = g.unpairedDic[readName]; + auto remainFragEnd = remainPosInfo.unpairedRE; + int64_t remainPosKey = remainFragEnd.posKey; + modifyPairedEnds(prevFragEnd, &remainFragEnd); // 在某些clip情况下,poskey可能是后面的read + auto &remainUnpairInfo = g.unpairedPosArr[remainPosKey]; + + remainUnpairInfo.pairArr.push_back(remainFragEnd); + g.unpairedDic.erase(readName); + } + } + map taskChanged; + for (auto &e : g.unpairedPosArr) { + auto posKey = e.first; + auto taskSeq = e.second.taskSeq; + auto &t = taskChanged[taskSeq]; + auto &arr = g.unpairedPosArr[posKey].pairArr; + + if (arr.size() > 1) { + std::sort(arr.begin(), arr.end()); + processPairs(arr, &t.dupIdx, &t.opticalDupIdx, &t.notDupIdx); + } + } + // 更新结果 + vector addDup; + map ndPosVal; + for (auto &e : taskChanged) { + auto taskSeq = e.first; + auto &t = e.second; + refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.notDupIdx, g.latterDupIdxArr[taskSeq], + g.latterOpticalDupIdxArr[taskSeq], g.latterNotDupIdxArr[taskSeq]); + } + + cout << "last unpair info: " << g.unpairedDic.size() << '\t' << g.unpairedPosArr.size() << endl; + g.unpairedPosArr.clear(); + g.unpairedDic.clear(); + + // 将dupidx放进全局数据 + for (int i = 0; i < (int)g.dupIdxArr.size() - 1; ++i) + refeshFinalTaskDupInfo(g.latterDupIdxArr[i], g.latterNotDupIdxArr[i], g.dupIdxArr[i]); + for (int i = 0; i < (int)g.opticalDupIdxArr.size() - 1; ++i) + refeshFinalTaskDupInfo(g.latterOpticalDupIdxArr[i], g.latterNotDupIdxArr[i], g.opticalDupIdxArr[i]); + + g.dupIdxArr.push_back(vector()); + auto &vIdx = g.dupIdxArr.back(); + lp.pairDupIdx.insert(lp.fragDupIdx.begin(), lp.fragDupIdx.end()); + vIdx.insert(vIdx.end(), lp.pairDupIdx.begin(), lp.pairDupIdx.end()); + + g.opticalDupIdxArr.push_back(vector()); + auto &vOpticalIdx = g.opticalDupIdxArr.back(); + vOpticalIdx.insert(vOpticalIdx.end(), lp.pairOpticalDupIdx.begin(), lp.pairOpticalDupIdx.end()); +} + +/* 串行处理数据,标记冗余 */ +void serialMarkDups() { + tm_arr[5].acc_start(); + Timer::log_time("serial start"); + // 读取缓存初始化 + BamBufType inBamBuf(g_gArg.use_asyncio); + inBamBuf.Init(g_inBamFp, g_inBamHeader, g_gArg.max_mem); + // BamBufType inBamBuf(false); + // inBamBuf.Init(g_inBamFp, g_inBamHeader, 100 * 1024 * 1024); + int64_t processedBamNum = 0; + + SerailMarkDupArg smdArg1, smdArg2; + SerailMarkDupArg *lastArgP = &smdArg1; + SerailMarkDupArg *curArgP = &smdArg2; + + bool isFirstRound = true; + int roundNum = 0; + int64_t readNumSum = 0; + while (inBamBuf.ReadStat() >= 0) { + Timer t_round; + // 读取bam文件中的read + tm_arr[4].acc_start(); + size_t readNum = inBamBuf.ReadBam(); + readNumSum += readNum; + tm_arr[4].acc_end(); + cout << "read num: " << readNum << '\t' << roundNum << endl; + // lastArgP = curArgP; + tm_arr[6].acc_start(); + curArgP->taskSeq = roundNum; + curArgP->bamStartIdx = processedBamNum; + curArgP->bams = inBamBuf.GetBamArr(); + tm_arr[6].acc_end(); + + tm_arr[0].acc_start(); + Timer t1; + generateReadEnds(curArgP); + // cout << "calc read end time: " << t1.seconds_elapsed() << " s" << + // endl; + tm_arr[0].acc_end(); + + tm_arr[1].acc_start(); + t1.reinit(); + markdups(curArgP); + // cout << "markdups time: " << t1.seconds_elapsed() << " s" << endl; + tm_arr[1].acc_end(); + + if (!isFirstRound) { + tm_arr[2].acc_start(); + t1.reinit(); + handleIntersectData(lastArgP, curArgP, &gData); + // cout << "intersect time: " << t1.seconds_elapsed() << " s" << + // endl; + // addTaskIdxToSet(lastArgP, &gData); + tm_arr[2].acc_end(); + } else { + isFirstRound = false; + } + inBamBuf.ClearAll(); // 清理上一轮读入的数据 + processedBamNum += readNum; + + // 交换 + auto tmp = lastArgP; + lastArgP = curArgP; + curArgP = tmp; + // cout << "round time: " << t_round.seconds_elapsed() << endl; + roundNum++; + if (roundNum % 100 == 0) { + cout << "read sum: " << readNumSum << endl; + cout << "round time: " << t_round.seconds_elapsed() * 100 << " s" << endl; + } + } + // cout << "here" << endl; + tm_arr[3].acc_start(); + // 处理剩下的全局数据 + handleLastTask(lastArgP, &gData); + // cout << "here 2" << endl; + tm_arr[3].acc_end(); + + tm_arr[5].acc_end(); + // 统计所有冗余index数量 + int64_t dupNum = 0; + map dup; + + int taskSeq = 0; + for (auto &arr : gData.dupIdxArr) { + for (auto idx : arr) { + if (dup.find(idx) != dup.end()) { + // cout << "dup index: " << dup[idx] << '\t' << taskSeq << '\t' + // << idx << endl; + } + dup[idx] = taskSeq; + } + taskSeq++; + } + // #include + // ofstream out("tumor_dup.txt"); + // for (auto idx : dup) + // { + // out << idx << endl; + // } + // out.close(); + + for (auto &arr : gData.dupIdxArr) dupNum += arr.size(); + + cout << "dup num : " << dupNum << '\t' << dup.size() << endl; + + cout << "calc readend: " << tm_arr[0].acc_seconds_elapsed() << endl; + cout << "markdup : " << tm_arr[1].acc_seconds_elapsed() << endl; + cout << "handle tail : " << tm_arr[2].acc_seconds_elapsed() << endl; + cout << "handle last : " << tm_arr[3].acc_seconds_elapsed() << endl; + cout << "read bam : " << tm_arr[4].acc_seconds_elapsed() << endl; + cout << "new arg : " << tm_arr[6].acc_seconds_elapsed() << endl; + cout << "del arg : " << tm_arr[7].acc_seconds_elapsed() << endl; + cout << "build ends : " << tm_arr[8].acc_seconds_elapsed() << endl; + cout << "sort frags : " << tm_arr[9].acc_seconds_elapsed() << endl; + cout << "sort pairs : " << tm_arr[10].acc_seconds_elapsed() << endl; + cout << "all : " << tm_arr[5].acc_seconds_elapsed() << endl; + + Timer::log_time("serial end "); + + // for (auto i : gData.dupArr) + // cout << i << endl; +} \ No newline at end of file diff --git a/src/sam/markdups/serial_md.h b/src/sam/markdups/serial_md.h index 5aa7f56..4a1f392 100644 --- a/src/sam/markdups/serial_md.h +++ b/src/sam/markdups/serial_md.h @@ -1,20 +1,28 @@ -#include +#pragma once + +#include #include +#include + +#include +#include +#include + +using std::set; +using std::string; +using std::vector; /* 存放未匹配readend相同位点的所有readend */ -struct UnpairedREInfo -{ +struct UnpairedREInfo { int64_t taskSeq; ReadEnds unpairedRE; }; /* 对于一个pair数据,一个完整的计算点,包含read1的比对位置和read2的比对位置 */ -struct CalcKey -{ +struct CalcKey { int64_t read1Pos; int64_t read2Pos; - bool operator<(const CalcKey &o) const - { + bool operator<(const CalcKey &o) const { int comp = (int)(read1Pos - o.read1Pos); if (comp == 0) comp = (int)(read2Pos - o.read2Pos); @@ -23,16 +31,14 @@ struct CalcKey }; /* 当遗留数据在当前任务找到了pair read后,进行冗余计算时候存放结果的数据结构 */ -struct TaskSeqDupInfo -{ +struct TaskSeqDupInfo { set dupIdx; set opticalDupIdx; set notDupIdx; }; /* 保存有未匹配pair位点的信息,包括read end数组和有几个未匹配的read end */ -struct UnpairedPosInfo -{ +struct UnpairedPosInfo { int unpairedNum = 0; int64_t taskSeq; vector pairArr; @@ -41,28 +47,26 @@ struct UnpairedPosInfo // typedef unordered_map UnpairedNameMap; // typedef unordered_map UnpairedPositionMap; -typedef tsl::robin_map UnpairedNameMap; // 以read name为索引,保存未匹配的pair read -typedef tsl::robin_map UnpairedPositionMap; // 以位点为索引,保存该位点包含的对应的所有read和该位点包含的剩余未匹配的read的数量 +typedef tsl::robin_map UnpairedNameMap; // 以read name为索引,保存未匹配的pair read +typedef tsl::robin_map UnpairedPositionMap; // 以位点为索引,保存该位点包含的对应的所有read和该位点包含的剩余未匹配的read的数量 /* 单线程处理冗余参数结构体 */ -struct SerailMarkDupArg -{ - int64_t taskSeq; - int64_t bamStartIdx; // 当前vBam数组中第一个bam记录在整体bam中所处的位置 - vector bams; // 存放待处理的bam read - vector pairs; - vector frags; - set pairDupIdx; // pair的冗余read的索引 - set pairOpticalDupIdx; // optical冗余read的索引 - set fragDupIdx; // frag的冗余read的索引 - UnpairedNameMap unpairedDic; // 用来寻找pair end - UnpairedPositionMap unpairedPosArr; // 存放未匹配的ReadEnd对应位点的所有ReadEnd,为了避免重复存储 +struct SerailMarkDupArg { + int64_t taskSeq; // 任务序号 + int64_t bamStartIdx; // 当前vBam数组中第一个bam记录在整体bam中所处的位置 + vector bams; // 存放待处理的bam read + vector pairs; // 成对的reads + vector frags; // 暂未找到配对的reads + set pairDupIdx; // pair的冗余read的索引 + set pairOpticalDupIdx; // optical冗余read的索引 + set fragDupIdx; // frag的冗余read的索引 + UnpairedNameMap unpairedDic; // 用来寻找pair end + UnpairedPositionMap unpairedPosArr; // 存放未匹配的ReadEnd对应位点的所有ReadEnd,为了避免重复存储 }; /* 全局保留的数据,因为有些paired数据比对到了不同的染色体,相距甚远 */ -struct GlobalDataArg -{ - UnpairedNameMap unpairedDic; // 用来寻找pair end +struct GlobalDataArg { + UnpairedNameMap unpairedDic; // 用来寻找pair end UnpairedPositionMap unpairedPosArr; // 每个task对应一个vector @@ -75,869 +79,5 @@ struct GlobalDataArg vector> latterNotDupIdxArr; }; -static GlobalDataArg gData; - - -/* 查找 */ -// template -// static inline Itr binaryFind(Itr first, Itr last, const T &val) -// { -// first = std::lower_bound(first, last, val); -// return (first != last && *first == val) ? first : last; -// } - -/* 排序 */ -static inline void sortReadEndsArr(vector &arr) -{ - size_t blockSize = 64 * 1024; - blockSize = min(blockSize, arr.size()); - size_t blockNum = (arr.size() + blockSize - 1) / blockSize; - size_t crossNum = 1024; - size_t start, end, i, left, right; - std::sort(arr.begin(), arr.begin() + blockSize); - for (i = 1; i < blockNum; ++i) - { - start = i * blockSize; - end = min(start + blockSize, arr.size()); - std::sort(arr.begin() + start, arr.begin() + end); - left = crossNum; - while (!(arr[start - left] < arr[start])) - { - left = left << 1; - if (left >= blockSize) - { - std::sort(arr.begin(), arr.end()); // 退化到普通排序 - return; - } - } - right = min(crossNum, end - start - 1); - - while (!(arr[start - 1] < arr[start + right])) - { - right = min(right << 1, end - start - 1); - if (right == end - start - 1) - break; - } - std::sort(arr.begin() + start - left, arr.begin() + start + right); - } -} - -/* 处理一组pairend的readends,标记冗余 */ -static void markDupsForPairs(vector &vpRe, - set *dupIdx, - set *opticalDupIdx, - set *notDupIdx = nullptr) -{ - if (vpRe.size() < 2) - { - if (vpRe.size() == 1) - { - // addSingletonToCount(libraryIdGenerator); - } - return; - } - int maxScore = 0; - const ReadEnds *pBest = nullptr; - /** All read ends should have orientation FF, FR, RF, or RR **/ - for (auto pe : vpRe) // 找分数最高的readend - { - if (pe->score > maxScore || pBest == nullptr) - { - maxScore = pe->score; - pBest = pe; - } - } - if (notDupIdx != nullptr) - { - notDupIdx->insert(pBest->read1IndexInFile); - notDupIdx->insert(pBest->read2IndexInFile); - } - if (!g_mdArg.READ_NAME_REGEX.empty()) // 检查光学冗余 - { - // trackOpticalDuplicates - } - for (auto pe : vpRe) // 对非best read标记冗余 - { - if (pe != pBest) // 非best - { - dupIdx->insert(pe->read1IndexInFile); // 添加read1 - if (pe->read2IndexInFile != pe->read1IndexInFile) - dupIdx->insert(pe->read2IndexInFile); // 添加read2 - } - } - - // if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS) - // { - // addRepresentativeReadIndex(vpRe); - // } -} - -/* 处理一组非paired的readends,标记冗余 */ -static void markDupsForFrags(vector &vpRe, - bool containsPairs, - set *dupIdx, - set *notDupIdx = nullptr) -{ - if (containsPairs) - { - for (auto pe : vpRe) - { - if (!pe->IsPaired()) - { - dupIdx->insert(pe->read1IndexInFile); - } - } - } - else - { - int maxScore = 0; - const ReadEnds *pBest = nullptr; - for (auto pe : vpRe) - { - if (pe->score > maxScore || pBest == nullptr) - { - maxScore = pe->score; - pBest = pe; - } - } - if (notDupIdx != nullptr) - { - notDupIdx->insert(pBest->read1IndexInFile); - } - for (auto pe : vpRe) - { - if (pe != pBest) - { - dupIdx->insert(pe->read1IndexInFile); - } - } - } -} - -/* 找到与readend pos相等的所有readend */ -static void getEqualRE(const ReadEnds &re, vector &src, vector *dst) -{ - auto range = std::equal_range(src.begin(), src.end(), re, ReadEnds::pairsLittleThan); // 只比对位点 - dst->insert(dst->end(), range.first, range.second); -} - -/* 单线程生成readends (第一步)*/ -static void generateReadEnds(SerailMarkDupArg *arg) -{ - auto &p = *arg; - auto &rnParser = g_vRnParser[0]; - - p.pairs.clear(); - p.frags.clear(); - p.unpairedDic.clear(); - p.unpairedPosArr.clear(); - - /* 处理每个read,创建ReadEnd,并放入frag和pair中 */ - set reSet; - - ReadEnds lastRe; - - for (int i = 0; i < p.bams.size(); ++i) // 循环处理每个read - { - BamWrap *bw = p.bams[i]; - const int64_t bamIdx = p.bamStartIdx + i; - if (bw->GetReadUnmappedFlag()) - { - if (bw->b->core.tid == -1) - // When we hit the unmapped reads with no coordinate, no reason to continue (only in coordinate sort). - break; - } - else if (!bw->IsSecondaryOrSupplementary()) // 是主要比对 - { - ReadEnds fragEnd; - tm_arr[8].acc_start(); - buildReadEnds(*bw, bamIdx, rnParser, &fragEnd); - tm_arr[8].acc_end(); - p.frags.push_back(fragEnd); // 添加进frag集合 - if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) // 是pairend而且互补的read也比对上了 - { - string key = bw->query_name(); - if (p.unpairedDic.find(key) == p.unpairedDic.end()) - { - p.unpairedDic[key] = {p.taskSeq, fragEnd}; - } - else // 找到了pairend - { - auto &pairedEnds = p.unpairedDic.at(key).unpairedRE; - modifyPairedEnds(fragEnd, &pairedEnds); - p.pairs.push_back(pairedEnds); - p.unpairedDic.erase(key); // 删除找到的pairend - } - } - } - } - tm_arr[9].acc_start(); - sortReadEndsArr(p.frags); - // sort(p.frags.begin(), p.frags.end()); - tm_arr[9].acc_end(); - // cout << "sort pairs" << endl; - tm_arr[10].acc_start(); - sort(p.pairs.begin(), p.pairs.end()); - tm_arr[10].acc_end(); - // 记录位点上的未匹配的read个数 - for (auto &e : p.unpairedDic) { - auto posKey = e.second.unpairedRE.posKey; - auto &unpairArrInfo = p.unpairedPosArr[posKey]; - unpairArrInfo.unpairedNum++; - unpairArrInfo.taskSeq = p.taskSeq; - unpairArrInfo.readNameSet.insert(e.first); - } - cout << "依赖比例:" << (float)p.unpairedDic.size() / p.frags.size() << endl; -} - -/* 处理pairs */ -static void processPairs(vector &readEnds, - set *dupIdx, - set *opticalDupIdx, - set *notDupIdx = nullptr) -{ - vector vpCache; // 有可能是冗余的reads - const ReadEnds *pReadEnd = nullptr; - for (auto &re : readEnds) - { - if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样 - vpCache.push_back(&re); // 处理一个潜在的冗余组 - else - { - markDupsForPairs(vpCache, dupIdx, opticalDupIdx, notDupIdx); // 不一样 - vpCache.clear(); - vpCache.push_back(&re); - pReadEnd = &re; - } - } - markDupsForPairs(vpCache, dupIdx, opticalDupIdx, notDupIdx); -} - -/* 处理frags */ -static void processFrags(vector &readEnds, - set *dupIdx, - set *notDupIdx = nullptr) -{ - bool containsPairs = false; - bool containsFrags = false; - vector vpCache; // 有可能是冗余的reads - const ReadEnds *pReadEnd = nullptr; - for (auto &re : readEnds) - { - if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false)) - { - vpCache.push_back(&re); - containsPairs = containsPairs || re.IsPaired(); - containsFrags = containsFrags || !re.IsPaired(); - } - else - { - if (vpCache.size() > 1 && containsFrags) - { - markDupsForFrags(vpCache, containsPairs, dupIdx, notDupIdx); - } - vpCache.clear(); - vpCache.push_back(&re); - pReadEnd = &re; - containsPairs = re.IsPaired(); - containsFrags = !re.IsPaired(); - } - } - if (vpCache.size() > 1 && containsFrags) - { - markDupsForFrags(vpCache, containsPairs, dupIdx, notDupIdx); - } -} - -/* 单线程markdup (第二步)*/ -static void markdups(SerailMarkDupArg *arg) -{ - auto &p = *arg; - p.pairDupIdx.clear(); - p.pairOpticalDupIdx.clear(); - p.fragDupIdx.clear(); - /* generateDuplicateIndexes,计算冗余read在所有read中的位置索引 */ - // 先处理 pair - processPairs(p.pairs, &p.pairDupIdx, &p.pairOpticalDupIdx); - - // 再处理frag - processFrags(p.frags, &p.fragDupIdx); -} - -/* 获取交叉部分的数据 */ -static inline void getIntersectData(vector &leftArr, - vector &rightArr, - vector *dst, - bool isPairCmp = false) -{ - const size_t leftEndIdx = leftArr.size() - 1; - const size_t rightStartIdx = 0; - size_t leftSpan = 0; - size_t rightSpan = 0; - - while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx - leftSpan], rightArr[rightStartIdx], isPairCmp)) - { - leftSpan += 1; - if (leftSpan > leftEndIdx) - { - leftSpan = leftArr.size() - 1; - break; - } - } - - while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx], rightArr[rightSpan], isPairCmp)) - { - rightSpan += 1; - if (rightSpan == rightArr.size() - 1) - break; - } - dst->insert(dst->end(), leftArr.end() - leftSpan, leftArr.end()); - dst->insert(dst->end(), rightArr.begin(), rightArr.begin() + rightSpan); - std::sort(dst->begin(), dst->end()); -} - -/* 将frags重叠部分的dup idx放进数据中 */ -static inline void refreshFragDupIdx(set &dupIdx, - set ¬DupIdx, - SerailMarkDupArg * lastArg, - SerailMarkDupArg *curArg) -{ - auto &lp = *lastArg; - auto &p = *curArg; - for (auto idx : dupIdx) - { - lp.fragDupIdx.insert(idx); - p.fragDupIdx.erase(idx); - } - for (auto idx : notDupIdx) - { - lp.fragDupIdx.erase(idx); - p.fragDupIdx.erase(idx); - } -} - -/* 将pairs重叠部分的dup idx放进数据中 */ -static inline void refreshPairDupIdx(set &dupIdx, - set &opticalDupIdx, - set ¬DupIdx, - SerailMarkDupArg *lastArg, - SerailMarkDupArg *curArg) -{ - auto &lp = *lastArg; - auto &p = *curArg; - for (auto idx : dupIdx) - { - lp.pairDupIdx.insert(idx); - p.pairDupIdx.erase(idx); - } - for (auto idx : opticalDupIdx) - { - lp.pairOpticalDupIdx.insert(idx); - p.pairOpticalDupIdx.erase(idx); - } - for (auto idx : notDupIdx) - { - lp.pairDupIdx.erase(idx); - lp.pairOpticalDupIdx.erase(idx); - p.pairDupIdx.erase(idx); - p.pairOpticalDupIdx.erase(idx); - } -} - -// 用来分别处理dup和optical dup -static void refeshTaskDupInfo(set &dupIdx, - set &opticalDupIdx, - set ¬DupIdx, - set &latterDupIdx, - set &latterOpticalDupIdx, - set &latterNotDupIdx) -{ - for (auto idx : dupIdx) - latterDupIdx.insert(idx); - for (auto idx : opticalDupIdx) - latterOpticalDupIdx.insert(idx); - for (auto idx : notDupIdx) - latterNotDupIdx.insert(idx); -} - -/* 最后合并数据并排序 */ -static void refeshFinalTaskDupInfo(set &dupIdx, - set ¬DupIdx, - vector &dupArr) -{ - vector midArr; - - auto ai = dupArr.begin(); - auto bi = dupIdx.begin(); - auto ae = dupArr.end(); - auto be = dupIdx.end(); - - int64_t val = 0; - while (ai != ae && bi != be) - { - if (*ai < *bi) - { - val = *ai++; - } - else if (*bi < *ai) - { - val = *bi++; - } - else - { - val = *ai++; - bi++; - } - if (notDupIdx.find(val) == notDupIdx.end()) - { - midArr.push_back(val); - } - } - while (ai != ae) - { - val = *ai++; - if (notDupIdx.find(val) == notDupIdx.end()) - { - midArr.push_back(val); - } - } - while (bi != be) - { - val = *bi++; - if (notDupIdx.find(val) == notDupIdx.end()) - { - midArr.push_back(val); - } - } - dupArr = midArr; -} - -/* 处理相邻的两个任务,有相交叉的数据 */ -static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *curArg, GlobalDataArg *gDataArg) -{ - auto &lp = *lastArg; - auto &p = *curArg; - auto &g = *gDataArg; - - vector reArr; - set dupIdx; - set notDupIdx; - // 先处理重叠的frags - getIntersectData(lp.frags, p.frags, &reArr); - processFrags(reArr, &dupIdx, ¬DupIdx); - refreshFragDupIdx(dupIdx, notDupIdx, &lp, &p); - - // 再处理重叠的pairs - reArr.clear(); - dupIdx.clear(); - notDupIdx.clear(); - set opticalDupIdx; - getIntersectData(lp.pairs, p.pairs, &reArr, true); - processPairs(reArr, &dupIdx, &opticalDupIdx, ¬DupIdx); - refreshPairDupIdx(dupIdx, opticalDupIdx, notDupIdx, &lp, &p); - - // 处理之前未匹配的部分 - map recalcPos; - set alreadyAdd; // 与该位点相同的pair都添加到数组里了 - set addToGlobal; - int64_t prevLastPos = 0, nextFirstPos = 0; - if (lp.frags.size() > 0) - prevLastPos = lp.frags.back().posKey; - if (p.frags.size() > 0) - nextFirstPos = p.frags[0].posKey; - // cout << "range: " << nextFirstPos << '\t' << prevLastPos << endl; - for (auto &prevUnpair : lp.unpairedDic) // 遍历上一个任务中的每个未匹配的read - { - auto &readName = prevUnpair.first; - auto &prevPosInfo = prevUnpair.second; - auto prevFragEnd = prevPosInfo.unpairedRE; // 未匹配的read end - - if (p.unpairedDic.find(readName) != p.unpairedDic.end()) // 在当前这个任务里找到了这个未匹配的read - { - auto &nextPosInfo = p.unpairedDic[readName]; - auto &nextFragEnd = nextPosInfo.unpairedRE; - int64_t prevPosKey = prevFragEnd.posKey; - modifyPairedEnds(nextFragEnd, &prevFragEnd); // 在某些clip情况下,poskey可能是后面的read - int64_t nextPosKey = max(prevPosKey, nextFragEnd.posKey); - CalcKey ck = {prevPosKey, nextPosKey}; - UnpairedPosInfo *prevUnpairInfoP = nullptr; - UnpairedPosInfo *nextUnpairInfoP = nullptr; - if (lp.unpairedPosArr.find(prevPosKey) != lp.unpairedPosArr.end()) - prevUnpairInfoP = &lp.unpairedPosArr[prevPosKey]; - if (p.unpairedPosArr.find(prevPosKey) != p.unpairedPosArr.end()) - nextUnpairInfoP = &p.unpairedPosArr[prevPosKey]; - - // pos分为两种情况,根据poskey(pair中两个read分别的pos)的位置确定 - // 1. prevpos在交叉部分之前,nextpos在交叉部分之后,这种情况不需要获取pairarr中的数据; - // 2. prevpos在交叉部分之前,nextpos在交叉部分,需要获取lp中的相等read pair进行重新计算 - // 复杂情况1. g中包含prevPosKey对应的unpair,p中有对应的pair,此时应该把这些pair考虑进去 - // 3. prevpos在交叉部分,nextpos在交叉部分之后,需要获取p中的相等read pair进行重新计算 - // 复杂情况2. p中是否包含prevPosKey对应的unpair - // 4. prevpos在交叉部分,nextpos在交叉部分,需要获取lp和p中的相等read pair进行重新计算 - - bool addDataToPos = true; - if (alreadyAdd.find(ck) != alreadyAdd.end()) - { - addDataToPos = false; // 之前已经添加过了,后面就不用再添加数据了 - } - else - alreadyAdd.insert(ck); - - if (prevPosKey < nextFirstPos) // prevpos在交叉部分之前 - { - auto &prevPairArr = prevUnpairInfoP->pairArr; // prevUnpairInfoP肯定不是nullptr - prevPairArr.push_back(prevFragEnd); - if (nextPosKey <= prevLastPos && addDataToPos) // 第二种情况 - { - getEqualRE(prevFragEnd, lp.pairs, &prevPairArr); - } - // 第一种情况,第二种情况下都会出现,复杂情况一 - auto gPosInfo = g.unpairedPosArr.find(prevPosKey); - if (gPosInfo != g.unpairedPosArr.end()) // 可能g和p有匹配的,刚好和该位点一致 - { - auto &gUnpairInfo = gPosInfo->second; - auto pPosInfo = p.unpairedPosArr.find(nextPosKey); - if (pPosInfo != p.unpairedPosArr.end()) - { - auto &pUnpairInfo = pPosInfo->second; - for (auto &rn : gUnpairInfo.readNameSet) // 遍历每一个readname,看是否有匹配的 - { - if (pUnpairInfo.readNameSet.find(rn) != pUnpairInfo.readNameSet.end()) - { - auto pe = g.unpairedDic[rn].unpairedRE; - auto fe = p.unpairedDic[rn].unpairedRE; - modifyPairedEnds(fe, &pe); - prevPairArr.push_back(pe); - g.unpairedDic.erase(rn); - p.unpairedDic.erase(rn); - // cout << "找到了!" << rn << endl; - } - } - } - } - recalcPos[ck] = prevPosInfo.taskSeq; - std::sort(prevPairArr.begin(), prevPairArr.end()); - } - else // prevpos在交叉部分 - { - if (nextPosKey > prevLastPos) // nextpos在交叉部分之后 - { // 第三种情况 - if (nextUnpairInfoP != nullptr) // 且在pos点,next task有unpair,这样才把这些数据放到next task里 - { - auto &nextPairArr = nextUnpairInfoP->pairArr; - nextPairArr.push_back(prevFragEnd); - auto &prevPairArr = prevUnpairInfoP->pairArr; - prevPairArr.push_back(prevFragEnd); - if (addDataToPos) - { - getEqualRE(prevFragEnd, p.pairs, &prevPairArr); - } - recalcPos[ck] = nextPosInfo.taskSeq; // 将数据放到next task里, (这个位点以后会可能还会计算到,目前方案是都计算,只是把冗余剔除) - std::sort(prevPairArr.begin(), prevPairArr.end()); - } - else // next task在该位点没有unpair,那就把数据放到prev task里 - { - auto &prevPairArr = prevUnpairInfoP->pairArr; // prevUnpairInfoP肯定不是nullptr - prevPairArr.push_back(prevFragEnd); - if (addDataToPos) // 第二种情况 - getEqualRE(prevFragEnd, p.pairs, &prevPairArr); - recalcPos[ck] = prevPosInfo.taskSeq; - std::sort(prevPairArr.begin(), prevPairArr.end()); - } - } - else - { // 第四种情况 - if (prevUnpairInfoP == nullptr) { - prevUnpairInfoP = &lp.unpairedPosArr[prevPosKey]; - prevUnpairInfoP->taskSeq = lp.taskSeq; - } - auto &prevPairArr = prevUnpairInfoP->pairArr; - prevPairArr.push_back(prevFragEnd); - if (addDataToPos) - { - getEqualRE(prevFragEnd, lp.pairs, &prevPairArr); - getEqualRE(prevFragEnd, p.pairs, &prevPairArr); - } - recalcPos[ck] = prevPosInfo.taskSeq; - std::sort(prevPairArr.begin(), prevPairArr.end()); - } - } - p.unpairedDic.erase(readName); // 在next task里删除该read - } - else if (g.unpairedDic.find(readName) != g.unpairedDic.end()) // 在遗留数据中找到了匹配的read - { - auto &remainPosInfo = g.unpairedDic[readName]; - auto remainFragEnd = remainPosInfo.unpairedRE; - int64_t remainPosKey = remainFragEnd.posKey; - modifyPairedEnds(prevFragEnd, &remainFragEnd); // 在某些clip情况下,poskey可能是后面的read - auto &remainUnpairInfo = g.unpairedPosArr[remainPosKey]; - auto &remainPairArr = remainUnpairInfo.pairArr; - remainPairArr.push_back(remainFragEnd); - CalcKey ck = {remainPosKey, prevFragEnd.posKey}; - recalcPos[ck] = remainPosInfo.taskSeq; - std::sort(remainPairArr.begin(), remainPairArr.end()); - - g.unpairedDic.erase(readName); - } - else // 都没找到,那就保存到遗留数据里 - { - int64_t prevPosKey = prevFragEnd.posKey; - g.unpairedDic.insert(prevUnpair); - addToGlobal.insert(prevPosKey); - } - } - for (auto posKey : addToGlobal) // 最后再添加,以防开始赋值,后来这个位置要是又添加了新的数据 - g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey]; - - map taskChanged; - set posProcessed; - for (auto &e : recalcPos) - { - auto posKey = e.first.read1Pos; - if (posProcessed.find(posKey) != posProcessed.end()) - continue; - posProcessed.insert(posKey); - auto taskSeq = e.second; - auto &t = taskChanged[taskSeq]; - // 在对应的任务包含的dup idx里修改结果数据 - vector *pairArrP = nullptr; - if (taskSeq < lp.taskSeq) - pairArrP = &g.unpairedPosArr[posKey].pairArr; - else - pairArrP = &lp.unpairedPosArr[posKey].pairArr; - processPairs(*pairArrP, &t.dupIdx, &t.opticalDupIdx, &t.notDupIdx); - if (taskSeq < lp.taskSeq) - g.unpairedPosArr.erase(posKey); - } - // 更新结果 - - for (auto &e : taskChanged) - { - auto taskSeq = e.first; - auto &t = e.second; - if (taskSeq < lp.taskSeq) - { - refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.notDupIdx, - g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq], g.latterNotDupIdxArr[taskSeq]); - } - else if (taskSeq == lp.taskSeq) - { - refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.notDupIdx, &lp, &p); - } - else - { - refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.notDupIdx, &p, &lp); // 把结果放到p中 - } - } - - // cout << "remain unpaired: " << g.unpairedDic.size() << '\t' << g.unpairedPosArr.size() << endl; - // cout << "calc g time: " << t.seconds_elapsed() << " s" << endl; - // 将dupidx放进全局数据 - g.latterDupIdxArr.push_back(set()); - g.latterOpticalDupIdxArr.push_back(set()); - g.latterNotDupIdxArr.push_back(set()); - - g.dupIdxArr.push_back(vector()); - auto &vIdx = g.dupIdxArr.back(); - lp.pairDupIdx.insert(lp.fragDupIdx.begin(), lp.fragDupIdx.end()); - vIdx.insert(vIdx.end(), lp.pairDupIdx.begin(), lp.pairDupIdx.end()); - - g.opticalDupIdxArr.push_back(vector()); - auto &vOpticalIdx = g.opticalDupIdxArr.back(); - vOpticalIdx.insert(vOpticalIdx.end(), lp.pairOpticalDupIdx.begin(), lp.pairOpticalDupIdx.end()); -} - -/* 当所有任务结束后,global data里还有未处理的数据 */ -static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) -{ - auto &lp = *task; - auto &g = *gDataArg; - // 遗留的未匹配的pair - for (auto &prevUnpair : lp.unpairedDic) // 遍历上一个任务中的每个未匹配的read - { - auto &readName = prevUnpair.first; - auto &prevPosInfo = prevUnpair.second; - auto prevFragEnd = prevPosInfo.unpairedRE; // 未匹配的read end - - if (g.unpairedDic.find(readName) != g.unpairedDic.end()) // 在遗留数据中找到了匹配的read - { - auto &remainPosInfo = g.unpairedDic[readName]; - auto remainFragEnd = remainPosInfo.unpairedRE; - int64_t remainPosKey = remainFragEnd.posKey; - modifyPairedEnds(prevFragEnd, &remainFragEnd); // 在某些clip情况下,poskey可能是后面的read - auto &remainUnpairInfo = g.unpairedPosArr[remainPosKey]; - - remainUnpairInfo.pairArr.push_back(remainFragEnd); - g.unpairedDic.erase(readName); - } - } - map taskChanged; - for (auto &e : g.unpairedPosArr) - { - auto posKey = e.first; - auto taskSeq = e.second.taskSeq; - auto &t = taskChanged[taskSeq]; - auto &arr = g.unpairedPosArr[posKey].pairArr; - - if (arr.size() > 1) - { - std::sort(arr.begin(), arr.end()); - processPairs(arr, &t.dupIdx, &t.opticalDupIdx, &t.notDupIdx); - } - } - // 更新结果 - vector addDup; - map ndPosVal; - for (auto &e : taskChanged) - { - auto taskSeq = e.first; - auto &t = e.second; - refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.notDupIdx, - g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq], g.latterNotDupIdxArr[taskSeq]); - } - - cout << "last unpair info: " << g.unpairedDic.size() << '\t' << g.unpairedPosArr.size() << endl; - g.unpairedPosArr.clear(); - g.unpairedDic.clear(); - - // 将dupidx放进全局数据 - for (int i = 0; i < g.dupIdxArr.size() - 1; ++i) - refeshFinalTaskDupInfo(g.latterDupIdxArr[i], g.latterNotDupIdxArr[i], g.dupIdxArr[i]); - for (int i = 0; i < g.opticalDupIdxArr.size() - 1; ++i) - refeshFinalTaskDupInfo(g.latterOpticalDupIdxArr[i], g.latterNotDupIdxArr[i], g.opticalDupIdxArr[i]); - - g.dupIdxArr.push_back(vector()); - auto &vIdx = g.dupIdxArr.back(); - lp.pairDupIdx.insert(lp.fragDupIdx.begin(), lp.fragDupIdx.end()); - vIdx.insert(vIdx.end(), lp.pairDupIdx.begin(), lp.pairDupIdx.end()); - - g.opticalDupIdxArr.push_back(vector()); - auto &vOpticalIdx = g.opticalDupIdxArr.back(); - vOpticalIdx.insert(vOpticalIdx.end(), lp.pairOpticalDupIdx.begin(), lp.pairOpticalDupIdx.end()); -} - -/* 串行处理数据,标记冗余 */ -static void serialMarkDups() -{ - tm_arr[5].acc_start(); - Timer::log_time("serial start"); - // 读取缓存初始化 - BamBufType inBamBuf(g_gArg.use_asyncio); - inBamBuf.Init(g_inBamFp, g_inBamHeader, g_gArg.max_mem); - // BamBufType inBamBuf(false); - // inBamBuf.Init(g_inBamFp, g_inBamHeader, 100 * 1024 * 1024); - int64_t processedBamNum = 0; - - SerailMarkDupArg smdArg1, smdArg2; - SerailMarkDupArg *lastArgP = &smdArg1; - SerailMarkDupArg *curArgP = &smdArg2; - - bool isFirstRound = true; - int roundNum = 0; - while (inBamBuf.ReadStat() >= 0) - { - Timer t_round; - // 读取bam文件中的read - tm_arr[4].acc_start(); - size_t readNum = inBamBuf.ReadBam(); - tm_arr[4].acc_end(); - cout << "read num: " << readNum << endl; - // lastArgP = curArgP; - tm_arr[6].acc_start(); - curArgP->taskSeq = roundNum; - curArgP->bamStartIdx = processedBamNum; - curArgP->bams = inBamBuf.GetBamArr(); - tm_arr[6].acc_end(); - - tm_arr[0].acc_start(); - Timer t1; - generateReadEnds(curArgP); - //cout << "calc read end time: " << t1.seconds_elapsed() << " s" << endl; - tm_arr[0].acc_end(); - - tm_arr[1].acc_start(); - t1.reinit(); - markdups(curArgP); - //cout << "markdups time: " << t1.seconds_elapsed() << " s" << endl; - tm_arr[1].acc_end(); - - if (!isFirstRound) - { - tm_arr[2].acc_start(); - t1.reinit(); - handleIntersectData(lastArgP, curArgP, &gData); - //cout << "intersect time: " << t1.seconds_elapsed() << " s" << endl; - // addTaskIdxToSet(lastArgP, &gData); - tm_arr[2].acc_end(); - } - else - { - isFirstRound = false; - } - inBamBuf.ClearAll(); // 清理上一轮读入的数据 - processedBamNum += readNum; - - // 交换 - auto tmp = lastArgP; - lastArgP = curArgP; - curArgP = tmp; - cout << "round time: " << t_round.seconds_elapsed() << endl; - roundNum++; - if (roundNum > 9){ -// break; - } - } - // cout << "here" << endl; - tm_arr[3].acc_start(); - // 处理剩下的全局数据 - handleLastTask(lastArgP, &gData); - // cout << "here 2" << endl; - tm_arr[3].acc_end(); - - tm_arr[5].acc_end(); - // 统计所有冗余index数量 - int64_t dupNum = 0; - set dup; - - // int taskSeq = 0; - // for (auto &arr : gData.dupIdxArr) - // { - // for (auto idx : arr) { - // if (dup.find(idx) != dup.end()) - // { - // cout << "dup index: " << taskSeq << '\t' << idx << endl; - // } - // dup.insert(idx); - // } - // taskSeq++; - // } - // #include - // ofstream out("tumor_dup.txt"); - // for (auto idx : dup) - // { - // out << idx << endl; - // } - // out.close(); - - for (auto &arr : gData.dupIdxArr) - dupNum += arr.size(); - - cout << "dup num : " << dupNum << '\t' << dup.size() << endl; - - cout << "calc readend: " << tm_arr[0].acc_seconds_elapsed() << endl; - cout << "markdup : " << tm_arr[1].acc_seconds_elapsed() << endl; - cout << "handle tail : " << tm_arr[2].acc_seconds_elapsed() << endl; - cout << "handle last : " << tm_arr[3].acc_seconds_elapsed() << endl; - cout << "read bam : " << tm_arr[4].acc_seconds_elapsed() << endl; - cout << "new arg : " << tm_arr[6].acc_seconds_elapsed() << endl; - cout << "del arg : " << tm_arr[7].acc_seconds_elapsed() << endl; - cout << "build ends : " << tm_arr[8].acc_seconds_elapsed() << endl; - cout << "sort frags : " << tm_arr[9].acc_seconds_elapsed() << endl; - cout << "sort pairs : " << tm_arr[10].acc_seconds_elapsed() << endl; - cout << "all : " << tm_arr[5].acc_seconds_elapsed() << endl; - - Timer::log_time("serial end "); - - //for (auto i : gData.dupArr) - // cout << i << endl; -} \ No newline at end of file +// 串行运行mark duplicate +void serialMarkDups(); \ No newline at end of file diff --git a/src/sam/markdups/shared_args.h b/src/sam/markdups/shared_args.h new file mode 100644 index 0000000..3f739ec --- /dev/null +++ b/src/sam/markdups/shared_args.h @@ -0,0 +1,26 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +extern Timer tm_arr[20]; // 用来测试性能 +/* 全局本地变量 */ +extern vector g_vRnParser; // 每个线程一个read name parser +extern samFile *g_inBamFp; // 输入的bam文件 +extern sam_hdr_t *g_inBamHeader; // 输入的bam文件头信息 +extern samFile *g_outBamFp; // 输出文件, sam或者bam格式 +extern sam_hdr_t *g_outBamHeader; // 输出文件的header + +/* 参数对象作为全局对象,免得多次作为参数传入函数中 */ +class GlobalArg; +extern GlobalArg &g_gArg; +class MarkDupsArg; +extern MarkDupsArg &g_mdArg; +class GlobalDataArg; +extern GlobalDataArg &gData; +class DuplicationMetrics; +extern DuplicationMetrics &gMetrics; \ No newline at end of file diff --git a/src/sam/utils/read_ends.h b/src/sam/utils/read_ends.h index 1ec6b0e..25ff0b8 100644 --- a/src/sam/utils/read_ends.h +++ b/src/sam/utils/read_ends.h @@ -1,5 +1,6 @@ /* -Description: read ends结构体主要用来标记冗余,包含一些序列的测序过程中的物理信息等 +Description: read +ends结构体主要用来标记冗余,包含一些序列的测序过程中的物理信息等 Copyright : All right reserved by ICT @@ -13,18 +14,20 @@ Date : 2023/11/3 #include /** - * Small interface that provides access to the physical location information about a cluster. - * All values should be defaulted to -1 if unavailable. ReadGroup and Tile should only allow - * non-zero positive integers, x and y coordinates may be negative. + * Small interface that provides access to the physical location information + * about a cluster. All values should be defaulted to -1 if unavailable. + * ReadGroup and Tile should only allow non-zero positive integers, x and y + * coordinates may be negative. */ -struct PhysicalLocation -{ +struct PhysicalLocation { + static const int NO_VALUE = -1; /** - * Small class that provides access to the physical location information about a cluster. - * All values should be defaulted to -1 if unavailable. Tile should only allow - * non-zero positive integers, x and y coordinates must be non-negative. - * This is different from PhysicalLocationShort in that the x and y positions are ints, not shorts - * thus, they do not overflow within a HiSeqX tile. + * Small class that provides access to the physical location information + * about a cluster. All values should be defaulted to -1 if unavailable. + * Tile should only allow non-zero positive integers, x and y coordinates + * must be non-negative. This is different from PhysicalLocationShort in + * that the x and y positions are ints, not shorts thus, they do not + * overflow within a HiSeqX tile. */ int16_t tile = -1; int32_t x = -1; @@ -32,28 +35,33 @@ struct PhysicalLocation }; /* 包含了所有read ends信息,如picard里边的 ReadEndsForMarkDuplicates*/ -struct ReadEnds : PhysicalLocation -{ +struct ReadEnds : PhysicalLocation { static const int8_t F = 0, R = 1, FF = 2, FR = 3, RR = 4, RF = 5; /* 保留一些bam记录中的数据 */ bool read1FirstOfPair = true; /* ReadEnds中的成员变量 */ - /** Little struct-like class to hold read pair (and fragment) end data for duplicate marking. */ + /** Little struct-like class to hold read pair (and fragment) end data for + * duplicate marking. */ // int16_t libraryId; // 没用,不考虑多样本 int8_t orientation = -1; int32_t read1ReferenceIndex = -1; int32_t read1Coordinate = -1; int32_t read2ReferenceIndex = -1; - int32_t read2Coordinate = -1; // This field is overloaded for flow based processing as the end coordinate of read 1. (paired reads not supported) + // This field is overloaded for flow based processing as the end coordinate of read 1. (paired reads not supported) + int32_t read2Coordinate = -1; /* Additional information used to detect optical dupes */ - // int16_t readGroup = -1; 一般经过比对后的bam文件只有一个read group,normal或者tumor - /** For optical duplicate detection the orientation matters regard to 1st or 2nd end of a mate */ + // int16_t readGroup = -1; 一般经过比对后的bam文件只有一个read + // group,normal或者tumor + /** For optical duplicate detection the orientation matters regard to 1st or + * 2nd end of a mate */ int8_t orientationForOpticalDuplicates = -1; - /** A *transient* flag marking this read end as being an optical duplicate. */ + /** A *transient* flag marking this read end as being an optical duplicate. + */ bool isOpticalDuplicate = false; /* ReadEndsForMarkDuplicates中的成员变量 */ - /** Little struct-like class to hold read pair (and fragment) end data for MarkDuplicatesWithMateCigar **/ + /** Little struct-like class to hold read pair (and fragment) end data for + * MarkDuplicatesWithMateCigar **/ int16_t score = 0; int64_t read1IndexInFile = -1; int64_t read2IndexInFile = -1; @@ -62,23 +70,22 @@ struct ReadEnds : PhysicalLocation /* ReadEndsForMarkDuplicatesWithBarcodes中的成员变量 (好像用不到) */ // int32_t barcode = 0; // primary barcode for this read (and pair) // int32_t readOneBarcode = 0; // read one barcode, 0 if not present - // int32_t readTwoBarcode = 0; // read two barcode, 0 if not present or not paired + // int32_t readTwoBarcode = 0; // read two barcode, 0 if not present or not + // paired /* zzh增加的成员变量 */ - int64_t posKey = -1; // 根据位置信息生成的关键字 return (int64_t)tid << MAX_CONTIG_LEN_SHIFT | (int64_t)pos; + int64_t posKey = -1; // 根据位置信息生成的关键字 return (int64_t)tid << + // MAX_CONTIG_LEN_SHIFT | (int64_t)pos; /* 根据pairend read的比对方向,来确定整体的比对方向 */ - static int8_t GetOrientationByte(bool read1NegativeStrand, bool read2NegativeStrand) - { - if (read1NegativeStrand) - { + static int8_t GetOrientationByte(bool read1NegativeStrand, + bool read2NegativeStrand) { + if (read1NegativeStrand) { if (read2NegativeStrand) return RR; else return RF; - } - else - { + } else { if (read2NegativeStrand) return FR; else @@ -87,47 +94,38 @@ struct ReadEnds : PhysicalLocation } /* 比较两个readends是否一样(有个冗余) */ - static bool AreComparableForDuplicates(const ReadEnds &lhs, const ReadEnds &rhs, bool compareRead2) - { + static bool AreComparableForDuplicates(const ReadEnds &lhs, + const ReadEnds &rhs, + bool compareRead2) { bool areComparable = true; areComparable = lhs.read1ReferenceIndex == rhs.read1ReferenceIndex && lhs.read1Coordinate == rhs.read1Coordinate && lhs.orientation == rhs.orientation; - if (areComparable && compareRead2) - { - areComparable = lhs.read2ReferenceIndex == rhs.read2ReferenceIndex && - lhs.read2Coordinate == rhs.read2Coordinate; + if (areComparable && compareRead2) { + areComparable = + lhs.read2ReferenceIndex == rhs.read2ReferenceIndex && + lhs.read2Coordinate == rhs.read2Coordinate; } return areComparable; } /* 比对方向是否正向 */ - bool IsPositiveStrand() const - { - return orientation == F; - } + bool IsPositiveStrand() const { return orientation == F; } /* pairend是否合适的比对上了 */ - bool IsPaired() const - { - return read2ReferenceIndex != -1; - } + bool IsPaired() const { return read2ReferenceIndex != -1; } - bool IsNegativeStrand() const - { - return orientation == R; - } + bool IsNegativeStrand() const { return orientation == R; } // 对于相交的数据进行比对,a是否小于b,根据AreComparableForDuplicates函数得来 - static inline bool ReadLittleThan(const ReadEnds &a, const ReadEnds &b, bool compareRead2 = false) - { + static inline bool ReadLittleThan(const ReadEnds &a, const ReadEnds &b, + bool compareRead2 = false) { int comp = a.read1ReferenceIndex - b.read1ReferenceIndex; if (comp == 0) comp = a.read1Coordinate - b.read1Coordinate; if (comp == 0) comp = a.orientation - b.orientation; - if (compareRead2) - { + if (compareRead2) { if (comp == 0) comp = a.read2ReferenceIndex - b.read2ReferenceIndex; if (comp == 0) @@ -137,14 +135,12 @@ struct ReadEnds : PhysicalLocation } // 找某一个位置的所有readend时需要 - static bool pairsLittleThan(const ReadEnds &lhs, const ReadEnds &rhs) - { + static bool PairsLittleThan(const ReadEnds &lhs, const ReadEnds &rhs) { return ReadLittleThan(lhs, rhs, true); } // 比较函数 - bool operator < (const ReadEnds &o) const - { + bool operator<(const ReadEnds &o) const { int comp = read1ReferenceIndex - o.read1ReferenceIndex; if (comp == 0) comp = read1Coordinate - o.read1Coordinate; diff --git a/src/sam/utils/read_name_parser.h b/src/sam/utils/read_name_parser.h index 34dfcb0..a7572a7 100644 --- a/src/sam/utils/read_name_parser.h +++ b/src/sam/utils/read_name_parser.h @@ -9,11 +9,12 @@ Date : 2023/11/6 #ifndef READ_NAME_PARSER_H_ #define READ_NAME_PARSER_H_ -#include "read_ends.h" #include - #include + #include + +#include "read_ends.h" // #include #include @@ -24,103 +25,107 @@ using std::string; /** * Provides access to the physical location information about a cluster. - * All values should be defaulted to -1 if unavailable. ReadGroup and Tile should only allow - * non-zero positive integers, x and y coordinates may be negative. - * 非线程安全 + * All values should be defaulted to -1 if unavailable. ReadGroup and Tile + * should only allow non-zero positive integers, x and y coordinates may be + * negative. 非线程安全 */ -struct ReadNameParser -{ +struct ReadNameParser { /** - * The read name regular expression (regex) is used to extract three pieces of information from the read name: tile, x location, - * and y location. Any read name regex should parse the read name to produce these and only these values. An example regex is: + * The read name regular expression (regex) is used to extract three pieces + * of information from the read name: tile, x location, and y location. Any + * read name regex should parse the read name to produce these and only + * these values. An example regex is: * (?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$ - * which assumes that fields in the read name are delimited by ':' and the last three fields correspond to the tile, x and y locations, - * ignoring any trailing non-digit characters. + * which assumes that fields in the read name are delimited by ':' and the + * last three fields correspond to the tile, x and y locations, ignoring any + * trailing non-digit characters. * - * The default regex is optimized for fast parsing (see {@link #getLastThreeFields(String, char, int[])}) by searching for the last - * three fields, ignoring any trailing non-digit characters, assuming the delimiter ':'. This should consider correctly read names - * where we have 5 or 7 field with the last three fields being tile/x/y, as is the case for the majority of read names produced by - * Illumina technology. + * The default regex is optimized for fast parsing (see {@link + * #getLastThreeFields(String, char, int[])}) by searching for the last + * three fields, ignoring any trailing non-digit characters, assuming the + * delimiter ':'. This should consider correctly read names where we have 5 + * or 7 field with the last three fields being tile/x/y, as is the case for + * the majority of read names produced by Illumina technology. */ - const string DEFAULT_READ_NAME_REGEX = "(?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$"; + const string DEFAULT_READ_NAME_REGEX = + "(?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$"; string readNameStored = ""; PhysicalLocation physicalLocationStored; - int tmpLocationFields[3]; // for optimization of addLocationInformation - bool useOptimizedDefaultParsing = true; // was the regex default? + int tmpLocationFields[3]; // for optimization of addLocationInformation + bool useOptimizedDefaultParsing = true; // was the regex default? string readNameRegex = DEFAULT_READ_NAME_REGEX; regex readNamePattern; bool warnedAboutRegexNotMatching = true; ReadNameParser() : ReadNameParser(DEFAULT_READ_NAME_REGEX) {} - ReadNameParser(const string &strReadNameRegex) : ReadNameParser(strReadNameRegex, true) {} - ReadNameParser(const string &strReadNameRegex, bool isWarn) - { + ReadNameParser(const string &strReadNameRegex) + : ReadNameParser(strReadNameRegex, true) {} + ReadNameParser(const string &strReadNameRegex, bool isWarn) { readNameRegex = strReadNameRegex; if (strReadNameRegex == DEFAULT_READ_NAME_REGEX) useOptimizedDefaultParsing = true; else useOptimizedDefaultParsing = false; - readNamePattern = boost::regex(strReadNameRegex, boost::regex_constants::optimize); + readNamePattern = + boost::regex(strReadNameRegex, boost::regex_constants::optimize); warnedAboutRegexNotMatching = isWarn; } /* 重新设置readNameRegex */ - void SetReadNameRegex(const string &strReadNameRegex) - { + void SetReadNameRegex(const string &strReadNameRegex) { readNameRegex = strReadNameRegex; if (strReadNameRegex == DEFAULT_READ_NAME_REGEX) useOptimizedDefaultParsing = true; else useOptimizedDefaultParsing = false; - readNamePattern = boost::regex(strReadNameRegex, boost::regex_constants::optimize); + readNamePattern = + boost::regex(strReadNameRegex, boost::regex_constants::optimize); // readNamePattern = strReadNameRegex; } /* 添加测序时候的tile x y 信息 */ - bool AddLocationInformation(const string &readName, PhysicalLocation *loc) - { - if (!(readName == readNameStored)) - { - if (ReadLocationInformation(readName, loc)) - { + bool AddLocationInformation(const string &readName, PhysicalLocation *loc) { + if (!(readName == readNameStored)) { + if (ReadLocationInformation(readName, loc)) { readNameStored = readName; physicalLocationStored = *loc; return true; } // return false if read name cannot be parsed return false; - } - else - { + } else { *loc = physicalLocationStored; return true; } } /** - * Method used to extract tile/x/y from the read name and add it to the PhysicalLocationShort so that it - * can be used later to determine optical duplication + * Method used to extract tile/x/y from the read name and add it to the + * PhysicalLocationShort so that it can be used later to determine optical + * duplication * * @param readName the name of the read/cluster * @param loc the object to add tile/x/y to - * @return true if the read name contained the information in parsable form, false otherwise + * @return true if the read name contained the information in parsable form, + * false otherwise */ - bool ReadLocationInformation(const string &readName, PhysicalLocation *loc) - { + bool ReadLocationInformation(const string &readName, + PhysicalLocation *loc) { try { - // Optimized version if using the default read name regex (== used on purpose): - if (useOptimizedDefaultParsing) - { + // Optimized version if using the default read name regex (== used + // on purpose): + if (useOptimizedDefaultParsing) { const int fields = getLastThreeFields(readName, ':'); - if (!(fields == 5 || fields == 7)) - { - if (warnedAboutRegexNotMatching) - { + if (!(fields == 5 || fields == 7)) { + if (warnedAboutRegexNotMatching) { Warn( - "Default READ_NAME_REGEX '%s' did not match read name '%s'." - "You may need to specify a READ_NAME_REGEX in order to correctly identify optical duplicates. " - "Note that this message will not be emitted again even if other read names do not match the regex.", + "Default READ_NAME_REGEX '%s' did not match read " + "name '%s'." + "You may need to specify a READ_NAME_REGEX in " + "order to correctly identify optical duplicates. " + "Note that this message will not be emitted again " + "even if other read names do not match the regex.", readNameRegex.c_str(), readName.c_str()); warnedAboutRegexNotMatching = false; } @@ -130,13 +135,9 @@ struct ReadNameParser loc->x = tmpLocationFields[1]; loc->y = tmpLocationFields[2]; return true; - } - else if (readNameRegex.empty()) - { + } else if (readNameRegex.empty()) { return false; - } - else - { + } else { // Standard version that will use the regex cmatch m; if (boost::regex_match(readName.c_str(), m, readNamePattern)) { @@ -144,28 +145,25 @@ struct ReadNameParser loc->x = std::stoi(m[2].str()); loc->y = std::stoi(m[3].str()); return true; - } - else - { - if (warnedAboutRegexNotMatching) - { + } else { + if (warnedAboutRegexNotMatching) { Warn( "READ_NAME_REGEX '%s' did not match read name '%s'." "Your regex may not be correct. " - "Note that this message will not be emitted again even if other read names do not match the regex.", + "Note that this message will not be emitted again " + "even if other read names do not match the regex.", readNameRegex.c_str(), readName.c_str()); warnedAboutRegexNotMatching = false; } return false; } } - } - catch (const std::runtime_error &e) - { - if (warnedAboutRegexNotMatching) - { + } catch (const std::runtime_error &e) { + if (warnedAboutRegexNotMatching) { Warn( - "A field parsed out of a read name was expected to contain an integer and did not. READ_NAME_REGEX: %s; Read name: %s; Error Msg: %s", + "A field parsed out of a read name was expected to contain " + "an integer and did not. READ_NAME_REGEX: %s; Read name: " + "%s; Error Msg: %s", readNameRegex.c_str(), readName.c_str(), e.what()); warnedAboutRegexNotMatching = false; } @@ -175,39 +173,39 @@ struct ReadNameParser } /** - * Given a string, splits the string by the delimiter, and returns the the last three fields parsed as integers. Parsing a field - * considers only a sequence of digits up until the first non-digit character. The three values are stored in the passed-in array. + * Given a string, splits the string by the delimiter, and returns the the + * last three fields parsed as integers. Parsing a field considers only a + * sequence of digits up until the first non-digit character. The three + * values are stored in the passed-in array. * - * @throws NumberFormatException if any of the tokens that should contain numbers do not start with parsable numbers + * @throws NumberFormatException if any of the tokens that should contain + * numbers do not start with parsable numbers */ - int getLastThreeFields(const string &readName, char delim) - { - int tokensIdx = 2; // start at the last token + int getLastThreeFields(const string &readName, char delim) { + int tokensIdx = 2; // start at the last token int numFields = 0; int i, endIdx; endIdx = readName.size(); // find the last three tokens only - for (i = readName.size() - 1; 0 <= i && 0 <= tokensIdx; i--) - { - if (readName.at(i) == delim || 0 == i) - { + for (i = (int)readName.size() - 1; 0 <= i && 0 <= tokensIdx; i--) { + if (readName.at(i) == delim || 0 == i) { numFields++; const int startIdx = (0 == i) ? 0 : (i + 1); - tmpLocationFields[tokensIdx] = std::stoi(readName.substr(startIdx, endIdx - startIdx)); + tmpLocationFields[tokensIdx] = + std::stoi(readName.substr(startIdx, endIdx - startIdx)); tokensIdx--; endIdx = i; } } // continue to find the # of fields - while (0 <= i) - { + while (0 <= i) { if (readName.at(i) == delim || 0 == i) numFields++; i--; } - if (numFields < 3) - { - tmpLocationFields[0] = tmpLocationFields[1] = tmpLocationFields[2] = -1; + if (numFields < 3) { + tmpLocationFields[0] = tmpLocationFields[1] = tmpLocationFields[2] = + -1; numFields = -1; }