检查optical duplication,用graph来检测,正在写这个

This commit is contained in:
zzh 2024-08-22 02:28:36 +08:00
parent 38bc489004
commit 022be611cd
27 changed files with 2133 additions and 2007 deletions

3
.gitignore vendored
View File

@ -1,3 +1,6 @@
# for fast-markdup
*.sam
*.log
# ---> C++
# Prerequisites
*.d

4
.vscode/launch.json vendored
View File

@ -13,10 +13,10 @@
"program": "${workspaceRoot}/build/bin/picard_cpp",
"args": [
"MarkDuplicates",
"--INPUT", "/mnt/d/data/100w.bam",
"--INPUT", "~/data/bam/100w.bam",
"--OUTPUT", "out.bam",
"--METRICS_FILE", "metrics.txt",
"--num_threads", "12",
"--num_threads", "1",
"--max_mem", "4G",
"--verbosity", "DEBUG",
"--asyncio", "true",

View File

@ -88,6 +88,13 @@
"typeindex": "cpp",
"typeinfo": "cpp",
"valarray": "cpp",
"variant": "cpp"
"variant": "cpp",
"__split_buffer": "cpp",
"executor": "cpp",
"io_context": "cpp",
"netfwd": "cpp",
"timer": "cpp",
"__nullptr": "cpp",
"__node_handle": "c"
}
}

View File

@ -1,8 +1,8 @@
#!/bin/bash
dir="/home/zzh/work/GeneKit/picard_cpp/build"
dir="/home/zzh/work/ngs/picard_cpp/build"
[ -d "$dir" ] && rm -rf "$dir"
mkdir "$dir"
cd "$dir"
#cmake .. -DCMAKE_BUILD_TYPE=Debug
cmake .. -DCMAKE_BUILD_TYPE=Release
make -j 8
make -j 16

15
run.sh
View File

@ -1,9 +1,14 @@
time /home/zzh/work/GeneKit/picard_cpp/build/bin/picard_cpp \
input=~/data/bam/zy_normal.bam
#input=~/data/bam/zy_tumor.bam
#input=~/data/bam/100w.bam
time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \
MarkDuplicates \
--INPUT /mnt/d/data/zy_tumor.bam \
--OUTPUT /mnt/d/data/out.bam \
--INPUT $input \
--OUTPUT ~/data/bam/out.bam \
--INDEX_FORMAT BAI \
--num_threads 1 \
--max_mem 4G \
--max_mem 2G \
--verbosity DEBUG \
--asyncio true #\
#--READ_NAME_REGEX ".*?([0-9]+):([0-9]+):([0-9]+)$"
@ -12,4 +17,4 @@ time /home/zzh/work/GeneKit/picard_cpp/build/bin/picard_cpp \
# --INPUT /mnt/d/data/100w.bam \
# --INPUT /mnt/d/data/zy_normal.bam \
# zy_tumor
# tumor_region
# tumor_region

View File

@ -13,116 +13,87 @@
* BamBuf
*/
// 读取数据直到读完,或者缓冲区满
int BamBuf::ReadBam()
{
int BamBuf::ReadBam() {
int read_num = 0;
if (handle_last)
{ // 处理上次读入的最后一个bam
if (has_enough_space())
{ // 必须调用在边界处调整memffset
if (handle_last) { // 处理上次读入的最后一个bam
if (has_enough_space()) { // 必须调用在边界处调整memffset
++read_num;
append_one_bam();
}
else
{
return read_num; // 还是没空间
} else {
return read_num; // 还是没空间
}
}
while (read_stat_ >= 0 && (read_stat_ = sam_read1(fp, hdr, bw->b)) >= 0)
{
while (read_stat_ >= 0 && (read_stat_ = sam_read1(fp, hdr, bw->b)) >= 0) {
bw->end_pos_ = BamWrap::BamEndPos(bw->b);
if (has_enough_space())
{ // 还有空间
if (has_enough_space()) { // 还有空间
append_one_bam();
++read_num; // 放进缓存才算读取到
}
else
{
++read_num; // 放进缓存才算读取到
} else {
break;
}
}
if (read_stat_ >= 0)
{
if (read_stat_ >= 0) {
handle_last = true;
}
else
{
} else {
handle_last = false;
}
return read_num;
}
// 初始化缓存
void BamBuf::Init(samFile *fp,
sam_hdr_t *hdr,
int64_t mem_size)
{
void BamBuf::Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size) {
this->fp = fp;
this->hdr = hdr;
this->mem_size = mem_size;
this->mem = (uint8_t *)malloc(mem_size);
this->bw = (BamWrap *)malloc(sizeof(BamWrap));
this->bw->b = bam_init1();
if (bw == NULL ||
this->mem == NULL ||
this->bw->b == NULL)
{
if (bw == NULL || this->mem == NULL || this->bw->b == NULL) {
fprintf(stderr, "allocate memory failed! Abort\n");
exit(-1);
}
}
void BamBuf::ClearBeforeIdx(size_t idxInBv)
{
void BamBuf::ClearBeforeIdx(size_t idxInBv) {
if (idxInBv < 1)
return;
int i = 0, j = idxInBv;
for (; j < bv.size(); ++i, ++j)
{
for (; j < bv.size(); ++i, ++j) {
bv[i] = bv[j];
}
bv.resize(i);
prepare_read();
}
void BamBuf::ClearAll()
{
void BamBuf::ClearAll() {
bv.clear();
prepare_read();
}
// 为下一次读取做准备, 计算一些边界条件
inline void BamBuf::prepare_read()
{
inline void BamBuf::prepare_read() {
// 计算余留的下次计算可能用到的bam所占的位置
if (bv.size() > 0)
{
if (bv.size() > 0) {
BamWrap *bw = bv[0];
legacy_start = (int64_t)bw - (int64_t)mem;
bw = bv.back();
legacy_end = (int64_t)bw + bw->length() - (int64_t)mem;
}
else
{
} else {
legacy_start = legacy_end = 0;
mem_offset = 0; // 上次没剩下,那就从头存储
mem_offset = 0; // 上次没剩下,那就从头存储
}
}
// 检查缓存是否还有空间
inline bool BamBuf::has_enough_space()
{
inline bool BamBuf::has_enough_space() {
const uint32_t bam_len = bw->length();
int64_t potential_end = mem_offset + bam_len;
if (legacy_start <= legacy_end)
legacy_start += mem_size;
if (potential_end >= legacy_start)
{
if (potential_end >= legacy_start) {
return false;
}
if (potential_end >= mem_size)
{
if (potential_end >= mem_size) {
mem_offset = 0;
}
int64_t virtual_offset = mem_offset;
@ -133,8 +104,7 @@ inline bool BamBuf::has_enough_space()
}
// 处理一个读取后的bam
inline void BamBuf::append_one_bam()
{
inline void BamBuf::append_one_bam() {
BamWrap *bwp = (BamWrap *)(mem + mem_offset);
*bwp = *bw;
bwp->b = (bam1_t *)((char *)bwp + sizeof(*bwp));
@ -148,12 +118,9 @@ inline void BamBuf::append_one_bam()
}
// 处理上次读入的最后一个read
inline bool BamBuf::handle_last_read()
{
if (handle_last)
{ // 处理上次读入的最后一个bam
if (has_enough_space())
{ // 必须调用在边界处调整memffset
inline bool BamBuf::handle_last_read() {
if (handle_last) { // 处理上次读入的最后一个bam
if (has_enough_space()) { // 必须调用在边界处调整memffset
append_one_bam();
handle_last = false;
return true;
@ -166,49 +133,35 @@ inline bool BamBuf::handle_last_read()
* AsyncIoBamBuf
*/
// 初始化缓存
void AsyncIoBamBuf::Init(samFile *fp,
sam_hdr_t *hdr,
int64_t mem_size)
{
if (use_async_io_)
{
void AsyncIoBamBuf::Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size) {
if (use_async_io_) {
buf1_.Init(fp, hdr, mem_size >> 1);
buf2_.Init(fp, hdr, mem_size >> 1);
pi_ = &buf1_;
po_ = &buf2_;
tid_ = (pthread_t *)malloc(sizeof(pthread_t));
}
else
{
} else {
buf1_.Init(fp, hdr, mem_size);
pi_ = &buf1_;
}
}
// 读取数据
int AsyncIoBamBuf::ReadBam()
{
if (use_async_io_)
{
int AsyncIoBamBuf::ReadBam() {
if (use_async_io_) {
hasThread = true;
return async_read_bam();
}
else
{
} else {
return sync_read_bam();
}
}
int AsyncIoBamBuf::sync_read_bam()
{
int AsyncIoBamBuf::sync_read_bam() {
int read_num = 0;
if (clear_all_)
{
if (clear_all_) {
clear_all_ = false;
pi_->ClearAll();
}
else if (clear_before_idx_ > 0)
{
} else if (clear_before_idx_ > 0) {
pi_->ClearBeforeIdx(clear_before_idx_);
clear_before_idx_ = 0;
}
@ -217,23 +170,18 @@ int AsyncIoBamBuf::sync_read_bam()
return read_num;
}
int AsyncIoBamBuf::async_read_bam()
{
int AsyncIoBamBuf::async_read_bam() {
int read_num = 0;
if (first_read_)
{
if (first_read_) {
read_num = pi_->ReadBam();
first_read_ = false;
refresh_bam_arr();
}
else
{
} else {
// join, 交换缓冲区指针
pthread_join(*tid_, 0);
resize_buf();
if (need_read_)
{ // 需要交换指针
if (need_read_) { // 需要交换指针
BamBuf *tmp = pi_;
pi_ = po_;
po_ = tmp;
@ -246,72 +194,52 @@ int AsyncIoBamBuf::async_read_bam()
return read_num;
}
void *AsyncIoBamBuf::async_read(void *data)
{
void *AsyncIoBamBuf::async_read(void *data) {
AsyncIoBamBuf *ab = (AsyncIoBamBuf *)data;
if (ab->need_read_ && ab->ReadStat() >= 0)
{ // 需要读取
if (ab->need_read_ && ab->ReadStat() >= 0) { // 需要读取
ab->last_read_num_ = ab->po_->ReadBam();
}
else
{
} else {
ab->last_read_num_ = 0;
}
pthread_exit(0);
}
// 为下一次读取做准备, 计算一些边界条件延迟操作因为此时可能po_对应的buf正在读取
void AsyncIoBamBuf::ClearBeforeIdx(size_t idxInBv)
{
// 为下一次读取做准备,
// 计算一些边界条件延迟操作因为此时可能po_对应的buf正在读取
void AsyncIoBamBuf::ClearBeforeIdx(size_t idxInBv) {
clear_before_idx_ = idxInBv;
}
// 清空上一次所有读入的数据延迟操作因为此时可能po_对应的buf正在读取
void AsyncIoBamBuf::ClearAll()
{
clear_all_ = true;
}
void AsyncIoBamBuf::ClearAll() { clear_all_ = true; }
inline void AsyncIoBamBuf::resize_buf()
{
if (clear_all_)
{ // 清理上一轮的数据
inline void AsyncIoBamBuf::resize_buf() {
if (clear_all_) { // 清理上一轮的数据
clear_all_ = false;
po_->ClearBeforeIdx(legacy_size_);
pi_->ClearAll();
if (pi_->handle_last_read()) // 上次读取有一个read没放入缓存
{
if (pi_->handle_last_read()) { // 上次读取有一个read没放入缓存
last_read_num_ += 1;
legacy_size_ = pi_->Size(); // 应该只有一个read
legacy_size_ = pi_->Size(); // 应该只有一个read
need_read_ = true;
}
else // 没空间存放,则不交换指针,或者文件已经读取完毕
{
} else { // 没空间存放,则不交换指针,或者文件已经读取完毕
legacy_size_ = 0;
need_read_ = false;
}
}
else if (clear_before_idx_ > 0)
{
if (clear_before_idx_ < legacy_size_)
{
} else if (clear_before_idx_ > 0) {
if (clear_before_idx_ < legacy_size_) {
po_->ClearBeforeIdx(clear_before_idx_);
legacy_size_ -= clear_before_idx_;
// 不需要交换指针,不需要读取
need_read_ = false;
}
else
{
} else {
po_->ClearBeforeIdx(legacy_size_);
pi_->ClearBeforeIdx(clear_before_idx_ - legacy_size_);
if (pi_->handle_last_read()) // 上次读取有一个read没放入缓存
{
if (pi_->handle_last_read()) {// 上次读取有一个read没放入缓存
last_read_num_ += 1;
legacy_size_ = pi_->Size(); // 应该只有一个read
legacy_size_ = pi_->Size(); // 应该只有一个read
need_read_ = true;
}
else // 没空间存放,则不交换指针,或者文件已经读取完毕
{
} else { // 没空间存放,则不交换指针,或者文件已经读取完毕
legacy_size_ = 0;
need_read_ = false;
}

View File

@ -10,19 +10,18 @@
#ifndef BAM_BUF_H_
#define BAM_BUF_H_
#include <vector>
#include <stdint.h>
#include <htslib/sam.h>
#include <pthread.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <functional>
#include <pthread.h>
#include <fstream>
#include <functional>
#include <iostream>
#include <sstream>
#include <fstream>
#include <htslib/sam.h>
#include <vector>
#include "bam_wrap.h"
@ -32,24 +31,22 @@ using namespace std;
/*
* bam
*/
struct BamBuf
{
sam_hdr_t *hdr; // sam文件的header信息
samFile *fp; // sam文件指针
BamWrap *bw = nullptr; // 用来循环读入bam
uint8_t *mem = nullptr; // 用来存放bam的数据, 程序结束后自动释放,所以没在析构函数里释放
int64_t mem_offset = 0; // 下一次要存放的位置
int64_t mem_size; // 缓存大小
int read_stat_ = 0; // 读取状态,是否读完
vector<BamWrap *> bv; // 方便对bam数据的访问
int64_t legacy_start = 0; // 没处理完的bam在mem中的起始位置, 闭区间
int64_t legacy_end = 0; // 没处理完的bam在mem中的结束位置, 开区间
bool handle_last = false; // 上次最后读入的bam是否需要处理
struct BamBuf {
sam_hdr_t *hdr; // sam文件的header信息
samFile *fp; // sam文件指针
BamWrap *bw = nullptr; // 用来循环读入bam
uint8_t *mem = nullptr; // 用来存放bam的数据,
// 程序结束后自动释放,所以没在析构函数里释放
int64_t mem_offset = 0; // 下一次要存放的位置
int64_t mem_size; // 缓存大小
int read_stat_ = 0; // 读取状态,是否读完
vector<BamWrap *> bv; // 方便对bam数据的访问
int64_t legacy_start = 0; // 没处理完的bam在mem中的起始位置, 闭区间
int64_t legacy_end = 0; // 没处理完的bam在mem中的结束位置, 开区间
bool handle_last = false; // 上次最后读入的bam是否需要处理
// 初始化缓存
void Init(samFile *fp,
sam_hdr_t *hdr,
int64_t mem_size);
void Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size);
// 读取数据直到读完,或者缓冲区满
int ReadBam();
// 为下一次读取做准备, 计算一些边界条件
@ -57,27 +54,24 @@ struct BamBuf
// 清空上一次所有读入的数据
void ClearAll();
inline int64_t Size() { return bv.size(); } // 包含多少个read
inline int ReadStat() { return read_stat_; } // 文件的读取状态,是否可读(读取完全)
~BamBuf()
{
if (this->mem != nullptr)
{
inline int ReadStat() {
return read_stat_;
} // 文件的读取状态,是否可读(读取完全)
~BamBuf() {
if (this->mem != nullptr) {
free(this->mem);
}
if (this->bw != nullptr)
{
if (this->bw != nullptr) {
bam_destroy1(bw->b);
free(bw);
}
}
void FreeMemory() // 释放开辟的内存
void FreeMemory() // 释放开辟的内存
{
if (this->mem != nullptr)
{
if (this->mem != nullptr) {
free(this->mem);
}
if (this->bw != nullptr)
{
if (this->bw != nullptr) {
bam_destroy1(bw->b);
free(bw);
}
@ -102,31 +96,31 @@ struct BamBuf
/*
* io
*/
struct AsyncIoBamBuf
{
struct AsyncIoBamBuf {
BamBuf buf1_;
BamBuf buf2_;
BamBuf *pi_; // 当前用的buf
BamBuf *po_; // 后台在读取的buf
BamBuf *pi_; // 当前用的buf
BamBuf *po_; // 后台在读取的buf
pthread_t *tid_ = NULL;
bool hasThread = false;
int64_t legacy_size_ = 0; // 上一轮运算之后缓存中还剩余的上次读取的read数量
int64_t legacy_size_ =
0; // 上一轮运算之后缓存中还剩余的上次读取的read数量
bool first_read_ = true;
int last_read_num_ = 0; // 上一次读取了多少reads
int last_read_num_ = 0; // 上一次读取了多少reads
bool need_read_ = true;
bool use_async_io_ = true;
int64_t clear_before_idx_ = 0; // 用户异步读取下一轮读取之前清理掉clear_before_idx_之前的所有reads
bool clear_all_ = false; // 用于异步读取下一轮读取之前清理掉之前的所有reads
int64_t clear_before_idx_ =
0; // 用户异步读取下一轮读取之前清理掉clear_before_idx_之前的所有reads
bool clear_all_ =
false; // 用于异步读取下一轮读取之前清理掉之前的所有reads
vector<BamWrap *> bam_arr_; // 用来访问buf中的bam
vector<BamWrap *> bam_arr_; // 用来访问buf中的bam
AsyncIoBamBuf() {}
AsyncIoBamBuf(bool use_async) : use_async_io_(use_async) {}
// 析构
~AsyncIoBamBuf()
{
if (tid_ != NULL)
{
~AsyncIoBamBuf() {
if (tid_ != NULL) {
if (hasThread)
pthread_join(*tid_, 0);
free(tid_);
@ -136,35 +130,29 @@ struct AsyncIoBamBuf
}
// 初始化缓存
void Init(samFile *fp,
sam_hdr_t *hdr,
int64_t mem_size);
void Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size);
// 读取数据
int ReadBam();
// 为下一次读取做准备, 计算一些边界条件
void ClearBeforeIdx(size_t idxInBv);
vector<BamWrap *> &GetBamArr() { return bam_arr_; } // 获取bam array
vector<BamWrap *> &GetBamArr() { return bam_arr_; } // 获取bam array
// 清空上一次所有读入的数据
void ClearAll();
// 包含的read数量
inline int64_t Size() { return legacy_size_ + pi_->Size(); }
inline int ReadStat() { return pi_->read_stat_; }
inline BamWrap *operator[](int64_t pos)
{
return bam_arr_[pos];
}
inline BamWrap *operator[](int64_t pos) { return bam_arr_[pos]; }
// 获取某一段reads
inline vector<BamWrap*> Slice(size_t startIdx, size_t endIdx)
{
inline vector<BamWrap *> Slice(size_t startIdx, size_t endIdx) {
if (endIdx > startIdx) {
auto begItr = bam_arr_.begin();
return std::move(vector<BamWrap *>(begItr + startIdx, begItr + endIdx));
return std::move(
vector<BamWrap *>(begItr + startIdx, begItr + endIdx));
}
return std::move(vector<BamWrap *>());
}
void FreeMemory()
{
void FreeMemory() {
buf1_.FreeMemory();
buf2_.FreeMemory();
}
@ -176,11 +164,9 @@ struct AsyncIoBamBuf
// 异步读取线程函数
static void *async_read(void *data);
void resize_buf();
inline void refresh_bam_arr()
{
inline void refresh_bam_arr() {
bam_arr_.resize(this->Size());
for (int i = 0; i < bam_arr_.size(); ++i)
{
for (int i = 0; i < bam_arr_.size(); ++i) {
if (i < legacy_size_)
bam_arr_[i] = (*po_)[i];
else

View File

@ -9,15 +9,14 @@
#ifndef BAM_WRAP_H_
#define BAM_WRAP_H_
#include <map>
#include <string>
#include <vector>
#include <sstream>
#include <htslib/sam.h>
#include <limits.h>
#include <math.h>
#include <htslib/sam.h>
#include <map>
#include <sstream>
#include <string>
#include <vector>
using namespace std;
@ -28,52 +27,32 @@ using namespace std;
/*
* sam read
*/
struct BamWrap
{
struct BamWrap {
// 将contig左移后加上pos作为全局位置
const static int MAX_CONTIG_LEN_SHIFT = 30;
const static int MAX_CONTIG_LEN_SHIFT = 40; // 将染色体id左移多少位和位点拼合在一起
const static int READ_MAX_LENGTH = 200;
const static int READ_MAX_DEPTH = 1000; // 这只是用来初始化空间用的,深度大于这个值也没关系
const static int READ_MAX_DEPTH = 1000; // 这只是用来初始化空间用的,深度大于这个值也没关系
// 成员变量尽量少,减少占用内存空间
bam1_t *b;
int64_t end_pos_; // bam的全局结束位置, 闭区间
int64_t end_pos_; // bam的全局结束位置, 闭区间
// 全局开始位置
inline int64_t start_pos()
{
return bam_global_pos(b);
}
inline int64_t start_pos() { return bam_global_pos(b); }
// 全局结束位置
inline int64_t end_pos()
{
return end_pos_;
}
inline int64_t end_pos() { return end_pos_; }
// 和reference对应的序列长度
inline int16_t read_len()
{
return (end_pos_ - start_pos() + 1);
}
inline int16_t read_len() { return (end_pos_ - start_pos() + 1); }
// 在contig内的开始位置
inline int32_t contig_pos()
{
return b->core.pos;
}
inline int32_t contig_pos() { return b->core.pos; }
// 在contig内部的结束位置
inline int32_t contig_end_pos()
{
return bam_pos(end_pos_);
}
inline int32_t contig_end_pos() { return bam_pos(end_pos_); }
// 序列的长度AGTC字母个数
inline int16_t seq_len()
{
return b->core.l_qseq;
}
inline int16_t seq_len() { return b->core.l_qseq; }
// 算上开头的softclip
inline int32_t softclip_start()
{
inline int32_t softclip_start() {
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
const char c = bam_cigar_opchr(cigar[0]);
@ -84,8 +63,7 @@ struct BamWrap
}
// 算上结尾的softclip
inline int32_t softclip_end()
{
inline int32_t softclip_end() {
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
const char c = bam_cigar_opchr(cigar[bc.n_cigar - 1]);
@ -96,8 +74,7 @@ struct BamWrap
}
// 算上结尾的softclip
inline int32_t right_softclip_len()
{
inline int32_t right_softclip_len() {
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
const char c = bam_cigar_opchr(cigar[bc.n_cigar - 1]);
@ -108,14 +85,12 @@ struct BamWrap
}
// 获取序列
inline std::string sequence()
{
inline std::string sequence() {
ostringstream oss;
char *seq = (char *)bam_get_seq(b);
const bam1_core_t &bc = b->core;
const char base_to_char[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'};
for (int i = 0; i < bc.l_qseq; ++i)
{
for (int i = 0; i < bc.l_qseq; ++i) {
char base = base_to_char[bam_seqi(seq, i)];
oss << base;
}
@ -123,18 +98,13 @@ struct BamWrap
}
// 获取名字
inline const char *query_name()
{
return bam_get_qname(b);
}
inline const char *query_name() { return bam_get_qname(b); }
// 获取cigar 字符串
inline string cigar_str()
{
inline string cigar_str() {
ostringstream oss;
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
for (int i = 0; i < bc.n_cigar; ++i)
{
for (int i = 0; i < bc.n_cigar; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
const int len = bam_cigar_oplen(cigar[i]);
oss << len << c;
@ -143,21 +113,14 @@ struct BamWrap
}
// 占用的内存大小
inline int16_t length()
{
return sizeof(*this) +
sizeof(bam1_t) +
b->l_data;
}
inline int16_t length() { return sizeof(*this) + sizeof(bam1_t) + b->l_data; }
// 获取cigar中insert的总长度
inline int32_t insert_cigar_len()
{
inline int32_t insert_cigar_len() {
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
int ret = 0;
for (int i = 0; i < bc.n_cigar; ++i)
{
for (int i = 0; i < bc.n_cigar; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
const int len = bam_cigar_oplen(cigar[i]);
if (c == 'I')
@ -167,13 +130,11 @@ struct BamWrap
}
// 获取cigar中delete的总长度
inline int32_t del_cigar_len()
{
inline int32_t del_cigar_len() {
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
int ret = 0;
for (int i = 0; i < bc.n_cigar; ++i)
{
for (int i = 0; i < bc.n_cigar; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
const int len = bam_cigar_oplen(cigar[i]);
if (c == 'D')
@ -183,13 +144,11 @@ struct BamWrap
}
// 计算sam read的终点位置
static inline int64_t BamEndPos(const bam1_t *b)
{
static inline int64_t BamEndPos(const bam1_t *b) {
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
int start_offset = -1;
for (int i = 0; i < bc.n_cigar; ++i)
{
for (int i = 0; i < bc.n_cigar; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
const int len = bam_cigar_oplen(cigar[i]);
if (c == 'D' || c == 'N' || c == 'M' || c == '=' || c == 'X')
@ -198,31 +157,22 @@ struct BamWrap
return (((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)(b->core.pos + start_offset));
};
bool HasWellDefinedFragmentSize()
{
bool HasWellDefinedFragmentSize() {
const bam1_core_t &bc = b->core;
bool hasWellDefinedFragmentSize = true;
if (bc.isize == 0 ||
!(bc.flag & BAM_FPAIRED) ||
((bc.flag & BAM_FUNMAP) || (bc.flag & BAM_FMUNMAP)) ||
((bool)(bc.flag & BAM_FREVERSE) == (bool)(bc.flag & BAM_FMREVERSE)))
{
if (bc.isize == 0 || !(bc.flag & BAM_FPAIRED) || ((bc.flag & BAM_FUNMAP) || (bc.flag & BAM_FMUNMAP)) ||
((bool)(bc.flag & BAM_FREVERSE) == (bool)(bc.flag & BAM_FMREVERSE))) {
hasWellDefinedFragmentSize = false;
}
else if (bc.flag & BAM_FREVERSE)
{
} else if (bc.flag & BAM_FREVERSE) {
hasWellDefinedFragmentSize = contig_end_pos() > bc.mpos ? true : false;
}
else
{
} else {
hasWellDefinedFragmentSize = bc.pos <= bc.mpos + bc.isize ? true : false;
}
return hasWellDefinedFragmentSize;
}
// 计算bam的adapterBoundary
int GetAdapterBoundary()
{
int GetAdapterBoundary() {
const bam1_core_t &bc = b->core;
int adapterBoundary;
if (!HasWellDefinedFragmentSize())
@ -230,39 +180,34 @@ struct BamWrap
else if (bc.flag & BAM_FREVERSE)
adapterBoundary = bc.mpos - 1;
else
adapterBoundary = bc.pos + abs(bc.isize); // GATK4.0 和 GATK3.5不一样3.5的这里+1
adapterBoundary = bc.pos + abs(bc.isize); // GATK4.0 和 GATK3.5不一样3.5的这里+1
return adapterBoundary;
}
// 获取开头的I的长度
inline int GetHeadInsertLen()
{
inline int GetHeadInsertLen() {
int insLen = 0;
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
for (int i = 0; i < bc.n_cigar; ++i)
{
for (int i = 0; i < bc.n_cigar; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
const int len = bam_cigar_oplen(cigar[i]);
if (c == 'I')
{
if (c == 'I') {
insLen = len;
break;
}
else if (c != 'H' && c != 'S')
} else if (c != 'H' && c != 'S')
break;
}
return insLen;
}
// 获取soft clip开始位置(能处理H和S相连的情况有这种情况么, 注意开头的I要当做S)
inline int64_t GetSoftStart()
{
// 获取soft clip开始位置(能处理H和S相连的情况有这种情况么,
// 注意开头的I要当做S)
inline int64_t GetSoftStart() {
int64_t softStart = b->core.pos;
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
for (int i = 0; i < bc.n_cigar; ++i)
{
for (int i = 0; i < bc.n_cigar; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
const int len = bam_cigar_oplen(cigar[i]);
if (c == 'S' || c == 'I')
@ -274,13 +219,11 @@ struct BamWrap
}
// 获取unclipped开始位置(包括hardclip)
inline int64_t GetUnclippedStart()
{
inline int64_t GetUnclippedStart() {
int64_t start = b->core.pos;
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
for (int i = 0; i < bc.n_cigar; ++i)
{
for (int i = 0; i < bc.n_cigar; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
const int len = bam_cigar_oplen(cigar[i]);
if (c == 'S' || c == 'H')
@ -292,13 +235,11 @@ struct BamWrap
}
// 获取unclipped结束位置(包括hardclip)
inline int64_t GetUnclippedEnd()
{
inline int64_t GetUnclippedEnd() {
int64_t end_pos = bam_endpos(b);
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
for (int i = bc.n_cigar - 1; i >= 0; --i)
{
for (int i = bc.n_cigar - 1; i >= 0; --i) {
const char c = bam_cigar_opchr(cigar[i]);
const int len = bam_cigar_oplen(cigar[i]);
if (c == 'S' || c == 'H')
@ -311,12 +252,10 @@ struct BamWrap
/* 获取碱基质量分数的加和 */
/** Calculates a score for the read which is the sum of scores over Q15. */
inline int GetSumOfBaseQualities()
{
inline int GetSumOfBaseQualities() {
int score = 0;
uint8_t *qual = bam_get_qual(b);
for (int i = 0; i < b->core.l_qseq; ++i)
{
for (int i = 0; i < b->core.l_qseq; ++i) {
if (qual[i] >= 15)
score += qual[i];
}
@ -327,93 +266,63 @@ struct BamWrap
/* 与flag相关的检测 */
/* 没有比对上 unmapped */
inline bool GetReadUnmappedFlag()
{
return b->core.flag & BAM_FUNMAP;
}
inline bool GetReadUnmappedFlag() { return b->core.flag & BAM_FUNMAP; }
/* Template having multiple segments in sequencing */
inline bool GetReadPairedFlag()
{
return b->core.flag & BAM_FPAIRED;
}
inline bool GetReadPairedFlag() { return b->core.flag & BAM_FPAIRED; }
/**
* the read fails platform/vendor quality checks.
*/
inline bool GetReadFailsVendorQualityCheckFlag()
{
return b->core.flag & BAM_FQCFAIL;
}
inline bool GetReadFailsVendorQualityCheckFlag() { return b->core.flag & BAM_FQCFAIL; }
/**
* the mate is unmapped.
*/
bool GetMateUnmappedFlag()
{
return b->core.flag & BAM_FMUNMAP;
}
bool GetMateUnmappedFlag() { return b->core.flag & BAM_FMUNMAP; }
/**
* @return whether the alignment is secondary (an alternative alignment of the read).
* @return whether the alignment is secondary (an alternative alignment of
* the read).
*/
bool IsSecondaryAlignment()
{
return b->core.flag & BAM_FSECONDARY;
}
bool IsSecondaryAlignment() { return b->core.flag & BAM_FSECONDARY; }
/**
* @return whether the alignment is supplementary (a split alignment such as a chimeric alignment).
* @return whether the alignment is supplementary (a split alignment such as
* a chimeric alignment).
*/
bool GetSupplementaryAlignmentFlag()
{
return b->core.flag & BAM_FSUPPLEMENTARY;
}
bool GetSupplementaryAlignmentFlag() { return b->core.flag & BAM_FSUPPLEMENTARY; }
/*
* Tests if this record is either a secondary and/or supplementary alignment;
* Tests if this record is either a secondary and/or supplementary
* alignment;
*/
bool IsSecondaryOrSupplementary()
{
return IsSecondaryAlignment() || GetSupplementaryAlignmentFlag();
}
bool IsSecondaryOrSupplementary() { return IsSecondaryAlignment() || GetSupplementaryAlignmentFlag(); }
/**
* the read is the first read in a pair.
*/
bool GetFirstOfPairFlag()
{
return b->core.flag & BAM_FREAD1;
}
bool GetFirstOfPairFlag() { return b->core.flag & BAM_FREAD1; }
/**
* strand of the query (false for forward; true for reverse strand).
*/
bool GetReadNegativeStrandFlag()
{
return b->core.flag & BAM_FREVERSE;
}
bool GetReadNegativeStrandFlag() { return b->core.flag & BAM_FREVERSE; }
/**
* strand of the mate (false for forward; true for reverse strand).
*/
bool GetMateNegativeStrandFlag()
{
return b->core.flag & BAM_FMREVERSE;
}
bool GetMateNegativeStrandFlag() { return b->core.flag & BAM_FMREVERSE; }
/* 其他的一些信息 */
inline int GetReferenceLength()
{
inline int GetReferenceLength() {
int length = 0;
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
for (int i = 0; i < bc.n_cigar; ++i)
{
for (int i = 0; i < bc.n_cigar; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
const int len = bam_cigar_oplen(cigar[i]);
switch (c)
{
switch (c) {
case 'M':
case 'D':
case 'N':
@ -429,24 +338,20 @@ struct BamWrap
}
// 计算bam的全局位置算上染色体序号和比对位置
static inline int64_t bam_global_pos(bam1_t *b)
{
static inline int64_t bam_global_pos(bam1_t *b) {
return (((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)b->core.pos);
}
static inline int64_t bam_global_pos(int tid, int pos)
{
static inline int64_t bam_global_pos(int tid, int pos) {
return (((int64_t)tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)pos);
}
// 根据全局位置获取bam的染色体序号
static inline int32_t bam_tid(int64_t global_pos)
{
static inline int32_t bam_tid(int64_t global_pos) {
const int64_t mask = ~(((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1);
const int64_t high_tid = global_pos & mask;
return (int32_t)(high_tid >> MAX_CONTIG_LEN_SHIFT);
}
// 根据全局位置获取bam的比对位置(染色体内)
static inline int32_t bam_pos(int64_t global_pos)
{
static inline int32_t bam_pos(int64_t global_pos) {
const int64_t mask = ((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1;
return (int32_t)(global_pos & mask);
}

View File

View File

View File

@ -23,8 +23,7 @@ using std::vector;
struct option *GlobalArg::GLOBAL_OPT = nullptr;
// 初始化参数
void GlobalArg::initGlobalOptions()
{
void GlobalArg::initGlobalOptions() {
vector<struct option> v;
v.push_back({"INPUT", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_INPUT}); // 输入文件
v.push_back({"OUTPUT", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_OUTPUT}); // 输出文件
@ -51,59 +50,53 @@ void GlobalArg::initGlobalOptions()
}
// 解析参数
void GlobalArg::parseArgument(int argNum)
{
void GlobalArg::parseArgument(int argNum) {
using namespace ns_ga;
switch (argNum)
{
case OPT_INPUT:
in_fn = optarg;
break;
case OPT_OUTPUT:
out_fn = optarg;
break;
case OPT_NUM_THREADS:
num_threads = std::stoi(optarg);
break;
case OPT_MAX_MEM:
{
char *q;
size_t mem_arg = strtol(optarg, &q, 0);
if (*q == 'k' || *q == 'K')
mem_arg <<= 10;
else if (*q == 'm' || *q == 'M')
mem_arg <<= 20;
else if (*q == 'g' || *q == 'G')
mem_arg <<= 30;
if (mem_arg >= max_mem)
max_mem = mem_arg;
else
{
std::cerr << "[Warn] Too small mem size, use default" << std::endl;
}
break;
switch (argNum) {
case OPT_INPUT:
in_fn = optarg;
break;
case OPT_OUTPUT:
out_fn = optarg;
break;
case OPT_NUM_THREADS:
num_threads = std::stoi(optarg);
break;
case OPT_MAX_MEM: {
char *q;
size_t mem_arg = strtol(optarg, &q, 0);
if (*q == 'k' || *q == 'K')
mem_arg <<= 10;
else if (*q == 'm' || *q == 'M')
mem_arg <<= 20;
else if (*q == 'g' || *q == 'G')
mem_arg <<= 30;
if (mem_arg >= max_mem)
max_mem = mem_arg;
else {
std::cerr << "[Warn] Too small mem size, use default" << std::endl;
}
case OPT_LOG_LEVEL:
{
if (strcmp("ERROR", optarg) == 0)
verbosity = ns_ga::ERROR;
else if (strcmp("WARNING", optarg) == 0)
verbosity = ns_ga::WARNING;
else if (strcmp("INFO", optarg) == 0)
verbosity = ns_ga::INFO;
else if (strcmp("DEBUG", optarg) == 0)
verbosity = ns_ga::DEBUG;
break;
}
case OPT_ASYNCIO:
{
if (strcmp("true", optarg) == 0)
use_asyncio = true;
else if (strcmp("false", optarg) == 0)
use_asyncio = false;
break;
}
default:
break;
break;
}
case OPT_LOG_LEVEL: {
if (strcmp("ERROR", optarg) == 0)
verbosity = ns_ga::ERROR;
else if (strcmp("WARNING", optarg) == 0)
verbosity = ns_ga::WARNING;
else if (strcmp("INFO", optarg) == 0)
verbosity = ns_ga::INFO;
else if (strcmp("DEBUG", optarg) == 0)
verbosity = ns_ga::DEBUG;
break;
}
case OPT_ASYNCIO: {
if (strcmp("true", optarg) == 0)
use_asyncio = true;
else if (strcmp("false", optarg) == 0)
use_asyncio = false;
break;
}
default:
break;
}
}

View File

@ -9,63 +9,57 @@ Date : 2023/10/23
#ifndef GLOBAL_ARG_H_
#define GLOBAL_ARG_H_
#include <string>
#include <map>
#include <vector>
#include <getopt.h>
#include <stdio.h>
#include <map>
#include <string>
#include <vector>
using std::map;
using std::string;
using std::vector;
namespace ns_ga {
enum GlobalOptEnum
{
_START_NUM = 1,
OPT_INPUT,
OPT_OUTPUT,
OPT_NUM_THREADS,
OPT_MAX_MEM,
OPT_LOG_LEVEL,
OPT_ASYNCIO,
OPT_VERSION,
OPT_HELP,
_END_NUM
};
enum GlobalOptEnum {
_START_NUM = 1,
OPT_INPUT,
OPT_OUTPUT,
OPT_NUM_THREADS,
OPT_MAX_MEM,
OPT_LOG_LEVEL,
OPT_ASYNCIO,
OPT_VERSION,
OPT_HELP,
_END_NUM
};
// log level
enum LogLevelEnum
{
ERROR,
WARNING,
INFO,
DEBUG
};
}
// log level
enum LogLevelEnum { ERROR, WARNING, INFO, DEBUG };
} // namespace ns_ga
/* 全局共享的一些参数 */
struct GlobalArg
{
const static int GLOBAL_ARG_CNT = ns_ga::GlobalOptEnum::_END_NUM - ns_ga::GlobalOptEnum::_START_NUM; // 这里不需要减1
struct GlobalArg {
const static int GLOBAL_ARG_CNT =
ns_ga::GlobalOptEnum::_END_NUM -
ns_ga::GlobalOptEnum::_START_NUM; // 这里不需要减1
static struct option *GLOBAL_OPT;
string in_fn; // input bam filename
string out_fn; // output bam filename
int num_threads = 1; // 线程个数
size_t max_mem = ((size_t)2) << 30; // 最小2G
ns_ga::LogLevelEnum verbosity = ns_ga::INFO; // 打印信息级别
bool use_asyncio = true; // 是否使用异步io
string in_fn; // input bam filename
string out_fn; // output bam filename
int num_threads = 1; // 线程个数
size_t max_mem = ((size_t)1) << 30; // 最小1G
ns_ga::LogLevelEnum verbosity = ns_ga::INFO; // 打印信息级别
bool use_asyncio = true; // 是否使用异步io
vector<string> vArgInfo; // 每个参数的帮助信息
vector<string> vArgInfo; // 每个参数的帮助信息
// 单例模式
GlobalArg(const GlobalArg &) = delete;
GlobalArg &operator=(const GlobalArg &) = delete;
// 获取单例
static GlobalArg &Instance()
{
static GlobalArg &Instance() {
static GlobalArg instance;
return instance;
}
@ -76,8 +70,7 @@ struct GlobalArg
void parseArgument(int argNum);
// 获取对应参数在数组option和help info中的索引
int getArgIndx(ns_ga::GlobalOptEnum opt)
{
int getArgIndx(ns_ga::GlobalOptEnum opt) {
return opt - ns_ga::GlobalOptEnum::OPT_INPUT;
}
@ -95,11 +88,9 @@ struct GlobalArg
printf("--verbosity = %d\n", verbosity);
printf("--asyncio = %d\n", use_asyncio);
}
private :
GlobalArg()
{
initGlobalOptions();
};
private:
GlobalArg() { initGlobalOptions(); };
};
#endif

View File

@ -7,36 +7,34 @@ Author : Zhang Zhonghai
Date : 2023/11/6
*/
#include <string>
#include <random>
#include <string>
using std::string;
/**
* Provides an implementation of the Murmur3_32 hash algorithm that has desirable properties in terms of randomness
* and uniformity of the distribution of output values that make it a useful hashing algorithm for downsampling.
* Provides an implementation of the Murmur3_32 hash algorithm that has
* desirable properties in terms of randomness and uniformity of the
* distribution of output values that make it a useful hashing algorithm for
* downsampling.
*/
struct Murmur3
{
struct Murmur3 {
int seed_ = 0;
/** Hashes a character stream to an int using Murmur3. */
int HashUnencodedChars(const string &input)
{
int HashUnencodedChars(const string &input) {
int h1 = this->seed_;
// step through the CharSequence 2 chars at a time
const int length = input.size();
for (int i = 1; i < length; i += 2)
{
for (int i = 1; i < length; i += 2) {
int k1 = input.at(i - 1) | (input.at(i) << 16);
k1 = mixK1(k1);
h1 = mixH1(h1, k1);
}
// deal with any remaining characters
if ((length & 1) == 1)
{
if ((length & 1) == 1) {
int k1 = input.at(length - 1);
k1 = mixK1(k1);
h1 ^= k1;
@ -45,14 +43,12 @@ struct Murmur3
return fmix(h1, 2 * length);
}
static Murmur3 &Instance()
{
static Murmur3 &Instance() {
static Murmur3 instance;
return instance;
}
static int mixK1(int k1)
{
static int mixK1(int k1) {
const int c1 = 0xcc9e2d51;
const int c2 = 0x1b873593;
k1 *= c1;
@ -61,8 +57,7 @@ struct Murmur3
return k1;
}
static int mixH1(int h1, int k1)
{
static int mixH1(int h1, int k1) {
h1 ^= k1;
h1 = h1 << 13;
h1 = h1 * 5 + 0xe6546b64;
@ -70,8 +65,7 @@ struct Murmur3
}
// Finalization mix - force all bits of a hash block to avalanche
static int fmix(int h1, int length)
{
static int fmix(int h1, int length) {
h1 ^= length;
h1 ^= (unsigned int)h1 >> 16;
h1 *= 0x85ebca6b;
@ -82,8 +76,7 @@ struct Murmur3
}
private:
Murmur3()
{
Murmur3() {
auto &&rd = std::random_device{};
seed_ = rd();
}

View File

View File

View File

@ -0,0 +1,70 @@
#pragma once
#include <string>
#include <stdint.h>
using std::string;
/*
*
*/
struct DuplicationMetrics {
/**
* The library on which the duplicate marking was performed.
*/
string LIBRARY = "";
/**
* The number of mapped reads examined which did not have a mapped mate pair,
* either because the read is unpaired, or the read is paired to an unmapped mate.
*/
uint64_t UNPAIRED_READS_EXAMINED = 0;
/**
* The number of mapped read pairs examined. (Primary, non-supplemental)
*/
uint64_t READ_PAIRS_EXAMINED = 0;
/**
* The number of reads that were either secondary or supplementary
*/
uint64_t SECONDARY_OR_SUPPLEMENTARY_RDS = 0;
/**
* The total number of unmapped reads examined. (Primary, non-supplemental)
*/
uint64_t UNMAPPED_READS = 0;
/**
* The number of fragments that were marked as duplicates.
*/
uint64_t UNPAIRED_READ_DUPLICATES = 0;
/**
* The number of read pairs that were marked as duplicates.
*/
uint64_t READ_PAIR_DUPLICATES = 0;
/**
* The number of read pairs duplicates that were caused by optical duplication.
* Value is always < READ_PAIR_DUPLICATES, which counts all duplicates regardless of source.
*/
uint64_t READ_PAIR_OPTICAL_DUPLICATES = 0;
/**
* The fraction of mapped sequence that is marked as duplicate.
*/
double PERCENT_DUPLICATION = 0.0;
/**
* The estimated number of unique molecules in the library based on PE duplication.
*/
uint64_t ESTIMATED_LIBRARY_SIZE = 0;
// 其他的统计数据
// addSingletonToCount需要记录的数据
uint64_t DuplicateCountHist = 0;
uint64_t NonOpticalDuplicateCountHist = 0;
// track optical duplicates 需要记录的数据
uint64_t OpticalDuplicatesCountHist = 0;
uint64_t OpticalDuplicatesByLibraryId = 0;
// 统计相关的函数
void AddSingletonToCount() {
++this->DuplicateCountHist;
++this->NonOpticalDuplicateCountHist;
}
};

View File

@ -1,100 +1,99 @@
/*
Description: bambambam
Description:
bambambam
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2023/10/23
*/
#include "markdups_arg.h"
// 有太多define冲突放到最后include
#include <common/hts/bam_buf.h>
#include <common/utils/global_arg.h>
#include <common/utils/murmur3.h>
#include <common/utils/thpool.h>
#include <common/utils/timer.h>
#include <common/utils/util.h>
#include <common/utils/murmur3.h>
#include <common/utils/yarn.h>
#include <htslib/sam.h>
#include <htslib/thread_pool.h>
#include <sam/utils/read_ends.h>
#include <sam/utils/read_name_parser.h>
#include <htslib/sam.h>
#include <htslib/thread_pool.h>
#include <iostream>
#include <vector>
#include <set>
#include <queue>
#include <set>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "markdups_arg.h"
#include "md_funcs.h"
#include "parallel_md.h"
#include "serial_md.h"
#include "shared_args.h"
#include "dup_metrics.h"
using namespace std;
using std::cout;
#define SMA_TAG_PG "PG"
#define BAM_BLOCK_SIZE 2 * 1024 * 1024
#define BAM_BLOCK_SIZE 16L * 1024 * 1024
#define NO_SUCH_INDEX INT64_MAX
static Timer tm_arr[20]; // 用来测试性能
Timer tm_arr[20]; // 用来测试性能
/* 全局本地变量 */
static vector<ReadNameParser> g_vRnParser; // 每个线程一个read name parser
static samFile *g_inBamFp; // 输入的bam文件
static sam_hdr_t *g_inBamHeader; // 输入的bam文件头信息
static samFile *g_outBamFp = nullptr; // 输出文件, sam或者bam格式
static sam_hdr_t *g_outBamHeader; // 输出文件的header
vector<ReadNameParser> g_vRnParser; // 每个线程一个read name parser
samFile *g_inBamFp; // 输入的bam文件
sam_hdr_t *g_inBamHeader; // 输入的bam文件头信息
samFile *g_outBamFp = nullptr; // 输出文件, sam或者bam格式
sam_hdr_t *g_outBamHeader; // 输出文件的header
/* 参数对象作为全局对象,免得多次作为参数传入函数中 */
static GlobalArg &g_gArg = GlobalArg::Instance();
static MarkDupsArg g_mdArg;
#include "md_funcs.h"
#include "serial_md.h"
#include "parallel_md.h"
GlobalArg &g_gArg = GlobalArg::Instance();
static MarkDupsArg g_mdArg_;
MarkDupsArg &g_mdArg = g_mdArg_;
static GlobalDataArg gData_;
GlobalDataArg &gData = gData_;
DuplicationMetrics gMetrics_;
DuplicationMetrics &gMetrics = gMetrics_;
/*
* mark duplicate bambarcode
* mark duplicate
* bambarcode
*/
int MarkDuplicates(int argc, char *argv[])
{
int MarkDuplicates(int argc, char *argv[]) {
Timer::log_time("程序开始");
Timer time_all;
/* 读取命令行参数 */
g_mdArg.parseArgument(argc, argv, &g_gArg); // 解析命令行参数
if (g_gArg.num_threads < 1) // 线程数不能小于1
g_gArg.num_threads = 1;
g_mdArg.parseArgument(argc, argv, &g_gArg); // 解析命令行参数
if (g_gArg.num_threads < 1)
g_gArg.num_threads = 1; // 线程数不能小于1
/* 初始化一些参数和变量*/
g_vRnParser.resize(g_gArg.num_threads);
for (auto &parser : g_vRnParser)
parser.SetReadNameRegex(g_mdArg.READ_NAME_REGEX); // 用来解析read name中的tilexy信息
parser.SetReadNameRegex(g_mdArg.READ_NAME_REGEX); // 用来解析read name中的tilexy信息
/* 打开输入bam文件 */
g_inBamFp = sam_open_format(g_gArg.in_fn.c_str(), "r", nullptr);
if (!g_inBamFp)
{
if (!g_inBamFp) {
Error("[%s] load sam/bam file failed.\n", __func__);
return -1;
}
hts_set_opt(g_inBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
g_inBamHeader = sam_hdr_read(g_inBamFp); // 读取header
g_inBamHeader = sam_hdr_read(g_inBamFp); // 读取header
// 获取样本名称(libraryId)
gMetrics.LIBRARY = sam_hdr_line_name(g_inBamHeader, "RG", 0);
/* 利用线程池对输入输出文件进行读写 */
htsThreadPool htsPoolRead = {NULL, 0}; // 多线程读取,创建线程池
htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池
// htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads);
htsPoolRead.pool = hts_tpool_init(16);
// htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads);
htsPoolWrite.pool = hts_tpool_init(16);
if (!htsPoolRead.pool || !htsPoolWrite.pool)
{
htsThreadPool htsPoolRead = {NULL, 0}; // 多线程读取,创建线程池
htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池
//htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads);
//htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads);
htsPoolRead.pool = hts_tpool_init(32);
htsPoolWrite.pool = hts_tpool_init(32);
if (!htsPoolRead.pool || !htsPoolWrite.pool) {
Error("[%d] failed to set up thread pool", __LINE__);
sam_close(g_inBamFp);
return -1;
@ -106,86 +105,94 @@ int MarkDuplicates(int argc, char *argv[])
sam_open_mode(modeout + 1, g_gArg.out_fn.c_str(), NULL);
g_outBamFp = sam_open(g_gArg.out_fn.c_str(), modeout);
g_outBamHeader = sam_hdr_dup(g_inBamHeader);
// 用同样的线程池处理输出文件
hts_set_opt(g_outBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
hts_set_opt(g_outBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite); // 用同样的线程池处理输出文件
hts_set_opt(g_outBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite);
/* 冗余检查和标记 */
if (g_gArg.num_threads == 1)
{
serialMarkDups(); // 串行运行
}
else
{
parallelMarkDups(); // 并行运行
if (g_gArg.num_threads == 1) {
serialMarkDups(); // 串行运行
} else {
parallelMarkDups(); // 并行运行
}
/* 标记冗余, 将处理后的结果写入文件 */
sam_close(g_inBamFp); // 重新打开bam文件
sam_close(g_inBamFp); // 重新打开bam文件
g_inBamFp = sam_open_format(g_gArg.in_fn.c_str(), "r", nullptr);
if (!g_inBamFp)
{
if (!g_inBamFp) {
Error("[%s] load sam/bam file failed.\n", __func__);
return -1;
}
hts_set_opt(g_inBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
hts_set_opt(g_inBamFp, HTS_OPT_THREAD_POOL, &htsPoolRead);
g_inBamHeader = sam_hdr_read(g_inBamFp); // 读取header
if (sam_hdr_write(g_outBamFp, g_outBamHeader) != 0)
{
g_inBamHeader = sam_hdr_read(g_inBamFp); // 读取header
if (sam_hdr_write(g_outBamFp, g_outBamHeader) != 0) {
Error("failed writing header to \"%s\"", g_gArg.out_fn.c_str());
sam_close(g_outBamFp);
sam_close(g_inBamFp);
return -1;
}
// 输出index文件
string indexFn = g_gArg.out_fn + ".csi"; // 现在索引都是csi格式的
if (sam_idx_init(g_outBamFp, g_outBamHeader, 14 /*csi*/, indexFn.c_str()) < 0)
{
// string indexFn = g_gArg.out_fn + ".csi"; // 现在索引都是csi格式的
string indexFn = g_gArg.out_fn + ".bai"; // min_shift = 0 是bai格式
int index_min_shift = 0;
if (g_mdArg.INDEX_FORMAT == ns_md::IndexFormat::CSI) {
indexFn = g_gArg.out_fn + ".csi";
index_min_shift = 14;
}
if (sam_idx_init(g_outBamFp, g_outBamHeader, 0 /*csi 14*/, indexFn.c_str()) < 0) {
Error("failed to open index \"%s\" for writing", indexFn.c_str());
sam_close(g_outBamFp);
sam_close(g_inBamFp);
return -1;
}
// 读取输入文件
// BamBufType inBuf(false); // inBuf(g_gArg.use_asyncio);
// BamBufType inBuf(false);
BamBufType inBuf(g_gArg.use_asyncio);
inBuf.Init(g_inBamFp, g_inBamHeader, g_gArg.max_mem);
DupIdxQueue idxQue;
idxQue.Init(&gData.dupIdxArr);
Timer tw;
while (inBuf.ReadStat() >= 0)
{
cout << "dupsize: " << idxQue.Size() << endl;
uint64_t bamIdx = 0;
uint64_t dupIdx = idxQue.Pop();
cout << "dup arr size: " << gData.dupIdxArr.size() << endl;
cout << "first dup: " << dupIdx << endl;
while (inBuf.ReadStat() >= 0) {
Timer tw1;
size_t readNum = inBuf.ReadBam();
cout << "read: " << readNum << endl;
for (size_t i = 0; i < inBuf.Size(); ++i)
{
for (size_t i = 0; i < inBuf.Size(); ++i) {
/* 判断是否冗余 */
if (sam_write1(g_outBamFp, g_outBamHeader, inBuf[i]->b) < 0)
{
if (bamIdx == dupIdx) {
// cout << "冗余" << bamIdx << endl;
dupIdx = idxQue.Pop();
}
if (sam_write1(g_outBamFp, g_outBamHeader, inBuf[i]->b) < 0) {
Error("failed writing header to \"%s\"", g_gArg.out_fn.c_str());
sam_close(g_outBamFp);
sam_close(g_inBamFp);
return -1;
}
++bamIdx;
}
inBuf.ClearAll();
cout << "write round time: " << tw1.seconds_elapsed() << " s" << endl;
}
if (sam_idx_save(g_outBamFp) < 0)
{
cout << "dupsize: " << idxQue.Size() << endl;
if (sam_idx_save(g_outBamFp) < 0) {
Error("writing index failed");
sam_close(g_outBamFp);
sam_close(g_inBamFp);
return -1;
}
cout << "write time: " << tw.seconds_elapsed() << " s" << endl;
/* 关闭文件,收尾清理 */
sam_close(g_outBamFp);
sam_close(g_inBamFp);
cout << " 总时间: " << time_all.seconds_elapsed() << endl;
// cout << "计算read end: " << tm_arr[0].acc_seconds_elapsed() << endl;
// cout << "计算read end: " << tm_arr[0].acc_seconds_elapsed() << endl;
Timer::log_time("程序结束");
return 0;
}
}

View File

@ -64,6 +64,7 @@ const static struct option kMdOpts[] = {
{"COMPRESSION_LEVEL", required_argument, NULL, COMPRESSION_LEVEL},
{"MAX_RECORDS_IN_RAM", required_argument, NULL, MAX_RECORDS_IN_RAM},
{"CREATE_INDEX", required_argument, NULL, CREATE_INDEX},
{"INDEX_FORMAT", required_argument, NULL, INDEX_FORMAT},
{"CREATE_MD5_FILE", required_argument, NULL, CREATE_MD5_FILE}};
// 判断bool类型的参数
@ -220,6 +221,11 @@ void MarkDupsArg::parseArgument(int argc,
case ns_md::CREATE_INDEX:
setBoolArg(&CREATE_INDEX);
break;
case ns_md::INDEX_FORMAT:
if (strcmp("CSI", optarg) == 0)
INDEX_FORMAT = ns_md::IndexFormat::CSI;
else
INDEX_FORMAT = ns_md::IndexFormat::BAI;
case ns_md::CREATE_MD5_FILE:
setBoolArg(&CREATE_MD5_FILE);
break;
@ -235,7 +241,8 @@ void MarkDupsArg::parseArgument(int argc,
/* 打印参数信息 */
void MarkDupsArg::printArgValue()
{
printf("--READ_NAME_REGEX = %s\n", this->READ_NAME_REGEX.c_str());
printf("--READ_NAME_REGEX = %s\n", this->READ_NAME_REGEX.c_str());
printf("--INDEX_FORMAT = %s\n", this->INDEX_FORMAT == ns_md::IndexFormat::BAI ? "bai" : "csi");
}
// 打印版本信息
@ -272,6 +279,8 @@ void MarkDupsArg::PrintHelp()
"\n"
"Optional Arguments:\n"
"\n"
"--INDEX_FORMAT <FORMAT> Format for bam index file. Possible values: {BAI, CSI}\n"
"\n"
"--ADD_PG_TAG_TO_READS <Boolean>\n"
" Add PG tag to each read in a SAM or BAM Default value: true. Possible values: {true,\n"
" false}\n"

View File

@ -21,91 +21,92 @@ using std::vector;
class GlobalArg;
namespace ns_md {
/* 用于markduplicate模块的参数这个枚举用于getoption */
enum MarkDupsArgEnum
{
_START_NUM = 100,
MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP,
MAX_FILE_HANDLES_FOR_READ_ENDS_MAP,
SORTING_COLLECTION_SIZE_RATIO,
BARCODE_TAG,
READ_ONE_BARCODE_TAG,
READ_TWO_BARCODE_TAG,
TAG_DUPLICATE_SET_MEMBERS,
REMOVE_SEQUENCING_DUPLICATES,
TAGGING_POLICY,
CLEAR_DT,
DUPLEX_UMI,
MOLECULAR_IDENTIFIER_TAG,
METRICS_FILE,
REMOVE_DUPLICATES,
ASSUME_SORTED,
ASSUME_SORT_ORDER,
DUPLICATE_SCORING_STRATEGY,
PROGRAM_RECORD_ID,
PROGRAM_GROUP_VERSION,
PROGRAM_GROUP_COMMAND_LINE,
PROGRAM_GROUP_NAME,
COMMENT,
READ_NAME_REGEX,
OPTICAL_DUPLICATE_PIXEL_DISTANCE,
MAX_OPTICAL_DUPLICATE_SET_SIZE,
QUIET,
VALIDATION_STRINGENCY,
COMPRESSION_LEVEL,
MAX_RECORDS_IN_RAM,
CREATE_INDEX,
CREATE_MD5_FILE,
_END_NUM
};
/* How strict to be when reading a SAM or BAM, beyond bare minimum validation. */
enum ValidationStringency
{
/**
* Do the right thing, throw an exception if something looks wrong.
*/
STRICT,
/**
* Emit warnings but keep going if possible.
*/
LENIENT,
/**
* Like LENIENT, only don't emit warning messages.
*/
SILENT,
DEFAULT_STRINGENCY = SILENT
};
/* 用于markduplicate模块的参数这个枚举用于getoption */
enum MarkDupsArgEnum {
_START_NUM = 100,
MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP,
MAX_FILE_HANDLES_FOR_READ_ENDS_MAP,
SORTING_COLLECTION_SIZE_RATIO,
BARCODE_TAG,
READ_ONE_BARCODE_TAG,
READ_TWO_BARCODE_TAG,
TAG_DUPLICATE_SET_MEMBERS,
REMOVE_SEQUENCING_DUPLICATES,
TAGGING_POLICY,
CLEAR_DT,
DUPLEX_UMI,
MOLECULAR_IDENTIFIER_TAG,
METRICS_FILE,
REMOVE_DUPLICATES,
ASSUME_SORTED,
ASSUME_SORT_ORDER,
DUPLICATE_SCORING_STRATEGY,
PROGRAM_RECORD_ID,
PROGRAM_GROUP_VERSION,
PROGRAM_GROUP_COMMAND_LINE,
PROGRAM_GROUP_NAME,
COMMENT,
READ_NAME_REGEX,
OPTICAL_DUPLICATE_PIXEL_DISTANCE,
MAX_OPTICAL_DUPLICATE_SET_SIZE,
QUIET,
VALIDATION_STRINGENCY,
COMPRESSION_LEVEL,
MAX_RECORDS_IN_RAM,
CREATE_INDEX,
INDEX_FORMAT,
CREATE_MD5_FILE,
_END_NUM
};
/* How strict to be when reading a SAM or BAM, beyond bare minimum validation.
*/
enum ValidationStringency {
/**
* Enum used to control how duplicates are flagged in the DT optional tag on each read.
* Do the right thing, throw an exception if something looks wrong.
*/
enum DuplicateTaggingPolicy
{
DontTag,
OpticalOnly,
All
};
STRICT,
/**
* Emit warnings but keep going if possible.
*/
LENIENT,
/**
* Like LENIENT, only don't emit warning messages.
*/
SILENT,
/* 排序的方式 */
enum SortOrder
{
unsorted,
queryname,
coordinate,
duplicate, // NB: this is not in the SAM spec!
unknown
};
DEFAULT_STRINGENCY = SILENT
};
/* 计算reads分数的方式比那个read得分更高 */
enum ScoringStrategy
{
SUM_OF_BASE_QUALITIES,
TOTAL_MAPPED_REFERENCE_LENGTH,
RANDOM
};
}
/**
* Enum used to control how duplicates are flagged in the DT optional tag on
* each read.
*/
enum DuplicateTaggingPolicy { DontTag, OpticalOnly, All };
/* 排序的方式 */
enum SortOrder {
unsorted,
queryname,
coordinate,
duplicate, // NB: this is not in the SAM spec!
unknown
};
/* 计算reads分数的方式比那个read得分更高 */
enum ScoringStrategy {
SUM_OF_BASE_QUALITIES,
TOTAL_MAPPED_REFERENCE_LENGTH,
RANDOM
};
/* 索引文件的格式 bai或者csi */
enum IndexFormat {
BAI,
CSI
};
} // namespace ns_md
/* markduplicate 需要的参数*/
struct MarkDupsArg
@ -287,6 +288,8 @@ struct MarkDupsArg
/* "Whether to create an index when writing VCF or coordinate sorted BAM output.", common = true */
bool CREATE_INDEX = false;
ns_md::IndexFormat INDEX_FORMAT = ns_md::IndexFormat::BAI;
/* "Whether to create an MD5 digest for any BAM or FASTQ files created. ", common = true */
bool CREATE_MD5_FILE = false;

View File

@ -0,0 +1,456 @@
#include "md_funcs.h"
#include <common/hts/bam_buf.h>
#include <common/utils/debug.h>
#include <common/utils/murmur3.h>
#include <common/utils/profiling.h>
#include <common/utils/timer.h>
#include <common/utils/util.h>
#include <sam/utils/read_ends.h>
#include <stdint.h>
#include <iostream>
#include <map>
#include <set>
#include <vector>
#include <math.h>
#include <unordered_map>
#include "dup_metrics.h"
#include "markdups_arg.h"
#include "shared_args.h"
using std::cerr;
using std::endl;
using std::map;
using std::set;
using std::unordered_map;
using std::vector;
/* 清除key位置的数据 */
void clearIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) {
auto &msIdx = *pmsIdx;
if (msIdx.find(key) != msIdx.end())
msIdx[key].clear(); // 清除该位点的冗余结果
}
/* 删除key位置的数据 */
void delIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) {
auto &msIdx = *pmsIdx;
if (msIdx.find(key) != msIdx.end())
msIdx.erase(key);
}
/*
* read
*/
int16_t computeDuplicateScore(BamWrap &bw) {
int16_t score = 0;
switch (g_mdArg.DUPLICATE_SCORING_STRATEGY) {
case ns_md::SUM_OF_BASE_QUALITIES:
// two (very) long reads worth of high-quality bases can go over
// Short.MAX_VALUE/2 and risk overflow.
score += (int16_t)min(bw.GetSumOfBaseQualities(), INT16_MAX / 2);
break;
case ns_md::TOTAL_MAPPED_REFERENCE_LENGTH:
if (!bw.GetReadUnmappedFlag())
// no need to remember the score since this scoring mechanism is
// symmetric
score = (int16_t)min(bw.GetReferenceLength(), INT16_MAX / 2);
break;
case ns_md::RANDOM:
// The RANDOM score gives the same score to both reads so that they get
// filtered together. it's not critical do use the readName since the
// scores from both ends get added, but it seem to be clearer this way.
score += (short)(Murmur3::Instance().HashUnencodedChars(bw.query_name()) & 0b11111111111111);
// subtract Short.MIN_VALUE/4 from it to end up with a number between
// 0 and Short.MAX_VALUE/2. This number can be then discounted in case
// the read is not passing filters. We need to stay far from overflow so
// that when we add the two scores from the two read mates we do not
// overflow since that could cause us to chose a failing read-pair
// instead of a passing one.
score -= INT16_MIN / 4;
default:
break;
}
// make sure that filter-failing records are heavily discounted. (the
// discount can happen twice, once for each mate, so need to make sure we do
// not subtract more than Short.MIN_VALUE overall.)
score += bw.GetReadFailsVendorQualityCheckFlag() ? (int16_t)(INT16_MIN / 2) : 0;
return score;
}
/*
* Builds a read ends object that represents a single read.
* read
*/
void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey) {
auto &k = *pKey;
auto &bc = bw.b->core;
k.read1FirstOfPair = bw.GetFirstOfPairFlag();
k.read1ReferenceIndex = bc.tid;
k.read1Coordinate = (bc.flag & BAM_FREVERSE) ? bw.GetUnclippedEnd() : bw.GetUnclippedStart();
k.orientation = (bc.flag & BAM_FREVERSE) ? ReadEnds::R : ReadEnds::F;
k.read1IndexInFile = index;
k.score = computeDuplicateScore(bw);
// Doing this lets the ends object know that it's part of a pair
if (bw.GetReadPairedFlag() && !bw.GetMateUnmappedFlag()) {
k.read2ReferenceIndex = bc.mtid;
}
// Fill in the location information for optical duplicates
rnParser.AddLocationInformation(bw.query_name(), pKey);
// cout << k.tile << ' ' << k.x << ' ' << k.y << endl;
// 计算位置key
k.posKey =
BamWrap::bam_global_pos(k.read1ReferenceIndex, k.read1Coordinate); // << 1 | k.orientation;
}
/**
* Takes a list of ReadEndsForMarkDuplicates objects and identify the
* representative read based on quality score. For all members of the duplicate
* set, add the read1 index-in-file of the representative read to the records of
* the first and second in a pair. This value becomes is used for the 'DI' tag.
*/
void addRepresentativeReadIndex(vector<const ReadEnds *> &vpRe) {}
/* 处理一组pairend的readends标记冗余 */
void markDuplicatePairs(int64_t posKey, vector<const ReadEnds *> &vpRe,
DupContainer<int64_t> *dupIdx, DupContainer<int64_t> *opticalDupIdx) {
if (vpRe.size() < 2) {
if (vpRe.size() == 1) {
// addSingletonToCount(libraryIdGenerator);
}
return;
}
// cout << "pos:" << posKey + 1 << ";size:" << vpRe.size() << endl;
auto &vDupIdx = dupIdx->AtPos(posKey);
auto &vOpticalDupIdx = opticalDupIdx->AtPos(posKey);
int maxScore = 0;
const ReadEnds *pBest = nullptr;
/** All read ends should have orientation FF, FR, RF, or RR **/
for (auto pe : vpRe) // 找分数最高的readend
{
if (pe->score > maxScore || pBest == nullptr) {
maxScore = pe->score;
pBest = pe;
}
}
if (!g_mdArg.READ_NAME_REGEX.empty()) // 检查光学冗余
{
// trackOpticalDuplicates
}
for (auto pe : vpRe) // 对非best read标记冗余
{
if (pe != pBest) // 非best
{
vDupIdx.push_back(pe->read1IndexInFile); // 添加read1
if (pe->read2IndexInFile != pe->read1IndexInFile)
vDupIdx.push_back(pe->read2IndexInFile); // 添加read2
}
}
if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS) {
addRepresentativeReadIndex(vpRe);
}
}
/* 处理一组非paired的readends标记冗余 */
void markDuplicateFragments(int64_t posKey, vector<const ReadEnds *> &vpRe, bool containsPairs,
DupContainer<int64_t> *dupIdx) {
auto &vDupIdx = dupIdx->AtPos(posKey);
if (containsPairs) {
for (auto pe : vpRe) {
if (!pe->IsPaired()) {
vDupIdx.push_back(pe->read1IndexInFile);
}
}
} else {
int maxScore = 0;
const ReadEnds *pBest = nullptr;
for (auto pe : vpRe) {
if (pe->score > maxScore || pBest == nullptr) {
maxScore = pe->score;
pBest = pe;
}
}
for (auto pe : vpRe) {
if (pe != pBest) {
vDupIdx.push_back(pe->read1IndexInFile);
}
}
}
}
/* 处理位于某个坐标的pairend reads */
void handlePairs(int64_t posKey, vector<ReadEnds> &readEnds,
vector<const ReadEnds *> &vpCache, DupContainer<int64_t> *dupIdx,
DupContainer<int64_t> *opticalDupIdx) {
if (readEnds.size() > 1) { // 有潜在的冗余
vpCache.clear();
// std::sort(readEnds.begin(), readEnds.end());
const ReadEnds *pReadEnd = nullptr;
for (auto &re : readEnds) {
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样
vpCache.push_back(&re); // 处理一个潜在的冗余组
else {
markDuplicatePairs(posKey, vpCache, dupIdx,
opticalDupIdx); // 不一样
vpCache.clear();
vpCache.push_back(&re);
pReadEnd = &re;
}
}
markDuplicatePairs(posKey, vpCache, dupIdx, opticalDupIdx);
}
}
/* 处理位于某个坐标的 reads */
void handleFrags(int64_t posKey, vector<ReadEnds> &readEnds,
vector<const ReadEnds *> &vpCache, DupContainer<int64_t> *dupIdx) {
if (readEnds.size() > 1) // 有潜在的冗余
{
vpCache.clear();
// std::sort(readEnds.begin(), readEnds.end());
const ReadEnds *pReadEnd = nullptr;
bool containsPairs = false;
bool containsFrags = false;
for (auto &re : readEnds) {
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false)) {
vpCache.push_back(&re);
containsPairs = containsPairs || re.IsPaired();
containsFrags = containsFrags || !re.IsPaired();
} else {
if (vpCache.size() > 1 && containsFrags) {
markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
}
vpCache.clear();
vpCache.push_back(&re);
pReadEnd = &re;
containsPairs = re.IsPaired();
containsFrags = !re.IsPaired();
}
}
if (vpCache.size() > 1 && containsFrags) {
markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
}
}
}
/* 对找到的pairend read end添加一些信息 */
void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds) {
auto &pairedEnds = *pPairedEnds;
int64_t bamIdx = fragEnd.read1IndexInFile;
const int matesRefIndex = fragEnd.read1ReferenceIndex;
const int matesCoordinate = fragEnd.read1Coordinate;
// Set orientationForOpticalDuplicates, which always goes by the first then
// the second end for the strands. NB: must do this before updating the
// orientation later.
if (fragEnd.read1FirstOfPair) {
pairedEnds.orientationForOpticalDuplicates = ReadEnds::GetOrientationByte(
fragEnd.IsNegativeStrand(), pairedEnds.orientation == ReadEnds::R);
} else {
pairedEnds.orientationForOpticalDuplicates = ReadEnds::GetOrientationByte(
pairedEnds.orientation == ReadEnds::R, fragEnd.IsNegativeStrand());
}
// If the other read is actually later, simply add the other read's data as
// read2, else flip the reads
if (matesRefIndex > pairedEnds.read1ReferenceIndex ||
(matesRefIndex == pairedEnds.read1ReferenceIndex &&
matesCoordinate >= pairedEnds.read1Coordinate)) {
pairedEnds.read2ReferenceIndex = matesRefIndex;
pairedEnds.read2Coordinate = matesCoordinate;
pairedEnds.read2IndexInFile = bamIdx;
pairedEnds.orientation = ReadEnds::GetOrientationByte(pairedEnds.orientation == ReadEnds::R,
fragEnd.IsNegativeStrand());
// if the two read ends are in the same position, pointing in opposite
// directions, the orientation is undefined and the procedure above will
// depend on the order of the reads in the file. To avoid this, we set
// it explicitly (to FR):
if (pairedEnds.read2ReferenceIndex == pairedEnds.read1ReferenceIndex &&
pairedEnds.read2Coordinate == pairedEnds.read1Coordinate &&
pairedEnds.orientation == ReadEnds::RF) {
pairedEnds.orientation = ReadEnds::FR;
}
} else {
pairedEnds.read2ReferenceIndex = pairedEnds.read1ReferenceIndex;
pairedEnds.read2Coordinate = pairedEnds.read1Coordinate;
pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile;
pairedEnds.read1ReferenceIndex = matesRefIndex;
pairedEnds.read1Coordinate = matesCoordinate;
pairedEnds.read1IndexInFile = bamIdx;
pairedEnds.orientation = ReadEnds::GetOrientationByte(
fragEnd.IsNegativeStrand(), pairedEnds.orientation == ReadEnds::R);
pairedEnds.posKey = fragEnd.posKey;
}
pairedEnds.score += fragEnd.score;
}
static inline bool closeEnough(const ReadEnds *lhs, const ReadEnds *rhs, const int distance) {
return lhs != rhs && // no comparing an object to itself (checked using object identity)!
(lhs->tile != ReadEnds::NO_VALUE) &&
(rhs->tile != ReadEnds::NO_VALUE) && // no comparing objects without locations
lhs->tile == rhs->tile && // and the same tile
abs(lhs->x - rhs->x) <= distance && abs(lhs->y - rhs->y) <= distance;
}
/**
* Finds which reads within the list of duplicates that are likely to be optical/co-localized duplicates of
* one another. Within each cluster of optical duplicates that is found, one read remains un-flagged for
* optical duplication and the rest are flagged as optical duplicates. The set of reads that are considered
* optical duplicates are indicated by returning "true" at the same index in the resulting boolean[] as the
* read appeared in the input list of physical locations.
*
* @param list a list of reads that are determined to be duplicates of one another
* @param keeper a single PhysicalLocation that is the one being kept as non-duplicate, and thus should never be
* annotated as an optical duplicate. May in some cases be null, or a PhysicalLocation not
* contained within the list! (always not be null!)
* @return a boolean[] of the same length as the incoming list marking which reads are optical duplicates
*/
static void findOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe,
vector<bool> *pOpticalDuplicateFlags) {
const int DEFAULT_OPTICAL_DUPLICATE_DISTANCE = 100;
const int DEFAULT_MAX_DUPLICATE_SET_SIZE = 300000;
vector<bool> &opticalDuplicateFlags = *pOpticalDuplicateFlags;
opticalDuplicateFlags.push_back(true);
int len = readEndsArr.size();
// If there is only one or zero reads passed in (so there are obviously no optical duplicates),
// or if there are too many reads (so we don't want to try to run this expensive n^2 algorithm),
// then just return an array of all false
if (len < 2 || len > DEFAULT_MAX_DUPLICATE_SET_SIZE) {
return;
}
if (len >= 4) {
/**
* Compute the optical duplicates correctly in the case where the duplicate group could end up with transitive
* optical duplicates
* getOpticalDuplicatesFlagWithGraph
*/
// Make a graph where the edges are reads that lie within the optical duplicate pixel distance from each other,
// we will then use the union-find algorithm to cluster the graph and find optical duplicate groups
Graph<int> opticalDistanceRelationGraph;
unordered_map<int, vector<int>> tileRGmap;
int keeperIndex = -1;
for (int i = 0; i < readEndsArr.size(); ++i) {
const ReadEnds *currentLoc = readEndsArr[i];
if (currentLoc == pBestRe)
keeperIndex = i;
if (currentLoc->tile != ReadEnds::NO_VALUE) {
int key = currentLoc->tile; // 只处理一个样本所以只有一个read group
tileRGmap[key].push_back(i);
}
}
} else {
/**
* Compute optical duplicates quickly in the standard case where we know that there won't be any transitive
* distances to worry about. Note, this is guaranteed to be correct when there are at most 2x reads from a
* readgroup or 3x with the keeper present
* getOpticalDuplicatesFlagFast
*/
// First go through and compare all the reads to the keeper
for (int i = 0; i < len; ++i) {
const ReadEnds *other = readEndsArr[i];
opticalDuplicateFlags[i] = closeEnough(pBestRe, other, DEFAULT_OPTICAL_DUPLICATE_DISTANCE);
}
// Now go through and do each pairwise comparison not involving the actualKeeper
for (int i = 0; i < len; ++i) {
const ReadEnds *lhs = readEndsArr[i];
if (lhs == pBestRe) // no comparisons to actualKeeper since those are all handled above
continue;
for (int j = i + 1; j < len; ++j) {
const ReadEnds *rhs = readEndsArr[j];
if (rhs == pBestRe) // no comparisons to actualKeeper since those are all handled above
continue;
if (opticalDuplicateFlags[i] && opticalDuplicateFlags[j])
continue; // both already marked, no need to check
if (closeEnough(lhs, rhs, DEFAULT_OPTICAL_DUPLICATE_DISTANCE)) {
// At this point we want to mark either lhs or rhs as duplicate. Either could have been marked
// as a duplicate of the keeper (but not both - that's checked above), so be careful about which
// one to now mark as a duplicate.
int index = opticalDuplicateFlags[j] ? i : j;
opticalDuplicateFlags[index] = true;
}
}
}
}
}
/**
* Looks through the set of reads and identifies how many of the duplicates are
* in fact optical duplicates, and stores the data in the instance level histogram.
*
* We expect only reads with FR or RF orientations, not a mixture of both.
*
* In PCR duplicate detection, a duplicates can be a have FR and RF when fixing the orientation order to the first end
* of the mate. In optical duplicate detection, we do not consider them duplicates if one read as FR and the other RF
* when we order orientation by the first mate sequenced (read #1 of the pair).
*/
static int checkOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe) {
vector<bool> opticalDuplicateFlags(readEndsArr.size(), false);
// find OpticalDuplicates
findOpticalDuplicates(readEndsArr, pBestRe, &opticalDuplicateFlags);
int opticalDuplicates = 0;
for (int i = 0; i < opticalDuplicateFlags.size(); ++i) {
if (opticalDuplicateFlags[i]) {
++opticalDuplicates;
ReadEnds *pRe = const_cast<ReadEnds *>(readEndsArr[i]);
pRe->isOpticalDuplicate = true;
}
}
if (opticalDuplicates > 0)
gMetrics.OpticalDuplicatesByLibraryId += opticalDuplicates;
return opticalDuplicates;
}
/**
*
*/
void trackOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe) {
bool hasFR = false, hasRF = false;
// Check to see if we have a mixture of FR/RF
for (auto pRe : readEndsArr) {
if (ReadEnds::FR == pRe->orientationForOpticalDuplicates)
hasFR = true;
else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates)
hasRF = true;
}
// Check if we need to partition since the orientations could have changed
int nOpticalDup;
if (hasFR && hasRF) { // need to track them independently
vector<const ReadEnds *> trackOpticalDuplicatesF;
vector<const ReadEnds *> trackOpticalDuplicatesR;
// Split into two lists: first of pairs and second of pairs,
// since they must have orientation and same starting end
for (auto pRe: readEndsArr) {
if (ReadEnds::FR == pRe->orientationForOpticalDuplicates)
trackOpticalDuplicatesF.push_back(pRe);
else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates)
trackOpticalDuplicatesR.push_back(pRe);
else
cerr << "Found an unexpected orientation: " << pRe->orientation << endl;
}
// track the duplicates
int nOpticalDupF = checkOpticalDuplicates(trackOpticalDuplicatesF, pBestRe);
int nOpticalDupR = checkOpticalDuplicates(trackOpticalDuplicatesR, pBestRe);
nOpticalDup = nOpticalDupF + nOpticalDupR;
} else { // No need to partition
nOpticalDup = checkOpticalDuplicates(readEndsArr, pBestRe);
}
// trackDuplicateCounts
gMetrics.DuplicateCountHist += readEndsArr.size() - nOpticalDup;
if (readEndsArr.size() > nOpticalDup)
gMetrics.NonOpticalDuplicateCountHist += readEndsArr.size() - nOpticalDup;
if (nOpticalDup)
gMetrics.OpticalDuplicatesCountHist += nOpticalDup + 1;
}

View File

@ -1,60 +1,64 @@
#pragma once
#include <robin-map/include/tsl/robin_map.h>
#include <vector>
#include <queue>
#include <unordered_set>
#include <unordered_map>
using std::priority_queue;
using std::unordered_map;
using std::unordered_set;
using std::vector;
/* 前向声明 */
class BamWrap;
class ReadEnds;
class ReadNameParser;
/* 存放readend或者冗余idx避免频繁的开辟和释放内存 */
template <class T>
struct DupContainer
{
vector<vector<T>> arr; // 类似map<int64_t, set<ReadEnds>> 或 map<int64_t, set<int64_t>>
vector<int64_t> pos; // arr中每个元素对应的position
struct DupContainer {
vector<vector<T>> arr; // 类似map<int64_t, set<ReadEnds>> 或 map<int64_t, set<int64_t>>
vector<int64_t> pos; // arr中每个元素对应的position
// unordered_map<int64_t, int64_t> idx; // 某个位点对应在vector中的坐标
tsl::robin_map<int64_t, int64_t> idx; // 某个位点对应在vector中的坐标
int64_t size = 0; // 实际使用的空间
int64_t capacity = 0; // 内存容量
inline void Init()
{
tsl::robin_map<int64_t, int64_t> idx; // 某个位点对应在vector中的坐标
int64_t size = 0; // 实际使用的空间
int64_t capacity = 0; // 内存容量
inline void Init() {
idx.clear();
size = 0;
}
inline void SortAtPos(int64_t p) // 这里的pos表示位点
inline void SortAtPos(int64_t p) // 这里的pos表示位点
{
if (idx.find(p) != idx.end())
{
if (idx.find(p) != idx.end()) {
const int64_t i = idx.at(p);
std::sort(arr[i].begin(), arr[i].end());
}
}
inline void SortAtIdx(int64_t i) // 这里的i表示vector的索引
inline void SortAtIdx(int64_t i) // 这里的i表示vector的索引
{
std::sort(arr[i].begin(), arr[i].end());
}
inline void RemoveAtPos(int64_t p)
{
if (idx.find(p) != idx.end())
{
inline void RemoveAtPos(int64_t p) {
if (idx.find(p) != idx.end()) {
const int64_t i = idx.at(p);
arr[i].clear();
}
}
inline void RemoveAtIdx(int64_t i) // 这里的i表示vector的索引
inline void RemoveAtIdx(int64_t i) // 这里的i表示vector的索引
{
arr[i].clear();
}
inline void AddAtPos(int64_t p, T &val)
{
AtPos(p).push_back(val);
}
inline vector<T> &AtPos(int64_t p)
{
if (idx.find(p) != idx.end())
{
inline void AddAtPos(int64_t p, T &val) { AtPos(p).push_back(val); }
inline vector<T> &AtPos(int64_t p) {
if (idx.find(p) != idx.end()) {
const int64_t i = idx.at(p);
return arr[i];
}
if (size >= capacity)
{
if (size >= capacity) {
capacity += 1;
arr.push_back(vector<T>());
pos.push_back(0);
@ -67,308 +71,131 @@ struct DupContainer
}
};
/* 清除key位置的数据 */
static inline void clearIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx)
{
auto &msIdx = *pmsIdx;
if (msIdx.find(key) != msIdx.end())
msIdx[key].clear(); // 清除该位点的冗余结果
}
/* 删除key位置的数据 */
static inline void delIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx)
{
auto &msIdx = *pmsIdx;
if (msIdx.find(key) != msIdx.end())
msIdx.erase(key);
}
/*
*
*/
struct PairArrIdIdx {
int arrId = 0;
uint64_t arrIdx = 0;
int64_t dupIdx = 0;
};
struct IdxGreaterThan {
bool operator()(const PairArrIdIdx &a, const PairArrIdIdx &b) { return a.dupIdx > b.dupIdx; }
};
struct DupIdxQueue {
// 将冗余索引和他对应的task vector对应起来
// 由于是多个task来查找冗余所以每次找到的冗余index都放在一个独立的vector中vector之间可能有重叠所以需要用一个最小堆来维护
vector<vector<int64_t>> *dupIdx2DArr;
priority_queue<PairArrIdIdx, vector<PairArrIdIdx>, IdxGreaterThan> minHeap;
uint64_t popNum = 0;
int Init(vector<vector<int64_t>> *_dupIdx2DArr) {
dupIdx2DArr = _dupIdx2DArr;
if (dupIdx2DArr == nullptr) {
return -1;
}
for (int i = 0; i < dupIdx2DArr->size(); ++i) {
auto &v = (*dupIdx2DArr)[i];
if (!v.empty()) {
minHeap.push({i, 1, v[0]});
}
}
return 0;
}
int64_t Pop() {
int64_t ret = -1;
if (!minHeap.empty()) {
auto idx = minHeap.top();
minHeap.pop();
++popNum;
ret = idx.dupIdx;
auto &v = (*dupIdx2DArr)[idx.arrId];
if (v.size() > idx.arrIdx) {
minHeap.push({idx.arrId, idx.arrIdx + 1, v[idx.arrIdx]});
}
}
return ret;
}
uint64_t Size() {
uint64_t len = 0;
if (dupIdx2DArr != nullptr) {
for (auto &v : *dupIdx2DArr) {
len += v.size();
}
}
return len - popNum;
}
};
/*
* optical duplicationgraph
*/
template <class Node>
struct Graph { // 用set
unordered_set<Node> nodes; // 图中的结点
unordered_map<Node*, unordered_set<Node*>> neighbors; // 邻接列表
Node *addNode(const Node &singleton) {
if (nodes.find(singleton) == nodes.end()) {
Node *n = const_cast<Node *>(&(*nodes.insert(singleton).first));
neighbors[n].clear();
return n;
}
return const_cast<Node *>(&(*nodes.find(singleton)));
}
};
/*
* read
*/
static int16_t computeDuplicateScore(BamWrap &bw)
{
int16_t score = 0;
switch (g_mdArg.DUPLICATE_SCORING_STRATEGY)
{
case ns_md::SUM_OF_BASE_QUALITIES:
// two (very) long reads worth of high-quality bases can go over Short.MAX_VALUE/2
// and risk overflow.
score += (int16_t)min(bw.GetSumOfBaseQualities(), INT16_MAX / 2);
break;
case ns_md::TOTAL_MAPPED_REFERENCE_LENGTH:
if (!bw.GetReadUnmappedFlag())
// no need to remember the score since this scoring mechanism is symmetric
score = (int16_t)min(bw.GetReferenceLength(), INT16_MAX / 2);
break;
case ns_md::RANDOM:
// The RANDOM score gives the same score to both reads so that they get filtered together.
// it's not critical do use the readName since the scores from both ends get added, but it seem
// to be clearer this way.
score += (short)(Murmur3::Instance().HashUnencodedChars(bw.query_name()) & 0b11111111111111);
// subtract Short.MIN_VALUE/4 from it to end up with a number between
// 0 and Short.MAX_VALUE/2. This number can be then discounted in case the read is
// not passing filters. We need to stay far from overflow so that when we add the two
// scores from the two read mates we do not overflow since that could cause us to chose a
// failing read-pair instead of a passing one.
score -= INT16_MIN / 4;
default:
break;
}
// make sure that filter-failing records are heavily discounted. (the discount can happen twice, once
// for each mate, so need to make sure we do not subtract more than Short.MIN_VALUE overall.)
score += bw.GetReadFailsVendorQualityCheckFlag() ? (int16_t)(INT16_MIN / 2) : 0;
return score;
}
int16_t computeDuplicateScore(BamWrap &bw);
/*
* Builds a read ends object that represents a single read. read
* Builds a read ends object that represents a single read.
* read
*/
static void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey)
{
auto &k = *pKey;
auto &bc = bw.b->core;
k.read1FirstOfPair = bw.GetFirstOfPairFlag();
k.read1ReferenceIndex = bc.tid;
k.read1Coordinate = (bc.flag & BAM_FREVERSE) ? bw.GetUnclippedEnd() : bw.GetUnclippedStart();
k.orientation = (bc.flag & BAM_FREVERSE) ? ReadEnds::R : ReadEnds::F;
void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey);
k.read1IndexInFile = index;
k.score = computeDuplicateScore(bw);
// Doing this lets the ends object know that it's part of a pair
if (bw.GetReadPairedFlag() && !bw.GetMateUnmappedFlag())
{
k.read2ReferenceIndex = bc.mtid;
}
// Fill in the location information for optical duplicates
rnParser.AddLocationInformation(bw.query_name(), pKey);
// cout << k.tile << ' ' << k.x << ' ' << k.y << endl;
// 计算位置key
k.posKey = BamWrap::bam_global_pos(k.read1ReferenceIndex, k.read1Coordinate); // << 1 | k.orientation;
}
/*
* pairendreadends
*/
void markDuplicatePairs(int64_t posKey, vector<const ReadEnds *> &vpRe,
DupContainer<int64_t> *dupIdx, DupContainer<int64_t> *opticalDupIdx);
/*
* pairedreadends
*/
void markDuplicateFragments(int64_t posKey, vector<const ReadEnds *> &vpRe, bool containsPairs,
DupContainer<int64_t> *dupIdx);
/*
* pairend reads
*/
void handlePairs(int64_t posKey, vector<ReadEnds> &readEnds, vector<const ReadEnds *> &vpCache,
DupContainer<int64_t> *dupIdx, DupContainer<int64_t> *opticalDupIdx);
/*
* frag reads
*/
void handleFrags(int64_t posKey, vector<ReadEnds> &readEnds, vector<const ReadEnds *> &vpCache,
DupContainer<int64_t> *dupIdx);
/*
* pairend read end
*/
void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds);
/**
* Takes a list of ReadEndsForMarkDuplicates objects and identify the representative read based on
* quality score. For all members of the duplicate set, add the read1 index-in-file of the representative
* read to the records of the first and second in a pair. This value becomes is used for
* the 'DI' tag.
* Looks through the set of reads and identifies how many of the duplicates are
* in fact optical duplicates, and stores the data in the instance level histogram.
* Additionally sets the transient isOpticalDuplicate flag on each read end that is
* identified as an optical duplicate.
*
*/
static void addRepresentativeReadIndex(vector<const ReadEnds *> &vpRe)
{
}
/* 处理一组pairend的readends标记冗余 */
static void markDuplicatePairs(int64_t posKey,
vector<const ReadEnds *> &vpRe,
DupContainer<int64_t> *dupIdx,
DupContainer<int64_t> *opticalDupIdx)
{
if (vpRe.size() < 2)
{
if (vpRe.size() == 1)
{
// addSingletonToCount(libraryIdGenerator);
}
return;
}
// cout << "pos:" << posKey + 1 << ";size:" << vpRe.size() << endl;
auto &vDupIdx = dupIdx->AtPos(posKey);
auto &vOpticalDupIdx = opticalDupIdx->AtPos(posKey);
int maxScore = 0;
const ReadEnds *pBest = nullptr;
/** All read ends should have orientation FF, FR, RF, or RR **/
for (auto pe : vpRe) // 找分数最高的readend
{
if (pe->score > maxScore || pBest == nullptr)
{
maxScore = pe->score;
pBest = pe;
}
}
if (!g_mdArg.READ_NAME_REGEX.empty()) // 检查光学冗余
{
// trackOpticalDuplicates
}
for (auto pe : vpRe) // 对非best read标记冗余
{
if (pe != pBest) // 非best
{
vDupIdx.push_back(pe->read1IndexInFile); // 添加read1
if (pe->read2IndexInFile != pe->read1IndexInFile)
vDupIdx.push_back(pe->read2IndexInFile); // 添加read2
}
}
if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS)
{
addRepresentativeReadIndex(vpRe);
}
}
/* 处理一组非paired的readends标记冗余 */
static void markDuplicateFragments(int64_t posKey,
vector<const ReadEnds *> &vpRe,
bool containsPairs,
DupContainer<int64_t> *dupIdx)
{
auto &vDupIdx = dupIdx->AtPos(posKey);
if (containsPairs)
{
for (auto pe : vpRe)
{
if (!pe->IsPaired())
{
vDupIdx.push_back(pe->read1IndexInFile);
}
}
}
else
{
int maxScore = 0;
const ReadEnds *pBest = nullptr;
for (auto pe : vpRe)
{
if (pe->score > maxScore || pBest == nullptr)
{
maxScore = pe->score;
pBest = pe;
}
}
for (auto pe : vpRe)
{
if (pe != pBest)
{
vDupIdx.push_back(pe->read1IndexInFile);
}
}
}
}
/* 处理位于某个坐标的pairend reads */
static inline void handlePairs(int64_t posKey,
vector<ReadEnds> &readEnds,
vector<const ReadEnds *> &vpCache,
DupContainer<int64_t> *dupIdx,
DupContainer<int64_t> *opticalDupIdx)
{
if (readEnds.size() > 1) // 有潜在的冗余
{
vpCache.clear();
// std::sort(readEnds.begin(), readEnds.end());
const ReadEnds *pReadEnd = nullptr;
for (auto &re : readEnds)
{
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样
vpCache.push_back(&re); // 处理一个潜在的冗余组
else
{
markDuplicatePairs(posKey, vpCache, dupIdx, opticalDupIdx); // 不一样
vpCache.clear();
vpCache.push_back(&re);
pReadEnd = &re;
}
}
markDuplicatePairs(posKey, vpCache, dupIdx, opticalDupIdx);
}
}
/* 处理位于某个坐标的 reads */
static inline void handleFrags(
int64_t posKey,
vector<ReadEnds> &readEnds,
vector<const ReadEnds *> &vpCache,
DupContainer<int64_t> *dupIdx)
{
if (readEnds.size() > 1) // 有潜在的冗余
{
vpCache.clear();
// std::sort(readEnds.begin(), readEnds.end());
const ReadEnds *pReadEnd = nullptr;
bool containsPairs = false;
bool containsFrags = false;
for (auto &re : readEnds)
{
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false))
{
vpCache.push_back(&re);
containsPairs = containsPairs || re.IsPaired();
containsFrags = containsFrags || !re.IsPaired();
}
else
{
if (vpCache.size() > 1 && containsFrags)
{
markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
}
vpCache.clear();
vpCache.push_back(&re);
pReadEnd = &re;
containsPairs = re.IsPaired();
containsFrags = !re.IsPaired();
}
}
if (vpCache.size() > 1 && containsFrags)
{
markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
}
}
}
/* 对找到的pairend read end添加一些信息 */
static inline void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds)
{
auto &pairedEnds = *pPairedEnds;
int64_t bamIdx = fragEnd.read1IndexInFile;
const int matesRefIndex = fragEnd.read1ReferenceIndex;
const int matesCoordinate = fragEnd.read1Coordinate;
// Set orientationForOpticalDuplicates, which always goes by the first then the second end for the strands. NB: must do this
// before updating the orientation later.
if (fragEnd.read1FirstOfPair)
{
pairedEnds.orientationForOpticalDuplicates =
ReadEnds::GetOrientationByte(fragEnd.IsNegativeStrand(), pairedEnds.orientation == ReadEnds::R);
}
else
{
pairedEnds.orientationForOpticalDuplicates =
ReadEnds::GetOrientationByte(pairedEnds.orientation == ReadEnds::R, fragEnd.IsNegativeStrand());
}
// If the other read is actually later, simply add the other read's data as read2, else flip the reads
if (matesRefIndex > pairedEnds.read1ReferenceIndex ||
(matesRefIndex == pairedEnds.read1ReferenceIndex && matesCoordinate >= pairedEnds.read1Coordinate))
{
pairedEnds.read2ReferenceIndex = matesRefIndex;
pairedEnds.read2Coordinate = matesCoordinate;
pairedEnds.read2IndexInFile = bamIdx;
pairedEnds.orientation = ReadEnds::GetOrientationByte(pairedEnds.orientation == ReadEnds::R,
fragEnd.IsNegativeStrand());
// if the two read ends are in the same position, pointing in opposite directions,
// the orientation is undefined and the procedure above
// will depend on the order of the reads in the file.
// To avoid this, we set it explicitly (to FR):
if (pairedEnds.read2ReferenceIndex == pairedEnds.read1ReferenceIndex &&
pairedEnds.read2Coordinate == pairedEnds.read1Coordinate &&
pairedEnds.orientation == ReadEnds::RF)
{
pairedEnds.orientation = ReadEnds::FR;
}
}
else
{
pairedEnds.read2ReferenceIndex = pairedEnds.read1ReferenceIndex;
pairedEnds.read2Coordinate = pairedEnds.read1Coordinate;
pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile;
pairedEnds.read1ReferenceIndex = matesRefIndex;
pairedEnds.read1Coordinate = matesCoordinate;
pairedEnds.read1IndexInFile = bamIdx;
pairedEnds.orientation = ReadEnds::GetOrientationByte(fragEnd.IsNegativeStrand(),
pairedEnds.orientation == ReadEnds::R);
pairedEnds.posKey = fragEnd.posKey;
}
pairedEnds.score += fragEnd.score;
}
void trackOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe);

View File

View File

@ -0,0 +1,783 @@
#include "serial_md.h"
#include <common/hts/bam_buf.h>
#include <common/utils/debug.h>
#include <common/utils/global_arg.h>
#include <common/utils/profiling.h>
#include <common/utils/timer.h>
#include <common/utils/util.h>
#include <sam/utils/read_ends.h>
#include <algorithm>
#include <iostream>
#include <set>
#include <vector>
#include "markdups_arg.h"
#include "md_funcs.h"
#include "shared_args.h"
#include "dup_metrics.h"
using std::cout;
using std::set;
using std::vector;
/* 查找 */
// template<class Itr, class T>
// static inline Itr binaryFind(Itr first, Itr last, const T &val)
// {
// first = std::lower_bound(first, last, val);
// return (first != last && *first == val) ? first : last;
// }
/* 排序 */
static inline void sortReadEndsArr(vector<ReadEnds> &arr) {
size_t blockSize = 64 * 1024;
if (arr.size() < blockSize) {
std::sort(arr.begin(), arr.end());
return;
}
size_t blockNum = (arr.size() + blockSize - 1) / blockSize;
size_t crossNum = 1024;
size_t start, end, i, left, right;
std::sort(arr.begin(), arr.begin() + blockSize);
for (i = 1; i < blockNum; ++i) {
start = i * blockSize;
end = min(start + blockSize, arr.size());
std::sort(arr.begin() + start, arr.begin() + end);
left = crossNum;
while (!(arr[start - left] < arr[start])) {
left = left << 1;
if (left >= blockSize) {
std::sort(arr.begin(), arr.end()); // 退化到普通排序
return;
}
}
right = min(crossNum, end - start - 1);
while (!(arr[start - 1] < arr[start + right])) {
right = min(right << 1, end - start - 1);
if (right == end - start - 1)
break;
}
std::sort(arr.begin() + start - left, arr.begin() + start + right);
}
}
/* 处理一组pairend的readends标记冗余, 这个函数需要串行运行,因为需要做一些统计*/
static void markDupsForPairs(vector<const ReadEnds *> &vpRe, set<int64_t> *dupIdx, set<int64_t> *opticalDupIdx,
set<int64_t> *notDupIdx = nullptr) {
if (vpRe.size() < 2) {
if (vpRe.size() == 1) {
// addSingletonToCount(libraryIdGenerator);
// 这个统计可能会有误差因为当前位点可能还有没匹配上的read导致当前位点的readpaired数量为1
// 可以通过后续的补充计算来解决这个问题,有必要么?
gMetrics.AddSingletonToCount();
}
return;
}
int maxScore = 0;
const ReadEnds *pBest = nullptr;
/** All read ends should have orientation FF, FR, RF, or RR **/
for (auto pe : vpRe) { // 找分数最高的readend
if (pe->score > maxScore || pBest == nullptr) {
maxScore = pe->score;
pBest = pe;
}
}
if (notDupIdx != nullptr) {
notDupIdx->insert(pBest->read1IndexInFile);
notDupIdx->insert(pBest->read2IndexInFile);
}
if (!g_mdArg.READ_NAME_REGEX.empty()) { // 检查光学冗余
// trackOpticalDuplicates
trackOpticalDuplicates(vpRe, pBest);
}
for (auto pe : vpRe) // 对非best read标记冗余
{
if (pe != pBest) // 非best
{
dupIdx->insert(pe->read1IndexInFile); // 添加read1
if (pe->read2IndexInFile != pe->read1IndexInFile)
dupIdx->insert(pe->read2IndexInFile); // 添加read2
}
}
// 在输出的bam文件中添加tag
if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS) {
// addRepresentativeReadIndex(vpRe); // 每次都更新就行,用最新的覆盖之前的(如果之前有)
}
}
/* 处理一组非paired的readends标记冗余 */
static void markDupsForFrags(vector<const ReadEnds *> &vpRe, bool containsPairs, set<int64_t> *dupIdx,
set<int64_t> *notDupIdx = nullptr) {
if (containsPairs) {
for (auto pe : vpRe) {
if (!pe->IsPaired()) {
dupIdx->insert(pe->read1IndexInFile);
}
}
} else {
int maxScore = 0;
const ReadEnds *pBest = nullptr;
for (auto pe : vpRe) {
if (pe->score > maxScore || pBest == nullptr) {
maxScore = pe->score;
pBest = pe;
}
}
if (notDupIdx != nullptr) {
notDupIdx->insert(pBest->read1IndexInFile);
}
for (auto pe : vpRe) {
if (pe != pBest) {
dupIdx->insert(pe->read1IndexInFile);
}
}
}
}
/* 找到与readend pos相等的所有readend */
static void getEqualRE(const ReadEnds &re, vector<ReadEnds> &src, vector<ReadEnds> *dst) {
auto range = std::equal_range(src.begin(), src.end(), re,
ReadEnds::PairsLittleThan); // 只比对位点
dst->insert(dst->end(), range.first, range.second);
}
/* 单线程生成readends (第一步)*/
static void generateReadEnds(SerailMarkDupArg *arg) {
auto &p = *arg;
auto &rnParser = g_vRnParser[0];
p.pairs.clear();
p.frags.clear();
p.unpairedDic.clear();
p.unpairedPosArr.clear();
/* 处理每个read创建ReadEnd并放入frag和pair中 */
set<ReadEnds> reSet;
ReadEnds lastRe;
for (int i = 0; i < p.bams.size(); ++i) // 循环处理每个read
{
BamWrap *bw = p.bams[i];
const int64_t bamIdx = p.bamStartIdx + i;
if (bw->GetReadUnmappedFlag()) {
if (bw->b->core.tid == -1)
// When we hit the unmapped reads with no coordinate, no reason
// to continue (only in coordinate sort).
break;
} else if (!bw->IsSecondaryOrSupplementary()) // 是主要比对
{
ReadEnds fragEnd;
tm_arr[8].acc_start();
buildReadEnds(*bw, bamIdx, rnParser, &fragEnd);
tm_arr[8].acc_end();
p.frags.push_back(fragEnd); // 添加进frag集合
if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) // 是pairend而且互补的read也比对上了
{
string key = bw->query_name();
if (p.unpairedDic.find(key) == p.unpairedDic.end()) {
p.unpairedDic[key] = {p.taskSeq, fragEnd};
} else // 找到了pairend
{
auto &pairedEnds = p.unpairedDic.at(key).unpairedRE;
modifyPairedEnds(fragEnd, &pairedEnds);
p.pairs.push_back(pairedEnds);
p.unpairedDic.erase(key); // 删除找到的pairend
}
}
}
}
tm_arr[9].acc_start();
sortReadEndsArr(p.frags);
// sort(p.frags.begin(), p.frags.end());
tm_arr[9].acc_end();
// cout << "sort pairs" << endl;
tm_arr[10].acc_start();
sort(p.pairs.begin(), p.pairs.end());
tm_arr[10].acc_end();
// 记录位点上的未匹配的read个数
for (auto &e : p.unpairedDic) {
auto posKey = e.second.unpairedRE.posKey;
auto &unpairArrInfo = p.unpairedPosArr[posKey];
unpairArrInfo.unpairedNum++;
unpairArrInfo.taskSeq = p.taskSeq;
unpairArrInfo.readNameSet.insert(e.first);
}
// cout << "依赖比例:" << (float)p.unpairedDic.size() / p.frags.size() <<
// endl;
}
/* 处理pairs */
static void processPairs(vector<ReadEnds> &readEnds, set<int64_t> *dupIdx, set<int64_t> *opticalDupIdx,
set<int64_t> *notDupIdx = nullptr) {
vector<const ReadEnds *> vpCache; // 有可能是冗余的reads
const ReadEnds *pReadEnd = nullptr;
for (auto &re : readEnds) {
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样
vpCache.push_back(&re); // 处理一个潜在的冗余组
else {
markDupsForPairs(vpCache, dupIdx, opticalDupIdx, notDupIdx); // 不一样
vpCache.clear();
vpCache.push_back(&re);
pReadEnd = &re;
}
}
markDupsForPairs(vpCache, dupIdx, opticalDupIdx, notDupIdx);
}
/* 处理frags */
static void processFrags(vector<ReadEnds> &readEnds, set<int64_t> *dupIdx, set<int64_t> *notDupIdx = nullptr) {
bool containsPairs = false;
bool containsFrags = false;
vector<const ReadEnds *> vpCache; // 有可能是冗余的reads
const ReadEnds *pReadEnd = nullptr;
for (auto &re : readEnds) {
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false)) {
vpCache.push_back(&re);
containsPairs = containsPairs || re.IsPaired();
containsFrags = containsFrags || !re.IsPaired();
} else {
if (vpCache.size() > 1 && containsFrags) {
markDupsForFrags(vpCache, containsPairs, dupIdx, notDupIdx);
}
vpCache.clear();
vpCache.push_back(&re);
pReadEnd = &re;
containsPairs = re.IsPaired();
containsFrags = !re.IsPaired();
}
}
if (vpCache.size() > 1 && containsFrags) {
markDupsForFrags(vpCache, containsPairs, dupIdx, notDupIdx);
}
}
/* 单线程markdup (第二步)*/
static void markdups(SerailMarkDupArg *arg) {
auto &p = *arg;
p.pairDupIdx.clear();
p.pairOpticalDupIdx.clear();
p.fragDupIdx.clear();
/* generateDuplicateIndexes计算冗余read在所有read中的位置索引 */
// 先处理 pair
processPairs(p.pairs, &p.pairDupIdx, &p.pairOpticalDupIdx);
// 再处理frag
processFrags(p.frags, &p.fragDupIdx);
}
/* 获取交叉部分的数据 */
static inline void getIntersectData(vector<ReadEnds> &leftArr, vector<ReadEnds> &rightArr, vector<ReadEnds> *dst,
bool isPairCmp = false) {
if (leftArr.empty() || rightArr.empty()) {
cout << "bad size: " << leftArr.size() << '\t' << rightArr.size() << endl;
return;
}
const size_t leftEndIdx = leftArr.size() - 1;
const size_t rightStartIdx = 0;
size_t leftSpan = 0;
size_t rightSpan = 0;
while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx - leftSpan], rightArr[rightStartIdx], isPairCmp)) {
leftSpan += 1;
if (leftSpan > leftEndIdx) {
leftSpan = leftArr.size() - 1;
break;
}
}
while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx], rightArr[rightSpan], isPairCmp)) {
rightSpan += 1;
if (rightSpan == rightArr.size() - 1)
break;
}
dst->insert(dst->end(), leftArr.end() - leftSpan, leftArr.end());
dst->insert(dst->end(), rightArr.begin(), rightArr.begin() + rightSpan);
std::sort(dst->begin(), dst->end());
}
/* 将frags重叠部分的dup idx放进数据中 */
static inline void refreshFragDupIdx(set<int64_t> &dupIdx, set<int64_t> &notDupIdx, SerailMarkDupArg *lastArg,
SerailMarkDupArg *curArg) {
auto &lp = *lastArg;
auto &p = *curArg;
for (auto idx : dupIdx) {
lp.fragDupIdx.insert(idx);
p.fragDupIdx.erase(idx);
}
for (auto idx : notDupIdx) {
lp.fragDupIdx.erase(idx);
p.fragDupIdx.erase(idx);
}
}
/* 将pairs重叠部分的dup idx放进数据中 */
static inline void refreshPairDupIdx(set<int64_t> &dupIdx, set<int64_t> &opticalDupIdx, set<int64_t> &notDupIdx,
SerailMarkDupArg *lastArg, SerailMarkDupArg *curArg) {
auto &lp = *lastArg;
auto &p = *curArg;
for (auto idx : dupIdx) {
lp.pairDupIdx.insert(idx);
p.pairDupIdx.erase(idx);
}
for (auto idx : opticalDupIdx) {
lp.pairOpticalDupIdx.insert(idx);
p.pairOpticalDupIdx.erase(idx);
}
for (auto idx : notDupIdx) {
lp.pairDupIdx.erase(idx);
lp.pairOpticalDupIdx.erase(idx);
p.pairDupIdx.erase(idx);
p.pairOpticalDupIdx.erase(idx);
}
}
// 用来分别处理dup和optical dup
static void refeshTaskDupInfo(set<int64_t> &dupIdx, set<int64_t> &opticalDupIdx, set<int64_t> &notDupIdx,
set<int64_t> &latterDupIdx, set<int64_t> &latterOpticalDupIdx,
set<int64_t> &latterNotDupIdx) {
for (auto idx : dupIdx) latterDupIdx.insert(idx);
for (auto idx : opticalDupIdx) latterOpticalDupIdx.insert(idx);
for (auto idx : notDupIdx) latterNotDupIdx.insert(idx);
}
/* 最后合并数据并排序 */
static void refeshFinalTaskDupInfo(set<int64_t> &dupIdx, set<int64_t> &notDupIdx, vector<int64_t> &dupArr) {
vector<int64_t> midArr;
auto ai = dupArr.begin();
auto bi = dupIdx.begin();
auto ae = dupArr.end();
auto be = dupIdx.end();
int64_t val = 0;
while (ai != ae && bi != be) {
if (*ai < *bi) {
val = *ai++;
} else if (*bi < *ai) {
val = *bi++;
} else {
val = *ai++;
bi++;
}
if (notDupIdx.find(val) == notDupIdx.end()) {
midArr.push_back(val);
}
}
while (ai != ae) {
val = *ai++;
if (notDupIdx.find(val) == notDupIdx.end()) {
midArr.push_back(val);
}
}
while (bi != be) {
val = *bi++;
if (notDupIdx.find(val) == notDupIdx.end()) {
midArr.push_back(val);
}
}
dupArr = midArr;
}
/* 处理相邻的两个任务,有相交叉的数据 */
static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *curArg, GlobalDataArg *gDataArg) {
auto &lp = *lastArg;
auto &p = *curArg;
auto &g = *gDataArg;
vector<ReadEnds> reArr;
set<int64_t> dupIdx;
set<int64_t> notDupIdx;
// 先处理重叠的frags
getIntersectData(lp.frags, p.frags, &reArr);
processFrags(reArr, &dupIdx, &notDupIdx);
refreshFragDupIdx(dupIdx, notDupIdx, &lp, &p);
// 再处理重叠的pairs
reArr.clear();
dupIdx.clear();
notDupIdx.clear();
set<int64_t> opticalDupIdx;
getIntersectData(lp.pairs, p.pairs, &reArr, true);
processPairs(reArr, &dupIdx, &opticalDupIdx, &notDupIdx);
refreshPairDupIdx(dupIdx, opticalDupIdx, notDupIdx, &lp, &p);
// 处理之前未匹配的部分
map<CalcKey, int64_t> recalcPos;
set<CalcKey> alreadyAdd; // 与该位点相同的pair都添加到数组里了
set<int64_t> addToGlobal;
int64_t prevLastPos = 0, nextFirstPos = 0;
if (lp.frags.size() > 0)
prevLastPos = lp.frags.back().posKey;
if (p.frags.size() > 0)
nextFirstPos = p.frags[0].posKey;
// cout << "range: " << nextFirstPos << '\t' << prevLastPos << endl;
for (auto &prevUnpair : lp.unpairedDic) { // 遍历上一个任务中的每个未匹配的read
auto &readName = prevUnpair.first;
auto &prevPosInfo = prevUnpair.second;
auto prevFragEnd = prevPosInfo.unpairedRE; // 未匹配的read end
if (p.unpairedDic.find(readName) != p.unpairedDic.end()) { // 在当前这个任务里找到了这个未匹配的read
auto &nextPosInfo = p.unpairedDic[readName];
auto &nextFragEnd = nextPosInfo.unpairedRE;
int64_t prevPosKey = prevFragEnd.posKey;
modifyPairedEnds(nextFragEnd, &prevFragEnd); // 在某些clip情况下poskey可能是后面的read
int64_t nextPosKey = max(prevPosKey, nextFragEnd.posKey);
CalcKey ck = {prevPosKey, nextPosKey};
UnpairedPosInfo *prevUnpairInfoP = nullptr;
UnpairedPosInfo *nextUnpairInfoP = nullptr;
if (lp.unpairedPosArr.find(prevPosKey) != lp.unpairedPosArr.end())
prevUnpairInfoP = &lp.unpairedPosArr[prevPosKey];
if (p.unpairedPosArr.find(prevPosKey) != p.unpairedPosArr.end())
nextUnpairInfoP = &p.unpairedPosArr[prevPosKey];
// pos分为两种情况根据poskey(pair中两个read分别的pos)的位置确定
// 1.
// prevpos在交叉部分之前nextpos在交叉部分之后这种情况不需要获取pairarr中的数据;
// 2.
// prevpos在交叉部分之前nextpos在交叉部分需要获取lp中的相等read
// pair进行重新计算
// 复杂情况1.
// g中包含prevPosKey对应的unpairp中有对应的pair此时应该把这些pair考虑进去
// 3.
// prevpos在交叉部分nextpos在交叉部分之后需要获取p中的相等read
// pair进行重新计算
// 复杂情况2. p中是否包含prevPosKey对应的unpair
// 4.
// prevpos在交叉部分nextpos在交叉部分需要获取lp和p中的相等read
// pair进行重新计算
bool addDataToPos = true;
if (alreadyAdd.find(ck) != alreadyAdd.end()) {
addDataToPos = false; // 之前已经添加过了,后面就不用再添加数据了
} else
alreadyAdd.insert(ck);
if (prevPosKey < nextFirstPos) { // prevpos在交叉部分之前
auto &prevPairArr = prevUnpairInfoP->pairArr; // prevUnpairInfoP肯定不是nullptr
prevPairArr.push_back(prevFragEnd);
if (nextPosKey <= prevLastPos && addDataToPos) { // 第二种情况
getEqualRE(prevFragEnd, lp.pairs, &prevPairArr);
}
// 第一种情况,第二种情况下都会出现,复杂情况一
auto gPosInfo = g.unpairedPosArr.find(prevPosKey);
if (gPosInfo != g.unpairedPosArr.end()) { // 可能g和p有匹配的刚好和该位点一致
auto &gUnpairInfo = gPosInfo->second;
auto pPosInfo = p.unpairedPosArr.find(nextPosKey);
if (pPosInfo != p.unpairedPosArr.end()) {
auto &pUnpairInfo = pPosInfo->second;
for (auto &rn : gUnpairInfo.readNameSet) { // 遍历每一个readname看是否有匹配的
if (pUnpairInfo.readNameSet.find(rn) != pUnpairInfo.readNameSet.end()) {
auto pe = g.unpairedDic[rn].unpairedRE;
auto fe = p.unpairedDic[rn].unpairedRE;
modifyPairedEnds(fe, &pe);
prevPairArr.push_back(pe);
g.unpairedDic.erase(rn);
p.unpairedDic.erase(rn);
// cout << "找到了!" << rn << endl;
}
}
}
}
recalcPos[ck] = prevPosInfo.taskSeq;
std::sort(prevPairArr.begin(), prevPairArr.end());
} else { // prevpos在交叉部分
if (nextPosKey > prevLastPos) { // nextpos在交叉部分之后 第三种情况
if (nextUnpairInfoP != nullptr) { // 且在pos点next task有unpair这样才把这些数据放到next task里
auto &nextPairArr = nextUnpairInfoP->pairArr;
nextPairArr.push_back(prevFragEnd);
auto &prevPairArr = prevUnpairInfoP->pairArr;
prevPairArr.push_back(prevFragEnd);
if (addDataToPos) {
getEqualRE(prevFragEnd, p.pairs, &prevPairArr);
}
// 将数据放到next task里,(这个位点以后会可能还会计算到,目前方案是都计算,只是把冗余剔除)
recalcPos[ck] = nextPosInfo.taskSeq;
std::sort(prevPairArr.begin(), prevPairArr.end());
} else { // next task在该位点没有unpair那就把数据放到prev task里
auto &prevPairArr = prevUnpairInfoP->pairArr; // prevUnpairInfoP肯定不是nullptr
prevPairArr.push_back(prevFragEnd);
if (addDataToPos) // 第二种情况
getEqualRE(prevFragEnd, p.pairs, &prevPairArr);
recalcPos[ck] = prevPosInfo.taskSeq;
std::sort(prevPairArr.begin(), prevPairArr.end());
}
} else { // 第四种情况
if (prevUnpairInfoP == nullptr) {
prevUnpairInfoP = &lp.unpairedPosArr[prevPosKey];
prevUnpairInfoP->taskSeq = lp.taskSeq;
}
auto &prevPairArr = prevUnpairInfoP->pairArr;
prevPairArr.push_back(prevFragEnd);
if (addDataToPos) {
getEqualRE(prevFragEnd, lp.pairs, &prevPairArr);
getEqualRE(prevFragEnd, p.pairs, &prevPairArr);
}
recalcPos[ck] = prevPosInfo.taskSeq;
std::sort(prevPairArr.begin(), prevPairArr.end());
}
}
p.unpairedDic.erase(readName); // 在next task里删除该read
} else if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // 在遗留数据中找到了匹配的read
auto &remainPosInfo = g.unpairedDic[readName];
auto remainFragEnd = remainPosInfo.unpairedRE;
int64_t remainPosKey = remainFragEnd.posKey;
modifyPairedEnds(prevFragEnd, &remainFragEnd); // 在某些clip情况下poskey可能是后面的read
auto &remainUnpairInfo = g.unpairedPosArr[remainPosKey];
auto &remainPairArr = remainUnpairInfo.pairArr;
remainPairArr.push_back(remainFragEnd);
CalcKey ck = {remainPosKey, prevFragEnd.posKey};
recalcPos[ck] = remainPosInfo.taskSeq;
std::sort(remainPairArr.begin(), remainPairArr.end());
g.unpairedDic.erase(readName);
} else { // 都没找到,那就保存到遗留数据里
int64_t prevPosKey = prevFragEnd.posKey;
g.unpairedDic.insert(prevUnpair);
addToGlobal.insert(prevPosKey);
}
}
// 最后再添加,以防开始赋值,后来这个位置要是又添加了新的数据
for (auto posKey : addToGlobal) g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey];
map<int64_t, TaskSeqDupInfo> taskChanged;
set<int64_t> posProcessed;
for (auto &e : recalcPos) {
auto posKey = e.first.read1Pos;
if (posProcessed.find(posKey) != posProcessed.end())
continue;
posProcessed.insert(posKey);
auto taskSeq = e.second;
auto &t = taskChanged[taskSeq];
// 在对应的任务包含的dup idx里修改结果数据
vector<ReadEnds> *pairArrP = nullptr;
if (taskSeq < lp.taskSeq)
pairArrP = &g.unpairedPosArr[posKey].pairArr;
else
pairArrP = &lp.unpairedPosArr[posKey].pairArr;
processPairs(*pairArrP, &t.dupIdx, &t.opticalDupIdx, &t.notDupIdx);
if (taskSeq < lp.taskSeq)
g.unpairedPosArr.erase(posKey);
}
// 更新结果
for (auto &e : taskChanged) {
auto taskSeq = e.first;
auto &t = e.second;
if (taskSeq < lp.taskSeq) {
refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.notDupIdx, g.latterDupIdxArr[taskSeq],
g.latterOpticalDupIdxArr[taskSeq], g.latterNotDupIdxArr[taskSeq]);
} else if (taskSeq == lp.taskSeq) {
refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.notDupIdx, &lp, &p);
} else {
refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.notDupIdx, &p, &lp); // 把结果放到p中
}
}
// cout << "remain unpaired: " << g.unpairedDic.size() << '\t' <<
// g.unpairedPosArr.size() << endl; cout << "calc g time: " <<
// t.seconds_elapsed() << " s" << endl; 将dupidx放进全局数据
g.latterDupIdxArr.push_back(set<int64_t>());
g.latterOpticalDupIdxArr.push_back(set<int64_t>());
g.latterNotDupIdxArr.push_back(set<int64_t>());
g.dupIdxArr.push_back(vector<int64_t>());
auto &vIdx = g.dupIdxArr.back();
lp.pairDupIdx.insert(lp.fragDupIdx.begin(), lp.fragDupIdx.end());
vIdx.insert(vIdx.end(), lp.pairDupIdx.begin(), lp.pairDupIdx.end());
g.opticalDupIdxArr.push_back(vector<int64_t>());
auto &vOpticalIdx = g.opticalDupIdxArr.back();
vOpticalIdx.insert(vOpticalIdx.end(), lp.pairOpticalDupIdx.begin(), lp.pairOpticalDupIdx.end());
}
/* 当所有任务结束后global data里还有未处理的数据 */
static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) {
auto &lp = *task;
auto &g = *gDataArg;
// 遗留的未匹配的pair
for (auto &prevUnpair : lp.unpairedDic) { // 遍历上一个任务中的每个未匹配的read
auto &readName = prevUnpair.first;
auto &prevPosInfo = prevUnpair.second;
auto prevFragEnd = prevPosInfo.unpairedRE; // 未匹配的read end
if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // 在遗留数据中找到了匹配的read
auto &remainPosInfo = g.unpairedDic[readName];
auto remainFragEnd = remainPosInfo.unpairedRE;
int64_t remainPosKey = remainFragEnd.posKey;
modifyPairedEnds(prevFragEnd, &remainFragEnd); // 在某些clip情况下poskey可能是后面的read
auto &remainUnpairInfo = g.unpairedPosArr[remainPosKey];
remainUnpairInfo.pairArr.push_back(remainFragEnd);
g.unpairedDic.erase(readName);
}
}
map<int64_t, TaskSeqDupInfo> taskChanged;
for (auto &e : g.unpairedPosArr) {
auto posKey = e.first;
auto taskSeq = e.second.taskSeq;
auto &t = taskChanged[taskSeq];
auto &arr = g.unpairedPosArr[posKey].pairArr;
if (arr.size() > 1) {
std::sort(arr.begin(), arr.end());
processPairs(arr, &t.dupIdx, &t.opticalDupIdx, &t.notDupIdx);
}
}
// 更新结果
vector<int64_t> addDup;
map<int64_t, int64_t> ndPosVal;
for (auto &e : taskChanged) {
auto taskSeq = e.first;
auto &t = e.second;
refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.notDupIdx, g.latterDupIdxArr[taskSeq],
g.latterOpticalDupIdxArr[taskSeq], g.latterNotDupIdxArr[taskSeq]);
}
cout << "last unpair info: " << g.unpairedDic.size() << '\t' << g.unpairedPosArr.size() << endl;
g.unpairedPosArr.clear();
g.unpairedDic.clear();
// 将dupidx放进全局数据
for (int i = 0; i < (int)g.dupIdxArr.size() - 1; ++i)
refeshFinalTaskDupInfo(g.latterDupIdxArr[i], g.latterNotDupIdxArr[i], g.dupIdxArr[i]);
for (int i = 0; i < (int)g.opticalDupIdxArr.size() - 1; ++i)
refeshFinalTaskDupInfo(g.latterOpticalDupIdxArr[i], g.latterNotDupIdxArr[i], g.opticalDupIdxArr[i]);
g.dupIdxArr.push_back(vector<int64_t>());
auto &vIdx = g.dupIdxArr.back();
lp.pairDupIdx.insert(lp.fragDupIdx.begin(), lp.fragDupIdx.end());
vIdx.insert(vIdx.end(), lp.pairDupIdx.begin(), lp.pairDupIdx.end());
g.opticalDupIdxArr.push_back(vector<int64_t>());
auto &vOpticalIdx = g.opticalDupIdxArr.back();
vOpticalIdx.insert(vOpticalIdx.end(), lp.pairOpticalDupIdx.begin(), lp.pairOpticalDupIdx.end());
}
/* 串行处理数据,标记冗余 */
void serialMarkDups() {
tm_arr[5].acc_start();
Timer::log_time("serial start");
// 读取缓存初始化
BamBufType inBamBuf(g_gArg.use_asyncio);
inBamBuf.Init(g_inBamFp, g_inBamHeader, g_gArg.max_mem);
// BamBufType inBamBuf(false);
// inBamBuf.Init(g_inBamFp, g_inBamHeader, 100 * 1024 * 1024);
int64_t processedBamNum = 0;
SerailMarkDupArg smdArg1, smdArg2;
SerailMarkDupArg *lastArgP = &smdArg1;
SerailMarkDupArg *curArgP = &smdArg2;
bool isFirstRound = true;
int roundNum = 0;
int64_t readNumSum = 0;
while (inBamBuf.ReadStat() >= 0) {
Timer t_round;
// 读取bam文件中的read
tm_arr[4].acc_start();
size_t readNum = inBamBuf.ReadBam();
readNumSum += readNum;
tm_arr[4].acc_end();
cout << "read num: " << readNum << '\t' << roundNum << endl;
// lastArgP = curArgP;
tm_arr[6].acc_start();
curArgP->taskSeq = roundNum;
curArgP->bamStartIdx = processedBamNum;
curArgP->bams = inBamBuf.GetBamArr();
tm_arr[6].acc_end();
tm_arr[0].acc_start();
Timer t1;
generateReadEnds(curArgP);
// cout << "calc read end time: " << t1.seconds_elapsed() << " s" <<
// endl;
tm_arr[0].acc_end();
tm_arr[1].acc_start();
t1.reinit();
markdups(curArgP);
// cout << "markdups time: " << t1.seconds_elapsed() << " s" << endl;
tm_arr[1].acc_end();
if (!isFirstRound) {
tm_arr[2].acc_start();
t1.reinit();
handleIntersectData(lastArgP, curArgP, &gData);
// cout << "intersect time: " << t1.seconds_elapsed() << " s" <<
// endl;
// addTaskIdxToSet(lastArgP, &gData);
tm_arr[2].acc_end();
} else {
isFirstRound = false;
}
inBamBuf.ClearAll(); // 清理上一轮读入的数据
processedBamNum += readNum;
// 交换
auto tmp = lastArgP;
lastArgP = curArgP;
curArgP = tmp;
// cout << "round time: " << t_round.seconds_elapsed() << endl;
roundNum++;
if (roundNum % 100 == 0) {
cout << "read sum: " << readNumSum << endl;
cout << "round time: " << t_round.seconds_elapsed() * 100 << " s" << endl;
}
}
// cout << "here" << endl;
tm_arr[3].acc_start();
// 处理剩下的全局数据
handleLastTask(lastArgP, &gData);
// cout << "here 2" << endl;
tm_arr[3].acc_end();
tm_arr[5].acc_end();
// 统计所有冗余index数量
int64_t dupNum = 0;
map<int64_t, int> dup;
int taskSeq = 0;
for (auto &arr : gData.dupIdxArr) {
for (auto idx : arr) {
if (dup.find(idx) != dup.end()) {
// cout << "dup index: " << dup[idx] << '\t' << taskSeq << '\t'
// << idx << endl;
}
dup[idx] = taskSeq;
}
taskSeq++;
}
// #include <fstream>
// ofstream out("tumor_dup.txt");
// for (auto idx : dup)
// {
// out << idx << endl;
// }
// out.close();
for (auto &arr : gData.dupIdxArr) dupNum += arr.size();
cout << "dup num : " << dupNum << '\t' << dup.size() << endl;
cout << "calc readend: " << tm_arr[0].acc_seconds_elapsed() << endl;
cout << "markdup : " << tm_arr[1].acc_seconds_elapsed() << endl;
cout << "handle tail : " << tm_arr[2].acc_seconds_elapsed() << endl;
cout << "handle last : " << tm_arr[3].acc_seconds_elapsed() << endl;
cout << "read bam : " << tm_arr[4].acc_seconds_elapsed() << endl;
cout << "new arg : " << tm_arr[6].acc_seconds_elapsed() << endl;
cout << "del arg : " << tm_arr[7].acc_seconds_elapsed() << endl;
cout << "build ends : " << tm_arr[8].acc_seconds_elapsed() << endl;
cout << "sort frags : " << tm_arr[9].acc_seconds_elapsed() << endl;
cout << "sort pairs : " << tm_arr[10].acc_seconds_elapsed() << endl;
cout << "all : " << tm_arr[5].acc_seconds_elapsed() << endl;
Timer::log_time("serial end ");
// for (auto i : gData.dupArr)
// cout << i << endl;
}

View File

@ -1,20 +1,28 @@
#include <algorithm>
#pragma once
#include <common/hts/bam_buf.h>
#include <robin-map/include/tsl/robin_map.h>
#include <sam/utils/read_ends.h>
#include <set>
#include <string>
#include <vector>
using std::set;
using std::string;
using std::vector;
/* 存放未匹配readend相同位点的所有readend */
struct UnpairedREInfo
{
struct UnpairedREInfo {
int64_t taskSeq;
ReadEnds unpairedRE;
};
/* 对于一个pair数据一个完整的计算点包含read1的比对位置和read2的比对位置 */
struct CalcKey
{
struct CalcKey {
int64_t read1Pos;
int64_t read2Pos;
bool operator<(const CalcKey &o) const
{
bool operator<(const CalcKey &o) const {
int comp = (int)(read1Pos - o.read1Pos);
if (comp == 0)
comp = (int)(read2Pos - o.read2Pos);
@ -23,16 +31,14 @@ struct CalcKey
};
/* 当遗留数据在当前任务找到了pair read后进行冗余计算时候存放结果的数据结构 */
struct TaskSeqDupInfo
{
struct TaskSeqDupInfo {
set<int64_t> dupIdx;
set<int64_t> opticalDupIdx;
set<int64_t> notDupIdx;
};
/* 保存有未匹配pair位点的信息包括read end数组和有几个未匹配的read end */
struct UnpairedPosInfo
{
struct UnpairedPosInfo {
int unpairedNum = 0;
int64_t taskSeq;
vector<ReadEnds> pairArr;
@ -41,28 +47,26 @@ struct UnpairedPosInfo
// typedef unordered_map<string, UnpairedREInfo> UnpairedNameMap;
// typedef unordered_map<int64_t, UnpairedPosInfo> UnpairedPositionMap;
typedef tsl::robin_map<string, UnpairedREInfo> UnpairedNameMap; // 以read name为索引保存未匹配的pair read
typedef tsl::robin_map<int64_t, UnpairedPosInfo> UnpairedPositionMap; // 以位点为索引保存该位点包含的对应的所有read和该位点包含的剩余未匹配的read的数量
typedef tsl::robin_map<string, UnpairedREInfo> UnpairedNameMap; // 以read name为索引保存未匹配的pair read
typedef tsl::robin_map<int64_t, UnpairedPosInfo> UnpairedPositionMap; // 以位点为索引保存该位点包含的对应的所有read和该位点包含的剩余未匹配的read的数量
/* 单线程处理冗余参数结构体 */
struct SerailMarkDupArg
{
int64_t taskSeq;
int64_t bamStartIdx; // 当前vBam数组中第一个bam记录在整体bam中所处的位置
vector<BamWrap *> bams; // 存放待处理的bam read
vector<ReadEnds> pairs;
vector<ReadEnds> frags;
set<int64_t> pairDupIdx; // pair的冗余read的索引
set<int64_t> pairOpticalDupIdx; // optical冗余read的索引
set<int64_t> fragDupIdx; // frag的冗余read的索引
UnpairedNameMap unpairedDic; // 用来寻找pair end
UnpairedPositionMap unpairedPosArr; // 存放未匹配的ReadEnd对应位点的所有ReadEnd为了避免重复存储
struct SerailMarkDupArg {
int64_t taskSeq; // 任务序号
int64_t bamStartIdx; // 当前vBam数组中第一个bam记录在整体bam中所处的位置
vector<BamWrap *> bams; // 存放待处理的bam read
vector<ReadEnds> pairs; // 成对的reads
vector<ReadEnds> frags; // 暂未找到配对的reads
set<int64_t> pairDupIdx; // pair的冗余read的索引
set<int64_t> pairOpticalDupIdx; // optical冗余read的索引
set<int64_t> fragDupIdx; // frag的冗余read的索引
UnpairedNameMap unpairedDic; // 用来寻找pair end
UnpairedPositionMap unpairedPosArr; // 存放未匹配的ReadEnd对应位点的所有ReadEnd为了避免重复存储
};
/* 全局保留的数据因为有些paired数据比对到了不同的染色体相距甚远 */
struct GlobalDataArg
{
UnpairedNameMap unpairedDic; // 用来寻找pair end
struct GlobalDataArg {
UnpairedNameMap unpairedDic; // 用来寻找pair end
UnpairedPositionMap unpairedPosArr;
// 每个task对应一个vector
@ -75,869 +79,5 @@ struct GlobalDataArg
vector<set<int64_t>> latterNotDupIdxArr;
};
static GlobalDataArg gData;
/* 查找 */
// template<class Itr, class T>
// static inline Itr binaryFind(Itr first, Itr last, const T &val)
// {
// first = std::lower_bound(first, last, val);
// return (first != last && *first == val) ? first : last;
// }
/* 排序 */
static inline void sortReadEndsArr(vector<ReadEnds> &arr)
{
size_t blockSize = 64 * 1024;
blockSize = min(blockSize, arr.size());
size_t blockNum = (arr.size() + blockSize - 1) / blockSize;
size_t crossNum = 1024;
size_t start, end, i, left, right;
std::sort(arr.begin(), arr.begin() + blockSize);
for (i = 1; i < blockNum; ++i)
{
start = i * blockSize;
end = min(start + blockSize, arr.size());
std::sort(arr.begin() + start, arr.begin() + end);
left = crossNum;
while (!(arr[start - left] < arr[start]))
{
left = left << 1;
if (left >= blockSize)
{
std::sort(arr.begin(), arr.end()); // 退化到普通排序
return;
}
}
right = min(crossNum, end - start - 1);
while (!(arr[start - 1] < arr[start + right]))
{
right = min(right << 1, end - start - 1);
if (right == end - start - 1)
break;
}
std::sort(arr.begin() + start - left, arr.begin() + start + right);
}
}
/* 处理一组pairend的readends标记冗余 */
static void markDupsForPairs(vector<const ReadEnds *> &vpRe,
set<int64_t> *dupIdx,
set<int64_t> *opticalDupIdx,
set<int64_t> *notDupIdx = nullptr)
{
if (vpRe.size() < 2)
{
if (vpRe.size() == 1)
{
// addSingletonToCount(libraryIdGenerator);
}
return;
}
int maxScore = 0;
const ReadEnds *pBest = nullptr;
/** All read ends should have orientation FF, FR, RF, or RR **/
for (auto pe : vpRe) // 找分数最高的readend
{
if (pe->score > maxScore || pBest == nullptr)
{
maxScore = pe->score;
pBest = pe;
}
}
if (notDupIdx != nullptr)
{
notDupIdx->insert(pBest->read1IndexInFile);
notDupIdx->insert(pBest->read2IndexInFile);
}
if (!g_mdArg.READ_NAME_REGEX.empty()) // 检查光学冗余
{
// trackOpticalDuplicates
}
for (auto pe : vpRe) // 对非best read标记冗余
{
if (pe != pBest) // 非best
{
dupIdx->insert(pe->read1IndexInFile); // 添加read1
if (pe->read2IndexInFile != pe->read1IndexInFile)
dupIdx->insert(pe->read2IndexInFile); // 添加read2
}
}
// if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS)
// {
// addRepresentativeReadIndex(vpRe);
// }
}
/* 处理一组非paired的readends标记冗余 */
static void markDupsForFrags(vector<const ReadEnds *> &vpRe,
bool containsPairs,
set<int64_t> *dupIdx,
set<int64_t> *notDupIdx = nullptr)
{
if (containsPairs)
{
for (auto pe : vpRe)
{
if (!pe->IsPaired())
{
dupIdx->insert(pe->read1IndexInFile);
}
}
}
else
{
int maxScore = 0;
const ReadEnds *pBest = nullptr;
for (auto pe : vpRe)
{
if (pe->score > maxScore || pBest == nullptr)
{
maxScore = pe->score;
pBest = pe;
}
}
if (notDupIdx != nullptr)
{
notDupIdx->insert(pBest->read1IndexInFile);
}
for (auto pe : vpRe)
{
if (pe != pBest)
{
dupIdx->insert(pe->read1IndexInFile);
}
}
}
}
/* 找到与readend pos相等的所有readend */
static void getEqualRE(const ReadEnds &re, vector<ReadEnds> &src, vector<ReadEnds> *dst)
{
auto range = std::equal_range(src.begin(), src.end(), re, ReadEnds::pairsLittleThan); // 只比对位点
dst->insert(dst->end(), range.first, range.second);
}
/* 单线程生成readends (第一步)*/
static void generateReadEnds(SerailMarkDupArg *arg)
{
auto &p = *arg;
auto &rnParser = g_vRnParser[0];
p.pairs.clear();
p.frags.clear();
p.unpairedDic.clear();
p.unpairedPosArr.clear();
/* 处理每个read创建ReadEnd并放入frag和pair中 */
set<ReadEnds> reSet;
ReadEnds lastRe;
for (int i = 0; i < p.bams.size(); ++i) // 循环处理每个read
{
BamWrap *bw = p.bams[i];
const int64_t bamIdx = p.bamStartIdx + i;
if (bw->GetReadUnmappedFlag())
{
if (bw->b->core.tid == -1)
// When we hit the unmapped reads with no coordinate, no reason to continue (only in coordinate sort).
break;
}
else if (!bw->IsSecondaryOrSupplementary()) // 是主要比对
{
ReadEnds fragEnd;
tm_arr[8].acc_start();
buildReadEnds(*bw, bamIdx, rnParser, &fragEnd);
tm_arr[8].acc_end();
p.frags.push_back(fragEnd); // 添加进frag集合
if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) // 是pairend而且互补的read也比对上了
{
string key = bw->query_name();
if (p.unpairedDic.find(key) == p.unpairedDic.end())
{
p.unpairedDic[key] = {p.taskSeq, fragEnd};
}
else // 找到了pairend
{
auto &pairedEnds = p.unpairedDic.at(key).unpairedRE;
modifyPairedEnds(fragEnd, &pairedEnds);
p.pairs.push_back(pairedEnds);
p.unpairedDic.erase(key); // 删除找到的pairend
}
}
}
}
tm_arr[9].acc_start();
sortReadEndsArr(p.frags);
// sort(p.frags.begin(), p.frags.end());
tm_arr[9].acc_end();
// cout << "sort pairs" << endl;
tm_arr[10].acc_start();
sort(p.pairs.begin(), p.pairs.end());
tm_arr[10].acc_end();
// 记录位点上的未匹配的read个数
for (auto &e : p.unpairedDic) {
auto posKey = e.second.unpairedRE.posKey;
auto &unpairArrInfo = p.unpairedPosArr[posKey];
unpairArrInfo.unpairedNum++;
unpairArrInfo.taskSeq = p.taskSeq;
unpairArrInfo.readNameSet.insert(e.first);
}
cout << "依赖比例:" << (float)p.unpairedDic.size() / p.frags.size() << endl;
}
/* 处理pairs */
static void processPairs(vector<ReadEnds> &readEnds,
set<int64_t> *dupIdx,
set<int64_t> *opticalDupIdx,
set<int64_t> *notDupIdx = nullptr)
{
vector<const ReadEnds *> vpCache; // 有可能是冗余的reads
const ReadEnds *pReadEnd = nullptr;
for (auto &re : readEnds)
{
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样
vpCache.push_back(&re); // 处理一个潜在的冗余组
else
{
markDupsForPairs(vpCache, dupIdx, opticalDupIdx, notDupIdx); // 不一样
vpCache.clear();
vpCache.push_back(&re);
pReadEnd = &re;
}
}
markDupsForPairs(vpCache, dupIdx, opticalDupIdx, notDupIdx);
}
/* 处理frags */
static void processFrags(vector<ReadEnds> &readEnds,
set<int64_t> *dupIdx,
set<int64_t> *notDupIdx = nullptr)
{
bool containsPairs = false;
bool containsFrags = false;
vector<const ReadEnds *> vpCache; // 有可能是冗余的reads
const ReadEnds *pReadEnd = nullptr;
for (auto &re : readEnds)
{
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false))
{
vpCache.push_back(&re);
containsPairs = containsPairs || re.IsPaired();
containsFrags = containsFrags || !re.IsPaired();
}
else
{
if (vpCache.size() > 1 && containsFrags)
{
markDupsForFrags(vpCache, containsPairs, dupIdx, notDupIdx);
}
vpCache.clear();
vpCache.push_back(&re);
pReadEnd = &re;
containsPairs = re.IsPaired();
containsFrags = !re.IsPaired();
}
}
if (vpCache.size() > 1 && containsFrags)
{
markDupsForFrags(vpCache, containsPairs, dupIdx, notDupIdx);
}
}
/* 单线程markdup (第二步)*/
static void markdups(SerailMarkDupArg *arg)
{
auto &p = *arg;
p.pairDupIdx.clear();
p.pairOpticalDupIdx.clear();
p.fragDupIdx.clear();
/* generateDuplicateIndexes计算冗余read在所有read中的位置索引 */
// 先处理 pair
processPairs(p.pairs, &p.pairDupIdx, &p.pairOpticalDupIdx);
// 再处理frag
processFrags(p.frags, &p.fragDupIdx);
}
/* 获取交叉部分的数据 */
static inline void getIntersectData(vector<ReadEnds> &leftArr,
vector<ReadEnds> &rightArr,
vector<ReadEnds> *dst,
bool isPairCmp = false)
{
const size_t leftEndIdx = leftArr.size() - 1;
const size_t rightStartIdx = 0;
size_t leftSpan = 0;
size_t rightSpan = 0;
while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx - leftSpan], rightArr[rightStartIdx], isPairCmp))
{
leftSpan += 1;
if (leftSpan > leftEndIdx)
{
leftSpan = leftArr.size() - 1;
break;
}
}
while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx], rightArr[rightSpan], isPairCmp))
{
rightSpan += 1;
if (rightSpan == rightArr.size() - 1)
break;
}
dst->insert(dst->end(), leftArr.end() - leftSpan, leftArr.end());
dst->insert(dst->end(), rightArr.begin(), rightArr.begin() + rightSpan);
std::sort(dst->begin(), dst->end());
}
/* 将frags重叠部分的dup idx放进数据中 */
static inline void refreshFragDupIdx(set<int64_t> &dupIdx,
set<int64_t> &notDupIdx,
SerailMarkDupArg * lastArg,
SerailMarkDupArg *curArg)
{
auto &lp = *lastArg;
auto &p = *curArg;
for (auto idx : dupIdx)
{
lp.fragDupIdx.insert(idx);
p.fragDupIdx.erase(idx);
}
for (auto idx : notDupIdx)
{
lp.fragDupIdx.erase(idx);
p.fragDupIdx.erase(idx);
}
}
/* 将pairs重叠部分的dup idx放进数据中 */
static inline void refreshPairDupIdx(set<int64_t> &dupIdx,
set<int64_t> &opticalDupIdx,
set<int64_t> &notDupIdx,
SerailMarkDupArg *lastArg,
SerailMarkDupArg *curArg)
{
auto &lp = *lastArg;
auto &p = *curArg;
for (auto idx : dupIdx)
{
lp.pairDupIdx.insert(idx);
p.pairDupIdx.erase(idx);
}
for (auto idx : opticalDupIdx)
{
lp.pairOpticalDupIdx.insert(idx);
p.pairOpticalDupIdx.erase(idx);
}
for (auto idx : notDupIdx)
{
lp.pairDupIdx.erase(idx);
lp.pairOpticalDupIdx.erase(idx);
p.pairDupIdx.erase(idx);
p.pairOpticalDupIdx.erase(idx);
}
}
// 用来分别处理dup和optical dup
static void refeshTaskDupInfo(set<int64_t> &dupIdx,
set<int64_t> &opticalDupIdx,
set<int64_t> &notDupIdx,
set<int64_t> &latterDupIdx,
set<int64_t> &latterOpticalDupIdx,
set<int64_t> &latterNotDupIdx)
{
for (auto idx : dupIdx)
latterDupIdx.insert(idx);
for (auto idx : opticalDupIdx)
latterOpticalDupIdx.insert(idx);
for (auto idx : notDupIdx)
latterNotDupIdx.insert(idx);
}
/* 最后合并数据并排序 */
static void refeshFinalTaskDupInfo(set<int64_t> &dupIdx,
set<int64_t> &notDupIdx,
vector<int64_t> &dupArr)
{
vector<int64_t> midArr;
auto ai = dupArr.begin();
auto bi = dupIdx.begin();
auto ae = dupArr.end();
auto be = dupIdx.end();
int64_t val = 0;
while (ai != ae && bi != be)
{
if (*ai < *bi)
{
val = *ai++;
}
else if (*bi < *ai)
{
val = *bi++;
}
else
{
val = *ai++;
bi++;
}
if (notDupIdx.find(val) == notDupIdx.end())
{
midArr.push_back(val);
}
}
while (ai != ae)
{
val = *ai++;
if (notDupIdx.find(val) == notDupIdx.end())
{
midArr.push_back(val);
}
}
while (bi != be)
{
val = *bi++;
if (notDupIdx.find(val) == notDupIdx.end())
{
midArr.push_back(val);
}
}
dupArr = midArr;
}
/* 处理相邻的两个任务,有相交叉的数据 */
static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *curArg, GlobalDataArg *gDataArg)
{
auto &lp = *lastArg;
auto &p = *curArg;
auto &g = *gDataArg;
vector<ReadEnds> reArr;
set<int64_t> dupIdx;
set<int64_t> notDupIdx;
// 先处理重叠的frags
getIntersectData(lp.frags, p.frags, &reArr);
processFrags(reArr, &dupIdx, &notDupIdx);
refreshFragDupIdx(dupIdx, notDupIdx, &lp, &p);
// 再处理重叠的pairs
reArr.clear();
dupIdx.clear();
notDupIdx.clear();
set<int64_t> opticalDupIdx;
getIntersectData(lp.pairs, p.pairs, &reArr, true);
processPairs(reArr, &dupIdx, &opticalDupIdx, &notDupIdx);
refreshPairDupIdx(dupIdx, opticalDupIdx, notDupIdx, &lp, &p);
// 处理之前未匹配的部分
map<CalcKey, int64_t> recalcPos;
set<CalcKey> alreadyAdd; // 与该位点相同的pair都添加到数组里了
set<int64_t> addToGlobal;
int64_t prevLastPos = 0, nextFirstPos = 0;
if (lp.frags.size() > 0)
prevLastPos = lp.frags.back().posKey;
if (p.frags.size() > 0)
nextFirstPos = p.frags[0].posKey;
// cout << "range: " << nextFirstPos << '\t' << prevLastPos << endl;
for (auto &prevUnpair : lp.unpairedDic) // 遍历上一个任务中的每个未匹配的read
{
auto &readName = prevUnpair.first;
auto &prevPosInfo = prevUnpair.second;
auto prevFragEnd = prevPosInfo.unpairedRE; // 未匹配的read end
if (p.unpairedDic.find(readName) != p.unpairedDic.end()) // 在当前这个任务里找到了这个未匹配的read
{
auto &nextPosInfo = p.unpairedDic[readName];
auto &nextFragEnd = nextPosInfo.unpairedRE;
int64_t prevPosKey = prevFragEnd.posKey;
modifyPairedEnds(nextFragEnd, &prevFragEnd); // 在某些clip情况下poskey可能是后面的read
int64_t nextPosKey = max(prevPosKey, nextFragEnd.posKey);
CalcKey ck = {prevPosKey, nextPosKey};
UnpairedPosInfo *prevUnpairInfoP = nullptr;
UnpairedPosInfo *nextUnpairInfoP = nullptr;
if (lp.unpairedPosArr.find(prevPosKey) != lp.unpairedPosArr.end())
prevUnpairInfoP = &lp.unpairedPosArr[prevPosKey];
if (p.unpairedPosArr.find(prevPosKey) != p.unpairedPosArr.end())
nextUnpairInfoP = &p.unpairedPosArr[prevPosKey];
// pos分为两种情况根据poskey(pair中两个read分别的pos)的位置确定
// 1. prevpos在交叉部分之前nextpos在交叉部分之后这种情况不需要获取pairarr中的数据;
// 2. prevpos在交叉部分之前nextpos在交叉部分需要获取lp中的相等read pair进行重新计算
// 复杂情况1. g中包含prevPosKey对应的unpairp中有对应的pair此时应该把这些pair考虑进去
// 3. prevpos在交叉部分nextpos在交叉部分之后需要获取p中的相等read pair进行重新计算
// 复杂情况2. p中是否包含prevPosKey对应的unpair
// 4. prevpos在交叉部分nextpos在交叉部分需要获取lp和p中的相等read pair进行重新计算
bool addDataToPos = true;
if (alreadyAdd.find(ck) != alreadyAdd.end())
{
addDataToPos = false; // 之前已经添加过了,后面就不用再添加数据了
}
else
alreadyAdd.insert(ck);
if (prevPosKey < nextFirstPos) // prevpos在交叉部分之前
{
auto &prevPairArr = prevUnpairInfoP->pairArr; // prevUnpairInfoP肯定不是nullptr
prevPairArr.push_back(prevFragEnd);
if (nextPosKey <= prevLastPos && addDataToPos) // 第二种情况
{
getEqualRE(prevFragEnd, lp.pairs, &prevPairArr);
}
// 第一种情况,第二种情况下都会出现,复杂情况一
auto gPosInfo = g.unpairedPosArr.find(prevPosKey);
if (gPosInfo != g.unpairedPosArr.end()) // 可能g和p有匹配的刚好和该位点一致
{
auto &gUnpairInfo = gPosInfo->second;
auto pPosInfo = p.unpairedPosArr.find(nextPosKey);
if (pPosInfo != p.unpairedPosArr.end())
{
auto &pUnpairInfo = pPosInfo->second;
for (auto &rn : gUnpairInfo.readNameSet) // 遍历每一个readname看是否有匹配的
{
if (pUnpairInfo.readNameSet.find(rn) != pUnpairInfo.readNameSet.end())
{
auto pe = g.unpairedDic[rn].unpairedRE;
auto fe = p.unpairedDic[rn].unpairedRE;
modifyPairedEnds(fe, &pe);
prevPairArr.push_back(pe);
g.unpairedDic.erase(rn);
p.unpairedDic.erase(rn);
// cout << "找到了!" << rn << endl;
}
}
}
}
recalcPos[ck] = prevPosInfo.taskSeq;
std::sort(prevPairArr.begin(), prevPairArr.end());
}
else // prevpos在交叉部分
{
if (nextPosKey > prevLastPos) // nextpos在交叉部分之后
{ // 第三种情况
if (nextUnpairInfoP != nullptr) // 且在pos点next task有unpair这样才把这些数据放到next task里
{
auto &nextPairArr = nextUnpairInfoP->pairArr;
nextPairArr.push_back(prevFragEnd);
auto &prevPairArr = prevUnpairInfoP->pairArr;
prevPairArr.push_back(prevFragEnd);
if (addDataToPos)
{
getEqualRE(prevFragEnd, p.pairs, &prevPairArr);
}
recalcPos[ck] = nextPosInfo.taskSeq; // 将数据放到next task里, (这个位点以后会可能还会计算到,目前方案是都计算,只是把冗余剔除)
std::sort(prevPairArr.begin(), prevPairArr.end());
}
else // next task在该位点没有unpair那就把数据放到prev task里
{
auto &prevPairArr = prevUnpairInfoP->pairArr; // prevUnpairInfoP肯定不是nullptr
prevPairArr.push_back(prevFragEnd);
if (addDataToPos) // 第二种情况
getEqualRE(prevFragEnd, p.pairs, &prevPairArr);
recalcPos[ck] = prevPosInfo.taskSeq;
std::sort(prevPairArr.begin(), prevPairArr.end());
}
}
else
{ // 第四种情况
if (prevUnpairInfoP == nullptr) {
prevUnpairInfoP = &lp.unpairedPosArr[prevPosKey];
prevUnpairInfoP->taskSeq = lp.taskSeq;
}
auto &prevPairArr = prevUnpairInfoP->pairArr;
prevPairArr.push_back(prevFragEnd);
if (addDataToPos)
{
getEqualRE(prevFragEnd, lp.pairs, &prevPairArr);
getEqualRE(prevFragEnd, p.pairs, &prevPairArr);
}
recalcPos[ck] = prevPosInfo.taskSeq;
std::sort(prevPairArr.begin(), prevPairArr.end());
}
}
p.unpairedDic.erase(readName); // 在next task里删除该read
}
else if (g.unpairedDic.find(readName) != g.unpairedDic.end()) // 在遗留数据中找到了匹配的read
{
auto &remainPosInfo = g.unpairedDic[readName];
auto remainFragEnd = remainPosInfo.unpairedRE;
int64_t remainPosKey = remainFragEnd.posKey;
modifyPairedEnds(prevFragEnd, &remainFragEnd); // 在某些clip情况下poskey可能是后面的read
auto &remainUnpairInfo = g.unpairedPosArr[remainPosKey];
auto &remainPairArr = remainUnpairInfo.pairArr;
remainPairArr.push_back(remainFragEnd);
CalcKey ck = {remainPosKey, prevFragEnd.posKey};
recalcPos[ck] = remainPosInfo.taskSeq;
std::sort(remainPairArr.begin(), remainPairArr.end());
g.unpairedDic.erase(readName);
}
else // 都没找到,那就保存到遗留数据里
{
int64_t prevPosKey = prevFragEnd.posKey;
g.unpairedDic.insert(prevUnpair);
addToGlobal.insert(prevPosKey);
}
}
for (auto posKey : addToGlobal) // 最后再添加,以防开始赋值,后来这个位置要是又添加了新的数据
g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey];
map<int64_t, TaskSeqDupInfo> taskChanged;
set<int64_t> posProcessed;
for (auto &e : recalcPos)
{
auto posKey = e.first.read1Pos;
if (posProcessed.find(posKey) != posProcessed.end())
continue;
posProcessed.insert(posKey);
auto taskSeq = e.second;
auto &t = taskChanged[taskSeq];
// 在对应的任务包含的dup idx里修改结果数据
vector<ReadEnds> *pairArrP = nullptr;
if (taskSeq < lp.taskSeq)
pairArrP = &g.unpairedPosArr[posKey].pairArr;
else
pairArrP = &lp.unpairedPosArr[posKey].pairArr;
processPairs(*pairArrP, &t.dupIdx, &t.opticalDupIdx, &t.notDupIdx);
if (taskSeq < lp.taskSeq)
g.unpairedPosArr.erase(posKey);
}
// 更新结果
for (auto &e : taskChanged)
{
auto taskSeq = e.first;
auto &t = e.second;
if (taskSeq < lp.taskSeq)
{
refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.notDupIdx,
g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq], g.latterNotDupIdxArr[taskSeq]);
}
else if (taskSeq == lp.taskSeq)
{
refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.notDupIdx, &lp, &p);
}
else
{
refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.notDupIdx, &p, &lp); // 把结果放到p中
}
}
// cout << "remain unpaired: " << g.unpairedDic.size() << '\t' << g.unpairedPosArr.size() << endl;
// cout << "calc g time: " << t.seconds_elapsed() << " s" << endl;
// 将dupidx放进全局数据
g.latterDupIdxArr.push_back(set<int64_t>());
g.latterOpticalDupIdxArr.push_back(set<int64_t>());
g.latterNotDupIdxArr.push_back(set<int64_t>());
g.dupIdxArr.push_back(vector<int64_t>());
auto &vIdx = g.dupIdxArr.back();
lp.pairDupIdx.insert(lp.fragDupIdx.begin(), lp.fragDupIdx.end());
vIdx.insert(vIdx.end(), lp.pairDupIdx.begin(), lp.pairDupIdx.end());
g.opticalDupIdxArr.push_back(vector<int64_t>());
auto &vOpticalIdx = g.opticalDupIdxArr.back();
vOpticalIdx.insert(vOpticalIdx.end(), lp.pairOpticalDupIdx.begin(), lp.pairOpticalDupIdx.end());
}
/* 当所有任务结束后global data里还有未处理的数据 */
static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg)
{
auto &lp = *task;
auto &g = *gDataArg;
// 遗留的未匹配的pair
for (auto &prevUnpair : lp.unpairedDic) // 遍历上一个任务中的每个未匹配的read
{
auto &readName = prevUnpair.first;
auto &prevPosInfo = prevUnpair.second;
auto prevFragEnd = prevPosInfo.unpairedRE; // 未匹配的read end
if (g.unpairedDic.find(readName) != g.unpairedDic.end()) // 在遗留数据中找到了匹配的read
{
auto &remainPosInfo = g.unpairedDic[readName];
auto remainFragEnd = remainPosInfo.unpairedRE;
int64_t remainPosKey = remainFragEnd.posKey;
modifyPairedEnds(prevFragEnd, &remainFragEnd); // 在某些clip情况下poskey可能是后面的read
auto &remainUnpairInfo = g.unpairedPosArr[remainPosKey];
remainUnpairInfo.pairArr.push_back(remainFragEnd);
g.unpairedDic.erase(readName);
}
}
map<int64_t, TaskSeqDupInfo> taskChanged;
for (auto &e : g.unpairedPosArr)
{
auto posKey = e.first;
auto taskSeq = e.second.taskSeq;
auto &t = taskChanged[taskSeq];
auto &arr = g.unpairedPosArr[posKey].pairArr;
if (arr.size() > 1)
{
std::sort(arr.begin(), arr.end());
processPairs(arr, &t.dupIdx, &t.opticalDupIdx, &t.notDupIdx);
}
}
// 更新结果
vector<int64_t> addDup;
map<int64_t, int64_t> ndPosVal;
for (auto &e : taskChanged)
{
auto taskSeq = e.first;
auto &t = e.second;
refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.notDupIdx,
g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq], g.latterNotDupIdxArr[taskSeq]);
}
cout << "last unpair info: " << g.unpairedDic.size() << '\t' << g.unpairedPosArr.size() << endl;
g.unpairedPosArr.clear();
g.unpairedDic.clear();
// 将dupidx放进全局数据
for (int i = 0; i < g.dupIdxArr.size() - 1; ++i)
refeshFinalTaskDupInfo(g.latterDupIdxArr[i], g.latterNotDupIdxArr[i], g.dupIdxArr[i]);
for (int i = 0; i < g.opticalDupIdxArr.size() - 1; ++i)
refeshFinalTaskDupInfo(g.latterOpticalDupIdxArr[i], g.latterNotDupIdxArr[i], g.opticalDupIdxArr[i]);
g.dupIdxArr.push_back(vector<int64_t>());
auto &vIdx = g.dupIdxArr.back();
lp.pairDupIdx.insert(lp.fragDupIdx.begin(), lp.fragDupIdx.end());
vIdx.insert(vIdx.end(), lp.pairDupIdx.begin(), lp.pairDupIdx.end());
g.opticalDupIdxArr.push_back(vector<int64_t>());
auto &vOpticalIdx = g.opticalDupIdxArr.back();
vOpticalIdx.insert(vOpticalIdx.end(), lp.pairOpticalDupIdx.begin(), lp.pairOpticalDupIdx.end());
}
/* 串行处理数据,标记冗余 */
static void serialMarkDups()
{
tm_arr[5].acc_start();
Timer::log_time("serial start");
// 读取缓存初始化
BamBufType inBamBuf(g_gArg.use_asyncio);
inBamBuf.Init(g_inBamFp, g_inBamHeader, g_gArg.max_mem);
// BamBufType inBamBuf(false);
// inBamBuf.Init(g_inBamFp, g_inBamHeader, 100 * 1024 * 1024);
int64_t processedBamNum = 0;
SerailMarkDupArg smdArg1, smdArg2;
SerailMarkDupArg *lastArgP = &smdArg1;
SerailMarkDupArg *curArgP = &smdArg2;
bool isFirstRound = true;
int roundNum = 0;
while (inBamBuf.ReadStat() >= 0)
{
Timer t_round;
// 读取bam文件中的read
tm_arr[4].acc_start();
size_t readNum = inBamBuf.ReadBam();
tm_arr[4].acc_end();
cout << "read num: " << readNum << endl;
// lastArgP = curArgP;
tm_arr[6].acc_start();
curArgP->taskSeq = roundNum;
curArgP->bamStartIdx = processedBamNum;
curArgP->bams = inBamBuf.GetBamArr();
tm_arr[6].acc_end();
tm_arr[0].acc_start();
Timer t1;
generateReadEnds(curArgP);
//cout << "calc read end time: " << t1.seconds_elapsed() << " s" << endl;
tm_arr[0].acc_end();
tm_arr[1].acc_start();
t1.reinit();
markdups(curArgP);
//cout << "markdups time: " << t1.seconds_elapsed() << " s" << endl;
tm_arr[1].acc_end();
if (!isFirstRound)
{
tm_arr[2].acc_start();
t1.reinit();
handleIntersectData(lastArgP, curArgP, &gData);
//cout << "intersect time: " << t1.seconds_elapsed() << " s" << endl;
// addTaskIdxToSet(lastArgP, &gData);
tm_arr[2].acc_end();
}
else
{
isFirstRound = false;
}
inBamBuf.ClearAll(); // 清理上一轮读入的数据
processedBamNum += readNum;
// 交换
auto tmp = lastArgP;
lastArgP = curArgP;
curArgP = tmp;
cout << "round time: " << t_round.seconds_elapsed() << endl;
roundNum++;
if (roundNum > 9){
// break;
}
}
// cout << "here" << endl;
tm_arr[3].acc_start();
// 处理剩下的全局数据
handleLastTask(lastArgP, &gData);
// cout << "here 2" << endl;
tm_arr[3].acc_end();
tm_arr[5].acc_end();
// 统计所有冗余index数量
int64_t dupNum = 0;
set<int64_t> dup;
// int taskSeq = 0;
// for (auto &arr : gData.dupIdxArr)
// {
// for (auto idx : arr) {
// if (dup.find(idx) != dup.end())
// {
// cout << "dup index: " << taskSeq << '\t' << idx << endl;
// }
// dup.insert(idx);
// }
// taskSeq++;
// }
// #include <fstream>
// ofstream out("tumor_dup.txt");
// for (auto idx : dup)
// {
// out << idx << endl;
// }
// out.close();
for (auto &arr : gData.dupIdxArr)
dupNum += arr.size();
cout << "dup num : " << dupNum << '\t' << dup.size() << endl;
cout << "calc readend: " << tm_arr[0].acc_seconds_elapsed() << endl;
cout << "markdup : " << tm_arr[1].acc_seconds_elapsed() << endl;
cout << "handle tail : " << tm_arr[2].acc_seconds_elapsed() << endl;
cout << "handle last : " << tm_arr[3].acc_seconds_elapsed() << endl;
cout << "read bam : " << tm_arr[4].acc_seconds_elapsed() << endl;
cout << "new arg : " << tm_arr[6].acc_seconds_elapsed() << endl;
cout << "del arg : " << tm_arr[7].acc_seconds_elapsed() << endl;
cout << "build ends : " << tm_arr[8].acc_seconds_elapsed() << endl;
cout << "sort frags : " << tm_arr[9].acc_seconds_elapsed() << endl;
cout << "sort pairs : " << tm_arr[10].acc_seconds_elapsed() << endl;
cout << "all : " << tm_arr[5].acc_seconds_elapsed() << endl;
Timer::log_time("serial end ");
//for (auto i : gData.dupArr)
// cout << i << endl;
}
// 串行运行mark duplicate
void serialMarkDups();

View File

@ -0,0 +1,26 @@
#pragma once
#include <common/utils/timer.h>
#include <common/utils/util.h>
#include <htslib/sam.h>
#include <htslib/thread_pool.h>
#include <sam/utils/read_ends.h>
#include <sam/utils/read_name_parser.h>
extern Timer tm_arr[20]; // 用来测试性能
/* 全局本地变量 */
extern vector<ReadNameParser> g_vRnParser; // 每个线程一个read name parser
extern samFile *g_inBamFp; // 输入的bam文件
extern sam_hdr_t *g_inBamHeader; // 输入的bam文件头信息
extern samFile *g_outBamFp; // 输出文件, sam或者bam格式
extern sam_hdr_t *g_outBamHeader; // 输出文件的header
/* 参数对象作为全局对象,免得多次作为参数传入函数中 */
class GlobalArg;
extern GlobalArg &g_gArg;
class MarkDupsArg;
extern MarkDupsArg &g_mdArg;
class GlobalDataArg;
extern GlobalDataArg &gData;
class DuplicationMetrics;
extern DuplicationMetrics &gMetrics;

View File

@ -1,5 +1,6 @@
/*
Description: read ends
Description: read
ends
Copyright : All right reserved by ICT
@ -13,18 +14,20 @@ Date : 2023/11/3
#include <stdint.h>
/**
* Small interface that provides access to the physical location information about a cluster.
* All values should be defaulted to -1 if unavailable. ReadGroup and Tile should only allow
* non-zero positive integers, x and y coordinates may be negative.
* Small interface that provides access to the physical location information
* about a cluster. All values should be defaulted to -1 if unavailable.
* ReadGroup and Tile should only allow non-zero positive integers, x and y
* coordinates may be negative.
*/
struct PhysicalLocation
{
struct PhysicalLocation {
static const int NO_VALUE = -1;
/**
* Small class that provides access to the physical location information about a cluster.
* All values should be defaulted to -1 if unavailable. Tile should only allow
* non-zero positive integers, x and y coordinates must be non-negative.
* This is different from PhysicalLocationShort in that the x and y positions are ints, not shorts
* thus, they do not overflow within a HiSeqX tile.
* Small class that provides access to the physical location information
* about a cluster. All values should be defaulted to -1 if unavailable.
* Tile should only allow non-zero positive integers, x and y coordinates
* must be non-negative. This is different from PhysicalLocationShort in
* that the x and y positions are ints, not shorts thus, they do not
* overflow within a HiSeqX tile.
*/
int16_t tile = -1;
int32_t x = -1;
@ -32,28 +35,33 @@ struct PhysicalLocation
};
/* 包含了所有read ends信息如picard里边的 ReadEndsForMarkDuplicates*/
struct ReadEnds : PhysicalLocation
{
struct ReadEnds : PhysicalLocation {
static const int8_t F = 0, R = 1, FF = 2, FR = 3, RR = 4, RF = 5;
/* 保留一些bam记录中的数据 */
bool read1FirstOfPair = true;
/* ReadEnds中的成员变量 */
/** Little struct-like class to hold read pair (and fragment) end data for duplicate marking. */
/** Little struct-like class to hold read pair (and fragment) end data for
* duplicate marking. */
// int16_t libraryId; // 没用,不考虑多样本
int8_t orientation = -1;
int32_t read1ReferenceIndex = -1;
int32_t read1Coordinate = -1;
int32_t read2ReferenceIndex = -1;
int32_t read2Coordinate = -1; // This field is overloaded for flow based processing as the end coordinate of read 1. (paired reads not supported)
// This field is overloaded for flow based processing as the end coordinate of read 1. (paired reads not supported)
int32_t read2Coordinate = -1;
/* Additional information used to detect optical dupes */
// int16_t readGroup = -1; 一般经过比对后的bam文件只有一个read groupnormal或者tumor
/** For optical duplicate detection the orientation matters regard to 1st or 2nd end of a mate */
// int16_t readGroup = -1; 一般经过比对后的bam文件只有一个read
// groupnormal或者tumor
/** For optical duplicate detection the orientation matters regard to 1st or
* 2nd end of a mate */
int8_t orientationForOpticalDuplicates = -1;
/** A *transient* flag marking this read end as being an optical duplicate. */
/** A *transient* flag marking this read end as being an optical duplicate.
*/
bool isOpticalDuplicate = false;
/* ReadEndsForMarkDuplicates中的成员变量 */
/** Little struct-like class to hold read pair (and fragment) end data for MarkDuplicatesWithMateCigar **/
/** Little struct-like class to hold read pair (and fragment) end data for
* MarkDuplicatesWithMateCigar **/
int16_t score = 0;
int64_t read1IndexInFile = -1;
int64_t read2IndexInFile = -1;
@ -62,23 +70,22 @@ struct ReadEnds : PhysicalLocation
/* ReadEndsForMarkDuplicatesWithBarcodes中的成员变量 (好像用不到) */
// int32_t barcode = 0; // primary barcode for this read (and pair)
// int32_t readOneBarcode = 0; // read one barcode, 0 if not present
// int32_t readTwoBarcode = 0; // read two barcode, 0 if not present or not paired
// int32_t readTwoBarcode = 0; // read two barcode, 0 if not present or not
// paired
/* zzh增加的成员变量 */
int64_t posKey = -1; // 根据位置信息生成的关键字 return (int64_t)tid << MAX_CONTIG_LEN_SHIFT | (int64_t)pos;
int64_t posKey = -1; // 根据位置信息生成的关键字 return (int64_t)tid <<
// MAX_CONTIG_LEN_SHIFT | (int64_t)pos;
/* 根据pairend read的比对方向来确定整体的比对方向 */
static int8_t GetOrientationByte(bool read1NegativeStrand, bool read2NegativeStrand)
{
if (read1NegativeStrand)
{
static int8_t GetOrientationByte(bool read1NegativeStrand,
bool read2NegativeStrand) {
if (read1NegativeStrand) {
if (read2NegativeStrand)
return RR;
else
return RF;
}
else
{
} else {
if (read2NegativeStrand)
return FR;
else
@ -87,47 +94,38 @@ struct ReadEnds : PhysicalLocation
}
/* 比较两个readends是否一样有个冗余 */
static bool AreComparableForDuplicates(const ReadEnds &lhs, const ReadEnds &rhs, bool compareRead2)
{
static bool AreComparableForDuplicates(const ReadEnds &lhs,
const ReadEnds &rhs,
bool compareRead2) {
bool areComparable = true;
areComparable = lhs.read1ReferenceIndex == rhs.read1ReferenceIndex &&
lhs.read1Coordinate == rhs.read1Coordinate &&
lhs.orientation == rhs.orientation;
if (areComparable && compareRead2)
{
areComparable = lhs.read2ReferenceIndex == rhs.read2ReferenceIndex &&
lhs.read2Coordinate == rhs.read2Coordinate;
if (areComparable && compareRead2) {
areComparable =
lhs.read2ReferenceIndex == rhs.read2ReferenceIndex &&
lhs.read2Coordinate == rhs.read2Coordinate;
}
return areComparable;
}
/* 比对方向是否正向 */
bool IsPositiveStrand() const
{
return orientation == F;
}
bool IsPositiveStrand() const { return orientation == F; }
/* pairend是否合适的比对上了 */
bool IsPaired() const
{
return read2ReferenceIndex != -1;
}
bool IsPaired() const { return read2ReferenceIndex != -1; }
bool IsNegativeStrand() const
{
return orientation == R;
}
bool IsNegativeStrand() const { return orientation == R; }
// 对于相交的数据进行比对a是否小于b根据AreComparableForDuplicates函数得来
static inline bool ReadLittleThan(const ReadEnds &a, const ReadEnds &b, bool compareRead2 = false)
{
static inline bool ReadLittleThan(const ReadEnds &a, const ReadEnds &b,
bool compareRead2 = false) {
int comp = a.read1ReferenceIndex - b.read1ReferenceIndex;
if (comp == 0)
comp = a.read1Coordinate - b.read1Coordinate;
if (comp == 0)
comp = a.orientation - b.orientation;
if (compareRead2)
{
if (compareRead2) {
if (comp == 0)
comp = a.read2ReferenceIndex - b.read2ReferenceIndex;
if (comp == 0)
@ -137,14 +135,12 @@ struct ReadEnds : PhysicalLocation
}
// 找某一个位置的所有readend时需要
static bool pairsLittleThan(const ReadEnds &lhs, const ReadEnds &rhs)
{
static bool PairsLittleThan(const ReadEnds &lhs, const ReadEnds &rhs) {
return ReadLittleThan(lhs, rhs, true);
}
// 比较函数
bool operator < (const ReadEnds &o) const
{
bool operator<(const ReadEnds &o) const {
int comp = read1ReferenceIndex - o.read1ReferenceIndex;
if (comp == 0)
comp = read1Coordinate - o.read1Coordinate;

View File

@ -9,11 +9,12 @@ Date : 2023/11/6
#ifndef READ_NAME_PARSER_H_
#define READ_NAME_PARSER_H_
#include "read_ends.h"
#include <common/utils/util.h>
#include <stdint.h>
#include <string>
#include "read_ends.h"
// #include <regex>
#include <boost/regex.hpp>
@ -24,103 +25,107 @@ using std::string;
/**
* Provides access to the physical location information about a cluster.
* All values should be defaulted to -1 if unavailable. ReadGroup and Tile should only allow
* non-zero positive integers, x and y coordinates may be negative.
* 线
* All values should be defaulted to -1 if unavailable. ReadGroup and Tile
* should only allow non-zero positive integers, x and y coordinates may be
* negative. 线
*/
struct ReadNameParser
{
struct ReadNameParser {
/**
* The read name regular expression (regex) is used to extract three pieces of information from the read name: tile, x location,
* and y location. Any read name regex should parse the read name to produce these and only these values. An example regex is:
* The read name regular expression (regex) is used to extract three pieces
* of information from the read name: tile, x location, and y location. Any
* read name regex should parse the read name to produce these and only
* these values. An example regex is:
* (?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$
* which assumes that fields in the read name are delimited by ':' and the last three fields correspond to the tile, x and y locations,
* ignoring any trailing non-digit characters.
* which assumes that fields in the read name are delimited by ':' and the
* last three fields correspond to the tile, x and y locations, ignoring any
* trailing non-digit characters.
*
* The default regex is optimized for fast parsing (see {@link #getLastThreeFields(String, char, int[])}) by searching for the last
* three fields, ignoring any trailing non-digit characters, assuming the delimiter ':'. This should consider correctly read names
* where we have 5 or 7 field with the last three fields being tile/x/y, as is the case for the majority of read names produced by
* Illumina technology.
* The default regex is optimized for fast parsing (see {@link
* #getLastThreeFields(String, char, int[])}) by searching for the last
* three fields, ignoring any trailing non-digit characters, assuming the
* delimiter ':'. This should consider correctly read names where we have 5
* or 7 field with the last three fields being tile/x/y, as is the case for
* the majority of read names produced by Illumina technology.
*/
const string DEFAULT_READ_NAME_REGEX = "(?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$";
const string DEFAULT_READ_NAME_REGEX =
"(?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$";
string readNameStored = "";
PhysicalLocation physicalLocationStored;
int tmpLocationFields[3]; // for optimization of addLocationInformation
bool useOptimizedDefaultParsing = true; // was the regex default?
int tmpLocationFields[3]; // for optimization of addLocationInformation
bool useOptimizedDefaultParsing = true; // was the regex default?
string readNameRegex = DEFAULT_READ_NAME_REGEX;
regex readNamePattern;
bool warnedAboutRegexNotMatching = true;
ReadNameParser() : ReadNameParser(DEFAULT_READ_NAME_REGEX) {}
ReadNameParser(const string &strReadNameRegex) : ReadNameParser(strReadNameRegex, true) {}
ReadNameParser(const string &strReadNameRegex, bool isWarn)
{
ReadNameParser(const string &strReadNameRegex)
: ReadNameParser(strReadNameRegex, true) {}
ReadNameParser(const string &strReadNameRegex, bool isWarn) {
readNameRegex = strReadNameRegex;
if (strReadNameRegex == DEFAULT_READ_NAME_REGEX)
useOptimizedDefaultParsing = true;
else
useOptimizedDefaultParsing = false;
readNamePattern = boost::regex(strReadNameRegex, boost::regex_constants::optimize);
readNamePattern =
boost::regex(strReadNameRegex, boost::regex_constants::optimize);
warnedAboutRegexNotMatching = isWarn;
}
/* 重新设置readNameRegex */
void SetReadNameRegex(const string &strReadNameRegex)
{
void SetReadNameRegex(const string &strReadNameRegex) {
readNameRegex = strReadNameRegex;
if (strReadNameRegex == DEFAULT_READ_NAME_REGEX)
useOptimizedDefaultParsing = true;
else
useOptimizedDefaultParsing = false;
readNamePattern = boost::regex(strReadNameRegex, boost::regex_constants::optimize);
readNamePattern =
boost::regex(strReadNameRegex, boost::regex_constants::optimize);
// readNamePattern = strReadNameRegex;
}
/* 添加测序时候的tile x y 信息 */
bool AddLocationInformation(const string &readName, PhysicalLocation *loc)
{
if (!(readName == readNameStored))
{
if (ReadLocationInformation(readName, loc))
{
bool AddLocationInformation(const string &readName, PhysicalLocation *loc) {
if (!(readName == readNameStored)) {
if (ReadLocationInformation(readName, loc)) {
readNameStored = readName;
physicalLocationStored = *loc;
return true;
}
// return false if read name cannot be parsed
return false;
}
else
{
} else {
*loc = physicalLocationStored;
return true;
}
}
/**
* Method used to extract tile/x/y from the read name and add it to the PhysicalLocationShort so that it
* can be used later to determine optical duplication
* Method used to extract tile/x/y from the read name and add it to the
* PhysicalLocationShort so that it can be used later to determine optical
* duplication
*
* @param readName the name of the read/cluster
* @param loc the object to add tile/x/y to
* @return true if the read name contained the information in parsable form, false otherwise
* @return true if the read name contained the information in parsable form,
* false otherwise
*/
bool ReadLocationInformation(const string &readName, PhysicalLocation *loc)
{
bool ReadLocationInformation(const string &readName,
PhysicalLocation *loc) {
try {
// Optimized version if using the default read name regex (== used on purpose):
if (useOptimizedDefaultParsing)
{
// Optimized version if using the default read name regex (== used
// on purpose):
if (useOptimizedDefaultParsing) {
const int fields = getLastThreeFields(readName, ':');
if (!(fields == 5 || fields == 7))
{
if (warnedAboutRegexNotMatching)
{
if (!(fields == 5 || fields == 7)) {
if (warnedAboutRegexNotMatching) {
Warn(
"Default READ_NAME_REGEX '%s' did not match read name '%s'."
"You may need to specify a READ_NAME_REGEX in order to correctly identify optical duplicates. "
"Note that this message will not be emitted again even if other read names do not match the regex.",
"Default READ_NAME_REGEX '%s' did not match read "
"name '%s'."
"You may need to specify a READ_NAME_REGEX in "
"order to correctly identify optical duplicates. "
"Note that this message will not be emitted again "
"even if other read names do not match the regex.",
readNameRegex.c_str(), readName.c_str());
warnedAboutRegexNotMatching = false;
}
@ -130,13 +135,9 @@ struct ReadNameParser
loc->x = tmpLocationFields[1];
loc->y = tmpLocationFields[2];
return true;
}
else if (readNameRegex.empty())
{
} else if (readNameRegex.empty()) {
return false;
}
else
{
} else {
// Standard version that will use the regex
cmatch m;
if (boost::regex_match(readName.c_str(), m, readNamePattern)) {
@ -144,28 +145,25 @@ struct ReadNameParser
loc->x = std::stoi(m[2].str());
loc->y = std::stoi(m[3].str());
return true;
}
else
{
if (warnedAboutRegexNotMatching)
{
} else {
if (warnedAboutRegexNotMatching) {
Warn(
"READ_NAME_REGEX '%s' did not match read name '%s'."
"Your regex may not be correct. "
"Note that this message will not be emitted again even if other read names do not match the regex.",
"Note that this message will not be emitted again "
"even if other read names do not match the regex.",
readNameRegex.c_str(), readName.c_str());
warnedAboutRegexNotMatching = false;
}
return false;
}
}
}
catch (const std::runtime_error &e)
{
if (warnedAboutRegexNotMatching)
{
} catch (const std::runtime_error &e) {
if (warnedAboutRegexNotMatching) {
Warn(
"A field parsed out of a read name was expected to contain an integer and did not. READ_NAME_REGEX: %s; Read name: %s; Error Msg: %s",
"A field parsed out of a read name was expected to contain "
"an integer and did not. READ_NAME_REGEX: %s; Read name: "
"%s; Error Msg: %s",
readNameRegex.c_str(), readName.c_str(), e.what());
warnedAboutRegexNotMatching = false;
}
@ -175,39 +173,39 @@ struct ReadNameParser
}
/**
* Given a string, splits the string by the delimiter, and returns the the last three fields parsed as integers. Parsing a field
* considers only a sequence of digits up until the first non-digit character. The three values are stored in the passed-in array.
* Given a string, splits the string by the delimiter, and returns the the
* last three fields parsed as integers. Parsing a field considers only a
* sequence of digits up until the first non-digit character. The three
* values are stored in the passed-in array.
*
* @throws NumberFormatException if any of the tokens that should contain numbers do not start with parsable numbers
* @throws NumberFormatException if any of the tokens that should contain
* numbers do not start with parsable numbers
*/
int getLastThreeFields(const string &readName, char delim)
{
int tokensIdx = 2; // start at the last token
int getLastThreeFields(const string &readName, char delim) {
int tokensIdx = 2; // start at the last token
int numFields = 0;
int i, endIdx;
endIdx = readName.size();
// find the last three tokens only
for (i = readName.size() - 1; 0 <= i && 0 <= tokensIdx; i--)
{
if (readName.at(i) == delim || 0 == i)
{
for (i = (int)readName.size() - 1; 0 <= i && 0 <= tokensIdx; i--) {
if (readName.at(i) == delim || 0 == i) {
numFields++;
const int startIdx = (0 == i) ? 0 : (i + 1);
tmpLocationFields[tokensIdx] = std::stoi(readName.substr(startIdx, endIdx - startIdx));
tmpLocationFields[tokensIdx] =
std::stoi(readName.substr(startIdx, endIdx - startIdx));
tokensIdx--;
endIdx = i;
}
}
// continue to find the # of fields
while (0 <= i)
{
while (0 <= i) {
if (readName.at(i) == delim || 0 == i)
numFields++;
i--;
}
if (numFields < 3)
{
tmpLocationFields[0] = tmpLocationFields[1] = tmpLocationFields[2] = -1;
if (numFields < 3) {
tmpLocationFields[0] = tmpLocationFields[1] = tmpLocationFields[2] =
-1;
numFields = -1;
}