检查optical duplication,用graph来检测,正在写这个
This commit is contained in:
parent
38bc489004
commit
022be611cd
|
|
@ -1,3 +1,6 @@
|
||||||
|
# for fast-markdup
|
||||||
|
*.sam
|
||||||
|
*.log
|
||||||
# ---> C++
|
# ---> C++
|
||||||
# Prerequisites
|
# Prerequisites
|
||||||
*.d
|
*.d
|
||||||
|
|
|
||||||
|
|
@ -13,10 +13,10 @@
|
||||||
"program": "${workspaceRoot}/build/bin/picard_cpp",
|
"program": "${workspaceRoot}/build/bin/picard_cpp",
|
||||||
"args": [
|
"args": [
|
||||||
"MarkDuplicates",
|
"MarkDuplicates",
|
||||||
"--INPUT", "/mnt/d/data/100w.bam",
|
"--INPUT", "~/data/bam/100w.bam",
|
||||||
"--OUTPUT", "out.bam",
|
"--OUTPUT", "out.bam",
|
||||||
"--METRICS_FILE", "metrics.txt",
|
"--METRICS_FILE", "metrics.txt",
|
||||||
"--num_threads", "12",
|
"--num_threads", "1",
|
||||||
"--max_mem", "4G",
|
"--max_mem", "4G",
|
||||||
"--verbosity", "DEBUG",
|
"--verbosity", "DEBUG",
|
||||||
"--asyncio", "true",
|
"--asyncio", "true",
|
||||||
|
|
|
||||||
|
|
@ -88,6 +88,13 @@
|
||||||
"typeindex": "cpp",
|
"typeindex": "cpp",
|
||||||
"typeinfo": "cpp",
|
"typeinfo": "cpp",
|
||||||
"valarray": "cpp",
|
"valarray": "cpp",
|
||||||
"variant": "cpp"
|
"variant": "cpp",
|
||||||
|
"__split_buffer": "cpp",
|
||||||
|
"executor": "cpp",
|
||||||
|
"io_context": "cpp",
|
||||||
|
"netfwd": "cpp",
|
||||||
|
"timer": "cpp",
|
||||||
|
"__nullptr": "cpp",
|
||||||
|
"__node_handle": "c"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
4
build.sh
4
build.sh
|
|
@ -1,8 +1,8 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
dir="/home/zzh/work/GeneKit/picard_cpp/build"
|
dir="/home/zzh/work/ngs/picard_cpp/build"
|
||||||
[ -d "$dir" ] && rm -rf "$dir"
|
[ -d "$dir" ] && rm -rf "$dir"
|
||||||
mkdir "$dir"
|
mkdir "$dir"
|
||||||
cd "$dir"
|
cd "$dir"
|
||||||
#cmake .. -DCMAKE_BUILD_TYPE=Debug
|
#cmake .. -DCMAKE_BUILD_TYPE=Debug
|
||||||
cmake .. -DCMAKE_BUILD_TYPE=Release
|
cmake .. -DCMAKE_BUILD_TYPE=Release
|
||||||
make -j 8
|
make -j 16
|
||||||
|
|
|
||||||
13
run.sh
13
run.sh
|
|
@ -1,9 +1,14 @@
|
||||||
time /home/zzh/work/GeneKit/picard_cpp/build/bin/picard_cpp \
|
input=~/data/bam/zy_normal.bam
|
||||||
|
#input=~/data/bam/zy_tumor.bam
|
||||||
|
#input=~/data/bam/100w.bam
|
||||||
|
|
||||||
|
time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \
|
||||||
MarkDuplicates \
|
MarkDuplicates \
|
||||||
--INPUT /mnt/d/data/zy_tumor.bam \
|
--INPUT $input \
|
||||||
--OUTPUT /mnt/d/data/out.bam \
|
--OUTPUT ~/data/bam/out.bam \
|
||||||
|
--INDEX_FORMAT BAI \
|
||||||
--num_threads 1 \
|
--num_threads 1 \
|
||||||
--max_mem 4G \
|
--max_mem 2G \
|
||||||
--verbosity DEBUG \
|
--verbosity DEBUG \
|
||||||
--asyncio true #\
|
--asyncio true #\
|
||||||
#--READ_NAME_REGEX ".*?([0-9]+):([0-9]+):([0-9]+)$"
|
#--READ_NAME_REGEX ".*?([0-9]+):([0-9]+):([0-9]+)$"
|
||||||
|
|
|
||||||
|
|
@ -13,116 +13,87 @@
|
||||||
* BamBuf类
|
* BamBuf类
|
||||||
*/
|
*/
|
||||||
// 读取数据直到读完,或者缓冲区满
|
// 读取数据直到读完,或者缓冲区满
|
||||||
int BamBuf::ReadBam()
|
int BamBuf::ReadBam() {
|
||||||
{
|
|
||||||
int read_num = 0;
|
int read_num = 0;
|
||||||
if (handle_last)
|
if (handle_last) { // 处理上次读入的最后一个bam
|
||||||
{ // 处理上次读入的最后一个bam
|
if (has_enough_space()) { // 必须调用,在边界处调整memffset
|
||||||
if (has_enough_space())
|
|
||||||
{ // 必须调用,在边界处调整memffset
|
|
||||||
++read_num;
|
++read_num;
|
||||||
append_one_bam();
|
append_one_bam();
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
return read_num; // 还是没空间
|
return read_num; // 还是没空间
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
while (read_stat_ >= 0 && (read_stat_ = sam_read1(fp, hdr, bw->b)) >= 0)
|
while (read_stat_ >= 0 && (read_stat_ = sam_read1(fp, hdr, bw->b)) >= 0) {
|
||||||
{
|
|
||||||
bw->end_pos_ = BamWrap::BamEndPos(bw->b);
|
bw->end_pos_ = BamWrap::BamEndPos(bw->b);
|
||||||
if (has_enough_space())
|
if (has_enough_space()) { // 还有空间
|
||||||
{ // 还有空间
|
|
||||||
append_one_bam();
|
append_one_bam();
|
||||||
++read_num; // 放进缓存才算读取到
|
++read_num; // 放进缓存才算读取到
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (read_stat_ >= 0)
|
if (read_stat_ >= 0) {
|
||||||
{
|
|
||||||
handle_last = true;
|
handle_last = true;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
handle_last = false;
|
handle_last = false;
|
||||||
}
|
}
|
||||||
return read_num;
|
return read_num;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 初始化缓存
|
// 初始化缓存
|
||||||
void BamBuf::Init(samFile *fp,
|
void BamBuf::Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size) {
|
||||||
sam_hdr_t *hdr,
|
|
||||||
int64_t mem_size)
|
|
||||||
{
|
|
||||||
this->fp = fp;
|
this->fp = fp;
|
||||||
this->hdr = hdr;
|
this->hdr = hdr;
|
||||||
this->mem_size = mem_size;
|
this->mem_size = mem_size;
|
||||||
this->mem = (uint8_t *)malloc(mem_size);
|
this->mem = (uint8_t *)malloc(mem_size);
|
||||||
this->bw = (BamWrap *)malloc(sizeof(BamWrap));
|
this->bw = (BamWrap *)malloc(sizeof(BamWrap));
|
||||||
this->bw->b = bam_init1();
|
this->bw->b = bam_init1();
|
||||||
if (bw == NULL ||
|
if (bw == NULL || this->mem == NULL || this->bw->b == NULL) {
|
||||||
this->mem == NULL ||
|
|
||||||
this->bw->b == NULL)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "allocate memory failed! Abort\n");
|
fprintf(stderr, "allocate memory failed! Abort\n");
|
||||||
exit(-1);
|
exit(-1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void BamBuf::ClearBeforeIdx(size_t idxInBv)
|
void BamBuf::ClearBeforeIdx(size_t idxInBv) {
|
||||||
{
|
|
||||||
if (idxInBv < 1)
|
if (idxInBv < 1)
|
||||||
return;
|
return;
|
||||||
int i = 0, j = idxInBv;
|
int i = 0, j = idxInBv;
|
||||||
for (; j < bv.size(); ++i, ++j)
|
for (; j < bv.size(); ++i, ++j) {
|
||||||
{
|
|
||||||
bv[i] = bv[j];
|
bv[i] = bv[j];
|
||||||
}
|
}
|
||||||
bv.resize(i);
|
bv.resize(i);
|
||||||
prepare_read();
|
prepare_read();
|
||||||
}
|
}
|
||||||
|
|
||||||
void BamBuf::ClearAll()
|
void BamBuf::ClearAll() {
|
||||||
{
|
|
||||||
|
|
||||||
bv.clear();
|
bv.clear();
|
||||||
prepare_read();
|
prepare_read();
|
||||||
}
|
}
|
||||||
|
|
||||||
// 为下一次读取做准备, 计算一些边界条件
|
// 为下一次读取做准备, 计算一些边界条件
|
||||||
inline void BamBuf::prepare_read()
|
inline void BamBuf::prepare_read() {
|
||||||
{
|
|
||||||
// 计算余留的下次计算可能用到的bam所占的位置
|
// 计算余留的下次计算可能用到的bam所占的位置
|
||||||
if (bv.size() > 0)
|
if (bv.size() > 0) {
|
||||||
{
|
|
||||||
BamWrap *bw = bv[0];
|
BamWrap *bw = bv[0];
|
||||||
legacy_start = (int64_t)bw - (int64_t)mem;
|
legacy_start = (int64_t)bw - (int64_t)mem;
|
||||||
bw = bv.back();
|
bw = bv.back();
|
||||||
legacy_end = (int64_t)bw + bw->length() - (int64_t)mem;
|
legacy_end = (int64_t)bw + bw->length() - (int64_t)mem;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
legacy_start = legacy_end = 0;
|
legacy_start = legacy_end = 0;
|
||||||
mem_offset = 0; // 上次没剩下,那就从头存储
|
mem_offset = 0; // 上次没剩下,那就从头存储
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 检查缓存是否还有空间
|
// 检查缓存是否还有空间
|
||||||
inline bool BamBuf::has_enough_space()
|
inline bool BamBuf::has_enough_space() {
|
||||||
{
|
|
||||||
const uint32_t bam_len = bw->length();
|
const uint32_t bam_len = bw->length();
|
||||||
int64_t potential_end = mem_offset + bam_len;
|
int64_t potential_end = mem_offset + bam_len;
|
||||||
if (legacy_start <= legacy_end)
|
if (legacy_start <= legacy_end)
|
||||||
legacy_start += mem_size;
|
legacy_start += mem_size;
|
||||||
if (potential_end >= legacy_start)
|
if (potential_end >= legacy_start) {
|
||||||
{
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (potential_end >= mem_size)
|
if (potential_end >= mem_size) {
|
||||||
{
|
|
||||||
mem_offset = 0;
|
mem_offset = 0;
|
||||||
}
|
}
|
||||||
int64_t virtual_offset = mem_offset;
|
int64_t virtual_offset = mem_offset;
|
||||||
|
|
@ -133,8 +104,7 @@ inline bool BamBuf::has_enough_space()
|
||||||
}
|
}
|
||||||
|
|
||||||
// 处理一个读取后的bam
|
// 处理一个读取后的bam
|
||||||
inline void BamBuf::append_one_bam()
|
inline void BamBuf::append_one_bam() {
|
||||||
{
|
|
||||||
BamWrap *bwp = (BamWrap *)(mem + mem_offset);
|
BamWrap *bwp = (BamWrap *)(mem + mem_offset);
|
||||||
*bwp = *bw;
|
*bwp = *bw;
|
||||||
bwp->b = (bam1_t *)((char *)bwp + sizeof(*bwp));
|
bwp->b = (bam1_t *)((char *)bwp + sizeof(*bwp));
|
||||||
|
|
@ -148,12 +118,9 @@ inline void BamBuf::append_one_bam()
|
||||||
}
|
}
|
||||||
|
|
||||||
// 处理上次读入的最后一个read
|
// 处理上次读入的最后一个read
|
||||||
inline bool BamBuf::handle_last_read()
|
inline bool BamBuf::handle_last_read() {
|
||||||
{
|
if (handle_last) { // 处理上次读入的最后一个bam
|
||||||
if (handle_last)
|
if (has_enough_space()) { // 必须调用,在边界处调整memffset
|
||||||
{ // 处理上次读入的最后一个bam
|
|
||||||
if (has_enough_space())
|
|
||||||
{ // 必须调用,在边界处调整memffset
|
|
||||||
append_one_bam();
|
append_one_bam();
|
||||||
handle_last = false;
|
handle_last = false;
|
||||||
return true;
|
return true;
|
||||||
|
|
@ -166,49 +133,35 @@ inline bool BamBuf::handle_last_read()
|
||||||
* AsyncIoBamBuf 类
|
* AsyncIoBamBuf 类
|
||||||
*/
|
*/
|
||||||
// 初始化缓存
|
// 初始化缓存
|
||||||
void AsyncIoBamBuf::Init(samFile *fp,
|
void AsyncIoBamBuf::Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size) {
|
||||||
sam_hdr_t *hdr,
|
if (use_async_io_) {
|
||||||
int64_t mem_size)
|
|
||||||
{
|
|
||||||
if (use_async_io_)
|
|
||||||
{
|
|
||||||
buf1_.Init(fp, hdr, mem_size >> 1);
|
buf1_.Init(fp, hdr, mem_size >> 1);
|
||||||
buf2_.Init(fp, hdr, mem_size >> 1);
|
buf2_.Init(fp, hdr, mem_size >> 1);
|
||||||
pi_ = &buf1_;
|
pi_ = &buf1_;
|
||||||
po_ = &buf2_;
|
po_ = &buf2_;
|
||||||
tid_ = (pthread_t *)malloc(sizeof(pthread_t));
|
tid_ = (pthread_t *)malloc(sizeof(pthread_t));
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
buf1_.Init(fp, hdr, mem_size);
|
buf1_.Init(fp, hdr, mem_size);
|
||||||
pi_ = &buf1_;
|
pi_ = &buf1_;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 读取数据
|
// 读取数据
|
||||||
int AsyncIoBamBuf::ReadBam()
|
int AsyncIoBamBuf::ReadBam() {
|
||||||
{
|
if (use_async_io_) {
|
||||||
if (use_async_io_)
|
|
||||||
{
|
|
||||||
hasThread = true;
|
hasThread = true;
|
||||||
return async_read_bam();
|
return async_read_bam();
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
return sync_read_bam();
|
return sync_read_bam();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int AsyncIoBamBuf::sync_read_bam()
|
int AsyncIoBamBuf::sync_read_bam() {
|
||||||
{
|
|
||||||
int read_num = 0;
|
int read_num = 0;
|
||||||
if (clear_all_)
|
if (clear_all_) {
|
||||||
{
|
|
||||||
clear_all_ = false;
|
clear_all_ = false;
|
||||||
pi_->ClearAll();
|
pi_->ClearAll();
|
||||||
}
|
} else if (clear_before_idx_ > 0) {
|
||||||
else if (clear_before_idx_ > 0)
|
|
||||||
{
|
|
||||||
pi_->ClearBeforeIdx(clear_before_idx_);
|
pi_->ClearBeforeIdx(clear_before_idx_);
|
||||||
clear_before_idx_ = 0;
|
clear_before_idx_ = 0;
|
||||||
}
|
}
|
||||||
|
|
@ -217,23 +170,18 @@ int AsyncIoBamBuf::sync_read_bam()
|
||||||
return read_num;
|
return read_num;
|
||||||
}
|
}
|
||||||
|
|
||||||
int AsyncIoBamBuf::async_read_bam()
|
int AsyncIoBamBuf::async_read_bam() {
|
||||||
{
|
|
||||||
int read_num = 0;
|
int read_num = 0;
|
||||||
if (first_read_)
|
if (first_read_) {
|
||||||
{
|
|
||||||
read_num = pi_->ReadBam();
|
read_num = pi_->ReadBam();
|
||||||
first_read_ = false;
|
first_read_ = false;
|
||||||
refresh_bam_arr();
|
refresh_bam_arr();
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
// join, 交换缓冲区指针
|
// join, 交换缓冲区指针
|
||||||
pthread_join(*tid_, 0);
|
pthread_join(*tid_, 0);
|
||||||
resize_buf();
|
resize_buf();
|
||||||
|
|
||||||
if (need_read_)
|
if (need_read_) { // 需要交换指针
|
||||||
{ // 需要交换指针
|
|
||||||
BamBuf *tmp = pi_;
|
BamBuf *tmp = pi_;
|
||||||
pi_ = po_;
|
pi_ = po_;
|
||||||
po_ = tmp;
|
po_ = tmp;
|
||||||
|
|
@ -246,72 +194,52 @@ int AsyncIoBamBuf::async_read_bam()
|
||||||
return read_num;
|
return read_num;
|
||||||
}
|
}
|
||||||
|
|
||||||
void *AsyncIoBamBuf::async_read(void *data)
|
void *AsyncIoBamBuf::async_read(void *data) {
|
||||||
{
|
|
||||||
AsyncIoBamBuf *ab = (AsyncIoBamBuf *)data;
|
AsyncIoBamBuf *ab = (AsyncIoBamBuf *)data;
|
||||||
if (ab->need_read_ && ab->ReadStat() >= 0)
|
if (ab->need_read_ && ab->ReadStat() >= 0) { // 需要读取
|
||||||
{ // 需要读取
|
|
||||||
ab->last_read_num_ = ab->po_->ReadBam();
|
ab->last_read_num_ = ab->po_->ReadBam();
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
ab->last_read_num_ = 0;
|
ab->last_read_num_ = 0;
|
||||||
}
|
}
|
||||||
pthread_exit(0);
|
pthread_exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// 为下一次读取做准备, 计算一些边界条件,延迟操作,因为此时可能po_对应的buf正在读取
|
// 为下一次读取做准备,
|
||||||
void AsyncIoBamBuf::ClearBeforeIdx(size_t idxInBv)
|
// 计算一些边界条件,延迟操作,因为此时可能po_对应的buf正在读取
|
||||||
{
|
void AsyncIoBamBuf::ClearBeforeIdx(size_t idxInBv) {
|
||||||
clear_before_idx_ = idxInBv;
|
clear_before_idx_ = idxInBv;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 清空上一次所有读入的数据,延迟操作,因为此时可能po_对应的buf正在读取
|
// 清空上一次所有读入的数据,延迟操作,因为此时可能po_对应的buf正在读取
|
||||||
void AsyncIoBamBuf::ClearAll()
|
void AsyncIoBamBuf::ClearAll() { clear_all_ = true; }
|
||||||
{
|
|
||||||
clear_all_ = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void AsyncIoBamBuf::resize_buf()
|
inline void AsyncIoBamBuf::resize_buf() {
|
||||||
{
|
if (clear_all_) { // 清理上一轮的数据
|
||||||
if (clear_all_)
|
|
||||||
{ // 清理上一轮的数据
|
|
||||||
clear_all_ = false;
|
clear_all_ = false;
|
||||||
po_->ClearBeforeIdx(legacy_size_);
|
po_->ClearBeforeIdx(legacy_size_);
|
||||||
pi_->ClearAll();
|
pi_->ClearAll();
|
||||||
if (pi_->handle_last_read()) // 上次读取有一个read没放入缓存
|
if (pi_->handle_last_read()) { // 上次读取有一个read没放入缓存
|
||||||
{
|
|
||||||
last_read_num_ += 1;
|
last_read_num_ += 1;
|
||||||
legacy_size_ = pi_->Size(); // 应该只有一个read
|
legacy_size_ = pi_->Size(); // 应该只有一个read
|
||||||
need_read_ = true;
|
need_read_ = true;
|
||||||
}
|
} else { // 没空间存放,则不交换指针,或者文件已经读取完毕
|
||||||
else // 没空间存放,则不交换指针,或者文件已经读取完毕
|
|
||||||
{
|
|
||||||
legacy_size_ = 0;
|
legacy_size_ = 0;
|
||||||
need_read_ = false;
|
need_read_ = false;
|
||||||
}
|
}
|
||||||
}
|
} else if (clear_before_idx_ > 0) {
|
||||||
else if (clear_before_idx_ > 0)
|
if (clear_before_idx_ < legacy_size_) {
|
||||||
{
|
|
||||||
if (clear_before_idx_ < legacy_size_)
|
|
||||||
{
|
|
||||||
po_->ClearBeforeIdx(clear_before_idx_);
|
po_->ClearBeforeIdx(clear_before_idx_);
|
||||||
legacy_size_ -= clear_before_idx_;
|
legacy_size_ -= clear_before_idx_;
|
||||||
// 不需要交换指针,不需要读取
|
// 不需要交换指针,不需要读取
|
||||||
need_read_ = false;
|
need_read_ = false;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
po_->ClearBeforeIdx(legacy_size_);
|
po_->ClearBeforeIdx(legacy_size_);
|
||||||
pi_->ClearBeforeIdx(clear_before_idx_ - legacy_size_);
|
pi_->ClearBeforeIdx(clear_before_idx_ - legacy_size_);
|
||||||
if (pi_->handle_last_read()) // 上次读取有一个read没放入缓存
|
if (pi_->handle_last_read()) {// 上次读取有一个read没放入缓存
|
||||||
{
|
|
||||||
last_read_num_ += 1;
|
last_read_num_ += 1;
|
||||||
legacy_size_ = pi_->Size(); // 应该只有一个read
|
legacy_size_ = pi_->Size(); // 应该只有一个read
|
||||||
need_read_ = true;
|
need_read_ = true;
|
||||||
}
|
} else { // 没空间存放,则不交换指针,或者文件已经读取完毕
|
||||||
else // 没空间存放,则不交换指针,或者文件已经读取完毕
|
|
||||||
{
|
|
||||||
legacy_size_ = 0;
|
legacy_size_ = 0;
|
||||||
need_read_ = false;
|
need_read_ = false;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -10,19 +10,18 @@
|
||||||
#ifndef BAM_BUF_H_
|
#ifndef BAM_BUF_H_
|
||||||
#define BAM_BUF_H_
|
#define BAM_BUF_H_
|
||||||
|
|
||||||
#include <vector>
|
#include <htslib/sam.h>
|
||||||
#include <stdint.h>
|
#include <pthread.h>
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
|
#include <stdint.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <functional>
|
|
||||||
#include <pthread.h>
|
|
||||||
|
|
||||||
|
#include <fstream>
|
||||||
|
#include <functional>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <fstream>
|
#include <vector>
|
||||||
|
|
||||||
#include <htslib/sam.h>
|
|
||||||
|
|
||||||
#include "bam_wrap.h"
|
#include "bam_wrap.h"
|
||||||
|
|
||||||
|
|
@ -32,12 +31,12 @@ using namespace std;
|
||||||
/*
|
/*
|
||||||
* 存放读入的bam数据
|
* 存放读入的bam数据
|
||||||
*/
|
*/
|
||||||
struct BamBuf
|
struct BamBuf {
|
||||||
{
|
|
||||||
sam_hdr_t *hdr; // sam文件的header信息
|
sam_hdr_t *hdr; // sam文件的header信息
|
||||||
samFile *fp; // sam文件指针
|
samFile *fp; // sam文件指针
|
||||||
BamWrap *bw = nullptr; // 用来循环读入bam
|
BamWrap *bw = nullptr; // 用来循环读入bam
|
||||||
uint8_t *mem = nullptr; // 用来存放bam的数据, 程序结束后自动释放,所以没在析构函数里释放
|
uint8_t *mem = nullptr; // 用来存放bam的数据,
|
||||||
|
// 程序结束后自动释放,所以没在析构函数里释放
|
||||||
int64_t mem_offset = 0; // 下一次要存放的位置
|
int64_t mem_offset = 0; // 下一次要存放的位置
|
||||||
int64_t mem_size; // 缓存大小
|
int64_t mem_size; // 缓存大小
|
||||||
int read_stat_ = 0; // 读取状态,是否读完
|
int read_stat_ = 0; // 读取状态,是否读完
|
||||||
|
|
@ -47,9 +46,7 @@ struct BamBuf
|
||||||
bool handle_last = false; // 上次最后读入的bam是否需要处理
|
bool handle_last = false; // 上次最后读入的bam是否需要处理
|
||||||
|
|
||||||
// 初始化缓存
|
// 初始化缓存
|
||||||
void Init(samFile *fp,
|
void Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size);
|
||||||
sam_hdr_t *hdr,
|
|
||||||
int64_t mem_size);
|
|
||||||
// 读取数据直到读完,或者缓冲区满
|
// 读取数据直到读完,或者缓冲区满
|
||||||
int ReadBam();
|
int ReadBam();
|
||||||
// 为下一次读取做准备, 计算一些边界条件
|
// 为下一次读取做准备, 计算一些边界条件
|
||||||
|
|
@ -57,27 +54,24 @@ struct BamBuf
|
||||||
// 清空上一次所有读入的数据
|
// 清空上一次所有读入的数据
|
||||||
void ClearAll();
|
void ClearAll();
|
||||||
inline int64_t Size() { return bv.size(); } // 包含多少个read
|
inline int64_t Size() { return bv.size(); } // 包含多少个read
|
||||||
inline int ReadStat() { return read_stat_; } // 文件的读取状态,是否可读(读取完全)
|
inline int ReadStat() {
|
||||||
~BamBuf()
|
return read_stat_;
|
||||||
{
|
} // 文件的读取状态,是否可读(读取完全)
|
||||||
if (this->mem != nullptr)
|
~BamBuf() {
|
||||||
{
|
if (this->mem != nullptr) {
|
||||||
free(this->mem);
|
free(this->mem);
|
||||||
}
|
}
|
||||||
if (this->bw != nullptr)
|
if (this->bw != nullptr) {
|
||||||
{
|
|
||||||
bam_destroy1(bw->b);
|
bam_destroy1(bw->b);
|
||||||
free(bw);
|
free(bw);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
void FreeMemory() // 释放开辟的内存
|
void FreeMemory() // 释放开辟的内存
|
||||||
{
|
{
|
||||||
if (this->mem != nullptr)
|
if (this->mem != nullptr) {
|
||||||
{
|
|
||||||
free(this->mem);
|
free(this->mem);
|
||||||
}
|
}
|
||||||
if (this->bw != nullptr)
|
if (this->bw != nullptr) {
|
||||||
{
|
|
||||||
bam_destroy1(bw->b);
|
bam_destroy1(bw->b);
|
||||||
free(bw);
|
free(bw);
|
||||||
}
|
}
|
||||||
|
|
@ -102,31 +96,31 @@ struct BamBuf
|
||||||
/*
|
/*
|
||||||
* io异步缓冲区
|
* io异步缓冲区
|
||||||
*/
|
*/
|
||||||
struct AsyncIoBamBuf
|
struct AsyncIoBamBuf {
|
||||||
{
|
|
||||||
BamBuf buf1_;
|
BamBuf buf1_;
|
||||||
BamBuf buf2_;
|
BamBuf buf2_;
|
||||||
BamBuf *pi_; // 当前用的buf
|
BamBuf *pi_; // 当前用的buf
|
||||||
BamBuf *po_; // 后台在读取的buf
|
BamBuf *po_; // 后台在读取的buf
|
||||||
pthread_t *tid_ = NULL;
|
pthread_t *tid_ = NULL;
|
||||||
bool hasThread = false;
|
bool hasThread = false;
|
||||||
int64_t legacy_size_ = 0; // 上一轮运算之后,缓存中还剩余的上次读取的read数量
|
int64_t legacy_size_ =
|
||||||
|
0; // 上一轮运算之后,缓存中还剩余的上次读取的read数量
|
||||||
bool first_read_ = true;
|
bool first_read_ = true;
|
||||||
int last_read_num_ = 0; // 上一次读取了多少reads
|
int last_read_num_ = 0; // 上一次读取了多少reads
|
||||||
bool need_read_ = true;
|
bool need_read_ = true;
|
||||||
bool use_async_io_ = true;
|
bool use_async_io_ = true;
|
||||||
int64_t clear_before_idx_ = 0; // 用户异步读取,下一轮读取之前清理掉clear_before_idx_之前的所有reads
|
int64_t clear_before_idx_ =
|
||||||
bool clear_all_ = false; // 用于异步读取,下一轮读取之前清理掉之前的所有reads
|
0; // 用户异步读取,下一轮读取之前清理掉clear_before_idx_之前的所有reads
|
||||||
|
bool clear_all_ =
|
||||||
|
false; // 用于异步读取,下一轮读取之前清理掉之前的所有reads
|
||||||
|
|
||||||
vector<BamWrap *> bam_arr_; // 用来访问buf中的bam
|
vector<BamWrap *> bam_arr_; // 用来访问buf中的bam
|
||||||
|
|
||||||
AsyncIoBamBuf() {}
|
AsyncIoBamBuf() {}
|
||||||
AsyncIoBamBuf(bool use_async) : use_async_io_(use_async) {}
|
AsyncIoBamBuf(bool use_async) : use_async_io_(use_async) {}
|
||||||
// 析构
|
// 析构
|
||||||
~AsyncIoBamBuf()
|
~AsyncIoBamBuf() {
|
||||||
{
|
if (tid_ != NULL) {
|
||||||
if (tid_ != NULL)
|
|
||||||
{
|
|
||||||
if (hasThread)
|
if (hasThread)
|
||||||
pthread_join(*tid_, 0);
|
pthread_join(*tid_, 0);
|
||||||
free(tid_);
|
free(tid_);
|
||||||
|
|
@ -136,9 +130,7 @@ struct AsyncIoBamBuf
|
||||||
}
|
}
|
||||||
|
|
||||||
// 初始化缓存
|
// 初始化缓存
|
||||||
void Init(samFile *fp,
|
void Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size);
|
||||||
sam_hdr_t *hdr,
|
|
||||||
int64_t mem_size);
|
|
||||||
|
|
||||||
// 读取数据
|
// 读取数据
|
||||||
int ReadBam();
|
int ReadBam();
|
||||||
|
|
@ -150,21 +142,17 @@ struct AsyncIoBamBuf
|
||||||
// 包含的read数量
|
// 包含的read数量
|
||||||
inline int64_t Size() { return legacy_size_ + pi_->Size(); }
|
inline int64_t Size() { return legacy_size_ + pi_->Size(); }
|
||||||
inline int ReadStat() { return pi_->read_stat_; }
|
inline int ReadStat() { return pi_->read_stat_; }
|
||||||
inline BamWrap *operator[](int64_t pos)
|
inline BamWrap *operator[](int64_t pos) { return bam_arr_[pos]; }
|
||||||
{
|
|
||||||
return bam_arr_[pos];
|
|
||||||
}
|
|
||||||
// 获取某一段reads
|
// 获取某一段reads
|
||||||
inline vector<BamWrap*> Slice(size_t startIdx, size_t endIdx)
|
inline vector<BamWrap *> Slice(size_t startIdx, size_t endIdx) {
|
||||||
{
|
|
||||||
if (endIdx > startIdx) {
|
if (endIdx > startIdx) {
|
||||||
auto begItr = bam_arr_.begin();
|
auto begItr = bam_arr_.begin();
|
||||||
return std::move(vector<BamWrap *>(begItr + startIdx, begItr + endIdx));
|
return std::move(
|
||||||
|
vector<BamWrap *>(begItr + startIdx, begItr + endIdx));
|
||||||
}
|
}
|
||||||
return std::move(vector<BamWrap *>());
|
return std::move(vector<BamWrap *>());
|
||||||
}
|
}
|
||||||
void FreeMemory()
|
void FreeMemory() {
|
||||||
{
|
|
||||||
buf1_.FreeMemory();
|
buf1_.FreeMemory();
|
||||||
buf2_.FreeMemory();
|
buf2_.FreeMemory();
|
||||||
}
|
}
|
||||||
|
|
@ -176,11 +164,9 @@ struct AsyncIoBamBuf
|
||||||
// 异步读取线程函数
|
// 异步读取线程函数
|
||||||
static void *async_read(void *data);
|
static void *async_read(void *data);
|
||||||
void resize_buf();
|
void resize_buf();
|
||||||
inline void refresh_bam_arr()
|
inline void refresh_bam_arr() {
|
||||||
{
|
|
||||||
bam_arr_.resize(this->Size());
|
bam_arr_.resize(this->Size());
|
||||||
for (int i = 0; i < bam_arr_.size(); ++i)
|
for (int i = 0; i < bam_arr_.size(); ++i) {
|
||||||
{
|
|
||||||
if (i < legacy_size_)
|
if (i < legacy_size_)
|
||||||
bam_arr_[i] = (*po_)[i];
|
bam_arr_[i] = (*po_)[i];
|
||||||
else
|
else
|
||||||
|
|
|
||||||
|
|
@ -9,15 +9,14 @@
|
||||||
|
|
||||||
#ifndef BAM_WRAP_H_
|
#ifndef BAM_WRAP_H_
|
||||||
#define BAM_WRAP_H_
|
#define BAM_WRAP_H_
|
||||||
#include <map>
|
#include <htslib/sam.h>
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
#include <sstream>
|
|
||||||
|
|
||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
#include <htslib/sam.h>
|
#include <map>
|
||||||
|
#include <sstream>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
|
|
@ -28,10 +27,9 @@ using namespace std;
|
||||||
/*
|
/*
|
||||||
* sam read的封装
|
* sam read的封装
|
||||||
*/
|
*/
|
||||||
struct BamWrap
|
struct BamWrap {
|
||||||
{
|
|
||||||
// 将contig左移后加上pos作为全局位置
|
// 将contig左移后加上pos作为全局位置
|
||||||
const static int MAX_CONTIG_LEN_SHIFT = 30;
|
const static int MAX_CONTIG_LEN_SHIFT = 40; // 将染色体id左移多少位,和位点拼合在一起
|
||||||
const static int READ_MAX_LENGTH = 200;
|
const static int READ_MAX_LENGTH = 200;
|
||||||
const static int READ_MAX_DEPTH = 1000; // 这只是用来初始化空间用的,深度大于这个值也没关系
|
const static int READ_MAX_DEPTH = 1000; // 这只是用来初始化空间用的,深度大于这个值也没关系
|
||||||
|
|
||||||
|
|
@ -40,40 +38,21 @@ struct BamWrap
|
||||||
int64_t end_pos_; // bam的全局结束位置, 闭区间
|
int64_t end_pos_; // bam的全局结束位置, 闭区间
|
||||||
|
|
||||||
// 全局开始位置
|
// 全局开始位置
|
||||||
inline int64_t start_pos()
|
inline int64_t start_pos() { return bam_global_pos(b); }
|
||||||
{
|
|
||||||
return bam_global_pos(b);
|
|
||||||
}
|
|
||||||
// 全局结束位置
|
// 全局结束位置
|
||||||
inline int64_t end_pos()
|
inline int64_t end_pos() { return end_pos_; }
|
||||||
{
|
|
||||||
return end_pos_;
|
|
||||||
}
|
|
||||||
// 和reference对应的序列长度
|
// 和reference对应的序列长度
|
||||||
inline int16_t read_len()
|
inline int16_t read_len() { return (end_pos_ - start_pos() + 1); }
|
||||||
{
|
|
||||||
return (end_pos_ - start_pos() + 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// 在contig内的开始位置
|
// 在contig内的开始位置
|
||||||
inline int32_t contig_pos()
|
inline int32_t contig_pos() { return b->core.pos; }
|
||||||
{
|
|
||||||
return b->core.pos;
|
|
||||||
}
|
|
||||||
// 在contig内部的结束位置
|
// 在contig内部的结束位置
|
||||||
inline int32_t contig_end_pos()
|
inline int32_t contig_end_pos() { return bam_pos(end_pos_); }
|
||||||
{
|
|
||||||
return bam_pos(end_pos_);
|
|
||||||
}
|
|
||||||
// 序列的长度(AGTC字母个数)
|
// 序列的长度(AGTC字母个数)
|
||||||
inline int16_t seq_len()
|
inline int16_t seq_len() { return b->core.l_qseq; }
|
||||||
{
|
|
||||||
return b->core.l_qseq;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 算上开头的softclip
|
// 算上开头的softclip
|
||||||
inline int32_t softclip_start()
|
inline int32_t softclip_start() {
|
||||||
{
|
|
||||||
const uint32_t *cigar = bam_get_cigar(b);
|
const uint32_t *cigar = bam_get_cigar(b);
|
||||||
const bam1_core_t &bc = b->core;
|
const bam1_core_t &bc = b->core;
|
||||||
const char c = bam_cigar_opchr(cigar[0]);
|
const char c = bam_cigar_opchr(cigar[0]);
|
||||||
|
|
@ -84,8 +63,7 @@ struct BamWrap
|
||||||
}
|
}
|
||||||
|
|
||||||
// 算上结尾的softclip
|
// 算上结尾的softclip
|
||||||
inline int32_t softclip_end()
|
inline int32_t softclip_end() {
|
||||||
{
|
|
||||||
const uint32_t *cigar = bam_get_cigar(b);
|
const uint32_t *cigar = bam_get_cigar(b);
|
||||||
const bam1_core_t &bc = b->core;
|
const bam1_core_t &bc = b->core;
|
||||||
const char c = bam_cigar_opchr(cigar[bc.n_cigar - 1]);
|
const char c = bam_cigar_opchr(cigar[bc.n_cigar - 1]);
|
||||||
|
|
@ -96,8 +74,7 @@ struct BamWrap
|
||||||
}
|
}
|
||||||
|
|
||||||
// 算上结尾的softclip
|
// 算上结尾的softclip
|
||||||
inline int32_t right_softclip_len()
|
inline int32_t right_softclip_len() {
|
||||||
{
|
|
||||||
const uint32_t *cigar = bam_get_cigar(b);
|
const uint32_t *cigar = bam_get_cigar(b);
|
||||||
const bam1_core_t &bc = b->core;
|
const bam1_core_t &bc = b->core;
|
||||||
const char c = bam_cigar_opchr(cigar[bc.n_cigar - 1]);
|
const char c = bam_cigar_opchr(cigar[bc.n_cigar - 1]);
|
||||||
|
|
@ -108,14 +85,12 @@ struct BamWrap
|
||||||
}
|
}
|
||||||
|
|
||||||
// 获取序列
|
// 获取序列
|
||||||
inline std::string sequence()
|
inline std::string sequence() {
|
||||||
{
|
|
||||||
ostringstream oss;
|
ostringstream oss;
|
||||||
char *seq = (char *)bam_get_seq(b);
|
char *seq = (char *)bam_get_seq(b);
|
||||||
const bam1_core_t &bc = b->core;
|
const bam1_core_t &bc = b->core;
|
||||||
const char base_to_char[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'};
|
const char base_to_char[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'};
|
||||||
for (int i = 0; i < bc.l_qseq; ++i)
|
for (int i = 0; i < bc.l_qseq; ++i) {
|
||||||
{
|
|
||||||
char base = base_to_char[bam_seqi(seq, i)];
|
char base = base_to_char[bam_seqi(seq, i)];
|
||||||
oss << base;
|
oss << base;
|
||||||
}
|
}
|
||||||
|
|
@ -123,18 +98,13 @@ struct BamWrap
|
||||||
}
|
}
|
||||||
|
|
||||||
// 获取名字
|
// 获取名字
|
||||||
inline const char *query_name()
|
inline const char *query_name() { return bam_get_qname(b); }
|
||||||
{
|
|
||||||
return bam_get_qname(b);
|
|
||||||
}
|
|
||||||
// 获取cigar 字符串
|
// 获取cigar 字符串
|
||||||
inline string cigar_str()
|
inline string cigar_str() {
|
||||||
{
|
|
||||||
ostringstream oss;
|
ostringstream oss;
|
||||||
const uint32_t *cigar = bam_get_cigar(b);
|
const uint32_t *cigar = bam_get_cigar(b);
|
||||||
const bam1_core_t &bc = b->core;
|
const bam1_core_t &bc = b->core;
|
||||||
for (int i = 0; i < bc.n_cigar; ++i)
|
for (int i = 0; i < bc.n_cigar; ++i) {
|
||||||
{
|
|
||||||
const char c = bam_cigar_opchr(cigar[i]);
|
const char c = bam_cigar_opchr(cigar[i]);
|
||||||
const int len = bam_cigar_oplen(cigar[i]);
|
const int len = bam_cigar_oplen(cigar[i]);
|
||||||
oss << len << c;
|
oss << len << c;
|
||||||
|
|
@ -143,21 +113,14 @@ struct BamWrap
|
||||||
}
|
}
|
||||||
|
|
||||||
// 占用的内存大小
|
// 占用的内存大小
|
||||||
inline int16_t length()
|
inline int16_t length() { return sizeof(*this) + sizeof(bam1_t) + b->l_data; }
|
||||||
{
|
|
||||||
return sizeof(*this) +
|
|
||||||
sizeof(bam1_t) +
|
|
||||||
b->l_data;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 获取cigar中insert的总长度
|
// 获取cigar中insert的总长度
|
||||||
inline int32_t insert_cigar_len()
|
inline int32_t insert_cigar_len() {
|
||||||
{
|
|
||||||
const uint32_t *cigar = bam_get_cigar(b);
|
const uint32_t *cigar = bam_get_cigar(b);
|
||||||
const bam1_core_t &bc = b->core;
|
const bam1_core_t &bc = b->core;
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
for (int i = 0; i < bc.n_cigar; ++i)
|
for (int i = 0; i < bc.n_cigar; ++i) {
|
||||||
{
|
|
||||||
const char c = bam_cigar_opchr(cigar[i]);
|
const char c = bam_cigar_opchr(cigar[i]);
|
||||||
const int len = bam_cigar_oplen(cigar[i]);
|
const int len = bam_cigar_oplen(cigar[i]);
|
||||||
if (c == 'I')
|
if (c == 'I')
|
||||||
|
|
@ -167,13 +130,11 @@ struct BamWrap
|
||||||
}
|
}
|
||||||
|
|
||||||
// 获取cigar中delete的总长度
|
// 获取cigar中delete的总长度
|
||||||
inline int32_t del_cigar_len()
|
inline int32_t del_cigar_len() {
|
||||||
{
|
|
||||||
const uint32_t *cigar = bam_get_cigar(b);
|
const uint32_t *cigar = bam_get_cigar(b);
|
||||||
const bam1_core_t &bc = b->core;
|
const bam1_core_t &bc = b->core;
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
for (int i = 0; i < bc.n_cigar; ++i)
|
for (int i = 0; i < bc.n_cigar; ++i) {
|
||||||
{
|
|
||||||
const char c = bam_cigar_opchr(cigar[i]);
|
const char c = bam_cigar_opchr(cigar[i]);
|
||||||
const int len = bam_cigar_oplen(cigar[i]);
|
const int len = bam_cigar_oplen(cigar[i]);
|
||||||
if (c == 'D')
|
if (c == 'D')
|
||||||
|
|
@ -183,13 +144,11 @@ struct BamWrap
|
||||||
}
|
}
|
||||||
|
|
||||||
// 计算sam read的终点位置
|
// 计算sam read的终点位置
|
||||||
static inline int64_t BamEndPos(const bam1_t *b)
|
static inline int64_t BamEndPos(const bam1_t *b) {
|
||||||
{
|
|
||||||
const uint32_t *cigar = bam_get_cigar(b);
|
const uint32_t *cigar = bam_get_cigar(b);
|
||||||
const bam1_core_t &bc = b->core;
|
const bam1_core_t &bc = b->core;
|
||||||
int start_offset = -1;
|
int start_offset = -1;
|
||||||
for (int i = 0; i < bc.n_cigar; ++i)
|
for (int i = 0; i < bc.n_cigar; ++i) {
|
||||||
{
|
|
||||||
const char c = bam_cigar_opchr(cigar[i]);
|
const char c = bam_cigar_opchr(cigar[i]);
|
||||||
const int len = bam_cigar_oplen(cigar[i]);
|
const int len = bam_cigar_oplen(cigar[i]);
|
||||||
if (c == 'D' || c == 'N' || c == 'M' || c == '=' || c == 'X')
|
if (c == 'D' || c == 'N' || c == 'M' || c == '=' || c == 'X')
|
||||||
|
|
@ -198,31 +157,22 @@ struct BamWrap
|
||||||
return (((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)(b->core.pos + start_offset));
|
return (((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)(b->core.pos + start_offset));
|
||||||
};
|
};
|
||||||
|
|
||||||
bool HasWellDefinedFragmentSize()
|
bool HasWellDefinedFragmentSize() {
|
||||||
{
|
|
||||||
const bam1_core_t &bc = b->core;
|
const bam1_core_t &bc = b->core;
|
||||||
bool hasWellDefinedFragmentSize = true;
|
bool hasWellDefinedFragmentSize = true;
|
||||||
if (bc.isize == 0 ||
|
if (bc.isize == 0 || !(bc.flag & BAM_FPAIRED) || ((bc.flag & BAM_FUNMAP) || (bc.flag & BAM_FMUNMAP)) ||
|
||||||
!(bc.flag & BAM_FPAIRED) ||
|
((bool)(bc.flag & BAM_FREVERSE) == (bool)(bc.flag & BAM_FMREVERSE))) {
|
||||||
((bc.flag & BAM_FUNMAP) || (bc.flag & BAM_FMUNMAP)) ||
|
|
||||||
((bool)(bc.flag & BAM_FREVERSE) == (bool)(bc.flag & BAM_FMREVERSE)))
|
|
||||||
{
|
|
||||||
hasWellDefinedFragmentSize = false;
|
hasWellDefinedFragmentSize = false;
|
||||||
}
|
} else if (bc.flag & BAM_FREVERSE) {
|
||||||
else if (bc.flag & BAM_FREVERSE)
|
|
||||||
{
|
|
||||||
hasWellDefinedFragmentSize = contig_end_pos() > bc.mpos ? true : false;
|
hasWellDefinedFragmentSize = contig_end_pos() > bc.mpos ? true : false;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
hasWellDefinedFragmentSize = bc.pos <= bc.mpos + bc.isize ? true : false;
|
hasWellDefinedFragmentSize = bc.pos <= bc.mpos + bc.isize ? true : false;
|
||||||
}
|
}
|
||||||
return hasWellDefinedFragmentSize;
|
return hasWellDefinedFragmentSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 计算bam的adapterBoundary
|
// 计算bam的adapterBoundary
|
||||||
int GetAdapterBoundary()
|
int GetAdapterBoundary() {
|
||||||
{
|
|
||||||
const bam1_core_t &bc = b->core;
|
const bam1_core_t &bc = b->core;
|
||||||
int adapterBoundary;
|
int adapterBoundary;
|
||||||
if (!HasWellDefinedFragmentSize())
|
if (!HasWellDefinedFragmentSize())
|
||||||
|
|
@ -235,34 +185,29 @@ struct BamWrap
|
||||||
}
|
}
|
||||||
|
|
||||||
// 获取开头的I的长度
|
// 获取开头的I的长度
|
||||||
inline int GetHeadInsertLen()
|
inline int GetHeadInsertLen() {
|
||||||
{
|
|
||||||
int insLen = 0;
|
int insLen = 0;
|
||||||
const uint32_t *cigar = bam_get_cigar(b);
|
const uint32_t *cigar = bam_get_cigar(b);
|
||||||
const bam1_core_t &bc = b->core;
|
const bam1_core_t &bc = b->core;
|
||||||
for (int i = 0; i < bc.n_cigar; ++i)
|
for (int i = 0; i < bc.n_cigar; ++i) {
|
||||||
{
|
|
||||||
const char c = bam_cigar_opchr(cigar[i]);
|
const char c = bam_cigar_opchr(cigar[i]);
|
||||||
const int len = bam_cigar_oplen(cigar[i]);
|
const int len = bam_cigar_oplen(cigar[i]);
|
||||||
if (c == 'I')
|
if (c == 'I') {
|
||||||
{
|
|
||||||
insLen = len;
|
insLen = len;
|
||||||
break;
|
break;
|
||||||
}
|
} else if (c != 'H' && c != 'S')
|
||||||
else if (c != 'H' && c != 'S')
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
return insLen;
|
return insLen;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 获取soft clip开始位置(能处理H和S相连的情况,有这种情况么?, 注意开头的I要当做S?)
|
// 获取soft clip开始位置(能处理H和S相连的情况,有这种情况么?,
|
||||||
inline int64_t GetSoftStart()
|
// 注意开头的I要当做S?)
|
||||||
{
|
inline int64_t GetSoftStart() {
|
||||||
int64_t softStart = b->core.pos;
|
int64_t softStart = b->core.pos;
|
||||||
const uint32_t *cigar = bam_get_cigar(b);
|
const uint32_t *cigar = bam_get_cigar(b);
|
||||||
const bam1_core_t &bc = b->core;
|
const bam1_core_t &bc = b->core;
|
||||||
for (int i = 0; i < bc.n_cigar; ++i)
|
for (int i = 0; i < bc.n_cigar; ++i) {
|
||||||
{
|
|
||||||
const char c = bam_cigar_opchr(cigar[i]);
|
const char c = bam_cigar_opchr(cigar[i]);
|
||||||
const int len = bam_cigar_oplen(cigar[i]);
|
const int len = bam_cigar_oplen(cigar[i]);
|
||||||
if (c == 'S' || c == 'I')
|
if (c == 'S' || c == 'I')
|
||||||
|
|
@ -274,13 +219,11 @@ struct BamWrap
|
||||||
}
|
}
|
||||||
|
|
||||||
// 获取unclipped开始位置(包括hardclip)
|
// 获取unclipped开始位置(包括hardclip)
|
||||||
inline int64_t GetUnclippedStart()
|
inline int64_t GetUnclippedStart() {
|
||||||
{
|
|
||||||
int64_t start = b->core.pos;
|
int64_t start = b->core.pos;
|
||||||
const uint32_t *cigar = bam_get_cigar(b);
|
const uint32_t *cigar = bam_get_cigar(b);
|
||||||
const bam1_core_t &bc = b->core;
|
const bam1_core_t &bc = b->core;
|
||||||
for (int i = 0; i < bc.n_cigar; ++i)
|
for (int i = 0; i < bc.n_cigar; ++i) {
|
||||||
{
|
|
||||||
const char c = bam_cigar_opchr(cigar[i]);
|
const char c = bam_cigar_opchr(cigar[i]);
|
||||||
const int len = bam_cigar_oplen(cigar[i]);
|
const int len = bam_cigar_oplen(cigar[i]);
|
||||||
if (c == 'S' || c == 'H')
|
if (c == 'S' || c == 'H')
|
||||||
|
|
@ -292,13 +235,11 @@ struct BamWrap
|
||||||
}
|
}
|
||||||
|
|
||||||
// 获取unclipped结束位置(包括hardclip)
|
// 获取unclipped结束位置(包括hardclip)
|
||||||
inline int64_t GetUnclippedEnd()
|
inline int64_t GetUnclippedEnd() {
|
||||||
{
|
|
||||||
int64_t end_pos = bam_endpos(b);
|
int64_t end_pos = bam_endpos(b);
|
||||||
const uint32_t *cigar = bam_get_cigar(b);
|
const uint32_t *cigar = bam_get_cigar(b);
|
||||||
const bam1_core_t &bc = b->core;
|
const bam1_core_t &bc = b->core;
|
||||||
for (int i = bc.n_cigar - 1; i >= 0; --i)
|
for (int i = bc.n_cigar - 1; i >= 0; --i) {
|
||||||
{
|
|
||||||
const char c = bam_cigar_opchr(cigar[i]);
|
const char c = bam_cigar_opchr(cigar[i]);
|
||||||
const int len = bam_cigar_oplen(cigar[i]);
|
const int len = bam_cigar_oplen(cigar[i]);
|
||||||
if (c == 'S' || c == 'H')
|
if (c == 'S' || c == 'H')
|
||||||
|
|
@ -311,12 +252,10 @@ struct BamWrap
|
||||||
|
|
||||||
/* 获取碱基质量分数的加和 */
|
/* 获取碱基质量分数的加和 */
|
||||||
/** Calculates a score for the read which is the sum of scores over Q15. */
|
/** Calculates a score for the read which is the sum of scores over Q15. */
|
||||||
inline int GetSumOfBaseQualities()
|
inline int GetSumOfBaseQualities() {
|
||||||
{
|
|
||||||
int score = 0;
|
int score = 0;
|
||||||
uint8_t *qual = bam_get_qual(b);
|
uint8_t *qual = bam_get_qual(b);
|
||||||
for (int i = 0; i < b->core.l_qseq; ++i)
|
for (int i = 0; i < b->core.l_qseq; ++i) {
|
||||||
{
|
|
||||||
if (qual[i] >= 15)
|
if (qual[i] >= 15)
|
||||||
score += qual[i];
|
score += qual[i];
|
||||||
}
|
}
|
||||||
|
|
@ -327,93 +266,63 @@ struct BamWrap
|
||||||
/* 与flag相关的检测 */
|
/* 与flag相关的检测 */
|
||||||
|
|
||||||
/* 没有比对上 unmapped */
|
/* 没有比对上 unmapped */
|
||||||
inline bool GetReadUnmappedFlag()
|
inline bool GetReadUnmappedFlag() { return b->core.flag & BAM_FUNMAP; }
|
||||||
{
|
|
||||||
return b->core.flag & BAM_FUNMAP;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Template having multiple segments in sequencing */
|
/* Template having multiple segments in sequencing */
|
||||||
inline bool GetReadPairedFlag()
|
inline bool GetReadPairedFlag() { return b->core.flag & BAM_FPAIRED; }
|
||||||
{
|
|
||||||
return b->core.flag & BAM_FPAIRED;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* the read fails platform/vendor quality checks.
|
* the read fails platform/vendor quality checks.
|
||||||
*/
|
*/
|
||||||
inline bool GetReadFailsVendorQualityCheckFlag()
|
inline bool GetReadFailsVendorQualityCheckFlag() { return b->core.flag & BAM_FQCFAIL; }
|
||||||
{
|
|
||||||
return b->core.flag & BAM_FQCFAIL;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* the mate is unmapped.
|
* the mate is unmapped.
|
||||||
*/
|
*/
|
||||||
bool GetMateUnmappedFlag()
|
bool GetMateUnmappedFlag() { return b->core.flag & BAM_FMUNMAP; }
|
||||||
{
|
|
||||||
return b->core.flag & BAM_FMUNMAP;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return whether the alignment is secondary (an alternative alignment of the read).
|
* @return whether the alignment is secondary (an alternative alignment of
|
||||||
|
* the read).
|
||||||
*/
|
*/
|
||||||
bool IsSecondaryAlignment()
|
bool IsSecondaryAlignment() { return b->core.flag & BAM_FSECONDARY; }
|
||||||
{
|
|
||||||
return b->core.flag & BAM_FSECONDARY;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return whether the alignment is supplementary (a split alignment such as a chimeric alignment).
|
* @return whether the alignment is supplementary (a split alignment such as
|
||||||
|
* a chimeric alignment).
|
||||||
*/
|
*/
|
||||||
bool GetSupplementaryAlignmentFlag()
|
bool GetSupplementaryAlignmentFlag() { return b->core.flag & BAM_FSUPPLEMENTARY; }
|
||||||
{
|
|
||||||
return b->core.flag & BAM_FSUPPLEMENTARY;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Tests if this record is either a secondary and/or supplementary alignment;
|
* Tests if this record is either a secondary and/or supplementary
|
||||||
|
* alignment;
|
||||||
*/
|
*/
|
||||||
bool IsSecondaryOrSupplementary()
|
bool IsSecondaryOrSupplementary() { return IsSecondaryAlignment() || GetSupplementaryAlignmentFlag(); }
|
||||||
{
|
|
||||||
return IsSecondaryAlignment() || GetSupplementaryAlignmentFlag();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* the read is the first read in a pair.
|
* the read is the first read in a pair.
|
||||||
*/
|
*/
|
||||||
bool GetFirstOfPairFlag()
|
bool GetFirstOfPairFlag() { return b->core.flag & BAM_FREAD1; }
|
||||||
{
|
|
||||||
return b->core.flag & BAM_FREAD1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* strand of the query (false for forward; true for reverse strand).
|
* strand of the query (false for forward; true for reverse strand).
|
||||||
*/
|
*/
|
||||||
bool GetReadNegativeStrandFlag()
|
bool GetReadNegativeStrandFlag() { return b->core.flag & BAM_FREVERSE; }
|
||||||
{
|
|
||||||
return b->core.flag & BAM_FREVERSE;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* strand of the mate (false for forward; true for reverse strand).
|
* strand of the mate (false for forward; true for reverse strand).
|
||||||
*/
|
*/
|
||||||
bool GetMateNegativeStrandFlag()
|
bool GetMateNegativeStrandFlag() { return b->core.flag & BAM_FMREVERSE; }
|
||||||
{
|
|
||||||
return b->core.flag & BAM_FMREVERSE;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 其他的一些信息 */
|
/* 其他的一些信息 */
|
||||||
inline int GetReferenceLength()
|
inline int GetReferenceLength() {
|
||||||
{
|
|
||||||
int length = 0;
|
int length = 0;
|
||||||
const uint32_t *cigar = bam_get_cigar(b);
|
const uint32_t *cigar = bam_get_cigar(b);
|
||||||
const bam1_core_t &bc = b->core;
|
const bam1_core_t &bc = b->core;
|
||||||
for (int i = 0; i < bc.n_cigar; ++i)
|
for (int i = 0; i < bc.n_cigar; ++i) {
|
||||||
{
|
|
||||||
const char c = bam_cigar_opchr(cigar[i]);
|
const char c = bam_cigar_opchr(cigar[i]);
|
||||||
const int len = bam_cigar_oplen(cigar[i]);
|
const int len = bam_cigar_oplen(cigar[i]);
|
||||||
switch (c)
|
switch (c) {
|
||||||
{
|
|
||||||
case 'M':
|
case 'M':
|
||||||
case 'D':
|
case 'D':
|
||||||
case 'N':
|
case 'N':
|
||||||
|
|
@ -429,24 +338,20 @@ struct BamWrap
|
||||||
}
|
}
|
||||||
|
|
||||||
// 计算bam的全局位置,算上染色体序号和比对位置
|
// 计算bam的全局位置,算上染色体序号和比对位置
|
||||||
static inline int64_t bam_global_pos(bam1_t *b)
|
static inline int64_t bam_global_pos(bam1_t *b) {
|
||||||
{
|
|
||||||
return (((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)b->core.pos);
|
return (((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)b->core.pos);
|
||||||
}
|
}
|
||||||
static inline int64_t bam_global_pos(int tid, int pos)
|
static inline int64_t bam_global_pos(int tid, int pos) {
|
||||||
{
|
|
||||||
return (((int64_t)tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)pos);
|
return (((int64_t)tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)pos);
|
||||||
}
|
}
|
||||||
// 根据全局位置获取bam的染色体序号
|
// 根据全局位置获取bam的染色体序号
|
||||||
static inline int32_t bam_tid(int64_t global_pos)
|
static inline int32_t bam_tid(int64_t global_pos) {
|
||||||
{
|
|
||||||
const int64_t mask = ~(((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1);
|
const int64_t mask = ~(((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1);
|
||||||
const int64_t high_tid = global_pos & mask;
|
const int64_t high_tid = global_pos & mask;
|
||||||
return (int32_t)(high_tid >> MAX_CONTIG_LEN_SHIFT);
|
return (int32_t)(high_tid >> MAX_CONTIG_LEN_SHIFT);
|
||||||
}
|
}
|
||||||
// 根据全局位置获取bam的比对位置(染色体内)
|
// 根据全局位置获取bam的比对位置(染色体内)
|
||||||
static inline int32_t bam_pos(int64_t global_pos)
|
static inline int32_t bam_pos(int64_t global_pos) {
|
||||||
{
|
|
||||||
const int64_t mask = ((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1;
|
const int64_t mask = ((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1;
|
||||||
return (int32_t)(global_pos & mask);
|
return (int32_t)(global_pos & mask);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -23,8 +23,7 @@ using std::vector;
|
||||||
struct option *GlobalArg::GLOBAL_OPT = nullptr;
|
struct option *GlobalArg::GLOBAL_OPT = nullptr;
|
||||||
|
|
||||||
// 初始化参数
|
// 初始化参数
|
||||||
void GlobalArg::initGlobalOptions()
|
void GlobalArg::initGlobalOptions() {
|
||||||
{
|
|
||||||
vector<struct option> v;
|
vector<struct option> v;
|
||||||
v.push_back({"INPUT", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_INPUT}); // 输入文件
|
v.push_back({"INPUT", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_INPUT}); // 输入文件
|
||||||
v.push_back({"OUTPUT", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_OUTPUT}); // 输出文件
|
v.push_back({"OUTPUT", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_OUTPUT}); // 输出文件
|
||||||
|
|
@ -51,11 +50,9 @@ void GlobalArg::initGlobalOptions()
|
||||||
}
|
}
|
||||||
|
|
||||||
// 解析参数
|
// 解析参数
|
||||||
void GlobalArg::parseArgument(int argNum)
|
void GlobalArg::parseArgument(int argNum) {
|
||||||
{
|
|
||||||
using namespace ns_ga;
|
using namespace ns_ga;
|
||||||
switch (argNum)
|
switch (argNum) {
|
||||||
{
|
|
||||||
case OPT_INPUT:
|
case OPT_INPUT:
|
||||||
in_fn = optarg;
|
in_fn = optarg;
|
||||||
break;
|
break;
|
||||||
|
|
@ -65,8 +62,7 @@ void GlobalArg::parseArgument(int argNum)
|
||||||
case OPT_NUM_THREADS:
|
case OPT_NUM_THREADS:
|
||||||
num_threads = std::stoi(optarg);
|
num_threads = std::stoi(optarg);
|
||||||
break;
|
break;
|
||||||
case OPT_MAX_MEM:
|
case OPT_MAX_MEM: {
|
||||||
{
|
|
||||||
char *q;
|
char *q;
|
||||||
size_t mem_arg = strtol(optarg, &q, 0);
|
size_t mem_arg = strtol(optarg, &q, 0);
|
||||||
if (*q == 'k' || *q == 'K')
|
if (*q == 'k' || *q == 'K')
|
||||||
|
|
@ -77,14 +73,12 @@ void GlobalArg::parseArgument(int argNum)
|
||||||
mem_arg <<= 30;
|
mem_arg <<= 30;
|
||||||
if (mem_arg >= max_mem)
|
if (mem_arg >= max_mem)
|
||||||
max_mem = mem_arg;
|
max_mem = mem_arg;
|
||||||
else
|
else {
|
||||||
{
|
|
||||||
std::cerr << "[Warn] Too small mem size, use default" << std::endl;
|
std::cerr << "[Warn] Too small mem size, use default" << std::endl;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case OPT_LOG_LEVEL:
|
case OPT_LOG_LEVEL: {
|
||||||
{
|
|
||||||
if (strcmp("ERROR", optarg) == 0)
|
if (strcmp("ERROR", optarg) == 0)
|
||||||
verbosity = ns_ga::ERROR;
|
verbosity = ns_ga::ERROR;
|
||||||
else if (strcmp("WARNING", optarg) == 0)
|
else if (strcmp("WARNING", optarg) == 0)
|
||||||
|
|
@ -95,8 +89,7 @@ void GlobalArg::parseArgument(int argNum)
|
||||||
verbosity = ns_ga::DEBUG;
|
verbosity = ns_ga::DEBUG;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case OPT_ASYNCIO:
|
case OPT_ASYNCIO: {
|
||||||
{
|
|
||||||
if (strcmp("true", optarg) == 0)
|
if (strcmp("true", optarg) == 0)
|
||||||
use_asyncio = true;
|
use_asyncio = true;
|
||||||
else if (strcmp("false", optarg) == 0)
|
else if (strcmp("false", optarg) == 0)
|
||||||
|
|
|
||||||
|
|
@ -9,19 +9,19 @@ Date : 2023/10/23
|
||||||
#ifndef GLOBAL_ARG_H_
|
#ifndef GLOBAL_ARG_H_
|
||||||
#define GLOBAL_ARG_H_
|
#define GLOBAL_ARG_H_
|
||||||
|
|
||||||
#include <string>
|
|
||||||
#include <map>
|
|
||||||
#include <vector>
|
|
||||||
#include <getopt.h>
|
#include <getopt.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
|
#include <map>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
using std::map;
|
using std::map;
|
||||||
using std::string;
|
using std::string;
|
||||||
using std::vector;
|
using std::vector;
|
||||||
|
|
||||||
namespace ns_ga {
|
namespace ns_ga {
|
||||||
enum GlobalOptEnum
|
enum GlobalOptEnum {
|
||||||
{
|
|
||||||
_START_NUM = 1,
|
_START_NUM = 1,
|
||||||
OPT_INPUT,
|
OPT_INPUT,
|
||||||
OPT_OUTPUT,
|
OPT_OUTPUT,
|
||||||
|
|
@ -32,28 +32,23 @@ namespace ns_ga {
|
||||||
OPT_VERSION,
|
OPT_VERSION,
|
||||||
OPT_HELP,
|
OPT_HELP,
|
||||||
_END_NUM
|
_END_NUM
|
||||||
};
|
};
|
||||||
|
|
||||||
// log level
|
// log level
|
||||||
enum LogLevelEnum
|
enum LogLevelEnum { ERROR, WARNING, INFO, DEBUG };
|
||||||
{
|
} // namespace ns_ga
|
||||||
ERROR,
|
|
||||||
WARNING,
|
|
||||||
INFO,
|
|
||||||
DEBUG
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 全局共享的一些参数 */
|
/* 全局共享的一些参数 */
|
||||||
struct GlobalArg
|
struct GlobalArg {
|
||||||
{
|
const static int GLOBAL_ARG_CNT =
|
||||||
const static int GLOBAL_ARG_CNT = ns_ga::GlobalOptEnum::_END_NUM - ns_ga::GlobalOptEnum::_START_NUM; // 这里不需要减1
|
ns_ga::GlobalOptEnum::_END_NUM -
|
||||||
|
ns_ga::GlobalOptEnum::_START_NUM; // 这里不需要减1
|
||||||
static struct option *GLOBAL_OPT;
|
static struct option *GLOBAL_OPT;
|
||||||
|
|
||||||
string in_fn; // input bam filename
|
string in_fn; // input bam filename
|
||||||
string out_fn; // output bam filename
|
string out_fn; // output bam filename
|
||||||
int num_threads = 1; // 线程个数
|
int num_threads = 1; // 线程个数
|
||||||
size_t max_mem = ((size_t)2) << 30; // 最小2G
|
size_t max_mem = ((size_t)1) << 30; // 最小1G
|
||||||
ns_ga::LogLevelEnum verbosity = ns_ga::INFO; // 打印信息级别
|
ns_ga::LogLevelEnum verbosity = ns_ga::INFO; // 打印信息级别
|
||||||
bool use_asyncio = true; // 是否使用异步io
|
bool use_asyncio = true; // 是否使用异步io
|
||||||
|
|
||||||
|
|
@ -64,8 +59,7 @@ struct GlobalArg
|
||||||
GlobalArg &operator=(const GlobalArg &) = delete;
|
GlobalArg &operator=(const GlobalArg &) = delete;
|
||||||
|
|
||||||
// 获取单例
|
// 获取单例
|
||||||
static GlobalArg &Instance()
|
static GlobalArg &Instance() {
|
||||||
{
|
|
||||||
static GlobalArg instance;
|
static GlobalArg instance;
|
||||||
return instance;
|
return instance;
|
||||||
}
|
}
|
||||||
|
|
@ -76,8 +70,7 @@ struct GlobalArg
|
||||||
void parseArgument(int argNum);
|
void parseArgument(int argNum);
|
||||||
|
|
||||||
// 获取对应参数在数组(option和help info)中的索引
|
// 获取对应参数在数组(option和help info)中的索引
|
||||||
int getArgIndx(ns_ga::GlobalOptEnum opt)
|
int getArgIndx(ns_ga::GlobalOptEnum opt) {
|
||||||
{
|
|
||||||
return opt - ns_ga::GlobalOptEnum::OPT_INPUT;
|
return opt - ns_ga::GlobalOptEnum::OPT_INPUT;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -95,11 +88,9 @@ struct GlobalArg
|
||||||
printf("--verbosity = %d\n", verbosity);
|
printf("--verbosity = %d\n", verbosity);
|
||||||
printf("--asyncio = %d\n", use_asyncio);
|
printf("--asyncio = %d\n", use_asyncio);
|
||||||
}
|
}
|
||||||
private :
|
|
||||||
GlobalArg()
|
private:
|
||||||
{
|
GlobalArg() { initGlobalOptions(); };
|
||||||
initGlobalOptions();
|
|
||||||
};
|
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -7,36 +7,34 @@ Author : Zhang Zhonghai
|
||||||
Date : 2023/11/6
|
Date : 2023/11/6
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <string>
|
|
||||||
#include <random>
|
#include <random>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
using std::string;
|
using std::string;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Provides an implementation of the Murmur3_32 hash algorithm that has desirable properties in terms of randomness
|
* Provides an implementation of the Murmur3_32 hash algorithm that has
|
||||||
* and uniformity of the distribution of output values that make it a useful hashing algorithm for downsampling.
|
* desirable properties in terms of randomness and uniformity of the
|
||||||
|
* distribution of output values that make it a useful hashing algorithm for
|
||||||
|
* downsampling.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
struct Murmur3
|
struct Murmur3 {
|
||||||
{
|
|
||||||
int seed_ = 0;
|
int seed_ = 0;
|
||||||
/** Hashes a character stream to an int using Murmur3. */
|
/** Hashes a character stream to an int using Murmur3. */
|
||||||
int HashUnencodedChars(const string &input)
|
int HashUnencodedChars(const string &input) {
|
||||||
{
|
|
||||||
int h1 = this->seed_;
|
int h1 = this->seed_;
|
||||||
|
|
||||||
// step through the CharSequence 2 chars at a time
|
// step through the CharSequence 2 chars at a time
|
||||||
const int length = input.size();
|
const int length = input.size();
|
||||||
for (int i = 1; i < length; i += 2)
|
for (int i = 1; i < length; i += 2) {
|
||||||
{
|
|
||||||
int k1 = input.at(i - 1) | (input.at(i) << 16);
|
int k1 = input.at(i - 1) | (input.at(i) << 16);
|
||||||
k1 = mixK1(k1);
|
k1 = mixK1(k1);
|
||||||
h1 = mixH1(h1, k1);
|
h1 = mixH1(h1, k1);
|
||||||
}
|
}
|
||||||
|
|
||||||
// deal with any remaining characters
|
// deal with any remaining characters
|
||||||
if ((length & 1) == 1)
|
if ((length & 1) == 1) {
|
||||||
{
|
|
||||||
int k1 = input.at(length - 1);
|
int k1 = input.at(length - 1);
|
||||||
k1 = mixK1(k1);
|
k1 = mixK1(k1);
|
||||||
h1 ^= k1;
|
h1 ^= k1;
|
||||||
|
|
@ -45,14 +43,12 @@ struct Murmur3
|
||||||
return fmix(h1, 2 * length);
|
return fmix(h1, 2 * length);
|
||||||
}
|
}
|
||||||
|
|
||||||
static Murmur3 &Instance()
|
static Murmur3 &Instance() {
|
||||||
{
|
|
||||||
static Murmur3 instance;
|
static Murmur3 instance;
|
||||||
return instance;
|
return instance;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int mixK1(int k1)
|
static int mixK1(int k1) {
|
||||||
{
|
|
||||||
const int c1 = 0xcc9e2d51;
|
const int c1 = 0xcc9e2d51;
|
||||||
const int c2 = 0x1b873593;
|
const int c2 = 0x1b873593;
|
||||||
k1 *= c1;
|
k1 *= c1;
|
||||||
|
|
@ -61,8 +57,7 @@ struct Murmur3
|
||||||
return k1;
|
return k1;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int mixH1(int h1, int k1)
|
static int mixH1(int h1, int k1) {
|
||||||
{
|
|
||||||
h1 ^= k1;
|
h1 ^= k1;
|
||||||
h1 = h1 << 13;
|
h1 = h1 << 13;
|
||||||
h1 = h1 * 5 + 0xe6546b64;
|
h1 = h1 * 5 + 0xe6546b64;
|
||||||
|
|
@ -70,8 +65,7 @@ struct Murmur3
|
||||||
}
|
}
|
||||||
|
|
||||||
// Finalization mix - force all bits of a hash block to avalanche
|
// Finalization mix - force all bits of a hash block to avalanche
|
||||||
static int fmix(int h1, int length)
|
static int fmix(int h1, int length) {
|
||||||
{
|
|
||||||
h1 ^= length;
|
h1 ^= length;
|
||||||
h1 ^= (unsigned int)h1 >> 16;
|
h1 ^= (unsigned int)h1 >> 16;
|
||||||
h1 *= 0x85ebca6b;
|
h1 *= 0x85ebca6b;
|
||||||
|
|
@ -82,8 +76,7 @@ struct Murmur3
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Murmur3()
|
Murmur3() {
|
||||||
{
|
|
||||||
auto &&rd = std::random_device{};
|
auto &&rd = std::random_device{};
|
||||||
seed_ = rd();
|
seed_ = rd();
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,70 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
using std::string;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* 标记冗余过程中的一些数据统计
|
||||||
|
*/
|
||||||
|
struct DuplicationMetrics {
|
||||||
|
/**
|
||||||
|
* The library on which the duplicate marking was performed.
|
||||||
|
*/
|
||||||
|
string LIBRARY = "";
|
||||||
|
/**
|
||||||
|
* The number of mapped reads examined which did not have a mapped mate pair,
|
||||||
|
* either because the read is unpaired, or the read is paired to an unmapped mate.
|
||||||
|
*/
|
||||||
|
uint64_t UNPAIRED_READS_EXAMINED = 0;
|
||||||
|
/**
|
||||||
|
* The number of mapped read pairs examined. (Primary, non-supplemental)
|
||||||
|
*/
|
||||||
|
uint64_t READ_PAIRS_EXAMINED = 0;
|
||||||
|
/**
|
||||||
|
* The number of reads that were either secondary or supplementary
|
||||||
|
*/
|
||||||
|
uint64_t SECONDARY_OR_SUPPLEMENTARY_RDS = 0;
|
||||||
|
/**
|
||||||
|
* The total number of unmapped reads examined. (Primary, non-supplemental)
|
||||||
|
*/
|
||||||
|
uint64_t UNMAPPED_READS = 0;
|
||||||
|
/**
|
||||||
|
* The number of fragments that were marked as duplicates.
|
||||||
|
*/
|
||||||
|
uint64_t UNPAIRED_READ_DUPLICATES = 0;
|
||||||
|
/**
|
||||||
|
* The number of read pairs that were marked as duplicates.
|
||||||
|
*/
|
||||||
|
uint64_t READ_PAIR_DUPLICATES = 0;
|
||||||
|
/**
|
||||||
|
* The number of read pairs duplicates that were caused by optical duplication.
|
||||||
|
* Value is always < READ_PAIR_DUPLICATES, which counts all duplicates regardless of source.
|
||||||
|
*/
|
||||||
|
uint64_t READ_PAIR_OPTICAL_DUPLICATES = 0;
|
||||||
|
/**
|
||||||
|
* The fraction of mapped sequence that is marked as duplicate.
|
||||||
|
*/
|
||||||
|
double PERCENT_DUPLICATION = 0.0;
|
||||||
|
/**
|
||||||
|
* The estimated number of unique molecules in the library based on PE duplication.
|
||||||
|
*/
|
||||||
|
uint64_t ESTIMATED_LIBRARY_SIZE = 0;
|
||||||
|
|
||||||
|
// 其他的统计数据
|
||||||
|
|
||||||
|
// addSingletonToCount需要记录的数据
|
||||||
|
uint64_t DuplicateCountHist = 0;
|
||||||
|
uint64_t NonOpticalDuplicateCountHist = 0;
|
||||||
|
|
||||||
|
// track optical duplicates 需要记录的数据
|
||||||
|
uint64_t OpticalDuplicatesCountHist = 0;
|
||||||
|
uint64_t OpticalDuplicatesByLibraryId = 0;
|
||||||
|
|
||||||
|
// 统计相关的函数
|
||||||
|
void AddSingletonToCount() {
|
||||||
|
++this->DuplicateCountHist;
|
||||||
|
++this->NonOpticalDuplicateCountHist;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
@ -1,75 +1,74 @@
|
||||||
/*
|
/*
|
||||||
Description: 标记bam文件中的冗余信息,只处理按照坐标排序后的bam,且bam为单一样本数据
|
Description:
|
||||||
|
标记bam文件中的冗余信息,只处理按照坐标排序后的bam,且bam为单一样本数据
|
||||||
|
|
||||||
Copyright : All right reserved by ICT
|
Copyright : All right reserved by ICT
|
||||||
|
|
||||||
Author : Zhang Zhonghai
|
Author : Zhang Zhonghai
|
||||||
Date : 2023/10/23
|
Date : 2023/10/23
|
||||||
*/
|
*/
|
||||||
#include "markdups_arg.h"
|
|
||||||
// 有太多define冲突,放到最后include
|
|
||||||
|
|
||||||
|
|
||||||
#include <common/hts/bam_buf.h>
|
#include <common/hts/bam_buf.h>
|
||||||
#include <common/utils/global_arg.h>
|
#include <common/utils/global_arg.h>
|
||||||
|
#include <common/utils/murmur3.h>
|
||||||
#include <common/utils/thpool.h>
|
#include <common/utils/thpool.h>
|
||||||
#include <common/utils/timer.h>
|
#include <common/utils/timer.h>
|
||||||
#include <common/utils/util.h>
|
#include <common/utils/util.h>
|
||||||
#include <common/utils/murmur3.h>
|
|
||||||
#include <common/utils/yarn.h>
|
#include <common/utils/yarn.h>
|
||||||
|
#include <htslib/sam.h>
|
||||||
|
#include <htslib/thread_pool.h>
|
||||||
#include <sam/utils/read_ends.h>
|
#include <sam/utils/read_ends.h>
|
||||||
#include <sam/utils/read_name_parser.h>
|
#include <sam/utils/read_name_parser.h>
|
||||||
|
|
||||||
#include <htslib/sam.h>
|
|
||||||
#include <htslib/thread_pool.h>
|
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <vector>
|
|
||||||
#include <set>
|
|
||||||
#include <queue>
|
#include <queue>
|
||||||
|
#include <set>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "markdups_arg.h"
|
||||||
|
#include "md_funcs.h"
|
||||||
|
#include "parallel_md.h"
|
||||||
|
#include "serial_md.h"
|
||||||
|
#include "shared_args.h"
|
||||||
|
#include "dup_metrics.h"
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
using std::cout;
|
using std::cout;
|
||||||
|
|
||||||
|
|
||||||
#define SMA_TAG_PG "PG"
|
#define SMA_TAG_PG "PG"
|
||||||
|
|
||||||
#define BAM_BLOCK_SIZE 2 * 1024 * 1024
|
#define BAM_BLOCK_SIZE 16L * 1024 * 1024
|
||||||
#define NO_SUCH_INDEX INT64_MAX
|
#define NO_SUCH_INDEX INT64_MAX
|
||||||
|
|
||||||
static Timer tm_arr[20]; // 用来测试性能
|
Timer tm_arr[20]; // 用来测试性能
|
||||||
/* 全局本地变量 */
|
/* 全局本地变量 */
|
||||||
static vector<ReadNameParser> g_vRnParser; // 每个线程一个read name parser
|
vector<ReadNameParser> g_vRnParser; // 每个线程一个read name parser
|
||||||
static samFile *g_inBamFp; // 输入的bam文件
|
samFile *g_inBamFp; // 输入的bam文件
|
||||||
static sam_hdr_t *g_inBamHeader; // 输入的bam文件头信息
|
sam_hdr_t *g_inBamHeader; // 输入的bam文件头信息
|
||||||
static samFile *g_outBamFp = nullptr; // 输出文件, sam或者bam格式
|
samFile *g_outBamFp = nullptr; // 输出文件, sam或者bam格式
|
||||||
static sam_hdr_t *g_outBamHeader; // 输出文件的header
|
sam_hdr_t *g_outBamHeader; // 输出文件的header
|
||||||
|
|
||||||
/* 参数对象作为全局对象,免得多次作为参数传入函数中 */
|
/* 参数对象作为全局对象,免得多次作为参数传入函数中 */
|
||||||
static GlobalArg &g_gArg = GlobalArg::Instance();
|
GlobalArg &g_gArg = GlobalArg::Instance();
|
||||||
static MarkDupsArg g_mdArg;
|
static MarkDupsArg g_mdArg_;
|
||||||
|
MarkDupsArg &g_mdArg = g_mdArg_;
|
||||||
|
static GlobalDataArg gData_;
|
||||||
#include "md_funcs.h"
|
GlobalDataArg &gData = gData_;
|
||||||
#include "serial_md.h"
|
DuplicationMetrics gMetrics_;
|
||||||
#include "parallel_md.h"
|
DuplicationMetrics &gMetrics = gMetrics_;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* mark duplicate 入口,假定bam是按照比对后的坐标排序的,同一个样本的话不需要考虑barcode的问题
|
* mark duplicate
|
||||||
|
* 入口,假定bam是按照比对后的坐标排序的,同一个样本的话不需要考虑barcode的问题
|
||||||
*/
|
*/
|
||||||
int MarkDuplicates(int argc, char *argv[])
|
int MarkDuplicates(int argc, char *argv[]) {
|
||||||
{
|
|
||||||
Timer::log_time("程序开始");
|
Timer::log_time("程序开始");
|
||||||
Timer time_all;
|
Timer time_all;
|
||||||
|
|
||||||
/* 读取命令行参数 */
|
/* 读取命令行参数 */
|
||||||
g_mdArg.parseArgument(argc, argv, &g_gArg); // 解析命令行参数
|
g_mdArg.parseArgument(argc, argv, &g_gArg); // 解析命令行参数
|
||||||
if (g_gArg.num_threads < 1) // 线程数不能小于1
|
if (g_gArg.num_threads < 1)
|
||||||
g_gArg.num_threads = 1;
|
g_gArg.num_threads = 1; // 线程数不能小于1
|
||||||
|
|
||||||
/* 初始化一些参数和变量*/
|
/* 初始化一些参数和变量*/
|
||||||
g_vRnParser.resize(g_gArg.num_threads);
|
g_vRnParser.resize(g_gArg.num_threads);
|
||||||
|
|
@ -78,23 +77,23 @@ int MarkDuplicates(int argc, char *argv[])
|
||||||
|
|
||||||
/* 打开输入bam文件 */
|
/* 打开输入bam文件 */
|
||||||
g_inBamFp = sam_open_format(g_gArg.in_fn.c_str(), "r", nullptr);
|
g_inBamFp = sam_open_format(g_gArg.in_fn.c_str(), "r", nullptr);
|
||||||
if (!g_inBamFp)
|
if (!g_inBamFp) {
|
||||||
{
|
|
||||||
Error("[%s] load sam/bam file failed.\n", __func__);
|
Error("[%s] load sam/bam file failed.\n", __func__);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
hts_set_opt(g_inBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
hts_set_opt(g_inBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
||||||
g_inBamHeader = sam_hdr_read(g_inBamFp); // 读取header
|
g_inBamHeader = sam_hdr_read(g_inBamFp); // 读取header
|
||||||
|
// 获取样本名称(libraryId)
|
||||||
|
gMetrics.LIBRARY = sam_hdr_line_name(g_inBamHeader, "RG", 0);
|
||||||
|
|
||||||
/* 利用线程池对输入输出文件进行读写 */
|
/* 利用线程池对输入输出文件进行读写 */
|
||||||
htsThreadPool htsPoolRead = {NULL, 0}; // 多线程读取,创建线程池
|
htsThreadPool htsPoolRead = {NULL, 0}; // 多线程读取,创建线程池
|
||||||
htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池
|
htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池
|
||||||
// htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads);
|
//htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads);
|
||||||
htsPoolRead.pool = hts_tpool_init(16);
|
//htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads);
|
||||||
// htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads);
|
htsPoolRead.pool = hts_tpool_init(32);
|
||||||
htsPoolWrite.pool = hts_tpool_init(16);
|
htsPoolWrite.pool = hts_tpool_init(32);
|
||||||
if (!htsPoolRead.pool || !htsPoolWrite.pool)
|
if (!htsPoolRead.pool || !htsPoolWrite.pool) {
|
||||||
{
|
|
||||||
Error("[%d] failed to set up thread pool", __LINE__);
|
Error("[%d] failed to set up thread pool", __LINE__);
|
||||||
sam_close(g_inBamFp);
|
sam_close(g_inBamFp);
|
||||||
return -1;
|
return -1;
|
||||||
|
|
@ -106,86 +105,94 @@ int MarkDuplicates(int argc, char *argv[])
|
||||||
sam_open_mode(modeout + 1, g_gArg.out_fn.c_str(), NULL);
|
sam_open_mode(modeout + 1, g_gArg.out_fn.c_str(), NULL);
|
||||||
g_outBamFp = sam_open(g_gArg.out_fn.c_str(), modeout);
|
g_outBamFp = sam_open(g_gArg.out_fn.c_str(), modeout);
|
||||||
g_outBamHeader = sam_hdr_dup(g_inBamHeader);
|
g_outBamHeader = sam_hdr_dup(g_inBamHeader);
|
||||||
|
// 用同样的线程池处理输出文件
|
||||||
hts_set_opt(g_outBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
hts_set_opt(g_outBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
||||||
hts_set_opt(g_outBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite); // 用同样的线程池处理输出文件
|
hts_set_opt(g_outBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite);
|
||||||
|
|
||||||
|
|
||||||
/* 冗余检查和标记 */
|
/* 冗余检查和标记 */
|
||||||
if (g_gArg.num_threads == 1)
|
if (g_gArg.num_threads == 1) {
|
||||||
{
|
|
||||||
serialMarkDups(); // 串行运行
|
serialMarkDups(); // 串行运行
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
parallelMarkDups(); // 并行运行
|
parallelMarkDups(); // 并行运行
|
||||||
}
|
}
|
||||||
|
|
||||||
/* 标记冗余, 将处理后的结果写入文件 */
|
/* 标记冗余, 将处理后的结果写入文件 */
|
||||||
sam_close(g_inBamFp); // 重新打开bam文件
|
sam_close(g_inBamFp); // 重新打开bam文件
|
||||||
g_inBamFp = sam_open_format(g_gArg.in_fn.c_str(), "r", nullptr);
|
g_inBamFp = sam_open_format(g_gArg.in_fn.c_str(), "r", nullptr);
|
||||||
if (!g_inBamFp)
|
if (!g_inBamFp) {
|
||||||
{
|
|
||||||
Error("[%s] load sam/bam file failed.\n", __func__);
|
Error("[%s] load sam/bam file failed.\n", __func__);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
hts_set_opt(g_inBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
hts_set_opt(g_inBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
||||||
hts_set_opt(g_inBamFp, HTS_OPT_THREAD_POOL, &htsPoolRead);
|
hts_set_opt(g_inBamFp, HTS_OPT_THREAD_POOL, &htsPoolRead);
|
||||||
g_inBamHeader = sam_hdr_read(g_inBamFp); // 读取header
|
g_inBamHeader = sam_hdr_read(g_inBamFp); // 读取header
|
||||||
if (sam_hdr_write(g_outBamFp, g_outBamHeader) != 0)
|
if (sam_hdr_write(g_outBamFp, g_outBamHeader) != 0) {
|
||||||
{
|
|
||||||
Error("failed writing header to \"%s\"", g_gArg.out_fn.c_str());
|
Error("failed writing header to \"%s\"", g_gArg.out_fn.c_str());
|
||||||
sam_close(g_outBamFp);
|
sam_close(g_outBamFp);
|
||||||
sam_close(g_inBamFp);
|
sam_close(g_inBamFp);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
// 输出index文件
|
// 输出index文件
|
||||||
string indexFn = g_gArg.out_fn + ".csi"; // 现在索引都是csi格式的
|
// string indexFn = g_gArg.out_fn + ".csi"; // 现在索引都是csi格式的
|
||||||
if (sam_idx_init(g_outBamFp, g_outBamHeader, 14 /*csi*/, indexFn.c_str()) < 0)
|
string indexFn = g_gArg.out_fn + ".bai"; // min_shift = 0 是bai格式
|
||||||
{
|
int index_min_shift = 0;
|
||||||
|
if (g_mdArg.INDEX_FORMAT == ns_md::IndexFormat::CSI) {
|
||||||
|
indexFn = g_gArg.out_fn + ".csi";
|
||||||
|
index_min_shift = 14;
|
||||||
|
}
|
||||||
|
if (sam_idx_init(g_outBamFp, g_outBamHeader, 0 /*csi 14*/, indexFn.c_str()) < 0) {
|
||||||
Error("failed to open index \"%s\" for writing", indexFn.c_str());
|
Error("failed to open index \"%s\" for writing", indexFn.c_str());
|
||||||
sam_close(g_outBamFp);
|
sam_close(g_outBamFp);
|
||||||
sam_close(g_inBamFp);
|
sam_close(g_inBamFp);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
// 读取输入文件
|
// 读取输入文件
|
||||||
// BamBufType inBuf(false); // inBuf(g_gArg.use_asyncio);
|
// BamBufType inBuf(false);
|
||||||
BamBufType inBuf(g_gArg.use_asyncio);
|
BamBufType inBuf(g_gArg.use_asyncio);
|
||||||
inBuf.Init(g_inBamFp, g_inBamHeader, g_gArg.max_mem);
|
inBuf.Init(g_inBamFp, g_inBamHeader, g_gArg.max_mem);
|
||||||
|
DupIdxQueue idxQue;
|
||||||
|
idxQue.Init(&gData.dupIdxArr);
|
||||||
Timer tw;
|
Timer tw;
|
||||||
while (inBuf.ReadStat() >= 0)
|
cout << "dupsize: " << idxQue.Size() << endl;
|
||||||
{
|
uint64_t bamIdx = 0;
|
||||||
|
uint64_t dupIdx = idxQue.Pop();
|
||||||
|
cout << "dup arr size: " << gData.dupIdxArr.size() << endl;
|
||||||
|
cout << "first dup: " << dupIdx << endl;
|
||||||
|
while (inBuf.ReadStat() >= 0) {
|
||||||
Timer tw1;
|
Timer tw1;
|
||||||
size_t readNum = inBuf.ReadBam();
|
size_t readNum = inBuf.ReadBam();
|
||||||
cout << "read: " << readNum << endl;
|
cout << "read: " << readNum << endl;
|
||||||
for (size_t i = 0; i < inBuf.Size(); ++i)
|
for (size_t i = 0; i < inBuf.Size(); ++i) {
|
||||||
{
|
|
||||||
/* 判断是否冗余 */
|
/* 判断是否冗余 */
|
||||||
if (sam_write1(g_outBamFp, g_outBamHeader, inBuf[i]->b) < 0)
|
if (bamIdx == dupIdx) {
|
||||||
{
|
// cout << "冗余" << bamIdx << endl;
|
||||||
|
dupIdx = idxQue.Pop();
|
||||||
|
}
|
||||||
|
if (sam_write1(g_outBamFp, g_outBamHeader, inBuf[i]->b) < 0) {
|
||||||
Error("failed writing header to \"%s\"", g_gArg.out_fn.c_str());
|
Error("failed writing header to \"%s\"", g_gArg.out_fn.c_str());
|
||||||
sam_close(g_outBamFp);
|
sam_close(g_outBamFp);
|
||||||
sam_close(g_inBamFp);
|
sam_close(g_inBamFp);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
++bamIdx;
|
||||||
}
|
}
|
||||||
inBuf.ClearAll();
|
inBuf.ClearAll();
|
||||||
cout << "write round time: " << tw1.seconds_elapsed() << " s" << endl;
|
cout << "write round time: " << tw1.seconds_elapsed() << " s" << endl;
|
||||||
}
|
}
|
||||||
if (sam_idx_save(g_outBamFp) < 0)
|
cout << "dupsize: " << idxQue.Size() << endl;
|
||||||
{
|
if (sam_idx_save(g_outBamFp) < 0) {
|
||||||
Error("writing index failed");
|
Error("writing index failed");
|
||||||
sam_close(g_outBamFp);
|
sam_close(g_outBamFp);
|
||||||
sam_close(g_inBamFp);
|
sam_close(g_inBamFp);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
cout << "write time: " << tw.seconds_elapsed() << " s" << endl;
|
cout << "write time: " << tw.seconds_elapsed() << " s" << endl;
|
||||||
|
|
||||||
/* 关闭文件,收尾清理 */
|
/* 关闭文件,收尾清理 */
|
||||||
sam_close(g_outBamFp);
|
sam_close(g_outBamFp);
|
||||||
sam_close(g_inBamFp);
|
sam_close(g_inBamFp);
|
||||||
|
|
||||||
cout << " 总时间: " << time_all.seconds_elapsed() << endl;
|
cout << " 总时间: " << time_all.seconds_elapsed() << endl;
|
||||||
// cout << "计算read end: " << tm_arr[0].acc_seconds_elapsed() << endl;
|
// cout << "计算read end: " << tm_arr[0].acc_seconds_elapsed() << endl;
|
||||||
Timer::log_time("程序结束");
|
Timer::log_time("程序结束");
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
@ -64,6 +64,7 @@ const static struct option kMdOpts[] = {
|
||||||
{"COMPRESSION_LEVEL", required_argument, NULL, COMPRESSION_LEVEL},
|
{"COMPRESSION_LEVEL", required_argument, NULL, COMPRESSION_LEVEL},
|
||||||
{"MAX_RECORDS_IN_RAM", required_argument, NULL, MAX_RECORDS_IN_RAM},
|
{"MAX_RECORDS_IN_RAM", required_argument, NULL, MAX_RECORDS_IN_RAM},
|
||||||
{"CREATE_INDEX", required_argument, NULL, CREATE_INDEX},
|
{"CREATE_INDEX", required_argument, NULL, CREATE_INDEX},
|
||||||
|
{"INDEX_FORMAT", required_argument, NULL, INDEX_FORMAT},
|
||||||
{"CREATE_MD5_FILE", required_argument, NULL, CREATE_MD5_FILE}};
|
{"CREATE_MD5_FILE", required_argument, NULL, CREATE_MD5_FILE}};
|
||||||
|
|
||||||
// 判断bool类型的参数
|
// 判断bool类型的参数
|
||||||
|
|
@ -220,6 +221,11 @@ void MarkDupsArg::parseArgument(int argc,
|
||||||
case ns_md::CREATE_INDEX:
|
case ns_md::CREATE_INDEX:
|
||||||
setBoolArg(&CREATE_INDEX);
|
setBoolArg(&CREATE_INDEX);
|
||||||
break;
|
break;
|
||||||
|
case ns_md::INDEX_FORMAT:
|
||||||
|
if (strcmp("CSI", optarg) == 0)
|
||||||
|
INDEX_FORMAT = ns_md::IndexFormat::CSI;
|
||||||
|
else
|
||||||
|
INDEX_FORMAT = ns_md::IndexFormat::BAI;
|
||||||
case ns_md::CREATE_MD5_FILE:
|
case ns_md::CREATE_MD5_FILE:
|
||||||
setBoolArg(&CREATE_MD5_FILE);
|
setBoolArg(&CREATE_MD5_FILE);
|
||||||
break;
|
break;
|
||||||
|
|
@ -236,6 +242,7 @@ void MarkDupsArg::parseArgument(int argc,
|
||||||
void MarkDupsArg::printArgValue()
|
void MarkDupsArg::printArgValue()
|
||||||
{
|
{
|
||||||
printf("--READ_NAME_REGEX = %s\n", this->READ_NAME_REGEX.c_str());
|
printf("--READ_NAME_REGEX = %s\n", this->READ_NAME_REGEX.c_str());
|
||||||
|
printf("--INDEX_FORMAT = %s\n", this->INDEX_FORMAT == ns_md::IndexFormat::BAI ? "bai" : "csi");
|
||||||
}
|
}
|
||||||
|
|
||||||
// 打印版本信息
|
// 打印版本信息
|
||||||
|
|
@ -272,6 +279,8 @@ void MarkDupsArg::PrintHelp()
|
||||||
"\n"
|
"\n"
|
||||||
"Optional Arguments:\n"
|
"Optional Arguments:\n"
|
||||||
"\n"
|
"\n"
|
||||||
|
"--INDEX_FORMAT <FORMAT> Format for bam index file. Possible values: {BAI, CSI}\n"
|
||||||
|
"\n"
|
||||||
"--ADD_PG_TAG_TO_READS <Boolean>\n"
|
"--ADD_PG_TAG_TO_READS <Boolean>\n"
|
||||||
" Add PG tag to each read in a SAM or BAM Default value: true. Possible values: {true,\n"
|
" Add PG tag to each read in a SAM or BAM Default value: true. Possible values: {true,\n"
|
||||||
" false}\n"
|
" false}\n"
|
||||||
|
|
|
||||||
|
|
@ -21,9 +21,8 @@ using std::vector;
|
||||||
class GlobalArg;
|
class GlobalArg;
|
||||||
|
|
||||||
namespace ns_md {
|
namespace ns_md {
|
||||||
/* 用于markduplicate模块的参数,这个枚举用于getoption */
|
/* 用于markduplicate模块的参数,这个枚举用于getoption */
|
||||||
enum MarkDupsArgEnum
|
enum MarkDupsArgEnum {
|
||||||
{
|
|
||||||
_START_NUM = 100,
|
_START_NUM = 100,
|
||||||
MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP,
|
MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP,
|
||||||
MAX_FILE_HANDLES_FOR_READ_ENDS_MAP,
|
MAX_FILE_HANDLES_FOR_READ_ENDS_MAP,
|
||||||
|
|
@ -55,13 +54,14 @@ namespace ns_md {
|
||||||
COMPRESSION_LEVEL,
|
COMPRESSION_LEVEL,
|
||||||
MAX_RECORDS_IN_RAM,
|
MAX_RECORDS_IN_RAM,
|
||||||
CREATE_INDEX,
|
CREATE_INDEX,
|
||||||
|
INDEX_FORMAT,
|
||||||
CREATE_MD5_FILE,
|
CREATE_MD5_FILE,
|
||||||
_END_NUM
|
_END_NUM
|
||||||
};
|
};
|
||||||
|
|
||||||
/* How strict to be when reading a SAM or BAM, beyond bare minimum validation. */
|
/* How strict to be when reading a SAM or BAM, beyond bare minimum validation.
|
||||||
enum ValidationStringency
|
*/
|
||||||
{
|
enum ValidationStringency {
|
||||||
/**
|
/**
|
||||||
* Do the right thing, throw an exception if something looks wrong.
|
* Do the right thing, throw an exception if something looks wrong.
|
||||||
*/
|
*/
|
||||||
|
|
@ -76,36 +76,37 @@ namespace ns_md {
|
||||||
SILENT,
|
SILENT,
|
||||||
|
|
||||||
DEFAULT_STRINGENCY = SILENT
|
DEFAULT_STRINGENCY = SILENT
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Enum used to control how duplicates are flagged in the DT optional tag on each read.
|
* Enum used to control how duplicates are flagged in the DT optional tag on
|
||||||
|
* each read.
|
||||||
*/
|
*/
|
||||||
enum DuplicateTaggingPolicy
|
enum DuplicateTaggingPolicy { DontTag, OpticalOnly, All };
|
||||||
{
|
|
||||||
DontTag,
|
|
||||||
OpticalOnly,
|
|
||||||
All
|
|
||||||
};
|
|
||||||
|
|
||||||
/* 排序的方式 */
|
/* 排序的方式 */
|
||||||
enum SortOrder
|
enum SortOrder {
|
||||||
{
|
|
||||||
unsorted,
|
unsorted,
|
||||||
queryname,
|
queryname,
|
||||||
coordinate,
|
coordinate,
|
||||||
duplicate, // NB: this is not in the SAM spec!
|
duplicate, // NB: this is not in the SAM spec!
|
||||||
unknown
|
unknown
|
||||||
};
|
};
|
||||||
|
|
||||||
/* 计算reads分数的方式(比那个read得分更高) */
|
/* 计算reads分数的方式(比那个read得分更高) */
|
||||||
enum ScoringStrategy
|
enum ScoringStrategy {
|
||||||
{
|
|
||||||
SUM_OF_BASE_QUALITIES,
|
SUM_OF_BASE_QUALITIES,
|
||||||
TOTAL_MAPPED_REFERENCE_LENGTH,
|
TOTAL_MAPPED_REFERENCE_LENGTH,
|
||||||
RANDOM
|
RANDOM
|
||||||
};
|
};
|
||||||
}
|
|
||||||
|
/* 索引文件的格式 (bai或者csi) */
|
||||||
|
enum IndexFormat {
|
||||||
|
BAI,
|
||||||
|
CSI
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace ns_md
|
||||||
|
|
||||||
/* markduplicate 需要的参数*/
|
/* markduplicate 需要的参数*/
|
||||||
struct MarkDupsArg
|
struct MarkDupsArg
|
||||||
|
|
@ -287,6 +288,8 @@ struct MarkDupsArg
|
||||||
/* "Whether to create an index when writing VCF or coordinate sorted BAM output.", common = true */
|
/* "Whether to create an index when writing VCF or coordinate sorted BAM output.", common = true */
|
||||||
bool CREATE_INDEX = false;
|
bool CREATE_INDEX = false;
|
||||||
|
|
||||||
|
ns_md::IndexFormat INDEX_FORMAT = ns_md::IndexFormat::BAI;
|
||||||
|
|
||||||
/* "Whether to create an MD5 digest for any BAM or FASTQ files created. ", common = true */
|
/* "Whether to create an MD5 digest for any BAM or FASTQ files created. ", common = true */
|
||||||
bool CREATE_MD5_FILE = false;
|
bool CREATE_MD5_FILE = false;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,456 @@
|
||||||
|
|
||||||
|
#include "md_funcs.h"
|
||||||
|
|
||||||
|
#include <common/hts/bam_buf.h>
|
||||||
|
#include <common/utils/debug.h>
|
||||||
|
#include <common/utils/murmur3.h>
|
||||||
|
#include <common/utils/profiling.h>
|
||||||
|
#include <common/utils/timer.h>
|
||||||
|
#include <common/utils/util.h>
|
||||||
|
#include <sam/utils/read_ends.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <map>
|
||||||
|
#include <set>
|
||||||
|
#include <vector>
|
||||||
|
#include <math.h>
|
||||||
|
#include <unordered_map>
|
||||||
|
|
||||||
|
#include "dup_metrics.h"
|
||||||
|
#include "markdups_arg.h"
|
||||||
|
#include "shared_args.h"
|
||||||
|
|
||||||
|
using std::cerr;
|
||||||
|
using std::endl;
|
||||||
|
using std::map;
|
||||||
|
using std::set;
|
||||||
|
using std::unordered_map;
|
||||||
|
using std::vector;
|
||||||
|
|
||||||
|
/* 清除key位置的数据 */
|
||||||
|
void clearIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) {
|
||||||
|
auto &msIdx = *pmsIdx;
|
||||||
|
if (msIdx.find(key) != msIdx.end())
|
||||||
|
msIdx[key].clear(); // 清除该位点的冗余结果
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 删除key位置的数据 */
|
||||||
|
void delIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) {
|
||||||
|
auto &msIdx = *pmsIdx;
|
||||||
|
if (msIdx.find(key) != msIdx.end())
|
||||||
|
msIdx.erase(key);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* 计算read的分数
|
||||||
|
*/
|
||||||
|
int16_t computeDuplicateScore(BamWrap &bw) {
|
||||||
|
int16_t score = 0;
|
||||||
|
switch (g_mdArg.DUPLICATE_SCORING_STRATEGY) {
|
||||||
|
case ns_md::SUM_OF_BASE_QUALITIES:
|
||||||
|
// two (very) long reads worth of high-quality bases can go over
|
||||||
|
// Short.MAX_VALUE/2 and risk overflow.
|
||||||
|
score += (int16_t)min(bw.GetSumOfBaseQualities(), INT16_MAX / 2);
|
||||||
|
break;
|
||||||
|
case ns_md::TOTAL_MAPPED_REFERENCE_LENGTH:
|
||||||
|
if (!bw.GetReadUnmappedFlag())
|
||||||
|
// no need to remember the score since this scoring mechanism is
|
||||||
|
// symmetric
|
||||||
|
score = (int16_t)min(bw.GetReferenceLength(), INT16_MAX / 2);
|
||||||
|
break;
|
||||||
|
case ns_md::RANDOM:
|
||||||
|
// The RANDOM score gives the same score to both reads so that they get
|
||||||
|
// filtered together. it's not critical do use the readName since the
|
||||||
|
// scores from both ends get added, but it seem to be clearer this way.
|
||||||
|
score += (short)(Murmur3::Instance().HashUnencodedChars(bw.query_name()) & 0b11111111111111);
|
||||||
|
// subtract Short.MIN_VALUE/4 from it to end up with a number between
|
||||||
|
// 0 and Short.MAX_VALUE/2. This number can be then discounted in case
|
||||||
|
// the read is not passing filters. We need to stay far from overflow so
|
||||||
|
// that when we add the two scores from the two read mates we do not
|
||||||
|
// overflow since that could cause us to chose a failing read-pair
|
||||||
|
// instead of a passing one.
|
||||||
|
score -= INT16_MIN / 4;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// make sure that filter-failing records are heavily discounted. (the
|
||||||
|
// discount can happen twice, once for each mate, so need to make sure we do
|
||||||
|
// not subtract more than Short.MIN_VALUE overall.)
|
||||||
|
score += bw.GetReadFailsVendorQualityCheckFlag() ? (int16_t)(INT16_MIN / 2) : 0;
|
||||||
|
|
||||||
|
return score;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Builds a read ends object that represents a single read.
|
||||||
|
* 用来表示一个read的特征结构
|
||||||
|
*/
|
||||||
|
void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey) {
|
||||||
|
auto &k = *pKey;
|
||||||
|
auto &bc = bw.b->core;
|
||||||
|
k.read1FirstOfPair = bw.GetFirstOfPairFlag();
|
||||||
|
k.read1ReferenceIndex = bc.tid;
|
||||||
|
k.read1Coordinate = (bc.flag & BAM_FREVERSE) ? bw.GetUnclippedEnd() : bw.GetUnclippedStart();
|
||||||
|
k.orientation = (bc.flag & BAM_FREVERSE) ? ReadEnds::R : ReadEnds::F;
|
||||||
|
|
||||||
|
k.read1IndexInFile = index;
|
||||||
|
k.score = computeDuplicateScore(bw);
|
||||||
|
// Doing this lets the ends object know that it's part of a pair
|
||||||
|
if (bw.GetReadPairedFlag() && !bw.GetMateUnmappedFlag()) {
|
||||||
|
k.read2ReferenceIndex = bc.mtid;
|
||||||
|
}
|
||||||
|
// Fill in the location information for optical duplicates
|
||||||
|
rnParser.AddLocationInformation(bw.query_name(), pKey);
|
||||||
|
// cout << k.tile << ' ' << k.x << ' ' << k.y << endl;
|
||||||
|
// 计算位置key
|
||||||
|
k.posKey =
|
||||||
|
BamWrap::bam_global_pos(k.read1ReferenceIndex, k.read1Coordinate); // << 1 | k.orientation;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Takes a list of ReadEndsForMarkDuplicates objects and identify the
|
||||||
|
* representative read based on quality score. For all members of the duplicate
|
||||||
|
* set, add the read1 index-in-file of the representative read to the records of
|
||||||
|
* the first and second in a pair. This value becomes is used for the 'DI' tag.
|
||||||
|
*/
|
||||||
|
void addRepresentativeReadIndex(vector<const ReadEnds *> &vpRe) {}
|
||||||
|
|
||||||
|
/* 处理一组pairend的readends,标记冗余 */
|
||||||
|
void markDuplicatePairs(int64_t posKey, vector<const ReadEnds *> &vpRe,
|
||||||
|
DupContainer<int64_t> *dupIdx, DupContainer<int64_t> *opticalDupIdx) {
|
||||||
|
if (vpRe.size() < 2) {
|
||||||
|
if (vpRe.size() == 1) {
|
||||||
|
// addSingletonToCount(libraryIdGenerator);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// cout << "pos:" << posKey + 1 << ";size:" << vpRe.size() << endl;
|
||||||
|
auto &vDupIdx = dupIdx->AtPos(posKey);
|
||||||
|
auto &vOpticalDupIdx = opticalDupIdx->AtPos(posKey);
|
||||||
|
|
||||||
|
int maxScore = 0;
|
||||||
|
const ReadEnds *pBest = nullptr;
|
||||||
|
/** All read ends should have orientation FF, FR, RF, or RR **/
|
||||||
|
for (auto pe : vpRe) // 找分数最高的readend
|
||||||
|
{
|
||||||
|
if (pe->score > maxScore || pBest == nullptr) {
|
||||||
|
maxScore = pe->score;
|
||||||
|
pBest = pe;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!g_mdArg.READ_NAME_REGEX.empty()) // 检查光学冗余
|
||||||
|
{
|
||||||
|
// trackOpticalDuplicates
|
||||||
|
}
|
||||||
|
for (auto pe : vpRe) // 对非best read标记冗余
|
||||||
|
{
|
||||||
|
if (pe != pBest) // 非best
|
||||||
|
{
|
||||||
|
vDupIdx.push_back(pe->read1IndexInFile); // 添加read1
|
||||||
|
if (pe->read2IndexInFile != pe->read1IndexInFile)
|
||||||
|
vDupIdx.push_back(pe->read2IndexInFile); // 添加read2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS) {
|
||||||
|
addRepresentativeReadIndex(vpRe);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 处理一组非paired的readends,标记冗余 */
|
||||||
|
void markDuplicateFragments(int64_t posKey, vector<const ReadEnds *> &vpRe, bool containsPairs,
|
||||||
|
DupContainer<int64_t> *dupIdx) {
|
||||||
|
auto &vDupIdx = dupIdx->AtPos(posKey);
|
||||||
|
|
||||||
|
if (containsPairs) {
|
||||||
|
for (auto pe : vpRe) {
|
||||||
|
if (!pe->IsPaired()) {
|
||||||
|
vDupIdx.push_back(pe->read1IndexInFile);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
int maxScore = 0;
|
||||||
|
const ReadEnds *pBest = nullptr;
|
||||||
|
for (auto pe : vpRe) {
|
||||||
|
if (pe->score > maxScore || pBest == nullptr) {
|
||||||
|
maxScore = pe->score;
|
||||||
|
pBest = pe;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto pe : vpRe) {
|
||||||
|
if (pe != pBest) {
|
||||||
|
vDupIdx.push_back(pe->read1IndexInFile);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 处理位于某个坐标的pairend reads */
|
||||||
|
void handlePairs(int64_t posKey, vector<ReadEnds> &readEnds,
|
||||||
|
vector<const ReadEnds *> &vpCache, DupContainer<int64_t> *dupIdx,
|
||||||
|
DupContainer<int64_t> *opticalDupIdx) {
|
||||||
|
if (readEnds.size() > 1) { // 有潜在的冗余
|
||||||
|
vpCache.clear();
|
||||||
|
// std::sort(readEnds.begin(), readEnds.end());
|
||||||
|
const ReadEnds *pReadEnd = nullptr;
|
||||||
|
for (auto &re : readEnds) {
|
||||||
|
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样
|
||||||
|
vpCache.push_back(&re); // 处理一个潜在的冗余组
|
||||||
|
else {
|
||||||
|
markDuplicatePairs(posKey, vpCache, dupIdx,
|
||||||
|
opticalDupIdx); // 不一样
|
||||||
|
vpCache.clear();
|
||||||
|
vpCache.push_back(&re);
|
||||||
|
pReadEnd = &re;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
markDuplicatePairs(posKey, vpCache, dupIdx, opticalDupIdx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 处理位于某个坐标的 reads */
|
||||||
|
void handleFrags(int64_t posKey, vector<ReadEnds> &readEnds,
|
||||||
|
vector<const ReadEnds *> &vpCache, DupContainer<int64_t> *dupIdx) {
|
||||||
|
if (readEnds.size() > 1) // 有潜在的冗余
|
||||||
|
{
|
||||||
|
vpCache.clear();
|
||||||
|
// std::sort(readEnds.begin(), readEnds.end());
|
||||||
|
const ReadEnds *pReadEnd = nullptr;
|
||||||
|
bool containsPairs = false;
|
||||||
|
bool containsFrags = false;
|
||||||
|
for (auto &re : readEnds) {
|
||||||
|
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false)) {
|
||||||
|
vpCache.push_back(&re);
|
||||||
|
containsPairs = containsPairs || re.IsPaired();
|
||||||
|
containsFrags = containsFrags || !re.IsPaired();
|
||||||
|
} else {
|
||||||
|
if (vpCache.size() > 1 && containsFrags) {
|
||||||
|
markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
|
||||||
|
}
|
||||||
|
vpCache.clear();
|
||||||
|
vpCache.push_back(&re);
|
||||||
|
pReadEnd = &re;
|
||||||
|
containsPairs = re.IsPaired();
|
||||||
|
containsFrags = !re.IsPaired();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (vpCache.size() > 1 && containsFrags) {
|
||||||
|
markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 对找到的pairend read end添加一些信息 */
|
||||||
|
void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds) {
|
||||||
|
auto &pairedEnds = *pPairedEnds;
|
||||||
|
int64_t bamIdx = fragEnd.read1IndexInFile;
|
||||||
|
const int matesRefIndex = fragEnd.read1ReferenceIndex;
|
||||||
|
const int matesCoordinate = fragEnd.read1Coordinate;
|
||||||
|
// Set orientationForOpticalDuplicates, which always goes by the first then
|
||||||
|
// the second end for the strands. NB: must do this before updating the
|
||||||
|
// orientation later.
|
||||||
|
if (fragEnd.read1FirstOfPair) {
|
||||||
|
pairedEnds.orientationForOpticalDuplicates = ReadEnds::GetOrientationByte(
|
||||||
|
fragEnd.IsNegativeStrand(), pairedEnds.orientation == ReadEnds::R);
|
||||||
|
} else {
|
||||||
|
pairedEnds.orientationForOpticalDuplicates = ReadEnds::GetOrientationByte(
|
||||||
|
pairedEnds.orientation == ReadEnds::R, fragEnd.IsNegativeStrand());
|
||||||
|
}
|
||||||
|
// If the other read is actually later, simply add the other read's data as
|
||||||
|
// read2, else flip the reads
|
||||||
|
if (matesRefIndex > pairedEnds.read1ReferenceIndex ||
|
||||||
|
(matesRefIndex == pairedEnds.read1ReferenceIndex &&
|
||||||
|
matesCoordinate >= pairedEnds.read1Coordinate)) {
|
||||||
|
pairedEnds.read2ReferenceIndex = matesRefIndex;
|
||||||
|
pairedEnds.read2Coordinate = matesCoordinate;
|
||||||
|
pairedEnds.read2IndexInFile = bamIdx;
|
||||||
|
pairedEnds.orientation = ReadEnds::GetOrientationByte(pairedEnds.orientation == ReadEnds::R,
|
||||||
|
fragEnd.IsNegativeStrand());
|
||||||
|
|
||||||
|
// if the two read ends are in the same position, pointing in opposite
|
||||||
|
// directions, the orientation is undefined and the procedure above will
|
||||||
|
// depend on the order of the reads in the file. To avoid this, we set
|
||||||
|
// it explicitly (to FR):
|
||||||
|
if (pairedEnds.read2ReferenceIndex == pairedEnds.read1ReferenceIndex &&
|
||||||
|
pairedEnds.read2Coordinate == pairedEnds.read1Coordinate &&
|
||||||
|
pairedEnds.orientation == ReadEnds::RF) {
|
||||||
|
pairedEnds.orientation = ReadEnds::FR;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
pairedEnds.read2ReferenceIndex = pairedEnds.read1ReferenceIndex;
|
||||||
|
pairedEnds.read2Coordinate = pairedEnds.read1Coordinate;
|
||||||
|
pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile;
|
||||||
|
pairedEnds.read1ReferenceIndex = matesRefIndex;
|
||||||
|
pairedEnds.read1Coordinate = matesCoordinate;
|
||||||
|
pairedEnds.read1IndexInFile = bamIdx;
|
||||||
|
pairedEnds.orientation = ReadEnds::GetOrientationByte(
|
||||||
|
fragEnd.IsNegativeStrand(), pairedEnds.orientation == ReadEnds::R);
|
||||||
|
pairedEnds.posKey = fragEnd.posKey;
|
||||||
|
}
|
||||||
|
pairedEnds.score += fragEnd.score;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline bool closeEnough(const ReadEnds *lhs, const ReadEnds *rhs, const int distance) {
|
||||||
|
return lhs != rhs && // no comparing an object to itself (checked using object identity)!
|
||||||
|
(lhs->tile != ReadEnds::NO_VALUE) &&
|
||||||
|
(rhs->tile != ReadEnds::NO_VALUE) && // no comparing objects without locations
|
||||||
|
lhs->tile == rhs->tile && // and the same tile
|
||||||
|
abs(lhs->x - rhs->x) <= distance && abs(lhs->y - rhs->y) <= distance;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Finds which reads within the list of duplicates that are likely to be optical/co-localized duplicates of
|
||||||
|
* one another. Within each cluster of optical duplicates that is found, one read remains un-flagged for
|
||||||
|
* optical duplication and the rest are flagged as optical duplicates. The set of reads that are considered
|
||||||
|
* optical duplicates are indicated by returning "true" at the same index in the resulting boolean[] as the
|
||||||
|
* read appeared in the input list of physical locations.
|
||||||
|
*
|
||||||
|
* @param list a list of reads that are determined to be duplicates of one another
|
||||||
|
* @param keeper a single PhysicalLocation that is the one being kept as non-duplicate, and thus should never be
|
||||||
|
* annotated as an optical duplicate. May in some cases be null, or a PhysicalLocation not
|
||||||
|
* contained within the list! (always not be null!)
|
||||||
|
* @return a boolean[] of the same length as the incoming list marking which reads are optical duplicates
|
||||||
|
*/
|
||||||
|
static void findOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe,
|
||||||
|
vector<bool> *pOpticalDuplicateFlags) {
|
||||||
|
const int DEFAULT_OPTICAL_DUPLICATE_DISTANCE = 100;
|
||||||
|
const int DEFAULT_MAX_DUPLICATE_SET_SIZE = 300000;
|
||||||
|
|
||||||
|
vector<bool> &opticalDuplicateFlags = *pOpticalDuplicateFlags;
|
||||||
|
opticalDuplicateFlags.push_back(true);
|
||||||
|
int len = readEndsArr.size();
|
||||||
|
// If there is only one or zero reads passed in (so there are obviously no optical duplicates),
|
||||||
|
// or if there are too many reads (so we don't want to try to run this expensive n^2 algorithm),
|
||||||
|
// then just return an array of all false
|
||||||
|
if (len < 2 || len > DEFAULT_MAX_DUPLICATE_SET_SIZE) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len >= 4) {
|
||||||
|
/**
|
||||||
|
* Compute the optical duplicates correctly in the case where the duplicate group could end up with transitive
|
||||||
|
* optical duplicates
|
||||||
|
* getOpticalDuplicatesFlagWithGraph
|
||||||
|
*/
|
||||||
|
// Make a graph where the edges are reads that lie within the optical duplicate pixel distance from each other,
|
||||||
|
// we will then use the union-find algorithm to cluster the graph and find optical duplicate groups
|
||||||
|
Graph<int> opticalDistanceRelationGraph;
|
||||||
|
unordered_map<int, vector<int>> tileRGmap;
|
||||||
|
int keeperIndex = -1;
|
||||||
|
for (int i = 0; i < readEndsArr.size(); ++i) {
|
||||||
|
const ReadEnds *currentLoc = readEndsArr[i];
|
||||||
|
if (currentLoc == pBestRe)
|
||||||
|
keeperIndex = i;
|
||||||
|
if (currentLoc->tile != ReadEnds::NO_VALUE) {
|
||||||
|
int key = currentLoc->tile; // 只处理一个样本,所以只有一个read group
|
||||||
|
tileRGmap[key].push_back(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
/**
|
||||||
|
* Compute optical duplicates quickly in the standard case where we know that there won't be any transitive
|
||||||
|
* distances to worry about. Note, this is guaranteed to be correct when there are at most 2x reads from a
|
||||||
|
* readgroup or 3x with the keeper present
|
||||||
|
* getOpticalDuplicatesFlagFast
|
||||||
|
*/
|
||||||
|
// First go through and compare all the reads to the keeper
|
||||||
|
for (int i = 0; i < len; ++i) {
|
||||||
|
const ReadEnds *other = readEndsArr[i];
|
||||||
|
opticalDuplicateFlags[i] = closeEnough(pBestRe, other, DEFAULT_OPTICAL_DUPLICATE_DISTANCE);
|
||||||
|
}
|
||||||
|
// Now go through and do each pairwise comparison not involving the actualKeeper
|
||||||
|
for (int i = 0; i < len; ++i) {
|
||||||
|
const ReadEnds *lhs = readEndsArr[i];
|
||||||
|
if (lhs == pBestRe) // no comparisons to actualKeeper since those are all handled above
|
||||||
|
continue;
|
||||||
|
for (int j = i + 1; j < len; ++j) {
|
||||||
|
const ReadEnds *rhs = readEndsArr[j];
|
||||||
|
if (rhs == pBestRe) // no comparisons to actualKeeper since those are all handled above
|
||||||
|
continue;
|
||||||
|
if (opticalDuplicateFlags[i] && opticalDuplicateFlags[j])
|
||||||
|
continue; // both already marked, no need to check
|
||||||
|
if (closeEnough(lhs, rhs, DEFAULT_OPTICAL_DUPLICATE_DISTANCE)) {
|
||||||
|
// At this point we want to mark either lhs or rhs as duplicate. Either could have been marked
|
||||||
|
// as a duplicate of the keeper (but not both - that's checked above), so be careful about which
|
||||||
|
// one to now mark as a duplicate.
|
||||||
|
int index = opticalDuplicateFlags[j] ? i : j;
|
||||||
|
opticalDuplicateFlags[index] = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Looks through the set of reads and identifies how many of the duplicates are
|
||||||
|
* in fact optical duplicates, and stores the data in the instance level histogram.
|
||||||
|
*
|
||||||
|
* We expect only reads with FR or RF orientations, not a mixture of both.
|
||||||
|
*
|
||||||
|
* In PCR duplicate detection, a duplicates can be a have FR and RF when fixing the orientation order to the first end
|
||||||
|
* of the mate. In optical duplicate detection, we do not consider them duplicates if one read as FR and the other RF
|
||||||
|
* when we order orientation by the first mate sequenced (read #1 of the pair).
|
||||||
|
*/
|
||||||
|
static int checkOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe) {
|
||||||
|
vector<bool> opticalDuplicateFlags(readEndsArr.size(), false);
|
||||||
|
// find OpticalDuplicates
|
||||||
|
findOpticalDuplicates(readEndsArr, pBestRe, &opticalDuplicateFlags);
|
||||||
|
int opticalDuplicates = 0;
|
||||||
|
for (int i = 0; i < opticalDuplicateFlags.size(); ++i) {
|
||||||
|
if (opticalDuplicateFlags[i]) {
|
||||||
|
++opticalDuplicates;
|
||||||
|
ReadEnds *pRe = const_cast<ReadEnds *>(readEndsArr[i]);
|
||||||
|
pRe->isOpticalDuplicate = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (opticalDuplicates > 0)
|
||||||
|
gMetrics.OpticalDuplicatesByLibraryId += opticalDuplicates;
|
||||||
|
return opticalDuplicates;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 记录光学原因造成的冗余
|
||||||
|
*/
|
||||||
|
void trackOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe) {
|
||||||
|
bool hasFR = false, hasRF = false;
|
||||||
|
// Check to see if we have a mixture of FR/RF
|
||||||
|
for (auto pRe : readEndsArr) {
|
||||||
|
if (ReadEnds::FR == pRe->orientationForOpticalDuplicates)
|
||||||
|
hasFR = true;
|
||||||
|
else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates)
|
||||||
|
hasRF = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if we need to partition since the orientations could have changed
|
||||||
|
int nOpticalDup;
|
||||||
|
if (hasFR && hasRF) { // need to track them independently
|
||||||
|
vector<const ReadEnds *> trackOpticalDuplicatesF;
|
||||||
|
vector<const ReadEnds *> trackOpticalDuplicatesR;
|
||||||
|
// Split into two lists: first of pairs and second of pairs,
|
||||||
|
// since they must have orientation and same starting end
|
||||||
|
for (auto pRe: readEndsArr) {
|
||||||
|
if (ReadEnds::FR == pRe->orientationForOpticalDuplicates)
|
||||||
|
trackOpticalDuplicatesF.push_back(pRe);
|
||||||
|
else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates)
|
||||||
|
trackOpticalDuplicatesR.push_back(pRe);
|
||||||
|
else
|
||||||
|
cerr << "Found an unexpected orientation: " << pRe->orientation << endl;
|
||||||
|
}
|
||||||
|
// track the duplicates
|
||||||
|
int nOpticalDupF = checkOpticalDuplicates(trackOpticalDuplicatesF, pBestRe);
|
||||||
|
int nOpticalDupR = checkOpticalDuplicates(trackOpticalDuplicatesR, pBestRe);
|
||||||
|
nOpticalDup = nOpticalDupF + nOpticalDupR;
|
||||||
|
} else { // No need to partition
|
||||||
|
nOpticalDup = checkOpticalDuplicates(readEndsArr, pBestRe);
|
||||||
|
}
|
||||||
|
|
||||||
|
// trackDuplicateCounts
|
||||||
|
gMetrics.DuplicateCountHist += readEndsArr.size() - nOpticalDup;
|
||||||
|
if (readEndsArr.size() > nOpticalDup)
|
||||||
|
gMetrics.NonOpticalDuplicateCountHist += readEndsArr.size() - nOpticalDup;
|
||||||
|
if (nOpticalDup)
|
||||||
|
gMetrics.OpticalDuplicatesCountHist += nOpticalDup + 1;
|
||||||
|
}
|
||||||
|
|
@ -1,25 +1,38 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
#include <robin-map/include/tsl/robin_map.h>
|
#include <robin-map/include/tsl/robin_map.h>
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <queue>
|
||||||
|
#include <unordered_set>
|
||||||
|
#include <unordered_map>
|
||||||
|
|
||||||
|
using std::priority_queue;
|
||||||
|
using std::unordered_map;
|
||||||
|
using std::unordered_set;
|
||||||
|
using std::vector;
|
||||||
|
|
||||||
/* 前向声明 */
|
/* 前向声明 */
|
||||||
|
class BamWrap;
|
||||||
|
class ReadEnds;
|
||||||
|
class ReadNameParser;
|
||||||
|
|
||||||
/* 存放readend或者冗余idx,避免频繁的开辟和释放内存 */
|
/* 存放readend或者冗余idx,避免频繁的开辟和释放内存 */
|
||||||
template <class T>
|
template <class T>
|
||||||
struct DupContainer
|
struct DupContainer {
|
||||||
{
|
|
||||||
vector<vector<T>> arr; // 类似map<int64_t, set<ReadEnds>> 或 map<int64_t, set<int64_t>>
|
vector<vector<T>> arr; // 类似map<int64_t, set<ReadEnds>> 或 map<int64_t, set<int64_t>>
|
||||||
vector<int64_t> pos; // arr中每个元素对应的position
|
vector<int64_t> pos; // arr中每个元素对应的position
|
||||||
// unordered_map<int64_t, int64_t> idx; // 某个位点对应在vector中的坐标
|
// unordered_map<int64_t, int64_t> idx; // 某个位点对应在vector中的坐标
|
||||||
tsl::robin_map<int64_t, int64_t> idx; // 某个位点对应在vector中的坐标
|
tsl::robin_map<int64_t, int64_t> idx; // 某个位点对应在vector中的坐标
|
||||||
int64_t size = 0; // 实际使用的空间
|
int64_t size = 0; // 实际使用的空间
|
||||||
int64_t capacity = 0; // 内存容量
|
int64_t capacity = 0; // 内存容量
|
||||||
inline void Init()
|
inline void Init() {
|
||||||
{
|
|
||||||
idx.clear();
|
idx.clear();
|
||||||
size = 0;
|
size = 0;
|
||||||
}
|
}
|
||||||
inline void SortAtPos(int64_t p) // 这里的pos表示位点
|
inline void SortAtPos(int64_t p) // 这里的pos表示位点
|
||||||
{
|
{
|
||||||
if (idx.find(p) != idx.end())
|
if (idx.find(p) != idx.end()) {
|
||||||
{
|
|
||||||
const int64_t i = idx.at(p);
|
const int64_t i = idx.at(p);
|
||||||
std::sort(arr[i].begin(), arr[i].end());
|
std::sort(arr[i].begin(), arr[i].end());
|
||||||
}
|
}
|
||||||
|
|
@ -28,10 +41,8 @@ struct DupContainer
|
||||||
{
|
{
|
||||||
std::sort(arr[i].begin(), arr[i].end());
|
std::sort(arr[i].begin(), arr[i].end());
|
||||||
}
|
}
|
||||||
inline void RemoveAtPos(int64_t p)
|
inline void RemoveAtPos(int64_t p) {
|
||||||
{
|
if (idx.find(p) != idx.end()) {
|
||||||
if (idx.find(p) != idx.end())
|
|
||||||
{
|
|
||||||
const int64_t i = idx.at(p);
|
const int64_t i = idx.at(p);
|
||||||
arr[i].clear();
|
arr[i].clear();
|
||||||
}
|
}
|
||||||
|
|
@ -40,21 +51,14 @@ struct DupContainer
|
||||||
{
|
{
|
||||||
arr[i].clear();
|
arr[i].clear();
|
||||||
}
|
}
|
||||||
inline void AddAtPos(int64_t p, T &val)
|
inline void AddAtPos(int64_t p, T &val) { AtPos(p).push_back(val); }
|
||||||
{
|
|
||||||
AtPos(p).push_back(val);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline vector<T> &AtPos(int64_t p)
|
inline vector<T> &AtPos(int64_t p) {
|
||||||
{
|
if (idx.find(p) != idx.end()) {
|
||||||
if (idx.find(p) != idx.end())
|
|
||||||
{
|
|
||||||
const int64_t i = idx.at(p);
|
const int64_t i = idx.at(p);
|
||||||
return arr[i];
|
return arr[i];
|
||||||
}
|
}
|
||||||
|
if (size >= capacity) {
|
||||||
if (size >= capacity)
|
|
||||||
{
|
|
||||||
capacity += 1;
|
capacity += 1;
|
||||||
arr.push_back(vector<T>());
|
arr.push_back(vector<T>());
|
||||||
pos.push_back(0);
|
pos.push_back(0);
|
||||||
|
|
@ -67,308 +71,131 @@ struct DupContainer
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
/* 清除key位置的数据 */
|
|
||||||
static inline void clearIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx)
|
|
||||||
{
|
|
||||||
auto &msIdx = *pmsIdx;
|
|
||||||
if (msIdx.find(key) != msIdx.end())
|
|
||||||
msIdx[key].clear(); // 清除该位点的冗余结果
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 删除key位置的数据 */
|
/*
|
||||||
static inline void delIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx)
|
* 优先队列,用最小堆来实现对所有冗余索引的排序
|
||||||
{
|
*/
|
||||||
auto &msIdx = *pmsIdx;
|
struct PairArrIdIdx {
|
||||||
if (msIdx.find(key) != msIdx.end())
|
int arrId = 0;
|
||||||
msIdx.erase(key);
|
uint64_t arrIdx = 0;
|
||||||
}
|
int64_t dupIdx = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct IdxGreaterThan {
|
||||||
|
bool operator()(const PairArrIdIdx &a, const PairArrIdIdx &b) { return a.dupIdx > b.dupIdx; }
|
||||||
|
};
|
||||||
|
|
||||||
|
struct DupIdxQueue {
|
||||||
|
// 将冗余索引和他对应的task vector对应起来
|
||||||
|
|
||||||
|
// 由于是多个task来查找冗余,所以每次找到的冗余index都放在一个独立的vector中,vector之间可能有重叠,所以需要用一个最小堆来维护
|
||||||
|
vector<vector<int64_t>> *dupIdx2DArr;
|
||||||
|
priority_queue<PairArrIdIdx, vector<PairArrIdIdx>, IdxGreaterThan> minHeap;
|
||||||
|
uint64_t popNum = 0;
|
||||||
|
|
||||||
|
int Init(vector<vector<int64_t>> *_dupIdx2DArr) {
|
||||||
|
dupIdx2DArr = _dupIdx2DArr;
|
||||||
|
if (dupIdx2DArr == nullptr) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < dupIdx2DArr->size(); ++i) {
|
||||||
|
auto &v = (*dupIdx2DArr)[i];
|
||||||
|
if (!v.empty()) {
|
||||||
|
minHeap.push({i, 1, v[0]});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t Pop() {
|
||||||
|
int64_t ret = -1;
|
||||||
|
if (!minHeap.empty()) {
|
||||||
|
auto idx = minHeap.top();
|
||||||
|
minHeap.pop();
|
||||||
|
++popNum;
|
||||||
|
ret = idx.dupIdx;
|
||||||
|
auto &v = (*dupIdx2DArr)[idx.arrId];
|
||||||
|
if (v.size() > idx.arrIdx) {
|
||||||
|
minHeap.push({idx.arrId, idx.arrIdx + 1, v[idx.arrIdx]});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t Size() {
|
||||||
|
uint64_t len = 0;
|
||||||
|
if (dupIdx2DArr != nullptr) {
|
||||||
|
for (auto &v : *dupIdx2DArr) {
|
||||||
|
len += v.size();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return len - popNum;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* 用来检测optical duplication的graph
|
||||||
|
*/
|
||||||
|
template <class Node>
|
||||||
|
struct Graph { // 用set?
|
||||||
|
unordered_set<Node> nodes; // 图中的结点
|
||||||
|
unordered_map<Node*, unordered_set<Node*>> neighbors; // 邻接列表
|
||||||
|
|
||||||
|
Node *addNode(const Node &singleton) {
|
||||||
|
if (nodes.find(singleton) == nodes.end()) {
|
||||||
|
Node *n = const_cast<Node *>(&(*nodes.insert(singleton).first));
|
||||||
|
neighbors[n].clear();
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
return const_cast<Node *>(&(*nodes.find(singleton)));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* 计算read的分数
|
* 计算read的分数
|
||||||
*/
|
*/
|
||||||
static int16_t computeDuplicateScore(BamWrap &bw)
|
int16_t computeDuplicateScore(BamWrap &bw);
|
||||||
{
|
|
||||||
int16_t score = 0;
|
|
||||||
switch (g_mdArg.DUPLICATE_SCORING_STRATEGY)
|
|
||||||
{
|
|
||||||
case ns_md::SUM_OF_BASE_QUALITIES:
|
|
||||||
// two (very) long reads worth of high-quality bases can go over Short.MAX_VALUE/2
|
|
||||||
// and risk overflow.
|
|
||||||
score += (int16_t)min(bw.GetSumOfBaseQualities(), INT16_MAX / 2);
|
|
||||||
break;
|
|
||||||
case ns_md::TOTAL_MAPPED_REFERENCE_LENGTH:
|
|
||||||
if (!bw.GetReadUnmappedFlag())
|
|
||||||
// no need to remember the score since this scoring mechanism is symmetric
|
|
||||||
score = (int16_t)min(bw.GetReferenceLength(), INT16_MAX / 2);
|
|
||||||
break;
|
|
||||||
case ns_md::RANDOM:
|
|
||||||
// The RANDOM score gives the same score to both reads so that they get filtered together.
|
|
||||||
// it's not critical do use the readName since the scores from both ends get added, but it seem
|
|
||||||
// to be clearer this way.
|
|
||||||
score += (short)(Murmur3::Instance().HashUnencodedChars(bw.query_name()) & 0b11111111111111);
|
|
||||||
// subtract Short.MIN_VALUE/4 from it to end up with a number between
|
|
||||||
// 0 and Short.MAX_VALUE/2. This number can be then discounted in case the read is
|
|
||||||
// not passing filters. We need to stay far from overflow so that when we add the two
|
|
||||||
// scores from the two read mates we do not overflow since that could cause us to chose a
|
|
||||||
// failing read-pair instead of a passing one.
|
|
||||||
score -= INT16_MIN / 4;
|
|
||||||
default:
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
// make sure that filter-failing records are heavily discounted. (the discount can happen twice, once
|
|
||||||
// for each mate, so need to make sure we do not subtract more than Short.MIN_VALUE overall.)
|
|
||||||
score += bw.GetReadFailsVendorQualityCheckFlag() ? (int16_t)(INT16_MIN / 2) : 0;
|
|
||||||
|
|
||||||
return score;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Builds a read ends object that represents a single read. 用来表示一个read的特征结构
|
* Builds a read ends object that represents a single read.
|
||||||
|
* 用来表示一个read的特征结构
|
||||||
*/
|
*/
|
||||||
static void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey)
|
void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey);
|
||||||
{
|
|
||||||
auto &k = *pKey;
|
|
||||||
auto &bc = bw.b->core;
|
|
||||||
k.read1FirstOfPair = bw.GetFirstOfPairFlag();
|
|
||||||
k.read1ReferenceIndex = bc.tid;
|
|
||||||
k.read1Coordinate = (bc.flag & BAM_FREVERSE) ? bw.GetUnclippedEnd() : bw.GetUnclippedStart();
|
|
||||||
k.orientation = (bc.flag & BAM_FREVERSE) ? ReadEnds::R : ReadEnds::F;
|
|
||||||
|
|
||||||
k.read1IndexInFile = index;
|
/*
|
||||||
k.score = computeDuplicateScore(bw);
|
* 处理一组pairend的readends,标记冗余
|
||||||
// Doing this lets the ends object know that it's part of a pair
|
*/
|
||||||
if (bw.GetReadPairedFlag() && !bw.GetMateUnmappedFlag())
|
void markDuplicatePairs(int64_t posKey, vector<const ReadEnds *> &vpRe,
|
||||||
{
|
DupContainer<int64_t> *dupIdx, DupContainer<int64_t> *opticalDupIdx);
|
||||||
k.read2ReferenceIndex = bc.mtid;
|
|
||||||
}
|
/*
|
||||||
// Fill in the location information for optical duplicates
|
* 处理一组非paired的readends,标记冗余
|
||||||
rnParser.AddLocationInformation(bw.query_name(), pKey);
|
*/
|
||||||
// cout << k.tile << ' ' << k.x << ' ' << k.y << endl;
|
void markDuplicateFragments(int64_t posKey, vector<const ReadEnds *> &vpRe, bool containsPairs,
|
||||||
// 计算位置key
|
DupContainer<int64_t> *dupIdx);
|
||||||
k.posKey = BamWrap::bam_global_pos(k.read1ReferenceIndex, k.read1Coordinate); // << 1 | k.orientation;
|
|
||||||
}
|
/*
|
||||||
|
* 处理位于某个坐标的pairend reads
|
||||||
|
*/
|
||||||
|
void handlePairs(int64_t posKey, vector<ReadEnds> &readEnds, vector<const ReadEnds *> &vpCache,
|
||||||
|
DupContainer<int64_t> *dupIdx, DupContainer<int64_t> *opticalDupIdx);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* 处理位于某个坐标的非配对的frag reads
|
||||||
|
*/
|
||||||
|
void handleFrags(int64_t posKey, vector<ReadEnds> &readEnds, vector<const ReadEnds *> &vpCache,
|
||||||
|
DupContainer<int64_t> *dupIdx);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* 对找到的pairend read end添加一些信息
|
||||||
|
*/
|
||||||
|
void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Takes a list of ReadEndsForMarkDuplicates objects and identify the representative read based on
|
* Looks through the set of reads and identifies how many of the duplicates are
|
||||||
* quality score. For all members of the duplicate set, add the read1 index-in-file of the representative
|
* in fact optical duplicates, and stores the data in the instance level histogram.
|
||||||
* read to the records of the first and second in a pair. This value becomes is used for
|
* Additionally sets the transient isOpticalDuplicate flag on each read end that is
|
||||||
* the 'DI' tag.
|
* identified as an optical duplicate.
|
||||||
|
* 记录光学原因造成的冗余
|
||||||
*/
|
*/
|
||||||
static void addRepresentativeReadIndex(vector<const ReadEnds *> &vpRe)
|
void trackOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe);
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 处理一组pairend的readends,标记冗余 */
|
|
||||||
static void markDuplicatePairs(int64_t posKey,
|
|
||||||
vector<const ReadEnds *> &vpRe,
|
|
||||||
DupContainer<int64_t> *dupIdx,
|
|
||||||
DupContainer<int64_t> *opticalDupIdx)
|
|
||||||
{
|
|
||||||
if (vpRe.size() < 2)
|
|
||||||
{
|
|
||||||
if (vpRe.size() == 1)
|
|
||||||
{
|
|
||||||
// addSingletonToCount(libraryIdGenerator);
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
// cout << "pos:" << posKey + 1 << ";size:" << vpRe.size() << endl;
|
|
||||||
auto &vDupIdx = dupIdx->AtPos(posKey);
|
|
||||||
auto &vOpticalDupIdx = opticalDupIdx->AtPos(posKey);
|
|
||||||
|
|
||||||
int maxScore = 0;
|
|
||||||
const ReadEnds *pBest = nullptr;
|
|
||||||
/** All read ends should have orientation FF, FR, RF, or RR **/
|
|
||||||
for (auto pe : vpRe) // 找分数最高的readend
|
|
||||||
{
|
|
||||||
if (pe->score > maxScore || pBest == nullptr)
|
|
||||||
{
|
|
||||||
maxScore = pe->score;
|
|
||||||
pBest = pe;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!g_mdArg.READ_NAME_REGEX.empty()) // 检查光学冗余
|
|
||||||
{
|
|
||||||
// trackOpticalDuplicates
|
|
||||||
}
|
|
||||||
for (auto pe : vpRe) // 对非best read标记冗余
|
|
||||||
{
|
|
||||||
if (pe != pBest) // 非best
|
|
||||||
{
|
|
||||||
vDupIdx.push_back(pe->read1IndexInFile); // 添加read1
|
|
||||||
if (pe->read2IndexInFile != pe->read1IndexInFile)
|
|
||||||
vDupIdx.push_back(pe->read2IndexInFile); // 添加read2
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS)
|
|
||||||
{
|
|
||||||
addRepresentativeReadIndex(vpRe);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 处理一组非paired的readends,标记冗余 */
|
|
||||||
static void markDuplicateFragments(int64_t posKey,
|
|
||||||
vector<const ReadEnds *> &vpRe,
|
|
||||||
bool containsPairs,
|
|
||||||
DupContainer<int64_t> *dupIdx)
|
|
||||||
{
|
|
||||||
auto &vDupIdx = dupIdx->AtPos(posKey);
|
|
||||||
|
|
||||||
if (containsPairs)
|
|
||||||
{
|
|
||||||
for (auto pe : vpRe)
|
|
||||||
{
|
|
||||||
if (!pe->IsPaired())
|
|
||||||
{
|
|
||||||
vDupIdx.push_back(pe->read1IndexInFile);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
int maxScore = 0;
|
|
||||||
const ReadEnds *pBest = nullptr;
|
|
||||||
for (auto pe : vpRe)
|
|
||||||
{
|
|
||||||
if (pe->score > maxScore || pBest == nullptr)
|
|
||||||
{
|
|
||||||
maxScore = pe->score;
|
|
||||||
pBest = pe;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (auto pe : vpRe)
|
|
||||||
{
|
|
||||||
if (pe != pBest)
|
|
||||||
{
|
|
||||||
vDupIdx.push_back(pe->read1IndexInFile);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 处理位于某个坐标的pairend reads */
|
|
||||||
static inline void handlePairs(int64_t posKey,
|
|
||||||
vector<ReadEnds> &readEnds,
|
|
||||||
vector<const ReadEnds *> &vpCache,
|
|
||||||
DupContainer<int64_t> *dupIdx,
|
|
||||||
DupContainer<int64_t> *opticalDupIdx)
|
|
||||||
{
|
|
||||||
if (readEnds.size() > 1) // 有潜在的冗余
|
|
||||||
{
|
|
||||||
vpCache.clear();
|
|
||||||
// std::sort(readEnds.begin(), readEnds.end());
|
|
||||||
const ReadEnds *pReadEnd = nullptr;
|
|
||||||
for (auto &re : readEnds)
|
|
||||||
{
|
|
||||||
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样
|
|
||||||
vpCache.push_back(&re); // 处理一个潜在的冗余组
|
|
||||||
else
|
|
||||||
{
|
|
||||||
markDuplicatePairs(posKey, vpCache, dupIdx, opticalDupIdx); // 不一样
|
|
||||||
vpCache.clear();
|
|
||||||
vpCache.push_back(&re);
|
|
||||||
pReadEnd = &re;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
markDuplicatePairs(posKey, vpCache, dupIdx, opticalDupIdx);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 处理位于某个坐标的 reads */
|
|
||||||
static inline void handleFrags(
|
|
||||||
int64_t posKey,
|
|
||||||
vector<ReadEnds> &readEnds,
|
|
||||||
vector<const ReadEnds *> &vpCache,
|
|
||||||
DupContainer<int64_t> *dupIdx)
|
|
||||||
{
|
|
||||||
if (readEnds.size() > 1) // 有潜在的冗余
|
|
||||||
{
|
|
||||||
vpCache.clear();
|
|
||||||
// std::sort(readEnds.begin(), readEnds.end());
|
|
||||||
const ReadEnds *pReadEnd = nullptr;
|
|
||||||
bool containsPairs = false;
|
|
||||||
bool containsFrags = false;
|
|
||||||
for (auto &re : readEnds)
|
|
||||||
{
|
|
||||||
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false))
|
|
||||||
{
|
|
||||||
vpCache.push_back(&re);
|
|
||||||
containsPairs = containsPairs || re.IsPaired();
|
|
||||||
containsFrags = containsFrags || !re.IsPaired();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (vpCache.size() > 1 && containsFrags)
|
|
||||||
{
|
|
||||||
markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
|
|
||||||
}
|
|
||||||
vpCache.clear();
|
|
||||||
vpCache.push_back(&re);
|
|
||||||
pReadEnd = &re;
|
|
||||||
containsPairs = re.IsPaired();
|
|
||||||
containsFrags = !re.IsPaired();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (vpCache.size() > 1 && containsFrags)
|
|
||||||
{
|
|
||||||
markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 对找到的pairend read end添加一些信息 */
|
|
||||||
static inline void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds)
|
|
||||||
{
|
|
||||||
auto &pairedEnds = *pPairedEnds;
|
|
||||||
int64_t bamIdx = fragEnd.read1IndexInFile;
|
|
||||||
const int matesRefIndex = fragEnd.read1ReferenceIndex;
|
|
||||||
const int matesCoordinate = fragEnd.read1Coordinate;
|
|
||||||
// Set orientationForOpticalDuplicates, which always goes by the first then the second end for the strands. NB: must do this
|
|
||||||
// before updating the orientation later.
|
|
||||||
if (fragEnd.read1FirstOfPair)
|
|
||||||
{
|
|
||||||
pairedEnds.orientationForOpticalDuplicates =
|
|
||||||
ReadEnds::GetOrientationByte(fragEnd.IsNegativeStrand(), pairedEnds.orientation == ReadEnds::R);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
pairedEnds.orientationForOpticalDuplicates =
|
|
||||||
ReadEnds::GetOrientationByte(pairedEnds.orientation == ReadEnds::R, fragEnd.IsNegativeStrand());
|
|
||||||
}
|
|
||||||
// If the other read is actually later, simply add the other read's data as read2, else flip the reads
|
|
||||||
if (matesRefIndex > pairedEnds.read1ReferenceIndex ||
|
|
||||||
(matesRefIndex == pairedEnds.read1ReferenceIndex && matesCoordinate >= pairedEnds.read1Coordinate))
|
|
||||||
{
|
|
||||||
pairedEnds.read2ReferenceIndex = matesRefIndex;
|
|
||||||
pairedEnds.read2Coordinate = matesCoordinate;
|
|
||||||
pairedEnds.read2IndexInFile = bamIdx;
|
|
||||||
pairedEnds.orientation = ReadEnds::GetOrientationByte(pairedEnds.orientation == ReadEnds::R,
|
|
||||||
fragEnd.IsNegativeStrand());
|
|
||||||
|
|
||||||
// if the two read ends are in the same position, pointing in opposite directions,
|
|
||||||
// the orientation is undefined and the procedure above
|
|
||||||
// will depend on the order of the reads in the file.
|
|
||||||
// To avoid this, we set it explicitly (to FR):
|
|
||||||
if (pairedEnds.read2ReferenceIndex == pairedEnds.read1ReferenceIndex &&
|
|
||||||
pairedEnds.read2Coordinate == pairedEnds.read1Coordinate &&
|
|
||||||
pairedEnds.orientation == ReadEnds::RF)
|
|
||||||
{
|
|
||||||
pairedEnds.orientation = ReadEnds::FR;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
pairedEnds.read2ReferenceIndex = pairedEnds.read1ReferenceIndex;
|
|
||||||
pairedEnds.read2Coordinate = pairedEnds.read1Coordinate;
|
|
||||||
pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile;
|
|
||||||
pairedEnds.read1ReferenceIndex = matesRefIndex;
|
|
||||||
pairedEnds.read1Coordinate = matesCoordinate;
|
|
||||||
pairedEnds.read1IndexInFile = bamIdx;
|
|
||||||
pairedEnds.orientation = ReadEnds::GetOrientationByte(fragEnd.IsNegativeStrand(),
|
|
||||||
pairedEnds.orientation == ReadEnds::R);
|
|
||||||
pairedEnds.posKey = fragEnd.posKey;
|
|
||||||
}
|
|
||||||
pairedEnds.score += fragEnd.score;
|
|
||||||
}
|
|
||||||
|
|
@ -0,0 +1,783 @@
|
||||||
|
#include "serial_md.h"
|
||||||
|
|
||||||
|
#include <common/hts/bam_buf.h>
|
||||||
|
#include <common/utils/debug.h>
|
||||||
|
#include <common/utils/global_arg.h>
|
||||||
|
#include <common/utils/profiling.h>
|
||||||
|
#include <common/utils/timer.h>
|
||||||
|
#include <common/utils/util.h>
|
||||||
|
#include <sam/utils/read_ends.h>
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <iostream>
|
||||||
|
#include <set>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "markdups_arg.h"
|
||||||
|
#include "md_funcs.h"
|
||||||
|
#include "shared_args.h"
|
||||||
|
#include "dup_metrics.h"
|
||||||
|
|
||||||
|
using std::cout;
|
||||||
|
using std::set;
|
||||||
|
using std::vector;
|
||||||
|
|
||||||
|
/* 查找 */
|
||||||
|
// template<class Itr, class T>
|
||||||
|
// static inline Itr binaryFind(Itr first, Itr last, const T &val)
|
||||||
|
// {
|
||||||
|
// first = std::lower_bound(first, last, val);
|
||||||
|
// return (first != last && *first == val) ? first : last;
|
||||||
|
// }
|
||||||
|
|
||||||
|
/* 排序 */
|
||||||
|
static inline void sortReadEndsArr(vector<ReadEnds> &arr) {
|
||||||
|
size_t blockSize = 64 * 1024;
|
||||||
|
if (arr.size() < blockSize) {
|
||||||
|
std::sort(arr.begin(), arr.end());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
size_t blockNum = (arr.size() + blockSize - 1) / blockSize;
|
||||||
|
size_t crossNum = 1024;
|
||||||
|
size_t start, end, i, left, right;
|
||||||
|
std::sort(arr.begin(), arr.begin() + blockSize);
|
||||||
|
for (i = 1; i < blockNum; ++i) {
|
||||||
|
start = i * blockSize;
|
||||||
|
end = min(start + blockSize, arr.size());
|
||||||
|
std::sort(arr.begin() + start, arr.begin() + end);
|
||||||
|
left = crossNum;
|
||||||
|
while (!(arr[start - left] < arr[start])) {
|
||||||
|
left = left << 1;
|
||||||
|
if (left >= blockSize) {
|
||||||
|
std::sort(arr.begin(), arr.end()); // 退化到普通排序
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
right = min(crossNum, end - start - 1);
|
||||||
|
|
||||||
|
while (!(arr[start - 1] < arr[start + right])) {
|
||||||
|
right = min(right << 1, end - start - 1);
|
||||||
|
if (right == end - start - 1)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
std::sort(arr.begin() + start - left, arr.begin() + start + right);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 处理一组pairend的readends,标记冗余, 这个函数需要串行运行,因为需要做一些统计*/
|
||||||
|
static void markDupsForPairs(vector<const ReadEnds *> &vpRe, set<int64_t> *dupIdx, set<int64_t> *opticalDupIdx,
|
||||||
|
set<int64_t> *notDupIdx = nullptr) {
|
||||||
|
if (vpRe.size() < 2) {
|
||||||
|
if (vpRe.size() == 1) {
|
||||||
|
// addSingletonToCount(libraryIdGenerator);
|
||||||
|
// 这个统计可能会有误差,因为当前位点可能还有没匹配上的read,导致当前位点的read(paired)数量为1
|
||||||
|
// 可以通过后续的补充计算来解决这个问题,有必要么?
|
||||||
|
gMetrics.AddSingletonToCount();
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
int maxScore = 0;
|
||||||
|
const ReadEnds *pBest = nullptr;
|
||||||
|
/** All read ends should have orientation FF, FR, RF, or RR **/
|
||||||
|
for (auto pe : vpRe) { // 找分数最高的readend
|
||||||
|
if (pe->score > maxScore || pBest == nullptr) {
|
||||||
|
maxScore = pe->score;
|
||||||
|
pBest = pe;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (notDupIdx != nullptr) {
|
||||||
|
notDupIdx->insert(pBest->read1IndexInFile);
|
||||||
|
notDupIdx->insert(pBest->read2IndexInFile);
|
||||||
|
}
|
||||||
|
if (!g_mdArg.READ_NAME_REGEX.empty()) { // 检查光学冗余
|
||||||
|
// trackOpticalDuplicates
|
||||||
|
trackOpticalDuplicates(vpRe, pBest);
|
||||||
|
}
|
||||||
|
for (auto pe : vpRe) // 对非best read标记冗余
|
||||||
|
{
|
||||||
|
if (pe != pBest) // 非best
|
||||||
|
{
|
||||||
|
dupIdx->insert(pe->read1IndexInFile); // 添加read1
|
||||||
|
if (pe->read2IndexInFile != pe->read1IndexInFile)
|
||||||
|
dupIdx->insert(pe->read2IndexInFile); // 添加read2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// 在输出的bam文件中添加tag
|
||||||
|
if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS) {
|
||||||
|
// addRepresentativeReadIndex(vpRe); // 每次都更新就行,用最新的覆盖之前的(如果之前有)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 处理一组非paired的readends,标记冗余 */
|
||||||
|
static void markDupsForFrags(vector<const ReadEnds *> &vpRe, bool containsPairs, set<int64_t> *dupIdx,
|
||||||
|
set<int64_t> *notDupIdx = nullptr) {
|
||||||
|
if (containsPairs) {
|
||||||
|
for (auto pe : vpRe) {
|
||||||
|
if (!pe->IsPaired()) {
|
||||||
|
dupIdx->insert(pe->read1IndexInFile);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
int maxScore = 0;
|
||||||
|
const ReadEnds *pBest = nullptr;
|
||||||
|
for (auto pe : vpRe) {
|
||||||
|
if (pe->score > maxScore || pBest == nullptr) {
|
||||||
|
maxScore = pe->score;
|
||||||
|
pBest = pe;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (notDupIdx != nullptr) {
|
||||||
|
notDupIdx->insert(pBest->read1IndexInFile);
|
||||||
|
}
|
||||||
|
for (auto pe : vpRe) {
|
||||||
|
if (pe != pBest) {
|
||||||
|
dupIdx->insert(pe->read1IndexInFile);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 找到与readend pos相等的所有readend */
|
||||||
|
static void getEqualRE(const ReadEnds &re, vector<ReadEnds> &src, vector<ReadEnds> *dst) {
|
||||||
|
auto range = std::equal_range(src.begin(), src.end(), re,
|
||||||
|
ReadEnds::PairsLittleThan); // 只比对位点
|
||||||
|
dst->insert(dst->end(), range.first, range.second);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 单线程生成readends (第一步)*/
|
||||||
|
static void generateReadEnds(SerailMarkDupArg *arg) {
|
||||||
|
auto &p = *arg;
|
||||||
|
auto &rnParser = g_vRnParser[0];
|
||||||
|
|
||||||
|
p.pairs.clear();
|
||||||
|
p.frags.clear();
|
||||||
|
p.unpairedDic.clear();
|
||||||
|
p.unpairedPosArr.clear();
|
||||||
|
|
||||||
|
/* 处理每个read,创建ReadEnd,并放入frag和pair中 */
|
||||||
|
set<ReadEnds> reSet;
|
||||||
|
|
||||||
|
ReadEnds lastRe;
|
||||||
|
|
||||||
|
for (int i = 0; i < p.bams.size(); ++i) // 循环处理每个read
|
||||||
|
{
|
||||||
|
BamWrap *bw = p.bams[i];
|
||||||
|
const int64_t bamIdx = p.bamStartIdx + i;
|
||||||
|
if (bw->GetReadUnmappedFlag()) {
|
||||||
|
if (bw->b->core.tid == -1)
|
||||||
|
// When we hit the unmapped reads with no coordinate, no reason
|
||||||
|
// to continue (only in coordinate sort).
|
||||||
|
break;
|
||||||
|
} else if (!bw->IsSecondaryOrSupplementary()) // 是主要比对
|
||||||
|
{
|
||||||
|
ReadEnds fragEnd;
|
||||||
|
tm_arr[8].acc_start();
|
||||||
|
buildReadEnds(*bw, bamIdx, rnParser, &fragEnd);
|
||||||
|
tm_arr[8].acc_end();
|
||||||
|
p.frags.push_back(fragEnd); // 添加进frag集合
|
||||||
|
if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) // 是pairend而且互补的read也比对上了
|
||||||
|
{
|
||||||
|
string key = bw->query_name();
|
||||||
|
if (p.unpairedDic.find(key) == p.unpairedDic.end()) {
|
||||||
|
p.unpairedDic[key] = {p.taskSeq, fragEnd};
|
||||||
|
} else // 找到了pairend
|
||||||
|
{
|
||||||
|
auto &pairedEnds = p.unpairedDic.at(key).unpairedRE;
|
||||||
|
modifyPairedEnds(fragEnd, &pairedEnds);
|
||||||
|
p.pairs.push_back(pairedEnds);
|
||||||
|
p.unpairedDic.erase(key); // 删除找到的pairend
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tm_arr[9].acc_start();
|
||||||
|
sortReadEndsArr(p.frags);
|
||||||
|
// sort(p.frags.begin(), p.frags.end());
|
||||||
|
tm_arr[9].acc_end();
|
||||||
|
// cout << "sort pairs" << endl;
|
||||||
|
tm_arr[10].acc_start();
|
||||||
|
sort(p.pairs.begin(), p.pairs.end());
|
||||||
|
tm_arr[10].acc_end();
|
||||||
|
// 记录位点上的未匹配的read个数
|
||||||
|
for (auto &e : p.unpairedDic) {
|
||||||
|
auto posKey = e.second.unpairedRE.posKey;
|
||||||
|
auto &unpairArrInfo = p.unpairedPosArr[posKey];
|
||||||
|
unpairArrInfo.unpairedNum++;
|
||||||
|
unpairArrInfo.taskSeq = p.taskSeq;
|
||||||
|
unpairArrInfo.readNameSet.insert(e.first);
|
||||||
|
}
|
||||||
|
// cout << "依赖比例:" << (float)p.unpairedDic.size() / p.frags.size() <<
|
||||||
|
// endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 处理pairs */
|
||||||
|
static void processPairs(vector<ReadEnds> &readEnds, set<int64_t> *dupIdx, set<int64_t> *opticalDupIdx,
|
||||||
|
set<int64_t> *notDupIdx = nullptr) {
|
||||||
|
vector<const ReadEnds *> vpCache; // 有可能是冗余的reads
|
||||||
|
const ReadEnds *pReadEnd = nullptr;
|
||||||
|
for (auto &re : readEnds) {
|
||||||
|
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样
|
||||||
|
vpCache.push_back(&re); // 处理一个潜在的冗余组
|
||||||
|
else {
|
||||||
|
markDupsForPairs(vpCache, dupIdx, opticalDupIdx, notDupIdx); // 不一样
|
||||||
|
vpCache.clear();
|
||||||
|
vpCache.push_back(&re);
|
||||||
|
pReadEnd = &re;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
markDupsForPairs(vpCache, dupIdx, opticalDupIdx, notDupIdx);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 处理frags */
|
||||||
|
static void processFrags(vector<ReadEnds> &readEnds, set<int64_t> *dupIdx, set<int64_t> *notDupIdx = nullptr) {
|
||||||
|
bool containsPairs = false;
|
||||||
|
bool containsFrags = false;
|
||||||
|
vector<const ReadEnds *> vpCache; // 有可能是冗余的reads
|
||||||
|
const ReadEnds *pReadEnd = nullptr;
|
||||||
|
for (auto &re : readEnds) {
|
||||||
|
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false)) {
|
||||||
|
vpCache.push_back(&re);
|
||||||
|
containsPairs = containsPairs || re.IsPaired();
|
||||||
|
containsFrags = containsFrags || !re.IsPaired();
|
||||||
|
} else {
|
||||||
|
if (vpCache.size() > 1 && containsFrags) {
|
||||||
|
markDupsForFrags(vpCache, containsPairs, dupIdx, notDupIdx);
|
||||||
|
}
|
||||||
|
vpCache.clear();
|
||||||
|
vpCache.push_back(&re);
|
||||||
|
pReadEnd = &re;
|
||||||
|
containsPairs = re.IsPaired();
|
||||||
|
containsFrags = !re.IsPaired();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (vpCache.size() > 1 && containsFrags) {
|
||||||
|
markDupsForFrags(vpCache, containsPairs, dupIdx, notDupIdx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 单线程markdup (第二步)*/
|
||||||
|
static void markdups(SerailMarkDupArg *arg) {
|
||||||
|
auto &p = *arg;
|
||||||
|
p.pairDupIdx.clear();
|
||||||
|
p.pairOpticalDupIdx.clear();
|
||||||
|
p.fragDupIdx.clear();
|
||||||
|
/* generateDuplicateIndexes,计算冗余read在所有read中的位置索引 */
|
||||||
|
// 先处理 pair
|
||||||
|
processPairs(p.pairs, &p.pairDupIdx, &p.pairOpticalDupIdx);
|
||||||
|
|
||||||
|
// 再处理frag
|
||||||
|
processFrags(p.frags, &p.fragDupIdx);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 获取交叉部分的数据 */
|
||||||
|
static inline void getIntersectData(vector<ReadEnds> &leftArr, vector<ReadEnds> &rightArr, vector<ReadEnds> *dst,
|
||||||
|
bool isPairCmp = false) {
|
||||||
|
if (leftArr.empty() || rightArr.empty()) {
|
||||||
|
cout << "bad size: " << leftArr.size() << '\t' << rightArr.size() << endl;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const size_t leftEndIdx = leftArr.size() - 1;
|
||||||
|
const size_t rightStartIdx = 0;
|
||||||
|
size_t leftSpan = 0;
|
||||||
|
size_t rightSpan = 0;
|
||||||
|
|
||||||
|
while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx - leftSpan], rightArr[rightStartIdx], isPairCmp)) {
|
||||||
|
leftSpan += 1;
|
||||||
|
if (leftSpan > leftEndIdx) {
|
||||||
|
leftSpan = leftArr.size() - 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx], rightArr[rightSpan], isPairCmp)) {
|
||||||
|
rightSpan += 1;
|
||||||
|
if (rightSpan == rightArr.size() - 1)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
dst->insert(dst->end(), leftArr.end() - leftSpan, leftArr.end());
|
||||||
|
dst->insert(dst->end(), rightArr.begin(), rightArr.begin() + rightSpan);
|
||||||
|
std::sort(dst->begin(), dst->end());
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 将frags重叠部分的dup idx放进数据中 */
|
||||||
|
static inline void refreshFragDupIdx(set<int64_t> &dupIdx, set<int64_t> ¬DupIdx, SerailMarkDupArg *lastArg,
|
||||||
|
SerailMarkDupArg *curArg) {
|
||||||
|
auto &lp = *lastArg;
|
||||||
|
auto &p = *curArg;
|
||||||
|
for (auto idx : dupIdx) {
|
||||||
|
lp.fragDupIdx.insert(idx);
|
||||||
|
p.fragDupIdx.erase(idx);
|
||||||
|
}
|
||||||
|
for (auto idx : notDupIdx) {
|
||||||
|
lp.fragDupIdx.erase(idx);
|
||||||
|
p.fragDupIdx.erase(idx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 将pairs重叠部分的dup idx放进数据中 */
|
||||||
|
static inline void refreshPairDupIdx(set<int64_t> &dupIdx, set<int64_t> &opticalDupIdx, set<int64_t> ¬DupIdx,
|
||||||
|
SerailMarkDupArg *lastArg, SerailMarkDupArg *curArg) {
|
||||||
|
auto &lp = *lastArg;
|
||||||
|
auto &p = *curArg;
|
||||||
|
for (auto idx : dupIdx) {
|
||||||
|
lp.pairDupIdx.insert(idx);
|
||||||
|
p.pairDupIdx.erase(idx);
|
||||||
|
}
|
||||||
|
for (auto idx : opticalDupIdx) {
|
||||||
|
lp.pairOpticalDupIdx.insert(idx);
|
||||||
|
p.pairOpticalDupIdx.erase(idx);
|
||||||
|
}
|
||||||
|
for (auto idx : notDupIdx) {
|
||||||
|
lp.pairDupIdx.erase(idx);
|
||||||
|
lp.pairOpticalDupIdx.erase(idx);
|
||||||
|
p.pairDupIdx.erase(idx);
|
||||||
|
p.pairOpticalDupIdx.erase(idx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 用来分别处理dup和optical dup
|
||||||
|
static void refeshTaskDupInfo(set<int64_t> &dupIdx, set<int64_t> &opticalDupIdx, set<int64_t> ¬DupIdx,
|
||||||
|
set<int64_t> &latterDupIdx, set<int64_t> &latterOpticalDupIdx,
|
||||||
|
set<int64_t> &latterNotDupIdx) {
|
||||||
|
for (auto idx : dupIdx) latterDupIdx.insert(idx);
|
||||||
|
for (auto idx : opticalDupIdx) latterOpticalDupIdx.insert(idx);
|
||||||
|
for (auto idx : notDupIdx) latterNotDupIdx.insert(idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 最后合并数据并排序 */
|
||||||
|
static void refeshFinalTaskDupInfo(set<int64_t> &dupIdx, set<int64_t> ¬DupIdx, vector<int64_t> &dupArr) {
|
||||||
|
vector<int64_t> midArr;
|
||||||
|
|
||||||
|
auto ai = dupArr.begin();
|
||||||
|
auto bi = dupIdx.begin();
|
||||||
|
auto ae = dupArr.end();
|
||||||
|
auto be = dupIdx.end();
|
||||||
|
|
||||||
|
int64_t val = 0;
|
||||||
|
while (ai != ae && bi != be) {
|
||||||
|
if (*ai < *bi) {
|
||||||
|
val = *ai++;
|
||||||
|
} else if (*bi < *ai) {
|
||||||
|
val = *bi++;
|
||||||
|
} else {
|
||||||
|
val = *ai++;
|
||||||
|
bi++;
|
||||||
|
}
|
||||||
|
if (notDupIdx.find(val) == notDupIdx.end()) {
|
||||||
|
midArr.push_back(val);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
while (ai != ae) {
|
||||||
|
val = *ai++;
|
||||||
|
if (notDupIdx.find(val) == notDupIdx.end()) {
|
||||||
|
midArr.push_back(val);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
while (bi != be) {
|
||||||
|
val = *bi++;
|
||||||
|
if (notDupIdx.find(val) == notDupIdx.end()) {
|
||||||
|
midArr.push_back(val);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
dupArr = midArr;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 处理相邻的两个任务,有相交叉的数据 */
|
||||||
|
static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *curArg, GlobalDataArg *gDataArg) {
|
||||||
|
auto &lp = *lastArg;
|
||||||
|
auto &p = *curArg;
|
||||||
|
auto &g = *gDataArg;
|
||||||
|
|
||||||
|
vector<ReadEnds> reArr;
|
||||||
|
set<int64_t> dupIdx;
|
||||||
|
set<int64_t> notDupIdx;
|
||||||
|
// 先处理重叠的frags
|
||||||
|
getIntersectData(lp.frags, p.frags, &reArr);
|
||||||
|
processFrags(reArr, &dupIdx, ¬DupIdx);
|
||||||
|
refreshFragDupIdx(dupIdx, notDupIdx, &lp, &p);
|
||||||
|
|
||||||
|
// 再处理重叠的pairs
|
||||||
|
reArr.clear();
|
||||||
|
dupIdx.clear();
|
||||||
|
notDupIdx.clear();
|
||||||
|
set<int64_t> opticalDupIdx;
|
||||||
|
getIntersectData(lp.pairs, p.pairs, &reArr, true);
|
||||||
|
processPairs(reArr, &dupIdx, &opticalDupIdx, ¬DupIdx);
|
||||||
|
refreshPairDupIdx(dupIdx, opticalDupIdx, notDupIdx, &lp, &p);
|
||||||
|
|
||||||
|
// 处理之前未匹配的部分
|
||||||
|
map<CalcKey, int64_t> recalcPos;
|
||||||
|
set<CalcKey> alreadyAdd; // 与该位点相同的pair都添加到数组里了
|
||||||
|
set<int64_t> addToGlobal;
|
||||||
|
int64_t prevLastPos = 0, nextFirstPos = 0;
|
||||||
|
if (lp.frags.size() > 0)
|
||||||
|
prevLastPos = lp.frags.back().posKey;
|
||||||
|
if (p.frags.size() > 0)
|
||||||
|
nextFirstPos = p.frags[0].posKey;
|
||||||
|
// cout << "range: " << nextFirstPos << '\t' << prevLastPos << endl;
|
||||||
|
for (auto &prevUnpair : lp.unpairedDic) { // 遍历上一个任务中的每个未匹配的read
|
||||||
|
auto &readName = prevUnpair.first;
|
||||||
|
auto &prevPosInfo = prevUnpair.second;
|
||||||
|
auto prevFragEnd = prevPosInfo.unpairedRE; // 未匹配的read end
|
||||||
|
|
||||||
|
if (p.unpairedDic.find(readName) != p.unpairedDic.end()) { // 在当前这个任务里找到了这个未匹配的read
|
||||||
|
auto &nextPosInfo = p.unpairedDic[readName];
|
||||||
|
auto &nextFragEnd = nextPosInfo.unpairedRE;
|
||||||
|
int64_t prevPosKey = prevFragEnd.posKey;
|
||||||
|
modifyPairedEnds(nextFragEnd, &prevFragEnd); // 在某些clip情况下,poskey可能是后面的read
|
||||||
|
int64_t nextPosKey = max(prevPosKey, nextFragEnd.posKey);
|
||||||
|
CalcKey ck = {prevPosKey, nextPosKey};
|
||||||
|
UnpairedPosInfo *prevUnpairInfoP = nullptr;
|
||||||
|
UnpairedPosInfo *nextUnpairInfoP = nullptr;
|
||||||
|
if (lp.unpairedPosArr.find(prevPosKey) != lp.unpairedPosArr.end())
|
||||||
|
prevUnpairInfoP = &lp.unpairedPosArr[prevPosKey];
|
||||||
|
if (p.unpairedPosArr.find(prevPosKey) != p.unpairedPosArr.end())
|
||||||
|
nextUnpairInfoP = &p.unpairedPosArr[prevPosKey];
|
||||||
|
|
||||||
|
// pos分为两种情况,根据poskey(pair中两个read分别的pos)的位置确定
|
||||||
|
// 1.
|
||||||
|
// prevpos在交叉部分之前,nextpos在交叉部分之后,这种情况不需要获取pairarr中的数据;
|
||||||
|
// 2.
|
||||||
|
// prevpos在交叉部分之前,nextpos在交叉部分,需要获取lp中的相等read
|
||||||
|
// pair进行重新计算
|
||||||
|
// 复杂情况1.
|
||||||
|
// g中包含prevPosKey对应的unpair,p中有对应的pair,此时应该把这些pair考虑进去
|
||||||
|
// 3.
|
||||||
|
// prevpos在交叉部分,nextpos在交叉部分之后,需要获取p中的相等read
|
||||||
|
// pair进行重新计算
|
||||||
|
// 复杂情况2. p中是否包含prevPosKey对应的unpair
|
||||||
|
// 4.
|
||||||
|
// prevpos在交叉部分,nextpos在交叉部分,需要获取lp和p中的相等read
|
||||||
|
// pair进行重新计算
|
||||||
|
|
||||||
|
bool addDataToPos = true;
|
||||||
|
if (alreadyAdd.find(ck) != alreadyAdd.end()) {
|
||||||
|
addDataToPos = false; // 之前已经添加过了,后面就不用再添加数据了
|
||||||
|
} else
|
||||||
|
alreadyAdd.insert(ck);
|
||||||
|
|
||||||
|
if (prevPosKey < nextFirstPos) { // prevpos在交叉部分之前
|
||||||
|
auto &prevPairArr = prevUnpairInfoP->pairArr; // prevUnpairInfoP肯定不是nullptr
|
||||||
|
prevPairArr.push_back(prevFragEnd);
|
||||||
|
if (nextPosKey <= prevLastPos && addDataToPos) { // 第二种情况
|
||||||
|
getEqualRE(prevFragEnd, lp.pairs, &prevPairArr);
|
||||||
|
}
|
||||||
|
// 第一种情况,第二种情况下都会出现,复杂情况一
|
||||||
|
auto gPosInfo = g.unpairedPosArr.find(prevPosKey);
|
||||||
|
if (gPosInfo != g.unpairedPosArr.end()) { // 可能g和p有匹配的,刚好和该位点一致
|
||||||
|
auto &gUnpairInfo = gPosInfo->second;
|
||||||
|
auto pPosInfo = p.unpairedPosArr.find(nextPosKey);
|
||||||
|
if (pPosInfo != p.unpairedPosArr.end()) {
|
||||||
|
auto &pUnpairInfo = pPosInfo->second;
|
||||||
|
for (auto &rn : gUnpairInfo.readNameSet) { // 遍历每一个readname,看是否有匹配的
|
||||||
|
if (pUnpairInfo.readNameSet.find(rn) != pUnpairInfo.readNameSet.end()) {
|
||||||
|
auto pe = g.unpairedDic[rn].unpairedRE;
|
||||||
|
auto fe = p.unpairedDic[rn].unpairedRE;
|
||||||
|
modifyPairedEnds(fe, &pe);
|
||||||
|
prevPairArr.push_back(pe);
|
||||||
|
g.unpairedDic.erase(rn);
|
||||||
|
p.unpairedDic.erase(rn);
|
||||||
|
// cout << "找到了!" << rn << endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
recalcPos[ck] = prevPosInfo.taskSeq;
|
||||||
|
std::sort(prevPairArr.begin(), prevPairArr.end());
|
||||||
|
} else { // prevpos在交叉部分
|
||||||
|
if (nextPosKey > prevLastPos) { // nextpos在交叉部分之后 第三种情况
|
||||||
|
if (nextUnpairInfoP != nullptr) { // 且在pos点,next task有unpair,这样才把这些数据放到next task里
|
||||||
|
auto &nextPairArr = nextUnpairInfoP->pairArr;
|
||||||
|
nextPairArr.push_back(prevFragEnd);
|
||||||
|
auto &prevPairArr = prevUnpairInfoP->pairArr;
|
||||||
|
prevPairArr.push_back(prevFragEnd);
|
||||||
|
if (addDataToPos) {
|
||||||
|
getEqualRE(prevFragEnd, p.pairs, &prevPairArr);
|
||||||
|
}
|
||||||
|
// 将数据放到next task里,(这个位点以后会可能还会计算到,目前方案是都计算,只是把冗余剔除)
|
||||||
|
recalcPos[ck] = nextPosInfo.taskSeq;
|
||||||
|
|
||||||
|
std::sort(prevPairArr.begin(), prevPairArr.end());
|
||||||
|
} else { // next task在该位点没有unpair,那就把数据放到prev task里
|
||||||
|
auto &prevPairArr = prevUnpairInfoP->pairArr; // prevUnpairInfoP肯定不是nullptr
|
||||||
|
prevPairArr.push_back(prevFragEnd);
|
||||||
|
if (addDataToPos) // 第二种情况
|
||||||
|
getEqualRE(prevFragEnd, p.pairs, &prevPairArr);
|
||||||
|
recalcPos[ck] = prevPosInfo.taskSeq;
|
||||||
|
std::sort(prevPairArr.begin(), prevPairArr.end());
|
||||||
|
}
|
||||||
|
} else { // 第四种情况
|
||||||
|
if (prevUnpairInfoP == nullptr) {
|
||||||
|
prevUnpairInfoP = &lp.unpairedPosArr[prevPosKey];
|
||||||
|
prevUnpairInfoP->taskSeq = lp.taskSeq;
|
||||||
|
}
|
||||||
|
auto &prevPairArr = prevUnpairInfoP->pairArr;
|
||||||
|
prevPairArr.push_back(prevFragEnd);
|
||||||
|
if (addDataToPos) {
|
||||||
|
getEqualRE(prevFragEnd, lp.pairs, &prevPairArr);
|
||||||
|
getEqualRE(prevFragEnd, p.pairs, &prevPairArr);
|
||||||
|
}
|
||||||
|
recalcPos[ck] = prevPosInfo.taskSeq;
|
||||||
|
std::sort(prevPairArr.begin(), prevPairArr.end());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
p.unpairedDic.erase(readName); // 在next task里删除该read
|
||||||
|
} else if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // 在遗留数据中找到了匹配的read
|
||||||
|
auto &remainPosInfo = g.unpairedDic[readName];
|
||||||
|
auto remainFragEnd = remainPosInfo.unpairedRE;
|
||||||
|
int64_t remainPosKey = remainFragEnd.posKey;
|
||||||
|
modifyPairedEnds(prevFragEnd, &remainFragEnd); // 在某些clip情况下,poskey可能是后面的read
|
||||||
|
auto &remainUnpairInfo = g.unpairedPosArr[remainPosKey];
|
||||||
|
auto &remainPairArr = remainUnpairInfo.pairArr;
|
||||||
|
remainPairArr.push_back(remainFragEnd);
|
||||||
|
CalcKey ck = {remainPosKey, prevFragEnd.posKey};
|
||||||
|
recalcPos[ck] = remainPosInfo.taskSeq;
|
||||||
|
std::sort(remainPairArr.begin(), remainPairArr.end());
|
||||||
|
|
||||||
|
g.unpairedDic.erase(readName);
|
||||||
|
} else { // 都没找到,那就保存到遗留数据里
|
||||||
|
int64_t prevPosKey = prevFragEnd.posKey;
|
||||||
|
g.unpairedDic.insert(prevUnpair);
|
||||||
|
addToGlobal.insert(prevPosKey);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// 最后再添加,以防开始赋值,后来这个位置要是又添加了新的数据
|
||||||
|
for (auto posKey : addToGlobal) g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey];
|
||||||
|
|
||||||
|
map<int64_t, TaskSeqDupInfo> taskChanged;
|
||||||
|
set<int64_t> posProcessed;
|
||||||
|
for (auto &e : recalcPos) {
|
||||||
|
auto posKey = e.first.read1Pos;
|
||||||
|
if (posProcessed.find(posKey) != posProcessed.end())
|
||||||
|
continue;
|
||||||
|
posProcessed.insert(posKey);
|
||||||
|
auto taskSeq = e.second;
|
||||||
|
auto &t = taskChanged[taskSeq];
|
||||||
|
// 在对应的任务包含的dup idx里修改结果数据
|
||||||
|
vector<ReadEnds> *pairArrP = nullptr;
|
||||||
|
if (taskSeq < lp.taskSeq)
|
||||||
|
pairArrP = &g.unpairedPosArr[posKey].pairArr;
|
||||||
|
else
|
||||||
|
pairArrP = &lp.unpairedPosArr[posKey].pairArr;
|
||||||
|
processPairs(*pairArrP, &t.dupIdx, &t.opticalDupIdx, &t.notDupIdx);
|
||||||
|
if (taskSeq < lp.taskSeq)
|
||||||
|
g.unpairedPosArr.erase(posKey);
|
||||||
|
}
|
||||||
|
// 更新结果
|
||||||
|
|
||||||
|
for (auto &e : taskChanged) {
|
||||||
|
auto taskSeq = e.first;
|
||||||
|
auto &t = e.second;
|
||||||
|
if (taskSeq < lp.taskSeq) {
|
||||||
|
refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.notDupIdx, g.latterDupIdxArr[taskSeq],
|
||||||
|
g.latterOpticalDupIdxArr[taskSeq], g.latterNotDupIdxArr[taskSeq]);
|
||||||
|
} else if (taskSeq == lp.taskSeq) {
|
||||||
|
refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.notDupIdx, &lp, &p);
|
||||||
|
} else {
|
||||||
|
refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.notDupIdx, &p, &lp); // 把结果放到p中
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// cout << "remain unpaired: " << g.unpairedDic.size() << '\t' <<
|
||||||
|
// g.unpairedPosArr.size() << endl; cout << "calc g time: " <<
|
||||||
|
// t.seconds_elapsed() << " s" << endl; 将dupidx放进全局数据
|
||||||
|
g.latterDupIdxArr.push_back(set<int64_t>());
|
||||||
|
g.latterOpticalDupIdxArr.push_back(set<int64_t>());
|
||||||
|
g.latterNotDupIdxArr.push_back(set<int64_t>());
|
||||||
|
|
||||||
|
g.dupIdxArr.push_back(vector<int64_t>());
|
||||||
|
auto &vIdx = g.dupIdxArr.back();
|
||||||
|
lp.pairDupIdx.insert(lp.fragDupIdx.begin(), lp.fragDupIdx.end());
|
||||||
|
vIdx.insert(vIdx.end(), lp.pairDupIdx.begin(), lp.pairDupIdx.end());
|
||||||
|
|
||||||
|
g.opticalDupIdxArr.push_back(vector<int64_t>());
|
||||||
|
auto &vOpticalIdx = g.opticalDupIdxArr.back();
|
||||||
|
vOpticalIdx.insert(vOpticalIdx.end(), lp.pairOpticalDupIdx.begin(), lp.pairOpticalDupIdx.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 当所有任务结束后,global data里还有未处理的数据 */
|
||||||
|
static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) {
|
||||||
|
auto &lp = *task;
|
||||||
|
auto &g = *gDataArg;
|
||||||
|
// 遗留的未匹配的pair
|
||||||
|
for (auto &prevUnpair : lp.unpairedDic) { // 遍历上一个任务中的每个未匹配的read
|
||||||
|
auto &readName = prevUnpair.first;
|
||||||
|
auto &prevPosInfo = prevUnpair.second;
|
||||||
|
auto prevFragEnd = prevPosInfo.unpairedRE; // 未匹配的read end
|
||||||
|
|
||||||
|
if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // 在遗留数据中找到了匹配的read
|
||||||
|
auto &remainPosInfo = g.unpairedDic[readName];
|
||||||
|
auto remainFragEnd = remainPosInfo.unpairedRE;
|
||||||
|
int64_t remainPosKey = remainFragEnd.posKey;
|
||||||
|
modifyPairedEnds(prevFragEnd, &remainFragEnd); // 在某些clip情况下,poskey可能是后面的read
|
||||||
|
auto &remainUnpairInfo = g.unpairedPosArr[remainPosKey];
|
||||||
|
|
||||||
|
remainUnpairInfo.pairArr.push_back(remainFragEnd);
|
||||||
|
g.unpairedDic.erase(readName);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
map<int64_t, TaskSeqDupInfo> taskChanged;
|
||||||
|
for (auto &e : g.unpairedPosArr) {
|
||||||
|
auto posKey = e.first;
|
||||||
|
auto taskSeq = e.second.taskSeq;
|
||||||
|
auto &t = taskChanged[taskSeq];
|
||||||
|
auto &arr = g.unpairedPosArr[posKey].pairArr;
|
||||||
|
|
||||||
|
if (arr.size() > 1) {
|
||||||
|
std::sort(arr.begin(), arr.end());
|
||||||
|
processPairs(arr, &t.dupIdx, &t.opticalDupIdx, &t.notDupIdx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// 更新结果
|
||||||
|
vector<int64_t> addDup;
|
||||||
|
map<int64_t, int64_t> ndPosVal;
|
||||||
|
for (auto &e : taskChanged) {
|
||||||
|
auto taskSeq = e.first;
|
||||||
|
auto &t = e.second;
|
||||||
|
refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.notDupIdx, g.latterDupIdxArr[taskSeq],
|
||||||
|
g.latterOpticalDupIdxArr[taskSeq], g.latterNotDupIdxArr[taskSeq]);
|
||||||
|
}
|
||||||
|
|
||||||
|
cout << "last unpair info: " << g.unpairedDic.size() << '\t' << g.unpairedPosArr.size() << endl;
|
||||||
|
g.unpairedPosArr.clear();
|
||||||
|
g.unpairedDic.clear();
|
||||||
|
|
||||||
|
// 将dupidx放进全局数据
|
||||||
|
for (int i = 0; i < (int)g.dupIdxArr.size() - 1; ++i)
|
||||||
|
refeshFinalTaskDupInfo(g.latterDupIdxArr[i], g.latterNotDupIdxArr[i], g.dupIdxArr[i]);
|
||||||
|
for (int i = 0; i < (int)g.opticalDupIdxArr.size() - 1; ++i)
|
||||||
|
refeshFinalTaskDupInfo(g.latterOpticalDupIdxArr[i], g.latterNotDupIdxArr[i], g.opticalDupIdxArr[i]);
|
||||||
|
|
||||||
|
g.dupIdxArr.push_back(vector<int64_t>());
|
||||||
|
auto &vIdx = g.dupIdxArr.back();
|
||||||
|
lp.pairDupIdx.insert(lp.fragDupIdx.begin(), lp.fragDupIdx.end());
|
||||||
|
vIdx.insert(vIdx.end(), lp.pairDupIdx.begin(), lp.pairDupIdx.end());
|
||||||
|
|
||||||
|
g.opticalDupIdxArr.push_back(vector<int64_t>());
|
||||||
|
auto &vOpticalIdx = g.opticalDupIdxArr.back();
|
||||||
|
vOpticalIdx.insert(vOpticalIdx.end(), lp.pairOpticalDupIdx.begin(), lp.pairOpticalDupIdx.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 串行处理数据,标记冗余 */
|
||||||
|
void serialMarkDups() {
|
||||||
|
tm_arr[5].acc_start();
|
||||||
|
Timer::log_time("serial start");
|
||||||
|
// 读取缓存初始化
|
||||||
|
BamBufType inBamBuf(g_gArg.use_asyncio);
|
||||||
|
inBamBuf.Init(g_inBamFp, g_inBamHeader, g_gArg.max_mem);
|
||||||
|
// BamBufType inBamBuf(false);
|
||||||
|
// inBamBuf.Init(g_inBamFp, g_inBamHeader, 100 * 1024 * 1024);
|
||||||
|
int64_t processedBamNum = 0;
|
||||||
|
|
||||||
|
SerailMarkDupArg smdArg1, smdArg2;
|
||||||
|
SerailMarkDupArg *lastArgP = &smdArg1;
|
||||||
|
SerailMarkDupArg *curArgP = &smdArg2;
|
||||||
|
|
||||||
|
bool isFirstRound = true;
|
||||||
|
int roundNum = 0;
|
||||||
|
int64_t readNumSum = 0;
|
||||||
|
while (inBamBuf.ReadStat() >= 0) {
|
||||||
|
Timer t_round;
|
||||||
|
// 读取bam文件中的read
|
||||||
|
tm_arr[4].acc_start();
|
||||||
|
size_t readNum = inBamBuf.ReadBam();
|
||||||
|
readNumSum += readNum;
|
||||||
|
tm_arr[4].acc_end();
|
||||||
|
cout << "read num: " << readNum << '\t' << roundNum << endl;
|
||||||
|
// lastArgP = curArgP;
|
||||||
|
tm_arr[6].acc_start();
|
||||||
|
curArgP->taskSeq = roundNum;
|
||||||
|
curArgP->bamStartIdx = processedBamNum;
|
||||||
|
curArgP->bams = inBamBuf.GetBamArr();
|
||||||
|
tm_arr[6].acc_end();
|
||||||
|
|
||||||
|
tm_arr[0].acc_start();
|
||||||
|
Timer t1;
|
||||||
|
generateReadEnds(curArgP);
|
||||||
|
// cout << "calc read end time: " << t1.seconds_elapsed() << " s" <<
|
||||||
|
// endl;
|
||||||
|
tm_arr[0].acc_end();
|
||||||
|
|
||||||
|
tm_arr[1].acc_start();
|
||||||
|
t1.reinit();
|
||||||
|
markdups(curArgP);
|
||||||
|
// cout << "markdups time: " << t1.seconds_elapsed() << " s" << endl;
|
||||||
|
tm_arr[1].acc_end();
|
||||||
|
|
||||||
|
if (!isFirstRound) {
|
||||||
|
tm_arr[2].acc_start();
|
||||||
|
t1.reinit();
|
||||||
|
handleIntersectData(lastArgP, curArgP, &gData);
|
||||||
|
// cout << "intersect time: " << t1.seconds_elapsed() << " s" <<
|
||||||
|
// endl;
|
||||||
|
// addTaskIdxToSet(lastArgP, &gData);
|
||||||
|
tm_arr[2].acc_end();
|
||||||
|
} else {
|
||||||
|
isFirstRound = false;
|
||||||
|
}
|
||||||
|
inBamBuf.ClearAll(); // 清理上一轮读入的数据
|
||||||
|
processedBamNum += readNum;
|
||||||
|
|
||||||
|
// 交换
|
||||||
|
auto tmp = lastArgP;
|
||||||
|
lastArgP = curArgP;
|
||||||
|
curArgP = tmp;
|
||||||
|
// cout << "round time: " << t_round.seconds_elapsed() << endl;
|
||||||
|
roundNum++;
|
||||||
|
if (roundNum % 100 == 0) {
|
||||||
|
cout << "read sum: " << readNumSum << endl;
|
||||||
|
cout << "round time: " << t_round.seconds_elapsed() * 100 << " s" << endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// cout << "here" << endl;
|
||||||
|
tm_arr[3].acc_start();
|
||||||
|
// 处理剩下的全局数据
|
||||||
|
handleLastTask(lastArgP, &gData);
|
||||||
|
// cout << "here 2" << endl;
|
||||||
|
tm_arr[3].acc_end();
|
||||||
|
|
||||||
|
tm_arr[5].acc_end();
|
||||||
|
// 统计所有冗余index数量
|
||||||
|
int64_t dupNum = 0;
|
||||||
|
map<int64_t, int> dup;
|
||||||
|
|
||||||
|
int taskSeq = 0;
|
||||||
|
for (auto &arr : gData.dupIdxArr) {
|
||||||
|
for (auto idx : arr) {
|
||||||
|
if (dup.find(idx) != dup.end()) {
|
||||||
|
// cout << "dup index: " << dup[idx] << '\t' << taskSeq << '\t'
|
||||||
|
// << idx << endl;
|
||||||
|
}
|
||||||
|
dup[idx] = taskSeq;
|
||||||
|
}
|
||||||
|
taskSeq++;
|
||||||
|
}
|
||||||
|
// #include <fstream>
|
||||||
|
// ofstream out("tumor_dup.txt");
|
||||||
|
// for (auto idx : dup)
|
||||||
|
// {
|
||||||
|
// out << idx << endl;
|
||||||
|
// }
|
||||||
|
// out.close();
|
||||||
|
|
||||||
|
for (auto &arr : gData.dupIdxArr) dupNum += arr.size();
|
||||||
|
|
||||||
|
cout << "dup num : " << dupNum << '\t' << dup.size() << endl;
|
||||||
|
|
||||||
|
cout << "calc readend: " << tm_arr[0].acc_seconds_elapsed() << endl;
|
||||||
|
cout << "markdup : " << tm_arr[1].acc_seconds_elapsed() << endl;
|
||||||
|
cout << "handle tail : " << tm_arr[2].acc_seconds_elapsed() << endl;
|
||||||
|
cout << "handle last : " << tm_arr[3].acc_seconds_elapsed() << endl;
|
||||||
|
cout << "read bam : " << tm_arr[4].acc_seconds_elapsed() << endl;
|
||||||
|
cout << "new arg : " << tm_arr[6].acc_seconds_elapsed() << endl;
|
||||||
|
cout << "del arg : " << tm_arr[7].acc_seconds_elapsed() << endl;
|
||||||
|
cout << "build ends : " << tm_arr[8].acc_seconds_elapsed() << endl;
|
||||||
|
cout << "sort frags : " << tm_arr[9].acc_seconds_elapsed() << endl;
|
||||||
|
cout << "sort pairs : " << tm_arr[10].acc_seconds_elapsed() << endl;
|
||||||
|
cout << "all : " << tm_arr[5].acc_seconds_elapsed() << endl;
|
||||||
|
|
||||||
|
Timer::log_time("serial end ");
|
||||||
|
|
||||||
|
// for (auto i : gData.dupArr)
|
||||||
|
// cout << i << endl;
|
||||||
|
}
|
||||||
|
|
@ -1,20 +1,28 @@
|
||||||
#include <algorithm>
|
#pragma once
|
||||||
|
|
||||||
|
#include <common/hts/bam_buf.h>
|
||||||
#include <robin-map/include/tsl/robin_map.h>
|
#include <robin-map/include/tsl/robin_map.h>
|
||||||
|
#include <sam/utils/read_ends.h>
|
||||||
|
|
||||||
|
#include <set>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
using std::set;
|
||||||
|
using std::string;
|
||||||
|
using std::vector;
|
||||||
|
|
||||||
/* 存放未匹配readend相同位点的所有readend */
|
/* 存放未匹配readend相同位点的所有readend */
|
||||||
struct UnpairedREInfo
|
struct UnpairedREInfo {
|
||||||
{
|
|
||||||
int64_t taskSeq;
|
int64_t taskSeq;
|
||||||
ReadEnds unpairedRE;
|
ReadEnds unpairedRE;
|
||||||
};
|
};
|
||||||
|
|
||||||
/* 对于一个pair数据,一个完整的计算点,包含read1的比对位置和read2的比对位置 */
|
/* 对于一个pair数据,一个完整的计算点,包含read1的比对位置和read2的比对位置 */
|
||||||
struct CalcKey
|
struct CalcKey {
|
||||||
{
|
|
||||||
int64_t read1Pos;
|
int64_t read1Pos;
|
||||||
int64_t read2Pos;
|
int64_t read2Pos;
|
||||||
bool operator<(const CalcKey &o) const
|
bool operator<(const CalcKey &o) const {
|
||||||
{
|
|
||||||
int comp = (int)(read1Pos - o.read1Pos);
|
int comp = (int)(read1Pos - o.read1Pos);
|
||||||
if (comp == 0)
|
if (comp == 0)
|
||||||
comp = (int)(read2Pos - o.read2Pos);
|
comp = (int)(read2Pos - o.read2Pos);
|
||||||
|
|
@ -23,16 +31,14 @@ struct CalcKey
|
||||||
};
|
};
|
||||||
|
|
||||||
/* 当遗留数据在当前任务找到了pair read后,进行冗余计算时候存放结果的数据结构 */
|
/* 当遗留数据在当前任务找到了pair read后,进行冗余计算时候存放结果的数据结构 */
|
||||||
struct TaskSeqDupInfo
|
struct TaskSeqDupInfo {
|
||||||
{
|
|
||||||
set<int64_t> dupIdx;
|
set<int64_t> dupIdx;
|
||||||
set<int64_t> opticalDupIdx;
|
set<int64_t> opticalDupIdx;
|
||||||
set<int64_t> notDupIdx;
|
set<int64_t> notDupIdx;
|
||||||
};
|
};
|
||||||
|
|
||||||
/* 保存有未匹配pair位点的信息,包括read end数组和有几个未匹配的read end */
|
/* 保存有未匹配pair位点的信息,包括read end数组和有几个未匹配的read end */
|
||||||
struct UnpairedPosInfo
|
struct UnpairedPosInfo {
|
||||||
{
|
|
||||||
int unpairedNum = 0;
|
int unpairedNum = 0;
|
||||||
int64_t taskSeq;
|
int64_t taskSeq;
|
||||||
vector<ReadEnds> pairArr;
|
vector<ReadEnds> pairArr;
|
||||||
|
|
@ -45,13 +51,12 @@ typedef tsl::robin_map<string, UnpairedREInfo> UnpairedNameMap; // 以read name
|
||||||
typedef tsl::robin_map<int64_t, UnpairedPosInfo> UnpairedPositionMap; // 以位点为索引,保存该位点包含的对应的所有read和该位点包含的剩余未匹配的read的数量
|
typedef tsl::robin_map<int64_t, UnpairedPosInfo> UnpairedPositionMap; // 以位点为索引,保存该位点包含的对应的所有read和该位点包含的剩余未匹配的read的数量
|
||||||
|
|
||||||
/* 单线程处理冗余参数结构体 */
|
/* 单线程处理冗余参数结构体 */
|
||||||
struct SerailMarkDupArg
|
struct SerailMarkDupArg {
|
||||||
{
|
int64_t taskSeq; // 任务序号
|
||||||
int64_t taskSeq;
|
|
||||||
int64_t bamStartIdx; // 当前vBam数组中第一个bam记录在整体bam中所处的位置
|
int64_t bamStartIdx; // 当前vBam数组中第一个bam记录在整体bam中所处的位置
|
||||||
vector<BamWrap *> bams; // 存放待处理的bam read
|
vector<BamWrap *> bams; // 存放待处理的bam read
|
||||||
vector<ReadEnds> pairs;
|
vector<ReadEnds> pairs; // 成对的reads
|
||||||
vector<ReadEnds> frags;
|
vector<ReadEnds> frags; // 暂未找到配对的reads
|
||||||
set<int64_t> pairDupIdx; // pair的冗余read的索引
|
set<int64_t> pairDupIdx; // pair的冗余read的索引
|
||||||
set<int64_t> pairOpticalDupIdx; // optical冗余read的索引
|
set<int64_t> pairOpticalDupIdx; // optical冗余read的索引
|
||||||
set<int64_t> fragDupIdx; // frag的冗余read的索引
|
set<int64_t> fragDupIdx; // frag的冗余read的索引
|
||||||
|
|
@ -60,8 +65,7 @@ struct SerailMarkDupArg
|
||||||
};
|
};
|
||||||
|
|
||||||
/* 全局保留的数据,因为有些paired数据比对到了不同的染色体,相距甚远 */
|
/* 全局保留的数据,因为有些paired数据比对到了不同的染色体,相距甚远 */
|
||||||
struct GlobalDataArg
|
struct GlobalDataArg {
|
||||||
{
|
|
||||||
UnpairedNameMap unpairedDic; // 用来寻找pair end
|
UnpairedNameMap unpairedDic; // 用来寻找pair end
|
||||||
UnpairedPositionMap unpairedPosArr;
|
UnpairedPositionMap unpairedPosArr;
|
||||||
|
|
||||||
|
|
@ -75,869 +79,5 @@ struct GlobalDataArg
|
||||||
vector<set<int64_t>> latterNotDupIdxArr;
|
vector<set<int64_t>> latterNotDupIdxArr;
|
||||||
};
|
};
|
||||||
|
|
||||||
static GlobalDataArg gData;
|
// 串行运行mark duplicate
|
||||||
|
void serialMarkDups();
|
||||||
|
|
||||||
/* 查找 */
|
|
||||||
// template<class Itr, class T>
|
|
||||||
// static inline Itr binaryFind(Itr first, Itr last, const T &val)
|
|
||||||
// {
|
|
||||||
// first = std::lower_bound(first, last, val);
|
|
||||||
// return (first != last && *first == val) ? first : last;
|
|
||||||
// }
|
|
||||||
|
|
||||||
/* 排序 */
|
|
||||||
static inline void sortReadEndsArr(vector<ReadEnds> &arr)
|
|
||||||
{
|
|
||||||
size_t blockSize = 64 * 1024;
|
|
||||||
blockSize = min(blockSize, arr.size());
|
|
||||||
size_t blockNum = (arr.size() + blockSize - 1) / blockSize;
|
|
||||||
size_t crossNum = 1024;
|
|
||||||
size_t start, end, i, left, right;
|
|
||||||
std::sort(arr.begin(), arr.begin() + blockSize);
|
|
||||||
for (i = 1; i < blockNum; ++i)
|
|
||||||
{
|
|
||||||
start = i * blockSize;
|
|
||||||
end = min(start + blockSize, arr.size());
|
|
||||||
std::sort(arr.begin() + start, arr.begin() + end);
|
|
||||||
left = crossNum;
|
|
||||||
while (!(arr[start - left] < arr[start]))
|
|
||||||
{
|
|
||||||
left = left << 1;
|
|
||||||
if (left >= blockSize)
|
|
||||||
{
|
|
||||||
std::sort(arr.begin(), arr.end()); // 退化到普通排序
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
right = min(crossNum, end - start - 1);
|
|
||||||
|
|
||||||
while (!(arr[start - 1] < arr[start + right]))
|
|
||||||
{
|
|
||||||
right = min(right << 1, end - start - 1);
|
|
||||||
if (right == end - start - 1)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
std::sort(arr.begin() + start - left, arr.begin() + start + right);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 处理一组pairend的readends,标记冗余 */
|
|
||||||
static void markDupsForPairs(vector<const ReadEnds *> &vpRe,
|
|
||||||
set<int64_t> *dupIdx,
|
|
||||||
set<int64_t> *opticalDupIdx,
|
|
||||||
set<int64_t> *notDupIdx = nullptr)
|
|
||||||
{
|
|
||||||
if (vpRe.size() < 2)
|
|
||||||
{
|
|
||||||
if (vpRe.size() == 1)
|
|
||||||
{
|
|
||||||
// addSingletonToCount(libraryIdGenerator);
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
int maxScore = 0;
|
|
||||||
const ReadEnds *pBest = nullptr;
|
|
||||||
/** All read ends should have orientation FF, FR, RF, or RR **/
|
|
||||||
for (auto pe : vpRe) // 找分数最高的readend
|
|
||||||
{
|
|
||||||
if (pe->score > maxScore || pBest == nullptr)
|
|
||||||
{
|
|
||||||
maxScore = pe->score;
|
|
||||||
pBest = pe;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (notDupIdx != nullptr)
|
|
||||||
{
|
|
||||||
notDupIdx->insert(pBest->read1IndexInFile);
|
|
||||||
notDupIdx->insert(pBest->read2IndexInFile);
|
|
||||||
}
|
|
||||||
if (!g_mdArg.READ_NAME_REGEX.empty()) // 检查光学冗余
|
|
||||||
{
|
|
||||||
// trackOpticalDuplicates
|
|
||||||
}
|
|
||||||
for (auto pe : vpRe) // 对非best read标记冗余
|
|
||||||
{
|
|
||||||
if (pe != pBest) // 非best
|
|
||||||
{
|
|
||||||
dupIdx->insert(pe->read1IndexInFile); // 添加read1
|
|
||||||
if (pe->read2IndexInFile != pe->read1IndexInFile)
|
|
||||||
dupIdx->insert(pe->read2IndexInFile); // 添加read2
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS)
|
|
||||||
// {
|
|
||||||
// addRepresentativeReadIndex(vpRe);
|
|
||||||
// }
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 处理一组非paired的readends,标记冗余 */
|
|
||||||
static void markDupsForFrags(vector<const ReadEnds *> &vpRe,
|
|
||||||
bool containsPairs,
|
|
||||||
set<int64_t> *dupIdx,
|
|
||||||
set<int64_t> *notDupIdx = nullptr)
|
|
||||||
{
|
|
||||||
if (containsPairs)
|
|
||||||
{
|
|
||||||
for (auto pe : vpRe)
|
|
||||||
{
|
|
||||||
if (!pe->IsPaired())
|
|
||||||
{
|
|
||||||
dupIdx->insert(pe->read1IndexInFile);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
int maxScore = 0;
|
|
||||||
const ReadEnds *pBest = nullptr;
|
|
||||||
for (auto pe : vpRe)
|
|
||||||
{
|
|
||||||
if (pe->score > maxScore || pBest == nullptr)
|
|
||||||
{
|
|
||||||
maxScore = pe->score;
|
|
||||||
pBest = pe;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (notDupIdx != nullptr)
|
|
||||||
{
|
|
||||||
notDupIdx->insert(pBest->read1IndexInFile);
|
|
||||||
}
|
|
||||||
for (auto pe : vpRe)
|
|
||||||
{
|
|
||||||
if (pe != pBest)
|
|
||||||
{
|
|
||||||
dupIdx->insert(pe->read1IndexInFile);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 找到与readend pos相等的所有readend */
|
|
||||||
static void getEqualRE(const ReadEnds &re, vector<ReadEnds> &src, vector<ReadEnds> *dst)
|
|
||||||
{
|
|
||||||
auto range = std::equal_range(src.begin(), src.end(), re, ReadEnds::pairsLittleThan); // 只比对位点
|
|
||||||
dst->insert(dst->end(), range.first, range.second);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 单线程生成readends (第一步)*/
|
|
||||||
static void generateReadEnds(SerailMarkDupArg *arg)
|
|
||||||
{
|
|
||||||
auto &p = *arg;
|
|
||||||
auto &rnParser = g_vRnParser[0];
|
|
||||||
|
|
||||||
p.pairs.clear();
|
|
||||||
p.frags.clear();
|
|
||||||
p.unpairedDic.clear();
|
|
||||||
p.unpairedPosArr.clear();
|
|
||||||
|
|
||||||
/* 处理每个read,创建ReadEnd,并放入frag和pair中 */
|
|
||||||
set<ReadEnds> reSet;
|
|
||||||
|
|
||||||
ReadEnds lastRe;
|
|
||||||
|
|
||||||
for (int i = 0; i < p.bams.size(); ++i) // 循环处理每个read
|
|
||||||
{
|
|
||||||
BamWrap *bw = p.bams[i];
|
|
||||||
const int64_t bamIdx = p.bamStartIdx + i;
|
|
||||||
if (bw->GetReadUnmappedFlag())
|
|
||||||
{
|
|
||||||
if (bw->b->core.tid == -1)
|
|
||||||
// When we hit the unmapped reads with no coordinate, no reason to continue (only in coordinate sort).
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
else if (!bw->IsSecondaryOrSupplementary()) // 是主要比对
|
|
||||||
{
|
|
||||||
ReadEnds fragEnd;
|
|
||||||
tm_arr[8].acc_start();
|
|
||||||
buildReadEnds(*bw, bamIdx, rnParser, &fragEnd);
|
|
||||||
tm_arr[8].acc_end();
|
|
||||||
p.frags.push_back(fragEnd); // 添加进frag集合
|
|
||||||
if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) // 是pairend而且互补的read也比对上了
|
|
||||||
{
|
|
||||||
string key = bw->query_name();
|
|
||||||
if (p.unpairedDic.find(key) == p.unpairedDic.end())
|
|
||||||
{
|
|
||||||
p.unpairedDic[key] = {p.taskSeq, fragEnd};
|
|
||||||
}
|
|
||||||
else // 找到了pairend
|
|
||||||
{
|
|
||||||
auto &pairedEnds = p.unpairedDic.at(key).unpairedRE;
|
|
||||||
modifyPairedEnds(fragEnd, &pairedEnds);
|
|
||||||
p.pairs.push_back(pairedEnds);
|
|
||||||
p.unpairedDic.erase(key); // 删除找到的pairend
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
tm_arr[9].acc_start();
|
|
||||||
sortReadEndsArr(p.frags);
|
|
||||||
// sort(p.frags.begin(), p.frags.end());
|
|
||||||
tm_arr[9].acc_end();
|
|
||||||
// cout << "sort pairs" << endl;
|
|
||||||
tm_arr[10].acc_start();
|
|
||||||
sort(p.pairs.begin(), p.pairs.end());
|
|
||||||
tm_arr[10].acc_end();
|
|
||||||
// 记录位点上的未匹配的read个数
|
|
||||||
for (auto &e : p.unpairedDic) {
|
|
||||||
auto posKey = e.second.unpairedRE.posKey;
|
|
||||||
auto &unpairArrInfo = p.unpairedPosArr[posKey];
|
|
||||||
unpairArrInfo.unpairedNum++;
|
|
||||||
unpairArrInfo.taskSeq = p.taskSeq;
|
|
||||||
unpairArrInfo.readNameSet.insert(e.first);
|
|
||||||
}
|
|
||||||
cout << "依赖比例:" << (float)p.unpairedDic.size() / p.frags.size() << endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 处理pairs */
|
|
||||||
static void processPairs(vector<ReadEnds> &readEnds,
|
|
||||||
set<int64_t> *dupIdx,
|
|
||||||
set<int64_t> *opticalDupIdx,
|
|
||||||
set<int64_t> *notDupIdx = nullptr)
|
|
||||||
{
|
|
||||||
vector<const ReadEnds *> vpCache; // 有可能是冗余的reads
|
|
||||||
const ReadEnds *pReadEnd = nullptr;
|
|
||||||
for (auto &re : readEnds)
|
|
||||||
{
|
|
||||||
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样
|
|
||||||
vpCache.push_back(&re); // 处理一个潜在的冗余组
|
|
||||||
else
|
|
||||||
{
|
|
||||||
markDupsForPairs(vpCache, dupIdx, opticalDupIdx, notDupIdx); // 不一样
|
|
||||||
vpCache.clear();
|
|
||||||
vpCache.push_back(&re);
|
|
||||||
pReadEnd = &re;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
markDupsForPairs(vpCache, dupIdx, opticalDupIdx, notDupIdx);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 处理frags */
|
|
||||||
static void processFrags(vector<ReadEnds> &readEnds,
|
|
||||||
set<int64_t> *dupIdx,
|
|
||||||
set<int64_t> *notDupIdx = nullptr)
|
|
||||||
{
|
|
||||||
bool containsPairs = false;
|
|
||||||
bool containsFrags = false;
|
|
||||||
vector<const ReadEnds *> vpCache; // 有可能是冗余的reads
|
|
||||||
const ReadEnds *pReadEnd = nullptr;
|
|
||||||
for (auto &re : readEnds)
|
|
||||||
{
|
|
||||||
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false))
|
|
||||||
{
|
|
||||||
vpCache.push_back(&re);
|
|
||||||
containsPairs = containsPairs || re.IsPaired();
|
|
||||||
containsFrags = containsFrags || !re.IsPaired();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (vpCache.size() > 1 && containsFrags)
|
|
||||||
{
|
|
||||||
markDupsForFrags(vpCache, containsPairs, dupIdx, notDupIdx);
|
|
||||||
}
|
|
||||||
vpCache.clear();
|
|
||||||
vpCache.push_back(&re);
|
|
||||||
pReadEnd = &re;
|
|
||||||
containsPairs = re.IsPaired();
|
|
||||||
containsFrags = !re.IsPaired();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (vpCache.size() > 1 && containsFrags)
|
|
||||||
{
|
|
||||||
markDupsForFrags(vpCache, containsPairs, dupIdx, notDupIdx);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 单线程markdup (第二步)*/
|
|
||||||
static void markdups(SerailMarkDupArg *arg)
|
|
||||||
{
|
|
||||||
auto &p = *arg;
|
|
||||||
p.pairDupIdx.clear();
|
|
||||||
p.pairOpticalDupIdx.clear();
|
|
||||||
p.fragDupIdx.clear();
|
|
||||||
/* generateDuplicateIndexes,计算冗余read在所有read中的位置索引 */
|
|
||||||
// 先处理 pair
|
|
||||||
processPairs(p.pairs, &p.pairDupIdx, &p.pairOpticalDupIdx);
|
|
||||||
|
|
||||||
// 再处理frag
|
|
||||||
processFrags(p.frags, &p.fragDupIdx);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 获取交叉部分的数据 */
|
|
||||||
static inline void getIntersectData(vector<ReadEnds> &leftArr,
|
|
||||||
vector<ReadEnds> &rightArr,
|
|
||||||
vector<ReadEnds> *dst,
|
|
||||||
bool isPairCmp = false)
|
|
||||||
{
|
|
||||||
const size_t leftEndIdx = leftArr.size() - 1;
|
|
||||||
const size_t rightStartIdx = 0;
|
|
||||||
size_t leftSpan = 0;
|
|
||||||
size_t rightSpan = 0;
|
|
||||||
|
|
||||||
while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx - leftSpan], rightArr[rightStartIdx], isPairCmp))
|
|
||||||
{
|
|
||||||
leftSpan += 1;
|
|
||||||
if (leftSpan > leftEndIdx)
|
|
||||||
{
|
|
||||||
leftSpan = leftArr.size() - 1;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx], rightArr[rightSpan], isPairCmp))
|
|
||||||
{
|
|
||||||
rightSpan += 1;
|
|
||||||
if (rightSpan == rightArr.size() - 1)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
dst->insert(dst->end(), leftArr.end() - leftSpan, leftArr.end());
|
|
||||||
dst->insert(dst->end(), rightArr.begin(), rightArr.begin() + rightSpan);
|
|
||||||
std::sort(dst->begin(), dst->end());
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 将frags重叠部分的dup idx放进数据中 */
|
|
||||||
static inline void refreshFragDupIdx(set<int64_t> &dupIdx,
|
|
||||||
set<int64_t> ¬DupIdx,
|
|
||||||
SerailMarkDupArg * lastArg,
|
|
||||||
SerailMarkDupArg *curArg)
|
|
||||||
{
|
|
||||||
auto &lp = *lastArg;
|
|
||||||
auto &p = *curArg;
|
|
||||||
for (auto idx : dupIdx)
|
|
||||||
{
|
|
||||||
lp.fragDupIdx.insert(idx);
|
|
||||||
p.fragDupIdx.erase(idx);
|
|
||||||
}
|
|
||||||
for (auto idx : notDupIdx)
|
|
||||||
{
|
|
||||||
lp.fragDupIdx.erase(idx);
|
|
||||||
p.fragDupIdx.erase(idx);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 将pairs重叠部分的dup idx放进数据中 */
|
|
||||||
static inline void refreshPairDupIdx(set<int64_t> &dupIdx,
|
|
||||||
set<int64_t> &opticalDupIdx,
|
|
||||||
set<int64_t> ¬DupIdx,
|
|
||||||
SerailMarkDupArg *lastArg,
|
|
||||||
SerailMarkDupArg *curArg)
|
|
||||||
{
|
|
||||||
auto &lp = *lastArg;
|
|
||||||
auto &p = *curArg;
|
|
||||||
for (auto idx : dupIdx)
|
|
||||||
{
|
|
||||||
lp.pairDupIdx.insert(idx);
|
|
||||||
p.pairDupIdx.erase(idx);
|
|
||||||
}
|
|
||||||
for (auto idx : opticalDupIdx)
|
|
||||||
{
|
|
||||||
lp.pairOpticalDupIdx.insert(idx);
|
|
||||||
p.pairOpticalDupIdx.erase(idx);
|
|
||||||
}
|
|
||||||
for (auto idx : notDupIdx)
|
|
||||||
{
|
|
||||||
lp.pairDupIdx.erase(idx);
|
|
||||||
lp.pairOpticalDupIdx.erase(idx);
|
|
||||||
p.pairDupIdx.erase(idx);
|
|
||||||
p.pairOpticalDupIdx.erase(idx);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 用来分别处理dup和optical dup
|
|
||||||
static void refeshTaskDupInfo(set<int64_t> &dupIdx,
|
|
||||||
set<int64_t> &opticalDupIdx,
|
|
||||||
set<int64_t> ¬DupIdx,
|
|
||||||
set<int64_t> &latterDupIdx,
|
|
||||||
set<int64_t> &latterOpticalDupIdx,
|
|
||||||
set<int64_t> &latterNotDupIdx)
|
|
||||||
{
|
|
||||||
for (auto idx : dupIdx)
|
|
||||||
latterDupIdx.insert(idx);
|
|
||||||
for (auto idx : opticalDupIdx)
|
|
||||||
latterOpticalDupIdx.insert(idx);
|
|
||||||
for (auto idx : notDupIdx)
|
|
||||||
latterNotDupIdx.insert(idx);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 最后合并数据并排序 */
|
|
||||||
static void refeshFinalTaskDupInfo(set<int64_t> &dupIdx,
|
|
||||||
set<int64_t> ¬DupIdx,
|
|
||||||
vector<int64_t> &dupArr)
|
|
||||||
{
|
|
||||||
vector<int64_t> midArr;
|
|
||||||
|
|
||||||
auto ai = dupArr.begin();
|
|
||||||
auto bi = dupIdx.begin();
|
|
||||||
auto ae = dupArr.end();
|
|
||||||
auto be = dupIdx.end();
|
|
||||||
|
|
||||||
int64_t val = 0;
|
|
||||||
while (ai != ae && bi != be)
|
|
||||||
{
|
|
||||||
if (*ai < *bi)
|
|
||||||
{
|
|
||||||
val = *ai++;
|
|
||||||
}
|
|
||||||
else if (*bi < *ai)
|
|
||||||
{
|
|
||||||
val = *bi++;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
val = *ai++;
|
|
||||||
bi++;
|
|
||||||
}
|
|
||||||
if (notDupIdx.find(val) == notDupIdx.end())
|
|
||||||
{
|
|
||||||
midArr.push_back(val);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
while (ai != ae)
|
|
||||||
{
|
|
||||||
val = *ai++;
|
|
||||||
if (notDupIdx.find(val) == notDupIdx.end())
|
|
||||||
{
|
|
||||||
midArr.push_back(val);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
while (bi != be)
|
|
||||||
{
|
|
||||||
val = *bi++;
|
|
||||||
if (notDupIdx.find(val) == notDupIdx.end())
|
|
||||||
{
|
|
||||||
midArr.push_back(val);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
dupArr = midArr;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 处理相邻的两个任务,有相交叉的数据 */
|
|
||||||
static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *curArg, GlobalDataArg *gDataArg)
|
|
||||||
{
|
|
||||||
auto &lp = *lastArg;
|
|
||||||
auto &p = *curArg;
|
|
||||||
auto &g = *gDataArg;
|
|
||||||
|
|
||||||
vector<ReadEnds> reArr;
|
|
||||||
set<int64_t> dupIdx;
|
|
||||||
set<int64_t> notDupIdx;
|
|
||||||
// 先处理重叠的frags
|
|
||||||
getIntersectData(lp.frags, p.frags, &reArr);
|
|
||||||
processFrags(reArr, &dupIdx, ¬DupIdx);
|
|
||||||
refreshFragDupIdx(dupIdx, notDupIdx, &lp, &p);
|
|
||||||
|
|
||||||
// 再处理重叠的pairs
|
|
||||||
reArr.clear();
|
|
||||||
dupIdx.clear();
|
|
||||||
notDupIdx.clear();
|
|
||||||
set<int64_t> opticalDupIdx;
|
|
||||||
getIntersectData(lp.pairs, p.pairs, &reArr, true);
|
|
||||||
processPairs(reArr, &dupIdx, &opticalDupIdx, ¬DupIdx);
|
|
||||||
refreshPairDupIdx(dupIdx, opticalDupIdx, notDupIdx, &lp, &p);
|
|
||||||
|
|
||||||
// 处理之前未匹配的部分
|
|
||||||
map<CalcKey, int64_t> recalcPos;
|
|
||||||
set<CalcKey> alreadyAdd; // 与该位点相同的pair都添加到数组里了
|
|
||||||
set<int64_t> addToGlobal;
|
|
||||||
int64_t prevLastPos = 0, nextFirstPos = 0;
|
|
||||||
if (lp.frags.size() > 0)
|
|
||||||
prevLastPos = lp.frags.back().posKey;
|
|
||||||
if (p.frags.size() > 0)
|
|
||||||
nextFirstPos = p.frags[0].posKey;
|
|
||||||
// cout << "range: " << nextFirstPos << '\t' << prevLastPos << endl;
|
|
||||||
for (auto &prevUnpair : lp.unpairedDic) // 遍历上一个任务中的每个未匹配的read
|
|
||||||
{
|
|
||||||
auto &readName = prevUnpair.first;
|
|
||||||
auto &prevPosInfo = prevUnpair.second;
|
|
||||||
auto prevFragEnd = prevPosInfo.unpairedRE; // 未匹配的read end
|
|
||||||
|
|
||||||
if (p.unpairedDic.find(readName) != p.unpairedDic.end()) // 在当前这个任务里找到了这个未匹配的read
|
|
||||||
{
|
|
||||||
auto &nextPosInfo = p.unpairedDic[readName];
|
|
||||||
auto &nextFragEnd = nextPosInfo.unpairedRE;
|
|
||||||
int64_t prevPosKey = prevFragEnd.posKey;
|
|
||||||
modifyPairedEnds(nextFragEnd, &prevFragEnd); // 在某些clip情况下,poskey可能是后面的read
|
|
||||||
int64_t nextPosKey = max(prevPosKey, nextFragEnd.posKey);
|
|
||||||
CalcKey ck = {prevPosKey, nextPosKey};
|
|
||||||
UnpairedPosInfo *prevUnpairInfoP = nullptr;
|
|
||||||
UnpairedPosInfo *nextUnpairInfoP = nullptr;
|
|
||||||
if (lp.unpairedPosArr.find(prevPosKey) != lp.unpairedPosArr.end())
|
|
||||||
prevUnpairInfoP = &lp.unpairedPosArr[prevPosKey];
|
|
||||||
if (p.unpairedPosArr.find(prevPosKey) != p.unpairedPosArr.end())
|
|
||||||
nextUnpairInfoP = &p.unpairedPosArr[prevPosKey];
|
|
||||||
|
|
||||||
// pos分为两种情况,根据poskey(pair中两个read分别的pos)的位置确定
|
|
||||||
// 1. prevpos在交叉部分之前,nextpos在交叉部分之后,这种情况不需要获取pairarr中的数据;
|
|
||||||
// 2. prevpos在交叉部分之前,nextpos在交叉部分,需要获取lp中的相等read pair进行重新计算
|
|
||||||
// 复杂情况1. g中包含prevPosKey对应的unpair,p中有对应的pair,此时应该把这些pair考虑进去
|
|
||||||
// 3. prevpos在交叉部分,nextpos在交叉部分之后,需要获取p中的相等read pair进行重新计算
|
|
||||||
// 复杂情况2. p中是否包含prevPosKey对应的unpair
|
|
||||||
// 4. prevpos在交叉部分,nextpos在交叉部分,需要获取lp和p中的相等read pair进行重新计算
|
|
||||||
|
|
||||||
bool addDataToPos = true;
|
|
||||||
if (alreadyAdd.find(ck) != alreadyAdd.end())
|
|
||||||
{
|
|
||||||
addDataToPos = false; // 之前已经添加过了,后面就不用再添加数据了
|
|
||||||
}
|
|
||||||
else
|
|
||||||
alreadyAdd.insert(ck);
|
|
||||||
|
|
||||||
if (prevPosKey < nextFirstPos) // prevpos在交叉部分之前
|
|
||||||
{
|
|
||||||
auto &prevPairArr = prevUnpairInfoP->pairArr; // prevUnpairInfoP肯定不是nullptr
|
|
||||||
prevPairArr.push_back(prevFragEnd);
|
|
||||||
if (nextPosKey <= prevLastPos && addDataToPos) // 第二种情况
|
|
||||||
{
|
|
||||||
getEqualRE(prevFragEnd, lp.pairs, &prevPairArr);
|
|
||||||
}
|
|
||||||
// 第一种情况,第二种情况下都会出现,复杂情况一
|
|
||||||
auto gPosInfo = g.unpairedPosArr.find(prevPosKey);
|
|
||||||
if (gPosInfo != g.unpairedPosArr.end()) // 可能g和p有匹配的,刚好和该位点一致
|
|
||||||
{
|
|
||||||
auto &gUnpairInfo = gPosInfo->second;
|
|
||||||
auto pPosInfo = p.unpairedPosArr.find(nextPosKey);
|
|
||||||
if (pPosInfo != p.unpairedPosArr.end())
|
|
||||||
{
|
|
||||||
auto &pUnpairInfo = pPosInfo->second;
|
|
||||||
for (auto &rn : gUnpairInfo.readNameSet) // 遍历每一个readname,看是否有匹配的
|
|
||||||
{
|
|
||||||
if (pUnpairInfo.readNameSet.find(rn) != pUnpairInfo.readNameSet.end())
|
|
||||||
{
|
|
||||||
auto pe = g.unpairedDic[rn].unpairedRE;
|
|
||||||
auto fe = p.unpairedDic[rn].unpairedRE;
|
|
||||||
modifyPairedEnds(fe, &pe);
|
|
||||||
prevPairArr.push_back(pe);
|
|
||||||
g.unpairedDic.erase(rn);
|
|
||||||
p.unpairedDic.erase(rn);
|
|
||||||
// cout << "找到了!" << rn << endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
recalcPos[ck] = prevPosInfo.taskSeq;
|
|
||||||
std::sort(prevPairArr.begin(), prevPairArr.end());
|
|
||||||
}
|
|
||||||
else // prevpos在交叉部分
|
|
||||||
{
|
|
||||||
if (nextPosKey > prevLastPos) // nextpos在交叉部分之后
|
|
||||||
{ // 第三种情况
|
|
||||||
if (nextUnpairInfoP != nullptr) // 且在pos点,next task有unpair,这样才把这些数据放到next task里
|
|
||||||
{
|
|
||||||
auto &nextPairArr = nextUnpairInfoP->pairArr;
|
|
||||||
nextPairArr.push_back(prevFragEnd);
|
|
||||||
auto &prevPairArr = prevUnpairInfoP->pairArr;
|
|
||||||
prevPairArr.push_back(prevFragEnd);
|
|
||||||
if (addDataToPos)
|
|
||||||
{
|
|
||||||
getEqualRE(prevFragEnd, p.pairs, &prevPairArr);
|
|
||||||
}
|
|
||||||
recalcPos[ck] = nextPosInfo.taskSeq; // 将数据放到next task里, (这个位点以后会可能还会计算到,目前方案是都计算,只是把冗余剔除)
|
|
||||||
std::sort(prevPairArr.begin(), prevPairArr.end());
|
|
||||||
}
|
|
||||||
else // next task在该位点没有unpair,那就把数据放到prev task里
|
|
||||||
{
|
|
||||||
auto &prevPairArr = prevUnpairInfoP->pairArr; // prevUnpairInfoP肯定不是nullptr
|
|
||||||
prevPairArr.push_back(prevFragEnd);
|
|
||||||
if (addDataToPos) // 第二种情况
|
|
||||||
getEqualRE(prevFragEnd, p.pairs, &prevPairArr);
|
|
||||||
recalcPos[ck] = prevPosInfo.taskSeq;
|
|
||||||
std::sort(prevPairArr.begin(), prevPairArr.end());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{ // 第四种情况
|
|
||||||
if (prevUnpairInfoP == nullptr) {
|
|
||||||
prevUnpairInfoP = &lp.unpairedPosArr[prevPosKey];
|
|
||||||
prevUnpairInfoP->taskSeq = lp.taskSeq;
|
|
||||||
}
|
|
||||||
auto &prevPairArr = prevUnpairInfoP->pairArr;
|
|
||||||
prevPairArr.push_back(prevFragEnd);
|
|
||||||
if (addDataToPos)
|
|
||||||
{
|
|
||||||
getEqualRE(prevFragEnd, lp.pairs, &prevPairArr);
|
|
||||||
getEqualRE(prevFragEnd, p.pairs, &prevPairArr);
|
|
||||||
}
|
|
||||||
recalcPos[ck] = prevPosInfo.taskSeq;
|
|
||||||
std::sort(prevPairArr.begin(), prevPairArr.end());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
p.unpairedDic.erase(readName); // 在next task里删除该read
|
|
||||||
}
|
|
||||||
else if (g.unpairedDic.find(readName) != g.unpairedDic.end()) // 在遗留数据中找到了匹配的read
|
|
||||||
{
|
|
||||||
auto &remainPosInfo = g.unpairedDic[readName];
|
|
||||||
auto remainFragEnd = remainPosInfo.unpairedRE;
|
|
||||||
int64_t remainPosKey = remainFragEnd.posKey;
|
|
||||||
modifyPairedEnds(prevFragEnd, &remainFragEnd); // 在某些clip情况下,poskey可能是后面的read
|
|
||||||
auto &remainUnpairInfo = g.unpairedPosArr[remainPosKey];
|
|
||||||
auto &remainPairArr = remainUnpairInfo.pairArr;
|
|
||||||
remainPairArr.push_back(remainFragEnd);
|
|
||||||
CalcKey ck = {remainPosKey, prevFragEnd.posKey};
|
|
||||||
recalcPos[ck] = remainPosInfo.taskSeq;
|
|
||||||
std::sort(remainPairArr.begin(), remainPairArr.end());
|
|
||||||
|
|
||||||
g.unpairedDic.erase(readName);
|
|
||||||
}
|
|
||||||
else // 都没找到,那就保存到遗留数据里
|
|
||||||
{
|
|
||||||
int64_t prevPosKey = prevFragEnd.posKey;
|
|
||||||
g.unpairedDic.insert(prevUnpair);
|
|
||||||
addToGlobal.insert(prevPosKey);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (auto posKey : addToGlobal) // 最后再添加,以防开始赋值,后来这个位置要是又添加了新的数据
|
|
||||||
g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey];
|
|
||||||
|
|
||||||
map<int64_t, TaskSeqDupInfo> taskChanged;
|
|
||||||
set<int64_t> posProcessed;
|
|
||||||
for (auto &e : recalcPos)
|
|
||||||
{
|
|
||||||
auto posKey = e.first.read1Pos;
|
|
||||||
if (posProcessed.find(posKey) != posProcessed.end())
|
|
||||||
continue;
|
|
||||||
posProcessed.insert(posKey);
|
|
||||||
auto taskSeq = e.second;
|
|
||||||
auto &t = taskChanged[taskSeq];
|
|
||||||
// 在对应的任务包含的dup idx里修改结果数据
|
|
||||||
vector<ReadEnds> *pairArrP = nullptr;
|
|
||||||
if (taskSeq < lp.taskSeq)
|
|
||||||
pairArrP = &g.unpairedPosArr[posKey].pairArr;
|
|
||||||
else
|
|
||||||
pairArrP = &lp.unpairedPosArr[posKey].pairArr;
|
|
||||||
processPairs(*pairArrP, &t.dupIdx, &t.opticalDupIdx, &t.notDupIdx);
|
|
||||||
if (taskSeq < lp.taskSeq)
|
|
||||||
g.unpairedPosArr.erase(posKey);
|
|
||||||
}
|
|
||||||
// 更新结果
|
|
||||||
|
|
||||||
for (auto &e : taskChanged)
|
|
||||||
{
|
|
||||||
auto taskSeq = e.first;
|
|
||||||
auto &t = e.second;
|
|
||||||
if (taskSeq < lp.taskSeq)
|
|
||||||
{
|
|
||||||
refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.notDupIdx,
|
|
||||||
g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq], g.latterNotDupIdxArr[taskSeq]);
|
|
||||||
}
|
|
||||||
else if (taskSeq == lp.taskSeq)
|
|
||||||
{
|
|
||||||
refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.notDupIdx, &lp, &p);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.notDupIdx, &p, &lp); // 把结果放到p中
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// cout << "remain unpaired: " << g.unpairedDic.size() << '\t' << g.unpairedPosArr.size() << endl;
|
|
||||||
// cout << "calc g time: " << t.seconds_elapsed() << " s" << endl;
|
|
||||||
// 将dupidx放进全局数据
|
|
||||||
g.latterDupIdxArr.push_back(set<int64_t>());
|
|
||||||
g.latterOpticalDupIdxArr.push_back(set<int64_t>());
|
|
||||||
g.latterNotDupIdxArr.push_back(set<int64_t>());
|
|
||||||
|
|
||||||
g.dupIdxArr.push_back(vector<int64_t>());
|
|
||||||
auto &vIdx = g.dupIdxArr.back();
|
|
||||||
lp.pairDupIdx.insert(lp.fragDupIdx.begin(), lp.fragDupIdx.end());
|
|
||||||
vIdx.insert(vIdx.end(), lp.pairDupIdx.begin(), lp.pairDupIdx.end());
|
|
||||||
|
|
||||||
g.opticalDupIdxArr.push_back(vector<int64_t>());
|
|
||||||
auto &vOpticalIdx = g.opticalDupIdxArr.back();
|
|
||||||
vOpticalIdx.insert(vOpticalIdx.end(), lp.pairOpticalDupIdx.begin(), lp.pairOpticalDupIdx.end());
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 当所有任务结束后,global data里还有未处理的数据 */
|
|
||||||
static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg)
|
|
||||||
{
|
|
||||||
auto &lp = *task;
|
|
||||||
auto &g = *gDataArg;
|
|
||||||
// 遗留的未匹配的pair
|
|
||||||
for (auto &prevUnpair : lp.unpairedDic) // 遍历上一个任务中的每个未匹配的read
|
|
||||||
{
|
|
||||||
auto &readName = prevUnpair.first;
|
|
||||||
auto &prevPosInfo = prevUnpair.second;
|
|
||||||
auto prevFragEnd = prevPosInfo.unpairedRE; // 未匹配的read end
|
|
||||||
|
|
||||||
if (g.unpairedDic.find(readName) != g.unpairedDic.end()) // 在遗留数据中找到了匹配的read
|
|
||||||
{
|
|
||||||
auto &remainPosInfo = g.unpairedDic[readName];
|
|
||||||
auto remainFragEnd = remainPosInfo.unpairedRE;
|
|
||||||
int64_t remainPosKey = remainFragEnd.posKey;
|
|
||||||
modifyPairedEnds(prevFragEnd, &remainFragEnd); // 在某些clip情况下,poskey可能是后面的read
|
|
||||||
auto &remainUnpairInfo = g.unpairedPosArr[remainPosKey];
|
|
||||||
|
|
||||||
remainUnpairInfo.pairArr.push_back(remainFragEnd);
|
|
||||||
g.unpairedDic.erase(readName);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
map<int64_t, TaskSeqDupInfo> taskChanged;
|
|
||||||
for (auto &e : g.unpairedPosArr)
|
|
||||||
{
|
|
||||||
auto posKey = e.first;
|
|
||||||
auto taskSeq = e.second.taskSeq;
|
|
||||||
auto &t = taskChanged[taskSeq];
|
|
||||||
auto &arr = g.unpairedPosArr[posKey].pairArr;
|
|
||||||
|
|
||||||
if (arr.size() > 1)
|
|
||||||
{
|
|
||||||
std::sort(arr.begin(), arr.end());
|
|
||||||
processPairs(arr, &t.dupIdx, &t.opticalDupIdx, &t.notDupIdx);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// 更新结果
|
|
||||||
vector<int64_t> addDup;
|
|
||||||
map<int64_t, int64_t> ndPosVal;
|
|
||||||
for (auto &e : taskChanged)
|
|
||||||
{
|
|
||||||
auto taskSeq = e.first;
|
|
||||||
auto &t = e.second;
|
|
||||||
refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.notDupIdx,
|
|
||||||
g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq], g.latterNotDupIdxArr[taskSeq]);
|
|
||||||
}
|
|
||||||
|
|
||||||
cout << "last unpair info: " << g.unpairedDic.size() << '\t' << g.unpairedPosArr.size() << endl;
|
|
||||||
g.unpairedPosArr.clear();
|
|
||||||
g.unpairedDic.clear();
|
|
||||||
|
|
||||||
// 将dupidx放进全局数据
|
|
||||||
for (int i = 0; i < g.dupIdxArr.size() - 1; ++i)
|
|
||||||
refeshFinalTaskDupInfo(g.latterDupIdxArr[i], g.latterNotDupIdxArr[i], g.dupIdxArr[i]);
|
|
||||||
for (int i = 0; i < g.opticalDupIdxArr.size() - 1; ++i)
|
|
||||||
refeshFinalTaskDupInfo(g.latterOpticalDupIdxArr[i], g.latterNotDupIdxArr[i], g.opticalDupIdxArr[i]);
|
|
||||||
|
|
||||||
g.dupIdxArr.push_back(vector<int64_t>());
|
|
||||||
auto &vIdx = g.dupIdxArr.back();
|
|
||||||
lp.pairDupIdx.insert(lp.fragDupIdx.begin(), lp.fragDupIdx.end());
|
|
||||||
vIdx.insert(vIdx.end(), lp.pairDupIdx.begin(), lp.pairDupIdx.end());
|
|
||||||
|
|
||||||
g.opticalDupIdxArr.push_back(vector<int64_t>());
|
|
||||||
auto &vOpticalIdx = g.opticalDupIdxArr.back();
|
|
||||||
vOpticalIdx.insert(vOpticalIdx.end(), lp.pairOpticalDupIdx.begin(), lp.pairOpticalDupIdx.end());
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 串行处理数据,标记冗余 */
|
|
||||||
static void serialMarkDups()
|
|
||||||
{
|
|
||||||
tm_arr[5].acc_start();
|
|
||||||
Timer::log_time("serial start");
|
|
||||||
// 读取缓存初始化
|
|
||||||
BamBufType inBamBuf(g_gArg.use_asyncio);
|
|
||||||
inBamBuf.Init(g_inBamFp, g_inBamHeader, g_gArg.max_mem);
|
|
||||||
// BamBufType inBamBuf(false);
|
|
||||||
// inBamBuf.Init(g_inBamFp, g_inBamHeader, 100 * 1024 * 1024);
|
|
||||||
int64_t processedBamNum = 0;
|
|
||||||
|
|
||||||
SerailMarkDupArg smdArg1, smdArg2;
|
|
||||||
SerailMarkDupArg *lastArgP = &smdArg1;
|
|
||||||
SerailMarkDupArg *curArgP = &smdArg2;
|
|
||||||
|
|
||||||
bool isFirstRound = true;
|
|
||||||
int roundNum = 0;
|
|
||||||
while (inBamBuf.ReadStat() >= 0)
|
|
||||||
{
|
|
||||||
Timer t_round;
|
|
||||||
// 读取bam文件中的read
|
|
||||||
tm_arr[4].acc_start();
|
|
||||||
size_t readNum = inBamBuf.ReadBam();
|
|
||||||
tm_arr[4].acc_end();
|
|
||||||
cout << "read num: " << readNum << endl;
|
|
||||||
// lastArgP = curArgP;
|
|
||||||
tm_arr[6].acc_start();
|
|
||||||
curArgP->taskSeq = roundNum;
|
|
||||||
curArgP->bamStartIdx = processedBamNum;
|
|
||||||
curArgP->bams = inBamBuf.GetBamArr();
|
|
||||||
tm_arr[6].acc_end();
|
|
||||||
|
|
||||||
tm_arr[0].acc_start();
|
|
||||||
Timer t1;
|
|
||||||
generateReadEnds(curArgP);
|
|
||||||
//cout << "calc read end time: " << t1.seconds_elapsed() << " s" << endl;
|
|
||||||
tm_arr[0].acc_end();
|
|
||||||
|
|
||||||
tm_arr[1].acc_start();
|
|
||||||
t1.reinit();
|
|
||||||
markdups(curArgP);
|
|
||||||
//cout << "markdups time: " << t1.seconds_elapsed() << " s" << endl;
|
|
||||||
tm_arr[1].acc_end();
|
|
||||||
|
|
||||||
if (!isFirstRound)
|
|
||||||
{
|
|
||||||
tm_arr[2].acc_start();
|
|
||||||
t1.reinit();
|
|
||||||
handleIntersectData(lastArgP, curArgP, &gData);
|
|
||||||
//cout << "intersect time: " << t1.seconds_elapsed() << " s" << endl;
|
|
||||||
// addTaskIdxToSet(lastArgP, &gData);
|
|
||||||
tm_arr[2].acc_end();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
isFirstRound = false;
|
|
||||||
}
|
|
||||||
inBamBuf.ClearAll(); // 清理上一轮读入的数据
|
|
||||||
processedBamNum += readNum;
|
|
||||||
|
|
||||||
// 交换
|
|
||||||
auto tmp = lastArgP;
|
|
||||||
lastArgP = curArgP;
|
|
||||||
curArgP = tmp;
|
|
||||||
cout << "round time: " << t_round.seconds_elapsed() << endl;
|
|
||||||
roundNum++;
|
|
||||||
if (roundNum > 9){
|
|
||||||
// break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// cout << "here" << endl;
|
|
||||||
tm_arr[3].acc_start();
|
|
||||||
// 处理剩下的全局数据
|
|
||||||
handleLastTask(lastArgP, &gData);
|
|
||||||
// cout << "here 2" << endl;
|
|
||||||
tm_arr[3].acc_end();
|
|
||||||
|
|
||||||
tm_arr[5].acc_end();
|
|
||||||
// 统计所有冗余index数量
|
|
||||||
int64_t dupNum = 0;
|
|
||||||
set<int64_t> dup;
|
|
||||||
|
|
||||||
// int taskSeq = 0;
|
|
||||||
// for (auto &arr : gData.dupIdxArr)
|
|
||||||
// {
|
|
||||||
// for (auto idx : arr) {
|
|
||||||
// if (dup.find(idx) != dup.end())
|
|
||||||
// {
|
|
||||||
// cout << "dup index: " << taskSeq << '\t' << idx << endl;
|
|
||||||
// }
|
|
||||||
// dup.insert(idx);
|
|
||||||
// }
|
|
||||||
// taskSeq++;
|
|
||||||
// }
|
|
||||||
// #include <fstream>
|
|
||||||
// ofstream out("tumor_dup.txt");
|
|
||||||
// for (auto idx : dup)
|
|
||||||
// {
|
|
||||||
// out << idx << endl;
|
|
||||||
// }
|
|
||||||
// out.close();
|
|
||||||
|
|
||||||
for (auto &arr : gData.dupIdxArr)
|
|
||||||
dupNum += arr.size();
|
|
||||||
|
|
||||||
cout << "dup num : " << dupNum << '\t' << dup.size() << endl;
|
|
||||||
|
|
||||||
cout << "calc readend: " << tm_arr[0].acc_seconds_elapsed() << endl;
|
|
||||||
cout << "markdup : " << tm_arr[1].acc_seconds_elapsed() << endl;
|
|
||||||
cout << "handle tail : " << tm_arr[2].acc_seconds_elapsed() << endl;
|
|
||||||
cout << "handle last : " << tm_arr[3].acc_seconds_elapsed() << endl;
|
|
||||||
cout << "read bam : " << tm_arr[4].acc_seconds_elapsed() << endl;
|
|
||||||
cout << "new arg : " << tm_arr[6].acc_seconds_elapsed() << endl;
|
|
||||||
cout << "del arg : " << tm_arr[7].acc_seconds_elapsed() << endl;
|
|
||||||
cout << "build ends : " << tm_arr[8].acc_seconds_elapsed() << endl;
|
|
||||||
cout << "sort frags : " << tm_arr[9].acc_seconds_elapsed() << endl;
|
|
||||||
cout << "sort pairs : " << tm_arr[10].acc_seconds_elapsed() << endl;
|
|
||||||
cout << "all : " << tm_arr[5].acc_seconds_elapsed() << endl;
|
|
||||||
|
|
||||||
Timer::log_time("serial end ");
|
|
||||||
|
|
||||||
//for (auto i : gData.dupArr)
|
|
||||||
// cout << i << endl;
|
|
||||||
}
|
|
||||||
|
|
@ -0,0 +1,26 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <common/utils/timer.h>
|
||||||
|
#include <common/utils/util.h>
|
||||||
|
#include <htslib/sam.h>
|
||||||
|
#include <htslib/thread_pool.h>
|
||||||
|
#include <sam/utils/read_ends.h>
|
||||||
|
#include <sam/utils/read_name_parser.h>
|
||||||
|
|
||||||
|
extern Timer tm_arr[20]; // 用来测试性能
|
||||||
|
/* 全局本地变量 */
|
||||||
|
extern vector<ReadNameParser> g_vRnParser; // 每个线程一个read name parser
|
||||||
|
extern samFile *g_inBamFp; // 输入的bam文件
|
||||||
|
extern sam_hdr_t *g_inBamHeader; // 输入的bam文件头信息
|
||||||
|
extern samFile *g_outBamFp; // 输出文件, sam或者bam格式
|
||||||
|
extern sam_hdr_t *g_outBamHeader; // 输出文件的header
|
||||||
|
|
||||||
|
/* 参数对象作为全局对象,免得多次作为参数传入函数中 */
|
||||||
|
class GlobalArg;
|
||||||
|
extern GlobalArg &g_gArg;
|
||||||
|
class MarkDupsArg;
|
||||||
|
extern MarkDupsArg &g_mdArg;
|
||||||
|
class GlobalDataArg;
|
||||||
|
extern GlobalDataArg &gData;
|
||||||
|
class DuplicationMetrics;
|
||||||
|
extern DuplicationMetrics &gMetrics;
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
/*
|
/*
|
||||||
Description: read ends结构体主要用来标记冗余,包含一些序列的测序过程中的物理信息等
|
Description: read
|
||||||
|
ends结构体主要用来标记冗余,包含一些序列的测序过程中的物理信息等
|
||||||
|
|
||||||
Copyright : All right reserved by ICT
|
Copyright : All right reserved by ICT
|
||||||
|
|
||||||
|
|
@ -13,18 +14,20 @@ Date : 2023/11/3
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Small interface that provides access to the physical location information about a cluster.
|
* Small interface that provides access to the physical location information
|
||||||
* All values should be defaulted to -1 if unavailable. ReadGroup and Tile should only allow
|
* about a cluster. All values should be defaulted to -1 if unavailable.
|
||||||
* non-zero positive integers, x and y coordinates may be negative.
|
* ReadGroup and Tile should only allow non-zero positive integers, x and y
|
||||||
|
* coordinates may be negative.
|
||||||
*/
|
*/
|
||||||
struct PhysicalLocation
|
struct PhysicalLocation {
|
||||||
{
|
static const int NO_VALUE = -1;
|
||||||
/**
|
/**
|
||||||
* Small class that provides access to the physical location information about a cluster.
|
* Small class that provides access to the physical location information
|
||||||
* All values should be defaulted to -1 if unavailable. Tile should only allow
|
* about a cluster. All values should be defaulted to -1 if unavailable.
|
||||||
* non-zero positive integers, x and y coordinates must be non-negative.
|
* Tile should only allow non-zero positive integers, x and y coordinates
|
||||||
* This is different from PhysicalLocationShort in that the x and y positions are ints, not shorts
|
* must be non-negative. This is different from PhysicalLocationShort in
|
||||||
* thus, they do not overflow within a HiSeqX tile.
|
* that the x and y positions are ints, not shorts thus, they do not
|
||||||
|
* overflow within a HiSeqX tile.
|
||||||
*/
|
*/
|
||||||
int16_t tile = -1;
|
int16_t tile = -1;
|
||||||
int32_t x = -1;
|
int32_t x = -1;
|
||||||
|
|
@ -32,28 +35,33 @@ struct PhysicalLocation
|
||||||
};
|
};
|
||||||
|
|
||||||
/* 包含了所有read ends信息,如picard里边的 ReadEndsForMarkDuplicates*/
|
/* 包含了所有read ends信息,如picard里边的 ReadEndsForMarkDuplicates*/
|
||||||
struct ReadEnds : PhysicalLocation
|
struct ReadEnds : PhysicalLocation {
|
||||||
{
|
|
||||||
static const int8_t F = 0, R = 1, FF = 2, FR = 3, RR = 4, RF = 5;
|
static const int8_t F = 0, R = 1, FF = 2, FR = 3, RR = 4, RF = 5;
|
||||||
/* 保留一些bam记录中的数据 */
|
/* 保留一些bam记录中的数据 */
|
||||||
bool read1FirstOfPair = true;
|
bool read1FirstOfPair = true;
|
||||||
/* ReadEnds中的成员变量 */
|
/* ReadEnds中的成员变量 */
|
||||||
/** Little struct-like class to hold read pair (and fragment) end data for duplicate marking. */
|
/** Little struct-like class to hold read pair (and fragment) end data for
|
||||||
|
* duplicate marking. */
|
||||||
// int16_t libraryId; // 没用,不考虑多样本
|
// int16_t libraryId; // 没用,不考虑多样本
|
||||||
int8_t orientation = -1;
|
int8_t orientation = -1;
|
||||||
int32_t read1ReferenceIndex = -1;
|
int32_t read1ReferenceIndex = -1;
|
||||||
int32_t read1Coordinate = -1;
|
int32_t read1Coordinate = -1;
|
||||||
int32_t read2ReferenceIndex = -1;
|
int32_t read2ReferenceIndex = -1;
|
||||||
int32_t read2Coordinate = -1; // This field is overloaded for flow based processing as the end coordinate of read 1. (paired reads not supported)
|
// This field is overloaded for flow based processing as the end coordinate of read 1. (paired reads not supported)
|
||||||
|
int32_t read2Coordinate = -1;
|
||||||
/* Additional information used to detect optical dupes */
|
/* Additional information used to detect optical dupes */
|
||||||
// int16_t readGroup = -1; 一般经过比对后的bam文件只有一个read group,normal或者tumor
|
// int16_t readGroup = -1; 一般经过比对后的bam文件只有一个read
|
||||||
/** For optical duplicate detection the orientation matters regard to 1st or 2nd end of a mate */
|
// group,normal或者tumor
|
||||||
|
/** For optical duplicate detection the orientation matters regard to 1st or
|
||||||
|
* 2nd end of a mate */
|
||||||
int8_t orientationForOpticalDuplicates = -1;
|
int8_t orientationForOpticalDuplicates = -1;
|
||||||
/** A *transient* flag marking this read end as being an optical duplicate. */
|
/** A *transient* flag marking this read end as being an optical duplicate.
|
||||||
|
*/
|
||||||
bool isOpticalDuplicate = false;
|
bool isOpticalDuplicate = false;
|
||||||
|
|
||||||
/* ReadEndsForMarkDuplicates中的成员变量 */
|
/* ReadEndsForMarkDuplicates中的成员变量 */
|
||||||
/** Little struct-like class to hold read pair (and fragment) end data for MarkDuplicatesWithMateCigar **/
|
/** Little struct-like class to hold read pair (and fragment) end data for
|
||||||
|
* MarkDuplicatesWithMateCigar **/
|
||||||
int16_t score = 0;
|
int16_t score = 0;
|
||||||
int64_t read1IndexInFile = -1;
|
int64_t read1IndexInFile = -1;
|
||||||
int64_t read2IndexInFile = -1;
|
int64_t read2IndexInFile = -1;
|
||||||
|
|
@ -62,23 +70,22 @@ struct ReadEnds : PhysicalLocation
|
||||||
/* ReadEndsForMarkDuplicatesWithBarcodes中的成员变量 (好像用不到) */
|
/* ReadEndsForMarkDuplicatesWithBarcodes中的成员变量 (好像用不到) */
|
||||||
// int32_t barcode = 0; // primary barcode for this read (and pair)
|
// int32_t barcode = 0; // primary barcode for this read (and pair)
|
||||||
// int32_t readOneBarcode = 0; // read one barcode, 0 if not present
|
// int32_t readOneBarcode = 0; // read one barcode, 0 if not present
|
||||||
// int32_t readTwoBarcode = 0; // read two barcode, 0 if not present or not paired
|
// int32_t readTwoBarcode = 0; // read two barcode, 0 if not present or not
|
||||||
|
// paired
|
||||||
|
|
||||||
/* zzh增加的成员变量 */
|
/* zzh增加的成员变量 */
|
||||||
int64_t posKey = -1; // 根据位置信息生成的关键字 return (int64_t)tid << MAX_CONTIG_LEN_SHIFT | (int64_t)pos;
|
int64_t posKey = -1; // 根据位置信息生成的关键字 return (int64_t)tid <<
|
||||||
|
// MAX_CONTIG_LEN_SHIFT | (int64_t)pos;
|
||||||
|
|
||||||
/* 根据pairend read的比对方向,来确定整体的比对方向 */
|
/* 根据pairend read的比对方向,来确定整体的比对方向 */
|
||||||
static int8_t GetOrientationByte(bool read1NegativeStrand, bool read2NegativeStrand)
|
static int8_t GetOrientationByte(bool read1NegativeStrand,
|
||||||
{
|
bool read2NegativeStrand) {
|
||||||
if (read1NegativeStrand)
|
if (read1NegativeStrand) {
|
||||||
{
|
|
||||||
if (read2NegativeStrand)
|
if (read2NegativeStrand)
|
||||||
return RR;
|
return RR;
|
||||||
else
|
else
|
||||||
return RF;
|
return RF;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
if (read2NegativeStrand)
|
if (read2NegativeStrand)
|
||||||
return FR;
|
return FR;
|
||||||
else
|
else
|
||||||
|
|
@ -87,47 +94,38 @@ struct ReadEnds : PhysicalLocation
|
||||||
}
|
}
|
||||||
|
|
||||||
/* 比较两个readends是否一样(有个冗余) */
|
/* 比较两个readends是否一样(有个冗余) */
|
||||||
static bool AreComparableForDuplicates(const ReadEnds &lhs, const ReadEnds &rhs, bool compareRead2)
|
static bool AreComparableForDuplicates(const ReadEnds &lhs,
|
||||||
{
|
const ReadEnds &rhs,
|
||||||
|
bool compareRead2) {
|
||||||
bool areComparable = true;
|
bool areComparable = true;
|
||||||
areComparable = lhs.read1ReferenceIndex == rhs.read1ReferenceIndex &&
|
areComparable = lhs.read1ReferenceIndex == rhs.read1ReferenceIndex &&
|
||||||
lhs.read1Coordinate == rhs.read1Coordinate &&
|
lhs.read1Coordinate == rhs.read1Coordinate &&
|
||||||
lhs.orientation == rhs.orientation;
|
lhs.orientation == rhs.orientation;
|
||||||
if (areComparable && compareRead2)
|
if (areComparable && compareRead2) {
|
||||||
{
|
areComparable =
|
||||||
areComparable = lhs.read2ReferenceIndex == rhs.read2ReferenceIndex &&
|
lhs.read2ReferenceIndex == rhs.read2ReferenceIndex &&
|
||||||
lhs.read2Coordinate == rhs.read2Coordinate;
|
lhs.read2Coordinate == rhs.read2Coordinate;
|
||||||
}
|
}
|
||||||
return areComparable;
|
return areComparable;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* 比对方向是否正向 */
|
/* 比对方向是否正向 */
|
||||||
bool IsPositiveStrand() const
|
bool IsPositiveStrand() const { return orientation == F; }
|
||||||
{
|
|
||||||
return orientation == F;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* pairend是否合适的比对上了 */
|
/* pairend是否合适的比对上了 */
|
||||||
bool IsPaired() const
|
bool IsPaired() const { return read2ReferenceIndex != -1; }
|
||||||
{
|
|
||||||
return read2ReferenceIndex != -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool IsNegativeStrand() const
|
bool IsNegativeStrand() const { return orientation == R; }
|
||||||
{
|
|
||||||
return orientation == R;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 对于相交的数据进行比对,a是否小于b,根据AreComparableForDuplicates函数得来
|
// 对于相交的数据进行比对,a是否小于b,根据AreComparableForDuplicates函数得来
|
||||||
static inline bool ReadLittleThan(const ReadEnds &a, const ReadEnds &b, bool compareRead2 = false)
|
static inline bool ReadLittleThan(const ReadEnds &a, const ReadEnds &b,
|
||||||
{
|
bool compareRead2 = false) {
|
||||||
int comp = a.read1ReferenceIndex - b.read1ReferenceIndex;
|
int comp = a.read1ReferenceIndex - b.read1ReferenceIndex;
|
||||||
if (comp == 0)
|
if (comp == 0)
|
||||||
comp = a.read1Coordinate - b.read1Coordinate;
|
comp = a.read1Coordinate - b.read1Coordinate;
|
||||||
if (comp == 0)
|
if (comp == 0)
|
||||||
comp = a.orientation - b.orientation;
|
comp = a.orientation - b.orientation;
|
||||||
if (compareRead2)
|
if (compareRead2) {
|
||||||
{
|
|
||||||
if (comp == 0)
|
if (comp == 0)
|
||||||
comp = a.read2ReferenceIndex - b.read2ReferenceIndex;
|
comp = a.read2ReferenceIndex - b.read2ReferenceIndex;
|
||||||
if (comp == 0)
|
if (comp == 0)
|
||||||
|
|
@ -137,14 +135,12 @@ struct ReadEnds : PhysicalLocation
|
||||||
}
|
}
|
||||||
|
|
||||||
// 找某一个位置的所有readend时需要
|
// 找某一个位置的所有readend时需要
|
||||||
static bool pairsLittleThan(const ReadEnds &lhs, const ReadEnds &rhs)
|
static bool PairsLittleThan(const ReadEnds &lhs, const ReadEnds &rhs) {
|
||||||
{
|
|
||||||
return ReadLittleThan(lhs, rhs, true);
|
return ReadLittleThan(lhs, rhs, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
// 比较函数
|
// 比较函数
|
||||||
bool operator < (const ReadEnds &o) const
|
bool operator<(const ReadEnds &o) const {
|
||||||
{
|
|
||||||
int comp = read1ReferenceIndex - o.read1ReferenceIndex;
|
int comp = read1ReferenceIndex - o.read1ReferenceIndex;
|
||||||
if (comp == 0)
|
if (comp == 0)
|
||||||
comp = read1Coordinate - o.read1Coordinate;
|
comp = read1Coordinate - o.read1Coordinate;
|
||||||
|
|
|
||||||
|
|
@ -9,11 +9,12 @@ Date : 2023/11/6
|
||||||
#ifndef READ_NAME_PARSER_H_
|
#ifndef READ_NAME_PARSER_H_
|
||||||
#define READ_NAME_PARSER_H_
|
#define READ_NAME_PARSER_H_
|
||||||
|
|
||||||
#include "read_ends.h"
|
|
||||||
#include <common/utils/util.h>
|
#include <common/utils/util.h>
|
||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
|
#include "read_ends.h"
|
||||||
// #include <regex>
|
// #include <regex>
|
||||||
#include <boost/regex.hpp>
|
#include <boost/regex.hpp>
|
||||||
|
|
||||||
|
|
@ -24,25 +25,30 @@ using std::string;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Provides access to the physical location information about a cluster.
|
* Provides access to the physical location information about a cluster.
|
||||||
* All values should be defaulted to -1 if unavailable. ReadGroup and Tile should only allow
|
* All values should be defaulted to -1 if unavailable. ReadGroup and Tile
|
||||||
* non-zero positive integers, x and y coordinates may be negative.
|
* should only allow non-zero positive integers, x and y coordinates may be
|
||||||
* 非线程安全
|
* negative. 非线程安全
|
||||||
*/
|
*/
|
||||||
struct ReadNameParser
|
struct ReadNameParser {
|
||||||
{
|
|
||||||
/**
|
/**
|
||||||
* The read name regular expression (regex) is used to extract three pieces of information from the read name: tile, x location,
|
* The read name regular expression (regex) is used to extract three pieces
|
||||||
* and y location. Any read name regex should parse the read name to produce these and only these values. An example regex is:
|
* of information from the read name: tile, x location, and y location. Any
|
||||||
|
* read name regex should parse the read name to produce these and only
|
||||||
|
* these values. An example regex is:
|
||||||
* (?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$
|
* (?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$
|
||||||
* which assumes that fields in the read name are delimited by ':' and the last three fields correspond to the tile, x and y locations,
|
* which assumes that fields in the read name are delimited by ':' and the
|
||||||
* ignoring any trailing non-digit characters.
|
* last three fields correspond to the tile, x and y locations, ignoring any
|
||||||
|
* trailing non-digit characters.
|
||||||
*
|
*
|
||||||
* The default regex is optimized for fast parsing (see {@link #getLastThreeFields(String, char, int[])}) by searching for the last
|
* The default regex is optimized for fast parsing (see {@link
|
||||||
* three fields, ignoring any trailing non-digit characters, assuming the delimiter ':'. This should consider correctly read names
|
* #getLastThreeFields(String, char, int[])}) by searching for the last
|
||||||
* where we have 5 or 7 field with the last three fields being tile/x/y, as is the case for the majority of read names produced by
|
* three fields, ignoring any trailing non-digit characters, assuming the
|
||||||
* Illumina technology.
|
* delimiter ':'. This should consider correctly read names where we have 5
|
||||||
|
* or 7 field with the last three fields being tile/x/y, as is the case for
|
||||||
|
* the majority of read names produced by Illumina technology.
|
||||||
*/
|
*/
|
||||||
const string DEFAULT_READ_NAME_REGEX = "(?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$";
|
const string DEFAULT_READ_NAME_REGEX =
|
||||||
|
"(?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$";
|
||||||
|
|
||||||
string readNameStored = "";
|
string readNameStored = "";
|
||||||
PhysicalLocation physicalLocationStored;
|
PhysicalLocation physicalLocationStored;
|
||||||
|
|
@ -53,74 +59,73 @@ struct ReadNameParser
|
||||||
bool warnedAboutRegexNotMatching = true;
|
bool warnedAboutRegexNotMatching = true;
|
||||||
|
|
||||||
ReadNameParser() : ReadNameParser(DEFAULT_READ_NAME_REGEX) {}
|
ReadNameParser() : ReadNameParser(DEFAULT_READ_NAME_REGEX) {}
|
||||||
ReadNameParser(const string &strReadNameRegex) : ReadNameParser(strReadNameRegex, true) {}
|
ReadNameParser(const string &strReadNameRegex)
|
||||||
ReadNameParser(const string &strReadNameRegex, bool isWarn)
|
: ReadNameParser(strReadNameRegex, true) {}
|
||||||
{
|
ReadNameParser(const string &strReadNameRegex, bool isWarn) {
|
||||||
readNameRegex = strReadNameRegex;
|
readNameRegex = strReadNameRegex;
|
||||||
if (strReadNameRegex == DEFAULT_READ_NAME_REGEX)
|
if (strReadNameRegex == DEFAULT_READ_NAME_REGEX)
|
||||||
useOptimizedDefaultParsing = true;
|
useOptimizedDefaultParsing = true;
|
||||||
else
|
else
|
||||||
useOptimizedDefaultParsing = false;
|
useOptimizedDefaultParsing = false;
|
||||||
readNamePattern = boost::regex(strReadNameRegex, boost::regex_constants::optimize);
|
readNamePattern =
|
||||||
|
boost::regex(strReadNameRegex, boost::regex_constants::optimize);
|
||||||
warnedAboutRegexNotMatching = isWarn;
|
warnedAboutRegexNotMatching = isWarn;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* 重新设置readNameRegex */
|
/* 重新设置readNameRegex */
|
||||||
void SetReadNameRegex(const string &strReadNameRegex)
|
void SetReadNameRegex(const string &strReadNameRegex) {
|
||||||
{
|
|
||||||
readNameRegex = strReadNameRegex;
|
readNameRegex = strReadNameRegex;
|
||||||
if (strReadNameRegex == DEFAULT_READ_NAME_REGEX)
|
if (strReadNameRegex == DEFAULT_READ_NAME_REGEX)
|
||||||
useOptimizedDefaultParsing = true;
|
useOptimizedDefaultParsing = true;
|
||||||
else
|
else
|
||||||
useOptimizedDefaultParsing = false;
|
useOptimizedDefaultParsing = false;
|
||||||
readNamePattern = boost::regex(strReadNameRegex, boost::regex_constants::optimize);
|
readNamePattern =
|
||||||
|
boost::regex(strReadNameRegex, boost::regex_constants::optimize);
|
||||||
// readNamePattern = strReadNameRegex;
|
// readNamePattern = strReadNameRegex;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* 添加测序时候的tile x y 信息 */
|
/* 添加测序时候的tile x y 信息 */
|
||||||
bool AddLocationInformation(const string &readName, PhysicalLocation *loc)
|
bool AddLocationInformation(const string &readName, PhysicalLocation *loc) {
|
||||||
{
|
if (!(readName == readNameStored)) {
|
||||||
if (!(readName == readNameStored))
|
if (ReadLocationInformation(readName, loc)) {
|
||||||
{
|
|
||||||
if (ReadLocationInformation(readName, loc))
|
|
||||||
{
|
|
||||||
readNameStored = readName;
|
readNameStored = readName;
|
||||||
physicalLocationStored = *loc;
|
physicalLocationStored = *loc;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
// return false if read name cannot be parsed
|
// return false if read name cannot be parsed
|
||||||
return false;
|
return false;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
*loc = physicalLocationStored;
|
*loc = physicalLocationStored;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Method used to extract tile/x/y from the read name and add it to the PhysicalLocationShort so that it
|
* Method used to extract tile/x/y from the read name and add it to the
|
||||||
* can be used later to determine optical duplication
|
* PhysicalLocationShort so that it can be used later to determine optical
|
||||||
|
* duplication
|
||||||
*
|
*
|
||||||
* @param readName the name of the read/cluster
|
* @param readName the name of the read/cluster
|
||||||
* @param loc the object to add tile/x/y to
|
* @param loc the object to add tile/x/y to
|
||||||
* @return true if the read name contained the information in parsable form, false otherwise
|
* @return true if the read name contained the information in parsable form,
|
||||||
|
* false otherwise
|
||||||
*/
|
*/
|
||||||
bool ReadLocationInformation(const string &readName, PhysicalLocation *loc)
|
bool ReadLocationInformation(const string &readName,
|
||||||
{
|
PhysicalLocation *loc) {
|
||||||
try {
|
try {
|
||||||
// Optimized version if using the default read name regex (== used on purpose):
|
// Optimized version if using the default read name regex (== used
|
||||||
if (useOptimizedDefaultParsing)
|
// on purpose):
|
||||||
{
|
if (useOptimizedDefaultParsing) {
|
||||||
const int fields = getLastThreeFields(readName, ':');
|
const int fields = getLastThreeFields(readName, ':');
|
||||||
if (!(fields == 5 || fields == 7))
|
if (!(fields == 5 || fields == 7)) {
|
||||||
{
|
if (warnedAboutRegexNotMatching) {
|
||||||
if (warnedAboutRegexNotMatching)
|
|
||||||
{
|
|
||||||
Warn(
|
Warn(
|
||||||
"Default READ_NAME_REGEX '%s' did not match read name '%s'."
|
"Default READ_NAME_REGEX '%s' did not match read "
|
||||||
"You may need to specify a READ_NAME_REGEX in order to correctly identify optical duplicates. "
|
"name '%s'."
|
||||||
"Note that this message will not be emitted again even if other read names do not match the regex.",
|
"You may need to specify a READ_NAME_REGEX in "
|
||||||
|
"order to correctly identify optical duplicates. "
|
||||||
|
"Note that this message will not be emitted again "
|
||||||
|
"even if other read names do not match the regex.",
|
||||||
readNameRegex.c_str(), readName.c_str());
|
readNameRegex.c_str(), readName.c_str());
|
||||||
warnedAboutRegexNotMatching = false;
|
warnedAboutRegexNotMatching = false;
|
||||||
}
|
}
|
||||||
|
|
@ -130,13 +135,9 @@ struct ReadNameParser
|
||||||
loc->x = tmpLocationFields[1];
|
loc->x = tmpLocationFields[1];
|
||||||
loc->y = tmpLocationFields[2];
|
loc->y = tmpLocationFields[2];
|
||||||
return true;
|
return true;
|
||||||
}
|
} else if (readNameRegex.empty()) {
|
||||||
else if (readNameRegex.empty())
|
|
||||||
{
|
|
||||||
return false;
|
return false;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
// Standard version that will use the regex
|
// Standard version that will use the regex
|
||||||
cmatch m;
|
cmatch m;
|
||||||
if (boost::regex_match(readName.c_str(), m, readNamePattern)) {
|
if (boost::regex_match(readName.c_str(), m, readNamePattern)) {
|
||||||
|
|
@ -144,28 +145,25 @@ struct ReadNameParser
|
||||||
loc->x = std::stoi(m[2].str());
|
loc->x = std::stoi(m[2].str());
|
||||||
loc->y = std::stoi(m[3].str());
|
loc->y = std::stoi(m[3].str());
|
||||||
return true;
|
return true;
|
||||||
}
|
} else {
|
||||||
else
|
if (warnedAboutRegexNotMatching) {
|
||||||
{
|
|
||||||
if (warnedAboutRegexNotMatching)
|
|
||||||
{
|
|
||||||
Warn(
|
Warn(
|
||||||
"READ_NAME_REGEX '%s' did not match read name '%s'."
|
"READ_NAME_REGEX '%s' did not match read name '%s'."
|
||||||
"Your regex may not be correct. "
|
"Your regex may not be correct. "
|
||||||
"Note that this message will not be emitted again even if other read names do not match the regex.",
|
"Note that this message will not be emitted again "
|
||||||
|
"even if other read names do not match the regex.",
|
||||||
readNameRegex.c_str(), readName.c_str());
|
readNameRegex.c_str(), readName.c_str());
|
||||||
warnedAboutRegexNotMatching = false;
|
warnedAboutRegexNotMatching = false;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
} catch (const std::runtime_error &e) {
|
||||||
catch (const std::runtime_error &e)
|
if (warnedAboutRegexNotMatching) {
|
||||||
{
|
|
||||||
if (warnedAboutRegexNotMatching)
|
|
||||||
{
|
|
||||||
Warn(
|
Warn(
|
||||||
"A field parsed out of a read name was expected to contain an integer and did not. READ_NAME_REGEX: %s; Read name: %s; Error Msg: %s",
|
"A field parsed out of a read name was expected to contain "
|
||||||
|
"an integer and did not. READ_NAME_REGEX: %s; Read name: "
|
||||||
|
"%s; Error Msg: %s",
|
||||||
readNameRegex.c_str(), readName.c_str(), e.what());
|
readNameRegex.c_str(), readName.c_str(), e.what());
|
||||||
warnedAboutRegexNotMatching = false;
|
warnedAboutRegexNotMatching = false;
|
||||||
}
|
}
|
||||||
|
|
@ -175,39 +173,39 @@ struct ReadNameParser
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Given a string, splits the string by the delimiter, and returns the the last three fields parsed as integers. Parsing a field
|
* Given a string, splits the string by the delimiter, and returns the the
|
||||||
* considers only a sequence of digits up until the first non-digit character. The three values are stored in the passed-in array.
|
* last three fields parsed as integers. Parsing a field considers only a
|
||||||
|
* sequence of digits up until the first non-digit character. The three
|
||||||
|
* values are stored in the passed-in array.
|
||||||
*
|
*
|
||||||
* @throws NumberFormatException if any of the tokens that should contain numbers do not start with parsable numbers
|
* @throws NumberFormatException if any of the tokens that should contain
|
||||||
|
* numbers do not start with parsable numbers
|
||||||
*/
|
*/
|
||||||
int getLastThreeFields(const string &readName, char delim)
|
int getLastThreeFields(const string &readName, char delim) {
|
||||||
{
|
|
||||||
int tokensIdx = 2; // start at the last token
|
int tokensIdx = 2; // start at the last token
|
||||||
int numFields = 0;
|
int numFields = 0;
|
||||||
int i, endIdx;
|
int i, endIdx;
|
||||||
endIdx = readName.size();
|
endIdx = readName.size();
|
||||||
// find the last three tokens only
|
// find the last three tokens only
|
||||||
for (i = readName.size() - 1; 0 <= i && 0 <= tokensIdx; i--)
|
for (i = (int)readName.size() - 1; 0 <= i && 0 <= tokensIdx; i--) {
|
||||||
{
|
if (readName.at(i) == delim || 0 == i) {
|
||||||
if (readName.at(i) == delim || 0 == i)
|
|
||||||
{
|
|
||||||
numFields++;
|
numFields++;
|
||||||
const int startIdx = (0 == i) ? 0 : (i + 1);
|
const int startIdx = (0 == i) ? 0 : (i + 1);
|
||||||
tmpLocationFields[tokensIdx] = std::stoi(readName.substr(startIdx, endIdx - startIdx));
|
tmpLocationFields[tokensIdx] =
|
||||||
|
std::stoi(readName.substr(startIdx, endIdx - startIdx));
|
||||||
tokensIdx--;
|
tokensIdx--;
|
||||||
endIdx = i;
|
endIdx = i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// continue to find the # of fields
|
// continue to find the # of fields
|
||||||
while (0 <= i)
|
while (0 <= i) {
|
||||||
{
|
|
||||||
if (readName.at(i) == delim || 0 == i)
|
if (readName.at(i) == delim || 0 == i)
|
||||||
numFields++;
|
numFields++;
|
||||||
i--;
|
i--;
|
||||||
}
|
}
|
||||||
if (numFields < 3)
|
if (numFields < 3) {
|
||||||
{
|
tmpLocationFields[0] = tmpLocationFields[1] = tmpLocationFields[2] =
|
||||||
tmpLocationFields[0] = tmpLocationFields[1] = tmpLocationFields[2] = -1;
|
-1;
|
||||||
numFields = -1;
|
numFields = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue