检查optical duplication,用graph来检测,正在写这个
This commit is contained in:
parent
38bc489004
commit
022be611cd
|
|
@ -1,3 +1,6 @@
|
|||
# for fast-markdup
|
||||
*.sam
|
||||
*.log
|
||||
# ---> C++
|
||||
# Prerequisites
|
||||
*.d
|
||||
|
|
|
|||
|
|
@ -13,10 +13,10 @@
|
|||
"program": "${workspaceRoot}/build/bin/picard_cpp",
|
||||
"args": [
|
||||
"MarkDuplicates",
|
||||
"--INPUT", "/mnt/d/data/100w.bam",
|
||||
"--INPUT", "~/data/bam/100w.bam",
|
||||
"--OUTPUT", "out.bam",
|
||||
"--METRICS_FILE", "metrics.txt",
|
||||
"--num_threads", "12",
|
||||
"--num_threads", "1",
|
||||
"--max_mem", "4G",
|
||||
"--verbosity", "DEBUG",
|
||||
"--asyncio", "true",
|
||||
|
|
|
|||
|
|
@ -88,6 +88,13 @@
|
|||
"typeindex": "cpp",
|
||||
"typeinfo": "cpp",
|
||||
"valarray": "cpp",
|
||||
"variant": "cpp"
|
||||
"variant": "cpp",
|
||||
"__split_buffer": "cpp",
|
||||
"executor": "cpp",
|
||||
"io_context": "cpp",
|
||||
"netfwd": "cpp",
|
||||
"timer": "cpp",
|
||||
"__nullptr": "cpp",
|
||||
"__node_handle": "c"
|
||||
}
|
||||
}
|
||||
4
build.sh
4
build.sh
|
|
@ -1,8 +1,8 @@
|
|||
#!/bin/bash
|
||||
dir="/home/zzh/work/GeneKit/picard_cpp/build"
|
||||
dir="/home/zzh/work/ngs/picard_cpp/build"
|
||||
[ -d "$dir" ] && rm -rf "$dir"
|
||||
mkdir "$dir"
|
||||
cd "$dir"
|
||||
#cmake .. -DCMAKE_BUILD_TYPE=Debug
|
||||
cmake .. -DCMAKE_BUILD_TYPE=Release
|
||||
make -j 8
|
||||
make -j 16
|
||||
|
|
|
|||
15
run.sh
15
run.sh
|
|
@ -1,9 +1,14 @@
|
|||
time /home/zzh/work/GeneKit/picard_cpp/build/bin/picard_cpp \
|
||||
input=~/data/bam/zy_normal.bam
|
||||
#input=~/data/bam/zy_tumor.bam
|
||||
#input=~/data/bam/100w.bam
|
||||
|
||||
time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \
|
||||
MarkDuplicates \
|
||||
--INPUT /mnt/d/data/zy_tumor.bam \
|
||||
--OUTPUT /mnt/d/data/out.bam \
|
||||
--INPUT $input \
|
||||
--OUTPUT ~/data/bam/out.bam \
|
||||
--INDEX_FORMAT BAI \
|
||||
--num_threads 1 \
|
||||
--max_mem 4G \
|
||||
--max_mem 2G \
|
||||
--verbosity DEBUG \
|
||||
--asyncio true #\
|
||||
#--READ_NAME_REGEX ".*?([0-9]+):([0-9]+):([0-9]+)$"
|
||||
|
|
@ -12,4 +17,4 @@ time /home/zzh/work/GeneKit/picard_cpp/build/bin/picard_cpp \
|
|||
# --INPUT /mnt/d/data/100w.bam \
|
||||
# --INPUT /mnt/d/data/zy_normal.bam \
|
||||
# zy_tumor
|
||||
# tumor_region
|
||||
# tumor_region
|
||||
|
|
|
|||
|
|
@ -13,116 +13,87 @@
|
|||
* BamBuf类
|
||||
*/
|
||||
// 读取数据直到读完,或者缓冲区满
|
||||
int BamBuf::ReadBam()
|
||||
{
|
||||
int BamBuf::ReadBam() {
|
||||
int read_num = 0;
|
||||
if (handle_last)
|
||||
{ // 处理上次读入的最后一个bam
|
||||
if (has_enough_space())
|
||||
{ // 必须调用,在边界处调整memffset
|
||||
if (handle_last) { // 处理上次读入的最后一个bam
|
||||
if (has_enough_space()) { // 必须调用,在边界处调整memffset
|
||||
++read_num;
|
||||
append_one_bam();
|
||||
}
|
||||
else
|
||||
{
|
||||
return read_num; // 还是没空间
|
||||
} else {
|
||||
return read_num; // 还是没空间
|
||||
}
|
||||
}
|
||||
while (read_stat_ >= 0 && (read_stat_ = sam_read1(fp, hdr, bw->b)) >= 0)
|
||||
{
|
||||
while (read_stat_ >= 0 && (read_stat_ = sam_read1(fp, hdr, bw->b)) >= 0) {
|
||||
bw->end_pos_ = BamWrap::BamEndPos(bw->b);
|
||||
if (has_enough_space())
|
||||
{ // 还有空间
|
||||
if (has_enough_space()) { // 还有空间
|
||||
append_one_bam();
|
||||
++read_num; // 放进缓存才算读取到
|
||||
}
|
||||
else
|
||||
{
|
||||
++read_num; // 放进缓存才算读取到
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (read_stat_ >= 0)
|
||||
{
|
||||
if (read_stat_ >= 0) {
|
||||
handle_last = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
} else {
|
||||
handle_last = false;
|
||||
}
|
||||
return read_num;
|
||||
}
|
||||
|
||||
// 初始化缓存
|
||||
void BamBuf::Init(samFile *fp,
|
||||
sam_hdr_t *hdr,
|
||||
int64_t mem_size)
|
||||
{
|
||||
void BamBuf::Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size) {
|
||||
this->fp = fp;
|
||||
this->hdr = hdr;
|
||||
this->mem_size = mem_size;
|
||||
this->mem = (uint8_t *)malloc(mem_size);
|
||||
this->bw = (BamWrap *)malloc(sizeof(BamWrap));
|
||||
this->bw->b = bam_init1();
|
||||
if (bw == NULL ||
|
||||
this->mem == NULL ||
|
||||
this->bw->b == NULL)
|
||||
{
|
||||
if (bw == NULL || this->mem == NULL || this->bw->b == NULL) {
|
||||
fprintf(stderr, "allocate memory failed! Abort\n");
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
void BamBuf::ClearBeforeIdx(size_t idxInBv)
|
||||
{
|
||||
void BamBuf::ClearBeforeIdx(size_t idxInBv) {
|
||||
if (idxInBv < 1)
|
||||
return;
|
||||
int i = 0, j = idxInBv;
|
||||
for (; j < bv.size(); ++i, ++j)
|
||||
{
|
||||
for (; j < bv.size(); ++i, ++j) {
|
||||
bv[i] = bv[j];
|
||||
}
|
||||
bv.resize(i);
|
||||
prepare_read();
|
||||
}
|
||||
|
||||
void BamBuf::ClearAll()
|
||||
{
|
||||
|
||||
void BamBuf::ClearAll() {
|
||||
bv.clear();
|
||||
prepare_read();
|
||||
}
|
||||
|
||||
// 为下一次读取做准备, 计算一些边界条件
|
||||
inline void BamBuf::prepare_read()
|
||||
{
|
||||
inline void BamBuf::prepare_read() {
|
||||
// 计算余留的下次计算可能用到的bam所占的位置
|
||||
if (bv.size() > 0)
|
||||
{
|
||||
if (bv.size() > 0) {
|
||||
BamWrap *bw = bv[0];
|
||||
legacy_start = (int64_t)bw - (int64_t)mem;
|
||||
bw = bv.back();
|
||||
legacy_end = (int64_t)bw + bw->length() - (int64_t)mem;
|
||||
}
|
||||
else
|
||||
{
|
||||
} else {
|
||||
legacy_start = legacy_end = 0;
|
||||
mem_offset = 0; // 上次没剩下,那就从头存储
|
||||
mem_offset = 0; // 上次没剩下,那就从头存储
|
||||
}
|
||||
}
|
||||
|
||||
// 检查缓存是否还有空间
|
||||
inline bool BamBuf::has_enough_space()
|
||||
{
|
||||
inline bool BamBuf::has_enough_space() {
|
||||
const uint32_t bam_len = bw->length();
|
||||
int64_t potential_end = mem_offset + bam_len;
|
||||
if (legacy_start <= legacy_end)
|
||||
legacy_start += mem_size;
|
||||
if (potential_end >= legacy_start)
|
||||
{
|
||||
if (potential_end >= legacy_start) {
|
||||
return false;
|
||||
}
|
||||
if (potential_end >= mem_size)
|
||||
{
|
||||
if (potential_end >= mem_size) {
|
||||
mem_offset = 0;
|
||||
}
|
||||
int64_t virtual_offset = mem_offset;
|
||||
|
|
@ -133,8 +104,7 @@ inline bool BamBuf::has_enough_space()
|
|||
}
|
||||
|
||||
// 处理一个读取后的bam
|
||||
inline void BamBuf::append_one_bam()
|
||||
{
|
||||
inline void BamBuf::append_one_bam() {
|
||||
BamWrap *bwp = (BamWrap *)(mem + mem_offset);
|
||||
*bwp = *bw;
|
||||
bwp->b = (bam1_t *)((char *)bwp + sizeof(*bwp));
|
||||
|
|
@ -148,12 +118,9 @@ inline void BamBuf::append_one_bam()
|
|||
}
|
||||
|
||||
// 处理上次读入的最后一个read
|
||||
inline bool BamBuf::handle_last_read()
|
||||
{
|
||||
if (handle_last)
|
||||
{ // 处理上次读入的最后一个bam
|
||||
if (has_enough_space())
|
||||
{ // 必须调用,在边界处调整memffset
|
||||
inline bool BamBuf::handle_last_read() {
|
||||
if (handle_last) { // 处理上次读入的最后一个bam
|
||||
if (has_enough_space()) { // 必须调用,在边界处调整memffset
|
||||
append_one_bam();
|
||||
handle_last = false;
|
||||
return true;
|
||||
|
|
@ -166,49 +133,35 @@ inline bool BamBuf::handle_last_read()
|
|||
* AsyncIoBamBuf 类
|
||||
*/
|
||||
// 初始化缓存
|
||||
void AsyncIoBamBuf::Init(samFile *fp,
|
||||
sam_hdr_t *hdr,
|
||||
int64_t mem_size)
|
||||
{
|
||||
if (use_async_io_)
|
||||
{
|
||||
void AsyncIoBamBuf::Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size) {
|
||||
if (use_async_io_) {
|
||||
buf1_.Init(fp, hdr, mem_size >> 1);
|
||||
buf2_.Init(fp, hdr, mem_size >> 1);
|
||||
pi_ = &buf1_;
|
||||
po_ = &buf2_;
|
||||
tid_ = (pthread_t *)malloc(sizeof(pthread_t));
|
||||
}
|
||||
else
|
||||
{
|
||||
} else {
|
||||
buf1_.Init(fp, hdr, mem_size);
|
||||
pi_ = &buf1_;
|
||||
}
|
||||
}
|
||||
|
||||
// 读取数据
|
||||
int AsyncIoBamBuf::ReadBam()
|
||||
{
|
||||
if (use_async_io_)
|
||||
{
|
||||
int AsyncIoBamBuf::ReadBam() {
|
||||
if (use_async_io_) {
|
||||
hasThread = true;
|
||||
return async_read_bam();
|
||||
}
|
||||
else
|
||||
{
|
||||
} else {
|
||||
return sync_read_bam();
|
||||
}
|
||||
}
|
||||
|
||||
int AsyncIoBamBuf::sync_read_bam()
|
||||
{
|
||||
int AsyncIoBamBuf::sync_read_bam() {
|
||||
int read_num = 0;
|
||||
if (clear_all_)
|
||||
{
|
||||
if (clear_all_) {
|
||||
clear_all_ = false;
|
||||
pi_->ClearAll();
|
||||
}
|
||||
else if (clear_before_idx_ > 0)
|
||||
{
|
||||
} else if (clear_before_idx_ > 0) {
|
||||
pi_->ClearBeforeIdx(clear_before_idx_);
|
||||
clear_before_idx_ = 0;
|
||||
}
|
||||
|
|
@ -217,23 +170,18 @@ int AsyncIoBamBuf::sync_read_bam()
|
|||
return read_num;
|
||||
}
|
||||
|
||||
int AsyncIoBamBuf::async_read_bam()
|
||||
{
|
||||
int AsyncIoBamBuf::async_read_bam() {
|
||||
int read_num = 0;
|
||||
if (first_read_)
|
||||
{
|
||||
if (first_read_) {
|
||||
read_num = pi_->ReadBam();
|
||||
first_read_ = false;
|
||||
refresh_bam_arr();
|
||||
}
|
||||
else
|
||||
{
|
||||
} else {
|
||||
// join, 交换缓冲区指针
|
||||
pthread_join(*tid_, 0);
|
||||
resize_buf();
|
||||
|
||||
if (need_read_)
|
||||
{ // 需要交换指针
|
||||
if (need_read_) { // 需要交换指针
|
||||
BamBuf *tmp = pi_;
|
||||
pi_ = po_;
|
||||
po_ = tmp;
|
||||
|
|
@ -246,72 +194,52 @@ int AsyncIoBamBuf::async_read_bam()
|
|||
return read_num;
|
||||
}
|
||||
|
||||
void *AsyncIoBamBuf::async_read(void *data)
|
||||
{
|
||||
void *AsyncIoBamBuf::async_read(void *data) {
|
||||
AsyncIoBamBuf *ab = (AsyncIoBamBuf *)data;
|
||||
if (ab->need_read_ && ab->ReadStat() >= 0)
|
||||
{ // 需要读取
|
||||
if (ab->need_read_ && ab->ReadStat() >= 0) { // 需要读取
|
||||
ab->last_read_num_ = ab->po_->ReadBam();
|
||||
}
|
||||
else
|
||||
{
|
||||
} else {
|
||||
ab->last_read_num_ = 0;
|
||||
}
|
||||
pthread_exit(0);
|
||||
}
|
||||
|
||||
// 为下一次读取做准备, 计算一些边界条件,延迟操作,因为此时可能po_对应的buf正在读取
|
||||
void AsyncIoBamBuf::ClearBeforeIdx(size_t idxInBv)
|
||||
{
|
||||
// 为下一次读取做准备,
|
||||
// 计算一些边界条件,延迟操作,因为此时可能po_对应的buf正在读取
|
||||
void AsyncIoBamBuf::ClearBeforeIdx(size_t idxInBv) {
|
||||
clear_before_idx_ = idxInBv;
|
||||
}
|
||||
|
||||
// 清空上一次所有读入的数据,延迟操作,因为此时可能po_对应的buf正在读取
|
||||
void AsyncIoBamBuf::ClearAll()
|
||||
{
|
||||
clear_all_ = true;
|
||||
}
|
||||
void AsyncIoBamBuf::ClearAll() { clear_all_ = true; }
|
||||
|
||||
inline void AsyncIoBamBuf::resize_buf()
|
||||
{
|
||||
if (clear_all_)
|
||||
{ // 清理上一轮的数据
|
||||
inline void AsyncIoBamBuf::resize_buf() {
|
||||
if (clear_all_) { // 清理上一轮的数据
|
||||
clear_all_ = false;
|
||||
po_->ClearBeforeIdx(legacy_size_);
|
||||
pi_->ClearAll();
|
||||
if (pi_->handle_last_read()) // 上次读取有一个read没放入缓存
|
||||
{
|
||||
if (pi_->handle_last_read()) { // 上次读取有一个read没放入缓存
|
||||
last_read_num_ += 1;
|
||||
legacy_size_ = pi_->Size(); // 应该只有一个read
|
||||
legacy_size_ = pi_->Size(); // 应该只有一个read
|
||||
need_read_ = true;
|
||||
}
|
||||
else // 没空间存放,则不交换指针,或者文件已经读取完毕
|
||||
{
|
||||
} else { // 没空间存放,则不交换指针,或者文件已经读取完毕
|
||||
legacy_size_ = 0;
|
||||
need_read_ = false;
|
||||
}
|
||||
}
|
||||
else if (clear_before_idx_ > 0)
|
||||
{
|
||||
if (clear_before_idx_ < legacy_size_)
|
||||
{
|
||||
} else if (clear_before_idx_ > 0) {
|
||||
if (clear_before_idx_ < legacy_size_) {
|
||||
po_->ClearBeforeIdx(clear_before_idx_);
|
||||
legacy_size_ -= clear_before_idx_;
|
||||
// 不需要交换指针,不需要读取
|
||||
need_read_ = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
} else {
|
||||
po_->ClearBeforeIdx(legacy_size_);
|
||||
pi_->ClearBeforeIdx(clear_before_idx_ - legacy_size_);
|
||||
if (pi_->handle_last_read()) // 上次读取有一个read没放入缓存
|
||||
{
|
||||
if (pi_->handle_last_read()) {// 上次读取有一个read没放入缓存
|
||||
last_read_num_ += 1;
|
||||
legacy_size_ = pi_->Size(); // 应该只有一个read
|
||||
legacy_size_ = pi_->Size(); // 应该只有一个read
|
||||
need_read_ = true;
|
||||
}
|
||||
else // 没空间存放,则不交换指针,或者文件已经读取完毕
|
||||
{
|
||||
} else { // 没空间存放,则不交换指针,或者文件已经读取完毕
|
||||
legacy_size_ = 0;
|
||||
need_read_ = false;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -10,19 +10,18 @@
|
|||
#ifndef BAM_BUF_H_
|
||||
#define BAM_BUF_H_
|
||||
|
||||
#include <vector>
|
||||
#include <stdint.h>
|
||||
#include <htslib/sam.h>
|
||||
#include <pthread.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <functional>
|
||||
#include <pthread.h>
|
||||
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <fstream>
|
||||
|
||||
#include <htslib/sam.h>
|
||||
#include <vector>
|
||||
|
||||
#include "bam_wrap.h"
|
||||
|
||||
|
|
@ -32,24 +31,22 @@ using namespace std;
|
|||
/*
|
||||
* 存放读入的bam数据
|
||||
*/
|
||||
struct BamBuf
|
||||
{
|
||||
sam_hdr_t *hdr; // sam文件的header信息
|
||||
samFile *fp; // sam文件指针
|
||||
BamWrap *bw = nullptr; // 用来循环读入bam
|
||||
uint8_t *mem = nullptr; // 用来存放bam的数据, 程序结束后自动释放,所以没在析构函数里释放
|
||||
int64_t mem_offset = 0; // 下一次要存放的位置
|
||||
int64_t mem_size; // 缓存大小
|
||||
int read_stat_ = 0; // 读取状态,是否读完
|
||||
vector<BamWrap *> bv; // 方便对bam数据的访问
|
||||
int64_t legacy_start = 0; // 没处理完的bam在mem中的起始位置, 闭区间
|
||||
int64_t legacy_end = 0; // 没处理完的bam在mem中的结束位置, 开区间
|
||||
bool handle_last = false; // 上次最后读入的bam是否需要处理
|
||||
struct BamBuf {
|
||||
sam_hdr_t *hdr; // sam文件的header信息
|
||||
samFile *fp; // sam文件指针
|
||||
BamWrap *bw = nullptr; // 用来循环读入bam
|
||||
uint8_t *mem = nullptr; // 用来存放bam的数据,
|
||||
// 程序结束后自动释放,所以没在析构函数里释放
|
||||
int64_t mem_offset = 0; // 下一次要存放的位置
|
||||
int64_t mem_size; // 缓存大小
|
||||
int read_stat_ = 0; // 读取状态,是否读完
|
||||
vector<BamWrap *> bv; // 方便对bam数据的访问
|
||||
int64_t legacy_start = 0; // 没处理完的bam在mem中的起始位置, 闭区间
|
||||
int64_t legacy_end = 0; // 没处理完的bam在mem中的结束位置, 开区间
|
||||
bool handle_last = false; // 上次最后读入的bam是否需要处理
|
||||
|
||||
// 初始化缓存
|
||||
void Init(samFile *fp,
|
||||
sam_hdr_t *hdr,
|
||||
int64_t mem_size);
|
||||
void Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size);
|
||||
// 读取数据直到读完,或者缓冲区满
|
||||
int ReadBam();
|
||||
// 为下一次读取做准备, 计算一些边界条件
|
||||
|
|
@ -57,27 +54,24 @@ struct BamBuf
|
|||
// 清空上一次所有读入的数据
|
||||
void ClearAll();
|
||||
inline int64_t Size() { return bv.size(); } // 包含多少个read
|
||||
inline int ReadStat() { return read_stat_; } // 文件的读取状态,是否可读(读取完全)
|
||||
~BamBuf()
|
||||
{
|
||||
if (this->mem != nullptr)
|
||||
{
|
||||
inline int ReadStat() {
|
||||
return read_stat_;
|
||||
} // 文件的读取状态,是否可读(读取完全)
|
||||
~BamBuf() {
|
||||
if (this->mem != nullptr) {
|
||||
free(this->mem);
|
||||
}
|
||||
if (this->bw != nullptr)
|
||||
{
|
||||
if (this->bw != nullptr) {
|
||||
bam_destroy1(bw->b);
|
||||
free(bw);
|
||||
}
|
||||
}
|
||||
void FreeMemory() // 释放开辟的内存
|
||||
void FreeMemory() // 释放开辟的内存
|
||||
{
|
||||
if (this->mem != nullptr)
|
||||
{
|
||||
if (this->mem != nullptr) {
|
||||
free(this->mem);
|
||||
}
|
||||
if (this->bw != nullptr)
|
||||
{
|
||||
if (this->bw != nullptr) {
|
||||
bam_destroy1(bw->b);
|
||||
free(bw);
|
||||
}
|
||||
|
|
@ -102,31 +96,31 @@ struct BamBuf
|
|||
/*
|
||||
* io异步缓冲区
|
||||
*/
|
||||
struct AsyncIoBamBuf
|
||||
{
|
||||
struct AsyncIoBamBuf {
|
||||
BamBuf buf1_;
|
||||
BamBuf buf2_;
|
||||
BamBuf *pi_; // 当前用的buf
|
||||
BamBuf *po_; // 后台在读取的buf
|
||||
BamBuf *pi_; // 当前用的buf
|
||||
BamBuf *po_; // 后台在读取的buf
|
||||
pthread_t *tid_ = NULL;
|
||||
bool hasThread = false;
|
||||
int64_t legacy_size_ = 0; // 上一轮运算之后,缓存中还剩余的上次读取的read数量
|
||||
int64_t legacy_size_ =
|
||||
0; // 上一轮运算之后,缓存中还剩余的上次读取的read数量
|
||||
bool first_read_ = true;
|
||||
int last_read_num_ = 0; // 上一次读取了多少reads
|
||||
int last_read_num_ = 0; // 上一次读取了多少reads
|
||||
bool need_read_ = true;
|
||||
bool use_async_io_ = true;
|
||||
int64_t clear_before_idx_ = 0; // 用户异步读取,下一轮读取之前清理掉clear_before_idx_之前的所有reads
|
||||
bool clear_all_ = false; // 用于异步读取,下一轮读取之前清理掉之前的所有reads
|
||||
int64_t clear_before_idx_ =
|
||||
0; // 用户异步读取,下一轮读取之前清理掉clear_before_idx_之前的所有reads
|
||||
bool clear_all_ =
|
||||
false; // 用于异步读取,下一轮读取之前清理掉之前的所有reads
|
||||
|
||||
vector<BamWrap *> bam_arr_; // 用来访问buf中的bam
|
||||
vector<BamWrap *> bam_arr_; // 用来访问buf中的bam
|
||||
|
||||
AsyncIoBamBuf() {}
|
||||
AsyncIoBamBuf(bool use_async) : use_async_io_(use_async) {}
|
||||
// 析构
|
||||
~AsyncIoBamBuf()
|
||||
{
|
||||
if (tid_ != NULL)
|
||||
{
|
||||
~AsyncIoBamBuf() {
|
||||
if (tid_ != NULL) {
|
||||
if (hasThread)
|
||||
pthread_join(*tid_, 0);
|
||||
free(tid_);
|
||||
|
|
@ -136,35 +130,29 @@ struct AsyncIoBamBuf
|
|||
}
|
||||
|
||||
// 初始化缓存
|
||||
void Init(samFile *fp,
|
||||
sam_hdr_t *hdr,
|
||||
int64_t mem_size);
|
||||
void Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size);
|
||||
|
||||
// 读取数据
|
||||
int ReadBam();
|
||||
// 为下一次读取做准备, 计算一些边界条件
|
||||
void ClearBeforeIdx(size_t idxInBv);
|
||||
vector<BamWrap *> &GetBamArr() { return bam_arr_; } // 获取bam array
|
||||
vector<BamWrap *> &GetBamArr() { return bam_arr_; } // 获取bam array
|
||||
// 清空上一次所有读入的数据
|
||||
void ClearAll();
|
||||
// 包含的read数量
|
||||
inline int64_t Size() { return legacy_size_ + pi_->Size(); }
|
||||
inline int ReadStat() { return pi_->read_stat_; }
|
||||
inline BamWrap *operator[](int64_t pos)
|
||||
{
|
||||
return bam_arr_[pos];
|
||||
}
|
||||
inline BamWrap *operator[](int64_t pos) { return bam_arr_[pos]; }
|
||||
// 获取某一段reads
|
||||
inline vector<BamWrap*> Slice(size_t startIdx, size_t endIdx)
|
||||
{
|
||||
inline vector<BamWrap *> Slice(size_t startIdx, size_t endIdx) {
|
||||
if (endIdx > startIdx) {
|
||||
auto begItr = bam_arr_.begin();
|
||||
return std::move(vector<BamWrap *>(begItr + startIdx, begItr + endIdx));
|
||||
return std::move(
|
||||
vector<BamWrap *>(begItr + startIdx, begItr + endIdx));
|
||||
}
|
||||
return std::move(vector<BamWrap *>());
|
||||
}
|
||||
void FreeMemory()
|
||||
{
|
||||
void FreeMemory() {
|
||||
buf1_.FreeMemory();
|
||||
buf2_.FreeMemory();
|
||||
}
|
||||
|
|
@ -176,11 +164,9 @@ struct AsyncIoBamBuf
|
|||
// 异步读取线程函数
|
||||
static void *async_read(void *data);
|
||||
void resize_buf();
|
||||
inline void refresh_bam_arr()
|
||||
{
|
||||
inline void refresh_bam_arr() {
|
||||
bam_arr_.resize(this->Size());
|
||||
for (int i = 0; i < bam_arr_.size(); ++i)
|
||||
{
|
||||
for (int i = 0; i < bam_arr_.size(); ++i) {
|
||||
if (i < legacy_size_)
|
||||
bam_arr_[i] = (*po_)[i];
|
||||
else
|
||||
|
|
|
|||
|
|
@ -9,15 +9,14 @@
|
|||
|
||||
#ifndef BAM_WRAP_H_
|
||||
#define BAM_WRAP_H_
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <sstream>
|
||||
|
||||
#include <htslib/sam.h>
|
||||
#include <limits.h>
|
||||
#include <math.h>
|
||||
|
||||
#include <htslib/sam.h>
|
||||
#include <map>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
|
@ -28,52 +27,32 @@ using namespace std;
|
|||
/*
|
||||
* sam read的封装
|
||||
*/
|
||||
struct BamWrap
|
||||
{
|
||||
struct BamWrap {
|
||||
// 将contig左移后加上pos作为全局位置
|
||||
const static int MAX_CONTIG_LEN_SHIFT = 30;
|
||||
const static int MAX_CONTIG_LEN_SHIFT = 40; // 将染色体id左移多少位,和位点拼合在一起
|
||||
const static int READ_MAX_LENGTH = 200;
|
||||
const static int READ_MAX_DEPTH = 1000; // 这只是用来初始化空间用的,深度大于这个值也没关系
|
||||
const static int READ_MAX_DEPTH = 1000; // 这只是用来初始化空间用的,深度大于这个值也没关系
|
||||
|
||||
// 成员变量尽量少,减少占用内存空间
|
||||
bam1_t *b;
|
||||
int64_t end_pos_; // bam的全局结束位置, 闭区间
|
||||
int64_t end_pos_; // bam的全局结束位置, 闭区间
|
||||
|
||||
// 全局开始位置
|
||||
inline int64_t start_pos()
|
||||
{
|
||||
return bam_global_pos(b);
|
||||
}
|
||||
inline int64_t start_pos() { return bam_global_pos(b); }
|
||||
// 全局结束位置
|
||||
inline int64_t end_pos()
|
||||
{
|
||||
return end_pos_;
|
||||
}
|
||||
inline int64_t end_pos() { return end_pos_; }
|
||||
// 和reference对应的序列长度
|
||||
inline int16_t read_len()
|
||||
{
|
||||
return (end_pos_ - start_pos() + 1);
|
||||
}
|
||||
inline int16_t read_len() { return (end_pos_ - start_pos() + 1); }
|
||||
|
||||
// 在contig内的开始位置
|
||||
inline int32_t contig_pos()
|
||||
{
|
||||
return b->core.pos;
|
||||
}
|
||||
inline int32_t contig_pos() { return b->core.pos; }
|
||||
// 在contig内部的结束位置
|
||||
inline int32_t contig_end_pos()
|
||||
{
|
||||
return bam_pos(end_pos_);
|
||||
}
|
||||
inline int32_t contig_end_pos() { return bam_pos(end_pos_); }
|
||||
// 序列的长度(AGTC字母个数)
|
||||
inline int16_t seq_len()
|
||||
{
|
||||
return b->core.l_qseq;
|
||||
}
|
||||
inline int16_t seq_len() { return b->core.l_qseq; }
|
||||
|
||||
// 算上开头的softclip
|
||||
inline int32_t softclip_start()
|
||||
{
|
||||
inline int32_t softclip_start() {
|
||||
const uint32_t *cigar = bam_get_cigar(b);
|
||||
const bam1_core_t &bc = b->core;
|
||||
const char c = bam_cigar_opchr(cigar[0]);
|
||||
|
|
@ -84,8 +63,7 @@ struct BamWrap
|
|||
}
|
||||
|
||||
// 算上结尾的softclip
|
||||
inline int32_t softclip_end()
|
||||
{
|
||||
inline int32_t softclip_end() {
|
||||
const uint32_t *cigar = bam_get_cigar(b);
|
||||
const bam1_core_t &bc = b->core;
|
||||
const char c = bam_cigar_opchr(cigar[bc.n_cigar - 1]);
|
||||
|
|
@ -96,8 +74,7 @@ struct BamWrap
|
|||
}
|
||||
|
||||
// 算上结尾的softclip
|
||||
inline int32_t right_softclip_len()
|
||||
{
|
||||
inline int32_t right_softclip_len() {
|
||||
const uint32_t *cigar = bam_get_cigar(b);
|
||||
const bam1_core_t &bc = b->core;
|
||||
const char c = bam_cigar_opchr(cigar[bc.n_cigar - 1]);
|
||||
|
|
@ -108,14 +85,12 @@ struct BamWrap
|
|||
}
|
||||
|
||||
// 获取序列
|
||||
inline std::string sequence()
|
||||
{
|
||||
inline std::string sequence() {
|
||||
ostringstream oss;
|
||||
char *seq = (char *)bam_get_seq(b);
|
||||
const bam1_core_t &bc = b->core;
|
||||
const char base_to_char[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'};
|
||||
for (int i = 0; i < bc.l_qseq; ++i)
|
||||
{
|
||||
for (int i = 0; i < bc.l_qseq; ++i) {
|
||||
char base = base_to_char[bam_seqi(seq, i)];
|
||||
oss << base;
|
||||
}
|
||||
|
|
@ -123,18 +98,13 @@ struct BamWrap
|
|||
}
|
||||
|
||||
// 获取名字
|
||||
inline const char *query_name()
|
||||
{
|
||||
return bam_get_qname(b);
|
||||
}
|
||||
inline const char *query_name() { return bam_get_qname(b); }
|
||||
// 获取cigar 字符串
|
||||
inline string cigar_str()
|
||||
{
|
||||
inline string cigar_str() {
|
||||
ostringstream oss;
|
||||
const uint32_t *cigar = bam_get_cigar(b);
|
||||
const bam1_core_t &bc = b->core;
|
||||
for (int i = 0; i < bc.n_cigar; ++i)
|
||||
{
|
||||
for (int i = 0; i < bc.n_cigar; ++i) {
|
||||
const char c = bam_cigar_opchr(cigar[i]);
|
||||
const int len = bam_cigar_oplen(cigar[i]);
|
||||
oss << len << c;
|
||||
|
|
@ -143,21 +113,14 @@ struct BamWrap
|
|||
}
|
||||
|
||||
// 占用的内存大小
|
||||
inline int16_t length()
|
||||
{
|
||||
return sizeof(*this) +
|
||||
sizeof(bam1_t) +
|
||||
b->l_data;
|
||||
}
|
||||
inline int16_t length() { return sizeof(*this) + sizeof(bam1_t) + b->l_data; }
|
||||
|
||||
// 获取cigar中insert的总长度
|
||||
inline int32_t insert_cigar_len()
|
||||
{
|
||||
inline int32_t insert_cigar_len() {
|
||||
const uint32_t *cigar = bam_get_cigar(b);
|
||||
const bam1_core_t &bc = b->core;
|
||||
int ret = 0;
|
||||
for (int i = 0; i < bc.n_cigar; ++i)
|
||||
{
|
||||
for (int i = 0; i < bc.n_cigar; ++i) {
|
||||
const char c = bam_cigar_opchr(cigar[i]);
|
||||
const int len = bam_cigar_oplen(cigar[i]);
|
||||
if (c == 'I')
|
||||
|
|
@ -167,13 +130,11 @@ struct BamWrap
|
|||
}
|
||||
|
||||
// 获取cigar中delete的总长度
|
||||
inline int32_t del_cigar_len()
|
||||
{
|
||||
inline int32_t del_cigar_len() {
|
||||
const uint32_t *cigar = bam_get_cigar(b);
|
||||
const bam1_core_t &bc = b->core;
|
||||
int ret = 0;
|
||||
for (int i = 0; i < bc.n_cigar; ++i)
|
||||
{
|
||||
for (int i = 0; i < bc.n_cigar; ++i) {
|
||||
const char c = bam_cigar_opchr(cigar[i]);
|
||||
const int len = bam_cigar_oplen(cigar[i]);
|
||||
if (c == 'D')
|
||||
|
|
@ -183,13 +144,11 @@ struct BamWrap
|
|||
}
|
||||
|
||||
// 计算sam read的终点位置
|
||||
static inline int64_t BamEndPos(const bam1_t *b)
|
||||
{
|
||||
static inline int64_t BamEndPos(const bam1_t *b) {
|
||||
const uint32_t *cigar = bam_get_cigar(b);
|
||||
const bam1_core_t &bc = b->core;
|
||||
int start_offset = -1;
|
||||
for (int i = 0; i < bc.n_cigar; ++i)
|
||||
{
|
||||
for (int i = 0; i < bc.n_cigar; ++i) {
|
||||
const char c = bam_cigar_opchr(cigar[i]);
|
||||
const int len = bam_cigar_oplen(cigar[i]);
|
||||
if (c == 'D' || c == 'N' || c == 'M' || c == '=' || c == 'X')
|
||||
|
|
@ -198,31 +157,22 @@ struct BamWrap
|
|||
return (((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)(b->core.pos + start_offset));
|
||||
};
|
||||
|
||||
bool HasWellDefinedFragmentSize()
|
||||
{
|
||||
bool HasWellDefinedFragmentSize() {
|
||||
const bam1_core_t &bc = b->core;
|
||||
bool hasWellDefinedFragmentSize = true;
|
||||
if (bc.isize == 0 ||
|
||||
!(bc.flag & BAM_FPAIRED) ||
|
||||
((bc.flag & BAM_FUNMAP) || (bc.flag & BAM_FMUNMAP)) ||
|
||||
((bool)(bc.flag & BAM_FREVERSE) == (bool)(bc.flag & BAM_FMREVERSE)))
|
||||
{
|
||||
if (bc.isize == 0 || !(bc.flag & BAM_FPAIRED) || ((bc.flag & BAM_FUNMAP) || (bc.flag & BAM_FMUNMAP)) ||
|
||||
((bool)(bc.flag & BAM_FREVERSE) == (bool)(bc.flag & BAM_FMREVERSE))) {
|
||||
hasWellDefinedFragmentSize = false;
|
||||
}
|
||||
else if (bc.flag & BAM_FREVERSE)
|
||||
{
|
||||
} else if (bc.flag & BAM_FREVERSE) {
|
||||
hasWellDefinedFragmentSize = contig_end_pos() > bc.mpos ? true : false;
|
||||
}
|
||||
else
|
||||
{
|
||||
} else {
|
||||
hasWellDefinedFragmentSize = bc.pos <= bc.mpos + bc.isize ? true : false;
|
||||
}
|
||||
return hasWellDefinedFragmentSize;
|
||||
}
|
||||
|
||||
// 计算bam的adapterBoundary
|
||||
int GetAdapterBoundary()
|
||||
{
|
||||
int GetAdapterBoundary() {
|
||||
const bam1_core_t &bc = b->core;
|
||||
int adapterBoundary;
|
||||
if (!HasWellDefinedFragmentSize())
|
||||
|
|
@ -230,39 +180,34 @@ struct BamWrap
|
|||
else if (bc.flag & BAM_FREVERSE)
|
||||
adapterBoundary = bc.mpos - 1;
|
||||
else
|
||||
adapterBoundary = bc.pos + abs(bc.isize); // GATK4.0 和 GATK3.5不一样,3.5的这里+1
|
||||
adapterBoundary = bc.pos + abs(bc.isize); // GATK4.0 和 GATK3.5不一样,3.5的这里+1
|
||||
return adapterBoundary;
|
||||
}
|
||||
|
||||
// 获取开头的I的长度
|
||||
inline int GetHeadInsertLen()
|
||||
{
|
||||
inline int GetHeadInsertLen() {
|
||||
int insLen = 0;
|
||||
const uint32_t *cigar = bam_get_cigar(b);
|
||||
const bam1_core_t &bc = b->core;
|
||||
for (int i = 0; i < bc.n_cigar; ++i)
|
||||
{
|
||||
for (int i = 0; i < bc.n_cigar; ++i) {
|
||||
const char c = bam_cigar_opchr(cigar[i]);
|
||||
const int len = bam_cigar_oplen(cigar[i]);
|
||||
if (c == 'I')
|
||||
{
|
||||
if (c == 'I') {
|
||||
insLen = len;
|
||||
break;
|
||||
}
|
||||
else if (c != 'H' && c != 'S')
|
||||
} else if (c != 'H' && c != 'S')
|
||||
break;
|
||||
}
|
||||
return insLen;
|
||||
}
|
||||
|
||||
// 获取soft clip开始位置(能处理H和S相连的情况,有这种情况么?, 注意开头的I要当做S?)
|
||||
inline int64_t GetSoftStart()
|
||||
{
|
||||
// 获取soft clip开始位置(能处理H和S相连的情况,有这种情况么?,
|
||||
// 注意开头的I要当做S?)
|
||||
inline int64_t GetSoftStart() {
|
||||
int64_t softStart = b->core.pos;
|
||||
const uint32_t *cigar = bam_get_cigar(b);
|
||||
const bam1_core_t &bc = b->core;
|
||||
for (int i = 0; i < bc.n_cigar; ++i)
|
||||
{
|
||||
for (int i = 0; i < bc.n_cigar; ++i) {
|
||||
const char c = bam_cigar_opchr(cigar[i]);
|
||||
const int len = bam_cigar_oplen(cigar[i]);
|
||||
if (c == 'S' || c == 'I')
|
||||
|
|
@ -274,13 +219,11 @@ struct BamWrap
|
|||
}
|
||||
|
||||
// 获取unclipped开始位置(包括hardclip)
|
||||
inline int64_t GetUnclippedStart()
|
||||
{
|
||||
inline int64_t GetUnclippedStart() {
|
||||
int64_t start = b->core.pos;
|
||||
const uint32_t *cigar = bam_get_cigar(b);
|
||||
const bam1_core_t &bc = b->core;
|
||||
for (int i = 0; i < bc.n_cigar; ++i)
|
||||
{
|
||||
for (int i = 0; i < bc.n_cigar; ++i) {
|
||||
const char c = bam_cigar_opchr(cigar[i]);
|
||||
const int len = bam_cigar_oplen(cigar[i]);
|
||||
if (c == 'S' || c == 'H')
|
||||
|
|
@ -292,13 +235,11 @@ struct BamWrap
|
|||
}
|
||||
|
||||
// 获取unclipped结束位置(包括hardclip)
|
||||
inline int64_t GetUnclippedEnd()
|
||||
{
|
||||
inline int64_t GetUnclippedEnd() {
|
||||
int64_t end_pos = bam_endpos(b);
|
||||
const uint32_t *cigar = bam_get_cigar(b);
|
||||
const bam1_core_t &bc = b->core;
|
||||
for (int i = bc.n_cigar - 1; i >= 0; --i)
|
||||
{
|
||||
for (int i = bc.n_cigar - 1; i >= 0; --i) {
|
||||
const char c = bam_cigar_opchr(cigar[i]);
|
||||
const int len = bam_cigar_oplen(cigar[i]);
|
||||
if (c == 'S' || c == 'H')
|
||||
|
|
@ -311,12 +252,10 @@ struct BamWrap
|
|||
|
||||
/* 获取碱基质量分数的加和 */
|
||||
/** Calculates a score for the read which is the sum of scores over Q15. */
|
||||
inline int GetSumOfBaseQualities()
|
||||
{
|
||||
inline int GetSumOfBaseQualities() {
|
||||
int score = 0;
|
||||
uint8_t *qual = bam_get_qual(b);
|
||||
for (int i = 0; i < b->core.l_qseq; ++i)
|
||||
{
|
||||
for (int i = 0; i < b->core.l_qseq; ++i) {
|
||||
if (qual[i] >= 15)
|
||||
score += qual[i];
|
||||
}
|
||||
|
|
@ -327,93 +266,63 @@ struct BamWrap
|
|||
/* 与flag相关的检测 */
|
||||
|
||||
/* 没有比对上 unmapped */
|
||||
inline bool GetReadUnmappedFlag()
|
||||
{
|
||||
return b->core.flag & BAM_FUNMAP;
|
||||
}
|
||||
inline bool GetReadUnmappedFlag() { return b->core.flag & BAM_FUNMAP; }
|
||||
|
||||
/* Template having multiple segments in sequencing */
|
||||
inline bool GetReadPairedFlag()
|
||||
{
|
||||
return b->core.flag & BAM_FPAIRED;
|
||||
}
|
||||
inline bool GetReadPairedFlag() { return b->core.flag & BAM_FPAIRED; }
|
||||
|
||||
/**
|
||||
* the read fails platform/vendor quality checks.
|
||||
*/
|
||||
inline bool GetReadFailsVendorQualityCheckFlag()
|
||||
{
|
||||
return b->core.flag & BAM_FQCFAIL;
|
||||
}
|
||||
inline bool GetReadFailsVendorQualityCheckFlag() { return b->core.flag & BAM_FQCFAIL; }
|
||||
|
||||
/**
|
||||
* the mate is unmapped.
|
||||
*/
|
||||
bool GetMateUnmappedFlag()
|
||||
{
|
||||
return b->core.flag & BAM_FMUNMAP;
|
||||
}
|
||||
bool GetMateUnmappedFlag() { return b->core.flag & BAM_FMUNMAP; }
|
||||
|
||||
/**
|
||||
* @return whether the alignment is secondary (an alternative alignment of the read).
|
||||
* @return whether the alignment is secondary (an alternative alignment of
|
||||
* the read).
|
||||
*/
|
||||
bool IsSecondaryAlignment()
|
||||
{
|
||||
return b->core.flag & BAM_FSECONDARY;
|
||||
}
|
||||
bool IsSecondaryAlignment() { return b->core.flag & BAM_FSECONDARY; }
|
||||
|
||||
/**
|
||||
* @return whether the alignment is supplementary (a split alignment such as a chimeric alignment).
|
||||
* @return whether the alignment is supplementary (a split alignment such as
|
||||
* a chimeric alignment).
|
||||
*/
|
||||
bool GetSupplementaryAlignmentFlag()
|
||||
{
|
||||
return b->core.flag & BAM_FSUPPLEMENTARY;
|
||||
}
|
||||
bool GetSupplementaryAlignmentFlag() { return b->core.flag & BAM_FSUPPLEMENTARY; }
|
||||
|
||||
/*
|
||||
* Tests if this record is either a secondary and/or supplementary alignment;
|
||||
* Tests if this record is either a secondary and/or supplementary
|
||||
* alignment;
|
||||
*/
|
||||
bool IsSecondaryOrSupplementary()
|
||||
{
|
||||
return IsSecondaryAlignment() || GetSupplementaryAlignmentFlag();
|
||||
}
|
||||
bool IsSecondaryOrSupplementary() { return IsSecondaryAlignment() || GetSupplementaryAlignmentFlag(); }
|
||||
|
||||
/**
|
||||
* the read is the first read in a pair.
|
||||
*/
|
||||
bool GetFirstOfPairFlag()
|
||||
{
|
||||
return b->core.flag & BAM_FREAD1;
|
||||
}
|
||||
bool GetFirstOfPairFlag() { return b->core.flag & BAM_FREAD1; }
|
||||
|
||||
/**
|
||||
* strand of the query (false for forward; true for reverse strand).
|
||||
*/
|
||||
bool GetReadNegativeStrandFlag()
|
||||
{
|
||||
return b->core.flag & BAM_FREVERSE;
|
||||
}
|
||||
bool GetReadNegativeStrandFlag() { return b->core.flag & BAM_FREVERSE; }
|
||||
|
||||
/**
|
||||
* strand of the mate (false for forward; true for reverse strand).
|
||||
*/
|
||||
bool GetMateNegativeStrandFlag()
|
||||
{
|
||||
return b->core.flag & BAM_FMREVERSE;
|
||||
}
|
||||
bool GetMateNegativeStrandFlag() { return b->core.flag & BAM_FMREVERSE; }
|
||||
|
||||
/* 其他的一些信息 */
|
||||
inline int GetReferenceLength()
|
||||
{
|
||||
inline int GetReferenceLength() {
|
||||
int length = 0;
|
||||
const uint32_t *cigar = bam_get_cigar(b);
|
||||
const bam1_core_t &bc = b->core;
|
||||
for (int i = 0; i < bc.n_cigar; ++i)
|
||||
{
|
||||
for (int i = 0; i < bc.n_cigar; ++i) {
|
||||
const char c = bam_cigar_opchr(cigar[i]);
|
||||
const int len = bam_cigar_oplen(cigar[i]);
|
||||
switch (c)
|
||||
{
|
||||
switch (c) {
|
||||
case 'M':
|
||||
case 'D':
|
||||
case 'N':
|
||||
|
|
@ -429,24 +338,20 @@ struct BamWrap
|
|||
}
|
||||
|
||||
// 计算bam的全局位置,算上染色体序号和比对位置
|
||||
static inline int64_t bam_global_pos(bam1_t *b)
|
||||
{
|
||||
static inline int64_t bam_global_pos(bam1_t *b) {
|
||||
return (((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)b->core.pos);
|
||||
}
|
||||
static inline int64_t bam_global_pos(int tid, int pos)
|
||||
{
|
||||
static inline int64_t bam_global_pos(int tid, int pos) {
|
||||
return (((int64_t)tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)pos);
|
||||
}
|
||||
// 根据全局位置获取bam的染色体序号
|
||||
static inline int32_t bam_tid(int64_t global_pos)
|
||||
{
|
||||
static inline int32_t bam_tid(int64_t global_pos) {
|
||||
const int64_t mask = ~(((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1);
|
||||
const int64_t high_tid = global_pos & mask;
|
||||
return (int32_t)(high_tid >> MAX_CONTIG_LEN_SHIFT);
|
||||
}
|
||||
// 根据全局位置获取bam的比对位置(染色体内)
|
||||
static inline int32_t bam_pos(int64_t global_pos)
|
||||
{
|
||||
static inline int32_t bam_pos(int64_t global_pos) {
|
||||
const int64_t mask = ((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1;
|
||||
return (int32_t)(global_pos & mask);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -23,8 +23,7 @@ using std::vector;
|
|||
struct option *GlobalArg::GLOBAL_OPT = nullptr;
|
||||
|
||||
// 初始化参数
|
||||
void GlobalArg::initGlobalOptions()
|
||||
{
|
||||
void GlobalArg::initGlobalOptions() {
|
||||
vector<struct option> v;
|
||||
v.push_back({"INPUT", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_INPUT}); // 输入文件
|
||||
v.push_back({"OUTPUT", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_OUTPUT}); // 输出文件
|
||||
|
|
@ -51,59 +50,53 @@ void GlobalArg::initGlobalOptions()
|
|||
}
|
||||
|
||||
// 解析参数
|
||||
void GlobalArg::parseArgument(int argNum)
|
||||
{
|
||||
void GlobalArg::parseArgument(int argNum) {
|
||||
using namespace ns_ga;
|
||||
switch (argNum)
|
||||
{
|
||||
case OPT_INPUT:
|
||||
in_fn = optarg;
|
||||
break;
|
||||
case OPT_OUTPUT:
|
||||
out_fn = optarg;
|
||||
break;
|
||||
case OPT_NUM_THREADS:
|
||||
num_threads = std::stoi(optarg);
|
||||
break;
|
||||
case OPT_MAX_MEM:
|
||||
{
|
||||
char *q;
|
||||
size_t mem_arg = strtol(optarg, &q, 0);
|
||||
if (*q == 'k' || *q == 'K')
|
||||
mem_arg <<= 10;
|
||||
else if (*q == 'm' || *q == 'M')
|
||||
mem_arg <<= 20;
|
||||
else if (*q == 'g' || *q == 'G')
|
||||
mem_arg <<= 30;
|
||||
if (mem_arg >= max_mem)
|
||||
max_mem = mem_arg;
|
||||
else
|
||||
{
|
||||
std::cerr << "[Warn] Too small mem size, use default" << std::endl;
|
||||
}
|
||||
break;
|
||||
switch (argNum) {
|
||||
case OPT_INPUT:
|
||||
in_fn = optarg;
|
||||
break;
|
||||
case OPT_OUTPUT:
|
||||
out_fn = optarg;
|
||||
break;
|
||||
case OPT_NUM_THREADS:
|
||||
num_threads = std::stoi(optarg);
|
||||
break;
|
||||
case OPT_MAX_MEM: {
|
||||
char *q;
|
||||
size_t mem_arg = strtol(optarg, &q, 0);
|
||||
if (*q == 'k' || *q == 'K')
|
||||
mem_arg <<= 10;
|
||||
else if (*q == 'm' || *q == 'M')
|
||||
mem_arg <<= 20;
|
||||
else if (*q == 'g' || *q == 'G')
|
||||
mem_arg <<= 30;
|
||||
if (mem_arg >= max_mem)
|
||||
max_mem = mem_arg;
|
||||
else {
|
||||
std::cerr << "[Warn] Too small mem size, use default" << std::endl;
|
||||
}
|
||||
case OPT_LOG_LEVEL:
|
||||
{
|
||||
if (strcmp("ERROR", optarg) == 0)
|
||||
verbosity = ns_ga::ERROR;
|
||||
else if (strcmp("WARNING", optarg) == 0)
|
||||
verbosity = ns_ga::WARNING;
|
||||
else if (strcmp("INFO", optarg) == 0)
|
||||
verbosity = ns_ga::INFO;
|
||||
else if (strcmp("DEBUG", optarg) == 0)
|
||||
verbosity = ns_ga::DEBUG;
|
||||
break;
|
||||
}
|
||||
case OPT_ASYNCIO:
|
||||
{
|
||||
if (strcmp("true", optarg) == 0)
|
||||
use_asyncio = true;
|
||||
else if (strcmp("false", optarg) == 0)
|
||||
use_asyncio = false;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
break;
|
||||
}
|
||||
case OPT_LOG_LEVEL: {
|
||||
if (strcmp("ERROR", optarg) == 0)
|
||||
verbosity = ns_ga::ERROR;
|
||||
else if (strcmp("WARNING", optarg) == 0)
|
||||
verbosity = ns_ga::WARNING;
|
||||
else if (strcmp("INFO", optarg) == 0)
|
||||
verbosity = ns_ga::INFO;
|
||||
else if (strcmp("DEBUG", optarg) == 0)
|
||||
verbosity = ns_ga::DEBUG;
|
||||
break;
|
||||
}
|
||||
case OPT_ASYNCIO: {
|
||||
if (strcmp("true", optarg) == 0)
|
||||
use_asyncio = true;
|
||||
else if (strcmp("false", optarg) == 0)
|
||||
use_asyncio = false;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
@ -9,63 +9,57 @@ Date : 2023/10/23
|
|||
#ifndef GLOBAL_ARG_H_
|
||||
#define GLOBAL_ARG_H_
|
||||
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <getopt.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
using std::map;
|
||||
using std::string;
|
||||
using std::vector;
|
||||
|
||||
namespace ns_ga {
|
||||
enum GlobalOptEnum
|
||||
{
|
||||
_START_NUM = 1,
|
||||
OPT_INPUT,
|
||||
OPT_OUTPUT,
|
||||
OPT_NUM_THREADS,
|
||||
OPT_MAX_MEM,
|
||||
OPT_LOG_LEVEL,
|
||||
OPT_ASYNCIO,
|
||||
OPT_VERSION,
|
||||
OPT_HELP,
|
||||
_END_NUM
|
||||
};
|
||||
enum GlobalOptEnum {
|
||||
_START_NUM = 1,
|
||||
OPT_INPUT,
|
||||
OPT_OUTPUT,
|
||||
OPT_NUM_THREADS,
|
||||
OPT_MAX_MEM,
|
||||
OPT_LOG_LEVEL,
|
||||
OPT_ASYNCIO,
|
||||
OPT_VERSION,
|
||||
OPT_HELP,
|
||||
_END_NUM
|
||||
};
|
||||
|
||||
// log level
|
||||
enum LogLevelEnum
|
||||
{
|
||||
ERROR,
|
||||
WARNING,
|
||||
INFO,
|
||||
DEBUG
|
||||
};
|
||||
}
|
||||
// log level
|
||||
enum LogLevelEnum { ERROR, WARNING, INFO, DEBUG };
|
||||
} // namespace ns_ga
|
||||
|
||||
/* 全局共享的一些参数 */
|
||||
struct GlobalArg
|
||||
{
|
||||
const static int GLOBAL_ARG_CNT = ns_ga::GlobalOptEnum::_END_NUM - ns_ga::GlobalOptEnum::_START_NUM; // 这里不需要减1
|
||||
struct GlobalArg {
|
||||
const static int GLOBAL_ARG_CNT =
|
||||
ns_ga::GlobalOptEnum::_END_NUM -
|
||||
ns_ga::GlobalOptEnum::_START_NUM; // 这里不需要减1
|
||||
static struct option *GLOBAL_OPT;
|
||||
|
||||
string in_fn; // input bam filename
|
||||
string out_fn; // output bam filename
|
||||
int num_threads = 1; // 线程个数
|
||||
size_t max_mem = ((size_t)2) << 30; // 最小2G
|
||||
ns_ga::LogLevelEnum verbosity = ns_ga::INFO; // 打印信息级别
|
||||
bool use_asyncio = true; // 是否使用异步io
|
||||
string in_fn; // input bam filename
|
||||
string out_fn; // output bam filename
|
||||
int num_threads = 1; // 线程个数
|
||||
size_t max_mem = ((size_t)1) << 30; // 最小1G
|
||||
ns_ga::LogLevelEnum verbosity = ns_ga::INFO; // 打印信息级别
|
||||
bool use_asyncio = true; // 是否使用异步io
|
||||
|
||||
vector<string> vArgInfo; // 每个参数的帮助信息
|
||||
vector<string> vArgInfo; // 每个参数的帮助信息
|
||||
|
||||
// 单例模式
|
||||
GlobalArg(const GlobalArg &) = delete;
|
||||
GlobalArg &operator=(const GlobalArg &) = delete;
|
||||
|
||||
// 获取单例
|
||||
static GlobalArg &Instance()
|
||||
{
|
||||
static GlobalArg &Instance() {
|
||||
static GlobalArg instance;
|
||||
return instance;
|
||||
}
|
||||
|
|
@ -76,8 +70,7 @@ struct GlobalArg
|
|||
void parseArgument(int argNum);
|
||||
|
||||
// 获取对应参数在数组(option和help info)中的索引
|
||||
int getArgIndx(ns_ga::GlobalOptEnum opt)
|
||||
{
|
||||
int getArgIndx(ns_ga::GlobalOptEnum opt) {
|
||||
return opt - ns_ga::GlobalOptEnum::OPT_INPUT;
|
||||
}
|
||||
|
||||
|
|
@ -95,11 +88,9 @@ struct GlobalArg
|
|||
printf("--verbosity = %d\n", verbosity);
|
||||
printf("--asyncio = %d\n", use_asyncio);
|
||||
}
|
||||
private :
|
||||
GlobalArg()
|
||||
{
|
||||
initGlobalOptions();
|
||||
};
|
||||
|
||||
private:
|
||||
GlobalArg() { initGlobalOptions(); };
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
@ -7,36 +7,34 @@ Author : Zhang Zhonghai
|
|||
Date : 2023/11/6
|
||||
*/
|
||||
|
||||
#include <string>
|
||||
#include <random>
|
||||
#include <string>
|
||||
|
||||
using std::string;
|
||||
|
||||
/**
|
||||
* Provides an implementation of the Murmur3_32 hash algorithm that has desirable properties in terms of randomness
|
||||
* and uniformity of the distribution of output values that make it a useful hashing algorithm for downsampling.
|
||||
* Provides an implementation of the Murmur3_32 hash algorithm that has
|
||||
* desirable properties in terms of randomness and uniformity of the
|
||||
* distribution of output values that make it a useful hashing algorithm for
|
||||
* downsampling.
|
||||
*/
|
||||
|
||||
struct Murmur3
|
||||
{
|
||||
struct Murmur3 {
|
||||
int seed_ = 0;
|
||||
/** Hashes a character stream to an int using Murmur3. */
|
||||
int HashUnencodedChars(const string &input)
|
||||
{
|
||||
int HashUnencodedChars(const string &input) {
|
||||
int h1 = this->seed_;
|
||||
|
||||
// step through the CharSequence 2 chars at a time
|
||||
const int length = input.size();
|
||||
for (int i = 1; i < length; i += 2)
|
||||
{
|
||||
for (int i = 1; i < length; i += 2) {
|
||||
int k1 = input.at(i - 1) | (input.at(i) << 16);
|
||||
k1 = mixK1(k1);
|
||||
h1 = mixH1(h1, k1);
|
||||
}
|
||||
|
||||
// deal with any remaining characters
|
||||
if ((length & 1) == 1)
|
||||
{
|
||||
if ((length & 1) == 1) {
|
||||
int k1 = input.at(length - 1);
|
||||
k1 = mixK1(k1);
|
||||
h1 ^= k1;
|
||||
|
|
@ -45,14 +43,12 @@ struct Murmur3
|
|||
return fmix(h1, 2 * length);
|
||||
}
|
||||
|
||||
static Murmur3 &Instance()
|
||||
{
|
||||
static Murmur3 &Instance() {
|
||||
static Murmur3 instance;
|
||||
return instance;
|
||||
}
|
||||
|
||||
static int mixK1(int k1)
|
||||
{
|
||||
static int mixK1(int k1) {
|
||||
const int c1 = 0xcc9e2d51;
|
||||
const int c2 = 0x1b873593;
|
||||
k1 *= c1;
|
||||
|
|
@ -61,8 +57,7 @@ struct Murmur3
|
|||
return k1;
|
||||
}
|
||||
|
||||
static int mixH1(int h1, int k1)
|
||||
{
|
||||
static int mixH1(int h1, int k1) {
|
||||
h1 ^= k1;
|
||||
h1 = h1 << 13;
|
||||
h1 = h1 * 5 + 0xe6546b64;
|
||||
|
|
@ -70,8 +65,7 @@ struct Murmur3
|
|||
}
|
||||
|
||||
// Finalization mix - force all bits of a hash block to avalanche
|
||||
static int fmix(int h1, int length)
|
||||
{
|
||||
static int fmix(int h1, int length) {
|
||||
h1 ^= length;
|
||||
h1 ^= (unsigned int)h1 >> 16;
|
||||
h1 *= 0x85ebca6b;
|
||||
|
|
@ -82,8 +76,7 @@ struct Murmur3
|
|||
}
|
||||
|
||||
private:
|
||||
Murmur3()
|
||||
{
|
||||
Murmur3() {
|
||||
auto &&rd = std::random_device{};
|
||||
seed_ = rd();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,70 @@
|
|||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <stdint.h>
|
||||
|
||||
using std::string;
|
||||
|
||||
/*
|
||||
* 标记冗余过程中的一些数据统计
|
||||
*/
|
||||
struct DuplicationMetrics {
|
||||
/**
|
||||
* The library on which the duplicate marking was performed.
|
||||
*/
|
||||
string LIBRARY = "";
|
||||
/**
|
||||
* The number of mapped reads examined which did not have a mapped mate pair,
|
||||
* either because the read is unpaired, or the read is paired to an unmapped mate.
|
||||
*/
|
||||
uint64_t UNPAIRED_READS_EXAMINED = 0;
|
||||
/**
|
||||
* The number of mapped read pairs examined. (Primary, non-supplemental)
|
||||
*/
|
||||
uint64_t READ_PAIRS_EXAMINED = 0;
|
||||
/**
|
||||
* The number of reads that were either secondary or supplementary
|
||||
*/
|
||||
uint64_t SECONDARY_OR_SUPPLEMENTARY_RDS = 0;
|
||||
/**
|
||||
* The total number of unmapped reads examined. (Primary, non-supplemental)
|
||||
*/
|
||||
uint64_t UNMAPPED_READS = 0;
|
||||
/**
|
||||
* The number of fragments that were marked as duplicates.
|
||||
*/
|
||||
uint64_t UNPAIRED_READ_DUPLICATES = 0;
|
||||
/**
|
||||
* The number of read pairs that were marked as duplicates.
|
||||
*/
|
||||
uint64_t READ_PAIR_DUPLICATES = 0;
|
||||
/**
|
||||
* The number of read pairs duplicates that were caused by optical duplication.
|
||||
* Value is always < READ_PAIR_DUPLICATES, which counts all duplicates regardless of source.
|
||||
*/
|
||||
uint64_t READ_PAIR_OPTICAL_DUPLICATES = 0;
|
||||
/**
|
||||
* The fraction of mapped sequence that is marked as duplicate.
|
||||
*/
|
||||
double PERCENT_DUPLICATION = 0.0;
|
||||
/**
|
||||
* The estimated number of unique molecules in the library based on PE duplication.
|
||||
*/
|
||||
uint64_t ESTIMATED_LIBRARY_SIZE = 0;
|
||||
|
||||
// 其他的统计数据
|
||||
|
||||
// addSingletonToCount需要记录的数据
|
||||
uint64_t DuplicateCountHist = 0;
|
||||
uint64_t NonOpticalDuplicateCountHist = 0;
|
||||
|
||||
// track optical duplicates 需要记录的数据
|
||||
uint64_t OpticalDuplicatesCountHist = 0;
|
||||
uint64_t OpticalDuplicatesByLibraryId = 0;
|
||||
|
||||
// 统计相关的函数
|
||||
void AddSingletonToCount() {
|
||||
++this->DuplicateCountHist;
|
||||
++this->NonOpticalDuplicateCountHist;
|
||||
}
|
||||
};
|
||||
|
|
@ -1,100 +1,99 @@
|
|||
/*
|
||||
Description: 标记bam文件中的冗余信息,只处理按照坐标排序后的bam,且bam为单一样本数据
|
||||
Description:
|
||||
标记bam文件中的冗余信息,只处理按照坐标排序后的bam,且bam为单一样本数据
|
||||
|
||||
Copyright : All right reserved by ICT
|
||||
|
||||
Author : Zhang Zhonghai
|
||||
Date : 2023/10/23
|
||||
*/
|
||||
#include "markdups_arg.h"
|
||||
// 有太多define冲突,放到最后include
|
||||
|
||||
|
||||
#include <common/hts/bam_buf.h>
|
||||
#include <common/utils/global_arg.h>
|
||||
#include <common/utils/murmur3.h>
|
||||
#include <common/utils/thpool.h>
|
||||
#include <common/utils/timer.h>
|
||||
#include <common/utils/util.h>
|
||||
#include <common/utils/murmur3.h>
|
||||
#include <common/utils/yarn.h>
|
||||
#include <htslib/sam.h>
|
||||
#include <htslib/thread_pool.h>
|
||||
#include <sam/utils/read_ends.h>
|
||||
#include <sam/utils/read_name_parser.h>
|
||||
|
||||
#include <htslib/sam.h>
|
||||
#include <htslib/thread_pool.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
#include "markdups_arg.h"
|
||||
#include "md_funcs.h"
|
||||
#include "parallel_md.h"
|
||||
#include "serial_md.h"
|
||||
#include "shared_args.h"
|
||||
#include "dup_metrics.h"
|
||||
|
||||
using namespace std;
|
||||
using std::cout;
|
||||
|
||||
|
||||
#define SMA_TAG_PG "PG"
|
||||
|
||||
#define BAM_BLOCK_SIZE 2 * 1024 * 1024
|
||||
#define BAM_BLOCK_SIZE 16L * 1024 * 1024
|
||||
#define NO_SUCH_INDEX INT64_MAX
|
||||
|
||||
static Timer tm_arr[20]; // 用来测试性能
|
||||
Timer tm_arr[20]; // 用来测试性能
|
||||
/* 全局本地变量 */
|
||||
static vector<ReadNameParser> g_vRnParser; // 每个线程一个read name parser
|
||||
static samFile *g_inBamFp; // 输入的bam文件
|
||||
static sam_hdr_t *g_inBamHeader; // 输入的bam文件头信息
|
||||
static samFile *g_outBamFp = nullptr; // 输出文件, sam或者bam格式
|
||||
static sam_hdr_t *g_outBamHeader; // 输出文件的header
|
||||
vector<ReadNameParser> g_vRnParser; // 每个线程一个read name parser
|
||||
samFile *g_inBamFp; // 输入的bam文件
|
||||
sam_hdr_t *g_inBamHeader; // 输入的bam文件头信息
|
||||
samFile *g_outBamFp = nullptr; // 输出文件, sam或者bam格式
|
||||
sam_hdr_t *g_outBamHeader; // 输出文件的header
|
||||
|
||||
/* 参数对象作为全局对象,免得多次作为参数传入函数中 */
|
||||
static GlobalArg &g_gArg = GlobalArg::Instance();
|
||||
static MarkDupsArg g_mdArg;
|
||||
|
||||
|
||||
#include "md_funcs.h"
|
||||
#include "serial_md.h"
|
||||
#include "parallel_md.h"
|
||||
|
||||
|
||||
|
||||
GlobalArg &g_gArg = GlobalArg::Instance();
|
||||
static MarkDupsArg g_mdArg_;
|
||||
MarkDupsArg &g_mdArg = g_mdArg_;
|
||||
static GlobalDataArg gData_;
|
||||
GlobalDataArg &gData = gData_;
|
||||
DuplicationMetrics gMetrics_;
|
||||
DuplicationMetrics &gMetrics = gMetrics_;
|
||||
/*
|
||||
* mark duplicate 入口,假定bam是按照比对后的坐标排序的,同一个样本的话不需要考虑barcode的问题
|
||||
* mark duplicate
|
||||
* 入口,假定bam是按照比对后的坐标排序的,同一个样本的话不需要考虑barcode的问题
|
||||
*/
|
||||
int MarkDuplicates(int argc, char *argv[])
|
||||
{
|
||||
int MarkDuplicates(int argc, char *argv[]) {
|
||||
Timer::log_time("程序开始");
|
||||
Timer time_all;
|
||||
|
||||
|
||||
/* 读取命令行参数 */
|
||||
g_mdArg.parseArgument(argc, argv, &g_gArg); // 解析命令行参数
|
||||
if (g_gArg.num_threads < 1) // 线程数不能小于1
|
||||
g_gArg.num_threads = 1;
|
||||
g_mdArg.parseArgument(argc, argv, &g_gArg); // 解析命令行参数
|
||||
if (g_gArg.num_threads < 1)
|
||||
g_gArg.num_threads = 1; // 线程数不能小于1
|
||||
|
||||
/* 初始化一些参数和变量*/
|
||||
g_vRnParser.resize(g_gArg.num_threads);
|
||||
for (auto &parser : g_vRnParser)
|
||||
parser.SetReadNameRegex(g_mdArg.READ_NAME_REGEX); // 用来解析read name中的tile,x,y信息
|
||||
|
||||
parser.SetReadNameRegex(g_mdArg.READ_NAME_REGEX); // 用来解析read name中的tile,x,y信息
|
||||
|
||||
/* 打开输入bam文件 */
|
||||
g_inBamFp = sam_open_format(g_gArg.in_fn.c_str(), "r", nullptr);
|
||||
if (!g_inBamFp)
|
||||
{
|
||||
if (!g_inBamFp) {
|
||||
Error("[%s] load sam/bam file failed.\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
hts_set_opt(g_inBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
||||
g_inBamHeader = sam_hdr_read(g_inBamFp); // 读取header
|
||||
g_inBamHeader = sam_hdr_read(g_inBamFp); // 读取header
|
||||
// 获取样本名称(libraryId)
|
||||
gMetrics.LIBRARY = sam_hdr_line_name(g_inBamHeader, "RG", 0);
|
||||
|
||||
/* 利用线程池对输入输出文件进行读写 */
|
||||
htsThreadPool htsPoolRead = {NULL, 0}; // 多线程读取,创建线程池
|
||||
htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池
|
||||
// htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads);
|
||||
htsPoolRead.pool = hts_tpool_init(16);
|
||||
// htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads);
|
||||
htsPoolWrite.pool = hts_tpool_init(16);
|
||||
if (!htsPoolRead.pool || !htsPoolWrite.pool)
|
||||
{
|
||||
htsThreadPool htsPoolRead = {NULL, 0}; // 多线程读取,创建线程池
|
||||
htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池
|
||||
//htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads);
|
||||
//htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads);
|
||||
htsPoolRead.pool = hts_tpool_init(32);
|
||||
htsPoolWrite.pool = hts_tpool_init(32);
|
||||
if (!htsPoolRead.pool || !htsPoolWrite.pool) {
|
||||
Error("[%d] failed to set up thread pool", __LINE__);
|
||||
sam_close(g_inBamFp);
|
||||
return -1;
|
||||
|
|
@ -106,86 +105,94 @@ int MarkDuplicates(int argc, char *argv[])
|
|||
sam_open_mode(modeout + 1, g_gArg.out_fn.c_str(), NULL);
|
||||
g_outBamFp = sam_open(g_gArg.out_fn.c_str(), modeout);
|
||||
g_outBamHeader = sam_hdr_dup(g_inBamHeader);
|
||||
// 用同样的线程池处理输出文件
|
||||
hts_set_opt(g_outBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
||||
hts_set_opt(g_outBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite); // 用同样的线程池处理输出文件
|
||||
|
||||
hts_set_opt(g_outBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite);
|
||||
|
||||
/* 冗余检查和标记 */
|
||||
if (g_gArg.num_threads == 1)
|
||||
{
|
||||
serialMarkDups(); // 串行运行
|
||||
}
|
||||
else
|
||||
{
|
||||
parallelMarkDups(); // 并行运行
|
||||
if (g_gArg.num_threads == 1) {
|
||||
serialMarkDups(); // 串行运行
|
||||
} else {
|
||||
parallelMarkDups(); // 并行运行
|
||||
}
|
||||
|
||||
/* 标记冗余, 将处理后的结果写入文件 */
|
||||
sam_close(g_inBamFp); // 重新打开bam文件
|
||||
sam_close(g_inBamFp); // 重新打开bam文件
|
||||
g_inBamFp = sam_open_format(g_gArg.in_fn.c_str(), "r", nullptr);
|
||||
if (!g_inBamFp)
|
||||
{
|
||||
if (!g_inBamFp) {
|
||||
Error("[%s] load sam/bam file failed.\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
hts_set_opt(g_inBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
||||
hts_set_opt(g_inBamFp, HTS_OPT_THREAD_POOL, &htsPoolRead);
|
||||
g_inBamHeader = sam_hdr_read(g_inBamFp); // 读取header
|
||||
if (sam_hdr_write(g_outBamFp, g_outBamHeader) != 0)
|
||||
{
|
||||
g_inBamHeader = sam_hdr_read(g_inBamFp); // 读取header
|
||||
if (sam_hdr_write(g_outBamFp, g_outBamHeader) != 0) {
|
||||
Error("failed writing header to \"%s\"", g_gArg.out_fn.c_str());
|
||||
sam_close(g_outBamFp);
|
||||
sam_close(g_inBamFp);
|
||||
return -1;
|
||||
}
|
||||
// 输出index文件
|
||||
string indexFn = g_gArg.out_fn + ".csi"; // 现在索引都是csi格式的
|
||||
if (sam_idx_init(g_outBamFp, g_outBamHeader, 14 /*csi*/, indexFn.c_str()) < 0)
|
||||
{
|
||||
// string indexFn = g_gArg.out_fn + ".csi"; // 现在索引都是csi格式的
|
||||
string indexFn = g_gArg.out_fn + ".bai"; // min_shift = 0 是bai格式
|
||||
int index_min_shift = 0;
|
||||
if (g_mdArg.INDEX_FORMAT == ns_md::IndexFormat::CSI) {
|
||||
indexFn = g_gArg.out_fn + ".csi";
|
||||
index_min_shift = 14;
|
||||
}
|
||||
if (sam_idx_init(g_outBamFp, g_outBamHeader, 0 /*csi 14*/, indexFn.c_str()) < 0) {
|
||||
Error("failed to open index \"%s\" for writing", indexFn.c_str());
|
||||
sam_close(g_outBamFp);
|
||||
sam_close(g_inBamFp);
|
||||
return -1;
|
||||
}
|
||||
// 读取输入文件
|
||||
// BamBufType inBuf(false); // inBuf(g_gArg.use_asyncio);
|
||||
// BamBufType inBuf(false);
|
||||
BamBufType inBuf(g_gArg.use_asyncio);
|
||||
inBuf.Init(g_inBamFp, g_inBamHeader, g_gArg.max_mem);
|
||||
DupIdxQueue idxQue;
|
||||
idxQue.Init(&gData.dupIdxArr);
|
||||
Timer tw;
|
||||
while (inBuf.ReadStat() >= 0)
|
||||
{
|
||||
cout << "dupsize: " << idxQue.Size() << endl;
|
||||
uint64_t bamIdx = 0;
|
||||
uint64_t dupIdx = idxQue.Pop();
|
||||
cout << "dup arr size: " << gData.dupIdxArr.size() << endl;
|
||||
cout << "first dup: " << dupIdx << endl;
|
||||
while (inBuf.ReadStat() >= 0) {
|
||||
Timer tw1;
|
||||
size_t readNum = inBuf.ReadBam();
|
||||
cout << "read: " << readNum << endl;
|
||||
for (size_t i = 0; i < inBuf.Size(); ++i)
|
||||
{
|
||||
for (size_t i = 0; i < inBuf.Size(); ++i) {
|
||||
/* 判断是否冗余 */
|
||||
if (sam_write1(g_outBamFp, g_outBamHeader, inBuf[i]->b) < 0)
|
||||
{
|
||||
if (bamIdx == dupIdx) {
|
||||
// cout << "冗余" << bamIdx << endl;
|
||||
dupIdx = idxQue.Pop();
|
||||
}
|
||||
if (sam_write1(g_outBamFp, g_outBamHeader, inBuf[i]->b) < 0) {
|
||||
Error("failed writing header to \"%s\"", g_gArg.out_fn.c_str());
|
||||
sam_close(g_outBamFp);
|
||||
sam_close(g_inBamFp);
|
||||
return -1;
|
||||
}
|
||||
++bamIdx;
|
||||
}
|
||||
inBuf.ClearAll();
|
||||
cout << "write round time: " << tw1.seconds_elapsed() << " s" << endl;
|
||||
}
|
||||
if (sam_idx_save(g_outBamFp) < 0)
|
||||
{
|
||||
cout << "dupsize: " << idxQue.Size() << endl;
|
||||
if (sam_idx_save(g_outBamFp) < 0) {
|
||||
Error("writing index failed");
|
||||
sam_close(g_outBamFp);
|
||||
sam_close(g_inBamFp);
|
||||
return -1;
|
||||
}
|
||||
cout << "write time: " << tw.seconds_elapsed() << " s" << endl;
|
||||
|
||||
/* 关闭文件,收尾清理 */
|
||||
sam_close(g_outBamFp);
|
||||
sam_close(g_inBamFp);
|
||||
|
||||
cout << " 总时间: " << time_all.seconds_elapsed() << endl;
|
||||
// cout << "计算read end: " << tm_arr[0].acc_seconds_elapsed() << endl;
|
||||
// cout << "计算read end: " << tm_arr[0].acc_seconds_elapsed() << endl;
|
||||
Timer::log_time("程序结束");
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
|
@ -64,6 +64,7 @@ const static struct option kMdOpts[] = {
|
|||
{"COMPRESSION_LEVEL", required_argument, NULL, COMPRESSION_LEVEL},
|
||||
{"MAX_RECORDS_IN_RAM", required_argument, NULL, MAX_RECORDS_IN_RAM},
|
||||
{"CREATE_INDEX", required_argument, NULL, CREATE_INDEX},
|
||||
{"INDEX_FORMAT", required_argument, NULL, INDEX_FORMAT},
|
||||
{"CREATE_MD5_FILE", required_argument, NULL, CREATE_MD5_FILE}};
|
||||
|
||||
// 判断bool类型的参数
|
||||
|
|
@ -220,6 +221,11 @@ void MarkDupsArg::parseArgument(int argc,
|
|||
case ns_md::CREATE_INDEX:
|
||||
setBoolArg(&CREATE_INDEX);
|
||||
break;
|
||||
case ns_md::INDEX_FORMAT:
|
||||
if (strcmp("CSI", optarg) == 0)
|
||||
INDEX_FORMAT = ns_md::IndexFormat::CSI;
|
||||
else
|
||||
INDEX_FORMAT = ns_md::IndexFormat::BAI;
|
||||
case ns_md::CREATE_MD5_FILE:
|
||||
setBoolArg(&CREATE_MD5_FILE);
|
||||
break;
|
||||
|
|
@ -235,7 +241,8 @@ void MarkDupsArg::parseArgument(int argc,
|
|||
/* 打印参数信息 */
|
||||
void MarkDupsArg::printArgValue()
|
||||
{
|
||||
printf("--READ_NAME_REGEX = %s\n", this->READ_NAME_REGEX.c_str());
|
||||
printf("--READ_NAME_REGEX = %s\n", this->READ_NAME_REGEX.c_str());
|
||||
printf("--INDEX_FORMAT = %s\n", this->INDEX_FORMAT == ns_md::IndexFormat::BAI ? "bai" : "csi");
|
||||
}
|
||||
|
||||
// 打印版本信息
|
||||
|
|
@ -272,6 +279,8 @@ void MarkDupsArg::PrintHelp()
|
|||
"\n"
|
||||
"Optional Arguments:\n"
|
||||
"\n"
|
||||
"--INDEX_FORMAT <FORMAT> Format for bam index file. Possible values: {BAI, CSI}\n"
|
||||
"\n"
|
||||
"--ADD_PG_TAG_TO_READS <Boolean>\n"
|
||||
" Add PG tag to each read in a SAM or BAM Default value: true. Possible values: {true,\n"
|
||||
" false}\n"
|
||||
|
|
|
|||
|
|
@ -21,91 +21,92 @@ using std::vector;
|
|||
class GlobalArg;
|
||||
|
||||
namespace ns_md {
|
||||
/* 用于markduplicate模块的参数,这个枚举用于getoption */
|
||||
enum MarkDupsArgEnum
|
||||
{
|
||||
_START_NUM = 100,
|
||||
MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP,
|
||||
MAX_FILE_HANDLES_FOR_READ_ENDS_MAP,
|
||||
SORTING_COLLECTION_SIZE_RATIO,
|
||||
BARCODE_TAG,
|
||||
READ_ONE_BARCODE_TAG,
|
||||
READ_TWO_BARCODE_TAG,
|
||||
TAG_DUPLICATE_SET_MEMBERS,
|
||||
REMOVE_SEQUENCING_DUPLICATES,
|
||||
TAGGING_POLICY,
|
||||
CLEAR_DT,
|
||||
DUPLEX_UMI,
|
||||
MOLECULAR_IDENTIFIER_TAG,
|
||||
METRICS_FILE,
|
||||
REMOVE_DUPLICATES,
|
||||
ASSUME_SORTED,
|
||||
ASSUME_SORT_ORDER,
|
||||
DUPLICATE_SCORING_STRATEGY,
|
||||
PROGRAM_RECORD_ID,
|
||||
PROGRAM_GROUP_VERSION,
|
||||
PROGRAM_GROUP_COMMAND_LINE,
|
||||
PROGRAM_GROUP_NAME,
|
||||
COMMENT,
|
||||
READ_NAME_REGEX,
|
||||
OPTICAL_DUPLICATE_PIXEL_DISTANCE,
|
||||
MAX_OPTICAL_DUPLICATE_SET_SIZE,
|
||||
QUIET,
|
||||
VALIDATION_STRINGENCY,
|
||||
COMPRESSION_LEVEL,
|
||||
MAX_RECORDS_IN_RAM,
|
||||
CREATE_INDEX,
|
||||
CREATE_MD5_FILE,
|
||||
_END_NUM
|
||||
};
|
||||
|
||||
/* How strict to be when reading a SAM or BAM, beyond bare minimum validation. */
|
||||
enum ValidationStringency
|
||||
{
|
||||
/**
|
||||
* Do the right thing, throw an exception if something looks wrong.
|
||||
*/
|
||||
STRICT,
|
||||
/**
|
||||
* Emit warnings but keep going if possible.
|
||||
*/
|
||||
LENIENT,
|
||||
/**
|
||||
* Like LENIENT, only don't emit warning messages.
|
||||
*/
|
||||
SILENT,
|
||||
|
||||
DEFAULT_STRINGENCY = SILENT
|
||||
};
|
||||
/* 用于markduplicate模块的参数,这个枚举用于getoption */
|
||||
enum MarkDupsArgEnum {
|
||||
_START_NUM = 100,
|
||||
MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP,
|
||||
MAX_FILE_HANDLES_FOR_READ_ENDS_MAP,
|
||||
SORTING_COLLECTION_SIZE_RATIO,
|
||||
BARCODE_TAG,
|
||||
READ_ONE_BARCODE_TAG,
|
||||
READ_TWO_BARCODE_TAG,
|
||||
TAG_DUPLICATE_SET_MEMBERS,
|
||||
REMOVE_SEQUENCING_DUPLICATES,
|
||||
TAGGING_POLICY,
|
||||
CLEAR_DT,
|
||||
DUPLEX_UMI,
|
||||
MOLECULAR_IDENTIFIER_TAG,
|
||||
METRICS_FILE,
|
||||
REMOVE_DUPLICATES,
|
||||
ASSUME_SORTED,
|
||||
ASSUME_SORT_ORDER,
|
||||
DUPLICATE_SCORING_STRATEGY,
|
||||
PROGRAM_RECORD_ID,
|
||||
PROGRAM_GROUP_VERSION,
|
||||
PROGRAM_GROUP_COMMAND_LINE,
|
||||
PROGRAM_GROUP_NAME,
|
||||
COMMENT,
|
||||
READ_NAME_REGEX,
|
||||
OPTICAL_DUPLICATE_PIXEL_DISTANCE,
|
||||
MAX_OPTICAL_DUPLICATE_SET_SIZE,
|
||||
QUIET,
|
||||
VALIDATION_STRINGENCY,
|
||||
COMPRESSION_LEVEL,
|
||||
MAX_RECORDS_IN_RAM,
|
||||
CREATE_INDEX,
|
||||
INDEX_FORMAT,
|
||||
CREATE_MD5_FILE,
|
||||
_END_NUM
|
||||
};
|
||||
|
||||
/* How strict to be when reading a SAM or BAM, beyond bare minimum validation.
|
||||
*/
|
||||
enum ValidationStringency {
|
||||
/**
|
||||
* Enum used to control how duplicates are flagged in the DT optional tag on each read.
|
||||
* Do the right thing, throw an exception if something looks wrong.
|
||||
*/
|
||||
enum DuplicateTaggingPolicy
|
||||
{
|
||||
DontTag,
|
||||
OpticalOnly,
|
||||
All
|
||||
};
|
||||
STRICT,
|
||||
/**
|
||||
* Emit warnings but keep going if possible.
|
||||
*/
|
||||
LENIENT,
|
||||
/**
|
||||
* Like LENIENT, only don't emit warning messages.
|
||||
*/
|
||||
SILENT,
|
||||
|
||||
/* 排序的方式 */
|
||||
enum SortOrder
|
||||
{
|
||||
unsorted,
|
||||
queryname,
|
||||
coordinate,
|
||||
duplicate, // NB: this is not in the SAM spec!
|
||||
unknown
|
||||
};
|
||||
DEFAULT_STRINGENCY = SILENT
|
||||
};
|
||||
|
||||
/* 计算reads分数的方式(比那个read得分更高) */
|
||||
enum ScoringStrategy
|
||||
{
|
||||
SUM_OF_BASE_QUALITIES,
|
||||
TOTAL_MAPPED_REFERENCE_LENGTH,
|
||||
RANDOM
|
||||
};
|
||||
}
|
||||
/**
|
||||
* Enum used to control how duplicates are flagged in the DT optional tag on
|
||||
* each read.
|
||||
*/
|
||||
enum DuplicateTaggingPolicy { DontTag, OpticalOnly, All };
|
||||
|
||||
/* 排序的方式 */
|
||||
enum SortOrder {
|
||||
unsorted,
|
||||
queryname,
|
||||
coordinate,
|
||||
duplicate, // NB: this is not in the SAM spec!
|
||||
unknown
|
||||
};
|
||||
|
||||
/* 计算reads分数的方式(比那个read得分更高) */
|
||||
enum ScoringStrategy {
|
||||
SUM_OF_BASE_QUALITIES,
|
||||
TOTAL_MAPPED_REFERENCE_LENGTH,
|
||||
RANDOM
|
||||
};
|
||||
|
||||
/* 索引文件的格式 (bai或者csi) */
|
||||
enum IndexFormat {
|
||||
BAI,
|
||||
CSI
|
||||
};
|
||||
|
||||
} // namespace ns_md
|
||||
|
||||
/* markduplicate 需要的参数*/
|
||||
struct MarkDupsArg
|
||||
|
|
@ -287,6 +288,8 @@ struct MarkDupsArg
|
|||
/* "Whether to create an index when writing VCF or coordinate sorted BAM output.", common = true */
|
||||
bool CREATE_INDEX = false;
|
||||
|
||||
ns_md::IndexFormat INDEX_FORMAT = ns_md::IndexFormat::BAI;
|
||||
|
||||
/* "Whether to create an MD5 digest for any BAM or FASTQ files created. ", common = true */
|
||||
bool CREATE_MD5_FILE = false;
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,456 @@
|
|||
|
||||
#include "md_funcs.h"
|
||||
|
||||
#include <common/hts/bam_buf.h>
|
||||
#include <common/utils/debug.h>
|
||||
#include <common/utils/murmur3.h>
|
||||
#include <common/utils/profiling.h>
|
||||
#include <common/utils/timer.h>
|
||||
#include <common/utils/util.h>
|
||||
#include <sam/utils/read_ends.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
#include <math.h>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "dup_metrics.h"
|
||||
#include "markdups_arg.h"
|
||||
#include "shared_args.h"
|
||||
|
||||
using std::cerr;
|
||||
using std::endl;
|
||||
using std::map;
|
||||
using std::set;
|
||||
using std::unordered_map;
|
||||
using std::vector;
|
||||
|
||||
/* 清除key位置的数据 */
|
||||
void clearIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) {
|
||||
auto &msIdx = *pmsIdx;
|
||||
if (msIdx.find(key) != msIdx.end())
|
||||
msIdx[key].clear(); // 清除该位点的冗余结果
|
||||
}
|
||||
|
||||
/* 删除key位置的数据 */
|
||||
void delIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) {
|
||||
auto &msIdx = *pmsIdx;
|
||||
if (msIdx.find(key) != msIdx.end())
|
||||
msIdx.erase(key);
|
||||
}
|
||||
|
||||
/*
|
||||
* 计算read的分数
|
||||
*/
|
||||
int16_t computeDuplicateScore(BamWrap &bw) {
|
||||
int16_t score = 0;
|
||||
switch (g_mdArg.DUPLICATE_SCORING_STRATEGY) {
|
||||
case ns_md::SUM_OF_BASE_QUALITIES:
|
||||
// two (very) long reads worth of high-quality bases can go over
|
||||
// Short.MAX_VALUE/2 and risk overflow.
|
||||
score += (int16_t)min(bw.GetSumOfBaseQualities(), INT16_MAX / 2);
|
||||
break;
|
||||
case ns_md::TOTAL_MAPPED_REFERENCE_LENGTH:
|
||||
if (!bw.GetReadUnmappedFlag())
|
||||
// no need to remember the score since this scoring mechanism is
|
||||
// symmetric
|
||||
score = (int16_t)min(bw.GetReferenceLength(), INT16_MAX / 2);
|
||||
break;
|
||||
case ns_md::RANDOM:
|
||||
// The RANDOM score gives the same score to both reads so that they get
|
||||
// filtered together. it's not critical do use the readName since the
|
||||
// scores from both ends get added, but it seem to be clearer this way.
|
||||
score += (short)(Murmur3::Instance().HashUnencodedChars(bw.query_name()) & 0b11111111111111);
|
||||
// subtract Short.MIN_VALUE/4 from it to end up with a number between
|
||||
// 0 and Short.MAX_VALUE/2. This number can be then discounted in case
|
||||
// the read is not passing filters. We need to stay far from overflow so
|
||||
// that when we add the two scores from the two read mates we do not
|
||||
// overflow since that could cause us to chose a failing read-pair
|
||||
// instead of a passing one.
|
||||
score -= INT16_MIN / 4;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
// make sure that filter-failing records are heavily discounted. (the
|
||||
// discount can happen twice, once for each mate, so need to make sure we do
|
||||
// not subtract more than Short.MIN_VALUE overall.)
|
||||
score += bw.GetReadFailsVendorQualityCheckFlag() ? (int16_t)(INT16_MIN / 2) : 0;
|
||||
|
||||
return score;
|
||||
}
|
||||
|
||||
/*
|
||||
* Builds a read ends object that represents a single read.
|
||||
* 用来表示一个read的特征结构
|
||||
*/
|
||||
void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey) {
|
||||
auto &k = *pKey;
|
||||
auto &bc = bw.b->core;
|
||||
k.read1FirstOfPair = bw.GetFirstOfPairFlag();
|
||||
k.read1ReferenceIndex = bc.tid;
|
||||
k.read1Coordinate = (bc.flag & BAM_FREVERSE) ? bw.GetUnclippedEnd() : bw.GetUnclippedStart();
|
||||
k.orientation = (bc.flag & BAM_FREVERSE) ? ReadEnds::R : ReadEnds::F;
|
||||
|
||||
k.read1IndexInFile = index;
|
||||
k.score = computeDuplicateScore(bw);
|
||||
// Doing this lets the ends object know that it's part of a pair
|
||||
if (bw.GetReadPairedFlag() && !bw.GetMateUnmappedFlag()) {
|
||||
k.read2ReferenceIndex = bc.mtid;
|
||||
}
|
||||
// Fill in the location information for optical duplicates
|
||||
rnParser.AddLocationInformation(bw.query_name(), pKey);
|
||||
// cout << k.tile << ' ' << k.x << ' ' << k.y << endl;
|
||||
// 计算位置key
|
||||
k.posKey =
|
||||
BamWrap::bam_global_pos(k.read1ReferenceIndex, k.read1Coordinate); // << 1 | k.orientation;
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes a list of ReadEndsForMarkDuplicates objects and identify the
|
||||
* representative read based on quality score. For all members of the duplicate
|
||||
* set, add the read1 index-in-file of the representative read to the records of
|
||||
* the first and second in a pair. This value becomes is used for the 'DI' tag.
|
||||
*/
|
||||
void addRepresentativeReadIndex(vector<const ReadEnds *> &vpRe) {}
|
||||
|
||||
/* 处理一组pairend的readends,标记冗余 */
|
||||
void markDuplicatePairs(int64_t posKey, vector<const ReadEnds *> &vpRe,
|
||||
DupContainer<int64_t> *dupIdx, DupContainer<int64_t> *opticalDupIdx) {
|
||||
if (vpRe.size() < 2) {
|
||||
if (vpRe.size() == 1) {
|
||||
// addSingletonToCount(libraryIdGenerator);
|
||||
}
|
||||
return;
|
||||
}
|
||||
// cout << "pos:" << posKey + 1 << ";size:" << vpRe.size() << endl;
|
||||
auto &vDupIdx = dupIdx->AtPos(posKey);
|
||||
auto &vOpticalDupIdx = opticalDupIdx->AtPos(posKey);
|
||||
|
||||
int maxScore = 0;
|
||||
const ReadEnds *pBest = nullptr;
|
||||
/** All read ends should have orientation FF, FR, RF, or RR **/
|
||||
for (auto pe : vpRe) // 找分数最高的readend
|
||||
{
|
||||
if (pe->score > maxScore || pBest == nullptr) {
|
||||
maxScore = pe->score;
|
||||
pBest = pe;
|
||||
}
|
||||
}
|
||||
if (!g_mdArg.READ_NAME_REGEX.empty()) // 检查光学冗余
|
||||
{
|
||||
// trackOpticalDuplicates
|
||||
}
|
||||
for (auto pe : vpRe) // 对非best read标记冗余
|
||||
{
|
||||
if (pe != pBest) // 非best
|
||||
{
|
||||
vDupIdx.push_back(pe->read1IndexInFile); // 添加read1
|
||||
if (pe->read2IndexInFile != pe->read1IndexInFile)
|
||||
vDupIdx.push_back(pe->read2IndexInFile); // 添加read2
|
||||
}
|
||||
}
|
||||
|
||||
if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS) {
|
||||
addRepresentativeReadIndex(vpRe);
|
||||
}
|
||||
}
|
||||
|
||||
/* 处理一组非paired的readends,标记冗余 */
|
||||
void markDuplicateFragments(int64_t posKey, vector<const ReadEnds *> &vpRe, bool containsPairs,
|
||||
DupContainer<int64_t> *dupIdx) {
|
||||
auto &vDupIdx = dupIdx->AtPos(posKey);
|
||||
|
||||
if (containsPairs) {
|
||||
for (auto pe : vpRe) {
|
||||
if (!pe->IsPaired()) {
|
||||
vDupIdx.push_back(pe->read1IndexInFile);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
int maxScore = 0;
|
||||
const ReadEnds *pBest = nullptr;
|
||||
for (auto pe : vpRe) {
|
||||
if (pe->score > maxScore || pBest == nullptr) {
|
||||
maxScore = pe->score;
|
||||
pBest = pe;
|
||||
}
|
||||
}
|
||||
|
||||
for (auto pe : vpRe) {
|
||||
if (pe != pBest) {
|
||||
vDupIdx.push_back(pe->read1IndexInFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* 处理位于某个坐标的pairend reads */
|
||||
void handlePairs(int64_t posKey, vector<ReadEnds> &readEnds,
|
||||
vector<const ReadEnds *> &vpCache, DupContainer<int64_t> *dupIdx,
|
||||
DupContainer<int64_t> *opticalDupIdx) {
|
||||
if (readEnds.size() > 1) { // 有潜在的冗余
|
||||
vpCache.clear();
|
||||
// std::sort(readEnds.begin(), readEnds.end());
|
||||
const ReadEnds *pReadEnd = nullptr;
|
||||
for (auto &re : readEnds) {
|
||||
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样
|
||||
vpCache.push_back(&re); // 处理一个潜在的冗余组
|
||||
else {
|
||||
markDuplicatePairs(posKey, vpCache, dupIdx,
|
||||
opticalDupIdx); // 不一样
|
||||
vpCache.clear();
|
||||
vpCache.push_back(&re);
|
||||
pReadEnd = &re;
|
||||
}
|
||||
}
|
||||
markDuplicatePairs(posKey, vpCache, dupIdx, opticalDupIdx);
|
||||
}
|
||||
}
|
||||
|
||||
/* 处理位于某个坐标的 reads */
|
||||
void handleFrags(int64_t posKey, vector<ReadEnds> &readEnds,
|
||||
vector<const ReadEnds *> &vpCache, DupContainer<int64_t> *dupIdx) {
|
||||
if (readEnds.size() > 1) // 有潜在的冗余
|
||||
{
|
||||
vpCache.clear();
|
||||
// std::sort(readEnds.begin(), readEnds.end());
|
||||
const ReadEnds *pReadEnd = nullptr;
|
||||
bool containsPairs = false;
|
||||
bool containsFrags = false;
|
||||
for (auto &re : readEnds) {
|
||||
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false)) {
|
||||
vpCache.push_back(&re);
|
||||
containsPairs = containsPairs || re.IsPaired();
|
||||
containsFrags = containsFrags || !re.IsPaired();
|
||||
} else {
|
||||
if (vpCache.size() > 1 && containsFrags) {
|
||||
markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
|
||||
}
|
||||
vpCache.clear();
|
||||
vpCache.push_back(&re);
|
||||
pReadEnd = &re;
|
||||
containsPairs = re.IsPaired();
|
||||
containsFrags = !re.IsPaired();
|
||||
}
|
||||
}
|
||||
if (vpCache.size() > 1 && containsFrags) {
|
||||
markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* 对找到的pairend read end添加一些信息 */
|
||||
void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds) {
|
||||
auto &pairedEnds = *pPairedEnds;
|
||||
int64_t bamIdx = fragEnd.read1IndexInFile;
|
||||
const int matesRefIndex = fragEnd.read1ReferenceIndex;
|
||||
const int matesCoordinate = fragEnd.read1Coordinate;
|
||||
// Set orientationForOpticalDuplicates, which always goes by the first then
|
||||
// the second end for the strands. NB: must do this before updating the
|
||||
// orientation later.
|
||||
if (fragEnd.read1FirstOfPair) {
|
||||
pairedEnds.orientationForOpticalDuplicates = ReadEnds::GetOrientationByte(
|
||||
fragEnd.IsNegativeStrand(), pairedEnds.orientation == ReadEnds::R);
|
||||
} else {
|
||||
pairedEnds.orientationForOpticalDuplicates = ReadEnds::GetOrientationByte(
|
||||
pairedEnds.orientation == ReadEnds::R, fragEnd.IsNegativeStrand());
|
||||
}
|
||||
// If the other read is actually later, simply add the other read's data as
|
||||
// read2, else flip the reads
|
||||
if (matesRefIndex > pairedEnds.read1ReferenceIndex ||
|
||||
(matesRefIndex == pairedEnds.read1ReferenceIndex &&
|
||||
matesCoordinate >= pairedEnds.read1Coordinate)) {
|
||||
pairedEnds.read2ReferenceIndex = matesRefIndex;
|
||||
pairedEnds.read2Coordinate = matesCoordinate;
|
||||
pairedEnds.read2IndexInFile = bamIdx;
|
||||
pairedEnds.orientation = ReadEnds::GetOrientationByte(pairedEnds.orientation == ReadEnds::R,
|
||||
fragEnd.IsNegativeStrand());
|
||||
|
||||
// if the two read ends are in the same position, pointing in opposite
|
||||
// directions, the orientation is undefined and the procedure above will
|
||||
// depend on the order of the reads in the file. To avoid this, we set
|
||||
// it explicitly (to FR):
|
||||
if (pairedEnds.read2ReferenceIndex == pairedEnds.read1ReferenceIndex &&
|
||||
pairedEnds.read2Coordinate == pairedEnds.read1Coordinate &&
|
||||
pairedEnds.orientation == ReadEnds::RF) {
|
||||
pairedEnds.orientation = ReadEnds::FR;
|
||||
}
|
||||
} else {
|
||||
pairedEnds.read2ReferenceIndex = pairedEnds.read1ReferenceIndex;
|
||||
pairedEnds.read2Coordinate = pairedEnds.read1Coordinate;
|
||||
pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile;
|
||||
pairedEnds.read1ReferenceIndex = matesRefIndex;
|
||||
pairedEnds.read1Coordinate = matesCoordinate;
|
||||
pairedEnds.read1IndexInFile = bamIdx;
|
||||
pairedEnds.orientation = ReadEnds::GetOrientationByte(
|
||||
fragEnd.IsNegativeStrand(), pairedEnds.orientation == ReadEnds::R);
|
||||
pairedEnds.posKey = fragEnd.posKey;
|
||||
}
|
||||
pairedEnds.score += fragEnd.score;
|
||||
}
|
||||
|
||||
static inline bool closeEnough(const ReadEnds *lhs, const ReadEnds *rhs, const int distance) {
|
||||
return lhs != rhs && // no comparing an object to itself (checked using object identity)!
|
||||
(lhs->tile != ReadEnds::NO_VALUE) &&
|
||||
(rhs->tile != ReadEnds::NO_VALUE) && // no comparing objects without locations
|
||||
lhs->tile == rhs->tile && // and the same tile
|
||||
abs(lhs->x - rhs->x) <= distance && abs(lhs->y - rhs->y) <= distance;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds which reads within the list of duplicates that are likely to be optical/co-localized duplicates of
|
||||
* one another. Within each cluster of optical duplicates that is found, one read remains un-flagged for
|
||||
* optical duplication and the rest are flagged as optical duplicates. The set of reads that are considered
|
||||
* optical duplicates are indicated by returning "true" at the same index in the resulting boolean[] as the
|
||||
* read appeared in the input list of physical locations.
|
||||
*
|
||||
* @param list a list of reads that are determined to be duplicates of one another
|
||||
* @param keeper a single PhysicalLocation that is the one being kept as non-duplicate, and thus should never be
|
||||
* annotated as an optical duplicate. May in some cases be null, or a PhysicalLocation not
|
||||
* contained within the list! (always not be null!)
|
||||
* @return a boolean[] of the same length as the incoming list marking which reads are optical duplicates
|
||||
*/
|
||||
static void findOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe,
|
||||
vector<bool> *pOpticalDuplicateFlags) {
|
||||
const int DEFAULT_OPTICAL_DUPLICATE_DISTANCE = 100;
|
||||
const int DEFAULT_MAX_DUPLICATE_SET_SIZE = 300000;
|
||||
|
||||
vector<bool> &opticalDuplicateFlags = *pOpticalDuplicateFlags;
|
||||
opticalDuplicateFlags.push_back(true);
|
||||
int len = readEndsArr.size();
|
||||
// If there is only one or zero reads passed in (so there are obviously no optical duplicates),
|
||||
// or if there are too many reads (so we don't want to try to run this expensive n^2 algorithm),
|
||||
// then just return an array of all false
|
||||
if (len < 2 || len > DEFAULT_MAX_DUPLICATE_SET_SIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (len >= 4) {
|
||||
/**
|
||||
* Compute the optical duplicates correctly in the case where the duplicate group could end up with transitive
|
||||
* optical duplicates
|
||||
* getOpticalDuplicatesFlagWithGraph
|
||||
*/
|
||||
// Make a graph where the edges are reads that lie within the optical duplicate pixel distance from each other,
|
||||
// we will then use the union-find algorithm to cluster the graph and find optical duplicate groups
|
||||
Graph<int> opticalDistanceRelationGraph;
|
||||
unordered_map<int, vector<int>> tileRGmap;
|
||||
int keeperIndex = -1;
|
||||
for (int i = 0; i < readEndsArr.size(); ++i) {
|
||||
const ReadEnds *currentLoc = readEndsArr[i];
|
||||
if (currentLoc == pBestRe)
|
||||
keeperIndex = i;
|
||||
if (currentLoc->tile != ReadEnds::NO_VALUE) {
|
||||
int key = currentLoc->tile; // 只处理一个样本,所以只有一个read group
|
||||
tileRGmap[key].push_back(i);
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
/**
|
||||
* Compute optical duplicates quickly in the standard case where we know that there won't be any transitive
|
||||
* distances to worry about. Note, this is guaranteed to be correct when there are at most 2x reads from a
|
||||
* readgroup or 3x with the keeper present
|
||||
* getOpticalDuplicatesFlagFast
|
||||
*/
|
||||
// First go through and compare all the reads to the keeper
|
||||
for (int i = 0; i < len; ++i) {
|
||||
const ReadEnds *other = readEndsArr[i];
|
||||
opticalDuplicateFlags[i] = closeEnough(pBestRe, other, DEFAULT_OPTICAL_DUPLICATE_DISTANCE);
|
||||
}
|
||||
// Now go through and do each pairwise comparison not involving the actualKeeper
|
||||
for (int i = 0; i < len; ++i) {
|
||||
const ReadEnds *lhs = readEndsArr[i];
|
||||
if (lhs == pBestRe) // no comparisons to actualKeeper since those are all handled above
|
||||
continue;
|
||||
for (int j = i + 1; j < len; ++j) {
|
||||
const ReadEnds *rhs = readEndsArr[j];
|
||||
if (rhs == pBestRe) // no comparisons to actualKeeper since those are all handled above
|
||||
continue;
|
||||
if (opticalDuplicateFlags[i] && opticalDuplicateFlags[j])
|
||||
continue; // both already marked, no need to check
|
||||
if (closeEnough(lhs, rhs, DEFAULT_OPTICAL_DUPLICATE_DISTANCE)) {
|
||||
// At this point we want to mark either lhs or rhs as duplicate. Either could have been marked
|
||||
// as a duplicate of the keeper (but not both - that's checked above), so be careful about which
|
||||
// one to now mark as a duplicate.
|
||||
int index = opticalDuplicateFlags[j] ? i : j;
|
||||
opticalDuplicateFlags[index] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Looks through the set of reads and identifies how many of the duplicates are
|
||||
* in fact optical duplicates, and stores the data in the instance level histogram.
|
||||
*
|
||||
* We expect only reads with FR or RF orientations, not a mixture of both.
|
||||
*
|
||||
* In PCR duplicate detection, a duplicates can be a have FR and RF when fixing the orientation order to the first end
|
||||
* of the mate. In optical duplicate detection, we do not consider them duplicates if one read as FR and the other RF
|
||||
* when we order orientation by the first mate sequenced (read #1 of the pair).
|
||||
*/
|
||||
static int checkOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe) {
|
||||
vector<bool> opticalDuplicateFlags(readEndsArr.size(), false);
|
||||
// find OpticalDuplicates
|
||||
findOpticalDuplicates(readEndsArr, pBestRe, &opticalDuplicateFlags);
|
||||
int opticalDuplicates = 0;
|
||||
for (int i = 0; i < opticalDuplicateFlags.size(); ++i) {
|
||||
if (opticalDuplicateFlags[i]) {
|
||||
++opticalDuplicates;
|
||||
ReadEnds *pRe = const_cast<ReadEnds *>(readEndsArr[i]);
|
||||
pRe->isOpticalDuplicate = true;
|
||||
}
|
||||
}
|
||||
if (opticalDuplicates > 0)
|
||||
gMetrics.OpticalDuplicatesByLibraryId += opticalDuplicates;
|
||||
return opticalDuplicates;
|
||||
}
|
||||
|
||||
/**
|
||||
* 记录光学原因造成的冗余
|
||||
*/
|
||||
void trackOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe) {
|
||||
bool hasFR = false, hasRF = false;
|
||||
// Check to see if we have a mixture of FR/RF
|
||||
for (auto pRe : readEndsArr) {
|
||||
if (ReadEnds::FR == pRe->orientationForOpticalDuplicates)
|
||||
hasFR = true;
|
||||
else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates)
|
||||
hasRF = true;
|
||||
}
|
||||
|
||||
// Check if we need to partition since the orientations could have changed
|
||||
int nOpticalDup;
|
||||
if (hasFR && hasRF) { // need to track them independently
|
||||
vector<const ReadEnds *> trackOpticalDuplicatesF;
|
||||
vector<const ReadEnds *> trackOpticalDuplicatesR;
|
||||
// Split into two lists: first of pairs and second of pairs,
|
||||
// since they must have orientation and same starting end
|
||||
for (auto pRe: readEndsArr) {
|
||||
if (ReadEnds::FR == pRe->orientationForOpticalDuplicates)
|
||||
trackOpticalDuplicatesF.push_back(pRe);
|
||||
else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates)
|
||||
trackOpticalDuplicatesR.push_back(pRe);
|
||||
else
|
||||
cerr << "Found an unexpected orientation: " << pRe->orientation << endl;
|
||||
}
|
||||
// track the duplicates
|
||||
int nOpticalDupF = checkOpticalDuplicates(trackOpticalDuplicatesF, pBestRe);
|
||||
int nOpticalDupR = checkOpticalDuplicates(trackOpticalDuplicatesR, pBestRe);
|
||||
nOpticalDup = nOpticalDupF + nOpticalDupR;
|
||||
} else { // No need to partition
|
||||
nOpticalDup = checkOpticalDuplicates(readEndsArr, pBestRe);
|
||||
}
|
||||
|
||||
// trackDuplicateCounts
|
||||
gMetrics.DuplicateCountHist += readEndsArr.size() - nOpticalDup;
|
||||
if (readEndsArr.size() > nOpticalDup)
|
||||
gMetrics.NonOpticalDuplicateCountHist += readEndsArr.size() - nOpticalDup;
|
||||
if (nOpticalDup)
|
||||
gMetrics.OpticalDuplicatesCountHist += nOpticalDup + 1;
|
||||
}
|
||||
|
|
@ -1,60 +1,64 @@
|
|||
#pragma once
|
||||
|
||||
#include <robin-map/include/tsl/robin_map.h>
|
||||
|
||||
#include <vector>
|
||||
#include <queue>
|
||||
#include <unordered_set>
|
||||
#include <unordered_map>
|
||||
|
||||
using std::priority_queue;
|
||||
using std::unordered_map;
|
||||
using std::unordered_set;
|
||||
using std::vector;
|
||||
|
||||
/* 前向声明 */
|
||||
class BamWrap;
|
||||
class ReadEnds;
|
||||
class ReadNameParser;
|
||||
|
||||
/* 存放readend或者冗余idx,避免频繁的开辟和释放内存 */
|
||||
template <class T>
|
||||
struct DupContainer
|
||||
{
|
||||
vector<vector<T>> arr; // 类似map<int64_t, set<ReadEnds>> 或 map<int64_t, set<int64_t>>
|
||||
vector<int64_t> pos; // arr中每个元素对应的position
|
||||
struct DupContainer {
|
||||
vector<vector<T>> arr; // 类似map<int64_t, set<ReadEnds>> 或 map<int64_t, set<int64_t>>
|
||||
vector<int64_t> pos; // arr中每个元素对应的position
|
||||
// unordered_map<int64_t, int64_t> idx; // 某个位点对应在vector中的坐标
|
||||
tsl::robin_map<int64_t, int64_t> idx; // 某个位点对应在vector中的坐标
|
||||
int64_t size = 0; // 实际使用的空间
|
||||
int64_t capacity = 0; // 内存容量
|
||||
inline void Init()
|
||||
{
|
||||
tsl::robin_map<int64_t, int64_t> idx; // 某个位点对应在vector中的坐标
|
||||
int64_t size = 0; // 实际使用的空间
|
||||
int64_t capacity = 0; // 内存容量
|
||||
inline void Init() {
|
||||
idx.clear();
|
||||
size = 0;
|
||||
}
|
||||
inline void SortAtPos(int64_t p) // 这里的pos表示位点
|
||||
inline void SortAtPos(int64_t p) // 这里的pos表示位点
|
||||
{
|
||||
if (idx.find(p) != idx.end())
|
||||
{
|
||||
if (idx.find(p) != idx.end()) {
|
||||
const int64_t i = idx.at(p);
|
||||
std::sort(arr[i].begin(), arr[i].end());
|
||||
}
|
||||
}
|
||||
inline void SortAtIdx(int64_t i) // 这里的i表示vector的索引
|
||||
inline void SortAtIdx(int64_t i) // 这里的i表示vector的索引
|
||||
{
|
||||
std::sort(arr[i].begin(), arr[i].end());
|
||||
}
|
||||
inline void RemoveAtPos(int64_t p)
|
||||
{
|
||||
if (idx.find(p) != idx.end())
|
||||
{
|
||||
inline void RemoveAtPos(int64_t p) {
|
||||
if (idx.find(p) != idx.end()) {
|
||||
const int64_t i = idx.at(p);
|
||||
arr[i].clear();
|
||||
}
|
||||
}
|
||||
inline void RemoveAtIdx(int64_t i) // 这里的i表示vector的索引
|
||||
inline void RemoveAtIdx(int64_t i) // 这里的i表示vector的索引
|
||||
{
|
||||
arr[i].clear();
|
||||
}
|
||||
inline void AddAtPos(int64_t p, T &val)
|
||||
{
|
||||
AtPos(p).push_back(val);
|
||||
}
|
||||
|
||||
inline vector<T> &AtPos(int64_t p)
|
||||
{
|
||||
if (idx.find(p) != idx.end())
|
||||
{
|
||||
inline void AddAtPos(int64_t p, T &val) { AtPos(p).push_back(val); }
|
||||
|
||||
inline vector<T> &AtPos(int64_t p) {
|
||||
if (idx.find(p) != idx.end()) {
|
||||
const int64_t i = idx.at(p);
|
||||
return arr[i];
|
||||
}
|
||||
|
||||
if (size >= capacity)
|
||||
{
|
||||
if (size >= capacity) {
|
||||
capacity += 1;
|
||||
arr.push_back(vector<T>());
|
||||
pos.push_back(0);
|
||||
|
|
@ -67,308 +71,131 @@ struct DupContainer
|
|||
}
|
||||
};
|
||||
|
||||
/* 清除key位置的数据 */
|
||||
static inline void clearIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx)
|
||||
{
|
||||
auto &msIdx = *pmsIdx;
|
||||
if (msIdx.find(key) != msIdx.end())
|
||||
msIdx[key].clear(); // 清除该位点的冗余结果
|
||||
}
|
||||
|
||||
/* 删除key位置的数据 */
|
||||
static inline void delIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx)
|
||||
{
|
||||
auto &msIdx = *pmsIdx;
|
||||
if (msIdx.find(key) != msIdx.end())
|
||||
msIdx.erase(key);
|
||||
}
|
||||
/*
|
||||
* 优先队列,用最小堆来实现对所有冗余索引的排序
|
||||
*/
|
||||
struct PairArrIdIdx {
|
||||
int arrId = 0;
|
||||
uint64_t arrIdx = 0;
|
||||
int64_t dupIdx = 0;
|
||||
};
|
||||
|
||||
struct IdxGreaterThan {
|
||||
bool operator()(const PairArrIdIdx &a, const PairArrIdIdx &b) { return a.dupIdx > b.dupIdx; }
|
||||
};
|
||||
|
||||
struct DupIdxQueue {
|
||||
// 将冗余索引和他对应的task vector对应起来
|
||||
|
||||
// 由于是多个task来查找冗余,所以每次找到的冗余index都放在一个独立的vector中,vector之间可能有重叠,所以需要用一个最小堆来维护
|
||||
vector<vector<int64_t>> *dupIdx2DArr;
|
||||
priority_queue<PairArrIdIdx, vector<PairArrIdIdx>, IdxGreaterThan> minHeap;
|
||||
uint64_t popNum = 0;
|
||||
|
||||
int Init(vector<vector<int64_t>> *_dupIdx2DArr) {
|
||||
dupIdx2DArr = _dupIdx2DArr;
|
||||
if (dupIdx2DArr == nullptr) {
|
||||
return -1;
|
||||
}
|
||||
for (int i = 0; i < dupIdx2DArr->size(); ++i) {
|
||||
auto &v = (*dupIdx2DArr)[i];
|
||||
if (!v.empty()) {
|
||||
minHeap.push({i, 1, v[0]});
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int64_t Pop() {
|
||||
int64_t ret = -1;
|
||||
if (!minHeap.empty()) {
|
||||
auto idx = minHeap.top();
|
||||
minHeap.pop();
|
||||
++popNum;
|
||||
ret = idx.dupIdx;
|
||||
auto &v = (*dupIdx2DArr)[idx.arrId];
|
||||
if (v.size() > idx.arrIdx) {
|
||||
minHeap.push({idx.arrId, idx.arrIdx + 1, v[idx.arrIdx]});
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
uint64_t Size() {
|
||||
uint64_t len = 0;
|
||||
if (dupIdx2DArr != nullptr) {
|
||||
for (auto &v : *dupIdx2DArr) {
|
||||
len += v.size();
|
||||
}
|
||||
}
|
||||
return len - popNum;
|
||||
}
|
||||
};
|
||||
|
||||
/*
|
||||
* 用来检测optical duplication的graph
|
||||
*/
|
||||
template <class Node>
|
||||
struct Graph { // 用set?
|
||||
unordered_set<Node> nodes; // 图中的结点
|
||||
unordered_map<Node*, unordered_set<Node*>> neighbors; // 邻接列表
|
||||
|
||||
Node *addNode(const Node &singleton) {
|
||||
if (nodes.find(singleton) == nodes.end()) {
|
||||
Node *n = const_cast<Node *>(&(*nodes.insert(singleton).first));
|
||||
neighbors[n].clear();
|
||||
return n;
|
||||
}
|
||||
return const_cast<Node *>(&(*nodes.find(singleton)));
|
||||
}
|
||||
};
|
||||
|
||||
/*
|
||||
* 计算read的分数
|
||||
*/
|
||||
static int16_t computeDuplicateScore(BamWrap &bw)
|
||||
{
|
||||
int16_t score = 0;
|
||||
switch (g_mdArg.DUPLICATE_SCORING_STRATEGY)
|
||||
{
|
||||
case ns_md::SUM_OF_BASE_QUALITIES:
|
||||
// two (very) long reads worth of high-quality bases can go over Short.MAX_VALUE/2
|
||||
// and risk overflow.
|
||||
score += (int16_t)min(bw.GetSumOfBaseQualities(), INT16_MAX / 2);
|
||||
break;
|
||||
case ns_md::TOTAL_MAPPED_REFERENCE_LENGTH:
|
||||
if (!bw.GetReadUnmappedFlag())
|
||||
// no need to remember the score since this scoring mechanism is symmetric
|
||||
score = (int16_t)min(bw.GetReferenceLength(), INT16_MAX / 2);
|
||||
break;
|
||||
case ns_md::RANDOM:
|
||||
// The RANDOM score gives the same score to both reads so that they get filtered together.
|
||||
// it's not critical do use the readName since the scores from both ends get added, but it seem
|
||||
// to be clearer this way.
|
||||
score += (short)(Murmur3::Instance().HashUnencodedChars(bw.query_name()) & 0b11111111111111);
|
||||
// subtract Short.MIN_VALUE/4 from it to end up with a number between
|
||||
// 0 and Short.MAX_VALUE/2. This number can be then discounted in case the read is
|
||||
// not passing filters. We need to stay far from overflow so that when we add the two
|
||||
// scores from the two read mates we do not overflow since that could cause us to chose a
|
||||
// failing read-pair instead of a passing one.
|
||||
score -= INT16_MIN / 4;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
// make sure that filter-failing records are heavily discounted. (the discount can happen twice, once
|
||||
// for each mate, so need to make sure we do not subtract more than Short.MIN_VALUE overall.)
|
||||
score += bw.GetReadFailsVendorQualityCheckFlag() ? (int16_t)(INT16_MIN / 2) : 0;
|
||||
|
||||
return score;
|
||||
}
|
||||
int16_t computeDuplicateScore(BamWrap &bw);
|
||||
|
||||
/*
|
||||
* Builds a read ends object that represents a single read. 用来表示一个read的特征结构
|
||||
* Builds a read ends object that represents a single read.
|
||||
* 用来表示一个read的特征结构
|
||||
*/
|
||||
static void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey)
|
||||
{
|
||||
auto &k = *pKey;
|
||||
auto &bc = bw.b->core;
|
||||
k.read1FirstOfPair = bw.GetFirstOfPairFlag();
|
||||
k.read1ReferenceIndex = bc.tid;
|
||||
k.read1Coordinate = (bc.flag & BAM_FREVERSE) ? bw.GetUnclippedEnd() : bw.GetUnclippedStart();
|
||||
k.orientation = (bc.flag & BAM_FREVERSE) ? ReadEnds::R : ReadEnds::F;
|
||||
void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey);
|
||||
|
||||
k.read1IndexInFile = index;
|
||||
k.score = computeDuplicateScore(bw);
|
||||
// Doing this lets the ends object know that it's part of a pair
|
||||
if (bw.GetReadPairedFlag() && !bw.GetMateUnmappedFlag())
|
||||
{
|
||||
k.read2ReferenceIndex = bc.mtid;
|
||||
}
|
||||
// Fill in the location information for optical duplicates
|
||||
rnParser.AddLocationInformation(bw.query_name(), pKey);
|
||||
// cout << k.tile << ' ' << k.x << ' ' << k.y << endl;
|
||||
// 计算位置key
|
||||
k.posKey = BamWrap::bam_global_pos(k.read1ReferenceIndex, k.read1Coordinate); // << 1 | k.orientation;
|
||||
}
|
||||
/*
|
||||
* 处理一组pairend的readends,标记冗余
|
||||
*/
|
||||
void markDuplicatePairs(int64_t posKey, vector<const ReadEnds *> &vpRe,
|
||||
DupContainer<int64_t> *dupIdx, DupContainer<int64_t> *opticalDupIdx);
|
||||
|
||||
/*
|
||||
* 处理一组非paired的readends,标记冗余
|
||||
*/
|
||||
void markDuplicateFragments(int64_t posKey, vector<const ReadEnds *> &vpRe, bool containsPairs,
|
||||
DupContainer<int64_t> *dupIdx);
|
||||
|
||||
/*
|
||||
* 处理位于某个坐标的pairend reads
|
||||
*/
|
||||
void handlePairs(int64_t posKey, vector<ReadEnds> &readEnds, vector<const ReadEnds *> &vpCache,
|
||||
DupContainer<int64_t> *dupIdx, DupContainer<int64_t> *opticalDupIdx);
|
||||
|
||||
/*
|
||||
* 处理位于某个坐标的非配对的frag reads
|
||||
*/
|
||||
void handleFrags(int64_t posKey, vector<ReadEnds> &readEnds, vector<const ReadEnds *> &vpCache,
|
||||
DupContainer<int64_t> *dupIdx);
|
||||
|
||||
/*
|
||||
* 对找到的pairend read end添加一些信息
|
||||
*/
|
||||
void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds);
|
||||
|
||||
/**
|
||||
* Takes a list of ReadEndsForMarkDuplicates objects and identify the representative read based on
|
||||
* quality score. For all members of the duplicate set, add the read1 index-in-file of the representative
|
||||
* read to the records of the first and second in a pair. This value becomes is used for
|
||||
* the 'DI' tag.
|
||||
* Looks through the set of reads and identifies how many of the duplicates are
|
||||
* in fact optical duplicates, and stores the data in the instance level histogram.
|
||||
* Additionally sets the transient isOpticalDuplicate flag on each read end that is
|
||||
* identified as an optical duplicate.
|
||||
* 记录光学原因造成的冗余
|
||||
*/
|
||||
static void addRepresentativeReadIndex(vector<const ReadEnds *> &vpRe)
|
||||
{
|
||||
}
|
||||
|
||||
/* 处理一组pairend的readends,标记冗余 */
|
||||
static void markDuplicatePairs(int64_t posKey,
|
||||
vector<const ReadEnds *> &vpRe,
|
||||
DupContainer<int64_t> *dupIdx,
|
||||
DupContainer<int64_t> *opticalDupIdx)
|
||||
{
|
||||
if (vpRe.size() < 2)
|
||||
{
|
||||
if (vpRe.size() == 1)
|
||||
{
|
||||
// addSingletonToCount(libraryIdGenerator);
|
||||
}
|
||||
return;
|
||||
}
|
||||
// cout << "pos:" << posKey + 1 << ";size:" << vpRe.size() << endl;
|
||||
auto &vDupIdx = dupIdx->AtPos(posKey);
|
||||
auto &vOpticalDupIdx = opticalDupIdx->AtPos(posKey);
|
||||
|
||||
int maxScore = 0;
|
||||
const ReadEnds *pBest = nullptr;
|
||||
/** All read ends should have orientation FF, FR, RF, or RR **/
|
||||
for (auto pe : vpRe) // 找分数最高的readend
|
||||
{
|
||||
if (pe->score > maxScore || pBest == nullptr)
|
||||
{
|
||||
maxScore = pe->score;
|
||||
pBest = pe;
|
||||
}
|
||||
}
|
||||
if (!g_mdArg.READ_NAME_REGEX.empty()) // 检查光学冗余
|
||||
{
|
||||
// trackOpticalDuplicates
|
||||
}
|
||||
for (auto pe : vpRe) // 对非best read标记冗余
|
||||
{
|
||||
if (pe != pBest) // 非best
|
||||
{
|
||||
vDupIdx.push_back(pe->read1IndexInFile); // 添加read1
|
||||
if (pe->read2IndexInFile != pe->read1IndexInFile)
|
||||
vDupIdx.push_back(pe->read2IndexInFile); // 添加read2
|
||||
}
|
||||
}
|
||||
|
||||
if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS)
|
||||
{
|
||||
addRepresentativeReadIndex(vpRe);
|
||||
}
|
||||
}
|
||||
|
||||
/* 处理一组非paired的readends,标记冗余 */
|
||||
static void markDuplicateFragments(int64_t posKey,
|
||||
vector<const ReadEnds *> &vpRe,
|
||||
bool containsPairs,
|
||||
DupContainer<int64_t> *dupIdx)
|
||||
{
|
||||
auto &vDupIdx = dupIdx->AtPos(posKey);
|
||||
|
||||
if (containsPairs)
|
||||
{
|
||||
for (auto pe : vpRe)
|
||||
{
|
||||
if (!pe->IsPaired())
|
||||
{
|
||||
vDupIdx.push_back(pe->read1IndexInFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
int maxScore = 0;
|
||||
const ReadEnds *pBest = nullptr;
|
||||
for (auto pe : vpRe)
|
||||
{
|
||||
if (pe->score > maxScore || pBest == nullptr)
|
||||
{
|
||||
maxScore = pe->score;
|
||||
pBest = pe;
|
||||
}
|
||||
}
|
||||
|
||||
for (auto pe : vpRe)
|
||||
{
|
||||
if (pe != pBest)
|
||||
{
|
||||
vDupIdx.push_back(pe->read1IndexInFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* 处理位于某个坐标的pairend reads */
|
||||
static inline void handlePairs(int64_t posKey,
|
||||
vector<ReadEnds> &readEnds,
|
||||
vector<const ReadEnds *> &vpCache,
|
||||
DupContainer<int64_t> *dupIdx,
|
||||
DupContainer<int64_t> *opticalDupIdx)
|
||||
{
|
||||
if (readEnds.size() > 1) // 有潜在的冗余
|
||||
{
|
||||
vpCache.clear();
|
||||
// std::sort(readEnds.begin(), readEnds.end());
|
||||
const ReadEnds *pReadEnd = nullptr;
|
||||
for (auto &re : readEnds)
|
||||
{
|
||||
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样
|
||||
vpCache.push_back(&re); // 处理一个潜在的冗余组
|
||||
else
|
||||
{
|
||||
markDuplicatePairs(posKey, vpCache, dupIdx, opticalDupIdx); // 不一样
|
||||
vpCache.clear();
|
||||
vpCache.push_back(&re);
|
||||
pReadEnd = &re;
|
||||
}
|
||||
}
|
||||
markDuplicatePairs(posKey, vpCache, dupIdx, opticalDupIdx);
|
||||
}
|
||||
}
|
||||
|
||||
/* 处理位于某个坐标的 reads */
|
||||
static inline void handleFrags(
|
||||
int64_t posKey,
|
||||
vector<ReadEnds> &readEnds,
|
||||
vector<const ReadEnds *> &vpCache,
|
||||
DupContainer<int64_t> *dupIdx)
|
||||
{
|
||||
if (readEnds.size() > 1) // 有潜在的冗余
|
||||
{
|
||||
vpCache.clear();
|
||||
// std::sort(readEnds.begin(), readEnds.end());
|
||||
const ReadEnds *pReadEnd = nullptr;
|
||||
bool containsPairs = false;
|
||||
bool containsFrags = false;
|
||||
for (auto &re : readEnds)
|
||||
{
|
||||
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false))
|
||||
{
|
||||
vpCache.push_back(&re);
|
||||
containsPairs = containsPairs || re.IsPaired();
|
||||
containsFrags = containsFrags || !re.IsPaired();
|
||||
}
|
||||
else
|
||||
{
|
||||
if (vpCache.size() > 1 && containsFrags)
|
||||
{
|
||||
markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
|
||||
}
|
||||
vpCache.clear();
|
||||
vpCache.push_back(&re);
|
||||
pReadEnd = &re;
|
||||
containsPairs = re.IsPaired();
|
||||
containsFrags = !re.IsPaired();
|
||||
}
|
||||
}
|
||||
if (vpCache.size() > 1 && containsFrags)
|
||||
{
|
||||
markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* 对找到的pairend read end添加一些信息 */
|
||||
static inline void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds)
|
||||
{
|
||||
auto &pairedEnds = *pPairedEnds;
|
||||
int64_t bamIdx = fragEnd.read1IndexInFile;
|
||||
const int matesRefIndex = fragEnd.read1ReferenceIndex;
|
||||
const int matesCoordinate = fragEnd.read1Coordinate;
|
||||
// Set orientationForOpticalDuplicates, which always goes by the first then the second end for the strands. NB: must do this
|
||||
// before updating the orientation later.
|
||||
if (fragEnd.read1FirstOfPair)
|
||||
{
|
||||
pairedEnds.orientationForOpticalDuplicates =
|
||||
ReadEnds::GetOrientationByte(fragEnd.IsNegativeStrand(), pairedEnds.orientation == ReadEnds::R);
|
||||
}
|
||||
else
|
||||
{
|
||||
pairedEnds.orientationForOpticalDuplicates =
|
||||
ReadEnds::GetOrientationByte(pairedEnds.orientation == ReadEnds::R, fragEnd.IsNegativeStrand());
|
||||
}
|
||||
// If the other read is actually later, simply add the other read's data as read2, else flip the reads
|
||||
if (matesRefIndex > pairedEnds.read1ReferenceIndex ||
|
||||
(matesRefIndex == pairedEnds.read1ReferenceIndex && matesCoordinate >= pairedEnds.read1Coordinate))
|
||||
{
|
||||
pairedEnds.read2ReferenceIndex = matesRefIndex;
|
||||
pairedEnds.read2Coordinate = matesCoordinate;
|
||||
pairedEnds.read2IndexInFile = bamIdx;
|
||||
pairedEnds.orientation = ReadEnds::GetOrientationByte(pairedEnds.orientation == ReadEnds::R,
|
||||
fragEnd.IsNegativeStrand());
|
||||
|
||||
// if the two read ends are in the same position, pointing in opposite directions,
|
||||
// the orientation is undefined and the procedure above
|
||||
// will depend on the order of the reads in the file.
|
||||
// To avoid this, we set it explicitly (to FR):
|
||||
if (pairedEnds.read2ReferenceIndex == pairedEnds.read1ReferenceIndex &&
|
||||
pairedEnds.read2Coordinate == pairedEnds.read1Coordinate &&
|
||||
pairedEnds.orientation == ReadEnds::RF)
|
||||
{
|
||||
pairedEnds.orientation = ReadEnds::FR;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
pairedEnds.read2ReferenceIndex = pairedEnds.read1ReferenceIndex;
|
||||
pairedEnds.read2Coordinate = pairedEnds.read1Coordinate;
|
||||
pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile;
|
||||
pairedEnds.read1ReferenceIndex = matesRefIndex;
|
||||
pairedEnds.read1Coordinate = matesCoordinate;
|
||||
pairedEnds.read1IndexInFile = bamIdx;
|
||||
pairedEnds.orientation = ReadEnds::GetOrientationByte(fragEnd.IsNegativeStrand(),
|
||||
pairedEnds.orientation == ReadEnds::R);
|
||||
pairedEnds.posKey = fragEnd.posKey;
|
||||
}
|
||||
pairedEnds.score += fragEnd.score;
|
||||
}
|
||||
void trackOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe);
|
||||
|
|
@ -0,0 +1,783 @@
|
|||
#include "serial_md.h"
|
||||
|
||||
#include <common/hts/bam_buf.h>
|
||||
#include <common/utils/debug.h>
|
||||
#include <common/utils/global_arg.h>
|
||||
#include <common/utils/profiling.h>
|
||||
#include <common/utils/timer.h>
|
||||
#include <common/utils/util.h>
|
||||
#include <sam/utils/read_ends.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
#include "markdups_arg.h"
|
||||
#include "md_funcs.h"
|
||||
#include "shared_args.h"
|
||||
#include "dup_metrics.h"
|
||||
|
||||
using std::cout;
|
||||
using std::set;
|
||||
using std::vector;
|
||||
|
||||
/* 查找 */
|
||||
// template<class Itr, class T>
|
||||
// static inline Itr binaryFind(Itr first, Itr last, const T &val)
|
||||
// {
|
||||
// first = std::lower_bound(first, last, val);
|
||||
// return (first != last && *first == val) ? first : last;
|
||||
// }
|
||||
|
||||
/* 排序 */
|
||||
static inline void sortReadEndsArr(vector<ReadEnds> &arr) {
|
||||
size_t blockSize = 64 * 1024;
|
||||
if (arr.size() < blockSize) {
|
||||
std::sort(arr.begin(), arr.end());
|
||||
return;
|
||||
}
|
||||
size_t blockNum = (arr.size() + blockSize - 1) / blockSize;
|
||||
size_t crossNum = 1024;
|
||||
size_t start, end, i, left, right;
|
||||
std::sort(arr.begin(), arr.begin() + blockSize);
|
||||
for (i = 1; i < blockNum; ++i) {
|
||||
start = i * blockSize;
|
||||
end = min(start + blockSize, arr.size());
|
||||
std::sort(arr.begin() + start, arr.begin() + end);
|
||||
left = crossNum;
|
||||
while (!(arr[start - left] < arr[start])) {
|
||||
left = left << 1;
|
||||
if (left >= blockSize) {
|
||||
std::sort(arr.begin(), arr.end()); // 退化到普通排序
|
||||
return;
|
||||
}
|
||||
}
|
||||
right = min(crossNum, end - start - 1);
|
||||
|
||||
while (!(arr[start - 1] < arr[start + right])) {
|
||||
right = min(right << 1, end - start - 1);
|
||||
if (right == end - start - 1)
|
||||
break;
|
||||
}
|
||||
std::sort(arr.begin() + start - left, arr.begin() + start + right);
|
||||
}
|
||||
}
|
||||
|
||||
/* 处理一组pairend的readends,标记冗余, 这个函数需要串行运行,因为需要做一些统计*/
|
||||
static void markDupsForPairs(vector<const ReadEnds *> &vpRe, set<int64_t> *dupIdx, set<int64_t> *opticalDupIdx,
|
||||
set<int64_t> *notDupIdx = nullptr) {
|
||||
if (vpRe.size() < 2) {
|
||||
if (vpRe.size() == 1) {
|
||||
// addSingletonToCount(libraryIdGenerator);
|
||||
// 这个统计可能会有误差,因为当前位点可能还有没匹配上的read,导致当前位点的read(paired)数量为1
|
||||
// 可以通过后续的补充计算来解决这个问题,有必要么?
|
||||
gMetrics.AddSingletonToCount();
|
||||
}
|
||||
return;
|
||||
}
|
||||
int maxScore = 0;
|
||||
const ReadEnds *pBest = nullptr;
|
||||
/** All read ends should have orientation FF, FR, RF, or RR **/
|
||||
for (auto pe : vpRe) { // 找分数最高的readend
|
||||
if (pe->score > maxScore || pBest == nullptr) {
|
||||
maxScore = pe->score;
|
||||
pBest = pe;
|
||||
}
|
||||
}
|
||||
if (notDupIdx != nullptr) {
|
||||
notDupIdx->insert(pBest->read1IndexInFile);
|
||||
notDupIdx->insert(pBest->read2IndexInFile);
|
||||
}
|
||||
if (!g_mdArg.READ_NAME_REGEX.empty()) { // 检查光学冗余
|
||||
// trackOpticalDuplicates
|
||||
trackOpticalDuplicates(vpRe, pBest);
|
||||
}
|
||||
for (auto pe : vpRe) // 对非best read标记冗余
|
||||
{
|
||||
if (pe != pBest) // 非best
|
||||
{
|
||||
dupIdx->insert(pe->read1IndexInFile); // 添加read1
|
||||
if (pe->read2IndexInFile != pe->read1IndexInFile)
|
||||
dupIdx->insert(pe->read2IndexInFile); // 添加read2
|
||||
}
|
||||
}
|
||||
// 在输出的bam文件中添加tag
|
||||
if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS) {
|
||||
// addRepresentativeReadIndex(vpRe); // 每次都更新就行,用最新的覆盖之前的(如果之前有)
|
||||
}
|
||||
}
|
||||
|
||||
/* 处理一组非paired的readends,标记冗余 */
|
||||
static void markDupsForFrags(vector<const ReadEnds *> &vpRe, bool containsPairs, set<int64_t> *dupIdx,
|
||||
set<int64_t> *notDupIdx = nullptr) {
|
||||
if (containsPairs) {
|
||||
for (auto pe : vpRe) {
|
||||
if (!pe->IsPaired()) {
|
||||
dupIdx->insert(pe->read1IndexInFile);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
int maxScore = 0;
|
||||
const ReadEnds *pBest = nullptr;
|
||||
for (auto pe : vpRe) {
|
||||
if (pe->score > maxScore || pBest == nullptr) {
|
||||
maxScore = pe->score;
|
||||
pBest = pe;
|
||||
}
|
||||
}
|
||||
if (notDupIdx != nullptr) {
|
||||
notDupIdx->insert(pBest->read1IndexInFile);
|
||||
}
|
||||
for (auto pe : vpRe) {
|
||||
if (pe != pBest) {
|
||||
dupIdx->insert(pe->read1IndexInFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* 找到与readend pos相等的所有readend */
|
||||
static void getEqualRE(const ReadEnds &re, vector<ReadEnds> &src, vector<ReadEnds> *dst) {
|
||||
auto range = std::equal_range(src.begin(), src.end(), re,
|
||||
ReadEnds::PairsLittleThan); // 只比对位点
|
||||
dst->insert(dst->end(), range.first, range.second);
|
||||
}
|
||||
|
||||
/* 单线程生成readends (第一步)*/
|
||||
static void generateReadEnds(SerailMarkDupArg *arg) {
|
||||
auto &p = *arg;
|
||||
auto &rnParser = g_vRnParser[0];
|
||||
|
||||
p.pairs.clear();
|
||||
p.frags.clear();
|
||||
p.unpairedDic.clear();
|
||||
p.unpairedPosArr.clear();
|
||||
|
||||
/* 处理每个read,创建ReadEnd,并放入frag和pair中 */
|
||||
set<ReadEnds> reSet;
|
||||
|
||||
ReadEnds lastRe;
|
||||
|
||||
for (int i = 0; i < p.bams.size(); ++i) // 循环处理每个read
|
||||
{
|
||||
BamWrap *bw = p.bams[i];
|
||||
const int64_t bamIdx = p.bamStartIdx + i;
|
||||
if (bw->GetReadUnmappedFlag()) {
|
||||
if (bw->b->core.tid == -1)
|
||||
// When we hit the unmapped reads with no coordinate, no reason
|
||||
// to continue (only in coordinate sort).
|
||||
break;
|
||||
} else if (!bw->IsSecondaryOrSupplementary()) // 是主要比对
|
||||
{
|
||||
ReadEnds fragEnd;
|
||||
tm_arr[8].acc_start();
|
||||
buildReadEnds(*bw, bamIdx, rnParser, &fragEnd);
|
||||
tm_arr[8].acc_end();
|
||||
p.frags.push_back(fragEnd); // 添加进frag集合
|
||||
if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) // 是pairend而且互补的read也比对上了
|
||||
{
|
||||
string key = bw->query_name();
|
||||
if (p.unpairedDic.find(key) == p.unpairedDic.end()) {
|
||||
p.unpairedDic[key] = {p.taskSeq, fragEnd};
|
||||
} else // 找到了pairend
|
||||
{
|
||||
auto &pairedEnds = p.unpairedDic.at(key).unpairedRE;
|
||||
modifyPairedEnds(fragEnd, &pairedEnds);
|
||||
p.pairs.push_back(pairedEnds);
|
||||
p.unpairedDic.erase(key); // 删除找到的pairend
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
tm_arr[9].acc_start();
|
||||
sortReadEndsArr(p.frags);
|
||||
// sort(p.frags.begin(), p.frags.end());
|
||||
tm_arr[9].acc_end();
|
||||
// cout << "sort pairs" << endl;
|
||||
tm_arr[10].acc_start();
|
||||
sort(p.pairs.begin(), p.pairs.end());
|
||||
tm_arr[10].acc_end();
|
||||
// 记录位点上的未匹配的read个数
|
||||
for (auto &e : p.unpairedDic) {
|
||||
auto posKey = e.second.unpairedRE.posKey;
|
||||
auto &unpairArrInfo = p.unpairedPosArr[posKey];
|
||||
unpairArrInfo.unpairedNum++;
|
||||
unpairArrInfo.taskSeq = p.taskSeq;
|
||||
unpairArrInfo.readNameSet.insert(e.first);
|
||||
}
|
||||
// cout << "依赖比例:" << (float)p.unpairedDic.size() / p.frags.size() <<
|
||||
// endl;
|
||||
}
|
||||
|
||||
/* 处理pairs */
|
||||
static void processPairs(vector<ReadEnds> &readEnds, set<int64_t> *dupIdx, set<int64_t> *opticalDupIdx,
|
||||
set<int64_t> *notDupIdx = nullptr) {
|
||||
vector<const ReadEnds *> vpCache; // 有可能是冗余的reads
|
||||
const ReadEnds *pReadEnd = nullptr;
|
||||
for (auto &re : readEnds) {
|
||||
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样
|
||||
vpCache.push_back(&re); // 处理一个潜在的冗余组
|
||||
else {
|
||||
markDupsForPairs(vpCache, dupIdx, opticalDupIdx, notDupIdx); // 不一样
|
||||
vpCache.clear();
|
||||
vpCache.push_back(&re);
|
||||
pReadEnd = &re;
|
||||
}
|
||||
}
|
||||
markDupsForPairs(vpCache, dupIdx, opticalDupIdx, notDupIdx);
|
||||
}
|
||||
|
||||
/* 处理frags */
|
||||
static void processFrags(vector<ReadEnds> &readEnds, set<int64_t> *dupIdx, set<int64_t> *notDupIdx = nullptr) {
|
||||
bool containsPairs = false;
|
||||
bool containsFrags = false;
|
||||
vector<const ReadEnds *> vpCache; // 有可能是冗余的reads
|
||||
const ReadEnds *pReadEnd = nullptr;
|
||||
for (auto &re : readEnds) {
|
||||
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false)) {
|
||||
vpCache.push_back(&re);
|
||||
containsPairs = containsPairs || re.IsPaired();
|
||||
containsFrags = containsFrags || !re.IsPaired();
|
||||
} else {
|
||||
if (vpCache.size() > 1 && containsFrags) {
|
||||
markDupsForFrags(vpCache, containsPairs, dupIdx, notDupIdx);
|
||||
}
|
||||
vpCache.clear();
|
||||
vpCache.push_back(&re);
|
||||
pReadEnd = &re;
|
||||
containsPairs = re.IsPaired();
|
||||
containsFrags = !re.IsPaired();
|
||||
}
|
||||
}
|
||||
if (vpCache.size() > 1 && containsFrags) {
|
||||
markDupsForFrags(vpCache, containsPairs, dupIdx, notDupIdx);
|
||||
}
|
||||
}
|
||||
|
||||
/* 单线程markdup (第二步)*/
|
||||
static void markdups(SerailMarkDupArg *arg) {
|
||||
auto &p = *arg;
|
||||
p.pairDupIdx.clear();
|
||||
p.pairOpticalDupIdx.clear();
|
||||
p.fragDupIdx.clear();
|
||||
/* generateDuplicateIndexes,计算冗余read在所有read中的位置索引 */
|
||||
// 先处理 pair
|
||||
processPairs(p.pairs, &p.pairDupIdx, &p.pairOpticalDupIdx);
|
||||
|
||||
// 再处理frag
|
||||
processFrags(p.frags, &p.fragDupIdx);
|
||||
}
|
||||
|
||||
/* 获取交叉部分的数据 */
|
||||
static inline void getIntersectData(vector<ReadEnds> &leftArr, vector<ReadEnds> &rightArr, vector<ReadEnds> *dst,
|
||||
bool isPairCmp = false) {
|
||||
if (leftArr.empty() || rightArr.empty()) {
|
||||
cout << "bad size: " << leftArr.size() << '\t' << rightArr.size() << endl;
|
||||
return;
|
||||
}
|
||||
const size_t leftEndIdx = leftArr.size() - 1;
|
||||
const size_t rightStartIdx = 0;
|
||||
size_t leftSpan = 0;
|
||||
size_t rightSpan = 0;
|
||||
|
||||
while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx - leftSpan], rightArr[rightStartIdx], isPairCmp)) {
|
||||
leftSpan += 1;
|
||||
if (leftSpan > leftEndIdx) {
|
||||
leftSpan = leftArr.size() - 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx], rightArr[rightSpan], isPairCmp)) {
|
||||
rightSpan += 1;
|
||||
if (rightSpan == rightArr.size() - 1)
|
||||
break;
|
||||
}
|
||||
dst->insert(dst->end(), leftArr.end() - leftSpan, leftArr.end());
|
||||
dst->insert(dst->end(), rightArr.begin(), rightArr.begin() + rightSpan);
|
||||
std::sort(dst->begin(), dst->end());
|
||||
}
|
||||
|
||||
/* 将frags重叠部分的dup idx放进数据中 */
|
||||
static inline void refreshFragDupIdx(set<int64_t> &dupIdx, set<int64_t> ¬DupIdx, SerailMarkDupArg *lastArg,
|
||||
SerailMarkDupArg *curArg) {
|
||||
auto &lp = *lastArg;
|
||||
auto &p = *curArg;
|
||||
for (auto idx : dupIdx) {
|
||||
lp.fragDupIdx.insert(idx);
|
||||
p.fragDupIdx.erase(idx);
|
||||
}
|
||||
for (auto idx : notDupIdx) {
|
||||
lp.fragDupIdx.erase(idx);
|
||||
p.fragDupIdx.erase(idx);
|
||||
}
|
||||
}
|
||||
|
||||
/* 将pairs重叠部分的dup idx放进数据中 */
|
||||
static inline void refreshPairDupIdx(set<int64_t> &dupIdx, set<int64_t> &opticalDupIdx, set<int64_t> ¬DupIdx,
|
||||
SerailMarkDupArg *lastArg, SerailMarkDupArg *curArg) {
|
||||
auto &lp = *lastArg;
|
||||
auto &p = *curArg;
|
||||
for (auto idx : dupIdx) {
|
||||
lp.pairDupIdx.insert(idx);
|
||||
p.pairDupIdx.erase(idx);
|
||||
}
|
||||
for (auto idx : opticalDupIdx) {
|
||||
lp.pairOpticalDupIdx.insert(idx);
|
||||
p.pairOpticalDupIdx.erase(idx);
|
||||
}
|
||||
for (auto idx : notDupIdx) {
|
||||
lp.pairDupIdx.erase(idx);
|
||||
lp.pairOpticalDupIdx.erase(idx);
|
||||
p.pairDupIdx.erase(idx);
|
||||
p.pairOpticalDupIdx.erase(idx);
|
||||
}
|
||||
}
|
||||
|
||||
// 用来分别处理dup和optical dup
|
||||
static void refeshTaskDupInfo(set<int64_t> &dupIdx, set<int64_t> &opticalDupIdx, set<int64_t> ¬DupIdx,
|
||||
set<int64_t> &latterDupIdx, set<int64_t> &latterOpticalDupIdx,
|
||||
set<int64_t> &latterNotDupIdx) {
|
||||
for (auto idx : dupIdx) latterDupIdx.insert(idx);
|
||||
for (auto idx : opticalDupIdx) latterOpticalDupIdx.insert(idx);
|
||||
for (auto idx : notDupIdx) latterNotDupIdx.insert(idx);
|
||||
}
|
||||
|
||||
/* 最后合并数据并排序 */
|
||||
static void refeshFinalTaskDupInfo(set<int64_t> &dupIdx, set<int64_t> ¬DupIdx, vector<int64_t> &dupArr) {
|
||||
vector<int64_t> midArr;
|
||||
|
||||
auto ai = dupArr.begin();
|
||||
auto bi = dupIdx.begin();
|
||||
auto ae = dupArr.end();
|
||||
auto be = dupIdx.end();
|
||||
|
||||
int64_t val = 0;
|
||||
while (ai != ae && bi != be) {
|
||||
if (*ai < *bi) {
|
||||
val = *ai++;
|
||||
} else if (*bi < *ai) {
|
||||
val = *bi++;
|
||||
} else {
|
||||
val = *ai++;
|
||||
bi++;
|
||||
}
|
||||
if (notDupIdx.find(val) == notDupIdx.end()) {
|
||||
midArr.push_back(val);
|
||||
}
|
||||
}
|
||||
while (ai != ae) {
|
||||
val = *ai++;
|
||||
if (notDupIdx.find(val) == notDupIdx.end()) {
|
||||
midArr.push_back(val);
|
||||
}
|
||||
}
|
||||
while (bi != be) {
|
||||
val = *bi++;
|
||||
if (notDupIdx.find(val) == notDupIdx.end()) {
|
||||
midArr.push_back(val);
|
||||
}
|
||||
}
|
||||
dupArr = midArr;
|
||||
}
|
||||
|
||||
/* 处理相邻的两个任务,有相交叉的数据 */
|
||||
static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *curArg, GlobalDataArg *gDataArg) {
|
||||
auto &lp = *lastArg;
|
||||
auto &p = *curArg;
|
||||
auto &g = *gDataArg;
|
||||
|
||||
vector<ReadEnds> reArr;
|
||||
set<int64_t> dupIdx;
|
||||
set<int64_t> notDupIdx;
|
||||
// 先处理重叠的frags
|
||||
getIntersectData(lp.frags, p.frags, &reArr);
|
||||
processFrags(reArr, &dupIdx, ¬DupIdx);
|
||||
refreshFragDupIdx(dupIdx, notDupIdx, &lp, &p);
|
||||
|
||||
// 再处理重叠的pairs
|
||||
reArr.clear();
|
||||
dupIdx.clear();
|
||||
notDupIdx.clear();
|
||||
set<int64_t> opticalDupIdx;
|
||||
getIntersectData(lp.pairs, p.pairs, &reArr, true);
|
||||
processPairs(reArr, &dupIdx, &opticalDupIdx, ¬DupIdx);
|
||||
refreshPairDupIdx(dupIdx, opticalDupIdx, notDupIdx, &lp, &p);
|
||||
|
||||
// 处理之前未匹配的部分
|
||||
map<CalcKey, int64_t> recalcPos;
|
||||
set<CalcKey> alreadyAdd; // 与该位点相同的pair都添加到数组里了
|
||||
set<int64_t> addToGlobal;
|
||||
int64_t prevLastPos = 0, nextFirstPos = 0;
|
||||
if (lp.frags.size() > 0)
|
||||
prevLastPos = lp.frags.back().posKey;
|
||||
if (p.frags.size() > 0)
|
||||
nextFirstPos = p.frags[0].posKey;
|
||||
// cout << "range: " << nextFirstPos << '\t' << prevLastPos << endl;
|
||||
for (auto &prevUnpair : lp.unpairedDic) { // 遍历上一个任务中的每个未匹配的read
|
||||
auto &readName = prevUnpair.first;
|
||||
auto &prevPosInfo = prevUnpair.second;
|
||||
auto prevFragEnd = prevPosInfo.unpairedRE; // 未匹配的read end
|
||||
|
||||
if (p.unpairedDic.find(readName) != p.unpairedDic.end()) { // 在当前这个任务里找到了这个未匹配的read
|
||||
auto &nextPosInfo = p.unpairedDic[readName];
|
||||
auto &nextFragEnd = nextPosInfo.unpairedRE;
|
||||
int64_t prevPosKey = prevFragEnd.posKey;
|
||||
modifyPairedEnds(nextFragEnd, &prevFragEnd); // 在某些clip情况下,poskey可能是后面的read
|
||||
int64_t nextPosKey = max(prevPosKey, nextFragEnd.posKey);
|
||||
CalcKey ck = {prevPosKey, nextPosKey};
|
||||
UnpairedPosInfo *prevUnpairInfoP = nullptr;
|
||||
UnpairedPosInfo *nextUnpairInfoP = nullptr;
|
||||
if (lp.unpairedPosArr.find(prevPosKey) != lp.unpairedPosArr.end())
|
||||
prevUnpairInfoP = &lp.unpairedPosArr[prevPosKey];
|
||||
if (p.unpairedPosArr.find(prevPosKey) != p.unpairedPosArr.end())
|
||||
nextUnpairInfoP = &p.unpairedPosArr[prevPosKey];
|
||||
|
||||
// pos分为两种情况,根据poskey(pair中两个read分别的pos)的位置确定
|
||||
// 1.
|
||||
// prevpos在交叉部分之前,nextpos在交叉部分之后,这种情况不需要获取pairarr中的数据;
|
||||
// 2.
|
||||
// prevpos在交叉部分之前,nextpos在交叉部分,需要获取lp中的相等read
|
||||
// pair进行重新计算
|
||||
// 复杂情况1.
|
||||
// g中包含prevPosKey对应的unpair,p中有对应的pair,此时应该把这些pair考虑进去
|
||||
// 3.
|
||||
// prevpos在交叉部分,nextpos在交叉部分之后,需要获取p中的相等read
|
||||
// pair进行重新计算
|
||||
// 复杂情况2. p中是否包含prevPosKey对应的unpair
|
||||
// 4.
|
||||
// prevpos在交叉部分,nextpos在交叉部分,需要获取lp和p中的相等read
|
||||
// pair进行重新计算
|
||||
|
||||
bool addDataToPos = true;
|
||||
if (alreadyAdd.find(ck) != alreadyAdd.end()) {
|
||||
addDataToPos = false; // 之前已经添加过了,后面就不用再添加数据了
|
||||
} else
|
||||
alreadyAdd.insert(ck);
|
||||
|
||||
if (prevPosKey < nextFirstPos) { // prevpos在交叉部分之前
|
||||
auto &prevPairArr = prevUnpairInfoP->pairArr; // prevUnpairInfoP肯定不是nullptr
|
||||
prevPairArr.push_back(prevFragEnd);
|
||||
if (nextPosKey <= prevLastPos && addDataToPos) { // 第二种情况
|
||||
getEqualRE(prevFragEnd, lp.pairs, &prevPairArr);
|
||||
}
|
||||
// 第一种情况,第二种情况下都会出现,复杂情况一
|
||||
auto gPosInfo = g.unpairedPosArr.find(prevPosKey);
|
||||
if (gPosInfo != g.unpairedPosArr.end()) { // 可能g和p有匹配的,刚好和该位点一致
|
||||
auto &gUnpairInfo = gPosInfo->second;
|
||||
auto pPosInfo = p.unpairedPosArr.find(nextPosKey);
|
||||
if (pPosInfo != p.unpairedPosArr.end()) {
|
||||
auto &pUnpairInfo = pPosInfo->second;
|
||||
for (auto &rn : gUnpairInfo.readNameSet) { // 遍历每一个readname,看是否有匹配的
|
||||
if (pUnpairInfo.readNameSet.find(rn) != pUnpairInfo.readNameSet.end()) {
|
||||
auto pe = g.unpairedDic[rn].unpairedRE;
|
||||
auto fe = p.unpairedDic[rn].unpairedRE;
|
||||
modifyPairedEnds(fe, &pe);
|
||||
prevPairArr.push_back(pe);
|
||||
g.unpairedDic.erase(rn);
|
||||
p.unpairedDic.erase(rn);
|
||||
// cout << "找到了!" << rn << endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
recalcPos[ck] = prevPosInfo.taskSeq;
|
||||
std::sort(prevPairArr.begin(), prevPairArr.end());
|
||||
} else { // prevpos在交叉部分
|
||||
if (nextPosKey > prevLastPos) { // nextpos在交叉部分之后 第三种情况
|
||||
if (nextUnpairInfoP != nullptr) { // 且在pos点,next task有unpair,这样才把这些数据放到next task里
|
||||
auto &nextPairArr = nextUnpairInfoP->pairArr;
|
||||
nextPairArr.push_back(prevFragEnd);
|
||||
auto &prevPairArr = prevUnpairInfoP->pairArr;
|
||||
prevPairArr.push_back(prevFragEnd);
|
||||
if (addDataToPos) {
|
||||
getEqualRE(prevFragEnd, p.pairs, &prevPairArr);
|
||||
}
|
||||
// 将数据放到next task里,(这个位点以后会可能还会计算到,目前方案是都计算,只是把冗余剔除)
|
||||
recalcPos[ck] = nextPosInfo.taskSeq;
|
||||
|
||||
std::sort(prevPairArr.begin(), prevPairArr.end());
|
||||
} else { // next task在该位点没有unpair,那就把数据放到prev task里
|
||||
auto &prevPairArr = prevUnpairInfoP->pairArr; // prevUnpairInfoP肯定不是nullptr
|
||||
prevPairArr.push_back(prevFragEnd);
|
||||
if (addDataToPos) // 第二种情况
|
||||
getEqualRE(prevFragEnd, p.pairs, &prevPairArr);
|
||||
recalcPos[ck] = prevPosInfo.taskSeq;
|
||||
std::sort(prevPairArr.begin(), prevPairArr.end());
|
||||
}
|
||||
} else { // 第四种情况
|
||||
if (prevUnpairInfoP == nullptr) {
|
||||
prevUnpairInfoP = &lp.unpairedPosArr[prevPosKey];
|
||||
prevUnpairInfoP->taskSeq = lp.taskSeq;
|
||||
}
|
||||
auto &prevPairArr = prevUnpairInfoP->pairArr;
|
||||
prevPairArr.push_back(prevFragEnd);
|
||||
if (addDataToPos) {
|
||||
getEqualRE(prevFragEnd, lp.pairs, &prevPairArr);
|
||||
getEqualRE(prevFragEnd, p.pairs, &prevPairArr);
|
||||
}
|
||||
recalcPos[ck] = prevPosInfo.taskSeq;
|
||||
std::sort(prevPairArr.begin(), prevPairArr.end());
|
||||
}
|
||||
}
|
||||
p.unpairedDic.erase(readName); // 在next task里删除该read
|
||||
} else if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // 在遗留数据中找到了匹配的read
|
||||
auto &remainPosInfo = g.unpairedDic[readName];
|
||||
auto remainFragEnd = remainPosInfo.unpairedRE;
|
||||
int64_t remainPosKey = remainFragEnd.posKey;
|
||||
modifyPairedEnds(prevFragEnd, &remainFragEnd); // 在某些clip情况下,poskey可能是后面的read
|
||||
auto &remainUnpairInfo = g.unpairedPosArr[remainPosKey];
|
||||
auto &remainPairArr = remainUnpairInfo.pairArr;
|
||||
remainPairArr.push_back(remainFragEnd);
|
||||
CalcKey ck = {remainPosKey, prevFragEnd.posKey};
|
||||
recalcPos[ck] = remainPosInfo.taskSeq;
|
||||
std::sort(remainPairArr.begin(), remainPairArr.end());
|
||||
|
||||
g.unpairedDic.erase(readName);
|
||||
} else { // 都没找到,那就保存到遗留数据里
|
||||
int64_t prevPosKey = prevFragEnd.posKey;
|
||||
g.unpairedDic.insert(prevUnpair);
|
||||
addToGlobal.insert(prevPosKey);
|
||||
}
|
||||
}
|
||||
// 最后再添加,以防开始赋值,后来这个位置要是又添加了新的数据
|
||||
for (auto posKey : addToGlobal) g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey];
|
||||
|
||||
map<int64_t, TaskSeqDupInfo> taskChanged;
|
||||
set<int64_t> posProcessed;
|
||||
for (auto &e : recalcPos) {
|
||||
auto posKey = e.first.read1Pos;
|
||||
if (posProcessed.find(posKey) != posProcessed.end())
|
||||
continue;
|
||||
posProcessed.insert(posKey);
|
||||
auto taskSeq = e.second;
|
||||
auto &t = taskChanged[taskSeq];
|
||||
// 在对应的任务包含的dup idx里修改结果数据
|
||||
vector<ReadEnds> *pairArrP = nullptr;
|
||||
if (taskSeq < lp.taskSeq)
|
||||
pairArrP = &g.unpairedPosArr[posKey].pairArr;
|
||||
else
|
||||
pairArrP = &lp.unpairedPosArr[posKey].pairArr;
|
||||
processPairs(*pairArrP, &t.dupIdx, &t.opticalDupIdx, &t.notDupIdx);
|
||||
if (taskSeq < lp.taskSeq)
|
||||
g.unpairedPosArr.erase(posKey);
|
||||
}
|
||||
// 更新结果
|
||||
|
||||
for (auto &e : taskChanged) {
|
||||
auto taskSeq = e.first;
|
||||
auto &t = e.second;
|
||||
if (taskSeq < lp.taskSeq) {
|
||||
refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.notDupIdx, g.latterDupIdxArr[taskSeq],
|
||||
g.latterOpticalDupIdxArr[taskSeq], g.latterNotDupIdxArr[taskSeq]);
|
||||
} else if (taskSeq == lp.taskSeq) {
|
||||
refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.notDupIdx, &lp, &p);
|
||||
} else {
|
||||
refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.notDupIdx, &p, &lp); // 把结果放到p中
|
||||
}
|
||||
}
|
||||
|
||||
// cout << "remain unpaired: " << g.unpairedDic.size() << '\t' <<
|
||||
// g.unpairedPosArr.size() << endl; cout << "calc g time: " <<
|
||||
// t.seconds_elapsed() << " s" << endl; 将dupidx放进全局数据
|
||||
g.latterDupIdxArr.push_back(set<int64_t>());
|
||||
g.latterOpticalDupIdxArr.push_back(set<int64_t>());
|
||||
g.latterNotDupIdxArr.push_back(set<int64_t>());
|
||||
|
||||
g.dupIdxArr.push_back(vector<int64_t>());
|
||||
auto &vIdx = g.dupIdxArr.back();
|
||||
lp.pairDupIdx.insert(lp.fragDupIdx.begin(), lp.fragDupIdx.end());
|
||||
vIdx.insert(vIdx.end(), lp.pairDupIdx.begin(), lp.pairDupIdx.end());
|
||||
|
||||
g.opticalDupIdxArr.push_back(vector<int64_t>());
|
||||
auto &vOpticalIdx = g.opticalDupIdxArr.back();
|
||||
vOpticalIdx.insert(vOpticalIdx.end(), lp.pairOpticalDupIdx.begin(), lp.pairOpticalDupIdx.end());
|
||||
}
|
||||
|
||||
/* 当所有任务结束后,global data里还有未处理的数据 */
|
||||
static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) {
|
||||
auto &lp = *task;
|
||||
auto &g = *gDataArg;
|
||||
// 遗留的未匹配的pair
|
||||
for (auto &prevUnpair : lp.unpairedDic) { // 遍历上一个任务中的每个未匹配的read
|
||||
auto &readName = prevUnpair.first;
|
||||
auto &prevPosInfo = prevUnpair.second;
|
||||
auto prevFragEnd = prevPosInfo.unpairedRE; // 未匹配的read end
|
||||
|
||||
if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // 在遗留数据中找到了匹配的read
|
||||
auto &remainPosInfo = g.unpairedDic[readName];
|
||||
auto remainFragEnd = remainPosInfo.unpairedRE;
|
||||
int64_t remainPosKey = remainFragEnd.posKey;
|
||||
modifyPairedEnds(prevFragEnd, &remainFragEnd); // 在某些clip情况下,poskey可能是后面的read
|
||||
auto &remainUnpairInfo = g.unpairedPosArr[remainPosKey];
|
||||
|
||||
remainUnpairInfo.pairArr.push_back(remainFragEnd);
|
||||
g.unpairedDic.erase(readName);
|
||||
}
|
||||
}
|
||||
map<int64_t, TaskSeqDupInfo> taskChanged;
|
||||
for (auto &e : g.unpairedPosArr) {
|
||||
auto posKey = e.first;
|
||||
auto taskSeq = e.second.taskSeq;
|
||||
auto &t = taskChanged[taskSeq];
|
||||
auto &arr = g.unpairedPosArr[posKey].pairArr;
|
||||
|
||||
if (arr.size() > 1) {
|
||||
std::sort(arr.begin(), arr.end());
|
||||
processPairs(arr, &t.dupIdx, &t.opticalDupIdx, &t.notDupIdx);
|
||||
}
|
||||
}
|
||||
// 更新结果
|
||||
vector<int64_t> addDup;
|
||||
map<int64_t, int64_t> ndPosVal;
|
||||
for (auto &e : taskChanged) {
|
||||
auto taskSeq = e.first;
|
||||
auto &t = e.second;
|
||||
refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.notDupIdx, g.latterDupIdxArr[taskSeq],
|
||||
g.latterOpticalDupIdxArr[taskSeq], g.latterNotDupIdxArr[taskSeq]);
|
||||
}
|
||||
|
||||
cout << "last unpair info: " << g.unpairedDic.size() << '\t' << g.unpairedPosArr.size() << endl;
|
||||
g.unpairedPosArr.clear();
|
||||
g.unpairedDic.clear();
|
||||
|
||||
// 将dupidx放进全局数据
|
||||
for (int i = 0; i < (int)g.dupIdxArr.size() - 1; ++i)
|
||||
refeshFinalTaskDupInfo(g.latterDupIdxArr[i], g.latterNotDupIdxArr[i], g.dupIdxArr[i]);
|
||||
for (int i = 0; i < (int)g.opticalDupIdxArr.size() - 1; ++i)
|
||||
refeshFinalTaskDupInfo(g.latterOpticalDupIdxArr[i], g.latterNotDupIdxArr[i], g.opticalDupIdxArr[i]);
|
||||
|
||||
g.dupIdxArr.push_back(vector<int64_t>());
|
||||
auto &vIdx = g.dupIdxArr.back();
|
||||
lp.pairDupIdx.insert(lp.fragDupIdx.begin(), lp.fragDupIdx.end());
|
||||
vIdx.insert(vIdx.end(), lp.pairDupIdx.begin(), lp.pairDupIdx.end());
|
||||
|
||||
g.opticalDupIdxArr.push_back(vector<int64_t>());
|
||||
auto &vOpticalIdx = g.opticalDupIdxArr.back();
|
||||
vOpticalIdx.insert(vOpticalIdx.end(), lp.pairOpticalDupIdx.begin(), lp.pairOpticalDupIdx.end());
|
||||
}
|
||||
|
||||
/* 串行处理数据,标记冗余 */
|
||||
void serialMarkDups() {
|
||||
tm_arr[5].acc_start();
|
||||
Timer::log_time("serial start");
|
||||
// 读取缓存初始化
|
||||
BamBufType inBamBuf(g_gArg.use_asyncio);
|
||||
inBamBuf.Init(g_inBamFp, g_inBamHeader, g_gArg.max_mem);
|
||||
// BamBufType inBamBuf(false);
|
||||
// inBamBuf.Init(g_inBamFp, g_inBamHeader, 100 * 1024 * 1024);
|
||||
int64_t processedBamNum = 0;
|
||||
|
||||
SerailMarkDupArg smdArg1, smdArg2;
|
||||
SerailMarkDupArg *lastArgP = &smdArg1;
|
||||
SerailMarkDupArg *curArgP = &smdArg2;
|
||||
|
||||
bool isFirstRound = true;
|
||||
int roundNum = 0;
|
||||
int64_t readNumSum = 0;
|
||||
while (inBamBuf.ReadStat() >= 0) {
|
||||
Timer t_round;
|
||||
// 读取bam文件中的read
|
||||
tm_arr[4].acc_start();
|
||||
size_t readNum = inBamBuf.ReadBam();
|
||||
readNumSum += readNum;
|
||||
tm_arr[4].acc_end();
|
||||
cout << "read num: " << readNum << '\t' << roundNum << endl;
|
||||
// lastArgP = curArgP;
|
||||
tm_arr[6].acc_start();
|
||||
curArgP->taskSeq = roundNum;
|
||||
curArgP->bamStartIdx = processedBamNum;
|
||||
curArgP->bams = inBamBuf.GetBamArr();
|
||||
tm_arr[6].acc_end();
|
||||
|
||||
tm_arr[0].acc_start();
|
||||
Timer t1;
|
||||
generateReadEnds(curArgP);
|
||||
// cout << "calc read end time: " << t1.seconds_elapsed() << " s" <<
|
||||
// endl;
|
||||
tm_arr[0].acc_end();
|
||||
|
||||
tm_arr[1].acc_start();
|
||||
t1.reinit();
|
||||
markdups(curArgP);
|
||||
// cout << "markdups time: " << t1.seconds_elapsed() << " s" << endl;
|
||||
tm_arr[1].acc_end();
|
||||
|
||||
if (!isFirstRound) {
|
||||
tm_arr[2].acc_start();
|
||||
t1.reinit();
|
||||
handleIntersectData(lastArgP, curArgP, &gData);
|
||||
// cout << "intersect time: " << t1.seconds_elapsed() << " s" <<
|
||||
// endl;
|
||||
// addTaskIdxToSet(lastArgP, &gData);
|
||||
tm_arr[2].acc_end();
|
||||
} else {
|
||||
isFirstRound = false;
|
||||
}
|
||||
inBamBuf.ClearAll(); // 清理上一轮读入的数据
|
||||
processedBamNum += readNum;
|
||||
|
||||
// 交换
|
||||
auto tmp = lastArgP;
|
||||
lastArgP = curArgP;
|
||||
curArgP = tmp;
|
||||
// cout << "round time: " << t_round.seconds_elapsed() << endl;
|
||||
roundNum++;
|
||||
if (roundNum % 100 == 0) {
|
||||
cout << "read sum: " << readNumSum << endl;
|
||||
cout << "round time: " << t_round.seconds_elapsed() * 100 << " s" << endl;
|
||||
}
|
||||
}
|
||||
// cout << "here" << endl;
|
||||
tm_arr[3].acc_start();
|
||||
// 处理剩下的全局数据
|
||||
handleLastTask(lastArgP, &gData);
|
||||
// cout << "here 2" << endl;
|
||||
tm_arr[3].acc_end();
|
||||
|
||||
tm_arr[5].acc_end();
|
||||
// 统计所有冗余index数量
|
||||
int64_t dupNum = 0;
|
||||
map<int64_t, int> dup;
|
||||
|
||||
int taskSeq = 0;
|
||||
for (auto &arr : gData.dupIdxArr) {
|
||||
for (auto idx : arr) {
|
||||
if (dup.find(idx) != dup.end()) {
|
||||
// cout << "dup index: " << dup[idx] << '\t' << taskSeq << '\t'
|
||||
// << idx << endl;
|
||||
}
|
||||
dup[idx] = taskSeq;
|
||||
}
|
||||
taskSeq++;
|
||||
}
|
||||
// #include <fstream>
|
||||
// ofstream out("tumor_dup.txt");
|
||||
// for (auto idx : dup)
|
||||
// {
|
||||
// out << idx << endl;
|
||||
// }
|
||||
// out.close();
|
||||
|
||||
for (auto &arr : gData.dupIdxArr) dupNum += arr.size();
|
||||
|
||||
cout << "dup num : " << dupNum << '\t' << dup.size() << endl;
|
||||
|
||||
cout << "calc readend: " << tm_arr[0].acc_seconds_elapsed() << endl;
|
||||
cout << "markdup : " << tm_arr[1].acc_seconds_elapsed() << endl;
|
||||
cout << "handle tail : " << tm_arr[2].acc_seconds_elapsed() << endl;
|
||||
cout << "handle last : " << tm_arr[3].acc_seconds_elapsed() << endl;
|
||||
cout << "read bam : " << tm_arr[4].acc_seconds_elapsed() << endl;
|
||||
cout << "new arg : " << tm_arr[6].acc_seconds_elapsed() << endl;
|
||||
cout << "del arg : " << tm_arr[7].acc_seconds_elapsed() << endl;
|
||||
cout << "build ends : " << tm_arr[8].acc_seconds_elapsed() << endl;
|
||||
cout << "sort frags : " << tm_arr[9].acc_seconds_elapsed() << endl;
|
||||
cout << "sort pairs : " << tm_arr[10].acc_seconds_elapsed() << endl;
|
||||
cout << "all : " << tm_arr[5].acc_seconds_elapsed() << endl;
|
||||
|
||||
Timer::log_time("serial end ");
|
||||
|
||||
// for (auto i : gData.dupArr)
|
||||
// cout << i << endl;
|
||||
}
|
||||
|
|
@ -1,20 +1,28 @@
|
|||
#include <algorithm>
|
||||
#pragma once
|
||||
|
||||
#include <common/hts/bam_buf.h>
|
||||
#include <robin-map/include/tsl/robin_map.h>
|
||||
#include <sam/utils/read_ends.h>
|
||||
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
using std::set;
|
||||
using std::string;
|
||||
using std::vector;
|
||||
|
||||
/* 存放未匹配readend相同位点的所有readend */
|
||||
struct UnpairedREInfo
|
||||
{
|
||||
struct UnpairedREInfo {
|
||||
int64_t taskSeq;
|
||||
ReadEnds unpairedRE;
|
||||
};
|
||||
|
||||
/* 对于一个pair数据,一个完整的计算点,包含read1的比对位置和read2的比对位置 */
|
||||
struct CalcKey
|
||||
{
|
||||
struct CalcKey {
|
||||
int64_t read1Pos;
|
||||
int64_t read2Pos;
|
||||
bool operator<(const CalcKey &o) const
|
||||
{
|
||||
bool operator<(const CalcKey &o) const {
|
||||
int comp = (int)(read1Pos - o.read1Pos);
|
||||
if (comp == 0)
|
||||
comp = (int)(read2Pos - o.read2Pos);
|
||||
|
|
@ -23,16 +31,14 @@ struct CalcKey
|
|||
};
|
||||
|
||||
/* 当遗留数据在当前任务找到了pair read后,进行冗余计算时候存放结果的数据结构 */
|
||||
struct TaskSeqDupInfo
|
||||
{
|
||||
struct TaskSeqDupInfo {
|
||||
set<int64_t> dupIdx;
|
||||
set<int64_t> opticalDupIdx;
|
||||
set<int64_t> notDupIdx;
|
||||
};
|
||||
|
||||
/* 保存有未匹配pair位点的信息,包括read end数组和有几个未匹配的read end */
|
||||
struct UnpairedPosInfo
|
||||
{
|
||||
struct UnpairedPosInfo {
|
||||
int unpairedNum = 0;
|
||||
int64_t taskSeq;
|
||||
vector<ReadEnds> pairArr;
|
||||
|
|
@ -41,28 +47,26 @@ struct UnpairedPosInfo
|
|||
// typedef unordered_map<string, UnpairedREInfo> UnpairedNameMap;
|
||||
// typedef unordered_map<int64_t, UnpairedPosInfo> UnpairedPositionMap;
|
||||
|
||||
typedef tsl::robin_map<string, UnpairedREInfo> UnpairedNameMap; // 以read name为索引,保存未匹配的pair read
|
||||
typedef tsl::robin_map<int64_t, UnpairedPosInfo> UnpairedPositionMap; // 以位点为索引,保存该位点包含的对应的所有read和该位点包含的剩余未匹配的read的数量
|
||||
typedef tsl::robin_map<string, UnpairedREInfo> UnpairedNameMap; // 以read name为索引,保存未匹配的pair read
|
||||
typedef tsl::robin_map<int64_t, UnpairedPosInfo> UnpairedPositionMap; // 以位点为索引,保存该位点包含的对应的所有read和该位点包含的剩余未匹配的read的数量
|
||||
|
||||
/* 单线程处理冗余参数结构体 */
|
||||
struct SerailMarkDupArg
|
||||
{
|
||||
int64_t taskSeq;
|
||||
int64_t bamStartIdx; // 当前vBam数组中第一个bam记录在整体bam中所处的位置
|
||||
vector<BamWrap *> bams; // 存放待处理的bam read
|
||||
vector<ReadEnds> pairs;
|
||||
vector<ReadEnds> frags;
|
||||
set<int64_t> pairDupIdx; // pair的冗余read的索引
|
||||
set<int64_t> pairOpticalDupIdx; // optical冗余read的索引
|
||||
set<int64_t> fragDupIdx; // frag的冗余read的索引
|
||||
UnpairedNameMap unpairedDic; // 用来寻找pair end
|
||||
UnpairedPositionMap unpairedPosArr; // 存放未匹配的ReadEnd对应位点的所有ReadEnd,为了避免重复存储
|
||||
struct SerailMarkDupArg {
|
||||
int64_t taskSeq; // 任务序号
|
||||
int64_t bamStartIdx; // 当前vBam数组中第一个bam记录在整体bam中所处的位置
|
||||
vector<BamWrap *> bams; // 存放待处理的bam read
|
||||
vector<ReadEnds> pairs; // 成对的reads
|
||||
vector<ReadEnds> frags; // 暂未找到配对的reads
|
||||
set<int64_t> pairDupIdx; // pair的冗余read的索引
|
||||
set<int64_t> pairOpticalDupIdx; // optical冗余read的索引
|
||||
set<int64_t> fragDupIdx; // frag的冗余read的索引
|
||||
UnpairedNameMap unpairedDic; // 用来寻找pair end
|
||||
UnpairedPositionMap unpairedPosArr; // 存放未匹配的ReadEnd对应位点的所有ReadEnd,为了避免重复存储
|
||||
};
|
||||
|
||||
/* 全局保留的数据,因为有些paired数据比对到了不同的染色体,相距甚远 */
|
||||
struct GlobalDataArg
|
||||
{
|
||||
UnpairedNameMap unpairedDic; // 用来寻找pair end
|
||||
struct GlobalDataArg {
|
||||
UnpairedNameMap unpairedDic; // 用来寻找pair end
|
||||
UnpairedPositionMap unpairedPosArr;
|
||||
|
||||
// 每个task对应一个vector
|
||||
|
|
@ -75,869 +79,5 @@ struct GlobalDataArg
|
|||
vector<set<int64_t>> latterNotDupIdxArr;
|
||||
};
|
||||
|
||||
static GlobalDataArg gData;
|
||||
|
||||
|
||||
/* 查找 */
|
||||
// template<class Itr, class T>
|
||||
// static inline Itr binaryFind(Itr first, Itr last, const T &val)
|
||||
// {
|
||||
// first = std::lower_bound(first, last, val);
|
||||
// return (first != last && *first == val) ? first : last;
|
||||
// }
|
||||
|
||||
/* 排序 */
|
||||
static inline void sortReadEndsArr(vector<ReadEnds> &arr)
|
||||
{
|
||||
size_t blockSize = 64 * 1024;
|
||||
blockSize = min(blockSize, arr.size());
|
||||
size_t blockNum = (arr.size() + blockSize - 1) / blockSize;
|
||||
size_t crossNum = 1024;
|
||||
size_t start, end, i, left, right;
|
||||
std::sort(arr.begin(), arr.begin() + blockSize);
|
||||
for (i = 1; i < blockNum; ++i)
|
||||
{
|
||||
start = i * blockSize;
|
||||
end = min(start + blockSize, arr.size());
|
||||
std::sort(arr.begin() + start, arr.begin() + end);
|
||||
left = crossNum;
|
||||
while (!(arr[start - left] < arr[start]))
|
||||
{
|
||||
left = left << 1;
|
||||
if (left >= blockSize)
|
||||
{
|
||||
std::sort(arr.begin(), arr.end()); // 退化到普通排序
|
||||
return;
|
||||
}
|
||||
}
|
||||
right = min(crossNum, end - start - 1);
|
||||
|
||||
while (!(arr[start - 1] < arr[start + right]))
|
||||
{
|
||||
right = min(right << 1, end - start - 1);
|
||||
if (right == end - start - 1)
|
||||
break;
|
||||
}
|
||||
std::sort(arr.begin() + start - left, arr.begin() + start + right);
|
||||
}
|
||||
}
|
||||
|
||||
/* 处理一组pairend的readends,标记冗余 */
|
||||
static void markDupsForPairs(vector<const ReadEnds *> &vpRe,
|
||||
set<int64_t> *dupIdx,
|
||||
set<int64_t> *opticalDupIdx,
|
||||
set<int64_t> *notDupIdx = nullptr)
|
||||
{
|
||||
if (vpRe.size() < 2)
|
||||
{
|
||||
if (vpRe.size() == 1)
|
||||
{
|
||||
// addSingletonToCount(libraryIdGenerator);
|
||||
}
|
||||
return;
|
||||
}
|
||||
int maxScore = 0;
|
||||
const ReadEnds *pBest = nullptr;
|
||||
/** All read ends should have orientation FF, FR, RF, or RR **/
|
||||
for (auto pe : vpRe) // 找分数最高的readend
|
||||
{
|
||||
if (pe->score > maxScore || pBest == nullptr)
|
||||
{
|
||||
maxScore = pe->score;
|
||||
pBest = pe;
|
||||
}
|
||||
}
|
||||
if (notDupIdx != nullptr)
|
||||
{
|
||||
notDupIdx->insert(pBest->read1IndexInFile);
|
||||
notDupIdx->insert(pBest->read2IndexInFile);
|
||||
}
|
||||
if (!g_mdArg.READ_NAME_REGEX.empty()) // 检查光学冗余
|
||||
{
|
||||
// trackOpticalDuplicates
|
||||
}
|
||||
for (auto pe : vpRe) // 对非best read标记冗余
|
||||
{
|
||||
if (pe != pBest) // 非best
|
||||
{
|
||||
dupIdx->insert(pe->read1IndexInFile); // 添加read1
|
||||
if (pe->read2IndexInFile != pe->read1IndexInFile)
|
||||
dupIdx->insert(pe->read2IndexInFile); // 添加read2
|
||||
}
|
||||
}
|
||||
|
||||
// if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS)
|
||||
// {
|
||||
// addRepresentativeReadIndex(vpRe);
|
||||
// }
|
||||
}
|
||||
|
||||
/* 处理一组非paired的readends,标记冗余 */
|
||||
static void markDupsForFrags(vector<const ReadEnds *> &vpRe,
|
||||
bool containsPairs,
|
||||
set<int64_t> *dupIdx,
|
||||
set<int64_t> *notDupIdx = nullptr)
|
||||
{
|
||||
if (containsPairs)
|
||||
{
|
||||
for (auto pe : vpRe)
|
||||
{
|
||||
if (!pe->IsPaired())
|
||||
{
|
||||
dupIdx->insert(pe->read1IndexInFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
int maxScore = 0;
|
||||
const ReadEnds *pBest = nullptr;
|
||||
for (auto pe : vpRe)
|
||||
{
|
||||
if (pe->score > maxScore || pBest == nullptr)
|
||||
{
|
||||
maxScore = pe->score;
|
||||
pBest = pe;
|
||||
}
|
||||
}
|
||||
if (notDupIdx != nullptr)
|
||||
{
|
||||
notDupIdx->insert(pBest->read1IndexInFile);
|
||||
}
|
||||
for (auto pe : vpRe)
|
||||
{
|
||||
if (pe != pBest)
|
||||
{
|
||||
dupIdx->insert(pe->read1IndexInFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* 找到与readend pos相等的所有readend */
|
||||
static void getEqualRE(const ReadEnds &re, vector<ReadEnds> &src, vector<ReadEnds> *dst)
|
||||
{
|
||||
auto range = std::equal_range(src.begin(), src.end(), re, ReadEnds::pairsLittleThan); // 只比对位点
|
||||
dst->insert(dst->end(), range.first, range.second);
|
||||
}
|
||||
|
||||
/* 单线程生成readends (第一步)*/
|
||||
static void generateReadEnds(SerailMarkDupArg *arg)
|
||||
{
|
||||
auto &p = *arg;
|
||||
auto &rnParser = g_vRnParser[0];
|
||||
|
||||
p.pairs.clear();
|
||||
p.frags.clear();
|
||||
p.unpairedDic.clear();
|
||||
p.unpairedPosArr.clear();
|
||||
|
||||
/* 处理每个read,创建ReadEnd,并放入frag和pair中 */
|
||||
set<ReadEnds> reSet;
|
||||
|
||||
ReadEnds lastRe;
|
||||
|
||||
for (int i = 0; i < p.bams.size(); ++i) // 循环处理每个read
|
||||
{
|
||||
BamWrap *bw = p.bams[i];
|
||||
const int64_t bamIdx = p.bamStartIdx + i;
|
||||
if (bw->GetReadUnmappedFlag())
|
||||
{
|
||||
if (bw->b->core.tid == -1)
|
||||
// When we hit the unmapped reads with no coordinate, no reason to continue (only in coordinate sort).
|
||||
break;
|
||||
}
|
||||
else if (!bw->IsSecondaryOrSupplementary()) // 是主要比对
|
||||
{
|
||||
ReadEnds fragEnd;
|
||||
tm_arr[8].acc_start();
|
||||
buildReadEnds(*bw, bamIdx, rnParser, &fragEnd);
|
||||
tm_arr[8].acc_end();
|
||||
p.frags.push_back(fragEnd); // 添加进frag集合
|
||||
if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) // 是pairend而且互补的read也比对上了
|
||||
{
|
||||
string key = bw->query_name();
|
||||
if (p.unpairedDic.find(key) == p.unpairedDic.end())
|
||||
{
|
||||
p.unpairedDic[key] = {p.taskSeq, fragEnd};
|
||||
}
|
||||
else // 找到了pairend
|
||||
{
|
||||
auto &pairedEnds = p.unpairedDic.at(key).unpairedRE;
|
||||
modifyPairedEnds(fragEnd, &pairedEnds);
|
||||
p.pairs.push_back(pairedEnds);
|
||||
p.unpairedDic.erase(key); // 删除找到的pairend
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
tm_arr[9].acc_start();
|
||||
sortReadEndsArr(p.frags);
|
||||
// sort(p.frags.begin(), p.frags.end());
|
||||
tm_arr[9].acc_end();
|
||||
// cout << "sort pairs" << endl;
|
||||
tm_arr[10].acc_start();
|
||||
sort(p.pairs.begin(), p.pairs.end());
|
||||
tm_arr[10].acc_end();
|
||||
// 记录位点上的未匹配的read个数
|
||||
for (auto &e : p.unpairedDic) {
|
||||
auto posKey = e.second.unpairedRE.posKey;
|
||||
auto &unpairArrInfo = p.unpairedPosArr[posKey];
|
||||
unpairArrInfo.unpairedNum++;
|
||||
unpairArrInfo.taskSeq = p.taskSeq;
|
||||
unpairArrInfo.readNameSet.insert(e.first);
|
||||
}
|
||||
cout << "依赖比例:" << (float)p.unpairedDic.size() / p.frags.size() << endl;
|
||||
}
|
||||
|
||||
/* 处理pairs */
|
||||
static void processPairs(vector<ReadEnds> &readEnds,
|
||||
set<int64_t> *dupIdx,
|
||||
set<int64_t> *opticalDupIdx,
|
||||
set<int64_t> *notDupIdx = nullptr)
|
||||
{
|
||||
vector<const ReadEnds *> vpCache; // 有可能是冗余的reads
|
||||
const ReadEnds *pReadEnd = nullptr;
|
||||
for (auto &re : readEnds)
|
||||
{
|
||||
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样
|
||||
vpCache.push_back(&re); // 处理一个潜在的冗余组
|
||||
else
|
||||
{
|
||||
markDupsForPairs(vpCache, dupIdx, opticalDupIdx, notDupIdx); // 不一样
|
||||
vpCache.clear();
|
||||
vpCache.push_back(&re);
|
||||
pReadEnd = &re;
|
||||
}
|
||||
}
|
||||
markDupsForPairs(vpCache, dupIdx, opticalDupIdx, notDupIdx);
|
||||
}
|
||||
|
||||
/* 处理frags */
|
||||
static void processFrags(vector<ReadEnds> &readEnds,
|
||||
set<int64_t> *dupIdx,
|
||||
set<int64_t> *notDupIdx = nullptr)
|
||||
{
|
||||
bool containsPairs = false;
|
||||
bool containsFrags = false;
|
||||
vector<const ReadEnds *> vpCache; // 有可能是冗余的reads
|
||||
const ReadEnds *pReadEnd = nullptr;
|
||||
for (auto &re : readEnds)
|
||||
{
|
||||
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false))
|
||||
{
|
||||
vpCache.push_back(&re);
|
||||
containsPairs = containsPairs || re.IsPaired();
|
||||
containsFrags = containsFrags || !re.IsPaired();
|
||||
}
|
||||
else
|
||||
{
|
||||
if (vpCache.size() > 1 && containsFrags)
|
||||
{
|
||||
markDupsForFrags(vpCache, containsPairs, dupIdx, notDupIdx);
|
||||
}
|
||||
vpCache.clear();
|
||||
vpCache.push_back(&re);
|
||||
pReadEnd = &re;
|
||||
containsPairs = re.IsPaired();
|
||||
containsFrags = !re.IsPaired();
|
||||
}
|
||||
}
|
||||
if (vpCache.size() > 1 && containsFrags)
|
||||
{
|
||||
markDupsForFrags(vpCache, containsPairs, dupIdx, notDupIdx);
|
||||
}
|
||||
}
|
||||
|
||||
/* 单线程markdup (第二步)*/
|
||||
static void markdups(SerailMarkDupArg *arg)
|
||||
{
|
||||
auto &p = *arg;
|
||||
p.pairDupIdx.clear();
|
||||
p.pairOpticalDupIdx.clear();
|
||||
p.fragDupIdx.clear();
|
||||
/* generateDuplicateIndexes,计算冗余read在所有read中的位置索引 */
|
||||
// 先处理 pair
|
||||
processPairs(p.pairs, &p.pairDupIdx, &p.pairOpticalDupIdx);
|
||||
|
||||
// 再处理frag
|
||||
processFrags(p.frags, &p.fragDupIdx);
|
||||
}
|
||||
|
||||
/* 获取交叉部分的数据 */
|
||||
static inline void getIntersectData(vector<ReadEnds> &leftArr,
|
||||
vector<ReadEnds> &rightArr,
|
||||
vector<ReadEnds> *dst,
|
||||
bool isPairCmp = false)
|
||||
{
|
||||
const size_t leftEndIdx = leftArr.size() - 1;
|
||||
const size_t rightStartIdx = 0;
|
||||
size_t leftSpan = 0;
|
||||
size_t rightSpan = 0;
|
||||
|
||||
while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx - leftSpan], rightArr[rightStartIdx], isPairCmp))
|
||||
{
|
||||
leftSpan += 1;
|
||||
if (leftSpan > leftEndIdx)
|
||||
{
|
||||
leftSpan = leftArr.size() - 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx], rightArr[rightSpan], isPairCmp))
|
||||
{
|
||||
rightSpan += 1;
|
||||
if (rightSpan == rightArr.size() - 1)
|
||||
break;
|
||||
}
|
||||
dst->insert(dst->end(), leftArr.end() - leftSpan, leftArr.end());
|
||||
dst->insert(dst->end(), rightArr.begin(), rightArr.begin() + rightSpan);
|
||||
std::sort(dst->begin(), dst->end());
|
||||
}
|
||||
|
||||
/* 将frags重叠部分的dup idx放进数据中 */
|
||||
static inline void refreshFragDupIdx(set<int64_t> &dupIdx,
|
||||
set<int64_t> ¬DupIdx,
|
||||
SerailMarkDupArg * lastArg,
|
||||
SerailMarkDupArg *curArg)
|
||||
{
|
||||
auto &lp = *lastArg;
|
||||
auto &p = *curArg;
|
||||
for (auto idx : dupIdx)
|
||||
{
|
||||
lp.fragDupIdx.insert(idx);
|
||||
p.fragDupIdx.erase(idx);
|
||||
}
|
||||
for (auto idx : notDupIdx)
|
||||
{
|
||||
lp.fragDupIdx.erase(idx);
|
||||
p.fragDupIdx.erase(idx);
|
||||
}
|
||||
}
|
||||
|
||||
/* 将pairs重叠部分的dup idx放进数据中 */
|
||||
static inline void refreshPairDupIdx(set<int64_t> &dupIdx,
|
||||
set<int64_t> &opticalDupIdx,
|
||||
set<int64_t> ¬DupIdx,
|
||||
SerailMarkDupArg *lastArg,
|
||||
SerailMarkDupArg *curArg)
|
||||
{
|
||||
auto &lp = *lastArg;
|
||||
auto &p = *curArg;
|
||||
for (auto idx : dupIdx)
|
||||
{
|
||||
lp.pairDupIdx.insert(idx);
|
||||
p.pairDupIdx.erase(idx);
|
||||
}
|
||||
for (auto idx : opticalDupIdx)
|
||||
{
|
||||
lp.pairOpticalDupIdx.insert(idx);
|
||||
p.pairOpticalDupIdx.erase(idx);
|
||||
}
|
||||
for (auto idx : notDupIdx)
|
||||
{
|
||||
lp.pairDupIdx.erase(idx);
|
||||
lp.pairOpticalDupIdx.erase(idx);
|
||||
p.pairDupIdx.erase(idx);
|
||||
p.pairOpticalDupIdx.erase(idx);
|
||||
}
|
||||
}
|
||||
|
||||
// 用来分别处理dup和optical dup
|
||||
static void refeshTaskDupInfo(set<int64_t> &dupIdx,
|
||||
set<int64_t> &opticalDupIdx,
|
||||
set<int64_t> ¬DupIdx,
|
||||
set<int64_t> &latterDupIdx,
|
||||
set<int64_t> &latterOpticalDupIdx,
|
||||
set<int64_t> &latterNotDupIdx)
|
||||
{
|
||||
for (auto idx : dupIdx)
|
||||
latterDupIdx.insert(idx);
|
||||
for (auto idx : opticalDupIdx)
|
||||
latterOpticalDupIdx.insert(idx);
|
||||
for (auto idx : notDupIdx)
|
||||
latterNotDupIdx.insert(idx);
|
||||
}
|
||||
|
||||
/* 最后合并数据并排序 */
|
||||
static void refeshFinalTaskDupInfo(set<int64_t> &dupIdx,
|
||||
set<int64_t> ¬DupIdx,
|
||||
vector<int64_t> &dupArr)
|
||||
{
|
||||
vector<int64_t> midArr;
|
||||
|
||||
auto ai = dupArr.begin();
|
||||
auto bi = dupIdx.begin();
|
||||
auto ae = dupArr.end();
|
||||
auto be = dupIdx.end();
|
||||
|
||||
int64_t val = 0;
|
||||
while (ai != ae && bi != be)
|
||||
{
|
||||
if (*ai < *bi)
|
||||
{
|
||||
val = *ai++;
|
||||
}
|
||||
else if (*bi < *ai)
|
||||
{
|
||||
val = *bi++;
|
||||
}
|
||||
else
|
||||
{
|
||||
val = *ai++;
|
||||
bi++;
|
||||
}
|
||||
if (notDupIdx.find(val) == notDupIdx.end())
|
||||
{
|
||||
midArr.push_back(val);
|
||||
}
|
||||
}
|
||||
while (ai != ae)
|
||||
{
|
||||
val = *ai++;
|
||||
if (notDupIdx.find(val) == notDupIdx.end())
|
||||
{
|
||||
midArr.push_back(val);
|
||||
}
|
||||
}
|
||||
while (bi != be)
|
||||
{
|
||||
val = *bi++;
|
||||
if (notDupIdx.find(val) == notDupIdx.end())
|
||||
{
|
||||
midArr.push_back(val);
|
||||
}
|
||||
}
|
||||
dupArr = midArr;
|
||||
}
|
||||
|
||||
/* 处理相邻的两个任务,有相交叉的数据 */
|
||||
static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *curArg, GlobalDataArg *gDataArg)
|
||||
{
|
||||
auto &lp = *lastArg;
|
||||
auto &p = *curArg;
|
||||
auto &g = *gDataArg;
|
||||
|
||||
vector<ReadEnds> reArr;
|
||||
set<int64_t> dupIdx;
|
||||
set<int64_t> notDupIdx;
|
||||
// 先处理重叠的frags
|
||||
getIntersectData(lp.frags, p.frags, &reArr);
|
||||
processFrags(reArr, &dupIdx, ¬DupIdx);
|
||||
refreshFragDupIdx(dupIdx, notDupIdx, &lp, &p);
|
||||
|
||||
// 再处理重叠的pairs
|
||||
reArr.clear();
|
||||
dupIdx.clear();
|
||||
notDupIdx.clear();
|
||||
set<int64_t> opticalDupIdx;
|
||||
getIntersectData(lp.pairs, p.pairs, &reArr, true);
|
||||
processPairs(reArr, &dupIdx, &opticalDupIdx, ¬DupIdx);
|
||||
refreshPairDupIdx(dupIdx, opticalDupIdx, notDupIdx, &lp, &p);
|
||||
|
||||
// 处理之前未匹配的部分
|
||||
map<CalcKey, int64_t> recalcPos;
|
||||
set<CalcKey> alreadyAdd; // 与该位点相同的pair都添加到数组里了
|
||||
set<int64_t> addToGlobal;
|
||||
int64_t prevLastPos = 0, nextFirstPos = 0;
|
||||
if (lp.frags.size() > 0)
|
||||
prevLastPos = lp.frags.back().posKey;
|
||||
if (p.frags.size() > 0)
|
||||
nextFirstPos = p.frags[0].posKey;
|
||||
// cout << "range: " << nextFirstPos << '\t' << prevLastPos << endl;
|
||||
for (auto &prevUnpair : lp.unpairedDic) // 遍历上一个任务中的每个未匹配的read
|
||||
{
|
||||
auto &readName = prevUnpair.first;
|
||||
auto &prevPosInfo = prevUnpair.second;
|
||||
auto prevFragEnd = prevPosInfo.unpairedRE; // 未匹配的read end
|
||||
|
||||
if (p.unpairedDic.find(readName) != p.unpairedDic.end()) // 在当前这个任务里找到了这个未匹配的read
|
||||
{
|
||||
auto &nextPosInfo = p.unpairedDic[readName];
|
||||
auto &nextFragEnd = nextPosInfo.unpairedRE;
|
||||
int64_t prevPosKey = prevFragEnd.posKey;
|
||||
modifyPairedEnds(nextFragEnd, &prevFragEnd); // 在某些clip情况下,poskey可能是后面的read
|
||||
int64_t nextPosKey = max(prevPosKey, nextFragEnd.posKey);
|
||||
CalcKey ck = {prevPosKey, nextPosKey};
|
||||
UnpairedPosInfo *prevUnpairInfoP = nullptr;
|
||||
UnpairedPosInfo *nextUnpairInfoP = nullptr;
|
||||
if (lp.unpairedPosArr.find(prevPosKey) != lp.unpairedPosArr.end())
|
||||
prevUnpairInfoP = &lp.unpairedPosArr[prevPosKey];
|
||||
if (p.unpairedPosArr.find(prevPosKey) != p.unpairedPosArr.end())
|
||||
nextUnpairInfoP = &p.unpairedPosArr[prevPosKey];
|
||||
|
||||
// pos分为两种情况,根据poskey(pair中两个read分别的pos)的位置确定
|
||||
// 1. prevpos在交叉部分之前,nextpos在交叉部分之后,这种情况不需要获取pairarr中的数据;
|
||||
// 2. prevpos在交叉部分之前,nextpos在交叉部分,需要获取lp中的相等read pair进行重新计算
|
||||
// 复杂情况1. g中包含prevPosKey对应的unpair,p中有对应的pair,此时应该把这些pair考虑进去
|
||||
// 3. prevpos在交叉部分,nextpos在交叉部分之后,需要获取p中的相等read pair进行重新计算
|
||||
// 复杂情况2. p中是否包含prevPosKey对应的unpair
|
||||
// 4. prevpos在交叉部分,nextpos在交叉部分,需要获取lp和p中的相等read pair进行重新计算
|
||||
|
||||
bool addDataToPos = true;
|
||||
if (alreadyAdd.find(ck) != alreadyAdd.end())
|
||||
{
|
||||
addDataToPos = false; // 之前已经添加过了,后面就不用再添加数据了
|
||||
}
|
||||
else
|
||||
alreadyAdd.insert(ck);
|
||||
|
||||
if (prevPosKey < nextFirstPos) // prevpos在交叉部分之前
|
||||
{
|
||||
auto &prevPairArr = prevUnpairInfoP->pairArr; // prevUnpairInfoP肯定不是nullptr
|
||||
prevPairArr.push_back(prevFragEnd);
|
||||
if (nextPosKey <= prevLastPos && addDataToPos) // 第二种情况
|
||||
{
|
||||
getEqualRE(prevFragEnd, lp.pairs, &prevPairArr);
|
||||
}
|
||||
// 第一种情况,第二种情况下都会出现,复杂情况一
|
||||
auto gPosInfo = g.unpairedPosArr.find(prevPosKey);
|
||||
if (gPosInfo != g.unpairedPosArr.end()) // 可能g和p有匹配的,刚好和该位点一致
|
||||
{
|
||||
auto &gUnpairInfo = gPosInfo->second;
|
||||
auto pPosInfo = p.unpairedPosArr.find(nextPosKey);
|
||||
if (pPosInfo != p.unpairedPosArr.end())
|
||||
{
|
||||
auto &pUnpairInfo = pPosInfo->second;
|
||||
for (auto &rn : gUnpairInfo.readNameSet) // 遍历每一个readname,看是否有匹配的
|
||||
{
|
||||
if (pUnpairInfo.readNameSet.find(rn) != pUnpairInfo.readNameSet.end())
|
||||
{
|
||||
auto pe = g.unpairedDic[rn].unpairedRE;
|
||||
auto fe = p.unpairedDic[rn].unpairedRE;
|
||||
modifyPairedEnds(fe, &pe);
|
||||
prevPairArr.push_back(pe);
|
||||
g.unpairedDic.erase(rn);
|
||||
p.unpairedDic.erase(rn);
|
||||
// cout << "找到了!" << rn << endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
recalcPos[ck] = prevPosInfo.taskSeq;
|
||||
std::sort(prevPairArr.begin(), prevPairArr.end());
|
||||
}
|
||||
else // prevpos在交叉部分
|
||||
{
|
||||
if (nextPosKey > prevLastPos) // nextpos在交叉部分之后
|
||||
{ // 第三种情况
|
||||
if (nextUnpairInfoP != nullptr) // 且在pos点,next task有unpair,这样才把这些数据放到next task里
|
||||
{
|
||||
auto &nextPairArr = nextUnpairInfoP->pairArr;
|
||||
nextPairArr.push_back(prevFragEnd);
|
||||
auto &prevPairArr = prevUnpairInfoP->pairArr;
|
||||
prevPairArr.push_back(prevFragEnd);
|
||||
if (addDataToPos)
|
||||
{
|
||||
getEqualRE(prevFragEnd, p.pairs, &prevPairArr);
|
||||
}
|
||||
recalcPos[ck] = nextPosInfo.taskSeq; // 将数据放到next task里, (这个位点以后会可能还会计算到,目前方案是都计算,只是把冗余剔除)
|
||||
std::sort(prevPairArr.begin(), prevPairArr.end());
|
||||
}
|
||||
else // next task在该位点没有unpair,那就把数据放到prev task里
|
||||
{
|
||||
auto &prevPairArr = prevUnpairInfoP->pairArr; // prevUnpairInfoP肯定不是nullptr
|
||||
prevPairArr.push_back(prevFragEnd);
|
||||
if (addDataToPos) // 第二种情况
|
||||
getEqualRE(prevFragEnd, p.pairs, &prevPairArr);
|
||||
recalcPos[ck] = prevPosInfo.taskSeq;
|
||||
std::sort(prevPairArr.begin(), prevPairArr.end());
|
||||
}
|
||||
}
|
||||
else
|
||||
{ // 第四种情况
|
||||
if (prevUnpairInfoP == nullptr) {
|
||||
prevUnpairInfoP = &lp.unpairedPosArr[prevPosKey];
|
||||
prevUnpairInfoP->taskSeq = lp.taskSeq;
|
||||
}
|
||||
auto &prevPairArr = prevUnpairInfoP->pairArr;
|
||||
prevPairArr.push_back(prevFragEnd);
|
||||
if (addDataToPos)
|
||||
{
|
||||
getEqualRE(prevFragEnd, lp.pairs, &prevPairArr);
|
||||
getEqualRE(prevFragEnd, p.pairs, &prevPairArr);
|
||||
}
|
||||
recalcPos[ck] = prevPosInfo.taskSeq;
|
||||
std::sort(prevPairArr.begin(), prevPairArr.end());
|
||||
}
|
||||
}
|
||||
p.unpairedDic.erase(readName); // 在next task里删除该read
|
||||
}
|
||||
else if (g.unpairedDic.find(readName) != g.unpairedDic.end()) // 在遗留数据中找到了匹配的read
|
||||
{
|
||||
auto &remainPosInfo = g.unpairedDic[readName];
|
||||
auto remainFragEnd = remainPosInfo.unpairedRE;
|
||||
int64_t remainPosKey = remainFragEnd.posKey;
|
||||
modifyPairedEnds(prevFragEnd, &remainFragEnd); // 在某些clip情况下,poskey可能是后面的read
|
||||
auto &remainUnpairInfo = g.unpairedPosArr[remainPosKey];
|
||||
auto &remainPairArr = remainUnpairInfo.pairArr;
|
||||
remainPairArr.push_back(remainFragEnd);
|
||||
CalcKey ck = {remainPosKey, prevFragEnd.posKey};
|
||||
recalcPos[ck] = remainPosInfo.taskSeq;
|
||||
std::sort(remainPairArr.begin(), remainPairArr.end());
|
||||
|
||||
g.unpairedDic.erase(readName);
|
||||
}
|
||||
else // 都没找到,那就保存到遗留数据里
|
||||
{
|
||||
int64_t prevPosKey = prevFragEnd.posKey;
|
||||
g.unpairedDic.insert(prevUnpair);
|
||||
addToGlobal.insert(prevPosKey);
|
||||
}
|
||||
}
|
||||
for (auto posKey : addToGlobal) // 最后再添加,以防开始赋值,后来这个位置要是又添加了新的数据
|
||||
g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey];
|
||||
|
||||
map<int64_t, TaskSeqDupInfo> taskChanged;
|
||||
set<int64_t> posProcessed;
|
||||
for (auto &e : recalcPos)
|
||||
{
|
||||
auto posKey = e.first.read1Pos;
|
||||
if (posProcessed.find(posKey) != posProcessed.end())
|
||||
continue;
|
||||
posProcessed.insert(posKey);
|
||||
auto taskSeq = e.second;
|
||||
auto &t = taskChanged[taskSeq];
|
||||
// 在对应的任务包含的dup idx里修改结果数据
|
||||
vector<ReadEnds> *pairArrP = nullptr;
|
||||
if (taskSeq < lp.taskSeq)
|
||||
pairArrP = &g.unpairedPosArr[posKey].pairArr;
|
||||
else
|
||||
pairArrP = &lp.unpairedPosArr[posKey].pairArr;
|
||||
processPairs(*pairArrP, &t.dupIdx, &t.opticalDupIdx, &t.notDupIdx);
|
||||
if (taskSeq < lp.taskSeq)
|
||||
g.unpairedPosArr.erase(posKey);
|
||||
}
|
||||
// 更新结果
|
||||
|
||||
for (auto &e : taskChanged)
|
||||
{
|
||||
auto taskSeq = e.first;
|
||||
auto &t = e.second;
|
||||
if (taskSeq < lp.taskSeq)
|
||||
{
|
||||
refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.notDupIdx,
|
||||
g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq], g.latterNotDupIdxArr[taskSeq]);
|
||||
}
|
||||
else if (taskSeq == lp.taskSeq)
|
||||
{
|
||||
refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.notDupIdx, &lp, &p);
|
||||
}
|
||||
else
|
||||
{
|
||||
refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.notDupIdx, &p, &lp); // 把结果放到p中
|
||||
}
|
||||
}
|
||||
|
||||
// cout << "remain unpaired: " << g.unpairedDic.size() << '\t' << g.unpairedPosArr.size() << endl;
|
||||
// cout << "calc g time: " << t.seconds_elapsed() << " s" << endl;
|
||||
// 将dupidx放进全局数据
|
||||
g.latterDupIdxArr.push_back(set<int64_t>());
|
||||
g.latterOpticalDupIdxArr.push_back(set<int64_t>());
|
||||
g.latterNotDupIdxArr.push_back(set<int64_t>());
|
||||
|
||||
g.dupIdxArr.push_back(vector<int64_t>());
|
||||
auto &vIdx = g.dupIdxArr.back();
|
||||
lp.pairDupIdx.insert(lp.fragDupIdx.begin(), lp.fragDupIdx.end());
|
||||
vIdx.insert(vIdx.end(), lp.pairDupIdx.begin(), lp.pairDupIdx.end());
|
||||
|
||||
g.opticalDupIdxArr.push_back(vector<int64_t>());
|
||||
auto &vOpticalIdx = g.opticalDupIdxArr.back();
|
||||
vOpticalIdx.insert(vOpticalIdx.end(), lp.pairOpticalDupIdx.begin(), lp.pairOpticalDupIdx.end());
|
||||
}
|
||||
|
||||
/* 当所有任务结束后,global data里还有未处理的数据 */
|
||||
static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg)
|
||||
{
|
||||
auto &lp = *task;
|
||||
auto &g = *gDataArg;
|
||||
// 遗留的未匹配的pair
|
||||
for (auto &prevUnpair : lp.unpairedDic) // 遍历上一个任务中的每个未匹配的read
|
||||
{
|
||||
auto &readName = prevUnpair.first;
|
||||
auto &prevPosInfo = prevUnpair.second;
|
||||
auto prevFragEnd = prevPosInfo.unpairedRE; // 未匹配的read end
|
||||
|
||||
if (g.unpairedDic.find(readName) != g.unpairedDic.end()) // 在遗留数据中找到了匹配的read
|
||||
{
|
||||
auto &remainPosInfo = g.unpairedDic[readName];
|
||||
auto remainFragEnd = remainPosInfo.unpairedRE;
|
||||
int64_t remainPosKey = remainFragEnd.posKey;
|
||||
modifyPairedEnds(prevFragEnd, &remainFragEnd); // 在某些clip情况下,poskey可能是后面的read
|
||||
auto &remainUnpairInfo = g.unpairedPosArr[remainPosKey];
|
||||
|
||||
remainUnpairInfo.pairArr.push_back(remainFragEnd);
|
||||
g.unpairedDic.erase(readName);
|
||||
}
|
||||
}
|
||||
map<int64_t, TaskSeqDupInfo> taskChanged;
|
||||
for (auto &e : g.unpairedPosArr)
|
||||
{
|
||||
auto posKey = e.first;
|
||||
auto taskSeq = e.second.taskSeq;
|
||||
auto &t = taskChanged[taskSeq];
|
||||
auto &arr = g.unpairedPosArr[posKey].pairArr;
|
||||
|
||||
if (arr.size() > 1)
|
||||
{
|
||||
std::sort(arr.begin(), arr.end());
|
||||
processPairs(arr, &t.dupIdx, &t.opticalDupIdx, &t.notDupIdx);
|
||||
}
|
||||
}
|
||||
// 更新结果
|
||||
vector<int64_t> addDup;
|
||||
map<int64_t, int64_t> ndPosVal;
|
||||
for (auto &e : taskChanged)
|
||||
{
|
||||
auto taskSeq = e.first;
|
||||
auto &t = e.second;
|
||||
refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.notDupIdx,
|
||||
g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq], g.latterNotDupIdxArr[taskSeq]);
|
||||
}
|
||||
|
||||
cout << "last unpair info: " << g.unpairedDic.size() << '\t' << g.unpairedPosArr.size() << endl;
|
||||
g.unpairedPosArr.clear();
|
||||
g.unpairedDic.clear();
|
||||
|
||||
// 将dupidx放进全局数据
|
||||
for (int i = 0; i < g.dupIdxArr.size() - 1; ++i)
|
||||
refeshFinalTaskDupInfo(g.latterDupIdxArr[i], g.latterNotDupIdxArr[i], g.dupIdxArr[i]);
|
||||
for (int i = 0; i < g.opticalDupIdxArr.size() - 1; ++i)
|
||||
refeshFinalTaskDupInfo(g.latterOpticalDupIdxArr[i], g.latterNotDupIdxArr[i], g.opticalDupIdxArr[i]);
|
||||
|
||||
g.dupIdxArr.push_back(vector<int64_t>());
|
||||
auto &vIdx = g.dupIdxArr.back();
|
||||
lp.pairDupIdx.insert(lp.fragDupIdx.begin(), lp.fragDupIdx.end());
|
||||
vIdx.insert(vIdx.end(), lp.pairDupIdx.begin(), lp.pairDupIdx.end());
|
||||
|
||||
g.opticalDupIdxArr.push_back(vector<int64_t>());
|
||||
auto &vOpticalIdx = g.opticalDupIdxArr.back();
|
||||
vOpticalIdx.insert(vOpticalIdx.end(), lp.pairOpticalDupIdx.begin(), lp.pairOpticalDupIdx.end());
|
||||
}
|
||||
|
||||
/* 串行处理数据,标记冗余 */
|
||||
static void serialMarkDups()
|
||||
{
|
||||
tm_arr[5].acc_start();
|
||||
Timer::log_time("serial start");
|
||||
// 读取缓存初始化
|
||||
BamBufType inBamBuf(g_gArg.use_asyncio);
|
||||
inBamBuf.Init(g_inBamFp, g_inBamHeader, g_gArg.max_mem);
|
||||
// BamBufType inBamBuf(false);
|
||||
// inBamBuf.Init(g_inBamFp, g_inBamHeader, 100 * 1024 * 1024);
|
||||
int64_t processedBamNum = 0;
|
||||
|
||||
SerailMarkDupArg smdArg1, smdArg2;
|
||||
SerailMarkDupArg *lastArgP = &smdArg1;
|
||||
SerailMarkDupArg *curArgP = &smdArg2;
|
||||
|
||||
bool isFirstRound = true;
|
||||
int roundNum = 0;
|
||||
while (inBamBuf.ReadStat() >= 0)
|
||||
{
|
||||
Timer t_round;
|
||||
// 读取bam文件中的read
|
||||
tm_arr[4].acc_start();
|
||||
size_t readNum = inBamBuf.ReadBam();
|
||||
tm_arr[4].acc_end();
|
||||
cout << "read num: " << readNum << endl;
|
||||
// lastArgP = curArgP;
|
||||
tm_arr[6].acc_start();
|
||||
curArgP->taskSeq = roundNum;
|
||||
curArgP->bamStartIdx = processedBamNum;
|
||||
curArgP->bams = inBamBuf.GetBamArr();
|
||||
tm_arr[6].acc_end();
|
||||
|
||||
tm_arr[0].acc_start();
|
||||
Timer t1;
|
||||
generateReadEnds(curArgP);
|
||||
//cout << "calc read end time: " << t1.seconds_elapsed() << " s" << endl;
|
||||
tm_arr[0].acc_end();
|
||||
|
||||
tm_arr[1].acc_start();
|
||||
t1.reinit();
|
||||
markdups(curArgP);
|
||||
//cout << "markdups time: " << t1.seconds_elapsed() << " s" << endl;
|
||||
tm_arr[1].acc_end();
|
||||
|
||||
if (!isFirstRound)
|
||||
{
|
||||
tm_arr[2].acc_start();
|
||||
t1.reinit();
|
||||
handleIntersectData(lastArgP, curArgP, &gData);
|
||||
//cout << "intersect time: " << t1.seconds_elapsed() << " s" << endl;
|
||||
// addTaskIdxToSet(lastArgP, &gData);
|
||||
tm_arr[2].acc_end();
|
||||
}
|
||||
else
|
||||
{
|
||||
isFirstRound = false;
|
||||
}
|
||||
inBamBuf.ClearAll(); // 清理上一轮读入的数据
|
||||
processedBamNum += readNum;
|
||||
|
||||
// 交换
|
||||
auto tmp = lastArgP;
|
||||
lastArgP = curArgP;
|
||||
curArgP = tmp;
|
||||
cout << "round time: " << t_round.seconds_elapsed() << endl;
|
||||
roundNum++;
|
||||
if (roundNum > 9){
|
||||
// break;
|
||||
}
|
||||
}
|
||||
// cout << "here" << endl;
|
||||
tm_arr[3].acc_start();
|
||||
// 处理剩下的全局数据
|
||||
handleLastTask(lastArgP, &gData);
|
||||
// cout << "here 2" << endl;
|
||||
tm_arr[3].acc_end();
|
||||
|
||||
tm_arr[5].acc_end();
|
||||
// 统计所有冗余index数量
|
||||
int64_t dupNum = 0;
|
||||
set<int64_t> dup;
|
||||
|
||||
// int taskSeq = 0;
|
||||
// for (auto &arr : gData.dupIdxArr)
|
||||
// {
|
||||
// for (auto idx : arr) {
|
||||
// if (dup.find(idx) != dup.end())
|
||||
// {
|
||||
// cout << "dup index: " << taskSeq << '\t' << idx << endl;
|
||||
// }
|
||||
// dup.insert(idx);
|
||||
// }
|
||||
// taskSeq++;
|
||||
// }
|
||||
// #include <fstream>
|
||||
// ofstream out("tumor_dup.txt");
|
||||
// for (auto idx : dup)
|
||||
// {
|
||||
// out << idx << endl;
|
||||
// }
|
||||
// out.close();
|
||||
|
||||
for (auto &arr : gData.dupIdxArr)
|
||||
dupNum += arr.size();
|
||||
|
||||
cout << "dup num : " << dupNum << '\t' << dup.size() << endl;
|
||||
|
||||
cout << "calc readend: " << tm_arr[0].acc_seconds_elapsed() << endl;
|
||||
cout << "markdup : " << tm_arr[1].acc_seconds_elapsed() << endl;
|
||||
cout << "handle tail : " << tm_arr[2].acc_seconds_elapsed() << endl;
|
||||
cout << "handle last : " << tm_arr[3].acc_seconds_elapsed() << endl;
|
||||
cout << "read bam : " << tm_arr[4].acc_seconds_elapsed() << endl;
|
||||
cout << "new arg : " << tm_arr[6].acc_seconds_elapsed() << endl;
|
||||
cout << "del arg : " << tm_arr[7].acc_seconds_elapsed() << endl;
|
||||
cout << "build ends : " << tm_arr[8].acc_seconds_elapsed() << endl;
|
||||
cout << "sort frags : " << tm_arr[9].acc_seconds_elapsed() << endl;
|
||||
cout << "sort pairs : " << tm_arr[10].acc_seconds_elapsed() << endl;
|
||||
cout << "all : " << tm_arr[5].acc_seconds_elapsed() << endl;
|
||||
|
||||
Timer::log_time("serial end ");
|
||||
|
||||
//for (auto i : gData.dupArr)
|
||||
// cout << i << endl;
|
||||
}
|
||||
// 串行运行mark duplicate
|
||||
void serialMarkDups();
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
#pragma once
|
||||
|
||||
#include <common/utils/timer.h>
|
||||
#include <common/utils/util.h>
|
||||
#include <htslib/sam.h>
|
||||
#include <htslib/thread_pool.h>
|
||||
#include <sam/utils/read_ends.h>
|
||||
#include <sam/utils/read_name_parser.h>
|
||||
|
||||
extern Timer tm_arr[20]; // 用来测试性能
|
||||
/* 全局本地变量 */
|
||||
extern vector<ReadNameParser> g_vRnParser; // 每个线程一个read name parser
|
||||
extern samFile *g_inBamFp; // 输入的bam文件
|
||||
extern sam_hdr_t *g_inBamHeader; // 输入的bam文件头信息
|
||||
extern samFile *g_outBamFp; // 输出文件, sam或者bam格式
|
||||
extern sam_hdr_t *g_outBamHeader; // 输出文件的header
|
||||
|
||||
/* 参数对象作为全局对象,免得多次作为参数传入函数中 */
|
||||
class GlobalArg;
|
||||
extern GlobalArg &g_gArg;
|
||||
class MarkDupsArg;
|
||||
extern MarkDupsArg &g_mdArg;
|
||||
class GlobalDataArg;
|
||||
extern GlobalDataArg &gData;
|
||||
class DuplicationMetrics;
|
||||
extern DuplicationMetrics &gMetrics;
|
||||
|
|
@ -1,5 +1,6 @@
|
|||
/*
|
||||
Description: read ends结构体主要用来标记冗余,包含一些序列的测序过程中的物理信息等
|
||||
Description: read
|
||||
ends结构体主要用来标记冗余,包含一些序列的测序过程中的物理信息等
|
||||
|
||||
Copyright : All right reserved by ICT
|
||||
|
||||
|
|
@ -13,18 +14,20 @@ Date : 2023/11/3
|
|||
#include <stdint.h>
|
||||
|
||||
/**
|
||||
* Small interface that provides access to the physical location information about a cluster.
|
||||
* All values should be defaulted to -1 if unavailable. ReadGroup and Tile should only allow
|
||||
* non-zero positive integers, x and y coordinates may be negative.
|
||||
* Small interface that provides access to the physical location information
|
||||
* about a cluster. All values should be defaulted to -1 if unavailable.
|
||||
* ReadGroup and Tile should only allow non-zero positive integers, x and y
|
||||
* coordinates may be negative.
|
||||
*/
|
||||
struct PhysicalLocation
|
||||
{
|
||||
struct PhysicalLocation {
|
||||
static const int NO_VALUE = -1;
|
||||
/**
|
||||
* Small class that provides access to the physical location information about a cluster.
|
||||
* All values should be defaulted to -1 if unavailable. Tile should only allow
|
||||
* non-zero positive integers, x and y coordinates must be non-negative.
|
||||
* This is different from PhysicalLocationShort in that the x and y positions are ints, not shorts
|
||||
* thus, they do not overflow within a HiSeqX tile.
|
||||
* Small class that provides access to the physical location information
|
||||
* about a cluster. All values should be defaulted to -1 if unavailable.
|
||||
* Tile should only allow non-zero positive integers, x and y coordinates
|
||||
* must be non-negative. This is different from PhysicalLocationShort in
|
||||
* that the x and y positions are ints, not shorts thus, they do not
|
||||
* overflow within a HiSeqX tile.
|
||||
*/
|
||||
int16_t tile = -1;
|
||||
int32_t x = -1;
|
||||
|
|
@ -32,28 +35,33 @@ struct PhysicalLocation
|
|||
};
|
||||
|
||||
/* 包含了所有read ends信息,如picard里边的 ReadEndsForMarkDuplicates*/
|
||||
struct ReadEnds : PhysicalLocation
|
||||
{
|
||||
struct ReadEnds : PhysicalLocation {
|
||||
static const int8_t F = 0, R = 1, FF = 2, FR = 3, RR = 4, RF = 5;
|
||||
/* 保留一些bam记录中的数据 */
|
||||
bool read1FirstOfPair = true;
|
||||
/* ReadEnds中的成员变量 */
|
||||
/** Little struct-like class to hold read pair (and fragment) end data for duplicate marking. */
|
||||
/** Little struct-like class to hold read pair (and fragment) end data for
|
||||
* duplicate marking. */
|
||||
// int16_t libraryId; // 没用,不考虑多样本
|
||||
int8_t orientation = -1;
|
||||
int32_t read1ReferenceIndex = -1;
|
||||
int32_t read1Coordinate = -1;
|
||||
int32_t read2ReferenceIndex = -1;
|
||||
int32_t read2Coordinate = -1; // This field is overloaded for flow based processing as the end coordinate of read 1. (paired reads not supported)
|
||||
// This field is overloaded for flow based processing as the end coordinate of read 1. (paired reads not supported)
|
||||
int32_t read2Coordinate = -1;
|
||||
/* Additional information used to detect optical dupes */
|
||||
// int16_t readGroup = -1; 一般经过比对后的bam文件只有一个read group,normal或者tumor
|
||||
/** For optical duplicate detection the orientation matters regard to 1st or 2nd end of a mate */
|
||||
// int16_t readGroup = -1; 一般经过比对后的bam文件只有一个read
|
||||
// group,normal或者tumor
|
||||
/** For optical duplicate detection the orientation matters regard to 1st or
|
||||
* 2nd end of a mate */
|
||||
int8_t orientationForOpticalDuplicates = -1;
|
||||
/** A *transient* flag marking this read end as being an optical duplicate. */
|
||||
/** A *transient* flag marking this read end as being an optical duplicate.
|
||||
*/
|
||||
bool isOpticalDuplicate = false;
|
||||
|
||||
/* ReadEndsForMarkDuplicates中的成员变量 */
|
||||
/** Little struct-like class to hold read pair (and fragment) end data for MarkDuplicatesWithMateCigar **/
|
||||
/** Little struct-like class to hold read pair (and fragment) end data for
|
||||
* MarkDuplicatesWithMateCigar **/
|
||||
int16_t score = 0;
|
||||
int64_t read1IndexInFile = -1;
|
||||
int64_t read2IndexInFile = -1;
|
||||
|
|
@ -62,23 +70,22 @@ struct ReadEnds : PhysicalLocation
|
|||
/* ReadEndsForMarkDuplicatesWithBarcodes中的成员变量 (好像用不到) */
|
||||
// int32_t barcode = 0; // primary barcode for this read (and pair)
|
||||
// int32_t readOneBarcode = 0; // read one barcode, 0 if not present
|
||||
// int32_t readTwoBarcode = 0; // read two barcode, 0 if not present or not paired
|
||||
// int32_t readTwoBarcode = 0; // read two barcode, 0 if not present or not
|
||||
// paired
|
||||
|
||||
/* zzh增加的成员变量 */
|
||||
int64_t posKey = -1; // 根据位置信息生成的关键字 return (int64_t)tid << MAX_CONTIG_LEN_SHIFT | (int64_t)pos;
|
||||
int64_t posKey = -1; // 根据位置信息生成的关键字 return (int64_t)tid <<
|
||||
// MAX_CONTIG_LEN_SHIFT | (int64_t)pos;
|
||||
|
||||
/* 根据pairend read的比对方向,来确定整体的比对方向 */
|
||||
static int8_t GetOrientationByte(bool read1NegativeStrand, bool read2NegativeStrand)
|
||||
{
|
||||
if (read1NegativeStrand)
|
||||
{
|
||||
static int8_t GetOrientationByte(bool read1NegativeStrand,
|
||||
bool read2NegativeStrand) {
|
||||
if (read1NegativeStrand) {
|
||||
if (read2NegativeStrand)
|
||||
return RR;
|
||||
else
|
||||
return RF;
|
||||
}
|
||||
else
|
||||
{
|
||||
} else {
|
||||
if (read2NegativeStrand)
|
||||
return FR;
|
||||
else
|
||||
|
|
@ -87,47 +94,38 @@ struct ReadEnds : PhysicalLocation
|
|||
}
|
||||
|
||||
/* 比较两个readends是否一样(有个冗余) */
|
||||
static bool AreComparableForDuplicates(const ReadEnds &lhs, const ReadEnds &rhs, bool compareRead2)
|
||||
{
|
||||
static bool AreComparableForDuplicates(const ReadEnds &lhs,
|
||||
const ReadEnds &rhs,
|
||||
bool compareRead2) {
|
||||
bool areComparable = true;
|
||||
areComparable = lhs.read1ReferenceIndex == rhs.read1ReferenceIndex &&
|
||||
lhs.read1Coordinate == rhs.read1Coordinate &&
|
||||
lhs.orientation == rhs.orientation;
|
||||
if (areComparable && compareRead2)
|
||||
{
|
||||
areComparable = lhs.read2ReferenceIndex == rhs.read2ReferenceIndex &&
|
||||
lhs.read2Coordinate == rhs.read2Coordinate;
|
||||
if (areComparable && compareRead2) {
|
||||
areComparable =
|
||||
lhs.read2ReferenceIndex == rhs.read2ReferenceIndex &&
|
||||
lhs.read2Coordinate == rhs.read2Coordinate;
|
||||
}
|
||||
return areComparable;
|
||||
}
|
||||
|
||||
/* 比对方向是否正向 */
|
||||
bool IsPositiveStrand() const
|
||||
{
|
||||
return orientation == F;
|
||||
}
|
||||
bool IsPositiveStrand() const { return orientation == F; }
|
||||
|
||||
/* pairend是否合适的比对上了 */
|
||||
bool IsPaired() const
|
||||
{
|
||||
return read2ReferenceIndex != -1;
|
||||
}
|
||||
bool IsPaired() const { return read2ReferenceIndex != -1; }
|
||||
|
||||
bool IsNegativeStrand() const
|
||||
{
|
||||
return orientation == R;
|
||||
}
|
||||
bool IsNegativeStrand() const { return orientation == R; }
|
||||
|
||||
// 对于相交的数据进行比对,a是否小于b,根据AreComparableForDuplicates函数得来
|
||||
static inline bool ReadLittleThan(const ReadEnds &a, const ReadEnds &b, bool compareRead2 = false)
|
||||
{
|
||||
static inline bool ReadLittleThan(const ReadEnds &a, const ReadEnds &b,
|
||||
bool compareRead2 = false) {
|
||||
int comp = a.read1ReferenceIndex - b.read1ReferenceIndex;
|
||||
if (comp == 0)
|
||||
comp = a.read1Coordinate - b.read1Coordinate;
|
||||
if (comp == 0)
|
||||
comp = a.orientation - b.orientation;
|
||||
if (compareRead2)
|
||||
{
|
||||
if (compareRead2) {
|
||||
if (comp == 0)
|
||||
comp = a.read2ReferenceIndex - b.read2ReferenceIndex;
|
||||
if (comp == 0)
|
||||
|
|
@ -137,14 +135,12 @@ struct ReadEnds : PhysicalLocation
|
|||
}
|
||||
|
||||
// 找某一个位置的所有readend时需要
|
||||
static bool pairsLittleThan(const ReadEnds &lhs, const ReadEnds &rhs)
|
||||
{
|
||||
static bool PairsLittleThan(const ReadEnds &lhs, const ReadEnds &rhs) {
|
||||
return ReadLittleThan(lhs, rhs, true);
|
||||
}
|
||||
|
||||
// 比较函数
|
||||
bool operator < (const ReadEnds &o) const
|
||||
{
|
||||
bool operator<(const ReadEnds &o) const {
|
||||
int comp = read1ReferenceIndex - o.read1ReferenceIndex;
|
||||
if (comp == 0)
|
||||
comp = read1Coordinate - o.read1Coordinate;
|
||||
|
|
|
|||
|
|
@ -9,11 +9,12 @@ Date : 2023/11/6
|
|||
#ifndef READ_NAME_PARSER_H_
|
||||
#define READ_NAME_PARSER_H_
|
||||
|
||||
#include "read_ends.h"
|
||||
#include <common/utils/util.h>
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "read_ends.h"
|
||||
// #include <regex>
|
||||
#include <boost/regex.hpp>
|
||||
|
||||
|
|
@ -24,103 +25,107 @@ using std::string;
|
|||
|
||||
/**
|
||||
* Provides access to the physical location information about a cluster.
|
||||
* All values should be defaulted to -1 if unavailable. ReadGroup and Tile should only allow
|
||||
* non-zero positive integers, x and y coordinates may be negative.
|
||||
* 非线程安全
|
||||
* All values should be defaulted to -1 if unavailable. ReadGroup and Tile
|
||||
* should only allow non-zero positive integers, x and y coordinates may be
|
||||
* negative. 非线程安全
|
||||
*/
|
||||
struct ReadNameParser
|
||||
{
|
||||
struct ReadNameParser {
|
||||
/**
|
||||
* The read name regular expression (regex) is used to extract three pieces of information from the read name: tile, x location,
|
||||
* and y location. Any read name regex should parse the read name to produce these and only these values. An example regex is:
|
||||
* The read name regular expression (regex) is used to extract three pieces
|
||||
* of information from the read name: tile, x location, and y location. Any
|
||||
* read name regex should parse the read name to produce these and only
|
||||
* these values. An example regex is:
|
||||
* (?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$
|
||||
* which assumes that fields in the read name are delimited by ':' and the last three fields correspond to the tile, x and y locations,
|
||||
* ignoring any trailing non-digit characters.
|
||||
* which assumes that fields in the read name are delimited by ':' and the
|
||||
* last three fields correspond to the tile, x and y locations, ignoring any
|
||||
* trailing non-digit characters.
|
||||
*
|
||||
* The default regex is optimized for fast parsing (see {@link #getLastThreeFields(String, char, int[])}) by searching for the last
|
||||
* three fields, ignoring any trailing non-digit characters, assuming the delimiter ':'. This should consider correctly read names
|
||||
* where we have 5 or 7 field with the last three fields being tile/x/y, as is the case for the majority of read names produced by
|
||||
* Illumina technology.
|
||||
* The default regex is optimized for fast parsing (see {@link
|
||||
* #getLastThreeFields(String, char, int[])}) by searching for the last
|
||||
* three fields, ignoring any trailing non-digit characters, assuming the
|
||||
* delimiter ':'. This should consider correctly read names where we have 5
|
||||
* or 7 field with the last three fields being tile/x/y, as is the case for
|
||||
* the majority of read names produced by Illumina technology.
|
||||
*/
|
||||
const string DEFAULT_READ_NAME_REGEX = "(?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$";
|
||||
const string DEFAULT_READ_NAME_REGEX =
|
||||
"(?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$";
|
||||
|
||||
string readNameStored = "";
|
||||
PhysicalLocation physicalLocationStored;
|
||||
int tmpLocationFields[3]; // for optimization of addLocationInformation
|
||||
bool useOptimizedDefaultParsing = true; // was the regex default?
|
||||
int tmpLocationFields[3]; // for optimization of addLocationInformation
|
||||
bool useOptimizedDefaultParsing = true; // was the regex default?
|
||||
string readNameRegex = DEFAULT_READ_NAME_REGEX;
|
||||
regex readNamePattern;
|
||||
bool warnedAboutRegexNotMatching = true;
|
||||
|
||||
ReadNameParser() : ReadNameParser(DEFAULT_READ_NAME_REGEX) {}
|
||||
ReadNameParser(const string &strReadNameRegex) : ReadNameParser(strReadNameRegex, true) {}
|
||||
ReadNameParser(const string &strReadNameRegex, bool isWarn)
|
||||
{
|
||||
ReadNameParser(const string &strReadNameRegex)
|
||||
: ReadNameParser(strReadNameRegex, true) {}
|
||||
ReadNameParser(const string &strReadNameRegex, bool isWarn) {
|
||||
readNameRegex = strReadNameRegex;
|
||||
if (strReadNameRegex == DEFAULT_READ_NAME_REGEX)
|
||||
useOptimizedDefaultParsing = true;
|
||||
else
|
||||
useOptimizedDefaultParsing = false;
|
||||
readNamePattern = boost::regex(strReadNameRegex, boost::regex_constants::optimize);
|
||||
readNamePattern =
|
||||
boost::regex(strReadNameRegex, boost::regex_constants::optimize);
|
||||
warnedAboutRegexNotMatching = isWarn;
|
||||
}
|
||||
|
||||
/* 重新设置readNameRegex */
|
||||
void SetReadNameRegex(const string &strReadNameRegex)
|
||||
{
|
||||
void SetReadNameRegex(const string &strReadNameRegex) {
|
||||
readNameRegex = strReadNameRegex;
|
||||
if (strReadNameRegex == DEFAULT_READ_NAME_REGEX)
|
||||
useOptimizedDefaultParsing = true;
|
||||
else
|
||||
useOptimizedDefaultParsing = false;
|
||||
readNamePattern = boost::regex(strReadNameRegex, boost::regex_constants::optimize);
|
||||
readNamePattern =
|
||||
boost::regex(strReadNameRegex, boost::regex_constants::optimize);
|
||||
// readNamePattern = strReadNameRegex;
|
||||
}
|
||||
|
||||
/* 添加测序时候的tile x y 信息 */
|
||||
bool AddLocationInformation(const string &readName, PhysicalLocation *loc)
|
||||
{
|
||||
if (!(readName == readNameStored))
|
||||
{
|
||||
if (ReadLocationInformation(readName, loc))
|
||||
{
|
||||
bool AddLocationInformation(const string &readName, PhysicalLocation *loc) {
|
||||
if (!(readName == readNameStored)) {
|
||||
if (ReadLocationInformation(readName, loc)) {
|
||||
readNameStored = readName;
|
||||
physicalLocationStored = *loc;
|
||||
return true;
|
||||
}
|
||||
// return false if read name cannot be parsed
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
} else {
|
||||
*loc = physicalLocationStored;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Method used to extract tile/x/y from the read name and add it to the PhysicalLocationShort so that it
|
||||
* can be used later to determine optical duplication
|
||||
* Method used to extract tile/x/y from the read name and add it to the
|
||||
* PhysicalLocationShort so that it can be used later to determine optical
|
||||
* duplication
|
||||
*
|
||||
* @param readName the name of the read/cluster
|
||||
* @param loc the object to add tile/x/y to
|
||||
* @return true if the read name contained the information in parsable form, false otherwise
|
||||
* @return true if the read name contained the information in parsable form,
|
||||
* false otherwise
|
||||
*/
|
||||
bool ReadLocationInformation(const string &readName, PhysicalLocation *loc)
|
||||
{
|
||||
bool ReadLocationInformation(const string &readName,
|
||||
PhysicalLocation *loc) {
|
||||
try {
|
||||
// Optimized version if using the default read name regex (== used on purpose):
|
||||
if (useOptimizedDefaultParsing)
|
||||
{
|
||||
// Optimized version if using the default read name regex (== used
|
||||
// on purpose):
|
||||
if (useOptimizedDefaultParsing) {
|
||||
const int fields = getLastThreeFields(readName, ':');
|
||||
if (!(fields == 5 || fields == 7))
|
||||
{
|
||||
if (warnedAboutRegexNotMatching)
|
||||
{
|
||||
if (!(fields == 5 || fields == 7)) {
|
||||
if (warnedAboutRegexNotMatching) {
|
||||
Warn(
|
||||
"Default READ_NAME_REGEX '%s' did not match read name '%s'."
|
||||
"You may need to specify a READ_NAME_REGEX in order to correctly identify optical duplicates. "
|
||||
"Note that this message will not be emitted again even if other read names do not match the regex.",
|
||||
"Default READ_NAME_REGEX '%s' did not match read "
|
||||
"name '%s'."
|
||||
"You may need to specify a READ_NAME_REGEX in "
|
||||
"order to correctly identify optical duplicates. "
|
||||
"Note that this message will not be emitted again "
|
||||
"even if other read names do not match the regex.",
|
||||
readNameRegex.c_str(), readName.c_str());
|
||||
warnedAboutRegexNotMatching = false;
|
||||
}
|
||||
|
|
@ -130,13 +135,9 @@ struct ReadNameParser
|
|||
loc->x = tmpLocationFields[1];
|
||||
loc->y = tmpLocationFields[2];
|
||||
return true;
|
||||
}
|
||||
else if (readNameRegex.empty())
|
||||
{
|
||||
} else if (readNameRegex.empty()) {
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
} else {
|
||||
// Standard version that will use the regex
|
||||
cmatch m;
|
||||
if (boost::regex_match(readName.c_str(), m, readNamePattern)) {
|
||||
|
|
@ -144,28 +145,25 @@ struct ReadNameParser
|
|||
loc->x = std::stoi(m[2].str());
|
||||
loc->y = std::stoi(m[3].str());
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (warnedAboutRegexNotMatching)
|
||||
{
|
||||
} else {
|
||||
if (warnedAboutRegexNotMatching) {
|
||||
Warn(
|
||||
"READ_NAME_REGEX '%s' did not match read name '%s'."
|
||||
"Your regex may not be correct. "
|
||||
"Note that this message will not be emitted again even if other read names do not match the regex.",
|
||||
"Note that this message will not be emitted again "
|
||||
"even if other read names do not match the regex.",
|
||||
readNameRegex.c_str(), readName.c_str());
|
||||
warnedAboutRegexNotMatching = false;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (const std::runtime_error &e)
|
||||
{
|
||||
if (warnedAboutRegexNotMatching)
|
||||
{
|
||||
} catch (const std::runtime_error &e) {
|
||||
if (warnedAboutRegexNotMatching) {
|
||||
Warn(
|
||||
"A field parsed out of a read name was expected to contain an integer and did not. READ_NAME_REGEX: %s; Read name: %s; Error Msg: %s",
|
||||
"A field parsed out of a read name was expected to contain "
|
||||
"an integer and did not. READ_NAME_REGEX: %s; Read name: "
|
||||
"%s; Error Msg: %s",
|
||||
readNameRegex.c_str(), readName.c_str(), e.what());
|
||||
warnedAboutRegexNotMatching = false;
|
||||
}
|
||||
|
|
@ -175,39 +173,39 @@ struct ReadNameParser
|
|||
}
|
||||
|
||||
/**
|
||||
* Given a string, splits the string by the delimiter, and returns the the last three fields parsed as integers. Parsing a field
|
||||
* considers only a sequence of digits up until the first non-digit character. The three values are stored in the passed-in array.
|
||||
* Given a string, splits the string by the delimiter, and returns the the
|
||||
* last three fields parsed as integers. Parsing a field considers only a
|
||||
* sequence of digits up until the first non-digit character. The three
|
||||
* values are stored in the passed-in array.
|
||||
*
|
||||
* @throws NumberFormatException if any of the tokens that should contain numbers do not start with parsable numbers
|
||||
* @throws NumberFormatException if any of the tokens that should contain
|
||||
* numbers do not start with parsable numbers
|
||||
*/
|
||||
int getLastThreeFields(const string &readName, char delim)
|
||||
{
|
||||
int tokensIdx = 2; // start at the last token
|
||||
int getLastThreeFields(const string &readName, char delim) {
|
||||
int tokensIdx = 2; // start at the last token
|
||||
int numFields = 0;
|
||||
int i, endIdx;
|
||||
endIdx = readName.size();
|
||||
// find the last three tokens only
|
||||
for (i = readName.size() - 1; 0 <= i && 0 <= tokensIdx; i--)
|
||||
{
|
||||
if (readName.at(i) == delim || 0 == i)
|
||||
{
|
||||
for (i = (int)readName.size() - 1; 0 <= i && 0 <= tokensIdx; i--) {
|
||||
if (readName.at(i) == delim || 0 == i) {
|
||||
numFields++;
|
||||
const int startIdx = (0 == i) ? 0 : (i + 1);
|
||||
tmpLocationFields[tokensIdx] = std::stoi(readName.substr(startIdx, endIdx - startIdx));
|
||||
tmpLocationFields[tokensIdx] =
|
||||
std::stoi(readName.substr(startIdx, endIdx - startIdx));
|
||||
tokensIdx--;
|
||||
endIdx = i;
|
||||
}
|
||||
}
|
||||
// continue to find the # of fields
|
||||
while (0 <= i)
|
||||
{
|
||||
while (0 <= i) {
|
||||
if (readName.at(i) == delim || 0 == i)
|
||||
numFields++;
|
||||
i--;
|
||||
}
|
||||
if (numFields < 3)
|
||||
{
|
||||
tmpLocationFields[0] = tmpLocationFields[1] = tmpLocationFields[2] = -1;
|
||||
if (numFields < 3) {
|
||||
tmpLocationFields[0] = tmpLocationFields[1] = tmpLocationFields[2] =
|
||||
-1;
|
||||
numFields = -1;
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue