并行处理框架搭建完成,基本完成了工作线程处理逻辑
This commit is contained in:
parent
a3a0b64ef2
commit
97d35a42e9
|
|
@ -13,7 +13,7 @@
|
||||||
"program": "${workspaceRoot}/build/bin/picard_cpp",
|
"program": "${workspaceRoot}/build/bin/picard_cpp",
|
||||||
"args": [
|
"args": [
|
||||||
"MarkDuplicates",
|
"MarkDuplicates",
|
||||||
"--INPUT", "test.bam",
|
"--INPUT", "/mnt/d/data/100w.bam",
|
||||||
"--OUTPUT", "out.bam",
|
"--OUTPUT", "out.bam",
|
||||||
"--METRICS_FILE", "metrics.txt",
|
"--METRICS_FILE", "metrics.txt",
|
||||||
"--num_threads", "12",
|
"--num_threads", "12",
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,90 @@
|
||||||
"vector": "cpp",
|
"vector": "cpp",
|
||||||
"random": "cpp",
|
"random": "cpp",
|
||||||
"ostream": "cpp",
|
"ostream": "cpp",
|
||||||
"*.tcc": "cpp"
|
"*.tcc": "cpp",
|
||||||
|
"new": "cpp",
|
||||||
|
"iostream": "cpp",
|
||||||
|
"mutex": "cpp",
|
||||||
|
"shared_mutex": "cpp",
|
||||||
|
"syncstream": "cpp",
|
||||||
|
"condition_variable": "cpp",
|
||||||
|
"future": "cpp",
|
||||||
|
"*.ipp": "cpp",
|
||||||
|
"cctype": "cpp",
|
||||||
|
"clocale": "cpp",
|
||||||
|
"cmath": "cpp",
|
||||||
|
"csignal": "cpp",
|
||||||
|
"cstdarg": "cpp",
|
||||||
|
"cstddef": "cpp",
|
||||||
|
"cstdio": "cpp",
|
||||||
|
"cstdlib": "cpp",
|
||||||
|
"ctime": "cpp",
|
||||||
|
"cwchar": "cpp",
|
||||||
|
"cwctype": "cpp",
|
||||||
|
"any": "cpp",
|
||||||
|
"array": "cpp",
|
||||||
|
"atomic": "cpp",
|
||||||
|
"strstream": "cpp",
|
||||||
|
"barrier": "cpp",
|
||||||
|
"bit": "cpp",
|
||||||
|
"bitset": "cpp",
|
||||||
|
"cfenv": "cpp",
|
||||||
|
"charconv": "cpp",
|
||||||
|
"chrono": "cpp",
|
||||||
|
"cinttypes": "cpp",
|
||||||
|
"codecvt": "cpp",
|
||||||
|
"compare": "cpp",
|
||||||
|
"complex": "cpp",
|
||||||
|
"concepts": "cpp",
|
||||||
|
"coroutine": "cpp",
|
||||||
|
"csetjmp": "cpp",
|
||||||
|
"cstdint": "cpp",
|
||||||
|
"deque": "cpp",
|
||||||
|
"forward_list": "cpp",
|
||||||
|
"list": "cpp",
|
||||||
|
"map": "cpp",
|
||||||
|
"set": "cpp",
|
||||||
|
"string": "cpp",
|
||||||
|
"unordered_map": "cpp",
|
||||||
|
"unordered_set": "cpp",
|
||||||
|
"exception": "cpp",
|
||||||
|
"algorithm": "cpp",
|
||||||
|
"functional": "cpp",
|
||||||
|
"iterator": "cpp",
|
||||||
|
"memory": "cpp",
|
||||||
|
"memory_resource": "cpp",
|
||||||
|
"numeric": "cpp",
|
||||||
|
"optional": "cpp",
|
||||||
|
"ratio": "cpp",
|
||||||
|
"regex": "cpp",
|
||||||
|
"source_location": "cpp",
|
||||||
|
"string_view": "cpp",
|
||||||
|
"system_error": "cpp",
|
||||||
|
"tuple": "cpp",
|
||||||
|
"type_traits": "cpp",
|
||||||
|
"utility": "cpp",
|
||||||
|
"rope": "cpp",
|
||||||
|
"slist": "cpp",
|
||||||
|
"fstream": "cpp",
|
||||||
|
"initializer_list": "cpp",
|
||||||
|
"iomanip": "cpp",
|
||||||
|
"iosfwd": "cpp",
|
||||||
|
"istream": "cpp",
|
||||||
|
"latch": "cpp",
|
||||||
|
"limits": "cpp",
|
||||||
|
"numbers": "cpp",
|
||||||
|
"ranges": "cpp",
|
||||||
|
"scoped_allocator": "cpp",
|
||||||
|
"semaphore": "cpp",
|
||||||
|
"span": "cpp",
|
||||||
|
"sstream": "cpp",
|
||||||
|
"stdexcept": "cpp",
|
||||||
|
"stop_token": "cpp",
|
||||||
|
"streambuf": "cpp",
|
||||||
|
"thread": "cpp",
|
||||||
|
"typeindex": "cpp",
|
||||||
|
"typeinfo": "cpp",
|
||||||
|
"valarray": "cpp",
|
||||||
|
"variant": "cpp"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
7
run.sh
7
run.sh
|
|
@ -1,11 +1,12 @@
|
||||||
/home/zzh/work/GeneKit/picard_cpp/build/bin/picard_cpp \
|
/home/zzh/work/GeneKit/picard_cpp/build/bin/picard_cpp \
|
||||||
MarkDuplicates \
|
MarkDuplicates \
|
||||||
--INPUT /mnt/d/data/zy_normal.bam \
|
--INPUT /mnt/d/data/zy_tumor.bam \
|
||||||
--OUTPUT out.bam \
|
--OUTPUT out.bam \
|
||||||
--num_threads 12 \
|
--num_threads 16 \
|
||||||
--max_mem 4G \
|
--max_mem 4G \
|
||||||
--verbosity DEBUG \
|
--verbosity DEBUG \
|
||||||
--asyncio true
|
--asyncio true #\
|
||||||
|
#--READ_NAME_REGEX ".*?([0-9]+):([0-9]+):([0-9]+)$"
|
||||||
|
|
||||||
|
|
||||||
# --INPUT /mnt/d/data/100w.bam \
|
# --INPUT /mnt/d/data/100w.bam \
|
||||||
|
|
|
||||||
|
|
@ -13,9 +13,11 @@ AUX_SOURCE_DIRECTORY(${PROJECT_SOURCE_DIR}/src/sam/markdups SAM_MARKDUPS_SRC)
|
||||||
INCLUDE_DIRECTORIES("${PROJECT_SOURCE_DIR}/src")
|
INCLUDE_DIRECTORIES("${PROJECT_SOURCE_DIR}/src")
|
||||||
INCLUDE_DIRECTORIES("${PROJECT_SOURCE_DIR}/lib")
|
INCLUDE_DIRECTORIES("${PROJECT_SOURCE_DIR}/lib")
|
||||||
INCLUDE_DIRECTORIES("${PROJECT_SOURCE_DIR}/lib/htslib")
|
INCLUDE_DIRECTORIES("${PROJECT_SOURCE_DIR}/lib/htslib")
|
||||||
|
INCLUDE_DIRECTORIES("${PROJECT_SOURCE_DIR}/lib/boost/include")
|
||||||
|
|
||||||
# 链接库位置
|
# 链接库位置
|
||||||
LINK_DIRECTORIES("${PROJECT_SOURCE_DIR}/lib/htslib")
|
LINK_DIRECTORIES("${PROJECT_SOURCE_DIR}/lib/htslib")
|
||||||
|
LINK_DIRECTORIES("${PROJECT_SOURCE_DIR}/lib/boost/lib")
|
||||||
|
|
||||||
# 编译生成的程序名称
|
# 编译生成的程序名称
|
||||||
set(PG_NAME "picard_cpp")
|
set(PG_NAME "picard_cpp")
|
||||||
|
|
|
||||||
|
|
@ -24,7 +24,6 @@
|
||||||
|
|
||||||
#include <htslib/sam.h>
|
#include <htslib/sam.h>
|
||||||
|
|
||||||
#include "interval.h"
|
|
||||||
#include "bam_wrap.h"
|
#include "bam_wrap.h"
|
||||||
|
|
||||||
using std::vector;
|
using std::vector;
|
||||||
|
|
@ -138,6 +137,15 @@ struct AsyncIoBamBuf
|
||||||
{
|
{
|
||||||
return bam_arr_[pos];
|
return bam_arr_[pos];
|
||||||
}
|
}
|
||||||
|
// 获取某一段reads
|
||||||
|
inline vector<BamWrap*> Slice(size_t startIdx, size_t endIdx)
|
||||||
|
{
|
||||||
|
if (endIdx > startIdx) {
|
||||||
|
auto begItr = bam_arr_.begin();
|
||||||
|
return std::move(vector<BamWrap *>(begItr + startIdx, begItr + endIdx));
|
||||||
|
}
|
||||||
|
return std::move(vector<BamWrap *>());
|
||||||
|
}
|
||||||
|
|
||||||
// 同步读取
|
// 同步读取
|
||||||
int sync_read_bam();
|
int sync_read_bam();
|
||||||
|
|
|
||||||
|
|
@ -119,11 +119,11 @@ struct BamWrap
|
||||||
char base = base_to_char[bam_seqi(seq, i)];
|
char base = base_to_char[bam_seqi(seq, i)];
|
||||||
oss << base;
|
oss << base;
|
||||||
}
|
}
|
||||||
return oss.str();
|
return std::move(oss.str());
|
||||||
}
|
}
|
||||||
|
|
||||||
// 获取名字
|
// 获取名字
|
||||||
inline std::string query_name()
|
inline const char *query_name()
|
||||||
{
|
{
|
||||||
return bam_get_qname(b);
|
return bam_get_qname(b);
|
||||||
}
|
}
|
||||||
|
|
@ -139,7 +139,7 @@ struct BamWrap
|
||||||
const int len = bam_cigar_oplen(cigar[i]);
|
const int len = bam_cigar_oplen(cigar[i]);
|
||||||
oss << len << c;
|
oss << len << c;
|
||||||
}
|
}
|
||||||
return oss.str();
|
return std::move(oss.str());
|
||||||
}
|
}
|
||||||
|
|
||||||
// 占用的内存大小
|
// 占用的内存大小
|
||||||
|
|
@ -309,6 +309,125 @@ struct BamWrap
|
||||||
return end_pos;
|
return end_pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* 获取碱基质量分数的加和 */
|
||||||
|
/** Calculates a score for the read which is the sum of scores over Q15. */
|
||||||
|
inline int GetSumOfBaseQualities()
|
||||||
|
{
|
||||||
|
int score = 0;
|
||||||
|
uint8_t *qual = bam_get_qual(b);
|
||||||
|
for (int i = 0; i < b->core.l_qseq; ++i)
|
||||||
|
{
|
||||||
|
if (qual[i] >= 15)
|
||||||
|
score += qual[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
return score;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 与flag相关的检测 */
|
||||||
|
|
||||||
|
/* 没有比对上 unmapped */
|
||||||
|
inline bool GetReadUnmappedFlag()
|
||||||
|
{
|
||||||
|
return b->core.flag & BAM_FUNMAP;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Template having multiple segments in sequencing */
|
||||||
|
inline bool GetReadPairedFlag()
|
||||||
|
{
|
||||||
|
return b->core.flag & BAM_FPAIRED;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* the read fails platform/vendor quality checks.
|
||||||
|
*/
|
||||||
|
inline bool GetReadFailsVendorQualityCheckFlag()
|
||||||
|
{
|
||||||
|
return b->core.flag & BAM_FQCFAIL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* the mate is unmapped.
|
||||||
|
*/
|
||||||
|
bool GetMateUnmappedFlag()
|
||||||
|
{
|
||||||
|
return b->core.flag & BAM_FMUNMAP;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return whether the alignment is secondary (an alternative alignment of the read).
|
||||||
|
*/
|
||||||
|
bool IsSecondaryAlignment()
|
||||||
|
{
|
||||||
|
return b->core.flag & BAM_FSECONDARY;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return whether the alignment is supplementary (a split alignment such as a chimeric alignment).
|
||||||
|
*/
|
||||||
|
bool GetSupplementaryAlignmentFlag()
|
||||||
|
{
|
||||||
|
return b->core.flag & BAM_FSUPPLEMENTARY;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Tests if this record is either a secondary and/or supplementary alignment;
|
||||||
|
*/
|
||||||
|
bool IsSecondaryOrSupplementary()
|
||||||
|
{
|
||||||
|
return IsSecondaryAlignment() || GetSupplementaryAlignmentFlag();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* the read is the first read in a pair.
|
||||||
|
*/
|
||||||
|
bool GetFirstOfPairFlag()
|
||||||
|
{
|
||||||
|
return b->core.flag & BAM_FREAD1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* strand of the query (false for forward; true for reverse strand).
|
||||||
|
*/
|
||||||
|
bool GetReadNegativeStrandFlag()
|
||||||
|
{
|
||||||
|
return b->core.flag & BAM_FREVERSE;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* strand of the mate (false for forward; true for reverse strand).
|
||||||
|
*/
|
||||||
|
bool GetMateNegativeStrandFlag()
|
||||||
|
{
|
||||||
|
return b->core.flag & BAM_FMREVERSE;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 其他的一些信息 */
|
||||||
|
inline int GetReferenceLength()
|
||||||
|
{
|
||||||
|
int length = 0;
|
||||||
|
const uint32_t *cigar = bam_get_cigar(b);
|
||||||
|
const bam1_core_t &bc = b->core;
|
||||||
|
for (int i = 0; i < bc.n_cigar; ++i)
|
||||||
|
{
|
||||||
|
const char c = bam_cigar_opchr(cigar[i]);
|
||||||
|
const int len = bam_cigar_oplen(cigar[i]);
|
||||||
|
switch (c)
|
||||||
|
{
|
||||||
|
case 'M':
|
||||||
|
case 'D':
|
||||||
|
case 'N':
|
||||||
|
case '=':
|
||||||
|
case 'X':
|
||||||
|
length += len;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
// 计算bam的全局位置,算上染色体序号和比对位置
|
// 计算bam的全局位置,算上染色体序号和比对位置
|
||||||
static inline int64_t bam_global_pos(bam1_t *b)
|
static inline int64_t bam_global_pos(bam1_t *b)
|
||||||
{
|
{
|
||||||
|
|
|
||||||
|
|
@ -1,295 +0,0 @@
|
||||||
/*
|
|
||||||
Description: 处理intervals
|
|
||||||
|
|
||||||
Copyright : All right reserved by ICT
|
|
||||||
|
|
||||||
Author : Zhang Zhonghai
|
|
||||||
Date : 2019/11/24
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "interval.h"
|
|
||||||
|
|
||||||
#include <algorithm>
|
|
||||||
#include <sstream>
|
|
||||||
#include <fstream>
|
|
||||||
#include <string>
|
|
||||||
#include <iostream>
|
|
||||||
|
|
||||||
#include <htslib/sam.h>
|
|
||||||
|
|
||||||
#include "../utils/util.h"
|
|
||||||
#include "bam_wrap.h"
|
|
||||||
|
|
||||||
using std::min;
|
|
||||||
using std::max;
|
|
||||||
using std::string;
|
|
||||||
using std::ifstream;
|
|
||||||
using std::stringstream;
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
// 构造函数
|
|
||||||
Interval::Interval() : Interval(0, 0) {}
|
|
||||||
Interval::Interval(int64_t l, int64_t r) : left(l), right(r) {}
|
|
||||||
|
|
||||||
// 比较函数
|
|
||||||
bool Interval::operator<(const Interval& other) {
|
|
||||||
if (left == other.left) {
|
|
||||||
return right < other.right;
|
|
||||||
}
|
|
||||||
return left < other.left;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 是否有重叠
|
|
||||||
bool Interval::overlaps(const Interval &other) {
|
|
||||||
return left <= other.right && right >= other.left;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 两个interval的合并
|
|
||||||
Interval& Interval::spanWith(const Interval &other) {
|
|
||||||
left = min(left, other.left);
|
|
||||||
right = max(right, other.right);
|
|
||||||
return *this;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 返回两个interval的交集,不改变当前interval
|
|
||||||
Interval Interval::intersect(const Interval &that) const {
|
|
||||||
Interval val;
|
|
||||||
val.left = max(left, that.left);
|
|
||||||
val.right = min(right, that.right);
|
|
||||||
return val;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
* 合并两个interval arr,取相交区域的交集, interval arr都是排序后的
|
|
||||||
*/
|
|
||||||
void Interval::IntersectIntervals(const IntervalArr &a_arr,
|
|
||||||
const IntervalArr &b_arr,
|
|
||||||
IntervalArr *r_arr) {
|
|
||||||
if (a_arr.size() < 1 || b_arr.size() < 1) return;
|
|
||||||
int ai=0, bi=0;
|
|
||||||
const Interval *last, *cur;
|
|
||||||
if (a_arr[ai].left < b_arr[bi].left) last = &a_arr[ai++];
|
|
||||||
else last = &b_arr[bi++];
|
|
||||||
while (ai < a_arr.size() && bi < b_arr.size()) {
|
|
||||||
if (a_arr[ai].left < b_arr[bi].left) cur = &a_arr[ai++];
|
|
||||||
else cur = &b_arr[bi++];
|
|
||||||
if (last->right < cur->left) {
|
|
||||||
last = cur; continue;
|
|
||||||
} else if (last->right > cur->right) {
|
|
||||||
r_arr->push_back(*cur);
|
|
||||||
} else {
|
|
||||||
r_arr->push_back(Interval(cur->left, last->right));
|
|
||||||
last = cur;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
const IntervalArr *arrp;
|
|
||||||
int ii;
|
|
||||||
if (ai < a_arr.size()) { arrp = &a_arr; ii = ai;}
|
|
||||||
else { arrp = &b_arr; ii = bi; }
|
|
||||||
const IntervalArr &arr = *arrp;
|
|
||||||
while(ii < arr.size()) {
|
|
||||||
cur = &arr[ii++];
|
|
||||||
if (last->right < cur->left) {
|
|
||||||
break;
|
|
||||||
} else if (last->right > cur->right) {
|
|
||||||
r_arr->push_back(*cur);
|
|
||||||
} else {
|
|
||||||
r_arr->push_back(Interval(cur->left, last->right));
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* 合并两个interval arr,取并集
|
|
||||||
*/
|
|
||||||
void Interval::UnionIntervals(const IntervalArr &a_arr,
|
|
||||||
const IntervalArr &b_arr,
|
|
||||||
IntervalArr *r_arr) {
|
|
||||||
Interval tmp;
|
|
||||||
const Interval *cur;
|
|
||||||
Interval *last;
|
|
||||||
int ai=0, bi=0;
|
|
||||||
if (a_arr.size() < 1) { *r_arr = b_arr; return; }
|
|
||||||
if (b_arr.size() < 1) { *r_arr = a_arr; return; }
|
|
||||||
r_arr->clear();
|
|
||||||
|
|
||||||
if (a_arr[ai].left < b_arr[bi].left) tmp = a_arr[ai++];
|
|
||||||
else tmp = b_arr[bi++];
|
|
||||||
last = &tmp;
|
|
||||||
while(ai < a_arr.size() && bi < b_arr.size()) {
|
|
||||||
if (a_arr[ai].left < b_arr[bi].left) cur = &a_arr[ai++];
|
|
||||||
else cur = &b_arr[bi++];
|
|
||||||
if (last->right < cur->left) {
|
|
||||||
r_arr->push_back(*last);
|
|
||||||
*last = *cur;
|
|
||||||
} else {
|
|
||||||
last->right = max(last->right, cur->right);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
const IntervalArr *arrp;
|
|
||||||
int ii;
|
|
||||||
if (ai < a_arr.size()) { arrp = &a_arr; ii = ai; }
|
|
||||||
else { arrp = &b_arr; ii = bi; }
|
|
||||||
const IntervalArr &arr = *arrp;
|
|
||||||
|
|
||||||
while(ii < arr.size()) {
|
|
||||||
cur = &arr[ii++];
|
|
||||||
if (last->right < cur->left) {
|
|
||||||
r_arr->push_back(*last);
|
|
||||||
*last = *cur;
|
|
||||||
} else {
|
|
||||||
last->right = max(last->right, cur->right);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
r_arr->push_back(*last);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* 将有read覆盖的区域和参数提供的interval文件中的区域做一个交集
|
|
||||||
*/
|
|
||||||
int64_t Interval::MergeIntervals(const IntervalArr &n_arr,
|
|
||||||
const IntervalArr &t_arr,
|
|
||||||
IntervalArr &in_arr,
|
|
||||||
int64_t start_loc, // 闭区间
|
|
||||||
int64_t *end_loc, // 开区间
|
|
||||||
IntervalArr *r_arr) {
|
|
||||||
IntervalArr tmp_arr;
|
|
||||||
const int64_t end_loc_val = *end_loc;
|
|
||||||
if (in_arr.size() < 1) { // 如果输入的interval为空,则使用tumor normal覆盖的interval
|
|
||||||
UnionIntervals(n_arr, t_arr, &tmp_arr);
|
|
||||||
} else {
|
|
||||||
IntervalArr mid_arr;
|
|
||||||
UnionIntervals(n_arr, t_arr, &mid_arr);
|
|
||||||
IntersectIntervals(mid_arr, in_arr, &tmp_arr);
|
|
||||||
}
|
|
||||||
for(int i=tmp_arr.size()-1; i>=0; --i) {
|
|
||||||
if (tmp_arr[i].left >= end_loc_val) {
|
|
||||||
tmp_arr.pop_back(); // 删除该元素
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
tmp_arr[i].right = min(tmp_arr[i].right, end_loc_val - 1); // end_loc是开区间
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
for (int i=0; i<tmp_arr.size(); ++i) {
|
|
||||||
if (tmp_arr[i].right < start_loc) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (tmp_arr[i].left < start_loc) {
|
|
||||||
r_arr->push_back(Interval(start_loc, tmp_arr[i].right));
|
|
||||||
} else {
|
|
||||||
r_arr->push_back(tmp_arr[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int next_i = 0;
|
|
||||||
while(next_i < in_arr.size() && in_arr[next_i].right < end_loc_val) ++next_i;
|
|
||||||
if (next_i < in_arr.size()) {
|
|
||||||
if (end_loc_val < in_arr[next_i].left) {
|
|
||||||
*end_loc = in_arr[next_i].left; // 更新本次处理的终点
|
|
||||||
} else {
|
|
||||||
in_arr[next_i].left = end_loc_val; // 更新panel
|
|
||||||
}
|
|
||||||
int i=0, j=next_i;
|
|
||||||
for (; j<in_arr.size(); ++i, ++j) {
|
|
||||||
in_arr[i] = in_arr[j];
|
|
||||||
}
|
|
||||||
in_arr.resize(i);
|
|
||||||
} else {
|
|
||||||
in_arr.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
int64_t locus_num = 0;
|
|
||||||
for (int i=0; i<r_arr->size(); ++i) {
|
|
||||||
locus_num += (*r_arr)[i].right - (*r_arr)[i].left + 1;
|
|
||||||
}
|
|
||||||
return locus_num;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* 读取interval文件
|
|
||||||
*/
|
|
||||||
void Interval::ReadInterval(const string &interval_fn,
|
|
||||||
bam_hdr_t* header,
|
|
||||||
int interval_padding,
|
|
||||||
IntervalArr *r_arr) {
|
|
||||||
ifstream interval_fs(interval_fn);
|
|
||||||
string one_line;
|
|
||||||
IntervalArr intervals;
|
|
||||||
getline(interval_fs, one_line);
|
|
||||||
while (!interval_fs.eof()) {
|
|
||||||
if (one_line[0] == '@') {
|
|
||||||
getline(interval_fs, one_line);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
stringstream ss_line(one_line);
|
|
||||||
string contig_name;
|
|
||||||
ss_line >> contig_name;
|
|
||||||
int itid = sam_hdr_name2tid(header, contig_name.c_str());
|
|
||||||
if (itid < 0) Error("[%s] interval file has unknown contig name [%s]\n", __func__, contig_name.c_str());
|
|
||||||
int64_t tid = (int64_t)itid;
|
|
||||||
tid <<= CONTIG_SHIFT;
|
|
||||||
int64_t start, stop;
|
|
||||||
ss_line >> start >> stop;
|
|
||||||
// interval文件是1-based,所以这里要减去1
|
|
||||||
intervals.push_back(Interval(tid + start - 1, tid + stop -1));
|
|
||||||
getline(interval_fs, one_line);
|
|
||||||
}
|
|
||||||
sort(intervals.begin(), intervals.end());
|
|
||||||
if (intervals.size() > 0) {
|
|
||||||
Interval new_span(intervals[0].left-interval_padding, intervals[0].right+interval_padding);
|
|
||||||
for (int i=1; i<intervals.size(); ++i) {
|
|
||||||
if (intervals[i].left - interval_padding > new_span.right) {
|
|
||||||
r_arr->push_back(new_span);
|
|
||||||
new_span.left = intervals[i].left - interval_padding;
|
|
||||||
new_span.right = intervals[i].right + interval_padding;
|
|
||||||
} else {
|
|
||||||
new_span.right = max(new_span.right, intervals[i].right + interval_padding);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
r_arr->push_back(new_span);
|
|
||||||
}
|
|
||||||
interval_fs.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* 将interval相连的区域合并
|
|
||||||
*/
|
|
||||||
void Interval::ShrinkInterval(IntervalArr *ivap) {
|
|
||||||
if (ivap->size() < 1) return;
|
|
||||||
IntervalArr &iva = *ivap;
|
|
||||||
IntervalArr tiva = iva;
|
|
||||||
iva.clear();
|
|
||||||
Interval iv;
|
|
||||||
iv.left = tiva[0].left;
|
|
||||||
iv.right = tiva[0].right;
|
|
||||||
for (int i=1; i<tiva.size(); ++i) {
|
|
||||||
if (iv.right+1 < tiva[i].left) {
|
|
||||||
iva.push_back(iv);
|
|
||||||
iv.left = tiva[i].left;
|
|
||||||
}
|
|
||||||
iv.right = tiva[i].right;
|
|
||||||
}
|
|
||||||
iva.push_back(iv);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* 根据header信息,扩展interval
|
|
||||||
*/
|
|
||||||
Interval Interval::ExpandInterval(int64_t start, int64_t end, int expandVal, bam_hdr_t* header) {
|
|
||||||
Interval result;
|
|
||||||
result.left = start;
|
|
||||||
result.right = end;
|
|
||||||
|
|
||||||
int64_t ext_left = start - expandVal;
|
|
||||||
int64_t ext_right = end + expandVal;
|
|
||||||
int tid = BamWrap::bam_tid(start);
|
|
||||||
uint32_t contig_len = header->target_len[tid];
|
|
||||||
result.left = max(BamWrap::bam_global_pos(tid, 0), ext_left);
|
|
||||||
result.right = min(ext_right, contig_len - 1 + BamWrap::bam_global_pos(tid, 0));
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
@ -1,101 +0,0 @@
|
||||||
/*
|
|
||||||
Description: 处理intervals
|
|
||||||
|
|
||||||
Copyright : All right reserved by ICT
|
|
||||||
|
|
||||||
Author : Zhang Zhonghai
|
|
||||||
Date : 2019/11/24
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef INTERVAL_H_
|
|
||||||
#define INTERVAL_H_
|
|
||||||
|
|
||||||
#include <stdint.h>
|
|
||||||
#include <vector>
|
|
||||||
#include <string>
|
|
||||||
#include <sstream>
|
|
||||||
|
|
||||||
#include <htslib/sam.h>
|
|
||||||
|
|
||||||
#include "bam_wrap.h"
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
// 前向声明
|
|
||||||
class Interval;
|
|
||||||
typedef std::vector<Interval> IntervalArr;
|
|
||||||
/*
|
|
||||||
* 闭区间
|
|
||||||
*/
|
|
||||||
struct Interval {
|
|
||||||
// const常量
|
|
||||||
const static int CONTIG_SHIFT = 30;
|
|
||||||
|
|
||||||
// 类变量
|
|
||||||
int64_t left;
|
|
||||||
int64_t right;
|
|
||||||
|
|
||||||
// 构造函数
|
|
||||||
Interval();
|
|
||||||
explicit Interval(int64_t l, int64_t r);
|
|
||||||
// 比较函数
|
|
||||||
bool operator<(const Interval &other);
|
|
||||||
// 是否有重叠
|
|
||||||
bool overlaps(const Interval &other);
|
|
||||||
// 两个interval的合并, 会改变当前interval
|
|
||||||
Interval& spanWith(const Interval &other);
|
|
||||||
// 返回两个interval的交集,不改变当前interval
|
|
||||||
Interval intersect(const Interval &that) const;
|
|
||||||
|
|
||||||
// for debug
|
|
||||||
string toString() const {
|
|
||||||
ostringstream oss;
|
|
||||||
oss << BamWrap::bam_tid(left) + 1 << ":"
|
|
||||||
<< BamWrap::bam_pos(left) + 1 << "-"
|
|
||||||
<< BamWrap::bam_pos(right) + 1;
|
|
||||||
|
|
||||||
return oss.str();
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
* 合并两个interval arr,取相交区域的交集, interval arr都是排序后的
|
|
||||||
*/
|
|
||||||
static void IntersectIntervals(const IntervalArr &a_arr,
|
|
||||||
const IntervalArr &b_arr,
|
|
||||||
IntervalArr *r_arr);
|
|
||||||
/*
|
|
||||||
* 合并两个interval arr,相交的区域取并集
|
|
||||||
*/
|
|
||||||
static void UnionIntervals(const IntervalArr &a_arr,
|
|
||||||
const IntervalArr &b_arr,
|
|
||||||
IntervalArr *r_arr);
|
|
||||||
/*
|
|
||||||
* 将有read覆盖的区域和参数提供的interval文件中的区域做一个交集
|
|
||||||
*/
|
|
||||||
static int64_t MergeIntervals(const IntervalArr &n_arr,
|
|
||||||
const IntervalArr &t_arr,
|
|
||||||
IntervalArr &in_arr, // 会更改
|
|
||||||
int64_t start_loc, // 闭区间
|
|
||||||
int64_t *end_loc, // 开区间, 会更改
|
|
||||||
IntervalArr *r_arr);
|
|
||||||
/*
|
|
||||||
* 读取interval文件
|
|
||||||
*/
|
|
||||||
static void ReadInterval(const std::string &interval_fn,
|
|
||||||
bam_hdr_t* header,
|
|
||||||
int interval_padding,
|
|
||||||
IntervalArr *r_arr);
|
|
||||||
/*
|
|
||||||
* 将interval相连的区域合并
|
|
||||||
*/
|
|
||||||
static void ShrinkInterval(IntervalArr *iva);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* 根据header信息,扩展interval
|
|
||||||
*/
|
|
||||||
static Interval ExpandInterval(int64_t start, int64_t end, int expandVal, bam_hdr_t* header);
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,59 +0,0 @@
|
||||||
/*
|
|
||||||
Description: read ends结构体主要用来标记冗余,包含一些序列的测序过程中的物理信息等
|
|
||||||
|
|
||||||
Copyright : All right reserved by ICT
|
|
||||||
|
|
||||||
Author : Zhang Zhonghai
|
|
||||||
Date : 2023/11/3
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef READ_ENDS_H_
|
|
||||||
#define READ_ENDS_H_
|
|
||||||
|
|
||||||
#include <stdint.h>
|
|
||||||
|
|
||||||
/* 包含了所有read ends信息,如picard里边的 ReadEndsForMarkDuplicates*/
|
|
||||||
struct ReadEnds
|
|
||||||
{
|
|
||||||
/* PhysicalLocationInt中的成员变量 */
|
|
||||||
/**
|
|
||||||
* Small class that provides access to the physical location information about a cluster.
|
|
||||||
* All values should be defaulted to -1 if unavailable. Tile should only allow
|
|
||||||
* non-zero positive integers, x and y coordinates must be non-negative.
|
|
||||||
* This is different from PhysicalLocationShort in that the x and y positions are ints, not shorts
|
|
||||||
* thus, they do not overflow within a HiSeqX tile.
|
|
||||||
*/
|
|
||||||
int16_t tile = -1;
|
|
||||||
int32_t x = -1;
|
|
||||||
int32_t y = -1;
|
|
||||||
|
|
||||||
/* ReadEnds中的成员变量 */
|
|
||||||
/** Little struct-like class to hold read pair (and fragment) end data for duplicate marking. */
|
|
||||||
static const int8_t F = 0, R = 1, FF = 2, FR = 3, RR = 4, RF = 5;
|
|
||||||
int16_t libraryId;
|
|
||||||
int8_t orientation;
|
|
||||||
int32_t read1ReferenceIndex = -1;
|
|
||||||
int32_t read1Coordinate = -1;
|
|
||||||
int32_t read2ReferenceIndex = -1;
|
|
||||||
int32_t read2Coordinate = -1; // This field is overloaded for flow based processing as the end coordinate of read 1. (paired reads not supported)
|
|
||||||
/* Additional information used to detect optical dupes */
|
|
||||||
int16_t readGroup = -1;
|
|
||||||
/** For optical duplicate detection the orientation matters regard to 1st or 2nd end of a mate */
|
|
||||||
int8_t orientationForOpticalDuplicates = -1;
|
|
||||||
/** A *transient* flag marking this read end as being an optical duplicate. */
|
|
||||||
bool isOpticalDuplicate = false;
|
|
||||||
|
|
||||||
/* ReadEndsForMarkDuplicates中的成员变量 */
|
|
||||||
/** Little struct-like class to hold read pair (and fragment) end data for MarkDuplicatesWithMateCigar **/
|
|
||||||
int16_t score = 0;
|
|
||||||
int64_t read1IndexInFile = -1;
|
|
||||||
int64_t read2IndexInFile = -1;
|
|
||||||
int64_t duplicateSetSize = -1;
|
|
||||||
|
|
||||||
/* ReadEndsForMarkDuplicatesWithBarcodes中的成员变量 (好像用不到) */
|
|
||||||
int32_t barcode = 0; // primary barcode for this read (and pair)
|
|
||||||
int32_t readOneBarcode = 0; // read one barcode, 0 if not present
|
|
||||||
int32_t readTwoBarcode = 0; // read two barcode, 0 if not present or not paired
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
@ -90,7 +90,7 @@ struct GlobalArg
|
||||||
void printArgValue() {
|
void printArgValue() {
|
||||||
printf("--INPUT = %s\n", in_fn.c_str());
|
printf("--INPUT = %s\n", in_fn.c_str());
|
||||||
printf("--OUTPUT = %s\n", out_fn.c_str());
|
printf("--OUTPUT = %s\n", out_fn.c_str());
|
||||||
printf("--num_threads = %d\n",num_threads);
|
printf("--num_threads = %d\n", num_threads);
|
||||||
printf("--max_mem = %ld\n", max_mem);
|
printf("--max_mem = %ld\n", max_mem);
|
||||||
printf("--verbosity = %d\n", verbosity);
|
printf("--verbosity = %d\n", verbosity);
|
||||||
printf("--asyncio = %d\n", use_asyncio);
|
printf("--asyncio = %d\n", use_asyncio);
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,90 @@
|
||||||
|
/*
|
||||||
|
Description: Murmur哈希
|
||||||
|
|
||||||
|
Copyright : All right reserved by ICT
|
||||||
|
|
||||||
|
Author : Zhang Zhonghai
|
||||||
|
Date : 2023/11/6
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <random>
|
||||||
|
|
||||||
|
using std::string;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Provides an implementation of the Murmur3_32 hash algorithm that has desirable properties in terms of randomness
|
||||||
|
* and uniformity of the distribution of output values that make it a useful hashing algorithm for downsampling.
|
||||||
|
*/
|
||||||
|
|
||||||
|
struct Murmur3
|
||||||
|
{
|
||||||
|
int seed_ = 0;
|
||||||
|
/** Hashes a character stream to an int using Murmur3. */
|
||||||
|
int HashUnencodedChars(const string &input)
|
||||||
|
{
|
||||||
|
int h1 = this->seed_;
|
||||||
|
|
||||||
|
// step through the CharSequence 2 chars at a time
|
||||||
|
const int length = input.size();
|
||||||
|
for (int i = 1; i < length; i += 2)
|
||||||
|
{
|
||||||
|
int k1 = input.at(i - 1) | (input.at(i) << 16);
|
||||||
|
k1 = mixK1(k1);
|
||||||
|
h1 = mixH1(h1, k1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// deal with any remaining characters
|
||||||
|
if ((length & 1) == 1)
|
||||||
|
{
|
||||||
|
int k1 = input.at(length - 1);
|
||||||
|
k1 = mixK1(k1);
|
||||||
|
h1 ^= k1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return fmix(h1, 2 * length);
|
||||||
|
}
|
||||||
|
|
||||||
|
static Murmur3 &Instance()
|
||||||
|
{
|
||||||
|
static Murmur3 instance;
|
||||||
|
return instance;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int mixK1(int k1)
|
||||||
|
{
|
||||||
|
const int c1 = 0xcc9e2d51;
|
||||||
|
const int c2 = 0x1b873593;
|
||||||
|
k1 *= c1;
|
||||||
|
k1 = k1 << 15;
|
||||||
|
k1 *= c2;
|
||||||
|
return k1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int mixH1(int h1, int k1)
|
||||||
|
{
|
||||||
|
h1 ^= k1;
|
||||||
|
h1 = h1 << 13;
|
||||||
|
h1 = h1 * 5 + 0xe6546b64;
|
||||||
|
return h1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Finalization mix - force all bits of a hash block to avalanche
|
||||||
|
static int fmix(int h1, int length)
|
||||||
|
{
|
||||||
|
h1 ^= length;
|
||||||
|
h1 ^= (unsigned int)h1 >> 16;
|
||||||
|
h1 *= 0x85ebca6b;
|
||||||
|
h1 ^= (unsigned int)h1 >> 13;
|
||||||
|
h1 *= 0xc2b2ae35;
|
||||||
|
h1 ^= (unsigned int)h1 >> 16;
|
||||||
|
return h1;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
Murmur3()
|
||||||
|
{
|
||||||
|
auto &&rd = std::random_device{};
|
||||||
|
seed_ = rd();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
@ -58,7 +58,7 @@ typedef struct bsem
|
||||||
typedef struct job
|
typedef struct job
|
||||||
{
|
{
|
||||||
struct job *prev; /* pointer to previous job */
|
struct job *prev; /* pointer to previous job */
|
||||||
void (*function)(void *arg); /* function pointer */
|
function_t function; /* function pointer */
|
||||||
void *arg; /* function's argument */
|
void *arg; /* function's argument */
|
||||||
} job;
|
} job;
|
||||||
|
|
||||||
|
|
@ -175,7 +175,7 @@ struct thpool_ *thpool_init(int num_threads)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Add work to the thread pool */
|
/* Add work to the thread pool */
|
||||||
int thpool_add_work(thpool_ *thpool_p, void (*function_p)(void *), void *arg_p)
|
int thpool_add_work(thpool_ *thpool_p, function_t function_p, void *arg_p)
|
||||||
{
|
{
|
||||||
job *newjob;
|
job *newjob;
|
||||||
|
|
||||||
|
|
@ -368,14 +368,14 @@ static void *thread_do(struct thread *thread_p)
|
||||||
pthread_mutex_unlock(&thpool_p->thcount_lock);
|
pthread_mutex_unlock(&thpool_p->thcount_lock);
|
||||||
|
|
||||||
/* Read job from queue and execute it */
|
/* Read job from queue and execute it */
|
||||||
void (*func_buff)(void *);
|
function_t func_buff;
|
||||||
void *arg_buff;
|
void *arg_buff;
|
||||||
job *job_p = jobqueue_pull(&thpool_p->jobqueue);
|
job *job_p = jobqueue_pull(&thpool_p->jobqueue);
|
||||||
if (job_p)
|
if (job_p)
|
||||||
{
|
{
|
||||||
func_buff = job_p->function;
|
func_buff = job_p->function;
|
||||||
arg_buff = job_p->arg;
|
arg_buff = job_p->arg;
|
||||||
func_buff(arg_buff);
|
func_buff(arg_buff, thread_p->id);
|
||||||
free(job_p);
|
free(job_p);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,8 @@ extern "C"
|
||||||
{
|
{
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
typedef void (*function_t)(void *, int);
|
||||||
|
|
||||||
/* =================================== API ======================================= */
|
/* =================================== API ======================================= */
|
||||||
|
|
||||||
typedef struct thpool_ *threadpool;
|
typedef struct thpool_ *threadpool;
|
||||||
|
|
@ -62,7 +64,7 @@ extern "C"
|
||||||
* @param arg_p pointer to an argument
|
* @param arg_p pointer to an argument
|
||||||
* @return 0 on success, -1 otherwise.
|
* @return 0 on success, -1 otherwise.
|
||||||
*/
|
*/
|
||||||
int thpool_add_work(threadpool, void (*function_p)(void *), void *arg_p);
|
int thpool_add_work(threadpool, function_t function_p, void *arg_p);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Wait for all queued jobs to finish
|
* @brief Wait for all queued jobs to finish
|
||||||
|
|
|
||||||
|
|
@ -29,6 +29,7 @@ using std::for_each;
|
||||||
va_start(ap, format); \
|
va_start(ap, format); \
|
||||||
vfprintf(stderr, format, ap); \
|
vfprintf(stderr, format, ap); \
|
||||||
va_end(ap); \
|
va_end(ap); \
|
||||||
|
fprintf(stderr, "\n"); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
||||||
|
|
@ -121,8 +121,8 @@ struct lock_s {
|
||||||
long value;
|
long value;
|
||||||
};
|
};
|
||||||
|
|
||||||
lock *new_lock_(long initial, char const *file, long line) {
|
lock_t *new_lock_(long initial, char const *file, long line) {
|
||||||
lock *bolt = (lock *)my_malloc(sizeof(struct lock_s), file, line);
|
lock_t *bolt = (lock_t *)my_malloc(sizeof(struct lock_s), file, line);
|
||||||
int ret = pthread_mutex_init(&(bolt->mutex), NULL);
|
int ret = pthread_mutex_init(&(bolt->mutex), NULL);
|
||||||
if (ret)
|
if (ret)
|
||||||
fail(ret, file, line, "mutex_init");
|
fail(ret, file, line, "mutex_init");
|
||||||
|
|
@ -133,19 +133,19 @@ lock *new_lock_(long initial, char const *file, long line) {
|
||||||
return bolt;
|
return bolt;
|
||||||
}
|
}
|
||||||
|
|
||||||
void possess_(lock *bolt, char const *file, long line) {
|
void possess_(lock_t *bolt, char const *file, long line) {
|
||||||
int ret = pthread_mutex_lock(&(bolt->mutex));
|
int ret = pthread_mutex_lock(&(bolt->mutex));
|
||||||
if (ret)
|
if (ret)
|
||||||
fail(ret, file, line, "mutex_lock");
|
fail(ret, file, line, "mutex_lock");
|
||||||
}
|
}
|
||||||
|
|
||||||
void release_(lock *bolt, char const *file, long line) {
|
void release_(lock_t *bolt, char const *file, long line) {
|
||||||
int ret = pthread_mutex_unlock(&(bolt->mutex));
|
int ret = pthread_mutex_unlock(&(bolt->mutex));
|
||||||
if (ret)
|
if (ret)
|
||||||
fail(ret, file, line, "mutex_unlock");
|
fail(ret, file, line, "mutex_unlock");
|
||||||
}
|
}
|
||||||
|
|
||||||
void twist_(lock *bolt, enum twist_op op, long val,
|
void twist_(lock_t *bolt, enum twist_op op, long val,
|
||||||
char const *file, long line) {
|
char const *file, long line) {
|
||||||
if (op == TO)
|
if (op == TO)
|
||||||
bolt->value = val;
|
bolt->value = val;
|
||||||
|
|
@ -161,7 +161,7 @@ void twist_(lock *bolt, enum twist_op op, long val,
|
||||||
|
|
||||||
#define until(a) while(!(a))
|
#define until(a) while(!(a))
|
||||||
|
|
||||||
void wait_for_(lock *bolt, enum wait_op op, long val,
|
void wait_for_(lock_t *bolt, enum wait_op op, long val,
|
||||||
char const *file, long line) {
|
char const *file, long line) {
|
||||||
switch (op) {
|
switch (op) {
|
||||||
case TO_BE:
|
case TO_BE:
|
||||||
|
|
@ -194,11 +194,11 @@ void wait_for_(lock *bolt, enum wait_op op, long val,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
long peek_lock(lock *bolt) {
|
long peek_lock(lock_t *bolt) {
|
||||||
return bolt->value;
|
return bolt->value;
|
||||||
}
|
}
|
||||||
|
|
||||||
void free_lock_(lock *bolt, char const *file, long line) {
|
void free_lock_(lock_t *bolt, char const *file, long line) {
|
||||||
if (bolt == NULL)
|
if (bolt == NULL)
|
||||||
return;
|
return;
|
||||||
int ret = pthread_cond_destroy(&(bolt->cond));
|
int ret = pthread_cond_destroy(&(bolt->cond));
|
||||||
|
|
@ -210,7 +210,7 @@ void free_lock_(lock *bolt, char const *file, long line) {
|
||||||
my_free(bolt);
|
my_free(bolt);
|
||||||
}
|
}
|
||||||
|
|
||||||
// -- Thread functions (uses the lock functions above) --
|
// -- Thread functions (uses the lock_t functions above) --
|
||||||
|
|
||||||
struct thread_s {
|
struct thread_s {
|
||||||
pthread_t id;
|
pthread_t id;
|
||||||
|
|
@ -220,7 +220,7 @@ struct thread_s {
|
||||||
|
|
||||||
// List of threads launched but not joined, count of threads exited but not
|
// List of threads launched but not joined, count of threads exited but not
|
||||||
// joined (incremented by ignition() just before exiting).
|
// joined (incremented by ignition() just before exiting).
|
||||||
local lock threads_lock = {
|
local lock_t threads_lock = {
|
||||||
PTHREAD_MUTEX_INITIALIZER,
|
PTHREAD_MUTEX_INITIALIZER,
|
||||||
PTHREAD_COND_INITIALIZER,
|
PTHREAD_COND_INITIALIZER,
|
||||||
0 // number of threads exited but not joined
|
0 // number of threads exited but not joined
|
||||||
|
|
|
||||||
|
|
@ -36,7 +36,7 @@
|
||||||
|
|
||||||
These functions allow the simple launching and joining of threads, and the
|
These functions allow the simple launching and joining of threads, and the
|
||||||
locking of objects and synchronization of changes of objects. The latter is
|
locking of objects and synchronization of changes of objects. The latter is
|
||||||
implemented with a single lock type that contains an integer value. The
|
implemented with a single lock_t type that contains an integer value. The
|
||||||
value can be ignored for simple exclusive access to an object, or the value
|
value can be ignored for simple exclusive access to an object, or the value
|
||||||
can be used to signal and wait for changes to an object.
|
can be used to signal and wait for changes to an object.
|
||||||
|
|
||||||
|
|
@ -45,10 +45,10 @@
|
||||||
thread *thread; identifier for launched thread, used by join
|
thread *thread; identifier for launched thread, used by join
|
||||||
void probe(void *); pointer to function "probe", run when thread starts
|
void probe(void *); pointer to function "probe", run when thread starts
|
||||||
void *payload; single argument passed to the probe function
|
void *payload; single argument passed to the probe function
|
||||||
lock *lock; a lock with a value -- used for exclusive access to
|
lock_t *lock_t; a lock_t with a value -- used for exclusive access to
|
||||||
an object and to synchronize threads waiting for
|
an object and to synchronize threads waiting for
|
||||||
changes to an object
|
changes to an object
|
||||||
long val; value to set lock, increment lock, or wait for
|
long val; value to set lock_t, increment lock_t, or wait for
|
||||||
int n; number of threads joined
|
int n; number of threads joined
|
||||||
|
|
||||||
-- Thread functions --
|
-- Thread functions --
|
||||||
|
|
@ -66,25 +66,25 @@
|
||||||
|
|
||||||
-- Lock functions --
|
-- Lock functions --
|
||||||
|
|
||||||
lock = new_lock(val) - create a new lock with initial value val (lock is
|
lock_t = new_lock(val) - create a new lock_t with initial value val (lock_t is
|
||||||
created in the released state)
|
created in the released state)
|
||||||
possess(lock) - acquire exclusive possession of a lock, waiting if necessary
|
possess(lock_t) - acquire exclusive possession of a lock_t, waiting if necessary
|
||||||
twist(lock, [TO | BY], val) - set lock to or increment lock by val, signal
|
twist(lock_t, [TO | BY], val) - set lock_t to or increment lock_t by val, signal
|
||||||
all threads waiting on this lock and then release the lock -- must
|
all threads waiting on this lock_t and then release the lock_t -- must
|
||||||
possess the lock before calling (twist releases, so don't do a
|
possess the lock_t before calling (twist releases, so don't do a
|
||||||
release() after a twist() on the same lock)
|
release() after a twist() on the same lock_t)
|
||||||
wait_for(lock, [TO_BE | NOT_TO_BE | TO_BE_MORE_THAN | TO_BE_LESS_THAN], val)
|
wait_for(lock_t, [TO_BE | NOT_TO_BE | TO_BE_MORE_THAN | TO_BE_LESS_THAN], val)
|
||||||
- wait on lock value to be, not to be, be greater than, or be less than
|
- wait on lock_t value to be, not to be, be greater than, or be less than
|
||||||
val -- must possess the lock before calling, will possess the lock on
|
val -- must possess the lock_t before calling, will possess the lock_t on
|
||||||
return but the lock is released while waiting to permit other threads
|
return but the lock_t is released while waiting to permit other threads
|
||||||
to use twist() to change the value and signal the change (so make sure
|
to use twist() to change the value and signal the change (so make sure
|
||||||
that the object is in a usable state when waiting)
|
that the object is in a usable state when waiting)
|
||||||
release(lock) - release a possessed lock (do not try to release a lock that
|
release(lock_t) - release a possessed lock_t (do not try to release a lock_t that
|
||||||
the current thread does not possess)
|
the current thread does not possess)
|
||||||
val = peek_lock(lock) - return the value of the lock (assumes that lock is
|
val = peek_lock(lock_t) - return the value of the lock_t (assumes that lock_t is
|
||||||
already possessed, no possess or release is done by peek_lock())
|
already possessed, no possess or release is done by peek_lock())
|
||||||
free_lock(lock) - free the resources allocated by new_lock() (application
|
free_lock(lock_t) - free the resources allocated by new_lock() (application
|
||||||
must assure that the lock is released before calling free_lock())
|
must assure that the lock_t is released before calling free_lock())
|
||||||
|
|
||||||
-- Memory allocation ---
|
-- Memory allocation ---
|
||||||
|
|
||||||
|
|
@ -112,27 +112,28 @@ void yarn_mem(void *(*)(size_t), void (*)(void *));
|
||||||
|
|
||||||
typedef struct thread_s thread;
|
typedef struct thread_s thread;
|
||||||
thread *launch_(void (*)(void *), void *, char const *, long);
|
thread *launch_(void (*)(void *), void *, char const *, long);
|
||||||
#define launch(a, b) launch_(a, b, __FILE__, __LINE__)
|
#define LAUNCH(a, b) launch_(a, b, __FILE__, __LINE__)
|
||||||
void join_(thread *, char const *, long);
|
void join_(thread *, char const *, long);
|
||||||
#define join(a) join_(a, __FILE__, __LINE__)
|
#define JOIN(a) join_(a, __FILE__, __LINE__)
|
||||||
int join_all_(char const *, long);
|
int join_all_(char const *, long);
|
||||||
#define join_all() join_all_(__FILE__, __LINE__)
|
#define JOIN_ALL() join_all_(__FILE__, __LINE__)
|
||||||
|
|
||||||
typedef struct lock_s lock;
|
typedef struct lock_s lock_t;
|
||||||
lock *new_lock_(long, char const *, long);
|
lock_t *new_lock_(long, char const *, long);
|
||||||
#define new_lock(a) new_lock_(a, __FILE__, __LINE__)
|
#define NEW_LOCK(a) new_lock_(a, __FILE__, __LINE__)
|
||||||
void possess_(lock *, char const *, long);
|
void possess_(lock_t *, char const *, long);
|
||||||
#define possess(a) possess_(a, __FILE__, __LINE__)
|
#define POSSESS(a) possess_(a, __FILE__, __LINE__)
|
||||||
void release_(lock *, char const *, long);
|
void release_(lock_t *, char const *, long);
|
||||||
#define release(a) release_(a, __FILE__, __LINE__)
|
// #define release(a) release_(a, __FILE__, __LINE__)
|
||||||
|
#define RELEASE(a) release_(a, __FILE__, __LINE__)
|
||||||
enum twist_op { TO, BY };
|
enum twist_op { TO, BY };
|
||||||
void twist_(lock *, enum twist_op, long, char const *, long);
|
void twist_(lock_t *, enum twist_op, long, char const *, long);
|
||||||
#define twist(a, b, c) twist_(a, b, c, __FILE__, __LINE__)
|
#define TWIST(a, b, c) twist_(a, b, c, __FILE__, __LINE__)
|
||||||
enum wait_op {
|
enum wait_op {
|
||||||
TO_BE, /* or */ NOT_TO_BE, /* that is the question */
|
TO_BE, /* or */ NOT_TO_BE, /* that is the question */
|
||||||
TO_BE_MORE_THAN, TO_BE_LESS_THAN };
|
TO_BE_MORE_THAN, TO_BE_LESS_THAN };
|
||||||
void wait_for_(lock *, enum wait_op, long, char const *, long);
|
void wait_for_(lock_t *, enum wait_op, long, char const *, long);
|
||||||
#define wait_for(a, b, c) wait_for_(a, b, c, __FILE__, __LINE__)
|
#define WAIT_FOR(a, b, c) wait_for_(a, b, c, __FILE__, __LINE__)
|
||||||
long peek_lock(lock *);
|
long peek_lock(lock_t *);
|
||||||
void free_lock_(lock *, char const *, long);
|
void free_lock_(lock_t *, char const *, long);
|
||||||
#define free_lock(a) free_lock_(a, __FILE__, __LINE__)
|
#define FREE_LOCK(a) free_lock_(a, __FILE__, __LINE__)
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
Description: 标记bam文件中的冗余信息
|
Description: 标记bam文件中的冗余信息,只处理按照坐标排序后的bam,且bam为单一样本数据
|
||||||
|
|
||||||
Copyright : All right reserved by ICT
|
Copyright : All right reserved by ICT
|
||||||
|
|
||||||
|
|
@ -7,64 +7,395 @@ Author : Zhang Zhonghai
|
||||||
Date : 2023/10/23
|
Date : 2023/10/23
|
||||||
*/
|
*/
|
||||||
#include "markdups_arg.h"
|
#include "markdups_arg.h"
|
||||||
|
// 有太多define冲突,放到最后include
|
||||||
|
|
||||||
|
|
||||||
|
#include <common/hts/bam_buf.h>
|
||||||
#include <common/utils/global_arg.h>
|
#include <common/utils/global_arg.h>
|
||||||
#include <common/utils/thpool.h>
|
#include <common/utils/thpool.h>
|
||||||
#include <common/utils/timer.h>
|
#include <common/utils/timer.h>
|
||||||
#include <common/utils/util.h>
|
#include <common/utils/util.h>
|
||||||
#include <common/hts/bam_buf.h>
|
#include <common/utils/murmur3.h>
|
||||||
#include <common/hts/read_ends.h>
|
|
||||||
#include <common/utils/yarn.h>
|
#include <common/utils/yarn.h>
|
||||||
|
#include <sam/utils/read_ends.h>
|
||||||
|
#include <sam/utils/read_name_parser.h>
|
||||||
|
|
||||||
#include <htslib/sam.h>
|
#include <htslib/sam.h>
|
||||||
#include "htslib/thread_pool.h"
|
#include <htslib/thread_pool.h>
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <set>
|
#include <set>
|
||||||
#include <queue>
|
#include <queue>
|
||||||
|
#include <unordered_map>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
using std::cout;
|
||||||
|
|
||||||
|
#define SMA_TAG_PG "PG"
|
||||||
|
|
||||||
#define BAM_BLOCK_SIZE 2 * 1024 * 1024
|
#define BAM_BLOCK_SIZE 2 * 1024 * 1024
|
||||||
|
#define NO_SUCH_INDEX INT64_MAX
|
||||||
|
|
||||||
|
static Timer tm_arr[10]; // 用来测试性能
|
||||||
|
|
||||||
/* 前向声明 */
|
/* 前向声明 */
|
||||||
class ThMarkDupArg;
|
class ThMarkDupArg;
|
||||||
|
|
||||||
/* 全局本地变量 */
|
/* 全局本地变量 */
|
||||||
static queue<ThMarkDupArg *> qpThMarkDupArg; // 存放线程变量的队列
|
static queue<ThMarkDupArg *> g_qpThMarkDupArg; // 存放线程变量的队列
|
||||||
static lock *queueFirstLock = new_lock(-1); // 队列的第一个任务是否完成
|
static lock_t *g_queueFirstLock = NEW_LOCK(-1); // 队列的第一个任务是否完成
|
||||||
|
static lock_t *g_readyToReadLock = NEW_LOCK(-1); // 通知主线程是否可以进行下一次读取
|
||||||
|
static vector<ReadNameParser> g_vRnParser; // 每个线程一个read name parser
|
||||||
|
static int g_numDuplicateIndices = 0; // 找到的冗余read总数
|
||||||
|
static samFile *g_outBamFp = nullptr; // 输出文件, sam或者bam格式
|
||||||
|
static sam_hdr_t *g_outBamHeader; // 输出文件的header
|
||||||
|
static int g_maxJobNum = 0; // 每次读取新的数据后,新增的任务数量
|
||||||
|
static int g_jobNumForRead = 0; // 任务数量降到当前值时开始下一轮读取
|
||||||
|
static volatile int64_t g_bamLoadedNum = 0; // 已经读入的read总数
|
||||||
|
static volatile int64_t g_bamWritenNum = 0; // 已经处理完,写入输出文件的read总数
|
||||||
|
static vector<int64_t> g_vDupIdx; // 线程内部计算得出的
|
||||||
|
static vector<int64_t> g_vOpticalDupIdx;
|
||||||
|
static set<int64_t> g_sDupIdxLatter;
|
||||||
|
static set<int64_t> g_sOpticalDupIdxLatter;
|
||||||
|
|
||||||
|
/* 参数对象作为全局对象,免得多次作为参数传入函数中 */
|
||||||
|
static GlobalArg &g_gArg = GlobalArg::Instance();
|
||||||
|
static MarkDupsArg g_mdArg;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* 计算read的分数
|
||||||
|
*/
|
||||||
|
static int16_t computeDuplicateScore(BamWrap &bw)
|
||||||
|
{
|
||||||
|
int16_t score = 0;
|
||||||
|
switch (g_mdArg.DUPLICATE_SCORING_STRATEGY)
|
||||||
|
{
|
||||||
|
case ns_md::SUM_OF_BASE_QUALITIES:
|
||||||
|
// two (very) long reads worth of high-quality bases can go over Short.MAX_VALUE/2
|
||||||
|
// and risk overflow.
|
||||||
|
score += (int16_t)min(bw.GetSumOfBaseQualities(), INT16_MAX / 2);
|
||||||
|
break;
|
||||||
|
case ns_md::TOTAL_MAPPED_REFERENCE_LENGTH:
|
||||||
|
if (!bw.GetReadUnmappedFlag())
|
||||||
|
// no need to remember the score since this scoring mechanism is symmetric
|
||||||
|
score = (int16_t)min(bw.GetReferenceLength(), INT16_MAX / 2);
|
||||||
|
break;
|
||||||
|
case ns_md::RANDOM:
|
||||||
|
// The RANDOM score gives the same score to both reads so that they get filtered together.
|
||||||
|
// it's not critical do use the readName since the scores from both ends get added, but it seem
|
||||||
|
// to be clearer this way.
|
||||||
|
score += (short)(Murmur3::Instance().HashUnencodedChars(bw.query_name()) & 0b11111111111111);
|
||||||
|
// subtract Short.MIN_VALUE/4 from it to end up with a number between
|
||||||
|
// 0 and Short.MAX_VALUE/2. This number can be then discounted in case the read is
|
||||||
|
// not passing filters. We need to stay far from overflow so that when we add the two
|
||||||
|
// scores from the two read mates we do not overflow since that could cause us to chose a
|
||||||
|
// failing read-pair instead of a passing one.
|
||||||
|
score -= INT16_MIN / 4;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// make sure that filter-failing records are heavily discounted. (the discount can happen twice, once
|
||||||
|
// for each mate, so need to make sure we do not subtract more than Short.MIN_VALUE overall.)
|
||||||
|
score += bw.GetReadFailsVendorQualityCheckFlag() ? (int16_t)(INT16_MIN / 2) : 0;
|
||||||
|
|
||||||
|
return score;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Builds a read ends object that represents a single read. 用来表示一个read的特征结构
|
||||||
|
*/
|
||||||
|
static void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey)
|
||||||
|
{
|
||||||
|
auto &k = *pKey;
|
||||||
|
auto &bc = bw.b->core;
|
||||||
|
k.read1ReferenceIndex = bc.tid;
|
||||||
|
k.read1Coordinate = (bc.flag & BAM_FREVERSE) ? bw.GetUnclippedEnd() : bw.GetUnclippedStart();
|
||||||
|
k.orientation = (bc.flag & BAM_FREVERSE) ? ReadEnds::R : ReadEnds::F;
|
||||||
|
k.read1IndexInFile = index;
|
||||||
|
k.score = computeDuplicateScore(bw);
|
||||||
|
// Doing this lets the ends object know that it's part of a pair
|
||||||
|
if (bw.GetReadPairedFlag() && !bw.GetMateUnmappedFlag())
|
||||||
|
{
|
||||||
|
k.read2ReferenceIndex = bc.mtid;
|
||||||
|
}
|
||||||
|
// Fill in the location information for optical duplicates
|
||||||
|
rnParser.AddLocationInformation(bw.query_name(), pKey);
|
||||||
|
// cout << k.tile << ' ' << k.x << ' ' << k.y << endl;
|
||||||
|
// 计算位置key
|
||||||
|
k.posKey = BamWrap::bam_global_pos(k.read1ReferenceIndex, k.read1Coordinate); // << 1 | k.orientation;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Takes a list of ReadEndsForMarkDuplicates objects and identify the representative read based on
|
||||||
|
* quality score. For all members of the duplicate set, add the read1 index-in-file of the representative
|
||||||
|
* read to the records of the first and second in a pair. This value becomes is used for
|
||||||
|
* the 'DI' tag.
|
||||||
|
*/
|
||||||
|
static void addRepresentativeReadIndex(vector<ReadEnds *> &vpRe)
|
||||||
|
{
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 处理一组pairend的readends,标记冗余 */
|
||||||
|
static void markDuplicatePairs(vector<ReadEnds *> &vpRe, set<int64_t> *psDupIdx, set<int64_t> *psOpticalDupIdx)
|
||||||
|
{
|
||||||
|
if (vpRe.size() < 2) {
|
||||||
|
if (vpRe.size() == 1)
|
||||||
|
{
|
||||||
|
// addSingletonToCount(libraryIdGenerator);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int maxScore = 0;
|
||||||
|
ReadEnds *pBestRe = nullptr;
|
||||||
|
/** All read ends should have orientation FF, FR, RF, or RR **/
|
||||||
|
for (auto pe: vpRe) // 找分数最高的readend
|
||||||
|
{
|
||||||
|
if (pe->score > maxScore || pBestRe == nullptr)
|
||||||
|
{
|
||||||
|
maxScore = pe->score;
|
||||||
|
pBestRe = pe;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!g_mdArg.READ_NAME_REGEX.empty()) // 检查光学冗余
|
||||||
|
{
|
||||||
|
// trackOpticalDuplicates
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto pe: vpRe) // 对非best read标记冗余
|
||||||
|
{
|
||||||
|
if (pe != pBestRe) // 非best
|
||||||
|
{
|
||||||
|
psDupIdx->insert(pe->read1IndexInFile); // 添加read1
|
||||||
|
if (pe->read2IndexInFile != pe->read1IndexInFile)
|
||||||
|
psDupIdx->insert(pe->read2IndexInFile); // 添加read2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS)
|
||||||
|
{
|
||||||
|
addRepresentativeReadIndex(vpRe);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 处理一组非paired的readends,标记冗余 */
|
||||||
|
static void markDuplicateFragments(vector<ReadEnds *> &vpRe,
|
||||||
|
bool containsPairs,
|
||||||
|
set<int64_t> *psDupIdx,
|
||||||
|
set<int64_t> *psOpticalDupIdx)
|
||||||
|
{
|
||||||
|
if (containsPairs)
|
||||||
|
{
|
||||||
|
for (auto pe: vpRe)
|
||||||
|
{
|
||||||
|
if (!pe->IsPaired())
|
||||||
|
{
|
||||||
|
psDupIdx->insert(pe->read1IndexInFile);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
int maxScore = 0;
|
||||||
|
ReadEnds *pBest = nullptr;
|
||||||
|
for (auto pe : vpRe)
|
||||||
|
{
|
||||||
|
if (pe->score > maxScore || pBest == nullptr)
|
||||||
|
{
|
||||||
|
maxScore = pe->score;
|
||||||
|
pBest = pe;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto pe : vpRe)
|
||||||
|
{
|
||||||
|
if (pe != pBest)
|
||||||
|
{
|
||||||
|
psDupIdx->insert(pe->read1IndexInFile);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* 多线程处理冗余参数结构体 */
|
/* 多线程处理冗余参数结构体 */
|
||||||
struct ThMarkDupArg
|
struct ThMarkDupArg
|
||||||
{
|
{
|
||||||
vector<BamWrap *> *pvBam;
|
int64_t bamStartIdx; // 当前vBam数组中第一个bam记录在整体bam中所处的位置
|
||||||
int startIdx; // 闭区间
|
long seq; // 当前任务在所有任务的排序
|
||||||
int endIdx; // 开区间
|
bool more; // 后面还有任务
|
||||||
long seq; // 当前任务在所有任务的排序
|
volatile bool finish; // 当前任务有没有处理完
|
||||||
bool more; // 后面还有任务
|
vector<BamWrap *> vBam; // 存放待处理的bam read
|
||||||
volatile bool finish; // 当前任务有没有处理完
|
map<int64_t, vector<ReadEnds>> mvPair; // 以冗余位置为索引,保存所有pairend reads
|
||||||
set<int> sDupIdx; // 冗余read的索引
|
map<int64_t, vector<ReadEnds>> mvFrag; // 保存所有reads,包括pairend
|
||||||
|
map<int64_t, set<int64_t>> msDupIdx; // 冗余read的索引
|
||||||
|
map<int64_t, set<int64_t>> msOpticalDupIdx; // optical冗余read的索引
|
||||||
|
unordered_map<string, ReadEnds> umReadEnds; // 用来寻找pair end
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* 多线程查找和标记冗余函数
|
* 多线程查找和标记冗余函数
|
||||||
*/
|
*/
|
||||||
void thread_markdups(void *arg)
|
void thread_markdups(void *arg, int tid)
|
||||||
{
|
{
|
||||||
auto &p = *(ThMarkDupArg *)arg;
|
auto &p = *(ThMarkDupArg *)arg;
|
||||||
|
|
||||||
p.sDupIdx.insert(1);
|
/* 处理每个read,创建ReadEnd,并放入frag和pair中 */
|
||||||
/* 处理数据 */
|
for (int i = 0; i < p.vBam.size(); ++i) // 循环处理每个read
|
||||||
|
{
|
||||||
|
BamWrap *bw = p.vBam[i];
|
||||||
|
const int64_t bamIdx = p.bamStartIdx + i;
|
||||||
|
if (bw->GetReadUnmappedFlag())
|
||||||
|
{
|
||||||
|
if (bw->b->core.tid == -1)
|
||||||
|
// When we hit the unmapped reads with no coordinate, no reason to continue (only in coordinate sort).
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else if (!bw->IsSecondaryOrSupplementary()) // 是主要比对
|
||||||
|
{
|
||||||
|
ReadEnds fragEnd;
|
||||||
|
buildReadEnds(*bw, bamIdx, g_vRnParser[tid], &fragEnd);
|
||||||
|
p.mvFrag[fragEnd.posKey].push_back(fragEnd); // 添加进frag集合
|
||||||
|
if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) // 是pairend而且互补的read也比对上了
|
||||||
|
{
|
||||||
|
string key = bw->query_name();
|
||||||
|
if (p.umReadEnds.find(key) == p.umReadEnds.end())
|
||||||
|
{
|
||||||
|
p.umReadEnds[key] = fragEnd;
|
||||||
|
}
|
||||||
|
else // 找到了pairend
|
||||||
|
{
|
||||||
|
auto pairedEnds = p.umReadEnds.at(key);
|
||||||
|
p.umReadEnds.erase(key); // 删除找到的pairend
|
||||||
|
const int matesRefIndex = fragEnd.read1ReferenceIndex;
|
||||||
|
const int matesCoordinate = fragEnd.read1Coordinate;
|
||||||
|
// Set orientationForOpticalDuplicates, which always goes by the first then the second end for the strands. NB: must do this
|
||||||
|
// before updating the orientation later.
|
||||||
|
if (bw->GetFirstOfPairFlag())
|
||||||
|
{
|
||||||
|
pairedEnds.orientationForOpticalDuplicates =
|
||||||
|
ReadEnds::GetOrientationByte(bw->GetReadNegativeStrandFlag(), pairedEnds.orientation == ReadEnds::R);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
pairedEnds.orientationForOpticalDuplicates =
|
||||||
|
ReadEnds::GetOrientationByte(pairedEnds.orientation == ReadEnds::R, bw->GetReadNegativeStrandFlag());
|
||||||
|
}
|
||||||
|
// If the other read is actually later, simply add the other read's data as read2, else flip the reads
|
||||||
|
if (matesRefIndex > pairedEnds.read1ReferenceIndex ||
|
||||||
|
(matesRefIndex == pairedEnds.read1ReferenceIndex && matesCoordinate >= pairedEnds.read1Coordinate))
|
||||||
|
{
|
||||||
|
pairedEnds.read2ReferenceIndex = matesRefIndex;
|
||||||
|
pairedEnds.read2Coordinate = matesCoordinate;
|
||||||
|
pairedEnds.read2IndexInFile = bamIdx;
|
||||||
|
pairedEnds.orientation = ReadEnds::GetOrientationByte(pairedEnds.orientation == ReadEnds::R,
|
||||||
|
bw->GetReadNegativeStrandFlag());
|
||||||
|
|
||||||
|
// if the two read ends are in the same position, pointing in opposite directions,
|
||||||
|
// the orientation is undefined and the procedure above
|
||||||
|
// will depend on the order of the reads in the file.
|
||||||
|
// To avoid this, we set it explicitly (to FR):
|
||||||
|
if (pairedEnds.read2ReferenceIndex == pairedEnds.read1ReferenceIndex &&
|
||||||
|
pairedEnds.read2Coordinate == pairedEnds.read1Coordinate &&
|
||||||
|
pairedEnds.orientation == ReadEnds::RF)
|
||||||
|
{
|
||||||
|
pairedEnds.orientation = ReadEnds::FR;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
pairedEnds.read2ReferenceIndex = pairedEnds.read1ReferenceIndex;
|
||||||
|
pairedEnds.read2Coordinate = pairedEnds.read1Coordinate;
|
||||||
|
pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile;
|
||||||
|
pairedEnds.read1ReferenceIndex = matesRefIndex;
|
||||||
|
pairedEnds.read1Coordinate = matesCoordinate;
|
||||||
|
pairedEnds.read1IndexInFile = bamIdx;
|
||||||
|
pairedEnds.orientation = ReadEnds::GetOrientationByte(bw->GetReadNegativeStrandFlag(),
|
||||||
|
pairedEnds.orientation == ReadEnds::R);
|
||||||
|
}
|
||||||
|
|
||||||
|
pairedEnds.score += computeDuplicateScore(*bw);
|
||||||
|
p.mvPair[pairedEnds.posKey].push_back(pairedEnds);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* generateDuplicateIndexes,计算冗余read在所有read中的位置索引 */
|
||||||
|
// 先处理 pair
|
||||||
|
int dupNum = 0;
|
||||||
|
vector<ReadEnds *> vRePotentialDup; // 有可能是冗余的reads
|
||||||
|
for (auto &e : p.mvPair) // 按比对的位置先后进行遍历
|
||||||
|
{
|
||||||
|
if (e.second.size() > 1) // 有潜在的冗余
|
||||||
|
{
|
||||||
|
vRePotentialDup.clear();
|
||||||
|
ReadEnds *pReadEnd = nullptr;
|
||||||
|
for (auto &re : e.second)
|
||||||
|
{
|
||||||
|
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true))
|
||||||
|
vRePotentialDup.push_back(&re);
|
||||||
|
else
|
||||||
|
{
|
||||||
|
markDuplicatePairs(vRePotentialDup, &p.msDupIdx[e.first], &p.msOpticalDupIdx[e.first]);
|
||||||
|
vRePotentialDup.clear();
|
||||||
|
vRePotentialDup.push_back(&re);
|
||||||
|
pReadEnd = &re;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
markDuplicatePairs(vRePotentialDup, &p.msDupIdx[e.first], &p.msOpticalDupIdx[e.first]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// 再处理frag
|
||||||
|
bool containsPairs = false;
|
||||||
|
bool containsFrags = false;
|
||||||
|
for (auto &e : p.mvFrag)
|
||||||
|
{
|
||||||
|
if (e.second.size() > 1) // 有潜在的冗余
|
||||||
|
{
|
||||||
|
vRePotentialDup.clear();
|
||||||
|
ReadEnds *pReadEnd = nullptr;
|
||||||
|
for (auto &re : e.second)
|
||||||
|
{
|
||||||
|
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false))
|
||||||
|
{
|
||||||
|
vRePotentialDup.push_back(&re);
|
||||||
|
containsPairs = containsPairs || re.IsPaired();
|
||||||
|
containsFrags = containsFrags || !re.IsPaired();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (vRePotentialDup.size() > 1 && containsFrags)
|
||||||
|
{
|
||||||
|
markDuplicateFragments(vRePotentialDup, containsPairs, &p.msDupIdx[e.first], &p.msOpticalDupIdx[e.first]);
|
||||||
|
}
|
||||||
|
vRePotentialDup.clear();
|
||||||
|
vRePotentialDup.push_back(&re);
|
||||||
|
pReadEnd = &re;
|
||||||
|
containsPairs = re.IsPaired();
|
||||||
|
containsFrags = !re.IsPaired();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (vRePotentialDup.size() > 1 && containsFrags) {
|
||||||
|
markDuplicateFragments(vRePotentialDup, containsPairs, &p.msDupIdx[e.first], &p.msOpticalDupIdx[e.first]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// cout << tid << '\t' << "dup: " << dupNum << endl;
|
||||||
|
// cout << tid << " all: no: " << p.vBam.size() << '\t' << p.umReadEnds.size() << endl;
|
||||||
/* 本段数据处理完成,告诉输出线程 */
|
/* 本段数据处理完成,告诉输出线程 */
|
||||||
possess(queueFirstLock);
|
POSSESS(g_queueFirstLock);
|
||||||
p.finish = true;
|
p.finish = true;
|
||||||
cout << "process: " << p.seq << endl;
|
// cout << tid << ": process: " << p.seq << endl;
|
||||||
auto front = qpThMarkDupArg.front();
|
auto front = g_qpThMarkDupArg.front();
|
||||||
if (front->finish)
|
if (front->finish)
|
||||||
{
|
{
|
||||||
twist(queueFirstLock, TO, front->seq);
|
TWIST(g_queueFirstLock, TO, front->seq); // 通知写线程,当前队列头部完成的任务
|
||||||
} else {
|
} else {
|
||||||
release(queueFirstLock);
|
RELEASE(g_queueFirstLock);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -75,58 +406,73 @@ void thread_write(void *)
|
||||||
{
|
{
|
||||||
bool more = false;
|
bool more = false;
|
||||||
long seq = 0;
|
long seq = 0;
|
||||||
possess(queueFirstLock);
|
long unPairedNum = 0;
|
||||||
wait_for(queueFirstLock, TO_BE, seq++); // 等待首个任务完成
|
POSSESS(g_queueFirstLock);
|
||||||
auto lastP = qpThMarkDupArg.front(); // 取队首的数据
|
WAIT_FOR(g_queueFirstLock, TO_BE, seq++); // 等待首个任务完成
|
||||||
qpThMarkDupArg.pop(); // 删除队首
|
auto lastP = g_qpThMarkDupArg.front(); // 取队首的数据
|
||||||
twist(queueFirstLock, TO, seq);
|
auto umUnpairedReadEnds = lastP->umReadEnds; // 还未找到pair的read
|
||||||
more = lastP->more;
|
auto p = lastP;
|
||||||
|
g_qpThMarkDupArg.pop(); // 删除队首
|
||||||
|
TWIST(g_queueFirstLock, TO, seq); // 解锁
|
||||||
|
more = lastP->more; // 是否还有下一个任务
|
||||||
while (more) // 循环处理,将结果写入文件
|
while (more) // 循环处理,将结果写入文件
|
||||||
{
|
{
|
||||||
possess(queueFirstLock);
|
POSSESS(g_queueFirstLock);
|
||||||
if (qpThMarkDupArg.empty()) // 有可能新任务没来得及添加进队列
|
if (g_qpThMarkDupArg.empty()) // 有可能新任务没来得及添加进队列
|
||||||
{
|
{
|
||||||
release(queueFirstLock);
|
RELEASE(g_queueFirstLock);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
wait_for(queueFirstLock, TO_BE, seq); // 等待任务完成
|
WAIT_FOR(g_queueFirstLock, TO_BE, seq); // 等待任务完成
|
||||||
auto p = qpThMarkDupArg.front();
|
p = g_qpThMarkDupArg.front();
|
||||||
if (!p->finish) // 有可能这个任务没有完成,是下边那个twist导致进到这里,因为这一段代码可能运行比较快
|
if (!p->finish) // 有可能这个任务没有完成,是下边那个TWIST导致进到这里,因为这一段代码可能运行比较快
|
||||||
{
|
{
|
||||||
twist(queueFirstLock, TO, -1); // 此时队首任务没完成,-1可以让锁无法进入到这里,避免无效获得锁
|
TWIST(g_queueFirstLock, TO, -1); // 此时队首任务没完成,-1可以让锁无法进入到这里,避免无效获得锁
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
qpThMarkDupArg.pop();
|
g_qpThMarkDupArg.pop();
|
||||||
twist(queueFirstLock, TO, seq + 1);
|
TWIST(g_queueFirstLock, TO, seq + 1);
|
||||||
|
/* 处理结果数据 */
|
||||||
|
// cout << "finish: " << seq - 1 << '\t' << "lastIdx: " << p->bamStartIdx+p->vBam.size() << endl;
|
||||||
|
|
||||||
/* 处理结果数据 */
|
for (auto &e : p->umReadEnds) // 在当前任务中找有没有与上一个任务中没匹配的read,相匹配的pair
|
||||||
cout << "finish: " << seq - 1 << endl;
|
{
|
||||||
|
if (umUnpairedReadEnds.find(e.first) != umUnpairedReadEnds.end())
|
||||||
|
umUnpairedReadEnds.erase(e.first); // 找到了pair
|
||||||
|
else
|
||||||
|
umUnpairedReadEnds.insert(e); // 没有pair,则添加
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 更新写入read数量和状态 */
|
||||||
|
POSSESS(g_readyToReadLock);
|
||||||
|
g_bamWritenNum += lastP->vBam.size();
|
||||||
|
// cout << "write: " << g_qpThMarkDupArg.size() << endl;
|
||||||
|
if (g_qpThMarkDupArg.size() <= g_jobNumForRead)
|
||||||
|
{
|
||||||
|
TWIST(g_readyToReadLock, TO, 1);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
RELEASE(g_readyToReadLock);
|
||||||
|
}
|
||||||
/* 准备下一轮循环 */
|
/* 准备下一轮循环 */
|
||||||
delete lastP;
|
delete lastP;
|
||||||
more = p->more;
|
more = p->more;
|
||||||
lastP = p;
|
lastP = p;
|
||||||
seq++;
|
seq++;
|
||||||
}
|
}
|
||||||
|
unPairedNum = umUnpairedReadEnds.size();
|
||||||
|
|
||||||
|
cout << "Finally unpaired read num: " << unPairedNum << endl;
|
||||||
|
|
||||||
// 处理最后一个数据
|
// 处理最后一个数据
|
||||||
cout << "finish: " << seq - 1 << endl;
|
POSSESS(g_readyToReadLock);
|
||||||
|
g_bamWritenNum += lastP->vBam.size();
|
||||||
|
TWIST(g_readyToReadLock, TO, 1);
|
||||||
|
// cout << "last finish: " << seq - 1 << endl;
|
||||||
pthread_exit(0);
|
pthread_exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Builds a read ends object that represents a single read.
|
|
||||||
*/
|
|
||||||
static void buildReadEnds(BamWrap &bw, int64_t index, ReadEnds *pKey)
|
|
||||||
{
|
|
||||||
auto &k = *pKey;
|
|
||||||
auto &bc = bw.b->core;
|
|
||||||
k.read1ReferenceIndex = bc.tid;
|
|
||||||
k.read1Coordinate = (bc.flag & BAM_FREVERSE) ? bw.GetUnclippedEnd() : bw.GetUnclippedStart();
|
|
||||||
k.orientation = (bc.flag & BAM_FREVERSE) ? ReadEnds::R : ReadEnds::F;
|
|
||||||
k.read1IndexInFile = index;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* mark duplicate 入口,假定bam是按照比对后的坐标排序的,同一个样本的话不需要考虑barcode的问题
|
* mark duplicate 入口,假定bam是按照比对后的坐标排序的,同一个样本的话不需要考虑barcode的问题
|
||||||
*/
|
*/
|
||||||
|
|
@ -134,108 +480,160 @@ int MarkDuplicates(int argc, char *argv[])
|
||||||
{
|
{
|
||||||
Timer::log_time("程序开始");
|
Timer::log_time("程序开始");
|
||||||
Timer time_all;
|
Timer time_all;
|
||||||
/* 初始化参数 */
|
|
||||||
GlobalArg &gArg = GlobalArg::Instance();
|
/* 读取命令行参数 */
|
||||||
MarkDupsArg mdArg;
|
g_mdArg.parseArgument(argc, argv, &g_gArg); // 解析命令行参数
|
||||||
vector<AuxVar> vAuxVar;
|
if (g_gArg.num_threads < 1) // 线程数不能小于1
|
||||||
mdArg.parseArgument(argc, argv, &gArg); // 解析命令行参数
|
g_gArg.num_threads = 1;
|
||||||
|
|
||||||
// if (gArg.num_threads > 1) // 多线程处理
|
/* 初始化一些参数和变量*/
|
||||||
if (false)
|
g_vRnParser.resize(g_gArg.num_threads);
|
||||||
|
for (auto &parser : g_vRnParser)
|
||||||
|
parser.SetReadNameRegex(g_mdArg.READ_NAME_REGEX); // 用来解析read name中的tile,x,y信息
|
||||||
|
|
||||||
|
/* 打开输入bam文件 */
|
||||||
|
sam_hdr_t *inBamHeader;
|
||||||
|
samFile *inBamFp;
|
||||||
|
inBamFp = sam_open_format(g_gArg.in_fn.c_str(), "r", nullptr);
|
||||||
|
if (!inBamFp)
|
||||||
{
|
{
|
||||||
threadpool thpool = thpool_init(gArg.num_threads); // 创建mark dup所需的线程池
|
Error("[%s] load sam/bam file failed.\n", __func__);
|
||||||
thread *writeth = launch(thread_write, nullptr); // 启动处理结果的的线程
|
return -1;
|
||||||
for (int i = 0; i < 40; ++i)
|
|
||||||
{
|
|
||||||
ThMarkDupArg *thArg = new ThMarkDupArg({nullptr, i, i * 10, i, true, false});
|
|
||||||
if (i == 39)
|
|
||||||
thArg->more = false;
|
|
||||||
possess(queueFirstLock); // 加锁
|
|
||||||
qpThMarkDupArg.push(thArg); // 将新任务需要的参数添加到队列
|
|
||||||
release(queueFirstLock); // 解锁
|
|
||||||
thpool_add_work(thpool, thread_markdups, (void *)thArg); // 添加新任务
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 同步所有线程 */
|
|
||||||
thpool_wait(thpool);
|
|
||||||
thpool_destroy(thpool);
|
|
||||||
join(writeth);
|
|
||||||
} else { // 单线程串行处理
|
|
||||||
/* 打开输入bam文件 */
|
|
||||||
sam_hdr_t *inBamHeader;
|
|
||||||
samFile *inBamFp;
|
|
||||||
inBamFp = sam_open_format(gArg.in_fn.c_str(), "r", nullptr);
|
|
||||||
if (! inBamFp) {
|
|
||||||
Error("[%s] load sam/bam file failed.\n", __func__);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
hts_set_opt(inBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
|
||||||
inBamHeader = sam_hdr_read(inBamFp);
|
|
||||||
htsThreadPool htsPoolRead = {NULL, 0}; // 多线程读取,创建线程池
|
|
||||||
htsThreadPool htsPoolWrite = {NULL, 0};
|
|
||||||
htsPoolRead.pool = hts_tpool_init(gArg.num_threads);
|
|
||||||
htsPoolWrite.pool = hts_tpool_init(gArg.num_threads);
|
|
||||||
if (!htsPoolRead.pool || !htsPoolWrite.pool)
|
|
||||||
{
|
|
||||||
Error("[%d] failed to set up thread pool", __LINE__);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
hts_set_opt(inBamFp, HTS_OPT_THREAD_POOL, &htsPoolRead);
|
|
||||||
|
|
||||||
/* 创建输出文件 */
|
|
||||||
samFile *outBamFp;
|
|
||||||
htsFormat outFormat = {};
|
|
||||||
hts_parse_format(&outFormat, "bam");
|
|
||||||
outBamFp = sam_open_format(gArg.out_fn.c_str(), "wb", &outFormat);
|
|
||||||
hts_set_opt(outBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
|
||||||
hts_set_opt(outBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite); // 用同样的线程池处理输出文件
|
|
||||||
|
|
||||||
// /* 读取缓存初始化 */
|
|
||||||
BamBufType inBamBuf(gArg.use_asyncio);
|
|
||||||
inBamBuf.Init(inBamFp, inBamHeader, gArg.max_mem);
|
|
||||||
|
|
||||||
/* 循环读入信息,并处理 */
|
|
||||||
while (inBamBuf.ReadStat() >= 0)
|
|
||||||
{
|
|
||||||
int readNum = inBamBuf.ReadBam();
|
|
||||||
cout << readNum << endl;
|
|
||||||
// inBamBuf.ClearAll();
|
|
||||||
// cout << inBamBuf.Size() << endl;
|
|
||||||
inBamBuf.ClearBeforeIdx(inBamBuf.Size());
|
|
||||||
// break;
|
|
||||||
for (int i = 0; i < inBamBuf.Size(); ++i) {
|
|
||||||
if (sam_write1(outBamFp, inBamHeader, inBamBuf[i]->b) < 0)
|
|
||||||
{
|
|
||||||
Error("failed writing to \"%s\"", gArg.out_fn.c_str());
|
|
||||||
sam_close(outBamFp);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (readNum == 0)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// int res = -1;
|
|
||||||
// bam1_t *b = bam_init1();
|
|
||||||
// size_t num = 0;
|
|
||||||
// while ((res = sam_read1(inBamFp, inBamHeader, b)) >= 0)
|
|
||||||
// {
|
|
||||||
// ++num;
|
|
||||||
// }
|
|
||||||
// cout << num << endl;
|
|
||||||
|
|
||||||
/* 为每个read创建ReadEnd信息 */
|
|
||||||
|
|
||||||
/* 标记冗余, 将处理后的结果写入文件 */
|
|
||||||
|
|
||||||
/* 关闭文件,收尾清理 */
|
|
||||||
sam_close(outBamFp);
|
|
||||||
sam_close(inBamFp);
|
|
||||||
}
|
}
|
||||||
|
hts_set_opt(inBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
||||||
|
inBamHeader = sam_hdr_read(inBamFp); // 读取header
|
||||||
|
|
||||||
// cout << "read ends size: " << sizeof(ReadEnds) << endl;
|
/* 利用线程池对输入输出文件进行读写 */
|
||||||
|
htsThreadPool htsPoolRead = {NULL, 0}; // 多线程读取,创建线程池
|
||||||
|
htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池
|
||||||
|
htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads);
|
||||||
|
htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads);
|
||||||
|
if (!htsPoolRead.pool || !htsPoolWrite.pool)
|
||||||
|
{
|
||||||
|
Error("[%d] failed to set up thread pool", __LINE__);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
hts_set_opt(inBamFp, HTS_OPT_THREAD_POOL, &htsPoolRead);
|
||||||
|
|
||||||
cout << "总时间: " << time_all.seconds_elapsed() << endl;
|
/* 初始化输出文件 */
|
||||||
|
char modeout[12] = "wb";
|
||||||
|
sam_open_mode(modeout + 1, g_gArg.out_fn.c_str(), NULL);
|
||||||
|
g_outBamFp = sam_open(g_gArg.out_fn.c_str(), modeout);
|
||||||
|
g_outBamHeader = sam_hdr_dup(inBamHeader);
|
||||||
|
if (sam_hdr_write(g_outBamFp, g_outBamHeader) != 0)
|
||||||
|
{
|
||||||
|
Error("failed writing header to \"%s\"", g_gArg.out_fn.c_str());
|
||||||
|
sam_close(g_outBamFp);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
hts_set_opt(g_outBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
||||||
|
hts_set_opt(g_outBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite); // 用同样的线程池处理输出文件
|
||||||
|
|
||||||
|
// /* 读取缓存初始化 */
|
||||||
|
BamBufType inBamBuf(g_gArg.use_asyncio);
|
||||||
|
inBamBuf.Init(inBamFp, inBamHeader, g_gArg.max_mem);
|
||||||
|
|
||||||
|
/* 循环读入信息,并处理 */
|
||||||
|
g_maxJobNum = g_gArg.num_threads * 10;
|
||||||
|
// g_maxJobNum = g_gArg.num_threads * 3;
|
||||||
|
g_jobNumForRead = g_gArg.num_threads * 2;
|
||||||
|
|
||||||
|
int64_t x_all = 0; // for test
|
||||||
|
int64_t jobSeq = 0;
|
||||||
|
int64_t processedBamNum = 0; // 记录每个轮次累计处理的reads数量,用来计算每个read在整个文件中的索引位置
|
||||||
|
threadpool thpool = thpool_init(g_gArg.num_threads); // 创建mark dup所需的线程池
|
||||||
|
thread *writeth = LAUNCH(thread_write, nullptr); // 启动处理结果的的线程
|
||||||
|
int bamRemainSize = 0; // 上一轮还剩下的bam数量,包含已经在任务里的和没有放进任务的
|
||||||
|
int numReadsForEachJob = 0; // 每个线程处理的read数量,第一次读取的时候进行设置
|
||||||
|
int lastRoundUnProcessed = 0; // 上一轮没有放进任务里的read数量
|
||||||
|
int curRoundProcessed = 0; // 这一轮放进任务的read数量
|
||||||
|
while (inBamBuf.ReadStat() >= 0)
|
||||||
|
{
|
||||||
|
/* 读取bam文件中的read */
|
||||||
|
int readNum = inBamBuf.ReadBam();
|
||||||
|
if (numReadsForEachJob == 0)
|
||||||
|
numReadsForEachJob = readNum / g_maxJobNum; // 第一次读取bam的时候进行设置
|
||||||
|
g_bamLoadedNum += readNum;
|
||||||
|
|
||||||
|
cout << readNum << endl; // 这一轮读取的bam数量
|
||||||
|
|
||||||
|
/* 多线程处理 任务数是线程数的10倍 */
|
||||||
|
tm_arr[0].acc_start();
|
||||||
|
curRoundProcessed = 0; // 当前轮次已经处理的reads数量
|
||||||
|
int numNeedToProcess = inBamBuf.Size() - bamRemainSize + lastRoundUnProcessed; // 当前需要处理的bam数量
|
||||||
|
for (int i = 0; numNeedToProcess >= numReadsForEachJob; ++i) // 只有待处理的reads数量大于一次任务的数量时,新建任务
|
||||||
|
{
|
||||||
|
int startIdx = i * numReadsForEachJob + bamRemainSize - lastRoundUnProcessed;
|
||||||
|
int endIdx = (i + 1) * numReadsForEachJob + bamRemainSize - lastRoundUnProcessed;
|
||||||
|
|
||||||
|
ThMarkDupArg *thArg = new ThMarkDupArg({processedBamNum + curRoundProcessed,
|
||||||
|
jobSeq++,
|
||||||
|
true,
|
||||||
|
false,
|
||||||
|
inBamBuf.Slice(startIdx, endIdx)});
|
||||||
|
POSSESS(g_queueFirstLock); // 加锁
|
||||||
|
g_qpThMarkDupArg.push(thArg); // 将新任务需要的参数添加到队列
|
||||||
|
RELEASE(g_queueFirstLock); // 解锁
|
||||||
|
thpool_add_work(thpool, thread_markdups, (void *)thArg); // 添加新任务
|
||||||
|
curRoundProcessed += endIdx - startIdx;
|
||||||
|
numNeedToProcess -= numReadsForEachJob;
|
||||||
|
}
|
||||||
|
processedBamNum += curRoundProcessed;
|
||||||
|
lastRoundUnProcessed = numNeedToProcess;
|
||||||
|
|
||||||
|
/* 等待可以继续读取的信号 */
|
||||||
|
POSSESS(g_readyToReadLock);
|
||||||
|
WAIT_FOR(g_readyToReadLock, TO_BE, 1);
|
||||||
|
bamRemainSize = g_bamLoadedNum - g_bamWritenNum;
|
||||||
|
|
||||||
|
while (bamRemainSize >= inBamBuf.Size() / 2)
|
||||||
|
{ // 要保留的多于现在有的bam数量的一半,那就等待write线程继续处理
|
||||||
|
TWIST(g_readyToReadLock, TO, 0);
|
||||||
|
POSSESS(g_readyToReadLock);
|
||||||
|
WAIT_FOR(g_readyToReadLock, TO_BE, 1);
|
||||||
|
bamRemainSize = g_bamLoadedNum - g_bamWritenNum;
|
||||||
|
}
|
||||||
|
inBamBuf.ClearBeforeIdx(inBamBuf.Size() - bamRemainSize); // 清理掉已经处理完的reads
|
||||||
|
// cout << g_bamLoadedNum << '\t' << g_bamWritenNum << '\t' << bamRemainSize << '\t' << inBamBuf.Size() << endl;
|
||||||
|
TWIST(g_readyToReadLock, TO, 0);
|
||||||
|
|
||||||
|
}
|
||||||
|
/* 数据读完了,放一个空的任务,好让write thread停下来 */
|
||||||
|
ThMarkDupArg *thArg = nullptr;
|
||||||
|
if (lastRoundUnProcessed > 0) // 最后一轮还有没有添加进任务的read数据
|
||||||
|
{
|
||||||
|
thArg = new ThMarkDupArg({processedBamNum + curRoundProcessed, jobSeq++, false, false,
|
||||||
|
inBamBuf.Slice(inBamBuf.Size() - lastRoundUnProcessed, inBamBuf.Size())});
|
||||||
|
processedBamNum += lastRoundUnProcessed;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
thArg = new ThMarkDupArg({0, jobSeq++, false, false});
|
||||||
|
}
|
||||||
|
POSSESS(g_queueFirstLock); // 加锁
|
||||||
|
g_qpThMarkDupArg.push(thArg); // 将新任务需要的参数添加到队列
|
||||||
|
RELEASE(g_queueFirstLock); // 解锁
|
||||||
|
thpool_add_work(thpool, thread_markdups, (void *)thArg); // 添加新任务
|
||||||
|
|
||||||
|
/* 同步所有线程 */
|
||||||
|
thpool_wait(thpool);
|
||||||
|
thpool_destroy(thpool);
|
||||||
|
JOIN(writeth);
|
||||||
|
|
||||||
|
cout <<"x_all: " << x_all << endl;
|
||||||
|
cout << "loaded: " << g_bamLoadedNum << endl;
|
||||||
|
cout << "writen: " << g_bamWritenNum << endl;
|
||||||
|
cout << "processedBamNum: " << processedBamNum << endl;
|
||||||
|
/* 标记冗余, 将处理后的结果写入文件 */
|
||||||
|
|
||||||
|
/* 关闭文件,收尾清理 */
|
||||||
|
sam_close(g_outBamFp);
|
||||||
|
sam_close(inBamFp);
|
||||||
|
|
||||||
|
cout << "read ends size: " << sizeof(ReadEnds) << endl;
|
||||||
|
|
||||||
|
cout << " 总时间: " << time_all.seconds_elapsed() << endl;
|
||||||
|
cout << "计算read end: " << tm_arr[0].acc_seconds_elapsed() << endl;
|
||||||
Timer::log_time("程序结束");
|
Timer::log_time("程序结束");
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
@ -229,6 +229,13 @@ void MarkDupsArg::parseArgument(int argc,
|
||||||
}
|
}
|
||||||
|
|
||||||
gArg.printArgValue();
|
gArg.printArgValue();
|
||||||
|
printArgValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 打印参数信息 */
|
||||||
|
void MarkDupsArg::printArgValue()
|
||||||
|
{
|
||||||
|
printf("--READ_NAME_REGEX = %s\n", this->READ_NAME_REGEX.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
// 打印版本信息
|
// 打印版本信息
|
||||||
|
|
@ -239,7 +246,6 @@ void MarkDupsArg::PrintVersion()
|
||||||
|
|
||||||
// 释放资源,关闭文件等
|
// 释放资源,关闭文件等
|
||||||
void MarkDupsArg::Finalize(MarkDupsArg *pMdArg,
|
void MarkDupsArg::Finalize(MarkDupsArg *pMdArg,
|
||||||
vector<AuxVar> *pvAuxVar,
|
|
||||||
GlobalArg *pGArg)
|
GlobalArg *pGArg)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
@ -256,8 +262,8 @@ void MarkDupsArg::PrintHelp()
|
||||||
"\n"
|
"\n"
|
||||||
"Required Arguments:\n"
|
"Required Arguments:\n"
|
||||||
"\n"
|
"\n"
|
||||||
"--INPUT <String> One or more input SAM, BAM or CRAM files to analyze. Must be coordinate sorted. This\n"
|
"--INPUT <String> One input SAM, BAM or CRAM files to analyze. Must be coordinate sorted. This\n"
|
||||||
" argument must be specified at least once.Required.\n"
|
" argument must be specified at least once. Required.\n"
|
||||||
"\n"
|
"\n"
|
||||||
"--METRICS_FILE <File> File to write duplication metrics to Required.\n"
|
"--METRICS_FILE <File> File to write duplication metrics to Required.\n"
|
||||||
"\n"
|
"\n"
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,9 @@ Author : Zhang Zhonghai
|
||||||
Date : 2023/10/23
|
Date : 2023/10/23
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#ifndef MARKDUPS_ARG_H_
|
||||||
|
#define MARKDUPS_ARG_H_
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
|
@ -104,15 +107,6 @@ namespace ns_md {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
// 用于线程内的各种变量
|
|
||||||
struct AuxVar {
|
|
||||||
const static int MIN_QSUM_QSCORE = 13;
|
|
||||||
const static int REF_CONTEXT_PAD = 3;
|
|
||||||
const static int REFERENCE_HALF_WINDOW_LENGTH = 150;
|
|
||||||
|
|
||||||
double contaminantAlternateFraction;
|
|
||||||
};
|
|
||||||
|
|
||||||
/* markduplicate 需要的参数*/
|
/* markduplicate 需要的参数*/
|
||||||
struct MarkDupsArg
|
struct MarkDupsArg
|
||||||
{
|
{
|
||||||
|
|
@ -303,12 +297,15 @@ struct MarkDupsArg
|
||||||
char **argv,
|
char **argv,
|
||||||
GlobalArg *pGArg);
|
GlobalArg *pGArg);
|
||||||
|
|
||||||
|
void printArgValue();
|
||||||
|
|
||||||
static void PrintHelp();
|
static void PrintHelp();
|
||||||
|
|
||||||
static void PrintVersion();
|
static void PrintVersion();
|
||||||
|
|
||||||
// 释放资源,关闭文件等
|
// 释放资源,关闭文件等
|
||||||
static void Finalize(MarkDupsArg *pMdArg,
|
static void Finalize(MarkDupsArg *pMdArg,
|
||||||
vector<AuxVar> *pvAuxVar,
|
|
||||||
GlobalArg *pGArg);
|
GlobalArg *pGArg);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
@ -0,0 +1,115 @@
|
||||||
|
/*
|
||||||
|
Description: read ends结构体主要用来标记冗余,包含一些序列的测序过程中的物理信息等
|
||||||
|
|
||||||
|
Copyright : All right reserved by ICT
|
||||||
|
|
||||||
|
Author : Zhang Zhonghai
|
||||||
|
Date : 2023/11/3
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef READ_ENDS_H_
|
||||||
|
#define READ_ENDS_H_
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Small interface that provides access to the physical location information about a cluster.
|
||||||
|
* All values should be defaulted to -1 if unavailable. ReadGroup and Tile should only allow
|
||||||
|
* non-zero positive integers, x and y coordinates may be negative.
|
||||||
|
*/
|
||||||
|
struct PhysicalLocation
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Small class that provides access to the physical location information about a cluster.
|
||||||
|
* All values should be defaulted to -1 if unavailable. Tile should only allow
|
||||||
|
* non-zero positive integers, x and y coordinates must be non-negative.
|
||||||
|
* This is different from PhysicalLocationShort in that the x and y positions are ints, not shorts
|
||||||
|
* thus, they do not overflow within a HiSeqX tile.
|
||||||
|
*/
|
||||||
|
int16_t tile = -1;
|
||||||
|
int32_t x = -1;
|
||||||
|
int32_t y = -1;
|
||||||
|
};
|
||||||
|
|
||||||
|
/* 包含了所有read ends信息,如picard里边的 ReadEndsForMarkDuplicates*/
|
||||||
|
struct ReadEnds : PhysicalLocation
|
||||||
|
{
|
||||||
|
/* ReadEnds中的成员变量 */
|
||||||
|
/** Little struct-like class to hold read pair (and fragment) end data for duplicate marking. */
|
||||||
|
static const int8_t F = 0, R = 1, FF = 2, FR = 3, RR = 4, RF = 5;
|
||||||
|
// int16_t libraryId; // 没用,不考虑多样本
|
||||||
|
int8_t orientation;
|
||||||
|
int32_t read1ReferenceIndex = -1;
|
||||||
|
int32_t read1Coordinate = -1;
|
||||||
|
int32_t read2ReferenceIndex = -1;
|
||||||
|
int32_t read2Coordinate = -1; // This field is overloaded for flow based processing as the end coordinate of read 1. (paired reads not supported)
|
||||||
|
/* Additional information used to detect optical dupes */
|
||||||
|
// int16_t readGroup = -1; 一般经过比对后的bam文件只有一个read group,normal或者tumor
|
||||||
|
/** For optical duplicate detection the orientation matters regard to 1st or 2nd end of a mate */
|
||||||
|
int8_t orientationForOpticalDuplicates = -1;
|
||||||
|
/** A *transient* flag marking this read end as being an optical duplicate. */
|
||||||
|
bool isOpticalDuplicate = false;
|
||||||
|
|
||||||
|
/* ReadEndsForMarkDuplicates中的成员变量 */
|
||||||
|
/** Little struct-like class to hold read pair (and fragment) end data for MarkDuplicatesWithMateCigar **/
|
||||||
|
int16_t score = 0;
|
||||||
|
int64_t read1IndexInFile = -1;
|
||||||
|
int64_t read2IndexInFile = -1;
|
||||||
|
int64_t duplicateSetSize = -1;
|
||||||
|
|
||||||
|
/* ReadEndsForMarkDuplicatesWithBarcodes中的成员变量 (好像用不到) */
|
||||||
|
// int32_t barcode = 0; // primary barcode for this read (and pair)
|
||||||
|
// int32_t readOneBarcode = 0; // read one barcode, 0 if not present
|
||||||
|
// int32_t readTwoBarcode = 0; // read two barcode, 0 if not present or not paired
|
||||||
|
|
||||||
|
/* zzh增加的成员变量 */
|
||||||
|
int64_t posKey = -1; // 根据位置信息生成的关键字 return (int64_t)tid << MAX_CONTIG_LEN_SHIFT | (int64_t)pos;
|
||||||
|
|
||||||
|
/* 根据pairend read的比对方向,来确定整体的比对方向 */
|
||||||
|
static int8_t GetOrientationByte(bool read1NegativeStrand, bool read2NegativeStrand)
|
||||||
|
{
|
||||||
|
if (read1NegativeStrand)
|
||||||
|
{
|
||||||
|
if (read2NegativeStrand)
|
||||||
|
return RR;
|
||||||
|
else
|
||||||
|
return RF;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (read2NegativeStrand)
|
||||||
|
return FR;
|
||||||
|
else
|
||||||
|
return FF;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 比较两个readends是否一样(有个冗余) */
|
||||||
|
static bool AreComparableForDuplicates(ReadEnds &lhs, ReadEnds &rhs, bool compareRead2)
|
||||||
|
{
|
||||||
|
bool areComparable = true;
|
||||||
|
areComparable = lhs.read1ReferenceIndex == rhs.read1ReferenceIndex &&
|
||||||
|
lhs.read1Coordinate == rhs.read1Coordinate &&
|
||||||
|
lhs.orientation == rhs.orientation;
|
||||||
|
if (areComparable && compareRead2)
|
||||||
|
{
|
||||||
|
areComparable = lhs.read2ReferenceIndex == rhs.read2ReferenceIndex &&
|
||||||
|
lhs.read2Coordinate == rhs.read2Coordinate;
|
||||||
|
}
|
||||||
|
return areComparable;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 比对方向是否正向 */
|
||||||
|
bool IsForwardStrand()
|
||||||
|
{
|
||||||
|
return orientation == F;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* pairend是否合适的比对上了 */
|
||||||
|
bool IsPaired()
|
||||||
|
{
|
||||||
|
return read2ReferenceIndex != -1;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
@ -0,0 +1,218 @@
|
||||||
|
/*
|
||||||
|
Description: 解析read的name中的信息,比如tile, x, y等
|
||||||
|
|
||||||
|
Copyright : All right reserved by ICT
|
||||||
|
|
||||||
|
Author : Zhang Zhonghai
|
||||||
|
Date : 2023/11/6
|
||||||
|
*/
|
||||||
|
#ifndef READ_NAME_PARSER_H_
|
||||||
|
#define READ_NAME_PARSER_H_
|
||||||
|
|
||||||
|
#include "read_ends.h"
|
||||||
|
#include <common/utils/util.h>
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <string>
|
||||||
|
// #include <regex>
|
||||||
|
#include <boost/regex.hpp>
|
||||||
|
|
||||||
|
// using std::regex;
|
||||||
|
using boost::cmatch;
|
||||||
|
using boost::regex;
|
||||||
|
using std::string;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Provides access to the physical location information about a cluster.
|
||||||
|
* All values should be defaulted to -1 if unavailable. ReadGroup and Tile should only allow
|
||||||
|
* non-zero positive integers, x and y coordinates may be negative.
|
||||||
|
* 非线程安全
|
||||||
|
*/
|
||||||
|
struct ReadNameParser
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* The read name regular expression (regex) is used to extract three pieces of information from the read name: tile, x location,
|
||||||
|
* and y location. Any read name regex should parse the read name to produce these and only these values. An example regex is:
|
||||||
|
* (?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$
|
||||||
|
* which assumes that fields in the read name are delimited by ':' and the last three fields correspond to the tile, x and y locations,
|
||||||
|
* ignoring any trailing non-digit characters.
|
||||||
|
*
|
||||||
|
* The default regex is optimized for fast parsing (see {@link #getLastThreeFields(String, char, int[])}) by searching for the last
|
||||||
|
* three fields, ignoring any trailing non-digit characters, assuming the delimiter ':'. This should consider correctly read names
|
||||||
|
* where we have 5 or 7 field with the last three fields being tile/x/y, as is the case for the majority of read names produced by
|
||||||
|
* Illumina technology.
|
||||||
|
*/
|
||||||
|
const string DEFAULT_READ_NAME_REGEX = "(?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$";
|
||||||
|
|
||||||
|
string readNameStored = "";
|
||||||
|
PhysicalLocation physicalLocationStored;
|
||||||
|
int tmpLocationFields[3]; // for optimization of addLocationInformation
|
||||||
|
bool useOptimizedDefaultParsing = true; // was the regex default?
|
||||||
|
string readNameRegex = DEFAULT_READ_NAME_REGEX;
|
||||||
|
regex readNamePattern;
|
||||||
|
bool warnedAboutRegexNotMatching = true;
|
||||||
|
|
||||||
|
ReadNameParser() : ReadNameParser(DEFAULT_READ_NAME_REGEX) {}
|
||||||
|
ReadNameParser(const string &strReadNameRegex) : ReadNameParser(strReadNameRegex, true) {}
|
||||||
|
ReadNameParser(const string &strReadNameRegex, bool isWarn)
|
||||||
|
{
|
||||||
|
readNameRegex = strReadNameRegex;
|
||||||
|
if (strReadNameRegex == DEFAULT_READ_NAME_REGEX)
|
||||||
|
useOptimizedDefaultParsing = true;
|
||||||
|
else
|
||||||
|
useOptimizedDefaultParsing = false;
|
||||||
|
readNamePattern = boost::regex(strReadNameRegex, boost::regex_constants::optimize);
|
||||||
|
warnedAboutRegexNotMatching = isWarn;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 重新设置readNameRegex */
|
||||||
|
void SetReadNameRegex(const string &strReadNameRegex)
|
||||||
|
{
|
||||||
|
readNameRegex = strReadNameRegex;
|
||||||
|
if (strReadNameRegex == DEFAULT_READ_NAME_REGEX)
|
||||||
|
useOptimizedDefaultParsing = true;
|
||||||
|
else
|
||||||
|
useOptimizedDefaultParsing = false;
|
||||||
|
readNamePattern = boost::regex(strReadNameRegex, boost::regex_constants::optimize);
|
||||||
|
// readNamePattern = strReadNameRegex;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 添加测序时候的tile x y 信息 */
|
||||||
|
bool AddLocationInformation(const string &readName, PhysicalLocation *loc)
|
||||||
|
{
|
||||||
|
if (!(readName == readNameStored))
|
||||||
|
{
|
||||||
|
if (ReadLocationInformation(readName, loc))
|
||||||
|
{
|
||||||
|
readNameStored = readName;
|
||||||
|
physicalLocationStored = *loc;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
// return false if read name cannot be parsed
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
*loc = physicalLocationStored;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Method used to extract tile/x/y from the read name and add it to the PhysicalLocationShort so that it
|
||||||
|
* can be used later to determine optical duplication
|
||||||
|
*
|
||||||
|
* @param readName the name of the read/cluster
|
||||||
|
* @param loc the object to add tile/x/y to
|
||||||
|
* @return true if the read name contained the information in parsable form, false otherwise
|
||||||
|
*/
|
||||||
|
bool ReadLocationInformation(const string &readName, PhysicalLocation *loc)
|
||||||
|
{
|
||||||
|
try {
|
||||||
|
// Optimized version if using the default read name regex (== used on purpose):
|
||||||
|
if (useOptimizedDefaultParsing)
|
||||||
|
{
|
||||||
|
const int fields = getLastThreeFields(readName, ':');
|
||||||
|
if (!(fields == 5 || fields == 7))
|
||||||
|
{
|
||||||
|
if (warnedAboutRegexNotMatching)
|
||||||
|
{
|
||||||
|
Warn(
|
||||||
|
"Default READ_NAME_REGEX '%s' did not match read name '%s'."
|
||||||
|
"You may need to specify a READ_NAME_REGEX in order to correctly identify optical duplicates. "
|
||||||
|
"Note that this message will not be emitted again even if other read names do not match the regex.",
|
||||||
|
readNameRegex.c_str(), readName.c_str());
|
||||||
|
warnedAboutRegexNotMatching = false;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
loc->tile = (int16_t)tmpLocationFields[0];
|
||||||
|
loc->x = tmpLocationFields[1];
|
||||||
|
loc->y = tmpLocationFields[2];
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
else if (readNameRegex.empty())
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Standard version that will use the regex
|
||||||
|
cmatch m;
|
||||||
|
if (boost::regex_match(readName.c_str(), m, readNamePattern)) {
|
||||||
|
loc->tile = std::stoi(m[1].str());
|
||||||
|
loc->x = std::stoi(m[2].str());
|
||||||
|
loc->y = std::stoi(m[3].str());
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (warnedAboutRegexNotMatching)
|
||||||
|
{
|
||||||
|
Warn(
|
||||||
|
"READ_NAME_REGEX '%s' did not match read name '%s'."
|
||||||
|
"Your regex may not be correct. "
|
||||||
|
"Note that this message will not be emitted again even if other read names do not match the regex.",
|
||||||
|
readNameRegex.c_str(), readName.c_str());
|
||||||
|
warnedAboutRegexNotMatching = false;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (const std::runtime_error &e)
|
||||||
|
{
|
||||||
|
if (warnedAboutRegexNotMatching)
|
||||||
|
{
|
||||||
|
Warn(
|
||||||
|
"A field parsed out of a read name was expected to contain an integer and did not. READ_NAME_REGEX: %s; Read name: %s; Error Msg: %s",
|
||||||
|
readNameRegex.c_str(), readName.c_str(), e.what());
|
||||||
|
warnedAboutRegexNotMatching = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Given a string, splits the string by the delimiter, and returns the the last three fields parsed as integers. Parsing a field
|
||||||
|
* considers only a sequence of digits up until the first non-digit character. The three values are stored in the passed-in array.
|
||||||
|
*
|
||||||
|
* @throws NumberFormatException if any of the tokens that should contain numbers do not start with parsable numbers
|
||||||
|
*/
|
||||||
|
int getLastThreeFields(const string &readName, char delim)
|
||||||
|
{
|
||||||
|
int tokensIdx = 2; // start at the last token
|
||||||
|
int numFields = 0;
|
||||||
|
int i, endIdx;
|
||||||
|
endIdx = readName.size();
|
||||||
|
// find the last three tokens only
|
||||||
|
for (i = readName.size() - 1; 0 <= i && 0 <= tokensIdx; i--)
|
||||||
|
{
|
||||||
|
if (readName.at(i) == delim || 0 == i)
|
||||||
|
{
|
||||||
|
numFields++;
|
||||||
|
const int startIdx = (0 == i) ? 0 : (i + 1);
|
||||||
|
tmpLocationFields[tokensIdx] = std::stoi(readName.substr(startIdx, endIdx - startIdx));
|
||||||
|
tokensIdx--;
|
||||||
|
endIdx = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// continue to find the # of fields
|
||||||
|
while (0 <= i)
|
||||||
|
{
|
||||||
|
if (readName.at(i) == delim || 0 == i)
|
||||||
|
numFields++;
|
||||||
|
i--;
|
||||||
|
}
|
||||||
|
if (numFields < 3)
|
||||||
|
{
|
||||||
|
tmpLocationFields[0] = tmpLocationFields[1] = tmpLocationFields[2] = -1;
|
||||||
|
numFields = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return numFields;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
||||||
Loading…
Reference in New Issue