并行处理框架搭建完成,基本完成了工作线程处理逻辑

This commit is contained in:
zzh 2023-11-09 21:07:58 +08:00
parent a3a0b64ef2
commit 97d35a42e9
22 changed files with 1264 additions and 677 deletions

2
.vscode/launch.json vendored
View File

@ -13,7 +13,7 @@
"program": "${workspaceRoot}/build/bin/picard_cpp",
"args": [
"MarkDuplicates",
"--INPUT", "test.bam",
"--INPUT", "/mnt/d/data/100w.bam",
"--OUTPUT", "out.bam",
"--METRICS_FILE", "metrics.txt",
"--num_threads", "12",

86
.vscode/settings.json vendored
View File

@ -4,6 +4,90 @@
"vector": "cpp",
"random": "cpp",
"ostream": "cpp",
"*.tcc": "cpp"
"*.tcc": "cpp",
"new": "cpp",
"iostream": "cpp",
"mutex": "cpp",
"shared_mutex": "cpp",
"syncstream": "cpp",
"condition_variable": "cpp",
"future": "cpp",
"*.ipp": "cpp",
"cctype": "cpp",
"clocale": "cpp",
"cmath": "cpp",
"csignal": "cpp",
"cstdarg": "cpp",
"cstddef": "cpp",
"cstdio": "cpp",
"cstdlib": "cpp",
"ctime": "cpp",
"cwchar": "cpp",
"cwctype": "cpp",
"any": "cpp",
"array": "cpp",
"atomic": "cpp",
"strstream": "cpp",
"barrier": "cpp",
"bit": "cpp",
"bitset": "cpp",
"cfenv": "cpp",
"charconv": "cpp",
"chrono": "cpp",
"cinttypes": "cpp",
"codecvt": "cpp",
"compare": "cpp",
"complex": "cpp",
"concepts": "cpp",
"coroutine": "cpp",
"csetjmp": "cpp",
"cstdint": "cpp",
"deque": "cpp",
"forward_list": "cpp",
"list": "cpp",
"map": "cpp",
"set": "cpp",
"string": "cpp",
"unordered_map": "cpp",
"unordered_set": "cpp",
"exception": "cpp",
"algorithm": "cpp",
"functional": "cpp",
"iterator": "cpp",
"memory": "cpp",
"memory_resource": "cpp",
"numeric": "cpp",
"optional": "cpp",
"ratio": "cpp",
"regex": "cpp",
"source_location": "cpp",
"string_view": "cpp",
"system_error": "cpp",
"tuple": "cpp",
"type_traits": "cpp",
"utility": "cpp",
"rope": "cpp",
"slist": "cpp",
"fstream": "cpp",
"initializer_list": "cpp",
"iomanip": "cpp",
"iosfwd": "cpp",
"istream": "cpp",
"latch": "cpp",
"limits": "cpp",
"numbers": "cpp",
"ranges": "cpp",
"scoped_allocator": "cpp",
"semaphore": "cpp",
"span": "cpp",
"sstream": "cpp",
"stdexcept": "cpp",
"stop_token": "cpp",
"streambuf": "cpp",
"thread": "cpp",
"typeindex": "cpp",
"typeinfo": "cpp",
"valarray": "cpp",
"variant": "cpp"
}
}

BIN
out.bam

Binary file not shown.

7
run.sh
View File

@ -1,11 +1,12 @@
/home/zzh/work/GeneKit/picard_cpp/build/bin/picard_cpp \
MarkDuplicates \
--INPUT /mnt/d/data/zy_normal.bam \
--INPUT /mnt/d/data/zy_tumor.bam \
--OUTPUT out.bam \
--num_threads 12 \
--num_threads 16 \
--max_mem 4G \
--verbosity DEBUG \
--asyncio true
--asyncio true #\
#--READ_NAME_REGEX ".*?([0-9]+):([0-9]+):([0-9]+)$"
# --INPUT /mnt/d/data/100w.bam \

View File

@ -13,9 +13,11 @@ AUX_SOURCE_DIRECTORY(${PROJECT_SOURCE_DIR}/src/sam/markdups SAM_MARKDUPS_SRC)
INCLUDE_DIRECTORIES("${PROJECT_SOURCE_DIR}/src")
INCLUDE_DIRECTORIES("${PROJECT_SOURCE_DIR}/lib")
INCLUDE_DIRECTORIES("${PROJECT_SOURCE_DIR}/lib/htslib")
INCLUDE_DIRECTORIES("${PROJECT_SOURCE_DIR}/lib/boost/include")
#
LINK_DIRECTORIES("${PROJECT_SOURCE_DIR}/lib/htslib")
LINK_DIRECTORIES("${PROJECT_SOURCE_DIR}/lib/boost/lib")
#
set(PG_NAME "picard_cpp")

View File

@ -24,7 +24,6 @@
#include <htslib/sam.h>
#include "interval.h"
#include "bam_wrap.h"
using std::vector;
@ -138,6 +137,15 @@ struct AsyncIoBamBuf
{
return bam_arr_[pos];
}
// 获取某一段reads
inline vector<BamWrap*> Slice(size_t startIdx, size_t endIdx)
{
if (endIdx > startIdx) {
auto begItr = bam_arr_.begin();
return std::move(vector<BamWrap *>(begItr + startIdx, begItr + endIdx));
}
return std::move(vector<BamWrap *>());
}
// 同步读取
int sync_read_bam();

View File

@ -119,11 +119,11 @@ struct BamWrap
char base = base_to_char[bam_seqi(seq, i)];
oss << base;
}
return oss.str();
return std::move(oss.str());
}
// 获取名字
inline std::string query_name()
inline const char *query_name()
{
return bam_get_qname(b);
}
@ -139,7 +139,7 @@ struct BamWrap
const int len = bam_cigar_oplen(cigar[i]);
oss << len << c;
}
return oss.str();
return std::move(oss.str());
}
// 占用的内存大小
@ -309,6 +309,125 @@ struct BamWrap
return end_pos;
}
/* 获取碱基质量分数的加和 */
/** Calculates a score for the read which is the sum of scores over Q15. */
inline int GetSumOfBaseQualities()
{
int score = 0;
uint8_t *qual = bam_get_qual(b);
for (int i = 0; i < b->core.l_qseq; ++i)
{
if (qual[i] >= 15)
score += qual[i];
}
return score;
}
/* 与flag相关的检测 */
/* 没有比对上 unmapped */
inline bool GetReadUnmappedFlag()
{
return b->core.flag & BAM_FUNMAP;
}
/* Template having multiple segments in sequencing */
inline bool GetReadPairedFlag()
{
return b->core.flag & BAM_FPAIRED;
}
/**
* the read fails platform/vendor quality checks.
*/
inline bool GetReadFailsVendorQualityCheckFlag()
{
return b->core.flag & BAM_FQCFAIL;
}
/**
* the mate is unmapped.
*/
bool GetMateUnmappedFlag()
{
return b->core.flag & BAM_FMUNMAP;
}
/**
* @return whether the alignment is secondary (an alternative alignment of the read).
*/
bool IsSecondaryAlignment()
{
return b->core.flag & BAM_FSECONDARY;
}
/**
* @return whether the alignment is supplementary (a split alignment such as a chimeric alignment).
*/
bool GetSupplementaryAlignmentFlag()
{
return b->core.flag & BAM_FSUPPLEMENTARY;
}
/*
* Tests if this record is either a secondary and/or supplementary alignment;
*/
bool IsSecondaryOrSupplementary()
{
return IsSecondaryAlignment() || GetSupplementaryAlignmentFlag();
}
/**
* the read is the first read in a pair.
*/
bool GetFirstOfPairFlag()
{
return b->core.flag & BAM_FREAD1;
}
/**
* strand of the query (false for forward; true for reverse strand).
*/
bool GetReadNegativeStrandFlag()
{
return b->core.flag & BAM_FREVERSE;
}
/**
* strand of the mate (false for forward; true for reverse strand).
*/
bool GetMateNegativeStrandFlag()
{
return b->core.flag & BAM_FMREVERSE;
}
/* 其他的一些信息 */
inline int GetReferenceLength()
{
int length = 0;
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
for (int i = 0; i < bc.n_cigar; ++i)
{
const char c = bam_cigar_opchr(cigar[i]);
const int len = bam_cigar_oplen(cigar[i]);
switch (c)
{
case 'M':
case 'D':
case 'N':
case '=':
case 'X':
length += len;
break;
default:
break;
}
}
return length;
}
// 计算bam的全局位置算上染色体序号和比对位置
static inline int64_t bam_global_pos(bam1_t *b)
{

View File

@ -1,295 +0,0 @@
/*
Description: intervals
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2019/11/24
*/
#include "interval.h"
#include <algorithm>
#include <sstream>
#include <fstream>
#include <string>
#include <iostream>
#include <htslib/sam.h>
#include "../utils/util.h"
#include "bam_wrap.h"
using std::min;
using std::max;
using std::string;
using std::ifstream;
using std::stringstream;
using namespace std;
// 构造函数
Interval::Interval() : Interval(0, 0) {}
Interval::Interval(int64_t l, int64_t r) : left(l), right(r) {}
// 比较函数
bool Interval::operator<(const Interval& other) {
if (left == other.left) {
return right < other.right;
}
return left < other.left;
}
// 是否有重叠
bool Interval::overlaps(const Interval &other) {
return left <= other.right && right >= other.left;
}
// 两个interval的合并
Interval& Interval::spanWith(const Interval &other) {
left = min(left, other.left);
right = max(right, other.right);
return *this;
}
// 返回两个interval的交集不改变当前interval
Interval Interval::intersect(const Interval &that) const {
Interval val;
val.left = max(left, that.left);
val.right = min(right, that.right);
return val;
}
/*
* interval arr, interval arr
*/
void Interval::IntersectIntervals(const IntervalArr &a_arr,
const IntervalArr &b_arr,
IntervalArr *r_arr) {
if (a_arr.size() < 1 || b_arr.size() < 1) return;
int ai=0, bi=0;
const Interval *last, *cur;
if (a_arr[ai].left < b_arr[bi].left) last = &a_arr[ai++];
else last = &b_arr[bi++];
while (ai < a_arr.size() && bi < b_arr.size()) {
if (a_arr[ai].left < b_arr[bi].left) cur = &a_arr[ai++];
else cur = &b_arr[bi++];
if (last->right < cur->left) {
last = cur; continue;
} else if (last->right > cur->right) {
r_arr->push_back(*cur);
} else {
r_arr->push_back(Interval(cur->left, last->right));
last = cur;
}
}
const IntervalArr *arrp;
int ii;
if (ai < a_arr.size()) { arrp = &a_arr; ii = ai;}
else { arrp = &b_arr; ii = bi; }
const IntervalArr &arr = *arrp;
while(ii < arr.size()) {
cur = &arr[ii++];
if (last->right < cur->left) {
break;
} else if (last->right > cur->right) {
r_arr->push_back(*cur);
} else {
r_arr->push_back(Interval(cur->left, last->right));
break;
}
}
}
/*
* interval arr
*/
void Interval::UnionIntervals(const IntervalArr &a_arr,
const IntervalArr &b_arr,
IntervalArr *r_arr) {
Interval tmp;
const Interval *cur;
Interval *last;
int ai=0, bi=0;
if (a_arr.size() < 1) { *r_arr = b_arr; return; }
if (b_arr.size() < 1) { *r_arr = a_arr; return; }
r_arr->clear();
if (a_arr[ai].left < b_arr[bi].left) tmp = a_arr[ai++];
else tmp = b_arr[bi++];
last = &tmp;
while(ai < a_arr.size() && bi < b_arr.size()) {
if (a_arr[ai].left < b_arr[bi].left) cur = &a_arr[ai++];
else cur = &b_arr[bi++];
if (last->right < cur->left) {
r_arr->push_back(*last);
*last = *cur;
} else {
last->right = max(last->right, cur->right);
}
}
const IntervalArr *arrp;
int ii;
if (ai < a_arr.size()) { arrp = &a_arr; ii = ai; }
else { arrp = &b_arr; ii = bi; }
const IntervalArr &arr = *arrp;
while(ii < arr.size()) {
cur = &arr[ii++];
if (last->right < cur->left) {
r_arr->push_back(*last);
*last = *cur;
} else {
last->right = max(last->right, cur->right);
}
}
r_arr->push_back(*last);
}
/*
* readinterval
*/
int64_t Interval::MergeIntervals(const IntervalArr &n_arr,
const IntervalArr &t_arr,
IntervalArr &in_arr,
int64_t start_loc, // 闭区间
int64_t *end_loc, // 开区间
IntervalArr *r_arr) {
IntervalArr tmp_arr;
const int64_t end_loc_val = *end_loc;
if (in_arr.size() < 1) { // 如果输入的interval为空则使用tumor normal覆盖的interval
UnionIntervals(n_arr, t_arr, &tmp_arr);
} else {
IntervalArr mid_arr;
UnionIntervals(n_arr, t_arr, &mid_arr);
IntersectIntervals(mid_arr, in_arr, &tmp_arr);
}
for(int i=tmp_arr.size()-1; i>=0; --i) {
if (tmp_arr[i].left >= end_loc_val) {
tmp_arr.pop_back(); // 删除该元素
continue;
}
tmp_arr[i].right = min(tmp_arr[i].right, end_loc_val - 1); // end_loc是开区间
break;
}
for (int i=0; i<tmp_arr.size(); ++i) {
if (tmp_arr[i].right < start_loc) {
continue;
}
if (tmp_arr[i].left < start_loc) {
r_arr->push_back(Interval(start_loc, tmp_arr[i].right));
} else {
r_arr->push_back(tmp_arr[i]);
}
}
int next_i = 0;
while(next_i < in_arr.size() && in_arr[next_i].right < end_loc_val) ++next_i;
if (next_i < in_arr.size()) {
if (end_loc_val < in_arr[next_i].left) {
*end_loc = in_arr[next_i].left; // 更新本次处理的终点
} else {
in_arr[next_i].left = end_loc_val; // 更新panel
}
int i=0, j=next_i;
for (; j<in_arr.size(); ++i, ++j) {
in_arr[i] = in_arr[j];
}
in_arr.resize(i);
} else {
in_arr.clear();
}
int64_t locus_num = 0;
for (int i=0; i<r_arr->size(); ++i) {
locus_num += (*r_arr)[i].right - (*r_arr)[i].left + 1;
}
return locus_num;
}
/*
* interval
*/
void Interval::ReadInterval(const string &interval_fn,
bam_hdr_t* header,
int interval_padding,
IntervalArr *r_arr) {
ifstream interval_fs(interval_fn);
string one_line;
IntervalArr intervals;
getline(interval_fs, one_line);
while (!interval_fs.eof()) {
if (one_line[0] == '@') {
getline(interval_fs, one_line);
continue;
}
stringstream ss_line(one_line);
string contig_name;
ss_line >> contig_name;
int itid = sam_hdr_name2tid(header, contig_name.c_str());
if (itid < 0) Error("[%s] interval file has unknown contig name [%s]\n", __func__, contig_name.c_str());
int64_t tid = (int64_t)itid;
tid <<= CONTIG_SHIFT;
int64_t start, stop;
ss_line >> start >> stop;
// interval文件是1-based所以这里要减去1
intervals.push_back(Interval(tid + start - 1, tid + stop -1));
getline(interval_fs, one_line);
}
sort(intervals.begin(), intervals.end());
if (intervals.size() > 0) {
Interval new_span(intervals[0].left-interval_padding, intervals[0].right+interval_padding);
for (int i=1; i<intervals.size(); ++i) {
if (intervals[i].left - interval_padding > new_span.right) {
r_arr->push_back(new_span);
new_span.left = intervals[i].left - interval_padding;
new_span.right = intervals[i].right + interval_padding;
} else {
new_span.right = max(new_span.right, intervals[i].right + interval_padding);
}
}
r_arr->push_back(new_span);
}
interval_fs.close();
}
/*
* interval
*/
void Interval::ShrinkInterval(IntervalArr *ivap) {
if (ivap->size() < 1) return;
IntervalArr &iva = *ivap;
IntervalArr tiva = iva;
iva.clear();
Interval iv;
iv.left = tiva[0].left;
iv.right = tiva[0].right;
for (int i=1; i<tiva.size(); ++i) {
if (iv.right+1 < tiva[i].left) {
iva.push_back(iv);
iv.left = tiva[i].left;
}
iv.right = tiva[i].right;
}
iva.push_back(iv);
}
/*
* headerinterval
*/
Interval Interval::ExpandInterval(int64_t start, int64_t end, int expandVal, bam_hdr_t* header) {
Interval result;
result.left = start;
result.right = end;
int64_t ext_left = start - expandVal;
int64_t ext_right = end + expandVal;
int tid = BamWrap::bam_tid(start);
uint32_t contig_len = header->target_len[tid];
result.left = max(BamWrap::bam_global_pos(tid, 0), ext_left);
result.right = min(ext_right, contig_len - 1 + BamWrap::bam_global_pos(tid, 0));
return result;
}

View File

@ -1,101 +0,0 @@
/*
Description: intervals
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2019/11/24
*/
#ifndef INTERVAL_H_
#define INTERVAL_H_
#include <stdint.h>
#include <vector>
#include <string>
#include <sstream>
#include <htslib/sam.h>
#include "bam_wrap.h"
using namespace std;
// 前向声明
class Interval;
typedef std::vector<Interval> IntervalArr;
/*
*
*/
struct Interval {
// const常量
const static int CONTIG_SHIFT = 30;
// 类变量
int64_t left;
int64_t right;
// 构造函数
Interval();
explicit Interval(int64_t l, int64_t r);
// 比较函数
bool operator<(const Interval &other);
// 是否有重叠
bool overlaps(const Interval &other);
// 两个interval的合并, 会改变当前interval
Interval& spanWith(const Interval &other);
// 返回两个interval的交集不改变当前interval
Interval intersect(const Interval &that) const;
// for debug
string toString() const {
ostringstream oss;
oss << BamWrap::bam_tid(left) + 1 << ":"
<< BamWrap::bam_pos(left) + 1 << "-"
<< BamWrap::bam_pos(right) + 1;
return oss.str();
}
/*
* interval arr, interval arr
*/
static void IntersectIntervals(const IntervalArr &a_arr,
const IntervalArr &b_arr,
IntervalArr *r_arr);
/*
* interval arr
*/
static void UnionIntervals(const IntervalArr &a_arr,
const IntervalArr &b_arr,
IntervalArr *r_arr);
/*
* readinterval
*/
static int64_t MergeIntervals(const IntervalArr &n_arr,
const IntervalArr &t_arr,
IntervalArr &in_arr, // 会更改
int64_t start_loc, // 闭区间
int64_t *end_loc, // 开区间, 会更改
IntervalArr *r_arr);
/*
* interval
*/
static void ReadInterval(const std::string &interval_fn,
bam_hdr_t* header,
int interval_padding,
IntervalArr *r_arr);
/*
* interval
*/
static void ShrinkInterval(IntervalArr *iva);
/*
* headerinterval
*/
static Interval ExpandInterval(int64_t start, int64_t end, int expandVal, bam_hdr_t* header);
};
#endif

View File

@ -1,59 +0,0 @@
/*
Description: read ends
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2023/11/3
*/
#ifndef READ_ENDS_H_
#define READ_ENDS_H_
#include <stdint.h>
/* 包含了所有read ends信息如picard里边的 ReadEndsForMarkDuplicates*/
struct ReadEnds
{
/* PhysicalLocationInt中的成员变量 */
/**
* Small class that provides access to the physical location information about a cluster.
* All values should be defaulted to -1 if unavailable. Tile should only allow
* non-zero positive integers, x and y coordinates must be non-negative.
* This is different from PhysicalLocationShort in that the x and y positions are ints, not shorts
* thus, they do not overflow within a HiSeqX tile.
*/
int16_t tile = -1;
int32_t x = -1;
int32_t y = -1;
/* ReadEnds中的成员变量 */
/** Little struct-like class to hold read pair (and fragment) end data for duplicate marking. */
static const int8_t F = 0, R = 1, FF = 2, FR = 3, RR = 4, RF = 5;
int16_t libraryId;
int8_t orientation;
int32_t read1ReferenceIndex = -1;
int32_t read1Coordinate = -1;
int32_t read2ReferenceIndex = -1;
int32_t read2Coordinate = -1; // This field is overloaded for flow based processing as the end coordinate of read 1. (paired reads not supported)
/* Additional information used to detect optical dupes */
int16_t readGroup = -1;
/** For optical duplicate detection the orientation matters regard to 1st or 2nd end of a mate */
int8_t orientationForOpticalDuplicates = -1;
/** A *transient* flag marking this read end as being an optical duplicate. */
bool isOpticalDuplicate = false;
/* ReadEndsForMarkDuplicates中的成员变量 */
/** Little struct-like class to hold read pair (and fragment) end data for MarkDuplicatesWithMateCigar **/
int16_t score = 0;
int64_t read1IndexInFile = -1;
int64_t read2IndexInFile = -1;
int64_t duplicateSetSize = -1;
/* ReadEndsForMarkDuplicatesWithBarcodes中的成员变量 (好像用不到) */
int32_t barcode = 0; // primary barcode for this read (and pair)
int32_t readOneBarcode = 0; // read one barcode, 0 if not present
int32_t readTwoBarcode = 0; // read two barcode, 0 if not present or not paired
};
#endif

View File

@ -90,7 +90,7 @@ struct GlobalArg
void printArgValue() {
printf("--INPUT = %s\n", in_fn.c_str());
printf("--OUTPUT = %s\n", out_fn.c_str());
printf("--num_threads = %d\n",num_threads);
printf("--num_threads = %d\n", num_threads);
printf("--max_mem = %ld\n", max_mem);
printf("--verbosity = %d\n", verbosity);
printf("--asyncio = %d\n", use_asyncio);

View File

@ -0,0 +1,90 @@
/*
Description: Murmur
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2023/11/6
*/
#include <string>
#include <random>
using std::string;
/**
* Provides an implementation of the Murmur3_32 hash algorithm that has desirable properties in terms of randomness
* and uniformity of the distribution of output values that make it a useful hashing algorithm for downsampling.
*/
struct Murmur3
{
int seed_ = 0;
/** Hashes a character stream to an int using Murmur3. */
int HashUnencodedChars(const string &input)
{
int h1 = this->seed_;
// step through the CharSequence 2 chars at a time
const int length = input.size();
for (int i = 1; i < length; i += 2)
{
int k1 = input.at(i - 1) | (input.at(i) << 16);
k1 = mixK1(k1);
h1 = mixH1(h1, k1);
}
// deal with any remaining characters
if ((length & 1) == 1)
{
int k1 = input.at(length - 1);
k1 = mixK1(k1);
h1 ^= k1;
}
return fmix(h1, 2 * length);
}
static Murmur3 &Instance()
{
static Murmur3 instance;
return instance;
}
static int mixK1(int k1)
{
const int c1 = 0xcc9e2d51;
const int c2 = 0x1b873593;
k1 *= c1;
k1 = k1 << 15;
k1 *= c2;
return k1;
}
static int mixH1(int h1, int k1)
{
h1 ^= k1;
h1 = h1 << 13;
h1 = h1 * 5 + 0xe6546b64;
return h1;
}
// Finalization mix - force all bits of a hash block to avalanche
static int fmix(int h1, int length)
{
h1 ^= length;
h1 ^= (unsigned int)h1 >> 16;
h1 *= 0x85ebca6b;
h1 ^= (unsigned int)h1 >> 13;
h1 *= 0xc2b2ae35;
h1 ^= (unsigned int)h1 >> 16;
return h1;
}
private:
Murmur3()
{
auto &&rd = std::random_device{};
seed_ = rd();
}
};

View File

@ -58,7 +58,7 @@ typedef struct bsem
typedef struct job
{
struct job *prev; /* pointer to previous job */
void (*function)(void *arg); /* function pointer */
function_t function; /* function pointer */
void *arg; /* function's argument */
} job;
@ -175,7 +175,7 @@ struct thpool_ *thpool_init(int num_threads)
}
/* Add work to the thread pool */
int thpool_add_work(thpool_ *thpool_p, void (*function_p)(void *), void *arg_p)
int thpool_add_work(thpool_ *thpool_p, function_t function_p, void *arg_p)
{
job *newjob;
@ -368,14 +368,14 @@ static void *thread_do(struct thread *thread_p)
pthread_mutex_unlock(&thpool_p->thcount_lock);
/* Read job from queue and execute it */
void (*func_buff)(void *);
function_t func_buff;
void *arg_buff;
job *job_p = jobqueue_pull(&thpool_p->jobqueue);
if (job_p)
{
func_buff = job_p->function;
arg_buff = job_p->arg;
func_buff(arg_buff);
func_buff(arg_buff, thread_p->id);
free(job_p);
}

View File

@ -12,6 +12,8 @@ extern "C"
{
#endif
typedef void (*function_t)(void *, int);
/* =================================== API ======================================= */
typedef struct thpool_ *threadpool;
@ -62,7 +64,7 @@ extern "C"
* @param arg_p pointer to an argument
* @return 0 on success, -1 otherwise.
*/
int thpool_add_work(threadpool, void (*function_p)(void *), void *arg_p);
int thpool_add_work(threadpool, function_t function_p, void *arg_p);
/**
* @brief Wait for all queued jobs to finish

View File

@ -29,6 +29,7 @@ using std::for_each;
va_start(ap, format); \
vfprintf(stderr, format, ap); \
va_end(ap); \
fprintf(stderr, "\n"); \
} while (0)
/*

View File

@ -121,8 +121,8 @@ struct lock_s {
long value;
};
lock *new_lock_(long initial, char const *file, long line) {
lock *bolt = (lock *)my_malloc(sizeof(struct lock_s), file, line);
lock_t *new_lock_(long initial, char const *file, long line) {
lock_t *bolt = (lock_t *)my_malloc(sizeof(struct lock_s), file, line);
int ret = pthread_mutex_init(&(bolt->mutex), NULL);
if (ret)
fail(ret, file, line, "mutex_init");
@ -133,19 +133,19 @@ lock *new_lock_(long initial, char const *file, long line) {
return bolt;
}
void possess_(lock *bolt, char const *file, long line) {
void possess_(lock_t *bolt, char const *file, long line) {
int ret = pthread_mutex_lock(&(bolt->mutex));
if (ret)
fail(ret, file, line, "mutex_lock");
}
void release_(lock *bolt, char const *file, long line) {
void release_(lock_t *bolt, char const *file, long line) {
int ret = pthread_mutex_unlock(&(bolt->mutex));
if (ret)
fail(ret, file, line, "mutex_unlock");
}
void twist_(lock *bolt, enum twist_op op, long val,
void twist_(lock_t *bolt, enum twist_op op, long val,
char const *file, long line) {
if (op == TO)
bolt->value = val;
@ -161,7 +161,7 @@ void twist_(lock *bolt, enum twist_op op, long val,
#define until(a) while(!(a))
void wait_for_(lock *bolt, enum wait_op op, long val,
void wait_for_(lock_t *bolt, enum wait_op op, long val,
char const *file, long line) {
switch (op) {
case TO_BE:
@ -194,11 +194,11 @@ void wait_for_(lock *bolt, enum wait_op op, long val,
}
}
long peek_lock(lock *bolt) {
long peek_lock(lock_t *bolt) {
return bolt->value;
}
void free_lock_(lock *bolt, char const *file, long line) {
void free_lock_(lock_t *bolt, char const *file, long line) {
if (bolt == NULL)
return;
int ret = pthread_cond_destroy(&(bolt->cond));
@ -210,7 +210,7 @@ void free_lock_(lock *bolt, char const *file, long line) {
my_free(bolt);
}
// -- Thread functions (uses the lock functions above) --
// -- Thread functions (uses the lock_t functions above) --
struct thread_s {
pthread_t id;
@ -220,7 +220,7 @@ struct thread_s {
// List of threads launched but not joined, count of threads exited but not
// joined (incremented by ignition() just before exiting).
local lock threads_lock = {
local lock_t threads_lock = {
PTHREAD_MUTEX_INITIALIZER,
PTHREAD_COND_INITIALIZER,
0 // number of threads exited but not joined

View File

@ -36,7 +36,7 @@
These functions allow the simple launching and joining of threads, and the
locking of objects and synchronization of changes of objects. The latter is
implemented with a single lock type that contains an integer value. The
implemented with a single lock_t type that contains an integer value. The
value can be ignored for simple exclusive access to an object, or the value
can be used to signal and wait for changes to an object.
@ -45,10 +45,10 @@
thread *thread; identifier for launched thread, used by join
void probe(void *); pointer to function "probe", run when thread starts
void *payload; single argument passed to the probe function
lock *lock; a lock with a value -- used for exclusive access to
lock_t *lock_t; a lock_t with a value -- used for exclusive access to
an object and to synchronize threads waiting for
changes to an object
long val; value to set lock, increment lock, or wait for
long val; value to set lock_t, increment lock_t, or wait for
int n; number of threads joined
-- Thread functions --
@ -66,25 +66,25 @@
-- Lock functions --
lock = new_lock(val) - create a new lock with initial value val (lock is
lock_t = new_lock(val) - create a new lock_t with initial value val (lock_t is
created in the released state)
possess(lock) - acquire exclusive possession of a lock, waiting if necessary
twist(lock, [TO | BY], val) - set lock to or increment lock by val, signal
all threads waiting on this lock and then release the lock -- must
possess the lock before calling (twist releases, so don't do a
release() after a twist() on the same lock)
wait_for(lock, [TO_BE | NOT_TO_BE | TO_BE_MORE_THAN | TO_BE_LESS_THAN], val)
- wait on lock value to be, not to be, be greater than, or be less than
val -- must possess the lock before calling, will possess the lock on
return but the lock is released while waiting to permit other threads
possess(lock_t) - acquire exclusive possession of a lock_t, waiting if necessary
twist(lock_t, [TO | BY], val) - set lock_t to or increment lock_t by val, signal
all threads waiting on this lock_t and then release the lock_t -- must
possess the lock_t before calling (twist releases, so don't do a
release() after a twist() on the same lock_t)
wait_for(lock_t, [TO_BE | NOT_TO_BE | TO_BE_MORE_THAN | TO_BE_LESS_THAN], val)
- wait on lock_t value to be, not to be, be greater than, or be less than
val -- must possess the lock_t before calling, will possess the lock_t on
return but the lock_t is released while waiting to permit other threads
to use twist() to change the value and signal the change (so make sure
that the object is in a usable state when waiting)
release(lock) - release a possessed lock (do not try to release a lock that
release(lock_t) - release a possessed lock_t (do not try to release a lock_t that
the current thread does not possess)
val = peek_lock(lock) - return the value of the lock (assumes that lock is
val = peek_lock(lock_t) - return the value of the lock_t (assumes that lock_t is
already possessed, no possess or release is done by peek_lock())
free_lock(lock) - free the resources allocated by new_lock() (application
must assure that the lock is released before calling free_lock())
free_lock(lock_t) - free the resources allocated by new_lock() (application
must assure that the lock_t is released before calling free_lock())
-- Memory allocation ---
@ -112,27 +112,28 @@ void yarn_mem(void *(*)(size_t), void (*)(void *));
typedef struct thread_s thread;
thread *launch_(void (*)(void *), void *, char const *, long);
#define launch(a, b) launch_(a, b, __FILE__, __LINE__)
#define LAUNCH(a, b) launch_(a, b, __FILE__, __LINE__)
void join_(thread *, char const *, long);
#define join(a) join_(a, __FILE__, __LINE__)
#define JOIN(a) join_(a, __FILE__, __LINE__)
int join_all_(char const *, long);
#define join_all() join_all_(__FILE__, __LINE__)
#define JOIN_ALL() join_all_(__FILE__, __LINE__)
typedef struct lock_s lock;
lock *new_lock_(long, char const *, long);
#define new_lock(a) new_lock_(a, __FILE__, __LINE__)
void possess_(lock *, char const *, long);
#define possess(a) possess_(a, __FILE__, __LINE__)
void release_(lock *, char const *, long);
#define release(a) release_(a, __FILE__, __LINE__)
typedef struct lock_s lock_t;
lock_t *new_lock_(long, char const *, long);
#define NEW_LOCK(a) new_lock_(a, __FILE__, __LINE__)
void possess_(lock_t *, char const *, long);
#define POSSESS(a) possess_(a, __FILE__, __LINE__)
void release_(lock_t *, char const *, long);
// #define release(a) release_(a, __FILE__, __LINE__)
#define RELEASE(a) release_(a, __FILE__, __LINE__)
enum twist_op { TO, BY };
void twist_(lock *, enum twist_op, long, char const *, long);
#define twist(a, b, c) twist_(a, b, c, __FILE__, __LINE__)
void twist_(lock_t *, enum twist_op, long, char const *, long);
#define TWIST(a, b, c) twist_(a, b, c, __FILE__, __LINE__)
enum wait_op {
TO_BE, /* or */ NOT_TO_BE, /* that is the question */
TO_BE_MORE_THAN, TO_BE_LESS_THAN };
void wait_for_(lock *, enum wait_op, long, char const *, long);
#define wait_for(a, b, c) wait_for_(a, b, c, __FILE__, __LINE__)
long peek_lock(lock *);
void free_lock_(lock *, char const *, long);
#define free_lock(a) free_lock_(a, __FILE__, __LINE__)
void wait_for_(lock_t *, enum wait_op, long, char const *, long);
#define WAIT_FOR(a, b, c) wait_for_(a, b, c, __FILE__, __LINE__)
long peek_lock(lock_t *);
void free_lock_(lock_t *, char const *, long);
#define FREE_LOCK(a) free_lock_(a, __FILE__, __LINE__)

View File

@ -1,5 +1,5 @@
/*
Description: bam
Description: bambambam
Copyright : All right reserved by ICT
@ -7,64 +7,395 @@ Author : Zhang Zhonghai
Date : 2023/10/23
*/
#include "markdups_arg.h"
// 有太多define冲突放到最后include
#include <common/hts/bam_buf.h>
#include <common/utils/global_arg.h>
#include <common/utils/thpool.h>
#include <common/utils/timer.h>
#include <common/utils/util.h>
#include <common/hts/bam_buf.h>
#include <common/hts/read_ends.h>
#include <common/utils/murmur3.h>
#include <common/utils/yarn.h>
#include <sam/utils/read_ends.h>
#include <sam/utils/read_name_parser.h>
#include <htslib/sam.h>
#include "htslib/thread_pool.h"
#include <htslib/thread_pool.h>
#include <iostream>
#include <vector>
#include <set>
#include <queue>
#include <unordered_map>
using namespace std;
using std::cout;
#define SMA_TAG_PG "PG"
#define BAM_BLOCK_SIZE 2 * 1024 * 1024
#define NO_SUCH_INDEX INT64_MAX
static Timer tm_arr[10]; // 用来测试性能
/* 前向声明 */
class ThMarkDupArg;
/* 全局本地变量 */
static queue<ThMarkDupArg *> qpThMarkDupArg; // 存放线程变量的队列
static lock *queueFirstLock = new_lock(-1); // 队列的第一个任务是否完成
static queue<ThMarkDupArg *> g_qpThMarkDupArg; // 存放线程变量的队列
static lock_t *g_queueFirstLock = NEW_LOCK(-1); // 队列的第一个任务是否完成
static lock_t *g_readyToReadLock = NEW_LOCK(-1); // 通知主线程是否可以进行下一次读取
static vector<ReadNameParser> g_vRnParser; // 每个线程一个read name parser
static int g_numDuplicateIndices = 0; // 找到的冗余read总数
static samFile *g_outBamFp = nullptr; // 输出文件, sam或者bam格式
static sam_hdr_t *g_outBamHeader; // 输出文件的header
static int g_maxJobNum = 0; // 每次读取新的数据后,新增的任务数量
static int g_jobNumForRead = 0; // 任务数量降到当前值时开始下一轮读取
static volatile int64_t g_bamLoadedNum = 0; // 已经读入的read总数
static volatile int64_t g_bamWritenNum = 0; // 已经处理完写入输出文件的read总数
static vector<int64_t> g_vDupIdx; // 线程内部计算得出的
static vector<int64_t> g_vOpticalDupIdx;
static set<int64_t> g_sDupIdxLatter;
static set<int64_t> g_sOpticalDupIdxLatter;
/* 参数对象作为全局对象,免得多次作为参数传入函数中 */
static GlobalArg &g_gArg = GlobalArg::Instance();
static MarkDupsArg g_mdArg;
/*
* read
*/
static int16_t computeDuplicateScore(BamWrap &bw)
{
int16_t score = 0;
switch (g_mdArg.DUPLICATE_SCORING_STRATEGY)
{
case ns_md::SUM_OF_BASE_QUALITIES:
// two (very) long reads worth of high-quality bases can go over Short.MAX_VALUE/2
// and risk overflow.
score += (int16_t)min(bw.GetSumOfBaseQualities(), INT16_MAX / 2);
break;
case ns_md::TOTAL_MAPPED_REFERENCE_LENGTH:
if (!bw.GetReadUnmappedFlag())
// no need to remember the score since this scoring mechanism is symmetric
score = (int16_t)min(bw.GetReferenceLength(), INT16_MAX / 2);
break;
case ns_md::RANDOM:
// The RANDOM score gives the same score to both reads so that they get filtered together.
// it's not critical do use the readName since the scores from both ends get added, but it seem
// to be clearer this way.
score += (short)(Murmur3::Instance().HashUnencodedChars(bw.query_name()) & 0b11111111111111);
// subtract Short.MIN_VALUE/4 from it to end up with a number between
// 0 and Short.MAX_VALUE/2. This number can be then discounted in case the read is
// not passing filters. We need to stay far from overflow so that when we add the two
// scores from the two read mates we do not overflow since that could cause us to chose a
// failing read-pair instead of a passing one.
score -= INT16_MIN / 4;
default:
break;
}
// make sure that filter-failing records are heavily discounted. (the discount can happen twice, once
// for each mate, so need to make sure we do not subtract more than Short.MIN_VALUE overall.)
score += bw.GetReadFailsVendorQualityCheckFlag() ? (int16_t)(INT16_MIN / 2) : 0;
return score;
}
/*
* Builds a read ends object that represents a single read. read
*/
static void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey)
{
auto &k = *pKey;
auto &bc = bw.b->core;
k.read1ReferenceIndex = bc.tid;
k.read1Coordinate = (bc.flag & BAM_FREVERSE) ? bw.GetUnclippedEnd() : bw.GetUnclippedStart();
k.orientation = (bc.flag & BAM_FREVERSE) ? ReadEnds::R : ReadEnds::F;
k.read1IndexInFile = index;
k.score = computeDuplicateScore(bw);
// Doing this lets the ends object know that it's part of a pair
if (bw.GetReadPairedFlag() && !bw.GetMateUnmappedFlag())
{
k.read2ReferenceIndex = bc.mtid;
}
// Fill in the location information for optical duplicates
rnParser.AddLocationInformation(bw.query_name(), pKey);
// cout << k.tile << ' ' << k.x << ' ' << k.y << endl;
// 计算位置key
k.posKey = BamWrap::bam_global_pos(k.read1ReferenceIndex, k.read1Coordinate); // << 1 | k.orientation;
}
/**
* Takes a list of ReadEndsForMarkDuplicates objects and identify the representative read based on
* quality score. For all members of the duplicate set, add the read1 index-in-file of the representative
* read to the records of the first and second in a pair. This value becomes is used for
* the 'DI' tag.
*/
static void addRepresentativeReadIndex(vector<ReadEnds *> &vpRe)
{
}
/* 处理一组pairend的readends标记冗余 */
static void markDuplicatePairs(vector<ReadEnds *> &vpRe, set<int64_t> *psDupIdx, set<int64_t> *psOpticalDupIdx)
{
if (vpRe.size() < 2) {
if (vpRe.size() == 1)
{
// addSingletonToCount(libraryIdGenerator);
}
return;
}
int maxScore = 0;
ReadEnds *pBestRe = nullptr;
/** All read ends should have orientation FF, FR, RF, or RR **/
for (auto pe: vpRe) // 找分数最高的readend
{
if (pe->score > maxScore || pBestRe == nullptr)
{
maxScore = pe->score;
pBestRe = pe;
}
}
if (!g_mdArg.READ_NAME_REGEX.empty()) // 检查光学冗余
{
// trackOpticalDuplicates
}
for (auto pe: vpRe) // 对非best read标记冗余
{
if (pe != pBestRe) // 非best
{
psDupIdx->insert(pe->read1IndexInFile); // 添加read1
if (pe->read2IndexInFile != pe->read1IndexInFile)
psDupIdx->insert(pe->read2IndexInFile); // 添加read2
}
}
if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS)
{
addRepresentativeReadIndex(vpRe);
}
}
/* 处理一组非paired的readends标记冗余 */
static void markDuplicateFragments(vector<ReadEnds *> &vpRe,
bool containsPairs,
set<int64_t> *psDupIdx,
set<int64_t> *psOpticalDupIdx)
{
if (containsPairs)
{
for (auto pe: vpRe)
{
if (!pe->IsPaired())
{
psDupIdx->insert(pe->read1IndexInFile);
}
}
}
else
{
int maxScore = 0;
ReadEnds *pBest = nullptr;
for (auto pe : vpRe)
{
if (pe->score > maxScore || pBest == nullptr)
{
maxScore = pe->score;
pBest = pe;
}
}
for (auto pe : vpRe)
{
if (pe != pBest)
{
psDupIdx->insert(pe->read1IndexInFile);
}
}
}
}
/* 多线程处理冗余参数结构体 */
struct ThMarkDupArg
{
vector<BamWrap *> *pvBam;
int startIdx; // 闭区间
int endIdx; // 开区间
long seq; // 当前任务在所有任务的排序
bool more; // 后面还有任务
volatile bool finish; // 当前任务有没有处理完
set<int> sDupIdx; // 冗余read的索引
int64_t bamStartIdx; // 当前vBam数组中第一个bam记录在整体bam中所处的位置
long seq; // 当前任务在所有任务的排序
bool more; // 后面还有任务
volatile bool finish; // 当前任务有没有处理完
vector<BamWrap *> vBam; // 存放待处理的bam read
map<int64_t, vector<ReadEnds>> mvPair; // 以冗余位置为索引保存所有pairend reads
map<int64_t, vector<ReadEnds>> mvFrag; // 保存所有reads包括pairend
map<int64_t, set<int64_t>> msDupIdx; // 冗余read的索引
map<int64_t, set<int64_t>> msOpticalDupIdx; // optical冗余read的索引
unordered_map<string, ReadEnds> umReadEnds; // 用来寻找pair end
};
/*
* 线
*/
void thread_markdups(void *arg)
void thread_markdups(void *arg, int tid)
{
auto &p = *(ThMarkDupArg *)arg;
p.sDupIdx.insert(1);
/* 处理数据 */
/* 处理每个read创建ReadEnd并放入frag和pair中 */
for (int i = 0; i < p.vBam.size(); ++i) // 循环处理每个read
{
BamWrap *bw = p.vBam[i];
const int64_t bamIdx = p.bamStartIdx + i;
if (bw->GetReadUnmappedFlag())
{
if (bw->b->core.tid == -1)
// When we hit the unmapped reads with no coordinate, no reason to continue (only in coordinate sort).
break;
}
else if (!bw->IsSecondaryOrSupplementary()) // 是主要比对
{
ReadEnds fragEnd;
buildReadEnds(*bw, bamIdx, g_vRnParser[tid], &fragEnd);
p.mvFrag[fragEnd.posKey].push_back(fragEnd); // 添加进frag集合
if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) // 是pairend而且互补的read也比对上了
{
string key = bw->query_name();
if (p.umReadEnds.find(key) == p.umReadEnds.end())
{
p.umReadEnds[key] = fragEnd;
}
else // 找到了pairend
{
auto pairedEnds = p.umReadEnds.at(key);
p.umReadEnds.erase(key); // 删除找到的pairend
const int matesRefIndex = fragEnd.read1ReferenceIndex;
const int matesCoordinate = fragEnd.read1Coordinate;
// Set orientationForOpticalDuplicates, which always goes by the first then the second end for the strands. NB: must do this
// before updating the orientation later.
if (bw->GetFirstOfPairFlag())
{
pairedEnds.orientationForOpticalDuplicates =
ReadEnds::GetOrientationByte(bw->GetReadNegativeStrandFlag(), pairedEnds.orientation == ReadEnds::R);
}
else
{
pairedEnds.orientationForOpticalDuplicates =
ReadEnds::GetOrientationByte(pairedEnds.orientation == ReadEnds::R, bw->GetReadNegativeStrandFlag());
}
// If the other read is actually later, simply add the other read's data as read2, else flip the reads
if (matesRefIndex > pairedEnds.read1ReferenceIndex ||
(matesRefIndex == pairedEnds.read1ReferenceIndex && matesCoordinate >= pairedEnds.read1Coordinate))
{
pairedEnds.read2ReferenceIndex = matesRefIndex;
pairedEnds.read2Coordinate = matesCoordinate;
pairedEnds.read2IndexInFile = bamIdx;
pairedEnds.orientation = ReadEnds::GetOrientationByte(pairedEnds.orientation == ReadEnds::R,
bw->GetReadNegativeStrandFlag());
// if the two read ends are in the same position, pointing in opposite directions,
// the orientation is undefined and the procedure above
// will depend on the order of the reads in the file.
// To avoid this, we set it explicitly (to FR):
if (pairedEnds.read2ReferenceIndex == pairedEnds.read1ReferenceIndex &&
pairedEnds.read2Coordinate == pairedEnds.read1Coordinate &&
pairedEnds.orientation == ReadEnds::RF)
{
pairedEnds.orientation = ReadEnds::FR;
}
}
else
{
pairedEnds.read2ReferenceIndex = pairedEnds.read1ReferenceIndex;
pairedEnds.read2Coordinate = pairedEnds.read1Coordinate;
pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile;
pairedEnds.read1ReferenceIndex = matesRefIndex;
pairedEnds.read1Coordinate = matesCoordinate;
pairedEnds.read1IndexInFile = bamIdx;
pairedEnds.orientation = ReadEnds::GetOrientationByte(bw->GetReadNegativeStrandFlag(),
pairedEnds.orientation == ReadEnds::R);
}
pairedEnds.score += computeDuplicateScore(*bw);
p.mvPair[pairedEnds.posKey].push_back(pairedEnds);
}
}
}
}
/* generateDuplicateIndexes计算冗余read在所有read中的位置索引 */
// 先处理 pair
int dupNum = 0;
vector<ReadEnds *> vRePotentialDup; // 有可能是冗余的reads
for (auto &e : p.mvPair) // 按比对的位置先后进行遍历
{
if (e.second.size() > 1) // 有潜在的冗余
{
vRePotentialDup.clear();
ReadEnds *pReadEnd = nullptr;
for (auto &re : e.second)
{
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true))
vRePotentialDup.push_back(&re);
else
{
markDuplicatePairs(vRePotentialDup, &p.msDupIdx[e.first], &p.msOpticalDupIdx[e.first]);
vRePotentialDup.clear();
vRePotentialDup.push_back(&re);
pReadEnd = &re;
}
}
markDuplicatePairs(vRePotentialDup, &p.msDupIdx[e.first], &p.msOpticalDupIdx[e.first]);
}
}
// 再处理frag
bool containsPairs = false;
bool containsFrags = false;
for (auto &e : p.mvFrag)
{
if (e.second.size() > 1) // 有潜在的冗余
{
vRePotentialDup.clear();
ReadEnds *pReadEnd = nullptr;
for (auto &re : e.second)
{
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false))
{
vRePotentialDup.push_back(&re);
containsPairs = containsPairs || re.IsPaired();
containsFrags = containsFrags || !re.IsPaired();
}
else
{
if (vRePotentialDup.size() > 1 && containsFrags)
{
markDuplicateFragments(vRePotentialDup, containsPairs, &p.msDupIdx[e.first], &p.msOpticalDupIdx[e.first]);
}
vRePotentialDup.clear();
vRePotentialDup.push_back(&re);
pReadEnd = &re;
containsPairs = re.IsPaired();
containsFrags = !re.IsPaired();
}
}
if (vRePotentialDup.size() > 1 && containsFrags) {
markDuplicateFragments(vRePotentialDup, containsPairs, &p.msDupIdx[e.first], &p.msOpticalDupIdx[e.first]);
}
}
}
// cout << tid << '\t' << "dup: " << dupNum << endl;
// cout << tid << " all: no: " << p.vBam.size() << '\t' << p.umReadEnds.size() << endl;
/* 本段数据处理完成,告诉输出线程 */
possess(queueFirstLock);
POSSESS(g_queueFirstLock);
p.finish = true;
cout << "process: " << p.seq << endl;
auto front = qpThMarkDupArg.front();
// cout << tid << ": process: " << p.seq << endl;
auto front = g_qpThMarkDupArg.front();
if (front->finish)
{
twist(queueFirstLock, TO, front->seq);
TWIST(g_queueFirstLock, TO, front->seq); // 通知写线程,当前队列头部完成的任务
} else {
release(queueFirstLock);
RELEASE(g_queueFirstLock);
}
}
@ -75,58 +406,73 @@ void thread_write(void *)
{
bool more = false;
long seq = 0;
possess(queueFirstLock);
wait_for(queueFirstLock, TO_BE, seq++); // 等待首个任务完成
auto lastP = qpThMarkDupArg.front(); // 取队首的数据
qpThMarkDupArg.pop(); // 删除队首
twist(queueFirstLock, TO, seq);
more = lastP->more;
long unPairedNum = 0;
POSSESS(g_queueFirstLock);
WAIT_FOR(g_queueFirstLock, TO_BE, seq++); // 等待首个任务完成
auto lastP = g_qpThMarkDupArg.front(); // 取队首的数据
auto umUnpairedReadEnds = lastP->umReadEnds; // 还未找到pair的read
auto p = lastP;
g_qpThMarkDupArg.pop(); // 删除队首
TWIST(g_queueFirstLock, TO, seq); // 解锁
more = lastP->more; // 是否还有下一个任务
while (more) // 循环处理,将结果写入文件
{
possess(queueFirstLock);
if (qpThMarkDupArg.empty()) // 有可能新任务没来得及添加进队列
POSSESS(g_queueFirstLock);
if (g_qpThMarkDupArg.empty()) // 有可能新任务没来得及添加进队列
{
release(queueFirstLock);
RELEASE(g_queueFirstLock);
continue;
}
wait_for(queueFirstLock, TO_BE, seq); // 等待任务完成
auto p = qpThMarkDupArg.front();
if (!p->finish) // 有可能这个任务没有完成,是下边那个twist导致进到这里,因为这一段代码可能运行比较快
WAIT_FOR(g_queueFirstLock, TO_BE, seq); // 等待任务完成
p = g_qpThMarkDupArg.front();
if (!p->finish) // 有可能这个任务没有完成,是下边那个TWIST导致进到这里,因为这一段代码可能运行比较快
{
twist(queueFirstLock, TO, -1); // 此时队首任务没完成,-1可以让锁无法进入到这里避免无效获得锁
TWIST(g_queueFirstLock, TO, -1); // 此时队首任务没完成,-1可以让锁无法进入到这里避免无效获得锁
continue;
}
qpThMarkDupArg.pop();
twist(queueFirstLock, TO, seq + 1);
g_qpThMarkDupArg.pop();
TWIST(g_queueFirstLock, TO, seq + 1);
/* 处理结果数据 */
// cout << "finish: " << seq - 1 << '\t' << "lastIdx: " << p->bamStartIdx+p->vBam.size() << endl;
/* 处理结果数据 */
cout << "finish: " << seq - 1 << endl;
for (auto &e : p->umReadEnds) // 在当前任务中找有没有与上一个任务中没匹配的read相匹配的pair
{
if (umUnpairedReadEnds.find(e.first) != umUnpairedReadEnds.end())
umUnpairedReadEnds.erase(e.first); // 找到了pair
else
umUnpairedReadEnds.insert(e); // 没有pair则添加
}
/* 更新写入read数量和状态 */
POSSESS(g_readyToReadLock);
g_bamWritenNum += lastP->vBam.size();
// cout << "write: " << g_qpThMarkDupArg.size() << endl;
if (g_qpThMarkDupArg.size() <= g_jobNumForRead)
{
TWIST(g_readyToReadLock, TO, 1);
}
else
{
RELEASE(g_readyToReadLock);
}
/* 准备下一轮循环 */
delete lastP;
more = p->more;
lastP = p;
seq++;
}
unPairedNum = umUnpairedReadEnds.size();
cout << "Finally unpaired read num: " << unPairedNum << endl;
// 处理最后一个数据
cout << "finish: " << seq - 1 << endl;
POSSESS(g_readyToReadLock);
g_bamWritenNum += lastP->vBam.size();
TWIST(g_readyToReadLock, TO, 1);
// cout << "last finish: " << seq - 1 << endl;
pthread_exit(0);
}
/*
* Builds a read ends object that represents a single read.
*/
static void buildReadEnds(BamWrap &bw, int64_t index, ReadEnds *pKey)
{
auto &k = *pKey;
auto &bc = bw.b->core;
k.read1ReferenceIndex = bc.tid;
k.read1Coordinate = (bc.flag & BAM_FREVERSE) ? bw.GetUnclippedEnd() : bw.GetUnclippedStart();
k.orientation = (bc.flag & BAM_FREVERSE) ? ReadEnds::R : ReadEnds::F;
k.read1IndexInFile = index;
}
/*
* mark duplicate bambarcode
*/
@ -134,108 +480,160 @@ int MarkDuplicates(int argc, char *argv[])
{
Timer::log_time("程序开始");
Timer time_all;
/* 初始化参数 */
GlobalArg &gArg = GlobalArg::Instance();
MarkDupsArg mdArg;
vector<AuxVar> vAuxVar;
mdArg.parseArgument(argc, argv, &gArg); // 解析命令行参数
/* 读取命令行参数 */
g_mdArg.parseArgument(argc, argv, &g_gArg); // 解析命令行参数
if (g_gArg.num_threads < 1) // 线程数不能小于1
g_gArg.num_threads = 1;
// if (gArg.num_threads > 1) // 多线程处理
if (false)
/* 初始化一些参数和变量*/
g_vRnParser.resize(g_gArg.num_threads);
for (auto &parser : g_vRnParser)
parser.SetReadNameRegex(g_mdArg.READ_NAME_REGEX); // 用来解析read name中的tilexy信息
/* 打开输入bam文件 */
sam_hdr_t *inBamHeader;
samFile *inBamFp;
inBamFp = sam_open_format(g_gArg.in_fn.c_str(), "r", nullptr);
if (!inBamFp)
{
threadpool thpool = thpool_init(gArg.num_threads); // 创建mark dup所需的线程池
thread *writeth = launch(thread_write, nullptr); // 启动处理结果的的线程
for (int i = 0; i < 40; ++i)
{
ThMarkDupArg *thArg = new ThMarkDupArg({nullptr, i, i * 10, i, true, false});
if (i == 39)
thArg->more = false;
possess(queueFirstLock); // 加锁
qpThMarkDupArg.push(thArg); // 将新任务需要的参数添加到队列
release(queueFirstLock); // 解锁
thpool_add_work(thpool, thread_markdups, (void *)thArg); // 添加新任务
}
/* 同步所有线程 */
thpool_wait(thpool);
thpool_destroy(thpool);
join(writeth);
} else { // 单线程串行处理
/* 打开输入bam文件 */
sam_hdr_t *inBamHeader;
samFile *inBamFp;
inBamFp = sam_open_format(gArg.in_fn.c_str(), "r", nullptr);
if (! inBamFp) {
Error("[%s] load sam/bam file failed.\n", __func__);
return -1;
}
hts_set_opt(inBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
inBamHeader = sam_hdr_read(inBamFp);
htsThreadPool htsPoolRead = {NULL, 0}; // 多线程读取,创建线程池
htsThreadPool htsPoolWrite = {NULL, 0};
htsPoolRead.pool = hts_tpool_init(gArg.num_threads);
htsPoolWrite.pool = hts_tpool_init(gArg.num_threads);
if (!htsPoolRead.pool || !htsPoolWrite.pool)
{
Error("[%d] failed to set up thread pool", __LINE__);
return -1;
}
hts_set_opt(inBamFp, HTS_OPT_THREAD_POOL, &htsPoolRead);
/* 创建输出文件 */
samFile *outBamFp;
htsFormat outFormat = {};
hts_parse_format(&outFormat, "bam");
outBamFp = sam_open_format(gArg.out_fn.c_str(), "wb", &outFormat);
hts_set_opt(outBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
hts_set_opt(outBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite); // 用同样的线程池处理输出文件
// /* 读取缓存初始化 */
BamBufType inBamBuf(gArg.use_asyncio);
inBamBuf.Init(inBamFp, inBamHeader, gArg.max_mem);
/* 循环读入信息,并处理 */
while (inBamBuf.ReadStat() >= 0)
{
int readNum = inBamBuf.ReadBam();
cout << readNum << endl;
// inBamBuf.ClearAll();
// cout << inBamBuf.Size() << endl;
inBamBuf.ClearBeforeIdx(inBamBuf.Size());
// break;
for (int i = 0; i < inBamBuf.Size(); ++i) {
if (sam_write1(outBamFp, inBamHeader, inBamBuf[i]->b) < 0)
{
Error("failed writing to \"%s\"", gArg.out_fn.c_str());
sam_close(outBamFp);
return -1;
}
}
if (readNum == 0)
break;
}
// int res = -1;
// bam1_t *b = bam_init1();
// size_t num = 0;
// while ((res = sam_read1(inBamFp, inBamHeader, b)) >= 0)
// {
// ++num;
// }
// cout << num << endl;
/* 为每个read创建ReadEnd信息 */
/* 标记冗余, 将处理后的结果写入文件 */
/* 关闭文件,收尾清理 */
sam_close(outBamFp);
sam_close(inBamFp);
Error("[%s] load sam/bam file failed.\n", __func__);
return -1;
}
hts_set_opt(inBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
inBamHeader = sam_hdr_read(inBamFp); // 读取header
// cout << "read ends size: " << sizeof(ReadEnds) << endl;
/* 利用线程池对输入输出文件进行读写 */
htsThreadPool htsPoolRead = {NULL, 0}; // 多线程读取,创建线程池
htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池
htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads);
htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads);
if (!htsPoolRead.pool || !htsPoolWrite.pool)
{
Error("[%d] failed to set up thread pool", __LINE__);
return -1;
}
hts_set_opt(inBamFp, HTS_OPT_THREAD_POOL, &htsPoolRead);
cout << "总时间: " << time_all.seconds_elapsed() << endl;
/* 初始化输出文件 */
char modeout[12] = "wb";
sam_open_mode(modeout + 1, g_gArg.out_fn.c_str(), NULL);
g_outBamFp = sam_open(g_gArg.out_fn.c_str(), modeout);
g_outBamHeader = sam_hdr_dup(inBamHeader);
if (sam_hdr_write(g_outBamFp, g_outBamHeader) != 0)
{
Error("failed writing header to \"%s\"", g_gArg.out_fn.c_str());
sam_close(g_outBamFp);
return -1;
}
hts_set_opt(g_outBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
hts_set_opt(g_outBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite); // 用同样的线程池处理输出文件
// /* 读取缓存初始化 */
BamBufType inBamBuf(g_gArg.use_asyncio);
inBamBuf.Init(inBamFp, inBamHeader, g_gArg.max_mem);
/* 循环读入信息,并处理 */
g_maxJobNum = g_gArg.num_threads * 10;
// g_maxJobNum = g_gArg.num_threads * 3;
g_jobNumForRead = g_gArg.num_threads * 2;
int64_t x_all = 0; // for test
int64_t jobSeq = 0;
int64_t processedBamNum = 0; // 记录每个轮次累计处理的reads数量用来计算每个read在整个文件中的索引位置
threadpool thpool = thpool_init(g_gArg.num_threads); // 创建mark dup所需的线程池
thread *writeth = LAUNCH(thread_write, nullptr); // 启动处理结果的的线程
int bamRemainSize = 0; // 上一轮还剩下的bam数量包含已经在任务里的和没有放进任务的
int numReadsForEachJob = 0; // 每个线程处理的read数量第一次读取的时候进行设置
int lastRoundUnProcessed = 0; // 上一轮没有放进任务里的read数量
int curRoundProcessed = 0; // 这一轮放进任务的read数量
while (inBamBuf.ReadStat() >= 0)
{
/* 读取bam文件中的read */
int readNum = inBamBuf.ReadBam();
if (numReadsForEachJob == 0)
numReadsForEachJob = readNum / g_maxJobNum; // 第一次读取bam的时候进行设置
g_bamLoadedNum += readNum;
cout << readNum << endl; // 这一轮读取的bam数量
/* 多线程处理 任务数是线程数的10倍 */
tm_arr[0].acc_start();
curRoundProcessed = 0; // 当前轮次已经处理的reads数量
int numNeedToProcess = inBamBuf.Size() - bamRemainSize + lastRoundUnProcessed; // 当前需要处理的bam数量
for (int i = 0; numNeedToProcess >= numReadsForEachJob; ++i) // 只有待处理的reads数量大于一次任务的数量时新建任务
{
int startIdx = i * numReadsForEachJob + bamRemainSize - lastRoundUnProcessed;
int endIdx = (i + 1) * numReadsForEachJob + bamRemainSize - lastRoundUnProcessed;
ThMarkDupArg *thArg = new ThMarkDupArg({processedBamNum + curRoundProcessed,
jobSeq++,
true,
false,
inBamBuf.Slice(startIdx, endIdx)});
POSSESS(g_queueFirstLock); // 加锁
g_qpThMarkDupArg.push(thArg); // 将新任务需要的参数添加到队列
RELEASE(g_queueFirstLock); // 解锁
thpool_add_work(thpool, thread_markdups, (void *)thArg); // 添加新任务
curRoundProcessed += endIdx - startIdx;
numNeedToProcess -= numReadsForEachJob;
}
processedBamNum += curRoundProcessed;
lastRoundUnProcessed = numNeedToProcess;
/* 等待可以继续读取的信号 */
POSSESS(g_readyToReadLock);
WAIT_FOR(g_readyToReadLock, TO_BE, 1);
bamRemainSize = g_bamLoadedNum - g_bamWritenNum;
while (bamRemainSize >= inBamBuf.Size() / 2)
{ // 要保留的多于现在有的bam数量的一半那就等待write线程继续处理
TWIST(g_readyToReadLock, TO, 0);
POSSESS(g_readyToReadLock);
WAIT_FOR(g_readyToReadLock, TO_BE, 1);
bamRemainSize = g_bamLoadedNum - g_bamWritenNum;
}
inBamBuf.ClearBeforeIdx(inBamBuf.Size() - bamRemainSize); // 清理掉已经处理完的reads
// cout << g_bamLoadedNum << '\t' << g_bamWritenNum << '\t' << bamRemainSize << '\t' << inBamBuf.Size() << endl;
TWIST(g_readyToReadLock, TO, 0);
}
/* 数据读完了放一个空的任务好让write thread停下来 */
ThMarkDupArg *thArg = nullptr;
if (lastRoundUnProcessed > 0) // 最后一轮还有没有添加进任务的read数据
{
thArg = new ThMarkDupArg({processedBamNum + curRoundProcessed, jobSeq++, false, false,
inBamBuf.Slice(inBamBuf.Size() - lastRoundUnProcessed, inBamBuf.Size())});
processedBamNum += lastRoundUnProcessed;
}
else
{
thArg = new ThMarkDupArg({0, jobSeq++, false, false});
}
POSSESS(g_queueFirstLock); // 加锁
g_qpThMarkDupArg.push(thArg); // 将新任务需要的参数添加到队列
RELEASE(g_queueFirstLock); // 解锁
thpool_add_work(thpool, thread_markdups, (void *)thArg); // 添加新任务
/* 同步所有线程 */
thpool_wait(thpool);
thpool_destroy(thpool);
JOIN(writeth);
cout <<"x_all: " << x_all << endl;
cout << "loaded: " << g_bamLoadedNum << endl;
cout << "writen: " << g_bamWritenNum << endl;
cout << "processedBamNum: " << processedBamNum << endl;
/* 标记冗余, 将处理后的结果写入文件 */
/* 关闭文件,收尾清理 */
sam_close(g_outBamFp);
sam_close(inBamFp);
cout << "read ends size: " << sizeof(ReadEnds) << endl;
cout << " 总时间: " << time_all.seconds_elapsed() << endl;
cout << "计算read end: " << tm_arr[0].acc_seconds_elapsed() << endl;
Timer::log_time("程序结束");
return 0;
}

View File

@ -229,6 +229,13 @@ void MarkDupsArg::parseArgument(int argc,
}
gArg.printArgValue();
printArgValue();
}
/* 打印参数信息 */
void MarkDupsArg::printArgValue()
{
printf("--READ_NAME_REGEX = %s\n", this->READ_NAME_REGEX.c_str());
}
// 打印版本信息
@ -239,7 +246,6 @@ void MarkDupsArg::PrintVersion()
// 释放资源,关闭文件等
void MarkDupsArg::Finalize(MarkDupsArg *pMdArg,
vector<AuxVar> *pvAuxVar,
GlobalArg *pGArg)
{
}
@ -256,8 +262,8 @@ void MarkDupsArg::PrintHelp()
"\n"
"Required Arguments:\n"
"\n"
"--INPUT <String> One or more input SAM, BAM or CRAM files to analyze. Must be coordinate sorted. This\n"
" argument must be specified at least once.Required.\n"
"--INPUT <String> One input SAM, BAM or CRAM files to analyze. Must be coordinate sorted. This\n"
" argument must be specified at least once. Required.\n"
"\n"
"--METRICS_FILE <File> File to write duplication metrics to Required.\n"
"\n"

View File

@ -7,6 +7,9 @@ Author : Zhang Zhonghai
Date : 2023/10/23
*/
#ifndef MARKDUPS_ARG_H_
#define MARKDUPS_ARG_H_
#include <string>
#include <vector>
@ -104,15 +107,6 @@ namespace ns_md {
};
}
// 用于线程内的各种变量
struct AuxVar {
const static int MIN_QSUM_QSCORE = 13;
const static int REF_CONTEXT_PAD = 3;
const static int REFERENCE_HALF_WINDOW_LENGTH = 150;
double contaminantAlternateFraction;
};
/* markduplicate 需要的参数*/
struct MarkDupsArg
{
@ -303,12 +297,15 @@ struct MarkDupsArg
char **argv,
GlobalArg *pGArg);
void printArgValue();
static void PrintHelp();
static void PrintVersion();
// 释放资源,关闭文件等
static void Finalize(MarkDupsArg *pMdArg,
vector<AuxVar> *pvAuxVar,
GlobalArg *pGArg);
};
};
#endif

View File

@ -0,0 +1,115 @@
/*
Description: read ends
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2023/11/3
*/
#ifndef READ_ENDS_H_
#define READ_ENDS_H_
#include <stdint.h>
/**
* Small interface that provides access to the physical location information about a cluster.
* All values should be defaulted to -1 if unavailable. ReadGroup and Tile should only allow
* non-zero positive integers, x and y coordinates may be negative.
*/
struct PhysicalLocation
{
/**
* Small class that provides access to the physical location information about a cluster.
* All values should be defaulted to -1 if unavailable. Tile should only allow
* non-zero positive integers, x and y coordinates must be non-negative.
* This is different from PhysicalLocationShort in that the x and y positions are ints, not shorts
* thus, they do not overflow within a HiSeqX tile.
*/
int16_t tile = -1;
int32_t x = -1;
int32_t y = -1;
};
/* 包含了所有read ends信息如picard里边的 ReadEndsForMarkDuplicates*/
struct ReadEnds : PhysicalLocation
{
/* ReadEnds中的成员变量 */
/** Little struct-like class to hold read pair (and fragment) end data for duplicate marking. */
static const int8_t F = 0, R = 1, FF = 2, FR = 3, RR = 4, RF = 5;
// int16_t libraryId; // 没用,不考虑多样本
int8_t orientation;
int32_t read1ReferenceIndex = -1;
int32_t read1Coordinate = -1;
int32_t read2ReferenceIndex = -1;
int32_t read2Coordinate = -1; // This field is overloaded for flow based processing as the end coordinate of read 1. (paired reads not supported)
/* Additional information used to detect optical dupes */
// int16_t readGroup = -1; 一般经过比对后的bam文件只有一个read groupnormal或者tumor
/** For optical duplicate detection the orientation matters regard to 1st or 2nd end of a mate */
int8_t orientationForOpticalDuplicates = -1;
/** A *transient* flag marking this read end as being an optical duplicate. */
bool isOpticalDuplicate = false;
/* ReadEndsForMarkDuplicates中的成员变量 */
/** Little struct-like class to hold read pair (and fragment) end data for MarkDuplicatesWithMateCigar **/
int16_t score = 0;
int64_t read1IndexInFile = -1;
int64_t read2IndexInFile = -1;
int64_t duplicateSetSize = -1;
/* ReadEndsForMarkDuplicatesWithBarcodes中的成员变量 (好像用不到) */
// int32_t barcode = 0; // primary barcode for this read (and pair)
// int32_t readOneBarcode = 0; // read one barcode, 0 if not present
// int32_t readTwoBarcode = 0; // read two barcode, 0 if not present or not paired
/* zzh增加的成员变量 */
int64_t posKey = -1; // 根据位置信息生成的关键字 return (int64_t)tid << MAX_CONTIG_LEN_SHIFT | (int64_t)pos;
/* 根据pairend read的比对方向来确定整体的比对方向 */
static int8_t GetOrientationByte(bool read1NegativeStrand, bool read2NegativeStrand)
{
if (read1NegativeStrand)
{
if (read2NegativeStrand)
return RR;
else
return RF;
}
else
{
if (read2NegativeStrand)
return FR;
else
return FF;
}
}
/* 比较两个readends是否一样有个冗余 */
static bool AreComparableForDuplicates(ReadEnds &lhs, ReadEnds &rhs, bool compareRead2)
{
bool areComparable = true;
areComparable = lhs.read1ReferenceIndex == rhs.read1ReferenceIndex &&
lhs.read1Coordinate == rhs.read1Coordinate &&
lhs.orientation == rhs.orientation;
if (areComparable && compareRead2)
{
areComparable = lhs.read2ReferenceIndex == rhs.read2ReferenceIndex &&
lhs.read2Coordinate == rhs.read2Coordinate;
}
return areComparable;
}
/* 比对方向是否正向 */
bool IsForwardStrand()
{
return orientation == F;
}
/* pairend是否合适的比对上了 */
bool IsPaired()
{
return read2ReferenceIndex != -1;
}
};
#endif

View File

@ -0,0 +1,218 @@
/*
Description: readnametile, x, y
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2023/11/6
*/
#ifndef READ_NAME_PARSER_H_
#define READ_NAME_PARSER_H_
#include "read_ends.h"
#include <common/utils/util.h>
#include <stdint.h>
#include <string>
// #include <regex>
#include <boost/regex.hpp>
// using std::regex;
using boost::cmatch;
using boost::regex;
using std::string;
/**
* Provides access to the physical location information about a cluster.
* All values should be defaulted to -1 if unavailable. ReadGroup and Tile should only allow
* non-zero positive integers, x and y coordinates may be negative.
* 线
*/
struct ReadNameParser
{
/**
* The read name regular expression (regex) is used to extract three pieces of information from the read name: tile, x location,
* and y location. Any read name regex should parse the read name to produce these and only these values. An example regex is:
* (?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$
* which assumes that fields in the read name are delimited by ':' and the last three fields correspond to the tile, x and y locations,
* ignoring any trailing non-digit characters.
*
* The default regex is optimized for fast parsing (see {@link #getLastThreeFields(String, char, int[])}) by searching for the last
* three fields, ignoring any trailing non-digit characters, assuming the delimiter ':'. This should consider correctly read names
* where we have 5 or 7 field with the last three fields being tile/x/y, as is the case for the majority of read names produced by
* Illumina technology.
*/
const string DEFAULT_READ_NAME_REGEX = "(?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$";
string readNameStored = "";
PhysicalLocation physicalLocationStored;
int tmpLocationFields[3]; // for optimization of addLocationInformation
bool useOptimizedDefaultParsing = true; // was the regex default?
string readNameRegex = DEFAULT_READ_NAME_REGEX;
regex readNamePattern;
bool warnedAboutRegexNotMatching = true;
ReadNameParser() : ReadNameParser(DEFAULT_READ_NAME_REGEX) {}
ReadNameParser(const string &strReadNameRegex) : ReadNameParser(strReadNameRegex, true) {}
ReadNameParser(const string &strReadNameRegex, bool isWarn)
{
readNameRegex = strReadNameRegex;
if (strReadNameRegex == DEFAULT_READ_NAME_REGEX)
useOptimizedDefaultParsing = true;
else
useOptimizedDefaultParsing = false;
readNamePattern = boost::regex(strReadNameRegex, boost::regex_constants::optimize);
warnedAboutRegexNotMatching = isWarn;
}
/* 重新设置readNameRegex */
void SetReadNameRegex(const string &strReadNameRegex)
{
readNameRegex = strReadNameRegex;
if (strReadNameRegex == DEFAULT_READ_NAME_REGEX)
useOptimizedDefaultParsing = true;
else
useOptimizedDefaultParsing = false;
readNamePattern = boost::regex(strReadNameRegex, boost::regex_constants::optimize);
// readNamePattern = strReadNameRegex;
}
/* 添加测序时候的tile x y 信息 */
bool AddLocationInformation(const string &readName, PhysicalLocation *loc)
{
if (!(readName == readNameStored))
{
if (ReadLocationInformation(readName, loc))
{
readNameStored = readName;
physicalLocationStored = *loc;
return true;
}
// return false if read name cannot be parsed
return false;
}
else
{
*loc = physicalLocationStored;
return true;
}
}
/**
* Method used to extract tile/x/y from the read name and add it to the PhysicalLocationShort so that it
* can be used later to determine optical duplication
*
* @param readName the name of the read/cluster
* @param loc the object to add tile/x/y to
* @return true if the read name contained the information in parsable form, false otherwise
*/
bool ReadLocationInformation(const string &readName, PhysicalLocation *loc)
{
try {
// Optimized version if using the default read name regex (== used on purpose):
if (useOptimizedDefaultParsing)
{
const int fields = getLastThreeFields(readName, ':');
if (!(fields == 5 || fields == 7))
{
if (warnedAboutRegexNotMatching)
{
Warn(
"Default READ_NAME_REGEX '%s' did not match read name '%s'."
"You may need to specify a READ_NAME_REGEX in order to correctly identify optical duplicates. "
"Note that this message will not be emitted again even if other read names do not match the regex.",
readNameRegex.c_str(), readName.c_str());
warnedAboutRegexNotMatching = false;
}
return false;
}
loc->tile = (int16_t)tmpLocationFields[0];
loc->x = tmpLocationFields[1];
loc->y = tmpLocationFields[2];
return true;
}
else if (readNameRegex.empty())
{
return false;
}
else
{
// Standard version that will use the regex
cmatch m;
if (boost::regex_match(readName.c_str(), m, readNamePattern)) {
loc->tile = std::stoi(m[1].str());
loc->x = std::stoi(m[2].str());
loc->y = std::stoi(m[3].str());
return true;
}
else
{
if (warnedAboutRegexNotMatching)
{
Warn(
"READ_NAME_REGEX '%s' did not match read name '%s'."
"Your regex may not be correct. "
"Note that this message will not be emitted again even if other read names do not match the regex.",
readNameRegex.c_str(), readName.c_str());
warnedAboutRegexNotMatching = false;
}
return false;
}
}
}
catch (const std::runtime_error &e)
{
if (warnedAboutRegexNotMatching)
{
Warn(
"A field parsed out of a read name was expected to contain an integer and did not. READ_NAME_REGEX: %s; Read name: %s; Error Msg: %s",
readNameRegex.c_str(), readName.c_str(), e.what());
warnedAboutRegexNotMatching = false;
}
}
return true;
}
/**
* Given a string, splits the string by the delimiter, and returns the the last three fields parsed as integers. Parsing a field
* considers only a sequence of digits up until the first non-digit character. The three values are stored in the passed-in array.
*
* @throws NumberFormatException if any of the tokens that should contain numbers do not start with parsable numbers
*/
int getLastThreeFields(const string &readName, char delim)
{
int tokensIdx = 2; // start at the last token
int numFields = 0;
int i, endIdx;
endIdx = readName.size();
// find the last three tokens only
for (i = readName.size() - 1; 0 <= i && 0 <= tokensIdx; i--)
{
if (readName.at(i) == delim || 0 == i)
{
numFields++;
const int startIdx = (0 == i) ? 0 : (i + 1);
tmpLocationFields[tokensIdx] = std::stoi(readName.substr(startIdx, endIdx - startIdx));
tokensIdx--;
endIdx = i;
}
}
// continue to find the # of fields
while (0 <= i)
{
if (readName.at(i) == delim || 0 == i)
numFields++;
i--;
}
if (numFields < 3)
{
tmpLocationFields[0] = tmpLocationFields[1] = tmpLocationFields[2] = -1;
numFields = -1;
}
return numFields;
}
};
#endif