2023-10-23 23:07:00 +08:00
|
|
|
|
/*
|
2023-11-09 21:07:58 +08:00
|
|
|
|
Description: 标记bam文件中的冗余信息,只处理按照坐标排序后的bam,且bam为单一样本数据
|
2023-10-23 23:07:00 +08:00
|
|
|
|
|
|
|
|
|
|
Copyright : All right reserved by ICT
|
|
|
|
|
|
|
|
|
|
|
|
Author : Zhang Zhonghai
|
|
|
|
|
|
Date : 2023/10/23
|
|
|
|
|
|
*/
|
2023-11-01 10:48:02 +08:00
|
|
|
|
#include "markdups_arg.h"
|
2023-11-09 21:07:58 +08:00
|
|
|
|
// 有太多define冲突,放到最后include
|
2023-11-06 12:38:30 +08:00
|
|
|
|
|
2023-11-09 21:07:58 +08:00
|
|
|
|
|
|
|
|
|
|
#include <common/hts/bam_buf.h>
|
2023-11-06 12:38:30 +08:00
|
|
|
|
#include <common/utils/global_arg.h>
|
|
|
|
|
|
#include <common/utils/thpool.h>
|
|
|
|
|
|
#include <common/utils/timer.h>
|
|
|
|
|
|
#include <common/utils/util.h>
|
2023-11-09 21:07:58 +08:00
|
|
|
|
#include <common/utils/murmur3.h>
|
2023-11-06 12:38:30 +08:00
|
|
|
|
#include <common/utils/yarn.h>
|
2023-11-09 21:07:58 +08:00
|
|
|
|
#include <sam/utils/read_ends.h>
|
|
|
|
|
|
#include <sam/utils/read_name_parser.h>
|
2023-11-06 12:38:30 +08:00
|
|
|
|
|
|
|
|
|
|
#include <htslib/sam.h>
|
2023-11-09 21:07:58 +08:00
|
|
|
|
#include <htslib/thread_pool.h>
|
2023-10-23 23:07:00 +08:00
|
|
|
|
|
|
|
|
|
|
#include <iostream>
|
2023-11-06 12:38:30 +08:00
|
|
|
|
#include <vector>
|
|
|
|
|
|
#include <set>
|
|
|
|
|
|
#include <queue>
|
2023-11-09 21:07:58 +08:00
|
|
|
|
#include <unordered_map>
|
|
|
|
|
|
|
|
|
|
|
|
|
2023-10-23 23:07:00 +08:00
|
|
|
|
|
|
|
|
|
|
using namespace std;
|
2023-11-09 21:07:58 +08:00
|
|
|
|
using std::cout;
|
|
|
|
|
|
|
|
|
|
|
|
#define SMA_TAG_PG "PG"
|
2023-10-23 23:07:00 +08:00
|
|
|
|
|
2023-11-06 12:38:30 +08:00
|
|
|
|
#define BAM_BLOCK_SIZE 2 * 1024 * 1024
|
2023-11-09 21:07:58 +08:00
|
|
|
|
#define NO_SUCH_INDEX INT64_MAX
|
|
|
|
|
|
|
|
|
|
|
|
static Timer tm_arr[10]; // 用来测试性能
|
2023-11-06 12:38:30 +08:00
|
|
|
|
|
|
|
|
|
|
/* 前向声明 */
|
|
|
|
|
|
class ThMarkDupArg;
|
2023-11-09 21:07:58 +08:00
|
|
|
|
|
2023-11-06 12:38:30 +08:00
|
|
|
|
/* 全局本地变量 */
|
2023-11-09 21:07:58 +08:00
|
|
|
|
static queue<ThMarkDupArg *> g_qpThMarkDupArg; // 存放线程变量的队列
|
|
|
|
|
|
static lock_t *g_queueFirstLock = NEW_LOCK(-1); // 队列的第一个任务是否完成
|
|
|
|
|
|
static lock_t *g_readyToReadLock = NEW_LOCK(-1); // 通知主线程是否可以进行下一次读取
|
|
|
|
|
|
static vector<ReadNameParser> g_vRnParser; // 每个线程一个read name parser
|
|
|
|
|
|
static int g_numDuplicateIndices = 0; // 找到的冗余read总数
|
|
|
|
|
|
static samFile *g_outBamFp = nullptr; // 输出文件, sam或者bam格式
|
|
|
|
|
|
static sam_hdr_t *g_outBamHeader; // 输出文件的header
|
|
|
|
|
|
static int g_maxJobNum = 0; // 每次读取新的数据后,新增的任务数量
|
|
|
|
|
|
static int g_jobNumForRead = 0; // 任务数量降到当前值时开始下一轮读取
|
|
|
|
|
|
static volatile int64_t g_bamLoadedNum = 0; // 已经读入的read总数
|
|
|
|
|
|
static volatile int64_t g_bamWritenNum = 0; // 已经处理完,写入输出文件的read总数
|
|
|
|
|
|
static vector<int64_t> g_vDupIdx; // 线程内部计算得出的
|
|
|
|
|
|
static vector<int64_t> g_vOpticalDupIdx;
|
|
|
|
|
|
static set<int64_t> g_sDupIdxLatter;
|
|
|
|
|
|
static set<int64_t> g_sOpticalDupIdxLatter;
|
|
|
|
|
|
|
|
|
|
|
|
/* 参数对象作为全局对象,免得多次作为参数传入函数中 */
|
|
|
|
|
|
static GlobalArg &g_gArg = GlobalArg::Instance();
|
|
|
|
|
|
static MarkDupsArg g_mdArg;
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
|
* 计算read的分数
|
|
|
|
|
|
*/
|
|
|
|
|
|
static int16_t computeDuplicateScore(BamWrap &bw)
|
|
|
|
|
|
{
|
|
|
|
|
|
int16_t score = 0;
|
|
|
|
|
|
switch (g_mdArg.DUPLICATE_SCORING_STRATEGY)
|
|
|
|
|
|
{
|
|
|
|
|
|
case ns_md::SUM_OF_BASE_QUALITIES:
|
|
|
|
|
|
// two (very) long reads worth of high-quality bases can go over Short.MAX_VALUE/2
|
|
|
|
|
|
// and risk overflow.
|
|
|
|
|
|
score += (int16_t)min(bw.GetSumOfBaseQualities(), INT16_MAX / 2);
|
|
|
|
|
|
break;
|
|
|
|
|
|
case ns_md::TOTAL_MAPPED_REFERENCE_LENGTH:
|
|
|
|
|
|
if (!bw.GetReadUnmappedFlag())
|
|
|
|
|
|
// no need to remember the score since this scoring mechanism is symmetric
|
|
|
|
|
|
score = (int16_t)min(bw.GetReferenceLength(), INT16_MAX / 2);
|
|
|
|
|
|
break;
|
|
|
|
|
|
case ns_md::RANDOM:
|
|
|
|
|
|
// The RANDOM score gives the same score to both reads so that they get filtered together.
|
|
|
|
|
|
// it's not critical do use the readName since the scores from both ends get added, but it seem
|
|
|
|
|
|
// to be clearer this way.
|
|
|
|
|
|
score += (short)(Murmur3::Instance().HashUnencodedChars(bw.query_name()) & 0b11111111111111);
|
|
|
|
|
|
// subtract Short.MIN_VALUE/4 from it to end up with a number between
|
|
|
|
|
|
// 0 and Short.MAX_VALUE/2. This number can be then discounted in case the read is
|
|
|
|
|
|
// not passing filters. We need to stay far from overflow so that when we add the two
|
|
|
|
|
|
// scores from the two read mates we do not overflow since that could cause us to chose a
|
|
|
|
|
|
// failing read-pair instead of a passing one.
|
|
|
|
|
|
score -= INT16_MIN / 4;
|
|
|
|
|
|
default:
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
// make sure that filter-failing records are heavily discounted. (the discount can happen twice, once
|
|
|
|
|
|
// for each mate, so need to make sure we do not subtract more than Short.MIN_VALUE overall.)
|
|
|
|
|
|
score += bw.GetReadFailsVendorQualityCheckFlag() ? (int16_t)(INT16_MIN / 2) : 0;
|
|
|
|
|
|
|
|
|
|
|
|
return score;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
|
* Builds a read ends object that represents a single read. 用来表示一个read的特征结构
|
|
|
|
|
|
*/
|
|
|
|
|
|
static void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey)
|
|
|
|
|
|
{
|
|
|
|
|
|
auto &k = *pKey;
|
|
|
|
|
|
auto &bc = bw.b->core;
|
|
|
|
|
|
k.read1ReferenceIndex = bc.tid;
|
|
|
|
|
|
k.read1Coordinate = (bc.flag & BAM_FREVERSE) ? bw.GetUnclippedEnd() : bw.GetUnclippedStart();
|
|
|
|
|
|
k.orientation = (bc.flag & BAM_FREVERSE) ? ReadEnds::R : ReadEnds::F;
|
|
|
|
|
|
k.read1IndexInFile = index;
|
|
|
|
|
|
k.score = computeDuplicateScore(bw);
|
|
|
|
|
|
// Doing this lets the ends object know that it's part of a pair
|
|
|
|
|
|
if (bw.GetReadPairedFlag() && !bw.GetMateUnmappedFlag())
|
|
|
|
|
|
{
|
|
|
|
|
|
k.read2ReferenceIndex = bc.mtid;
|
|
|
|
|
|
}
|
|
|
|
|
|
// Fill in the location information for optical duplicates
|
|
|
|
|
|
rnParser.AddLocationInformation(bw.query_name(), pKey);
|
|
|
|
|
|
// cout << k.tile << ' ' << k.x << ' ' << k.y << endl;
|
|
|
|
|
|
// 计算位置key
|
|
|
|
|
|
k.posKey = BamWrap::bam_global_pos(k.read1ReferenceIndex, k.read1Coordinate); // << 1 | k.orientation;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Takes a list of ReadEndsForMarkDuplicates objects and identify the representative read based on
|
|
|
|
|
|
* quality score. For all members of the duplicate set, add the read1 index-in-file of the representative
|
|
|
|
|
|
* read to the records of the first and second in a pair. This value becomes is used for
|
|
|
|
|
|
* the 'DI' tag.
|
|
|
|
|
|
*/
|
|
|
|
|
|
static void addRepresentativeReadIndex(vector<ReadEnds *> &vpRe)
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* 处理一组pairend的readends,标记冗余 */
|
|
|
|
|
|
static void markDuplicatePairs(vector<ReadEnds *> &vpRe, set<int64_t> *psDupIdx, set<int64_t> *psOpticalDupIdx)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (vpRe.size() < 2) {
|
|
|
|
|
|
if (vpRe.size() == 1)
|
|
|
|
|
|
{
|
|
|
|
|
|
// addSingletonToCount(libraryIdGenerator);
|
|
|
|
|
|
}
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int maxScore = 0;
|
|
|
|
|
|
ReadEnds *pBestRe = nullptr;
|
|
|
|
|
|
/** All read ends should have orientation FF, FR, RF, or RR **/
|
|
|
|
|
|
for (auto pe: vpRe) // 找分数最高的readend
|
|
|
|
|
|
{
|
|
|
|
|
|
if (pe->score > maxScore || pBestRe == nullptr)
|
|
|
|
|
|
{
|
|
|
|
|
|
maxScore = pe->score;
|
|
|
|
|
|
pBestRe = pe;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
if (!g_mdArg.READ_NAME_REGEX.empty()) // 检查光学冗余
|
|
|
|
|
|
{
|
|
|
|
|
|
// trackOpticalDuplicates
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
for (auto pe: vpRe) // 对非best read标记冗余
|
|
|
|
|
|
{
|
|
|
|
|
|
if (pe != pBestRe) // 非best
|
|
|
|
|
|
{
|
|
|
|
|
|
psDupIdx->insert(pe->read1IndexInFile); // 添加read1
|
|
|
|
|
|
if (pe->read2IndexInFile != pe->read1IndexInFile)
|
|
|
|
|
|
psDupIdx->insert(pe->read2IndexInFile); // 添加read2
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS)
|
|
|
|
|
|
{
|
|
|
|
|
|
addRepresentativeReadIndex(vpRe);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* 处理一组非paired的readends,标记冗余 */
|
|
|
|
|
|
static void markDuplicateFragments(vector<ReadEnds *> &vpRe,
|
|
|
|
|
|
bool containsPairs,
|
|
|
|
|
|
set<int64_t> *psDupIdx,
|
|
|
|
|
|
set<int64_t> *psOpticalDupIdx)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (containsPairs)
|
|
|
|
|
|
{
|
|
|
|
|
|
for (auto pe: vpRe)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (!pe->IsPaired())
|
|
|
|
|
|
{
|
|
|
|
|
|
psDupIdx->insert(pe->read1IndexInFile);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
|
|
|
|
|
int maxScore = 0;
|
|
|
|
|
|
ReadEnds *pBest = nullptr;
|
|
|
|
|
|
for (auto pe : vpRe)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (pe->score > maxScore || pBest == nullptr)
|
|
|
|
|
|
{
|
|
|
|
|
|
maxScore = pe->score;
|
|
|
|
|
|
pBest = pe;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
for (auto pe : vpRe)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (pe != pBest)
|
|
|
|
|
|
{
|
|
|
|
|
|
psDupIdx->insert(pe->read1IndexInFile);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2023-11-06 12:38:30 +08:00
|
|
|
|
|
|
|
|
|
|
/* 多线程处理冗余参数结构体 */
|
|
|
|
|
|
struct ThMarkDupArg
|
|
|
|
|
|
{
|
2023-11-09 21:07:58 +08:00
|
|
|
|
int64_t bamStartIdx; // 当前vBam数组中第一个bam记录在整体bam中所处的位置
|
|
|
|
|
|
long seq; // 当前任务在所有任务的排序
|
|
|
|
|
|
bool more; // 后面还有任务
|
|
|
|
|
|
volatile bool finish; // 当前任务有没有处理完
|
|
|
|
|
|
vector<BamWrap *> vBam; // 存放待处理的bam read
|
|
|
|
|
|
map<int64_t, vector<ReadEnds>> mvPair; // 以冗余位置为索引,保存所有pairend reads
|
|
|
|
|
|
map<int64_t, vector<ReadEnds>> mvFrag; // 保存所有reads,包括pairend
|
|
|
|
|
|
map<int64_t, set<int64_t>> msDupIdx; // 冗余read的索引
|
|
|
|
|
|
map<int64_t, set<int64_t>> msOpticalDupIdx; // optical冗余read的索引
|
|
|
|
|
|
unordered_map<string, ReadEnds> umReadEnds; // 用来寻找pair end
|
2023-11-06 12:38:30 +08:00
|
|
|
|
};
|
2023-11-09 21:07:58 +08:00
|
|
|
|
|
2023-10-23 23:07:00 +08:00
|
|
|
|
/*
|
2023-11-06 12:38:30 +08:00
|
|
|
|
* 多线程查找和标记冗余函数
|
2023-10-23 23:07:00 +08:00
|
|
|
|
*/
|
2023-11-09 21:07:58 +08:00
|
|
|
|
void thread_markdups(void *arg, int tid)
|
2023-11-06 12:38:30 +08:00
|
|
|
|
{
|
|
|
|
|
|
auto &p = *(ThMarkDupArg *)arg;
|
|
|
|
|
|
|
2023-11-09 21:07:58 +08:00
|
|
|
|
/* 处理每个read,创建ReadEnd,并放入frag和pair中 */
|
|
|
|
|
|
for (int i = 0; i < p.vBam.size(); ++i) // 循环处理每个read
|
|
|
|
|
|
{
|
|
|
|
|
|
BamWrap *bw = p.vBam[i];
|
|
|
|
|
|
const int64_t bamIdx = p.bamStartIdx + i;
|
|
|
|
|
|
if (bw->GetReadUnmappedFlag())
|
|
|
|
|
|
{
|
|
|
|
|
|
if (bw->b->core.tid == -1)
|
|
|
|
|
|
// When we hit the unmapped reads with no coordinate, no reason to continue (only in coordinate sort).
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
else if (!bw->IsSecondaryOrSupplementary()) // 是主要比对
|
|
|
|
|
|
{
|
|
|
|
|
|
ReadEnds fragEnd;
|
|
|
|
|
|
buildReadEnds(*bw, bamIdx, g_vRnParser[tid], &fragEnd);
|
|
|
|
|
|
p.mvFrag[fragEnd.posKey].push_back(fragEnd); // 添加进frag集合
|
|
|
|
|
|
if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) // 是pairend而且互补的read也比对上了
|
|
|
|
|
|
{
|
|
|
|
|
|
string key = bw->query_name();
|
|
|
|
|
|
if (p.umReadEnds.find(key) == p.umReadEnds.end())
|
|
|
|
|
|
{
|
|
|
|
|
|
p.umReadEnds[key] = fragEnd;
|
|
|
|
|
|
}
|
|
|
|
|
|
else // 找到了pairend
|
|
|
|
|
|
{
|
|
|
|
|
|
auto pairedEnds = p.umReadEnds.at(key);
|
|
|
|
|
|
p.umReadEnds.erase(key); // 删除找到的pairend
|
|
|
|
|
|
const int matesRefIndex = fragEnd.read1ReferenceIndex;
|
|
|
|
|
|
const int matesCoordinate = fragEnd.read1Coordinate;
|
|
|
|
|
|
// Set orientationForOpticalDuplicates, which always goes by the first then the second end for the strands. NB: must do this
|
|
|
|
|
|
// before updating the orientation later.
|
|
|
|
|
|
if (bw->GetFirstOfPairFlag())
|
|
|
|
|
|
{
|
|
|
|
|
|
pairedEnds.orientationForOpticalDuplicates =
|
|
|
|
|
|
ReadEnds::GetOrientationByte(bw->GetReadNegativeStrandFlag(), pairedEnds.orientation == ReadEnds::R);
|
|
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
|
|
|
|
|
pairedEnds.orientationForOpticalDuplicates =
|
|
|
|
|
|
ReadEnds::GetOrientationByte(pairedEnds.orientation == ReadEnds::R, bw->GetReadNegativeStrandFlag());
|
|
|
|
|
|
}
|
|
|
|
|
|
// If the other read is actually later, simply add the other read's data as read2, else flip the reads
|
|
|
|
|
|
if (matesRefIndex > pairedEnds.read1ReferenceIndex ||
|
|
|
|
|
|
(matesRefIndex == pairedEnds.read1ReferenceIndex && matesCoordinate >= pairedEnds.read1Coordinate))
|
|
|
|
|
|
{
|
|
|
|
|
|
pairedEnds.read2ReferenceIndex = matesRefIndex;
|
|
|
|
|
|
pairedEnds.read2Coordinate = matesCoordinate;
|
|
|
|
|
|
pairedEnds.read2IndexInFile = bamIdx;
|
|
|
|
|
|
pairedEnds.orientation = ReadEnds::GetOrientationByte(pairedEnds.orientation == ReadEnds::R,
|
|
|
|
|
|
bw->GetReadNegativeStrandFlag());
|
|
|
|
|
|
|
|
|
|
|
|
// if the two read ends are in the same position, pointing in opposite directions,
|
|
|
|
|
|
// the orientation is undefined and the procedure above
|
|
|
|
|
|
// will depend on the order of the reads in the file.
|
|
|
|
|
|
// To avoid this, we set it explicitly (to FR):
|
|
|
|
|
|
if (pairedEnds.read2ReferenceIndex == pairedEnds.read1ReferenceIndex &&
|
|
|
|
|
|
pairedEnds.read2Coordinate == pairedEnds.read1Coordinate &&
|
|
|
|
|
|
pairedEnds.orientation == ReadEnds::RF)
|
|
|
|
|
|
{
|
|
|
|
|
|
pairedEnds.orientation = ReadEnds::FR;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
|
|
|
|
|
pairedEnds.read2ReferenceIndex = pairedEnds.read1ReferenceIndex;
|
|
|
|
|
|
pairedEnds.read2Coordinate = pairedEnds.read1Coordinate;
|
|
|
|
|
|
pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile;
|
|
|
|
|
|
pairedEnds.read1ReferenceIndex = matesRefIndex;
|
|
|
|
|
|
pairedEnds.read1Coordinate = matesCoordinate;
|
|
|
|
|
|
pairedEnds.read1IndexInFile = bamIdx;
|
|
|
|
|
|
pairedEnds.orientation = ReadEnds::GetOrientationByte(bw->GetReadNegativeStrandFlag(),
|
|
|
|
|
|
pairedEnds.orientation == ReadEnds::R);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
pairedEnds.score += computeDuplicateScore(*bw);
|
|
|
|
|
|
p.mvPair[pairedEnds.posKey].push_back(pairedEnds);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
/* generateDuplicateIndexes,计算冗余read在所有read中的位置索引 */
|
|
|
|
|
|
// 先处理 pair
|
|
|
|
|
|
int dupNum = 0;
|
|
|
|
|
|
vector<ReadEnds *> vRePotentialDup; // 有可能是冗余的reads
|
|
|
|
|
|
for (auto &e : p.mvPair) // 按比对的位置先后进行遍历
|
|
|
|
|
|
{
|
|
|
|
|
|
if (e.second.size() > 1) // 有潜在的冗余
|
|
|
|
|
|
{
|
|
|
|
|
|
vRePotentialDup.clear();
|
|
|
|
|
|
ReadEnds *pReadEnd = nullptr;
|
|
|
|
|
|
for (auto &re : e.second)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true))
|
|
|
|
|
|
vRePotentialDup.push_back(&re);
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
|
|
|
|
|
markDuplicatePairs(vRePotentialDup, &p.msDupIdx[e.first], &p.msOpticalDupIdx[e.first]);
|
|
|
|
|
|
vRePotentialDup.clear();
|
|
|
|
|
|
vRePotentialDup.push_back(&re);
|
|
|
|
|
|
pReadEnd = &re;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
markDuplicatePairs(vRePotentialDup, &p.msDupIdx[e.first], &p.msOpticalDupIdx[e.first]);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
// 再处理frag
|
|
|
|
|
|
bool containsPairs = false;
|
|
|
|
|
|
bool containsFrags = false;
|
|
|
|
|
|
for (auto &e : p.mvFrag)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (e.second.size() > 1) // 有潜在的冗余
|
|
|
|
|
|
{
|
|
|
|
|
|
vRePotentialDup.clear();
|
|
|
|
|
|
ReadEnds *pReadEnd = nullptr;
|
|
|
|
|
|
for (auto &re : e.second)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false))
|
|
|
|
|
|
{
|
|
|
|
|
|
vRePotentialDup.push_back(&re);
|
|
|
|
|
|
containsPairs = containsPairs || re.IsPaired();
|
|
|
|
|
|
containsFrags = containsFrags || !re.IsPaired();
|
|
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
|
|
|
|
|
if (vRePotentialDup.size() > 1 && containsFrags)
|
|
|
|
|
|
{
|
|
|
|
|
|
markDuplicateFragments(vRePotentialDup, containsPairs, &p.msDupIdx[e.first], &p.msOpticalDupIdx[e.first]);
|
|
|
|
|
|
}
|
|
|
|
|
|
vRePotentialDup.clear();
|
|
|
|
|
|
vRePotentialDup.push_back(&re);
|
|
|
|
|
|
pReadEnd = &re;
|
|
|
|
|
|
containsPairs = re.IsPaired();
|
|
|
|
|
|
containsFrags = !re.IsPaired();
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
if (vRePotentialDup.size() > 1 && containsFrags) {
|
|
|
|
|
|
markDuplicateFragments(vRePotentialDup, containsPairs, &p.msDupIdx[e.first], &p.msOpticalDupIdx[e.first]);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2023-11-06 12:38:30 +08:00
|
|
|
|
|
2023-11-09 21:07:58 +08:00
|
|
|
|
// cout << tid << '\t' << "dup: " << dupNum << endl;
|
|
|
|
|
|
// cout << tid << " all: no: " << p.vBam.size() << '\t' << p.umReadEnds.size() << endl;
|
2023-11-06 12:38:30 +08:00
|
|
|
|
/* 本段数据处理完成,告诉输出线程 */
|
2023-11-09 21:07:58 +08:00
|
|
|
|
POSSESS(g_queueFirstLock);
|
2023-11-06 12:38:30 +08:00
|
|
|
|
p.finish = true;
|
2023-11-09 21:07:58 +08:00
|
|
|
|
// cout << tid << ": process: " << p.seq << endl;
|
|
|
|
|
|
auto front = g_qpThMarkDupArg.front();
|
2023-11-06 12:38:30 +08:00
|
|
|
|
if (front->finish)
|
|
|
|
|
|
{
|
2023-11-09 21:07:58 +08:00
|
|
|
|
TWIST(g_queueFirstLock, TO, front->seq); // 通知写线程,当前队列头部完成的任务
|
2023-11-06 12:38:30 +08:00
|
|
|
|
} else {
|
2023-11-09 21:07:58 +08:00
|
|
|
|
RELEASE(g_queueFirstLock);
|
2023-11-06 12:38:30 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
|
* 多线程将结果写入文件,写之前需要合并相邻线程的未处理的结果
|
|
|
|
|
|
*/
|
|
|
|
|
|
void thread_write(void *)
|
2023-10-23 23:07:00 +08:00
|
|
|
|
{
|
2023-11-06 12:38:30 +08:00
|
|
|
|
bool more = false;
|
|
|
|
|
|
long seq = 0;
|
2023-11-09 21:07:58 +08:00
|
|
|
|
long unPairedNum = 0;
|
|
|
|
|
|
POSSESS(g_queueFirstLock);
|
|
|
|
|
|
WAIT_FOR(g_queueFirstLock, TO_BE, seq++); // 等待首个任务完成
|
|
|
|
|
|
auto lastP = g_qpThMarkDupArg.front(); // 取队首的数据
|
|
|
|
|
|
auto umUnpairedReadEnds = lastP->umReadEnds; // 还未找到pair的read
|
|
|
|
|
|
auto p = lastP;
|
|
|
|
|
|
g_qpThMarkDupArg.pop(); // 删除队首
|
|
|
|
|
|
TWIST(g_queueFirstLock, TO, seq); // 解锁
|
|
|
|
|
|
more = lastP->more; // 是否还有下一个任务
|
2023-11-06 12:38:30 +08:00
|
|
|
|
while (more) // 循环处理,将结果写入文件
|
|
|
|
|
|
{
|
2023-11-09 21:07:58 +08:00
|
|
|
|
POSSESS(g_queueFirstLock);
|
|
|
|
|
|
if (g_qpThMarkDupArg.empty()) // 有可能新任务没来得及添加进队列
|
2023-11-06 12:38:30 +08:00
|
|
|
|
{
|
2023-11-09 21:07:58 +08:00
|
|
|
|
RELEASE(g_queueFirstLock);
|
2023-11-06 12:38:30 +08:00
|
|
|
|
continue;
|
|
|
|
|
|
}
|
2023-11-09 21:07:58 +08:00
|
|
|
|
WAIT_FOR(g_queueFirstLock, TO_BE, seq); // 等待任务完成
|
|
|
|
|
|
p = g_qpThMarkDupArg.front();
|
|
|
|
|
|
if (!p->finish) // 有可能这个任务没有完成,是下边那个TWIST导致进到这里,因为这一段代码可能运行比较快
|
2023-11-06 12:38:30 +08:00
|
|
|
|
{
|
2023-11-09 21:07:58 +08:00
|
|
|
|
TWIST(g_queueFirstLock, TO, -1); // 此时队首任务没完成,-1可以让锁无法进入到这里,避免无效获得锁
|
2023-11-06 12:38:30 +08:00
|
|
|
|
continue;
|
|
|
|
|
|
}
|
2023-11-09 21:07:58 +08:00
|
|
|
|
g_qpThMarkDupArg.pop();
|
|
|
|
|
|
TWIST(g_queueFirstLock, TO, seq + 1);
|
|
|
|
|
|
/* 处理结果数据 */
|
|
|
|
|
|
// cout << "finish: " << seq - 1 << '\t' << "lastIdx: " << p->bamStartIdx+p->vBam.size() << endl;
|
2023-11-01 10:48:02 +08:00
|
|
|
|
|
2023-11-09 21:07:58 +08:00
|
|
|
|
for (auto &e : p->umReadEnds) // 在当前任务中找有没有与上一个任务中没匹配的read,相匹配的pair
|
|
|
|
|
|
{
|
|
|
|
|
|
if (umUnpairedReadEnds.find(e.first) != umUnpairedReadEnds.end())
|
|
|
|
|
|
umUnpairedReadEnds.erase(e.first); // 找到了pair
|
|
|
|
|
|
else
|
|
|
|
|
|
umUnpairedReadEnds.insert(e); // 没有pair,则添加
|
|
|
|
|
|
}
|
2023-11-06 12:38:30 +08:00
|
|
|
|
|
2023-11-09 21:07:58 +08:00
|
|
|
|
/* 更新写入read数量和状态 */
|
|
|
|
|
|
POSSESS(g_readyToReadLock);
|
|
|
|
|
|
g_bamWritenNum += lastP->vBam.size();
|
|
|
|
|
|
// cout << "write: " << g_qpThMarkDupArg.size() << endl;
|
|
|
|
|
|
if (g_qpThMarkDupArg.size() <= g_jobNumForRead)
|
|
|
|
|
|
{
|
|
|
|
|
|
TWIST(g_readyToReadLock, TO, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
|
|
|
|
|
RELEASE(g_readyToReadLock);
|
|
|
|
|
|
}
|
2023-11-06 12:38:30 +08:00
|
|
|
|
/* 准备下一轮循环 */
|
|
|
|
|
|
delete lastP;
|
|
|
|
|
|
more = p->more;
|
|
|
|
|
|
lastP = p;
|
|
|
|
|
|
seq++;
|
|
|
|
|
|
}
|
2023-11-09 21:07:58 +08:00
|
|
|
|
unPairedNum = umUnpairedReadEnds.size();
|
|
|
|
|
|
|
|
|
|
|
|
cout << "Finally unpaired read num: " << unPairedNum << endl;
|
2023-11-06 12:38:30 +08:00
|
|
|
|
|
|
|
|
|
|
// 处理最后一个数据
|
2023-11-09 21:07:58 +08:00
|
|
|
|
POSSESS(g_readyToReadLock);
|
|
|
|
|
|
g_bamWritenNum += lastP->vBam.size();
|
|
|
|
|
|
TWIST(g_readyToReadLock, TO, 1);
|
|
|
|
|
|
// cout << "last finish: " << seq - 1 << endl;
|
2023-11-06 12:38:30 +08:00
|
|
|
|
pthread_exit(0);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
|
* mark duplicate 入口,假定bam是按照比对后的坐标排序的,同一个样本的话不需要考虑barcode的问题
|
|
|
|
|
|
*/
|
|
|
|
|
|
int MarkDuplicates(int argc, char *argv[])
|
|
|
|
|
|
{
|
|
|
|
|
|
Timer::log_time("程序开始");
|
|
|
|
|
|
Timer time_all;
|
2023-11-09 21:07:58 +08:00
|
|
|
|
|
|
|
|
|
|
/* 读取命令行参数 */
|
|
|
|
|
|
g_mdArg.parseArgument(argc, argv, &g_gArg); // 解析命令行参数
|
|
|
|
|
|
if (g_gArg.num_threads < 1) // 线程数不能小于1
|
|
|
|
|
|
g_gArg.num_threads = 1;
|
|
|
|
|
|
|
|
|
|
|
|
/* 初始化一些参数和变量*/
|
|
|
|
|
|
g_vRnParser.resize(g_gArg.num_threads);
|
|
|
|
|
|
for (auto &parser : g_vRnParser)
|
|
|
|
|
|
parser.SetReadNameRegex(g_mdArg.READ_NAME_REGEX); // 用来解析read name中的tile,x,y信息
|
|
|
|
|
|
|
|
|
|
|
|
/* 打开输入bam文件 */
|
|
|
|
|
|
sam_hdr_t *inBamHeader;
|
|
|
|
|
|
samFile *inBamFp;
|
|
|
|
|
|
inBamFp = sam_open_format(g_gArg.in_fn.c_str(), "r", nullptr);
|
|
|
|
|
|
if (!inBamFp)
|
|
|
|
|
|
{
|
|
|
|
|
|
Error("[%s] load sam/bam file failed.\n", __func__);
|
|
|
|
|
|
return -1;
|
|
|
|
|
|
}
|
|
|
|
|
|
hts_set_opt(inBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
|
|
|
|
|
inBamHeader = sam_hdr_read(inBamFp); // 读取header
|
|
|
|
|
|
|
|
|
|
|
|
/* 利用线程池对输入输出文件进行读写 */
|
|
|
|
|
|
htsThreadPool htsPoolRead = {NULL, 0}; // 多线程读取,创建线程池
|
|
|
|
|
|
htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池
|
|
|
|
|
|
htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads);
|
|
|
|
|
|
htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads);
|
|
|
|
|
|
if (!htsPoolRead.pool || !htsPoolWrite.pool)
|
|
|
|
|
|
{
|
|
|
|
|
|
Error("[%d] failed to set up thread pool", __LINE__);
|
|
|
|
|
|
return -1;
|
|
|
|
|
|
}
|
|
|
|
|
|
hts_set_opt(inBamFp, HTS_OPT_THREAD_POOL, &htsPoolRead);
|
|
|
|
|
|
|
|
|
|
|
|
/* 初始化输出文件 */
|
|
|
|
|
|
char modeout[12] = "wb";
|
|
|
|
|
|
sam_open_mode(modeout + 1, g_gArg.out_fn.c_str(), NULL);
|
|
|
|
|
|
g_outBamFp = sam_open(g_gArg.out_fn.c_str(), modeout);
|
|
|
|
|
|
g_outBamHeader = sam_hdr_dup(inBamHeader);
|
|
|
|
|
|
if (sam_hdr_write(g_outBamFp, g_outBamHeader) != 0)
|
|
|
|
|
|
{
|
|
|
|
|
|
Error("failed writing header to \"%s\"", g_gArg.out_fn.c_str());
|
|
|
|
|
|
sam_close(g_outBamFp);
|
|
|
|
|
|
return -1;
|
|
|
|
|
|
}
|
|
|
|
|
|
hts_set_opt(g_outBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
|
|
|
|
|
hts_set_opt(g_outBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite); // 用同样的线程池处理输出文件
|
|
|
|
|
|
|
|
|
|
|
|
// /* 读取缓存初始化 */
|
|
|
|
|
|
BamBufType inBamBuf(g_gArg.use_asyncio);
|
|
|
|
|
|
inBamBuf.Init(inBamFp, inBamHeader, g_gArg.max_mem);
|
|
|
|
|
|
|
|
|
|
|
|
/* 循环读入信息,并处理 */
|
|
|
|
|
|
g_maxJobNum = g_gArg.num_threads * 10;
|
|
|
|
|
|
// g_maxJobNum = g_gArg.num_threads * 3;
|
|
|
|
|
|
g_jobNumForRead = g_gArg.num_threads * 2;
|
|
|
|
|
|
|
|
|
|
|
|
int64_t x_all = 0; // for test
|
|
|
|
|
|
int64_t jobSeq = 0;
|
|
|
|
|
|
int64_t processedBamNum = 0; // 记录每个轮次累计处理的reads数量,用来计算每个read在整个文件中的索引位置
|
|
|
|
|
|
threadpool thpool = thpool_init(g_gArg.num_threads); // 创建mark dup所需的线程池
|
|
|
|
|
|
thread *writeth = LAUNCH(thread_write, nullptr); // 启动处理结果的的线程
|
|
|
|
|
|
int bamRemainSize = 0; // 上一轮还剩下的bam数量,包含已经在任务里的和没有放进任务的
|
|
|
|
|
|
int numReadsForEachJob = 0; // 每个线程处理的read数量,第一次读取的时候进行设置
|
|
|
|
|
|
int lastRoundUnProcessed = 0; // 上一轮没有放进任务里的read数量
|
|
|
|
|
|
int curRoundProcessed = 0; // 这一轮放进任务的read数量
|
|
|
|
|
|
while (inBamBuf.ReadStat() >= 0)
|
|
|
|
|
|
{
|
|
|
|
|
|
/* 读取bam文件中的read */
|
|
|
|
|
|
int readNum = inBamBuf.ReadBam();
|
|
|
|
|
|
if (numReadsForEachJob == 0)
|
|
|
|
|
|
numReadsForEachJob = readNum / g_maxJobNum; // 第一次读取bam的时候进行设置
|
|
|
|
|
|
g_bamLoadedNum += readNum;
|
|
|
|
|
|
|
|
|
|
|
|
cout << readNum << endl; // 这一轮读取的bam数量
|
|
|
|
|
|
|
|
|
|
|
|
/* 多线程处理 任务数是线程数的10倍 */
|
|
|
|
|
|
tm_arr[0].acc_start();
|
|
|
|
|
|
curRoundProcessed = 0; // 当前轮次已经处理的reads数量
|
|
|
|
|
|
int numNeedToProcess = inBamBuf.Size() - bamRemainSize + lastRoundUnProcessed; // 当前需要处理的bam数量
|
|
|
|
|
|
for (int i = 0; numNeedToProcess >= numReadsForEachJob; ++i) // 只有待处理的reads数量大于一次任务的数量时,新建任务
|
|
|
|
|
|
{
|
|
|
|
|
|
int startIdx = i * numReadsForEachJob + bamRemainSize - lastRoundUnProcessed;
|
|
|
|
|
|
int endIdx = (i + 1) * numReadsForEachJob + bamRemainSize - lastRoundUnProcessed;
|
|
|
|
|
|
|
|
|
|
|
|
ThMarkDupArg *thArg = new ThMarkDupArg({processedBamNum + curRoundProcessed,
|
|
|
|
|
|
jobSeq++,
|
|
|
|
|
|
true,
|
|
|
|
|
|
false,
|
|
|
|
|
|
inBamBuf.Slice(startIdx, endIdx)});
|
|
|
|
|
|
POSSESS(g_queueFirstLock); // 加锁
|
|
|
|
|
|
g_qpThMarkDupArg.push(thArg); // 将新任务需要的参数添加到队列
|
|
|
|
|
|
RELEASE(g_queueFirstLock); // 解锁
|
2023-11-06 12:38:30 +08:00
|
|
|
|
thpool_add_work(thpool, thread_markdups, (void *)thArg); // 添加新任务
|
2023-11-09 21:07:58 +08:00
|
|
|
|
curRoundProcessed += endIdx - startIdx;
|
|
|
|
|
|
numNeedToProcess -= numReadsForEachJob;
|
2023-11-06 12:38:30 +08:00
|
|
|
|
}
|
2023-11-09 21:07:58 +08:00
|
|
|
|
processedBamNum += curRoundProcessed;
|
|
|
|
|
|
lastRoundUnProcessed = numNeedToProcess;
|
2023-11-06 12:38:30 +08:00
|
|
|
|
|
2023-11-09 21:07:58 +08:00
|
|
|
|
/* 等待可以继续读取的信号 */
|
|
|
|
|
|
POSSESS(g_readyToReadLock);
|
|
|
|
|
|
WAIT_FOR(g_readyToReadLock, TO_BE, 1);
|
|
|
|
|
|
bamRemainSize = g_bamLoadedNum - g_bamWritenNum;
|
|
|
|
|
|
|
|
|
|
|
|
while (bamRemainSize >= inBamBuf.Size() / 2)
|
|
|
|
|
|
{ // 要保留的多于现在有的bam数量的一半,那就等待write线程继续处理
|
|
|
|
|
|
TWIST(g_readyToReadLock, TO, 0);
|
|
|
|
|
|
POSSESS(g_readyToReadLock);
|
|
|
|
|
|
WAIT_FOR(g_readyToReadLock, TO_BE, 1);
|
|
|
|
|
|
bamRemainSize = g_bamLoadedNum - g_bamWritenNum;
|
2023-11-06 12:38:30 +08:00
|
|
|
|
}
|
2023-11-09 21:07:58 +08:00
|
|
|
|
inBamBuf.ClearBeforeIdx(inBamBuf.Size() - bamRemainSize); // 清理掉已经处理完的reads
|
|
|
|
|
|
// cout << g_bamLoadedNum << '\t' << g_bamWritenNum << '\t' << bamRemainSize << '\t' << inBamBuf.Size() << endl;
|
|
|
|
|
|
TWIST(g_readyToReadLock, TO, 0);
|
2023-11-06 12:38:30 +08:00
|
|
|
|
|
2023-11-09 21:07:58 +08:00
|
|
|
|
}
|
|
|
|
|
|
/* 数据读完了,放一个空的任务,好让write thread停下来 */
|
|
|
|
|
|
ThMarkDupArg *thArg = nullptr;
|
|
|
|
|
|
if (lastRoundUnProcessed > 0) // 最后一轮还有没有添加进任务的read数据
|
|
|
|
|
|
{
|
|
|
|
|
|
thArg = new ThMarkDupArg({processedBamNum + curRoundProcessed, jobSeq++, false, false,
|
|
|
|
|
|
inBamBuf.Slice(inBamBuf.Size() - lastRoundUnProcessed, inBamBuf.Size())});
|
|
|
|
|
|
processedBamNum += lastRoundUnProcessed;
|
|
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
|
|
|
|
|
thArg = new ThMarkDupArg({0, jobSeq++, false, false});
|
|
|
|
|
|
}
|
|
|
|
|
|
POSSESS(g_queueFirstLock); // 加锁
|
|
|
|
|
|
g_qpThMarkDupArg.push(thArg); // 将新任务需要的参数添加到队列
|
|
|
|
|
|
RELEASE(g_queueFirstLock); // 解锁
|
|
|
|
|
|
thpool_add_work(thpool, thread_markdups, (void *)thArg); // 添加新任务
|
2023-11-06 12:38:30 +08:00
|
|
|
|
|
2023-11-09 21:07:58 +08:00
|
|
|
|
/* 同步所有线程 */
|
|
|
|
|
|
thpool_wait(thpool);
|
|
|
|
|
|
thpool_destroy(thpool);
|
|
|
|
|
|
JOIN(writeth);
|
2023-11-06 12:38:30 +08:00
|
|
|
|
|
2023-11-09 21:07:58 +08:00
|
|
|
|
cout <<"x_all: " << x_all << endl;
|
|
|
|
|
|
cout << "loaded: " << g_bamLoadedNum << endl;
|
|
|
|
|
|
cout << "writen: " << g_bamWritenNum << endl;
|
|
|
|
|
|
cout << "processedBamNum: " << processedBamNum << endl;
|
|
|
|
|
|
/* 标记冗余, 将处理后的结果写入文件 */
|
|
|
|
|
|
|
|
|
|
|
|
/* 关闭文件,收尾清理 */
|
|
|
|
|
|
sam_close(g_outBamFp);
|
|
|
|
|
|
sam_close(inBamFp);
|
2023-11-01 10:48:02 +08:00
|
|
|
|
|
2023-11-09 21:07:58 +08:00
|
|
|
|
cout << "read ends size: " << sizeof(ReadEnds) << endl;
|
2023-10-23 23:07:00 +08:00
|
|
|
|
|
2023-11-09 21:07:58 +08:00
|
|
|
|
cout << " 总时间: " << time_all.seconds_elapsed() << endl;
|
|
|
|
|
|
cout << "计算read end: " << tm_arr[0].acc_seconds_elapsed() << endl;
|
2023-11-06 12:38:30 +08:00
|
|
|
|
Timer::log_time("程序结束");
|
2023-10-23 23:07:00 +08:00
|
|
|
|
return 0;
|
|
|
|
|
|
}
|