在结果中加入了TAG
This commit is contained in:
parent
899d40cbda
commit
4fd267c4db
|
|
@ -1,4 +1,5 @@
|
|||
# for fast-markdup
|
||||
core.*
|
||||
*.sam
|
||||
*.bam
|
||||
*.log
|
||||
|
|
|
|||
|
|
@ -13,8 +13,8 @@
|
|||
"program": "${workspaceRoot}/build/bin/picard_cpp",
|
||||
"args": [
|
||||
"MarkDuplicates",
|
||||
"--INPUT", "~/data/bam/zy_tumor.bam",
|
||||
"--OUTPUT", "/dev/null",
|
||||
"--INPUT", "~/data/bam/1k.sam",
|
||||
"--OUTPUT", "./out.sam",
|
||||
"--METRICS_FILE", "metrics.txt",
|
||||
"--num_threads", "1",
|
||||
"--max_mem", "2G",
|
||||
|
|
|
|||
|
|
@ -101,6 +101,7 @@
|
|||
"queue": "cpp",
|
||||
"stack": "cpp",
|
||||
"__bit_reference": "cpp",
|
||||
"__string": "cpp"
|
||||
"__string": "cpp",
|
||||
"filesystem": "cpp"
|
||||
}
|
||||
}
|
||||
10
run.sh
10
run.sh
|
|
@ -1,18 +1,22 @@
|
|||
#input=~/data/bam/zy_normal.bam
|
||||
#input=~/data/bam/zy_tumor.bam
|
||||
#input=~/data/bam/100w.bam
|
||||
input=~/data/bam/1kw.sam
|
||||
input=~/data/bam/1k.sam
|
||||
#input=~/data/bam/1kw.sam
|
||||
#input=~/data/bam/1kw.bam
|
||||
#input=~/data/bam/n1kw.sam
|
||||
|
||||
cd ./build/ && make -j 8 && cd ..
|
||||
#./out.sam \
|
||||
time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \
|
||||
MarkDuplicates \
|
||||
--INPUT $input \
|
||||
--OUTPUT /dev/null \
|
||||
--OUTPUT ./out.sam \
|
||||
--INDEX_FORMAT BAI \
|
||||
--num_threads 1 \
|
||||
--max_mem 2G \
|
||||
--verbosity DEBUG \
|
||||
--asyncio true -TAG_DUPLICATE_SET_MEMBERS true #--READ_NAME_REGEX ".*?([0-9]+):([0-9]+):([0-9]+)$"
|
||||
--asyncio true -TAG_DUPLICATE_SET_MEMBERS true #--READ_NAME_REGEX null
|
||||
#--TAG_DUPLICATE_SET_MEMBERS true #--READ_NAME_REGEX ""
|
||||
#--READ_NAME_REGEX ".*?([0-9]+):([0-9]+):([0-9]+)$"
|
||||
#--READ_NAME_REGEX ""
|
||||
|
|
|
|||
|
|
@ -355,6 +355,14 @@ struct BamWrap {
|
|||
const int64_t mask = ((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1;
|
||||
return (int32_t)(global_pos & mask);
|
||||
}
|
||||
|
||||
// 设置是否冗余的标记
|
||||
void SetDuplicateReadFlag(bool flag) { setFlag(flag, BAM_FDUP); }
|
||||
|
||||
void setFlag(bool flag, int bit) {
|
||||
if (flag) this->b->core.flag |= bit;
|
||||
else this->b->core.flag &= ~bit;
|
||||
}
|
||||
};
|
||||
|
||||
typedef std::map<const std::string, std::vector<BamWrap *>> SampleBamMap;
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@
|
|||
*/
|
||||
|
||||
#include "global_arg.h"
|
||||
#include "util.h"
|
||||
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
|
|
@ -101,3 +102,23 @@ void GlobalArg::parseArgument(int argNum) {
|
|||
break;
|
||||
}
|
||||
}
|
||||
|
||||
string GlobalArg::GetArgValueString() {
|
||||
printf("--INPUT = %s\n", in_fn.c_str());
|
||||
printf("--OUTPUT = %s\n", out_fn.c_str());
|
||||
printf("--num_threads = %d\n", num_threads);
|
||||
printf("--max_mem = %ld\n", max_mem);
|
||||
printf("--verbosity = %d\n", verbosity);
|
||||
printf("--asyncio = %d\n", use_asyncio);
|
||||
|
||||
vector<string> argVals;
|
||||
argVals.push_back("--INPUT"); argVals.push_back(in_fn);
|
||||
argVals.push_back("--OUTPUT"); argVals.push_back(out_fn);
|
||||
argVals.push_back("--num_threads"); argVals.push_back(StringUtil::Format(num_threads));
|
||||
argVals.push_back("--asyncio"); argVals.push_back(use_asyncio ? "true" : "false");
|
||||
|
||||
string argValStr;
|
||||
StringUtil::Join(argVals, argValStr, ' ');
|
||||
cout << argValStr << endl;
|
||||
return argValStr;
|
||||
}
|
||||
|
|
@ -89,6 +89,8 @@ struct GlobalArg {
|
|||
printf("--asyncio = %d\n", use_asyncio);
|
||||
}
|
||||
|
||||
string GetArgValueString();
|
||||
|
||||
private:
|
||||
GlobalArg() { initGlobalOptions(); };
|
||||
};
|
||||
|
|
|
|||
|
|
@ -34,6 +34,13 @@ void Warn(const char *format, ...);
|
|||
void Info(const char *format, ...);
|
||||
void Debug(const char *format, ...);
|
||||
|
||||
// 字节缓冲区
|
||||
struct ByteBuf {
|
||||
uint8_t *buf = nullptr;
|
||||
int size = 0; // 当前使用量
|
||||
int capacity = 0; // 最大容量
|
||||
};
|
||||
|
||||
/*
|
||||
* StringUtil 用来合并字符串等操作
|
||||
*/
|
||||
|
|
@ -74,6 +81,17 @@ struct StringUtil {
|
|||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
//static void StringToBytes(const string &str, ByteBuf &buf) {
|
||||
// if (str.size() > buf.capacity) {
|
||||
// buf.capacity = str.size() * 2;
|
||||
// buf.buf = (uint8_t*)realloc(buf.buf, buf.capacity);
|
||||
// }
|
||||
// buf.size = str.size();
|
||||
// for (int i = 0; i < str.size(); ++i) {
|
||||
// buf.buf[i] = (uint8_t)str[i] + 64;
|
||||
// }
|
||||
//}
|
||||
};
|
||||
|
||||
// 二进制读写相关
|
||||
|
|
|
|||
|
|
@ -2,8 +2,11 @@
|
|||
|
||||
#include <string>
|
||||
#include <stdint.h>
|
||||
#include <vector>
|
||||
#include "serial_md.h"
|
||||
|
||||
using std::string;
|
||||
using std::vector;
|
||||
|
||||
/*
|
||||
* 标记冗余过程中的一些数据统计
|
||||
|
|
@ -54,17 +57,11 @@ struct DuplicationMetrics {
|
|||
|
||||
// 其他的统计数据
|
||||
|
||||
// addSingletonToCount需要记录的数据
|
||||
uint64_t DuplicateCountHist = 0;
|
||||
uint64_t NonOpticalDuplicateCountHist = 0;
|
||||
// 比如在该位置,有4个冗余,那么所有4个冗余的位置数量
|
||||
MDMap DuplicateCountHist;
|
||||
MDMap NonOpticalDuplicateCountHist;
|
||||
MDMap OpticalDuplicatesCountHist;
|
||||
|
||||
// track optical duplicates 需要记录的数据
|
||||
uint64_t OpticalDuplicatesCountHist = 0;
|
||||
uint64_t OpticalDuplicatesByLibraryId = 0;
|
||||
|
||||
// 统计相关的函数
|
||||
void AddSingletonToCount() {
|
||||
++this->DuplicateCountHist;
|
||||
++this->NonOpticalDuplicateCountHist;
|
||||
}
|
||||
// 没有冗余的位置总数量
|
||||
MDSet<int64_t> singletonReads;
|
||||
};
|
||||
|
|
@ -25,6 +25,7 @@ Date : 2023/10/23
|
|||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "dup_metrics.h"
|
||||
#include "markdups_arg.h"
|
||||
|
|
@ -35,6 +36,7 @@ Date : 2023/10/23
|
|||
|
||||
using namespace std;
|
||||
using std::cout;
|
||||
using std::string;
|
||||
|
||||
#define SMA_TAG_PG "PG"
|
||||
|
||||
|
|
@ -56,11 +58,67 @@ MarkDupsArg &g_mdArg = g_mdArg_;
|
|||
static GlobalDataArg gData_;
|
||||
GlobalDataArg &gData = gData_;
|
||||
DuplicationMetrics gMetrics_;
|
||||
DuplicationMetrics &gMetrics = gMetrics_;
|
||||
DuplicationMetrics &gMetrics = gMetrics_; // 记录统计信息
|
||||
|
||||
/* 程序版本等信息 */
|
||||
const char *PROGRAM_GROUP_VERSION = "3.2.0";
|
||||
|
||||
// 调试信息
|
||||
int zzhtestnum = 0;
|
||||
set<int64_t> zzhopticalSet;
|
||||
vector<int64_t> zzhopticalArr;
|
||||
int64_t bam_num1=0, bam_num2=0;
|
||||
/**
|
||||
* Estimates the size of a library based on the number of paired end molecules observed
|
||||
* and the number of unique pairs observed.
|
||||
* <p>
|
||||
* Based on the Lander-Waterman equation that states:
|
||||
* C/X = 1 - exp( -N/X )
|
||||
* where
|
||||
* X = number of distinct molecules in library
|
||||
* N = number of read pairs
|
||||
* C = number of distinct fragments observed in read pairs
|
||||
*/
|
||||
static int64_t estimateLibrarySize(int64_t readPairs, int64_t uniqueReadPairs) {
|
||||
int64_t librarySize = 0;
|
||||
int64_t readPairDuplicates = readPairs - uniqueReadPairs;
|
||||
auto f = [](double x, double c, double n) { return c / x - 1 + exp(-n / x); };
|
||||
if (readPairs > 0 && readPairDuplicates > 0) {
|
||||
double m = 1.0;
|
||||
double M = 100.0;
|
||||
if (uniqueReadPairs >= readPairs || f(m * uniqueReadPairs, uniqueReadPairs, readPairs) < 0) {
|
||||
cerr << "Invalid values for pairs and unique pairs: " << readPairs << ", " << uniqueReadPairs << endl;
|
||||
return 0;
|
||||
}
|
||||
// find value of M, large enough to act as other side for bisection method
|
||||
while (f(M * uniqueReadPairs, uniqueReadPairs, readPairs) > 0) {
|
||||
M *= 10.0;
|
||||
}
|
||||
// use bisection method (no more than 40 times) to find solution
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
double r = (m + M) / 2.0;
|
||||
double u = f(r * uniqueReadPairs, uniqueReadPairs, readPairs);
|
||||
if (u == 0)
|
||||
break;
|
||||
else if (u > 0)
|
||||
m = r;
|
||||
else if (u < 0)
|
||||
M = r;
|
||||
}
|
||||
return uniqueReadPairs * (m + M) / 2.0;
|
||||
}
|
||||
return librarySize;
|
||||
}
|
||||
|
||||
/*
|
||||
* 获取文件名后缀
|
||||
*/
|
||||
string getFileExtension(const string &filename) {
|
||||
auto last_dot = filename.find_last_of('.');
|
||||
if (last_dot == string::npos) {
|
||||
return "";
|
||||
}
|
||||
return filename.substr(last_dot + 1);
|
||||
}
|
||||
|
||||
/*
|
||||
* mark duplicate
|
||||
* 入口,假定bam是按照比对后的坐标排序的,同一个样本的话不需要考虑barcode的问题
|
||||
|
|
@ -109,6 +167,10 @@ int MarkDuplicates(int argc, char *argv[]) {
|
|||
sam_open_mode(modeout + 1, g_gArg.out_fn.c_str(), NULL);
|
||||
g_outBamFp = sam_open(g_gArg.out_fn.c_str(), modeout);
|
||||
g_outBamHeader = sam_hdr_dup(g_inBamHeader);
|
||||
// 修改输出文件的header
|
||||
sam_hdr_add_line(g_outBamHeader, "PG", "ID", g_mdArg.PROGRAM_RECORD_ID.c_str(), "VN", PROGRAM_GROUP_VERSION, "CL",
|
||||
(g_mdArg.PROGRAM_RECORD_ID + " " + g_gArg.GetArgValueString() + " " + g_mdArg.GetArgValueString()).c_str(), NULL);
|
||||
|
||||
// 用同样的线程池处理输出文件
|
||||
hts_set_opt(g_outBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
||||
hts_set_opt(g_outBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite);
|
||||
|
|
@ -139,6 +201,11 @@ int MarkDuplicates(int argc, char *argv[]) {
|
|||
// 输出index文件
|
||||
// string indexFn = g_gArg.out_fn + ".csi"; // 现在索引都是csi格式的
|
||||
string indexFn = g_gArg.out_fn + ".bai"; // min_shift = 0 是bai格式
|
||||
if ("sam" == getFileExtension(g_gArg.out_fn)) {
|
||||
indexFn = "";
|
||||
}
|
||||
|
||||
if (!indexFn.empty()) {
|
||||
int index_min_shift = 0;
|
||||
if (g_mdArg.INDEX_FORMAT == ns_md::IndexFormat::CSI) {
|
||||
indexFn = g_gArg.out_fn + ".csi";
|
||||
|
|
@ -150,33 +217,122 @@ int MarkDuplicates(int argc, char *argv[]) {
|
|||
sam_close(g_inBamFp);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
// 读取输入文件
|
||||
// BamBufType inBuf(false);
|
||||
BamBufType inBuf(g_gArg.use_asyncio);
|
||||
inBuf.Init(g_inBamFp, g_inBamHeader, g_gArg.max_mem);
|
||||
DupIdxQueue<DupInfo> dupIdxQue, repIdxQue;
|
||||
DupIdxQueue<int64_t> opticalIdxQue;
|
||||
dupIdxQue.Init(&gData.dupIdxArr);
|
||||
repIdxQue.Init(&gData.repIdxArr);
|
||||
opticalIdxQue.Init(&gData.opticalDupIdxArr);
|
||||
Timer tw;
|
||||
cout << "dupsize: " << dupIdxQue.Size() << endl;
|
||||
uint64_t bamIdx = 0;
|
||||
DupInfo dupIdx = dupIdxQue.Pop();
|
||||
DupInfo repIdx = repIdxQue.Pop();
|
||||
exit(0);
|
||||
uint64_t opticalIdx = opticalIdxQue.Pop();
|
||||
ByteBuf bytes;
|
||||
bam1_t *bp = bam_init1();
|
||||
bool isDup = false;
|
||||
bool isOpticalDup = false;
|
||||
bool isInDuplicateSet = false;
|
||||
uint32_t representativeReadIndexInFile = 0;
|
||||
uint32_t duplicateSetSize = 0;
|
||||
// exit(0);
|
||||
while (inBuf.ReadStat() >= 0) {
|
||||
Timer tw1;
|
||||
size_t readNum = inBuf.ReadBam();
|
||||
// cout << "read: " << readNum << endl;
|
||||
for (size_t i = 0; i < inBuf.Size(); ++i) {
|
||||
BamWrap *bw = inBuf[i];
|
||||
if (bam_copy1(bp, bw->b) == nullptr) {
|
||||
Error("Can not copy sam record!");
|
||||
return -1;
|
||||
}
|
||||
bw->b = bp;
|
||||
isDup = false;
|
||||
isOpticalDup = false;
|
||||
isInDuplicateSet = false;
|
||||
// 删除原来的duplicate tag
|
||||
if (g_mdArg.CLEAR_DT) {
|
||||
uint8_t *oldTagVal = bam_aux_get(bw->b, g_mdArg.DUPLICATE_TYPE_TAG.c_str());
|
||||
if (oldTagVal != NULL) bam_aux_del(bw->b, oldTagVal);
|
||||
}
|
||||
++bam_num2;
|
||||
// 统计信息
|
||||
if (bw->GetReadUnmappedFlag()) {
|
||||
++gMetrics.UNMAPPED_READS;
|
||||
} else if (bw->IsSecondaryOrSupplementary()) {
|
||||
++gMetrics.SECONDARY_OR_SUPPLEMENTARY_RDS;
|
||||
} else if (!bw->GetReadPairedFlag() || bw->GetMateUnmappedFlag()) {
|
||||
++gMetrics.UNPAIRED_READS_EXAMINED;
|
||||
} else {
|
||||
++gMetrics.READ_PAIRS_EXAMINED; // will need to be divided by 2 at the end
|
||||
}
|
||||
|
||||
/* 判断是否冗余 */
|
||||
if (bamIdx == dupIdx) {
|
||||
// if (dupIdx.dupSet != 0)
|
||||
isDup = true;
|
||||
|
||||
if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS && dupIdx.dupSet != 0) {
|
||||
// cerr << bamIdx << " " << dupIdx.repIdx << " " << dupIdx.dupSet << endl;
|
||||
isInDuplicateSet = true;
|
||||
representativeReadIndexInFile = dupIdx.repIdx;
|
||||
duplicateSetSize = dupIdx.dupSet;
|
||||
}
|
||||
// 为了防止小内存运行的时候,有重复的dupidx,这时候dup的repIdx和dupSetSize可能会有不同
|
||||
while ((dupIdx = dupIdxQue.Pop()) == bamIdx);
|
||||
if (opticalIdx == bamIdx)
|
||||
isOpticalDup = true;
|
||||
else if (opticalIdx < bamIdx) {
|
||||
while ((opticalIdx = opticalIdxQue.Pop()) < bamIdx);
|
||||
if (opticalIdx == bamIdx)
|
||||
isOpticalDup = true;
|
||||
}
|
||||
// 添加冗余标识
|
||||
bw->SetDuplicateReadFlag(true);
|
||||
// if (isOpticalDup)
|
||||
// cout << bamIdx << " optical" << endl;
|
||||
// else
|
||||
// cout << bamIdx << " not opt" << endl;
|
||||
|
||||
uint8_t *oldTagVal = bam_aux_get(bw->b, g_mdArg.DUPLICATE_TYPE_TAG.c_str());
|
||||
if (oldTagVal != NULL) bam_aux_del(bw->b, oldTagVal);
|
||||
if (isOpticalDup)
|
||||
bam_aux_append(bw->b, g_mdArg.DUPLICATE_TYPE_TAG.c_str(), 'Z',
|
||||
g_mdArg.DUPLICATE_TYPE_SEQUENCING.size() + 1,
|
||||
(const uint8_t *)g_mdArg.DUPLICATE_TYPE_SEQUENCING.c_str());
|
||||
else
|
||||
bam_aux_append(bw->b, g_mdArg.DUPLICATE_TYPE_TAG.c_str(), 'Z',
|
||||
g_mdArg.DUPLICATE_TYPE_LIBRARY.size() + 1,
|
||||
(const uint8_t *)g_mdArg.DUPLICATE_TYPE_LIBRARY.c_str());
|
||||
|
||||
// cout << bamIdx << endl;
|
||||
// int ival = 2;
|
||||
// cout << "bam rec size: " << bw->b->l_data << "\t" << bw->b->m_data << endl;
|
||||
// if (bam_aux_append(bw->b, "DT", 'Z', 3, (const uint8_t*)"LB") < 0) Error("add tag error!");
|
||||
// // if (bam_aux_append(bw->b, "DT", 'i', sizeof(ival), (uint8_t *)&ival) < 0) Error("add tag error!");
|
||||
// cout << "bam rec size: " << bw->b->l_data << "\t" << bw->b->m_data << endl;
|
||||
|
||||
// 计算统计信息
|
||||
if (!bw->IsSecondaryOrSupplementary() && !bw->GetReadUnmappedFlag()) {
|
||||
// Update the duplication metrics
|
||||
if (!bw->GetReadPairedFlag() || bw->GetMateUnmappedFlag()) {
|
||||
++gMetrics.UNPAIRED_READ_DUPLICATES;
|
||||
} else {
|
||||
++gMetrics.READ_PAIR_DUPLICATES; // will need to be divided by 2 at the end
|
||||
}
|
||||
}
|
||||
} else {
|
||||
bw->SetDuplicateReadFlag(false);
|
||||
}
|
||||
if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS && bamIdx == repIdx) { // repressent
|
||||
// cerr << bamIdx << " " << repIdx.repIdx << " " << repIdx.dupSet << endl;
|
||||
isInDuplicateSet = true;
|
||||
representativeReadIndexInFile = repIdx.repIdx;
|
||||
duplicateSetSize = repIdx.dupSet;
|
||||
while (repIdx == bamIdx || repIdx.dupSet == 0) {
|
||||
if (repIdxQue.Size() > 0)
|
||||
repIdx = repIdxQue.Pop();
|
||||
|
|
@ -186,19 +342,82 @@ int MarkDuplicates(int argc, char *argv[]) {
|
|||
}
|
||||
}
|
||||
}
|
||||
if (sam_write1(g_outBamFp, g_outBamHeader, inBuf[i]->b) < 0) {
|
||||
Error("failed writing header to \"%s\"", g_gArg.out_fn.c_str());
|
||||
|
||||
if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS && isInDuplicateSet) {
|
||||
// cerr << bamIdx << " " << representativeReadIndexInFile << " " << duplicateSetSize << endl;
|
||||
uint8_t *oldTagVal = bam_aux_get(bw->b, g_mdArg.DUPLICATE_SET_INDEX_TAG.c_str());
|
||||
if (oldTagVal != NULL) bam_aux_del(bw->b, oldTagVal);
|
||||
bam_aux_append(bw->b, g_mdArg.DUPLICATE_SET_INDEX_TAG.c_str(), 'i',
|
||||
sizeof(representativeReadIndexInFile), (uint8_t *)&representativeReadIndexInFile);
|
||||
oldTagVal = bam_aux_get(bw->b, g_mdArg.DUPLICATE_SET_SIZE_TAG.c_str());
|
||||
if (oldTagVal != NULL) bam_aux_del(bw->b, oldTagVal);
|
||||
bam_aux_append(bw->b, g_mdArg.DUPLICATE_SET_SIZE_TAG.c_str(), 'i', sizeof(duplicateSetSize),
|
||||
(const uint8_t *)&duplicateSetSize);
|
||||
}
|
||||
// 每个read都要写到output,只是添加标识
|
||||
// cout << "bam rec size: " << bw->b->l_data << "\t" << bw->b->m_data << endl;
|
||||
// if (bamIdx == 922) break;
|
||||
++bamIdx;
|
||||
if (isDup && g_mdArg.REMOVE_DUPLICATES) {
|
||||
continue;
|
||||
}
|
||||
if (isOpticalDup && g_mdArg.REMOVE_SEQUENCING_DUPLICATES) {
|
||||
continue;
|
||||
}
|
||||
if (!g_mdArg.PROGRAM_RECORD_ID.empty() && g_mdArg.ADD_PG_TAG_TO_READS) {
|
||||
uint8_t *oldTagVal = bam_aux_get(bw->b, "PG");
|
||||
if (oldTagVal != NULL) bam_aux_del(bw->b, oldTagVal);
|
||||
bam_aux_append(bw->b, "PG", 'Z', g_mdArg.PROGRAM_RECORD_ID.size() + 1,
|
||||
(const uint8_t *)g_mdArg.PROGRAM_RECORD_ID.c_str());
|
||||
}
|
||||
if (sam_write1(g_outBamFp, g_outBamHeader, bw->b) < 0) {
|
||||
Error("failed writing sam record to \"%s\"", g_gArg.out_fn.c_str());
|
||||
sam_close(g_outBamFp);
|
||||
sam_close(g_inBamFp);
|
||||
return -1;
|
||||
}
|
||||
++bamIdx;
|
||||
}
|
||||
inBuf.ClearAll();
|
||||
cout << "write round time: " << tw1.seconds_elapsed() << " s" << endl;
|
||||
}
|
||||
cout << "dupsize: " << dupIdxQue.Size() << endl;
|
||||
if (sam_idx_save(g_outBamFp) < 0) {
|
||||
bam_destroy1(bp);
|
||||
cout << "bam_Num: " << bam_num1 << " : " << bam_num2 << endl;
|
||||
|
||||
// 计算统计信息
|
||||
gMetrics.READ_PAIRS_EXAMINED /= 2;
|
||||
gMetrics.READ_PAIR_DUPLICATES /= 2;
|
||||
for (auto &arr : gData.opticalDupIdxArr) gMetrics.READ_PAIR_OPTICAL_DUPLICATES += arr.size();
|
||||
gMetrics.READ_PAIR_OPTICAL_DUPLICATES = gMetrics.READ_PAIR_OPTICAL_DUPLICATES / 2;
|
||||
gMetrics.ESTIMATED_LIBRARY_SIZE =
|
||||
estimateLibrarySize(gMetrics.READ_PAIRS_EXAMINED - gMetrics.READ_PAIR_OPTICAL_DUPLICATES,
|
||||
gMetrics.READ_PAIRS_EXAMINED - gMetrics.READ_PAIR_DUPLICATES);
|
||||
if (gMetrics.UNPAIRED_READS_EXAMINED + gMetrics.READ_PAIRS_EXAMINED != 0) {
|
||||
gMetrics.PERCENT_DUPLICATION = (gMetrics.UNPAIRED_READ_DUPLICATES + gMetrics.READ_PAIR_DUPLICATES * 2) /
|
||||
(double)(gMetrics.UNPAIRED_READS_EXAMINED + gMetrics.READ_PAIRS_EXAMINED * 2);
|
||||
} else {
|
||||
gMetrics.PERCENT_DUPLICATION = 0;
|
||||
}
|
||||
|
||||
cout << "metrics: \n"
|
||||
<< "LIBRARY: " << gMetrics.LIBRARY << "\n"
|
||||
<< "UNPAIRED_READS_EXAMINED: " << gMetrics.UNPAIRED_READS_EXAMINED << "\n"
|
||||
<< "READ_PAIRS_EXAMINED: " << gMetrics.READ_PAIRS_EXAMINED << "\n"
|
||||
<< "SECONDARY_OR_SUPPLEMENTARY_RDS: " << gMetrics.SECONDARY_OR_SUPPLEMENTARY_RDS << "\n"
|
||||
<< "UNMAPPED_READS: " << gMetrics.UNMAPPED_READS << "\n"
|
||||
<< "UNPAIRED_READ_DUPLICATES: " << gMetrics.UNPAIRED_READ_DUPLICATES << "\n"
|
||||
<< "READ_PAIR_DUPLICATES: " << gMetrics.READ_PAIR_DUPLICATES << "\n"
|
||||
<< "READ_PAIR_OPTICAL_DUPLICATES: " << gMetrics.READ_PAIR_OPTICAL_DUPLICATES << "\n"
|
||||
<< "PERCENT_DUPLICATION: " << gMetrics.PERCENT_DUPLICATION << "\n"
|
||||
<< "ESTIMATED_LIBRARY_SIZE: " << gMetrics.ESTIMATED_LIBRARY_SIZE << endl;
|
||||
|
||||
cout << "singleton pos: " << gMetrics.singletonReads.size() << endl;
|
||||
for (auto &itr : gMetrics.DuplicateCountHist) cout << "dup counts: " << itr.first << ": " << itr.second << endl;
|
||||
for (auto &itr : gMetrics.NonOpticalDuplicateCountHist)
|
||||
cout << "non optical dup counts: " << itr.first << ": " << itr.second << endl;
|
||||
for (auto &itr : gMetrics.OpticalDuplicatesCountHist)
|
||||
cout << "optical dup counts: " << itr.first << ": " << itr.second << endl;
|
||||
cout << "dupsize: " << dupIdxQue.Size() << "\t" << opticalIdxQue.Size() << "\t" << repIdxQue.Size() << endl;
|
||||
if (!indexFn.empty() && sam_idx_save(g_outBamFp) < 0) {
|
||||
Error("writing index failed");
|
||||
sam_close(g_outBamFp);
|
||||
sam_close(g_inBamFp);
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ Date : 2023/10/27
|
|||
|
||||
#include "markdups_arg.h"
|
||||
#include "common/utils/global_arg.h"
|
||||
#include "common/utils/util.h"
|
||||
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
|
|
@ -65,7 +66,8 @@ const static struct option kMdOpts[] = {
|
|||
{"MAX_RECORDS_IN_RAM", required_argument, NULL, MAX_RECORDS_IN_RAM},
|
||||
{"CREATE_INDEX", required_argument, NULL, CREATE_INDEX},
|
||||
{"INDEX_FORMAT", required_argument, NULL, INDEX_FORMAT},
|
||||
{"CREATE_MD5_FILE", required_argument, NULL, CREATE_MD5_FILE}};
|
||||
{"CREATE_MD5_FILE", required_argument, NULL, CREATE_MD5_FILE},
|
||||
{"ADD_PG_TAG_TO_READS", required_argument, NULL, ADD_PG_TAG_TO_READS}};
|
||||
|
||||
// 判断bool类型的参数
|
||||
void setBoolArg(bool *arg) {
|
||||
|
|
@ -193,7 +195,8 @@ void MarkDupsArg::parseArgument(int argc,
|
|||
COMMENT.push_back(optarg);
|
||||
break;
|
||||
case ns_md::READ_NAME_REGEX:
|
||||
READ_NAME_REGEX = optarg;
|
||||
if (strcmp("null", optarg) == 0) READ_NAME_REGEX="";
|
||||
else READ_NAME_REGEX = optarg;
|
||||
break;
|
||||
case ns_md::OPTICAL_DUPLICATE_PIXEL_DISTANCE:
|
||||
OPTICAL_DUPLICATE_PIXEL_DISTANCE = stoi(optarg);
|
||||
|
|
@ -229,6 +232,9 @@ void MarkDupsArg::parseArgument(int argc,
|
|||
case ns_md::CREATE_MD5_FILE:
|
||||
setBoolArg(&CREATE_MD5_FILE);
|
||||
break;
|
||||
case ns_md::ADD_PG_TAG_TO_READS:
|
||||
setBoolArg(&ADD_PG_TAG_TO_READS);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
@ -245,6 +251,18 @@ void MarkDupsArg::printArgValue()
|
|||
printf("--INDEX_FORMAT = %s\n", this->INDEX_FORMAT == ns_md::IndexFormat::BAI ? "bai" : "csi");
|
||||
}
|
||||
|
||||
string MarkDupsArg::GetArgValueString() {
|
||||
vector<string> argVals;
|
||||
argVals.push_back("--READ_NAME_REGEX");
|
||||
if (this->READ_NAME_REGEX.empty()) argVals.push_back("null");
|
||||
else argVals.push_back(this->READ_NAME_REGEX.c_str());
|
||||
argVals.push_back("--INDEX_FORMAT"); argVals.push_back(this->INDEX_FORMAT == ns_md::IndexFormat::BAI ? "bai" : "csi");
|
||||
string argValStr;
|
||||
StringUtil::Join(argVals, argValStr, ' ');
|
||||
cout << argValStr << endl;
|
||||
return argValStr;
|
||||
}
|
||||
|
||||
// 打印版本信息
|
||||
void MarkDupsArg::PrintVersion()
|
||||
{
|
||||
|
|
|
|||
|
|
@ -56,6 +56,7 @@ enum MarkDupsArgEnum {
|
|||
CREATE_INDEX,
|
||||
INDEX_FORMAT,
|
||||
CREATE_MD5_FILE,
|
||||
ADD_PG_TAG_TO_READS,
|
||||
_END_NUM
|
||||
};
|
||||
|
||||
|
|
@ -294,6 +295,9 @@ struct MarkDupsArg
|
|||
/* "Whether to create an MD5 digest for any BAM or FASTQ files created. ", common = true */
|
||||
bool CREATE_MD5_FILE = false;
|
||||
|
||||
/* Add PG tag to each read in a SAM or BAM (PGTagArgumentCollection)*/
|
||||
bool ADD_PG_TAG_TO_READS = true;
|
||||
|
||||
// mark duplicate参数个数
|
||||
const static int ARG_COUNT = ns_md::MarkDupsArgEnum::_END_NUM - ns_md::MarkDupsArgEnum::_START_NUM - 1;
|
||||
// 解析参数
|
||||
|
|
@ -302,6 +306,7 @@ struct MarkDupsArg
|
|||
GlobalArg *pGArg);
|
||||
|
||||
void printArgValue();
|
||||
string GetArgValueString();
|
||||
|
||||
static void PrintHelp();
|
||||
|
||||
|
|
|
|||
|
|
@ -461,18 +461,9 @@ static int checkOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const R
|
|||
ReadEnds *pRe = const_cast<ReadEnds *>(readEndsArr[i]);
|
||||
if (opticalDuplicateFlags[i]) {
|
||||
++opticalDuplicates;
|
||||
// if (zzhopticalSet.find(pRe->read1IndexInFile) != zzhopticalSet.end()) {
|
||||
// cout << "val: " << pRe->isOpticalDuplicate << endl;
|
||||
// }
|
||||
pRe->isOpticalDuplicate = true;
|
||||
zzhopticalSet.insert(pRe->read1IndexInFile);
|
||||
zzhopticalSet.insert(pRe->read2IndexInFile);
|
||||
zzhopticalArr.push_back(pRe->read1IndexInFile);
|
||||
zzhopticalArr.push_back(pRe->read2IndexInFile);
|
||||
} else {
|
||||
pRe->isOpticalDuplicate = false;
|
||||
zzhopticalSet.erase(pRe->read1IndexInFile);
|
||||
zzhopticalSet.erase(pRe->read2IndexInFile);
|
||||
}
|
||||
}
|
||||
return opticalDuplicates;
|
||||
|
|
@ -483,11 +474,8 @@ static int checkOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const R
|
|||
*/
|
||||
void trackOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe) {
|
||||
bool hasFR = false, hasRF = false;
|
||||
int prevOpticalDupNum = 0;
|
||||
// Check to see if we have a mixture of FR/RF
|
||||
for (auto pRe : readEndsArr) {
|
||||
if (pRe->isOpticalDuplicate)
|
||||
++prevOpticalDupNum;
|
||||
if (ReadEnds::FR == pRe->orientationForOpticalDuplicates)
|
||||
hasFR = true;
|
||||
else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates)
|
||||
|
|
@ -517,17 +505,8 @@ void trackOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnd
|
|||
nOpticalDup = checkOpticalDuplicates(readEndsArr, pBestRe);
|
||||
}
|
||||
|
||||
// trackDuplicateCounts
|
||||
gMetrics.DuplicateCountHist += readEndsArr.size() - nOpticalDup;
|
||||
if (readEndsArr.size() > nOpticalDup)
|
||||
gMetrics.NonOpticalDuplicateCountHist += readEndsArr.size() - nOpticalDup;
|
||||
if (nOpticalDup)
|
||||
gMetrics.OpticalDuplicatesCountHist += nOpticalDup + 1;
|
||||
|
||||
|
||||
gMetrics.OpticalDuplicatesByLibraryId += nOpticalDup - prevOpticalDupNum;
|
||||
//gMetrics.OpticalDuplicatesByLibraryId += nOpticalDup;
|
||||
|
||||
// cout << "zzh optical:" << (++zzhtestnum) << "\t" << readEndsArr.size() << "\t" << nOpticalDup << endl;
|
||||
// cerr << (zzhtestnum++) << " " << readEndsArr.size() << ":" << nOpticalDup << endl;
|
||||
// 统计信息,trackDuplicateCounts
|
||||
++gMetrics.DuplicateCountHist[readEndsArr.size()];
|
||||
if (readEndsArr.size() > nOpticalDup) ++gMetrics.NonOpticalDuplicateCountHist[readEndsArr.size() - nOpticalDup];
|
||||
if (nOpticalDup) ++gMetrics.OpticalDuplicatesCountHist[nOpticalDup + 1];
|
||||
}
|
||||
|
|
@ -0,0 +1,148 @@
|
|||
#pragma once
|
||||
|
||||
#include <common/hts/bam_buf.h>
|
||||
#include <robin-map/include/tsl/robin_map.h>
|
||||
#include <robin-map/include/tsl/robin_set.h>
|
||||
#include <sam/utils/read_ends.h>
|
||||
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
using std::set;
|
||||
using std::string;
|
||||
using std::unordered_set;
|
||||
using std::vector;
|
||||
|
||||
/* 存放未匹配readend相同位点的所有readend */
|
||||
struct UnpairedREInfo {
|
||||
int64_t taskSeq;
|
||||
ReadEnds unpairedRE;
|
||||
};
|
||||
|
||||
/* 对于一个pair数据,一个完整的计算点,包含read1的比对位置和read2的比对位置 */
|
||||
struct CalcKey {
|
||||
int64_t read1Pos;
|
||||
int64_t read2Pos;
|
||||
bool operator<(const CalcKey &o) const {
|
||||
int comp = (int)(read1Pos - o.read1Pos);
|
||||
if (comp == 0)
|
||||
comp = (int)(read2Pos - o.read2Pos);
|
||||
return comp < 0;
|
||||
}
|
||||
bool operator==(const CalcKey &o) const { return read1Pos == o.read1Pos && read2Pos == o.read2Pos; }
|
||||
std::size_t operator()(const CalcKey &o) const {
|
||||
return std::hash<int64_t>()(read1Pos) ^ std::hash<int64_t>()(read2Pos);
|
||||
}
|
||||
};
|
||||
|
||||
struct CalcKeyHash {
|
||||
std::size_t operator()(const CalcKey &o) const {
|
||||
return std::hash<int64_t>()(o.read1Pos) ^ std::hash<int64_t>()(o.read2Pos);
|
||||
}
|
||||
};
|
||||
|
||||
/* 用来记录冗余索引相关的信息 */
|
||||
struct DupInfo {
|
||||
int64_t idx;
|
||||
int64_t repIdx = 0; // 这一批冗余中的非冗余read 代表的索引
|
||||
int16_t dupSet = 0; // dup set size
|
||||
|
||||
DupInfo() : DupInfo(-1, 0, 0) {}
|
||||
DupInfo(int64_t idx_) : DupInfo(idx_, 0, 0) {}
|
||||
DupInfo(int64_t idx_, int64_t repIdx_, int dupSet_) : idx(idx_), repIdx(repIdx_), dupSet(dupSet_) {}
|
||||
bool operator<(const DupInfo &o) const { return idx < o.idx; }
|
||||
bool operator>(const DupInfo &o) const { return idx > o.idx; }
|
||||
operator int64_t() const { return idx; }
|
||||
};
|
||||
|
||||
struct DupInfoHash {
|
||||
std::size_t operator()(const DupInfo &o) const { return std::hash<int64_t>()(o.idx); }
|
||||
};
|
||||
|
||||
struct DupInfoEqual {
|
||||
bool operator()(const DupInfo &o1, const DupInfo &o2) const { return o1.idx == o2.idx; }
|
||||
bool operator()(const DupInfo &o1, const int64_t &o2) const { return o1.idx == o2; }
|
||||
bool operator()(const int64_t &o1, const DupInfo &o2) const { return o1 == o2.idx; }
|
||||
};
|
||||
|
||||
using MDMap = tsl::robin_map<int64_t, int64_t>;
|
||||
|
||||
template <typename T>
|
||||
// using MDSet = set<T>;
|
||||
// using MDSet = unordered_set<T>;
|
||||
using MDSet = tsl::robin_set<T>;
|
||||
|
||||
template <typename T>
|
||||
// using DPSet = set<T>;
|
||||
// using DPSet = unordered_set<T, DupInfoHash, DupInfoEqual>;
|
||||
using DPSet = tsl::robin_set<T, DupInfoHash, DupInfoEqual>;
|
||||
|
||||
template <typename T>
|
||||
// using CalcSet = set<T>;
|
||||
using CalcSet = tsl::robin_set<T, CalcKeyHash>;
|
||||
|
||||
using ReadEndsSet = tsl::robin_set<ReadEnds, ReadEndsHash, ReadEndsEqual>;
|
||||
|
||||
/* 当遗留数据在当前任务找到了pair read后,进行冗余计算时候存放结果的数据结构 */
|
||||
struct TaskSeqDupInfo {
|
||||
DPSet<DupInfo> dupIdx;
|
||||
MDSet<int64_t> opticalDupIdx;
|
||||
DPSet<DupInfo> repIdx;
|
||||
MDSet<int64_t> notDupIdx;
|
||||
MDSet<int64_t> notOpticalDupIdx;
|
||||
MDSet<int64_t> notRepIdx;
|
||||
};
|
||||
|
||||
/* 保存有未匹配pair位点的信息,包括read end数组和有几个未匹配的read end */
|
||||
struct UnpairedPosInfo {
|
||||
int unpairedNum = 0;
|
||||
int64_t taskSeq;
|
||||
vector<ReadEnds> pairArr;
|
||||
MDSet<string> readNameSet;
|
||||
};
|
||||
// typedef unordered_map<string, UnpairedREInfo> UnpairedNameMap;
|
||||
// typedef unordered_map<int64_t, UnpairedPosInfo> UnpairedPositionMap;
|
||||
|
||||
typedef tsl::robin_map<string, UnpairedREInfo> UnpairedNameMap; // 以read name为索引,保存未匹配的pair read
|
||||
typedef tsl::robin_map<int64_t, UnpairedPosInfo>
|
||||
UnpairedPositionMap; // 以位点为索引,保存该位点包含的对应的所有read和该位点包含的剩余未匹配的read的数量
|
||||
|
||||
/* 单线程处理冗余参数结构体 */
|
||||
struct MarkDupDataArg {
|
||||
int64_t taskSeq; // 任务序号
|
||||
int64_t bamStartIdx; // 当前vBam数组中第一个bam记录在整体bam中所处的位置
|
||||
vector<BamWrap *> bams; // 存放待处理的bam read
|
||||
vector<ReadEnds> pairs; // 成对的reads
|
||||
vector<ReadEnds> frags; // 暂未找到配对的reads
|
||||
DPSet<DupInfo> pairDupIdx; // pair的冗余read的索引
|
||||
MDSet<int64_t> pairOpticalDupIdx; // optical冗余read的索引
|
||||
DPSet<DupInfo> fragDupIdx; // frag的冗余read的索引
|
||||
DPSet<DupInfo> pairRepIdx; // pair的dupset代表read的索引
|
||||
MDSet<int64_t> pairSingletonIdxArr; // 某位置只有一对read的所有read pair个数
|
||||
UnpairedNameMap unpairedDic; // 用来寻找pair end
|
||||
UnpairedPositionMap unpairedPosArr; // 存放未匹配的ReadEnd对应位点的所有ReadEnd,为了避免重复存储
|
||||
};
|
||||
|
||||
/* 全局保留的数据,因为有些paired数据比对到了不同的染色体,相距甚远 */
|
||||
struct GlobalDataArg {
|
||||
UnpairedNameMap unpairedDic; // 用来寻找pair end
|
||||
UnpairedPositionMap unpairedPosArr;
|
||||
|
||||
// 每个task对应一个vector
|
||||
vector<vector<DupInfo>> dupIdxArr;
|
||||
vector<vector<int64_t>> opticalDupIdxArr;
|
||||
vector<vector<DupInfo>> repIdxArr;
|
||||
vector<vector<int64_t>> singletonIdxArr;
|
||||
|
||||
// 用来存放后续计算的数据
|
||||
vector<DPSet<DupInfo>> latterDupIdxArr;
|
||||
vector<MDSet<int64_t>> latterOpticalDupIdxArr;
|
||||
vector<DPSet<DupInfo>> latterRepIdxArr;
|
||||
vector<MDSet<int64_t>> latterSingletonIdxArr;
|
||||
vector<MDSet<int64_t>> latterNotDupIdxArr;
|
||||
vector<MDSet<int64_t>> latterNotOpticalDupIdxArr;
|
||||
vector<MDSet<int64_t>> latterNotRepIdxArr;
|
||||
vector<MDSet<int64_t>> latterNotSingletonIdxArr;
|
||||
};
|
||||
|
|
@ -68,13 +68,13 @@ static void markDupsForPairs(vector<const ReadEnds *> &vpRe, DPSet<DupInfo> *dup
|
|||
MDSet<int64_t> *notOpticalDupIdx = nullptr, MDSet<int64_t> *notRepIdx = nullptr) {
|
||||
if (vpRe.size() < 2) {
|
||||
if (vpRe.size() == 1) {
|
||||
// addSingletonToCount(libraryIdGenerator);
|
||||
// 这个统计可能会有误差,因为当前位点可能还有没匹配上的read,导致当前位点的read(paired)数量为1
|
||||
// 可以通过后续的补充计算来解决这个问题,有必要么?
|
||||
gMetrics.AddSingletonToCount();
|
||||
// 可以通过后续的补充计算来解决这个问题,有必要么?好像没必要
|
||||
gMetrics.singletonReads.insert(vpRe[0]->read1IndexInFile);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
int maxScore = 0;
|
||||
const ReadEnds *pBest = nullptr;
|
||||
// bool print = false;
|
||||
|
|
@ -109,6 +109,7 @@ static void markDupsForPairs(vector<const ReadEnds *> &vpRe, DPSet<DupInfo> *dup
|
|||
notDupIdx->insert(pBest->read2IndexInFile);
|
||||
}
|
||||
// if (false) {
|
||||
// if (true) {
|
||||
if (!g_mdArg.READ_NAME_REGEX.empty()) { // 检查光学冗余
|
||||
// trackOpticalDuplicates
|
||||
vector<const ReadEnds *> prevOpticalRe;
|
||||
|
|
@ -196,7 +197,7 @@ static void getEqualRE(const ReadEnds &re, vector<ReadEnds> &src, vector<ReadEnd
|
|||
}
|
||||
|
||||
/* 单线程生成readends (第一步)*/
|
||||
static void generateReadEnds(SerailMarkDupArg *arg) {
|
||||
static void generateReadEnds(MarkDupDataArg *arg) {
|
||||
auto &p = *arg;
|
||||
auto &rnParser = g_vRnParser[0];
|
||||
|
||||
|
|
@ -205,12 +206,14 @@ static void generateReadEnds(SerailMarkDupArg *arg) {
|
|||
p.unpairedDic.clear();
|
||||
p.unpairedPosArr.clear();
|
||||
|
||||
// bam_num1 += p.bams.size();
|
||||
/* 处理每个read,创建ReadEnd,并放入frag和pair中 */
|
||||
// MDSet<ReadEnds> reSet;
|
||||
// ReadEnds lastRe;
|
||||
|
||||
for (int i = 0; i < p.bams.size(); ++i) { // 循环处理每个read
|
||||
for (size_t i = 0; i < p.bams.size(); ++i) { // 循环处理每个read
|
||||
BamWrap *bw = p.bams[i];
|
||||
// ++bam_num1;
|
||||
const int64_t bamIdx = p.bamStartIdx + i;
|
||||
if (bw->GetReadUnmappedFlag()) {
|
||||
if (bw->b->core.tid == -1)
|
||||
|
|
@ -308,7 +311,7 @@ static void processFrags(vector<ReadEnds> &readEnds, DPSet<DupInfo> *dupIdx, MDS
|
|||
}
|
||||
|
||||
/* 单线程markdup (第二步)*/
|
||||
static void markdups(SerailMarkDupArg *arg) {
|
||||
static void markdups(MarkDupDataArg *arg) {
|
||||
auto &p = *arg;
|
||||
p.pairDupIdx.clear();
|
||||
p.pairOpticalDupIdx.clear();
|
||||
|
|
@ -354,8 +357,8 @@ static inline void getIntersectData(vector<ReadEnds> &leftArr, vector<ReadEnds>
|
|||
}
|
||||
|
||||
/* 将frags重叠部分的dup idx放进数据中 */
|
||||
static inline void refreshFragDupIdx(DPSet<DupInfo> &dupIdx, MDSet<int64_t> ¬DupIdx, SerailMarkDupArg *lastArg,
|
||||
SerailMarkDupArg *curArg) {
|
||||
static inline void refreshFragDupIdx(DPSet<DupInfo> &dupIdx, MDSet<int64_t> ¬DupIdx, MarkDupDataArg *lastArg,
|
||||
MarkDupDataArg *curArg) {
|
||||
auto &lp = *lastArg;
|
||||
auto &p = *curArg;
|
||||
for (auto idx : dupIdx) {
|
||||
|
|
@ -371,7 +374,7 @@ static inline void refreshFragDupIdx(DPSet<DupInfo> &dupIdx, MDSet<int64_t> ¬
|
|||
/* 将pairs重叠部分的dup idx放进数据中 */
|
||||
static inline void refreshPairDupIdx(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &opticalDupIdx, DPSet<DupInfo> &repIdx,
|
||||
MDSet<int64_t> ¬DupIdx, MDSet<int64_t> ¬OpticalDupIdx,
|
||||
MDSet<int64_t> ¬RepIdx, SerailMarkDupArg *lastArg, SerailMarkDupArg *curArg) {
|
||||
MDSet<int64_t> ¬RepIdx, MarkDupDataArg *lastArg, MarkDupDataArg *curArg) {
|
||||
auto &lp = *lastArg;
|
||||
auto &p = *curArg;
|
||||
for (auto idx : dupIdx) {
|
||||
|
|
@ -466,7 +469,7 @@ static void refeshFinalTaskDupInfo(DupContainer &dupIdx, MDSet<int64_t> ¬DupI
|
|||
}
|
||||
|
||||
/* 处理相邻的两个任务,有相交叉的数据 */
|
||||
static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *curArg, GlobalDataArg *gDataArg) {
|
||||
static void handleIntersectData(MarkDupDataArg *lastArg, MarkDupDataArg *curArg, GlobalDataArg *gDataArg) {
|
||||
auto &lp = *lastArg;
|
||||
auto &p = *curArg;
|
||||
auto &g = *gDataArg;
|
||||
|
|
@ -745,7 +748,7 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
|
|||
}
|
||||
|
||||
/* 当所有任务结束后,global data里还有未处理的数据 */
|
||||
static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) {
|
||||
static void handleLastTask(MarkDupDataArg *task, GlobalDataArg *gDataArg) {
|
||||
// cout << "last task start" << endl;
|
||||
auto &lp = *task;
|
||||
auto &g = *gDataArg;
|
||||
|
|
@ -768,6 +771,7 @@ static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) {
|
|||
g.unpairedDic.insert(prevUnpair); // 用来记录没有匹配的read个数
|
||||
}
|
||||
}
|
||||
|
||||
map<int64_t, TaskSeqDupInfo> taskChanged;
|
||||
for (auto &e : g.unpairedPosArr) {
|
||||
auto posKey = e.first;
|
||||
|
|
@ -779,6 +783,8 @@ static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) {
|
|||
std::sort(arr.begin(), arr.end());
|
||||
// cout << "last task before mark pair" << endl;
|
||||
processPairs(arr, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.notDupIdx, &t.notRepIdx);
|
||||
} else if (arr.size() == 1) {
|
||||
gMetrics.singletonReads.insert(arr[0].read1IndexInFile);
|
||||
}
|
||||
}
|
||||
// 更新结果
|
||||
|
|
@ -796,7 +802,10 @@ static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) {
|
|||
|
||||
// cout << "last unpair info: " << g.unpairedDic.size() << '\t' << g.unpairedPosArr.size() << endl;
|
||||
g.unpairedPosArr.clear();
|
||||
g.unpairedDic.clear();
|
||||
|
||||
// for (auto &rn : g.unpairedDic) {
|
||||
// cout << rn.first << endl;
|
||||
// }
|
||||
|
||||
/*
|
||||
int taskSeq = 0;
|
||||
|
|
@ -888,12 +897,15 @@ static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) {
|
|||
std::sort(vRepIdx.begin(), vRepIdx.end());
|
||||
}
|
||||
|
||||
void calculateMetrics(SerailMarkDupArg &lp, SerailMarkDupArg &p, GlobalDataArg &g, DuplicationMetrics *pgMetrics) {
|
||||
void calculateMetrics(MarkDupDataArg &lp, MarkDupDataArg &p, GlobalDataArg &g, DuplicationMetrics *pgMetrics) {
|
||||
DuplicationMetrics &gMetrics = *pgMetrics;
|
||||
|
||||
// gMetrics.READ_PAIRS_EXAMINED /= 2;
|
||||
|
||||
cout << "calculateMetrics start: " << endl;
|
||||
|
||||
cout << lp.unpairedDic.size() << "\t" << p.unpairedDic.size() << "\t" << g.unpairedDic.size() << endl;
|
||||
|
||||
cout << "all: " << (lp.unpairedDic.size() + p.unpairedDic.size() + g.unpairedDic.size()) << endl;
|
||||
cout << "calculateMetrics end" << endl;
|
||||
}
|
||||
|
||||
|
|
@ -908,9 +920,9 @@ void serialMarkDups() {
|
|||
// inBamBuf.Init(g_inBamFp, g_inBamHeader, 100 * 1024 * 1024);
|
||||
int64_t processedBamNum = 0;
|
||||
|
||||
SerailMarkDupArg smdArg1, smdArg2;
|
||||
SerailMarkDupArg *lastArgP = &smdArg1;
|
||||
SerailMarkDupArg *curArgP = &smdArg2;
|
||||
MarkDupDataArg smdArg1, smdArg2;
|
||||
MarkDupDataArg *lastArgP = &smdArg1;
|
||||
MarkDupDataArg *curArgP = &smdArg2;
|
||||
|
||||
bool isFirstRound = true;
|
||||
int roundNum = 0;
|
||||
|
|
@ -921,6 +933,7 @@ void serialMarkDups() {
|
|||
tm_arr[4].acc_start();
|
||||
size_t readNum = inBamBuf.ReadBam();
|
||||
readNumSum += readNum;
|
||||
// readNumSum += inBamBuf.GetBamArr().size();
|
||||
tm_arr[4].acc_end();
|
||||
cout << "read num: " << readNum << '\t' << roundNum << endl;
|
||||
// lastArgP = curArgP;
|
||||
|
|
@ -974,7 +987,7 @@ void serialMarkDups() {
|
|||
handleLastTask(lastArgP, &gData);
|
||||
|
||||
// 计算各种统计指标
|
||||
calculateMetrics(*lastArgP, *curArgP, gData, &gMetrics);
|
||||
// calculateMetrics(*lastArgP, *curArgP, gData, &gMetrics);
|
||||
|
||||
// cout << "here 2" << endl;
|
||||
tm_arr[3].acc_end();
|
||||
|
|
@ -987,7 +1000,7 @@ void serialMarkDups() {
|
|||
map<int64_t, int> dup;
|
||||
|
||||
int taskSeq = 0;
|
||||
for (auto &arr : gData.dupIdxArr) {
|
||||
/* for (auto &arr : gData.dupIdxArr) {
|
||||
for (auto idx : arr) {
|
||||
if (dup.find(idx.idx) != dup.end()) {
|
||||
// cout << "dup index: " << dup[idx] << '\t' << taskSeq << '\t'
|
||||
|
|
@ -998,7 +1011,7 @@ void serialMarkDups() {
|
|||
// cout << taskSeq << "\t" << arr.size() << endl;
|
||||
taskSeq++;
|
||||
}
|
||||
|
||||
*/
|
||||
// #include <fstream>
|
||||
// ofstream out("tumor_dup.txt");
|
||||
// for (auto idx : dup)
|
||||
|
|
@ -1023,23 +1036,11 @@ void serialMarkDups() {
|
|||
cout << "sort frags : " << tm_arr[9].acc_seconds_elapsed() << endl;
|
||||
cout << "sort pairs : " << tm_arr[10].acc_seconds_elapsed() << endl;
|
||||
cout << "all : " << tm_arr[5].acc_seconds_elapsed() << endl;
|
||||
cout << "optical dup: " << zzhopticalSet.size() << endl;
|
||||
cout << "optical arr dup: " << zzhopticalArr.size() << endl;
|
||||
cout << "optical size: " << opticalDupNum << endl;
|
||||
|
||||
cout << "metrics: \n"
|
||||
<< "LIBRARY: " << gMetrics.LIBRARY << "\n"
|
||||
<< "UNPAIRED_READS_EXAMINED: " << gMetrics.UNPAIRED_READS_EXAMINED << "\n"
|
||||
<< "READ_PAIRS_EXAMINED: " << gMetrics.READ_PAIRS_EXAMINED << "\n"
|
||||
<< "SECONDARY_OR_SUPPLEMENTARY_RDS: " << gMetrics.SECONDARY_OR_SUPPLEMENTARY_RDS << "\n"
|
||||
<< "UNMAPPED_READS: " << gMetrics.UNMAPPED_READS << "\n"
|
||||
<< "UNPAIRED_READ_DUPLICATES: " << gMetrics.UNPAIRED_READ_DUPLICATES << "\n"
|
||||
<< "READ_PAIR_DUPLICATES: " << gMetrics.READ_PAIR_DUPLICATES << "\n"
|
||||
<< "READ_PAIR_OPTICAL_DUPLICATES: " << gMetrics.READ_PAIR_OPTICAL_DUPLICATES << "\n"
|
||||
<< "PERCENT_DUPLICATION: " << gMetrics.PERCENT_DUPLICATION << "\n"
|
||||
<< "ESTIMATED_LIBRARY_SIZE: " << gMetrics.ESTIMATED_LIBRARY_SIZE << endl;
|
||||
cout << "singleton size: " << gMetrics.singletonReads.size() << endl;
|
||||
|
||||
Timer::log_time("serial end ");
|
||||
cout << "read num sum: " << readNumSum << endl;
|
||||
|
||||
// for (auto i : gData.dupArr)
|
||||
// cout << i << endl;
|
||||
|
|
|
|||
|
|
@ -1,143 +1,6 @@
|
|||
#pragma once
|
||||
|
||||
#include <common/hts/bam_buf.h>
|
||||
#include <robin-map/include/tsl/robin_map.h>
|
||||
#include <robin-map/include/tsl/robin_set.h>
|
||||
#include <sam/utils/read_ends.h>
|
||||
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
using std::set;
|
||||
using std::string;
|
||||
using std::unordered_set;
|
||||
using std::vector;
|
||||
|
||||
/* 存放未匹配readend相同位点的所有readend */
|
||||
struct UnpairedREInfo {
|
||||
int64_t taskSeq;
|
||||
ReadEnds unpairedRE;
|
||||
};
|
||||
|
||||
/* 对于一个pair数据,一个完整的计算点,包含read1的比对位置和read2的比对位置 */
|
||||
struct CalcKey {
|
||||
int64_t read1Pos;
|
||||
int64_t read2Pos;
|
||||
bool operator<(const CalcKey &o) const {
|
||||
int comp = (int)(read1Pos - o.read1Pos);
|
||||
if (comp == 0)
|
||||
comp = (int)(read2Pos - o.read2Pos);
|
||||
return comp < 0;
|
||||
}
|
||||
bool operator==(const CalcKey &o) const { return read1Pos == o.read1Pos && read2Pos == o.read2Pos; }
|
||||
std::size_t operator()(const CalcKey &o) const {
|
||||
return std::hash<int64_t>()(read1Pos) ^ std::hash<int64_t>()(read2Pos);
|
||||
}
|
||||
};
|
||||
|
||||
struct CalcKeyHash {
|
||||
std::size_t operator()(const CalcKey &o) const {
|
||||
return std::hash<int64_t>()(o.read1Pos) ^ std::hash<int64_t>()(o.read2Pos);
|
||||
}
|
||||
};
|
||||
|
||||
/* 用来记录冗余索引相关的信息 */
|
||||
struct DupInfo {
|
||||
int64_t idx;
|
||||
int64_t repIdx = 0; // 这一批冗余中的非冗余read 代表的索引
|
||||
int16_t dupSet = 0; // dup set size
|
||||
|
||||
DupInfo() : DupInfo(-1, 0, 0) {}
|
||||
DupInfo(int64_t idx_) : DupInfo(idx_, 0, 0) {}
|
||||
DupInfo(int64_t idx_, int64_t repIdx_, int dupSet_) : idx(idx_), repIdx(repIdx_), dupSet(dupSet_) {}
|
||||
bool operator<(const DupInfo &o) const { return idx < o.idx; }
|
||||
bool operator>(const DupInfo &o) const { return idx > o.idx; }
|
||||
operator int64_t() const { return idx; }
|
||||
};
|
||||
|
||||
struct DupInfoHash {
|
||||
std::size_t operator()(const DupInfo &o) const { return std::hash<int64_t>()(o.idx); }
|
||||
};
|
||||
|
||||
struct DupInfoEqual {
|
||||
bool operator()(const DupInfo &o1, const DupInfo &o2) const { return o1.idx == o2.idx; }
|
||||
bool operator()(const DupInfo &o1, const int64_t &o2) const { return o1.idx == o2; }
|
||||
bool operator()(const int64_t &o1, const DupInfo &o2) const { return o1 == o2.idx; }
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
// using MDSet = set<T>;
|
||||
// using MDSet = unordered_set<T>;
|
||||
using MDSet = tsl::robin_set<T>;
|
||||
|
||||
template <typename T>
|
||||
// using DPSet = set<T>;
|
||||
// using DPSet = unordered_set<T, DupInfoHash, DupInfoEqual>;
|
||||
using DPSet = tsl::robin_set<T, DupInfoHash, DupInfoEqual>;
|
||||
|
||||
template <typename T>
|
||||
// using CalcSet = set<T>;
|
||||
using CalcSet = tsl::robin_set<T, CalcKeyHash>;
|
||||
|
||||
/* 当遗留数据在当前任务找到了pair read后,进行冗余计算时候存放结果的数据结构 */
|
||||
struct TaskSeqDupInfo {
|
||||
DPSet<DupInfo> dupIdx;
|
||||
MDSet<int64_t> opticalDupIdx;
|
||||
DPSet<DupInfo> repIdx;
|
||||
MDSet<int64_t> notDupIdx;
|
||||
MDSet<int64_t> notOpticalDupIdx;
|
||||
MDSet<int64_t> notRepIdx;
|
||||
};
|
||||
|
||||
/* 保存有未匹配pair位点的信息,包括read end数组和有几个未匹配的read end */
|
||||
struct UnpairedPosInfo {
|
||||
int unpairedNum = 0;
|
||||
int64_t taskSeq;
|
||||
vector<ReadEnds> pairArr;
|
||||
MDSet<string> readNameSet;
|
||||
};
|
||||
// typedef unordered_map<string, UnpairedREInfo> UnpairedNameMap;
|
||||
// typedef unordered_map<int64_t, UnpairedPosInfo> UnpairedPositionMap;
|
||||
|
||||
typedef tsl::robin_map<string, UnpairedREInfo> UnpairedNameMap; // 以read name为索引,保存未匹配的pair read
|
||||
typedef tsl::robin_map<int64_t, UnpairedPosInfo>
|
||||
UnpairedPositionMap; // 以位点为索引,保存该位点包含的对应的所有read和该位点包含的剩余未匹配的read的数量
|
||||
|
||||
/* 单线程处理冗余参数结构体 */
|
||||
struct SerailMarkDupArg {
|
||||
int64_t taskSeq; // 任务序号
|
||||
int64_t bamStartIdx; // 当前vBam数组中第一个bam记录在整体bam中所处的位置
|
||||
vector<BamWrap *> bams; // 存放待处理的bam read
|
||||
vector<ReadEnds> pairs; // 成对的reads
|
||||
vector<ReadEnds> frags; // 暂未找到配对的reads
|
||||
DPSet<DupInfo> pairDupIdx; // pair的冗余read的索引
|
||||
MDSet<int64_t> pairOpticalDupIdx; // optical冗余read的索引
|
||||
DPSet<DupInfo> fragDupIdx; // frag的冗余read的索引
|
||||
DPSet<DupInfo> pairRepIdx; // pair的dupset代表read的索引
|
||||
UnpairedNameMap unpairedDic; // 用来寻找pair end
|
||||
UnpairedPositionMap unpairedPosArr; // 存放未匹配的ReadEnd对应位点的所有ReadEnd,为了避免重复存储
|
||||
};
|
||||
|
||||
/* 全局保留的数据,因为有些paired数据比对到了不同的染色体,相距甚远 */
|
||||
struct GlobalDataArg {
|
||||
UnpairedNameMap unpairedDic; // 用来寻找pair end
|
||||
UnpairedPositionMap unpairedPosArr;
|
||||
|
||||
// 每个task对应一个vector
|
||||
vector<vector<DupInfo>> dupIdxArr;
|
||||
vector<vector<int64_t>> opticalDupIdxArr;
|
||||
vector<vector<DupInfo>> repIdxArr;
|
||||
|
||||
// 用来存放后续计算的数据
|
||||
vector<DPSet<DupInfo>> latterDupIdxArr;
|
||||
vector<MDSet<int64_t>> latterOpticalDupIdxArr;
|
||||
vector<DPSet<DupInfo>> latterRepIdxArr;
|
||||
vector<MDSet<int64_t>> latterNotDupIdxArr;
|
||||
vector<MDSet<int64_t>> latterNotOpticalDupIdxArr;
|
||||
vector<MDSet<int64_t>> latterNotRepIdxArr;
|
||||
};
|
||||
#include "md_types.h"
|
||||
|
||||
// 串行运行mark duplicate
|
||||
void serialMarkDups();
|
||||
|
|
@ -6,6 +6,8 @@
|
|||
#include <htslib/thread_pool.h>
|
||||
#include <sam/utils/read_ends.h>
|
||||
#include <sam/utils/read_name_parser.h>
|
||||
//#include "md_types.h"
|
||||
|
||||
#include <set>
|
||||
using std::set;
|
||||
|
||||
|
|
@ -19,14 +21,15 @@ extern sam_hdr_t *g_outBamHeader; // 输出文件的header
|
|||
|
||||
/* 参数对象作为全局对象,免得多次作为参数传入函数中 */
|
||||
class GlobalArg;
|
||||
extern GlobalArg &g_gArg;
|
||||
extern GlobalArg &g_gArg; // 全局参数
|
||||
class MarkDupsArg;
|
||||
extern MarkDupsArg &g_mdArg;
|
||||
extern MarkDupsArg &g_mdArg; // markduplicate程序相关的参数
|
||||
// 上边加下划线表示所有模块共享,不加下划线表示markduplicate计算时候内部使用
|
||||
|
||||
class GlobalDataArg;
|
||||
extern GlobalDataArg &gData;
|
||||
extern GlobalDataArg &gData; // 计算冗余过程中保存在全局的数据
|
||||
class DuplicationMetrics;
|
||||
extern DuplicationMetrics &gMetrics;
|
||||
extern DuplicationMetrics &gMetrics; // 用来计算统计信息
|
||||
|
||||
extern int zzhtestnum;
|
||||
extern set<int64_t> zzhopticalSet;
|
||||
extern vector<int64_t> zzhopticalArr;
|
||||
extern int64_t bam_num1, bam_num2;
|
||||
|
|
@ -167,4 +167,12 @@ struct ReadEnds : PhysicalLocation {
|
|||
}
|
||||
};
|
||||
|
||||
struct ReadEndsHash {
|
||||
std::size_t operator()(const ReadEnds &o) const { return std::hash<int64_t>()(o.read1IndexInFile); }
|
||||
};
|
||||
|
||||
struct ReadEndsEqual {
|
||||
bool operator()(const ReadEnds &o1, const ReadEnds &o2) const { return o1.read1IndexInFile == o2.read1IndexInFile; }
|
||||
};
|
||||
|
||||
#endif
|
||||
Loading…
Reference in New Issue