在结果中加入了TAG

This commit is contained in:
zzh 2024-11-11 18:10:17 +08:00
parent 899d40cbda
commit 4fd267c4db
18 changed files with 539 additions and 243 deletions

1
.gitignore vendored
View File

@ -1,4 +1,5 @@
# for fast-markdup # for fast-markdup
core.*
*.sam *.sam
*.bam *.bam
*.log *.log

4
.vscode/launch.json vendored
View File

@ -13,8 +13,8 @@
"program": "${workspaceRoot}/build/bin/picard_cpp", "program": "${workspaceRoot}/build/bin/picard_cpp",
"args": [ "args": [
"MarkDuplicates", "MarkDuplicates",
"--INPUT", "~/data/bam/zy_tumor.bam", "--INPUT", "~/data/bam/1k.sam",
"--OUTPUT", "/dev/null", "--OUTPUT", "./out.sam",
"--METRICS_FILE", "metrics.txt", "--METRICS_FILE", "metrics.txt",
"--num_threads", "1", "--num_threads", "1",
"--max_mem", "2G", "--max_mem", "2G",

View File

@ -101,6 +101,7 @@
"queue": "cpp", "queue": "cpp",
"stack": "cpp", "stack": "cpp",
"__bit_reference": "cpp", "__bit_reference": "cpp",
"__string": "cpp" "__string": "cpp",
"filesystem": "cpp"
} }
} }

10
run.sh
View File

@ -1,18 +1,22 @@
#input=~/data/bam/zy_normal.bam #input=~/data/bam/zy_normal.bam
#input=~/data/bam/zy_tumor.bam #input=~/data/bam/zy_tumor.bam
#input=~/data/bam/100w.bam #input=~/data/bam/100w.bam
input=~/data/bam/1kw.sam input=~/data/bam/1k.sam
#input=~/data/bam/1kw.sam
#input=~/data/bam/1kw.bam
#input=~/data/bam/n1kw.sam #input=~/data/bam/n1kw.sam
cd ./build/ && make -j 8 && cd ..
#./out.sam \
time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \ time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \
MarkDuplicates \ MarkDuplicates \
--INPUT $input \ --INPUT $input \
--OUTPUT /dev/null \ --OUTPUT ./out.sam \
--INDEX_FORMAT BAI \ --INDEX_FORMAT BAI \
--num_threads 1 \ --num_threads 1 \
--max_mem 2G \ --max_mem 2G \
--verbosity DEBUG \ --verbosity DEBUG \
--asyncio true -TAG_DUPLICATE_SET_MEMBERS true #--READ_NAME_REGEX ".*?([0-9]+):([0-9]+):([0-9]+)$" --asyncio true -TAG_DUPLICATE_SET_MEMBERS true #--READ_NAME_REGEX null
#--TAG_DUPLICATE_SET_MEMBERS true #--READ_NAME_REGEX "" #--TAG_DUPLICATE_SET_MEMBERS true #--READ_NAME_REGEX ""
#--READ_NAME_REGEX ".*?([0-9]+):([0-9]+):([0-9]+)$" #--READ_NAME_REGEX ".*?([0-9]+):([0-9]+):([0-9]+)$"
#--READ_NAME_REGEX "" #--READ_NAME_REGEX ""

View File

@ -355,6 +355,14 @@ struct BamWrap {
const int64_t mask = ((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1; const int64_t mask = ((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1;
return (int32_t)(global_pos & mask); return (int32_t)(global_pos & mask);
} }
// 设置是否冗余的标记
void SetDuplicateReadFlag(bool flag) { setFlag(flag, BAM_FDUP); }
void setFlag(bool flag, int bit) {
if (flag) this->b->core.flag |= bit;
else this->b->core.flag &= ~bit;
}
}; };
typedef std::map<const std::string, std::vector<BamWrap *>> SampleBamMap; typedef std::map<const std::string, std::vector<BamWrap *>> SampleBamMap;

View File

@ -8,6 +8,7 @@
*/ */
#include "global_arg.h" #include "global_arg.h"
#include "util.h"
#include <cstring> #include <cstring>
#include <vector> #include <vector>
@ -100,4 +101,24 @@ void GlobalArg::parseArgument(int argNum) {
default: default:
break; break;
} }
}
string GlobalArg::GetArgValueString() {
printf("--INPUT = %s\n", in_fn.c_str());
printf("--OUTPUT = %s\n", out_fn.c_str());
printf("--num_threads = %d\n", num_threads);
printf("--max_mem = %ld\n", max_mem);
printf("--verbosity = %d\n", verbosity);
printf("--asyncio = %d\n", use_asyncio);
vector<string> argVals;
argVals.push_back("--INPUT"); argVals.push_back(in_fn);
argVals.push_back("--OUTPUT"); argVals.push_back(out_fn);
argVals.push_back("--num_threads"); argVals.push_back(StringUtil::Format(num_threads));
argVals.push_back("--asyncio"); argVals.push_back(use_asyncio ? "true" : "false");
string argValStr;
StringUtil::Join(argVals, argValStr, ' ');
cout << argValStr << endl;
return argValStr;
} }

View File

@ -89,6 +89,8 @@ struct GlobalArg {
printf("--asyncio = %d\n", use_asyncio); printf("--asyncio = %d\n", use_asyncio);
} }
string GetArgValueString();
private: private:
GlobalArg() { initGlobalOptions(); }; GlobalArg() { initGlobalOptions(); };
}; };

View File

@ -34,6 +34,13 @@ void Warn(const char *format, ...);
void Info(const char *format, ...); void Info(const char *format, ...);
void Debug(const char *format, ...); void Debug(const char *format, ...);
// 字节缓冲区
struct ByteBuf {
uint8_t *buf = nullptr;
int size = 0; // 当前使用量
int capacity = 0; // 最大容量
};
/* /*
* StringUtil * StringUtil
*/ */
@ -74,6 +81,17 @@ struct StringUtil {
} }
return true; return true;
} }
//static void StringToBytes(const string &str, ByteBuf &buf) {
// if (str.size() > buf.capacity) {
// buf.capacity = str.size() * 2;
// buf.buf = (uint8_t*)realloc(buf.buf, buf.capacity);
// }
// buf.size = str.size();
// for (int i = 0; i < str.size(); ++i) {
// buf.buf[i] = (uint8_t)str[i] + 64;
// }
//}
}; };
// 二进制读写相关 // 二进制读写相关

View File

@ -2,8 +2,11 @@
#include <string> #include <string>
#include <stdint.h> #include <stdint.h>
#include <vector>
#include "serial_md.h"
using std::string; using std::string;
using std::vector;
/* /*
* *
@ -54,17 +57,11 @@ struct DuplicationMetrics {
// 其他的统计数据 // 其他的统计数据
// addSingletonToCount需要记录的数据 // 比如在该位置有4个冗余那么所有4个冗余的位置数量
uint64_t DuplicateCountHist = 0; MDMap DuplicateCountHist;
uint64_t NonOpticalDuplicateCountHist = 0; MDMap NonOpticalDuplicateCountHist;
MDMap OpticalDuplicatesCountHist;
// track optical duplicates 需要记录的数据 // 没有冗余的位置总数量
uint64_t OpticalDuplicatesCountHist = 0; MDSet<int64_t> singletonReads;
uint64_t OpticalDuplicatesByLibraryId = 0;
// 统计相关的函数
void AddSingletonToCount() {
++this->DuplicateCountHist;
++this->NonOpticalDuplicateCountHist;
}
}; };

View File

@ -25,6 +25,7 @@ Date : 2023/10/23
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include <string>
#include "dup_metrics.h" #include "dup_metrics.h"
#include "markdups_arg.h" #include "markdups_arg.h"
@ -35,6 +36,7 @@ Date : 2023/10/23
using namespace std; using namespace std;
using std::cout; using std::cout;
using std::string;
#define SMA_TAG_PG "PG" #define SMA_TAG_PG "PG"
@ -56,11 +58,67 @@ MarkDupsArg &g_mdArg = g_mdArg_;
static GlobalDataArg gData_; static GlobalDataArg gData_;
GlobalDataArg &gData = gData_; GlobalDataArg &gData = gData_;
DuplicationMetrics gMetrics_; DuplicationMetrics gMetrics_;
DuplicationMetrics &gMetrics = gMetrics_; DuplicationMetrics &gMetrics = gMetrics_; // 记录统计信息
/* 程序版本等信息 */
const char *PROGRAM_GROUP_VERSION = "3.2.0";
// 调试信息
int zzhtestnum = 0; int zzhtestnum = 0;
set<int64_t> zzhopticalSet; int64_t bam_num1=0, bam_num2=0;
vector<int64_t> zzhopticalArr; /**
* Estimates the size of a library based on the number of paired end molecules observed
* and the number of unique pairs observed.
* <p>
* Based on the Lander-Waterman equation that states:
* C/X = 1 - exp( -N/X )
* where
* X = number of distinct molecules in library
* N = number of read pairs
* C = number of distinct fragments observed in read pairs
*/
static int64_t estimateLibrarySize(int64_t readPairs, int64_t uniqueReadPairs) {
int64_t librarySize = 0;
int64_t readPairDuplicates = readPairs - uniqueReadPairs;
auto f = [](double x, double c, double n) { return c / x - 1 + exp(-n / x); };
if (readPairs > 0 && readPairDuplicates > 0) {
double m = 1.0;
double M = 100.0;
if (uniqueReadPairs >= readPairs || f(m * uniqueReadPairs, uniqueReadPairs, readPairs) < 0) {
cerr << "Invalid values for pairs and unique pairs: " << readPairs << ", " << uniqueReadPairs << endl;
return 0;
}
// find value of M, large enough to act as other side for bisection method
while (f(M * uniqueReadPairs, uniqueReadPairs, readPairs) > 0) {
M *= 10.0;
}
// use bisection method (no more than 40 times) to find solution
for (int i = 0; i < 40; ++i) {
double r = (m + M) / 2.0;
double u = f(r * uniqueReadPairs, uniqueReadPairs, readPairs);
if (u == 0)
break;
else if (u > 0)
m = r;
else if (u < 0)
M = r;
}
return uniqueReadPairs * (m + M) / 2.0;
}
return librarySize;
}
/*
*
*/
string getFileExtension(const string &filename) {
auto last_dot = filename.find_last_of('.');
if (last_dot == string::npos) {
return "";
}
return filename.substr(last_dot + 1);
}
/* /*
* mark duplicate * mark duplicate
* bambarcode * bambarcode
@ -109,6 +167,10 @@ int MarkDuplicates(int argc, char *argv[]) {
sam_open_mode(modeout + 1, g_gArg.out_fn.c_str(), NULL); sam_open_mode(modeout + 1, g_gArg.out_fn.c_str(), NULL);
g_outBamFp = sam_open(g_gArg.out_fn.c_str(), modeout); g_outBamFp = sam_open(g_gArg.out_fn.c_str(), modeout);
g_outBamHeader = sam_hdr_dup(g_inBamHeader); g_outBamHeader = sam_hdr_dup(g_inBamHeader);
// 修改输出文件的header
sam_hdr_add_line(g_outBamHeader, "PG", "ID", g_mdArg.PROGRAM_RECORD_ID.c_str(), "VN", PROGRAM_GROUP_VERSION, "CL",
(g_mdArg.PROGRAM_RECORD_ID + " " + g_gArg.GetArgValueString() + " " + g_mdArg.GetArgValueString()).c_str(), NULL);
// 用同样的线程池处理输出文件 // 用同样的线程池处理输出文件
hts_set_opt(g_outBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE); hts_set_opt(g_outBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
hts_set_opt(g_outBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite); hts_set_opt(g_outBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite);
@ -139,44 +201,138 @@ int MarkDuplicates(int argc, char *argv[]) {
// 输出index文件 // 输出index文件
// string indexFn = g_gArg.out_fn + ".csi"; // 现在索引都是csi格式的 // string indexFn = g_gArg.out_fn + ".csi"; // 现在索引都是csi格式的
string indexFn = g_gArg.out_fn + ".bai"; // min_shift = 0 是bai格式 string indexFn = g_gArg.out_fn + ".bai"; // min_shift = 0 是bai格式
int index_min_shift = 0; if ("sam" == getFileExtension(g_gArg.out_fn)) {
if (g_mdArg.INDEX_FORMAT == ns_md::IndexFormat::CSI) { indexFn = "";
indexFn = g_gArg.out_fn + ".csi";
index_min_shift = 14;
} }
if (sam_idx_init(g_outBamFp, g_outBamHeader, 0 /*csi 14*/, indexFn.c_str()) < 0) {
Error("failed to open index \"%s\" for writing", indexFn.c_str()); if (!indexFn.empty()) {
sam_close(g_outBamFp); int index_min_shift = 0;
sam_close(g_inBamFp); if (g_mdArg.INDEX_FORMAT == ns_md::IndexFormat::CSI) {
return -1; indexFn = g_gArg.out_fn + ".csi";
index_min_shift = 14;
}
if (sam_idx_init(g_outBamFp, g_outBamHeader, 0 /*csi 14*/, indexFn.c_str()) < 0) {
Error("failed to open index \"%s\" for writing", indexFn.c_str());
sam_close(g_outBamFp);
sam_close(g_inBamFp);
return -1;
}
} }
// 读取输入文件 // 读取输入文件
// BamBufType inBuf(false); // BamBufType inBuf(false);
BamBufType inBuf(g_gArg.use_asyncio); BamBufType inBuf(g_gArg.use_asyncio);
inBuf.Init(g_inBamFp, g_inBamHeader, g_gArg.max_mem); inBuf.Init(g_inBamFp, g_inBamHeader, g_gArg.max_mem);
DupIdxQueue<DupInfo> dupIdxQue, repIdxQue; DupIdxQueue<DupInfo> dupIdxQue, repIdxQue;
DupIdxQueue<int64_t> opticalIdxQue;
dupIdxQue.Init(&gData.dupIdxArr); dupIdxQue.Init(&gData.dupIdxArr);
repIdxQue.Init(&gData.repIdxArr); repIdxQue.Init(&gData.repIdxArr);
opticalIdxQue.Init(&gData.opticalDupIdxArr);
Timer tw; Timer tw;
cout << "dupsize: " << dupIdxQue.Size() << endl; cout << "dupsize: " << dupIdxQue.Size() << endl;
uint64_t bamIdx = 0; uint64_t bamIdx = 0;
DupInfo dupIdx = dupIdxQue.Pop(); DupInfo dupIdx = dupIdxQue.Pop();
DupInfo repIdx = repIdxQue.Pop(); DupInfo repIdx = repIdxQue.Pop();
exit(0); uint64_t opticalIdx = opticalIdxQue.Pop();
ByteBuf bytes;
bam1_t *bp = bam_init1();
bool isDup = false;
bool isOpticalDup = false;
bool isInDuplicateSet = false;
uint32_t representativeReadIndexInFile = 0;
uint32_t duplicateSetSize = 0;
// exit(0);
while (inBuf.ReadStat() >= 0) { while (inBuf.ReadStat() >= 0) {
Timer tw1; Timer tw1;
size_t readNum = inBuf.ReadBam(); size_t readNum = inBuf.ReadBam();
// cout << "read: " << readNum << endl; // cout << "read: " << readNum << endl;
for (size_t i = 0; i < inBuf.Size(); ++i) { for (size_t i = 0; i < inBuf.Size(); ++i) {
BamWrap *bw = inBuf[i];
if (bam_copy1(bp, bw->b) == nullptr) {
Error("Can not copy sam record!");
return -1;
}
bw->b = bp;
isDup = false;
isOpticalDup = false;
isInDuplicateSet = false;
// 删除原来的duplicate tag
if (g_mdArg.CLEAR_DT) {
uint8_t *oldTagVal = bam_aux_get(bw->b, g_mdArg.DUPLICATE_TYPE_TAG.c_str());
if (oldTagVal != NULL) bam_aux_del(bw->b, oldTagVal);
}
++bam_num2;
// 统计信息
if (bw->GetReadUnmappedFlag()) {
++gMetrics.UNMAPPED_READS;
} else if (bw->IsSecondaryOrSupplementary()) {
++gMetrics.SECONDARY_OR_SUPPLEMENTARY_RDS;
} else if (!bw->GetReadPairedFlag() || bw->GetMateUnmappedFlag()) {
++gMetrics.UNPAIRED_READS_EXAMINED;
} else {
++gMetrics.READ_PAIRS_EXAMINED; // will need to be divided by 2 at the end
}
/* 判断是否冗余 */ /* 判断是否冗余 */
if (bamIdx == dupIdx) { if (bamIdx == dupIdx) {
// if (dupIdx.dupSet != 0) isDup = true;
// cerr << bamIdx << " " << dupIdx.repIdx << " " << dupIdx.dupSet << endl;
if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS && dupIdx.dupSet != 0) {
// cerr << bamIdx << " " << dupIdx.repIdx << " " << dupIdx.dupSet << endl;
isInDuplicateSet = true;
representativeReadIndexInFile = dupIdx.repIdx;
duplicateSetSize = dupIdx.dupSet;
}
// 为了防止小内存运行的时候有重复的dupidx这时候dup的repIdx和dupSetSize可能会有不同 // 为了防止小内存运行的时候有重复的dupidx这时候dup的repIdx和dupSetSize可能会有不同
while ((dupIdx = dupIdxQue.Pop()) == bamIdx); while ((dupIdx = dupIdxQue.Pop()) == bamIdx);
if (opticalIdx == bamIdx)
isOpticalDup = true;
else if (opticalIdx < bamIdx) {
while ((opticalIdx = opticalIdxQue.Pop()) < bamIdx);
if (opticalIdx == bamIdx)
isOpticalDup = true;
}
// 添加冗余标识
bw->SetDuplicateReadFlag(true);
// if (isOpticalDup)
// cout << bamIdx << " optical" << endl;
// else
// cout << bamIdx << " not opt" << endl;
uint8_t *oldTagVal = bam_aux_get(bw->b, g_mdArg.DUPLICATE_TYPE_TAG.c_str());
if (oldTagVal != NULL) bam_aux_del(bw->b, oldTagVal);
if (isOpticalDup)
bam_aux_append(bw->b, g_mdArg.DUPLICATE_TYPE_TAG.c_str(), 'Z',
g_mdArg.DUPLICATE_TYPE_SEQUENCING.size() + 1,
(const uint8_t *)g_mdArg.DUPLICATE_TYPE_SEQUENCING.c_str());
else
bam_aux_append(bw->b, g_mdArg.DUPLICATE_TYPE_TAG.c_str(), 'Z',
g_mdArg.DUPLICATE_TYPE_LIBRARY.size() + 1,
(const uint8_t *)g_mdArg.DUPLICATE_TYPE_LIBRARY.c_str());
// cout << bamIdx << endl;
// int ival = 2;
// cout << "bam rec size: " << bw->b->l_data << "\t" << bw->b->m_data << endl;
// if (bam_aux_append(bw->b, "DT", 'Z', 3, (const uint8_t*)"LB") < 0) Error("add tag error!");
// // if (bam_aux_append(bw->b, "DT", 'i', sizeof(ival), (uint8_t *)&ival) < 0) Error("add tag error!");
// cout << "bam rec size: " << bw->b->l_data << "\t" << bw->b->m_data << endl;
// 计算统计信息
if (!bw->IsSecondaryOrSupplementary() && !bw->GetReadUnmappedFlag()) {
// Update the duplication metrics
if (!bw->GetReadPairedFlag() || bw->GetMateUnmappedFlag()) {
++gMetrics.UNPAIRED_READ_DUPLICATES;
} else {
++gMetrics.READ_PAIR_DUPLICATES; // will need to be divided by 2 at the end
}
}
} else {
bw->SetDuplicateReadFlag(false);
} }
if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS && bamIdx == repIdx) { // repressent if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS && bamIdx == repIdx) { // repressent
// cerr << bamIdx << " " << repIdx.repIdx << " " << repIdx.dupSet << endl; // cerr << bamIdx << " " << repIdx.repIdx << " " << repIdx.dupSet << endl;
isInDuplicateSet = true;
representativeReadIndexInFile = repIdx.repIdx;
duplicateSetSize = repIdx.dupSet;
while (repIdx == bamIdx || repIdx.dupSet == 0) { while (repIdx == bamIdx || repIdx.dupSet == 0) {
if (repIdxQue.Size() > 0) if (repIdxQue.Size() > 0)
repIdx = repIdxQue.Pop(); repIdx = repIdxQue.Pop();
@ -186,19 +342,82 @@ int MarkDuplicates(int argc, char *argv[]) {
} }
} }
} }
if (sam_write1(g_outBamFp, g_outBamHeader, inBuf[i]->b) < 0) {
Error("failed writing header to \"%s\"", g_gArg.out_fn.c_str()); if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS && isInDuplicateSet) {
// cerr << bamIdx << " " << representativeReadIndexInFile << " " << duplicateSetSize << endl;
uint8_t *oldTagVal = bam_aux_get(bw->b, g_mdArg.DUPLICATE_SET_INDEX_TAG.c_str());
if (oldTagVal != NULL) bam_aux_del(bw->b, oldTagVal);
bam_aux_append(bw->b, g_mdArg.DUPLICATE_SET_INDEX_TAG.c_str(), 'i',
sizeof(representativeReadIndexInFile), (uint8_t *)&representativeReadIndexInFile);
oldTagVal = bam_aux_get(bw->b, g_mdArg.DUPLICATE_SET_SIZE_TAG.c_str());
if (oldTagVal != NULL) bam_aux_del(bw->b, oldTagVal);
bam_aux_append(bw->b, g_mdArg.DUPLICATE_SET_SIZE_TAG.c_str(), 'i', sizeof(duplicateSetSize),
(const uint8_t *)&duplicateSetSize);
}
// 每个read都要写到output只是添加标识
// cout << "bam rec size: " << bw->b->l_data << "\t" << bw->b->m_data << endl;
// if (bamIdx == 922) break;
++bamIdx;
if (isDup && g_mdArg.REMOVE_DUPLICATES) {
continue;
}
if (isOpticalDup && g_mdArg.REMOVE_SEQUENCING_DUPLICATES) {
continue;
}
if (!g_mdArg.PROGRAM_RECORD_ID.empty() && g_mdArg.ADD_PG_TAG_TO_READS) {
uint8_t *oldTagVal = bam_aux_get(bw->b, "PG");
if (oldTagVal != NULL) bam_aux_del(bw->b, oldTagVal);
bam_aux_append(bw->b, "PG", 'Z', g_mdArg.PROGRAM_RECORD_ID.size() + 1,
(const uint8_t *)g_mdArg.PROGRAM_RECORD_ID.c_str());
}
if (sam_write1(g_outBamFp, g_outBamHeader, bw->b) < 0) {
Error("failed writing sam record to \"%s\"", g_gArg.out_fn.c_str());
sam_close(g_outBamFp); sam_close(g_outBamFp);
sam_close(g_inBamFp); sam_close(g_inBamFp);
return -1; return -1;
} }
++bamIdx;
} }
inBuf.ClearAll(); inBuf.ClearAll();
cout << "write round time: " << tw1.seconds_elapsed() << " s" << endl; cout << "write round time: " << tw1.seconds_elapsed() << " s" << endl;
} }
cout << "dupsize: " << dupIdxQue.Size() << endl; bam_destroy1(bp);
if (sam_idx_save(g_outBamFp) < 0) { cout << "bam_Num: " << bam_num1 << " : " << bam_num2 << endl;
// 计算统计信息
gMetrics.READ_PAIRS_EXAMINED /= 2;
gMetrics.READ_PAIR_DUPLICATES /= 2;
for (auto &arr : gData.opticalDupIdxArr) gMetrics.READ_PAIR_OPTICAL_DUPLICATES += arr.size();
gMetrics.READ_PAIR_OPTICAL_DUPLICATES = gMetrics.READ_PAIR_OPTICAL_DUPLICATES / 2;
gMetrics.ESTIMATED_LIBRARY_SIZE =
estimateLibrarySize(gMetrics.READ_PAIRS_EXAMINED - gMetrics.READ_PAIR_OPTICAL_DUPLICATES,
gMetrics.READ_PAIRS_EXAMINED - gMetrics.READ_PAIR_DUPLICATES);
if (gMetrics.UNPAIRED_READS_EXAMINED + gMetrics.READ_PAIRS_EXAMINED != 0) {
gMetrics.PERCENT_DUPLICATION = (gMetrics.UNPAIRED_READ_DUPLICATES + gMetrics.READ_PAIR_DUPLICATES * 2) /
(double)(gMetrics.UNPAIRED_READS_EXAMINED + gMetrics.READ_PAIRS_EXAMINED * 2);
} else {
gMetrics.PERCENT_DUPLICATION = 0;
}
cout << "metrics: \n"
<< "LIBRARY: " << gMetrics.LIBRARY << "\n"
<< "UNPAIRED_READS_EXAMINED: " << gMetrics.UNPAIRED_READS_EXAMINED << "\n"
<< "READ_PAIRS_EXAMINED: " << gMetrics.READ_PAIRS_EXAMINED << "\n"
<< "SECONDARY_OR_SUPPLEMENTARY_RDS: " << gMetrics.SECONDARY_OR_SUPPLEMENTARY_RDS << "\n"
<< "UNMAPPED_READS: " << gMetrics.UNMAPPED_READS << "\n"
<< "UNPAIRED_READ_DUPLICATES: " << gMetrics.UNPAIRED_READ_DUPLICATES << "\n"
<< "READ_PAIR_DUPLICATES: " << gMetrics.READ_PAIR_DUPLICATES << "\n"
<< "READ_PAIR_OPTICAL_DUPLICATES: " << gMetrics.READ_PAIR_OPTICAL_DUPLICATES << "\n"
<< "PERCENT_DUPLICATION: " << gMetrics.PERCENT_DUPLICATION << "\n"
<< "ESTIMATED_LIBRARY_SIZE: " << gMetrics.ESTIMATED_LIBRARY_SIZE << endl;
cout << "singleton pos: " << gMetrics.singletonReads.size() << endl;
for (auto &itr : gMetrics.DuplicateCountHist) cout << "dup counts: " << itr.first << ": " << itr.second << endl;
for (auto &itr : gMetrics.NonOpticalDuplicateCountHist)
cout << "non optical dup counts: " << itr.first << ": " << itr.second << endl;
for (auto &itr : gMetrics.OpticalDuplicatesCountHist)
cout << "optical dup counts: " << itr.first << ": " << itr.second << endl;
cout << "dupsize: " << dupIdxQue.Size() << "\t" << opticalIdxQue.Size() << "\t" << repIdxQue.Size() << endl;
if (!indexFn.empty() && sam_idx_save(g_outBamFp) < 0) {
Error("writing index failed"); Error("writing index failed");
sam_close(g_outBamFp); sam_close(g_outBamFp);
sam_close(g_inBamFp); sam_close(g_inBamFp);

View File

@ -9,6 +9,7 @@ Date : 2023/10/27
#include "markdups_arg.h" #include "markdups_arg.h"
#include "common/utils/global_arg.h" #include "common/utils/global_arg.h"
#include "common/utils/util.h"
#include <cstring> #include <cstring>
#include <vector> #include <vector>
@ -65,7 +66,8 @@ const static struct option kMdOpts[] = {
{"MAX_RECORDS_IN_RAM", required_argument, NULL, MAX_RECORDS_IN_RAM}, {"MAX_RECORDS_IN_RAM", required_argument, NULL, MAX_RECORDS_IN_RAM},
{"CREATE_INDEX", required_argument, NULL, CREATE_INDEX}, {"CREATE_INDEX", required_argument, NULL, CREATE_INDEX},
{"INDEX_FORMAT", required_argument, NULL, INDEX_FORMAT}, {"INDEX_FORMAT", required_argument, NULL, INDEX_FORMAT},
{"CREATE_MD5_FILE", required_argument, NULL, CREATE_MD5_FILE}}; {"CREATE_MD5_FILE", required_argument, NULL, CREATE_MD5_FILE},
{"ADD_PG_TAG_TO_READS", required_argument, NULL, ADD_PG_TAG_TO_READS}};
// 判断bool类型的参数 // 判断bool类型的参数
void setBoolArg(bool *arg) { void setBoolArg(bool *arg) {
@ -193,7 +195,8 @@ void MarkDupsArg::parseArgument(int argc,
COMMENT.push_back(optarg); COMMENT.push_back(optarg);
break; break;
case ns_md::READ_NAME_REGEX: case ns_md::READ_NAME_REGEX:
READ_NAME_REGEX = optarg; if (strcmp("null", optarg) == 0) READ_NAME_REGEX="";
else READ_NAME_REGEX = optarg;
break; break;
case ns_md::OPTICAL_DUPLICATE_PIXEL_DISTANCE: case ns_md::OPTICAL_DUPLICATE_PIXEL_DISTANCE:
OPTICAL_DUPLICATE_PIXEL_DISTANCE = stoi(optarg); OPTICAL_DUPLICATE_PIXEL_DISTANCE = stoi(optarg);
@ -229,6 +232,9 @@ void MarkDupsArg::parseArgument(int argc,
case ns_md::CREATE_MD5_FILE: case ns_md::CREATE_MD5_FILE:
setBoolArg(&CREATE_MD5_FILE); setBoolArg(&CREATE_MD5_FILE);
break; break;
case ns_md::ADD_PG_TAG_TO_READS:
setBoolArg(&ADD_PG_TAG_TO_READS);
break;
default: default:
break; break;
} }
@ -245,6 +251,18 @@ void MarkDupsArg::printArgValue()
printf("--INDEX_FORMAT = %s\n", this->INDEX_FORMAT == ns_md::IndexFormat::BAI ? "bai" : "csi"); printf("--INDEX_FORMAT = %s\n", this->INDEX_FORMAT == ns_md::IndexFormat::BAI ? "bai" : "csi");
} }
string MarkDupsArg::GetArgValueString() {
vector<string> argVals;
argVals.push_back("--READ_NAME_REGEX");
if (this->READ_NAME_REGEX.empty()) argVals.push_back("null");
else argVals.push_back(this->READ_NAME_REGEX.c_str());
argVals.push_back("--INDEX_FORMAT"); argVals.push_back(this->INDEX_FORMAT == ns_md::IndexFormat::BAI ? "bai" : "csi");
string argValStr;
StringUtil::Join(argVals, argValStr, ' ');
cout << argValStr << endl;
return argValStr;
}
// 打印版本信息 // 打印版本信息
void MarkDupsArg::PrintVersion() void MarkDupsArg::PrintVersion()
{ {

View File

@ -56,6 +56,7 @@ enum MarkDupsArgEnum {
CREATE_INDEX, CREATE_INDEX,
INDEX_FORMAT, INDEX_FORMAT,
CREATE_MD5_FILE, CREATE_MD5_FILE,
ADD_PG_TAG_TO_READS,
_END_NUM _END_NUM
}; };
@ -294,6 +295,9 @@ struct MarkDupsArg
/* "Whether to create an MD5 digest for any BAM or FASTQ files created. ", common = true */ /* "Whether to create an MD5 digest for any BAM or FASTQ files created. ", common = true */
bool CREATE_MD5_FILE = false; bool CREATE_MD5_FILE = false;
/* Add PG tag to each read in a SAM or BAM (PGTagArgumentCollection)*/
bool ADD_PG_TAG_TO_READS = true;
// mark duplicate参数个数 // mark duplicate参数个数
const static int ARG_COUNT = ns_md::MarkDupsArgEnum::_END_NUM - ns_md::MarkDupsArgEnum::_START_NUM - 1; const static int ARG_COUNT = ns_md::MarkDupsArgEnum::_END_NUM - ns_md::MarkDupsArgEnum::_START_NUM - 1;
// 解析参数 // 解析参数
@ -302,6 +306,7 @@ struct MarkDupsArg
GlobalArg *pGArg); GlobalArg *pGArg);
void printArgValue(); void printArgValue();
string GetArgValueString();
static void PrintHelp(); static void PrintHelp();

View File

@ -461,18 +461,9 @@ static int checkOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const R
ReadEnds *pRe = const_cast<ReadEnds *>(readEndsArr[i]); ReadEnds *pRe = const_cast<ReadEnds *>(readEndsArr[i]);
if (opticalDuplicateFlags[i]) { if (opticalDuplicateFlags[i]) {
++opticalDuplicates; ++opticalDuplicates;
// if (zzhopticalSet.find(pRe->read1IndexInFile) != zzhopticalSet.end()) {
// cout << "val: " << pRe->isOpticalDuplicate << endl;
// }
pRe->isOpticalDuplicate = true; pRe->isOpticalDuplicate = true;
zzhopticalSet.insert(pRe->read1IndexInFile);
zzhopticalSet.insert(pRe->read2IndexInFile);
zzhopticalArr.push_back(pRe->read1IndexInFile);
zzhopticalArr.push_back(pRe->read2IndexInFile);
} else { } else {
pRe->isOpticalDuplicate = false; pRe->isOpticalDuplicate = false;
zzhopticalSet.erase(pRe->read1IndexInFile);
zzhopticalSet.erase(pRe->read2IndexInFile);
} }
} }
return opticalDuplicates; return opticalDuplicates;
@ -483,11 +474,8 @@ static int checkOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const R
*/ */
void trackOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe) { void trackOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe) {
bool hasFR = false, hasRF = false; bool hasFR = false, hasRF = false;
int prevOpticalDupNum = 0;
// Check to see if we have a mixture of FR/RF // Check to see if we have a mixture of FR/RF
for (auto pRe : readEndsArr) { for (auto pRe : readEndsArr) {
if (pRe->isOpticalDuplicate)
++prevOpticalDupNum;
if (ReadEnds::FR == pRe->orientationForOpticalDuplicates) if (ReadEnds::FR == pRe->orientationForOpticalDuplicates)
hasFR = true; hasFR = true;
else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates) else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates)
@ -517,17 +505,8 @@ void trackOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnd
nOpticalDup = checkOpticalDuplicates(readEndsArr, pBestRe); nOpticalDup = checkOpticalDuplicates(readEndsArr, pBestRe);
} }
// trackDuplicateCounts // 统计信息trackDuplicateCounts
gMetrics.DuplicateCountHist += readEndsArr.size() - nOpticalDup; ++gMetrics.DuplicateCountHist[readEndsArr.size()];
if (readEndsArr.size() > nOpticalDup) if (readEndsArr.size() > nOpticalDup) ++gMetrics.NonOpticalDuplicateCountHist[readEndsArr.size() - nOpticalDup];
gMetrics.NonOpticalDuplicateCountHist += readEndsArr.size() - nOpticalDup; if (nOpticalDup) ++gMetrics.OpticalDuplicatesCountHist[nOpticalDup + 1];
if (nOpticalDup)
gMetrics.OpticalDuplicatesCountHist += nOpticalDup + 1;
gMetrics.OpticalDuplicatesByLibraryId += nOpticalDup - prevOpticalDupNum;
//gMetrics.OpticalDuplicatesByLibraryId += nOpticalDup;
// cout << "zzh optical:" << (++zzhtestnum) << "\t" << readEndsArr.size() << "\t" << nOpticalDup << endl;
// cerr << (zzhtestnum++) << " " << readEndsArr.size() << ":" << nOpticalDup << endl;
} }

View File

@ -0,0 +1,148 @@
#pragma once
#include <common/hts/bam_buf.h>
#include <robin-map/include/tsl/robin_map.h>
#include <robin-map/include/tsl/robin_set.h>
#include <sam/utils/read_ends.h>
#include <set>
#include <string>
#include <unordered_set>
#include <vector>
using std::set;
using std::string;
using std::unordered_set;
using std::vector;
/* 存放未匹配readend相同位点的所有readend */
struct UnpairedREInfo {
int64_t taskSeq;
ReadEnds unpairedRE;
};
/* 对于一个pair数据一个完整的计算点包含read1的比对位置和read2的比对位置 */
struct CalcKey {
int64_t read1Pos;
int64_t read2Pos;
bool operator<(const CalcKey &o) const {
int comp = (int)(read1Pos - o.read1Pos);
if (comp == 0)
comp = (int)(read2Pos - o.read2Pos);
return comp < 0;
}
bool operator==(const CalcKey &o) const { return read1Pos == o.read1Pos && read2Pos == o.read2Pos; }
std::size_t operator()(const CalcKey &o) const {
return std::hash<int64_t>()(read1Pos) ^ std::hash<int64_t>()(read2Pos);
}
};
struct CalcKeyHash {
std::size_t operator()(const CalcKey &o) const {
return std::hash<int64_t>()(o.read1Pos) ^ std::hash<int64_t>()(o.read2Pos);
}
};
/* 用来记录冗余索引相关的信息 */
struct DupInfo {
int64_t idx;
int64_t repIdx = 0; // 这一批冗余中的非冗余read 代表的索引
int16_t dupSet = 0; // dup set size
DupInfo() : DupInfo(-1, 0, 0) {}
DupInfo(int64_t idx_) : DupInfo(idx_, 0, 0) {}
DupInfo(int64_t idx_, int64_t repIdx_, int dupSet_) : idx(idx_), repIdx(repIdx_), dupSet(dupSet_) {}
bool operator<(const DupInfo &o) const { return idx < o.idx; }
bool operator>(const DupInfo &o) const { return idx > o.idx; }
operator int64_t() const { return idx; }
};
struct DupInfoHash {
std::size_t operator()(const DupInfo &o) const { return std::hash<int64_t>()(o.idx); }
};
struct DupInfoEqual {
bool operator()(const DupInfo &o1, const DupInfo &o2) const { return o1.idx == o2.idx; }
bool operator()(const DupInfo &o1, const int64_t &o2) const { return o1.idx == o2; }
bool operator()(const int64_t &o1, const DupInfo &o2) const { return o1 == o2.idx; }
};
using MDMap = tsl::robin_map<int64_t, int64_t>;
template <typename T>
// using MDSet = set<T>;
// using MDSet = unordered_set<T>;
using MDSet = tsl::robin_set<T>;
template <typename T>
// using DPSet = set<T>;
// using DPSet = unordered_set<T, DupInfoHash, DupInfoEqual>;
using DPSet = tsl::robin_set<T, DupInfoHash, DupInfoEqual>;
template <typename T>
// using CalcSet = set<T>;
using CalcSet = tsl::robin_set<T, CalcKeyHash>;
using ReadEndsSet = tsl::robin_set<ReadEnds, ReadEndsHash, ReadEndsEqual>;
/* 当遗留数据在当前任务找到了pair read后进行冗余计算时候存放结果的数据结构 */
struct TaskSeqDupInfo {
DPSet<DupInfo> dupIdx;
MDSet<int64_t> opticalDupIdx;
DPSet<DupInfo> repIdx;
MDSet<int64_t> notDupIdx;
MDSet<int64_t> notOpticalDupIdx;
MDSet<int64_t> notRepIdx;
};
/* 保存有未匹配pair位点的信息包括read end数组和有几个未匹配的read end */
struct UnpairedPosInfo {
int unpairedNum = 0;
int64_t taskSeq;
vector<ReadEnds> pairArr;
MDSet<string> readNameSet;
};
// typedef unordered_map<string, UnpairedREInfo> UnpairedNameMap;
// typedef unordered_map<int64_t, UnpairedPosInfo> UnpairedPositionMap;
typedef tsl::robin_map<string, UnpairedREInfo> UnpairedNameMap; // 以read name为索引保存未匹配的pair read
typedef tsl::robin_map<int64_t, UnpairedPosInfo>
UnpairedPositionMap; // 以位点为索引保存该位点包含的对应的所有read和该位点包含的剩余未匹配的read的数量
/* 单线程处理冗余参数结构体 */
struct MarkDupDataArg {
int64_t taskSeq; // 任务序号
int64_t bamStartIdx; // 当前vBam数组中第一个bam记录在整体bam中所处的位置
vector<BamWrap *> bams; // 存放待处理的bam read
vector<ReadEnds> pairs; // 成对的reads
vector<ReadEnds> frags; // 暂未找到配对的reads
DPSet<DupInfo> pairDupIdx; // pair的冗余read的索引
MDSet<int64_t> pairOpticalDupIdx; // optical冗余read的索引
DPSet<DupInfo> fragDupIdx; // frag的冗余read的索引
DPSet<DupInfo> pairRepIdx; // pair的dupset代表read的索引
MDSet<int64_t> pairSingletonIdxArr; // 某位置只有一对read的所有read pair个数
UnpairedNameMap unpairedDic; // 用来寻找pair end
UnpairedPositionMap unpairedPosArr; // 存放未匹配的ReadEnd对应位点的所有ReadEnd为了避免重复存储
};
/* 全局保留的数据因为有些paired数据比对到了不同的染色体相距甚远 */
struct GlobalDataArg {
UnpairedNameMap unpairedDic; // 用来寻找pair end
UnpairedPositionMap unpairedPosArr;
// 每个task对应一个vector
vector<vector<DupInfo>> dupIdxArr;
vector<vector<int64_t>> opticalDupIdxArr;
vector<vector<DupInfo>> repIdxArr;
vector<vector<int64_t>> singletonIdxArr;
// 用来存放后续计算的数据
vector<DPSet<DupInfo>> latterDupIdxArr;
vector<MDSet<int64_t>> latterOpticalDupIdxArr;
vector<DPSet<DupInfo>> latterRepIdxArr;
vector<MDSet<int64_t>> latterSingletonIdxArr;
vector<MDSet<int64_t>> latterNotDupIdxArr;
vector<MDSet<int64_t>> latterNotOpticalDupIdxArr;
vector<MDSet<int64_t>> latterNotRepIdxArr;
vector<MDSet<int64_t>> latterNotSingletonIdxArr;
};

View File

@ -68,13 +68,13 @@ static void markDupsForPairs(vector<const ReadEnds *> &vpRe, DPSet<DupInfo> *dup
MDSet<int64_t> *notOpticalDupIdx = nullptr, MDSet<int64_t> *notRepIdx = nullptr) { MDSet<int64_t> *notOpticalDupIdx = nullptr, MDSet<int64_t> *notRepIdx = nullptr) {
if (vpRe.size() < 2) { if (vpRe.size() < 2) {
if (vpRe.size() == 1) { if (vpRe.size() == 1) {
// addSingletonToCount(libraryIdGenerator);
// 这个统计可能会有误差因为当前位点可能还有没匹配上的read导致当前位点的readpaired数量为1 // 这个统计可能会有误差因为当前位点可能还有没匹配上的read导致当前位点的readpaired数量为1
// 可以通过后续的补充计算来解决这个问题,有必要么? // 可以通过后续的补充计算来解决这个问题,有必要么?好像没必要
gMetrics.AddSingletonToCount(); gMetrics.singletonReads.insert(vpRe[0]->read1IndexInFile);
} }
return; return;
} }
int maxScore = 0; int maxScore = 0;
const ReadEnds *pBest = nullptr; const ReadEnds *pBest = nullptr;
// bool print = false; // bool print = false;
@ -109,6 +109,7 @@ static void markDupsForPairs(vector<const ReadEnds *> &vpRe, DPSet<DupInfo> *dup
notDupIdx->insert(pBest->read2IndexInFile); notDupIdx->insert(pBest->read2IndexInFile);
} }
// if (false) { // if (false) {
// if (true) {
if (!g_mdArg.READ_NAME_REGEX.empty()) { // 检查光学冗余 if (!g_mdArg.READ_NAME_REGEX.empty()) { // 检查光学冗余
// trackOpticalDuplicates // trackOpticalDuplicates
vector<const ReadEnds *> prevOpticalRe; vector<const ReadEnds *> prevOpticalRe;
@ -196,7 +197,7 @@ static void getEqualRE(const ReadEnds &re, vector<ReadEnds> &src, vector<ReadEnd
} }
/* 单线程生成readends (第一步)*/ /* 单线程生成readends (第一步)*/
static void generateReadEnds(SerailMarkDupArg *arg) { static void generateReadEnds(MarkDupDataArg *arg) {
auto &p = *arg; auto &p = *arg;
auto &rnParser = g_vRnParser[0]; auto &rnParser = g_vRnParser[0];
@ -205,12 +206,14 @@ static void generateReadEnds(SerailMarkDupArg *arg) {
p.unpairedDic.clear(); p.unpairedDic.clear();
p.unpairedPosArr.clear(); p.unpairedPosArr.clear();
// bam_num1 += p.bams.size();
/* 处理每个read创建ReadEnd并放入frag和pair中 */ /* 处理每个read创建ReadEnd并放入frag和pair中 */
// MDSet<ReadEnds> reSet; // MDSet<ReadEnds> reSet;
// ReadEnds lastRe; // ReadEnds lastRe;
for (int i = 0; i < p.bams.size(); ++i) { // 循环处理每个read for (size_t i = 0; i < p.bams.size(); ++i) { // 循环处理每个read
BamWrap *bw = p.bams[i]; BamWrap *bw = p.bams[i];
// ++bam_num1;
const int64_t bamIdx = p.bamStartIdx + i; const int64_t bamIdx = p.bamStartIdx + i;
if (bw->GetReadUnmappedFlag()) { if (bw->GetReadUnmappedFlag()) {
if (bw->b->core.tid == -1) if (bw->b->core.tid == -1)
@ -308,7 +311,7 @@ static void processFrags(vector<ReadEnds> &readEnds, DPSet<DupInfo> *dupIdx, MDS
} }
/* 单线程markdup (第二步)*/ /* 单线程markdup (第二步)*/
static void markdups(SerailMarkDupArg *arg) { static void markdups(MarkDupDataArg *arg) {
auto &p = *arg; auto &p = *arg;
p.pairDupIdx.clear(); p.pairDupIdx.clear();
p.pairOpticalDupIdx.clear(); p.pairOpticalDupIdx.clear();
@ -354,8 +357,8 @@ static inline void getIntersectData(vector<ReadEnds> &leftArr, vector<ReadEnds>
} }
/* 将frags重叠部分的dup idx放进数据中 */ /* 将frags重叠部分的dup idx放进数据中 */
static inline void refreshFragDupIdx(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &notDupIdx, SerailMarkDupArg *lastArg, static inline void refreshFragDupIdx(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &notDupIdx, MarkDupDataArg *lastArg,
SerailMarkDupArg *curArg) { MarkDupDataArg *curArg) {
auto &lp = *lastArg; auto &lp = *lastArg;
auto &p = *curArg; auto &p = *curArg;
for (auto idx : dupIdx) { for (auto idx : dupIdx) {
@ -371,7 +374,7 @@ static inline void refreshFragDupIdx(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &not
/* 将pairs重叠部分的dup idx放进数据中 */ /* 将pairs重叠部分的dup idx放进数据中 */
static inline void refreshPairDupIdx(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &opticalDupIdx, DPSet<DupInfo> &repIdx, static inline void refreshPairDupIdx(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &opticalDupIdx, DPSet<DupInfo> &repIdx,
MDSet<int64_t> &notDupIdx, MDSet<int64_t> &notOpticalDupIdx, MDSet<int64_t> &notDupIdx, MDSet<int64_t> &notOpticalDupIdx,
MDSet<int64_t> &notRepIdx, SerailMarkDupArg *lastArg, SerailMarkDupArg *curArg) { MDSet<int64_t> &notRepIdx, MarkDupDataArg *lastArg, MarkDupDataArg *curArg) {
auto &lp = *lastArg; auto &lp = *lastArg;
auto &p = *curArg; auto &p = *curArg;
for (auto idx : dupIdx) { for (auto idx : dupIdx) {
@ -466,7 +469,7 @@ static void refeshFinalTaskDupInfo(DupContainer &dupIdx, MDSet<int64_t> &notDupI
} }
/* 处理相邻的两个任务,有相交叉的数据 */ /* 处理相邻的两个任务,有相交叉的数据 */
static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *curArg, GlobalDataArg *gDataArg) { static void handleIntersectData(MarkDupDataArg *lastArg, MarkDupDataArg *curArg, GlobalDataArg *gDataArg) {
auto &lp = *lastArg; auto &lp = *lastArg;
auto &p = *curArg; auto &p = *curArg;
auto &g = *gDataArg; auto &g = *gDataArg;
@ -745,7 +748,7 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
} }
/* 当所有任务结束后global data里还有未处理的数据 */ /* 当所有任务结束后global data里还有未处理的数据 */
static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) { static void handleLastTask(MarkDupDataArg *task, GlobalDataArg *gDataArg) {
// cout << "last task start" << endl; // cout << "last task start" << endl;
auto &lp = *task; auto &lp = *task;
auto &g = *gDataArg; auto &g = *gDataArg;
@ -768,6 +771,7 @@ static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) {
g.unpairedDic.insert(prevUnpair); // 用来记录没有匹配的read个数 g.unpairedDic.insert(prevUnpair); // 用来记录没有匹配的read个数
} }
} }
map<int64_t, TaskSeqDupInfo> taskChanged; map<int64_t, TaskSeqDupInfo> taskChanged;
for (auto &e : g.unpairedPosArr) { for (auto &e : g.unpairedPosArr) {
auto posKey = e.first; auto posKey = e.first;
@ -779,6 +783,8 @@ static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) {
std::sort(arr.begin(), arr.end()); std::sort(arr.begin(), arr.end());
// cout << "last task before mark pair" << endl; // cout << "last task before mark pair" << endl;
processPairs(arr, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.notDupIdx, &t.notRepIdx); processPairs(arr, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.notDupIdx, &t.notRepIdx);
} else if (arr.size() == 1) {
gMetrics.singletonReads.insert(arr[0].read1IndexInFile);
} }
} }
// 更新结果 // 更新结果
@ -796,7 +802,10 @@ static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) {
// cout << "last unpair info: " << g.unpairedDic.size() << '\t' << g.unpairedPosArr.size() << endl; // cout << "last unpair info: " << g.unpairedDic.size() << '\t' << g.unpairedPosArr.size() << endl;
g.unpairedPosArr.clear(); g.unpairedPosArr.clear();
g.unpairedDic.clear();
// for (auto &rn : g.unpairedDic) {
// cout << rn.first << endl;
// }
/* /*
int taskSeq = 0; int taskSeq = 0;
@ -888,12 +897,15 @@ static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) {
std::sort(vRepIdx.begin(), vRepIdx.end()); std::sort(vRepIdx.begin(), vRepIdx.end());
} }
void calculateMetrics(SerailMarkDupArg &lp, SerailMarkDupArg &p, GlobalDataArg &g, DuplicationMetrics *pgMetrics) { void calculateMetrics(MarkDupDataArg &lp, MarkDupDataArg &p, GlobalDataArg &g, DuplicationMetrics *pgMetrics) {
DuplicationMetrics &gMetrics = *pgMetrics; DuplicationMetrics &gMetrics = *pgMetrics;
// gMetrics.READ_PAIRS_EXAMINED /= 2;
cout << "calculateMetrics start: " << endl; cout << "calculateMetrics start: " << endl;
cout << lp.unpairedDic.size() << "\t" << p.unpairedDic.size() << "\t" << g.unpairedDic.size() << endl; cout << lp.unpairedDic.size() << "\t" << p.unpairedDic.size() << "\t" << g.unpairedDic.size() << endl;
cout << "all: " << (lp.unpairedDic.size() + p.unpairedDic.size() + g.unpairedDic.size()) << endl;
cout << "calculateMetrics end" << endl; cout << "calculateMetrics end" << endl;
} }
@ -908,9 +920,9 @@ void serialMarkDups() {
// inBamBuf.Init(g_inBamFp, g_inBamHeader, 100 * 1024 * 1024); // inBamBuf.Init(g_inBamFp, g_inBamHeader, 100 * 1024 * 1024);
int64_t processedBamNum = 0; int64_t processedBamNum = 0;
SerailMarkDupArg smdArg1, smdArg2; MarkDupDataArg smdArg1, smdArg2;
SerailMarkDupArg *lastArgP = &smdArg1; MarkDupDataArg *lastArgP = &smdArg1;
SerailMarkDupArg *curArgP = &smdArg2; MarkDupDataArg *curArgP = &smdArg2;
bool isFirstRound = true; bool isFirstRound = true;
int roundNum = 0; int roundNum = 0;
@ -921,6 +933,7 @@ void serialMarkDups() {
tm_arr[4].acc_start(); tm_arr[4].acc_start();
size_t readNum = inBamBuf.ReadBam(); size_t readNum = inBamBuf.ReadBam();
readNumSum += readNum; readNumSum += readNum;
// readNumSum += inBamBuf.GetBamArr().size();
tm_arr[4].acc_end(); tm_arr[4].acc_end();
cout << "read num: " << readNum << '\t' << roundNum << endl; cout << "read num: " << readNum << '\t' << roundNum << endl;
// lastArgP = curArgP; // lastArgP = curArgP;
@ -974,7 +987,7 @@ void serialMarkDups() {
handleLastTask(lastArgP, &gData); handleLastTask(lastArgP, &gData);
// 计算各种统计指标 // 计算各种统计指标
calculateMetrics(*lastArgP, *curArgP, gData, &gMetrics); // calculateMetrics(*lastArgP, *curArgP, gData, &gMetrics);
// cout << "here 2" << endl; // cout << "here 2" << endl;
tm_arr[3].acc_end(); tm_arr[3].acc_end();
@ -987,7 +1000,7 @@ void serialMarkDups() {
map<int64_t, int> dup; map<int64_t, int> dup;
int taskSeq = 0; int taskSeq = 0;
for (auto &arr : gData.dupIdxArr) { /* for (auto &arr : gData.dupIdxArr) {
for (auto idx : arr) { for (auto idx : arr) {
if (dup.find(idx.idx) != dup.end()) { if (dup.find(idx.idx) != dup.end()) {
// cout << "dup index: " << dup[idx] << '\t' << taskSeq << '\t' // cout << "dup index: " << dup[idx] << '\t' << taskSeq << '\t'
@ -998,7 +1011,7 @@ void serialMarkDups() {
// cout << taskSeq << "\t" << arr.size() << endl; // cout << taskSeq << "\t" << arr.size() << endl;
taskSeq++; taskSeq++;
} }
*/
// #include <fstream> // #include <fstream>
// ofstream out("tumor_dup.txt"); // ofstream out("tumor_dup.txt");
// for (auto idx : dup) // for (auto idx : dup)
@ -1023,23 +1036,11 @@ void serialMarkDups() {
cout << "sort frags : " << tm_arr[9].acc_seconds_elapsed() << endl; cout << "sort frags : " << tm_arr[9].acc_seconds_elapsed() << endl;
cout << "sort pairs : " << tm_arr[10].acc_seconds_elapsed() << endl; cout << "sort pairs : " << tm_arr[10].acc_seconds_elapsed() << endl;
cout << "all : " << tm_arr[5].acc_seconds_elapsed() << endl; cout << "all : " << tm_arr[5].acc_seconds_elapsed() << endl;
cout << "optical dup: " << zzhopticalSet.size() << endl;
cout << "optical arr dup: " << zzhopticalArr.size() << endl;
cout << "optical size: " << opticalDupNum << endl; cout << "optical size: " << opticalDupNum << endl;
cout << "singleton size: " << gMetrics.singletonReads.size() << endl;
cout << "metrics: \n"
<< "LIBRARY: " << gMetrics.LIBRARY << "\n"
<< "UNPAIRED_READS_EXAMINED: " << gMetrics.UNPAIRED_READS_EXAMINED << "\n"
<< "READ_PAIRS_EXAMINED: " << gMetrics.READ_PAIRS_EXAMINED << "\n"
<< "SECONDARY_OR_SUPPLEMENTARY_RDS: " << gMetrics.SECONDARY_OR_SUPPLEMENTARY_RDS << "\n"
<< "UNMAPPED_READS: " << gMetrics.UNMAPPED_READS << "\n"
<< "UNPAIRED_READ_DUPLICATES: " << gMetrics.UNPAIRED_READ_DUPLICATES << "\n"
<< "READ_PAIR_DUPLICATES: " << gMetrics.READ_PAIR_DUPLICATES << "\n"
<< "READ_PAIR_OPTICAL_DUPLICATES: " << gMetrics.READ_PAIR_OPTICAL_DUPLICATES << "\n"
<< "PERCENT_DUPLICATION: " << gMetrics.PERCENT_DUPLICATION << "\n"
<< "ESTIMATED_LIBRARY_SIZE: " << gMetrics.ESTIMATED_LIBRARY_SIZE << endl;
Timer::log_time("serial end "); Timer::log_time("serial end ");
cout << "read num sum: " << readNumSum << endl;
// for (auto i : gData.dupArr) // for (auto i : gData.dupArr)
// cout << i << endl; // cout << i << endl;

View File

@ -1,143 +1,6 @@
#pragma once #pragma once
#include <common/hts/bam_buf.h> #include "md_types.h"
#include <robin-map/include/tsl/robin_map.h>
#include <robin-map/include/tsl/robin_set.h>
#include <sam/utils/read_ends.h>
#include <set>
#include <string>
#include <unordered_set>
#include <vector>
using std::set;
using std::string;
using std::unordered_set;
using std::vector;
/* 存放未匹配readend相同位点的所有readend */
struct UnpairedREInfo {
int64_t taskSeq;
ReadEnds unpairedRE;
};
/* 对于一个pair数据一个完整的计算点包含read1的比对位置和read2的比对位置 */
struct CalcKey {
int64_t read1Pos;
int64_t read2Pos;
bool operator<(const CalcKey &o) const {
int comp = (int)(read1Pos - o.read1Pos);
if (comp == 0)
comp = (int)(read2Pos - o.read2Pos);
return comp < 0;
}
bool operator==(const CalcKey &o) const { return read1Pos == o.read1Pos && read2Pos == o.read2Pos; }
std::size_t operator()(const CalcKey &o) const {
return std::hash<int64_t>()(read1Pos) ^ std::hash<int64_t>()(read2Pos);
}
};
struct CalcKeyHash {
std::size_t operator()(const CalcKey &o) const {
return std::hash<int64_t>()(o.read1Pos) ^ std::hash<int64_t>()(o.read2Pos);
}
};
/* 用来记录冗余索引相关的信息 */
struct DupInfo {
int64_t idx;
int64_t repIdx = 0; // 这一批冗余中的非冗余read 代表的索引
int16_t dupSet = 0; // dup set size
DupInfo() : DupInfo(-1, 0, 0) {}
DupInfo(int64_t idx_) : DupInfo(idx_, 0, 0) {}
DupInfo(int64_t idx_, int64_t repIdx_, int dupSet_) : idx(idx_), repIdx(repIdx_), dupSet(dupSet_) {}
bool operator<(const DupInfo &o) const { return idx < o.idx; }
bool operator>(const DupInfo &o) const { return idx > o.idx; }
operator int64_t() const { return idx; }
};
struct DupInfoHash {
std::size_t operator()(const DupInfo &o) const { return std::hash<int64_t>()(o.idx); }
};
struct DupInfoEqual {
bool operator()(const DupInfo &o1, const DupInfo &o2) const { return o1.idx == o2.idx; }
bool operator()(const DupInfo &o1, const int64_t &o2) const { return o1.idx == o2; }
bool operator()(const int64_t &o1, const DupInfo &o2) const { return o1 == o2.idx; }
};
template <typename T>
// using MDSet = set<T>;
// using MDSet = unordered_set<T>;
using MDSet = tsl::robin_set<T>;
template <typename T>
// using DPSet = set<T>;
// using DPSet = unordered_set<T, DupInfoHash, DupInfoEqual>;
using DPSet = tsl::robin_set<T, DupInfoHash, DupInfoEqual>;
template <typename T>
// using CalcSet = set<T>;
using CalcSet = tsl::robin_set<T, CalcKeyHash>;
/* 当遗留数据在当前任务找到了pair read后进行冗余计算时候存放结果的数据结构 */
struct TaskSeqDupInfo {
DPSet<DupInfo> dupIdx;
MDSet<int64_t> opticalDupIdx;
DPSet<DupInfo> repIdx;
MDSet<int64_t> notDupIdx;
MDSet<int64_t> notOpticalDupIdx;
MDSet<int64_t> notRepIdx;
};
/* 保存有未匹配pair位点的信息包括read end数组和有几个未匹配的read end */
struct UnpairedPosInfo {
int unpairedNum = 0;
int64_t taskSeq;
vector<ReadEnds> pairArr;
MDSet<string> readNameSet;
};
// typedef unordered_map<string, UnpairedREInfo> UnpairedNameMap;
// typedef unordered_map<int64_t, UnpairedPosInfo> UnpairedPositionMap;
typedef tsl::robin_map<string, UnpairedREInfo> UnpairedNameMap; // 以read name为索引保存未匹配的pair read
typedef tsl::robin_map<int64_t, UnpairedPosInfo>
UnpairedPositionMap; // 以位点为索引保存该位点包含的对应的所有read和该位点包含的剩余未匹配的read的数量
/* 单线程处理冗余参数结构体 */
struct SerailMarkDupArg {
int64_t taskSeq; // 任务序号
int64_t bamStartIdx; // 当前vBam数组中第一个bam记录在整体bam中所处的位置
vector<BamWrap *> bams; // 存放待处理的bam read
vector<ReadEnds> pairs; // 成对的reads
vector<ReadEnds> frags; // 暂未找到配对的reads
DPSet<DupInfo> pairDupIdx; // pair的冗余read的索引
MDSet<int64_t> pairOpticalDupIdx; // optical冗余read的索引
DPSet<DupInfo> fragDupIdx; // frag的冗余read的索引
DPSet<DupInfo> pairRepIdx; // pair的dupset代表read的索引
UnpairedNameMap unpairedDic; // 用来寻找pair end
UnpairedPositionMap unpairedPosArr; // 存放未匹配的ReadEnd对应位点的所有ReadEnd为了避免重复存储
};
/* 全局保留的数据因为有些paired数据比对到了不同的染色体相距甚远 */
struct GlobalDataArg {
UnpairedNameMap unpairedDic; // 用来寻找pair end
UnpairedPositionMap unpairedPosArr;
// 每个task对应一个vector
vector<vector<DupInfo>> dupIdxArr;
vector<vector<int64_t>> opticalDupIdxArr;
vector<vector<DupInfo>> repIdxArr;
// 用来存放后续计算的数据
vector<DPSet<DupInfo>> latterDupIdxArr;
vector<MDSet<int64_t>> latterOpticalDupIdxArr;
vector<DPSet<DupInfo>> latterRepIdxArr;
vector<MDSet<int64_t>> latterNotDupIdxArr;
vector<MDSet<int64_t>> latterNotOpticalDupIdxArr;
vector<MDSet<int64_t>> latterNotRepIdxArr;
};
// 串行运行mark duplicate // 串行运行mark duplicate
void serialMarkDups(); void serialMarkDups();

View File

@ -6,6 +6,8 @@
#include <htslib/thread_pool.h> #include <htslib/thread_pool.h>
#include <sam/utils/read_ends.h> #include <sam/utils/read_ends.h>
#include <sam/utils/read_name_parser.h> #include <sam/utils/read_name_parser.h>
//#include "md_types.h"
#include <set> #include <set>
using std::set; using std::set;
@ -19,14 +21,15 @@ extern sam_hdr_t *g_outBamHeader; // 输出文件的header
/* 参数对象作为全局对象,免得多次作为参数传入函数中 */ /* 参数对象作为全局对象,免得多次作为参数传入函数中 */
class GlobalArg; class GlobalArg;
extern GlobalArg &g_gArg; extern GlobalArg &g_gArg; // 全局参数
class MarkDupsArg; class MarkDupsArg;
extern MarkDupsArg &g_mdArg; extern MarkDupsArg &g_mdArg; // markduplicate程序相关的参数
// 上边加下划线表示所有模块共享不加下划线表示markduplicate计算时候内部使用
class GlobalDataArg; class GlobalDataArg;
extern GlobalDataArg &gData; extern GlobalDataArg &gData; // 计算冗余过程中保存在全局的数据
class DuplicationMetrics; class DuplicationMetrics;
extern DuplicationMetrics &gMetrics; extern DuplicationMetrics &gMetrics; // 用来计算统计信息
extern int zzhtestnum; extern int zzhtestnum;
extern set<int64_t> zzhopticalSet; extern int64_t bam_num1, bam_num2;
extern vector<int64_t> zzhopticalArr;

View File

@ -167,4 +167,12 @@ struct ReadEnds : PhysicalLocation {
} }
}; };
struct ReadEndsHash {
std::size_t operator()(const ReadEnds &o) const { return std::hash<int64_t>()(o.read1IndexInFile); }
};
struct ReadEndsEqual {
bool operator()(const ReadEnds &o1, const ReadEnds &o2) const { return o1.read1IndexInFile == o2.read1IndexInFile; }
};
#endif #endif