统计信息写入文件,完整版1.0

This commit is contained in:
zzh 2024-11-27 15:37:34 +08:00
parent 0ee22ca331
commit 223d16f2eb
12 changed files with 119 additions and 32 deletions

1
.gitignore vendored
View File

@ -4,6 +4,7 @@ core.*
*.sam
*.bam
*.log
log/
# ---> C++
# Prerequisites
*.d

View File

@ -103,6 +103,7 @@
"__bit_reference": "cpp",
"__string": "cpp",
"filesystem": "cpp",
"ios": "cpp"
"ios": "cpp",
"locale": "cpp"
}
}

17
run.sh
View File

@ -6,11 +6,12 @@
nthread=32
#nthread=64
#nthread=128
#input=/home/zzh/data/bam/zy_normal.bam
#input=/home/zzh/data/bam/normal_all.sam
input=/home/zzh/data/bam/zy_normal.bam
#input=/home/zzh/data/bam/normal_all.sam
#input=/home/zzh/data/bam/zy_tumor.bam
#input=/home/zzh/data/wgs_na12878.bam
input=~/data/bam/100w.bam
#input=~/data/bam/100w.bam
#input=~/data/bam/t100w.sam
#input=~/data/bam/1k.sam
#input=~/data/bam/1kw.sam
@ -19,7 +20,9 @@ input=~/data/bam/100w.bam
#input=~/data/bam/tumor_small.bam
#input=~/data/bam/normal_small.bam
#input=~/data/bam/normal_30.bam
output=/home/zzh/data1/na12878_out.bam
#output=/home/zzh/data1/na12878_out.bam
output=./out.bam
metrics=./metrics.txt
cd ./build/ && make -j 8 && cd ..
#./out.sam \
@ -27,11 +30,11 @@ time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \
MarkDuplicates \
--INPUT $input \
--OUTPUT $output \
--INDEX_FORMAT BAI \
--num_threads $nthread \
--max_mem 2G \
--METRICS_FILE $metrics \
--INDEX_FORMAT CSI \
--num_threads $nthread \
--verbosity DEBUG \
--asyncio true -TAG_DUPLICATE_SET_MEMBERS true --READ_NAME_REGEX null #
--asyncio true -TAG_DUPLICATE_SET_MEMBERS true #--READ_NAME_REGEX null #
#--TAG_DUPLICATE_SET_MEMBERS true #--READ_NAME_REGEX ""
#--READ_NAME_REGEX ".*?([0-9]+):([0-9]+):([0-9]+)$"
#--READ_NAME_REGEX ""

View File

@ -29,7 +29,7 @@ void GlobalArg::initGlobalOptions() {
v.push_back({"INPUT", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_INPUT}); // 输入文件
v.push_back({"OUTPUT", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_OUTPUT}); // 输出文件
v.push_back({"num_threads", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_NUM_THREADS});
v.push_back({"max_mem", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_MAX_MEM});
// v.push_back({"max_mem", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_MAX_MEM});
v.push_back({"verbosity", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_LOG_LEVEL});
v.push_back({"asyncio", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_ASYNCIO});
v.push_back({"version", no_argument, NULL, ns_ga::GlobalOptEnum::OPT_VERSION});
@ -43,7 +43,7 @@ void GlobalArg::initGlobalOptions() {
vArgInfo.push_back("--INPUT Input file path (bam, vcf ...)\n");
vArgInfo.push_back("--OUTPUT Output file path \n");
vArgInfo.push_back("--num_threads <num_threads> Number of threads to allocate to this analysis [1]\n");
vArgInfo.push_back("--max_mem <max_mem> Set maximum memory; suffix K/M/G recognized [2G]\n");
// vArgInfo.push_back("--max_mem <max_mem> Set maximum memory; suffix K/M/G recognized [2G]\n");
vArgInfo.push_back("--verbosity <log level> Control verbosity of logging. error/warning/info/debug [info]\n");
vArgInfo.push_back("--asyncio Use async io [true]\n");
vArgInfo.push_back("--version Output version information\n");
@ -73,8 +73,8 @@ void GlobalArg::parseArgument(int argNum) {
else if (*q == 'g' || *q == 'G')
mem_arg <<= 30;
//if (mem_arg >= max_mem)
if (true)
max_mem = mem_arg;
//if (true)
// max_mem = mem_arg;
else {
std::cerr << "[Warn] Too small mem size, use default" << std::endl;
}
@ -104,10 +104,10 @@ void GlobalArg::parseArgument(int argNum) {
}
string GlobalArg::GetArgValueString() {
printf("--INPUT = %s\n", in_fn.c_str());
printf("--INPUT = %s\n", in_fn.c_str());
printf("--OUTPUT = %s\n", out_fn.c_str());
printf("--num_threads = %d\n", num_threads);
printf("--max_mem = %ld\n", max_mem);
// printf("--max_mem = %ld\n", max_mem);
printf("--verbosity = %d\n", verbosity);
printf("--asyncio = %d\n", use_asyncio);

View File

@ -48,7 +48,8 @@ struct GlobalArg {
string in_fn; // input bam filename
string out_fn; // output bam filename
int num_threads = 1; // 线程个数
size_t max_mem = ((size_t)1) << 30; // 最小1G
// size_t max_mem = ((size_t)3) << 29; // 最小1.5G
size_t max_mem = ((size_t)1) << 31; // 最小2G
ns_ga::LogLevelEnum verbosity = ns_ga::INFO; // 打印信息级别
bool use_asyncio = true; // 是否使用异步io

View File

@ -64,6 +64,16 @@ void Timer::print_current_time() {
puts(now);
}
string Timer::get_current_time_str() {
time_t time_val;
struct tm *at;
char now[80];
time(&time_val);
at = localtime(&time_val);
strftime(now, 79, "%B %d, %Y at %I:%M:%S %p %Z", at);
return string(now);
}
double Timer::get_mseconds() {
struct timeval tv;
gettimeofday(&tv, NULL);

View File

@ -10,6 +10,9 @@
#ifndef TIMER_H_
#define TIMER_H_
#include <string>
using std::string;
/*
* @brief: Record the run time of this program
*/
@ -27,6 +30,7 @@ public:
static void log_time(const char *desc);
static void print_current_time();
static string get_current_time_str();
static double get_mseconds();
static double get_seconds();

View File

@ -58,6 +58,7 @@ struct DuplicationMetrics {
uint64_t ESTIMATED_LIBRARY_SIZE = 0;
// 其他的统计数据
vector<double> CoverageMult;
// 比如在该位置有4个冗余那么所有4个冗余的位置数量
MDMap DuplicateCountHist;

View File

@ -19,6 +19,7 @@ Date : 2023/10/23
#include <sam/utils/read_ends.h>
#include <sam/utils/read_name_parser.h>
#include <iomanip>
#include <iostream>
#include <queue>
#include <set>
@ -39,7 +40,8 @@ using std::string;
#define SMA_TAG_PG "PG"
#define BAM_BLOCK_SIZE 32L * 1024 * 1024
#define BAM_BLOCK_SIZE 16L * 1024 * 1024
// #define BAM_BLOCK_SIZE 32L * 1024 * 1024
#define NO_SUCH_INDEX INT64_MAX
Timer tm_arr[50]; // 用来测试性能
@ -107,6 +109,34 @@ static int64_t estimateLibrarySize(int64_t readPairs, int64_t uniqueReadPairs) {
return librarySize;
}
/**
* Estimates the ROI (return on investment) that one would see if a library was sequenced to
* x higher coverage than the observed coverage.
*
* @param estimatedLibrarySize the estimated number of molecules in the library
* @param x the multiple of sequencing to be simulated (i.e. how many X sequencing)
* @param pairs the number of pairs observed in the actual sequencing
* @param uniquePairs the number of unique pairs observed in the actual sequencing
* @return a number z <= x that estimates if you had pairs*x as your sequencing then you
* would observe uniquePairs*z unique pairs.
*/
static double estimateRoi(long estimatedLibrarySize, double x, long pairs, long uniquePairs) {
return estimatedLibrarySize * (1 - exp(-(x * pairs) / estimatedLibrarySize)) / uniquePairs;
}
/**
* Calculates a histogram using the estimateRoi method to estimate the effective yield
* doing x sequencing for x=1..10.
*/
static void calculateRoiHistogram(DuplicationMetrics &metrics) {
int64_t uniquePairs = metrics.READ_PAIRS_EXAMINED - metrics.READ_PAIR_DUPLICATES;
metrics.CoverageMult.resize(101);
for (int x = 1; x <= 100; ++x) {
metrics.CoverageMult[x] =
estimateRoi(metrics.ESTIMATED_LIBRARY_SIZE, x, metrics.READ_PAIRS_EXAMINED, uniquePairs);
}
}
/*
*
*/
@ -127,7 +157,7 @@ int MarkDuplicates(int argc, char *argv[]) {
Timer time_all;
/* 读取命令行参数 */
g_mdArg.parseArgument(argc, argv, &g_gArg); // 解析命令行参数
g_mdArg.parseArgument(argc, argv, &g_gArg); // 解析命令行参数
if (g_gArg.num_threads < 1)
g_gArg.num_threads = 1; // 线程数不能小于1
@ -150,10 +180,10 @@ int MarkDuplicates(int argc, char *argv[]) {
/* 利用线程池对输入输出文件进行读写 */
htsThreadPool htsPoolRead = {NULL, 0}; // 多线程读取,创建线程池
htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池
// htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads);
// htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads);
htsPoolRead.pool = hts_tpool_init(32);
htsPoolWrite.pool = hts_tpool_init(32);
htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads);
htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads);
//htsPoolRead.pool = hts_tpool_init(16);
//htsPoolWrite.pool = hts_tpool_init(32);
if (!htsPoolRead.pool || !htsPoolWrite.pool) {
Error("[%d] failed to set up thread pool", __LINE__);
sam_close(g_inBamFp);
@ -237,7 +267,7 @@ int MarkDuplicates(int argc, char *argv[]) {
uint32_t duplicateSetSize = 0;
int64_t realDupSize = 0;
// exit(0);
// exit(0);
while (inBuf.ReadStat() >= 0) {
Timer tw1;
size_t readNum = inBuf.ReadBam();
@ -335,7 +365,7 @@ int MarkDuplicates(int argc, char *argv[]) {
if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS && isInDuplicateSet) {
if (!bw->IsSecondaryOrSupplementary() && !bw->GetReadUnmappedFlag()) {
cerr << bamIdx << " " << representativeReadIndexInFile << " " << duplicateSetSize << endl;
// cerr << bamIdx << " " << representativeReadIndexInFile << " " << duplicateSetSize << endl;
uint8_t *oldTagVal = bam_aux_get(bw->b, g_mdArg.DUPLICATE_SET_INDEX_TAG.c_str());
if (oldTagVal != NULL)
bam_aux_del(bw->b, oldTagVal);
@ -391,6 +421,7 @@ int MarkDuplicates(int argc, char *argv[]) {
} else {
gMetrics.PERCENT_DUPLICATION = 0;
}
calculateRoiHistogram(gMetrics);
cout << "metrics: \n"
<< "LIBRARY: " << gMetrics.LIBRARY << "\n"
@ -404,12 +435,46 @@ int MarkDuplicates(int argc, char *argv[]) {
<< "PERCENT_DUPLICATION: " << gMetrics.PERCENT_DUPLICATION << "\n"
<< "ESTIMATED_LIBRARY_SIZE: " << gMetrics.ESTIMATED_LIBRARY_SIZE << endl;
// 将统计信息写到文件里
if (!g_mdArg.METRICS_FILE.empty()) {
ofstream ofsM(g_mdArg.METRICS_FILE);
ofsM << "## htsjdk.samtools.metrics.StringHeader" << endl;
ofsM << "# " << g_mdArg.PROGRAM_RECORD_ID << " " << g_gArg.GetArgValueString() << " " << g_mdArg.GetArgValueString() << endl;
ofsM << "## htsjdk.samtools.metrics.StringHeader" << endl;
ofsM << "# Started on: " << Timer::get_current_time_str() << endl << endl;
ofsM << "## METRICS CLASS\torg.broadinstitute.hellbender.utils.read.markduplicates.GATKDuplicationMetrics" << endl;
ofsM << "LIBRARY\tUNPAIRED_READS_EXAMINED\tREAD_PAIRS_EXAMINED\tSECONDARY_OR_SUPPLEMENTARY_RDS\tUNMAPPED_"
"READS\tUNPAIRED_READ_DUPLICATES\tREAD_PAIR_DUPLICATES\tREAD_PAIR_OPTICAL_DUPLICATES\tPERCENT_"
"DUPLICATION\tESTIMATED_LIBRARY_SIZE"
<< endl;
ofsM << gMetrics.LIBRARY << "\t" << gMetrics.UNPAIRED_READS_EXAMINED << "\t" << gMetrics.READ_PAIRS_EXAMINED
<< "\t" << gMetrics.SECONDARY_OR_SUPPLEMENTARY_RDS << "\t" << gMetrics.UNMAPPED_READS << "\t"
<< gMetrics.UNPAIRED_READ_DUPLICATES << "\t" << gMetrics.READ_PAIR_DUPLICATES << "\t"
<< gMetrics.READ_PAIR_OPTICAL_DUPLICATES << "\t" << gMetrics.PERCENT_DUPLICATION << "\t"
<< gMetrics.ESTIMATED_LIBRARY_SIZE << endl << endl;
ofsM << "## HISTOGRAM\tjava.lang.Double" << endl;
ofsM << "BIN CoverageMult" << endl;
for (int i = 1; i <= 100; ++i) {
ofsM << i << "\t" << std::fixed << std::setprecision(6) << gMetrics.CoverageMult[i] << endl;
}
ofsM.close();
}
#if 0
cout << "CoverageMult:" << endl;
for (int i = 1; i <= 100; ++i) {
cout << i << "\t" << gMetrics.CoverageMult[i] << endl;
}
#endif
#if 0
cout << "singleton pos: " << gMetrics.singletonReads.size() << endl;
for (auto &itr : gMetrics.DuplicateCountHist) cout << "dup counts: " << itr.first << ": " << itr.second << endl;
for (auto &itr : gMetrics.NonOpticalDuplicateCountHist)
cout << "non optical dup counts: " << itr.first << ": " << itr.second << endl;
for (auto &itr : gMetrics.OpticalDuplicatesCountHist)
cout << "optical dup counts: " << itr.first << ": " << itr.second << endl;
#endif
cout << "dupsize: " << dupIdxQue.Size() << "\t" << opticalIdxQue.Size() << "\t" << repIdxQue.Size() << endl;
if (!indexFn.empty() && sam_idx_save(g_outBamFp) < 0) {
Error("writing index failed");

View File

@ -253,6 +253,7 @@ void MarkDupsArg::printArgValue()
string MarkDupsArg::GetArgValueString() {
vector<string> argVals;
if (!this->METRICS_FILE.empty()) { argVals.push_back("--METRICS_FILE");argVals.push_back(this->METRICS_FILE); }
argVals.push_back("--READ_NAME_REGEX");
if (this->READ_NAME_REGEX.empty()) argVals.push_back("null");
else argVals.push_back(this->READ_NAME_REGEX.c_str());

View File

@ -660,7 +660,6 @@ static void doIntersect(PipelineArg &pipeArg) {
addToGlobal.insert(prevPosKey);
}
}
// return;
map<int64_t, TaskSeqDupInfo> taskChanged;
MDSet<int64_t> posProcessed;
for (auto &e : recalcPos) {
@ -683,10 +682,11 @@ static void doIntersect(PipelineArg &pipeArg) {
}
// 最后再添加,以防开始赋值,后来这个位置要是又添加了新的数据
// 放在这里因为lp中的unpairedPosArr中的readends可能会被修改比如optical duplicate
// for (auto posKey : addToGlobal) {
// g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey];
// }
// return;
#if 0
for (auto posKey : addToGlobal) {
g.unpairedPosArr[posKey] = lpSM.unpairedPosArr[posKey];
}
#endif
/* 不需要把p中找不到lp的unpair放到global中否则最后找到pair后还要再执行一次冗余检测造成重复的冗余索引 */
// 更新结果

View File

@ -30,10 +30,10 @@ struct PhysicalLocation {
* overflow within a HiSeqX tile.
*/
int16_t tile = -1;
int32_t x = -1;
int32_t y = -1;
//int16_t x = -1;
//int16_t y = -1;
//int32_t x = -1;
//int32_t y = -1;
int16_t x = -1;
int16_t y = -1;
};
/* 包含了所有read ends信息如picard里边的 ReadEndsForMarkDuplicates*/