diff --git a/.gitignore b/.gitignore index 9dcbadc..0ef2599 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ core.* *.sam *.bam *.log +log/ # ---> C++ # Prerequisites *.d diff --git a/.vscode/settings.json b/.vscode/settings.json index bc40fd8..33cfbeb 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -103,6 +103,7 @@ "__bit_reference": "cpp", "__string": "cpp", "filesystem": "cpp", - "ios": "cpp" + "ios": "cpp", + "locale": "cpp" } } \ No newline at end of file diff --git a/run.sh b/run.sh index 847a717..def40b3 100755 --- a/run.sh +++ b/run.sh @@ -6,11 +6,12 @@ nthread=32 #nthread=64 #nthread=128 -#input=/home/zzh/data/bam/zy_normal.bam +#input=/home/zzh/data/bam/normal_all.sam +input=/home/zzh/data/bam/zy_normal.bam #input=/home/zzh/data/bam/normal_all.sam #input=/home/zzh/data/bam/zy_tumor.bam #input=/home/zzh/data/wgs_na12878.bam -input=~/data/bam/100w.bam +#input=~/data/bam/100w.bam #input=~/data/bam/t100w.sam #input=~/data/bam/1k.sam #input=~/data/bam/1kw.sam @@ -19,7 +20,9 @@ input=~/data/bam/100w.bam #input=~/data/bam/tumor_small.bam #input=~/data/bam/normal_small.bam #input=~/data/bam/normal_30.bam -output=/home/zzh/data1/na12878_out.bam +#output=/home/zzh/data1/na12878_out.bam +output=./out.bam +metrics=./metrics.txt cd ./build/ && make -j 8 && cd .. #./out.sam \ @@ -27,11 +30,11 @@ time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \ MarkDuplicates \ --INPUT $input \ --OUTPUT $output \ - --INDEX_FORMAT BAI \ ---num_threads $nthread \ - --max_mem 2G \ + --METRICS_FILE $metrics \ + --INDEX_FORMAT CSI \ + --num_threads $nthread \ --verbosity DEBUG \ - --asyncio true -TAG_DUPLICATE_SET_MEMBERS true --READ_NAME_REGEX null # + --asyncio true -TAG_DUPLICATE_SET_MEMBERS true #--READ_NAME_REGEX null # #--TAG_DUPLICATE_SET_MEMBERS true #--READ_NAME_REGEX "" #--READ_NAME_REGEX ".*?([0-9]+):([0-9]+):([0-9]+)$" #--READ_NAME_REGEX "" diff --git a/src/common/utils/global_arg.cpp b/src/common/utils/global_arg.cpp index e14d9e3..d94f9a4 100644 --- a/src/common/utils/global_arg.cpp +++ b/src/common/utils/global_arg.cpp @@ -29,7 +29,7 @@ void GlobalArg::initGlobalOptions() { v.push_back({"INPUT", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_INPUT}); // 输入文件 v.push_back({"OUTPUT", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_OUTPUT}); // 输出文件 v.push_back({"num_threads", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_NUM_THREADS}); - v.push_back({"max_mem", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_MAX_MEM}); +// v.push_back({"max_mem", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_MAX_MEM}); v.push_back({"verbosity", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_LOG_LEVEL}); v.push_back({"asyncio", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_ASYNCIO}); v.push_back({"version", no_argument, NULL, ns_ga::GlobalOptEnum::OPT_VERSION}); @@ -43,7 +43,7 @@ void GlobalArg::initGlobalOptions() { vArgInfo.push_back("--INPUT Input file path (bam, vcf ...)\n"); vArgInfo.push_back("--OUTPUT Output file path \n"); vArgInfo.push_back("--num_threads Number of threads to allocate to this analysis [1]\n"); - vArgInfo.push_back("--max_mem Set maximum memory; suffix K/M/G recognized [2G]\n"); +// vArgInfo.push_back("--max_mem Set maximum memory; suffix K/M/G recognized [2G]\n"); vArgInfo.push_back("--verbosity Control verbosity of logging. error/warning/info/debug [info]\n"); vArgInfo.push_back("--asyncio Use async io [true]\n"); vArgInfo.push_back("--version Output version information\n"); @@ -73,8 +73,8 @@ void GlobalArg::parseArgument(int argNum) { else if (*q == 'g' || *q == 'G') mem_arg <<= 30; //if (mem_arg >= max_mem) - if (true) - max_mem = mem_arg; + //if (true) + // max_mem = mem_arg; else { std::cerr << "[Warn] Too small mem size, use default" << std::endl; } @@ -104,10 +104,10 @@ void GlobalArg::parseArgument(int argNum) { } string GlobalArg::GetArgValueString() { - printf("--INPUT = %s\n", in_fn.c_str()); + printf("--INPUT = %s\n", in_fn.c_str()); printf("--OUTPUT = %s\n", out_fn.c_str()); printf("--num_threads = %d\n", num_threads); - printf("--max_mem = %ld\n", max_mem); + // printf("--max_mem = %ld\n", max_mem); printf("--verbosity = %d\n", verbosity); printf("--asyncio = %d\n", use_asyncio); diff --git a/src/common/utils/global_arg.h b/src/common/utils/global_arg.h index 98bec6c..84da566 100644 --- a/src/common/utils/global_arg.h +++ b/src/common/utils/global_arg.h @@ -48,7 +48,8 @@ struct GlobalArg { string in_fn; // input bam filename string out_fn; // output bam filename int num_threads = 1; // 线程个数 - size_t max_mem = ((size_t)1) << 30; // 最小1G + // size_t max_mem = ((size_t)3) << 29; // 最小1.5G + size_t max_mem = ((size_t)1) << 31; // 最小2G ns_ga::LogLevelEnum verbosity = ns_ga::INFO; // 打印信息级别 bool use_asyncio = true; // 是否使用异步io diff --git a/src/common/utils/timer.cpp b/src/common/utils/timer.cpp index 99c2677..dc5d11e 100755 --- a/src/common/utils/timer.cpp +++ b/src/common/utils/timer.cpp @@ -64,6 +64,16 @@ void Timer::print_current_time() { puts(now); } +string Timer::get_current_time_str() { + time_t time_val; + struct tm *at; + char now[80]; + time(&time_val); + at = localtime(&time_val); + strftime(now, 79, "%B %d, %Y at %I:%M:%S %p %Z", at); + return string(now); +} + double Timer::get_mseconds() { struct timeval tv; gettimeofday(&tv, NULL); diff --git a/src/common/utils/timer.h b/src/common/utils/timer.h index 88bdcde..594bde1 100755 --- a/src/common/utils/timer.h +++ b/src/common/utils/timer.h @@ -10,6 +10,9 @@ #ifndef TIMER_H_ #define TIMER_H_ +#include +using std::string; + /* * @brief: Record the run time of this program */ @@ -27,6 +30,7 @@ public: static void log_time(const char *desc); static void print_current_time(); + static string get_current_time_str(); static double get_mseconds(); static double get_seconds(); diff --git a/src/sam/markdups/dup_metrics.h b/src/sam/markdups/dup_metrics.h index c609cd9..f5bf81f 100644 --- a/src/sam/markdups/dup_metrics.h +++ b/src/sam/markdups/dup_metrics.h @@ -58,6 +58,7 @@ struct DuplicationMetrics { uint64_t ESTIMATED_LIBRARY_SIZE = 0; // 其他的统计数据 + vector CoverageMult; // 比如在该位置,有4个冗余,那么所有4个冗余的位置数量 MDMap DuplicateCountHist; diff --git a/src/sam/markdups/markdups.cpp b/src/sam/markdups/markdups.cpp index 09c6809..7fc881b 100644 --- a/src/sam/markdups/markdups.cpp +++ b/src/sam/markdups/markdups.cpp @@ -19,6 +19,7 @@ Date : 2023/10/23 #include #include +#include #include #include #include @@ -39,7 +40,8 @@ using std::string; #define SMA_TAG_PG "PG" -#define BAM_BLOCK_SIZE 32L * 1024 * 1024 +#define BAM_BLOCK_SIZE 16L * 1024 * 1024 +// #define BAM_BLOCK_SIZE 32L * 1024 * 1024 #define NO_SUCH_INDEX INT64_MAX Timer tm_arr[50]; // 用来测试性能 @@ -107,6 +109,34 @@ static int64_t estimateLibrarySize(int64_t readPairs, int64_t uniqueReadPairs) { return librarySize; } +/** + * Estimates the ROI (return on investment) that one would see if a library was sequenced to + * x higher coverage than the observed coverage. + * + * @param estimatedLibrarySize the estimated number of molecules in the library + * @param x the multiple of sequencing to be simulated (i.e. how many X sequencing) + * @param pairs the number of pairs observed in the actual sequencing + * @param uniquePairs the number of unique pairs observed in the actual sequencing + * @return a number z <= x that estimates if you had pairs*x as your sequencing then you + * would observe uniquePairs*z unique pairs. + */ +static double estimateRoi(long estimatedLibrarySize, double x, long pairs, long uniquePairs) { + return estimatedLibrarySize * (1 - exp(-(x * pairs) / estimatedLibrarySize)) / uniquePairs; +} + +/** + * Calculates a histogram using the estimateRoi method to estimate the effective yield + * doing x sequencing for x=1..10. + */ +static void calculateRoiHistogram(DuplicationMetrics &metrics) { + int64_t uniquePairs = metrics.READ_PAIRS_EXAMINED - metrics.READ_PAIR_DUPLICATES; + metrics.CoverageMult.resize(101); + for (int x = 1; x <= 100; ++x) { + metrics.CoverageMult[x] = + estimateRoi(metrics.ESTIMATED_LIBRARY_SIZE, x, metrics.READ_PAIRS_EXAMINED, uniquePairs); + } +} + /* * 获取文件名后缀 */ @@ -127,7 +157,7 @@ int MarkDuplicates(int argc, char *argv[]) { Timer time_all; /* 读取命令行参数 */ - g_mdArg.parseArgument(argc, argv, &g_gArg); // 解析命令行参数 + g_mdArg.parseArgument(argc, argv, &g_gArg); // 解析命令行参数 if (g_gArg.num_threads < 1) g_gArg.num_threads = 1; // 线程数不能小于1 @@ -150,10 +180,10 @@ int MarkDuplicates(int argc, char *argv[]) { /* 利用线程池对输入输出文件进行读写 */ htsThreadPool htsPoolRead = {NULL, 0}; // 多线程读取,创建线程池 htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池 - // htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads); - // htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads); - htsPoolRead.pool = hts_tpool_init(32); - htsPoolWrite.pool = hts_tpool_init(32); + htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads); + htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads); + //htsPoolRead.pool = hts_tpool_init(16); + //htsPoolWrite.pool = hts_tpool_init(32); if (!htsPoolRead.pool || !htsPoolWrite.pool) { Error("[%d] failed to set up thread pool", __LINE__); sam_close(g_inBamFp); @@ -237,7 +267,7 @@ int MarkDuplicates(int argc, char *argv[]) { uint32_t duplicateSetSize = 0; int64_t realDupSize = 0; - // exit(0); +// exit(0); while (inBuf.ReadStat() >= 0) { Timer tw1; size_t readNum = inBuf.ReadBam(); @@ -335,7 +365,7 @@ int MarkDuplicates(int argc, char *argv[]) { if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS && isInDuplicateSet) { if (!bw->IsSecondaryOrSupplementary() && !bw->GetReadUnmappedFlag()) { - cerr << bamIdx << " " << representativeReadIndexInFile << " " << duplicateSetSize << endl; + // cerr << bamIdx << " " << representativeReadIndexInFile << " " << duplicateSetSize << endl; uint8_t *oldTagVal = bam_aux_get(bw->b, g_mdArg.DUPLICATE_SET_INDEX_TAG.c_str()); if (oldTagVal != NULL) bam_aux_del(bw->b, oldTagVal); @@ -391,6 +421,7 @@ int MarkDuplicates(int argc, char *argv[]) { } else { gMetrics.PERCENT_DUPLICATION = 0; } + calculateRoiHistogram(gMetrics); cout << "metrics: \n" << "LIBRARY: " << gMetrics.LIBRARY << "\n" @@ -404,12 +435,46 @@ int MarkDuplicates(int argc, char *argv[]) { << "PERCENT_DUPLICATION: " << gMetrics.PERCENT_DUPLICATION << "\n" << "ESTIMATED_LIBRARY_SIZE: " << gMetrics.ESTIMATED_LIBRARY_SIZE << endl; + // 将统计信息写到文件里 + if (!g_mdArg.METRICS_FILE.empty()) { + ofstream ofsM(g_mdArg.METRICS_FILE); + ofsM << "## htsjdk.samtools.metrics.StringHeader" << endl; + ofsM << "# " << g_mdArg.PROGRAM_RECORD_ID << " " << g_gArg.GetArgValueString() << " " << g_mdArg.GetArgValueString() << endl; + ofsM << "## htsjdk.samtools.metrics.StringHeader" << endl; + ofsM << "# Started on: " << Timer::get_current_time_str() << endl << endl; + ofsM << "## METRICS CLASS\torg.broadinstitute.hellbender.utils.read.markduplicates.GATKDuplicationMetrics" << endl; + ofsM << "LIBRARY\tUNPAIRED_READS_EXAMINED\tREAD_PAIRS_EXAMINED\tSECONDARY_OR_SUPPLEMENTARY_RDS\tUNMAPPED_" + "READS\tUNPAIRED_READ_DUPLICATES\tREAD_PAIR_DUPLICATES\tREAD_PAIR_OPTICAL_DUPLICATES\tPERCENT_" + "DUPLICATION\tESTIMATED_LIBRARY_SIZE" + << endl; + ofsM << gMetrics.LIBRARY << "\t" << gMetrics.UNPAIRED_READS_EXAMINED << "\t" << gMetrics.READ_PAIRS_EXAMINED + << "\t" << gMetrics.SECONDARY_OR_SUPPLEMENTARY_RDS << "\t" << gMetrics.UNMAPPED_READS << "\t" + << gMetrics.UNPAIRED_READ_DUPLICATES << "\t" << gMetrics.READ_PAIR_DUPLICATES << "\t" + << gMetrics.READ_PAIR_OPTICAL_DUPLICATES << "\t" << gMetrics.PERCENT_DUPLICATION << "\t" + << gMetrics.ESTIMATED_LIBRARY_SIZE << endl << endl; + ofsM << "## HISTOGRAM\tjava.lang.Double" << endl; + ofsM << "BIN CoverageMult" << endl; + for (int i = 1; i <= 100; ++i) { + ofsM << i << "\t" << std::fixed << std::setprecision(6) << gMetrics.CoverageMult[i] << endl; + } + ofsM.close(); + } + +#if 0 + cout << "CoverageMult:" << endl; + for (int i = 1; i <= 100; ++i) { + cout << i << "\t" << gMetrics.CoverageMult[i] << endl; + } +#endif + +#if 0 cout << "singleton pos: " << gMetrics.singletonReads.size() << endl; for (auto &itr : gMetrics.DuplicateCountHist) cout << "dup counts: " << itr.first << ": " << itr.second << endl; for (auto &itr : gMetrics.NonOpticalDuplicateCountHist) cout << "non optical dup counts: " << itr.first << ": " << itr.second << endl; for (auto &itr : gMetrics.OpticalDuplicatesCountHist) cout << "optical dup counts: " << itr.first << ": " << itr.second << endl; +#endif cout << "dupsize: " << dupIdxQue.Size() << "\t" << opticalIdxQue.Size() << "\t" << repIdxQue.Size() << endl; if (!indexFn.empty() && sam_idx_save(g_outBamFp) < 0) { Error("writing index failed"); diff --git a/src/sam/markdups/markdups_arg.cpp b/src/sam/markdups/markdups_arg.cpp index e6d4652..6dd4d25 100644 --- a/src/sam/markdups/markdups_arg.cpp +++ b/src/sam/markdups/markdups_arg.cpp @@ -253,6 +253,7 @@ void MarkDupsArg::printArgValue() string MarkDupsArg::GetArgValueString() { vector argVals; + if (!this->METRICS_FILE.empty()) { argVals.push_back("--METRICS_FILE");argVals.push_back(this->METRICS_FILE); } argVals.push_back("--READ_NAME_REGEX"); if (this->READ_NAME_REGEX.empty()) argVals.push_back("null"); else argVals.push_back(this->READ_NAME_REGEX.c_str()); diff --git a/src/sam/markdups/pipeline_md.cpp b/src/sam/markdups/pipeline_md.cpp index e12009f..c5c8fe5 100644 --- a/src/sam/markdups/pipeline_md.cpp +++ b/src/sam/markdups/pipeline_md.cpp @@ -660,7 +660,6 @@ static void doIntersect(PipelineArg &pipeArg) { addToGlobal.insert(prevPosKey); } } - // return; map taskChanged; MDSet posProcessed; for (auto &e : recalcPos) { @@ -683,10 +682,11 @@ static void doIntersect(PipelineArg &pipeArg) { } // 最后再添加,以防开始赋值,后来这个位置要是又添加了新的数据 // 放在这里,因为lp中的unpairedPosArr中的readends可能会被修改(比如optical duplicate) - // for (auto posKey : addToGlobal) { - // g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey]; - // } - // return; +#if 0 + for (auto posKey : addToGlobal) { + g.unpairedPosArr[posKey] = lpSM.unpairedPosArr[posKey]; + } +#endif /* 不需要把p中找不到lp的unpair,放到global中,否则最后找到pair后,还要再执行一次冗余检测,造成重复的冗余索引 */ // 更新结果 diff --git a/src/sam/utils/read_ends.h b/src/sam/utils/read_ends.h index c3a49ad..dc34dfe 100644 --- a/src/sam/utils/read_ends.h +++ b/src/sam/utils/read_ends.h @@ -30,10 +30,10 @@ struct PhysicalLocation { * overflow within a HiSeqX tile. */ int16_t tile = -1; - int32_t x = -1; - int32_t y = -1; - //int16_t x = -1; - //int16_t y = -1; + //int32_t x = -1; + //int32_t y = -1; + int16_t x = -1; + int16_t y = -1; }; /* 包含了所有read ends信息,如picard里边的 ReadEndsForMarkDuplicates*/