统计信息写入文件，完整版1.0

2024-11-27 15:37:34 +08:00 · 2024-11-27 15:37:34 +08:00 · 223d16f2eb
parent 0ee22ca331
commit 223d16f2eb
12 changed files with 119 additions and 32 deletions
--- a/.gitignore
+++ b/.gitignore
@ -4,6 +4,7 @@ core.*
 *.sam
 *.bam
 *.log
+log/
 # ---> C++
 # Prerequisites
 *.d
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -103,6 +103,7 @@
        "__bit_reference": "cpp",
        "__string": "cpp",
        "filesystem": "cpp",
-        "ios": "cpp"
+        "ios": "cpp",
+        "locale": "cpp"
    }
 }
--- a/run.sh
+++ b/run.sh
@ -6,11 +6,12 @@
 nthread=32
 #nthread=64
 #nthread=128
-#input=/home/zzh/data/bam/zy_normal.bam
+#input=/home/zzh/data/bam/normal_all.sam
+input=/home/zzh/data/bam/zy_normal.bam
 #input=/home/zzh/data/bam/normal_all.sam
 #input=/home/zzh/data/bam/zy_tumor.bam
 #input=/home/zzh/data/wgs_na12878.bam
-input=~/data/bam/100w.bam
+#input=~/data/bam/100w.bam
 #input=~/data/bam/t100w.sam
 #input=~/data/bam/1k.sam
 #input=~/data/bam/1kw.sam
@ -19,7 +20,9 @@ input=~/data/bam/100w.bam
 #input=~/data/bam/tumor_small.bam
 #input=~/data/bam/normal_small.bam
 #input=~/data/bam/normal_30.bam
-output=/home/zzh/data1/na12878_out.bam
+#output=/home/zzh/data1/na12878_out.bam
+output=./out.bam
+metrics=./metrics.txt

 cd ./build/ && make -j 8 && cd ..
 #./out.sam \
@ -27,11 +30,11 @@ time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \
    MarkDuplicates \
    --INPUT $input \
    --OUTPUT $output \
-    --INDEX_FORMAT BAI \
--num_threads $nthread \
-    --max_mem 2G \
+    --METRICS_FILE $metrics \
+    --INDEX_FORMAT CSI \
+    --num_threads $nthread \
    --verbosity DEBUG \
-    --asyncio true -TAG_DUPLICATE_SET_MEMBERS true --READ_NAME_REGEX null #
+    --asyncio true -TAG_DUPLICATE_SET_MEMBERS true #--READ_NAME_REGEX null #
    #--TAG_DUPLICATE_SET_MEMBERS true #--READ_NAME_REGEX ""
    #--READ_NAME_REGEX ".*?([0-9]+):([0-9]+):([0-9]+)$"
    #--READ_NAME_REGEX ""
--- a/src/common/utils/global_arg.cpp
+++ b/src/common/utils/global_arg.cpp
@ -29,7 +29,7 @@ void GlobalArg::initGlobalOptions() {
    v.push_back({"INPUT", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_INPUT}); // 输入文件
    v.push_back({"OUTPUT", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_OUTPUT}); // 输出文件
    v.push_back({"num_threads", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_NUM_THREADS});
-    v.push_back({"max_mem", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_MAX_MEM});
+//    v.push_back({"max_mem", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_MAX_MEM});
    v.push_back({"verbosity", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_LOG_LEVEL});
    v.push_back({"asyncio", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_ASYNCIO});
    v.push_back({"version", no_argument, NULL, ns_ga::GlobalOptEnum::OPT_VERSION});
@ -43,7 +43,7 @@ void GlobalArg::initGlobalOptions() {
    vArgInfo.push_back("--INPUT                       Input file path (bam, vcf ...)\n");
    vArgInfo.push_back("--OUTPUT                      Output file path \n");
    vArgInfo.push_back("--num_threads <num_threads>   Number of threads to allocate to this analysis [1]\n");
-    vArgInfo.push_back("--max_mem <max_mem>           Set maximum memory; suffix K/M/G recognized [2G]\n");
+//    vArgInfo.push_back("--max_mem <max_mem>           Set maximum memory; suffix K/M/G recognized [2G]\n");
    vArgInfo.push_back("--verbosity <log level>       Control verbosity of logging. error/warning/info/debug [info]\n");
    vArgInfo.push_back("--asyncio                     Use async io [true]\n");
    vArgInfo.push_back("--version                     Output version information\n");
@ -73,8 +73,8 @@ void GlobalArg::parseArgument(int argNum) {
        else if (*q == 'g' || *q == 'G')
            mem_arg <<= 30;
        //if (mem_arg >= max_mem)
-        if (true)
-            max_mem = mem_arg;
+        //if (true)
+        //    max_mem = mem_arg;
        else {
            std::cerr << "[Warn] Too small mem size, use default" << std::endl;
        }
@ -104,10 +104,10 @@ void GlobalArg::parseArgument(int argNum) {
 }

 string GlobalArg::GetArgValueString() {
-            printf("--INPUT       = %s\n", in_fn.c_str());
+        printf("--INPUT       = %s\n", in_fn.c_str());
        printf("--OUTPUT      = %s\n", out_fn.c_str());
        printf("--num_threads = %d\n", num_threads);
-        printf("--max_mem     = %ld\n", max_mem);
+        // printf("--max_mem     = %ld\n", max_mem);
        printf("--verbosity   = %d\n", verbosity);
        printf("--asyncio     = %d\n", use_asyncio);

--- a/src/common/utils/global_arg.h
+++ b/src/common/utils/global_arg.h
@ -48,7 +48,8 @@ struct GlobalArg {
    string in_fn;                                 // input bam filename
    string out_fn;                                // output bam filename
    int num_threads = 1;                          // 线程个数
-    size_t max_mem = ((size_t)1) << 30;           // 最小1G
+    // size_t max_mem = ((size_t)3) << 29;           // 最小1.5G
+    size_t max_mem = ((size_t)1) << 31;           // 最小2G
    ns_ga::LogLevelEnum verbosity = ns_ga::INFO;  // 打印信息级别
    bool use_asyncio = true;                      // 是否使用异步io

--- a/src/common/utils/timer.cpp
+++ b/src/common/utils/timer.cpp
@ -64,6 +64,16 @@ void Timer::print_current_time() {
    puts(now);
 }

+string Timer::get_current_time_str() {
+    time_t time_val;
+    struct tm *at;
+    char now[80];
+    time(&time_val);
+    at = localtime(&time_val);
+    strftime(now, 79, "%B %d, %Y at %I:%M:%S %p %Z", at);
+    return string(now);
+}
+
 double Timer::get_mseconds() {
    struct timeval tv;
    gettimeofday(&tv, NULL);
--- a/src/common/utils/timer.h
+++ b/src/common/utils/timer.h
@ -10,6 +10,9 @@
 #ifndef TIMER_H_
 #define TIMER_H_

+#include <string>
+using std::string;
+
 /*
 * @brief: Record the run time of this program
 */
@ -27,6 +30,7 @@ public:

    static void log_time(const char *desc);
    static void print_current_time();
+    static string get_current_time_str();
    static double get_mseconds();
    static double get_seconds();

--- a/src/sam/markdups/dup_metrics.h
+++ b/src/sam/markdups/dup_metrics.h
@ -58,6 +58,7 @@ struct DuplicationMetrics {
    uint64_t ESTIMATED_LIBRARY_SIZE = 0;

    // 其他的统计数据
+    vector<double> CoverageMult;

    // 比如在该位置，有4个冗余，那么所有4个冗余的位置数量
    MDMap DuplicateCountHist;
--- a/src/sam/markdups/markdups.cpp
+++ b/src/sam/markdups/markdups.cpp
@ -19,6 +19,7 @@ Date : 2023/10/23
 #include <sam/utils/read_ends.h>
 #include <sam/utils/read_name_parser.h>

+#include <iomanip>
 #include <iostream>
 #include <queue>
 #include <set>
@ -39,7 +40,8 @@ using std::string;

 #define SMA_TAG_PG "PG"

-#define BAM_BLOCK_SIZE 32L * 1024 * 1024
+#define BAM_BLOCK_SIZE 16L * 1024 * 1024
+// #define BAM_BLOCK_SIZE 32L * 1024 * 1024
 #define NO_SUCH_INDEX INT64_MAX

 Timer tm_arr[50];  // 用来测试性能
@ -107,6 +109,34 @@ static int64_t estimateLibrarySize(int64_t readPairs, int64_t uniqueReadPairs) {
    return librarySize;
 }

+/**
+ * Estimates the ROI (return on investment) that one would see if a library was sequenced to
+ * x higher coverage than the observed coverage.
+ *
+ * @param estimatedLibrarySize the estimated number of molecules in the library
+ * @param x                    the multiple of sequencing to be simulated (i.e. how many X sequencing)
+ * @param pairs                the number of pairs observed in the actual sequencing
+ * @param uniquePairs          the number of unique pairs observed in the actual sequencing
+ * @return a number z <= x that estimates if you had pairs*x as your sequencing then you
+ * would observe uniquePairs*z unique pairs.
+ */
+static double estimateRoi(long estimatedLibrarySize, double x, long pairs, long uniquePairs) {
+    return estimatedLibrarySize * (1 - exp(-(x * pairs) / estimatedLibrarySize)) / uniquePairs;
+}
+
+/**
+ * Calculates a histogram using the estimateRoi method to estimate the effective yield
+ * doing x sequencing for x=1..10.
+ */
+static void calculateRoiHistogram(DuplicationMetrics &metrics) {
+    int64_t uniquePairs = metrics.READ_PAIRS_EXAMINED - metrics.READ_PAIR_DUPLICATES;
+    metrics.CoverageMult.resize(101);
+    for (int x = 1; x <= 100; ++x) {
+        metrics.CoverageMult[x] =
+            estimateRoi(metrics.ESTIMATED_LIBRARY_SIZE, x, metrics.READ_PAIRS_EXAMINED, uniquePairs);
+    }
+}
+
 /*
 * 获取文件名后缀
 */
@ -127,7 +157,7 @@ int MarkDuplicates(int argc, char *argv[]) {
    Timer time_all;

    /* 读取命令行参数 */
-                                                 g_mdArg.parseArgument(argc, argv, &g_gArg);  // 解析命令行参数
+    g_mdArg.parseArgument(argc, argv, &g_gArg);  // 解析命令行参数
    if (g_gArg.num_threads < 1)
        g_gArg.num_threads = 1;  // 线程数不能小于1

@ -150,10 +180,10 @@ int MarkDuplicates(int argc, char *argv[]) {
    /* 利用线程池对输入输出文件进行读写 */
    htsThreadPool htsPoolRead = {NULL, 0};   // 多线程读取，创建线程池
    htsThreadPool htsPoolWrite = {NULL, 0};  // 读写用不同的线程池
-    // htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads);
-    // htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads);
-    htsPoolRead.pool = hts_tpool_init(32);
-    htsPoolWrite.pool = hts_tpool_init(32);
+    htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads);
+    htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads);
+    //htsPoolRead.pool = hts_tpool_init(16);
+    //htsPoolWrite.pool = hts_tpool_init(32);
    if (!htsPoolRead.pool || !htsPoolWrite.pool) {
        Error("[%d] failed to set up thread pool", __LINE__);
        sam_close(g_inBamFp);
@ -237,7 +267,7 @@ int MarkDuplicates(int argc, char *argv[]) {
    uint32_t duplicateSetSize = 0;

    int64_t realDupSize = 0;
-    // exit(0);
+//    exit(0);
    while (inBuf.ReadStat() >= 0) {
        Timer tw1;
        size_t readNum = inBuf.ReadBam();
@ -335,7 +365,7 @@ int MarkDuplicates(int argc, char *argv[]) {

            if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS && isInDuplicateSet) {
                if (!bw->IsSecondaryOrSupplementary() && !bw->GetReadUnmappedFlag()) {
-                    cerr << bamIdx << " " << representativeReadIndexInFile << " " << duplicateSetSize << endl;
+                    // cerr << bamIdx << " " << representativeReadIndexInFile << " " << duplicateSetSize << endl;
                    uint8_t *oldTagVal = bam_aux_get(bw->b, g_mdArg.DUPLICATE_SET_INDEX_TAG.c_str());
                    if (oldTagVal != NULL)
                        bam_aux_del(bw->b, oldTagVal);
@ -391,6 +421,7 @@ int MarkDuplicates(int argc, char *argv[]) {
    } else {
        gMetrics.PERCENT_DUPLICATION = 0;
    }
+    calculateRoiHistogram(gMetrics);

    cout << "metrics: \n"
         << "LIBRARY:                        " << gMetrics.LIBRARY << "\n"
@ -404,12 +435,46 @@ int MarkDuplicates(int argc, char *argv[]) {
         << "PERCENT_DUPLICATION:            " << gMetrics.PERCENT_DUPLICATION << "\n"
         << "ESTIMATED_LIBRARY_SIZE:         " << gMetrics.ESTIMATED_LIBRARY_SIZE << endl;

+    // 将统计信息写到文件里
+    if (!g_mdArg.METRICS_FILE.empty()) {
+        ofstream ofsM(g_mdArg.METRICS_FILE);
+        ofsM << "## htsjdk.samtools.metrics.StringHeader" << endl;
+        ofsM << "# " << g_mdArg.PROGRAM_RECORD_ID << " " << g_gArg.GetArgValueString() << " " << g_mdArg.GetArgValueString() << endl;
+        ofsM << "## htsjdk.samtools.metrics.StringHeader" << endl;
+        ofsM << "# Started on: " << Timer::get_current_time_str() << endl << endl;
+        ofsM << "## METRICS CLASS\torg.broadinstitute.hellbender.utils.read.markduplicates.GATKDuplicationMetrics" << endl;
+        ofsM << "LIBRARY\tUNPAIRED_READS_EXAMINED\tREAD_PAIRS_EXAMINED\tSECONDARY_OR_SUPPLEMENTARY_RDS\tUNMAPPED_"
+                "READS\tUNPAIRED_READ_DUPLICATES\tREAD_PAIR_DUPLICATES\tREAD_PAIR_OPTICAL_DUPLICATES\tPERCENT_"
+                "DUPLICATION\tESTIMATED_LIBRARY_SIZE"
+             << endl;
+        ofsM << gMetrics.LIBRARY << "\t" << gMetrics.UNPAIRED_READS_EXAMINED << "\t" << gMetrics.READ_PAIRS_EXAMINED
+             << "\t" << gMetrics.SECONDARY_OR_SUPPLEMENTARY_RDS << "\t" << gMetrics.UNMAPPED_READS << "\t"
+             << gMetrics.UNPAIRED_READ_DUPLICATES << "\t" << gMetrics.READ_PAIR_DUPLICATES << "\t"
+             << gMetrics.READ_PAIR_OPTICAL_DUPLICATES << "\t" << gMetrics.PERCENT_DUPLICATION << "\t"
+             << gMetrics.ESTIMATED_LIBRARY_SIZE << endl << endl;
+        ofsM << "## HISTOGRAM\tjava.lang.Double" << endl;
+        ofsM << "BIN CoverageMult" << endl;
+        for (int i = 1; i <= 100; ++i) {
+            ofsM << i << "\t" << std::fixed << std::setprecision(6) << gMetrics.CoverageMult[i] << endl;
+        }
+        ofsM.close();
+    }
+
+#if 0
+    cout << "CoverageMult:" << endl;
+    for (int i = 1; i <= 100; ++i) {
+        cout << i << "\t" << gMetrics.CoverageMult[i] << endl;
+    }
+#endif
+
+#if 0
    cout << "singleton pos: " << gMetrics.singletonReads.size() << endl;
    for (auto &itr : gMetrics.DuplicateCountHist) cout << "dup counts: " << itr.first << ": " << itr.second << endl;
    for (auto &itr : gMetrics.NonOpticalDuplicateCountHist)
        cout << "non optical dup counts: " << itr.first << ": " << itr.second << endl;
    for (auto &itr : gMetrics.OpticalDuplicatesCountHist)
        cout << "optical dup counts: " << itr.first << ": " << itr.second << endl;
+#endif
    cout << "dupsize: " << dupIdxQue.Size() << "\t" << opticalIdxQue.Size() << "\t" << repIdxQue.Size() << endl;
    if (!indexFn.empty() && sam_idx_save(g_outBamFp) < 0) {
        Error("writing index failed");
--- a/src/sam/markdups/markdups_arg.cpp
+++ b/src/sam/markdups/markdups_arg.cpp
@ -253,6 +253,7 @@ void MarkDupsArg::printArgValue()

 string MarkDupsArg::GetArgValueString() {
    vector<string> argVals;
+    if (!this->METRICS_FILE.empty()) { argVals.push_back("--METRICS_FILE");argVals.push_back(this->METRICS_FILE); }
    argVals.push_back("--READ_NAME_REGEX");
    if (this->READ_NAME_REGEX.empty()) argVals.push_back("null");
    else argVals.push_back(this->READ_NAME_REGEX.c_str());
--- a/src/sam/markdups/pipeline_md.cpp
+++ b/src/sam/markdups/pipeline_md.cpp
@ -660,7 +660,6 @@ static void doIntersect(PipelineArg &pipeArg) {
            addToGlobal.insert(prevPosKey);
        }
    }
-    // return;
    map<int64_t, TaskSeqDupInfo> taskChanged;
    MDSet<int64_t> posProcessed;
    for (auto &e : recalcPos) {
@ -683,10 +682,11 @@ static void doIntersect(PipelineArg &pipeArg) {
    }
    // 最后再添加，以防开始赋值，后来这个位置要是又添加了新的数据
    // 放在这里，因为lp中的unpairedPosArr中的readends可能会被修改（比如optical duplicate）
-    // for (auto posKey : addToGlobal) {
-    //     g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey];
-    // }
-    // return;
+#if 0
+    for (auto posKey : addToGlobal) {
+        g.unpairedPosArr[posKey] = lpSM.unpairedPosArr[posKey];
+    }
+#endif
    /* 不需要把p中找不到lp的unpair，放到global中，否则最后找到pair后，还要再执行一次冗余检测，造成重复的冗余索引 */

    // 更新结果
--- a/src/sam/utils/read_ends.h
+++ b/src/sam/utils/read_ends.h
@ -30,10 +30,10 @@ struct PhysicalLocation {
     * overflow within a HiSeqX tile.
     */
    int16_t tile = -1;
-    int32_t x = -1;
-    int32_t y = -1;
-    //int16_t x = -1;
-    //int16_t y = -1;
+    //int32_t x = -1;
+    //int32_t y = -1;
+    int16_t x = -1;
+    int16_t y = -1;
 };

 /* 包含了所有read ends信息，如picard里边的 ReadEndsForMarkDuplicates*/