diff --git a/.vscode/launch.json b/.vscode/launch.json index 9b52ffc..ca8917a 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -12,7 +12,14 @@ "request": "launch", "program": "${workspaceRoot}/build/bin/picard_cpp", "args": [ - "MarkDuplicates" + "MarkDuplicates", + "--INPUT", "test.bam", + "--OUTPUT", "out.bam", + "--METRICS_FILE", "metrics.txt", + "--num_threads", "12", + "--max_mem", "4G", + "--verbosity", "DEBUG", + "--asyncio", "true", ], "cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间 } diff --git a/.vscode/settings.json b/.vscode/settings.json index f66631f..6c73617 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,5 +1,7 @@ { "files.associations": { - "cstring": "cpp" + "cstring": "cpp", + "vector": "cpp", + "random": "cpp" } } \ No newline at end of file diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..963fc94 --- /dev/null +++ b/build.sh @@ -0,0 +1,8 @@ +#!/bin/bash +dir="/home/zzh/work/GeneKit/picard_cpp/build" +#[ -d "$dir" ] && rm -rf "$dir" +#mkdir "$dir" +cd "$dir" +cmake .. -DCMAKE_BUILD_TYPE=Debug +#cmake .. -DCMAKE_BUILD_TYPE=Release +make -j 8 diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..ede0c45 --- /dev/null +++ b/run.sh @@ -0,0 +1,8 @@ +/home/zzh/work/GeneKit/picard_cpp/build/bin/picard_cpp \ + MarkDuplicates \ + --INPUT test.bam \ + --OUTPUT out.bam \ + --num_threads 12 \ + --max_mem 4G \ + --verbosity DEBUG \ + --asyncio true diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 35908d1..5286867 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -3,7 +3,7 @@ SET(EXECUTABLE_OUTPUT_PATH "${PROJECT_BINARY_DIR}/bin") # 源码目录 AUX_SOURCE_DIRECTORY(${PROJECT_SOURCE_DIR}/src MAIN_SRC) -# AUX_SOURCE_DIRECTORY(${PROJECT_SOURCE_DIR}/src/common COMMON) +AUX_SOURCE_DIRECTORY(${PROJECT_SOURCE_DIR}/src/common COMMON) AUX_SOURCE_DIRECTORY(${PROJECT_SOURCE_DIR}/src/sam SAM_SRC) AUX_SOURCE_DIRECTORY(${PROJECT_SOURCE_DIR}/src/sam/markdups SAM_MARKDUPS_SRC) @@ -19,7 +19,7 @@ LINK_DIRECTORIES("${PROJECT_SOURCE_DIR}/lib/htslib") set(PG_NAME "picard_cpp") # 为程序添加依赖关系 -ADD_EXECUTABLE(${PG_NAME} ${MAIN_SRC} ${SAM_SRC} ${SAM_MARKDUPS_SRC}) +ADD_EXECUTABLE(${PG_NAME} ${MAIN_SRC} ${COMMON} ${SAM_SRC} ${SAM_MARKDUPS_SRC}) # 链接库 TARGET_LINK_LIBRARIES(${PG_NAME} libhts.a) diff --git a/src/common/global_arg.cpp b/src/common/global_arg.cpp new file mode 100644 index 0000000..1588147 --- /dev/null +++ b/src/common/global_arg.cpp @@ -0,0 +1,109 @@ +/* + Description: 全局参数,所有模块都可能用到的参数 + + Copyright : All right reserved by NCIC.ICT + + Author : Zhang Zhonghai + Date : 2023/10/23 +*/ + +#include "global_arg.h" + +#include +#include +#include +#include + +using std::vector; + +/* + * GlobalArg 类 + */ + +struct option *GlobalArg::GLOBAL_OPT = nullptr; + +// 初始化参数 +void GlobalArg::initGlobalOptions() +{ + vector v; + v.push_back({"INPUT", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_INPUT}); // 输入文件 + v.push_back({"OUTPUT", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_OUTPUT}); // 输出文件 + v.push_back({"num_threads", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_NUM_THREADS}); + v.push_back({"max_mem", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_MAX_MEM}); + v.push_back({"verbosity", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_LOG_LEVEL}); + v.push_back({"asyncio", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_ASYNCIO}); + v.push_back({"version", no_argument, NULL, ns_ga::GlobalOptEnum::OPT_VERSION}); + v.push_back({"help", no_argument, NULL, ns_ga::GlobalOptEnum::OPT_HELP}); + v.push_back({0, 0, 0, 0}); + + GLOBAL_OPT = new struct option[GLOBAL_ARG_CNT]; + memcpy(GLOBAL_OPT, v.data(), v.size() * sizeof(struct option)); + + /* 添加帮助信息, 按arg enum顺序进行添加信息 */ + vArgInfo.push_back("--INPUT Input file path (bam, vcf ...)\n"); + vArgInfo.push_back("--OUTPUT Output file path \n"); + vArgInfo.push_back("--num_threads Number of threads to allocate to this analysis [1]\n"); + vArgInfo.push_back("--max_mem Set maximum memory; suffix K/M/G recognized [2G]\n"); + vArgInfo.push_back("--verbosity Control verbosity of logging. error/warning/info/debug [info]\n"); + vArgInfo.push_back("--asyncio Use async io [true]\n"); + vArgInfo.push_back("--version Output version information\n"); + vArgInfo.push_back("--help Generate the help message\n"); +} + +// 解析参数 +void GlobalArg::parseArgument(int argNum) +{ + using namespace ns_ga; + switch (argNum) + { + case OPT_INPUT: + in_fn = optarg; + break; + case OPT_OUTPUT: + out_fn = optarg; + break; + case OPT_NUM_THREADS: + num_threads = std::stoi(optarg); + break; + case OPT_MAX_MEM: + { + char *q; + size_t mem_arg = strtol(optarg, &q, 0); + if (*q == 'k' || *q == 'K') + mem_arg <<= 10; + else if (*q == 'm' || *q == 'M') + mem_arg <<= 20; + else if (*q == 'g' || *q == 'G') + mem_arg <<= 30; + if (mem_arg >= max_mem) + max_mem = mem_arg; + else + { + std::cerr << "[Warn] Too small mem size, use default" << std::endl; + } + break; + } + case OPT_LOG_LEVEL: + { + if (strcmp("ERROR", optarg) == 0) + verbosity = ns_ga::ERROR; + else if (strcmp("WARNING", optarg) == 0) + verbosity = ns_ga::WARNING; + else if (strcmp("INFO", optarg) == 0) + verbosity = ns_ga::INFO; + else if (strcmp("DEBUG", optarg) == 0) + verbosity = ns_ga::DEBUG; + break; + } + case OPT_ASYNCIO: + { + if (strcmp("true", optarg) == 0) + use_asyncio = true; + else if (strcmp("false", optarg) == 0) + use_asyncio = false; + break; + } + default: + break; + } +} \ No newline at end of file diff --git a/src/common/global_arg.h b/src/common/global_arg.h new file mode 100644 index 0000000..4ae614e --- /dev/null +++ b/src/common/global_arg.h @@ -0,0 +1,105 @@ +/* +Description: picard_cpp共享的一些参数 + +Copyright : All right reserved by NCIC.ICT + +Author : Zhang Zhonghai +Date : 2023/10/23 +*/ +#ifndef GLOBAL_ARG_H_ +#define GLOBAL_ARG_H_ + +#include +#include +#include +#include +#include + +using std::map; +using std::string; +using std::vector; + +namespace ns_ga { + enum GlobalOptEnum + { + _START_NUM = 1, + OPT_INPUT, + OPT_OUTPUT, + OPT_NUM_THREADS, + OPT_MAX_MEM, + OPT_LOG_LEVEL, + OPT_ASYNCIO, + OPT_VERSION, + OPT_HELP, + _END_NUM + }; + + // log level + enum LogLevelEnum + { + ERROR, + WARNING, + INFO, + DEBUG + }; +} + +/* 全局共享的一些参数 */ +struct GlobalArg +{ + const static int GLOBAL_ARG_CNT = ns_ga::GlobalOptEnum::_END_NUM - ns_ga::GlobalOptEnum::_START_NUM; // 这里不需要减1 + static struct option *GLOBAL_OPT; + + string in_fn; // input bam filename + string out_fn; // output bam filename + int num_threads = 1; // 线程个数 + size_t max_mem = ((size_t)2) << 30; // 最小2G + ns_ga::LogLevelEnum verbosity = ns_ga::INFO; // 打印信息级别 + bool use_asyncio = true; // 是否使用异步io + + vector vArgInfo; // 每个参数的帮助信息 + + // 单例模式 + GlobalArg(const GlobalArg &) = delete; + GlobalArg &operator=(const GlobalArg &) = delete; + + // 获取单例 + static GlobalArg &Instance() + { + static GlobalArg instance; + return instance; + } + // 初始化参数 + void initGlobalOptions(); + + // 解析参数 + void parseArgument(int argNum); + + // 获取对应参数在数组(option和help info)中的索引 + int getArgIndx(ns_ga::GlobalOptEnum opt) + { + return opt - ns_ga::GlobalOptEnum::OPT_INPUT; + } + + // 打印某个参数的帮助信息 + void printArgInfo(ns_ga::GlobalOptEnum arg) { + int idx = getArgIndx(arg); + fprintf(stdout, "%s\n", vArgInfo[idx].c_str()); + } + + void printArgValue() { + printf("--INPUT = %s\n", in_fn.c_str()); + printf("--OUTPUT = %s\n", out_fn.c_str()); + printf("--num_threads = %d\n",num_threads); + printf("--max_mem = %ld\n", max_mem); + printf("--verbosity = %d\n", verbosity); + printf("--asyncio = %d\n", use_asyncio); + } +private : + GlobalArg() + { + initGlobalOptions(); + }; +}; + +#endif \ No newline at end of file diff --git a/src/sam/markdups/markdups.cpp b/src/sam/markdups/markdups.cpp index 48dbad6..f33e97c 100644 --- a/src/sam/markdups/markdups.cpp +++ b/src/sam/markdups/markdups.cpp @@ -6,6 +6,8 @@ Copyright : All right reserved by ICT Author : Zhang Zhonghai Date : 2023/10/23 */ +#include "markdups_arg.h" +#include #include @@ -16,11 +18,18 @@ using namespace std; */ int MarkDuplicates(int argc, char *argv[]) { - cout << argc << endl; - for (int i = 0; i < argc; ++i) { - cout << argv[i] << '\t'; - } - cout << endl; + // cout << argc << endl; + // for (int i = 0; i < argc; ++i) { + // cout << argv[i] << '\t'; + // } + // cout << endl; + + GlobalArg &gArg = GlobalArg::Instance(); + MarkDupsArg mdArg; + vector vAuxVar; + mdArg.parseArgument(argc, argv, &vAuxVar, &gArg); + + // cout << ns_md::ValidationStringency::DEFAULT_STRINGENCY << '\t' << ns_md::ValidationStringency::SILENT << endl; return 0; } \ No newline at end of file diff --git a/src/sam/markdups/markdups_arg.cpp b/src/sam/markdups/markdups_arg.cpp index e69de29..97b7c31 100644 --- a/src/sam/markdups/markdups_arg.cpp +++ b/src/sam/markdups/markdups_arg.cpp @@ -0,0 +1,488 @@ +/* +Description: Markduplicate需要用到的一些参数,读取命令行给的参数,并做一些初始化 + +Copyright : All right reserved by ICT + +Author : Zhang Zhonghai +Date : 2023/10/27 +*/ + +#include "markdups_arg.h" +#include "common/global_arg.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +using std::cout, std::endl; + +using std::ostringstream; +using std::stod; +using std::stoi; +using std::stol; +using std::string; +using std::vector; + +using namespace ns_md; + +/* + * mutect参数 + */ +const static struct option kMdOpts[] = { + {"MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP", required_argument, NULL, MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP}, + {"MAX_FILE_HANDLES_FOR_READ_ENDS_MAP", required_argument, NULL, MAX_FILE_HANDLES_FOR_READ_ENDS_MAP}, + {"SORTING_COLLECTION_SIZE_RATIO", required_argument, NULL, SORTING_COLLECTION_SIZE_RATIO}, + {"BARCODE_TAG", required_argument, NULL, BARCODE_TAG}, + {"READ_ONE_BARCODE_TAG", required_argument, NULL, READ_ONE_BARCODE_TAG}, + {"READ_TWO_BARCODE_TAG", required_argument, NULL, READ_TWO_BARCODE_TAG}, + {"TAG_DUPLICATE_SET_MEMBERS", required_argument, NULL, TAG_DUPLICATE_SET_MEMBERS}, + {"REMOVE_SEQUENCING_DUPLICATES", required_argument, NULL, REMOVE_SEQUENCING_DUPLICATES}, + {"TAGGING_POLICY", required_argument, NULL, TAGGING_POLICY}, + {"CLEAR_DT", required_argument, NULL, CLEAR_DT}, + {"DUPLEX_UMI", required_argument, NULL, DUPLEX_UMI}, + {"MOLECULAR_IDENTIFIER_TAG", required_argument, NULL, MOLECULAR_IDENTIFIER_TAG}, + {"METRICS_FILE", required_argument, NULL, METRICS_FILE}, + {"REMOVE_DUPLICATES", required_argument, NULL, REMOVE_DUPLICATES}, + {"ASSUME_SORTED", required_argument, NULL, ASSUME_SORTED}, + {"ASSUME_SORT_ORDER", required_argument, NULL, ASSUME_SORT_ORDER}, + {"DUPLICATE_SCORING_STRATEGY", required_argument, NULL, DUPLICATE_SCORING_STRATEGY}, + {"PROGRAM_RECORD_ID", required_argument, NULL, PROGRAM_RECORD_ID}, + {"PROGRAM_GROUP_VERSION", required_argument, NULL, PROGRAM_GROUP_VERSION}, + {"PROGRAM_GROUP_COMMAND_LINE", required_argument, NULL, PROGRAM_GROUP_COMMAND_LINE}, + {"PROGRAM_GROUP_NAME", required_argument, NULL, PROGRAM_GROUP_NAME}, + {"COMMENT", required_argument, NULL, COMMENT}, + {"READ_NAME_REGEX", required_argument, NULL, READ_NAME_REGEX}, + {"OPTICAL_DUPLICATE_PIXEL_DISTANCE", required_argument, NULL, OPTICAL_DUPLICATE_PIXEL_DISTANCE}, + {"MAX_OPTICAL_DUPLICATE_SET_SIZE", required_argument, NULL, MAX_OPTICAL_DUPLICATE_SET_SIZE}, + {"QUIET", required_argument, NULL, QUIET}, + {"VALIDATION_STRINGENCY", required_argument, NULL, VALIDATION_STRINGENCY}, + {"COMPRESSION_LEVEL", required_argument, NULL, COMPRESSION_LEVEL}, + {"MAX_RECORDS_IN_RAM", required_argument, NULL, MAX_RECORDS_IN_RAM}, + {"CREATE_INDEX", required_argument, NULL, CREATE_INDEX}, + {"CREATE_MD5_FILE", required_argument, NULL, CREATE_MD5_FILE}}; + +// 判断bool类型的参数 +void setBoolArg(bool *arg) { + if (strcmp("true", optarg) == 0) + *arg = true; + else if (strcmp("false", optarg) == 0) + *arg = false; +} + +// 解析参数 +void MarkDupsArg::parseArgument(int argc, + char **argv, + vector *pvAuxVar, + GlobalArg *pGArg) +{ + auto &vAuxVar = *pvAuxVar; + auto &gArg = *pGArg; + + struct option allOpt[MarkDupsArg::ARG_COUNT + GlobalArg::GLOBAL_ARG_CNT]; + + memcpy(allOpt, kMdOpts, MarkDupsArg::ARG_COUNT * sizeof(struct option)); + memcpy(&allOpt[MarkDupsArg::ARG_COUNT], GlobalArg::GLOBAL_OPT, GlobalArg::GLOBAL_ARG_CNT * sizeof(struct option)); + + // int cnt = MarkDupsArg::ARG_COUNT + GlobalArg::GLOBAL_ARG_CNT; + // cout << cnt << endl; + // for (int i = 0; i < cnt; ++i) + // { + // cout << i << '\t' << allOpt[i].name << endl; + // } + + int c; + while ((c = getopt_long_only(argc, argv, "", allOpt, NULL)) >= 0) + { + + gArg.parseArgument(c); + switch (c) + { + case ns_ga::OPT_VERSION: + PrintVersion(); + exit(0); + case ns_ga::OPT_HELP: + PrintHelp(); + exit(0); + case ns_md::MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP: + MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP = stoi(optarg); + break; + case ns_md::MAX_FILE_HANDLES_FOR_READ_ENDS_MAP: + MAX_FILE_HANDLES_FOR_READ_ENDS_MAP = stoi(optarg); + break; + case ns_md::SORTING_COLLECTION_SIZE_RATIO: + SORTING_COLLECTION_SIZE_RATIO = stod(optarg); + break; + case ns_md::BARCODE_TAG: + BARCODE_TAG = optarg; + break; + case ns_md::READ_ONE_BARCODE_TAG: + READ_ONE_BARCODE_TAG = optarg; + break; + case ns_md::READ_TWO_BARCODE_TAG: + READ_TWO_BARCODE_TAG = optarg; + break; + case ns_md::TAG_DUPLICATE_SET_MEMBERS: + setBoolArg(&TAG_DUPLICATE_SET_MEMBERS); + break; + case ns_md::REMOVE_SEQUENCING_DUPLICATES: + setBoolArg(&REMOVE_SEQUENCING_DUPLICATES); + break; + case ns_md::TAGGING_POLICY: + if (strcmp("DontTag", optarg) == 0) + TAGGING_POLICY = ns_md::DuplicateTaggingPolicy::DontTag; + else if (strcmp("OpticalOnly", optarg) == 0) + TAGGING_POLICY = ns_md::DuplicateTaggingPolicy::OpticalOnly; + else if (strcmp("All", optarg) == 0) + TAGGING_POLICY = ns_md::DuplicateTaggingPolicy::All; + break; + case ns_md::CLEAR_DT: + setBoolArg(&CLEAR_DT); + break; + case ns_md::DUPLEX_UMI: + setBoolArg(&DUPLEX_UMI); + break; + case ns_md::MOLECULAR_IDENTIFIER_TAG: + MOLECULAR_IDENTIFIER_TAG = optarg; + break; + case ns_md::METRICS_FILE: + METRICS_FILE = optarg; + break; + case ns_md::REMOVE_DUPLICATES: + setBoolArg(&REMOVE_DUPLICATES); + break; + case ns_md::ASSUME_SORTED: + setBoolArg(&ASSUME_SORTED); + break; + case ns_md::ASSUME_SORT_ORDER: + if (strcmp("unsorted", optarg) == 0) + ASSUME_SORT_ORDER = ns_md::SortOrder::unsorted; + else if (strcmp("queryname", optarg) == 0) + ASSUME_SORT_ORDER = ns_md::SortOrder::queryname; + else if (strcmp("coordinate", optarg) == 0) + ASSUME_SORT_ORDER = ns_md::SortOrder::coordinate; + else if (strcmp("duplicate", optarg) == 0) + ASSUME_SORT_ORDER = ns_md::SortOrder::duplicate; + else if (strcmp("unknown", optarg) == 0) + ASSUME_SORT_ORDER = ns_md::SortOrder::unknown; + break; + case ns_md::DUPLICATE_SCORING_STRATEGY: + if (strcmp("SUM_OF_BASE_QUALITIES", optarg) == 0) + DUPLICATE_SCORING_STRATEGY = ns_md::ScoringStrategy::SUM_OF_BASE_QUALITIES; + else if (strcmp("TOTAL_MAPPED_REFERENCE_LENGTH", optarg) == 0) + DUPLICATE_SCORING_STRATEGY = ns_md::ScoringStrategy::TOTAL_MAPPED_REFERENCE_LENGTH; + else if (strcmp("RANDOM", optarg) == 0) + DUPLICATE_SCORING_STRATEGY = ns_md::ScoringStrategy::RANDOM; + break; + case ns_md::PROGRAM_RECORD_ID: + PROGRAM_RECORD_ID = optarg; + break; + case ns_md::PROGRAM_GROUP_VERSION: + PROGRAM_GROUP_VERSION = optarg; + break; + case ns_md::PROGRAM_GROUP_COMMAND_LINE: + PROGRAM_GROUP_COMMAND_LINE = optarg; + break; + case ns_md::PROGRAM_GROUP_NAME: + PROGRAM_GROUP_NAME = optarg; + break; + case ns_md::COMMENT: + COMMENT.push_back(optarg); + break; + case ns_md::READ_NAME_REGEX: + READ_NAME_REGEX = optarg; + break; + case ns_md::OPTICAL_DUPLICATE_PIXEL_DISTANCE: + OPTICAL_DUPLICATE_PIXEL_DISTANCE = stoi(optarg); + break; + case ns_md::MAX_OPTICAL_DUPLICATE_SET_SIZE: + MAX_OPTICAL_DUPLICATE_SET_SIZE = stol(optarg); + break; + case ns_md::QUIET: + setBoolArg(&QUIET); + break; + case ns_md::VALIDATION_STRINGENCY: + if (strcmp("STRICT", optarg) == 0) + VALIDATION_STRINGENCY = ns_md::ValidationStringency::STRICT; + else if (strcmp("LENIENT", optarg) == 0) + VALIDATION_STRINGENCY = ns_md::ValidationStringency::LENIENT; + else if (strcmp("SILENT", optarg) == 0) + VALIDATION_STRINGENCY = ns_md::ValidationStringency::SILENT; + break; + case ns_md::COMPRESSION_LEVEL: + COMPRESSION_LEVEL = stoi(optarg); + break; + case ns_md::MAX_RECORDS_IN_RAM: + MAX_RECORDS_IN_RAM = stoi(optarg); + break; + case ns_md::CREATE_INDEX: + setBoolArg(&CREATE_INDEX); + break; + case ns_md::CREATE_MD5_FILE: + setBoolArg(&CREATE_MD5_FILE); + break; + default: + break; + } + } + + gArg.printArgValue(); +} + +// 打印版本信息 +void MarkDupsArg::PrintVersion() +{ + fprintf(stdout, "\n MarkDuplicate Version: %s\n", MARKDUPLICATE_VERSION); +} + +// 释放资源,关闭文件等 +void MarkDupsArg::Finalize(MarkDupsArg *pMdArg, + vector *pvAuxVar, + GlobalArg *pGArg) +{ +} + +// 打印帮助信息 +void MarkDupsArg::PrintHelp() +{ + FILE *fp = stdout; + fprintf(fp, + "Usage: MarkDuplicates [arguments]\n" + "\n" + "Example:\n" + " ./picard_cpp MarkDuplicates --num_thread 4 --INPUT input.bam --OUTPUT marked_duplicates.bam --METRICS_FILE marked_dup_metrics.txt\n" + "\n" + "Required Arguments:\n" + "\n" + "--INPUT,-I One or more input SAM, BAM or CRAM files to analyze. Must be coordinate sorted. This\n" + " argument must be specified at least once.Required.\n" + "\n" + "--METRICS_FILE,-M File to write duplication metrics to Required.\n" + "\n" + "--OUTPUT,-O The output file to write marked records to Required.\n" + "\n" + "\n" + "Optional Arguments:\n" + "\n" + "--ADD_PG_TAG_TO_READS \n" + " Add PG tag to each read in a SAM or BAM Default value: true. Possible values: {true,\n" + " false}\n" + "\n" + "--arguments_file read one or more arguments files and add them to the command line This argument may be\n" + " specified 0 or more times. Default value: null.\n" + "\n" + "--ASSUME_SORT_ORDER,-ASO \n" + " If not null, assume that the input file has this order even if the header says otherwise.\n" + " Default value: null. Possible values: {unsorted, queryname, coordinate, duplicate,\n" + " unknown} Cannot be used in conjunction with argument(s) ASSUME_SORTED (AS)\n" + "\n" + "\n" + "--ASSUME_SORTED,-AS If true, assume that the input file is coordinate sorted even if the header says\n" + " otherwise. Deprecated, used ASSUME_SORT_ORDER=coordinate instead. Default value: false.\n" + " Possible values: {true, false} Cannot be used in conjunction with argument(s)\n" + " ASSUME_SORT_ORDER (ASO)\n" + "\n" + "--BARCODE_TAG Barcode SAM tag (ex. BC for 10X Genomics) Default value: null.\n" + "\n" + "--CLEAR_DT Clear DT tag from input SAM records. Should be set to false if input SAM doesn't have this\n" + " tag. Default true Default value: true. Possible values: {true, false}\n" + "\n" + "--COMMENT,-CO Comment(s) to include in the output file's header. This argument may be specified 0 or\n" + " more times. Default value: null.\n" + "\n" + "--COMPRESSION_LEVEL Compression level for all compressed files created (e.g. BAM and VCF). Default value: 5.\n" + "\n" + "--CREATE_INDEX Whether to create an index when writing VCF or coordinate sorted BAM output. Default\n" + " value: false. Possible values: {true, false}\n" + "\n" + "--CREATE_MD5_FILE Whether to create an MD5 digest for any BAM or FASTQ files created. Default value:\n" + " false. Possible values: {true, false}\n" + "\n" + "--DUPLEX_UMI Treat UMIs as being duplex stranded. This option requires that the UMI consist of two\n" + " equal length strings that are separated by a hyphen (e.g. 'ATC-GTC'). Reads are considered\n" + " duplicates if, in addition to standard definition, have identical normalized UMIs. A UMI\n" + " from the 'bottom' strand is normalized by swapping its content around the hyphen (eg.\n" + " ATC-GTC becomes GTC-ATC). A UMI from the 'top' strand is already normalized as it is.\n" + " Both reads from a read pair considered top strand if the read 1 unclipped 5' coordinate is\n" + " less than the read 2 unclipped 5' coordinate. All chimeric reads and read fragments are\n" + " treated as having come from the top strand. With this option is it required that the\n" + " BARCODE_TAG hold non-normalized UMIs. Default false. Default value: false. Possible\n" + " values: {true, false}\n" + "\n" + "--DUPLICATE_SCORING_STRATEGY,-DS \n" + " The scoring strategy for choosing the non-duplicate among candidates. Default value:\n" + " SUM_OF_BASE_QUALITIES. Possible values: {SUM_OF_BASE_QUALITIES,\n" + " TOTAL_MAPPED_REFERENCE_LENGTH, RANDOM}\n" + "\n" + "--FLOW_EFFECTIVE_QUALITY_THRESHOLD \n" + " Threshold for considering a quality value high enough to be included when calculating\n" + " FLOW_QUALITY_SUM_STRATEGY calculation. Default value: 15.\n" + "\n" + "--FLOW_MODE enable parameters and behavior specific to flow based reads. Default value: false.\n" + " Possible values: {true, false}\n" + "\n" + "--FLOW_Q_IS_KNOWN_END \n" + " Treat position of read trimming based on quality as the known end (relevant for flow based\n" + " reads). Default false - if the read is trimmed on quality its end is not defined and the\n" + " read is duplicate of any read starting at the same place. Default value: false. Possible\n" + " values: {true, false}\n" + "\n" + "--FLOW_QUALITY_SUM_STRATEGY \n" + " Use specific quality summing strategy for flow based reads. The strategy ensures that the\n" + " same (and correct) quality value is used for all bases of the same homopolymer. Default\n" + " value: false. Possible values: {true, false}\n" + "\n" + "--FLOW_SKIP_FIRST_N_FLOWS \n" + " Skip first N flows, starting from the read's start, when considering duplicates. Useful\n" + " for flow based reads where sometimes there is noise in the first flows (for this argument,\n" + " \" read start \" means 5' end). Default value: 0.\n" + "\n" + "--help,-h display the help message Default value: false. Possible values: {true, false}\n" + "\n" + "--MAX_FILE_HANDLES_FOR_READ_ENDS_MAP,-MAX_FILE_HANDLES \n" + " Maximum number of file handles to keep open when spilling read ends to disk. Set this\n" + " number a little lower than the per-process maximum number of file that may be open. This\n" + " number can be found by executing the 'ulimit -n' command on a Unix system. Default value:\n" + " 8000.\n" + "\n" + "--MAX_OPTICAL_DUPLICATE_SET_SIZE \n" + " This number is the maximum size of a set of duplicate reads for which we will attempt to\n" + " determine which are optical duplicates. Please be aware that if you raise this value too\n" + " high and do encounter a very large set of duplicate reads, it will severely affect the\n" + " runtime of this tool. To completely disable this check, set the value to -1. Default\n" + " value: 300000.\n" + "\n" + "--MAX_RECORDS_IN_RAM When writing files that need to be sorted, this will specify the number of records stored\n" + " in RAM before spilling to disk. Increasing this number reduces the number of file handles\n" + " needed to sort the file, and increases the amount of RAM needed. Default value: 500000.\n" + "\n" + "--MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP,-MAX_SEQS \n" + " This option is obsolete. ReadEnds will always be spilled to disk. Default value: 50000.\n" + "\n" + "--MOLECULAR_IDENTIFIER_TAG \n" + " SAM tag to uniquely identify the molecule from which a read was derived. Use of this\n" + " option requires that the BARCODE_TAG option be set to a non null value. Default null.\n" + " Default value: null.\n" + "\n" + "--OPTICAL_DUPLICATE_PIXEL_DISTANCE \n" + " The maximum offset between two duplicate clusters in order to consider them optical\n" + " duplicates. The default is appropriate for unpatterned versions of the Illumina platform.\n" + " For the patterned flowcell models, 2500 is moreappropriate. For other platforms and\n" + " models, users should experiment to find what works best. Default value: 100.\n" + "\n" + "--PROGRAM_GROUP_COMMAND_LINE,-PG_COMMAND \n" + " Value of CL tag of PG record to be created. If not supplied the command line will be\n" + " detected automatically. Default value: null.\n" + "\n" + "--PROGRAM_GROUP_NAME,-PG_NAME \n" + " Value of PN tag of PG record to be created. Default value: MarkDuplicates.\n" + "\n" + "--PROGRAM_GROUP_VERSION,-PG_VERSION \n" + " Value of VN tag of PG record to be created. If not specified, the version will be detected\n" + " automatically. Default value: null.\n" + "\n" + "--PROGRAM_RECORD_ID,-PG \n" + " The program record ID for the @PG record(s) created by this program. Set to null to\n" + " disable PG record creation. This string may have a suffix appended to avoid collision\n" + " with other program record IDs. Default value: MarkDuplicates.\n" + "\n" + "--QUIET Whether to suppress job-summary info on System.err. Default value: false. Possible\n" + " values: {true, false}\n" + "\n" + "--READ_NAME_REGEX MarkDuplicates can use the tile and cluster positions to estimate the rate of optical\n" + " duplication in addition to the dominant source of duplication, PCR, to provide a more\n" + " accurate estimation of library size. By default (with no READ_NAME_REGEX specified),\n" + " MarkDuplicates will attempt to extract coordinates using a split on ':' (see Note below).\n" + " Set READ_NAME_REGEX to 'null' to disable optical duplicate detection. Note that without\n" + " optical duplicate counts, library size estimation will be less accurate. If the read name\n" + " does not follow a standard Illumina colon-separation convention, but does contain tile and\n" + " x,y coordinates, a regular expression can be specified to extract three variables:\n" + " tile/region, x coordinate and y coordinate from a read name. The regular expression must\n" + " contain three capture groups for the three variables, in order. It must match the entire\n" + " read name. e.g. if field names were separated by semi-colon (';') this example regex\n" + " could be specified (?:.*;)?([0-9]+)[^;]*;([0-9]+)[^;]*;([0-9]+)[^;]*$ Note that if no\n" + " READ_NAME_REGEX is specified, the read name is split on ':'. For 5 element names, the\n" + " 3rd, 4th and 5th elements are assumed to be tile, x and y values. For 7 element names\n" + " (CASAVA 1.8), the 5th, 6th, and 7th elements are assumed to be tile, x and y values.\n" + " Default value: .\n" + "\n" + "--READ_ONE_BARCODE_TAG \n" + " Read one barcode SAM tag (ex. BX for 10X Genomics) Default value: null.\n" + "\n" + "--READ_TWO_BARCODE_TAG \n" + " Read two barcode SAM tag (ex. BX for 10X Genomics) Default value: null.\n" + "\n" + "--REFERENCE_SEQUENCE,-R Reference sequence file. Default value: null.\n" + "\n" + "--REMOVE_DUPLICATES If true do not write duplicates to the output file instead of writing them with\n" + " appropriate flags set. Default value: false. Possible values: {true, false}\n" + "\n" + "--REMOVE_SEQUENCING_DUPLICATES \n" + " If true remove 'optical' duplicates and other duplicates that appear to have arisen from\n" + " the sequencing process instead of the library preparation process, even if\n" + " REMOVE_DUPLICATES is false. If REMOVE_DUPLICATES is true, all duplicates are removed and\n" + " this option is ignored. Default value: false. Possible values: {true, false}\n" + "\n" + "--SORTING_COLLECTION_SIZE_RATIO \n" + " This number, plus the maximum RAM available to the JVM, determine the memory footprint\n" + " used by some of the sorting collections. If you are running out of memory, try reducing\n" + " this number. Default value: 0.25.\n" + "\n" + "--TAG_DUPLICATE_SET_MEMBERS \n" + " If a read appears in a duplicate set, add two tags. The first tag, DUPLICATE_SET_SIZE_TAG\n" + " (DS), indicates the size of the duplicate set. The smallest possible DS value is 2 which\n" + " occurs when two reads map to the same portion of the reference only one of which is marked\n" + " as duplicate. The second tag, DUPLICATE_SET_INDEX_TAG (DI), represents a unique identifier\n" + " for the duplicate set to which the record belongs. This identifier is the index-in-file of\n" + " the representative read that was selected out of the duplicate set. Default value: false.\n" + " Possible values: {true, false}\n" + "\n" + "--TAGGING_POLICY \n" + " Determines how duplicate types are recorded in the DT optional attribute. Default value:\n" + " DontTag. Possible values: {DontTag, OpticalOnly, All}\n" + "\n" + "--TMP_DIR One or more directories with space available to be used by this program for temporary\n" + " storage of working files This argument may be specified 0 or more times. Default value:\n" + " null.\n" + "\n" + "--UNPAIRED_END_UNCERTAINTY \n" + " Maximal difference of the read end position that counted as equal. Useful for flow based\n" + " reads where the end position might vary due to sequencing errors. (for this argument,\n" + " \" read end \" means 3' end) Default value: 0.\n" + "\n" + "--USE_END_IN_UNPAIRED_READS \n" + " Make the end location of single end read be significant when considering duplicates, in\n" + " addition to the start location, which is always significant (i.e. require single-ended\n" + " reads to start andend on the same position to be considered duplicate) (for this argument,\n" + " \" read end \" means 3' end). Default value: false. Possible values: {true, false}\n" + "\n" + "--USE_JDK_DEFLATER,-use_jdk_deflater \n" + " Use the JDK Deflater instead of the Intel Deflater for writing compressed output Default\n" + " value: false. Possible values: {true, false}\n" + "\n" + "--USE_JDK_INFLATER,-use_jdk_inflater \n" + " Use the JDK Inflater instead of the Intel Inflater for reading compressed input Default\n" + " value: false. Possible values: {true, false}\n" + "\n" + "--USE_UNPAIRED_CLIPPED_END \n" + " Use position of the clipping as the end position, when considering duplicates (or use the\n" + " unclipped end position) (for this argument, \" read end \" means 3' end). Default value:\n" + " false. Possible values: {true, false}\n" + "\n" + "--VALIDATION_STRINGENCY \n" + " Validation stringency for all SAM files read by this program. Setting stringency to\n" + " SILENT can improve performance when processing a BAM file in which variable-length data\n" + " (read, qualities, tags) do not otherwise need to be decoded. Default value: STRICT.\n" + " Possible values: {STRICT, LENIENT, SILENT}\n" + "\n" + "--VERBOSITY Control verbosity of logging. Default value: INFO. Possible values: {ERROR, WARNING,\n" + " INFO, DEBUG}\n" + "\n" + "--version display the version number for this tool Default value: false. Possible values: {true,\n" + " false}\n" + "\n"); +} \ No newline at end of file diff --git a/src/sam/markdups/markdups_arg.h b/src/sam/markdups/markdups_arg.h index e69de29..fa66cca 100644 --- a/src/sam/markdups/markdups_arg.h +++ b/src/sam/markdups/markdups_arg.h @@ -0,0 +1,315 @@ +/* +Description: Markduplicate需要用到的一些参数 + +Copyright : All right reserved by ICT + +Author : Zhang Zhonghai +Date : 2023/10/23 +*/ + +#include +#include + +using std::string; +using std::vector; + +#define MARKDUPLICATE_VERSION "v0.1" + +class GlobalArg; + +namespace ns_md { + /* 用于markduplicate模块的参数,这个枚举用于getoption */ + enum MarkDupsArgEnum + { + _START_NUM = 100, + MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP, + MAX_FILE_HANDLES_FOR_READ_ENDS_MAP, + SORTING_COLLECTION_SIZE_RATIO, + BARCODE_TAG, + READ_ONE_BARCODE_TAG, + READ_TWO_BARCODE_TAG, + TAG_DUPLICATE_SET_MEMBERS, + REMOVE_SEQUENCING_DUPLICATES, + TAGGING_POLICY, + CLEAR_DT, + DUPLEX_UMI, + MOLECULAR_IDENTIFIER_TAG, + METRICS_FILE, + REMOVE_DUPLICATES, + ASSUME_SORTED, + ASSUME_SORT_ORDER, + DUPLICATE_SCORING_STRATEGY, + PROGRAM_RECORD_ID, + PROGRAM_GROUP_VERSION, + PROGRAM_GROUP_COMMAND_LINE, + PROGRAM_GROUP_NAME, + COMMENT, + READ_NAME_REGEX, + OPTICAL_DUPLICATE_PIXEL_DISTANCE, + MAX_OPTICAL_DUPLICATE_SET_SIZE, + QUIET, + VALIDATION_STRINGENCY, + COMPRESSION_LEVEL, + MAX_RECORDS_IN_RAM, + CREATE_INDEX, + CREATE_MD5_FILE, + _END_NUM + }; + + /* How strict to be when reading a SAM or BAM, beyond bare minimum validation. */ + enum ValidationStringency + { + /** + * Do the right thing, throw an exception if something looks wrong. + */ + STRICT, + /** + * Emit warnings but keep going if possible. + */ + LENIENT, + /** + * Like LENIENT, only don't emit warning messages. + */ + SILENT, + + DEFAULT_STRINGENCY = SILENT + }; + + /** + * Enum used to control how duplicates are flagged in the DT optional tag on each read. + */ + enum DuplicateTaggingPolicy + { + DontTag, + OpticalOnly, + All + }; + + /* 排序的方式 */ + enum SortOrder + { + unsorted, + queryname, + coordinate, + duplicate, // NB: this is not in the SAM spec! + unknown + }; + + /* 计算reads分数的方式(比那个read得分更高) */ + enum ScoringStrategy + { + SUM_OF_BASE_QUALITIES, + TOTAL_MAPPED_REFERENCE_LENGTH, + RANDOM + }; +} + +// 用于线程内的各种变量 +struct AuxVar { + const static int MIN_QSUM_QSCORE = 13; + const static int REF_CONTEXT_PAD = 3; + const static int REFERENCE_HALF_WINDOW_LENGTH = 150; + + double contaminantAlternateFraction; +}; + +/* markduplicate 需要的参数*/ +struct MarkDupsArg +{ + /** + * The optional attribute in SAM/BAM/CRAM files used to store the duplicate type. + */ + string DUPLICATE_TYPE_TAG = "DT"; + /** + * The duplicate type tag value for duplicate type: library. + */ + string DUPLICATE_TYPE_LIBRARY = "LB"; + /** + * The duplicate type tag value for duplicate type: sequencing (optical & pad-hopping, or "co-localized"). + */ + string DUPLICATE_TYPE_SEQUENCING = "SQ"; + /** + * The attribute in the SAM/BAM file used to store which read was selected as representative out of a duplicate set + */ + string DUPLICATE_SET_INDEX_TAG = "DI"; + /** + * The attribute in the SAM/BAM file used to store the size of a duplicate set + */ + string DUPLICATE_SET_SIZE_TAG = "DS"; + + /* OpticalDuplicateFinder */ + int DEFAULT_OPTICAL_DUPLICATE_DISTANCE = 100; + int DEFAULT_BIG_DUPLICATE_SET_SIZE = 1000; + int DEFAULT_MAX_DUPLICATE_SET_SIZE = 300000; // larger than this number will generate over 100 billion comparisons in the n^2 algorithm below + + /** + * If more than this many sequences in SAM file, don't spill to disk because there will not + * be enough file handles. + */ + + /* "This option is obsolete. ReadEnds will always be spilled to disk." */ + int MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP = 50000; + + /* "Maximum number of file handles to keep open when spilling read ends to disk. " + + "Set this number a little lower than the per-process maximum number of file that may be open. " + + "This number can be found by executing the 'ulimit -n' command on a Unix system." */ + int MAX_FILE_HANDLES_FOR_READ_ENDS_MAP = 8000; + + /* "This number, plus the maximum RAM available to the JVM, determine the memory footprint used by " + + "some of the sorting collections. If you are running out of memory, try reducing this number." */ + double SORTING_COLLECTION_SIZE_RATIO = 0.25; + + /* "Barcode SAM tag (ex. BC for 10X Genomics)", optional = true */ + string BARCODE_TAG = ""; + + /* "Read one barcode SAM tag (ex. BX for 10X Genomics)", optional = true */ + string READ_ONE_BARCODE_TAG = ""; + + /* "Read two barcode SAM tag (ex. BX for 10X Genomics)", optional = true */ + string READ_TWO_BARCODE_TAG = ""; + + /* "If a read appears in a duplicate set, add two tags. The first tag, DUPLICATE_SET_SIZE_TAG (DS), " + + "indicates the size of the duplicate set. The smallest possible DS value is 2 which occurs when two " + + "reads map to the same portion of the reference only one of which is marked as duplicate. The second " + + "tag, DUPLICATE_SET_INDEX_TAG (DI), represents a unique identifier for the duplicate set to which the " + + "record belongs. This identifier is the index-in-file of the representative read that was selected out " + + "of the duplicate set.", + optional = true) */ + bool TAG_DUPLICATE_SET_MEMBERS = false; + + /* "If true remove 'optical' duplicates and other duplicates that appear to have arisen from the " + + "sequencing process instead of the library preparation process, even if REMOVE_DUPLICATES is false. " + + "If REMOVE_DUPLICATES is true, all duplicates are removed and this option is ignored.") */ + bool REMOVE_SEQUENCING_DUPLICATES = false; + + /* "Determines how duplicate types are recorded in the DT optional attribute.") */ + ns_md::DuplicateTaggingPolicy TAGGING_POLICY = ns_md::DuplicateTaggingPolicy::DontTag; + + /* "Clear DT tag from input SAM records. Should be set to false if input SAM doesn't have this tag. Default true") */ + bool CLEAR_DT = true; + + /* "Treat UMIs as being duplex stranded. This option requires that the UMI consist of two equal length " + + "strings that are separated by a hyphen (e.g. 'ATC-GTC'). Reads are considered duplicates if, in addition to standard " + + "definition, have identical normalized UMIs. A UMI from the 'bottom' strand is normalized by swapping its content " + + "around the hyphen (eg. ATC-GTC becomes GTC-ATC). A UMI from the 'top' strand is already normalized as it is. " + + "Both reads from a read pair considered top strand if the read 1 unclipped 5' coordinate is less than the read " + + "2 unclipped 5' coordinate. All chimeric reads and read fragments are treated as having come from the top strand. " + + "With this option is it required that the BARCODE_TAG hold non-normalized UMIs. Default false.") */ + bool DUPLEX_UMI = false; + + /* "SAM tag to uniquely identify the molecule from which a read was derived. Use of this option requires that " + + "the BARCODE_TAG option be set to a non null value. Default null.", + optional = true) */ + string MOLECULAR_IDENTIFIER_TAG = ""; + + /* 继承自 AbstractMarkDuplicatesCommandLineProgram 的参数*/ + /* "File to write duplication metrics to" */ + string METRICS_FILE; + + /* "If true do not write duplicates to the output file instead of writing them with appropriate flags set." */ + bool REMOVE_DUPLICATES = false; + + /* "If true, assume that the input file is coordinate sorted even if the header says otherwise. " + + "Deprecated, used ASSUME_SORT_ORDER=coordinate instead." mutex = {"ASSUME_SORT_ORDER"} */ + bool ASSUME_SORTED = false; + + /* "If not null, assume that the input file has this order even if the header says otherwise.", + optional = true, mutex = {"ASSUME_SORTED"} */ + ns_md::SortOrder ASSUME_SORT_ORDER = ns_md::SortOrder::unsorted; + + /* "The scoring strategy for choosing the non-duplicate among candidates." */ + ns_md::ScoringStrategy DUPLICATE_SCORING_STRATEGY = ns_md::ScoringStrategy::TOTAL_MAPPED_REFERENCE_LENGTH; + + /* "The program record ID for the @PG record(s) created by this program. Set to null to disable " + + "PG record creation. This string may have a suffix appended to avoid collision with other " + + "program record IDs.", + optional = true */ + string PROGRAM_RECORD_ID = "MarkDuplicates"; + + /* "Value of VN tag of PG record to be created. If not specified, the version will be detected automatically.", + optional = true */ + string PROGRAM_GROUP_VERSION; + + /* "Value of CL tag of PG record to be created. If not supplied the command line will be detected automatically.", + optional = true */ + string PROGRAM_GROUP_COMMAND_LINE; + + /* "Value of PN tag of PG record to be created." */ + string PROGRAM_GROUP_NAME = "MarkDuplicates"; + + /* "Comment(s) to include in the output file's header.", + optional = true */ + vector COMMENT; + + /* 继承自 AbstractOpticalDuplicateFinderCommandLineProgram 的参数 */ + + /* "MarkDuplicates can use the tile and cluster positions to estimate the rate of optical duplication " + + "in addition to the dominant source of duplication, PCR, to provide a more accurate estimation of library size. " + + "By default (with no READ_NAME_REGEX specified), MarkDuplicates will attempt to extract coordinates " + + "using a split on ':' (see Note below). " + + "Set READ_NAME_REGEX to 'null' to disable optical duplicate detection. " + + "Note that without optical duplicate counts, library size estimation will be less accurate. " + + "If the read name does not follow a standard Illumina colon-separation convention, but does contain tile and x,y coordinates, " + + "a regular expression can be specified to extract three variables: tile/region, x coordinate and y coordinate from a read name. " + + "The regular expression must contain three capture groups for the three variables, in order. " + + "It must match the entire read name. " + + " e.g. if field names were separated by semi-colon (';') this example regex could be specified " + + " (?:.*;)?([0-9]+)[^;]*;([0-9]+)[^;]*;([0-9]+)[^;]*$ " + + "Note that if no READ_NAME_REGEX is specified, the read name is split on ':'. " + + " For 5 element names, the 3rd, 4th and 5th elements are assumed to be tile, x and y values. " + + " For 7 element names (CASAVA 1.8), the 5th, 6th, and 7th elements are assumed to be tile, x and y values.", + optional = true */ + string READ_NAME_REGEX = "(?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$"; + + /* "The maximum offset between two duplicate clusters in order to consider them optical duplicates. The default " + + "is appropriate for unpatterned versions of the Illumina platform. For the patterned flowcell models, 2500 is more" + + "appropriate. For other platforms and models, users should experiment to find what works best." */ + int OPTICAL_DUPLICATE_PIXEL_DISTANCE = DEFAULT_OPTICAL_DUPLICATE_DISTANCE; + + /* "This number is the maximum size of a set of duplicate reads for which we will attempt to determine " + + "which are optical duplicates. Please be aware that if you raise this value too high and do encounter a very " + + "large set of duplicate reads, it will severely affect the runtime of this tool. To completely disable this check, " + + "set the value to -1." */ + long MAX_OPTICAL_DUPLICATE_SET_SIZE = DEFAULT_MAX_DUPLICATE_SET_SIZE; + + /* 继承自 CommandLineProgram 的参数*/ + + /* "Whether to suppress job-summary info on System.err.", common = true */ + bool QUIET = false; + + /* "Validation stringency for all SAM files read by this program. Setting stringency to SILENT " + + "can improve performance when processing a BAM file in which variable-length data (read, qualities, tags) " + + "do not otherwise need to be decoded.", common=true */ + ns_md::ValidationStringency VALIDATION_STRINGENCY = ns_md::ValidationStringency::DEFAULT_STRINGENCY; + + /* "Compression level for all compressed files created (e.g. BAM and VCF).", common = true */ + int COMPRESSION_LEVEL = 5; + + /* "When writing files that need to be sorted, this will specify the number of records stored in RAM before spilling to disk. " + + "Increasing this number reduces the number of file handles needed to sort the file, and increases the amount of RAM needed.", + optional = true, common = true */ + int MAX_RECORDS_IN_RAM = 500000; + + /* "Whether to create an index when writing VCF or coordinate sorted BAM output.", common = true */ + bool CREATE_INDEX = false; + + /* "Whether to create an MD5 digest for any BAM or FASTQ files created. ", common = true */ + bool CREATE_MD5_FILE = false; + + // mark duplicate参数个数 + const static int ARG_COUNT = ns_md::MarkDupsArgEnum::_END_NUM - ns_md::MarkDupsArgEnum::_START_NUM - 1; + // 解析参数 + void parseArgument(int argc, + char **argv, + vector *pvAuxVar, + GlobalArg *pGArg); + + static void PrintHelp(); + + static void PrintVersion(); + + // 释放资源,关闭文件等 + static void Finalize(MarkDupsArg *pMdArg, + vector *pvAuxVar, + GlobalArg *pGArg); +}; \ No newline at end of file