/* Description: Markduplicate需要用到的一些参数,读取命令行给的参数,并做一些初始化 Copyright : All right reserved by ICT Author : Zhang Zhonghai Date : 2023/10/27 */ #include "markdups_arg.h" #include "common/global_arg.h" #include #include #include #include #include #include #include #include using std::cout, std::endl; using std::ostringstream; using std::stod; using std::stoi; using std::stol; using std::string; using std::vector; using namespace ns_md; /* * mutect参数 */ const static struct option kMdOpts[] = { {"MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP", required_argument, NULL, MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP}, {"MAX_FILE_HANDLES_FOR_READ_ENDS_MAP", required_argument, NULL, MAX_FILE_HANDLES_FOR_READ_ENDS_MAP}, {"SORTING_COLLECTION_SIZE_RATIO", required_argument, NULL, SORTING_COLLECTION_SIZE_RATIO}, {"BARCODE_TAG", required_argument, NULL, BARCODE_TAG}, {"READ_ONE_BARCODE_TAG", required_argument, NULL, READ_ONE_BARCODE_TAG}, {"READ_TWO_BARCODE_TAG", required_argument, NULL, READ_TWO_BARCODE_TAG}, {"TAG_DUPLICATE_SET_MEMBERS", required_argument, NULL, TAG_DUPLICATE_SET_MEMBERS}, {"REMOVE_SEQUENCING_DUPLICATES", required_argument, NULL, REMOVE_SEQUENCING_DUPLICATES}, {"TAGGING_POLICY", required_argument, NULL, TAGGING_POLICY}, {"CLEAR_DT", required_argument, NULL, CLEAR_DT}, {"DUPLEX_UMI", required_argument, NULL, DUPLEX_UMI}, {"MOLECULAR_IDENTIFIER_TAG", required_argument, NULL, MOLECULAR_IDENTIFIER_TAG}, {"METRICS_FILE", required_argument, NULL, METRICS_FILE}, {"REMOVE_DUPLICATES", required_argument, NULL, REMOVE_DUPLICATES}, {"ASSUME_SORTED", required_argument, NULL, ASSUME_SORTED}, {"ASSUME_SORT_ORDER", required_argument, NULL, ASSUME_SORT_ORDER}, {"DUPLICATE_SCORING_STRATEGY", required_argument, NULL, DUPLICATE_SCORING_STRATEGY}, {"PROGRAM_RECORD_ID", required_argument, NULL, PROGRAM_RECORD_ID}, {"PROGRAM_GROUP_VERSION", required_argument, NULL, PROGRAM_GROUP_VERSION}, {"PROGRAM_GROUP_COMMAND_LINE", required_argument, NULL, PROGRAM_GROUP_COMMAND_LINE}, {"PROGRAM_GROUP_NAME", required_argument, NULL, PROGRAM_GROUP_NAME}, {"COMMENT", required_argument, NULL, COMMENT}, {"READ_NAME_REGEX", required_argument, NULL, READ_NAME_REGEX}, {"OPTICAL_DUPLICATE_PIXEL_DISTANCE", required_argument, NULL, OPTICAL_DUPLICATE_PIXEL_DISTANCE}, {"MAX_OPTICAL_DUPLICATE_SET_SIZE", required_argument, NULL, MAX_OPTICAL_DUPLICATE_SET_SIZE}, {"QUIET", required_argument, NULL, QUIET}, {"VALIDATION_STRINGENCY", required_argument, NULL, VALIDATION_STRINGENCY}, {"COMPRESSION_LEVEL", required_argument, NULL, COMPRESSION_LEVEL}, {"MAX_RECORDS_IN_RAM", required_argument, NULL, MAX_RECORDS_IN_RAM}, {"CREATE_INDEX", required_argument, NULL, CREATE_INDEX}, {"CREATE_MD5_FILE", required_argument, NULL, CREATE_MD5_FILE}}; // 判断bool类型的参数 void setBoolArg(bool *arg) { if (strcmp("true", optarg) == 0) *arg = true; else if (strcmp("false", optarg) == 0) *arg = false; } // 解析参数 void MarkDupsArg::parseArgument(int argc, char **argv, vector *pvAuxVar, GlobalArg *pGArg) { auto &vAuxVar = *pvAuxVar; auto &gArg = *pGArg; struct option allOpt[MarkDupsArg::ARG_COUNT + GlobalArg::GLOBAL_ARG_CNT]; memcpy(allOpt, kMdOpts, MarkDupsArg::ARG_COUNT * sizeof(struct option)); memcpy(&allOpt[MarkDupsArg::ARG_COUNT], GlobalArg::GLOBAL_OPT, GlobalArg::GLOBAL_ARG_CNT * sizeof(struct option)); // int cnt = MarkDupsArg::ARG_COUNT + GlobalArg::GLOBAL_ARG_CNT; // cout << cnt << endl; // for (int i = 0; i < cnt; ++i) // { // cout << i << '\t' << allOpt[i].name << endl; // } int c; while ((c = getopt_long_only(argc, argv, "", allOpt, NULL)) >= 0) { gArg.parseArgument(c); switch (c) { case ns_ga::OPT_VERSION: PrintVersion(); exit(0); case ns_ga::OPT_HELP: PrintHelp(); exit(0); case ns_md::MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP: MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP = stoi(optarg); break; case ns_md::MAX_FILE_HANDLES_FOR_READ_ENDS_MAP: MAX_FILE_HANDLES_FOR_READ_ENDS_MAP = stoi(optarg); break; case ns_md::SORTING_COLLECTION_SIZE_RATIO: SORTING_COLLECTION_SIZE_RATIO = stod(optarg); break; case ns_md::BARCODE_TAG: BARCODE_TAG = optarg; break; case ns_md::READ_ONE_BARCODE_TAG: READ_ONE_BARCODE_TAG = optarg; break; case ns_md::READ_TWO_BARCODE_TAG: READ_TWO_BARCODE_TAG = optarg; break; case ns_md::TAG_DUPLICATE_SET_MEMBERS: setBoolArg(&TAG_DUPLICATE_SET_MEMBERS); break; case ns_md::REMOVE_SEQUENCING_DUPLICATES: setBoolArg(&REMOVE_SEQUENCING_DUPLICATES); break; case ns_md::TAGGING_POLICY: if (strcmp("DontTag", optarg) == 0) TAGGING_POLICY = ns_md::DuplicateTaggingPolicy::DontTag; else if (strcmp("OpticalOnly", optarg) == 0) TAGGING_POLICY = ns_md::DuplicateTaggingPolicy::OpticalOnly; else if (strcmp("All", optarg) == 0) TAGGING_POLICY = ns_md::DuplicateTaggingPolicy::All; break; case ns_md::CLEAR_DT: setBoolArg(&CLEAR_DT); break; case ns_md::DUPLEX_UMI: setBoolArg(&DUPLEX_UMI); break; case ns_md::MOLECULAR_IDENTIFIER_TAG: MOLECULAR_IDENTIFIER_TAG = optarg; break; case ns_md::METRICS_FILE: METRICS_FILE = optarg; break; case ns_md::REMOVE_DUPLICATES: setBoolArg(&REMOVE_DUPLICATES); break; case ns_md::ASSUME_SORTED: setBoolArg(&ASSUME_SORTED); break; case ns_md::ASSUME_SORT_ORDER: if (strcmp("unsorted", optarg) == 0) ASSUME_SORT_ORDER = ns_md::SortOrder::unsorted; else if (strcmp("queryname", optarg) == 0) ASSUME_SORT_ORDER = ns_md::SortOrder::queryname; else if (strcmp("coordinate", optarg) == 0) ASSUME_SORT_ORDER = ns_md::SortOrder::coordinate; else if (strcmp("duplicate", optarg) == 0) ASSUME_SORT_ORDER = ns_md::SortOrder::duplicate; else if (strcmp("unknown", optarg) == 0) ASSUME_SORT_ORDER = ns_md::SortOrder::unknown; break; case ns_md::DUPLICATE_SCORING_STRATEGY: if (strcmp("SUM_OF_BASE_QUALITIES", optarg) == 0) DUPLICATE_SCORING_STRATEGY = ns_md::ScoringStrategy::SUM_OF_BASE_QUALITIES; else if (strcmp("TOTAL_MAPPED_REFERENCE_LENGTH", optarg) == 0) DUPLICATE_SCORING_STRATEGY = ns_md::ScoringStrategy::TOTAL_MAPPED_REFERENCE_LENGTH; else if (strcmp("RANDOM", optarg) == 0) DUPLICATE_SCORING_STRATEGY = ns_md::ScoringStrategy::RANDOM; break; case ns_md::PROGRAM_RECORD_ID: PROGRAM_RECORD_ID = optarg; break; case ns_md::PROGRAM_GROUP_VERSION: PROGRAM_GROUP_VERSION = optarg; break; case ns_md::PROGRAM_GROUP_COMMAND_LINE: PROGRAM_GROUP_COMMAND_LINE = optarg; break; case ns_md::PROGRAM_GROUP_NAME: PROGRAM_GROUP_NAME = optarg; break; case ns_md::COMMENT: COMMENT.push_back(optarg); break; case ns_md::READ_NAME_REGEX: READ_NAME_REGEX = optarg; break; case ns_md::OPTICAL_DUPLICATE_PIXEL_DISTANCE: OPTICAL_DUPLICATE_PIXEL_DISTANCE = stoi(optarg); break; case ns_md::MAX_OPTICAL_DUPLICATE_SET_SIZE: MAX_OPTICAL_DUPLICATE_SET_SIZE = stol(optarg); break; case ns_md::QUIET: setBoolArg(&QUIET); break; case ns_md::VALIDATION_STRINGENCY: if (strcmp("STRICT", optarg) == 0) VALIDATION_STRINGENCY = ns_md::ValidationStringency::STRICT; else if (strcmp("LENIENT", optarg) == 0) VALIDATION_STRINGENCY = ns_md::ValidationStringency::LENIENT; else if (strcmp("SILENT", optarg) == 0) VALIDATION_STRINGENCY = ns_md::ValidationStringency::SILENT; break; case ns_md::COMPRESSION_LEVEL: COMPRESSION_LEVEL = stoi(optarg); break; case ns_md::MAX_RECORDS_IN_RAM: MAX_RECORDS_IN_RAM = stoi(optarg); break; case ns_md::CREATE_INDEX: setBoolArg(&CREATE_INDEX); break; case ns_md::CREATE_MD5_FILE: setBoolArg(&CREATE_MD5_FILE); break; default: break; } } gArg.printArgValue(); } // 打印版本信息 void MarkDupsArg::PrintVersion() { fprintf(stdout, "\n MarkDuplicate Version: %s\n", MARKDUPLICATE_VERSION); } // 释放资源,关闭文件等 void MarkDupsArg::Finalize(MarkDupsArg *pMdArg, vector *pvAuxVar, GlobalArg *pGArg) { } // 打印帮助信息 void MarkDupsArg::PrintHelp() { FILE *fp = stdout; fprintf(fp, "Usage: MarkDuplicates [arguments]\n" "\n" "Example:\n" " ./picard_cpp MarkDuplicates --num_thread 4 --INPUT input.bam --OUTPUT marked_duplicates.bam --METRICS_FILE marked_dup_metrics.txt\n" "\n" "Required Arguments:\n" "\n" "--INPUT,-I One or more input SAM, BAM or CRAM files to analyze. Must be coordinate sorted. This\n" " argument must be specified at least once.Required.\n" "\n" "--METRICS_FILE,-M File to write duplication metrics to Required.\n" "\n" "--OUTPUT,-O The output file to write marked records to Required.\n" "\n" "\n" "Optional Arguments:\n" "\n" "--ADD_PG_TAG_TO_READS \n" " Add PG tag to each read in a SAM or BAM Default value: true. Possible values: {true,\n" " false}\n" "\n" "--arguments_file read one or more arguments files and add them to the command line This argument may be\n" " specified 0 or more times. Default value: null.\n" "\n" "--ASSUME_SORT_ORDER,-ASO \n" " If not null, assume that the input file has this order even if the header says otherwise.\n" " Default value: null. Possible values: {unsorted, queryname, coordinate, duplicate,\n" " unknown} Cannot be used in conjunction with argument(s) ASSUME_SORTED (AS)\n" "\n" "\n" "--ASSUME_SORTED,-AS If true, assume that the input file is coordinate sorted even if the header says\n" " otherwise. Deprecated, used ASSUME_SORT_ORDER=coordinate instead. Default value: false.\n" " Possible values: {true, false} Cannot be used in conjunction with argument(s)\n" " ASSUME_SORT_ORDER (ASO)\n" "\n" "--BARCODE_TAG Barcode SAM tag (ex. BC for 10X Genomics) Default value: null.\n" "\n" "--CLEAR_DT Clear DT tag from input SAM records. Should be set to false if input SAM doesn't have this\n" " tag. Default true Default value: true. Possible values: {true, false}\n" "\n" "--COMMENT,-CO Comment(s) to include in the output file's header. This argument may be specified 0 or\n" " more times. Default value: null.\n" "\n" "--COMPRESSION_LEVEL Compression level for all compressed files created (e.g. BAM and VCF). Default value: 5.\n" "\n" "--CREATE_INDEX Whether to create an index when writing VCF or coordinate sorted BAM output. Default\n" " value: false. Possible values: {true, false}\n" "\n" "--CREATE_MD5_FILE Whether to create an MD5 digest for any BAM or FASTQ files created. Default value:\n" " false. Possible values: {true, false}\n" "\n" "--DUPLEX_UMI Treat UMIs as being duplex stranded. This option requires that the UMI consist of two\n" " equal length strings that are separated by a hyphen (e.g. 'ATC-GTC'). Reads are considered\n" " duplicates if, in addition to standard definition, have identical normalized UMIs. A UMI\n" " from the 'bottom' strand is normalized by swapping its content around the hyphen (eg.\n" " ATC-GTC becomes GTC-ATC). A UMI from the 'top' strand is already normalized as it is.\n" " Both reads from a read pair considered top strand if the read 1 unclipped 5' coordinate is\n" " less than the read 2 unclipped 5' coordinate. All chimeric reads and read fragments are\n" " treated as having come from the top strand. With this option is it required that the\n" " BARCODE_TAG hold non-normalized UMIs. Default false. Default value: false. Possible\n" " values: {true, false}\n" "\n" "--DUPLICATE_SCORING_STRATEGY,-DS \n" " The scoring strategy for choosing the non-duplicate among candidates. Default value:\n" " SUM_OF_BASE_QUALITIES. Possible values: {SUM_OF_BASE_QUALITIES,\n" " TOTAL_MAPPED_REFERENCE_LENGTH, RANDOM}\n" "\n" "--FLOW_EFFECTIVE_QUALITY_THRESHOLD \n" " Threshold for considering a quality value high enough to be included when calculating\n" " FLOW_QUALITY_SUM_STRATEGY calculation. Default value: 15.\n" "\n" "--FLOW_MODE enable parameters and behavior specific to flow based reads. Default value: false.\n" " Possible values: {true, false}\n" "\n" "--FLOW_Q_IS_KNOWN_END \n" " Treat position of read trimming based on quality as the known end (relevant for flow based\n" " reads). Default false - if the read is trimmed on quality its end is not defined and the\n" " read is duplicate of any read starting at the same place. Default value: false. Possible\n" " values: {true, false}\n" "\n" "--FLOW_QUALITY_SUM_STRATEGY \n" " Use specific quality summing strategy for flow based reads. The strategy ensures that the\n" " same (and correct) quality value is used for all bases of the same homopolymer. Default\n" " value: false. Possible values: {true, false}\n" "\n" "--FLOW_SKIP_FIRST_N_FLOWS \n" " Skip first N flows, starting from the read's start, when considering duplicates. Useful\n" " for flow based reads where sometimes there is noise in the first flows (for this argument,\n" " \" read start \" means 5' end). Default value: 0.\n" "\n" "--help,-h display the help message Default value: false. Possible values: {true, false}\n" "\n" "--MAX_FILE_HANDLES_FOR_READ_ENDS_MAP,-MAX_FILE_HANDLES \n" " Maximum number of file handles to keep open when spilling read ends to disk. Set this\n" " number a little lower than the per-process maximum number of file that may be open. This\n" " number can be found by executing the 'ulimit -n' command on a Unix system. Default value:\n" " 8000.\n" "\n" "--MAX_OPTICAL_DUPLICATE_SET_SIZE \n" " This number is the maximum size of a set of duplicate reads for which we will attempt to\n" " determine which are optical duplicates. Please be aware that if you raise this value too\n" " high and do encounter a very large set of duplicate reads, it will severely affect the\n" " runtime of this tool. To completely disable this check, set the value to -1. Default\n" " value: 300000.\n" "\n" "--MAX_RECORDS_IN_RAM When writing files that need to be sorted, this will specify the number of records stored\n" " in RAM before spilling to disk. Increasing this number reduces the number of file handles\n" " needed to sort the file, and increases the amount of RAM needed. Default value: 500000.\n" "\n" "--MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP,-MAX_SEQS \n" " This option is obsolete. ReadEnds will always be spilled to disk. Default value: 50000.\n" "\n" "--MOLECULAR_IDENTIFIER_TAG \n" " SAM tag to uniquely identify the molecule from which a read was derived. Use of this\n" " option requires that the BARCODE_TAG option be set to a non null value. Default null.\n" " Default value: null.\n" "\n" "--OPTICAL_DUPLICATE_PIXEL_DISTANCE \n" " The maximum offset between two duplicate clusters in order to consider them optical\n" " duplicates. The default is appropriate for unpatterned versions of the Illumina platform.\n" " For the patterned flowcell models, 2500 is moreappropriate. For other platforms and\n" " models, users should experiment to find what works best. Default value: 100.\n" "\n" "--PROGRAM_GROUP_COMMAND_LINE,-PG_COMMAND \n" " Value of CL tag of PG record to be created. If not supplied the command line will be\n" " detected automatically. Default value: null.\n" "\n" "--PROGRAM_GROUP_NAME,-PG_NAME \n" " Value of PN tag of PG record to be created. Default value: MarkDuplicates.\n" "\n" "--PROGRAM_GROUP_VERSION,-PG_VERSION \n" " Value of VN tag of PG record to be created. If not specified, the version will be detected\n" " automatically. Default value: null.\n" "\n" "--PROGRAM_RECORD_ID,-PG \n" " The program record ID for the @PG record(s) created by this program. Set to null to\n" " disable PG record creation. This string may have a suffix appended to avoid collision\n" " with other program record IDs. Default value: MarkDuplicates.\n" "\n" "--QUIET Whether to suppress job-summary info on System.err. Default value: false. Possible\n" " values: {true, false}\n" "\n" "--READ_NAME_REGEX MarkDuplicates can use the tile and cluster positions to estimate the rate of optical\n" " duplication in addition to the dominant source of duplication, PCR, to provide a more\n" " accurate estimation of library size. By default (with no READ_NAME_REGEX specified),\n" " MarkDuplicates will attempt to extract coordinates using a split on ':' (see Note below).\n" " Set READ_NAME_REGEX to 'null' to disable optical duplicate detection. Note that without\n" " optical duplicate counts, library size estimation will be less accurate. If the read name\n" " does not follow a standard Illumina colon-separation convention, but does contain tile and\n" " x,y coordinates, a regular expression can be specified to extract three variables:\n" " tile/region, x coordinate and y coordinate from a read name. The regular expression must\n" " contain three capture groups for the three variables, in order. It must match the entire\n" " read name. e.g. if field names were separated by semi-colon (';') this example regex\n" " could be specified (?:.*;)?([0-9]+)[^;]*;([0-9]+)[^;]*;([0-9]+)[^;]*$ Note that if no\n" " READ_NAME_REGEX is specified, the read name is split on ':'. For 5 element names, the\n" " 3rd, 4th and 5th elements are assumed to be tile, x and y values. For 7 element names\n" " (CASAVA 1.8), the 5th, 6th, and 7th elements are assumed to be tile, x and y values.\n" " Default value: .\n" "\n" "--READ_ONE_BARCODE_TAG \n" " Read one barcode SAM tag (ex. BX for 10X Genomics) Default value: null.\n" "\n" "--READ_TWO_BARCODE_TAG \n" " Read two barcode SAM tag (ex. BX for 10X Genomics) Default value: null.\n" "\n" "--REFERENCE_SEQUENCE,-R Reference sequence file. Default value: null.\n" "\n" "--REMOVE_DUPLICATES If true do not write duplicates to the output file instead of writing them with\n" " appropriate flags set. Default value: false. Possible values: {true, false}\n" "\n" "--REMOVE_SEQUENCING_DUPLICATES \n" " If true remove 'optical' duplicates and other duplicates that appear to have arisen from\n" " the sequencing process instead of the library preparation process, even if\n" " REMOVE_DUPLICATES is false. If REMOVE_DUPLICATES is true, all duplicates are removed and\n" " this option is ignored. Default value: false. Possible values: {true, false}\n" "\n" "--SORTING_COLLECTION_SIZE_RATIO \n" " This number, plus the maximum RAM available to the JVM, determine the memory footprint\n" " used by some of the sorting collections. If you are running out of memory, try reducing\n" " this number. Default value: 0.25.\n" "\n" "--TAG_DUPLICATE_SET_MEMBERS \n" " If a read appears in a duplicate set, add two tags. The first tag, DUPLICATE_SET_SIZE_TAG\n" " (DS), indicates the size of the duplicate set. The smallest possible DS value is 2 which\n" " occurs when two reads map to the same portion of the reference only one of which is marked\n" " as duplicate. The second tag, DUPLICATE_SET_INDEX_TAG (DI), represents a unique identifier\n" " for the duplicate set to which the record belongs. This identifier is the index-in-file of\n" " the representative read that was selected out of the duplicate set. Default value: false.\n" " Possible values: {true, false}\n" "\n" "--TAGGING_POLICY \n" " Determines how duplicate types are recorded in the DT optional attribute. Default value:\n" " DontTag. Possible values: {DontTag, OpticalOnly, All}\n" "\n" "--TMP_DIR One or more directories with space available to be used by this program for temporary\n" " storage of working files This argument may be specified 0 or more times. Default value:\n" " null.\n" "\n" "--UNPAIRED_END_UNCERTAINTY \n" " Maximal difference of the read end position that counted as equal. Useful for flow based\n" " reads where the end position might vary due to sequencing errors. (for this argument,\n" " \" read end \" means 3' end) Default value: 0.\n" "\n" "--USE_END_IN_UNPAIRED_READS \n" " Make the end location of single end read be significant when considering duplicates, in\n" " addition to the start location, which is always significant (i.e. require single-ended\n" " reads to start andend on the same position to be considered duplicate) (for this argument,\n" " \" read end \" means 3' end). Default value: false. Possible values: {true, false}\n" "\n" "--USE_JDK_DEFLATER,-use_jdk_deflater \n" " Use the JDK Deflater instead of the Intel Deflater for writing compressed output Default\n" " value: false. Possible values: {true, false}\n" "\n" "--USE_JDK_INFLATER,-use_jdk_inflater \n" " Use the JDK Inflater instead of the Intel Inflater for reading compressed input Default\n" " value: false. Possible values: {true, false}\n" "\n" "--USE_UNPAIRED_CLIPPED_END \n" " Use position of the clipping as the end position, when considering duplicates (or use the\n" " unclipped end position) (for this argument, \" read end \" means 3' end). Default value:\n" " false. Possible values: {true, false}\n" "\n" "--VALIDATION_STRINGENCY \n" " Validation stringency for all SAM files read by this program. Setting stringency to\n" " SILENT can improve performance when processing a BAM file in which variable-length data\n" " (read, qualities, tags) do not otherwise need to be decoded. Default value: STRICT.\n" " Possible values: {STRICT, LENIENT, SILENT}\n" "\n" "--VERBOSITY Control verbosity of logging. Default value: INFO. Possible values: {ERROR, WARNING,\n" " INFO, DEBUG}\n" "\n" "--version display the version number for this tool Default value: false. Possible values: {true,\n" " false}\n" "\n"); }