picard_cpp/src/sam/markdups/markdups_arg.cpp

/*
Description: Markduplicate需要用到的一些参数，读取命令行给的参数，并做一些初始化

Copyright : All right reserved by ICT

Author : Zhang Zhonghai
Date : 2023/10/27
*/

#include "markdups_arg.h"
#include "common/utils/global_arg.h"

#include <cstring>
#include <vector>
#include <string>
#include <sstream>
#include <getopt.h>
#include <stddef.h>
#include <stdio.h>

#include <iostream>
using std::cout, std::endl;

using std::ostringstream;
using std::stod;
using std::stoi;
using std::stol;
using std::string;
using std::vector;

using namespace ns_md;

/*
 * mutect参数
 */
const static struct option kMdOpts[] = {
    {"MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP", required_argument, NULL, MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP},
    {"MAX_FILE_HANDLES_FOR_READ_ENDS_MAP", required_argument, NULL, MAX_FILE_HANDLES_FOR_READ_ENDS_MAP},
    {"SORTING_COLLECTION_SIZE_RATIO", required_argument, NULL, SORTING_COLLECTION_SIZE_RATIO},
    {"BARCODE_TAG", required_argument, NULL, BARCODE_TAG},
    {"READ_ONE_BARCODE_TAG", required_argument, NULL, READ_ONE_BARCODE_TAG},
    {"READ_TWO_BARCODE_TAG", required_argument, NULL, READ_TWO_BARCODE_TAG},
    {"TAG_DUPLICATE_SET_MEMBERS", required_argument, NULL, TAG_DUPLICATE_SET_MEMBERS},
    {"REMOVE_SEQUENCING_DUPLICATES", required_argument, NULL, REMOVE_SEQUENCING_DUPLICATES},
    {"TAGGING_POLICY", required_argument, NULL, TAGGING_POLICY},
    {"CLEAR_DT", required_argument, NULL, CLEAR_DT},
    {"DUPLEX_UMI", required_argument, NULL, DUPLEX_UMI},
    {"MOLECULAR_IDENTIFIER_TAG", required_argument, NULL, MOLECULAR_IDENTIFIER_TAG},
    {"METRICS_FILE", required_argument, NULL, METRICS_FILE},
    {"REMOVE_DUPLICATES", required_argument, NULL, REMOVE_DUPLICATES},
    {"ASSUME_SORTED", required_argument, NULL, ASSUME_SORTED},
    {"ASSUME_SORT_ORDER", required_argument, NULL, ASSUME_SORT_ORDER},
    {"DUPLICATE_SCORING_STRATEGY", required_argument, NULL, DUPLICATE_SCORING_STRATEGY},
    {"PROGRAM_RECORD_ID", required_argument, NULL, PROGRAM_RECORD_ID},
    {"PROGRAM_GROUP_VERSION", required_argument, NULL, PROGRAM_GROUP_VERSION},
    {"PROGRAM_GROUP_COMMAND_LINE", required_argument, NULL, PROGRAM_GROUP_COMMAND_LINE},
    {"PROGRAM_GROUP_NAME", required_argument, NULL, PROGRAM_GROUP_NAME},
    {"COMMENT", required_argument, NULL, COMMENT},
    {"READ_NAME_REGEX", required_argument, NULL, READ_NAME_REGEX},
    {"OPTICAL_DUPLICATE_PIXEL_DISTANCE", required_argument, NULL, OPTICAL_DUPLICATE_PIXEL_DISTANCE},
    {"MAX_OPTICAL_DUPLICATE_SET_SIZE", required_argument, NULL, MAX_OPTICAL_DUPLICATE_SET_SIZE},
    {"QUIET", required_argument, NULL, QUIET},
    {"VALIDATION_STRINGENCY", required_argument, NULL, VALIDATION_STRINGENCY},
    {"COMPRESSION_LEVEL", required_argument, NULL, COMPRESSION_LEVEL},
    {"MAX_RECORDS_IN_RAM", required_argument, NULL, MAX_RECORDS_IN_RAM},
    {"CREATE_INDEX", required_argument, NULL, CREATE_INDEX},
    {"INDEX_FORMAT", required_argument, NULL, INDEX_FORMAT},
    {"CREATE_MD5_FILE", required_argument, NULL, CREATE_MD5_FILE}};

// 判断bool类型的参数
void setBoolArg(bool *arg) {
    if (strcmp("true", optarg) == 0)
        *arg = true;
    else if (strcmp("false", optarg) == 0)
        *arg = false;
}

// 解析参数
void MarkDupsArg::parseArgument(int argc,
                                char **argv,
                                GlobalArg *pGArg)
{
    auto &gArg = *pGArg;

    struct option allOpt[MarkDupsArg::ARG_COUNT + GlobalArg::GLOBAL_ARG_CNT];

    memcpy(allOpt, kMdOpts, MarkDupsArg::ARG_COUNT * sizeof(struct option));
    memcpy(&allOpt[MarkDupsArg::ARG_COUNT], GlobalArg::GLOBAL_OPT, GlobalArg::GLOBAL_ARG_CNT * sizeof(struct option));

    // int cnt = MarkDupsArg::ARG_COUNT + GlobalArg::GLOBAL_ARG_CNT;
    // cout << cnt << endl;
    // for (int i = 0; i < cnt; ++i)
    // {
    //     cout << i << '\t' << allOpt[i].name << endl;
    // }

    int c;
    while ((c = getopt_long_only(argc, argv, "", allOpt, NULL)) >= 0)
    {

        gArg.parseArgument(c);
        switch (c)
        {
        case ns_ga::OPT_VERSION:
            PrintVersion();
            exit(0);
        case ns_ga::OPT_HELP:
            PrintHelp();
            exit(0);
        case ns_md::MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP:
            MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP = stoi(optarg);
            break;
        case ns_md::MAX_FILE_HANDLES_FOR_READ_ENDS_MAP:
            MAX_FILE_HANDLES_FOR_READ_ENDS_MAP = stoi(optarg);
            break;
        case ns_md::SORTING_COLLECTION_SIZE_RATIO:
            SORTING_COLLECTION_SIZE_RATIO = stod(optarg);
            break;
        case ns_md::BARCODE_TAG:
            BARCODE_TAG = optarg;
            break;
        case ns_md::READ_ONE_BARCODE_TAG:
            READ_ONE_BARCODE_TAG = optarg;
            break;
        case ns_md::READ_TWO_BARCODE_TAG:
            READ_TWO_BARCODE_TAG = optarg;
            break;
        case ns_md::TAG_DUPLICATE_SET_MEMBERS:
            setBoolArg(&TAG_DUPLICATE_SET_MEMBERS);
            break;
        case ns_md::REMOVE_SEQUENCING_DUPLICATES:
            setBoolArg(&REMOVE_SEQUENCING_DUPLICATES);
            break;
        case ns_md::TAGGING_POLICY:
            if (strcmp("DontTag", optarg) == 0)
                TAGGING_POLICY = ns_md::DuplicateTaggingPolicy::DontTag;
            else if (strcmp("OpticalOnly", optarg) == 0)
                TAGGING_POLICY = ns_md::DuplicateTaggingPolicy::OpticalOnly;
            else if (strcmp("All", optarg) == 0)
                TAGGING_POLICY = ns_md::DuplicateTaggingPolicy::All;
            break;
        case ns_md::CLEAR_DT:
            setBoolArg(&CLEAR_DT);
            break;
        case ns_md::DUPLEX_UMI:
            setBoolArg(&DUPLEX_UMI);
            break;
        case ns_md::MOLECULAR_IDENTIFIER_TAG:
            MOLECULAR_IDENTIFIER_TAG = optarg;
            break;
        case ns_md::METRICS_FILE:
            METRICS_FILE = optarg;
            break;
        case ns_md::REMOVE_DUPLICATES:
            setBoolArg(&REMOVE_DUPLICATES);
            break;
        case ns_md::ASSUME_SORTED:
            setBoolArg(&ASSUME_SORTED);
            break;
        case ns_md::ASSUME_SORT_ORDER:
            if (strcmp("unsorted", optarg) == 0)
                ASSUME_SORT_ORDER = ns_md::SortOrder::unsorted;
            else if (strcmp("queryname", optarg) == 0)
                ASSUME_SORT_ORDER = ns_md::SortOrder::queryname;
            else if (strcmp("coordinate", optarg) == 0)
                ASSUME_SORT_ORDER = ns_md::SortOrder::coordinate;
            else if (strcmp("duplicate", optarg) == 0)
                ASSUME_SORT_ORDER = ns_md::SortOrder::duplicate;
            else if (strcmp("unknown", optarg) == 0)
                ASSUME_SORT_ORDER = ns_md::SortOrder::unknown;
            break;
        case ns_md::DUPLICATE_SCORING_STRATEGY:
            if (strcmp("SUM_OF_BASE_QUALITIES", optarg) == 0)
                DUPLICATE_SCORING_STRATEGY = ns_md::ScoringStrategy::SUM_OF_BASE_QUALITIES;
            else if (strcmp("TOTAL_MAPPED_REFERENCE_LENGTH", optarg) == 0)
                DUPLICATE_SCORING_STRATEGY = ns_md::ScoringStrategy::TOTAL_MAPPED_REFERENCE_LENGTH;
            else if (strcmp("RANDOM", optarg) == 0)
                DUPLICATE_SCORING_STRATEGY = ns_md::ScoringStrategy::RANDOM;
            break;
        case ns_md::PROGRAM_RECORD_ID:
            PROGRAM_RECORD_ID = optarg;
            break;
        case ns_md::PROGRAM_GROUP_VERSION:
            PROGRAM_GROUP_VERSION = optarg;
            break;
        case ns_md::PROGRAM_GROUP_COMMAND_LINE:
            PROGRAM_GROUP_COMMAND_LINE = optarg;
            break;
        case ns_md::PROGRAM_GROUP_NAME:
            PROGRAM_GROUP_NAME = optarg;
            break;
        case ns_md::COMMENT:
            COMMENT.push_back(optarg);
            break;
        case ns_md::READ_NAME_REGEX:
            READ_NAME_REGEX = optarg;
            break;
        case ns_md::OPTICAL_DUPLICATE_PIXEL_DISTANCE:
            OPTICAL_DUPLICATE_PIXEL_DISTANCE = stoi(optarg);
            break;
        case ns_md::MAX_OPTICAL_DUPLICATE_SET_SIZE:
            MAX_OPTICAL_DUPLICATE_SET_SIZE = stol(optarg);
            break;
        case ns_md::QUIET:
            setBoolArg(&QUIET);
            break;
        case ns_md::VALIDATION_STRINGENCY:
            if (strcmp("STRICT", optarg) == 0)
                VALIDATION_STRINGENCY = ns_md::ValidationStringency::STRICT;
            else if (strcmp("LENIENT", optarg) == 0)
                VALIDATION_STRINGENCY = ns_md::ValidationStringency::LENIENT;
            else if (strcmp("SILENT", optarg) == 0)
                VALIDATION_STRINGENCY = ns_md::ValidationStringency::SILENT;
            break;
        case ns_md::COMPRESSION_LEVEL:
            COMPRESSION_LEVEL = stoi(optarg);
            break;
        case ns_md::MAX_RECORDS_IN_RAM:
            MAX_RECORDS_IN_RAM = stoi(optarg);
            break;
        case ns_md::CREATE_INDEX:
            setBoolArg(&CREATE_INDEX);
            break;
        case ns_md::INDEX_FORMAT:
            if (strcmp("CSI", optarg) == 0)
                INDEX_FORMAT = ns_md::IndexFormat::CSI;
            else
                INDEX_FORMAT = ns_md::IndexFormat::BAI;
        case ns_md::CREATE_MD5_FILE:
            setBoolArg(&CREATE_MD5_FILE);
            break;
        default:
            break;
        }
    }

    gArg.printArgValue();
    printArgValue();
}

/* 打印参数信息 */
void MarkDupsArg::printArgValue()
{
    printf("--READ_NAME_REGEX = %s\n", this->READ_NAME_REGEX.c_str());
    printf("--INDEX_FORMAT    = %s\n", this->INDEX_FORMAT == ns_md::IndexFormat::BAI ? "bai" : "csi");
}

// 打印版本信息
void MarkDupsArg::PrintVersion()
{
    fprintf(stdout, "\n MarkDuplicate Version: %s\n", MARKDUPLICATE_VERSION);
}

// 释放资源，关闭文件等
void MarkDupsArg::Finalize(MarkDupsArg *pMdArg,
                           GlobalArg *pGArg)
{
}

// 打印帮助信息
void MarkDupsArg::PrintHelp()
{
    FILE *fp = stdout;
    fprintf(fp,
            "Usage: MarkDuplicates [arguments]\n"
            "\n"
            "Example:\n"
            "  ./picard_cpp MarkDuplicates --num_thread 4 --INPUT input.bam --OUTPUT marked_duplicates.bam --METRICS_FILE marked_dup_metrics.txt\n"
            "\n"
            "Required Arguments:\n"
            "\n"
            "--INPUT <String>              One input SAM, BAM or CRAM files to analyze. Must be coordinate sorted.  This\n"
            "                              argument must be specified at least once. Required.\n"
            "\n"
            "--METRICS_FILE <File>         File to write duplication metrics to  Required.\n"
            "\n"
            "--OUTPUT <File>               The output file to write marked records to  Required.\n"
            "\n"
            "\n"
            "Optional Arguments:\n"
            "\n"
            "--INDEX_FORMAT <FORMAT>       Format for bam index file. Possible values: {BAI, CSI}\n"
            "\n"
            "--ADD_PG_TAG_TO_READS <Boolean>\n"
            "                              Add PG tag to each read in a SAM or BAM  Default value: true. Possible values: {true,\n"
            "                              false}\n"
            "\n"
            "--arguments_file <File>       read one or more arguments files and add them to the command line  This argument may be\n"
            "                              specified 0 or more times. Default value: null.\n"
            "\n"
            "--ASSUME_SORT_ORDER <SortOrder>\n"
            "                              If not null, assume that the input file has this order even if the header says otherwise.\n"
            "                              Default value: null. Possible values: {unsorted, queryname, coordinate, duplicate,\n"
            "                              unknown}  Cannot be used in conjunction with argument(s) ASSUME_SORTED (AS)\n"
            "\n"
            "\n"
            "--ASSUME_SORTED <Boolean>     If true, assume that the input file is coordinate sorted even if the header says\n"
            "                              otherwise. Deprecated, used ASSUME_SORT_ORDER=coordinate instead.  Default value: false.\n"
            "                              Possible values: {true, false}  Cannot be used in conjunction with argument(s)\n"
            "                              ASSUME_SORT_ORDER (ASO)\n"
            "\n"
            "--BARCODE_TAG <String>        Barcode SAM tag (ex. BC for 10X Genomics)  Default value: null.\n"
            "\n"
            "--CLEAR_DT <Boolean>          Clear DT tag from input SAM records. Should be set to false if input SAM doesn't have this\n"
            "                              tag.  Default true  Default value: true. Possible values: {true, false}\n"
            "\n"
            "--COMMENT <String>            Comment(s) to include in the output file's header.  This argument may be specified 0 or\n"
            "                              more times. Default value: null.\n"
            "\n"
            "--COMPRESSION_LEVEL <Integer> Compression level for all compressed files created (e.g. BAM and VCF).  Default value: 5.\n"
            "\n"
            "--CREATE_INDEX <Boolean>      Whether to create an index when writing VCF or coordinate sorted BAM output.  Default\n"
            "                              value: false. Possible values: {true, false}\n"
            "\n"
            "--CREATE_MD5_FILE <Boolean>   Whether to create an MD5 digest for any BAM or FASTQ files created.    Default value:\n"
            "                              false. Possible values: {true, false}\n"
            "\n"
            "--DUPLEX_UMI <Boolean>        Treat UMIs as being duplex stranded.  This option requires that the UMI consist of two\n"
            "                              equal length strings that are separated by a hyphen (e.g. 'ATC-GTC'). Reads are considered\n"
            "                              duplicates if, in addition to standard definition, have identical normalized UMIs.  A UMI\n"
            "                              from the 'bottom' strand is normalized by swapping its content around the hyphen (eg.\n"
            "                              ATC-GTC becomes GTC-ATC).  A UMI from the 'top' strand is already normalized as it is.\n"
            "                              Both reads from a read pair considered top strand if the read 1 unclipped 5' coordinate is\n"
            "                              less than the read 2 unclipped 5' coordinate. All chimeric reads and read fragments are\n"
            "                              treated as having come from the top strand. With this option is it required that the\n"
            "                              BARCODE_TAG hold non-normalized UMIs. Default false.  Default value: false. Possible\n"
            "                              values: {true, false}\n"
            "\n"
            "--DUPLICATE_SCORING_STRATEGY <ScoringStrategy>\n"
            "                              The scoring strategy for choosing the non-duplicate among candidates.  Default value:\n"
            "                              SUM_OF_BASE_QUALITIES. Possible values: {SUM_OF_BASE_QUALITIES,\n"
            "                              TOTAL_MAPPED_REFERENCE_LENGTH, RANDOM}\n"
            "\n"
            "--FLOW_EFFECTIVE_QUALITY_THRESHOLD <Integer>\n"
            "                              Threshold for considering a quality value high enough to be included when calculating\n"
            "                              FLOW_QUALITY_SUM_STRATEGY calculation.  Default value: 15.\n"
            "\n"
            "--FLOW_MODE <Boolean>         enable parameters and behavior specific to flow based reads.  Default value: false.\n"
            "                              Possible values: {true, false}\n"
            "\n"
            "--FLOW_Q_IS_KNOWN_END <Boolean>\n"
            "                              Treat position of read trimming based on quality as the known end (relevant for flow based\n"
            "                              reads). Default false - if the read is trimmed on quality its end is not defined and the\n"
            "                              read is duplicate of any read starting at the same place.  Default value: false. Possible\n"
            "                              values: {true, false}\n"
            "\n"
            "--FLOW_QUALITY_SUM_STRATEGY <Boolean>\n"
            "                              Use specific quality summing strategy for flow based reads. The strategy ensures that the\n"
            "                              same (and correct) quality value is used for all bases of the same homopolymer.  Default\n"
            "                              value: false. Possible values: {true, false}\n"
            "\n"
            "--FLOW_SKIP_FIRST_N_FLOWS <Integer>\n"
            "                              Skip first N flows, starting from the read's start, when considering duplicates. Useful\n"
            "                              for flow based reads where sometimes there is noise in the first flows (for this argument,\n"
            "                              \" read start \" means 5' end).  Default value: 0.\n"
            "\n"
            "--help,-h <Boolean>           display the help message  Default value: false. Possible values: {true, false}\n"
            "\n"
            "--MAX_FILE_HANDLES_FOR_READ_ENDS_MAP <Integer>\n"
            "                              Maximum number of file handles to keep open when spilling read ends to disk. Set this\n"
            "                              number a little lower than the per-process maximum number of file that may be open. This\n"
            "                              number can be found by executing the 'ulimit -n' command on a Unix system.  Default value:\n"
            "                              8000.\n"
            "\n"
            "--MAX_OPTICAL_DUPLICATE_SET_SIZE <Long>\n"
            "                              This number is the maximum size of a set of duplicate reads for which we will attempt to\n"
            "                              determine which are optical duplicates.  Please be aware that if you raise this value too\n"
            "                              high and do encounter a very large set of duplicate reads, it will severely affect the\n"
            "                              runtime of this tool.  To completely disable this check, set the value to -1.  Default\n"
            "                              value: 300000.\n"
            "\n"
            "--MAX_RECORDS_IN_RAM <Integer>When writing files that need to be sorted, this will specify the number of records stored\n"
            "                              in RAM before spilling to disk. Increasing this number reduces the number of file handles\n"
            "                              needed to sort the file, and increases the amount of RAM needed.  Default value: 500000.\n"
            "\n"
            "--MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP <Integer>\n"
            "                              This option is obsolete. ReadEnds will always be spilled to disk.  Default value: 50000.\n"
            "\n"
            "--MOLECULAR_IDENTIFIER_TAG <String>\n"
            "                              SAM tag to uniquely identify the molecule from which a read was derived.  Use of this\n"
            "                              option requires that the BARCODE_TAG option be set to a non null value.  Default null.\n"
            "                              Default value: null.\n"
            "\n"
            "--OPTICAL_DUPLICATE_PIXEL_DISTANCE <Integer>\n"
            "                              The maximum offset between two duplicate clusters in order to consider them optical\n"
            "                              duplicates. The default is appropriate for unpatterned versions of the Illumina platform.\n"
            "                              For the patterned flowcell models, 2500 is moreappropriate. For other platforms and\n"
            "                              models, users should experiment to find what works best.  Default value: 100.\n"
            "\n"
            "--PROGRAM_GROUP_COMMAND_LINE <String>\n"
            "                              Value of CL tag of PG record to be created. If not supplied the command line will be\n"
            "                              detected automatically.  Default value: null.\n"
            "\n"
            "--PROGRAM_GROUP_NAME <String>\n"
            "                              Value of PN tag of PG record to be created.  Default value: MarkDuplicates.\n"
            "\n"
            "--PROGRAM_GROUP_VERSION <String>\n"
            "                              Value of VN tag of PG record to be created. If not specified, the version will be detected\n"
            "                              automatically.  Default value: null.\n"
            "\n"
            "--PROGRAM_RECORD_ID <String>\n"
            "                              The program record ID for the @PG record(s) created by this program. Set to null to\n"
            "                              disable PG record creation.  This string may have a suffix appended to avoid collision\n"
            "                              with other program record IDs.  Default value: MarkDuplicates.\n"
            "\n"
            "--QUIET <Boolean>             Whether to suppress job-summary info on System.err.  Default value: false. Possible\n"
            "                              values: {true, false}\n"
            "\n"
            "--READ_NAME_REGEX <String>    MarkDuplicates can use the tile and cluster positions to estimate the rate of optical\n"
            "                              duplication in addition to the dominant source of duplication, PCR, to provide a more\n"
            "                              accurate estimation of library size. By default (with no READ_NAME_REGEX specified),\n"
            "                              MarkDuplicates will attempt to extract coordinates using a split on ':' (see Note below).\n"
            "                              Set READ_NAME_REGEX to 'null' to disable optical duplicate detection. Note that without\n"
            "                              optical duplicate counts, library size estimation will be less accurate. If the read name\n"
            "                              does not follow a standard Illumina colon-separation convention, but does contain tile and\n"
            "                              x,y coordinates, a regular expression can be specified to extract three variables:\n"
            "                              tile/region, x coordinate and y coordinate from a read name. The regular expression must\n"
            "                              contain three capture groups for the three variables, in order. It must match the entire\n"
            "                              read name.   e.g. if field names were separated by semi-colon (';') this example regex\n"
            "                              could be specified      (?:.*;)?([0-9]+)[^;]*;([0-9]+)[^;]*;([0-9]+)[^;]*$ Note that if no\n"
            "                              READ_NAME_REGEX is specified, the read name is split on ':'.   For 5 element names, the\n"
            "                              3rd, 4th and 5th elements are assumed to be tile, x and y values.   For 7 element names\n"
            "                              (CASAVA 1.8), the 5th, 6th, and 7th elements are assumed to be tile, x and y values.\n"
            "                              Default value: <optimized capture of last three ':' separated fields as numeric values>.\n"
            "\n"
            "--READ_ONE_BARCODE_TAG <String>\n"
            "                              Read one barcode SAM tag (ex. BX for 10X Genomics)  Default value: null.\n"
            "\n"
            "--READ_TWO_BARCODE_TAG <String>\n"
            "                              Read two barcode SAM tag (ex. BX for 10X Genomics)  Default value: null.\n"
            "\n"
            "--REFERENCE_SEQUENCE <File>   Reference sequence file.  Default value: null.\n"
            "\n"
            "--REMOVE_DUPLICATES <Boolean> If true do not write duplicates to the output file instead of writing them with\n"
            "                              appropriate flags set.  Default value: false. Possible values: {true, false}\n"
            "\n"
            "--REMOVE_SEQUENCING_DUPLICATES <Boolean>\n"
            "                              If true remove 'optical' duplicates and other duplicates that appear to have arisen from\n"
            "                              the sequencing process instead of the library preparation process, even if\n"
            "                              REMOVE_DUPLICATES is false. If REMOVE_DUPLICATES is true, all duplicates are removed and\n"
            "                              this option is ignored.  Default value: false. Possible values: {true, false}\n"
            "\n"
            "--SORTING_COLLECTION_SIZE_RATIO <Double>\n"
            "                              This number, plus the maximum RAM available to the JVM, determine the memory footprint\n"
            "                              used by some of the sorting collections.  If you are running out of memory, try reducing\n"
            "                              this number.  Default value: 0.25.\n"
            "\n"
            "--TAG_DUPLICATE_SET_MEMBERS <Boolean>\n"
            "                              If a read appears in a duplicate set, add two tags. The first tag, DUPLICATE_SET_SIZE_TAG\n"
            "                              (DS), indicates the size of the duplicate set. The smallest possible DS value is 2 which\n"
            "                              occurs when two reads map to the same portion of the reference only one of which is marked\n"
            "                              as duplicate. The second tag, DUPLICATE_SET_INDEX_TAG (DI), represents a unique identifier\n"
            "                              for the duplicate set to which the record belongs. This identifier is the index-in-file of\n"
            "                              the representative read that was selected out of the duplicate set.  Default value: false.\n"
            "                              Possible values: {true, false}\n"
            "\n"
            "--TAGGING_POLICY <DuplicateTaggingPolicy>\n"
            "                              Determines how duplicate types are recorded in the DT optional attribute.  Default value:\n"
            "                              DontTag. Possible values: {DontTag, OpticalOnly, All}\n"
            "\n"
            "--TMP_DIR <File>              One or more directories with space available to be used by this program for temporary\n"
            "                              storage of working files  This argument may be specified 0 or more times. Default value:\n"
            "                              null.\n"
            "\n"
            "--UNPAIRED_END_UNCERTAINTY <Integer>\n"
            "                              Maximal difference of the read end position that counted as equal. Useful for flow based\n"
            "                              reads where the end position might vary due to sequencing errors. (for this argument,\n"
            "                              \" read end \" means 3' end)  Default value: 0.\n"
            "\n"
            "--USE_END_IN_UNPAIRED_READS <Boolean>\n"
            "                              Make the end location of single end read be significant when considering duplicates, in\n"
            "                              addition to the start location, which is always significant (i.e. require single-ended\n"
            "                              reads to start andend on the same position to be considered duplicate) (for this argument,\n"
            "                              \" read end \" means 3' end).  Default value: false. Possible values: {true, false}\n"
            "\n"
            "--USE_UNPAIRED_CLIPPED_END <Boolean>\n"
            "                              Use position of the clipping as the end position, when considering duplicates (or use the\n"
            "                              unclipped end position) (for this argument, \" read end \" means 3' end).  Default value:\n"
            "                              false. Possible values: {true, false}\n"
            "\n"
            "--VALIDATION_STRINGENCY <ValidationStringency>\n"
            "                              Validation stringency for all SAM files read by this program.  Setting stringency to\n"
            "                              SILENT can improve performance when processing a BAM file in which variable-length data\n"
            "                              (read, qualities, tags) do not otherwise need to be decoded.  Default value: STRICT.\n"
            "                              Possible values: {STRICT, LENIENT, SILENT}\n"
            "\n"
            "--VERBOSITY <LogLevel>        Control verbosity of logging.  Default value: INFO. Possible values: {ERROR, WARNING,\n"
            "                              INFO, DEBUG}\n"
            "\n"
            "--version <Boolean>           display the version number for this tool  Default value: false. Possible values: {true,\n"
            "                              false}\n"
            "\n");
}