From e8b5b4405a999dac8d3a0143548819ffb4f54262 Mon Sep 17 00:00:00 2001 From: zzh Date: Sun, 15 Dec 2024 03:20:35 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E9=87=8D=E6=9E=84=E5=9F=BA?= =?UTF-8?q?=E6=9C=AC=E5=AE=8C=E6=88=90=EF=BC=8C=E8=BF=98=E5=B7=AEmarkdup?= =?UTF-8?q?=E9=87=8C=E7=9A=84=E4=B8=80=E4=BA=9B=E8=B0=83=E7=94=A8=E5=92=8C?= =?UTF-8?q?=E5=A4=84=E7=90=86=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + CMakeLists.txt | 5 +- src/CMakeLists.txt | 9 +- src/main.cpp | 192 ++++- src/markdup/class_static_var.cpp | 2 + src/markdup/dup_metrics.h | 72 ++ src/markdup/global_vars.h | 1 - src/markdup/markdup.cpp | 84 +++ src/markdup/md_args.cpp | 2 +- src/markdup/md_args.h | 27 +- src/markdup/md_funcs.cpp | 437 +++++++++++ src/markdup/md_funcs.h | 141 ++++ src/markdup/md_types.h | 194 +++++ src/markdup/pipeline_md.cpp | 1165 ++++++++++++++++++++++++++++++ src/markdup/pipeline_md.h | 174 +++++ src/markdup/read_ends.h | 170 +++++ src/markdup/read_name_parser.h | 226 ++++++ src/util/bam_buf.cpp | 248 +++++++ src/util/bam_buf.h | 172 +++++ src/util/bam_wrap.h | 369 ++++++++++ src/util/murmur3.h | 83 +++ src/util/profiling.cpp | 70 ++ src/util/profiling.h | 58 ++ src/util/timer.cpp | 0 src/util/timer.h | 0 src/util/yarn.cpp | 395 ++++++++++ src/util/yarn.h | 150 ++++ 27 files changed, 4430 insertions(+), 17 deletions(-) create mode 100644 src/markdup/class_static_var.cpp delete mode 100644 src/markdup/global_vars.h create mode 100644 src/util/profiling.cpp create mode 100644 src/util/profiling.h delete mode 100644 src/util/timer.cpp delete mode 100644 src/util/timer.h diff --git a/.gitignore b/.gitignore index 1b958ab..a5ccbb8 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ /.vscode /build build.sh +run.sh # Compiled Object files *.slo diff --git a/CMakeLists.txt b/CMakeLists.txt index d8ca905..cf8e1f8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,8 +1,9 @@ -CMAKE_MINIMUM_REQUIRED(VERSION 3.0) +cmake_minimum_required(VERSION 3.0) project(FastDup) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") # set(CMAKE_BUILD_TYPE Debug) # set(CMAKE_BUILD_TYPE Release) -ADD_SUBDIRECTORY(src) \ No newline at end of file +add_definitions(-DSHOW_PERF) +add_subdirectory(src) \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4a3a06e..56f0d07 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -45,7 +45,7 @@ else() endif() # lib deflate -find_library(DeflateLib dlibdeflate) +find_library(DeflateLib deflate) if(DeflateLib) target_link_libraries(${PG_NAME} ${DeflateLib}) else() @@ -61,6 +61,13 @@ else() message(FATAL_ERROR "lzma is not found") endif() +# lib curl +find_package(CURL REQUIRED) +if (CURL_FOUND) + include_directories(${CURL_INCLUDE_DIR}) + target_link_libraries(${PG_NAME} ${CURL_LIBRARY}) +endif() + # lz find_library(Z_LIBRARY z) if(Z_LIBRARY) diff --git a/src/main.cpp b/src/main.cpp index 3b7b1fe..88d801f 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -3,23 +3,207 @@ #include #include +#include +#include +#include +#include -#include "version.h" #include "markdup/md_args.h" +#include "util/profiling.h" +#include "version.h" -using namespace std; - +namespace nsgv { extern MarkDupsArg gMdArg; +}; + +int MarkDuplicates(); int main(int argc, char *argv[]) { // init log spdlog::set_default_logger(spdlog::stderr_color_st("fastdup")); spdlog::cfg::load_env_levels(); + // init arg parser - argparse::ArgumentParser cliArgs() + argparse::ArgumentParser program(nsgv::gMdArg.PROGRAM_RECORD_ID, FASTDUP_VERSION, argparse::default_arguments::none); + program.add_description( + "Identifies duplicate reads. This tool locates and tags duplicate reads in a coordinate ordered SAM or BAM " + "file.\nUse the same algorithm as picard MarkDuplicates and output identical results.\n" + "Use spdlog as log tool and the default level is 'info'."); + program.add_argument("--input") + .help("Input file. One coordinate ordered SAM or BAM file.") + .metavar("") + .required(); + + program.add_argument("--metrics") + .help("Metrics file. File to write duplication metrics to.") + .metavar("") + .required(); + + program.add_argument("--output") + .help("Output file. SAM or BAM file to write marked records to.") + .metavar("") + .required(); + + program.add_argument("--num-threads") + .help("Number of threads to use.") + .scan<'i', int>() + .default_value(1) + .nargs(1) + .metavar(""); + + program.add_argument("--none-duplex-io") + .help("Do not use writing-while-reading mode.") + .default_value(false) + .implicit_value(true); + + program.add_argument("--create-index") + .help("Whether to create an index when writing coordinate sorted BAM output.") + .default_value(false) + .implicit_value(true); + + program.add_argument("--index-format") + .help("Format for bam index file. Possible values: {BAI, CSI}") + .default_value(std::string("BAI")) + .choices("BAI", "CSI") + .nargs(1) + .metavar(""); + + program.add_argument("--remove-duplicates") + .help("If true do not write duplicates to the output file instead of writing them with appropriate flags set.") + .default_value(false) + .implicit_value(true); + + program.add_argument("--duplicate-scoring-strategy") + .help( + "The scoring strategy for choosing the non-duplicate among candidates. " + "Possible values: {SUM_OF_BASE_QUALITIES, TOTAL_MAPPED_REFERENCE_LENGTH, RANDOM}") + .default_value(std::string("SUM_OF_BASE_QUALITIES")) + .choices("SUM_OF_BASE_QUALITIES", "TOTAL_MAPPED_REFERENCE_LENGTH", "RANDOM") + .nargs(1) + .metavar(""); + + program.add_argument("--optical-duplicate-pixel-distance") + .help( + "\nThe maximum offset between two duplicate clusters in order to consider them optical" + "duplicates. The default is appropriate for unpatterned versions of the Illumina platform." + "For the patterned flowcell models, 2500 is moreappropriate. For other platforms and" + "models, users should experiment to find what works best.") + .scan<'i', int>() + .default_value(100) + .nargs(1) + .metavar(""); + + program.add_argument("--read-name-regex") + .help( + "\nMarkDuplicates can use the tile and cluster positions to estimate the rate of optical " + "duplication in addition to the dominant source of duplication, PCR, to provide a more " + "accurate estimation of library size. By default (with no READ_NAME_REGEX specified), " + "MarkDuplicates will attempt to extract coordinates using a split on ':' (see Note below). " + "Set READ_NAME_REGEX to 'null' to disable optical duplicate detection. Note that without " + "optical duplicate counts, library size estimation will be less accurate. If the read name " + "does not follow a standard Illumina colon-separation convention, but does contain tile and " + "x,y coordinates, a regular expression can be specified to extract three variables: " + "tile/region, x coordinate and y coordinate from a read name. The regular expression must " + "contain three capture groups for the three variables, in order. It must match the entire " + "read name. e.g. if field names were separated by semi-colon (';') this example regex " + "could be specified (?:.*;)?([0-9]+)[^;]*;([0-9]+)[^;]*;([0-9]+)[^;]*$ Note that if no " + "READ_NAME_REGEX is specified, the read name is split on ':'. For 5 element names, the " + "3rd, 4th and 5th elements are assumed to be tile, x and y values. For 7 element names " + "(CASAVA 1.8), the 5th, 6th, and 7th elements are assumed to be tile, x and y values. " + "Default value: .") + .default_value(std::string("(?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$")) + .nargs(1) + .metavar(""); + + program.add_argument("--tag-duplicate-set-members") + .help( + "\nIf a read appears in a duplicate set, add two tags. The first tag, DUPLICATE_SET_SIZE_TAG" + "(DS), indicates the size of the duplicate set. The smallest possible DS value is 2 which" + "occurs when two reads map to the same portion of the reference only one of which is marked" + "as duplicate. The second tag, DUPLICATE_SET_INDEX_TAG (DI), represents a unique identifier" + "for the duplicate set to which the record belongs. This identifier is the index-in-file of" + "the representative read that was selected out of the duplicate set.") + .default_value(false) + .implicit_value(true); + + program.add_argument("--tagging-policy") + .help( + "Determines how duplicate types are recorded in the DT optional attribute. Possible values: {DontTag, " + "OpticalOnly, All}.") + .default_value(std::string("DontTag")) + .choices("DontTag", "OpticalOnly", "All") + .nargs(1) + .metavar(""); + + // add help and version args + program.add_argument("-h", "--help") + .action([&](const auto & /*unused*/) { + std::cout << program.help().str(); + std::exit(0); + }) + .default_value(false) + .help("shows help message and exits") + .implicit_value(true) + .nargs(0); + + program.add_argument("-v", "--version") + .action([&](const auto & /*unused*/) { + std::cout << FASTDUP_VERSION << std::endl; + std::exit(0); + }) + .default_value(false) + .help("prints version information and exits") + .implicit_value(true) + .nargs(0); + + // std::cout << program << std::endl; + + try { + program.parse_args(argc, argv); + nsgv::gMdArg.INPUT_FILE = program.get("--input"); + nsgv::gMdArg.OUTPUT_FILE = program.get("--output"); + nsgv::gMdArg.METRICS_FILE = program.get("--metrics"); + nsgv::gMdArg.NUM_THREADS = program.get("--num-threads"); + nsgv::gMdArg.DUPLEX_IO = !program.get("--none-duplex-io"); + nsgv::gMdArg.CREATE_INDEX = program.get("--create-index"); + + nsgv::gMdArg.INDEX_FORMAT = + program.get("--index-format") == "BAI" ? nsmd::IndexFormat::BAI : nsmd::IndexFormat::CSI; + nsgv::gMdArg.REMOVE_DUPLICATES = program.get("--remove-duplicates"); + + std::map scoring_strategy_args = { + {"SUM_OF_BASE_QUALITIES", nsmd::ScoringStrategy::SUM_OF_BASE_QUALITIES}, + {"TOTAL_MAPPED_REFERENCE_LENGTH", nsmd::ScoringStrategy::TOTAL_MAPPED_REFERENCE_LENGTH}, + {"RANDOM", nsmd::ScoringStrategy::RANDOM}}; + nsgv::gMdArg.DUPLICATE_SCORING_STRATEGY = scoring_strategy_args[program.get("--duplicate-scoring-strategy")]; + + nsgv::gMdArg.OPTICAL_DUPLICATE_PIXEL_DISTANCE = program.get("--optical-duplicate-pixel-distance"); + + std::set all_nulls = {"null", "Null", "NULL"}; + nsgv::gMdArg.READ_NAME_REGEX = + all_nulls.find(program.get("--read-name-regex")) != all_nulls.end() ? "" : program.get("--read-name-regex"); + if (nsgv::gMdArg.READ_NAME_REGEX.empty()) + nsgv::gMdArg.CHECK_OPTICAL_DUP = false; + + nsgv::gMdArg.TAG_DUPLICATE_SET_MEMBERS = program.get("--tag-duplicate-set-members"); + + std::map tagging_policy_args = { + {"DontTag", nsmd::DuplicateTaggingPolicy::DontTag}, + {"OpticalOnly", nsmd::DuplicateTaggingPolicy::OpticalOnly}, + {"All", nsmd::DuplicateTaggingPolicy::All}}; + nsgv::gMdArg.TAGGING_POLICY = tagging_policy_args[program.get("--tagging-policy")]; + + } catch (const std::exception &err) { + spdlog::error(err.what()); + return 1; + } spdlog::info("fast markduplicates start"); + MarkDuplicates(); + spdlog::info("fast markduplicates end"); + + DisplayProfiling(1); return 0; } \ No newline at end of file diff --git a/src/markdup/class_static_var.cpp b/src/markdup/class_static_var.cpp new file mode 100644 index 0000000..fdb304d --- /dev/null +++ b/src/markdup/class_static_var.cpp @@ -0,0 +1,2 @@ +#include "read_name_parser.h" +bool ReadNameParser::sWrongNameFormat = false; \ No newline at end of file diff --git a/src/markdup/dup_metrics.h b/src/markdup/dup_metrics.h index e69de29..1fba5de 100644 --- a/src/markdup/dup_metrics.h +++ b/src/markdup/dup_metrics.h @@ -0,0 +1,72 @@ +#pragma once + +#include + +#include +#include + +#include "md_types.h" + +using std::string; +using std::vector; + +/* + * 标记冗余过程中的一些数据统计 + */ +struct DuplicationMetrics { + /** + * The library on which the duplicate marking was performed. + */ + string LIBRARY = ""; + /** + * The number of mapped reads examined which did not have a mapped mate pair, + * either because the read is unpaired, or the read is paired to an unmapped mate. + */ + uint64_t UNPAIRED_READS_EXAMINED = 0; + /** + * The number of mapped read pairs examined. (Primary, non-supplemental) + */ + uint64_t READ_PAIRS_EXAMINED = 0; + /** + * The number of reads that were either secondary or supplementary + */ + uint64_t SECONDARY_OR_SUPPLEMENTARY_RDS = 0; + /** + * The total number of unmapped reads examined. (Primary, non-supplemental) + */ + uint64_t UNMAPPED_READS = 0; + /** + * The number of fragments that were marked as duplicates. + */ + uint64_t UNPAIRED_READ_DUPLICATES = 0; + /** + * The number of read pairs that were marked as duplicates. + */ + uint64_t READ_PAIR_DUPLICATES = 0; + /** + * The number of read pairs duplicates that were caused by optical duplication. + * Value is always < READ_PAIR_DUPLICATES, which counts all duplicates regardless of source. + */ + uint64_t READ_PAIR_OPTICAL_DUPLICATES = 0; + /** + * The fraction of mapped sequence that is marked as duplicate. + */ + double PERCENT_DUPLICATION = 0.0; + /** + * The estimated number of unique molecules in the library based on PE duplication. + */ + uint64_t ESTIMATED_LIBRARY_SIZE = 0; + + // 其他的统计数据 + vector CoverageMult; + + // 比如在该位置,有4个冗余,那么所有4个冗余的位置数量 + MDMap DuplicateCountHist; + MDMap NonOpticalDuplicateCountHist; + MDMap OpticalDuplicatesCountHist; + + // 没有冗余的位置总数量 for test + MDSet singletonReads; + MDSet dupReads; // for test + MDSet bestReads; +}; \ No newline at end of file diff --git a/src/markdup/global_vars.h b/src/markdup/global_vars.h deleted file mode 100644 index 7b9637e..0000000 --- a/src/markdup/global_vars.h +++ /dev/null @@ -1 +0,0 @@ -#pragma once \ No newline at end of file diff --git a/src/markdup/markdup.cpp b/src/markdup/markdup.cpp index c1af2d9..e729237 100644 --- a/src/markdup/markdup.cpp +++ b/src/markdup/markdup.cpp @@ -7,4 +7,88 @@ Copyright : All right reserved by ICT Author : Zhang Zhonghai Date : 2023/10/23 */ +#include +#include +#include +#include + +#include "dup_metrics.h" +#include "md_args.h" +#include "md_funcs.h" +#include "pipeline_md.h" +#include "read_name_parser.h" +#include "util/profiling.h" +#include "version.h" + +#define BAM_BLOCK_SIZE 16L * 1024 * 1024 + +namespace nsgv { + +MarkDupsArg gMdArg; // 用来测试性能 +std::vector gNameParsers; // 每个线程一个read name parser +samFile *gInBamFp; // 输入的bam文件 +sam_hdr_t *gInBamHeader; // 输入的bam文件头信息 +samFile *gOutBamFp; // 输出文件, sam或者bam格式 +sam_hdr_t *gOutBamHeader; // 输出文件的header +DuplicationMetrics gMetrics; // 统计信息 +PipelineArg gPipe; + +}; + +/* + * 获取文件名后缀 + */ +static string getFileExtension(const string &filename) { + auto last_dot = filename.find_last_of('.'); + if (last_dot == string::npos) { + return ""; + } + return filename.substr(last_dot + 1); +} + +// entrance of mark duplicates +int MarkDuplicates() { + + /* 初始化一些参数和变量*/ + nsgv::gNameParsers.resize(nsgv::gMdArg.NUM_THREADS); + for (auto &parser : nsgv::gNameParsers) + parser.SetReadNameRegex(nsgv::gMdArg.READ_NAME_REGEX); // 用来解析read name中的tile,x,y信息 + + /* 打开输入bam文件 */ + nsgv::gInBamFp = sam_open_format(nsgv::gMdArg.INPUT_FILE.c_str(), "r", nullptr); + if (!nsgv::gInBamFp) { + spdlog::error("[{}] load sam/bam file failed.\n", __func__); + return -1; + } + hts_set_opt(nsgv::gInBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE); + nsgv::gInBamHeader = sam_hdr_read(nsgv::gInBamFp); // 读取header + // 获取样本名称(libraryId) + nsgv::gMetrics.LIBRARY = sam_hdr_line_name(nsgv::gInBamHeader, "RG", 0); + + /* 利用线程池对输入输出文件进行读写 */ + htsThreadPool htsPoolRead = {NULL, 0}; // 多线程读取,创建线程池 + htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池 + htsPoolRead.pool = hts_tpool_init(nsgv::gMdArg.NUM_THREADS); + htsPoolWrite.pool = hts_tpool_init(nsgv::gMdArg.NUM_THREADS); + // htsPoolRead.pool = hts_tpool_init(8); + // htsPoolWrite.pool = hts_tpool_init(32); + if (!htsPoolRead.pool || !htsPoolWrite.pool) { + spdlog::error("[{}] failed to set up thread pool", __LINE__); + sam_close(nsgv::gInBamFp); + return -1; + } + hts_set_opt(nsgv::gInBamFp, HTS_OPT_THREAD_POOL, &htsPoolRead); + + // 测试读写速度 +#if 1 + bam1_t *bamp = bam_init1(); + while (sam_read1(nsgv::gInBamFp, nsgv::gInBamHeader, bamp) >= 0); + DisplayProfiling(nsgv::gMdArg.NUM_THREADS); + exit(0); +#endif + + + + return 0; +} \ No newline at end of file diff --git a/src/markdup/md_args.cpp b/src/markdup/md_args.cpp index 736fb84..dfb4fda 100644 --- a/src/markdup/md_args.cpp +++ b/src/markdup/md_args.cpp @@ -23,4 +23,4 @@ using std::stol; using std::string; using std::vector; -using namespace ns_md; \ No newline at end of file +using namespace nsmd; \ No newline at end of file diff --git a/src/markdup/md_args.h b/src/markdup/md_args.h index 26b64d7..007dbc1 100644 --- a/src/markdup/md_args.h +++ b/src/markdup/md_args.h @@ -14,7 +14,7 @@ Date : 2023/10/23 using std::string; using std::vector; -namespace ns_md { +namespace nsmd { /* How strict to be when reading a SAM or BAM, beyond bare minimum validation. */ enum ValidationStringency { @@ -54,10 +54,20 @@ enum ScoringStrategy { SUM_OF_BASE_QUALITIES, TOTAL_MAPPED_REFERENCE_LENGTH, RAN /* 索引文件的格式 (bai或者csi) */ enum IndexFormat { BAI, CSI }; -} // namespace ns_md +} // namespace nsmd /* markduplicate 需要的参数*/ struct MarkDupsArg { + string INPUT_FILE; // input bam filename + + string OUTPUT_FILE; // output bam filename + + int NUM_THREADS = 1; + + size_t MAX_MEM = ((size_t)1) << 31; // 最小2G + + bool DUPLEX_IO = true; // 同时读写 + /** * The optional attribute in SAM/BAM/CRAM files used to store the duplicate type. */ @@ -125,7 +135,7 @@ struct MarkDupsArg { bool REMOVE_SEQUENCING_DUPLICATES = false; /* "Determines how duplicate types are recorded in the DT optional attribute.") */ - ns_md::DuplicateTaggingPolicy TAGGING_POLICY = ns_md::DuplicateTaggingPolicy::DontTag; + nsmd::DuplicateTaggingPolicy TAGGING_POLICY = nsmd::DuplicateTaggingPolicy::DontTag; /* "Clear DT tag from input SAM records. Should be set to false if input SAM doesn't have this tag. Default true") */ @@ -159,11 +169,11 @@ struct MarkDupsArg { /* "If not null, assume that the input file has this order even if the header says otherwise.", optional = true, mutex = {"ASSUME_SORTED"} */ - ns_md::SortOrder ASSUME_SORT_ORDER = ns_md::SortOrder::unsorted; + nsmd::SortOrder ASSUME_SORT_ORDER = nsmd::SortOrder::unsorted; /* "The scoring strategy for choosing the non-duplicate among candidates." */ - ns_md::ScoringStrategy DUPLICATE_SCORING_STRATEGY = ns_md::ScoringStrategy::SUM_OF_BASE_QUALITIES; - // ns_md::ScoringStrategy DUPLICATE_SCORING_STRATEGY = ns_md::ScoringStrategy::TOTAL_MAPPED_REFERENCE_LENGTH; + nsmd::ScoringStrategy DUPLICATE_SCORING_STRATEGY = nsmd::ScoringStrategy::SUM_OF_BASE_QUALITIES; + // nsmd::ScoringStrategy DUPLICATE_SCORING_STRATEGY = nsmd::ScoringStrategy::TOTAL_MAPPED_REFERENCE_LENGTH; /* "The program record ID for the @PG record(s) created by this program. Set to null to disable " + "PG record creation. This string may have a suffix appended to avoid collision with other " + @@ -203,6 +213,7 @@ struct MarkDupsArg { 3rd, 4th and 5th elements are assumed to be tile, x and y values. " + " For 7 element names (CASAVA 1.8), the 5th, 6th, and 7th elements are assumed to be tile, x and y values.", optional = true */ string READ_NAME_REGEX = "(?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$"; + bool CHECK_OPTICAL_DUP = true; /* "The maximum offset between two duplicate clusters in order to consider them optical duplicates. The default " + "is appropriate for unpatterned versions of the Illumina platform. For the patterned flowcell @@ -224,7 +235,7 @@ struct MarkDupsArg { /* "Validation stringency for all SAM files read by this program. Setting stringency to SILENT " + "can improve performance when processing a BAM file in which variable-length data (read, qualities, tags) " + "do not otherwise need to be decoded.", common=true */ - ns_md::ValidationStringency VALIDATION_STRINGENCY = ns_md::ValidationStringency::DEFAULT_STRINGENCY; + nsmd::ValidationStringency VALIDATION_STRINGENCY = nsmd::ValidationStringency::DEFAULT_STRINGENCY; /* "Compression level for all compressed files created (e.g. BAM and VCF).", common = true */ int COMPRESSION_LEVEL = 5; @@ -237,7 +248,7 @@ struct MarkDupsArg { /* "Whether to create an index when writing VCF or coordinate sorted BAM output.", common = true */ bool CREATE_INDEX = false; - ns_md::IndexFormat INDEX_FORMAT = ns_md::IndexFormat::BAI; + nsmd::IndexFormat INDEX_FORMAT = nsmd::IndexFormat::BAI; /* "Whether to create an MD5 digest for any BAM or FASTQ files created. ", common = true */ bool CREATE_MD5_FILE = false; diff --git a/src/markdup/md_funcs.cpp b/src/markdup/md_funcs.cpp index e69de29..b62e2fe 100644 --- a/src/markdup/md_funcs.cpp +++ b/src/markdup/md_funcs.cpp @@ -0,0 +1,437 @@ +#include "md_funcs.h" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "dup_metrics.h" +#include "md_args.h" +#include "read_ends.h" +#include "read_name_parser.h" + +using std::cerr; +using std::endl; +using std::map; +using std::set; +using std::unordered_map; +using std::vector; + +namespace nsgv { +extern MarkDupsArg gMdArg; +extern DuplicationMetrics gMetrics; +}; + +/* + * 计算read的分数 + */ +int16_t computeDuplicateScore(BamWrap &bw) { + int16_t score = 0; + switch (nsgv::gMdArg.DUPLICATE_SCORING_STRATEGY) { + case nsmd::SUM_OF_BASE_QUALITIES: + // two (very) long reads worth of high-quality bases can go over + // Short.MAX_VALUE/2 and risk overflow. + score += (int16_t)min(bw.GetSumOfBaseQualities(), INT16_MAX / 2); + break; + case nsmd::TOTAL_MAPPED_REFERENCE_LENGTH: + if (!bw.GetReadUnmappedFlag()) + // no need to remember the score since this scoring mechanism is + // symmetric + score = (int16_t)min(bw.GetReferenceLength(), INT16_MAX / 2); + break; + case nsmd::RANDOM: + // The RANDOM score gives the same score to both reads so that they get + // filtered together. it's not critical do use the readName since the + // scores from both ends get added, but it seem to be clearer this way. + score += (short)(Murmur3::Instance().HashUnencodedChars(bw.query_name()) & 0b11111111111111); + // subtract Short.MIN_VALUE/4 from it to end up with a number between + // 0 and Short.MAX_VALUE/2. This number can be then discounted in case + // the read is not passing filters. We need to stay far from overflow so + // that when we add the two scores from the two read mates we do not + // overflow since that could cause us to chose a failing read-pair + // instead of a passing one. + score -= INT16_MIN / 4; + default: + break; + } + // make sure that filter-failing records are heavily discounted. (the + // discount can happen twice, once for each mate, so need to make sure we do + // not subtract more than Short.MIN_VALUE overall.) + score += bw.GetReadFailsVendorQualityCheckFlag() ? (int16_t)(INT16_MIN / 2) : 0; + + return score; +} + +/* + * Builds a read ends object that represents a single read. + * 用来表示一个read的特征结构 + */ +void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey) { + auto &k = *pKey; + auto &bc = bw.b->core; + k.read1FirstOfPair = bw.GetFirstOfPairFlag(); + k.read1ReferenceIndex = bc.tid; + k.read1Coordinate = (bc.flag & BAM_FREVERSE) ? bw.GetUnclippedEnd() : bw.GetUnclippedStart(); + k.orientation = (bc.flag & BAM_FREVERSE) ? ReadEnds::R : ReadEnds::F; + + k.read1IndexInFile = index; + k.score = computeDuplicateScore(bw); + // Doing this lets the ends object know that it's part of a pair + if (bw.GetReadPairedFlag() && !bw.GetMateUnmappedFlag()) { + k.read2ReferenceIndex = bc.mtid; + } + // Fill in the location information for optical duplicates + if (!ReadNameParser::sWrongNameFormat) + rnParser.AddLocationInformation(bw.query_name(), pKey); + else + nsgv::gMdArg.CHECK_OPTICAL_DUP = false; + // cout << k.tile << ' ' << k.x << ' ' << k.y << endl; + // 计算位置key + k.posKey = BamWrap::bam_global_pos(k.read1ReferenceIndex, k.read1Coordinate); // << 1 | k.orientation; +} + +/* 对找到的pairend read end添加一些信息 */ +void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds) { + auto &pairedEnds = *pPairedEnds; + + int64_t bamIdx = fragEnd.read1IndexInFile; + const int matesRefIndex = fragEnd.read1ReferenceIndex; + const int matesCoordinate = fragEnd.read1Coordinate; + // Set orientationForOpticalDuplicates, which always goes by the first then + // the second end for the strands. NB: must do this before updating the + // orientation later. + if (fragEnd.read1FirstOfPair) { + pairedEnds.orientationForOpticalDuplicates = + ReadEnds::GetOrientationByte(fragEnd.IsNegativeStrand(), pairedEnds.orientation == ReadEnds::R); + } else { + pairedEnds.orientationForOpticalDuplicates = + ReadEnds::GetOrientationByte(pairedEnds.orientation == ReadEnds::R, fragEnd.IsNegativeStrand()); + } + // If the other read is actually later, simply add the other read's data as + // read2, else flip the reads + if (matesRefIndex > pairedEnds.read1ReferenceIndex || + (matesRefIndex == pairedEnds.read1ReferenceIndex && matesCoordinate >= pairedEnds.read1Coordinate)) { + pairedEnds.read2ReferenceIndex = matesRefIndex; + pairedEnds.read2Coordinate = matesCoordinate; + pairedEnds.read2IndexInFile = bamIdx; + pairedEnds.orientation = + ReadEnds::GetOrientationByte(pairedEnds.orientation == ReadEnds::R, fragEnd.IsNegativeStrand()); + + // if the two read ends are in the same position, pointing in opposite + // directions, the orientation is undefined and the procedure above will + // depend on the order of the reads in the file. To avoid this, we set + // it explicitly (to FR): + if (pairedEnds.read2ReferenceIndex == pairedEnds.read1ReferenceIndex && + pairedEnds.read2Coordinate == pairedEnds.read1Coordinate && pairedEnds.orientation == ReadEnds::RF) { + pairedEnds.orientation = ReadEnds::FR; + } + } else { + pairedEnds.read2ReferenceIndex = pairedEnds.read1ReferenceIndex; + pairedEnds.read2Coordinate = pairedEnds.read1Coordinate; + pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile; + pairedEnds.read1ReferenceIndex = matesRefIndex; + pairedEnds.read1Coordinate = matesCoordinate; + pairedEnds.read1IndexInFile = bamIdx; + pairedEnds.orientation = + ReadEnds::GetOrientationByte(fragEnd.IsNegativeStrand(), pairedEnds.orientation == ReadEnds::R); + pairedEnds.posKey = fragEnd.posKey; + } + pairedEnds.score += fragEnd.score; +} + +static inline bool closeEnough(const ReadEnds *lhs, const ReadEnds *rhs, const int distance) { + return lhs != rhs && // no comparing an object to itself (checked using object identity)! + (lhs->tile != ReadEnds::NO_VALUE) && + (rhs->tile != ReadEnds::NO_VALUE) && // no comparing objects without locations + lhs->tile == rhs->tile && // and the same tile + abs(lhs->x - rhs->x) <= distance && abs(lhs->y - rhs->y) <= distance; +} + +static inline bool closeEnoughShort(const ReadEnds *lhs, const ReadEnds *rhs, const int distance) { + return lhs != rhs && abs(lhs->x - rhs->x) <= distance && abs(lhs->y - rhs->y) <= distance; +} + +/** + * Finds which reads within the list of duplicates that are likely to be optical/co-localized duplicates of + * one another. Within each cluster of optical duplicates that is found, one read remains un-flagged for + * optical duplication and the rest are flagged as optical duplicates. The set of reads that are considered + * optical duplicates are indicated by returning "true" at the same index in the resulting boolean[] as the + * read appeared in the input list of physical locations. + * + * @param list a list of reads that are determined to be duplicates of one another + * @param keeper a single PhysicalLocation that is the one being kept as non-duplicate, and thus should never be + * annotated as an optical duplicate. May in some cases be null, or a PhysicalLocation not + * contained within the list! (always not be null!) + * @return a boolean[] of the same length as the incoming list marking which reads are optical duplicates + */ +static void findOpticalDuplicates(vector &readEndsArr, const ReadEnds *pBestRe, + vector *pOpticalDuplicateFlags) { + const int DEFAULT_OPTICAL_DUPLICATE_DISTANCE = 100; + const int DEFAULT_MAX_DUPLICATE_SET_SIZE = 300000; + + vector &opticalDuplicateFlags = *pOpticalDuplicateFlags; + // opticalDuplicateFlags.push_back(true); // for test + int len = readEndsArr.size(); + // If there is only one or zero reads passed in (so there are obviously no optical duplicates), + // or if there are too many reads (so we don't want to try to run this expensive n^2 algorithm), + // then just return an array of all false + if (len < 2 || len > DEFAULT_MAX_DUPLICATE_SET_SIZE) { + return; + } + + if (len >= 4) { + /** + * Compute the optical duplicates correctly in the case where the duplicate group could end up with transitive + * optical duplicates + * getOpticalDuplicatesFlagWithGraph + */ + // Make a graph where the edges are reads that lie within the optical duplicate pixel distance from each other, + // we will then use the union-find algorithm to cluster the graph and find optical duplicate groups + Graph opticalDistanceRelationGraph; + unordered_map> tileRGmap; + int keeperIndex = -1; + for (int i = 0; i < readEndsArr.size(); ++i) { + const ReadEnds *currentLoc = readEndsArr[i]; + if (currentLoc == pBestRe) + keeperIndex = i; + if (currentLoc->tile != ReadEnds::NO_VALUE) { + int key = currentLoc->tile; // 只处理一个样本,所以只有一个read group + tileRGmap[key].push_back(i); + } + opticalDistanceRelationGraph.addNode(i); + } + // Since because finding adjacent optical duplicates is an O(n^2) operation, we can subdivide the input into its + // readgroups in order to reduce the amount of redundant checks across readgroups between reads. + // fillGraphFromAGroup + for (auto &entry : tileRGmap) { + auto &groupList = entry.second; + if (groupList.size() > 1) { + for (int i = 0; i < groupList.size(); ++i) { + int iIndex = groupList[i]; + const ReadEnds *currentLoc = readEndsArr[iIndex]; + for (int j = i + 1; j < groupList.size(); ++j) { + int jIndex = groupList[j]; + const ReadEnds *other = readEndsArr[jIndex]; + if (closeEnoughShort(currentLoc, other, DEFAULT_OPTICAL_DUPLICATE_DISTANCE)) + opticalDistanceRelationGraph.addEdge(iIndex, jIndex); + } + } + } + } + // Keep a map of the reads and their cluster assignments + unordered_map opticalDuplicateClusterMap; + opticalDistanceRelationGraph.cluster(&opticalDuplicateClusterMap); + unordered_map clusterToRepresentativeRead; + int keeperCluster = -1; + // Specially mark the keeper as specifically not a duplicate if it exists + if (keeperIndex >= 0) { + keeperCluster = opticalDuplicateClusterMap[keeperIndex]; + clusterToRepresentativeRead[keeperCluster] = keeperIndex; + } + + for (auto &entry : opticalDuplicateClusterMap) { + int recordIndex = entry.first; + int recordAssignedCluster = entry.second; + // If its not the first read we've seen for this cluster, mark it as an optical duplicate + auto repReadItr = clusterToRepresentativeRead.find(recordAssignedCluster); + if (repReadItr != clusterToRepresentativeRead.end() && recordIndex != keeperIndex) { + const ReadEnds *representativeLoc = readEndsArr[repReadItr->second]; + const ReadEnds *currentRecordLoc = readEndsArr[recordIndex]; + // If not in the keeper cluster, then keep the minX -> minY valued duplicate (note the tile must be + // equal for reads to cluster together) + if (!(keeperIndex >= 0 && recordAssignedCluster == keeperCluster) && + (currentRecordLoc->x < representativeLoc->x || + (currentRecordLoc->x == representativeLoc->x && currentRecordLoc->y < representativeLoc->y))) { + // Mark the old min as an optical duplicate, and save the new min + opticalDuplicateFlags[repReadItr->second] = true; + clusterToRepresentativeRead[recordAssignedCluster] = recordIndex; + } else { + // If a smaller read has already been visited, mark the test read as an optical duplicate + opticalDuplicateFlags[recordIndex] = true; + } + } else { + clusterToRepresentativeRead[recordAssignedCluster] = recordIndex; + } + } + } else { + /** + * Compute optical duplicates quickly in the standard case where we know that there won't be any transitive + * distances to worry about. Note, this is guaranteed to be correct when there are at most 2x reads from a + * readgroup or 3x with the keeper present + * getOpticalDuplicatesFlagFast + */ + // First go through and compare all the reads to the keeper + for (int i = 0; i < len; ++i) { + const ReadEnds *other = readEndsArr[i]; + opticalDuplicateFlags[i] = closeEnough(pBestRe, other, DEFAULT_OPTICAL_DUPLICATE_DISTANCE); + } + // Now go through and do each pairwise comparison not involving the actualKeeper + for (int i = 0; i < len; ++i) { + const ReadEnds *lhs = readEndsArr[i]; + if (lhs == pBestRe) // no comparisons to actualKeeper since those are all handled above + continue; + for (int j = i + 1; j < len; ++j) { + const ReadEnds *rhs = readEndsArr[j]; + if (rhs == pBestRe) // no comparisons to actualKeeper since those are all handled above + continue; + if (opticalDuplicateFlags[i] && opticalDuplicateFlags[j]) + continue; // both already marked, no need to check + if (closeEnough(lhs, rhs, DEFAULT_OPTICAL_DUPLICATE_DISTANCE)) { + // At this point we want to mark either lhs or rhs as duplicate. Either could have been marked + // as a duplicate of the keeper (but not both - that's checked above), so be careful about which + // one to now mark as a duplicate. + int index = opticalDuplicateFlags[j] ? i : j; + opticalDuplicateFlags[index] = true; + } + } + } + } +} + +/** + * Looks through the set of reads and identifies how many of the duplicates are + * in fact optical duplicates, and stores the data in the instance level histogram. + * + * We expect only reads with FR or RF orientations, not a mixture of both. + * + * In PCR duplicate detection, a duplicates can be a have FR and RF when fixing the orientation order to the first end + * of the mate. In optical duplicate detection, we do not consider them duplicates if one read as FR and the other RF + * when we order orientation by the first mate sequenced (read #1 of the pair). + */ +static int checkOpticalDuplicates(vector &readEndsArr, const ReadEnds *pBestRe) { + vector opticalDuplicateFlags(readEndsArr.size(), false); + // find OpticalDuplicates + findOpticalDuplicates(readEndsArr, pBestRe, &opticalDuplicateFlags); + int opticalDuplicates = 0; + for (int i = 0; i < opticalDuplicateFlags.size(); ++i) { + ReadEnds *pRe = const_cast(readEndsArr[i]); + if (opticalDuplicateFlags[i]) { + ++opticalDuplicates; + pRe->isOpticalDuplicate = true; + } else { + pRe->isOpticalDuplicate = false; + } + } + return opticalDuplicates; +} + +/** + * 记录光学原因造成的冗余 + */ +void trackOpticalDuplicates(vector &readEndsArr, const ReadEnds *pBestRe) { + bool hasFR = false, hasRF = false; + // Check to see if we have a mixture of FR/RF + for (auto pRe : readEndsArr) { + if (ReadEnds::FR == pRe->orientationForOpticalDuplicates) + hasFR = true; + else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates) + hasRF = true; + } + + // Check if we need to partition since the orientations could have changed + int nOpticalDup; + if (hasFR && hasRF) { // need to track them independently + vector trackOpticalDuplicatesF; + vector trackOpticalDuplicatesR; + // Split into two lists: first of pairs and second of pairs, + // since they must have orientation and same starting end + for (auto pRe : readEndsArr) { + if (ReadEnds::FR == pRe->orientationForOpticalDuplicates) + trackOpticalDuplicatesF.push_back(pRe); + else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates) + trackOpticalDuplicatesR.push_back(pRe); + else + cerr << "Found an unexpected orientation: " << pRe->orientation << endl; + } + // track the duplicates + int nOpticalDupF = checkOpticalDuplicates(trackOpticalDuplicatesF, pBestRe); + int nOpticalDupR = checkOpticalDuplicates(trackOpticalDuplicatesR, pBestRe); + nOpticalDup = nOpticalDupF + nOpticalDupR; + } else { // No need to partition + nOpticalDup = checkOpticalDuplicates(readEndsArr, pBestRe); + } + + // 统计信息,trackDuplicateCounts + ++nsgv::gMetrics.DuplicateCountHist[readEndsArr.size()]; + if (readEndsArr.size() > nOpticalDup) + ++nsgv::gMetrics.NonOpticalDuplicateCountHist[readEndsArr.size() - nOpticalDup]; + if (nOpticalDup) + ++nsgv::gMetrics.OpticalDuplicatesCountHist[nOpticalDup + 1]; +} + +/** + * Estimates the size of a library based on the number of paired end molecules observed + * and the number of unique pairs observed. + *

+ * Based on the Lander-Waterman equation that states: + * C/X = 1 - exp( -N/X ) + * where + * X = number of distinct molecules in library + * N = number of read pairs + * C = number of distinct fragments observed in read pairs + */ +int64_t estimateLibrarySize(int64_t readPairs, int64_t uniqueReadPairs) { + int64_t librarySize = 0; + int64_t readPairDuplicates = readPairs - uniqueReadPairs; + auto f = [](double x, double c, double n) { return c / x - 1 + exp(-n / x); }; + if (readPairs > 0 && readPairDuplicates > 0) { + double m = 1.0; + double M = 100.0; + if (uniqueReadPairs >= readPairs || f(m * uniqueReadPairs, uniqueReadPairs, readPairs) < 0) { + cerr << "Invalid values for pairs and unique pairs: " << readPairs << ", " << uniqueReadPairs << endl; + return 0; + } + // find value of M, large enough to act as other side for bisection method + while (f(M * uniqueReadPairs, uniqueReadPairs, readPairs) > 0) { + M *= 10.0; + } + // use bisection method (no more than 40 times) to find solution + for (int i = 0; i < 40; ++i) { + double r = (m + M) / 2.0; + double u = f(r * uniqueReadPairs, uniqueReadPairs, readPairs); + if (u == 0) + break; + else if (u > 0) + m = r; + else if (u < 0) + M = r; + } + return uniqueReadPairs * (m + M) / 2.0; + } + return librarySize; +} + +/** + * Estimates the ROI (return on investment) that one would see if a library was sequenced to + * x higher coverage than the observed coverage. + * + * @param estimatedLibrarySize the estimated number of molecules in the library + * @param x the multiple of sequencing to be simulated (i.e. how many X sequencing) + * @param pairs the number of pairs observed in the actual sequencing + * @param uniquePairs the number of unique pairs observed in the actual sequencing + * @return a number z <= x that estimates if you had pairs*x as your sequencing then you + * would observe uniquePairs*z unique pairs. + */ +double estimateRoi(long estimatedLibrarySize, double x, long pairs, long uniquePairs) { + return estimatedLibrarySize * (1 - exp(-(x * pairs) / estimatedLibrarySize)) / uniquePairs; +} + +/** + * Calculates a histogram using the estimateRoi method to estimate the effective yield + * doing x sequencing for x=1..10. + */ +void calculateRoiHistogram(DuplicationMetrics &metrics) { + int64_t uniquePairs = metrics.READ_PAIRS_EXAMINED - metrics.READ_PAIR_DUPLICATES; + metrics.CoverageMult.resize(101); + for (int x = 1; x <= 100; ++x) { + metrics.CoverageMult[x] = + estimateRoi(metrics.ESTIMATED_LIBRARY_SIZE, x, metrics.READ_PAIRS_EXAMINED, uniquePairs); + } +} \ No newline at end of file diff --git a/src/markdup/md_funcs.h b/src/markdup/md_funcs.h index e69de29..c8a70cd 100644 --- a/src/markdup/md_funcs.h +++ b/src/markdup/md_funcs.h @@ -0,0 +1,141 @@ +#pragma once + +#include + +#include +#include +#include +#include + +#include "dup_metrics.h" + +using std::priority_queue; +using std::unordered_map; +using std::unordered_set; +using std::vector; + +/* 前向声明 */ +class BamWrap; +class ReadEnds; +class ReadNameParser; + +/* + * 用来检测optical duplication的graph + */ +template +struct Graph { // 用set? + vector nodes; // 图中的结点 + unordered_map nodeIdxMap; + unordered_map> neighbors; // 邻接列表 + + int addNode(const Node &singleton) { + int idx = -1; + if (nodeIdxMap.find(singleton) == nodeIdxMap.end()) { + nodes.push_back(singleton); + idx = nodes.size() - 1; + nodeIdxMap[singleton] = idx; + neighbors[idx].clear(); + } else + idx = nodeIdxMap[singleton]; + + return idx; + } + + /* bidirectional and public */ + void addEdge(Node &left, Node &right) { + int leftIndex = addNode(left); + if (left == right) + return; + int rightIndex = addNode(right); + addNeighbor(leftIndex, rightIndex); + addNeighbor(rightIndex, leftIndex); + } + + void addNeighbor(int fromNode, int toNode) { + unordered_set &fromNodesNeighbors = neighbors[fromNode]; + if (fromNodesNeighbors.find(toNode) == fromNodesNeighbors.end()) + fromNodesNeighbors.insert(toNode); + } + + /** + * returns the cluster map of connected components + * + * @return Nodes that point to the same integer are in the same cluster. + */ + void cluster(unordered_map *pClusterMap) { + auto &clusterMap = *pClusterMap; + vector vCluster(nodes.size(), 0); + for (int i = 0; i < vCluster.size(); ++i) vCluster[i] = i; + for (int i = 0; i < neighbors.size(); ++i) { + for (auto &j : neighbors[i]) joinNodes(j, i, &vCluster); + } + for (auto &node : nodes) { + clusterMap[node] = findRepNode(vCluster, nodeIdxMap[node]); + } + } + + // Part of Union-Find with Path Compression that joins to nodes to be part of the same cluster. + static void joinNodes(int nodeId1, int nodeId2, vector *pGrouping) { + auto &grouping = *pGrouping; + int repNode1 = findRepNode(grouping, nodeId1); + int repNode2 = findRepNode(grouping, nodeId2); + if (repNode1 == repNode2) + return; + grouping[repNode1] = repNode2; + } + + // Part of Union-Find with Path Compression to determine the duplicate set a particular UMI belongs to. + static int findRepNode(vector &grouping, int nodeId) { + int representativeUmi = nodeId; // All UMIs of a duplicate set will have the same reprsentativeUmi. + while (representativeUmi != grouping[representativeUmi]) representativeUmi = grouping[representativeUmi]; + while (nodeId != representativeUmi) { + int newUmiId = grouping[nodeId]; + grouping[nodeId] = representativeUmi; + nodeId = newUmiId; + } + return representativeUmi; + } +}; + +/* + * 计算read的分数 + */ +int16_t computeDuplicateScore(BamWrap &bw); + +/* + * Builds a read ends object that represents a single read. + * 用来表示一个read的特征结构 + */ +void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey); + +/* + * 对找到的pairend read end添加一些信息 + */ +void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds); + +/** + * Looks through the set of reads and identifies how many of the duplicates are + * in fact optical duplicates, and stores the data in the instance level histogram. + * Additionally sets the transient isOpticalDuplicate flag on each read end that is + * identified as an optical duplicate. + * 记录光学原因造成的冗余 + */ +void trackOpticalDuplicates(vector &readEndsArr, const ReadEnds *pBestRe); + +/* + * Estimates the size of a library based on the number of paired end molecules observed + * and the number of unique pairs observed. + */ +int64_t estimateLibrarySize(int64_t readPairs, int64_t uniqueReadPairs); + +/** + * Estimates the ROI (return on investment) that one would see if a library was sequenced to + * x higher coverage than the observed coverage. + **/ +double estimateRoi(long estimatedLibrarySize, double x, long pairs, long uniquePairs); + +/** + * Calculates a histogram using the estimateRoi method to estimate the effective yield + * doing x sequencing for x=1..10. + */ +void calculateRoiHistogram(DuplicationMetrics &metrics); \ No newline at end of file diff --git a/src/markdup/md_types.h b/src/markdup/md_types.h index e69de29..805843c 100644 --- a/src/markdup/md_types.h +++ b/src/markdup/md_types.h @@ -0,0 +1,194 @@ +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "read_ends.h" +#include "util/bam_buf.h" + +using std::priority_queue; +using std::set; +using std::string; +using std::unordered_set; +using std::vector; + +/* 存放未匹配readend相同位点的所有readend */ +struct UnpairedREInfo { + int64_t taskSeq; // 对应第几轮计算 + ReadEnds unpairedRE; +}; + +/* 对于一个pair数据,一个完整的计算点,包含read1的比对位置和read2的比对位置 */ +struct CalcKey { + int64_t read1Pos; + int64_t read2Pos; + bool operator<(const CalcKey &o) const { + int comp = (int)(read1Pos - o.read1Pos); + if (comp == 0) + comp = (int)(read2Pos - o.read2Pos); + return comp < 0; + } + bool operator==(const CalcKey &o) const { return read1Pos == o.read1Pos && read2Pos == o.read2Pos; } + std::size_t operator()(const CalcKey &o) const { + return std::hash()(read1Pos) ^ std::hash()(read2Pos); + } +}; + +struct CalcKeyHash { + std::size_t operator()(const CalcKey &o) const { + return std::hash()(o.read1Pos) ^ std::hash()(o.read2Pos); + } +}; + +/* 用来记录冗余索引相关的信息 */ +struct DupInfo { + int64_t idx; + int64_t repIdx = 0; // 这一批冗余中的非冗余read 代表的索引 + int16_t dupSet = 0; // dup set size + + DupInfo() : DupInfo(-1, 0, 0) {} + DupInfo(int64_t idx_) : DupInfo(idx_, 0, 0) {} + DupInfo(int64_t idx_, int64_t repIdx_, int dupSet_) : idx(idx_), repIdx(repIdx_), dupSet(dupSet_) {} + bool operator<(const DupInfo &o) const { return idx < o.idx; } + bool operator>(const DupInfo &o) const { return idx > o.idx; } + operator int64_t() const { return idx; } +}; + +struct DupInfoHash { + std::size_t operator()(const DupInfo &o) const { return std::hash()(o.idx); } +}; + +struct DupInfoEqual { + bool operator()(const DupInfo &o1, const DupInfo &o2) const { return o1.idx == o2.idx; } + bool operator()(const DupInfo &o1, const int64_t &o2) const { return o1.idx == o2; } + bool operator()(const int64_t &o1, const DupInfo &o2) const { return o1 == o2.idx; } +}; + +using MDMap = tsl::robin_map; + +template +// using MDSet = set; +// using MDSet = unordered_set; +using MDSet = tsl::robin_set; + +template +// using DPSet = set; +// using DPSet = unordered_set; +using DPSet = tsl::robin_set; + +template +using CalcSet = set; +// using CalcSet = tsl::robin_set; + +using ReadEndsSet = tsl::robin_set; + +/* 当遗留数据在当前任务找到了pair read后,进行冗余计算时候存放结果的数据结构 */ +struct TaskSeqDupInfo { + DPSet dupIdx; + MDSet opticalDupIdx; + DPSet repIdx; + MDSet notDupIdx; + MDSet notOpticalDupIdx; + MDSet notRepIdx; +}; + +/* 保存有未匹配pair位点的信息,包括read end数组和有几个未匹配的read end */ +struct UnpairedPosInfo { + int unpairedNum = 0; + int64_t taskSeq; + vector pairArr; + MDSet readNameSet; +}; +// typedef unordered_map UnpairedNameMap; +// typedef unordered_map UnpairedPositionMap; + +typedef tsl::robin_map UnpairedNameMap; // 以read name为索引,保存未匹配的pair read +typedef tsl::robin_map + UnpairedPositionMap; // 以位点为索引,保存该位点包含的对应的所有read和该位点包含的剩余未匹配的read的数量 + +/* 单线程处理冗余参数结构体 */ +struct MarkDupDataArg { + int64_t taskSeq; // 任务序号 + int64_t bamStartIdx; // 当前vBam数组中第一个bam记录在整体bam中所处的位置 + vector bams; // 存放待处理的bam read + vector pairs; // 成对的reads + vector frags; // 暂未找到配对的reads + DPSet pairDupIdx; // pair的冗余read的索引 + MDSet pairOpticalDupIdx; // optical冗余read的索引 + DPSet fragDupIdx; // frag的冗余read的索引 + DPSet pairRepIdx; // pair的dupset代表read的索引 + MDSet pairSingletonIdx; // 某位置只有一对read的所有read pair个数 + UnpairedNameMap unpairedDic; // 用来寻找pair end + UnpairedPositionMap unpairedPosArr; // 存放未匹配的ReadEnd对应位点的所有ReadEnd,为了避免重复存储 +}; + +/* + * 优先队列,用最小堆来实现对所有冗余索引的排序 + */ +template +struct PairArrIdIdx { + int arrId = 0; + uint64_t arrIdx = 0; + T dupIdx = 0; +}; + +template +struct IdxGreaterThan { + bool operator()(const PairArrIdIdx &a, const PairArrIdIdx &b) { return a.dupIdx > b.dupIdx; } +}; + +template +struct DupIdxQueue { + // 将冗余索引和他对应的task vector对应起来 + + // 由于是多个task来查找冗余,所以每次找到的冗余index都放在一个独立的vector中,vector之间可能有重叠,所以需要用一个最小堆来维护 + vector> *dupIdx2DArr; + priority_queue, vector>, IdxGreaterThan> minHeap; + uint64_t popNum = 0; + + int Init(vector> *_dupIdx2DArr) { + dupIdx2DArr = _dupIdx2DArr; + if (dupIdx2DArr == nullptr) { + return -1; + } + for (int i = 0; i < dupIdx2DArr->size(); ++i) { + auto &v = (*dupIdx2DArr)[i]; + if (!v.empty()) { + minHeap.push({i, 1, v[0]}); + } + } + return 0; + } + + T Pop() { + T ret = -1; + if (!minHeap.empty()) { + auto idx = minHeap.top(); + minHeap.pop(); + ++popNum; + ret = idx.dupIdx; + auto &v = (*dupIdx2DArr)[idx.arrId]; + if (v.size() > idx.arrIdx) { + minHeap.push({idx.arrId, idx.arrIdx + 1, v[idx.arrIdx]}); + } + } + return ret; + } + + uint64_t Size() { + uint64_t len = 0; + if (dupIdx2DArr != nullptr) { + for (auto &v : *dupIdx2DArr) { + len += v.size(); + } + } + return len - popNum; + } +}; \ No newline at end of file diff --git a/src/markdup/pipeline_md.cpp b/src/markdup/pipeline_md.cpp index e69de29..8b50905 100644 --- a/src/markdup/pipeline_md.cpp +++ b/src/markdup/pipeline_md.cpp @@ -0,0 +1,1165 @@ +#include "pipeline_md.h" + +#include +#include + +#include +#include +#include + +#include "dup_metrics.h" +#include "md_args.h" +#include "md_funcs.h" +#include "read_ends.h" +#include "read_name_parser.h" +#include "util/bam_buf.h" +#include "util/profiling.h" +#include "util/yarn.h" + +using std::cout; +using std::vector; + +namespace nsgv { + +extern MarkDupsArg gMdArg; // 用来测试性能 +extern samFile *gInBamFp; // 输入的bam文件 +extern sam_hdr_t *gInBamHeader; // 输入的bam文件头信息 +extern DuplicationMetrics gMetrics; // 统计信息 +extern vector gNameParsers; +extern PipelineArg gPipe; + +}; // namespace nsgv + +/* 排序 */ +static inline void sortReadEndsArr(vector &arr) { + size_t blockSize = 64 * 1024; + if (arr.size() < blockSize) { + std::sort(arr.begin(), arr.end()); + return; + } + size_t blockNum = (arr.size() + blockSize - 1) / blockSize; + size_t crossNum = 1024; + size_t start, end, i, left, right; + std::sort(arr.begin(), arr.begin() + blockSize); + for (i = 1; i < blockNum; ++i) { + start = i * blockSize; + end = min(start + blockSize, arr.size()); + std::sort(arr.begin() + start, arr.begin() + end); + left = crossNum; + while (!(arr[start - left] < arr[start])) { + left = left << 1; + if (left >= blockSize) { + std::sort(arr.begin(), arr.end()); // 退化到普通排序 + return; + } + } + right = min(crossNum, end - start - 1); + + while (!(arr[start - 1] < arr[start + right])) { + right = min(right << 1, end - start - 1); + if (right == end - start - 1) + break; + } + std::sort(arr.begin() + start - left, arr.begin() + start + right); + } +} + +/* 处理一组pairend的readends,标记冗余, 这个函数需要串行运行,因为需要做一些统计*/ +static void markDupsForPairs(vector &vpRe, + DPSet *dupIdx, + MDSet *opticalDupIdx, + DPSet *repIdx, + MDSet *notDupIdx = nullptr, + MDSet *notOpticalDupIdx = nullptr, + MDSet *notRepIdx = nullptr) { + if (vpRe.size() < 2) { + return; + } + + int maxScore = 0; + const ReadEnds *pBest = nullptr; + /** All read ends should have orientation FF, FR, RF, or RR **/ + for (auto pe : vpRe) { // 找分数最高的readend + if (pe->score > maxScore || pBest == nullptr) { + maxScore = pe->score; + pBest = pe; + } + } + + if (notDupIdx != nullptr) { + notDupIdx->insert(pBest->read1IndexInFile); + notDupIdx->insert(pBest->read2IndexInFile); + } + + if (nsgv::gMdArg.CHECK_OPTICAL_DUP) { // 检查光学冗余 + // trackOpticalDuplicates + vector prevOpticalRe; + if (notOpticalDupIdx != nullptr) { + for (auto pe : vpRe) { + if (pe->isOpticalDuplicate) { + prevOpticalRe.push_back(pe); + } + } + } + trackOpticalDuplicates(vpRe, pBest); + // 由于重叠问题,可能会更新数据 + if (notOpticalDupIdx != nullptr) { + for (auto pe : prevOpticalRe) { + if (!pe->isOpticalDuplicate) { + notOpticalDupIdx->insert(pe->read1IndexInFile); + notOpticalDupIdx->insert(pe->read2IndexInFile); + } + } + } + } + for (auto pe : vpRe) { // 对非best read标记冗余 + if (pe != pBest) { // 非best + dupIdx->insert(DupInfo(pe->read1IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); // 添加read1 + if (pe->read2IndexInFile != pe->read1IndexInFile) + dupIdx->insert(DupInfo(pe->read2IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); // read2, + // 注意这里代表是read1的索引 + // 检查是否optical dup + if (pe->isOpticalDuplicate && opticalDupIdx != nullptr) { + opticalDupIdx->insert(pe->read1IndexInFile); + if (pe->read2IndexInFile != pe->read1IndexInFile) + opticalDupIdx->insert(pe->read2IndexInFile); + } + } + } + // 在输出的bam文件中添加tag, best作为dupset的代表 + if (nsgv::gMdArg.TAG_DUPLICATE_SET_MEMBERS) { + repIdx->insert(DupInfo(pBest->read1IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); + repIdx->insert(DupInfo(pBest->read2IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); + if (notRepIdx != nullptr) { + for (auto pe : vpRe) { + if (pe != pBest) { + notRepIdx->insert(pe->read1IndexInFile); + if (pe->read2IndexInFile != pe->read1IndexInFile) + notRepIdx->insert(pe->read2IndexInFile); + } + } + } + } +} + +/* 处理一组非paired的readends,标记冗余 */ +static void markDupsForFrags(vector &vpRe, bool containsPairs, DPSet *dupIdx, + MDSet *notDupIdx = nullptr) { + if (containsPairs) { + for (auto pe : vpRe) { + if (!pe->IsPaired()) { + dupIdx->insert(pe->read1IndexInFile); + } + } + } else { + int maxScore = 0; + const ReadEnds *pBest = nullptr; + for (auto pe : vpRe) { + if (pe->score > maxScore || pBest == nullptr) { + maxScore = pe->score; + pBest = pe; + } + } + if (notDupIdx != nullptr) { + notDupIdx->insert(pBest->read1IndexInFile); + } + for (auto pe : vpRe) { + if (pe != pBest) { + dupIdx->insert(pe->read1IndexInFile); + } + } + } +} + +/* 找到与readend pos相等的所有readend */ +static void getEqualRE(const ReadEnds &re, vector &src, vector *dst) { + auto range = std::equal_range(src.begin(), src.end(), re, ReadEnds::PairsLittleThan); // 只比对位点 + dst->insert(dst->end(), range.first, range.second); +} + +#define LOWER_BOUND(tid, nthread, ndata) ((tid) * (ndata) / (nthread)) +#define UPPER_BOUND(tid, nthread, ndata) ((tid + 1) * (ndata) / (nthread)) + +/* 处理pairs */ +static void processPairs(vector &readEnds, DPSet *dupIdx, MDSet *opticalDupIdx, + DPSet *repIdx, MDSet *notDupIdx = nullptr, + MDSet *notOpticalDupIdx = nullptr, MDSet *notRepIdx = nullptr) { + // return; + vector vpCache; // 有可能是冗余的reads + const ReadEnds *pReadEnd = nullptr; + for (auto &re : readEnds) { + if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样 + vpCache.push_back(&re); // 处理一个潜在的冗余组 + else { + markDupsForPairs(vpCache, dupIdx, opticalDupIdx, repIdx, notDupIdx, notOpticalDupIdx, + notRepIdx); // 不一样 + vpCache.clear(); + vpCache.push_back(&re); + pReadEnd = &re; + } + } + markDupsForPairs(vpCache, dupIdx, opticalDupIdx, repIdx, notDupIdx, notOpticalDupIdx, notRepIdx); +} + +/* 处理frags */ +static void processFrags(vector &readEnds, DPSet *dupIdx, MDSet *notDupIdx = nullptr) { + bool containsPairs = false; + bool containsFrags = false; + vector vpCache; // 有可能是冗余的reads + const ReadEnds *pReadEnd = nullptr; + for (auto &re : readEnds) { + if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false)) { + vpCache.push_back(&re); + containsPairs = containsPairs || re.IsPaired(); + containsFrags = containsFrags || !re.IsPaired(); + } else { + if (vpCache.size() > 1 && containsFrags) { + markDupsForFrags(vpCache, containsPairs, dupIdx, notDupIdx); + } + vpCache.clear(); + vpCache.push_back(&re); + pReadEnd = &re; + containsPairs = re.IsPaired(); + containsFrags = !re.IsPaired(); + } + } + if (vpCache.size() > 1 && containsFrags) { + markDupsForFrags(vpCache, containsPairs, dupIdx, notDupIdx); + } +} + +/* 单线程markdup (第二步)*/ +static void markdups(MarkDupDataArg *arg) { + auto &p = *arg; + p.fragDupIdx.clear(); + p.pairDupIdx.clear(); + p.pairOpticalDupIdx.clear(); + p.pairRepIdx.clear(); + p.pairSingletonIdx.clear(); + /* generateDuplicateIndexes,计算冗余read在所有read中的位置索引 */ + // 先处理 pair + processPairs(p.pairs, &p.pairDupIdx, &p.pairOpticalDupIdx, &p.pairRepIdx, &p.pairSingletonIdx); + // 再处理frag + processFrags(p.frags, &p.fragDupIdx); +} + +/* 获取交叉部分的数据 */ +static inline void getIntersectData(vector &leftArr, vector &rightArr, vector *dst, + bool isPairCmp = false) { + if (leftArr.empty() || rightArr.empty()) { + // cout << "bad size: " << leftArr.size() << '\t' << rightArr.size() << endl; + return; + } + const size_t leftEndIdx = leftArr.size() - 1; + const size_t rightStartIdx = 0; + size_t leftSpan = 0; + size_t rightSpan = 0; + + while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx - leftSpan], rightArr[rightStartIdx], isPairCmp)) { + leftSpan += 1; + if (leftSpan > leftEndIdx) { // 上一个的范围被下一个全部包围了(可能会有bug,上上个也与下一个有交集呢?) + leftSpan = leftArr.size() - 1; + break; + } + } + + while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx], rightArr[rightSpan], isPairCmp)) { + rightSpan += 1; + if (rightSpan == rightArr.size() - 1) // 同上,可能会有bug + break; + } + dst->insert(dst->end(), leftArr.end() - leftSpan, leftArr.end()); + dst->insert(dst->end(), rightArr.begin(), rightArr.begin() + rightSpan); + std::sort(dst->begin(), dst->end()); +} + +/* 将frags重叠部分的dup idx放进数据中 */ +static inline void refreshFragDupIdx(DPSet &dupIdx, MDSet ¬DupIdx, MarkDupDataArg *lastArg, + MarkDupDataArg *curArg) { + auto &lp = *lastArg; + auto &p = *curArg; + for (auto idx : dupIdx) { + lp.fragDupIdx.insert(idx); + p.fragDupIdx.erase(idx); + } + for (auto idx : notDupIdx) { + lp.fragDupIdx.erase(idx); + p.fragDupIdx.erase(idx); + } +} + +// 用来分别处理dup和optical dup +static void refeshTaskDupInfo(DPSet &dupIdx, MDSet &opticalDupIdx, DPSet &repIdx, + MDSet ¬DupIdx, MDSet ¬OpticalDupIdx, MDSet ¬RepIdx, + DPSet &latterDupIdx, MDSet &latterOpticalDupIdx, + DPSet &latterRepIdx, MDSet &latterNotDupIdx, + MDSet &latterNotOpticalDupIdx, MDSet &latterNotRepIdx) { + for (auto idx : dupIdx) latterDupIdx.insert(idx); + for (auto idx : opticalDupIdx) latterOpticalDupIdx.insert(idx); + for (auto idx : repIdx) latterRepIdx.insert(idx); + for (auto idx : notDupIdx) latterNotDupIdx.insert(idx); + for (auto idx : notOpticalDupIdx) latterNotOpticalDupIdx.insert(idx); + for (auto idx : notRepIdx) latterNotRepIdx.insert(idx); +} + +/* 最后合并数据并排序 */ +template +static void refeshFinalTaskDupInfo(DupContainer &dupIdx, MDSet ¬DupIdx, vector &dupArr, + vector &cacheDupIdx, vector &midArr) { + midArr.resize(0); + cacheDupIdx.resize(0); + cacheDupIdx.insert(cacheDupIdx.end(), dupIdx.begin(), dupIdx.end()); + std::sort(cacheDupIdx.begin(), cacheDupIdx.end()); + + auto ai = dupArr.begin(); + auto ae = dupArr.end(); + auto bi = cacheDupIdx.begin(); + auto be = cacheDupIdx.end(); + + T val = 0; + while (ai != ae && bi != be) { + if (*ai < *bi) { + val = *ai++; + } else if (*bi < *ai) { + val = *bi++; + } else { + val = *bi++; // 相等的时候取后面的作为结果 + ai++; + } + if (notDupIdx.find(val) == notDupIdx.end()) { + midArr.push_back(val); + } + } + while (ai != ae) { + val = *ai++; + if (notDupIdx.find(val) == notDupIdx.end()) { + midArr.push_back(val); + } + } + while (bi != be) { + val = *bi++; + if (notDupIdx.find(val) == notDupIdx.end()) { + midArr.push_back(val); + } + } + dupArr = midArr; +} + +// for step 2 generate read ends + +// multi-thread generate read ends +static void mtGenerateReadEnds(void *data, long idx, int tid) { + auto &p = *(PipelineArg *)data; + auto &rnParser = nsgv::gNameParsers[idx]; + int nThread = p.numThread; + auto &bams = p.readData.bams; + int64_t bamStartIdx = p.readData.bamStartIdx; + int64_t taskSeq = p.readData.taskSeq; + GenREData &genREData = p.genREData[p.genREOrder % p.GENBUFNUM]; + auto &pairs = genREData.pairsArr[idx]; + auto &frags = genREData.fragsArr[idx]; + auto &unpairedDic = genREData.unpairedDicArr[idx]; + + pairs.clear(); + frags.clear(); + unpairedDic.clear(); + + size_t start_id = LOWER_BOUND(idx, nThread, bams.size()); + size_t end_id = UPPER_BOUND(idx, nThread, bams.size()); + for (size_t i = start_id; i < end_id; ++i) { // 循环处理每个read + BamWrap *bw = bams[i]; + const int64_t bamIdx = bamStartIdx + i; + if (bw->GetReadUnmappedFlag()) { + if (bw->b->core.tid == -1) + // When we hit the unmapped reads with no coordinate, no reason to continue (only in coordinate sort). + break; + } else if (!bw->IsSecondaryOrSupplementary()) { // 是主要比对 + ReadEnds fragEnd; + buildReadEnds(*bw, bamIdx, rnParser, &fragEnd); + frags.push_back(fragEnd); // 添加进frag集合 + if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) { // 是pairend而且互补的read也比对上了 + string key = bw->query_name(); + if (unpairedDic.find(key) == unpairedDic.end()) { + unpairedDic[key] = {taskSeq, fragEnd}; + } else { // 找到了pairend + auto &pairedEnds = unpairedDic.at(key).unpairedRE; + modifyPairedEnds(fragEnd, &pairedEnds); + pairs.push_back(pairedEnds); + unpairedDic.erase(key); // 删除找到的pairend + } + } + } + } + // sortReadEndsArr(frags); + sort(frags.begin(), frags.end()); + sort(pairs.begin(), pairs.end()); +} + +static void doGenRE(PipelineArg &pipeArg) { + // return; + GenREData &genREData = pipeArg.genREData[pipeArg.genREOrder % pipeArg.GENBUFNUM]; + ReadData &readData = pipeArg.readData; + + // generate read ends + const int numThread = pipeArg.numThread; + + kt_for(numThread, mtGenerateReadEnds, &pipeArg, numThread); + // 用来在genRE阶段计算一些Sort阶段应该计算的数据,保持每个step用时更平衡 + // 轮询每个线程中未找到匹配的read,找到匹配的 + genREData.unpairedDic.clear(); + genREData.unpairedPosArr.clear(); + vector &pairs = genREData.pairsArr[numThread]; + pairs.clear(); + for (int i = 0; i < numThread; ++i) { + for (auto &val : genREData.unpairedDicArr[i]) { + const string &key = val.first; + const ReadEnds &fragEnd = val.second.unpairedRE; + if (genREData.unpairedDic.find(key) == genREData.unpairedDic.end()) { + genREData.unpairedDic[key] = {readData.taskSeq, fragEnd}; + } else { // 找到了pairend + auto &pairedEnds = genREData.unpairedDic.at(key).unpairedRE; + modifyPairedEnds(fragEnd, &pairedEnds); + pairs.push_back(pairedEnds); + genREData.unpairedDic.erase(key); // 删除找到的pairend + } + } + } + sort(pairs.begin(), pairs.end()); + + // create unpaired info + for (auto &e : genREData.unpairedDic) { + auto posKey = e.second.unpairedRE.posKey; + auto &unpairArrInfo = genREData.unpairedPosArr[posKey]; + unpairArrInfo.unpairedNum++; + unpairArrInfo.taskSeq = readData.taskSeq; + unpairArrInfo.readNameSet.insert(e.first); + } +} +// end for step 2 generate read ends + +// for step-3 sort +static void doSort(PipelineArg &pipeArg) { + // return; + GenREData &genREData = pipeArg.genREData[pipeArg.sortOrder % pipeArg.GENBUFNUM]; + SortData &sortData = pipeArg.sortData[pipeArg.sortOrder % pipeArg.SORTBUFNUM]; + SortMarkData &smd = *(SortMarkData *)sortData.dataPtr; + + smd.unpairedDic = genREData.unpairedDic; + smd.unpairedPosArr = genREData.unpairedPosArr; + + smd.pairs.clear(); + smd.frags.clear(); + const ReadEnds *pRE; + ReadEndsHeap pairsHeap, fragsHeap; + pairsHeap.Init(&genREData.pairsArr); + while ((pRE = pairsHeap.Pop()) != nullptr) { + smd.pairs.push_back(*pRE); + } + fragsHeap.Init(&genREData.fragsArr); + while ((pRE = fragsHeap.Pop()) != nullptr) { + smd.frags.push_back(*pRE); + } +} +// for step-4 sort +static void doMarkDup(PipelineArg &pipeArg) { + // return; + MarkDupData &mdData = pipeArg.markDupData[pipeArg.markDupOrder % pipeArg.MARKBUFNUM]; + SortData &sortData = pipeArg.sortData[pipeArg.markDupOrder % pipeArg.SORTBUFNUM]; + mdData.taskSeq = pipeArg.markDupOrder; + mdData.fragDupIdx.clear(); + mdData.pairDupIdx.clear(); + mdData.pairOpticalDupIdx.clear(); + mdData.pairRepIdx.clear(); + + auto tmpPtr = mdData.dataPtr; + mdData.dataPtr = sortData.dataPtr; + sortData.dataPtr = tmpPtr; + + SortMarkData &smd = *(SortMarkData *)mdData.dataPtr; + // 先处理 pair + processPairs(smd.pairs, &mdData.pairDupIdx, &mdData.pairOpticalDupIdx, &mdData.pairRepIdx); + // 再处理frag + processFrags(smd.frags, &mdData.fragDupIdx); +} + +template +static void refreshDupIdx(T &srcArr, T &insArr, T &delArr) { + for (auto dup : srcArr) { + insArr.insert(dup); + delArr.erase(dup); + } +} +template +static void refreshNotDupIdx(T1 &srcArr, T2 &delArr1, T2 &delArr2) { + for (auto dup : srcArr) { + delArr1.erase(dup); + delArr2.erase(dup); + } +} + +static void refreshMarkDupData(DPSet &dupIdx, MDSet &opticalDupIdx, DPSet &repIdx, + MDSet ¬DupIdx, MDSet ¬OpticalDupIdx, MDSet ¬RepIdx, + MarkDupData &lp, MarkDupData &p) { + refreshDupIdx(dupIdx, lp.pairDupIdx, p.pairDupIdx); + refreshDupIdx(opticalDupIdx, lp.pairOpticalDupIdx, p.pairOpticalDupIdx); + refreshDupIdx(repIdx, lp.pairRepIdx, p.pairRepIdx); + + refreshNotDupIdx(notDupIdx, lp.pairDupIdx, p.pairDupIdx); + refreshNotDupIdx(notOpticalDupIdx, lp.pairOpticalDupIdx, p.pairOpticalDupIdx); + refreshNotDupIdx(notRepIdx, lp.pairRepIdx, p.pairRepIdx); +} + +// for step-5 sort +static void doIntersect(PipelineArg &pipeArg) { + // return; + IntersectData &g = pipeArg.intersectData; + MarkDupData &lp = pipeArg.markDupData[(pipeArg.intersectOrder - 1) % pipeArg.MARKBUFNUM]; + MarkDupData &p = pipeArg.markDupData[pipeArg.intersectOrder % pipeArg.MARKBUFNUM]; + SortMarkData &lpSM = *(SortMarkData *)lp.dataPtr; + SortMarkData &pSM = *(SortMarkData *)p.dataPtr; + + vector reArr; + DPSet dupIdx; + MDSet opticalDupIdx; + DPSet repIdx; + MDSet notOpticalDupIdx; + MDSet notDupIdx; + MDSet notRepIdx; + + // 先处理重叠的frags + getIntersectData(lpSM.frags, pSM.frags, &reArr); + processFrags(reArr, &dupIdx, ¬DupIdx); + refreshDupIdx(dupIdx, lp.fragDupIdx, p.fragDupIdx); + refreshNotDupIdx(notDupIdx, lp.fragDupIdx, p.fragDupIdx); + + // 再处理重叠的pairs + reArr.clear(); + dupIdx.clear(); + notDupIdx.clear(); + getIntersectData(lpSM.pairs, pSM.pairs, &reArr, true); + + processPairs(reArr, &dupIdx, &opticalDupIdx, &repIdx, ¬DupIdx, ¬OpticalDupIdx, ¬RepIdx); + refreshMarkDupData(dupIdx, opticalDupIdx, repIdx, notDupIdx, notOpticalDupIdx, notRepIdx, lp, p); + + // 处理之前未匹配的部分 + map recalcPos; + CalcSet alreadyAdd; // 与该位点相同的pair都添加到数组里了 + MDSet addToGlobal; + int64_t prevLastPos = 0, nextFirstPos = 0; + if (lpSM.frags.size() > 0) + prevLastPos = lpSM.frags.back().posKey; + if (pSM.frags.size() > 0) + nextFirstPos = pSM.frags[0].posKey; + + for (auto &prevUnpair : lpSM.unpairedDic) { // 遍历上一个任务中的每个未匹配的read + auto &readName = prevUnpair.first; + auto &prevPosInfo = prevUnpair.second; + auto prevFragEnd = prevPosInfo.unpairedRE; // 未匹配的read end + if (pSM.unpairedDic.find(readName) != pSM.unpairedDic.end()) { // 在当前这个任务里找到了这个未匹配的read + auto &nextPosInfo = pSM.unpairedDic[readName]; + auto &nextFragEnd = nextPosInfo.unpairedRE; + int64_t prevPosKey = prevFragEnd.posKey; + modifyPairedEnds(nextFragEnd, &prevFragEnd); // 在某些clip情况下,poskey可能是后面的read + int64_t nextPosKey = max(prevPosKey, nextFragEnd.posKey); + CalcKey ck = {prevPosKey, nextPosKey}; + UnpairedPosInfo *prevUnpairInfoP = nullptr; + UnpairedPosInfo *nextUnpairInfoP = nullptr; + + if (lpSM.unpairedPosArr.find(prevPosKey) != lpSM.unpairedPosArr.end()) + prevUnpairInfoP = &lpSM.unpairedPosArr[prevPosKey]; + if (pSM.unpairedPosArr.find(prevPosKey) != pSM.unpairedPosArr.end()) + nextUnpairInfoP = &pSM.unpairedPosArr[prevPosKey]; + // pos分为两种情况,根据poskey(pair中两个read分别的pos)的位置确定 + // 1. + // prevpos在交叉部分之前,nextpos在交叉部分之后,这种情况不需要获取pairarr中的数据; + // 2. + // prevpos在交叉部分之前,nextpos在交叉部分,需要获取lp中的相等read pair进行重新计算 + // 复杂情况1. g中包含prevPosKey对应的unpair,p中有对应的pair,此时应该把这些pair考虑进去 + // 3. + // prevpos在交叉部分,nextpos在交叉部分之后,需要获取p中的相等read pair进行重新计算 + // 复杂情况2. p中是否包含prevPosKey对应的unpair + // 4. + // prevpos在交叉部分,nextpos在交叉部分,需要获取lp和p中的相等read pair进行重新计算 + + bool addDataToPos = true; + if (alreadyAdd.find(ck) != alreadyAdd.end()) { + // 之前已经添加过了,后面就不用再添加数据了,因为同一个位置可能找到两个及以上的unpair数据,处理之前的数据时候可能已经添加了这些数据 + addDataToPos = false; + } else + alreadyAdd.insert(ck); + + if (prevPosKey < nextFirstPos) { // prevpos在交叉部分之前 + auto &prevPairArr = prevUnpairInfoP->pairArr; // prevUnpairInfoP肯定不是nullptr + prevPairArr.push_back(prevFragEnd); + if (nextPosKey <= prevLastPos && addDataToPos) { // 第二种情况 + getEqualRE(prevFragEnd, lpSM.pairs, &prevPairArr); + } + // 第一种情况,第二种情况下都会出现,复杂情况一 + auto gPosInfo = g.unpairedPosArr.find(prevPosKey); + if (gPosInfo != g.unpairedPosArr.end()) { // 可能g和p有匹配的,刚好和该位点一致 + auto &gUnpairInfo = gPosInfo->second; + auto pPosInfo = pSM.unpairedPosArr.find(nextPosKey); + if (pPosInfo != pSM.unpairedPosArr.end()) { + auto &pUnpairInfo = pPosInfo->second; + for (auto &rn : gUnpairInfo.readNameSet) { // 遍历每一个readname,看是否有匹配的 + if (pUnpairInfo.readNameSet.find(rn) != pUnpairInfo.readNameSet.end()) { + auto pe = g.unpairedDic[rn].unpairedRE; + auto fe = pSM.unpairedDic[rn].unpairedRE; + modifyPairedEnds(fe, &pe); + prevPairArr.push_back(pe); + g.unpairedDic.erase(rn); + pSM.unpairedDic.erase(rn); + } + } + } + } + recalcPos[ck] = prevPosInfo.taskSeq; + std::sort(prevPairArr.begin(), prevPairArr.end()); + } else { // prevpos在交叉部分 + if (nextPosKey > prevLastPos) { // nextpos在交叉部分之后 第三种情况 + if (nextUnpairInfoP != nullptr) { // 且在pos点,next task有unpair,这样才把这些数据放到next task里 + auto &nextPairArr = nextUnpairInfoP->pairArr; + nextPairArr.push_back(prevFragEnd); + auto &prevPairArr = prevUnpairInfoP->pairArr; + prevPairArr.push_back(prevFragEnd); + if (addDataToPos) { + getEqualRE(prevFragEnd, pSM.pairs, &prevPairArr); + getEqualRE(prevFragEnd, pSM.pairs, &nextPairArr); + } + // 将数据放到next task里,(这个位点以后会可能还会计算到,目前方案是都计算,只是把冗余剔除) + recalcPos[ck] = nextPosInfo.taskSeq; + + std::sort(prevPairArr.begin(), prevPairArr.end()); + std::sort(nextPairArr.begin(), nextPairArr.end()); + } else { // next task在该位点没有unpair,那就把数据放到prev task里 + auto &prevPairArr = prevUnpairInfoP->pairArr; // prevUnpairInfoP肯定不是nullptr + prevPairArr.push_back(prevFragEnd); + if (addDataToPos) // 第二种情况 + getEqualRE(prevFragEnd, pSM.pairs, &prevPairArr); + recalcPos[ck] = prevPosInfo.taskSeq; + std::sort(prevPairArr.begin(), prevPairArr.end()); + } + } else { // 第四种情况 + if (prevUnpairInfoP == nullptr) { + prevUnpairInfoP = &lpSM.unpairedPosArr[prevPosKey]; + prevUnpairInfoP->taskSeq = lp.taskSeq; + } + auto &prevPairArr = prevUnpairInfoP->pairArr; + prevPairArr.push_back(prevFragEnd); + if (addDataToPos) { + getEqualRE(prevFragEnd, lpSM.pairs, &prevPairArr); + getEqualRE(prevFragEnd, pSM.pairs, &prevPairArr); + } + recalcPos[ck] = prevPosInfo.taskSeq; + std::sort(prevPairArr.begin(), prevPairArr.end()); + } + } + pSM.unpairedDic.erase(readName); // 在next task里删除该read + } else if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // 在遗留数据中找到了匹配的read + auto &remainPosInfo = g.unpairedDic[readName]; + auto remainFragEnd = remainPosInfo.unpairedRE; + int64_t remainPosKey = remainFragEnd.posKey; + modifyPairedEnds(prevFragEnd, &remainFragEnd); // 在某些clip情况下,poskey可能是后面的read + auto &remainUnpairInfo = g.unpairedPosArr[remainPosKey]; + auto &remainPairArr = remainUnpairInfo.pairArr; + remainPairArr.push_back(remainFragEnd); + CalcKey ck = {remainPosKey, prevFragEnd.posKey}; + recalcPos[ck] = remainPosInfo.taskSeq; + std::sort(remainPairArr.begin(), remainPairArr.end()); + + g.unpairedDic.erase(readName); + } else { // 都没找到,那就保存到遗留数据里 + int64_t prevPosKey = prevFragEnd.posKey; + g.unpairedDic.insert(prevUnpair); + addToGlobal.insert(prevPosKey); + } + } + map taskChanged; + MDSet posProcessed; + for (auto &e : recalcPos) { + auto posKey = e.first.read1Pos; + if (posProcessed.find(posKey) != posProcessed.end()) + continue; + posProcessed.insert(posKey); + auto taskSeq = e.second; + auto &t = taskChanged[taskSeq]; + // 在对应的任务包含的dup idx里修改结果数据 + vector *pairArrP = nullptr; + if (taskSeq < lp.taskSeq) + pairArrP = &g.unpairedPosArr[posKey].pairArr; + else + pairArrP = &lpSM.unpairedPosArr[posKey].pairArr; + processPairs(*pairArrP, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.notDupIdx, &t.notOpticalDupIdx, + &t.notRepIdx); + if (taskSeq < lp.taskSeq) + g.unpairedPosArr.erase(posKey); + } + // 最后再添加,以防开始赋值,后来这个位置要是又添加了新的数据 + // 放在这里,因为lp中的unpairedPosArr中的readends可能会被修改(比如optical duplicate) +#if 0 + for (auto posKey : addToGlobal) { + g.unpairedPosArr[posKey] = lpSM.unpairedPosArr[posKey]; + } +#endif + /* 不需要把p中找不到lp的unpair,放到global中,否则最后找到pair后,还要再执行一次冗余检测,造成重复的冗余索引 */ + + // 更新结果 + for (auto &e : taskChanged) { + auto taskSeq = e.first; + auto &t = e.second; + if (taskSeq < lp.taskSeq) { + refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, + g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq], g.latterRepIdxArr[taskSeq], + g.latterNotDupIdxArr[taskSeq], g.latterNotOpticalDupIdxArr[taskSeq], + g.latterNotRepIdxArr[taskSeq]); + } else if (taskSeq == lp.taskSeq) { + refreshMarkDupData(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, lp, + p); + } else { + refreshMarkDupData(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, p, + lp); // 把结果放到p中 + } + } + + // 将dupidx放进全局数据 + g.latterDupIdxArr.push_back(DPSet()); + g.latterOpticalDupIdxArr.push_back(MDSet()); + g.latterRepIdxArr.push_back(DPSet()); + g.latterNotDupIdxArr.push_back(MDSet()); + g.latterNotOpticalDupIdxArr.push_back(MDSet()); + g.latterNotRepIdxArr.push_back(MDSet()); + + g.dupIdxArr.push_back(vector()); + auto &vIdx = g.dupIdxArr.back(); + lp.pairDupIdx.insert(lp.fragDupIdx.begin(), lp.fragDupIdx.end()); + vIdx.insert(vIdx.end(), lp.pairDupIdx.begin(), lp.pairDupIdx.end()); + std::sort(vIdx.begin(), vIdx.end()); + + g.opticalDupIdxArr.push_back(vector()); + auto &vOpticalIdx = g.opticalDupIdxArr.back(); + vOpticalIdx.insert(vOpticalIdx.end(), lp.pairOpticalDupIdx.begin(), lp.pairOpticalDupIdx.end()); + std::sort(vOpticalIdx.begin(), vOpticalIdx.end()); + + g.repIdxArr.push_back(vector()); + auto &vRepIdx = g.repIdxArr.back(); + vRepIdx.insert(vRepIdx.end(), lp.pairRepIdx.begin(), lp.pairRepIdx.end()); + std::sort(vRepIdx.begin(), vRepIdx.end()); +} + +static void *pipeRead(void *data) { + PipelineArg &pipeArg = *(PipelineArg *)data; + BamBufType inBamBuf(nsgv::gMdArg.DUPLEX_IO); + inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gMdArg.MAX_MEM); + int64_t readNumSum = 0; + while (1) { + PROF_START(read_wait); + yarn::POSSESS(pipeArg.readSig); + yarn::WAIT_FOR(pipeArg.readSig, yarn::NOT_TO_BE, 1); // 只有一个坑,因为在bambuf内部支持异步读取 + PROF_END(gprof[GP_read_wait], read_wait); + size_t readNum = 0; + PROF_START(read); + if (inBamBuf.ReadStat() >= 0) + readNum = inBamBuf.ReadBam(); // 读入新一轮的数据 + PROF_END(gprof[GP_read], read); + if (readNum < 1) { + pipeArg.readFinish = 1; + yarn::TWIST(pipeArg.readSig, yarn::BY, 1); // 读入了一轮数据 + break; + } + cout << "read num: " << readNum << "\t" << readNumSum << '\t' << pipeArg.readOrder << endl; + + pipeArg.readData.bams = inBamBuf.GetBamArr(); + + pipeArg.readData.bamStartIdx = readNumSum; + pipeArg.readData.taskSeq = pipeArg.readOrder; + + readNumSum += readNum; + inBamBuf.ClearAll(); // 清理上一轮读入的数据 + pipeArg.readOrder += 1; // for next + yarn::TWIST(pipeArg.readSig, yarn::BY, 1); // 读入了一轮数据 + } + cout << "total reads: " << readNumSum << endl; + return 0; +} +static void *pipeGenRE(void *data) { + PipelineArg &pipeArg = *(PipelineArg *)data; + auto &genREData = pipeArg.genREData; + // init generate read ends data by num_thread + int genREThread = pipeArg.numThread; + // / 4; + for (int i = 0; i < pipeArg.GENBUFNUM; ++i) { + genREData[i].Init(genREThread); + } + + while (1) { + PROF_START(gen_wait); + yarn::POSSESS(pipeArg.readSig); + yarn::WAIT_FOR(pipeArg.readSig, yarn::NOT_TO_BE, 0); // 等待有数据 + yarn::POSSESS(pipeArg.genRESig); + PROF_END(gprof[GP_gen_wait], gen_wait); + + yarn::WAIT_FOR(pipeArg.genRESig, yarn::NOT_TO_BE, pipeArg.GENBUFNUM); // 有BUFNUM个坑 + yarn::RELEASE(pipeArg.genRESig); // 因为有不止一个位置,所以要释放 + + if (pipeArg.readFinish) { + yarn::POSSESS(pipeArg.genRESig); + pipeArg.genREFinish = 1; + yarn::TWIST(pipeArg.genRESig, yarn::BY, 1); + yarn::TWIST(pipeArg.readSig, yarn::BY, -1); + break; + } + /* 处理bam,生成readends */ + // cout << "genRE order: " << pipeArg.genREOrder << "\t" << pipeArg.readData.bamStartIdx << endl; + PROF_START(gen); + doGenRE(pipeArg); + // usleep(200000); + PROF_END(gprof[GP_gen], gen); + + yarn::POSSESS(pipeArg.genRESig); + pipeArg.genREOrder += 1; + yarn::TWIST(pipeArg.genRESig, yarn::BY, 1); + yarn::TWIST(pipeArg.readSig, yarn::BY, -1); // 使用了一次读入的数据 + } + cout << "pipe gen reads" << endl; + return 0; +} +static void *pipeSort(void *data) { + PipelineArg &pipeArg = *(PipelineArg *)data; + + while (1) { + PROF_START(sort_wait); + yarn::POSSESS(pipeArg.genRESig); + yarn::WAIT_FOR(pipeArg.genRESig, yarn::NOT_TO_BE, 0); // 等待有数据 + yarn::RELEASE(pipeArg.genRESig); + PROF_END(gprof[GP_sort_wait], sort_wait); + + yarn::POSSESS(pipeArg.sortSig); + yarn::WAIT_FOR(pipeArg.sortSig, yarn::NOT_TO_BE, pipeArg.SORTBUFNUM); // 有BUFNUM个位置 + yarn::RELEASE(pipeArg.sortSig); + + if (pipeArg.genREFinish) { + // 处理完剩余数据 + while (pipeArg.sortOrder < pipeArg.genREOrder) { + PROF_START(sort); + doSort(pipeArg); + PROF_END(gprof[GP_sort], sort); + pipeArg.sortOrder += 1; + } + yarn::POSSESS(pipeArg.sortSig); + pipeArg.sortFinish = 1; + yarn::TWIST(pipeArg.sortSig, yarn::BY, 1); + break; + } + /* 排序 readends */ + // cout << "sort order: " << pipeArg.sortOrder << endl; + PROF_START(sort); + doSort(pipeArg); + PROF_END(gprof[GP_sort], sort); + + yarn::POSSESS(pipeArg.genRESig); + yarn::TWIST(pipeArg.genRESig, yarn::BY, -1); + + yarn::POSSESS(pipeArg.sortSig); + pipeArg.sortOrder += 1; + yarn::TWIST(pipeArg.sortSig, yarn::BY, 1); + } + cout << "end pipe sort" << endl; + return 0; +} +static void *pipeMarkDup(void *data) { + PipelineArg &pipeArg = *(PipelineArg *)data; + + while (1) { + PROF_START(markdup_wait); + yarn::POSSESS(pipeArg.sortSig); + yarn::WAIT_FOR(pipeArg.sortSig, yarn::NOT_TO_BE, 0); // 等待有数据 + yarn::RELEASE(pipeArg.sortSig); + PROF_END(gprof[GP_markdup_wait], markdup_wait); + + yarn::POSSESS(pipeArg.markDupSig); + yarn::WAIT_FOR(pipeArg.markDupSig, yarn::NOT_TO_BE, pipeArg.MARKBUFNUM); + yarn::RELEASE(pipeArg.markDupSig); + + if (pipeArg.sortFinish) { + // 应该还得处理剩余的数据 + while (pipeArg.markDupOrder < pipeArg.sortOrder) { + PROF_START(markdup); + doMarkDup(pipeArg); + PROF_END(gprof[GP_markdup], markdup); + pipeArg.markDupOrder += 1; + } + yarn::POSSESS(pipeArg.markDupSig); + pipeArg.markDupFinish = 1; + yarn::TWIST(pipeArg.markDupSig, yarn::BY, 2); + break; + } + /* 冗余检测 readends */ + // cout << "markdup order: " << pipeArg.markDupOrder << endl; + PROF_START(markdup); + doMarkDup(pipeArg); + PROF_END(gprof[GP_markdup], markdup); + yarn::POSSESS(pipeArg.sortSig); + yarn::TWIST(pipeArg.sortSig, yarn::BY, -1); + + yarn::POSSESS(pipeArg.markDupSig); + pipeArg.markDupOrder += 1; + yarn::TWIST(pipeArg.markDupSig, yarn::BY, 1); + } + cout << "end pipe markdup" << endl; + return 0; +} +static void *pipeIntersect(void *data) { + PipelineArg &pipeArg = *(PipelineArg *)data; + pipeArg.intersectOrder = 1; + while (1) { + PROF_START(intersect_wait); + yarn::POSSESS(pipeArg.markDupSig); + yarn::WAIT_FOR(pipeArg.markDupSig, yarn::TO_BE_MORE_THAN, 1); // 等待有数据 + yarn::RELEASE(pipeArg.markDupSig); + PROF_END(gprof[GP_intersect_wait], intersect_wait); + + if (pipeArg.markDupFinish) { + while (pipeArg.intersectOrder < pipeArg.markDupOrder) { + PROF_START(intersect); + doIntersect(pipeArg); + PROF_END(gprof[GP_intersect], intersect); + pipeArg.intersectOrder += 1; + } + break; + } + /* 交叉数据处理 readends */ + PROF_START(intersect); + doIntersect(pipeArg); + PROF_END(gprof[GP_intersect], intersect); + + yarn::POSSESS(pipeArg.markDupSig); + yarn::TWIST(pipeArg.markDupSig, yarn::BY, -1); + + pipeArg.intersectOrder += 1; + } + cout << "end pipe intersect" << endl; + return 0; +} + +/* 当所有任务结束后,global data里还有未处理的数据 */ +static void mergeAllTask(PipelineArg &pipeArg) { + MarkDupData &lp = pipeArg.markDupData[(pipeArg.markDupOrder - 1) % pipeArg.MARKBUFNUM]; + IntersectData &g = pipeArg.intersectData; + SortMarkData &lpSM = *(SortMarkData *)lp.dataPtr; + // 遗留的未匹配的pair + for (auto &prevUnpair : lpSM.unpairedDic) { // 遍历上一个任务中的每个未匹配的read + auto &readName = prevUnpair.first; + auto &prevPosInfo = prevUnpair.second; + auto prevFragEnd = prevPosInfo.unpairedRE; // 未匹配的read end + + if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // 在遗留数据中找到了匹配的read + auto &remainPosInfo = g.unpairedDic[readName]; + auto remainFragEnd = remainPosInfo.unpairedRE; + int64_t remainPosKey = remainFragEnd.posKey; + modifyPairedEnds(prevFragEnd, &remainFragEnd); // 在某些clip情况下,poskey可能是后面的read + auto &remainUnpairInfo = g.unpairedPosArr[remainPosKey]; + + remainUnpairInfo.pairArr.push_back(remainFragEnd); + g.unpairedDic.erase(readName); + } else { + g.unpairedDic.insert(prevUnpair); // 用来记录没有匹配的read个数 + } + } + + map taskChanged; + for (auto &e : g.unpairedPosArr) { + auto posKey = e.first; + auto taskSeq = e.second.taskSeq; + auto &t = taskChanged[taskSeq]; + auto &arr = g.unpairedPosArr[posKey].pairArr; + + if (arr.size() > 1) { + std::sort(arr.begin(), arr.end()); + processPairs(arr, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.notDupIdx, &t.notOpticalDupIdx, &t.notRepIdx); + } + } + // 更新结果 + vector addDup; + map ndPosVal; + for (auto &e : taskChanged) { + auto taskSeq = e.first; + auto &t = e.second; + refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, + g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq], g.latterRepIdxArr[taskSeq], + g.latterNotDupIdxArr[taskSeq], g.latterNotOpticalDupIdxArr[taskSeq], + g.latterNotRepIdxArr[taskSeq]); + } + g.unpairedPosArr.clear(); + + // 将dupidx放进全局数据 + vector cacheDupIdx; + vector midArr; + vector intCacheDupIdx; + vector intMidArr; + for (int i = 0; i < (int)g.dupIdxArr.size() - 1; ++i) { + refeshFinalTaskDupInfo(g.latterDupIdxArr[i], g.latterNotDupIdxArr[i], g.dupIdxArr[i], cacheDupIdx, midArr); + } + for (int i = 0; i < (int)g.opticalDupIdxArr.size() - 1; ++i) + refeshFinalTaskDupInfo(g.latterOpticalDupIdxArr[i], g.latterNotOpticalDupIdxArr[i], g.opticalDupIdxArr[i], + intCacheDupIdx, intMidArr); + for (int i = 0; i < (int)g.repIdxArr.size() - 1; ++i) + refeshFinalTaskDupInfo(g.latterRepIdxArr[i], g.latterNotRepIdxArr[i], g.repIdxArr[i], cacheDupIdx, midArr); + + g.dupIdxArr.push_back(vector()); + auto &vIdx = g.dupIdxArr.back(); + lp.pairDupIdx.insert(lp.fragDupIdx.begin(), lp.fragDupIdx.end()); + vIdx.insert(vIdx.end(), lp.pairDupIdx.begin(), lp.pairDupIdx.end()); + std::sort(vIdx.begin(), vIdx.end()); + + g.opticalDupIdxArr.push_back(vector()); + auto &vOpticalIdx = g.opticalDupIdxArr.back(); + vOpticalIdx.insert(vOpticalIdx.end(), lp.pairOpticalDupIdx.begin(), lp.pairOpticalDupIdx.end()); + std::sort(vOpticalIdx.begin(), vOpticalIdx.end()); + + g.repIdxArr.push_back(vector()); + auto &vRepIdx = g.repIdxArr.back(); + vRepIdx.insert(vRepIdx.end(), lp.pairRepIdx.begin(), lp.pairRepIdx.end()); + std::sort(vRepIdx.begin(), vRepIdx.end()); +} + +static void parallelPipeline() { + PipelineArg &pipeArg = nsgv::gPipe; + pipeArg.numThread = nsgv::gMdArg.NUM_THREADS; + + pthread_t tidArr[5]; + pthread_create(&tidArr[0], 0, pipeRead, &pipeArg); + pthread_create(&tidArr[1], 0, pipeGenRE, &pipeArg); + pthread_create(&tidArr[2], 0, pipeSort, &pipeArg); + pthread_create(&tidArr[3], 0, pipeMarkDup, &pipeArg); + pthread_create(&tidArr[4], 0, pipeIntersect, &pipeArg); + for (int i = 0; i < 5; ++i) pthread_join(tidArr[i], 0); + PROF_START(merge_result); + mergeAllTask(pipeArg); + PROF_END(gprof[GP_merge_result], merge_result); + + int64_t dupNum = 0; + int64_t opticalDupNum = 0; + for (auto &arr : pipeArg.intersectData.dupIdxArr) dupNum += arr.size(); + for (auto &arr : pipeArg.intersectData.opticalDupIdxArr) opticalDupNum += arr.size(); + + map dup; +#if 0 + int taskSeq = 0; + for (auto &arr : pipeArg.intersectData.dupIdxArr) { + for (auto idx : arr) { + if (dup.find(idx.idx) != dup.end()) { + //if (taskSeq - 1 > dup[idx]) + cout << "dup index: " << dup[idx] << '\t' << taskSeq << '\t' << idx << endl; + } + dup[idx.idx] = taskSeq; + } + // cout << taskSeq << "\t" << arr.size() << endl; + taskSeq++; + } +#endif + + cout << "Final read order: " << pipeArg.readOrder << endl; + cout << "Final gen order: " << pipeArg.genREOrder << endl; + cout << "Final sort order: " << pipeArg.sortOrder << endl; + cout << "Final mark order: " << pipeArg.markDupOrder << endl; + cout << "Final inter order: " << pipeArg.intersectOrder << endl; + +// cout << "w read time : " << tm_arr[10].acc_seconds_elapsed() << endl; +// cout << "w gen time : " << tm_arr[11].acc_seconds_elapsed() << endl; +// cout << "w sort time : " << tm_arr[12].acc_seconds_elapsed() << endl; +// cout << "w markdup time : " << tm_arr[13].acc_seconds_elapsed() << endl; +// cout << "w intersect time: " << tm_arr[14].acc_seconds_elapsed() << endl; +// +// cout << "w1 gen time : " << tm_arr[21].acc_seconds_elapsed() << endl; +// cout << "w1 sort time : " << tm_arr[22].acc_seconds_elapsed() << endl; +// cout << "w1 markdup time : " << tm_arr[23].acc_seconds_elapsed() << endl; +// +// cout << "read time : " << tm_arr[0].acc_seconds_elapsed() << endl; +// cout << "gen time : " << tm_arr[1].acc_seconds_elapsed() << endl; +// cout << "sort time : " << tm_arr[2].acc_seconds_elapsed() << endl; +// cout << "markdup time : " << tm_arr[3].acc_seconds_elapsed() << endl; +// cout << "intersect time: " << tm_arr[4].acc_seconds_elapsed() << endl; +// +// cout << "copy time: " << tm_arr[5].acc_seconds_elapsed() << endl; +// cout << "merge al6 time: " << tm_arr[6].acc_seconds_elapsed() << endl; +// +// cout << "dup num : " << dupNum << "\t" << dup.size() << endl; +// cout << "optical dup num : " << opticalDupNum / 2 << "\t" << opticalDupNum << endl; + +} + +/* 并行流水线方式处理数据,标记冗余 */ +void pipelineMarkDups() { + if (nsgv::gMdArg.NUM_THREADS > 1) + return parallelPipeline(); + + PipelineArg &pipeArg = nsgv::gPipe; + pipeArg.numThread = nsgv::gMdArg.NUM_THREADS; + BamBufType inBamBuf(nsgv::gMdArg.DUPLEX_IO); + inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gMdArg.MAX_MEM); + int64_t readNumSum = 0; + for (int i = 0; i < pipeArg.GENBUFNUM; ++i) { + pipeArg.genREData[i].Init(pipeArg.numThread); + } + pipeArg.intersectOrder = 1; // do intersect 从1开始 + while (1) { + size_t readNum = 0; + if (inBamBuf.ReadStat() >= 0) + readNum = inBamBuf.ReadBam(); // 读入新一轮的数据 + if (readNum < 1) { + break; + } + cout << "read num: " << readNum << "\t" << readNumSum << '\t' << pipeArg.readOrder << endl; + + pipeArg.readData.bams = inBamBuf.GetBamArr(); + pipeArg.readData.bamStartIdx = readNumSum; + pipeArg.readData.taskSeq = pipeArg.readOrder; + // 1. do generate read ends + doGenRE(pipeArg); + pipeArg.genREOrder += 1; + // 2. do sort + doSort(pipeArg); + pipeArg.sortOrder += 1; + // 3. do markduplicate + doMarkDup(pipeArg); + pipeArg.markDupOrder += 1; + // 4. do intersect data + if (pipeArg.markDupOrder > 1) { + doIntersect(pipeArg); + pipeArg.intersectOrder += 1; + } + + readNumSum += readNum; + inBamBuf.ClearAll(); // 清理上一轮读入的数据 + pipeArg.readOrder += 1; // for next + } + mergeAllTask(pipeArg); + + int64_t dupNum = 0; + int64_t opticalDupNum = 0; + for (auto &arr : pipeArg.intersectData.dupIdxArr) dupNum += arr.size(); + for (auto &arr : pipeArg.intersectData.opticalDupIdxArr) opticalDupNum += arr.size(); + + map dup; +#if 0 + int taskSeq = 0; + for (auto &arr : pipeArg.intersectData.dupIdxArr) { + for (auto idx : arr) { + if (dup.find(idx.idx) != dup.end()) { + cout << "dup index: " << dup[idx] << '\t' << taskSeq << '\t' << idx << endl; + } + dup[idx.idx] = taskSeq; + } + taskSeq++; + } +#endif + + cout << "total reads: " << readNumSum << endl; + cout << "Final read order: " << pipeArg.readOrder << endl; + cout << "Final gen order: " << pipeArg.genREOrder << endl; + cout << "Final sort order: " << pipeArg.sortOrder << endl; + cout << "Final mark order: " << pipeArg.markDupOrder << endl; + cout << "Final inter order: " << pipeArg.intersectOrder << endl; + cout << "dup num : " << dupNum << "\t" << dup.size() << endl; + cout << "optical dup num : " << opticalDupNum / 2 << "\t" << opticalDupNum << endl; +} \ No newline at end of file diff --git a/src/markdup/pipeline_md.h b/src/markdup/pipeline_md.h index e69de29..ba925af 100644 --- a/src/markdup/pipeline_md.h +++ b/src/markdup/pipeline_md.h @@ -0,0 +1,174 @@ +#pragma once + +#include +#include + +#include "md_types.h" + +struct ReadData { + vector bams; // read step output + int64_t bamStartIdx = 0; // 每轮读入的bam数组,起始位置在全局bam中的索引 + int64_t taskSeq = 0; // 任务序号 +}; + +struct GenREData { + vector> pairsArr; // 成对的reads + vector> fragsArr; // 暂未找到配对的reads + vector unpairedDicArr; // 用来寻找pair end + void Init(int nThread) { + for (int i = 0; i <= nThread; ++i) { // 比线程多一个,主要是pairs多一个,其他没用 + pairsArr.push_back(vector()); + fragsArr.push_back(vector()); + unpairedDicArr.push_back(UnpairedNameMap()); + } + } + UnpairedNameMap unpairedDic; // 代替sort step中一部分计算 + UnpairedPositionMap unpairedPosArr; // +}; + +struct SortMarkData { + vector pairs; // 成对的reads + vector frags; // 暂未找到配对的reads + UnpairedNameMap unpairedDic; // 用来寻找pair end + UnpairedPositionMap unpairedPosArr; // 存放未匹配的ReadEnd对应位点的所有ReadEnd,为了避免重复存储 +}; + +struct SortData { + volatile void *dataPtr; // SortMarkData pointer +}; + +struct MarkDupData { + int64_t taskSeq = 0; // 任务序号 + DPSet pairDupIdx; // pair的冗余read的索引 + MDSet pairOpticalDupIdx; // optical冗余read的索引 + DPSet fragDupIdx; // frag的冗余read的索引 + DPSet pairRepIdx; // pair的dupset代表read的索引 + + volatile void *dataPtr; // SortMarkData pointer +}; + +struct IntersectData { + UnpairedNameMap unpairedDic; // 用来寻找pair end + UnpairedPositionMap unpairedPosArr; + + // 每个task对应一个vector + vector> dupIdxArr; + vector> opticalDupIdxArr; + vector> repIdxArr; + + // 用来存放后续计算的数据 + vector> latterDupIdxArr; + vector> latterOpticalDupIdxArr; + vector> latterRepIdxArr; + vector> latterNotDupIdxArr; + vector> latterNotOpticalDupIdxArr; + vector> latterNotRepIdxArr; +}; + +// 记录流水线状态,task的序号,以及某阶段是否结束 +struct PipelineArg { + static const int GENBUFNUM = 2; + static const int SORTBUFNUM = 2; + static const int MARKBUFNUM = 3; + uint64_t readOrder = 0; + uint64_t genREOrder = 0; + uint64_t sortOrder = 0; + uint64_t markDupOrder = 0; + uint64_t intersectOrder = 0; + int numThread = 0; + + volatile int readFinish = 0; + volatile int genREFinish = 0; + volatile int sortFinish = 0; + volatile int markDupFinish = 0; + + yarn::lock_t *readSig; + yarn::lock_t *genRESig; + yarn::lock_t *sortSig; + yarn::lock_t *markDupSig; + + PipelineArg() { + readSig = yarn::NEW_LOCK(0); // 最大值1, 双buffer在bambuf中实现了,对调用透明 + genRESig = yarn::NEW_LOCK(0); // 最大值2, 双buffer + sortSig = yarn::NEW_LOCK(0); + markDupSig = yarn::NEW_LOCK(0); + for (int i = 0; i < SORTBUFNUM; ++i) { + sortData[i].dataPtr = &sortMarkData[i]; + } + for (int i = 0; i < MARKBUFNUM; ++i) { + markDupData[i].dataPtr = &sortMarkData[i + SORTBUFNUM]; + } + } + + SortMarkData sortMarkData[SORTBUFNUM + MARKBUFNUM]; + + // for step-1 read + ReadData readData; + // for step-2 generate readends + GenREData genREData[GENBUFNUM]; + // for step-3 sort each thread frags and pairs + SortData sortData[SORTBUFNUM]; + // for step-4 mark duplicate + MarkDupData markDupData[MARKBUFNUM]; + // for step-5 deal with intersect data + IntersectData intersectData; +}; + +struct REArrIdIdx { + int arrId = 0; // 数组索引 + uint64_t arrIdx = 0; // 数组内部当前索引 + const ReadEnds *pE = nullptr; +}; + +struct REGreaterThan { + bool operator()(const REArrIdIdx &a, const REArrIdIdx &b) { return *b.pE < *a.pE; } +}; + +struct ReadEndsHeap { + // 将冗余索引和他对应的task vector对应起来 + vector> *arr2d; + priority_queue, REGreaterThan> minHeap; + uint64_t popNum = 0; + + int Init(vector> *_arr2d) { + arr2d = _arr2d; + if (arr2d == nullptr) { + return -1; + } + for (int i = 0; i < arr2d->size(); ++i) { + auto &v = (*arr2d)[i]; + if (!v.empty()) { + minHeap.push({i, 1, &v[0]}); + } + } + return 0; + } + + const ReadEnds *Pop() { + const ReadEnds *ret = nullptr; + if (!minHeap.empty()) { + auto minVal = minHeap.top(); + minHeap.pop(); + ++popNum; + ret = minVal.pE; + auto &v = (*arr2d)[minVal.arrId]; + if (v.size() > minVal.arrIdx) { + minHeap.push({minVal.arrId, minVal.arrIdx + 1, &v[minVal.arrIdx]}); + } + } + return ret; + } + + uint64_t Size() { + uint64_t len = 0; + if (arr2d != nullptr) { + for (auto &v : *arr2d) { + len += v.size(); + } + } + return len - popNum; + } +}; + +// 并行运行mark duplicate +void pipelineMarkDups(); \ No newline at end of file diff --git a/src/markdup/read_ends.h b/src/markdup/read_ends.h index e69de29..4bcf89c 100644 --- a/src/markdup/read_ends.h +++ b/src/markdup/read_ends.h @@ -0,0 +1,170 @@ +/* +Description: read +ends结构体主要用来标记冗余,包含一些序列的测序过程中的物理信息等 + +Copyright : All right reserved by ICT + +Author : Zhang Zhonghai +Date : 2023/11/3 +*/ +#pragma once + +#include +#include +#include + +/** + * Small interface that provides access to the physical location information + * about a cluster. All values should be defaulted to -1 if unavailable. + * ReadGroup and Tile should only allow non-zero positive integers, x and y + * coordinates may be negative. + */ +struct PhysicalLocation { + static const int NO_VALUE = -1; + /** + * Small class that provides access to the physical location information + * about a cluster. All values should be defaulted to -1 if unavailable. + * Tile should only allow non-zero positive integers, x and y coordinates + * must be non-negative. This is different from PhysicalLocationShort in + * that the x and y positions are ints, not shorts thus, they do not + * overflow within a HiSeqX tile. + */ + int16_t tile = -1; + // int32_t x = -1; + // int32_t y = -1; + int16_t x = -1; + int16_t y = -1; +}; + +/* 包含了所有read ends信息,如picard里边的 ReadEndsForMarkDuplicates*/ +struct ReadEnds : PhysicalLocation { + static const int8_t F = 0, R = 1, FF = 2, FR = 3, RR = 4, RF = 5; + /* 保留一些bam记录中的数据 */ + bool read1FirstOfPair = true; + /* ReadEnds中的成员变量 */ + /** Little struct-like class to hold read pair (and fragment) end data for + * duplicate marking. */ + // int16_t libraryId; // 没用,不考虑多样本 + int8_t orientation = -1; + int32_t read1ReferenceIndex = -1; + int32_t read1Coordinate = -1; + int32_t read2ReferenceIndex = -1; + // This field is overloaded for flow based processing as the end coordinate of read 1. (paired reads not supported) + int32_t read2Coordinate = -1; + /* Additional information used to detect optical dupes */ + // int16_t readGroup = -1; 一般经过比对后的bam文件只有一个read + // group,normal或者tumor + /** For optical duplicate detection the orientation matters regard to 1st or + * 2nd end of a mate */ + int8_t orientationForOpticalDuplicates = -1; + /** A *transient* flag marking this read end as being an optical duplicate. + */ + bool isOpticalDuplicate = false; + + /* ReadEndsForMarkDuplicates中的成员变量 */ + /** Little struct-like class to hold read pair (and fragment) end data for + * MarkDuplicatesWithMateCigar **/ + int16_t score = 0; + int64_t read1IndexInFile = -1; + int64_t read2IndexInFile = -1; + int64_t duplicateSetSize = -1; + + /* ReadEndsForMarkDuplicatesWithBarcodes中的成员变量 (好像用不到) */ + // int32_t barcode = 0; // primary barcode for this read (and pair) + // int32_t readOneBarcode = 0; // read one barcode, 0 if not present + // int32_t readTwoBarcode = 0; // read two barcode, 0 if not present or not + // paired + + /* zzh增加的成员变量 */ + int64_t posKey = -1; // 根据位置信息生成的关键字 return (int64_t)tid << + // MAX_CONTIG_LEN_SHIFT | (int64_t)pos; + + /* 用来做一些判断,因为一些readends会做多次操作,比如task之间有重叠等等 */ + int oprateTime = 0; + + /* 根据pairend read的比对方向,来确定整体的比对方向 */ + static int8_t GetOrientationByte(bool read1NegativeStrand, bool read2NegativeStrand) { + if (read1NegativeStrand) { + if (read2NegativeStrand) + return RR; + else + return RF; + } else { + if (read2NegativeStrand) + return FR; + else + return FF; + } + } + + /* 比较两个readends是否一样(有个冗余) */ + static bool AreComparableForDuplicates(const ReadEnds &lhs, const ReadEnds &rhs, bool compareRead2) { + bool areComparable = true; + areComparable = lhs.read1ReferenceIndex == rhs.read1ReferenceIndex && + lhs.read1Coordinate == rhs.read1Coordinate && lhs.orientation == rhs.orientation; + if (areComparable && compareRead2) { + areComparable = + lhs.read2ReferenceIndex == rhs.read2ReferenceIndex && lhs.read2Coordinate == rhs.read2Coordinate; + } + return areComparable; + } + + /* 比对方向是否正向 */ + bool IsPositiveStrand() const { return orientation == F; } + + /* pairend是否合适的比对上了 */ + bool IsPaired() const { return read2ReferenceIndex != -1; } + + bool IsNegativeStrand() const { return orientation == R; } + + // 对于相交的数据进行比对,a是否小于b,根据AreComparableForDuplicates函数得来 + static inline bool ReadLittleThan(const ReadEnds &a, const ReadEnds &b, bool compareRead2 = false) { + int comp = a.read1ReferenceIndex - b.read1ReferenceIndex; + if (comp == 0) + comp = a.read1Coordinate - b.read1Coordinate; + if (comp == 0) + comp = a.orientation - b.orientation; + if (compareRead2) { + if (comp == 0) + comp = a.read2ReferenceIndex - b.read2ReferenceIndex; + if (comp == 0) + comp = a.read2Coordinate - b.read2Coordinate; + } + return comp < 0; + } + + // 找某一个位置的所有readend时需要 + static bool PairsLittleThan(const ReadEnds &lhs, const ReadEnds &rhs) { return ReadLittleThan(lhs, rhs, true); } + + // 比较函数 + bool operator<(const ReadEnds &o) const { + int comp = read1ReferenceIndex - o.read1ReferenceIndex; + if (comp == 0) + comp = read1Coordinate - o.read1Coordinate; + if (comp == 0) + comp = orientation - o.orientation; + if (comp == 0) + comp = read2ReferenceIndex - o.read2ReferenceIndex; + if (comp == 0) + comp = read2Coordinate - o.read2Coordinate; + if (comp == 0) + comp = tile - o.tile; + if (comp == 0) + comp = x - o.x; + if (comp == 0) + comp - y - o.y; + if (comp == 0) + comp = (int)(read1IndexInFile - o.read1IndexInFile); + if (comp == 0) + comp = (int)(read2IndexInFile - o.read2IndexInFile); + return comp < 0; + } +}; + +struct ReadEndsHash { + std::size_t operator()(const ReadEnds &o) const { return std::hash()(o.read1IndexInFile); } +}; + +struct ReadEndsEqual { + bool operator()(const ReadEnds &o1, const ReadEnds &o2) const { return o1.read1IndexInFile == o2.read1IndexInFile; } +}; \ No newline at end of file diff --git a/src/markdup/read_name_parser.h b/src/markdup/read_name_parser.h index e69de29..65d35bd 100644 --- a/src/markdup/read_name_parser.h +++ b/src/markdup/read_name_parser.h @@ -0,0 +1,226 @@ +/* +Description: 解析read的name中的信息,比如tile, x, y等 + +Copyright : All right reserved by ICT + +Author : Zhang Zhonghai +Date : 2023/11/6 +*/ +#ifndef READ_NAME_PARSER_H_ +#define READ_NAME_PARSER_H_ + +#include +#include + +#include +#include + +#include "read_ends.h" + +// using std::regex; +using std::cmatch; +using std::regex; +using std::string; + + + +/** + * Provides access to the physical location information about a cluster. + * All values should be defaulted to -1 if unavailable. ReadGroup and Tile + * should only allow non-zero positive integers, x and y coordinates may be + * negative. 非线程安全 + */ +struct ReadNameParser { + /** + * The read name regular expression (regex) is used to extract three pieces + * of information from the read name: tile, x location, and y location. Any + * read name regex should parse the read name to produce these and only + * these values. An example regex is: + * (?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$ + * which assumes that fields in the read name are delimited by ':' and the + * last three fields correspond to the tile, x and y locations, ignoring any + * trailing non-digit characters. + * + * The default regex is optimized for fast parsing (see {@link + * #getLastThreeFields(String, char, int[])}) by searching for the last + * three fields, ignoring any trailing non-digit characters, assuming the + * delimiter ':'. This should consider correctly read names where we have 5 + * or 7 field with the last three fields being tile/x/y, as is the case for + * the majority of read names produced by Illumina technology. + */ + const string DEFAULT_READ_NAME_REGEX = "(?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$"; + bool warnAboutRegexNotMatching = true; + + static bool sWrongNameFormat; + + string readNameStored = ""; + PhysicalLocation physicalLocationStored; + int tmpLocationFields[3]; // for optimization of addLocationInformation + bool useOptimizedDefaultParsing = true; // was the regex default? + string readNameRegex = DEFAULT_READ_NAME_REGEX; + regex readNamePattern; + + ReadNameParser() : ReadNameParser(DEFAULT_READ_NAME_REGEX) {} + ReadNameParser(const string &strReadNameRegex) : ReadNameParser(strReadNameRegex, true) {} + ReadNameParser(const string &strReadNameRegex, bool isWarn) { + readNameRegex = strReadNameRegex; + if (strReadNameRegex == DEFAULT_READ_NAME_REGEX) + useOptimizedDefaultParsing = true; + else + useOptimizedDefaultParsing = false; + readNamePattern = std::regex(strReadNameRegex, std::regex_constants::optimize); + warnAboutRegexNotMatching = isWarn; + } + + /* 重新设置readNameRegex */ + void SetReadNameRegex(const string &strReadNameRegex) { + readNameRegex = strReadNameRegex; + if (strReadNameRegex == DEFAULT_READ_NAME_REGEX) + useOptimizedDefaultParsing = true; + else + useOptimizedDefaultParsing = false; + readNamePattern = std::regex(strReadNameRegex, std::regex_constants::optimize); + // readNamePattern = strReadNameRegex; + } + + /* 添加测序时候的tile x y 信息 */ + bool AddLocationInformation(const string &readName, PhysicalLocation *loc) { + if (!(readName == readNameStored)) { + if (ReadLocationInformation(readName, loc)) { + readNameStored = readName; + physicalLocationStored = *loc; + return true; + } + // return false if read name cannot be parsed + return false; + } else { + *loc = physicalLocationStored; + return true; + } + } + + /** + * Method used to extract tile/x/y from the read name and add it to the + * PhysicalLocationShort so that it can be used later to determine optical + * duplication + * + * @param readName the name of the read/cluster + * @param loc the object to add tile/x/y to + * @return true if the read name contained the information in parsable form, + * false otherwise + */ + bool ReadLocationInformation(const string &readName, PhysicalLocation *loc) { + try { + // Optimized version if using the default read name regex (== used on purpose): + if (useOptimizedDefaultParsing) { + const int fields = getLastThreeFields(readName, ':'); + if (!(fields == 5 || fields == 7)) { + if (warnAboutRegexNotMatching) { + spdlog::warn( + "Default READ_NAME_REGEX '{}' did not match read " + "name '{}'." + "You may need to specify a READ_NAME_REGEX in " + "order to correctly identify optical duplicates. " + "Note that this message will not be emitted again " + "even if other read names do not match the regex.", + readNameRegex.c_str(), readName.c_str()); + warnAboutRegexNotMatching = false; + sWrongNameFormat = true; + } + return false; + } + loc->tile = (int16_t)tmpLocationFields[0]; + loc->x = tmpLocationFields[1]; + loc->y = tmpLocationFields[2]; + return true; + } else if (readNameRegex.empty()) { + return false; + } else { + // Standard version that will use the regex + cmatch m; + // cout << "here1" << endl; + if (std::regex_match(readName.c_str(), m, readNamePattern)) { + loc->tile = std::stoi(m[1].str()); + loc->x = std::stoi(m[2].str()); + loc->y = std::stoi(m[3].str()); + return true; + } else { + if (warnAboutRegexNotMatching) { + spdlog::warn( + "READ_NAME_REGEX '{}' did not match read name '{}'." + "Your regex may not be correct. " + "Note that this message will not be emitted again " + "even if other read names do not match the regex.", + readNameRegex.c_str(), readName.c_str()); + warnAboutRegexNotMatching = false; + sWrongNameFormat = true; + } + return false; + } + } + } catch (const std::runtime_error &e) { + if (warnAboutRegexNotMatching) { + spdlog::warn( + "A field parsed out of a read name was expected to contain " + "an integer and did not. READ_NAME_REGEX: {}; Read name: " + "{}; Error Msg: {}", + readNameRegex.c_str(), readName.c_str(), e.what()); + warnAboutRegexNotMatching = false; + sWrongNameFormat = true; + } + } catch (...) { + if (warnAboutRegexNotMatching) { + spdlog::warn( + "A field parsed out of a read name was expected to contain " + "an integer and did not. READ_NAME_REGEX: {}; Read name: " + "{}", + readNameRegex.c_str(), readName.c_str()); + warnAboutRegexNotMatching = false; + sWrongNameFormat = true; + } + } + + return true; + } + + /** + * Given a string, splits the string by the delimiter, and returns the the + * last three fields parsed as integers. Parsing a field considers only a + * sequence of digits up until the first non-digit character. The three + * values are stored in the passed-in array. + * + * @throws NumberFormatException if any of the tokens that should contain + * numbers do not start with parsable numbers + */ + int getLastThreeFields(const string &readName, char delim) { + int tokensIdx = 2; // start at the last token + int numFields = 0; + int i, endIdx; + endIdx = readName.size(); + // find the last three tokens only + for (i = (int)readName.size() - 1; 0 <= i && 0 <= tokensIdx; i--) { + if (readName.at(i) == delim || 0 == i) { + numFields++; + const int startIdx = (0 == i) ? 0 : (i + 1); + // cout << readName << endl; + tmpLocationFields[tokensIdx] = std::stoi(readName.substr(startIdx, endIdx - startIdx)); + tokensIdx--; + endIdx = i; + } + } + // continue to find the # of fields + while (0 <= i) { + if (readName.at(i) == delim || 0 == i) + numFields++; + i--; + } + if (numFields < 3) { + tmpLocationFields[0] = tmpLocationFields[1] = tmpLocationFields[2] = -1; + numFields = -1; + } + + return numFields; + } +}; + +#endif \ No newline at end of file diff --git a/src/util/bam_buf.cpp b/src/util/bam_buf.cpp index e69de29..0473772 100644 --- a/src/util/bam_buf.cpp +++ b/src/util/bam_buf.cpp @@ -0,0 +1,248 @@ +/* + Description: 读入sam/bam时,开辟一个大的buf,存放这些数据 + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2019/11/27 +*/ + +#include "bam_buf.h" + +/* + * BamBuf类 + */ +// 读取数据直到读完,或者缓冲区满 +int BamBuf::ReadBam() { + int read_num = 0; + if (handle_last) { // 处理上次读入的最后一个bam + if (has_enough_space()) { // 必须调用,在边界处调整memffset + ++read_num; + append_one_bam(); + } else { + return read_num; // 还是没空间 + } + } + while (read_stat_ >= 0 && (read_stat_ = sam_read1(fp, hdr, bw->b)) >= 0) { + bw->end_pos_ = BamWrap::BamEndPos(bw->b); + if (has_enough_space()) { // 还有空间 + // if (true) { // 还有空间 + append_one_bam(); + ++read_num; // 放进缓存才算读取到 + } else { + break; + } + } + if (read_stat_ >= 0) { + handle_last = true; + } else { + handle_last = false; + } + return read_num; +} + +// 初始化缓存 +void BamBuf::Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size) { + this->fp = fp; + this->hdr = hdr; + this->mem_size = mem_size; + this->mem = (uint8_t *)malloc(mem_size); + this->bw = (BamWrap *)malloc(sizeof(BamWrap)); + this->bw->b = bam_init1(); + if (bw == NULL || this->mem == NULL || this->bw->b == NULL) { + fprintf(stderr, "allocate memory failed! Abort\n"); + exit(-1); + } +} + +void BamBuf::ClearBeforeIdx(size_t idxInBv) { + if (idxInBv < 1) + return; + int i = 0, j = idxInBv; + for (; j < bv.size(); ++i, ++j) { + bv[i] = bv[j]; + } + bv.resize(i); + prepare_read(); +} + +void BamBuf::ClearAll() { + bv.clear(); + prepare_read(); +} + +// 为下一次读取做准备, 计算一些边界条件 +inline void BamBuf::prepare_read() { + // 计算余留的下次计算可能用到的bam所占的位置 + if (bv.size() > 0) { + BamWrap *bw = bv[0]; + legacy_start = (int64_t)bw - (int64_t)mem; + bw = bv.back(); + legacy_end = (int64_t)bw + bw->length() - (int64_t)mem; + } else { + legacy_start = legacy_end = 0; + mem_offset = 0; // 上次没剩下,那就从头存储 + } +} + +// 检查缓存是否还有空间 +inline bool BamBuf::has_enough_space() { + const uint32_t bam_len = bw->length(); + int64_t potential_end = mem_offset + bam_len; + if (legacy_start <= legacy_end) + legacy_start += mem_size; + if (potential_end >= legacy_start) { + return false; + } + if (potential_end >= mem_size) { + mem_offset = 0; + } + int64_t virtual_offset = mem_offset; + if (virtual_offset < legacy_end) + virtual_offset += mem_size; + potential_end = virtual_offset + bam_len; + return potential_end < legacy_start; +} + +// 处理一个读取后的bam +inline void BamBuf::append_one_bam() { + BamWrap *bwp = (BamWrap *)(mem + mem_offset); + *bwp = *bw; + bwp->b = (bam1_t *)((char *)bwp + sizeof(*bwp)); + bam1_t *bp = bwp->b; + *bp = *bw->b; + bp->data = (uint8_t *)((char *)bwp->b + sizeof(bam1_t)); + memcpy(bp->data, bw->b->data, bw->b->l_data); + // 更新下次存储的位置 + mem_offset = (mem_offset + bw->length() + 8 - 1) & ~((size_t)(8 - 1)); + bv.push_back(bwp); +} + +// 处理上次读入的最后一个read +inline bool BamBuf::handle_last_read() { + if (handle_last) { // 处理上次读入的最后一个bam + if (has_enough_space()) { // 必须调用,在边界处调整memffset + append_one_bam(); + handle_last = false; + return true; + } + } + return false; +} + +/* + * AsyncIoBamBuf 类 + */ +// 初始化缓存 +void AsyncIoBamBuf::Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size) { + if (use_async_io_) { + buf1_.Init(fp, hdr, mem_size >> 1); + buf2_.Init(fp, hdr, mem_size >> 1); + pi_ = &buf1_; + po_ = &buf2_; + tid_ = (pthread_t *)malloc(sizeof(pthread_t)); + } else { + buf1_.Init(fp, hdr, mem_size); + pi_ = &buf1_; + } +} + +// 读取数据 +int AsyncIoBamBuf::ReadBam() { + if (use_async_io_) { + hasThread = true; + return async_read_bam(); + } else { + return sync_read_bam(); + } +} + +int AsyncIoBamBuf::sync_read_bam() { + int read_num = 0; + if (clear_all_) { + clear_all_ = false; + pi_->ClearAll(); + } else if (clear_before_idx_ > 0) { + pi_->ClearBeforeIdx(clear_before_idx_); + clear_before_idx_ = 0; + } + read_num = pi_->ReadBam(); + refresh_bam_arr(); + return read_num; +} + +int AsyncIoBamBuf::async_read_bam() { + int read_num = 0; + if (first_read_) { + read_num = pi_->ReadBam(); + first_read_ = false; + refresh_bam_arr(); + } else { + // join, 交换缓冲区指针 + pthread_join(*tid_, 0); + resize_buf(); + + if (need_read_) { // 需要交换指针 + BamBuf *tmp = pi_; + pi_ = po_; + po_ = tmp; + } + read_num = last_read_num_; + refresh_bam_arr(); + } + // 异步读 + pthread_create(tid_, 0, async_read, this); + return read_num; +} + +void *AsyncIoBamBuf::async_read(void *data) { + AsyncIoBamBuf *ab = (AsyncIoBamBuf *)data; + if (ab->need_read_ && ab->ReadStat() >= 0) { // 需要读取 + ab->last_read_num_ = ab->po_->ReadBam(); + } else { + ab->last_read_num_ = 0; + } + pthread_exit(0); +} + +// 为下一次读取做准备, +// 计算一些边界条件,延迟操作,因为此时可能po_对应的buf正在读取 +void AsyncIoBamBuf::ClearBeforeIdx(size_t idxInBv) { clear_before_idx_ = idxInBv; } + +// 清空上一次所有读入的数据,延迟操作,因为此时可能po_对应的buf正在读取 +void AsyncIoBamBuf::ClearAll() { clear_all_ = true; } + +inline void AsyncIoBamBuf::resize_buf() { + if (clear_all_) { // 清理上一轮的数据 + clear_all_ = false; + po_->ClearBeforeIdx(legacy_size_); + pi_->ClearAll(); + if (pi_->handle_last_read()) { // 上次读取有一个read没放入缓存 + last_read_num_ += 1; + legacy_size_ = pi_->Size(); // 应该只有一个read + need_read_ = true; + } else { // 没空间存放,则不交换指针,或者文件已经读取完毕 + legacy_size_ = 0; + need_read_ = false; + } + } else if (clear_before_idx_ > 0) { + if (clear_before_idx_ < legacy_size_) { + po_->ClearBeforeIdx(clear_before_idx_); + legacy_size_ -= clear_before_idx_; + // 不需要交换指针,不需要读取 + need_read_ = false; + } else { + po_->ClearBeforeIdx(legacy_size_); + pi_->ClearBeforeIdx(clear_before_idx_ - legacy_size_); + if (pi_->handle_last_read()) { // 上次读取有一个read没放入缓存 + last_read_num_ += 1; + legacy_size_ = pi_->Size(); // 应该只有一个read + need_read_ = true; + } else { // 没空间存放,则不交换指针,或者文件已经读取完毕 + legacy_size_ = 0; + need_read_ = false; + } + } + clear_before_idx_ = 0; + } +} \ No newline at end of file diff --git a/src/util/bam_buf.h b/src/util/bam_buf.h index e69de29..ee6a3c8 100644 --- a/src/util/bam_buf.h +++ b/src/util/bam_buf.h @@ -0,0 +1,172 @@ +/* + Description: 读入sam/bam时,开辟一个大的buf,存放这些数据 + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2019/11/27 +*/ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "bam_wrap.h" + +using std::vector; +using namespace std; + +/* + * 存放读入的bam数据 + */ +struct BamBuf { + sam_hdr_t *hdr; // sam文件的header信息 + samFile *fp; // sam文件指针 + BamWrap *bw = nullptr; // 用来循环读入bam + uint8_t *mem = nullptr; // 用来存放bam的数据, + // 程序结束后自动释放,所以没在析构函数里释放 + int64_t mem_offset = 0; // 下一次要存放的位置 + int64_t mem_size; // 缓存大小 + int read_stat_ = 0; // 读取状态,是否读完 + vector bv; // 方便对bam数据的访问 + int64_t legacy_start = 0; // 没处理完的bam在mem中的起始位置, 闭区间 + int64_t legacy_end = 0; // 没处理完的bam在mem中的结束位置, 开区间 + bool handle_last = false; // 上次最后读入的bam是否需要处理 + + // 初始化缓存 + void Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size); + // 读取数据直到读完,或者缓冲区满 + int ReadBam(); + // 为下一次读取做准备, 计算一些边界条件 + void ClearBeforeIdx(size_t idxInBv); + // 清空上一次所有读入的数据 + void ClearAll(); + inline int64_t Size() { return bv.size(); } // 包含多少个read + inline int ReadStat() { return read_stat_; } // 文件的读取状态,是否可读(读取完全) + ~BamBuf() { + if (this->mem != nullptr) { + free(this->mem); + } + if (this->bw != nullptr) { + bam_destroy1(bw->b); + free(bw); + } + } + void FreeMemory() // 释放开辟的内存 + { + if (this->mem != nullptr) { + free(this->mem); + } + if (this->bw != nullptr) { + bam_destroy1(bw->b); + free(bw); + } + this->mem = nullptr; + this->bw = nullptr; + } + void prepare_read(); + // 检查缓存是否还有空间 + bool has_enough_space(); + // 处理一个读取后的bam + void append_one_bam(); + // 处理上次读入的最后一个read + bool handle_last_read(); + + // 针对bv的操作 + inline BamWrap *operator[](int64_t pos) { return bv[pos]; } + inline void push_back(BamWrap *val) { bv.push_back(val); } + inline void clear() { bv.clear(); } + inline void resize(int64_t s) { bv.resize(s); } +}; + +/* + * io异步缓冲区 + */ +struct AsyncIoBamBuf { + BamBuf buf1_; + BamBuf buf2_; + BamBuf *pi_; // 当前用的buf + BamBuf *po_; // 后台在读取的buf + pthread_t *tid_ = NULL; + bool hasThread = false; + int64_t legacy_size_ = 0; // 上一轮运算之后,缓存中还剩余的上次读取的read数量 + bool first_read_ = true; + int last_read_num_ = 0; // 上一次读取了多少reads + bool need_read_ = true; + bool use_async_io_ = true; + int64_t clear_before_idx_ = 0; // 用户异步读取,下一轮读取之前清理掉clear_before_idx_之前的所有reads + bool clear_all_ = false; // 用于异步读取,下一轮读取之前清理掉之前的所有reads + + vector bam_arr_; // 用来访问buf中的bam + + AsyncIoBamBuf() {} + AsyncIoBamBuf(bool use_async) : use_async_io_(use_async) {} + // 析构 + ~AsyncIoBamBuf() { + if (tid_ != NULL) { + if (hasThread) + pthread_join(*tid_, 0); + free(tid_); + } + // 其他的内存就等程序结束自动释放 + // buf的析构函数会自动调用 + } + + // 初始化缓存 + void Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size); + + // 读取数据 + int ReadBam(); + // 为下一次读取做准备, 计算一些边界条件 + void ClearBeforeIdx(size_t idxInBv); + vector &GetBamArr() { return bam_arr_; } // 获取bam array + // 清空上一次所有读入的数据 + void ClearAll(); + // 包含的read数量 + inline int64_t Size() { return legacy_size_ + pi_->Size(); } + inline int ReadStat() { return pi_->read_stat_; } + inline BamWrap *operator[](int64_t pos) { return bam_arr_[pos]; } + // 获取某一段reads + inline vector Slice(size_t startIdx, size_t endIdx) { + if (endIdx > startIdx) { + auto begItr = bam_arr_.begin(); + return std::move(vector(begItr + startIdx, begItr + endIdx)); + } + return std::move(vector()); + } + void FreeMemory() { + buf1_.FreeMemory(); + buf2_.FreeMemory(); + } + + // 同步读取 + int sync_read_bam(); + // 异步读取 + int async_read_bam(); + // 异步读取线程函数 + static void *async_read(void *data); + void resize_buf(); + inline void refresh_bam_arr() { + bam_arr_.resize(this->Size()); + for (int i = 0; i < bam_arr_.size(); ++i) { + if (i < legacy_size_) + bam_arr_[i] = (*po_)[i]; + else + bam_arr_[i] = (*pi_)[i - legacy_size_]; + } + } +}; + +typedef AsyncIoBamBuf BamBufType; + +typedef vector BamArray; \ No newline at end of file diff --git a/src/util/bam_wrap.h b/src/util/bam_wrap.h index e69de29..3e5a20f 100644 --- a/src/util/bam_wrap.h +++ b/src/util/bam_wrap.h @@ -0,0 +1,369 @@ +/* + Description: 读入sam/bam时,开辟一个大的buf,存放这些数据 + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2019/11/27 +*/ +#pragma once + +#include +#include +#include + +#include +#include +#include +#include + +using namespace std; + +/* + 这里的成员函数命名有点混乱,特此说明,小写加下划线的函数命名,无论是静态函数,还是普通成员函数,更侧重说明 + 这是类似bam的一个属性,而大写加驼峰命名的函数,更侧重说明这是通过计算得出的。 +*/ +/* + * sam read的封装 + */ +struct BamWrap { + // 将contig左移后加上pos作为全局位置 + const static int MAX_CONTIG_LEN_SHIFT = 40; // 将染色体id左移多少位,和位点拼合在一起 + const static int READ_MAX_LENGTH = 200; + const static int READ_MAX_DEPTH = 1000; // 这只是用来初始化空间用的,深度大于这个值也没关系 + + // 成员变量尽量少,减少占用内存空间 + bam1_t *b; + int64_t end_pos_; // bam的全局结束位置, 闭区间 + + // 全局开始位置 + inline int64_t start_pos() { return bam_global_pos(b); } + // 全局结束位置 + inline int64_t end_pos() { return end_pos_; } + // 和reference对应的序列长度 + inline int16_t read_len() { return (end_pos_ - start_pos() + 1); } + + // 在contig内的开始位置 + inline int32_t contig_pos() { return b->core.pos; } + // 在contig内部的结束位置 + inline int32_t contig_end_pos() { return bam_pos(end_pos_); } + // 序列的长度(AGTC字母个数) + inline int16_t seq_len() { return b->core.l_qseq; } + + // 算上开头的softclip + inline int32_t softclip_start() { + const uint32_t *cigar = bam_get_cigar(b); + const bam1_core_t &bc = b->core; + const char c = bam_cigar_opchr(cigar[0]); + const int len = bam_cigar_oplen(cigar[0]); + if (c == 'S') + return bc.pos - len; + return bc.pos; + } + + // 算上结尾的softclip + inline int32_t softclip_end() { + const uint32_t *cigar = bam_get_cigar(b); + const bam1_core_t &bc = b->core; + const char c = bam_cigar_opchr(cigar[bc.n_cigar - 1]); + const int len = bam_cigar_oplen(cigar[bc.n_cigar - 1]); + if (c == 'S') + return bam_pos(end_pos_) + len; + return bam_pos(end_pos_); + } + + // 算上结尾的softclip + inline int32_t right_softclip_len() { + const uint32_t *cigar = bam_get_cigar(b); + const bam1_core_t &bc = b->core; + const char c = bam_cigar_opchr(cigar[bc.n_cigar - 1]); + const int len = bam_cigar_oplen(cigar[bc.n_cigar - 1]); + if (c == 'S') + return len; + return 0; + } + + // 获取序列 + inline std::string sequence() { + ostringstream oss; + char *seq = (char *)bam_get_seq(b); + const bam1_core_t &bc = b->core; + const char base_to_char[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'}; + for (int i = 0; i < bc.l_qseq; ++i) { + char base = base_to_char[bam_seqi(seq, i)]; + oss << base; + } + return std::move(oss.str()); + } + + // 获取名字 + inline const char *query_name() { return bam_get_qname(b); } + // 获取cigar 字符串 + inline string cigar_str() { + ostringstream oss; + const uint32_t *cigar = bam_get_cigar(b); + const bam1_core_t &bc = b->core; + for (int i = 0; i < bc.n_cigar; ++i) { + const char c = bam_cigar_opchr(cigar[i]); + const int len = bam_cigar_oplen(cigar[i]); + oss << len << c; + } + return std::move(oss.str()); + } + + // 占用的内存大小 + inline int16_t length() { return sizeof(*this) + sizeof(bam1_t) + b->l_data; } + + // 获取cigar中insert的总长度 + inline int32_t insert_cigar_len() { + const uint32_t *cigar = bam_get_cigar(b); + const bam1_core_t &bc = b->core; + int ret = 0; + for (int i = 0; i < bc.n_cigar; ++i) { + const char c = bam_cigar_opchr(cigar[i]); + const int len = bam_cigar_oplen(cigar[i]); + if (c == 'I') + ret += len; + } + return ret; + } + + // 获取cigar中delete的总长度 + inline int32_t del_cigar_len() { + const uint32_t *cigar = bam_get_cigar(b); + const bam1_core_t &bc = b->core; + int ret = 0; + for (int i = 0; i < bc.n_cigar; ++i) { + const char c = bam_cigar_opchr(cigar[i]); + const int len = bam_cigar_oplen(cigar[i]); + if (c == 'D') + ret += len; + } + return ret; + } + + // 计算sam read的终点位置 + static inline int64_t BamEndPos(const bam1_t *b) { + const uint32_t *cigar = bam_get_cigar(b); + const bam1_core_t &bc = b->core; + int start_offset = -1; + for (int i = 0; i < bc.n_cigar; ++i) { + const char c = bam_cigar_opchr(cigar[i]); + const int len = bam_cigar_oplen(cigar[i]); + if (c == 'D' || c == 'N' || c == 'M' || c == '=' || c == 'X') + start_offset += len; + } + return (((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)(b->core.pos + start_offset)); + }; + + bool HasWellDefinedFragmentSize() { + const bam1_core_t &bc = b->core; + bool hasWellDefinedFragmentSize = true; + if (bc.isize == 0 || !(bc.flag & BAM_FPAIRED) || ((bc.flag & BAM_FUNMAP) || (bc.flag & BAM_FMUNMAP)) || + ((bool)(bc.flag & BAM_FREVERSE) == (bool)(bc.flag & BAM_FMREVERSE))) { + hasWellDefinedFragmentSize = false; + } else if (bc.flag & BAM_FREVERSE) { + hasWellDefinedFragmentSize = contig_end_pos() > bc.mpos ? true : false; + } else { + hasWellDefinedFragmentSize = bc.pos <= bc.mpos + bc.isize ? true : false; + } + return hasWellDefinedFragmentSize; + } + + // 计算bam的adapterBoundary + int GetAdapterBoundary() { + const bam1_core_t &bc = b->core; + int adapterBoundary; + if (!HasWellDefinedFragmentSize()) + adapterBoundary = INT_MIN; + else if (bc.flag & BAM_FREVERSE) + adapterBoundary = bc.mpos - 1; + else + adapterBoundary = bc.pos + abs(bc.isize); // GATK4.0 和 GATK3.5不一样,3.5的这里+1 + return adapterBoundary; + } + + // 获取开头的I的长度 + inline int GetHeadInsertLen() { + int insLen = 0; + const uint32_t *cigar = bam_get_cigar(b); + const bam1_core_t &bc = b->core; + for (int i = 0; i < bc.n_cigar; ++i) { + const char c = bam_cigar_opchr(cigar[i]); + const int len = bam_cigar_oplen(cigar[i]); + if (c == 'I') { + insLen = len; + break; + } else if (c != 'H' && c != 'S') + break; + } + return insLen; + } + + // 获取soft clip开始位置(能处理H和S相连的情况,有这种情况么?, + // 注意开头的I要当做S?) + inline int64_t GetSoftStart() { + int64_t softStart = b->core.pos; + const uint32_t *cigar = bam_get_cigar(b); + const bam1_core_t &bc = b->core; + for (int i = 0; i < bc.n_cigar; ++i) { + const char c = bam_cigar_opchr(cigar[i]); + const int len = bam_cigar_oplen(cigar[i]); + if (c == 'S' || c == 'I') + softStart -= len; + else if (c != 'H') + break; + } + return softStart; + } + + // 获取unclipped开始位置(包括hardclip) + inline int64_t GetUnclippedStart() { + int64_t start = b->core.pos; + const uint32_t *cigar = bam_get_cigar(b); + const bam1_core_t &bc = b->core; + for (int i = 0; i < bc.n_cigar; ++i) { + const char c = bam_cigar_opchr(cigar[i]); + const int len = bam_cigar_oplen(cigar[i]); + if (c == 'S' || c == 'H') + start -= len; + else + break; + } + return start; + } + + // 获取unclipped结束位置(包括hardclip) + inline int64_t GetUnclippedEnd() { + int64_t end_pos = bam_endpos(b); + const uint32_t *cigar = bam_get_cigar(b); + const bam1_core_t &bc = b->core; + for (int i = bc.n_cigar - 1; i >= 0; --i) { + const char c = bam_cigar_opchr(cigar[i]); + const int len = bam_cigar_oplen(cigar[i]); + if (c == 'S' || c == 'H') + end_pos += len; + else + break; + } + return end_pos - 1; + } + + /* 获取碱基质量分数的加和 */ + /** Calculates a score for the read which is the sum of scores over Q15. */ + inline int GetSumOfBaseQualities() { + int score = 0; + uint8_t *qual = bam_get_qual(b); + for (int i = 0; i < b->core.l_qseq; ++i) { + if (qual[i] >= 15) + score += qual[i]; + } + + return score; + } + + /* 与flag相关的检测 */ + + /* 没有比对上 unmapped */ + inline bool GetReadUnmappedFlag() { return b->core.flag & BAM_FUNMAP; } + + /* Template having multiple segments in sequencing */ + inline bool GetReadPairedFlag() { return b->core.flag & BAM_FPAIRED; } + + /** + * the read fails platform/vendor quality checks. + */ + inline bool GetReadFailsVendorQualityCheckFlag() { return b->core.flag & BAM_FQCFAIL; } + + /** + * the mate is unmapped. + */ + bool GetMateUnmappedFlag() { return b->core.flag & BAM_FMUNMAP; } + + /** + * @return whether the alignment is secondary (an alternative alignment of + * the read). + */ + bool IsSecondaryAlignment() { return b->core.flag & BAM_FSECONDARY; } + + /** + * @return whether the alignment is supplementary (a split alignment such as + * a chimeric alignment). + */ + bool GetSupplementaryAlignmentFlag() { return b->core.flag & BAM_FSUPPLEMENTARY; } + + /* + * Tests if this record is either a secondary and/or supplementary + * alignment; + */ + bool IsSecondaryOrSupplementary() { return IsSecondaryAlignment() || GetSupplementaryAlignmentFlag(); } + + /** + * the read is the first read in a pair. + */ + bool GetFirstOfPairFlag() { return b->core.flag & BAM_FREAD1; } + + /** + * strand of the query (false for forward; true for reverse strand). + */ + bool GetReadNegativeStrandFlag() { return b->core.flag & BAM_FREVERSE; } + + /** + * strand of the mate (false for forward; true for reverse strand). + */ + bool GetMateNegativeStrandFlag() { return b->core.flag & BAM_FMREVERSE; } + + /* 其他的一些信息 */ + inline int GetReferenceLength() { + int length = 0; + const uint32_t *cigar = bam_get_cigar(b); + const bam1_core_t &bc = b->core; + for (int i = 0; i < bc.n_cigar; ++i) { + const char c = bam_cigar_opchr(cigar[i]); + const int len = bam_cigar_oplen(cigar[i]); + switch (c) { + case 'M': + case 'D': + case 'N': + case '=': + case 'X': + length += len; + break; + default: + break; + } + } + return length; + } + + // 计算bam的全局位置,算上染色体序号和比对位置 + static inline int64_t bam_global_pos(bam1_t *b) { + return (((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)b->core.pos); + } + static inline int64_t bam_global_pos(int tid, int pos) { + return (((int64_t)tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)pos); + } + // 根据全局位置获取bam的染色体序号 + static inline int32_t bam_tid(int64_t global_pos) { + const int64_t mask = ~(((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1); + const int64_t high_tid = global_pos & mask; + return (int32_t)(high_tid >> MAX_CONTIG_LEN_SHIFT); + } + // 根据全局位置获取bam的比对位置(染色体内) + static inline int32_t bam_pos(int64_t global_pos) { + const int64_t mask = ((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1; + return (int32_t)(global_pos & mask); + } + + // 设置是否冗余的标记 + void SetDuplicateReadFlag(bool flag) { setFlag(flag, BAM_FDUP); } + + void setFlag(bool flag, int bit) { + if (flag) + this->b->core.flag |= bit; + else + this->b->core.flag &= ~bit; + } +}; + +typedef std::map> SampleBamMap; \ No newline at end of file diff --git a/src/util/murmur3.h b/src/util/murmur3.h index e69de29..d404ecc 100644 --- a/src/util/murmur3.h +++ b/src/util/murmur3.h @@ -0,0 +1,83 @@ +/* +Description: Murmur哈希 + +Copyright : All right reserved by ICT + +Author : Zhang Zhonghai +Date : 2023/11/6 +*/ + +#include +#include + +using std::string; + +/** + * Provides an implementation of the Murmur3_32 hash algorithm that has + * desirable properties in terms of randomness and uniformity of the + * distribution of output values that make it a useful hashing algorithm for + * downsampling. + */ + +struct Murmur3 { + int seed_ = 0; + /** Hashes a character stream to an int using Murmur3. */ + int HashUnencodedChars(const string &input) { + int h1 = this->seed_; + + // step through the CharSequence 2 chars at a time + const int length = input.size(); + for (int i = 1; i < length; i += 2) { + int k1 = input.at(i - 1) | (input.at(i) << 16); + k1 = mixK1(k1); + h1 = mixH1(h1, k1); + } + + // deal with any remaining characters + if ((length & 1) == 1) { + int k1 = input.at(length - 1); + k1 = mixK1(k1); + h1 ^= k1; + } + + return fmix(h1, 2 * length); + } + + static Murmur3 &Instance() { + static Murmur3 instance; + return instance; + } + + static int mixK1(int k1) { + const int c1 = 0xcc9e2d51; + const int c2 = 0x1b873593; + k1 *= c1; + k1 = k1 << 15; + k1 *= c2; + return k1; + } + + static int mixH1(int h1, int k1) { + h1 ^= k1; + h1 = h1 << 13; + h1 = h1 * 5 + 0xe6546b64; + return h1; + } + + // Finalization mix - force all bits of a hash block to avalanche + static int fmix(int h1, int length) { + h1 ^= length; + h1 ^= (unsigned int)h1 >> 16; + h1 *= 0x85ebca6b; + h1 ^= (unsigned int)h1 >> 13; + h1 *= 0xc2b2ae35; + h1 ^= (unsigned int)h1 >> 16; + return h1; + } + +private: + Murmur3() { + auto &&rd = std::random_device{}; + seed_ = rd(); + } +}; \ No newline at end of file diff --git a/src/util/profiling.cpp b/src/util/profiling.cpp new file mode 100644 index 0000000..d3b6800 --- /dev/null +++ b/src/util/profiling.cpp @@ -0,0 +1,70 @@ +#include "profiling.h" + +#include +#include +#include +#include + +#ifdef SHOW_PERF +uint64_t tprof[LIM_THREAD_PROF_TYPE][LIM_THREAD] = {0}; +uint64_t proc_freq = 1000; +uint64_t gprof[LIM_GLOBAL_PROF_TYPE] = {0}; +#endif + +uint64_t RealtimeMsec(void) { + struct timeval tv; + gettimeofday(&tv, NULL); + return (uint64_t)1000 * (tv.tv_sec + ((1e-6) * tv.tv_usec)); +} + +static int CalcThreadTime(uint64_t *a, int len, double *max, double *min, double *avg) { +#ifdef SHOW_PERF + int i = 0; + uint64_t umax = 0, umin = UINT64_MAX, uavg = 0; + for (i = 0; i < len; i++) { + if (a[i] > umax) + umax = a[i]; + if (a[i] < umin) + umin = a[i]; + uavg += a[i]; + } + *avg = uavg * 1.0 / len / proc_freq; + *max = umax * 1.0 / proc_freq; + *min = umin * 1.0 / proc_freq; +#endif + return 1; +} + +#define PRINT_GP(gpname) \ + fprintf(stderr, "time G %-15s: %0.2lfs\n", #gpname, gprof[GP_##gpname] * 1.0 / proc_freq); + +#define PRINT_TP(tpname, nthread) \ + { \ + double maxTime, minTime, avgTime; \ + CalcThreadTime(tprof[TP_##tpname], nthread, &maxTime, &minTime, &avgTime); \ + fprintf(stderr, "time T %-15s: avg %0.2lfs min %0.2lfs max %0.2lfs\n", #tpname, avgTime, minTime, maxTime); \ + } + +int DisplayProfiling(int nthread) { + +#ifdef SHOW_PERF + fprintf(stderr, "\n"); + PRINT_GP(read_wait); + PRINT_GP(gen_wait); + PRINT_GP(sort_wait); + PRINT_GP(markdup_wait); + PRINT_GP(intersect_wait); + PRINT_GP(read); + PRINT_GP(gen); + PRINT_GP(sort); + PRINT_GP(markdup); + PRINT_GP(intersect); + PRINT_GP(merge_result); + + PRINT_TP(gen, nthread); + PRINT_TP(sort, nthread); + fprintf(stderr, "\n"); +#endif + + return 0; +} \ No newline at end of file diff --git a/src/util/profiling.h b/src/util/profiling.h new file mode 100644 index 0000000..ff5c417 --- /dev/null +++ b/src/util/profiling.h @@ -0,0 +1,58 @@ +#pragma once +#include +#include +#include + +// #define SHOW_PERF + +#ifdef __cplusplus +extern "C" { +#endif + +#define LIM_THREAD 128 +#define LIM_THREAD_PROF_TYPE 128 +#define LIM_GLOBAL_PROF_TYPE 128 +#define LIM_THREAD_DATA_TYPE 128 +#define LIM_GLOBAL_DATA_TYPE 128 + +#ifdef SHOW_PERF +extern uint64_t proc_freq; +extern uint64_t tprof[LIM_THREAD_PROF_TYPE][LIM_THREAD]; +extern uint64_t gprof[LIM_GLOBAL_PROF_TYPE]; +#endif + +#ifdef SHOW_PERF +#define PROF_START(tmp_time) uint64_t prof_tmp_##tmp_time = RealtimeMsec() +#define PROF_START_AGAIN(tmp_time) prof_tmp_##tmp_time = RealtimeMsec() +#define PROF_END(result, tmp_time) result += RealtimeMsec() - prof_tmp_##tmp_time +#else +#define PROF_START(tmp_time) +#define PROF_END(result, tmp_time) +#endif + +// GLOBAL +enum { GP_0 = 0, GP_1, GP_2, GP_3, GP_4, GP_5, GP_6, GP_7, GP_8, GP_9, GP_10 }; +enum { + GP_read_wait = 11, + GP_gen_wait, + GP_sort_wait, + GP_markdup_wait, + GP_intersect_wait, + GP_read, + GP_gen, + GP_sort, + GP_markdup, + GP_intersect, + GP_merge_result +}; +// THREAD +enum { TP_0 = 0, TP_1, TP_2, TP_3, TP_4, TP_5, TP_6, TP_7, TP_8, TP_9, TP_10 }; +enum { TP_gen = 11, TP_sort }; + +uint64_t RealtimeMsec(void); + +int DisplayProfiling(int); + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/src/util/timer.cpp b/src/util/timer.cpp deleted file mode 100644 index e69de29..0000000 diff --git a/src/util/timer.h b/src/util/timer.h deleted file mode 100644 index e69de29..0000000 diff --git a/src/util/yarn.cpp b/src/util/yarn.cpp index e69de29..7945210 100644 --- a/src/util/yarn.cpp +++ b/src/util/yarn.cpp @@ -0,0 +1,395 @@ +/* yarn.c -- generic thread operations implemented using pthread functions + * Copyright (C) 2008, 2011, 2012, 2015, 2018, 2019, 2020 Mark Adler + * Version 1.7 12 Apr 2020 Mark Adler + * For conditions of distribution and use, see copyright notice in yarn.h + */ + +/* Basic thread operations implemented using the POSIX pthread library. All + pthread references are isolated within this module to allow alternate + implementations with other thread libraries. See yarn.h for the description + of these operations. */ + +/* Version history: + 1.0 19 Oct 2008 First version + 1.1 26 Oct 2008 No need to set the stack size -- remove + Add yarn_abort() function for clean-up on error exit + 1.2 19 Dec 2011 (changes reversed in 1.3) + 1.3 13 Jan 2012 Add large file #define for consistency with pigz.c + Update thread portability #defines per IEEE 1003.1-2008 + Fix documentation in yarn.h for yarn_prefix + 1.4 19 Jan 2015 Allow yarn_abort() to avoid error message to stderr + Accept and do nothing for NULL argument to free_lock() + 1.5 8 May 2018 Remove destruct() to avoid use of pthread_cancel() + Normalize the code style + 1.6 3 Apr 2019 Add debugging information to fail() error messages + 1.7 12 Apr 2020 Fix use after free bug in ignition() + */ + +// For thread portability. +#define _XOPEN_SOURCE 700 +#define _POSIX_C_SOURCE 200809L +#define _THREAD_SAFE + +// Use large file functions if available. +#define _FILE_OFFSET_BITS 64 + +// External libraries and entities referenced. +#include // pthread_t, pthread_create(), pthread_join(), +#include // fprintf(), stderr +#include // exit(), malloc(), free(), NULL +// pthread_attr_t, pthread_attr_init(), pthread_attr_destroy(), +// PTHREAD_CREATE_JOINABLE, pthread_attr_setdetachstate(), +// pthread_self(), pthread_equal(), +// pthread_mutex_t, PTHREAD_MUTEX_INITIALIZER, pthread_mutex_init(), +// pthread_mutex_lock(), pthread_mutex_unlock(), pthread_mutex_destroy(), +// pthread_cond_t, PTHREAD_COND_INITIALIZER, pthread_cond_init(), +// pthread_cond_broadcast(), pthread_cond_wait(), pthread_cond_destroy() +#include // EPERM, ESRCH, EDEADLK, ENOMEM, EBUSY, EINVAL, EAGAIN + +// Interface definition. +#include "yarn.h" + +// Constants. +#define local static // for non-exported functions and globals + +namespace yarn { + +// Error handling external globals, resettable by application. +char *yarn_prefix = (char *)"yarn"; +void (*yarn_abort)(int) = NULL; + +// Immediately exit -- use for errors that shouldn't ever happen. +local void fail(int err, char const *file, long line, char const *func) { + fprintf(stderr, "%s: ", yarn_prefix); + switch (err) { + case EPERM: + fputs("already unlocked", stderr); + break; + case ESRCH: + fputs("no such thread", stderr); + break; + case EDEADLK: + fputs("resource deadlock", stderr); + break; + case ENOMEM: + fputs("out of memory", stderr); + break; + case EBUSY: + fputs("can't destroy locked resource", stderr); + break; + case EINVAL: + fputs("invalid request", stderr); + break; + case EAGAIN: + fputs("resource unavailable", stderr); + break; + default: + fprintf(stderr, "internal error %d", err); + } + fprintf(stderr, " (%s:%ld:%s)\n", file, line, func); + if (yarn_abort != NULL) + yarn_abort(err); + exit(err); +} + +// Memory handling routines provided by user. If none are provided, malloc() +// and free() are used, which are therefore assumed to be thread-safe. +typedef void *(*malloc_t)(size_t); +typedef void (*free_t)(void *); +local malloc_t my_malloc_f = malloc; +local free_t my_free = free; + +// Use user-supplied allocation routines instead of malloc() and free(). +void yarn_mem(malloc_t lease, free_t vacate) { + my_malloc_f = lease; + my_free = vacate; +} + +// Memory allocation that cannot fail (from the point of view of the caller). +local void *my_malloc(size_t size, char const *file, long line) { + void *block; + + if ((block = my_malloc_f(size)) == NULL) + fail(ENOMEM, file, line, "malloc"); + return block; +} + +// -- Lock functions -- + +struct lock_s { + pthread_mutex_t mutex; + pthread_cond_t cond; + long value; +}; + +lock_t *new_lock_(long initial, char const *file, long line) { + lock_t *bolt = (lock_t *)my_malloc(sizeof(struct lock_s), file, line); + int ret = pthread_mutex_init(&(bolt->mutex), NULL); + if (ret) + fail(ret, file, line, "mutex_init"); + ret = pthread_cond_init(&(bolt->cond), NULL); + if (ret) + fail(ret, file, line, "cond_init"); + bolt->value = initial; + return bolt; +} + +void possess_(lock_t *bolt, char const *file, long line) { + int ret = pthread_mutex_lock(&(bolt->mutex)); + if (ret) + fail(ret, file, line, "mutex_lock"); +} + +void release_(lock_t *bolt, char const *file, long line) { + int ret = pthread_mutex_unlock(&(bolt->mutex)); + if (ret) + fail(ret, file, line, "mutex_unlock"); +} + +void twist_(lock_t *bolt, enum twist_op op, long val, char const *file, long line) { + if (op == TO) + bolt->value = val; + else if (op == BY) + bolt->value += val; + int ret = pthread_cond_broadcast(&(bolt->cond)); + if (ret) + fail(ret, file, line, "cond_broadcast"); + ret = pthread_mutex_unlock(&(bolt->mutex)); + if (ret) + fail(ret, file, line, "mutex_unlock"); +} + +#define until(a) while (!(a)) + +void wait_for_(lock_t *bolt, enum wait_op op, long val, char const *file, long line) { + switch (op) { + case TO_BE: + until(bolt->value == val) { + int ret = pthread_cond_wait(&(bolt->cond), &(bolt->mutex)); + if (ret) + fail(ret, file, line, "cond_wait"); + } + break; + case NOT_TO_BE: + until(bolt->value != val) { + int ret = pthread_cond_wait(&(bolt->cond), &(bolt->mutex)); + if (ret) + fail(ret, file, line, "cond_wait"); + } + break; + case TO_BE_MORE_THAN: + until(bolt->value > val) { + int ret = pthread_cond_wait(&(bolt->cond), &(bolt->mutex)); + if (ret) + fail(ret, file, line, "cond_wait"); + } + break; + case TO_BE_LESS_THAN: + until(bolt->value < val) { + int ret = pthread_cond_wait(&(bolt->cond), &(bolt->mutex)); + if (ret) + fail(ret, file, line, "cond_wait"); + } + } +} + +long peek_lock(lock_t *bolt) { return bolt->value; } + +void free_lock_(lock_t *bolt, char const *file, long line) { + if (bolt == NULL) + return; + int ret = pthread_cond_destroy(&(bolt->cond)); + if (ret) + fail(ret, file, line, "cond_destroy"); + ret = pthread_mutex_destroy(&(bolt->mutex)); + if (ret) + fail(ret, file, line, "mutex_destroy"); + my_free(bolt); +} + +// -- Thread functions (uses the lock_t functions above) -- + +struct thread_s { + pthread_t id; + int done; // true if this thread has exited + thread *next; // for list of all launched threads +}; + +// List of threads launched but not joined, count of threads exited but not +// joined (incremented by ignition() just before exiting). +local lock_t threads_lock = { + PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER, + 0 // number of threads exited but not joined +}; +local thread *threads = NULL; // list of extant threads + +// Structure in which to pass the probe and its payload to ignition(). +struct capsule { + void (*probe)(void *); + void *payload; + char const *file; + long line; +}; + +// Mark the calling thread as done and alert join_all(). +local void reenter(void *arg) { + struct capsule *capsule = (struct capsule *)arg; + + // find this thread in the threads list by matching the thread id + pthread_t me = pthread_self(); + possess_(&(threads_lock), capsule->file, capsule->line); + thread **prior = &(threads); + thread *match; + while ((match = *prior) != NULL) { + if (pthread_equal(match->id, me)) + break; + prior = &(match->next); + } + if (match == NULL) + fail(ESRCH, capsule->file, capsule->line, "reenter lost"); + + // mark this thread as done and move it to the head of the list + match->done = 1; + if (threads != match) { + *prior = match->next; + match->next = threads; + threads = match; + } + + // update the count of threads to be joined and alert join_all() + twist_(&(threads_lock), BY, +1, capsule->file, capsule->line); + + // free the capsule resource, even if the thread is cancelled (though yarn + // doesn't use pthread_cancel() -- you never know) + my_free(capsule); +} + +// All threads go through this routine. Just before a thread exits, it marks +// itself as done in the threads list and alerts join_all() so that the thread +// resources can be released. Use a cleanup stack so that the marking occurs +// even if the thread is cancelled. +local void *ignition(void *arg) { + struct capsule *capsule = (struct capsule *)arg; + + // run reenter() before leaving + pthread_cleanup_push(reenter, arg); + + // execute the requested function with argument + capsule->probe(capsule->payload); + + // mark this thread as done, letting join_all() know, and free capsule + pthread_cleanup_pop(1); + + // exit thread + return NULL; +} + +// Not all POSIX implementations create threads as joinable by default, so that +// is made explicit here. +thread *launch_(void (*probe)(void *), void *payload, char const *file, long line) { + // construct the requested call and argument for the ignition() routine + // (allocated instead of automatic so that we're sure this will still be + // there when ignition() actually starts up -- ignition() will free this + // allocation) + struct capsule *capsule = (struct capsule *)my_malloc(sizeof(struct capsule), file, line); + capsule->probe = probe; + capsule->payload = payload; + capsule->file = file; + capsule->line = line; + + // assure this thread is in the list before join_all() or ignition() looks + // for it + possess_(&(threads_lock), file, line); + + // create the thread and call ignition() from that thread + thread *th = (thread *)my_malloc(sizeof(struct thread_s), file, line); + pthread_attr_t attr; + int ret = pthread_attr_init(&attr); + if (ret) + fail(ret, file, line, "attr_init"); + ret = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + if (ret) + fail(ret, file, line, "attr_setdetachstate"); + ret = pthread_create(&(th->id), &attr, ignition, capsule); + if (ret) + fail(ret, file, line, "create"); + ret = pthread_attr_destroy(&attr); + if (ret) + fail(ret, file, line, "attr_destroy"); + + // put the thread in the threads list for join_all() + th->done = 0; + th->next = threads; + threads = th; + release_(&(threads_lock), file, line); + return th; +} + +void join_(thread *ally, char const *file, long line) { + // wait for thread to exit and return its resources + int ret = pthread_join(ally->id, NULL); + if (ret) + fail(ret, file, line, "join"); + + // find the thread in the threads list + possess_(&(threads_lock), file, line); + thread **prior = &(threads); + thread *match; + while ((match = *prior) != NULL) { + if (match == ally) + break; + prior = &(match->next); + } + if (match == NULL) + fail(ESRCH, file, line, "join lost"); + + // remove thread from list and update exited count, free thread + if (match->done) + threads_lock.value--; + *prior = match->next; + release_(&(threads_lock), file, line); + my_free(ally); +} + +// This implementation of join_all() only attempts to join threads that have +// announced that they have exited (see ignition()). When there are many +// threads, this is faster than waiting for some random thread to exit while a +// bunch of other threads have already exited. +int join_all_(char const *file, long line) { + // grab the threads list and initialize the joined count + int count = 0; + possess_(&(threads_lock), file, line); + + // do until threads list is empty + while (threads != NULL) { + // wait until at least one thread has reentered + wait_for_(&(threads_lock), NOT_TO_BE, 0, file, line); + + // find the first thread marked done (should be at or near the top) + thread **prior = &(threads); + thread *match; + while ((match = *prior) != NULL) { + if (match->done) + break; + prior = &(match->next); + } + if (match == NULL) + fail(ESRCH, file, line, "join_all lost"); + + // join the thread (will be almost immediate), remove from the threads + // list, update the reenter count, and free the thread + int ret = pthread_join(match->id, NULL); + if (ret) + fail(ret, file, line, "join"); + threads_lock.value--; + *prior = match->next; + my_free(match); + count++; + } + + // let go of the threads list and return the number of threads joined + release_(&(threads_lock), file, line); + return count; +} + +}; // namespace yarn \ No newline at end of file diff --git a/src/util/yarn.h b/src/util/yarn.h index e69de29..0c401e9 100644 --- a/src/util/yarn.h +++ b/src/util/yarn.h @@ -0,0 +1,150 @@ +/* yarn.h -- generic interface for thread operations + * Copyright (C) 2008, 2011, 2012, 2015, 2018, 2019, 2020 Mark Adler + * Version 1.7 12 Apr 2020 Mark Adler + */ + +/* + This software is provided 'as-is', without any express or implied + warranty. In no event will the author be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + Mark Adler + madler@alumni.caltech.edu + */ + +/* Basic thread operations + + This interface isolates the local operating system implementation of threads + from the application in order to facilitate platform independent use of + threads. All of the implementation details are deliberately hidden. + + Assuming adequate system resources and proper use, none of these functions + can fail. As a result, any errors encountered will cause an exit() to be + executed, or the execution of your own optionally-provided abort function. + + These functions allow the simple launching and joining of threads, and the + locking of objects and synchronization of changes of objects. The latter is + implemented with a single lock_t type that contains an integer value. The + value can be ignored for simple exclusive access to an object, or the value + can be used to signal and wait for changes to an object. + + -- Arguments -- + + thread *thread; identifier for launched thread, used by join + void probe(void *); pointer to function "probe", run when thread starts + void *payload; single argument passed to the probe function + lock_t *lock_t; a lock_t with a value -- used for exclusive access to + an object and to synchronize threads waiting for + changes to an object + long val; value to set lock_t, increment lock_t, or wait for + int n; number of threads joined + + -- Thread functions -- + + thread = launch(probe, payload) - launch a thread -- exit via probe() return + join(thread) - join a thread and by joining end it, waiting for the thread + to exit if it hasn't already -- will free the resources allocated by + launch() (don't try to join the same thread more than once) + n = join_all() - join all threads launched by launch() that are not joined + yet and free the resources allocated by the launches, usually to clean + up when the thread processing is done -- join_all() returns an int with + the count of the number of threads joined (join_all() should only be + called from the main thread, and should only be called after any calls + of join() have completed) + + -- Lock functions -- + + lock_t = new_lock(val) - create a new lock_t with initial value val (lock_t is + created in the released state) + possess(lock_t) - acquire exclusive possession of a lock_t, waiting if necessary + twist(lock_t, [TO | BY], val) - set lock_t to or increment lock_t by val, signal + all threads waiting on this lock_t and then release the lock_t -- must + possess the lock_t before calling (twist releases, so don't do a + release() after a twist() on the same lock_t) + wait_for(lock_t, [TO_BE | NOT_TO_BE | TO_BE_MORE_THAN | TO_BE_LESS_THAN], val) + - wait on lock_t value to be, not to be, be greater than, or be less than + val -- must possess the lock_t before calling, will possess the lock_t on + return but the lock_t is released while waiting to permit other threads + to use twist() to change the value and signal the change (so make sure + that the object is in a usable state when waiting) + release(lock_t) - release a possessed lock_t (do not try to release a lock_t that + the current thread does not possess) + val = peek_lock(lock_t) - return the value of the lock_t (assumes that lock_t is + already possessed, no possess or release is done by peek_lock()) + free_lock(lock_t) - free the resources allocated by new_lock() (application + must assure that the lock_t is released before calling free_lock()) + + -- Memory allocation --- + + yarn_mem(better_malloc, better_free) - set the memory allocation and free + routines for use by the yarn routines where the supplied routines have + the same interface and operation as malloc() and free(), and may be + provided in order to supply thread-safe memory allocation routines or + for any other reason -- by default malloc() and free() will be used + + -- Error control -- + + yarn_prefix - a char pointer to a string that will be the prefix for any + error messages that these routines generate before exiting -- if not + changed by the application, "yarn" will be used + yarn_abort - an external function that will be executed when there is an + internal yarn error, due to out of memory or misuse -- this function + may exit to abort the application, or if it returns, the yarn error + handler will exit (set to NULL by default for no action) + */ +#pragma once + +#include + +namespace yarn { + +extern char *yarn_prefix; +extern void (*yarn_abort)(int); + +void yarn_mem(void *(*)(size_t), void (*)(void *)); + +typedef struct thread_s thread; +thread *launch_(void (*)(void *), void *, char const *, long); +#define LAUNCH(a, b) launch_(a, b, __FILE__, __LINE__) +void join_(thread *, char const *, long); +#define JOIN(a) join_(a, __FILE__, __LINE__) +int join_all_(char const *, long); +#define JOIN_ALL() join_all_(__FILE__, __LINE__) + +typedef struct lock_s lock_t; +lock_t *new_lock_(long, char const *, long); +#define NEW_LOCK(a) new_lock_(a, __FILE__, __LINE__) +void possess_(lock_t *, char const *, long); +#define POSSESS(a) possess_(a, __FILE__, __LINE__) +void release_(lock_t *, char const *, long); +// #define release(a) release_(a, __FILE__, __LINE__) +#define RELEASE(a) release_(a, __FILE__, __LINE__) +enum twist_op { TO, BY }; +void twist_(lock_t *, enum twist_op, long, char const *, long); +#define TWIST(a, b, c) twist_(a, b, c, __FILE__, __LINE__) +enum wait_op { + TO_BE, + /* or */ NOT_TO_BE, /* that is the question */ + TO_BE_MORE_THAN, + TO_BE_LESS_THAN +}; +void wait_for_(lock_t *, enum wait_op, long, char const *, long); +#define WAIT_FOR(a, b, c) wait_for_(a, b, c, __FILE__, __LINE__) +long peek_lock(lock_t *); +#define PEEK_LOCK(a) peek_lock(a) +void free_lock_(lock_t *, char const *, long); +#define FREE_LOCK(a) free_lock_(a, __FILE__, __LINE__) + +}; // namespace yarn