diff --git a/.gitignore b/.gitignore index 04edd13..6b16b43 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ build.sh run.sh test.sh +*.log # Compiled Object files *.slo diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 03199c2..35f5e43 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -4,7 +4,7 @@ set(EXECUTABLE_OUTPUT_PATH "${PROJECT_BINARY_DIR}/bin") # source codes path aux_source_directory(${PROJECT_SOURCE_DIR}/src MAIN_SRC) aux_source_directory(${PROJECT_SOURCE_DIR}/src/util UTIL_SRC) -aux_source_directory(${PROJECT_SOURCE_DIR}/src/markdup MARKDUP_SRC) +aux_source_directory(${PROJECT_SOURCE_DIR}/src/bqsr BQSR_SRC) set(KTHREAD_FILE ${PROJECT_SOURCE_DIR}/ext/klib/kthread.c) @@ -20,7 +20,7 @@ link_directories("${PROJECT_SOURCE_DIR}/ext/htslib") set(PG_NAME "fastbqsr") # dependency files -add_executable(${PG_NAME} ${MAIN_SRC} ${UTIL_SRC} ${MARKDUP_SRC} ${KTHREAD_FILE}) +add_executable(${PG_NAME} ${MAIN_SRC} ${UTIL_SRC} ${BQSR_SRC} ${KTHREAD_FILE}) # link htslib target_link_libraries(${PG_NAME} libhts.a) diff --git a/src/bqsr/apply_bqsr_entry.cpp b/src/bqsr/apply_bqsr_entry.cpp new file mode 100644 index 0000000..e69de29 diff --git a/src/bqsr/apply_bqsr_pipeline.cpp b/src/bqsr/apply_bqsr_pipeline.cpp new file mode 100644 index 0000000..e69de29 diff --git a/src/bqsr/bqsr_args.h b/src/bqsr/bqsr_args.h index 36d9d98..0265beb 100644 --- a/src/bqsr/bqsr_args.h +++ b/src/bqsr/bqsr_args.h @@ -48,6 +48,9 @@ struct BQSRArg { string PROGRAM_RECORD_ID = "FastBQSR"; + // known sites vcf files + vector KNOWN_SITES_VCFS; + // end of common parameters /** diff --git a/src/bqsr/bqsr_entry.cpp b/src/bqsr/bqsr_entry.cpp index e77ec36..ab3e98e 100644 --- a/src/bqsr/bqsr_entry.cpp +++ b/src/bqsr/bqsr_entry.cpp @@ -8,33 +8,38 @@ Author : Zhang Zhonghai Date : 2023/10/23 */ #include +#include #include #include #include #include -#include "dup_metrics.h" -#include "fastbqsr_version.h" #include "bqsr_args.h" #include "bqsr_funcs.h" #include "bqsr_pipeline.h" +#include "dup_metrics.h" +#include "fastbqsr_version.h" #include "read_name_parser.h" #include "util/profiling.h" +#include "util/utils.h" #define BAM_BLOCK_SIZE 16L * 1024 * 1024 namespace nsgv { -BQSRArg gBqsrArg; // + std::vector gNameParsers; // read name parser +DuplicationMetrics gMetrics; // +DupResult gDupRes; +PipelineArg gPipe(&gDupRes); + +BQSRArg gBqsrArg; // samFile *gInBamFp; // bam sam_hdr_t *gInBamHeader; // bam samFile *gOutBamFp; // , sambam sam_hdr_t *gOutBamHeader; // header -DuplicationMetrics gMetrics; // -DupResult gDupRes; -PipelineArg gPipe(&gDupRes); +vector gKnownSitesVcfSrs; // known sites vcf srs }; // namespace nsgv // @@ -55,14 +60,44 @@ static string getFileExtension(const string &filename) { return filename.substr(last_dot + 1); } -// entrance of mark duplicates -int MarkDuplicates() { - PROF_START(whole_process); - /* */ - nsgv::gNameParsers.resize(nsgv::gBqsrArg.NUM_THREADS); - for (auto &parser : nsgv::gNameParsers) - parser.SetReadNameRegex(nsgv::gBqsrArg.READ_NAME_REGEX); // read nametile,x,y +// 串行bqsr +int SerialBQSR() { + int round = 0; + BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO); + inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM); + int64_t readNumSum = 0; + while (1) { + ++ round; + size_t readNum = 0; + if (inBamBuf.ReadStat() >= 0) + readNum = inBamBuf.ReadBam(); + if (readNum < 1) { + break; + } + spdlog::info("{} reads processed in {} round", readNum, round); + auto bams = inBamBuf.GetBamArr(); + spdlog::info("region: {} - {}", bams[0]->softclip_start(), bams.back()->softclip_end()); + // 1. 获取bams数组覆盖的region范围 + + // 2. 开辟一个uint32_t的数组作为bitmap(如果上一轮的不够就重开),用来表示region的每个位点是否有known sites覆盖(每轮使用前需清零) + + // 3. 读取在region范围内的所有known sites,并为对应的bitmap设定0 or 1 (作为skip标识) + + // 4. 遍历bams数组中的每一条记录并进行处理 + + readNumSum += readNum; + inBamBuf.ClearAll(); // + } + spdlog::info("read count: {}", readNumSum); + + return 0; +} + +// entrance of mark duplicates +int BaseRecalibrator() { + + PROF_START(whole_process); /* bam */ nsgv::gInBamFp = sam_open_format(nsgv::gBqsrArg.INPUT_FILE.c_str(), "r", nullptr); if (!nsgv::gInBamFp) { @@ -71,10 +106,11 @@ int MarkDuplicates() { } hts_set_opt(nsgv::gInBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE); nsgv::gInBamHeader = sam_hdr_read(nsgv::gInBamFp); // header + // (libraryId) nsgv::gMetrics.LIBRARY = sam_hdr_line_name(nsgv::gInBamHeader, "RG", 0); - /* */ + /* 并行读取bam数据 */ htsThreadPool htsPoolRead = {NULL, 0}; // , htsThreadPool htsPoolWrite = {NULL, 0}; // htsPoolRead.pool = hts_tpool_init(nsgv::gBqsrArg.NUM_THREADS); @@ -86,286 +122,28 @@ int MarkDuplicates() { } hts_set_opt(nsgv::gInBamFp, HTS_OPT_THREAD_POOL, &htsPoolRead); - /* */ - PROF_START(markdup_all); - PipelineMarkDups(); - PROF_END(gprof[GP_markdup_all], markdup_all); + return SerialBQSR(); - /* , */ - char modeout[12] = "wb"; - sam_open_mode(modeout + 1, nsgv::gBqsrArg.OUTPUT_FILE.c_str(), NULL); - nsgv::gOutBamFp = sam_open(nsgv::gBqsrArg.OUTPUT_FILE.c_str(), modeout); - if (!nsgv::gOutBamFp) { - spdlog::error("[{}] create output sam/bam file failed.\n", __func__); - return -1; - } - nsgv::gOutBamHeader = sam_hdr_dup(nsgv::gInBamHeader); - // header - sam_hdr_add_line(nsgv::gOutBamHeader, "PG", "ID", nsgv::gBqsrArg.PROGRAM_RECORD_ID.c_str(), "VN", FASTDUP_VERSION, - "CL", nsgv::gBqsrArg.CLI_STR.c_str(), NULL); +// // 读取known sites vcfs +// for (const auto& ks : nsgv::gBqsrArg.KNOWN_SITES_VCFS) { +// spdlog::info(" {}", ks); +// bcf_srs_t* srs = bcf_sr_init(); +// if (!bcf_sr_add_reader(srs, ks.c_str())) +// error("Failed to read from %s: %s\n", !strcmp("-", ks.c_str()) ? "standard input" : ks.c_str(), bcf_sr_strerror(srs->errnum)); +// nsgv::gKnownSitesVcfSrs.push_back(srs); +// +// while (bcf_sr_next_line(srs)) { +// bcf1_t* line = srs->readers[0].buffer[0]; +// cout << line->pos << '\t' << line->rlen << '\t' << line->n_allele << '\t' << line->n_info << endl; +// } +// } +// +// /* 先实现串行的bqsr-phase-1 */ + - // - hts_set_opt(nsgv::gOutBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE); - hts_set_opt(nsgv::gOutBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite); - sam_close(nsgv::gInBamFp); // bam - nsgv::gInBamFp = sam_open_format(nsgv::gBqsrArg.INPUT_FILE.c_str(), "r", nullptr); - if (!nsgv::gInBamFp) { - spdlog::error("[{}] load sam/bam file failed.\n", __func__); - return -1; - } - hts_set_opt(nsgv::gInBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE); - hts_set_opt(nsgv::gInBamFp, HTS_OPT_THREAD_POOL, &htsPoolRead); - nsgv::gInBamHeader = sam_hdr_read(nsgv::gInBamFp); - if (sam_hdr_write(nsgv::gOutBamFp, nsgv::gOutBamHeader) != 0) { - spdlog::error("failed writing header to \"{}\"", nsgv::gBqsrArg.OUTPUT_FILE); - sam_close(nsgv::gOutBamFp); - sam_close(nsgv::gInBamFp); - return -1; - } - // index - string indexFn = nsgv::gBqsrArg.OUTPUT_FILE + ".bai"; // min_shift = 0 bai - if ("sam" == getFileExtension(nsgv::gBqsrArg.OUTPUT_FILE) || !nsgv::gBqsrArg.CREATE_INDEX) { - indexFn = ""; - } - if (!indexFn.empty()) { - int index_min_shift = 0; - if (nsgv::gBqsrArg.INDEX_FORMAT == nsbqsr::IndexFormat::CSI) { - indexFn = nsgv::gBqsrArg.OUTPUT_FILE + ".csi"; - index_min_shift = 14; - } - if (sam_idx_init(nsgv::gOutBamFp, nsgv::gOutBamHeader, 0 /*csi 14*/, indexFn.c_str()) < 0) { - spdlog::error("failed to open index \"{}\" for writing", indexFn); - sam_close(nsgv::gOutBamFp); - sam_close(nsgv::gInBamFp); - return -1; - } - } - // , - BamBufType inBuf(nsgv::gBqsrArg.DUPLEX_IO); - inBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM); - DupIdxQueue dupIdxQue, repIdxQue; - DupIdxQueue opticalIdxQue; - dupIdxQue.Init(&nsgv::gDupRes.dupIdxArr); - repIdxQue.Init(&nsgv::gDupRes.repIdxArr); - opticalIdxQue.Init(&nsgv::gDupRes.opticalDupIdxArr); - spdlog::info("{} duplicate reads found", dupIdxQue.Size()); - spdlog::info("{} optical reads found", opticalIdxQue.Size()); - // spdlog::info("{} represent reads found", repIdxQue.Size()); - // dupIdxQue.RealSize("na12878.dup"); - // opticalIdxQue.RealSize("normal.odup"); - // return 0; - - uint64_t bamIdx = 0; - DupInfo dupIdx = dupIdxQue.Pop(); - DupInfo repIdx = repIdxQue.Pop(); - uint64_t opticalIdx = opticalIdxQue.Pop(); - - ByteBuf bytes; - bam1_t *bp = bam_init1(); - bool isDup = false; - bool isOpticalDup = false; - bool isInDuplicateSet = false; - uint32_t representativeReadIndexInFile = 0; - uint32_t duplicateSetSize = 0; - - int64_t realDupSize = 0; - - PROF_START(write); - while (inBuf.ReadStat() >= 0) { - PROF_START(final_read); - size_t readNum = inBuf.ReadBam(); - PROF_END(gprof[GP_final_read], final_read); - // PROF_PRINT_START(read_write); - for (size_t i = 0; i < readNum; ++i) { - BamWrap *bw = inBuf[i]; - if (bam_copy1(bp, bw->b) == nullptr) { - spdlog::error("Can not copy sam record!"); - return -1; - } - bw->b = bp; - isDup = false; - isOpticalDup = false; - isInDuplicateSet = false; - - // duplicate tag - if (nsgv::gBqsrArg.CLEAR_DT) { - uint8_t *oldTagVal = bam_aux_get(bw->b, nsgv::gBqsrArg.DUPLICATE_TYPE_TAG.c_str()); - if (oldTagVal != NULL) bam_aux_del(bw->b, oldTagVal); - } - - // - if (bw->GetReadUnmappedFlag()) { - ++nsgv::gMetrics.UNMAPPED_READS; - } else if (bw->IsSecondaryOrSupplementary()) { - ++nsgv::gMetrics.SECONDARY_OR_SUPPLEMENTARY_RDS; - } else if (!bw->GetReadPairedFlag() || bw->GetMateUnmappedFlag()) { - ++nsgv::gMetrics.UNPAIRED_READS_EXAMINED; - } else { - ++nsgv::gMetrics.READ_PAIRS_EXAMINED; // will need to be divided by 2 at the end - } - - /* */ - if (bamIdx == dupIdx) { - ++realDupSize; // for test - isDup = true; - if (nsgv::gBqsrArg.TAG_DUPLICATE_SET_MEMBERS && dupIdx.dupSet != 0) { - isInDuplicateSet = true; - representativeReadIndexInFile = dupIdx.GetRepIdx(); - duplicateSetSize = dupIdx.dupSet; - } - - // ,dupidx,duprepIdxdupSetSize - while ((dupIdx = dupIdxQue.Pop()) == bamIdx); - if (opticalIdx == bamIdx) - isOpticalDup = true; - else if (opticalIdx < bamIdx) { - while ((opticalIdx = opticalIdxQue.Pop()) < bamIdx); - if (opticalIdx == bamIdx) - isOpticalDup = true; - } - - // - bw->SetDuplicateReadFlag(true); - - uint8_t *oldTagVal = bam_aux_get(bw->b, nsgv::gBqsrArg.DUPLICATE_TYPE_TAG.c_str()); - if (oldTagVal != NULL) bam_aux_del(bw->b, oldTagVal); - - if (isOpticalDup) - bam_aux_append(bw->b, nsgv::gBqsrArg.DUPLICATE_TYPE_TAG.c_str(), 'Z', - nsgv::gBqsrArg.DUPLICATE_TYPE_SEQUENCING.size() + 1, - (const uint8_t *)nsgv::gBqsrArg.DUPLICATE_TYPE_SEQUENCING.c_str()); - else - bam_aux_append(bw->b, nsgv::gBqsrArg.DUPLICATE_TYPE_TAG.c_str(), 'Z', - nsgv::gBqsrArg.DUPLICATE_TYPE_LIBRARY.size() + 1, - (const uint8_t *)nsgv::gBqsrArg.DUPLICATE_TYPE_LIBRARY.c_str()); - - // - if (!bw->IsSecondaryOrSupplementary() && !bw->GetReadUnmappedFlag()) { - // Update the duplication metrics - if (!bw->GetReadPairedFlag() || bw->GetMateUnmappedFlag()) { - ++nsgv::gMetrics.UNPAIRED_READ_DUPLICATES; - } else { - ++nsgv::gMetrics.READ_PAIR_DUPLICATES; // will need to be divided by 2 at the end - } - } - } else { - bw->SetDuplicateReadFlag(false); - } - if (nsgv::gBqsrArg.TAG_DUPLICATE_SET_MEMBERS && bamIdx == repIdx) { // repressent - isInDuplicateSet = true; - representativeReadIndexInFile = repIdx.GetRepIdx(); - duplicateSetSize = repIdx.dupSet; - while (repIdx == bamIdx || repIdx.dupSet == 0) { - if (repIdxQue.Size() > 0) - repIdx = repIdxQue.Pop(); - else { - repIdx = -1; - break; - } - } - } - - if (nsgv::gBqsrArg.TAG_DUPLICATE_SET_MEMBERS && isInDuplicateSet) { - if (!bw->IsSecondaryOrSupplementary() && !bw->GetReadUnmappedFlag()) { - uint8_t *oldTagVal = bam_aux_get(bw->b, nsgv::gBqsrArg.DUPLICATE_SET_INDEX_TAG.c_str()); - if (oldTagVal != NULL) - bam_aux_del(bw->b, oldTagVal); - bam_aux_append(bw->b, nsgv::gBqsrArg.DUPLICATE_SET_INDEX_TAG.c_str(), 'i', - sizeof(representativeReadIndexInFile), (uint8_t *)&representativeReadIndexInFile); - oldTagVal = bam_aux_get(bw->b, nsgv::gBqsrArg.DUPLICATE_SET_SIZE_TAG.c_str()); - if (oldTagVal != NULL) - bam_aux_del(bw->b, oldTagVal); - bam_aux_append(bw->b, nsgv::gBqsrArg.DUPLICATE_SET_SIZE_TAG.c_str(), 'i', sizeof(duplicateSetSize), - (const uint8_t *)&duplicateSetSize); - } - } - // readoutput,,record - ++bamIdx; - if (isDup && nsgv::gBqsrArg.REMOVE_DUPLICATES) { - continue; - } - if (isOpticalDup && nsgv::gBqsrArg.REMOVE_SEQUENCING_DUPLICATES) { - continue; - } - if (!nsgv::gBqsrArg.PROGRAM_RECORD_ID.empty() && nsgv::gBqsrArg.ADD_PG_TAG_TO_READS) { - uint8_t *oldTagVal = bam_aux_get(bw->b, "PG"); - if (oldTagVal != NULL) - bam_aux_del(bw->b, oldTagVal); - bam_aux_append(bw->b, "PG", 'Z', nsgv::gBqsrArg.PROGRAM_RECORD_ID.size() + 1, - (const uint8_t *)nsgv::gBqsrArg.PROGRAM_RECORD_ID.c_str()); - } -#if 1 - if (sam_write1(nsgv::gOutBamFp, nsgv::gOutBamHeader, bw->b) < 0) { - spdlog::error("failed writing sam record to \"{}\"", nsgv::gBqsrArg.OUTPUT_FILE.c_str()); - sam_close(nsgv::gOutBamFp); - sam_close(nsgv::gInBamFp); - return -1; - } -#endif - } - inBuf.ClearAll(); - // PROF_PRINT_END(read_write); - spdlog::info("write {} reads to output", readNum); - } - bam_destroy1(bp); - - // - nsgv::gMetrics.READ_PAIRS_EXAMINED /= 2; - nsgv::gMetrics.READ_PAIR_DUPLICATES /= 2; - for (auto &arr : nsgv::gDupRes.opticalDupIdxArr) nsgv::gMetrics.READ_PAIR_OPTICAL_DUPLICATES += arr.size(); - nsgv::gMetrics.READ_PAIR_OPTICAL_DUPLICATES = nsgv::gMetrics.READ_PAIR_OPTICAL_DUPLICATES / 2; - nsgv::gMetrics.ESTIMATED_LIBRARY_SIZE = - estimateLibrarySize(nsgv::gMetrics.READ_PAIRS_EXAMINED - nsgv::gMetrics.READ_PAIR_OPTICAL_DUPLICATES, - nsgv::gMetrics.READ_PAIRS_EXAMINED - nsgv::gMetrics.READ_PAIR_DUPLICATES); - if (nsgv::gMetrics.UNPAIRED_READS_EXAMINED + nsgv::gMetrics.READ_PAIRS_EXAMINED != 0) { - nsgv::gMetrics.PERCENT_DUPLICATION = - (nsgv::gMetrics.UNPAIRED_READ_DUPLICATES + nsgv::gMetrics.READ_PAIR_DUPLICATES * 2) / - (double)(nsgv::gMetrics.UNPAIRED_READS_EXAMINED + nsgv::gMetrics.READ_PAIRS_EXAMINED * 2); - } else { - nsgv::gMetrics.PERCENT_DUPLICATION = 0; - } - calculateRoiHistogram(nsgv::gMetrics); - - // - if (!nsgv::gBqsrArg.METRICS_FILE.empty()) { - ofstream ofsM(nsgv::gBqsrArg.METRICS_FILE); - ofsM << "## StringHeader" << endl; - ofsM << "# " << nsgv::gBqsrArg.CLI_STR << endl; - ofsM << "## StringHeader" << endl; - ofsM << "# Started on: " << nsgv::gBqsrArg.START_TIME << endl << endl; - ofsM << "## METRICS" - << endl; - ofsM << "LIBRARY\tUNPAIRED_READS_EXAMINED\tREAD_PAIRS_EXAMINED\tSECONDARY_OR_SUPPLEMENTARY_RDS\tUNMAPPED_" - "READS\tUNPAIRED_READ_DUPLICATES\tREAD_PAIR_DUPLICATES\tREAD_PAIR_OPTICAL_DUPLICATES\tPERCENT_" - "DUPLICATION\tESTIMATED_LIBRARY_SIZE" - << endl; - ofsM << nsgv::gMetrics.LIBRARY << "\t" << nsgv::gMetrics.UNPAIRED_READS_EXAMINED << "\t" << nsgv::gMetrics.READ_PAIRS_EXAMINED - << "\t" << nsgv::gMetrics.SECONDARY_OR_SUPPLEMENTARY_RDS << "\t" << nsgv::gMetrics.UNMAPPED_READS << "\t" - << nsgv::gMetrics.UNPAIRED_READ_DUPLICATES << "\t" << nsgv::gMetrics.READ_PAIR_DUPLICATES << "\t" - << nsgv::gMetrics.READ_PAIR_OPTICAL_DUPLICATES << "\t" << nsgv::gMetrics.PERCENT_DUPLICATION << "\t" - << nsgv::gMetrics.ESTIMATED_LIBRARY_SIZE << endl - << endl; - ofsM << "## HISTOGRAM\tDouble" << endl; - ofsM << "BIN CoverageMult" << endl; - for (int i = 1; i <= 100; ++i) { - ofsM << i << "\t" << std::fixed << std::setprecision(6) << nsgv::gMetrics.CoverageMult[i] << endl; - } - ofsM.close(); - } - PROF_END(gprof[GP_write], write); - - if (!indexFn.empty() && sam_idx_save(nsgv::gOutBamFp) < 0) { - spdlog::error("writing index failed"); - sam_close(nsgv::gOutBamFp); - sam_close(nsgv::gInBamFp); - return -1; - } - - /* , */ - sam_close(nsgv::gOutBamFp); sam_close(nsgv::gInBamFp); PROF_END(gprof[GP_whole_process], whole_process); diff --git a/src/bqsr/bqsr_funcs.cpp b/src/bqsr/bqsr_funcs.cpp index 901b5ed..6b2db1f 100644 --- a/src/bqsr/bqsr_funcs.cpp +++ b/src/bqsr/bqsr_funcs.cpp @@ -1,4 +1,4 @@ -#include "md_funcs.h" +#include "bqsr_funcs.h" #include #include @@ -13,7 +13,7 @@ #include #include "dup_metrics.h" -#include "md_args.h" +#include "bqsr_args.h" #include "read_ends.h" #include "read_name_parser.h" @@ -25,7 +25,7 @@ using std::unordered_map; using std::vector; namespace nsgv { -extern MarkDupsArg gBqsrArg; +extern BQSRArg gBqsrArg; extern DuplicationMetrics gMetrics; }; @@ -33,40 +33,8 @@ extern DuplicationMetrics gMetrics; * read */ int16_t computeDuplicateScore(BamWrap &bw) { - int16_t score = 0; - switch (nsgv::gBqsrArg.DUPLICATE_SCORING_STRATEGY) { - case nsbqsr::SUM_OF_BASE_QUALITIES: - // two (very) long reads worth of high-quality bases can go over - // Short.MAX_VALUE/2 and risk overflow. - score += (int16_t)min(bw.GetSumOfBaseQualities(), INT16_MAX / 2); - break; - case nsbqsr::TOTAL_MAPPED_REFERENCE_LENGTH: - if (!bw.GetReadUnmappedFlag()) - // no need to remember the score since this scoring mechanism is - // symmetric - score = (int16_t)min(bw.GetReferenceLength(), INT16_MAX / 2); - break; - case nsbqsr::RANDOM: - // The RANDOM score gives the same score to both reads so that they get - // filtered together. it's not critical do use the readName since the - // scores from both ends get added, but it seem to be clearer this way. - score += (short)(Murmur3::Instance().HashUnencodedChars(bw.query_name()) & 0b11111111111111); - // subtract Short.MIN_VALUE/4 from it to end up with a number between - // 0 and Short.MAX_VALUE/2. This number can be then discounted in case - // the read is not passing filters. We need to stay far from overflow so - // that when we add the two scores from the two read mates we do not - // overflow since that could cause us to chose a failing read-pair - // instead of a passing one. - score -= INT16_MIN / 4; - default: - break; - } - // make sure that filter-failing records are heavily discounted. (the - // discount can happen twice, once for each mate, so need to make sure we do - // not subtract more than Short.MIN_VALUE overall.) - score += bw.GetReadFailsVendorQualityCheckFlag() ? (int16_t)(INT16_MIN / 2) : 0; - return score; + return 1; } /* @@ -90,8 +58,6 @@ void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnd // Fill in the location information for optical duplicates if (!ReadNameParser::sWrongNameFormat) rnParser.AddLocationInformation(bw.query_name(), pKey); - else - nsgv::gBqsrArg.CHECK_OPTICAL_DUP = false; // cout << k.tile << ' ' << k.x << ' ' << k.y << endl; // key k.posKey = BamWrap::bam_global_pos(k.read1ReferenceIndex, k.read1Coordinate); // << 1 | k.orientation; diff --git a/src/bqsr/bqsr_pipeline.cpp b/src/bqsr/bqsr_pipeline.cpp index dd9495c..65ee25e 100644 --- a/src/bqsr/bqsr_pipeline.cpp +++ b/src/bqsr/bqsr_pipeline.cpp @@ -1,4 +1,4 @@ -#include "md_pipeline.h" +#include "bqsr_pipeline.h" #include #include @@ -9,8 +9,8 @@ #include #include "dup_metrics.h" -#include "md_args.h" -#include "md_funcs.h" +#include "bqsr_args.h" +#include "bqsr_funcs.h" #include "read_ends.h" #include "read_name_parser.h" #include "util/bam_buf.h" @@ -22,7 +22,7 @@ using namespace std; namespace nsgv { -extern MarkDupsArg gBqsrArg; // +extern BQSRArg gBqsrArg; // extern samFile *gInBamFp; // bam extern sam_hdr_t *gInBamHeader; // bam extern DuplicationMetrics gMetrics; // @@ -35,73 +35,6 @@ extern PipelineArg gPipe; static void markDupsForPairs(vector &vpRe, DPSet *dupIdx, MDSet *opticalDupIdx, DPSet *repIdx, MDSet *notDupIdx = nullptr, MDSet *notOpticalDupIdx = nullptr, MDSet *notRepIdx = nullptr) { - if (vpRe.size() < 2) { - return; - } - int maxScore = 0; - const ReadEnds *pBest = nullptr; - /** All read ends should have orientation FF, FR, RF, or RR **/ - for (auto pe : vpRe) { // readend - if (pe->score > maxScore || pBest == nullptr) { - maxScore = pe->score; - pBest = pe; - } - } - - if (notDupIdx != nullptr) { - notDupIdx->insert(pBest->read1IndexInFile); - notDupIdx->insert(pBest->read2IndexInFile); - } - - if (nsgv::gBqsrArg.CHECK_OPTICAL_DUP) { // - // trackOpticalDuplicates - vector prevOpticalRe; - if (notOpticalDupIdx != nullptr) { - for (auto pe : vpRe) { - if (pe->isOpticalDuplicate) { - prevOpticalRe.push_back(pe); - } - } - } - trackOpticalDuplicates(vpRe, pBest); - // , - if (notOpticalDupIdx != nullptr) { - for (auto pe : prevOpticalRe) { - if (!pe->isOpticalDuplicate) { - notOpticalDupIdx->insert(pe->read1IndexInFile); - notOpticalDupIdx->insert(pe->read2IndexInFile); - } - } - } - } - for (auto pe : vpRe) { // best read - if (pe != pBest) { // best - dupIdx->insert(DupInfo(pe->read1IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); // read1 - if (pe->read2IndexInFile != pe->read1IndexInFile) - dupIdx->insert(DupInfo(pe->read2IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); // read2, - // read1 - // optical dup - if (pe->isOpticalDuplicate && opticalDupIdx != nullptr) { - opticalDupIdx->insert(pe->read1IndexInFile); - if (pe->read2IndexInFile != pe->read1IndexInFile) - opticalDupIdx->insert(pe->read2IndexInFile); - } - } - } - // bamtag, bestdupset - if (nsgv::gBqsrArg.TAG_DUPLICATE_SET_MEMBERS) { - repIdx->insert(DupInfo(pBest->read1IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); - repIdx->insert(DupInfo(pBest->read2IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); - if (notRepIdx != nullptr) { - for (auto pe : vpRe) { - if (pe != pBest) { - notRepIdx->insert(pe->read1IndexInFile); - if (pe->read2IndexInFile != pe->read1IndexInFile) - notRepIdx->insert(pe->read2IndexInFile); - } - } - } - } } /* pairedreadends, */ diff --git a/src/main.cpp b/src/main.cpp index 4f1b226..eada665 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -64,7 +64,7 @@ int main_BaseRecalibrator(int argc, char *argv[]) { "One or more databases of known polymorphic sites used to exclude regions around known polymorphisms from " "analysis. This argument must be specified at least once.") .metavar("") - .required(); + .nargs(argparse::nargs_pattern::any); program.add_argument("--create-index") .help("Whether to create an index when writing coordinate sorted BAM output.") @@ -114,6 +114,12 @@ int main_BaseRecalibrator(int argc, char *argv[]) { nsgv::gBqsrArg.NUM_THREADS = program.get("--num-threads"); nsgv::gBqsrArg.CREATE_INDEX = program.get("--create-index"); + nsgv::gBqsrArg.KNOWN_SITES_VCFS = program.get>("--known-sites"); + // spdlog::info("known sites vcf files:"); + // for (const auto& ks : nsgv::gBqsrArg.KNOWN_SITES_VCFS) { + // spdlog::info(" {}", ks); + // } + nsgv::gBqsrArg.INDEX_FORMAT = program.get("--index-format") == "BAI" ? nsbqsr::IndexFormat::BAI : nsbqsr::IndexFormat::CSI; @@ -131,14 +137,14 @@ int main_BaseRecalibrator(int argc, char *argv[]) { return 0; } -int main_ApplyBQSR(int argc, char *argv[]) {} +int main_ApplyBQSR(int argc, char* argv[]) { return 0; } int main(int argc, char *argv[]) { // init log - spdlog::set_default_logger(spdlog::stderr_color_st("fastdup")); + spdlog::set_default_logger(spdlog::stderr_color_st("fastbqsr")); spdlog::cfg::load_env_levels(); - string bqsr_prog = argv[0]; + string bqsr_prog = argv[1]; if (bqsr_prog == "BaseRecalibrator") { return main_BaseRecalibrator(argc - 1, argv + 1); } else if (bqsr_prog == "ApplyBQSR") { diff --git a/src/util/utils.cpp b/src/util/utils.cpp new file mode 100644 index 0000000..34707f0 --- /dev/null +++ b/src/util/utils.cpp @@ -0,0 +1,13 @@ +#include +#include +#include +#include +#include + +void error(const char* format, ...) { + va_list ap; + va_start(ap, format); + vfprintf(stderr, format, ap); + va_end(ap); + exit(-1); +} \ No newline at end of file diff --git a/src/util/utils.h b/src/util/utils.h new file mode 100644 index 0000000..844eac1 --- /dev/null +++ b/src/util/utils.h @@ -0,0 +1,4 @@ +#pragma once + +// Report an error and exit -1 +void error(const char* format, ...); \ No newline at end of file diff --git a/test/stats.table b/test/stats.table new file mode 100644 index 0000000..e69de29