搭建串行框架
This commit is contained in:
parent
14f1611ab6
commit
0fca937fab
|
|
@ -7,6 +7,7 @@
|
|||
build.sh
|
||||
run.sh
|
||||
test.sh
|
||||
*.log
|
||||
|
||||
# Compiled Object files
|
||||
*.slo
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ set(EXECUTABLE_OUTPUT_PATH "${PROJECT_BINARY_DIR}/bin")
|
|||
# source codes path
|
||||
aux_source_directory(${PROJECT_SOURCE_DIR}/src MAIN_SRC)
|
||||
aux_source_directory(${PROJECT_SOURCE_DIR}/src/util UTIL_SRC)
|
||||
aux_source_directory(${PROJECT_SOURCE_DIR}/src/markdup MARKDUP_SRC)
|
||||
aux_source_directory(${PROJECT_SOURCE_DIR}/src/bqsr BQSR_SRC)
|
||||
|
||||
set(KTHREAD_FILE ${PROJECT_SOURCE_DIR}/ext/klib/kthread.c)
|
||||
|
||||
|
|
@ -20,7 +20,7 @@ link_directories("${PROJECT_SOURCE_DIR}/ext/htslib")
|
|||
set(PG_NAME "fastbqsr")
|
||||
|
||||
# dependency files
|
||||
add_executable(${PG_NAME} ${MAIN_SRC} ${UTIL_SRC} ${MARKDUP_SRC} ${KTHREAD_FILE})
|
||||
add_executable(${PG_NAME} ${MAIN_SRC} ${UTIL_SRC} ${BQSR_SRC} ${KTHREAD_FILE})
|
||||
|
||||
# link htslib
|
||||
target_link_libraries(${PG_NAME} libhts.a)
|
||||
|
|
|
|||
|
|
@ -48,6 +48,9 @@ struct BQSRArg {
|
|||
|
||||
string PROGRAM_RECORD_ID = "FastBQSR";
|
||||
|
||||
// known sites vcf files
|
||||
vector<string> KNOWN_SITES_VCFS;
|
||||
|
||||
// end of common parameters
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -8,33 +8,38 @@ Author : Zhang Zhonghai
|
|||
Date : 2023/10/23
|
||||
*/
|
||||
#include <htslib/sam.h>
|
||||
#include <htslib/synced_bcf_reader.h>
|
||||
#include <htslib/thread_pool.h>
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
|
||||
#include "dup_metrics.h"
|
||||
#include "fastbqsr_version.h"
|
||||
#include "bqsr_args.h"
|
||||
#include "bqsr_funcs.h"
|
||||
#include "bqsr_pipeline.h"
|
||||
#include "dup_metrics.h"
|
||||
#include "fastbqsr_version.h"
|
||||
#include "read_name_parser.h"
|
||||
#include "util/profiling.h"
|
||||
#include "util/utils.h"
|
||||
|
||||
#define BAM_BLOCK_SIZE 16L * 1024 * 1024
|
||||
|
||||
namespace nsgv {
|
||||
|
||||
BQSRArg gBqsrArg; //
|
||||
|
||||
std::vector<ReadNameParser> gNameParsers; // read name parser
|
||||
DuplicationMetrics gMetrics; //
|
||||
DupResult gDupRes;
|
||||
PipelineArg gPipe(&gDupRes);
|
||||
|
||||
BQSRArg gBqsrArg; //
|
||||
samFile *gInBamFp; // bam
|
||||
sam_hdr_t *gInBamHeader; // bam
|
||||
samFile *gOutBamFp; // , sambam
|
||||
sam_hdr_t *gOutBamHeader; // header
|
||||
DuplicationMetrics gMetrics; //
|
||||
DupResult gDupRes;
|
||||
PipelineArg gPipe(&gDupRes);
|
||||
vector <bcf_srs_t*> gKnownSitesVcfSrs; // known sites vcf srs
|
||||
}; // namespace nsgv
|
||||
|
||||
//
|
||||
|
|
@ -55,14 +60,44 @@ static string getFileExtension(const string &filename) {
|
|||
return filename.substr(last_dot + 1);
|
||||
}
|
||||
|
||||
// entrance of mark duplicates
|
||||
int MarkDuplicates() {
|
||||
PROF_START(whole_process);
|
||||
/* */
|
||||
nsgv::gNameParsers.resize(nsgv::gBqsrArg.NUM_THREADS);
|
||||
for (auto &parser : nsgv::gNameParsers)
|
||||
parser.SetReadNameRegex(nsgv::gBqsrArg.READ_NAME_REGEX); // read nametile,x,y
|
||||
// 串行bqsr
|
||||
int SerialBQSR() {
|
||||
int round = 0;
|
||||
BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO);
|
||||
inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM);
|
||||
int64_t readNumSum = 0;
|
||||
while (1) {
|
||||
++ round;
|
||||
size_t readNum = 0;
|
||||
if (inBamBuf.ReadStat() >= 0)
|
||||
readNum = inBamBuf.ReadBam();
|
||||
if (readNum < 1) {
|
||||
break;
|
||||
}
|
||||
spdlog::info("{} reads processed in {} round", readNum, round);
|
||||
|
||||
auto bams = inBamBuf.GetBamArr();
|
||||
spdlog::info("region: {} - {}", bams[0]->softclip_start(), bams.back()->softclip_end());
|
||||
// 1. 获取bams数组覆盖的region范围
|
||||
|
||||
// 2. 开辟一个uint32_t的数组作为bitmap(如果上一轮的不够就重开),用来表示region的每个位点是否有known sites覆盖(每轮使用前需清零)
|
||||
|
||||
// 3. 读取在region范围内的所有known sites,并为对应的bitmap设定0 or 1 (作为skip标识)
|
||||
|
||||
// 4. 遍历bams数组中的每一条记录并进行处理
|
||||
|
||||
readNumSum += readNum;
|
||||
inBamBuf.ClearAll(); //
|
||||
}
|
||||
spdlog::info("read count: {}", readNumSum);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// entrance of mark duplicates
|
||||
int BaseRecalibrator() {
|
||||
|
||||
PROF_START(whole_process);
|
||||
/* bam */
|
||||
nsgv::gInBamFp = sam_open_format(nsgv::gBqsrArg.INPUT_FILE.c_str(), "r", nullptr);
|
||||
if (!nsgv::gInBamFp) {
|
||||
|
|
@ -71,10 +106,11 @@ int MarkDuplicates() {
|
|||
}
|
||||
hts_set_opt(nsgv::gInBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
||||
nsgv::gInBamHeader = sam_hdr_read(nsgv::gInBamFp); // header
|
||||
|
||||
// (libraryId)
|
||||
nsgv::gMetrics.LIBRARY = sam_hdr_line_name(nsgv::gInBamHeader, "RG", 0);
|
||||
|
||||
/* */
|
||||
/* 并行读取bam数据 */
|
||||
htsThreadPool htsPoolRead = {NULL, 0}; // ,
|
||||
htsThreadPool htsPoolWrite = {NULL, 0}; //
|
||||
htsPoolRead.pool = hts_tpool_init(nsgv::gBqsrArg.NUM_THREADS);
|
||||
|
|
@ -86,286 +122,28 @@ int MarkDuplicates() {
|
|||
}
|
||||
hts_set_opt(nsgv::gInBamFp, HTS_OPT_THREAD_POOL, &htsPoolRead);
|
||||
|
||||
/* */
|
||||
PROF_START(markdup_all);
|
||||
PipelineMarkDups();
|
||||
PROF_END(gprof[GP_markdup_all], markdup_all);
|
||||
|
||||
/* , */
|
||||
char modeout[12] = "wb";
|
||||
sam_open_mode(modeout + 1, nsgv::gBqsrArg.OUTPUT_FILE.c_str(), NULL);
|
||||
nsgv::gOutBamFp = sam_open(nsgv::gBqsrArg.OUTPUT_FILE.c_str(), modeout);
|
||||
if (!nsgv::gOutBamFp) {
|
||||
spdlog::error("[{}] create output sam/bam file failed.\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
nsgv::gOutBamHeader = sam_hdr_dup(nsgv::gInBamHeader);
|
||||
// header
|
||||
sam_hdr_add_line(nsgv::gOutBamHeader, "PG", "ID", nsgv::gBqsrArg.PROGRAM_RECORD_ID.c_str(), "VN", FASTDUP_VERSION,
|
||||
"CL", nsgv::gBqsrArg.CLI_STR.c_str(), NULL);
|
||||
return SerialBQSR();
|
||||
|
||||
// // 读取known sites vcfs
|
||||
// for (const auto& ks : nsgv::gBqsrArg.KNOWN_SITES_VCFS) {
|
||||
// spdlog::info(" {}", ks);
|
||||
// bcf_srs_t* srs = bcf_sr_init();
|
||||
// if (!bcf_sr_add_reader(srs, ks.c_str()))
|
||||
// error("Failed to read from %s: %s\n", !strcmp("-", ks.c_str()) ? "standard input" : ks.c_str(), bcf_sr_strerror(srs->errnum));
|
||||
// nsgv::gKnownSitesVcfSrs.push_back(srs);
|
||||
//
|
||||
hts_set_opt(nsgv::gOutBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
||||
hts_set_opt(nsgv::gOutBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite);
|
||||
sam_close(nsgv::gInBamFp); // bam
|
||||
nsgv::gInBamFp = sam_open_format(nsgv::gBqsrArg.INPUT_FILE.c_str(), "r", nullptr);
|
||||
if (!nsgv::gInBamFp) {
|
||||
spdlog::error("[{}] load sam/bam file failed.\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
hts_set_opt(nsgv::gInBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
||||
hts_set_opt(nsgv::gInBamFp, HTS_OPT_THREAD_POOL, &htsPoolRead);
|
||||
nsgv::gInBamHeader = sam_hdr_read(nsgv::gInBamFp);
|
||||
if (sam_hdr_write(nsgv::gOutBamFp, nsgv::gOutBamHeader) != 0) {
|
||||
spdlog::error("failed writing header to \"{}\"", nsgv::gBqsrArg.OUTPUT_FILE);
|
||||
sam_close(nsgv::gOutBamFp);
|
||||
sam_close(nsgv::gInBamFp);
|
||||
return -1;
|
||||
}
|
||||
// index
|
||||
string indexFn = nsgv::gBqsrArg.OUTPUT_FILE + ".bai"; // min_shift = 0 bai
|
||||
if ("sam" == getFileExtension(nsgv::gBqsrArg.OUTPUT_FILE) || !nsgv::gBqsrArg.CREATE_INDEX) {
|
||||
indexFn = "";
|
||||
}
|
||||
if (!indexFn.empty()) {
|
||||
int index_min_shift = 0;
|
||||
if (nsgv::gBqsrArg.INDEX_FORMAT == nsbqsr::IndexFormat::CSI) {
|
||||
indexFn = nsgv::gBqsrArg.OUTPUT_FILE + ".csi";
|
||||
index_min_shift = 14;
|
||||
}
|
||||
if (sam_idx_init(nsgv::gOutBamFp, nsgv::gOutBamHeader, 0 /*csi 14*/, indexFn.c_str()) < 0) {
|
||||
spdlog::error("failed to open index \"{}\" for writing", indexFn);
|
||||
sam_close(nsgv::gOutBamFp);
|
||||
sam_close(nsgv::gInBamFp);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// ,
|
||||
BamBufType inBuf(nsgv::gBqsrArg.DUPLEX_IO);
|
||||
inBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM);
|
||||
|
||||
DupIdxQueue<DupInfo> dupIdxQue, repIdxQue;
|
||||
DupIdxQueue<int64_t> opticalIdxQue;
|
||||
dupIdxQue.Init(&nsgv::gDupRes.dupIdxArr);
|
||||
repIdxQue.Init(&nsgv::gDupRes.repIdxArr);
|
||||
opticalIdxQue.Init(&nsgv::gDupRes.opticalDupIdxArr);
|
||||
spdlog::info("{} duplicate reads found", dupIdxQue.Size());
|
||||
spdlog::info("{} optical reads found", opticalIdxQue.Size());
|
||||
// spdlog::info("{} represent reads found", repIdxQue.Size());
|
||||
// dupIdxQue.RealSize("na12878.dup");
|
||||
// opticalIdxQue.RealSize("normal.odup");
|
||||
|
||||
// return 0;
|
||||
|
||||
uint64_t bamIdx = 0;
|
||||
DupInfo dupIdx = dupIdxQue.Pop();
|
||||
DupInfo repIdx = repIdxQue.Pop();
|
||||
uint64_t opticalIdx = opticalIdxQue.Pop();
|
||||
|
||||
ByteBuf bytes;
|
||||
bam1_t *bp = bam_init1();
|
||||
bool isDup = false;
|
||||
bool isOpticalDup = false;
|
||||
bool isInDuplicateSet = false;
|
||||
uint32_t representativeReadIndexInFile = 0;
|
||||
uint32_t duplicateSetSize = 0;
|
||||
|
||||
int64_t realDupSize = 0;
|
||||
|
||||
PROF_START(write);
|
||||
while (inBuf.ReadStat() >= 0) {
|
||||
PROF_START(final_read);
|
||||
size_t readNum = inBuf.ReadBam();
|
||||
PROF_END(gprof[GP_final_read], final_read);
|
||||
// PROF_PRINT_START(read_write);
|
||||
for (size_t i = 0; i < readNum; ++i) {
|
||||
BamWrap *bw = inBuf[i];
|
||||
if (bam_copy1(bp, bw->b) == nullptr) {
|
||||
spdlog::error("Can not copy sam record!");
|
||||
return -1;
|
||||
}
|
||||
bw->b = bp;
|
||||
isDup = false;
|
||||
isOpticalDup = false;
|
||||
isInDuplicateSet = false;
|
||||
|
||||
// duplicate tag
|
||||
if (nsgv::gBqsrArg.CLEAR_DT) {
|
||||
uint8_t *oldTagVal = bam_aux_get(bw->b, nsgv::gBqsrArg.DUPLICATE_TYPE_TAG.c_str());
|
||||
if (oldTagVal != NULL) bam_aux_del(bw->b, oldTagVal);
|
||||
}
|
||||
|
||||
// while (bcf_sr_next_line(srs)) {
|
||||
// bcf1_t* line = srs->readers[0].buffer[0];
|
||||
// cout << line->pos << '\t' << line->rlen << '\t' << line->n_allele << '\t' << line->n_info << endl;
|
||||
// }
|
||||
// }
|
||||
//
|
||||
if (bw->GetReadUnmappedFlag()) {
|
||||
++nsgv::gMetrics.UNMAPPED_READS;
|
||||
} else if (bw->IsSecondaryOrSupplementary()) {
|
||||
++nsgv::gMetrics.SECONDARY_OR_SUPPLEMENTARY_RDS;
|
||||
} else if (!bw->GetReadPairedFlag() || bw->GetMateUnmappedFlag()) {
|
||||
++nsgv::gMetrics.UNPAIRED_READS_EXAMINED;
|
||||
} else {
|
||||
++nsgv::gMetrics.READ_PAIRS_EXAMINED; // will need to be divided by 2 at the end
|
||||
}
|
||||
// /* 先实现串行的bqsr-phase-1 */
|
||||
|
||||
/* */
|
||||
if (bamIdx == dupIdx) {
|
||||
++realDupSize; // for test
|
||||
isDup = true;
|
||||
if (nsgv::gBqsrArg.TAG_DUPLICATE_SET_MEMBERS && dupIdx.dupSet != 0) {
|
||||
isInDuplicateSet = true;
|
||||
representativeReadIndexInFile = dupIdx.GetRepIdx();
|
||||
duplicateSetSize = dupIdx.dupSet;
|
||||
}
|
||||
|
||||
// ,dupidx,duprepIdxdupSetSize
|
||||
while ((dupIdx = dupIdxQue.Pop()) == bamIdx);
|
||||
if (opticalIdx == bamIdx)
|
||||
isOpticalDup = true;
|
||||
else if (opticalIdx < bamIdx) {
|
||||
while ((opticalIdx = opticalIdxQue.Pop()) < bamIdx);
|
||||
if (opticalIdx == bamIdx)
|
||||
isOpticalDup = true;
|
||||
}
|
||||
|
||||
//
|
||||
bw->SetDuplicateReadFlag(true);
|
||||
|
||||
uint8_t *oldTagVal = bam_aux_get(bw->b, nsgv::gBqsrArg.DUPLICATE_TYPE_TAG.c_str());
|
||||
if (oldTagVal != NULL) bam_aux_del(bw->b, oldTagVal);
|
||||
|
||||
if (isOpticalDup)
|
||||
bam_aux_append(bw->b, nsgv::gBqsrArg.DUPLICATE_TYPE_TAG.c_str(), 'Z',
|
||||
nsgv::gBqsrArg.DUPLICATE_TYPE_SEQUENCING.size() + 1,
|
||||
(const uint8_t *)nsgv::gBqsrArg.DUPLICATE_TYPE_SEQUENCING.c_str());
|
||||
else
|
||||
bam_aux_append(bw->b, nsgv::gBqsrArg.DUPLICATE_TYPE_TAG.c_str(), 'Z',
|
||||
nsgv::gBqsrArg.DUPLICATE_TYPE_LIBRARY.size() + 1,
|
||||
(const uint8_t *)nsgv::gBqsrArg.DUPLICATE_TYPE_LIBRARY.c_str());
|
||||
|
||||
//
|
||||
if (!bw->IsSecondaryOrSupplementary() && !bw->GetReadUnmappedFlag()) {
|
||||
// Update the duplication metrics
|
||||
if (!bw->GetReadPairedFlag() || bw->GetMateUnmappedFlag()) {
|
||||
++nsgv::gMetrics.UNPAIRED_READ_DUPLICATES;
|
||||
} else {
|
||||
++nsgv::gMetrics.READ_PAIR_DUPLICATES; // will need to be divided by 2 at the end
|
||||
}
|
||||
}
|
||||
} else {
|
||||
bw->SetDuplicateReadFlag(false);
|
||||
}
|
||||
if (nsgv::gBqsrArg.TAG_DUPLICATE_SET_MEMBERS && bamIdx == repIdx) { // repressent
|
||||
isInDuplicateSet = true;
|
||||
representativeReadIndexInFile = repIdx.GetRepIdx();
|
||||
duplicateSetSize = repIdx.dupSet;
|
||||
while (repIdx == bamIdx || repIdx.dupSet == 0) {
|
||||
if (repIdxQue.Size() > 0)
|
||||
repIdx = repIdxQue.Pop();
|
||||
else {
|
||||
repIdx = -1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (nsgv::gBqsrArg.TAG_DUPLICATE_SET_MEMBERS && isInDuplicateSet) {
|
||||
if (!bw->IsSecondaryOrSupplementary() && !bw->GetReadUnmappedFlag()) {
|
||||
uint8_t *oldTagVal = bam_aux_get(bw->b, nsgv::gBqsrArg.DUPLICATE_SET_INDEX_TAG.c_str());
|
||||
if (oldTagVal != NULL)
|
||||
bam_aux_del(bw->b, oldTagVal);
|
||||
bam_aux_append(bw->b, nsgv::gBqsrArg.DUPLICATE_SET_INDEX_TAG.c_str(), 'i',
|
||||
sizeof(representativeReadIndexInFile), (uint8_t *)&representativeReadIndexInFile);
|
||||
oldTagVal = bam_aux_get(bw->b, nsgv::gBqsrArg.DUPLICATE_SET_SIZE_TAG.c_str());
|
||||
if (oldTagVal != NULL)
|
||||
bam_aux_del(bw->b, oldTagVal);
|
||||
bam_aux_append(bw->b, nsgv::gBqsrArg.DUPLICATE_SET_SIZE_TAG.c_str(), 'i', sizeof(duplicateSetSize),
|
||||
(const uint8_t *)&duplicateSetSize);
|
||||
}
|
||||
}
|
||||
// readoutput,,record
|
||||
++bamIdx;
|
||||
if (isDup && nsgv::gBqsrArg.REMOVE_DUPLICATES) {
|
||||
continue;
|
||||
}
|
||||
if (isOpticalDup && nsgv::gBqsrArg.REMOVE_SEQUENCING_DUPLICATES) {
|
||||
continue;
|
||||
}
|
||||
if (!nsgv::gBqsrArg.PROGRAM_RECORD_ID.empty() && nsgv::gBqsrArg.ADD_PG_TAG_TO_READS) {
|
||||
uint8_t *oldTagVal = bam_aux_get(bw->b, "PG");
|
||||
if (oldTagVal != NULL)
|
||||
bam_aux_del(bw->b, oldTagVal);
|
||||
bam_aux_append(bw->b, "PG", 'Z', nsgv::gBqsrArg.PROGRAM_RECORD_ID.size() + 1,
|
||||
(const uint8_t *)nsgv::gBqsrArg.PROGRAM_RECORD_ID.c_str());
|
||||
}
|
||||
#if 1
|
||||
if (sam_write1(nsgv::gOutBamFp, nsgv::gOutBamHeader, bw->b) < 0) {
|
||||
spdlog::error("failed writing sam record to \"{}\"", nsgv::gBqsrArg.OUTPUT_FILE.c_str());
|
||||
sam_close(nsgv::gOutBamFp);
|
||||
sam_close(nsgv::gInBamFp);
|
||||
return -1;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
inBuf.ClearAll();
|
||||
// PROF_PRINT_END(read_write);
|
||||
spdlog::info("write {} reads to output", readNum);
|
||||
}
|
||||
bam_destroy1(bp);
|
||||
|
||||
//
|
||||
nsgv::gMetrics.READ_PAIRS_EXAMINED /= 2;
|
||||
nsgv::gMetrics.READ_PAIR_DUPLICATES /= 2;
|
||||
for (auto &arr : nsgv::gDupRes.opticalDupIdxArr) nsgv::gMetrics.READ_PAIR_OPTICAL_DUPLICATES += arr.size();
|
||||
nsgv::gMetrics.READ_PAIR_OPTICAL_DUPLICATES = nsgv::gMetrics.READ_PAIR_OPTICAL_DUPLICATES / 2;
|
||||
nsgv::gMetrics.ESTIMATED_LIBRARY_SIZE =
|
||||
estimateLibrarySize(nsgv::gMetrics.READ_PAIRS_EXAMINED - nsgv::gMetrics.READ_PAIR_OPTICAL_DUPLICATES,
|
||||
nsgv::gMetrics.READ_PAIRS_EXAMINED - nsgv::gMetrics.READ_PAIR_DUPLICATES);
|
||||
if (nsgv::gMetrics.UNPAIRED_READS_EXAMINED + nsgv::gMetrics.READ_PAIRS_EXAMINED != 0) {
|
||||
nsgv::gMetrics.PERCENT_DUPLICATION =
|
||||
(nsgv::gMetrics.UNPAIRED_READ_DUPLICATES + nsgv::gMetrics.READ_PAIR_DUPLICATES * 2) /
|
||||
(double)(nsgv::gMetrics.UNPAIRED_READS_EXAMINED + nsgv::gMetrics.READ_PAIRS_EXAMINED * 2);
|
||||
} else {
|
||||
nsgv::gMetrics.PERCENT_DUPLICATION = 0;
|
||||
}
|
||||
calculateRoiHistogram(nsgv::gMetrics);
|
||||
|
||||
//
|
||||
if (!nsgv::gBqsrArg.METRICS_FILE.empty()) {
|
||||
ofstream ofsM(nsgv::gBqsrArg.METRICS_FILE);
|
||||
ofsM << "## StringHeader" << endl;
|
||||
ofsM << "# " << nsgv::gBqsrArg.CLI_STR << endl;
|
||||
ofsM << "## StringHeader" << endl;
|
||||
ofsM << "# Started on: " << nsgv::gBqsrArg.START_TIME << endl << endl;
|
||||
ofsM << "## METRICS"
|
||||
<< endl;
|
||||
ofsM << "LIBRARY\tUNPAIRED_READS_EXAMINED\tREAD_PAIRS_EXAMINED\tSECONDARY_OR_SUPPLEMENTARY_RDS\tUNMAPPED_"
|
||||
"READS\tUNPAIRED_READ_DUPLICATES\tREAD_PAIR_DUPLICATES\tREAD_PAIR_OPTICAL_DUPLICATES\tPERCENT_"
|
||||
"DUPLICATION\tESTIMATED_LIBRARY_SIZE"
|
||||
<< endl;
|
||||
ofsM << nsgv::gMetrics.LIBRARY << "\t" << nsgv::gMetrics.UNPAIRED_READS_EXAMINED << "\t" << nsgv::gMetrics.READ_PAIRS_EXAMINED
|
||||
<< "\t" << nsgv::gMetrics.SECONDARY_OR_SUPPLEMENTARY_RDS << "\t" << nsgv::gMetrics.UNMAPPED_READS << "\t"
|
||||
<< nsgv::gMetrics.UNPAIRED_READ_DUPLICATES << "\t" << nsgv::gMetrics.READ_PAIR_DUPLICATES << "\t"
|
||||
<< nsgv::gMetrics.READ_PAIR_OPTICAL_DUPLICATES << "\t" << nsgv::gMetrics.PERCENT_DUPLICATION << "\t"
|
||||
<< nsgv::gMetrics.ESTIMATED_LIBRARY_SIZE << endl
|
||||
<< endl;
|
||||
ofsM << "## HISTOGRAM\tDouble" << endl;
|
||||
ofsM << "BIN CoverageMult" << endl;
|
||||
for (int i = 1; i <= 100; ++i) {
|
||||
ofsM << i << "\t" << std::fixed << std::setprecision(6) << nsgv::gMetrics.CoverageMult[i] << endl;
|
||||
}
|
||||
ofsM.close();
|
||||
}
|
||||
PROF_END(gprof[GP_write], write);
|
||||
|
||||
if (!indexFn.empty() && sam_idx_save(nsgv::gOutBamFp) < 0) {
|
||||
spdlog::error("writing index failed");
|
||||
sam_close(nsgv::gOutBamFp);
|
||||
sam_close(nsgv::gInBamFp);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* , */
|
||||
sam_close(nsgv::gOutBamFp);
|
||||
sam_close(nsgv::gInBamFp);
|
||||
|
||||
PROF_END(gprof[GP_whole_process], whole_process);
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
#include "md_funcs.h"
|
||||
#include "bqsr_funcs.h"
|
||||
|
||||
#include <math.h>
|
||||
#include <stdint.h>
|
||||
|
|
@ -13,7 +13,7 @@
|
|||
#include <vector>
|
||||
|
||||
#include "dup_metrics.h"
|
||||
#include "md_args.h"
|
||||
#include "bqsr_args.h"
|
||||
#include "read_ends.h"
|
||||
#include "read_name_parser.h"
|
||||
|
||||
|
|
@ -25,7 +25,7 @@ using std::unordered_map;
|
|||
using std::vector;
|
||||
|
||||
namespace nsgv {
|
||||
extern MarkDupsArg gBqsrArg;
|
||||
extern BQSRArg gBqsrArg;
|
||||
extern DuplicationMetrics gMetrics;
|
||||
};
|
||||
|
||||
|
|
@ -33,40 +33,8 @@ extern DuplicationMetrics gMetrics;
|
|||
* read
|
||||
*/
|
||||
int16_t computeDuplicateScore(BamWrap &bw) {
|
||||
int16_t score = 0;
|
||||
switch (nsgv::gBqsrArg.DUPLICATE_SCORING_STRATEGY) {
|
||||
case nsbqsr::SUM_OF_BASE_QUALITIES:
|
||||
// two (very) long reads worth of high-quality bases can go over
|
||||
// Short.MAX_VALUE/2 and risk overflow.
|
||||
score += (int16_t)min(bw.GetSumOfBaseQualities(), INT16_MAX / 2);
|
||||
break;
|
||||
case nsbqsr::TOTAL_MAPPED_REFERENCE_LENGTH:
|
||||
if (!bw.GetReadUnmappedFlag())
|
||||
// no need to remember the score since this scoring mechanism is
|
||||
// symmetric
|
||||
score = (int16_t)min(bw.GetReferenceLength(), INT16_MAX / 2);
|
||||
break;
|
||||
case nsbqsr::RANDOM:
|
||||
// The RANDOM score gives the same score to both reads so that they get
|
||||
// filtered together. it's not critical do use the readName since the
|
||||
// scores from both ends get added, but it seem to be clearer this way.
|
||||
score += (short)(Murmur3::Instance().HashUnencodedChars(bw.query_name()) & 0b11111111111111);
|
||||
// subtract Short.MIN_VALUE/4 from it to end up with a number between
|
||||
// 0 and Short.MAX_VALUE/2. This number can be then discounted in case
|
||||
// the read is not passing filters. We need to stay far from overflow so
|
||||
// that when we add the two scores from the two read mates we do not
|
||||
// overflow since that could cause us to chose a failing read-pair
|
||||
// instead of a passing one.
|
||||
score -= INT16_MIN / 4;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
// make sure that filter-failing records are heavily discounted. (the
|
||||
// discount can happen twice, once for each mate, so need to make sure we do
|
||||
// not subtract more than Short.MIN_VALUE overall.)
|
||||
score += bw.GetReadFailsVendorQualityCheckFlag() ? (int16_t)(INT16_MIN / 2) : 0;
|
||||
|
||||
return score;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
@ -90,8 +58,6 @@ void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnd
|
|||
// Fill in the location information for optical duplicates
|
||||
if (!ReadNameParser::sWrongNameFormat)
|
||||
rnParser.AddLocationInformation(bw.query_name(), pKey);
|
||||
else
|
||||
nsgv::gBqsrArg.CHECK_OPTICAL_DUP = false;
|
||||
// cout << k.tile << ' ' << k.x << ' ' << k.y << endl;
|
||||
// key
|
||||
k.posKey = BamWrap::bam_global_pos(k.read1ReferenceIndex, k.read1Coordinate); // << 1 | k.orientation;
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
#include "md_pipeline.h"
|
||||
#include "bqsr_pipeline.h"
|
||||
|
||||
#include <klib/kthread.h>
|
||||
#include <pthread.h>
|
||||
|
|
@ -9,8 +9,8 @@
|
|||
#include <vector>
|
||||
|
||||
#include "dup_metrics.h"
|
||||
#include "md_args.h"
|
||||
#include "md_funcs.h"
|
||||
#include "bqsr_args.h"
|
||||
#include "bqsr_funcs.h"
|
||||
#include "read_ends.h"
|
||||
#include "read_name_parser.h"
|
||||
#include "util/bam_buf.h"
|
||||
|
|
@ -22,7 +22,7 @@ using namespace std;
|
|||
|
||||
namespace nsgv {
|
||||
|
||||
extern MarkDupsArg gBqsrArg; //
|
||||
extern BQSRArg gBqsrArg; //
|
||||
extern samFile *gInBamFp; // bam
|
||||
extern sam_hdr_t *gInBamHeader; // bam
|
||||
extern DuplicationMetrics gMetrics; //
|
||||
|
|
@ -35,73 +35,6 @@ extern PipelineArg gPipe;
|
|||
static void markDupsForPairs(vector<const ReadEnds *> &vpRe, DPSet<DupInfo> *dupIdx, MDSet<int64_t> *opticalDupIdx,
|
||||
DPSet<DupInfo> *repIdx, MDSet<int64_t> *notDupIdx = nullptr,
|
||||
MDSet<int64_t> *notOpticalDupIdx = nullptr, MDSet<int64_t> *notRepIdx = nullptr) {
|
||||
if (vpRe.size() < 2) {
|
||||
return;
|
||||
}
|
||||
int maxScore = 0;
|
||||
const ReadEnds *pBest = nullptr;
|
||||
/** All read ends should have orientation FF, FR, RF, or RR **/
|
||||
for (auto pe : vpRe) { // readend
|
||||
if (pe->score > maxScore || pBest == nullptr) {
|
||||
maxScore = pe->score;
|
||||
pBest = pe;
|
||||
}
|
||||
}
|
||||
|
||||
if (notDupIdx != nullptr) {
|
||||
notDupIdx->insert(pBest->read1IndexInFile);
|
||||
notDupIdx->insert(pBest->read2IndexInFile);
|
||||
}
|
||||
|
||||
if (nsgv::gBqsrArg.CHECK_OPTICAL_DUP) { //
|
||||
// trackOpticalDuplicates
|
||||
vector<const ReadEnds *> prevOpticalRe;
|
||||
if (notOpticalDupIdx != nullptr) {
|
||||
for (auto pe : vpRe) {
|
||||
if (pe->isOpticalDuplicate) {
|
||||
prevOpticalRe.push_back(pe);
|
||||
}
|
||||
}
|
||||
}
|
||||
trackOpticalDuplicates(vpRe, pBest);
|
||||
// ,
|
||||
if (notOpticalDupIdx != nullptr) {
|
||||
for (auto pe : prevOpticalRe) {
|
||||
if (!pe->isOpticalDuplicate) {
|
||||
notOpticalDupIdx->insert(pe->read1IndexInFile);
|
||||
notOpticalDupIdx->insert(pe->read2IndexInFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto pe : vpRe) { // best read
|
||||
if (pe != pBest) { // best
|
||||
dupIdx->insert(DupInfo(pe->read1IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); // read1
|
||||
if (pe->read2IndexInFile != pe->read1IndexInFile)
|
||||
dupIdx->insert(DupInfo(pe->read2IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); // read2,
|
||||
// read1
|
||||
// optical dup
|
||||
if (pe->isOpticalDuplicate && opticalDupIdx != nullptr) {
|
||||
opticalDupIdx->insert(pe->read1IndexInFile);
|
||||
if (pe->read2IndexInFile != pe->read1IndexInFile)
|
||||
opticalDupIdx->insert(pe->read2IndexInFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
// bamtag, bestdupset
|
||||
if (nsgv::gBqsrArg.TAG_DUPLICATE_SET_MEMBERS) {
|
||||
repIdx->insert(DupInfo(pBest->read1IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size()));
|
||||
repIdx->insert(DupInfo(pBest->read2IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size()));
|
||||
if (notRepIdx != nullptr) {
|
||||
for (auto pe : vpRe) {
|
||||
if (pe != pBest) {
|
||||
notRepIdx->insert(pe->read1IndexInFile);
|
||||
if (pe->read2IndexInFile != pe->read1IndexInFile)
|
||||
notRepIdx->insert(pe->read2IndexInFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* pairedreadends, */
|
||||
|
|
|
|||
14
src/main.cpp
14
src/main.cpp
|
|
@ -64,7 +64,7 @@ int main_BaseRecalibrator(int argc, char *argv[]) {
|
|||
"One or more databases of known polymorphic sites used to exclude regions around known polymorphisms from "
|
||||
"analysis. This argument must be specified at least once.")
|
||||
.metavar("<KnownSites>")
|
||||
.required();
|
||||
.nargs(argparse::nargs_pattern::any);
|
||||
|
||||
program.add_argument("--create-index")
|
||||
.help("Whether to create an index when writing coordinate sorted BAM output.")
|
||||
|
|
@ -114,6 +114,12 @@ int main_BaseRecalibrator(int argc, char *argv[]) {
|
|||
nsgv::gBqsrArg.NUM_THREADS = program.get<int>("--num-threads");
|
||||
nsgv::gBqsrArg.CREATE_INDEX = program.get<bool>("--create-index");
|
||||
|
||||
nsgv::gBqsrArg.KNOWN_SITES_VCFS = program.get<std::vector<string>>("--known-sites");
|
||||
// spdlog::info("known sites vcf files:");
|
||||
// for (const auto& ks : nsgv::gBqsrArg.KNOWN_SITES_VCFS) {
|
||||
// spdlog::info(" {}", ks);
|
||||
// }
|
||||
|
||||
nsgv::gBqsrArg.INDEX_FORMAT =
|
||||
program.get("--index-format") == "BAI" ? nsbqsr::IndexFormat::BAI : nsbqsr::IndexFormat::CSI;
|
||||
|
||||
|
|
@ -131,14 +137,14 @@ int main_BaseRecalibrator(int argc, char *argv[]) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
int main_ApplyBQSR(int argc, char *argv[]) {}
|
||||
int main_ApplyBQSR(int argc, char* argv[]) { return 0; }
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
// init log
|
||||
spdlog::set_default_logger(spdlog::stderr_color_st("fastdup"));
|
||||
spdlog::set_default_logger(spdlog::stderr_color_st("fastbqsr"));
|
||||
spdlog::cfg::load_env_levels();
|
||||
|
||||
string bqsr_prog = argv[0];
|
||||
string bqsr_prog = argv[1];
|
||||
if (bqsr_prog == "BaseRecalibrator") {
|
||||
return main_BaseRecalibrator(argc - 1, argv + 1);
|
||||
} else if (bqsr_prog == "ApplyBQSR") {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,13 @@
|
|||
#include <errno.h>
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <strings.h>
|
||||
|
||||
void error(const char* format, ...) {
|
||||
va_list ap;
|
||||
va_start(ap, format);
|
||||
vfprintf(stderr, format, ap);
|
||||
va_end(ap);
|
||||
exit(-1);
|
||||
}
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
#pragma once
|
||||
|
||||
// Report an error and exit -1
|
||||
void error(const char* format, ...);
|
||||
Loading…
Reference in New Issue