diff --git a/src/bqsr/baq.cpp b/src/bqsr/baq.cpp index a7c7488..b56d8d6 100644 --- a/src/bqsr/baq.cpp +++ b/src/bqsr/baq.cpp @@ -6,7 +6,7 @@ vector BAQ::qual2prob(256); // 质量分数转化概率 vector>> BAQ::EPSILONS(256, vector>(256, vector(SAM_MAX_PHRED_SCORE + 1))); // [ref][read][qual] // 计算baq数组,返回成功与否 -bool BAQ::calcBAQFromHMM(BamWrap* bw, SamData& ad, string ref, int refOffset, vector& baqArray) { +bool BAQ::calcBAQFromHMM(SamData& ad, string ref, int refOffset, vector& baqArray) { // 检测ref是否覆盖了read if (ref.size() < refOffset + ad.read_len) { spdlog::error("BAQ calculation error: reference sequence length {} is less than required length {} (refOffset {} + read_len {})", diff --git a/src/bqsr/baq.h b/src/bqsr/baq.h index 247217a..a57b722 100644 --- a/src/bqsr/baq.h +++ b/src/bqsr/baq.h @@ -15,6 +15,7 @@ #include #include "util/bam_wrap.h" +#include "util/sam_data.h" using std::vector; using std::string; @@ -84,5 +85,5 @@ struct BAQ { double calcEpsilon(uint8_t ref, uint8_t read, uint8_t qualB) { return EPSILONS[ref][read][qualB]; } // 计算baq数组,返回成功与否 - bool calcBAQFromHMM(BamWrap* bw, SamData& ad, string ref, int refOffset, vector& baqArray); + bool calcBAQFromHMM(SamData& ad, string ref, int refOffset, vector& baqArray); }; \ No newline at end of file diff --git a/src/bqsr/bqsr_entry.cpp b/src/bqsr/bqsr_entry.cpp index ee073b5..b3bc42a 100644 --- a/src/bqsr/bqsr_entry.cpp +++ b/src/bqsr/bqsr_entry.cpp @@ -39,6 +39,10 @@ Date : 2023/10/23 #include "util/math/math_utils.h" #include "quant_info.h" #include "util/debug.h" +#include "util/stable_array.h" +#include "util/sam_data.h" +#include "util/read_transformer.h" +#include "util/base_utils.h" using std::deque; @@ -47,8 +51,6 @@ using std::deque; const uint8_t NO_BAQ_UNCERTAINTY = (uint8_t)'@'; -const char cBaseToChar[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'}; - // 解析knownSites struct VCFParser { deque knownSites; // 已知的变异位点 @@ -157,222 +159,6 @@ bool bqsrReadFilterOut(const bam1_t *b) { return false; } -// 该操作符是否消耗read的碱基 -bool consumeReadBases(char cigar) { - return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'I' || cigar == 'S'; -} - -// 该操作符是否消耗参考基因组的碱基 -bool consumeRefBases(char cigar) { - return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N'; -} - -// 给定一个ref位置,在read内部找到对应的位置和操作符 -struct PosAndOperator { - int readPosAtRefCoord = -1; // read中的位置 - char cigarOperator = '0'; // cigar操作符 - int cigarIndex = -1; // cigar索引 - int cigarLen = 0; - int preCigarLen = 0; // 截止cigar之前的,消耗read base的长度 -}; - -/** - * Find the 0-based index within a read base array corresponding to a given 0-based position in the reference, along with the cigar operator of - * the element containing that base. If the reference coordinate occurs within a deletion, the first index after the deletion is returned. - * Note that this treats soft-clipped bases as if they align with the reference, which is useful for hard-clipping reads with soft clips. - * - * @param alignmentStart The soft start of the read on the reference - * @param cigar The read's cigar - * @param refCoord The target reference coordinate - * @return If the reference coordinate occurs before the read start or after the read end {@code CLIPPING_GOAL_NOT_REACHED}; - * if the reference coordinate falls within an alignment block of the read's cigar, the corresponding read coordinate; - * if the reference coordinate falls within a deletion, the first read coordinate after the deletion. Note: if the last - * cigar element is a deletion (which isn't meaningful), it returns {@code CLIPPING_GOAL_NOT_REACHED}. - */ -PosAndOperator getReadIndexForReferenceCoordinate(BamWrap *bw, int alignmentStart, int refCoord) { - PosAndOperator po; - if (refCoord < alignmentStart) { - return po; - } - int firstReadPosOfElement = 0; // inclusive - int firstRefPosOfElement = alignmentStart; // inclusive - int lastReadPosOfElement = 0; // exclusive - int lastRefPosOfElement = alignmentStart; // exclusive - - // advance forward through all the cigar elements until we bracket the reference coordinate - const uint32_t* cigar = bam_get_cigar(bw->b); - const bam1_core_t& bc = bw->b->core; - const int idx = bc.n_cigar - 1; - if (idx < 0) - return po; - for (int i = 0; i < bc.n_cigar; ++i) { - const char c = bam_cigar_opchr(cigar[i]); - const int len = bam_cigar_oplen(cigar[i]); - firstReadPosOfElement = lastReadPosOfElement; - firstRefPosOfElement = lastRefPosOfElement; - lastReadPosOfElement += consumeReadBases(c) ? len : 0; - lastRefPosOfElement += (consumeRefBases(c) || c == 'S') ? len : 0; - if (firstRefPosOfElement <= refCoord && refCoord < lastRefPosOfElement) { // refCoord falls within this cigar element - int readPosAtRefCoord = firstReadPosOfElement + (consumeReadBases(c) ? (refCoord - firstRefPosOfElement) : 0); - return PosAndOperator{readPosAtRefCoord, c, i, len, firstReadPosOfElement}; - } - } - return po; -} - -// 根据adapter位置,对read进行hardclip,返回左侧或右侧减掉的base数量 -void clipByReferenceCoordinates(BamWrap *bw, int refStart, int refStop, SamData &sd) { - int start, stop; - // Determine the read coordinate to start and stop hard clipping - if (refStart < 0) { - if (refStop < 0) return; - PosAndOperator stopPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStop); - // if the refStop falls in a deletion, the above method returns the position after the deletion. Since the stop we return here - // is inclusive, we decrement the stop to avoid overclipping by one base. As a result we do not clip the deletion, which is fine. - stop = stopPosAndOperator.readPosAtRefCoord - (consumeReadBases(stopPosAndOperator.cigarOperator) ? 0 : 1); - sd.left_clip = stop + 1; - sd.cigar_start = stopPosAndOperator.cigarIndex; - sd.first_cigar_clip = sd.left_clip - stopPosAndOperator.preCigarLen; - } else { - if (refStop >= 0) return; - // unlike the above case where we clip the start fo the read, here we clip the end and returning the base to the right of a deletion avoids - // overclipping - PosAndOperator startPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStart); - start = startPosAndOperator.readPosAtRefCoord; - sd.right_clip = bw->b->core.l_qseq - start; - sd.cigar_end = startPosAndOperator.cigarIndex + 1; - sd.last_cigar_clip = startPosAndOperator.preCigarLen + startPosAndOperator.cigarLen - start; - } -} - -// 计算切掉adapter之后,ref相对原始ref的偏移量 -void calculateRefOffset(BamWrap *bw, SamData &sd) { - const uint32_t* cigar = bam_get_cigar(bw->b); - const bam1_core_t& bc = bw->b->core; - int i = 0; - for (i = 0; i < sd.cigar_start; ++i) { - const char c = bam_cigar_opchr(cigar[i]); - int len = bam_cigar_oplen(cigar[i]); - if (consumeRefBases(c)) { - sd.ref_offset += len; - } - } - const char c = bam_cigar_opchr(cigar[i]); - if (consumeRefBases(c)) { - sd.ref_offset += sd.first_cigar_clip; - } -} - -// 计算clip处理之后,剩余的碱基 -void calculateReadBases(BamWrap* bw, SamData& sd) { - sd.bases.resize(sd.read_len); - sd.base_quals.resize(sd.read_len); - uint8_t* seq = bam_get_seq(bw->b); - uint8_t* quals = bam_get_qual(bw->b); - for (int i = 0; i < sd.read_len; ++i) { - sd.bases[i] = cBaseToChar[bam_seqi(seq, i + sd.left_clip)]; - sd.base_quals[i] = quals[i + sd.left_clip]; - } -} - -// 计算read两端clip之后的softstart和softend -void calculateSoftStartEnd(BamWrap* bw, SamData& sd) { - int64_t softStart = bw->b->core.pos + sd.ref_offset; - int64_t softEnd = softStart - 1; // 闭区间 - const uint32_t* cigar = bam_get_cigar(bw->b); - const bam1_core_t& bc = bw->b->core; - int cigar_start = sd.cigar_start; - int cigar_end = sd.cigar_end; - bool rightTail = false; - for (int i = sd.cigar_start; i < sd.cigar_end; ++i) { - const char c = bam_cigar_opchr(cigar[i]); - int len = bam_cigar_oplen(cigar[i]); - if (i == sd.cigar_start) len -= sd.first_cigar_clip; - if (i == sd.cigar_end - 1) len -= sd.last_cigar_clip; - // if (c == 'S' || c == 'I') - if (c == 'S') - softStart -= len; - else if (c != 'H') - rightTail = true; - if (rightTail) { - if (consumeRefBases(c) || c == 'S') - softEnd += len; - } - } - sd.softStart() = softStart; - sd.softEnd() = softEnd; -} - -// 计算clip之后的cigar -void calculateCigar(BamWrap* bw, SamData& sd) { - sd.cigars.clear(); - const uint32_t* cigar = bam_get_cigar(bw->b); - const bam1_core_t& bc = bw->b->core; - int cigar_start = sd.cigar_start; - int cigar_end = sd.cigar_end; - for (int i = sd.cigar_start; i < sd.cigar_end; ++i) { - char c = bam_cigar_opchr(cigar[i]); - int len = bam_cigar_oplen(cigar[i]); - if (i == sd.cigar_start) - len -= sd.first_cigar_clip; - if (i == sd.cigar_end - 1) - len -= sd.last_cigar_clip; - //if ((i == sd.cigar_start || i == sd.cigar_end - 1) && c == 'D') // 跳过开头的deletion - if (i == sd.cigar_start && c == 'D') { // 跳过开头的deletion - c = 'H'; - // sd.ref_offset += len; - sd.start_pos += len; - } - sd.cigars.push_back({c, len}); - } - //for(auto &cigar : sd.cigars) { - // spdlog::info("op: {}, len: {}", cigar.op, cigar.len); - //} -} - -// 计算read两端softclip的碱基数量,可能会修改ad里的clip值 -void calculateSoftClip(BamWrap *bw, SamData &sd) { - const uint32_t* cigar = bam_get_cigar(bw->b); - const bam1_core_t& bc = bw->b->core; - int readIndex = sd.left_clip; - int cutLeft = -1; // first position to hard clip (inclusive) - int cutRight = -1; // first position to hard clip (inclusive) - int cigar_start = sd.cigar_start; - int cigar_end = sd.cigar_end; - bool rightTail = false; // trigger to stop clipping the left tail and start cutting the right tail - - for (int i = sd.cigar_start; i < sd.cigar_end; ++i) { - const char c = bam_cigar_opchr(cigar[i]); - int len = bam_cigar_oplen(cigar[i]); - if (i == sd.cigar_start) len -= sd.first_cigar_clip; - if (i == sd.cigar_end - 1) len -= sd.last_cigar_clip; - if (c == 'S') { - if (rightTail) { - cutRight = readIndex; - cigar_end = i; - } else { - cutLeft = readIndex + len - 1; - cigar_start = i + 1; - } - } else if (c != 'H') { - rightTail = true; - } - if (consumeReadBases(c)) { - readIndex += len; - } - } - if (cutRight >= 0) { - sd.right_clip = bw->b->core.l_qseq - cutRight; - sd.cigar_end = cigar_end; - sd.last_cigar_clip = 0; - } - if (cutLeft >= 0) { - sd.left_clip = cutLeft + 1; - sd.cigar_start = cigar_start; - sd.first_cigar_clip = 0; - } -} - // 读取给定区间的reference static inline void read_ref_base(AuxVar& aux, int64_t cur_pos, Interval& interval) { if (aux.ref_seq != NULL) @@ -386,22 +172,23 @@ static inline void read_ref_base(AuxVar& aux, int64_t cur_pos, Interval& interva } // 设置某个位置是indel -inline void updateIndel(vector &isIndel, int index) { +inline void updateIndel(StableArray& isIndel, int index) { if (index >=0 && index < isIndel.size()) { isIndel[index] = 1; } } // 计算该read的每个碱基位置是否是SNP或Indel -int calculateIsSNPOrIndel(AuxVar& aux, SamData &sd, vector &isSNP, vector &isIns, vector &isDel) { +int calculateIsSNPOrIndel(AuxVar& aux, SamData& sd, StableArray& isSNP, StableArray& isIns, StableArray& isDel) { + isSNP.resize(sd.read_len, 0); + isIns.resize(sd.read_len, 0); + isDel.resize(sd.read_len, 0); // 1. 读取参考基因组,先看看串行运行性能,稍后可以将读入ref和vcf合并起来做成一个并行流水线步骤 - //Interval interval{bw->start_pos() + sd.ref_offset, bw->end_pos()}; // 闭区间 Interval interval{sd.start_pos, sd.end_pos}; // 闭区间 PROF_START(ref); read_ref_base(aux, interval.left, interval); PROF_END(gprof[GP_read_ref], ref); string refBases(aux.ref_seq); - // spdlog::info("ref: {}, {}, {} - {}", aux.ref_seq, aux.ref_len, bw->contig_pos(), bw->contig_end_pos()); // 2. 遍历cigar,计算每个碱基是否是SNP或Indel int readPos = 0, refPos = 0, nEvents = 0; @@ -412,7 +199,6 @@ int calculateIsSNPOrIndel(AuxVar& aux, SamData &sd, vector &isSNP, vector 0) { spdlog::info("snp {}, readpos: {}", snpInt, readPos); } isSNP[readPos] = snpInt; nEvents += snpInt; readPos++; @@ -439,59 +225,13 @@ int calculateIsSNPOrIndel(AuxVar& aux, SamData &sd, vector &isSNP, vectorb); - // const bam1_core_t& bc = bw->b->core; - // uint8_t* seq = bam_get_seq(bw->b); - // for (int i = sd.cigar_start; i < sd.cigar_end; ++i) { - // const char c = bam_cigar_opchr(cigar[i]); - // int len = bam_cigar_oplen(cigar[i]); - // if (i == sd.cigar_start) len -= sd.first_cigar_clip; - // if (i == sd.cigar_end - 1) len -= sd.last_cigar_clip; - // if (c == 'M' || c == '=' || c == 'X') { - // for (int j = 0; j < len; ++j) { - // // 按位置将read和ref碱基进行比较,不同则是snp,注意read起始位置要加上left_clip - // int snpInt = cBaseToChar[bam_seqi(seq, readPos + sd.left_clip)] == refBases[refPos] ? 0 : 1; - // // if (snpInt > 0) { spdlog::info("snp {}, readpos: {}", snpInt, readPos); } - // isSNP[readPos] = snpInt; - // nEvents += snpInt; - // readPos++; - // refPos++; - // } - // } else if (c == 'D') { - // // 应该是在上一个消耗碱基的cigar的最后一个位置,标记Del - // int index = bw->GetReadNegativeStrandFlag() ? readPos : readPos - 1; - // updateIndel(isDel, index); - // refPos += len; - // } else if (c == 'N') { - // refPos += len; - // } else if (c == 'I') { - // // 与Del不同,Ins应该是在下一个cigar开始的位置,标记Ins - // bool forwardStrandRead = !bw->GetReadNegativeStrandFlag(); - // if (forwardStrandRead) { - // updateIndel(isIns, readPos - 1); - // } - // readPos += len; - // if (!forwardStrandRead) { - // updateIndel(isIns, readPos); - // } - // } else if (c == 'S') { - // readPos += len; - // } - // } - nEvents += std::accumulate(isIns.begin(), isIns.end(), 0) + std::accumulate(isDel.begin(), isDel.end(), 0); - // spdlog::info("nEvents: {}", nEvents); - - //spdlog::info("SNPs: {}, Ins: {}, Del: {}, total events: {}", std::accumulate(isSNP.begin(), isSNP.end(), 0), - // std::accumulate(isIns.begin(), isIns.end(), 0), std::accumulate(isDel.begin(), isDel.end(), 0), nEvents); - // exit(0); return nEvents; } // 简单计算baq数组,就是全部赋值为'@' (64) -bool flatBAQArray(SamData& sd, vector& baqArray) { +bool flatBAQArray(SamData& sd, StableArray& baqArray) { baqArray.resize(sd.read_len, (uint8_t)'@'); return true; } @@ -514,12 +254,13 @@ static void get_line_from_buf(char* buf, int64_t total, int64_t* cur, string* li } // 计算与read有交叉的已知位点信息, 应该要判断一下,是按照read的范围去读取vcf,还是按照一个batch read的范围去读取 -void calculateKnownSites(SamData& sd, vector &vcfs, vector &knownSites) { +void calculateKnownSites(SamData& sd, vector& vcfs, StableArray& knownSites) { BamWrap* bw = sd.bw; int tid = bw->contig_id(); uint64_t startPos = bw->start_pos(); // 闭区间 uint64_t endPos = bw->end_pos(); // 闭区间 - // spdlog::info("bam {}, {}", startPos, endPos); + knownSites.resize(sd.read_len, 0); + // update vcfs for(auto &vcf : vcfs) { // 清理旧的interval @@ -533,15 +274,11 @@ void calculateKnownSites(SamData& sd, vector &vcfs, vector &kno } if (!vcf.knownSites.empty() && vcf.knownSites.back().left > endPos) continue; - // spdlog::info("intv {}, {}, {}", vcf.knownSites.size(), vcf.knownSites.front().right, vcf.knownSites.front().right); - // exit(0); - //spdlog::info("before intervals : {}", vcf.knownSites.size()); // 读取新的interval int64_t fpos, flen; endPos = std::max(startPos + MAX_SITES_INTERVAL, endPos); Interval readIntv(startPos, endPos); vcf.index.SearchInterval(startPos, endPos, &fpos, &flen); - //spdlog::info("file index: {}, {}", fpos, flen); if (flen > 0) { vcf.inStm.seekg(fpos, ios::beg); if (flen > vcf.bufLen) { @@ -565,20 +302,11 @@ void calculateKnownSites(SamData& sd, vector &vcfs, vector &kno Interval varIntv(varStart, varStart + ref.size() - 1); if (readIntv.overlaps(varIntv)) { vcf.knownSites.push_back(Interval(tid, pos - 1, pos - 1 + ref.size() - 1)); // 闭区间 - //spdlog::info("intv-1 {}, {}, {}", tid, pos, ref.size()); } get_line_from_buf(buf, flen, &cur, &line); } } - //spdlog::info("after intervals : {}", vcf.knownSites.size()); - //for(auto &val : vcf.knownSites) { - // spdlog::info("intv {}, {}", val.left, val.right); - //} } - //exit(0); - knownSites.resize(sd.read_len); - endPos = bw->end_pos(); - for(auto &vcf : vcfs) { for (auto &intv : vcf.knownSites) { // knownSite is outside clipping window for the read, ignore @@ -602,11 +330,10 @@ void calculateKnownSites(SamData& sd, vector &vcfs, vector &kno } } } - } // 应该是计算一段数据的平均值 -static void calculateAndStoreErrorsInBlock(int i, int blockStartIndex, vector& errorArr, vector& fracErrs) { +static void calculateAndStoreErrorsInBlock(int i, int blockStartIndex, StableArray& errorArr, StableArray& fracErrs) { int totalErrors = 0; for (int j = max(0, blockStartIndex - 1); j <= i; j++) { totalErrors += errorArr[j]; @@ -617,9 +344,8 @@ static void calculateAndStoreErrorsInBlock(int i, int blockStartIndex, vector& errorArr, vector& baqArr, vector& fracErrs) { - // for (auto val : errorArr) { if (val > 0) spdlog::info("snp err val: {}", val); } - fracErrs.resize(baqArr.size()); +void calculateFractionalErrorArray(StableArray& errorArr, StableArray& baqArr, StableArray& fracErrs) { + fracErrs.resize(baqArr.size(), 0.0); // errorArray和baqArray必须长度相同 const int BLOCK_START_UNSET = -1; bool inBlock = false; @@ -805,11 +531,8 @@ int SerialBQSR() { ReadGroupCovariate::IdToRg[i] = nsgv::gInBamHeader->hrecs->rg[i].name; } - int test = 0; - - while (1) { + while (true) { ++ round; - // 一. 读取bam数据 size_t readNum = 0; if (inBamBuf.ReadStat() >= 0) @@ -818,52 +541,38 @@ int SerialBQSR() { break; } auto bams = inBamBuf.GetBamArr(); - spdlog::info("{} reads processed in {} round, {}", readNum, round, test); + spdlog::info("{} reads processed in {} round", readNum, round); // 二. 遍历每个bam(read)记录,进行处理 + SamData sd; + StableArray isSNP, isIns, isDel; // 该位置是否是SNP, indel位置,0不是,1是 + StableArray baqArray; + StableArray snpErrors, insErrors, delErrors; + StableArray skips; // 该位置是否是已知位点 for (int i = 0; i < bams.size(); ++i) { // 1. 对每个read,需要检查cigar是否合法,即没有两个连续的相同的cigar,而且需要将首尾的deletion处理掉,目前看好像没啥影响,我们忽略这一步 // 2. 对质量分数长度跟碱基长度不匹配的read,缺少的质量分数用默认值补齐,先忽略,后边有需要再处理 // 3. 如果bam文件之前做过bqsr,tag中包含OQ(originnal quality,原始质量分数),检查用户参数里是否指定用原始质量分数进行bqsr,如果是则将质量分数替换为OQ,否则忽略OQ,先忽略 - // 4. 对read的两端进行检测,去除(hardclip)adapter // spdlog::info("bam idx: {}", i); BamWrap* bw = bams[i]; - SamData sd; + sd.init(); + sd.parseBasic(bw); sd.rid = i + readNumSum; - sd.bw = bw; - sd.read_len = BamWrap::BamEffectiveLength(bw->b); - sd.cigar_end = bw->b->core.n_cigar; if (sd.read_len <= 0) continue; - - int adapter_boundary = bw->GetAdapterBoundary(); - if (bw->IsAdapterInRead(adapter_boundary)) { - // adapter在read范围内 - if (bw->GetReadNegativeStrandFlag()) { // 反链 - clipByReferenceCoordinates(bw, -1, adapter_boundary, sd); - } else { // 正链 - clipByReferenceCoordinates(bw, adapter_boundary, -1, sd); - } - } - sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip; // 更新read长度 + + PROF_START(clip_read); + // 4. 对read的两端进行检测,去除(hardclip)adapter + ReadTransformer::hardClipAdaptorSequence(bw, sd); + if (sd.read_len <= 0) continue; // 5. 然后再去除softclip部分 - calculateSoftClip(bw, sd); - sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip; // 更新read长度 + ReadTransformer::hardClipSoftClippedBases(bw, sd); if (sd.read_len <= 0) continue; - - calculateRefOffset(bw, sd); // 计算ref_offset,就是相对比对的position,要将ref右移多少 - calculateReadBases(bw, sd); // 计算clip处理之后,剩余的碱基 - // 计算clip之后,两端的softstart和softend - calculateSoftStartEnd(bw, sd); - calculateCigar(bw, sd); - - //spdlog::info("read-len {} - {}: clip left {}, right {}, ref offset: {}, cigar range: [{}, {}), cigar: {}", bw->b->core.l_qseq, - // sd.read_len, sd.left_clip, sd.right_clip, sd.ref_offset, sd.cigar_start, sd.cigar_end, bw->cigar_str()); + // 应用所有的变换,计算samdata的相关信息 + sd.applyTransformations(); + PROF_END(gprof[GP_clip_read], clip_read); // 6. 更新每个read的platform信息,好像没啥用,暂时忽略 - vector isSNP(sd.read_len, 0); // 该位置是否是SNP位置,0不是,1是 - vector isIns(sd.read_len, 0); // 该位置是否是插入位置,0不是,1是 - vector isDel(sd.read_len, 0); // 该位置是否是删除位置,0不是,1是 const int nErrors = calculateIsSNPOrIndel(nsgv::gAuxVars[0], sd, isSNP, isIns, isDel); /*fprintf(gf[0], "%d\t", sd.read_len); @@ -877,16 +586,11 @@ int SerialBQSR() { fprintf(gf[2], "\n"); */ - // spdlog::info("nErrors: {}", nErrors); - // for (auto val : isSNP) { if (val > 0) spdlog::info("snp val: {}", val); } - - //exit(0); - // 7. 计算baqArray // BAQ = base alignment quality // note for efficiency reasons we don't compute the BAQ array unless we actually have // some error to marginalize over. For ILMN data ~85% of reads have no error - vector baqArray; + // vector baqArray; bool baqCalculated = false; if (nErrors == 0 || !nsgv::gBqsrArg.enableBAQ) { baqCalculated = flatBAQArray(sd, baqArray); @@ -922,13 +626,7 @@ int SerialBQSR() { // } // fprintf(gf[3], "\n"); - //test = readCovariates[1][0][0] + readCovariates[2][1][3]; - //int end_pos = bw->contig_end_pos(); - //spdlog::info("adapter: {}, read: {}, {}, strand: {}", adapter_boundary, bw->contig_pos(), end_pos, - // bw->GetReadNegativeStrandFlag() ? "reverse" : "forward"); - // for (auto val : isSNP) { if (val > 0) spdlog::info("snp err val-1: {}", val); } // 9. 计算这条read需要跳过的位置 - vector skips(sd.read_len, 0); PROF_START(known_sites); calculateKnownSites(sd, nsgv::gAuxVars[0].vcfArr, skips); for (int ii = 0; ii < sd.read_len; ++ii) { @@ -936,27 +634,19 @@ int SerialBQSR() { sd.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN; } PROF_END(gprof[GP_read_vcf], known_sites); - // fprintf(gf[0], "%d\t", sd.read_len); + // fprintf(gf[0], "%ld %d\t", sd.rid, sd.read_len); // for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[0], "%d ", skips[ii] ? 1 : 0); // fprintf(gf[0], "\n"); // 10. 根据BAQ进一步处理snp,indel,得到处理后的数据 - vector snpErrors, insErrors, delErrors; - // for (auto val : isSNP) { if (val > 0) spdlog::info("snp err val-2: {}", val); } calculateFractionalErrorArray(isSNP, baqArray, snpErrors); calculateFractionalErrorArray(isIns, baqArray, insErrors); calculateFractionalErrorArray(isDel, baqArray, delErrors); - // for (auto val : isSNP) { if (val > 0) spdlog::info("snp val: {}", val); } - //spdlog::info("snp errors size: {}, read len: {}", snpErrors.size(), sd.read_len); - //for (auto val : snpErrors) { if (val > 0) spdlog::info("snp err val: {}", val); } - // aggregate all of the info into our info object, and update the data // 11. 合并之前计算的数据,得到info,并更新bqsr table数据 ReadRecalInfo info(sd, readCovariates, skips, snpErrors, insErrors, delErrors); - int m = 0; - // for (auto err : snpErrors) { if (isSNP[m] > 0 || err > 0) spdlog::info("snp err: {} : {}", isSNP[m++], err); } - //exit(0); + PROF_START(update_info); updateRecalTablesForRead(info); PROF_END(gprof[GP_update_info], update_info); diff --git a/src/bqsr/covariate.cpp b/src/bqsr/covariate.cpp index 2e4ee2c..a2c0ab9 100644 --- a/src/bqsr/covariate.cpp +++ b/src/bqsr/covariate.cpp @@ -99,7 +99,7 @@ static char SimpleComplement(const char base) { } } -static void ClipLowQualEndsWithN(string& bases, const FastArray &quals, uint8_t lowQTail, bool isNegativeStrand) { +static void ClipLowQualEndsWithN(string& bases, const StableArray &quals, uint8_t lowQTail, bool isNegativeStrand) { // 处理左边 int left = 0; int readLen = bases.size(); @@ -245,7 +245,7 @@ void ContextCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovar const int originalReadLength = sd.read_len; // store the original bases and then write Ns over low quality ones - string strandedClippedBases(sd.bases); + string strandedClippedBases(sd.bases.arr.data(), sd.read_len); // GetStrandedClippedBytes(bw, sd, strandedClippedBases, 30); // 注意这里的lowQualTail数值 GetStrandedClippedBytes(sd, strandedClippedBases, lowQualTail); // 命名我之前看到过这个30的? // spdlog::info("bases: {}", strandedClippedBases); diff --git a/src/bqsr/covariate.h b/src/bqsr/covariate.h index d7d7f2b..2f813c8 100644 --- a/src/bqsr/covariate.h +++ b/src/bqsr/covariate.h @@ -19,6 +19,7 @@ #include "bqsr_args.h" #include "util/bam_wrap.h" +#include "util/sam_data.h" using std::map; using std::string; diff --git a/src/bqsr/read_recal_info.h b/src/bqsr/read_recal_info.h index 1915a89..2888be9 100644 --- a/src/bqsr/read_recal_info.h +++ b/src/bqsr/read_recal_info.h @@ -9,19 +9,23 @@ #pragma once -#include "util/bam_wrap.h" +#include + #include "covariate.h" +#include "util/bam_wrap.h" +#include "util/stable_array.h" +using std::vector; struct ReadRecalInfo { SamData& read; int length; PerReadCovariateMatrix& covariates; - vector& skips; - FastArray&base_quals, &ins_quals, &del_quals; - vector&snp_errs, &ins_errs, &del_errs; + StableArray& skips; + StableArray&base_quals, &ins_quals, &del_quals; + StableArray&snp_errs, &ins_errs, &del_errs; - ReadRecalInfo(SamData& _read, PerReadCovariateMatrix& _covariates, vector& _skips, vector& _snp_errs, vector& _ins_errs, - vector& _del_errs) + ReadRecalInfo(SamData& _read, PerReadCovariateMatrix& _covariates, StableArray& _skips, StableArray& _snp_errs, + StableArray& _ins_errs, StableArray& _del_errs) : read(_read), covariates(_covariates), skips(_skips), diff --git a/src/util/bam_wrap.h b/src/util/bam_wrap.h index 00e7feb..4bd1ba5 100644 --- a/src/util/bam_wrap.h +++ b/src/util/bam_wrap.h @@ -21,111 +21,6 @@ using namespace std; -struct Cigar { - char op = '0'; - int len = 0; - // 该操作符是否消耗read的碱基 - static bool ConsumeReadBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'I' || cigar == 'S'; } - // 该操作符是否消耗参考基因组的碱基 - static bool ConsumeRefBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N'; } -}; - -struct ReadIdxCigar { - int readIdx = -1; // 在read序列中的位置 - char cigarOp = '0'; // 当前位置对应的cigar -}; - -// 不用经常释放array的内存空间,减少频繁的内存开辟和释放操作 -template -struct FastArray { - vector arr; - size_t idx; - void clear() { idx = 0; } - size_t size() { return idx; } - bool empty() { return idx == 0; } - void reserve(size_t _size) { arr.reserve(_size); } - void resize(size_t _size) { - arr.resize(_size); - idx = _size; - } - void push_back(const T& val) { - if (idx < arr.size()) { - arr[idx++] = val; - } else { - arr.push_back(val); - idx++; - } - } - inline T& operator[](size_t pos) { return arr[pos]; } - inline const T& operator[](size_t pos) const { return arr[pos]; } - struct iterator { - typename std::vector::iterator it; - iterator(typename std::vector::iterator _it) : it(_it) {} - iterator& operator++() { ++it; return *this;} - iterator& operator--() { --it; return *this;} - T& operator*() const { return *it; } - bool operator!=(const iterator& other) const { return it != other.it; } - bool operator==(const iterator& other) const { return it == other.it; } - }; - - iterator begin() { return arr.begin(); } - iterator end() { return arr.begin() + idx; } -}; - -// 对原始bam数据的补充,比如对两端进行hardclip等 -class BamWrap; -struct SamData { - int64_t rid = 0; // for debug - int read_len = 0; // read长度,各种clip之后的长度 - int cigar_start = 0; // cigar起始位置,闭区间 - int cigar_end = 0; // cigar结束位置,开区间 - int first_cigar_clip = 0; // 第一个cigar, clip的数量,切左侧 - int last_cigar_clip = 0; // 最后一个cigar, clip的数量,切右侧 - int left_clip = 0; // 左侧被切掉的碱基长度,BI和BD质量分数也会用到 - int right_clip = 0; // 右侧被切掉的碱基长度 - int ref_offset = 0; // 切除adapter和softclip之后(softclip应该不影响),相对原始ref比对位置(contig_pos)的偏移量 - - // 记录一下bqsr运算过程中用到的数据,回头提前计算一下,修正现在的复杂逻辑 - static constexpr int READ_INDEX_NOT_FOUND = -1; - - BamWrap* bw; - int64_t start_pos; // 因为soft clip都被切掉了,这里的softstart应该就是切掉之后的匹配位点,闭区间 - int64_t end_pos; // 同上,闭区间 - string bases; // 处理之后的read的碱基 - FastArray base_quals; // 对应的质量分数 - FastArray ins_quals; // insert质量分数, BI (大部分应该都没有) - FastArray del_quals; // delete质量分数, BD (大部分应该都没有) - - FastArray cigars; - int64_t& softStart() { return start_pos; } - int64_t& softEnd() { return end_pos; } - - // functions - ReadIdxCigar getReadIndexForReferenceCoordinate(int64_t refPos) { - ReadIdxCigar rc; - if (refPos < start_pos) - return rc; - int firstReadPosOfElement = 0; // inclusive - int firstRefPosOfElement = start_pos; // inclusive - int lastReadPosOfElement = 0; // exclusive - int lastRefPosOfElement = start_pos; // exclusive - // advance forward through all the cigar elements until we bracket the reference coordinate - for (auto& cigar : cigars) { - firstReadPosOfElement = lastReadPosOfElement; - firstRefPosOfElement = lastRefPosOfElement; - lastReadPosOfElement += Cigar::ConsumeReadBases(cigar.op) ? cigar.len : 0; - lastRefPosOfElement += Cigar::ConsumeRefBases(cigar.op) || cigar.op == 'S' ? cigar.len : 0; - if (firstRefPosOfElement <= refPos && refPos < lastRefPosOfElement) { // refCoord falls within this cigar element - int readPosAtRefCoord = firstReadPosOfElement + (Cigar::ConsumeReadBases(cigar.op) ? (refPos - firstRefPosOfElement) : 0); - rc.cigarOp = cigar.op; - rc.readIdx = readPosAtRefCoord; - return rc; - } - } - return rc; - } -}; - /* 这里的成员函数命名有点混乱,特此说明,小写加下划线的函数命名,无论是静态函数,还是普通成员函数,更侧重说明 这是类似bam的一个属性,而大写加驼峰命名的函数,更侧重说明这是通过计算得出的。 diff --git a/src/util/base_utils.cpp b/src/util/base_utils.cpp new file mode 100644 index 0000000..01ca770 --- /dev/null +++ b/src/util/base_utils.cpp @@ -0,0 +1,3 @@ +#include "base_utils.h" + +const char BaseUtils::cBaseToChar[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'}; \ No newline at end of file diff --git a/src/util/base_utils.h b/src/util/base_utils.h new file mode 100644 index 0000000..d179b82 --- /dev/null +++ b/src/util/base_utils.h @@ -0,0 +1,20 @@ +/* + Description: 碱基相关的工具函数 + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2025/12/29 +*/ +#pragma once + +struct BaseUtils { + // uint8_t转碱基字符 + static const char cBaseToChar[16]; + + // 该操作符是否消耗read的碱基 + static bool consumeReadBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'I' || cigar == 'S'; } + + // 该操作符是否消耗参考基因组的碱基 + static bool consumeRefBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N'; } +}; \ No newline at end of file diff --git a/src/util/profiling.cpp b/src/util/profiling.cpp index 55706ba..5d456da 100644 --- a/src/util/profiling.cpp +++ b/src/util/profiling.cpp @@ -54,6 +54,7 @@ int DisplayProfiling(int nthread) { // PRINT_GP(sort_wait); // PRINT_GP(markdup_wait); // PRINT_GP(intersect_wait); + PRINT_GP(clip_read); PRINT_GP(read_ref); PRINT_GP(read_vcf); PRINT_GP(covariate); diff --git a/src/util/profiling.h b/src/util/profiling.h index 3e3e9e4..0ee68e8 100644 --- a/src/util/profiling.h +++ b/src/util/profiling.h @@ -40,6 +40,7 @@ extern uint64_t gprof[LIM_GLOBAL_PROF_TYPE]; enum { GP_0 = 0, GP_1, GP_2, GP_3, GP_4, GP_5, GP_6, GP_7, GP_8, GP_9, GP_10 }; enum { GP_read_wait = 11, + GP_clip_read, GP_covariate, GP_read_ref, GP_read_vcf, diff --git a/src/util/read_transformer.h b/src/util/read_transformer.h new file mode 100644 index 0000000..39ac2ed --- /dev/null +++ b/src/util/read_transformer.h @@ -0,0 +1,154 @@ +/* + Description: read (sam record) 相关的工具函数,比如用于clipping低质量碱基等 + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2025/12/29 +*/ +#pragma once + +#include "bam_wrap.h" +#include "sam_data.h" +#include "base_utils.h" + +// 用于对read进行各种转换操作,比如clipping等。注意这里都是逻辑操作,最后需要调用SamData.applyTransformations()来真正应用这些修改 +struct ReadTransformer { + // 给定一个ref位置,在read内部找到对应的位置和操作符 + struct PosAndOperator { + int readPosAtRefCoord = -1; // read中的位置 + char cigarOperator = '0'; // cigar操作符 + int cigarIndex = -1; // cigar索引 + int cigarLen = 0; + int preCigarLen = 0; // 截止cigar之前的,消耗read base的长度 + }; + + /** + * Find the 0-based index within a read base array corresponding to a given 0-based position in the reference, along with the cigar operator of + * the element containing that base. If the reference coordinate occurs within a deletion, the first index after the deletion is returned. + * Note that this treats soft-clipped bases as if they align with the reference, which is useful for hard-clipping reads with soft clips. + * + * @param alignmentStart The soft start of the read on the reference + * @param cigar The read's cigar + * @param refCoord The target reference coordinate + * @return If the reference coordinate occurs before the read start or after the read end {@code CLIPPING_GOAL_NOT_REACHED}; + * if the reference coordinate falls within an alignment block of the read's cigar, the corresponding read + * coordinate; if the reference coordinate falls within a deletion, the first read coordinate after the deletion. Note: if the last cigar element + * is a deletion (which isn't meaningful), it returns {@code CLIPPING_GOAL_NOT_REACHED}. + */ + static PosAndOperator getReadIndexForReferenceCoordinate(BamWrap* bw, int alignmentStart, int refCoord) { + PosAndOperator po; + if (refCoord < alignmentStart) { + return po; + } + int firstReadPosOfElement = 0; // inclusive + int firstRefPosOfElement = alignmentStart; // inclusive + int lastReadPosOfElement = 0; // exclusive + int lastRefPosOfElement = alignmentStart; // exclusive + + // advance forward through all the cigar elements until we bracket the reference coordinate + const uint32_t* cigar = bam_get_cigar(bw->b); + const bam1_core_t& bc = bw->b->core; + const int idx = bc.n_cigar - 1; + if (idx < 0) + return po; + for (int i = 0; i < bc.n_cigar; ++i) { + const char c = bam_cigar_opchr(cigar[i]); + const int len = bam_cigar_oplen(cigar[i]); + firstReadPosOfElement = lastReadPosOfElement; + firstRefPosOfElement = lastRefPosOfElement; + lastReadPosOfElement += BaseUtils::consumeReadBases(c) ? len : 0; + lastRefPosOfElement += (BaseUtils::consumeRefBases(c) || c == 'S') ? len : 0; + if (firstRefPosOfElement <= refCoord && refCoord < lastRefPosOfElement) { // refCoord falls within this cigar element + int readPosAtRefCoord = firstReadPosOfElement + (BaseUtils::consumeReadBases(c) ? (refCoord - firstRefPosOfElement) : 0); + return PosAndOperator{readPosAtRefCoord, c, i, len, firstReadPosOfElement}; + } + } + return po; + } + + // 根据adapter位置,对read进行hardclip,返回左侧或右侧减掉的base数量 + static void clipByReferenceCoordinates(BamWrap* bw, int refStart, int refStop, SamData& sd) { + int start, stop; + // Determine the read coordinate to start and stop hard clipping + if (refStart < 0) { + if (refStop < 0) + return; + PosAndOperator stopPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStop); + // if the refStop falls in a deletion, the above method returns the position after the deletion. Since the stop we return here + // is inclusive, we decrement the stop to avoid overclipping by one base. As a result we do not clip the deletion, which is fine. + stop = stopPosAndOperator.readPosAtRefCoord - (BaseUtils::consumeReadBases(stopPosAndOperator.cigarOperator) ? 0 : 1); + sd.left_clip = stop + 1; + sd.cigar_start = stopPosAndOperator.cigarIndex; + sd.first_cigar_clip = sd.left_clip - stopPosAndOperator.preCigarLen; + } else { + if (refStop >= 0) + return; + // unlike the above case where we clip the start fo the read, here we clip the end and returning the base to the right of a deletion + // avoids overclipping + PosAndOperator startPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStart); + start = startPosAndOperator.readPosAtRefCoord; + sd.right_clip = bw->b->core.l_qseq - start; + sd.cigar_end = startPosAndOperator.cigarIndex + 1; + sd.last_cigar_clip = startPosAndOperator.preCigarLen + startPosAndOperator.cigarLen - start; + } + } + + // 切掉adaper序列,注意这里的clipping只是逻辑上的,实际并没有修改bam record + static void hardClipAdaptorSequence(BamWrap* bw, SamData& sd) { + int adapter_boundary = bw->GetAdapterBoundary(); + if (bw->IsAdapterInRead(adapter_boundary)) { + // adapter在read范围内 + if (bw->GetReadNegativeStrandFlag()) { // 反链 + clipByReferenceCoordinates(bw, -1, adapter_boundary, sd); + } else { // 正链 + clipByReferenceCoordinates(bw, adapter_boundary, -1, sd); + } + } + sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip; // 更新read长度 + } + + // 计算read两端softclip的碱基数量,切掉softclip序列 + static void hardClipSoftClippedBases(BamWrap* bw, SamData& sd) { + const uint32_t* cigar = bam_get_cigar(bw->b); + const bam1_core_t& bc = bw->b->core; + int readIndex = sd.left_clip; + int cutLeft = -1; // first position to hard clip (inclusive) + int cutRight = -1; // first position to hard clip (inclusive) + int cigar_start = sd.cigar_start; + int cigar_end = sd.cigar_end; + bool rightTail = false; // trigger to stop clipping the left tail and start cutting the right tail + + for (int i = sd.cigar_start; i < sd.cigar_end; ++i) { + const char c = bam_cigar_opchr(cigar[i]); + int len = bam_cigar_oplen(cigar[i]); + if (i == sd.cigar_start) len -= sd.first_cigar_clip; + if (i == sd.cigar_end - 1) len -= sd.last_cigar_clip; + if (c == 'S') { + if (rightTail) { + cutRight = readIndex; + cigar_end = i; + } else { + cutLeft = readIndex + len - 1; + cigar_start = i + 1; + } + } else if (c != 'H') { + rightTail = true; + } + if (BaseUtils::consumeReadBases(c)) { + readIndex += len; + } + } + if (cutRight >= 0) { + sd.right_clip = bw->b->core.l_qseq - cutRight; + sd.cigar_end = cigar_end; + sd.last_cigar_clip = 0; + } + if (cutLeft >= 0) { + sd.left_clip = cutLeft + 1; + sd.cigar_start = cigar_start; + sd.first_cigar_clip = 0; + } + sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip; // 更新read长度 + } +}; \ No newline at end of file diff --git a/src/util/read_utils.h b/src/util/read_utils.h deleted file mode 100644 index e69de29..0000000 diff --git a/src/util/sam_data.h b/src/util/sam_data.h new file mode 100644 index 0000000..a260ee1 --- /dev/null +++ b/src/util/sam_data.h @@ -0,0 +1,178 @@ +/* + Description: 对原始bam进行部分数据解析,以更方便后续处理 + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2025/12/29 +*/ +#pragma once + +#include + +#include "bam_wrap.h" +#include "stable_array.h" +#include "base_utils.h" + +// 对cigar进行简单包装 +struct Cigar { + char op = '0'; + int len = 0; + // 该操作符是否消耗read的碱基 + static bool ConsumeReadBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'I' || cigar == 'S'; } + // 该操作符是否消耗参考基因组的碱基 + static bool ConsumeRefBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N'; } +}; + +// 在read中某个特定位置对应的cigar操作符 +struct ReadIdxCigar { + int readIdx = -1; // 在read序列中的位置 + char cigarOp = '0'; // 当前位置对应的cigar +}; + +// 对原始bam进行部分数据解析,以更方便后续处理 +struct SamData { + int64_t rid = 0; // for debug + + // 这些数据用于最开始clipping等操作 + int read_len = 0; // read长度,各种clip之后的长度 + int cigar_start = 0; // cigar起始位置,闭区间 + int cigar_end = 0; // cigar结束位置,开区间 + int first_cigar_clip = 0; // 第一个cigar, clip的数量,切左侧 + int last_cigar_clip = 0; // 最后一个cigar, clip的数量,切右侧 + int left_clip = 0; // 左侧被切掉的碱基长度,BI和BD质量分数也会用到 + int right_clip = 0; // 右侧被切掉的碱基长度 + int ref_offset = 0; // 切除adapter和softclip之后(softclip应该不影响),相对原始ref比对位置(contig_pos)的偏移量 + + // 记录一下bqsr运算过程中用到的数据,回头提前计算一下,修正现在的复杂逻辑 + static constexpr int READ_INDEX_NOT_FOUND = -1; + + BamWrap* bw; + int64_t start_pos; // 因为soft clip都被切掉了,这里的softstart应该就是切掉之后的匹配位点,闭区间 + int64_t end_pos; // 同上,闭区间 + StableArray bases; // 处理之后的read的碱基数组形式 + StableArray base_quals; // 对应的质量分数 + StableArray ins_quals; // insert质量分数, BI (大部分应该都没有) + StableArray del_quals; // delete质量分数, BD (大部分应该都没有) + StableArray cigars; + + int64_t& softStart() { return start_pos; } + int64_t& softEnd() { return end_pos; } + + // functions + + // 初始化 + void init() { + bases.clear(); + base_quals.clear(); + ins_quals.clear(); + del_quals.clear(); + cigars.clear(); + rid = 0; + read_len = 0; + cigar_start = 0; + cigar_end = 0; + first_cigar_clip = 0; + last_cigar_clip = 0; + left_clip = 0; + right_clip = 0; + ref_offset = 0; + bw = nullptr; + start_pos = 0; + end_pos = 0; + } + + // 初步解析bam + void parseBasic(BamWrap *_bw) { + bw = _bw; + read_len = BamWrap::BamEffectiveLength(bw->b); + cigar_end = bw->b->core.n_cigar; + } + + // 应用各种转换操作,更新SamData中的数据 + void applyTransformations() { + const uint32_t* cigar = bam_get_cigar(bw->b); + const bam1_core_t& bc = bw->b->core; + int i = 0; + + // 计算ref_offset,就是相对比对的position,要将ref右移多少 + for (i = 0; i < cigar_start; ++i) { + if (BaseUtils::consumeRefBases(bam_cigar_opchr(cigar[i]))) { + ref_offset += bam_cigar_oplen(cigar[i]); + } + } + if (BaseUtils::consumeRefBases(bam_cigar_opchr(cigar[i]))) { + ref_offset += first_cigar_clip; + } + + // 计算clip处理之后,剩余的碱基 + bases.resize(read_len); + base_quals.resize(read_len); + uint8_t* seq = bam_get_seq(bw->b); + uint8_t* quals = bam_get_qual(bw->b); + for (i = 0; i < read_len; ++i) { + bases[i] = BaseUtils::cBaseToChar[bam_seqi(seq, i + left_clip)]; + base_quals[i] = quals[i + left_clip]; + } + + // 计算read两端clip之后的softstart和softend,其实S之前都被切掉了 + int64_t softStart = bw->b->core.pos + ref_offset; + int64_t softEnd = softStart - 1; // 闭区间 + bool rightTail = false; + for (i = cigar_start; i < cigar_end; ++i) { + const char c = bam_cigar_opchr(cigar[i]); + int len = bam_cigar_oplen(cigar[i]); + if (i == cigar_start) len -= first_cigar_clip; + if (i == cigar_end - 1) len -= last_cigar_clip; + // if (c == 'S' || c == 'I') + if (c == 'S') softStart -= len; + else if (c != 'H') rightTail = true; + if (rightTail && (BaseUtils::consumeRefBases(c) || c == 'S')) softEnd += len; + } + this->softStart() = softStart; // 其实这里的softStart就是start + this->softEnd() = softEnd; // 同上 + + // 计算clip之后的cigar,其实可以考虑下边的代码和上边的换一下位置 + for (i = cigar_start; i < cigar_end; ++i) { + char c = bam_cigar_opchr(cigar[i]); + int len = bam_cigar_oplen(cigar[i]); + if (i == cigar_start) len -= first_cigar_clip; + if (i == cigar_end - 1) len -= last_cigar_clip; + // if ((i == sd.cigar_start || i == sd.cigar_end - 1) && c == 'D') // 跳过开头的deletion + if (i == cigar_start && c == 'D') { // 跳过开头的deletion + c = 'H'; + ref_offset += len; + start_pos += len; // 更新起始位置 + } else if (i == cigar_end - 1 && c == 'D') { // 跳过结尾的deletion + c = 'H'; + softEnd -= len; // 更新结束位置 + } + cigars.push_back({c, len}); + } + } + + // 给定一个ref pos,返回对应的read index和cigar操作 + ReadIdxCigar getReadIndexForReferenceCoordinate(int64_t refPos) { + ReadIdxCigar rc; + if (refPos < start_pos) + return rc; + int firstReadPosOfElement = 0; // inclusive + int firstRefPosOfElement = start_pos; // inclusive + int lastReadPosOfElement = 0; // exclusive + int lastRefPosOfElement = start_pos; // exclusive + // advance forward through all the cigar elements until we bracket the reference coordinate + for (auto& cigar : cigars) { + firstReadPosOfElement = lastReadPosOfElement; + firstRefPosOfElement = lastRefPosOfElement; + lastReadPosOfElement += Cigar::ConsumeReadBases(cigar.op) ? cigar.len : 0; + lastRefPosOfElement += Cigar::ConsumeRefBases(cigar.op) || cigar.op == 'S' ? cigar.len : 0; + if (firstRefPosOfElement <= refPos && refPos < lastRefPosOfElement) { // refCoord falls within this cigar element + int readPosAtRefCoord = firstReadPosOfElement + (Cigar::ConsumeReadBases(cigar.op) ? (refPos - firstRefPosOfElement) : 0); + rc.cigarOp = cigar.op; + rc.readIdx = readPosAtRefCoord; + return rc; + } + } + return rc; + } +}; \ No newline at end of file diff --git a/src/util/stable_array.h b/src/util/stable_array.h new file mode 100644 index 0000000..d10800a --- /dev/null +++ b/src/util/stable_array.h @@ -0,0 +1,73 @@ +/* + Description: 不用经常释放array的内存空间,减少频繁的内存开辟和释放操作 + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2025/12/29 +*/ +#pragma once + +#include +#include + +using std::vector; + +// 不能用于bool类型, 因为在c++中vector是特化过的 +template +struct StableArray { + vector arr; + size_t idx = 0; + void clear() { idx = 0; } + size_t size() { return idx; } + bool empty() { return idx == 0; } + void reserve(size_t _size) { + if (arr.size() < _size) + arr.reserve(_size); + } + void resize(size_t _size) { + if (arr.size() < _size) { + arr.resize(_size); + } + idx = _size; + } + + void resize(size_t _size, const T &val) { + if (arr.size() < _size) { + arr.resize(_size); + } + for (size_t i = 0; i < _size; ++i) { + arr[i] = val; + } + idx = _size; + } + void push_back(const T& val) { + if (idx < arr.size()) { + arr[idx++] = val; + } else { + arr.push_back(val); + idx++; + } + } + inline T& operator[](size_t pos) { return arr[pos]; } + //inline const T& operator[](size_t pos) { return arr[pos]; } + inline const T& operator[](size_t pos) const { return arr[pos]; } + struct iterator { + typename std::vector::iterator it; + iterator(typename std::vector::iterator _it) : it(_it) {} + iterator& operator++() { + ++it; + return *this; + } + iterator& operator--() { + --it; + return *this; + } + T& operator*() const { return *it; } + bool operator!=(const iterator& other) const { return it != other.it; } + bool operator==(const iterator& other) const { return it == other.it; } + }; + + iterator begin() { return arr.begin(); } + iterator end() { return arr.begin() + idx; } +}; \ No newline at end of file