重构了一下,大数据问题还没解决

This commit is contained in:
zzh 2025-12-29 19:36:38 +08:00
parent 1864736509
commit 1e5a291eb7
15 changed files with 483 additions and 462 deletions

View File

@ -6,7 +6,7 @@ vector<double> BAQ::qual2prob(256); // 质量分数转化概率
vector<vector<vector<double>>> BAQ::EPSILONS(256, vector<vector<double>>(256, vector<double>(SAM_MAX_PHRED_SCORE + 1))); // [ref][read][qual]
// 计算baq数组返回成功与否
bool BAQ::calcBAQFromHMM(BamWrap* bw, SamData& ad, string ref, int refOffset, vector<int>& baqArray) {
bool BAQ::calcBAQFromHMM(SamData& ad, string ref, int refOffset, vector<int>& baqArray) {
// 检测ref是否覆盖了read
if (ref.size() < refOffset + ad.read_len) {
spdlog::error("BAQ calculation error: reference sequence length {} is less than required length {} (refOffset {} + read_len {})",

View File

@ -15,6 +15,7 @@
#include <vector>
#include "util/bam_wrap.h"
#include "util/sam_data.h"
using std::vector;
using std::string;
@ -84,5 +85,5 @@ struct BAQ {
double calcEpsilon(uint8_t ref, uint8_t read, uint8_t qualB) { return EPSILONS[ref][read][qualB]; }
// 计算baq数组返回成功与否
bool calcBAQFromHMM(BamWrap* bw, SamData& ad, string ref, int refOffset, vector<int>& baqArray);
bool calcBAQFromHMM(SamData& ad, string ref, int refOffset, vector<int>& baqArray);
};

View File

@ -39,6 +39,10 @@ Date : 2023/10/23
#include "util/math/math_utils.h"
#include "quant_info.h"
#include "util/debug.h"
#include "util/stable_array.h"
#include "util/sam_data.h"
#include "util/read_transformer.h"
#include "util/base_utils.h"
using std::deque;
@ -47,8 +51,6 @@ using std::deque;
const uint8_t NO_BAQ_UNCERTAINTY = (uint8_t)'@';
const char cBaseToChar[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'};
// 解析knownSites
struct VCFParser {
deque<Interval> knownSites; // 已知的变异位点
@ -157,222 +159,6 @@ bool bqsrReadFilterOut(const bam1_t *b) {
return false;
}
// 该操作符是否消耗read的碱基
bool consumeReadBases(char cigar) {
return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'I' || cigar == 'S';
}
// 该操作符是否消耗参考基因组的碱基
bool consumeRefBases(char cigar) {
return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N';
}
// 给定一个ref位置在read内部找到对应的位置和操作符
struct PosAndOperator {
int readPosAtRefCoord = -1; // read中的位置
char cigarOperator = '0'; // cigar操作符
int cigarIndex = -1; // cigar索引
int cigarLen = 0;
int preCigarLen = 0; // 截止cigar之前的消耗read base的长度
};
/**
* Find the 0-based index within a read base array corresponding to a given 0-based position in the reference, along with the cigar operator of
* the element containing that base. If the reference coordinate occurs within a deletion, the first index after the deletion is returned.
* Note that this treats soft-clipped bases as if they align with the reference, which is useful for hard-clipping reads with soft clips.
*
* @param alignmentStart The soft start of the read on the reference
* @param cigar The read's cigar
* @param refCoord The target reference coordinate
* @return If the reference coordinate occurs before the read start or after the read end {@code CLIPPING_GOAL_NOT_REACHED};
* if the reference coordinate falls within an alignment block of the read's cigar, the corresponding read coordinate;
* if the reference coordinate falls within a deletion, the first read coordinate after the deletion. Note: if the last
* cigar element is a deletion (which isn't meaningful), it returns {@code CLIPPING_GOAL_NOT_REACHED}.
*/
PosAndOperator getReadIndexForReferenceCoordinate(BamWrap *bw, int alignmentStart, int refCoord) {
PosAndOperator po;
if (refCoord < alignmentStart) {
return po;
}
int firstReadPosOfElement = 0; // inclusive
int firstRefPosOfElement = alignmentStart; // inclusive
int lastReadPosOfElement = 0; // exclusive
int lastRefPosOfElement = alignmentStart; // exclusive
// advance forward through all the cigar elements until we bracket the reference coordinate
const uint32_t* cigar = bam_get_cigar(bw->b);
const bam1_core_t& bc = bw->b->core;
const int idx = bc.n_cigar - 1;
if (idx < 0)
return po;
for (int i = 0; i < bc.n_cigar; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
const int len = bam_cigar_oplen(cigar[i]);
firstReadPosOfElement = lastReadPosOfElement;
firstRefPosOfElement = lastRefPosOfElement;
lastReadPosOfElement += consumeReadBases(c) ? len : 0;
lastRefPosOfElement += (consumeRefBases(c) || c == 'S') ? len : 0;
if (firstRefPosOfElement <= refCoord && refCoord < lastRefPosOfElement) { // refCoord falls within this cigar element
int readPosAtRefCoord = firstReadPosOfElement + (consumeReadBases(c) ? (refCoord - firstRefPosOfElement) : 0);
return PosAndOperator{readPosAtRefCoord, c, i, len, firstReadPosOfElement};
}
}
return po;
}
// 根据adapter位置对read进行hardclip返回左侧或右侧减掉的base数量
void clipByReferenceCoordinates(BamWrap *bw, int refStart, int refStop, SamData &sd) {
int start, stop;
// Determine the read coordinate to start and stop hard clipping
if (refStart < 0) {
if (refStop < 0) return;
PosAndOperator stopPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStop);
// if the refStop falls in a deletion, the above method returns the position after the deletion. Since the stop we return here
// is inclusive, we decrement the stop to avoid overclipping by one base. As a result we do not clip the deletion, which is fine.
stop = stopPosAndOperator.readPosAtRefCoord - (consumeReadBases(stopPosAndOperator.cigarOperator) ? 0 : 1);
sd.left_clip = stop + 1;
sd.cigar_start = stopPosAndOperator.cigarIndex;
sd.first_cigar_clip = sd.left_clip - stopPosAndOperator.preCigarLen;
} else {
if (refStop >= 0) return;
// unlike the above case where we clip the start fo the read, here we clip the end and returning the base to the right of a deletion avoids
// overclipping
PosAndOperator startPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStart);
start = startPosAndOperator.readPosAtRefCoord;
sd.right_clip = bw->b->core.l_qseq - start;
sd.cigar_end = startPosAndOperator.cigarIndex + 1;
sd.last_cigar_clip = startPosAndOperator.preCigarLen + startPosAndOperator.cigarLen - start;
}
}
// 计算切掉adapter之后ref相对原始ref的偏移量
void calculateRefOffset(BamWrap *bw, SamData &sd) {
const uint32_t* cigar = bam_get_cigar(bw->b);
const bam1_core_t& bc = bw->b->core;
int i = 0;
for (i = 0; i < sd.cigar_start; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
int len = bam_cigar_oplen(cigar[i]);
if (consumeRefBases(c)) {
sd.ref_offset += len;
}
}
const char c = bam_cigar_opchr(cigar[i]);
if (consumeRefBases(c)) {
sd.ref_offset += sd.first_cigar_clip;
}
}
// 计算clip处理之后剩余的碱基
void calculateReadBases(BamWrap* bw, SamData& sd) {
sd.bases.resize(sd.read_len);
sd.base_quals.resize(sd.read_len);
uint8_t* seq = bam_get_seq(bw->b);
uint8_t* quals = bam_get_qual(bw->b);
for (int i = 0; i < sd.read_len; ++i) {
sd.bases[i] = cBaseToChar[bam_seqi(seq, i + sd.left_clip)];
sd.base_quals[i] = quals[i + sd.left_clip];
}
}
// 计算read两端clip之后的softstart和softend
void calculateSoftStartEnd(BamWrap* bw, SamData& sd) {
int64_t softStart = bw->b->core.pos + sd.ref_offset;
int64_t softEnd = softStart - 1; // 闭区间
const uint32_t* cigar = bam_get_cigar(bw->b);
const bam1_core_t& bc = bw->b->core;
int cigar_start = sd.cigar_start;
int cigar_end = sd.cigar_end;
bool rightTail = false;
for (int i = sd.cigar_start; i < sd.cigar_end; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
int len = bam_cigar_oplen(cigar[i]);
if (i == sd.cigar_start) len -= sd.first_cigar_clip;
if (i == sd.cigar_end - 1) len -= sd.last_cigar_clip;
// if (c == 'S' || c == 'I')
if (c == 'S')
softStart -= len;
else if (c != 'H')
rightTail = true;
if (rightTail) {
if (consumeRefBases(c) || c == 'S')
softEnd += len;
}
}
sd.softStart() = softStart;
sd.softEnd() = softEnd;
}
// 计算clip之后的cigar
void calculateCigar(BamWrap* bw, SamData& sd) {
sd.cigars.clear();
const uint32_t* cigar = bam_get_cigar(bw->b);
const bam1_core_t& bc = bw->b->core;
int cigar_start = sd.cigar_start;
int cigar_end = sd.cigar_end;
for (int i = sd.cigar_start; i < sd.cigar_end; ++i) {
char c = bam_cigar_opchr(cigar[i]);
int len = bam_cigar_oplen(cigar[i]);
if (i == sd.cigar_start)
len -= sd.first_cigar_clip;
if (i == sd.cigar_end - 1)
len -= sd.last_cigar_clip;
//if ((i == sd.cigar_start || i == sd.cigar_end - 1) && c == 'D') // 跳过开头的deletion
if (i == sd.cigar_start && c == 'D') { // 跳过开头的deletion
c = 'H';
// sd.ref_offset += len;
sd.start_pos += len;
}
sd.cigars.push_back({c, len});
}
//for(auto &cigar : sd.cigars) {
// spdlog::info("op: {}, len: {}", cigar.op, cigar.len);
//}
}
// 计算read两端softclip的碱基数量可能会修改ad里的clip值
void calculateSoftClip(BamWrap *bw, SamData &sd) {
const uint32_t* cigar = bam_get_cigar(bw->b);
const bam1_core_t& bc = bw->b->core;
int readIndex = sd.left_clip;
int cutLeft = -1; // first position to hard clip (inclusive)
int cutRight = -1; // first position to hard clip (inclusive)
int cigar_start = sd.cigar_start;
int cigar_end = sd.cigar_end;
bool rightTail = false; // trigger to stop clipping the left tail and start cutting the right tail
for (int i = sd.cigar_start; i < sd.cigar_end; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
int len = bam_cigar_oplen(cigar[i]);
if (i == sd.cigar_start) len -= sd.first_cigar_clip;
if (i == sd.cigar_end - 1) len -= sd.last_cigar_clip;
if (c == 'S') {
if (rightTail) {
cutRight = readIndex;
cigar_end = i;
} else {
cutLeft = readIndex + len - 1;
cigar_start = i + 1;
}
} else if (c != 'H') {
rightTail = true;
}
if (consumeReadBases(c)) {
readIndex += len;
}
}
if (cutRight >= 0) {
sd.right_clip = bw->b->core.l_qseq - cutRight;
sd.cigar_end = cigar_end;
sd.last_cigar_clip = 0;
}
if (cutLeft >= 0) {
sd.left_clip = cutLeft + 1;
sd.cigar_start = cigar_start;
sd.first_cigar_clip = 0;
}
}
// 读取给定区间的reference
static inline void read_ref_base(AuxVar& aux, int64_t cur_pos, Interval& interval) {
if (aux.ref_seq != NULL)
@ -386,22 +172,23 @@ static inline void read_ref_base(AuxVar& aux, int64_t cur_pos, Interval& interva
}
// 设置某个位置是indel
inline void updateIndel(vector<int> &isIndel, int index) {
inline void updateIndel(StableArray<int>& isIndel, int index) {
if (index >=0 && index < isIndel.size()) {
isIndel[index] = 1;
}
}
// 计算该read的每个碱基位置是否是SNP或Indel
int calculateIsSNPOrIndel(AuxVar& aux, SamData &sd, vector<int> &isSNP, vector<int> &isIns, vector<int> &isDel) {
int calculateIsSNPOrIndel(AuxVar& aux, SamData& sd, StableArray<int>& isSNP, StableArray<int>& isIns, StableArray<int>& isDel) {
isSNP.resize(sd.read_len, 0);
isIns.resize(sd.read_len, 0);
isDel.resize(sd.read_len, 0);
// 1. 读取参考基因组先看看串行运行性能稍后可以将读入ref和vcf合并起来做成一个并行流水线步骤
//Interval interval{bw->start_pos() + sd.ref_offset, bw->end_pos()}; // 闭区间
Interval interval{sd.start_pos, sd.end_pos}; // 闭区间
PROF_START(ref);
read_ref_base(aux, interval.left, interval);
PROF_END(gprof[GP_read_ref], ref);
string refBases(aux.ref_seq);
// spdlog::info("ref: {}, {}, {} - {}", aux.ref_seq, aux.ref_len, bw->contig_pos(), bw->contig_end_pos());
// 2. 遍历cigar计算每个碱基是否是SNP或Indel
int readPos = 0, refPos = 0, nEvents = 0;
@ -412,7 +199,6 @@ int calculateIsSNPOrIndel(AuxVar& aux, SamData &sd, vector<int> &isSNP, vector<i
for (int j = 0; j < len; ++j) {
// 按位置将read和ref碱基进行比较不同则是snp注意read起始位置要加上left_clip
int snpInt = sd.bases[readPos] == refBases[refPos] ? 0 : 1;
// if (snpInt > 0) { spdlog::info("snp {}, readpos: {}", snpInt, readPos); }
isSNP[readPos] = snpInt;
nEvents += snpInt;
readPos++;
@ -439,59 +225,13 @@ int calculateIsSNPOrIndel(AuxVar& aux, SamData &sd, vector<int> &isSNP, vector<i
readPos += len;
}
}
// const uint32_t* cigar = bam_get_cigar(bw->b);
// const bam1_core_t& bc = bw->b->core;
// uint8_t* seq = bam_get_seq(bw->b);
// for (int i = sd.cigar_start; i < sd.cigar_end; ++i) {
// const char c = bam_cigar_opchr(cigar[i]);
// int len = bam_cigar_oplen(cigar[i]);
// if (i == sd.cigar_start) len -= sd.first_cigar_clip;
// if (i == sd.cigar_end - 1) len -= sd.last_cigar_clip;
// if (c == 'M' || c == '=' || c == 'X') {
// for (int j = 0; j < len; ++j) {
// // 按位置将read和ref碱基进行比较不同则是snp注意read起始位置要加上left_clip
// int snpInt = cBaseToChar[bam_seqi(seq, readPos + sd.left_clip)] == refBases[refPos] ? 0 : 1;
// // if (snpInt > 0) { spdlog::info("snp {}, readpos: {}", snpInt, readPos); }
// isSNP[readPos] = snpInt;
// nEvents += snpInt;
// readPos++;
// refPos++;
// }
// } else if (c == 'D') {
// // 应该是在上一个消耗碱基的cigar的最后一个位置标记Del
// int index = bw->GetReadNegativeStrandFlag() ? readPos : readPos - 1;
// updateIndel(isDel, index);
// refPos += len;
// } else if (c == 'N') {
// refPos += len;
// } else if (c == 'I') {
// // 与Del不同Ins应该是在下一个cigar开始的位置标记Ins
// bool forwardStrandRead = !bw->GetReadNegativeStrandFlag();
// if (forwardStrandRead) {
// updateIndel(isIns, readPos - 1);
// }
// readPos += len;
// if (!forwardStrandRead) {
// updateIndel(isIns, readPos);
// }
// } else if (c == 'S') {
// readPos += len;
// }
// }
nEvents += std::accumulate(isIns.begin(), isIns.end(), 0) + std::accumulate(isDel.begin(), isDel.end(), 0);
// spdlog::info("nEvents: {}", nEvents);
//spdlog::info("SNPs: {}, Ins: {}, Del: {}, total events: {}", std::accumulate(isSNP.begin(), isSNP.end(), 0),
// std::accumulate(isIns.begin(), isIns.end(), 0), std::accumulate(isDel.begin(), isDel.end(), 0), nEvents);
// exit(0);
return nEvents;
}
// 简单计算baq数组就是全部赋值为'@' (64)
bool flatBAQArray(SamData& sd, vector<uint8_t>& baqArray) {
bool flatBAQArray(SamData& sd, StableArray<uint8_t>& baqArray) {
baqArray.resize(sd.read_len, (uint8_t)'@');
return true;
}
@ -514,12 +254,13 @@ static void get_line_from_buf(char* buf, int64_t total, int64_t* cur, string* li
}
// 计算与read有交叉的已知位点信息 应该要判断一下是按照read的范围去读取vcf还是按照一个batch read的范围去读取
void calculateKnownSites(SamData& sd, vector<VCFParser> &vcfs, vector<bool> &knownSites) {
void calculateKnownSites(SamData& sd, vector<VCFParser>& vcfs, StableArray<uint8_t>& knownSites) {
BamWrap* bw = sd.bw;
int tid = bw->contig_id();
uint64_t startPos = bw->start_pos(); // 闭区间
uint64_t endPos = bw->end_pos(); // 闭区间
// spdlog::info("bam {}, {}", startPos, endPos);
knownSites.resize(sd.read_len, 0);
// update vcfs
for(auto &vcf : vcfs) {
// 清理旧的interval
@ -533,15 +274,11 @@ void calculateKnownSites(SamData& sd, vector<VCFParser> &vcfs, vector<bool> &kno
}
if (!vcf.knownSites.empty() && vcf.knownSites.back().left > endPos) continue;
// spdlog::info("intv {}, {}, {}", vcf.knownSites.size(), vcf.knownSites.front().right, vcf.knownSites.front().right);
// exit(0);
//spdlog::info("before intervals : {}", vcf.knownSites.size());
// 读取新的interval
int64_t fpos, flen;
endPos = std::max(startPos + MAX_SITES_INTERVAL, endPos);
Interval readIntv(startPos, endPos);
vcf.index.SearchInterval(startPos, endPos, &fpos, &flen);
//spdlog::info("file index: {}, {}", fpos, flen);
if (flen > 0) {
vcf.inStm.seekg(fpos, ios::beg);
if (flen > vcf.bufLen) {
@ -565,20 +302,11 @@ void calculateKnownSites(SamData& sd, vector<VCFParser> &vcfs, vector<bool> &kno
Interval varIntv(varStart, varStart + ref.size() - 1);
if (readIntv.overlaps(varIntv)) {
vcf.knownSites.push_back(Interval(tid, pos - 1, pos - 1 + ref.size() - 1)); // 闭区间
//spdlog::info("intv-1 {}, {}, {}", tid, pos, ref.size());
}
get_line_from_buf(buf, flen, &cur, &line);
}
}
//spdlog::info("after intervals : {}", vcf.knownSites.size());
//for(auto &val : vcf.knownSites) {
// spdlog::info("intv {}, {}", val.left, val.right);
//}
}
//exit(0);
knownSites.resize(sd.read_len);
endPos = bw->end_pos();
for(auto &vcf : vcfs) {
for (auto &intv : vcf.knownSites) {
// knownSite is outside clipping window for the read, ignore
@ -602,11 +330,10 @@ void calculateKnownSites(SamData& sd, vector<VCFParser> &vcfs, vector<bool> &kno
}
}
}
}
// 应该是计算一段数据的平均值
static void calculateAndStoreErrorsInBlock(int i, int blockStartIndex, vector<int>& errorArr, vector<double>& fracErrs) {
static void calculateAndStoreErrorsInBlock(int i, int blockStartIndex, StableArray<int>& errorArr, StableArray<double>& fracErrs) {
int totalErrors = 0;
for (int j = max(0, blockStartIndex - 1); j <= i; j++) {
totalErrors += errorArr[j];
@ -617,9 +344,8 @@ static void calculateAndStoreErrorsInBlock(int i, int blockStartIndex, vector<in
}
// 应该是用来处理BAQ的把不等于特定BAQ分数的碱基作为一段数据统一处理
void calculateFractionalErrorArray(vector<int>& errorArr, vector<uint8_t>& baqArr, vector<double>& fracErrs) {
// for (auto val : errorArr) { if (val > 0) spdlog::info("snp err val: {}", val); }
fracErrs.resize(baqArr.size());
void calculateFractionalErrorArray(StableArray<int>& errorArr, StableArray<uint8_t>& baqArr, StableArray<double>& fracErrs) {
fracErrs.resize(baqArr.size(), 0.0);
// errorArray和baqArray必须长度相同
const int BLOCK_START_UNSET = -1;
bool inBlock = false;
@ -805,11 +531,8 @@ int SerialBQSR() {
ReadGroupCovariate::IdToRg[i] = nsgv::gInBamHeader->hrecs->rg[i].name;
}
int test = 0;
while (1) {
while (true) {
++ round;
// 一. 读取bam数据
size_t readNum = 0;
if (inBamBuf.ReadStat() >= 0)
@ -818,52 +541,38 @@ int SerialBQSR() {
break;
}
auto bams = inBamBuf.GetBamArr();
spdlog::info("{} reads processed in {} round, {}", readNum, round, test);
spdlog::info("{} reads processed in {} round", readNum, round);
// 二. 遍历每个bamread记录进行处理
SamData sd;
StableArray<int> isSNP, isIns, isDel; // 该位置是否是SNP, indel位置0不是1是
StableArray<uint8_t> baqArray;
StableArray<double> snpErrors, insErrors, delErrors;
StableArray<uint8_t> skips; // 该位置是否是已知位点
for (int i = 0; i < bams.size(); ++i) {
// 1. 对每个read需要检查cigar是否合法即没有两个连续的相同的cigar而且需要将首尾的deletion处理掉目前看好像没啥影响我们忽略这一步
// 2. 对质量分数长度跟碱基长度不匹配的read缺少的质量分数用默认值补齐先忽略后边有需要再处理
// 3. 如果bam文件之前做过bqsrtag中包含OQoriginnal quality原始质量分数检查用户参数里是否指定用原始质量分数进行bqsr如果是则将质量分数替换为OQ否则忽略OQ先忽略
// 4. 对read的两端进行检测去除hardclipadapter
// spdlog::info("bam idx: {}", i);
BamWrap* bw = bams[i];
SamData sd;
sd.init();
sd.parseBasic(bw);
sd.rid = i + readNumSum;
sd.bw = bw;
sd.read_len = BamWrap::BamEffectiveLength(bw->b);
sd.cigar_end = bw->b->core.n_cigar;
if (sd.read_len <= 0) continue;
int adapter_boundary = bw->GetAdapterBoundary();
if (bw->IsAdapterInRead(adapter_boundary)) {
// adapter在read范围内
if (bw->GetReadNegativeStrandFlag()) { // 反链
clipByReferenceCoordinates(bw, -1, adapter_boundary, sd);
} else { // 正链
clipByReferenceCoordinates(bw, adapter_boundary, -1, sd);
}
}
sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip; // 更新read长度
PROF_START(clip_read);
// 4. 对read的两端进行检测去除hardclipadapter
ReadTransformer::hardClipAdaptorSequence(bw, sd);
if (sd.read_len <= 0) continue;
// 5. 然后再去除softclip部分
calculateSoftClip(bw, sd);
sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip; // 更新read长度
ReadTransformer::hardClipSoftClippedBases(bw, sd);
if (sd.read_len <= 0) continue;
calculateRefOffset(bw, sd); // 计算ref_offset就是相对比对的position要将ref右移多少
calculateReadBases(bw, sd); // 计算clip处理之后剩余的碱基
// 计算clip之后两端的softstart和softend
calculateSoftStartEnd(bw, sd);
calculateCigar(bw, sd);
//spdlog::info("read-len {} - {}: clip left {}, right {}, ref offset: {}, cigar range: [{}, {}), cigar: {}", bw->b->core.l_qseq,
// sd.read_len, sd.left_clip, sd.right_clip, sd.ref_offset, sd.cigar_start, sd.cigar_end, bw->cigar_str());
// 应用所有的变换计算samdata的相关信息
sd.applyTransformations();
PROF_END(gprof[GP_clip_read], clip_read);
// 6. 更新每个read的platform信息好像没啥用暂时忽略
vector<int> isSNP(sd.read_len, 0); // 该位置是否是SNP位置0不是1是
vector<int> isIns(sd.read_len, 0); // 该位置是否是插入位置0不是1是
vector<int> isDel(sd.read_len, 0); // 该位置是否是删除位置0不是1是
const int nErrors = calculateIsSNPOrIndel(nsgv::gAuxVars[0], sd, isSNP, isIns, isDel);
/*fprintf(gf[0], "%d\t", sd.read_len);
@ -877,16 +586,11 @@ int SerialBQSR() {
fprintf(gf[2], "\n");
*/
// spdlog::info("nErrors: {}", nErrors);
// for (auto val : isSNP) { if (val > 0) spdlog::info("snp val: {}", val); }
//exit(0);
// 7. 计算baqArray
// BAQ = base alignment quality
// note for efficiency reasons we don't compute the BAQ array unless we actually have
// some error to marginalize over. For ILMN data ~85% of reads have no error
vector<uint8_t> baqArray;
// vector<uint8_t> baqArray;
bool baqCalculated = false;
if (nErrors == 0 || !nsgv::gBqsrArg.enableBAQ) {
baqCalculated = flatBAQArray(sd, baqArray);
@ -922,13 +626,7 @@ int SerialBQSR() {
// }
// fprintf(gf[3], "\n");
//test = readCovariates[1][0][0] + readCovariates[2][1][3];
//int end_pos = bw->contig_end_pos();
//spdlog::info("adapter: {}, read: {}, {}, strand: {}", adapter_boundary, bw->contig_pos(), end_pos,
// bw->GetReadNegativeStrandFlag() ? "reverse" : "forward");
// for (auto val : isSNP) { if (val > 0) spdlog::info("snp err val-1: {}", val); }
// 9. 计算这条read需要跳过的位置
vector<bool> skips(sd.read_len, 0);
PROF_START(known_sites);
calculateKnownSites(sd, nsgv::gAuxVars[0].vcfArr, skips);
for (int ii = 0; ii < sd.read_len; ++ii) {
@ -936,27 +634,19 @@ int SerialBQSR() {
sd.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN;
}
PROF_END(gprof[GP_read_vcf], known_sites);
// fprintf(gf[0], "%d\t", sd.read_len);
// fprintf(gf[0], "%ld %d\t", sd.rid, sd.read_len);
// for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[0], "%d ", skips[ii] ? 1 : 0);
// fprintf(gf[0], "\n");
// 10. 根据BAQ进一步处理snpindel得到处理后的数据
vector<double> snpErrors, insErrors, delErrors;
// for (auto val : isSNP) { if (val > 0) spdlog::info("snp err val-2: {}", val); }
calculateFractionalErrorArray(isSNP, baqArray, snpErrors);
calculateFractionalErrorArray(isIns, baqArray, insErrors);
calculateFractionalErrorArray(isDel, baqArray, delErrors);
// for (auto val : isSNP) { if (val > 0) spdlog::info("snp val: {}", val); }
//spdlog::info("snp errors size: {}, read len: {}", snpErrors.size(), sd.read_len);
//for (auto val : snpErrors) { if (val > 0) spdlog::info("snp err val: {}", val); }
// aggregate all of the info into our info object, and update the data
// 11. 合并之前计算的数据得到info并更新bqsr table数据
ReadRecalInfo info(sd, readCovariates, skips, snpErrors, insErrors, delErrors);
int m = 0;
// for (auto err : snpErrors) { if (isSNP[m] > 0 || err > 0) spdlog::info("snp err: {} : {}", isSNP[m++], err); }
//exit(0);
PROF_START(update_info);
updateRecalTablesForRead(info);
PROF_END(gprof[GP_update_info], update_info);

View File

@ -99,7 +99,7 @@ static char SimpleComplement(const char base) {
}
}
static void ClipLowQualEndsWithN(string& bases, const FastArray<uint8_t> &quals, uint8_t lowQTail, bool isNegativeStrand) {
static void ClipLowQualEndsWithN(string& bases, const StableArray<uint8_t> &quals, uint8_t lowQTail, bool isNegativeStrand) {
// 处理左边
int left = 0;
int readLen = bases.size();
@ -245,7 +245,7 @@ void ContextCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovar
const int originalReadLength = sd.read_len;
// store the original bases and then write Ns over low quality ones
string strandedClippedBases(sd.bases);
string strandedClippedBases(sd.bases.arr.data(), sd.read_len);
// GetStrandedClippedBytes(bw, sd, strandedClippedBases, 30); // 注意这里的lowQualTail数值
GetStrandedClippedBytes(sd, strandedClippedBases, lowQualTail); // 命名我之前看到过这个30的
// spdlog::info("bases: {}", strandedClippedBases);

View File

@ -19,6 +19,7 @@
#include "bqsr_args.h"
#include "util/bam_wrap.h"
#include "util/sam_data.h"
using std::map;
using std::string;

View File

@ -9,19 +9,23 @@
#pragma once
#include "util/bam_wrap.h"
#include <vector>
#include "covariate.h"
#include "util/bam_wrap.h"
#include "util/stable_array.h"
using std::vector;
struct ReadRecalInfo {
SamData& read;
int length;
PerReadCovariateMatrix& covariates;
vector<bool>& skips;
FastArray<uint8_t>&base_quals, &ins_quals, &del_quals;
vector<double>&snp_errs, &ins_errs, &del_errs;
StableArray<uint8_t>& skips;
StableArray<uint8_t>&base_quals, &ins_quals, &del_quals;
StableArray<double>&snp_errs, &ins_errs, &del_errs;
ReadRecalInfo(SamData& _read, PerReadCovariateMatrix& _covariates, vector<bool>& _skips, vector<double>& _snp_errs, vector<double>& _ins_errs,
vector<double>& _del_errs)
ReadRecalInfo(SamData& _read, PerReadCovariateMatrix& _covariates, StableArray<uint8_t>& _skips, StableArray<double>& _snp_errs,
StableArray<double>& _ins_errs, StableArray<double>& _del_errs)
: read(_read),
covariates(_covariates),
skips(_skips),

View File

@ -21,111 +21,6 @@
using namespace std;
struct Cigar {
char op = '0';
int len = 0;
// 该操作符是否消耗read的碱基
static bool ConsumeReadBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'I' || cigar == 'S'; }
// 该操作符是否消耗参考基因组的碱基
static bool ConsumeRefBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N'; }
};
struct ReadIdxCigar {
int readIdx = -1; // 在read序列中的位置
char cigarOp = '0'; // 当前位置对应的cigar
};
// 不用经常释放array的内存空间减少频繁的内存开辟和释放操作
template <class T>
struct FastArray {
vector<T> arr;
size_t idx;
void clear() { idx = 0; }
size_t size() { return idx; }
bool empty() { return idx == 0; }
void reserve(size_t _size) { arr.reserve(_size); }
void resize(size_t _size) {
arr.resize(_size);
idx = _size;
}
void push_back(const T& val) {
if (idx < arr.size()) {
arr[idx++] = val;
} else {
arr.push_back(val);
idx++;
}
}
inline T& operator[](size_t pos) { return arr[pos]; }
inline const T& operator[](size_t pos) const { return arr[pos]; }
struct iterator {
typename std::vector<T>::iterator it;
iterator(typename std::vector<T>::iterator _it) : it(_it) {}
iterator& operator++() { ++it; return *this;}
iterator& operator--() { --it; return *this;}
T& operator*() const { return *it; }
bool operator!=(const iterator& other) const { return it != other.it; }
bool operator==(const iterator& other) const { return it == other.it; }
};
iterator begin() { return arr.begin(); }
iterator end() { return arr.begin() + idx; }
};
// 对原始bam数据的补充比如对两端进行hardclip等
class BamWrap;
struct SamData {
int64_t rid = 0; // for debug
int read_len = 0; // read长度各种clip之后的长度
int cigar_start = 0; // cigar起始位置闭区间
int cigar_end = 0; // cigar结束位置开区间
int first_cigar_clip = 0; // 第一个cigar, clip的数量切左侧
int last_cigar_clip = 0; // 最后一个cigar, clip的数量切右侧
int left_clip = 0; // 左侧被切掉的碱基长度BI和BD质量分数也会用到
int right_clip = 0; // 右侧被切掉的碱基长度
int ref_offset = 0; // 切除adapter和softclip之后(softclip应该不影响)相对原始ref比对位置contig_pos的偏移量
// 记录一下bqsr运算过程中用到的数据回头提前计算一下修正现在的复杂逻辑
static constexpr int READ_INDEX_NOT_FOUND = -1;
BamWrap* bw;
int64_t start_pos; // 因为soft clip都被切掉了这里的softstart应该就是切掉之后的匹配位点闭区间
int64_t end_pos; // 同上,闭区间
string bases; // 处理之后的read的碱基
FastArray<uint8_t> base_quals; // 对应的质量分数
FastArray<uint8_t> ins_quals; // insert质量分数, BI (大部分应该都没有)
FastArray<uint8_t> del_quals; // delete质量分数, BD (大部分应该都没有)
FastArray<Cigar> cigars;
int64_t& softStart() { return start_pos; }
int64_t& softEnd() { return end_pos; }
// functions
ReadIdxCigar getReadIndexForReferenceCoordinate(int64_t refPos) {
ReadIdxCigar rc;
if (refPos < start_pos)
return rc;
int firstReadPosOfElement = 0; // inclusive
int firstRefPosOfElement = start_pos; // inclusive
int lastReadPosOfElement = 0; // exclusive
int lastRefPosOfElement = start_pos; // exclusive
// advance forward through all the cigar elements until we bracket the reference coordinate
for (auto& cigar : cigars) {
firstReadPosOfElement = lastReadPosOfElement;
firstRefPosOfElement = lastRefPosOfElement;
lastReadPosOfElement += Cigar::ConsumeReadBases(cigar.op) ? cigar.len : 0;
lastRefPosOfElement += Cigar::ConsumeRefBases(cigar.op) || cigar.op == 'S' ? cigar.len : 0;
if (firstRefPosOfElement <= refPos && refPos < lastRefPosOfElement) { // refCoord falls within this cigar element
int readPosAtRefCoord = firstReadPosOfElement + (Cigar::ConsumeReadBases(cigar.op) ? (refPos - firstRefPosOfElement) : 0);
rc.cigarOp = cigar.op;
rc.readIdx = readPosAtRefCoord;
return rc;
}
}
return rc;
}
};
/*
线
bam

View File

@ -0,0 +1,3 @@
#include "base_utils.h"
const char BaseUtils::cBaseToChar[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'};

View File

@ -0,0 +1,20 @@
/*
Description:
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2025/12/29
*/
#pragma once
struct BaseUtils {
// uint8_t转碱基字符
static const char cBaseToChar[16];
// 该操作符是否消耗read的碱基
static bool consumeReadBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'I' || cigar == 'S'; }
// 该操作符是否消耗参考基因组的碱基
static bool consumeRefBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N'; }
};

View File

@ -54,6 +54,7 @@ int DisplayProfiling(int nthread) {
// PRINT_GP(sort_wait);
// PRINT_GP(markdup_wait);
// PRINT_GP(intersect_wait);
PRINT_GP(clip_read);
PRINT_GP(read_ref);
PRINT_GP(read_vcf);
PRINT_GP(covariate);

View File

@ -40,6 +40,7 @@ extern uint64_t gprof[LIM_GLOBAL_PROF_TYPE];
enum { GP_0 = 0, GP_1, GP_2, GP_3, GP_4, GP_5, GP_6, GP_7, GP_8, GP_9, GP_10 };
enum {
GP_read_wait = 11,
GP_clip_read,
GP_covariate,
GP_read_ref,
GP_read_vcf,

View File

@ -0,0 +1,154 @@
/*
Description: read (sam record) clipping
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2025/12/29
*/
#pragma once
#include "bam_wrap.h"
#include "sam_data.h"
#include "base_utils.h"
// 用于对read进行各种转换操作比如clipping等。注意这里都是逻辑操作最后需要调用SamData.applyTransformations()来真正应用这些修改
struct ReadTransformer {
// 给定一个ref位置在read内部找到对应的位置和操作符
struct PosAndOperator {
int readPosAtRefCoord = -1; // read中的位置
char cigarOperator = '0'; // cigar操作符
int cigarIndex = -1; // cigar索引
int cigarLen = 0;
int preCigarLen = 0; // 截止cigar之前的消耗read base的长度
};
/**
* Find the 0-based index within a read base array corresponding to a given 0-based position in the reference, along with the cigar operator of
* the element containing that base. If the reference coordinate occurs within a deletion, the first index after the deletion is returned.
* Note that this treats soft-clipped bases as if they align with the reference, which is useful for hard-clipping reads with soft clips.
*
* @param alignmentStart The soft start of the read on the reference
* @param cigar The read's cigar
* @param refCoord The target reference coordinate
* @return If the reference coordinate occurs before the read start or after the read end {@code CLIPPING_GOAL_NOT_REACHED};
* if the reference coordinate falls within an alignment block of the read's cigar, the corresponding read
* coordinate; if the reference coordinate falls within a deletion, the first read coordinate after the deletion. Note: if the last cigar element
* is a deletion (which isn't meaningful), it returns {@code CLIPPING_GOAL_NOT_REACHED}.
*/
static PosAndOperator getReadIndexForReferenceCoordinate(BamWrap* bw, int alignmentStart, int refCoord) {
PosAndOperator po;
if (refCoord < alignmentStart) {
return po;
}
int firstReadPosOfElement = 0; // inclusive
int firstRefPosOfElement = alignmentStart; // inclusive
int lastReadPosOfElement = 0; // exclusive
int lastRefPosOfElement = alignmentStart; // exclusive
// advance forward through all the cigar elements until we bracket the reference coordinate
const uint32_t* cigar = bam_get_cigar(bw->b);
const bam1_core_t& bc = bw->b->core;
const int idx = bc.n_cigar - 1;
if (idx < 0)
return po;
for (int i = 0; i < bc.n_cigar; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
const int len = bam_cigar_oplen(cigar[i]);
firstReadPosOfElement = lastReadPosOfElement;
firstRefPosOfElement = lastRefPosOfElement;
lastReadPosOfElement += BaseUtils::consumeReadBases(c) ? len : 0;
lastRefPosOfElement += (BaseUtils::consumeRefBases(c) || c == 'S') ? len : 0;
if (firstRefPosOfElement <= refCoord && refCoord < lastRefPosOfElement) { // refCoord falls within this cigar element
int readPosAtRefCoord = firstReadPosOfElement + (BaseUtils::consumeReadBases(c) ? (refCoord - firstRefPosOfElement) : 0);
return PosAndOperator{readPosAtRefCoord, c, i, len, firstReadPosOfElement};
}
}
return po;
}
// 根据adapter位置对read进行hardclip返回左侧或右侧减掉的base数量
static void clipByReferenceCoordinates(BamWrap* bw, int refStart, int refStop, SamData& sd) {
int start, stop;
// Determine the read coordinate to start and stop hard clipping
if (refStart < 0) {
if (refStop < 0)
return;
PosAndOperator stopPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStop);
// if the refStop falls in a deletion, the above method returns the position after the deletion. Since the stop we return here
// is inclusive, we decrement the stop to avoid overclipping by one base. As a result we do not clip the deletion, which is fine.
stop = stopPosAndOperator.readPosAtRefCoord - (BaseUtils::consumeReadBases(stopPosAndOperator.cigarOperator) ? 0 : 1);
sd.left_clip = stop + 1;
sd.cigar_start = stopPosAndOperator.cigarIndex;
sd.first_cigar_clip = sd.left_clip - stopPosAndOperator.preCigarLen;
} else {
if (refStop >= 0)
return;
// unlike the above case where we clip the start fo the read, here we clip the end and returning the base to the right of a deletion
// avoids overclipping
PosAndOperator startPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStart);
start = startPosAndOperator.readPosAtRefCoord;
sd.right_clip = bw->b->core.l_qseq - start;
sd.cigar_end = startPosAndOperator.cigarIndex + 1;
sd.last_cigar_clip = startPosAndOperator.preCigarLen + startPosAndOperator.cigarLen - start;
}
}
// 切掉adaper序列注意这里的clipping只是逻辑上的实际并没有修改bam record
static void hardClipAdaptorSequence(BamWrap* bw, SamData& sd) {
int adapter_boundary = bw->GetAdapterBoundary();
if (bw->IsAdapterInRead(adapter_boundary)) {
// adapter在read范围内
if (bw->GetReadNegativeStrandFlag()) { // 反链
clipByReferenceCoordinates(bw, -1, adapter_boundary, sd);
} else { // 正链
clipByReferenceCoordinates(bw, adapter_boundary, -1, sd);
}
}
sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip; // 更新read长度
}
// 计算read两端softclip的碱基数量切掉softclip序列
static void hardClipSoftClippedBases(BamWrap* bw, SamData& sd) {
const uint32_t* cigar = bam_get_cigar(bw->b);
const bam1_core_t& bc = bw->b->core;
int readIndex = sd.left_clip;
int cutLeft = -1; // first position to hard clip (inclusive)
int cutRight = -1; // first position to hard clip (inclusive)
int cigar_start = sd.cigar_start;
int cigar_end = sd.cigar_end;
bool rightTail = false; // trigger to stop clipping the left tail and start cutting the right tail
for (int i = sd.cigar_start; i < sd.cigar_end; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
int len = bam_cigar_oplen(cigar[i]);
if (i == sd.cigar_start) len -= sd.first_cigar_clip;
if (i == sd.cigar_end - 1) len -= sd.last_cigar_clip;
if (c == 'S') {
if (rightTail) {
cutRight = readIndex;
cigar_end = i;
} else {
cutLeft = readIndex + len - 1;
cigar_start = i + 1;
}
} else if (c != 'H') {
rightTail = true;
}
if (BaseUtils::consumeReadBases(c)) {
readIndex += len;
}
}
if (cutRight >= 0) {
sd.right_clip = bw->b->core.l_qseq - cutRight;
sd.cigar_end = cigar_end;
sd.last_cigar_clip = 0;
}
if (cutLeft >= 0) {
sd.left_clip = cutLeft + 1;
sd.cigar_start = cigar_start;
sd.first_cigar_clip = 0;
}
sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip; // 更新read长度
}
};

View File

178
src/util/sam_data.h 100644
View File

@ -0,0 +1,178 @@
/*
Description: bam便
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2025/12/29
*/
#pragma once
#include <cstdint>
#include "bam_wrap.h"
#include "stable_array.h"
#include "base_utils.h"
// 对cigar进行简单包装
struct Cigar {
char op = '0';
int len = 0;
// 该操作符是否消耗read的碱基
static bool ConsumeReadBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'I' || cigar == 'S'; }
// 该操作符是否消耗参考基因组的碱基
static bool ConsumeRefBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N'; }
};
// 在read中某个特定位置对应的cigar操作符
struct ReadIdxCigar {
int readIdx = -1; // 在read序列中的位置
char cigarOp = '0'; // 当前位置对应的cigar
};
// 对原始bam进行部分数据解析以更方便后续处理
struct SamData {
int64_t rid = 0; // for debug
// 这些数据用于最开始clipping等操作
int read_len = 0; // read长度各种clip之后的长度
int cigar_start = 0; // cigar起始位置闭区间
int cigar_end = 0; // cigar结束位置开区间
int first_cigar_clip = 0; // 第一个cigar, clip的数量切左侧
int last_cigar_clip = 0; // 最后一个cigar, clip的数量切右侧
int left_clip = 0; // 左侧被切掉的碱基长度BI和BD质量分数也会用到
int right_clip = 0; // 右侧被切掉的碱基长度
int ref_offset = 0; // 切除adapter和softclip之后(softclip应该不影响)相对原始ref比对位置contig_pos的偏移量
// 记录一下bqsr运算过程中用到的数据回头提前计算一下修正现在的复杂逻辑
static constexpr int READ_INDEX_NOT_FOUND = -1;
BamWrap* bw;
int64_t start_pos; // 因为soft clip都被切掉了这里的softstart应该就是切掉之后的匹配位点闭区间
int64_t end_pos; // 同上,闭区间
StableArray<char> bases; // 处理之后的read的碱基数组形式
StableArray<uint8_t> base_quals; // 对应的质量分数
StableArray<uint8_t> ins_quals; // insert质量分数, BI (大部分应该都没有)
StableArray<uint8_t> del_quals; // delete质量分数, BD (大部分应该都没有)
StableArray<Cigar> cigars;
int64_t& softStart() { return start_pos; }
int64_t& softEnd() { return end_pos; }
// functions
// 初始化
void init() {
bases.clear();
base_quals.clear();
ins_quals.clear();
del_quals.clear();
cigars.clear();
rid = 0;
read_len = 0;
cigar_start = 0;
cigar_end = 0;
first_cigar_clip = 0;
last_cigar_clip = 0;
left_clip = 0;
right_clip = 0;
ref_offset = 0;
bw = nullptr;
start_pos = 0;
end_pos = 0;
}
// 初步解析bam
void parseBasic(BamWrap *_bw) {
bw = _bw;
read_len = BamWrap::BamEffectiveLength(bw->b);
cigar_end = bw->b->core.n_cigar;
}
// 应用各种转换操作更新SamData中的数据
void applyTransformations() {
const uint32_t* cigar = bam_get_cigar(bw->b);
const bam1_core_t& bc = bw->b->core;
int i = 0;
// 计算ref_offset就是相对比对的position要将ref右移多少
for (i = 0; i < cigar_start; ++i) {
if (BaseUtils::consumeRefBases(bam_cigar_opchr(cigar[i]))) {
ref_offset += bam_cigar_oplen(cigar[i]);
}
}
if (BaseUtils::consumeRefBases(bam_cigar_opchr(cigar[i]))) {
ref_offset += first_cigar_clip;
}
// 计算clip处理之后剩余的碱基
bases.resize(read_len);
base_quals.resize(read_len);
uint8_t* seq = bam_get_seq(bw->b);
uint8_t* quals = bam_get_qual(bw->b);
for (i = 0; i < read_len; ++i) {
bases[i] = BaseUtils::cBaseToChar[bam_seqi(seq, i + left_clip)];
base_quals[i] = quals[i + left_clip];
}
// 计算read两端clip之后的softstart和softend其实S之前都被切掉了
int64_t softStart = bw->b->core.pos + ref_offset;
int64_t softEnd = softStart - 1; // 闭区间
bool rightTail = false;
for (i = cigar_start; i < cigar_end; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
int len = bam_cigar_oplen(cigar[i]);
if (i == cigar_start) len -= first_cigar_clip;
if (i == cigar_end - 1) len -= last_cigar_clip;
// if (c == 'S' || c == 'I')
if (c == 'S') softStart -= len;
else if (c != 'H') rightTail = true;
if (rightTail && (BaseUtils::consumeRefBases(c) || c == 'S')) softEnd += len;
}
this->softStart() = softStart; // 其实这里的softStart就是start
this->softEnd() = softEnd; // 同上
// 计算clip之后的cigar其实可以考虑下边的代码和上边的换一下位置
for (i = cigar_start; i < cigar_end; ++i) {
char c = bam_cigar_opchr(cigar[i]);
int len = bam_cigar_oplen(cigar[i]);
if (i == cigar_start) len -= first_cigar_clip;
if (i == cigar_end - 1) len -= last_cigar_clip;
// if ((i == sd.cigar_start || i == sd.cigar_end - 1) && c == 'D') // 跳过开头的deletion
if (i == cigar_start && c == 'D') { // 跳过开头的deletion
c = 'H';
ref_offset += len;
start_pos += len; // 更新起始位置
} else if (i == cigar_end - 1 && c == 'D') { // 跳过结尾的deletion
c = 'H';
softEnd -= len; // 更新结束位置
}
cigars.push_back({c, len});
}
}
// 给定一个ref pos返回对应的read index和cigar操作
ReadIdxCigar getReadIndexForReferenceCoordinate(int64_t refPos) {
ReadIdxCigar rc;
if (refPos < start_pos)
return rc;
int firstReadPosOfElement = 0; // inclusive
int firstRefPosOfElement = start_pos; // inclusive
int lastReadPosOfElement = 0; // exclusive
int lastRefPosOfElement = start_pos; // exclusive
// advance forward through all the cigar elements until we bracket the reference coordinate
for (auto& cigar : cigars) {
firstReadPosOfElement = lastReadPosOfElement;
firstRefPosOfElement = lastRefPosOfElement;
lastReadPosOfElement += Cigar::ConsumeReadBases(cigar.op) ? cigar.len : 0;
lastRefPosOfElement += Cigar::ConsumeRefBases(cigar.op) || cigar.op == 'S' ? cigar.len : 0;
if (firstRefPosOfElement <= refPos && refPos < lastRefPosOfElement) { // refCoord falls within this cigar element
int readPosAtRefCoord = firstReadPosOfElement + (Cigar::ConsumeReadBases(cigar.op) ? (refPos - firstRefPosOfElement) : 0);
rc.cigarOp = cigar.op;
rc.readIdx = readPosAtRefCoord;
return rc;
}
}
return rc;
}
};

View File

@ -0,0 +1,73 @@
/*
Description: array
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2025/12/29
*/
#pragma once
#include <cstdlib>
#include <vector>
using std::vector;
// 不能用于bool类型, 因为在c++中vector<bool>是特化过的
template <class T>
struct StableArray {
vector<T> arr;
size_t idx = 0;
void clear() { idx = 0; }
size_t size() { return idx; }
bool empty() { return idx == 0; }
void reserve(size_t _size) {
if (arr.size() < _size)
arr.reserve(_size);
}
void resize(size_t _size) {
if (arr.size() < _size) {
arr.resize(_size);
}
idx = _size;
}
void resize(size_t _size, const T &val) {
if (arr.size() < _size) {
arr.resize(_size);
}
for (size_t i = 0; i < _size; ++i) {
arr[i] = val;
}
idx = _size;
}
void push_back(const T& val) {
if (idx < arr.size()) {
arr[idx++] = val;
} else {
arr.push_back(val);
idx++;
}
}
inline T& operator[](size_t pos) { return arr[pos]; }
//inline const T& operator[](size_t pos) { return arr[pos]; }
inline const T& operator[](size_t pos) const { return arr[pos]; }
struct iterator {
typename std::vector<T>::iterator it;
iterator(typename std::vector<T>::iterator _it) : it(_it) {}
iterator& operator++() {
++it;
return *this;
}
iterator& operator--() {
--it;
return *this;
}
T& operator*() const { return *it; }
bool operator!=(const iterator& other) const { return it != other.it; }
bool operator==(const iterator& other) const { return it == other.it; }
};
iterator begin() { return arr.begin(); }
iterator end() { return arr.begin() + idx; }
};