重构了一下,大数据问题还没解决
This commit is contained in:
parent
1864736509
commit
1e5a291eb7
|
|
@ -6,7 +6,7 @@ vector<double> BAQ::qual2prob(256); // 质量分数转化概率
|
||||||
vector<vector<vector<double>>> BAQ::EPSILONS(256, vector<vector<double>>(256, vector<double>(SAM_MAX_PHRED_SCORE + 1))); // [ref][read][qual]
|
vector<vector<vector<double>>> BAQ::EPSILONS(256, vector<vector<double>>(256, vector<double>(SAM_MAX_PHRED_SCORE + 1))); // [ref][read][qual]
|
||||||
|
|
||||||
// 计算baq数组,返回成功与否
|
// 计算baq数组,返回成功与否
|
||||||
bool BAQ::calcBAQFromHMM(BamWrap* bw, SamData& ad, string ref, int refOffset, vector<int>& baqArray) {
|
bool BAQ::calcBAQFromHMM(SamData& ad, string ref, int refOffset, vector<int>& baqArray) {
|
||||||
// 检测ref是否覆盖了read
|
// 检测ref是否覆盖了read
|
||||||
if (ref.size() < refOffset + ad.read_len) {
|
if (ref.size() < refOffset + ad.read_len) {
|
||||||
spdlog::error("BAQ calculation error: reference sequence length {} is less than required length {} (refOffset {} + read_len {})",
|
spdlog::error("BAQ calculation error: reference sequence length {} is less than required length {} (refOffset {} + read_len {})",
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,7 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "util/bam_wrap.h"
|
#include "util/bam_wrap.h"
|
||||||
|
#include "util/sam_data.h"
|
||||||
|
|
||||||
using std::vector;
|
using std::vector;
|
||||||
using std::string;
|
using std::string;
|
||||||
|
|
@ -84,5 +85,5 @@ struct BAQ {
|
||||||
double calcEpsilon(uint8_t ref, uint8_t read, uint8_t qualB) { return EPSILONS[ref][read][qualB]; }
|
double calcEpsilon(uint8_t ref, uint8_t read, uint8_t qualB) { return EPSILONS[ref][read][qualB]; }
|
||||||
|
|
||||||
// 计算baq数组,返回成功与否
|
// 计算baq数组,返回成功与否
|
||||||
bool calcBAQFromHMM(BamWrap* bw, SamData& ad, string ref, int refOffset, vector<int>& baqArray);
|
bool calcBAQFromHMM(SamData& ad, string ref, int refOffset, vector<int>& baqArray);
|
||||||
};
|
};
|
||||||
|
|
@ -39,6 +39,10 @@ Date : 2023/10/23
|
||||||
#include "util/math/math_utils.h"
|
#include "util/math/math_utils.h"
|
||||||
#include "quant_info.h"
|
#include "quant_info.h"
|
||||||
#include "util/debug.h"
|
#include "util/debug.h"
|
||||||
|
#include "util/stable_array.h"
|
||||||
|
#include "util/sam_data.h"
|
||||||
|
#include "util/read_transformer.h"
|
||||||
|
#include "util/base_utils.h"
|
||||||
|
|
||||||
using std::deque;
|
using std::deque;
|
||||||
|
|
||||||
|
|
@ -47,8 +51,6 @@ using std::deque;
|
||||||
|
|
||||||
const uint8_t NO_BAQ_UNCERTAINTY = (uint8_t)'@';
|
const uint8_t NO_BAQ_UNCERTAINTY = (uint8_t)'@';
|
||||||
|
|
||||||
const char cBaseToChar[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'};
|
|
||||||
|
|
||||||
// 解析knownSites
|
// 解析knownSites
|
||||||
struct VCFParser {
|
struct VCFParser {
|
||||||
deque<Interval> knownSites; // 已知的变异位点
|
deque<Interval> knownSites; // 已知的变异位点
|
||||||
|
|
@ -157,222 +159,6 @@ bool bqsrReadFilterOut(const bam1_t *b) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 该操作符是否消耗read的碱基
|
|
||||||
bool consumeReadBases(char cigar) {
|
|
||||||
return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'I' || cigar == 'S';
|
|
||||||
}
|
|
||||||
|
|
||||||
// 该操作符是否消耗参考基因组的碱基
|
|
||||||
bool consumeRefBases(char cigar) {
|
|
||||||
return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N';
|
|
||||||
}
|
|
||||||
|
|
||||||
// 给定一个ref位置,在read内部找到对应的位置和操作符
|
|
||||||
struct PosAndOperator {
|
|
||||||
int readPosAtRefCoord = -1; // read中的位置
|
|
||||||
char cigarOperator = '0'; // cigar操作符
|
|
||||||
int cigarIndex = -1; // cigar索引
|
|
||||||
int cigarLen = 0;
|
|
||||||
int preCigarLen = 0; // 截止cigar之前的,消耗read base的长度
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Find the 0-based index within a read base array corresponding to a given 0-based position in the reference, along with the cigar operator of
|
|
||||||
* the element containing that base. If the reference coordinate occurs within a deletion, the first index after the deletion is returned.
|
|
||||||
* Note that this treats soft-clipped bases as if they align with the reference, which is useful for hard-clipping reads with soft clips.
|
|
||||||
*
|
|
||||||
* @param alignmentStart The soft start of the read on the reference
|
|
||||||
* @param cigar The read's cigar
|
|
||||||
* @param refCoord The target reference coordinate
|
|
||||||
* @return If the reference coordinate occurs before the read start or after the read end {@code CLIPPING_GOAL_NOT_REACHED};
|
|
||||||
* if the reference coordinate falls within an alignment block of the read's cigar, the corresponding read coordinate;
|
|
||||||
* if the reference coordinate falls within a deletion, the first read coordinate after the deletion. Note: if the last
|
|
||||||
* cigar element is a deletion (which isn't meaningful), it returns {@code CLIPPING_GOAL_NOT_REACHED}.
|
|
||||||
*/
|
|
||||||
PosAndOperator getReadIndexForReferenceCoordinate(BamWrap *bw, int alignmentStart, int refCoord) {
|
|
||||||
PosAndOperator po;
|
|
||||||
if (refCoord < alignmentStart) {
|
|
||||||
return po;
|
|
||||||
}
|
|
||||||
int firstReadPosOfElement = 0; // inclusive
|
|
||||||
int firstRefPosOfElement = alignmentStart; // inclusive
|
|
||||||
int lastReadPosOfElement = 0; // exclusive
|
|
||||||
int lastRefPosOfElement = alignmentStart; // exclusive
|
|
||||||
|
|
||||||
// advance forward through all the cigar elements until we bracket the reference coordinate
|
|
||||||
const uint32_t* cigar = bam_get_cigar(bw->b);
|
|
||||||
const bam1_core_t& bc = bw->b->core;
|
|
||||||
const int idx = bc.n_cigar - 1;
|
|
||||||
if (idx < 0)
|
|
||||||
return po;
|
|
||||||
for (int i = 0; i < bc.n_cigar; ++i) {
|
|
||||||
const char c = bam_cigar_opchr(cigar[i]);
|
|
||||||
const int len = bam_cigar_oplen(cigar[i]);
|
|
||||||
firstReadPosOfElement = lastReadPosOfElement;
|
|
||||||
firstRefPosOfElement = lastRefPosOfElement;
|
|
||||||
lastReadPosOfElement += consumeReadBases(c) ? len : 0;
|
|
||||||
lastRefPosOfElement += (consumeRefBases(c) || c == 'S') ? len : 0;
|
|
||||||
if (firstRefPosOfElement <= refCoord && refCoord < lastRefPosOfElement) { // refCoord falls within this cigar element
|
|
||||||
int readPosAtRefCoord = firstReadPosOfElement + (consumeReadBases(c) ? (refCoord - firstRefPosOfElement) : 0);
|
|
||||||
return PosAndOperator{readPosAtRefCoord, c, i, len, firstReadPosOfElement};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return po;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 根据adapter位置,对read进行hardclip,返回左侧或右侧减掉的base数量
|
|
||||||
void clipByReferenceCoordinates(BamWrap *bw, int refStart, int refStop, SamData &sd) {
|
|
||||||
int start, stop;
|
|
||||||
// Determine the read coordinate to start and stop hard clipping
|
|
||||||
if (refStart < 0) {
|
|
||||||
if (refStop < 0) return;
|
|
||||||
PosAndOperator stopPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStop);
|
|
||||||
// if the refStop falls in a deletion, the above method returns the position after the deletion. Since the stop we return here
|
|
||||||
// is inclusive, we decrement the stop to avoid overclipping by one base. As a result we do not clip the deletion, which is fine.
|
|
||||||
stop = stopPosAndOperator.readPosAtRefCoord - (consumeReadBases(stopPosAndOperator.cigarOperator) ? 0 : 1);
|
|
||||||
sd.left_clip = stop + 1;
|
|
||||||
sd.cigar_start = stopPosAndOperator.cigarIndex;
|
|
||||||
sd.first_cigar_clip = sd.left_clip - stopPosAndOperator.preCigarLen;
|
|
||||||
} else {
|
|
||||||
if (refStop >= 0) return;
|
|
||||||
// unlike the above case where we clip the start fo the read, here we clip the end and returning the base to the right of a deletion avoids
|
|
||||||
// overclipping
|
|
||||||
PosAndOperator startPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStart);
|
|
||||||
start = startPosAndOperator.readPosAtRefCoord;
|
|
||||||
sd.right_clip = bw->b->core.l_qseq - start;
|
|
||||||
sd.cigar_end = startPosAndOperator.cigarIndex + 1;
|
|
||||||
sd.last_cigar_clip = startPosAndOperator.preCigarLen + startPosAndOperator.cigarLen - start;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 计算切掉adapter之后,ref相对原始ref的偏移量
|
|
||||||
void calculateRefOffset(BamWrap *bw, SamData &sd) {
|
|
||||||
const uint32_t* cigar = bam_get_cigar(bw->b);
|
|
||||||
const bam1_core_t& bc = bw->b->core;
|
|
||||||
int i = 0;
|
|
||||||
for (i = 0; i < sd.cigar_start; ++i) {
|
|
||||||
const char c = bam_cigar_opchr(cigar[i]);
|
|
||||||
int len = bam_cigar_oplen(cigar[i]);
|
|
||||||
if (consumeRefBases(c)) {
|
|
||||||
sd.ref_offset += len;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
const char c = bam_cigar_opchr(cigar[i]);
|
|
||||||
if (consumeRefBases(c)) {
|
|
||||||
sd.ref_offset += sd.first_cigar_clip;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 计算clip处理之后,剩余的碱基
|
|
||||||
void calculateReadBases(BamWrap* bw, SamData& sd) {
|
|
||||||
sd.bases.resize(sd.read_len);
|
|
||||||
sd.base_quals.resize(sd.read_len);
|
|
||||||
uint8_t* seq = bam_get_seq(bw->b);
|
|
||||||
uint8_t* quals = bam_get_qual(bw->b);
|
|
||||||
for (int i = 0; i < sd.read_len; ++i) {
|
|
||||||
sd.bases[i] = cBaseToChar[bam_seqi(seq, i + sd.left_clip)];
|
|
||||||
sd.base_quals[i] = quals[i + sd.left_clip];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 计算read两端clip之后的softstart和softend
|
|
||||||
void calculateSoftStartEnd(BamWrap* bw, SamData& sd) {
|
|
||||||
int64_t softStart = bw->b->core.pos + sd.ref_offset;
|
|
||||||
int64_t softEnd = softStart - 1; // 闭区间
|
|
||||||
const uint32_t* cigar = bam_get_cigar(bw->b);
|
|
||||||
const bam1_core_t& bc = bw->b->core;
|
|
||||||
int cigar_start = sd.cigar_start;
|
|
||||||
int cigar_end = sd.cigar_end;
|
|
||||||
bool rightTail = false;
|
|
||||||
for (int i = sd.cigar_start; i < sd.cigar_end; ++i) {
|
|
||||||
const char c = bam_cigar_opchr(cigar[i]);
|
|
||||||
int len = bam_cigar_oplen(cigar[i]);
|
|
||||||
if (i == sd.cigar_start) len -= sd.first_cigar_clip;
|
|
||||||
if (i == sd.cigar_end - 1) len -= sd.last_cigar_clip;
|
|
||||||
// if (c == 'S' || c == 'I')
|
|
||||||
if (c == 'S')
|
|
||||||
softStart -= len;
|
|
||||||
else if (c != 'H')
|
|
||||||
rightTail = true;
|
|
||||||
if (rightTail) {
|
|
||||||
if (consumeRefBases(c) || c == 'S')
|
|
||||||
softEnd += len;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
sd.softStart() = softStart;
|
|
||||||
sd.softEnd() = softEnd;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 计算clip之后的cigar
|
|
||||||
void calculateCigar(BamWrap* bw, SamData& sd) {
|
|
||||||
sd.cigars.clear();
|
|
||||||
const uint32_t* cigar = bam_get_cigar(bw->b);
|
|
||||||
const bam1_core_t& bc = bw->b->core;
|
|
||||||
int cigar_start = sd.cigar_start;
|
|
||||||
int cigar_end = sd.cigar_end;
|
|
||||||
for (int i = sd.cigar_start; i < sd.cigar_end; ++i) {
|
|
||||||
char c = bam_cigar_opchr(cigar[i]);
|
|
||||||
int len = bam_cigar_oplen(cigar[i]);
|
|
||||||
if (i == sd.cigar_start)
|
|
||||||
len -= sd.first_cigar_clip;
|
|
||||||
if (i == sd.cigar_end - 1)
|
|
||||||
len -= sd.last_cigar_clip;
|
|
||||||
//if ((i == sd.cigar_start || i == sd.cigar_end - 1) && c == 'D') // 跳过开头的deletion
|
|
||||||
if (i == sd.cigar_start && c == 'D') { // 跳过开头的deletion
|
|
||||||
c = 'H';
|
|
||||||
// sd.ref_offset += len;
|
|
||||||
sd.start_pos += len;
|
|
||||||
}
|
|
||||||
sd.cigars.push_back({c, len});
|
|
||||||
}
|
|
||||||
//for(auto &cigar : sd.cigars) {
|
|
||||||
// spdlog::info("op: {}, len: {}", cigar.op, cigar.len);
|
|
||||||
//}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 计算read两端softclip的碱基数量,可能会修改ad里的clip值
|
|
||||||
void calculateSoftClip(BamWrap *bw, SamData &sd) {
|
|
||||||
const uint32_t* cigar = bam_get_cigar(bw->b);
|
|
||||||
const bam1_core_t& bc = bw->b->core;
|
|
||||||
int readIndex = sd.left_clip;
|
|
||||||
int cutLeft = -1; // first position to hard clip (inclusive)
|
|
||||||
int cutRight = -1; // first position to hard clip (inclusive)
|
|
||||||
int cigar_start = sd.cigar_start;
|
|
||||||
int cigar_end = sd.cigar_end;
|
|
||||||
bool rightTail = false; // trigger to stop clipping the left tail and start cutting the right tail
|
|
||||||
|
|
||||||
for (int i = sd.cigar_start; i < sd.cigar_end; ++i) {
|
|
||||||
const char c = bam_cigar_opchr(cigar[i]);
|
|
||||||
int len = bam_cigar_oplen(cigar[i]);
|
|
||||||
if (i == sd.cigar_start) len -= sd.first_cigar_clip;
|
|
||||||
if (i == sd.cigar_end - 1) len -= sd.last_cigar_clip;
|
|
||||||
if (c == 'S') {
|
|
||||||
if (rightTail) {
|
|
||||||
cutRight = readIndex;
|
|
||||||
cigar_end = i;
|
|
||||||
} else {
|
|
||||||
cutLeft = readIndex + len - 1;
|
|
||||||
cigar_start = i + 1;
|
|
||||||
}
|
|
||||||
} else if (c != 'H') {
|
|
||||||
rightTail = true;
|
|
||||||
}
|
|
||||||
if (consumeReadBases(c)) {
|
|
||||||
readIndex += len;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (cutRight >= 0) {
|
|
||||||
sd.right_clip = bw->b->core.l_qseq - cutRight;
|
|
||||||
sd.cigar_end = cigar_end;
|
|
||||||
sd.last_cigar_clip = 0;
|
|
||||||
}
|
|
||||||
if (cutLeft >= 0) {
|
|
||||||
sd.left_clip = cutLeft + 1;
|
|
||||||
sd.cigar_start = cigar_start;
|
|
||||||
sd.first_cigar_clip = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 读取给定区间的reference
|
// 读取给定区间的reference
|
||||||
static inline void read_ref_base(AuxVar& aux, int64_t cur_pos, Interval& interval) {
|
static inline void read_ref_base(AuxVar& aux, int64_t cur_pos, Interval& interval) {
|
||||||
if (aux.ref_seq != NULL)
|
if (aux.ref_seq != NULL)
|
||||||
|
|
@ -386,22 +172,23 @@ static inline void read_ref_base(AuxVar& aux, int64_t cur_pos, Interval& interva
|
||||||
}
|
}
|
||||||
|
|
||||||
// 设置某个位置是indel
|
// 设置某个位置是indel
|
||||||
inline void updateIndel(vector<int> &isIndel, int index) {
|
inline void updateIndel(StableArray<int>& isIndel, int index) {
|
||||||
if (index >=0 && index < isIndel.size()) {
|
if (index >=0 && index < isIndel.size()) {
|
||||||
isIndel[index] = 1;
|
isIndel[index] = 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 计算该read的每个碱基位置是否是SNP或Indel
|
// 计算该read的每个碱基位置是否是SNP或Indel
|
||||||
int calculateIsSNPOrIndel(AuxVar& aux, SamData &sd, vector<int> &isSNP, vector<int> &isIns, vector<int> &isDel) {
|
int calculateIsSNPOrIndel(AuxVar& aux, SamData& sd, StableArray<int>& isSNP, StableArray<int>& isIns, StableArray<int>& isDel) {
|
||||||
|
isSNP.resize(sd.read_len, 0);
|
||||||
|
isIns.resize(sd.read_len, 0);
|
||||||
|
isDel.resize(sd.read_len, 0);
|
||||||
// 1. 读取参考基因组,先看看串行运行性能,稍后可以将读入ref和vcf合并起来做成一个并行流水线步骤
|
// 1. 读取参考基因组,先看看串行运行性能,稍后可以将读入ref和vcf合并起来做成一个并行流水线步骤
|
||||||
//Interval interval{bw->start_pos() + sd.ref_offset, bw->end_pos()}; // 闭区间
|
|
||||||
Interval interval{sd.start_pos, sd.end_pos}; // 闭区间
|
Interval interval{sd.start_pos, sd.end_pos}; // 闭区间
|
||||||
PROF_START(ref);
|
PROF_START(ref);
|
||||||
read_ref_base(aux, interval.left, interval);
|
read_ref_base(aux, interval.left, interval);
|
||||||
PROF_END(gprof[GP_read_ref], ref);
|
PROF_END(gprof[GP_read_ref], ref);
|
||||||
string refBases(aux.ref_seq);
|
string refBases(aux.ref_seq);
|
||||||
// spdlog::info("ref: {}, {}, {} - {}", aux.ref_seq, aux.ref_len, bw->contig_pos(), bw->contig_end_pos());
|
|
||||||
|
|
||||||
// 2. 遍历cigar,计算每个碱基是否是SNP或Indel
|
// 2. 遍历cigar,计算每个碱基是否是SNP或Indel
|
||||||
int readPos = 0, refPos = 0, nEvents = 0;
|
int readPos = 0, refPos = 0, nEvents = 0;
|
||||||
|
|
@ -412,7 +199,6 @@ int calculateIsSNPOrIndel(AuxVar& aux, SamData &sd, vector<int> &isSNP, vector<i
|
||||||
for (int j = 0; j < len; ++j) {
|
for (int j = 0; j < len; ++j) {
|
||||||
// 按位置将read和ref碱基进行比较,不同则是snp,注意read起始位置要加上left_clip
|
// 按位置将read和ref碱基进行比较,不同则是snp,注意read起始位置要加上left_clip
|
||||||
int snpInt = sd.bases[readPos] == refBases[refPos] ? 0 : 1;
|
int snpInt = sd.bases[readPos] == refBases[refPos] ? 0 : 1;
|
||||||
// if (snpInt > 0) { spdlog::info("snp {}, readpos: {}", snpInt, readPos); }
|
|
||||||
isSNP[readPos] = snpInt;
|
isSNP[readPos] = snpInt;
|
||||||
nEvents += snpInt;
|
nEvents += snpInt;
|
||||||
readPos++;
|
readPos++;
|
||||||
|
|
@ -439,59 +225,13 @@ int calculateIsSNPOrIndel(AuxVar& aux, SamData &sd, vector<int> &isSNP, vector<i
|
||||||
readPos += len;
|
readPos += len;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// const uint32_t* cigar = bam_get_cigar(bw->b);
|
|
||||||
// const bam1_core_t& bc = bw->b->core;
|
|
||||||
// uint8_t* seq = bam_get_seq(bw->b);
|
|
||||||
// for (int i = sd.cigar_start; i < sd.cigar_end; ++i) {
|
|
||||||
// const char c = bam_cigar_opchr(cigar[i]);
|
|
||||||
// int len = bam_cigar_oplen(cigar[i]);
|
|
||||||
// if (i == sd.cigar_start) len -= sd.first_cigar_clip;
|
|
||||||
// if (i == sd.cigar_end - 1) len -= sd.last_cigar_clip;
|
|
||||||
// if (c == 'M' || c == '=' || c == 'X') {
|
|
||||||
// for (int j = 0; j < len; ++j) {
|
|
||||||
// // 按位置将read和ref碱基进行比较,不同则是snp,注意read起始位置要加上left_clip
|
|
||||||
// int snpInt = cBaseToChar[bam_seqi(seq, readPos + sd.left_clip)] == refBases[refPos] ? 0 : 1;
|
|
||||||
// // if (snpInt > 0) { spdlog::info("snp {}, readpos: {}", snpInt, readPos); }
|
|
||||||
// isSNP[readPos] = snpInt;
|
|
||||||
// nEvents += snpInt;
|
|
||||||
// readPos++;
|
|
||||||
// refPos++;
|
|
||||||
// }
|
|
||||||
// } else if (c == 'D') {
|
|
||||||
// // 应该是在上一个消耗碱基的cigar的最后一个位置,标记Del
|
|
||||||
// int index = bw->GetReadNegativeStrandFlag() ? readPos : readPos - 1;
|
|
||||||
// updateIndel(isDel, index);
|
|
||||||
// refPos += len;
|
|
||||||
// } else if (c == 'N') {
|
|
||||||
// refPos += len;
|
|
||||||
// } else if (c == 'I') {
|
|
||||||
// // 与Del不同,Ins应该是在下一个cigar开始的位置,标记Ins
|
|
||||||
// bool forwardStrandRead = !bw->GetReadNegativeStrandFlag();
|
|
||||||
// if (forwardStrandRead) {
|
|
||||||
// updateIndel(isIns, readPos - 1);
|
|
||||||
// }
|
|
||||||
// readPos += len;
|
|
||||||
// if (!forwardStrandRead) {
|
|
||||||
// updateIndel(isIns, readPos);
|
|
||||||
// }
|
|
||||||
// } else if (c == 'S') {
|
|
||||||
// readPos += len;
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
nEvents += std::accumulate(isIns.begin(), isIns.end(), 0) + std::accumulate(isDel.begin(), isDel.end(), 0);
|
nEvents += std::accumulate(isIns.begin(), isIns.end(), 0) + std::accumulate(isDel.begin(), isDel.end(), 0);
|
||||||
// spdlog::info("nEvents: {}", nEvents);
|
|
||||||
|
|
||||||
//spdlog::info("SNPs: {}, Ins: {}, Del: {}, total events: {}", std::accumulate(isSNP.begin(), isSNP.end(), 0),
|
|
||||||
// std::accumulate(isIns.begin(), isIns.end(), 0), std::accumulate(isDel.begin(), isDel.end(), 0), nEvents);
|
|
||||||
// exit(0);
|
|
||||||
|
|
||||||
return nEvents;
|
return nEvents;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 简单计算baq数组,就是全部赋值为'@' (64)
|
// 简单计算baq数组,就是全部赋值为'@' (64)
|
||||||
bool flatBAQArray(SamData& sd, vector<uint8_t>& baqArray) {
|
bool flatBAQArray(SamData& sd, StableArray<uint8_t>& baqArray) {
|
||||||
baqArray.resize(sd.read_len, (uint8_t)'@');
|
baqArray.resize(sd.read_len, (uint8_t)'@');
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
@ -514,12 +254,13 @@ static void get_line_from_buf(char* buf, int64_t total, int64_t* cur, string* li
|
||||||
}
|
}
|
||||||
|
|
||||||
// 计算与read有交叉的已知位点信息, 应该要判断一下,是按照read的范围去读取vcf,还是按照一个batch read的范围去读取
|
// 计算与read有交叉的已知位点信息, 应该要判断一下,是按照read的范围去读取vcf,还是按照一个batch read的范围去读取
|
||||||
void calculateKnownSites(SamData& sd, vector<VCFParser> &vcfs, vector<bool> &knownSites) {
|
void calculateKnownSites(SamData& sd, vector<VCFParser>& vcfs, StableArray<uint8_t>& knownSites) {
|
||||||
BamWrap* bw = sd.bw;
|
BamWrap* bw = sd.bw;
|
||||||
int tid = bw->contig_id();
|
int tid = bw->contig_id();
|
||||||
uint64_t startPos = bw->start_pos(); // 闭区间
|
uint64_t startPos = bw->start_pos(); // 闭区间
|
||||||
uint64_t endPos = bw->end_pos(); // 闭区间
|
uint64_t endPos = bw->end_pos(); // 闭区间
|
||||||
// spdlog::info("bam {}, {}", startPos, endPos);
|
knownSites.resize(sd.read_len, 0);
|
||||||
|
|
||||||
// update vcfs
|
// update vcfs
|
||||||
for(auto &vcf : vcfs) {
|
for(auto &vcf : vcfs) {
|
||||||
// 清理旧的interval
|
// 清理旧的interval
|
||||||
|
|
@ -533,15 +274,11 @@ void calculateKnownSites(SamData& sd, vector<VCFParser> &vcfs, vector<bool> &kno
|
||||||
}
|
}
|
||||||
if (!vcf.knownSites.empty() && vcf.knownSites.back().left > endPos) continue;
|
if (!vcf.knownSites.empty() && vcf.knownSites.back().left > endPos) continue;
|
||||||
|
|
||||||
// spdlog::info("intv {}, {}, {}", vcf.knownSites.size(), vcf.knownSites.front().right, vcf.knownSites.front().right);
|
|
||||||
// exit(0);
|
|
||||||
//spdlog::info("before intervals : {}", vcf.knownSites.size());
|
|
||||||
// 读取新的interval
|
// 读取新的interval
|
||||||
int64_t fpos, flen;
|
int64_t fpos, flen;
|
||||||
endPos = std::max(startPos + MAX_SITES_INTERVAL, endPos);
|
endPos = std::max(startPos + MAX_SITES_INTERVAL, endPos);
|
||||||
Interval readIntv(startPos, endPos);
|
Interval readIntv(startPos, endPos);
|
||||||
vcf.index.SearchInterval(startPos, endPos, &fpos, &flen);
|
vcf.index.SearchInterval(startPos, endPos, &fpos, &flen);
|
||||||
//spdlog::info("file index: {}, {}", fpos, flen);
|
|
||||||
if (flen > 0) {
|
if (flen > 0) {
|
||||||
vcf.inStm.seekg(fpos, ios::beg);
|
vcf.inStm.seekg(fpos, ios::beg);
|
||||||
if (flen > vcf.bufLen) {
|
if (flen > vcf.bufLen) {
|
||||||
|
|
@ -565,20 +302,11 @@ void calculateKnownSites(SamData& sd, vector<VCFParser> &vcfs, vector<bool> &kno
|
||||||
Interval varIntv(varStart, varStart + ref.size() - 1);
|
Interval varIntv(varStart, varStart + ref.size() - 1);
|
||||||
if (readIntv.overlaps(varIntv)) {
|
if (readIntv.overlaps(varIntv)) {
|
||||||
vcf.knownSites.push_back(Interval(tid, pos - 1, pos - 1 + ref.size() - 1)); // 闭区间
|
vcf.knownSites.push_back(Interval(tid, pos - 1, pos - 1 + ref.size() - 1)); // 闭区间
|
||||||
//spdlog::info("intv-1 {}, {}, {}", tid, pos, ref.size());
|
|
||||||
}
|
}
|
||||||
get_line_from_buf(buf, flen, &cur, &line);
|
get_line_from_buf(buf, flen, &cur, &line);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//spdlog::info("after intervals : {}", vcf.knownSites.size());
|
|
||||||
//for(auto &val : vcf.knownSites) {
|
|
||||||
// spdlog::info("intv {}, {}", val.left, val.right);
|
|
||||||
//}
|
|
||||||
}
|
}
|
||||||
//exit(0);
|
|
||||||
knownSites.resize(sd.read_len);
|
|
||||||
endPos = bw->end_pos();
|
|
||||||
|
|
||||||
for(auto &vcf : vcfs) {
|
for(auto &vcf : vcfs) {
|
||||||
for (auto &intv : vcf.knownSites) {
|
for (auto &intv : vcf.knownSites) {
|
||||||
// knownSite is outside clipping window for the read, ignore
|
// knownSite is outside clipping window for the read, ignore
|
||||||
|
|
@ -602,11 +330,10 @@ void calculateKnownSites(SamData& sd, vector<VCFParser> &vcfs, vector<bool> &kno
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// 应该是计算一段数据的平均值
|
// 应该是计算一段数据的平均值
|
||||||
static void calculateAndStoreErrorsInBlock(int i, int blockStartIndex, vector<int>& errorArr, vector<double>& fracErrs) {
|
static void calculateAndStoreErrorsInBlock(int i, int blockStartIndex, StableArray<int>& errorArr, StableArray<double>& fracErrs) {
|
||||||
int totalErrors = 0;
|
int totalErrors = 0;
|
||||||
for (int j = max(0, blockStartIndex - 1); j <= i; j++) {
|
for (int j = max(0, blockStartIndex - 1); j <= i; j++) {
|
||||||
totalErrors += errorArr[j];
|
totalErrors += errorArr[j];
|
||||||
|
|
@ -617,9 +344,8 @@ static void calculateAndStoreErrorsInBlock(int i, int blockStartIndex, vector<in
|
||||||
}
|
}
|
||||||
|
|
||||||
// 应该是用来处理BAQ的,把不等于特定BAQ分数的碱基作为一段数据统一处理
|
// 应该是用来处理BAQ的,把不等于特定BAQ分数的碱基作为一段数据统一处理
|
||||||
void calculateFractionalErrorArray(vector<int>& errorArr, vector<uint8_t>& baqArr, vector<double>& fracErrs) {
|
void calculateFractionalErrorArray(StableArray<int>& errorArr, StableArray<uint8_t>& baqArr, StableArray<double>& fracErrs) {
|
||||||
// for (auto val : errorArr) { if (val > 0) spdlog::info("snp err val: {}", val); }
|
fracErrs.resize(baqArr.size(), 0.0);
|
||||||
fracErrs.resize(baqArr.size());
|
|
||||||
// errorArray和baqArray必须长度相同
|
// errorArray和baqArray必须长度相同
|
||||||
const int BLOCK_START_UNSET = -1;
|
const int BLOCK_START_UNSET = -1;
|
||||||
bool inBlock = false;
|
bool inBlock = false;
|
||||||
|
|
@ -805,11 +531,8 @@ int SerialBQSR() {
|
||||||
ReadGroupCovariate::IdToRg[i] = nsgv::gInBamHeader->hrecs->rg[i].name;
|
ReadGroupCovariate::IdToRg[i] = nsgv::gInBamHeader->hrecs->rg[i].name;
|
||||||
}
|
}
|
||||||
|
|
||||||
int test = 0;
|
while (true) {
|
||||||
|
|
||||||
while (1) {
|
|
||||||
++ round;
|
++ round;
|
||||||
|
|
||||||
// 一. 读取bam数据
|
// 一. 读取bam数据
|
||||||
size_t readNum = 0;
|
size_t readNum = 0;
|
||||||
if (inBamBuf.ReadStat() >= 0)
|
if (inBamBuf.ReadStat() >= 0)
|
||||||
|
|
@ -818,52 +541,38 @@ int SerialBQSR() {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
auto bams = inBamBuf.GetBamArr();
|
auto bams = inBamBuf.GetBamArr();
|
||||||
spdlog::info("{} reads processed in {} round, {}", readNum, round, test);
|
spdlog::info("{} reads processed in {} round", readNum, round);
|
||||||
|
|
||||||
// 二. 遍历每个bam(read)记录,进行处理
|
// 二. 遍历每个bam(read)记录,进行处理
|
||||||
|
SamData sd;
|
||||||
|
StableArray<int> isSNP, isIns, isDel; // 该位置是否是SNP, indel位置,0不是,1是
|
||||||
|
StableArray<uint8_t> baqArray;
|
||||||
|
StableArray<double> snpErrors, insErrors, delErrors;
|
||||||
|
StableArray<uint8_t> skips; // 该位置是否是已知位点
|
||||||
|
|
||||||
for (int i = 0; i < bams.size(); ++i) {
|
for (int i = 0; i < bams.size(); ++i) {
|
||||||
// 1. 对每个read,需要检查cigar是否合法,即没有两个连续的相同的cigar,而且需要将首尾的deletion处理掉,目前看好像没啥影响,我们忽略这一步
|
// 1. 对每个read,需要检查cigar是否合法,即没有两个连续的相同的cigar,而且需要将首尾的deletion处理掉,目前看好像没啥影响,我们忽略这一步
|
||||||
// 2. 对质量分数长度跟碱基长度不匹配的read,缺少的质量分数用默认值补齐,先忽略,后边有需要再处理
|
// 2. 对质量分数长度跟碱基长度不匹配的read,缺少的质量分数用默认值补齐,先忽略,后边有需要再处理
|
||||||
// 3. 如果bam文件之前做过bqsr,tag中包含OQ(originnal quality,原始质量分数),检查用户参数里是否指定用原始质量分数进行bqsr,如果是则将质量分数替换为OQ,否则忽略OQ,先忽略
|
// 3. 如果bam文件之前做过bqsr,tag中包含OQ(originnal quality,原始质量分数),检查用户参数里是否指定用原始质量分数进行bqsr,如果是则将质量分数替换为OQ,否则忽略OQ,先忽略
|
||||||
// 4. 对read的两端进行检测,去除(hardclip)adapter
|
|
||||||
// spdlog::info("bam idx: {}", i);
|
// spdlog::info("bam idx: {}", i);
|
||||||
BamWrap* bw = bams[i];
|
BamWrap* bw = bams[i];
|
||||||
SamData sd;
|
sd.init();
|
||||||
|
sd.parseBasic(bw);
|
||||||
sd.rid = i + readNumSum;
|
sd.rid = i + readNumSum;
|
||||||
sd.bw = bw;
|
|
||||||
sd.read_len = BamWrap::BamEffectiveLength(bw->b);
|
|
||||||
sd.cigar_end = bw->b->core.n_cigar;
|
|
||||||
if (sd.read_len <= 0) continue;
|
if (sd.read_len <= 0) continue;
|
||||||
|
|
||||||
int adapter_boundary = bw->GetAdapterBoundary();
|
PROF_START(clip_read);
|
||||||
if (bw->IsAdapterInRead(adapter_boundary)) {
|
// 4. 对read的两端进行检测,去除(hardclip)adapter
|
||||||
// adapter在read范围内
|
ReadTransformer::hardClipAdaptorSequence(bw, sd);
|
||||||
if (bw->GetReadNegativeStrandFlag()) { // 反链
|
if (sd.read_len <= 0) continue;
|
||||||
clipByReferenceCoordinates(bw, -1, adapter_boundary, sd);
|
|
||||||
} else { // 正链
|
|
||||||
clipByReferenceCoordinates(bw, adapter_boundary, -1, sd);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip; // 更新read长度
|
|
||||||
// 5. 然后再去除softclip部分
|
// 5. 然后再去除softclip部分
|
||||||
calculateSoftClip(bw, sd);
|
ReadTransformer::hardClipSoftClippedBases(bw, sd);
|
||||||
sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip; // 更新read长度
|
|
||||||
if (sd.read_len <= 0) continue;
|
if (sd.read_len <= 0) continue;
|
||||||
|
// 应用所有的变换,计算samdata的相关信息
|
||||||
calculateRefOffset(bw, sd); // 计算ref_offset,就是相对比对的position,要将ref右移多少
|
sd.applyTransformations();
|
||||||
calculateReadBases(bw, sd); // 计算clip处理之后,剩余的碱基
|
PROF_END(gprof[GP_clip_read], clip_read);
|
||||||
// 计算clip之后,两端的softstart和softend
|
|
||||||
calculateSoftStartEnd(bw, sd);
|
|
||||||
calculateCigar(bw, sd);
|
|
||||||
|
|
||||||
//spdlog::info("read-len {} - {}: clip left {}, right {}, ref offset: {}, cigar range: [{}, {}), cigar: {}", bw->b->core.l_qseq,
|
|
||||||
// sd.read_len, sd.left_clip, sd.right_clip, sd.ref_offset, sd.cigar_start, sd.cigar_end, bw->cigar_str());
|
|
||||||
|
|
||||||
// 6. 更新每个read的platform信息,好像没啥用,暂时忽略
|
// 6. 更新每个read的platform信息,好像没啥用,暂时忽略
|
||||||
vector<int> isSNP(sd.read_len, 0); // 该位置是否是SNP位置,0不是,1是
|
|
||||||
vector<int> isIns(sd.read_len, 0); // 该位置是否是插入位置,0不是,1是
|
|
||||||
vector<int> isDel(sd.read_len, 0); // 该位置是否是删除位置,0不是,1是
|
|
||||||
const int nErrors = calculateIsSNPOrIndel(nsgv::gAuxVars[0], sd, isSNP, isIns, isDel);
|
const int nErrors = calculateIsSNPOrIndel(nsgv::gAuxVars[0], sd, isSNP, isIns, isDel);
|
||||||
|
|
||||||
/*fprintf(gf[0], "%d\t", sd.read_len);
|
/*fprintf(gf[0], "%d\t", sd.read_len);
|
||||||
|
|
@ -877,16 +586,11 @@ int SerialBQSR() {
|
||||||
fprintf(gf[2], "\n");
|
fprintf(gf[2], "\n");
|
||||||
*/
|
*/
|
||||||
|
|
||||||
// spdlog::info("nErrors: {}", nErrors);
|
|
||||||
// for (auto val : isSNP) { if (val > 0) spdlog::info("snp val: {}", val); }
|
|
||||||
|
|
||||||
//exit(0);
|
|
||||||
|
|
||||||
// 7. 计算baqArray
|
// 7. 计算baqArray
|
||||||
// BAQ = base alignment quality
|
// BAQ = base alignment quality
|
||||||
// note for efficiency reasons we don't compute the BAQ array unless we actually have
|
// note for efficiency reasons we don't compute the BAQ array unless we actually have
|
||||||
// some error to marginalize over. For ILMN data ~85% of reads have no error
|
// some error to marginalize over. For ILMN data ~85% of reads have no error
|
||||||
vector<uint8_t> baqArray;
|
// vector<uint8_t> baqArray;
|
||||||
bool baqCalculated = false;
|
bool baqCalculated = false;
|
||||||
if (nErrors == 0 || !nsgv::gBqsrArg.enableBAQ) {
|
if (nErrors == 0 || !nsgv::gBqsrArg.enableBAQ) {
|
||||||
baqCalculated = flatBAQArray(sd, baqArray);
|
baqCalculated = flatBAQArray(sd, baqArray);
|
||||||
|
|
@ -922,13 +626,7 @@ int SerialBQSR() {
|
||||||
// }
|
// }
|
||||||
// fprintf(gf[3], "\n");
|
// fprintf(gf[3], "\n");
|
||||||
|
|
||||||
//test = readCovariates[1][0][0] + readCovariates[2][1][3];
|
|
||||||
//int end_pos = bw->contig_end_pos();
|
|
||||||
//spdlog::info("adapter: {}, read: {}, {}, strand: {}", adapter_boundary, bw->contig_pos(), end_pos,
|
|
||||||
// bw->GetReadNegativeStrandFlag() ? "reverse" : "forward");
|
|
||||||
// for (auto val : isSNP) { if (val > 0) spdlog::info("snp err val-1: {}", val); }
|
|
||||||
// 9. 计算这条read需要跳过的位置
|
// 9. 计算这条read需要跳过的位置
|
||||||
vector<bool> skips(sd.read_len, 0);
|
|
||||||
PROF_START(known_sites);
|
PROF_START(known_sites);
|
||||||
calculateKnownSites(sd, nsgv::gAuxVars[0].vcfArr, skips);
|
calculateKnownSites(sd, nsgv::gAuxVars[0].vcfArr, skips);
|
||||||
for (int ii = 0; ii < sd.read_len; ++ii) {
|
for (int ii = 0; ii < sd.read_len; ++ii) {
|
||||||
|
|
@ -936,27 +634,19 @@ int SerialBQSR() {
|
||||||
sd.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN;
|
sd.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN;
|
||||||
}
|
}
|
||||||
PROF_END(gprof[GP_read_vcf], known_sites);
|
PROF_END(gprof[GP_read_vcf], known_sites);
|
||||||
// fprintf(gf[0], "%d\t", sd.read_len);
|
// fprintf(gf[0], "%ld %d\t", sd.rid, sd.read_len);
|
||||||
// for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[0], "%d ", skips[ii] ? 1 : 0);
|
// for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[0], "%d ", skips[ii] ? 1 : 0);
|
||||||
// fprintf(gf[0], "\n");
|
// fprintf(gf[0], "\n");
|
||||||
|
|
||||||
// 10. 根据BAQ进一步处理snp,indel,得到处理后的数据
|
// 10. 根据BAQ进一步处理snp,indel,得到处理后的数据
|
||||||
vector<double> snpErrors, insErrors, delErrors;
|
|
||||||
// for (auto val : isSNP) { if (val > 0) spdlog::info("snp err val-2: {}", val); }
|
|
||||||
calculateFractionalErrorArray(isSNP, baqArray, snpErrors);
|
calculateFractionalErrorArray(isSNP, baqArray, snpErrors);
|
||||||
calculateFractionalErrorArray(isIns, baqArray, insErrors);
|
calculateFractionalErrorArray(isIns, baqArray, insErrors);
|
||||||
calculateFractionalErrorArray(isDel, baqArray, delErrors);
|
calculateFractionalErrorArray(isDel, baqArray, delErrors);
|
||||||
|
|
||||||
// for (auto val : isSNP) { if (val > 0) spdlog::info("snp val: {}", val); }
|
|
||||||
//spdlog::info("snp errors size: {}, read len: {}", snpErrors.size(), sd.read_len);
|
|
||||||
//for (auto val : snpErrors) { if (val > 0) spdlog::info("snp err val: {}", val); }
|
|
||||||
|
|
||||||
// aggregate all of the info into our info object, and update the data
|
// aggregate all of the info into our info object, and update the data
|
||||||
// 11. 合并之前计算的数据,得到info,并更新bqsr table数据
|
// 11. 合并之前计算的数据,得到info,并更新bqsr table数据
|
||||||
ReadRecalInfo info(sd, readCovariates, skips, snpErrors, insErrors, delErrors);
|
ReadRecalInfo info(sd, readCovariates, skips, snpErrors, insErrors, delErrors);
|
||||||
int m = 0;
|
|
||||||
// for (auto err : snpErrors) { if (isSNP[m] > 0 || err > 0) spdlog::info("snp err: {} : {}", isSNP[m++], err); }
|
|
||||||
//exit(0);
|
|
||||||
PROF_START(update_info);
|
PROF_START(update_info);
|
||||||
updateRecalTablesForRead(info);
|
updateRecalTablesForRead(info);
|
||||||
PROF_END(gprof[GP_update_info], update_info);
|
PROF_END(gprof[GP_update_info], update_info);
|
||||||
|
|
|
||||||
|
|
@ -99,7 +99,7 @@ static char SimpleComplement(const char base) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ClipLowQualEndsWithN(string& bases, const FastArray<uint8_t> &quals, uint8_t lowQTail, bool isNegativeStrand) {
|
static void ClipLowQualEndsWithN(string& bases, const StableArray<uint8_t> &quals, uint8_t lowQTail, bool isNegativeStrand) {
|
||||||
// 处理左边
|
// 处理左边
|
||||||
int left = 0;
|
int left = 0;
|
||||||
int readLen = bases.size();
|
int readLen = bases.size();
|
||||||
|
|
@ -245,7 +245,7 @@ void ContextCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovar
|
||||||
const int originalReadLength = sd.read_len;
|
const int originalReadLength = sd.read_len;
|
||||||
|
|
||||||
// store the original bases and then write Ns over low quality ones
|
// store the original bases and then write Ns over low quality ones
|
||||||
string strandedClippedBases(sd.bases);
|
string strandedClippedBases(sd.bases.arr.data(), sd.read_len);
|
||||||
// GetStrandedClippedBytes(bw, sd, strandedClippedBases, 30); // 注意这里的lowQualTail数值
|
// GetStrandedClippedBytes(bw, sd, strandedClippedBases, 30); // 注意这里的lowQualTail数值
|
||||||
GetStrandedClippedBytes(sd, strandedClippedBases, lowQualTail); // 命名我之前看到过这个30的?
|
GetStrandedClippedBytes(sd, strandedClippedBases, lowQualTail); // 命名我之前看到过这个30的?
|
||||||
// spdlog::info("bases: {}", strandedClippedBases);
|
// spdlog::info("bases: {}", strandedClippedBases);
|
||||||
|
|
|
||||||
|
|
@ -19,6 +19,7 @@
|
||||||
|
|
||||||
#include "bqsr_args.h"
|
#include "bqsr_args.h"
|
||||||
#include "util/bam_wrap.h"
|
#include "util/bam_wrap.h"
|
||||||
|
#include "util/sam_data.h"
|
||||||
|
|
||||||
using std::map;
|
using std::map;
|
||||||
using std::string;
|
using std::string;
|
||||||
|
|
|
||||||
|
|
@ -9,19 +9,23 @@
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "util/bam_wrap.h"
|
#include <vector>
|
||||||
|
|
||||||
#include "covariate.h"
|
#include "covariate.h"
|
||||||
|
#include "util/bam_wrap.h"
|
||||||
|
#include "util/stable_array.h"
|
||||||
|
using std::vector;
|
||||||
|
|
||||||
struct ReadRecalInfo {
|
struct ReadRecalInfo {
|
||||||
SamData& read;
|
SamData& read;
|
||||||
int length;
|
int length;
|
||||||
PerReadCovariateMatrix& covariates;
|
PerReadCovariateMatrix& covariates;
|
||||||
vector<bool>& skips;
|
StableArray<uint8_t>& skips;
|
||||||
FastArray<uint8_t>&base_quals, &ins_quals, &del_quals;
|
StableArray<uint8_t>&base_quals, &ins_quals, &del_quals;
|
||||||
vector<double>&snp_errs, &ins_errs, &del_errs;
|
StableArray<double>&snp_errs, &ins_errs, &del_errs;
|
||||||
|
|
||||||
ReadRecalInfo(SamData& _read, PerReadCovariateMatrix& _covariates, vector<bool>& _skips, vector<double>& _snp_errs, vector<double>& _ins_errs,
|
ReadRecalInfo(SamData& _read, PerReadCovariateMatrix& _covariates, StableArray<uint8_t>& _skips, StableArray<double>& _snp_errs,
|
||||||
vector<double>& _del_errs)
|
StableArray<double>& _ins_errs, StableArray<double>& _del_errs)
|
||||||
: read(_read),
|
: read(_read),
|
||||||
covariates(_covariates),
|
covariates(_covariates),
|
||||||
skips(_skips),
|
skips(_skips),
|
||||||
|
|
|
||||||
|
|
@ -21,111 +21,6 @@
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
struct Cigar {
|
|
||||||
char op = '0';
|
|
||||||
int len = 0;
|
|
||||||
// 该操作符是否消耗read的碱基
|
|
||||||
static bool ConsumeReadBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'I' || cigar == 'S'; }
|
|
||||||
// 该操作符是否消耗参考基因组的碱基
|
|
||||||
static bool ConsumeRefBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N'; }
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ReadIdxCigar {
|
|
||||||
int readIdx = -1; // 在read序列中的位置
|
|
||||||
char cigarOp = '0'; // 当前位置对应的cigar
|
|
||||||
};
|
|
||||||
|
|
||||||
// 不用经常释放array的内存空间,减少频繁的内存开辟和释放操作
|
|
||||||
template <class T>
|
|
||||||
struct FastArray {
|
|
||||||
vector<T> arr;
|
|
||||||
size_t idx;
|
|
||||||
void clear() { idx = 0; }
|
|
||||||
size_t size() { return idx; }
|
|
||||||
bool empty() { return idx == 0; }
|
|
||||||
void reserve(size_t _size) { arr.reserve(_size); }
|
|
||||||
void resize(size_t _size) {
|
|
||||||
arr.resize(_size);
|
|
||||||
idx = _size;
|
|
||||||
}
|
|
||||||
void push_back(const T& val) {
|
|
||||||
if (idx < arr.size()) {
|
|
||||||
arr[idx++] = val;
|
|
||||||
} else {
|
|
||||||
arr.push_back(val);
|
|
||||||
idx++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
inline T& operator[](size_t pos) { return arr[pos]; }
|
|
||||||
inline const T& operator[](size_t pos) const { return arr[pos]; }
|
|
||||||
struct iterator {
|
|
||||||
typename std::vector<T>::iterator it;
|
|
||||||
iterator(typename std::vector<T>::iterator _it) : it(_it) {}
|
|
||||||
iterator& operator++() { ++it; return *this;}
|
|
||||||
iterator& operator--() { --it; return *this;}
|
|
||||||
T& operator*() const { return *it; }
|
|
||||||
bool operator!=(const iterator& other) const { return it != other.it; }
|
|
||||||
bool operator==(const iterator& other) const { return it == other.it; }
|
|
||||||
};
|
|
||||||
|
|
||||||
iterator begin() { return arr.begin(); }
|
|
||||||
iterator end() { return arr.begin() + idx; }
|
|
||||||
};
|
|
||||||
|
|
||||||
// 对原始bam数据的补充,比如对两端进行hardclip等
|
|
||||||
class BamWrap;
|
|
||||||
struct SamData {
|
|
||||||
int64_t rid = 0; // for debug
|
|
||||||
int read_len = 0; // read长度,各种clip之后的长度
|
|
||||||
int cigar_start = 0; // cigar起始位置,闭区间
|
|
||||||
int cigar_end = 0; // cigar结束位置,开区间
|
|
||||||
int first_cigar_clip = 0; // 第一个cigar, clip的数量,切左侧
|
|
||||||
int last_cigar_clip = 0; // 最后一个cigar, clip的数量,切右侧
|
|
||||||
int left_clip = 0; // 左侧被切掉的碱基长度,BI和BD质量分数也会用到
|
|
||||||
int right_clip = 0; // 右侧被切掉的碱基长度
|
|
||||||
int ref_offset = 0; // 切除adapter和softclip之后(softclip应该不影响),相对原始ref比对位置(contig_pos)的偏移量
|
|
||||||
|
|
||||||
// 记录一下bqsr运算过程中用到的数据,回头提前计算一下,修正现在的复杂逻辑
|
|
||||||
static constexpr int READ_INDEX_NOT_FOUND = -1;
|
|
||||||
|
|
||||||
BamWrap* bw;
|
|
||||||
int64_t start_pos; // 因为soft clip都被切掉了,这里的softstart应该就是切掉之后的匹配位点,闭区间
|
|
||||||
int64_t end_pos; // 同上,闭区间
|
|
||||||
string bases; // 处理之后的read的碱基
|
|
||||||
FastArray<uint8_t> base_quals; // 对应的质量分数
|
|
||||||
FastArray<uint8_t> ins_quals; // insert质量分数, BI (大部分应该都没有)
|
|
||||||
FastArray<uint8_t> del_quals; // delete质量分数, BD (大部分应该都没有)
|
|
||||||
|
|
||||||
FastArray<Cigar> cigars;
|
|
||||||
int64_t& softStart() { return start_pos; }
|
|
||||||
int64_t& softEnd() { return end_pos; }
|
|
||||||
|
|
||||||
// functions
|
|
||||||
ReadIdxCigar getReadIndexForReferenceCoordinate(int64_t refPos) {
|
|
||||||
ReadIdxCigar rc;
|
|
||||||
if (refPos < start_pos)
|
|
||||||
return rc;
|
|
||||||
int firstReadPosOfElement = 0; // inclusive
|
|
||||||
int firstRefPosOfElement = start_pos; // inclusive
|
|
||||||
int lastReadPosOfElement = 0; // exclusive
|
|
||||||
int lastRefPosOfElement = start_pos; // exclusive
|
|
||||||
// advance forward through all the cigar elements until we bracket the reference coordinate
|
|
||||||
for (auto& cigar : cigars) {
|
|
||||||
firstReadPosOfElement = lastReadPosOfElement;
|
|
||||||
firstRefPosOfElement = lastRefPosOfElement;
|
|
||||||
lastReadPosOfElement += Cigar::ConsumeReadBases(cigar.op) ? cigar.len : 0;
|
|
||||||
lastRefPosOfElement += Cigar::ConsumeRefBases(cigar.op) || cigar.op == 'S' ? cigar.len : 0;
|
|
||||||
if (firstRefPosOfElement <= refPos && refPos < lastRefPosOfElement) { // refCoord falls within this cigar element
|
|
||||||
int readPosAtRefCoord = firstReadPosOfElement + (Cigar::ConsumeReadBases(cigar.op) ? (refPos - firstRefPosOfElement) : 0);
|
|
||||||
rc.cigarOp = cigar.op;
|
|
||||||
rc.readIdx = readPosAtRefCoord;
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
这里的成员函数命名有点混乱,特此说明,小写加下划线的函数命名,无论是静态函数,还是普通成员函数,更侧重说明
|
这里的成员函数命名有点混乱,特此说明,小写加下划线的函数命名,无论是静态函数,还是普通成员函数,更侧重说明
|
||||||
这是类似bam的一个属性,而大写加驼峰命名的函数,更侧重说明这是通过计算得出的。
|
这是类似bam的一个属性,而大写加驼峰命名的函数,更侧重说明这是通过计算得出的。
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,3 @@
|
||||||
|
#include "base_utils.h"
|
||||||
|
|
||||||
|
const char BaseUtils::cBaseToChar[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'};
|
||||||
|
|
@ -0,0 +1,20 @@
|
||||||
|
/*
|
||||||
|
Description: 碱基相关的工具函数
|
||||||
|
|
||||||
|
Copyright : All right reserved by ICT
|
||||||
|
|
||||||
|
Author : Zhang Zhonghai
|
||||||
|
Date : 2025/12/29
|
||||||
|
*/
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
struct BaseUtils {
|
||||||
|
// uint8_t转碱基字符
|
||||||
|
static const char cBaseToChar[16];
|
||||||
|
|
||||||
|
// 该操作符是否消耗read的碱基
|
||||||
|
static bool consumeReadBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'I' || cigar == 'S'; }
|
||||||
|
|
||||||
|
// 该操作符是否消耗参考基因组的碱基
|
||||||
|
static bool consumeRefBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N'; }
|
||||||
|
};
|
||||||
|
|
@ -54,6 +54,7 @@ int DisplayProfiling(int nthread) {
|
||||||
// PRINT_GP(sort_wait);
|
// PRINT_GP(sort_wait);
|
||||||
// PRINT_GP(markdup_wait);
|
// PRINT_GP(markdup_wait);
|
||||||
// PRINT_GP(intersect_wait);
|
// PRINT_GP(intersect_wait);
|
||||||
|
PRINT_GP(clip_read);
|
||||||
PRINT_GP(read_ref);
|
PRINT_GP(read_ref);
|
||||||
PRINT_GP(read_vcf);
|
PRINT_GP(read_vcf);
|
||||||
PRINT_GP(covariate);
|
PRINT_GP(covariate);
|
||||||
|
|
|
||||||
|
|
@ -40,6 +40,7 @@ extern uint64_t gprof[LIM_GLOBAL_PROF_TYPE];
|
||||||
enum { GP_0 = 0, GP_1, GP_2, GP_3, GP_4, GP_5, GP_6, GP_7, GP_8, GP_9, GP_10 };
|
enum { GP_0 = 0, GP_1, GP_2, GP_3, GP_4, GP_5, GP_6, GP_7, GP_8, GP_9, GP_10 };
|
||||||
enum {
|
enum {
|
||||||
GP_read_wait = 11,
|
GP_read_wait = 11,
|
||||||
|
GP_clip_read,
|
||||||
GP_covariate,
|
GP_covariate,
|
||||||
GP_read_ref,
|
GP_read_ref,
|
||||||
GP_read_vcf,
|
GP_read_vcf,
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,154 @@
|
||||||
|
/*
|
||||||
|
Description: read (sam record) 相关的工具函数,比如用于clipping低质量碱基等
|
||||||
|
|
||||||
|
Copyright : All right reserved by ICT
|
||||||
|
|
||||||
|
Author : Zhang Zhonghai
|
||||||
|
Date : 2025/12/29
|
||||||
|
*/
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "bam_wrap.h"
|
||||||
|
#include "sam_data.h"
|
||||||
|
#include "base_utils.h"
|
||||||
|
|
||||||
|
// 用于对read进行各种转换操作,比如clipping等。注意这里都是逻辑操作,最后需要调用SamData.applyTransformations()来真正应用这些修改
|
||||||
|
struct ReadTransformer {
|
||||||
|
// 给定一个ref位置,在read内部找到对应的位置和操作符
|
||||||
|
struct PosAndOperator {
|
||||||
|
int readPosAtRefCoord = -1; // read中的位置
|
||||||
|
char cigarOperator = '0'; // cigar操作符
|
||||||
|
int cigarIndex = -1; // cigar索引
|
||||||
|
int cigarLen = 0;
|
||||||
|
int preCigarLen = 0; // 截止cigar之前的,消耗read base的长度
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Find the 0-based index within a read base array corresponding to a given 0-based position in the reference, along with the cigar operator of
|
||||||
|
* the element containing that base. If the reference coordinate occurs within a deletion, the first index after the deletion is returned.
|
||||||
|
* Note that this treats soft-clipped bases as if they align with the reference, which is useful for hard-clipping reads with soft clips.
|
||||||
|
*
|
||||||
|
* @param alignmentStart The soft start of the read on the reference
|
||||||
|
* @param cigar The read's cigar
|
||||||
|
* @param refCoord The target reference coordinate
|
||||||
|
* @return If the reference coordinate occurs before the read start or after the read end {@code CLIPPING_GOAL_NOT_REACHED};
|
||||||
|
* if the reference coordinate falls within an alignment block of the read's cigar, the corresponding read
|
||||||
|
* coordinate; if the reference coordinate falls within a deletion, the first read coordinate after the deletion. Note: if the last cigar element
|
||||||
|
* is a deletion (which isn't meaningful), it returns {@code CLIPPING_GOAL_NOT_REACHED}.
|
||||||
|
*/
|
||||||
|
static PosAndOperator getReadIndexForReferenceCoordinate(BamWrap* bw, int alignmentStart, int refCoord) {
|
||||||
|
PosAndOperator po;
|
||||||
|
if (refCoord < alignmentStart) {
|
||||||
|
return po;
|
||||||
|
}
|
||||||
|
int firstReadPosOfElement = 0; // inclusive
|
||||||
|
int firstRefPosOfElement = alignmentStart; // inclusive
|
||||||
|
int lastReadPosOfElement = 0; // exclusive
|
||||||
|
int lastRefPosOfElement = alignmentStart; // exclusive
|
||||||
|
|
||||||
|
// advance forward through all the cigar elements until we bracket the reference coordinate
|
||||||
|
const uint32_t* cigar = bam_get_cigar(bw->b);
|
||||||
|
const bam1_core_t& bc = bw->b->core;
|
||||||
|
const int idx = bc.n_cigar - 1;
|
||||||
|
if (idx < 0)
|
||||||
|
return po;
|
||||||
|
for (int i = 0; i < bc.n_cigar; ++i) {
|
||||||
|
const char c = bam_cigar_opchr(cigar[i]);
|
||||||
|
const int len = bam_cigar_oplen(cigar[i]);
|
||||||
|
firstReadPosOfElement = lastReadPosOfElement;
|
||||||
|
firstRefPosOfElement = lastRefPosOfElement;
|
||||||
|
lastReadPosOfElement += BaseUtils::consumeReadBases(c) ? len : 0;
|
||||||
|
lastRefPosOfElement += (BaseUtils::consumeRefBases(c) || c == 'S') ? len : 0;
|
||||||
|
if (firstRefPosOfElement <= refCoord && refCoord < lastRefPosOfElement) { // refCoord falls within this cigar element
|
||||||
|
int readPosAtRefCoord = firstReadPosOfElement + (BaseUtils::consumeReadBases(c) ? (refCoord - firstRefPosOfElement) : 0);
|
||||||
|
return PosAndOperator{readPosAtRefCoord, c, i, len, firstReadPosOfElement};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return po;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 根据adapter位置,对read进行hardclip,返回左侧或右侧减掉的base数量
|
||||||
|
static void clipByReferenceCoordinates(BamWrap* bw, int refStart, int refStop, SamData& sd) {
|
||||||
|
int start, stop;
|
||||||
|
// Determine the read coordinate to start and stop hard clipping
|
||||||
|
if (refStart < 0) {
|
||||||
|
if (refStop < 0)
|
||||||
|
return;
|
||||||
|
PosAndOperator stopPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStop);
|
||||||
|
// if the refStop falls in a deletion, the above method returns the position after the deletion. Since the stop we return here
|
||||||
|
// is inclusive, we decrement the stop to avoid overclipping by one base. As a result we do not clip the deletion, which is fine.
|
||||||
|
stop = stopPosAndOperator.readPosAtRefCoord - (BaseUtils::consumeReadBases(stopPosAndOperator.cigarOperator) ? 0 : 1);
|
||||||
|
sd.left_clip = stop + 1;
|
||||||
|
sd.cigar_start = stopPosAndOperator.cigarIndex;
|
||||||
|
sd.first_cigar_clip = sd.left_clip - stopPosAndOperator.preCigarLen;
|
||||||
|
} else {
|
||||||
|
if (refStop >= 0)
|
||||||
|
return;
|
||||||
|
// unlike the above case where we clip the start fo the read, here we clip the end and returning the base to the right of a deletion
|
||||||
|
// avoids overclipping
|
||||||
|
PosAndOperator startPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStart);
|
||||||
|
start = startPosAndOperator.readPosAtRefCoord;
|
||||||
|
sd.right_clip = bw->b->core.l_qseq - start;
|
||||||
|
sd.cigar_end = startPosAndOperator.cigarIndex + 1;
|
||||||
|
sd.last_cigar_clip = startPosAndOperator.preCigarLen + startPosAndOperator.cigarLen - start;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 切掉adaper序列,注意这里的clipping只是逻辑上的,实际并没有修改bam record
|
||||||
|
static void hardClipAdaptorSequence(BamWrap* bw, SamData& sd) {
|
||||||
|
int adapter_boundary = bw->GetAdapterBoundary();
|
||||||
|
if (bw->IsAdapterInRead(adapter_boundary)) {
|
||||||
|
// adapter在read范围内
|
||||||
|
if (bw->GetReadNegativeStrandFlag()) { // 反链
|
||||||
|
clipByReferenceCoordinates(bw, -1, adapter_boundary, sd);
|
||||||
|
} else { // 正链
|
||||||
|
clipByReferenceCoordinates(bw, adapter_boundary, -1, sd);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip; // 更新read长度
|
||||||
|
}
|
||||||
|
|
||||||
|
// 计算read两端softclip的碱基数量,切掉softclip序列
|
||||||
|
static void hardClipSoftClippedBases(BamWrap* bw, SamData& sd) {
|
||||||
|
const uint32_t* cigar = bam_get_cigar(bw->b);
|
||||||
|
const bam1_core_t& bc = bw->b->core;
|
||||||
|
int readIndex = sd.left_clip;
|
||||||
|
int cutLeft = -1; // first position to hard clip (inclusive)
|
||||||
|
int cutRight = -1; // first position to hard clip (inclusive)
|
||||||
|
int cigar_start = sd.cigar_start;
|
||||||
|
int cigar_end = sd.cigar_end;
|
||||||
|
bool rightTail = false; // trigger to stop clipping the left tail and start cutting the right tail
|
||||||
|
|
||||||
|
for (int i = sd.cigar_start; i < sd.cigar_end; ++i) {
|
||||||
|
const char c = bam_cigar_opchr(cigar[i]);
|
||||||
|
int len = bam_cigar_oplen(cigar[i]);
|
||||||
|
if (i == sd.cigar_start) len -= sd.first_cigar_clip;
|
||||||
|
if (i == sd.cigar_end - 1) len -= sd.last_cigar_clip;
|
||||||
|
if (c == 'S') {
|
||||||
|
if (rightTail) {
|
||||||
|
cutRight = readIndex;
|
||||||
|
cigar_end = i;
|
||||||
|
} else {
|
||||||
|
cutLeft = readIndex + len - 1;
|
||||||
|
cigar_start = i + 1;
|
||||||
|
}
|
||||||
|
} else if (c != 'H') {
|
||||||
|
rightTail = true;
|
||||||
|
}
|
||||||
|
if (BaseUtils::consumeReadBases(c)) {
|
||||||
|
readIndex += len;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (cutRight >= 0) {
|
||||||
|
sd.right_clip = bw->b->core.l_qseq - cutRight;
|
||||||
|
sd.cigar_end = cigar_end;
|
||||||
|
sd.last_cigar_clip = 0;
|
||||||
|
}
|
||||||
|
if (cutLeft >= 0) {
|
||||||
|
sd.left_clip = cutLeft + 1;
|
||||||
|
sd.cigar_start = cigar_start;
|
||||||
|
sd.first_cigar_clip = 0;
|
||||||
|
}
|
||||||
|
sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip; // 更新read长度
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
@ -0,0 +1,178 @@
|
||||||
|
/*
|
||||||
|
Description: 对原始bam进行部分数据解析,以更方便后续处理
|
||||||
|
|
||||||
|
Copyright : All right reserved by ICT
|
||||||
|
|
||||||
|
Author : Zhang Zhonghai
|
||||||
|
Date : 2025/12/29
|
||||||
|
*/
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
|
|
||||||
|
#include "bam_wrap.h"
|
||||||
|
#include "stable_array.h"
|
||||||
|
#include "base_utils.h"
|
||||||
|
|
||||||
|
// 对cigar进行简单包装
|
||||||
|
struct Cigar {
|
||||||
|
char op = '0';
|
||||||
|
int len = 0;
|
||||||
|
// 该操作符是否消耗read的碱基
|
||||||
|
static bool ConsumeReadBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'I' || cigar == 'S'; }
|
||||||
|
// 该操作符是否消耗参考基因组的碱基
|
||||||
|
static bool ConsumeRefBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N'; }
|
||||||
|
};
|
||||||
|
|
||||||
|
// 在read中某个特定位置对应的cigar操作符
|
||||||
|
struct ReadIdxCigar {
|
||||||
|
int readIdx = -1; // 在read序列中的位置
|
||||||
|
char cigarOp = '0'; // 当前位置对应的cigar
|
||||||
|
};
|
||||||
|
|
||||||
|
// 对原始bam进行部分数据解析,以更方便后续处理
|
||||||
|
struct SamData {
|
||||||
|
int64_t rid = 0; // for debug
|
||||||
|
|
||||||
|
// 这些数据用于最开始clipping等操作
|
||||||
|
int read_len = 0; // read长度,各种clip之后的长度
|
||||||
|
int cigar_start = 0; // cigar起始位置,闭区间
|
||||||
|
int cigar_end = 0; // cigar结束位置,开区间
|
||||||
|
int first_cigar_clip = 0; // 第一个cigar, clip的数量,切左侧
|
||||||
|
int last_cigar_clip = 0; // 最后一个cigar, clip的数量,切右侧
|
||||||
|
int left_clip = 0; // 左侧被切掉的碱基长度,BI和BD质量分数也会用到
|
||||||
|
int right_clip = 0; // 右侧被切掉的碱基长度
|
||||||
|
int ref_offset = 0; // 切除adapter和softclip之后(softclip应该不影响),相对原始ref比对位置(contig_pos)的偏移量
|
||||||
|
|
||||||
|
// 记录一下bqsr运算过程中用到的数据,回头提前计算一下,修正现在的复杂逻辑
|
||||||
|
static constexpr int READ_INDEX_NOT_FOUND = -1;
|
||||||
|
|
||||||
|
BamWrap* bw;
|
||||||
|
int64_t start_pos; // 因为soft clip都被切掉了,这里的softstart应该就是切掉之后的匹配位点,闭区间
|
||||||
|
int64_t end_pos; // 同上,闭区间
|
||||||
|
StableArray<char> bases; // 处理之后的read的碱基数组形式
|
||||||
|
StableArray<uint8_t> base_quals; // 对应的质量分数
|
||||||
|
StableArray<uint8_t> ins_quals; // insert质量分数, BI (大部分应该都没有)
|
||||||
|
StableArray<uint8_t> del_quals; // delete质量分数, BD (大部分应该都没有)
|
||||||
|
StableArray<Cigar> cigars;
|
||||||
|
|
||||||
|
int64_t& softStart() { return start_pos; }
|
||||||
|
int64_t& softEnd() { return end_pos; }
|
||||||
|
|
||||||
|
// functions
|
||||||
|
|
||||||
|
// 初始化
|
||||||
|
void init() {
|
||||||
|
bases.clear();
|
||||||
|
base_quals.clear();
|
||||||
|
ins_quals.clear();
|
||||||
|
del_quals.clear();
|
||||||
|
cigars.clear();
|
||||||
|
rid = 0;
|
||||||
|
read_len = 0;
|
||||||
|
cigar_start = 0;
|
||||||
|
cigar_end = 0;
|
||||||
|
first_cigar_clip = 0;
|
||||||
|
last_cigar_clip = 0;
|
||||||
|
left_clip = 0;
|
||||||
|
right_clip = 0;
|
||||||
|
ref_offset = 0;
|
||||||
|
bw = nullptr;
|
||||||
|
start_pos = 0;
|
||||||
|
end_pos = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 初步解析bam
|
||||||
|
void parseBasic(BamWrap *_bw) {
|
||||||
|
bw = _bw;
|
||||||
|
read_len = BamWrap::BamEffectiveLength(bw->b);
|
||||||
|
cigar_end = bw->b->core.n_cigar;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 应用各种转换操作,更新SamData中的数据
|
||||||
|
void applyTransformations() {
|
||||||
|
const uint32_t* cigar = bam_get_cigar(bw->b);
|
||||||
|
const bam1_core_t& bc = bw->b->core;
|
||||||
|
int i = 0;
|
||||||
|
|
||||||
|
// 计算ref_offset,就是相对比对的position,要将ref右移多少
|
||||||
|
for (i = 0; i < cigar_start; ++i) {
|
||||||
|
if (BaseUtils::consumeRefBases(bam_cigar_opchr(cigar[i]))) {
|
||||||
|
ref_offset += bam_cigar_oplen(cigar[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (BaseUtils::consumeRefBases(bam_cigar_opchr(cigar[i]))) {
|
||||||
|
ref_offset += first_cigar_clip;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 计算clip处理之后,剩余的碱基
|
||||||
|
bases.resize(read_len);
|
||||||
|
base_quals.resize(read_len);
|
||||||
|
uint8_t* seq = bam_get_seq(bw->b);
|
||||||
|
uint8_t* quals = bam_get_qual(bw->b);
|
||||||
|
for (i = 0; i < read_len; ++i) {
|
||||||
|
bases[i] = BaseUtils::cBaseToChar[bam_seqi(seq, i + left_clip)];
|
||||||
|
base_quals[i] = quals[i + left_clip];
|
||||||
|
}
|
||||||
|
|
||||||
|
// 计算read两端clip之后的softstart和softend,其实S之前都被切掉了
|
||||||
|
int64_t softStart = bw->b->core.pos + ref_offset;
|
||||||
|
int64_t softEnd = softStart - 1; // 闭区间
|
||||||
|
bool rightTail = false;
|
||||||
|
for (i = cigar_start; i < cigar_end; ++i) {
|
||||||
|
const char c = bam_cigar_opchr(cigar[i]);
|
||||||
|
int len = bam_cigar_oplen(cigar[i]);
|
||||||
|
if (i == cigar_start) len -= first_cigar_clip;
|
||||||
|
if (i == cigar_end - 1) len -= last_cigar_clip;
|
||||||
|
// if (c == 'S' || c == 'I')
|
||||||
|
if (c == 'S') softStart -= len;
|
||||||
|
else if (c != 'H') rightTail = true;
|
||||||
|
if (rightTail && (BaseUtils::consumeRefBases(c) || c == 'S')) softEnd += len;
|
||||||
|
}
|
||||||
|
this->softStart() = softStart; // 其实这里的softStart就是start
|
||||||
|
this->softEnd() = softEnd; // 同上
|
||||||
|
|
||||||
|
// 计算clip之后的cigar,其实可以考虑下边的代码和上边的换一下位置
|
||||||
|
for (i = cigar_start; i < cigar_end; ++i) {
|
||||||
|
char c = bam_cigar_opchr(cigar[i]);
|
||||||
|
int len = bam_cigar_oplen(cigar[i]);
|
||||||
|
if (i == cigar_start) len -= first_cigar_clip;
|
||||||
|
if (i == cigar_end - 1) len -= last_cigar_clip;
|
||||||
|
// if ((i == sd.cigar_start || i == sd.cigar_end - 1) && c == 'D') // 跳过开头的deletion
|
||||||
|
if (i == cigar_start && c == 'D') { // 跳过开头的deletion
|
||||||
|
c = 'H';
|
||||||
|
ref_offset += len;
|
||||||
|
start_pos += len; // 更新起始位置
|
||||||
|
} else if (i == cigar_end - 1 && c == 'D') { // 跳过结尾的deletion
|
||||||
|
c = 'H';
|
||||||
|
softEnd -= len; // 更新结束位置
|
||||||
|
}
|
||||||
|
cigars.push_back({c, len});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 给定一个ref pos,返回对应的read index和cigar操作
|
||||||
|
ReadIdxCigar getReadIndexForReferenceCoordinate(int64_t refPos) {
|
||||||
|
ReadIdxCigar rc;
|
||||||
|
if (refPos < start_pos)
|
||||||
|
return rc;
|
||||||
|
int firstReadPosOfElement = 0; // inclusive
|
||||||
|
int firstRefPosOfElement = start_pos; // inclusive
|
||||||
|
int lastReadPosOfElement = 0; // exclusive
|
||||||
|
int lastRefPosOfElement = start_pos; // exclusive
|
||||||
|
// advance forward through all the cigar elements until we bracket the reference coordinate
|
||||||
|
for (auto& cigar : cigars) {
|
||||||
|
firstReadPosOfElement = lastReadPosOfElement;
|
||||||
|
firstRefPosOfElement = lastRefPosOfElement;
|
||||||
|
lastReadPosOfElement += Cigar::ConsumeReadBases(cigar.op) ? cigar.len : 0;
|
||||||
|
lastRefPosOfElement += Cigar::ConsumeRefBases(cigar.op) || cigar.op == 'S' ? cigar.len : 0;
|
||||||
|
if (firstRefPosOfElement <= refPos && refPos < lastRefPosOfElement) { // refCoord falls within this cigar element
|
||||||
|
int readPosAtRefCoord = firstReadPosOfElement + (Cigar::ConsumeReadBases(cigar.op) ? (refPos - firstRefPosOfElement) : 0);
|
||||||
|
rc.cigarOp = cigar.op;
|
||||||
|
rc.readIdx = readPosAtRefCoord;
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
@ -0,0 +1,73 @@
|
||||||
|
/*
|
||||||
|
Description: 不用经常释放array的内存空间,减少频繁的内存开辟和释放操作
|
||||||
|
|
||||||
|
Copyright : All right reserved by ICT
|
||||||
|
|
||||||
|
Author : Zhang Zhonghai
|
||||||
|
Date : 2025/12/29
|
||||||
|
*/
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
using std::vector;
|
||||||
|
|
||||||
|
// 不能用于bool类型, 因为在c++中vector<bool>是特化过的
|
||||||
|
template <class T>
|
||||||
|
struct StableArray {
|
||||||
|
vector<T> arr;
|
||||||
|
size_t idx = 0;
|
||||||
|
void clear() { idx = 0; }
|
||||||
|
size_t size() { return idx; }
|
||||||
|
bool empty() { return idx == 0; }
|
||||||
|
void reserve(size_t _size) {
|
||||||
|
if (arr.size() < _size)
|
||||||
|
arr.reserve(_size);
|
||||||
|
}
|
||||||
|
void resize(size_t _size) {
|
||||||
|
if (arr.size() < _size) {
|
||||||
|
arr.resize(_size);
|
||||||
|
}
|
||||||
|
idx = _size;
|
||||||
|
}
|
||||||
|
|
||||||
|
void resize(size_t _size, const T &val) {
|
||||||
|
if (arr.size() < _size) {
|
||||||
|
arr.resize(_size);
|
||||||
|
}
|
||||||
|
for (size_t i = 0; i < _size; ++i) {
|
||||||
|
arr[i] = val;
|
||||||
|
}
|
||||||
|
idx = _size;
|
||||||
|
}
|
||||||
|
void push_back(const T& val) {
|
||||||
|
if (idx < arr.size()) {
|
||||||
|
arr[idx++] = val;
|
||||||
|
} else {
|
||||||
|
arr.push_back(val);
|
||||||
|
idx++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
inline T& operator[](size_t pos) { return arr[pos]; }
|
||||||
|
//inline const T& operator[](size_t pos) { return arr[pos]; }
|
||||||
|
inline const T& operator[](size_t pos) const { return arr[pos]; }
|
||||||
|
struct iterator {
|
||||||
|
typename std::vector<T>::iterator it;
|
||||||
|
iterator(typename std::vector<T>::iterator _it) : it(_it) {}
|
||||||
|
iterator& operator++() {
|
||||||
|
++it;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
iterator& operator--() {
|
||||||
|
--it;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
T& operator*() const { return *it; }
|
||||||
|
bool operator!=(const iterator& other) const { return it != other.it; }
|
||||||
|
bool operator==(const iterator& other) const { return it == other.it; }
|
||||||
|
};
|
||||||
|
|
||||||
|
iterator begin() { return arr.begin(); }
|
||||||
|
iterator end() { return arr.begin() + idx; }
|
||||||
|
};
|
||||||
Loading…
Reference in New Issue