串行版本还差最后一步,将信息合并到数据汇总表中
This commit is contained in:
parent
1e9b58fac1
commit
25f079b936
|
|
@ -6,7 +6,7 @@ vector<double> BAQ::qual2prob(256); // 质量分数转化概率
|
||||||
vector<vector<vector<double>>> BAQ::EPSILONS(256, vector<vector<double>>(256, vector<double>(SAM_MAX_PHRED_SCORE + 1))); // [ref][read][qual]
|
vector<vector<vector<double>>> BAQ::EPSILONS(256, vector<vector<double>>(256, vector<double>(SAM_MAX_PHRED_SCORE + 1))); // [ref][read][qual]
|
||||||
|
|
||||||
// 计算baq数组,返回成功与否
|
// 计算baq数组,返回成功与否
|
||||||
bool BAQ::calcBAQFromHMM(BamWrap* bw, ReadAdditionData& ad, string ref, int refOffset, vector<int>& baqArray) {
|
bool BAQ::calcBAQFromHMM(BamWrap* bw, SamData& ad, string ref, int refOffset, vector<int>& baqArray) {
|
||||||
// 检测ref是否覆盖了read
|
// 检测ref是否覆盖了read
|
||||||
if (ref.size() < refOffset + ad.read_len) {
|
if (ref.size() < refOffset + ad.read_len) {
|
||||||
spdlog::error("BAQ calculation error: reference sequence length {} is less than required length {} (refOffset {} + read_len {})",
|
spdlog::error("BAQ calculation error: reference sequence length {} is less than required length {} (refOffset {} + read_len {})",
|
||||||
|
|
|
||||||
|
|
@ -84,5 +84,5 @@ struct BAQ {
|
||||||
double calcEpsilon(uint8_t ref, uint8_t read, uint8_t qualB) { return EPSILONS[ref][read][qualB]; }
|
double calcEpsilon(uint8_t ref, uint8_t read, uint8_t qualB) { return EPSILONS[ref][read][qualB]; }
|
||||||
|
|
||||||
// 计算baq数组,返回成功与否
|
// 计算baq数组,返回成功与否
|
||||||
bool calcBAQFromHMM(BamWrap* bw, ReadAdditionData& ad, string ref, int refOffset, vector<int>& baqArray);
|
bool calcBAQFromHMM(BamWrap* bw, SamData& ad, string ref, int refOffset, vector<int>& baqArray);
|
||||||
};
|
};
|
||||||
|
|
@ -36,6 +36,9 @@ Date : 2023/10/23
|
||||||
using std::deque;
|
using std::deque;
|
||||||
|
|
||||||
#define BAM_BLOCK_SIZE 16L * 1024 * 1024
|
#define BAM_BLOCK_SIZE 16L * 1024 * 1024
|
||||||
|
#define MAX_SITES_INTERVAL 100000
|
||||||
|
|
||||||
|
const uint8_t NO_BAQ_UNCERTAINTY = (uint8_t)'@';
|
||||||
|
|
||||||
const char cBaseToChar[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'};
|
const char cBaseToChar[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'};
|
||||||
|
|
||||||
|
|
@ -210,7 +213,7 @@ PosAndOperator getReadIndexForReferenceCoordinate(BamWrap *bw, int alignmentStar
|
||||||
}
|
}
|
||||||
|
|
||||||
// 根据adapter位置,对read进行hardclip,返回左侧或右侧减掉的base数量
|
// 根据adapter位置,对read进行hardclip,返回左侧或右侧减掉的base数量
|
||||||
void clipByReferenceCoordinates(BamWrap *bw, int refStart, int refStop, ReadAdditionData &ad) {
|
void clipByReferenceCoordinates(BamWrap *bw, int refStart, int refStop, SamData &ad) {
|
||||||
int start, stop;
|
int start, stop;
|
||||||
// Determine the read coordinate to start and stop hard clipping
|
// Determine the read coordinate to start and stop hard clipping
|
||||||
if (refStart < 0) {
|
if (refStart < 0) {
|
||||||
|
|
@ -235,7 +238,7 @@ void clipByReferenceCoordinates(BamWrap *bw, int refStart, int refStop, ReadAddi
|
||||||
}
|
}
|
||||||
|
|
||||||
// 计算切掉adapter之后,ref相对原始ref的偏移量
|
// 计算切掉adapter之后,ref相对原始ref的偏移量
|
||||||
void calculateRefOffset(BamWrap *bw, ReadAdditionData &ad) {
|
void calculateRefOffset(BamWrap *bw, SamData &ad) {
|
||||||
const uint32_t* cigar = bam_get_cigar(bw->b);
|
const uint32_t* cigar = bam_get_cigar(bw->b);
|
||||||
const bam1_core_t& bc = bw->b->core;
|
const bam1_core_t& bc = bw->b->core;
|
||||||
int i = 0;
|
int i = 0;
|
||||||
|
|
@ -253,16 +256,68 @@ void calculateRefOffset(BamWrap *bw, ReadAdditionData &ad) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// 计算clip处理之后,剩余的碱基
|
// 计算clip处理之后,剩余的碱基
|
||||||
void calculateReadBases(BamWrap* bw, ReadAdditionData& ad) {
|
void calculateReadBases(BamWrap* bw, SamData& ad) {
|
||||||
ad.bases.resize(ad.read_len);
|
ad.bases.resize(ad.read_len);
|
||||||
|
ad.quals.resize(ad.read_len);
|
||||||
uint8_t* seq = bam_get_seq(bw->b);
|
uint8_t* seq = bam_get_seq(bw->b);
|
||||||
|
uint8_t* quals = bam_get_qual(bw->b);
|
||||||
for (int i = 0; i < ad.read_len; ++i) {
|
for (int i = 0; i < ad.read_len; ++i) {
|
||||||
ad.bases[i] = cBaseToChar[bam_seqi(seq, i + ad.left_clip)];
|
ad.bases[i] = cBaseToChar[bam_seqi(seq, i + ad.left_clip)];
|
||||||
|
ad.quals[i] = quals[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 计算read两端clip之后的softstart和softend
|
||||||
|
void calculateSoftStartEnd(BamWrap* bw, SamData& ad) {
|
||||||
|
int64_t softStart = bw->b->core.pos + ad.ref_offset;
|
||||||
|
int64_t softEnd = softStart - 1; // 闭区间
|
||||||
|
const uint32_t* cigar = bam_get_cigar(bw->b);
|
||||||
|
const bam1_core_t& bc = bw->b->core;
|
||||||
|
int cigar_start = ad.cigar_start;
|
||||||
|
int cigar_end = ad.cigar_end;
|
||||||
|
bool rightTail = false;
|
||||||
|
for (int i = ad.cigar_start; i < ad.cigar_end; ++i) {
|
||||||
|
const char c = bam_cigar_opchr(cigar[i]);
|
||||||
|
int len = bam_cigar_oplen(cigar[i]);
|
||||||
|
if (i == ad.cigar_start) len -= ad.first_cigar_clip;
|
||||||
|
if (i == ad.cigar_end - 1) len -= ad.last_cigar_clip;
|
||||||
|
// if (c == 'S' || c == 'I')
|
||||||
|
if (c == 'S')
|
||||||
|
softStart -= len;
|
||||||
|
else if (c != 'H')
|
||||||
|
rightTail = true;
|
||||||
|
if (rightTail) {
|
||||||
|
if (consumeRefBases(c) || c == 'S')
|
||||||
|
softEnd += len;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ad.softStart() = softStart;
|
||||||
|
ad.softEnd() = softEnd;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 计算clip之后的cigar
|
||||||
|
void calculateCigar(BamWrap* bw, SamData& ad) {
|
||||||
|
ad.cigars.clear();
|
||||||
|
const uint32_t* cigar = bam_get_cigar(bw->b);
|
||||||
|
const bam1_core_t& bc = bw->b->core;
|
||||||
|
int cigar_start = ad.cigar_start;
|
||||||
|
int cigar_end = ad.cigar_end;
|
||||||
|
for (int i = ad.cigar_start; i < ad.cigar_end; ++i) {
|
||||||
|
const char c = bam_cigar_opchr(cigar[i]);
|
||||||
|
int len = bam_cigar_oplen(cigar[i]);
|
||||||
|
if (i == ad.cigar_start)
|
||||||
|
len -= ad.first_cigar_clip;
|
||||||
|
if (i == ad.cigar_end - 1)
|
||||||
|
len -= ad.last_cigar_clip;
|
||||||
|
ad.cigars.push_back({c, len});
|
||||||
|
}
|
||||||
|
//for(auto &cigar : ad.cigars) {
|
||||||
|
// spdlog::info("op: {}, len: {}", cigar.op, cigar.len);
|
||||||
|
//}
|
||||||
|
}
|
||||||
|
|
||||||
// 计算read两端softclip的碱基数量,可能会修改ad里的clip值
|
// 计算read两端softclip的碱基数量,可能会修改ad里的clip值
|
||||||
void calculateSoftClip(BamWrap *bw, ReadAdditionData &ad) {
|
void calculateSoftClip(BamWrap *bw, SamData &ad) {
|
||||||
const uint32_t* cigar = bam_get_cigar(bw->b);
|
const uint32_t* cigar = bam_get_cigar(bw->b);
|
||||||
const bam1_core_t& bc = bw->b->core;
|
const bam1_core_t& bc = bw->b->core;
|
||||||
int readIndex = ad.left_clip;
|
int readIndex = ad.left_clip;
|
||||||
|
|
@ -324,10 +379,12 @@ inline void updateIndel(vector<int> &isIndel, int index) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// 计算该read的每个碱基位置是否是SNP或Indel
|
// 计算该read的每个碱基位置是否是SNP或Indel
|
||||||
int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, ReadAdditionData &ad, vector<int> &isSNP, vector<int> &isIns, vector<int> &isDel) {
|
int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, SamData &ad, vector<int> &isSNP, vector<int> &isIns, vector<int> &isDel) {
|
||||||
// 1. 读取参考基因组,先看看串行运行性能,稍后可以将读入ref和vcf合并起来做成一个并行流水线步骤
|
// 1. 读取参考基因组,先看看串行运行性能,稍后可以将读入ref和vcf合并起来做成一个并行流水线步骤
|
||||||
Interval interval{bw->start_pos() + ad.ref_offset, bw->end_pos()}; // 闭区间
|
Interval interval{bw->start_pos() + ad.ref_offset, bw->end_pos()}; // 闭区间
|
||||||
|
PROF_START(ref);
|
||||||
read_ref_base(aux, interval.left, interval);
|
read_ref_base(aux, interval.left, interval);
|
||||||
|
PROF_END(gprof[GP_read_ref], ref);
|
||||||
string refBases(aux.ref_seq);
|
string refBases(aux.ref_seq);
|
||||||
// spdlog::info("ref: {}, {}, {} - {}", aux.ref_seq, aux.ref_len, bw->contig_pos(), bw->contig_end_pos());
|
// spdlog::info("ref: {}, {}, {} - {}", aux.ref_seq, aux.ref_len, bw->contig_pos(), bw->contig_end_pos());
|
||||||
|
|
||||||
|
|
@ -381,13 +438,13 @@ int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, ReadAdditionData &ad, vector
|
||||||
}
|
}
|
||||||
|
|
||||||
// 简单计算baq数组,就是全部赋值为'@' (64)
|
// 简单计算baq数组,就是全部赋值为'@' (64)
|
||||||
bool flatBAQArray(BamWrap* bw, ReadAdditionData& ad, vector<int>& baqArray) {
|
bool flatBAQArray(BamWrap* bw, SamData& ad, vector<int>& baqArray) {
|
||||||
baqArray.resize(ad.read_len, (int)'@');
|
baqArray.resize(ad.read_len, (int)'@');
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 计算真实的baq数组,耗时更多,好像enable-baq参数默认是关闭的,那就先不实现这个了
|
// 计算真实的baq数组,耗时更多,好像enable-baq参数默认是关闭的,那就先不实现这个了
|
||||||
bool calculateBAQArray(AuxVar& aux, BAQ& baq, BamWrap* bw, ReadAdditionData& ad, vector<int>& baqArray) {
|
bool calculateBAQArray(AuxVar& aux, BAQ& baq, BamWrap* bw, SamData& ad, vector<int>& baqArray) {
|
||||||
baqArray.resize(ad.read_len, 0);
|
baqArray.resize(ad.read_len, 0);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
@ -404,7 +461,7 @@ static void get_line_from_buf(char* buf, int64_t total, int64_t* cur, string* li
|
||||||
}
|
}
|
||||||
|
|
||||||
// 计算与read有交叉的已知位点信息, 应该要判断一下,是按照read的范围去读取vcf,还是按照一个batch read的范围去读取
|
// 计算与read有交叉的已知位点信息, 应该要判断一下,是按照read的范围去读取vcf,还是按照一个batch read的范围去读取
|
||||||
void calculateKnownSites(BamWrap* bw, ReadAdditionData& ad, vector<VCFParser> &vcfs) {
|
void calculateKnownSites(BamWrap* bw, SamData& ad, vector<VCFParser> &vcfs, vector<bool> knownSites) {
|
||||||
int tid = bw->contig_id();
|
int tid = bw->contig_id();
|
||||||
uint64_t startPos = bw->start_pos(); // 闭区间
|
uint64_t startPos = bw->start_pos(); // 闭区间
|
||||||
uint64_t endPos = bw->end_pos(); // 闭区间
|
uint64_t endPos = bw->end_pos(); // 闭区间
|
||||||
|
|
@ -427,6 +484,8 @@ void calculateKnownSites(BamWrap* bw, ReadAdditionData& ad, vector<VCFParser> &v
|
||||||
//spdlog::info("before intervals : {}", vcf.knownSites.size());
|
//spdlog::info("before intervals : {}", vcf.knownSites.size());
|
||||||
// 读取新的interval
|
// 读取新的interval
|
||||||
int64_t fpos, flen;
|
int64_t fpos, flen;
|
||||||
|
endPos = std::max(startPos + MAX_SITES_INTERVAL, endPos);
|
||||||
|
Interval readIntv(startPos, endPos);
|
||||||
vcf.index.SearchInterval(startPos, endPos, &fpos, &flen);
|
vcf.index.SearchInterval(startPos, endPos, &fpos, &flen);
|
||||||
//spdlog::info("file index: {}, {}", fpos, flen);
|
//spdlog::info("file index: {}, {}", fpos, flen);
|
||||||
if (flen > 0) {
|
if (flen > 0) {
|
||||||
|
|
@ -448,7 +507,9 @@ void calculateKnownSites(BamWrap* bw, ReadAdditionData& ad, vector<VCFParser> &v
|
||||||
string id, ref;
|
string id, ref;
|
||||||
ss_line >> stid >> pos >> id >> ref;
|
ss_line >> stid >> pos >> id >> ref;
|
||||||
tid = sam_hdr_name2tid(nsgv::gInBamHeader, stid.c_str());
|
tid = sam_hdr_name2tid(nsgv::gInBamHeader, stid.c_str());
|
||||||
if (tid >= 0 && pos > 0) {
|
int64_t varStart = BamWrap::bam_global_pos(tid, pos);
|
||||||
|
Interval varIntv(varStart, varStart + ref.size() - 1);
|
||||||
|
if (readIntv.overlaps(varIntv)) {
|
||||||
vcf.knownSites.push_back(Interval(tid, pos - 1, pos - 1 + ref.size()));
|
vcf.knownSites.push_back(Interval(tid, pos - 1, pos - 1 + ref.size()));
|
||||||
//spdlog::info("intv-1 {}, {}, {}", tid, pos, ref.size());
|
//spdlog::info("intv-1 {}, {}, {}", tid, pos, ref.size());
|
||||||
}
|
}
|
||||||
|
|
@ -461,6 +522,74 @@ void calculateKnownSites(BamWrap* bw, ReadAdditionData& ad, vector<VCFParser> &v
|
||||||
//}
|
//}
|
||||||
}
|
}
|
||||||
//exit(0);
|
//exit(0);
|
||||||
|
knownSites.resize(ad.read_len);
|
||||||
|
endPos = bw->end_pos();
|
||||||
|
|
||||||
|
for(auto &vcf : vcfs) {
|
||||||
|
for (auto &intv : vcf.knownSites) {
|
||||||
|
// knownSite is outside clipping window for the read, ignore
|
||||||
|
if (intv.right < ad.softStart())
|
||||||
|
continue;
|
||||||
|
if (intv.left > ad.softEnd())
|
||||||
|
break;
|
||||||
|
// 计算对应的位点
|
||||||
|
ReadIdxCigar rcStart = ad.getReadIndexForReferenceCoordinate(intv.left);
|
||||||
|
int featureStartOnRead = rcStart.readIdx == SamData::READ_INDEX_NOT_FOUND ? 0 : rcStart.readIdx;
|
||||||
|
if (rcStart.cigarOp == 'D') {
|
||||||
|
--featureStartOnRead;
|
||||||
|
}
|
||||||
|
ReadIdxCigar rcEnd = ad.getReadIndexForReferenceCoordinate(intv.right);
|
||||||
|
int featureEndOnRead = rcEnd.readIdx == SamData::READ_INDEX_NOT_FOUND ? ad.read_len : rcEnd.readIdx;
|
||||||
|
if (featureStartOnRead > ad.read_len) {
|
||||||
|
featureStartOnRead = featureEndOnRead = ad.read_len;
|
||||||
|
}
|
||||||
|
for (int i = max(0, featureStartOnRead); i < min(ad.read_len, featureEndOnRead + 1); ++i) {
|
||||||
|
knownSites[i] = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// 应该是计算一段数据的平均值
|
||||||
|
static void calculateAndStoreErrorsInBlock(int i, int blockStartIndex, vector<int>& errorArr, vector<double>& fracErrs) {
|
||||||
|
int totalErrors = 0;
|
||||||
|
for (int j = max(0, blockStartIndex - 1); j <= i; j++) {
|
||||||
|
totalErrors += errorArr[j];
|
||||||
|
}
|
||||||
|
for (int j = max(0, blockStartIndex - 1); j <= i; j++) {
|
||||||
|
fracErrs[j] = ((double)totalErrors) / ((double)(i - max(0, blockStartIndex - 1) + 1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 应该是用来处理BAQ的,把不等于特定BAQ分数的碱基作为一段数据统一处理
|
||||||
|
void calculateFractionalErrorArray(vector<int>& errorArr, vector<int>& baqArr, vector<double>& fracErrs) {
|
||||||
|
fracErrs.resize(baqArr.size());
|
||||||
|
// errorArray和baqArray必须长度相同
|
||||||
|
const int BLOCK_START_UNSET = -1;
|
||||||
|
bool inBlock = false;
|
||||||
|
int blockStartIndex = BLOCK_START_UNSET;
|
||||||
|
int i;
|
||||||
|
for (i = 0; i < fracErrs.size(); ++i) {
|
||||||
|
if (baqArr[i] == NO_BAQ_UNCERTAINTY) { // 不等于NO_BAQ_UNCERTAINTY的碱基当成一段数据来处理
|
||||||
|
if (!inBlock) {
|
||||||
|
fracErrs[i] = errorArr[i];
|
||||||
|
} else {
|
||||||
|
calculateAndStoreErrorsInBlock(i, blockStartIndex, errorArr, fracErrs);
|
||||||
|
inBlock = false; // reset state variables
|
||||||
|
blockStartIndex = BLOCK_START_UNSET; // reset state variables
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
inBlock = true;
|
||||||
|
if (blockStartIndex == BLOCK_START_UNSET) {
|
||||||
|
blockStartIndex = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// 处理最后一个block
|
||||||
|
if (inBlock) {
|
||||||
|
calculateAndStoreErrorsInBlock(i - 1, blockStartIndex, errorArr, fracErrs);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 串行bqsr
|
// 串行bqsr
|
||||||
|
|
@ -513,7 +642,7 @@ int SerialBQSR() {
|
||||||
// 3. 如果bam文件之前做过bqsr,tag中包含OQ(originnal quality,原始质量分数),检查用户参数里是否指定用原始质量分数进行bqsr,如果是则将质量分数替换为OQ,否则忽略OQ,先忽略
|
// 3. 如果bam文件之前做过bqsr,tag中包含OQ(originnal quality,原始质量分数),检查用户参数里是否指定用原始质量分数进行bqsr,如果是则将质量分数替换为OQ,否则忽略OQ,先忽略
|
||||||
// 4. 对read的两端进行检测,去除(hardclip)adapter
|
// 4. 对read的两端进行检测,去除(hardclip)adapter
|
||||||
BamWrap *bw = bams[i];
|
BamWrap *bw = bams[i];
|
||||||
ReadAdditionData ad;
|
SamData ad;
|
||||||
ad.read_len = BamWrap::BamEffectiveLength(bw->b);
|
ad.read_len = BamWrap::BamEffectiveLength(bw->b);
|
||||||
ad.cigar_end = bw->b->core.n_cigar;
|
ad.cigar_end = bw->b->core.n_cigar;
|
||||||
if (ad.read_len <= 0) continue;
|
if (ad.read_len <= 0) continue;
|
||||||
|
|
@ -535,6 +664,9 @@ int SerialBQSR() {
|
||||||
|
|
||||||
calculateRefOffset(bw, ad); // 计算ref_offset,就是相对比对的position,要将ref右移多少
|
calculateRefOffset(bw, ad); // 计算ref_offset,就是相对比对的position,要将ref右移多少
|
||||||
calculateReadBases(bw, ad); // 计算clip处理之后,剩余的碱基
|
calculateReadBases(bw, ad); // 计算clip处理之后,剩余的碱基
|
||||||
|
// 计算clip之后,两端的softstart和softend
|
||||||
|
calculateSoftStartEnd(bw, ad);
|
||||||
|
calculateCigar(bw, ad);
|
||||||
|
|
||||||
//spdlog::info("read-len {} - {}: clip left {}, right {}, ref offset: {}, cigar range: [{}, {}), cigar: {}", bw->b->core.l_qseq,
|
//spdlog::info("read-len {} - {}: clip left {}, right {}, ref offset: {}, cigar range: [{}, {}), cigar: {}", bw->b->core.l_qseq,
|
||||||
// ad.read_len, ad.left_clip, ad.right_clip, ad.ref_offset, ad.cigar_start, ad.cigar_end, bw->cigar_str());
|
// ad.read_len, ad.left_clip, ad.right_clip, ad.ref_offset, ad.cigar_start, ad.cigar_end, bw->cigar_str());
|
||||||
|
|
@ -560,7 +692,9 @@ int SerialBQSR() {
|
||||||
// 到这里,基本的数据都准备好了,后续就是进行bqsr的统计了
|
// 到这里,基本的数据都准备好了,后续就是进行bqsr的统计了
|
||||||
|
|
||||||
// 8. 计算这条read对应的协变量
|
// 8. 计算这条read对应的协变量
|
||||||
|
PROF_START(covariate);
|
||||||
CovariateUtils::ComputeCovariates(bw, ad, nsgv::gInBamHeader, readCovariates, true);
|
CovariateUtils::ComputeCovariates(bw, ad, nsgv::gInBamHeader, readCovariates, true);
|
||||||
|
PROF_END(gprof[GP_covariate], covariate);
|
||||||
test = readCovariates[1][0][0] + readCovariates[2][1][3];
|
test = readCovariates[1][0][0] + readCovariates[2][1][3];
|
||||||
int end_pos = bw->contig_end_pos();
|
int end_pos = bw->contig_end_pos();
|
||||||
//spdlog::info("adapter: {}, read: {}, {}, strand: {}", adapter_boundary, bw->contig_pos(), end_pos,
|
//spdlog::info("adapter: {}, read: {}, {}, strand: {}", adapter_boundary, bw->contig_pos(), end_pos,
|
||||||
|
|
@ -568,7 +702,23 @@ int SerialBQSR() {
|
||||||
|
|
||||||
// 9. 计算这条read需要跳过的位置
|
// 9. 计算这条read需要跳过的位置
|
||||||
vector<bool> skip(ad.read_len, 0);
|
vector<bool> skip(ad.read_len, 0);
|
||||||
calculateKnownSites(bw, ad, nsgv::gAuxVars[0].vcfArr);
|
PROF_START(known_sites);
|
||||||
|
calculateKnownSites(bw, ad, nsgv::gAuxVars[0].vcfArr, skip);
|
||||||
|
for (int ii = 0; ii < ad.read_len; ++ii) {
|
||||||
|
skip[ii] =
|
||||||
|
skip[ii] || (ContextCovariate::baseIndexMap[ad.bases[ii]] == -1) || ad.quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN;
|
||||||
|
}
|
||||||
|
PROF_END(gprof[GP_read_vcf], known_sites);
|
||||||
|
|
||||||
|
// 10. 根据BAQ进一步处理snp,indel,得到处理后的数据
|
||||||
|
vector<double> snpErrors, insErrors, delErrors;
|
||||||
|
calculateFractionalErrorArray(isSNP, baqArray, snpErrors);
|
||||||
|
calculateFractionalErrorArray(isIns, baqArray, insErrors);
|
||||||
|
calculateFractionalErrorArray(isDel, baqArray, delErrors);
|
||||||
|
|
||||||
|
// aggregate all of the info into our info object, and update the data
|
||||||
|
// 11. 合并之前计算的数据,得到info,并更新bqsr table数据
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
|
|
|
||||||
|
|
@ -22,16 +22,16 @@ int CycleCovariate::MAXIMUM_CYCLE_VALUE;
|
||||||
|
|
||||||
// for CovariateUtils
|
// for CovariateUtils
|
||||||
// 对一条read计算协变量(该协变量被上一个read用过)
|
// 对一条read计算协变量(该协变量被上一个read用过)
|
||||||
void CovariateUtils::ComputeCovariates(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values,
|
void CovariateUtils::ComputeCovariates(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values,
|
||||||
bool recordIndelValues) {
|
bool recordIndelValues) {
|
||||||
//ReadGroupCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
|
ReadGroupCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
|
||||||
//BaseQualityCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
|
BaseQualityCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
|
||||||
//ContextCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
|
ContextCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
|
||||||
//CycleCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
|
CycleCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ReadGroupCovariate 协变量的方法
|
// ReadGroupCovariate 协变量的方法
|
||||||
void ReadGroupCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
|
void ReadGroupCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
|
||||||
uint8_t *rgStr = bam_aux_get(bw->b, "RG");
|
uint8_t *rgStr = bam_aux_get(bw->b, "RG");
|
||||||
char* rgVal = nullptr;
|
char* rgVal = nullptr;
|
||||||
if (rgStr) rgVal = bam_aux2Z(rgStr);
|
if (rgStr) rgVal = bam_aux2Z(rgStr);
|
||||||
|
|
@ -47,7 +47,7 @@ void ReadGroupCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr
|
||||||
}
|
}
|
||||||
|
|
||||||
// BaseQualityCovariate 协变量的方法
|
// BaseQualityCovariate 协变量的方法
|
||||||
void BaseQualityCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values,
|
void BaseQualityCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values,
|
||||||
bool recordIndelValues) {
|
bool recordIndelValues) {
|
||||||
// 在前面的处理过后,quals应该和base长度一致了
|
// 在前面的处理过后,quals应该和base长度一致了
|
||||||
#define __bq_set_cov(ins, del) \
|
#define __bq_set_cov(ins, del) \
|
||||||
|
|
@ -98,7 +98,7 @@ static char SimpleComplement(const char base) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// 获取去除低质量分数碱基之后的read碱基序列(将低质量分数的碱基变成N)
|
// 获取去除低质量分数碱基之后的read碱基序列(将低质量分数的碱基变成N)
|
||||||
void ContextCovariate::GetStrandedClippedBytes(BamWrap* bw, ReadAdditionData& ad, string& clippedBases, uint8_t lowQTail) {
|
void ContextCovariate::GetStrandedClippedBytes(BamWrap* bw, SamData& ad, string& clippedBases, uint8_t lowQTail) {
|
||||||
uint8_t* quals = bam_get_qual(bw->b) + ad.left_clip;
|
uint8_t* quals = bam_get_qual(bw->b) + ad.left_clip;
|
||||||
|
|
||||||
if (bw->GetReadNegativeStrandFlag()) { // 反向互补
|
if (bw->GetReadNegativeStrandFlag()) { // 反向互补
|
||||||
|
|
@ -218,7 +218,7 @@ void ContextCovariate::GetReadContextAtEachPosition(const string& bases, const i
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ContextCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
|
void ContextCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
|
||||||
const int originalReadLength = ad.read_len;
|
const int originalReadLength = ad.read_len;
|
||||||
|
|
||||||
// store the original bases and then write Ns over low quality ones
|
// store the original bases and then write Ns over low quality ones
|
||||||
|
|
@ -270,7 +270,7 @@ void ContextCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t
|
||||||
* @param maxCycle max value of the base to compute the key for
|
* @param maxCycle max value of the base to compute the key for
|
||||||
* (this method throws UserException if the computed absolute value of the cycle number is higher than this value).
|
* (this method throws UserException if the computed absolute value of the cycle number is higher than this value).
|
||||||
*/
|
*/
|
||||||
int CycleCovariate::CycleKey(BamWrap* bw, ReadAdditionData& ad, const int baseNumber, const bool indel, const int maxCycle) {
|
int CycleCovariate::CycleKey(BamWrap* bw, SamData& ad, const int baseNumber, const bool indel, const int maxCycle) {
|
||||||
const bool isNegStrand = bw->GetReadNegativeStrandFlag();
|
const bool isNegStrand = bw->GetReadNegativeStrandFlag();
|
||||||
const bool isSecondInPair = (bw->b->core.flag & BAM_FPAIRED) && (bw->b->core.flag & BAM_FREAD2);
|
const bool isSecondInPair = (bw->b->core.flag & BAM_FPAIRED) && (bw->b->core.flag & BAM_FREAD2);
|
||||||
const int readLength = ad.read_len;
|
const int readLength = ad.read_len;
|
||||||
|
|
@ -299,7 +299,7 @@ int CycleCovariate::CycleKey(BamWrap* bw, ReadAdditionData& ad, const int baseNu
|
||||||
}
|
}
|
||||||
|
|
||||||
// Used to pick out the covariate's value from attributes of the read
|
// Used to pick out the covariate's value from attributes of the read
|
||||||
void CycleCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
|
void CycleCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
|
||||||
const int readLength = ad.read_len;
|
const int readLength = ad.read_len;
|
||||||
// Note: duplicate the loop to void checking recordIndelValues on every iteration
|
// Note: duplicate the loop to void checking recordIndelValues on every iteration
|
||||||
if (recordIndelValues) {
|
if (recordIndelValues) {
|
||||||
|
|
|
||||||
|
|
@ -69,7 +69,7 @@ struct CovariateUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
// 对一条read计算协变量(该协变量被上一个read用过)
|
// 对一条read计算协变量(该协变量被上一个read用过)
|
||||||
static void ComputeCovariates(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
static void ComputeCovariates(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
||||||
};
|
};
|
||||||
|
|
||||||
// Read group协变量
|
// Read group协变量
|
||||||
|
|
@ -78,13 +78,13 @@ struct ReadGroupCovariate {
|
||||||
static map<string, int> RgToId; // read group name到id的映射
|
static map<string, int> RgToId; // read group name到id的映射
|
||||||
static map<int, string> IdToRg; // id到read group name的映射
|
static map<int, string> IdToRg; // id到read group name的映射
|
||||||
|
|
||||||
static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
static void RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
||||||
};
|
};
|
||||||
|
|
||||||
// Base quality协变量
|
// Base quality协变量
|
||||||
struct BaseQualityCovariate {
|
struct BaseQualityCovariate {
|
||||||
static constexpr int index = 1; // 在协变量数组中的索引位置
|
static constexpr int index = 1; // 在协变量数组中的索引位置
|
||||||
static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
static void RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
||||||
};
|
};
|
||||||
|
|
||||||
// Context协变量
|
// Context协变量
|
||||||
|
|
@ -162,14 +162,14 @@ struct ContextCovariate {
|
||||||
}
|
}
|
||||||
|
|
||||||
// 获取去除低质量分数碱基之后的read碱基序列(将低质量分数的碱基变成N)
|
// 获取去除低质量分数碱基之后的read碱基序列(将低质量分数的碱基变成N)
|
||||||
static void GetStrandedClippedBytes(BamWrap* bw, ReadAdditionData& ad, string& clippedBases, uint8_t lowQTail);
|
static void GetStrandedClippedBytes(BamWrap* bw, SamData& ad, string& clippedBases, uint8_t lowQTail);
|
||||||
// Creates a int representation of a given dna string.
|
// Creates a int representation of a given dna string.
|
||||||
static int KeyFromContext(const string& dna, const int start, const int end);
|
static int KeyFromContext(const string& dna, const int start, const int end);
|
||||||
// For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
|
// For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
|
||||||
static void GetReadContextAtEachPosition(const string& bases, const int contextSize, const int mask, vector<int>& keys);
|
static void GetReadContextAtEachPosition(const string& bases, const int contextSize, const int mask, vector<int>& keys);
|
||||||
|
|
||||||
// 设置协变量的值
|
// 设置协变量的值
|
||||||
static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
static void RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
||||||
};
|
};
|
||||||
|
|
||||||
// Cycle协变量
|
// Cycle协变量
|
||||||
|
|
@ -203,7 +203,7 @@ struct CycleCovariate {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Computes the encoded value of CycleCovariate's key for the given position at the read.
|
// Computes the encoded value of CycleCovariate's key for the given position at the read.
|
||||||
static int CycleKey(BamWrap* bw, ReadAdditionData& ad, const int baseNumber, const bool indel, const int maxCycle);
|
static int CycleKey(BamWrap* bw, SamData& ad, const int baseNumber, const bool indel, const int maxCycle);
|
||||||
|
|
||||||
static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
static void RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
||||||
};
|
};
|
||||||
|
|
@ -11,16 +11,61 @@
|
||||||
#include <htslib/sam.h>
|
#include <htslib/sam.h>
|
||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
#include <map>
|
#include <map>
|
||||||
|
#include <queue>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
|
struct Cigar {
|
||||||
|
char op = '0';
|
||||||
|
int len = 0;
|
||||||
|
// 该操作符是否消耗read的碱基
|
||||||
|
static bool ConsumeReadBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'I' || cigar == 'S'; }
|
||||||
|
// 该操作符是否消耗参考基因组的碱基
|
||||||
|
static bool ConsumeRefBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N'; }
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ReadIdxCigar {
|
||||||
|
int readIdx = 0; // 在read序列中的位置
|
||||||
|
char cigarOp = '0'; // 当前位置对应的cigar
|
||||||
|
};
|
||||||
|
|
||||||
|
// 不用经常释放array的内存空间,减少频繁的内存开辟和释放操作
|
||||||
|
template <class T>
|
||||||
|
struct FastArray {
|
||||||
|
vector<T> arr;
|
||||||
|
size_t idx;
|
||||||
|
void clear() { idx = 0; }
|
||||||
|
size_t size() { return idx; }
|
||||||
|
void push_back(const T& val) {
|
||||||
|
if (idx < arr.size()) {
|
||||||
|
arr[idx++] = val;
|
||||||
|
} else {
|
||||||
|
arr.push_back(val);
|
||||||
|
idx++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
struct iterator {
|
||||||
|
typename std::vector<T>::iterator it;
|
||||||
|
iterator(typename std::vector<T>::iterator _it) : it(_it) {}
|
||||||
|
iterator& operator++() { ++it; return *this;}
|
||||||
|
iterator& operator--() { --it; return *this;}
|
||||||
|
T& operator*() const { return *it; }
|
||||||
|
bool operator!=(const iterator& other) const { return it != other.it; }
|
||||||
|
bool operator==(const iterator& other) const { return it == other.it; }
|
||||||
|
};
|
||||||
|
|
||||||
|
iterator begin() { return arr.begin(); }
|
||||||
|
iterator end() { return arr.begin() + idx; }
|
||||||
|
};
|
||||||
|
|
||||||
// 对原始bam数据的补充,比如对两端进行hardclip等
|
// 对原始bam数据的补充,比如对两端进行hardclip等
|
||||||
struct ReadAdditionData {
|
struct SamData {
|
||||||
int read_len = 0; // read长度,各种clip之后的长度
|
int read_len = 0; // read长度,各种clip之后的长度
|
||||||
int cigar_start = 0; // cigar起始位置,闭区间
|
int cigar_start = 0; // cigar起始位置,闭区间
|
||||||
int cigar_end = 0; // cigar结束位置,开区间
|
int cigar_end = 0; // cigar结束位置,开区间
|
||||||
|
|
@ -29,7 +74,41 @@ struct ReadAdditionData {
|
||||||
int left_clip = 0; // 左侧被切掉的碱基长度
|
int left_clip = 0; // 左侧被切掉的碱基长度
|
||||||
int right_clip = 0; // 右侧被切掉的碱基长度
|
int right_clip = 0; // 右侧被切掉的碱基长度
|
||||||
int ref_offset = 0; // 切除adapter和softclip之后(softclip应该不影响),相对原始ref比对位置(contig_pos)的偏移量
|
int ref_offset = 0; // 切除adapter和softclip之后(softclip应该不影响),相对原始ref比对位置(contig_pos)的偏移量
|
||||||
string bases; // 处理之后的read的碱基
|
|
||||||
|
// 记录一下bqsr运算过程中用到的数据,回头提前计算一下,修正现在的复杂逻辑
|
||||||
|
static constexpr int READ_INDEX_NOT_FOUND = -1;
|
||||||
|
string bases; // 处理之后的read的碱基
|
||||||
|
vector<uint8_t> quals; // 对应的质量分数
|
||||||
|
int64_t start_pos; // 因为soft clip都被切掉了,这里的softstart应该就是切掉之后的匹配位点,闭区间
|
||||||
|
int64_t end_pos; // 同上,闭区间
|
||||||
|
FastArray<Cigar> cigars;
|
||||||
|
int64_t& softStart() { return start_pos; }
|
||||||
|
int64_t& softEnd() { return end_pos; }
|
||||||
|
|
||||||
|
// functions
|
||||||
|
ReadIdxCigar getReadIndexForReferenceCoordinate(int64_t refPos) {
|
||||||
|
ReadIdxCigar rc;
|
||||||
|
if (refPos < start_pos)
|
||||||
|
return rc;
|
||||||
|
int firstReadPosOfElement = 0; // inclusive
|
||||||
|
int firstRefPosOfElement = start_pos; // inclusive
|
||||||
|
int lastReadPosOfElement = 0; // exclusive
|
||||||
|
int lastRefPosOfElement = start_pos; // exclusive
|
||||||
|
// advance forward through all the cigar elements until we bracket the reference coordinate
|
||||||
|
for (auto& cigar : cigars) {
|
||||||
|
firstReadPosOfElement = lastReadPosOfElement;
|
||||||
|
firstRefPosOfElement = lastRefPosOfElement;
|
||||||
|
lastReadPosOfElement += Cigar::ConsumeReadBases(cigar.op) ? cigar.len : 0;
|
||||||
|
lastRefPosOfElement += Cigar::ConsumeRefBases(cigar.op) || cigar.op == 'S' ? cigar.len : 0;
|
||||||
|
if (firstRefPosOfElement <= refPos && refPos < lastRefPosOfElement) { // refCoord falls within this cigar element
|
||||||
|
int readPosAtRefCoord = firstReadPosOfElement + (Cigar::ConsumeReadBases(cigar.op) ? (refPos - firstRefPosOfElement) : 0);
|
||||||
|
rc.cigarOp = cigar.op;
|
||||||
|
rc.readIdx = readPosAtRefCoord;
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
||||||
|
|
@ -54,11 +54,11 @@ int DisplayProfiling(int nthread) {
|
||||||
// PRINT_GP(sort_wait);
|
// PRINT_GP(sort_wait);
|
||||||
// PRINT_GP(markdup_wait);
|
// PRINT_GP(markdup_wait);
|
||||||
// PRINT_GP(intersect_wait);
|
// PRINT_GP(intersect_wait);
|
||||||
PRINT_GP(read);
|
PRINT_GP(read_ref);
|
||||||
PRINT_GP(gen);
|
PRINT_GP(read_vcf);
|
||||||
PRINT_GP(sort);
|
PRINT_GP(covariate);
|
||||||
PRINT_GP(markdup);
|
//PRINT_GP(markdup);
|
||||||
PRINT_GP(intersect);
|
//PRINT_GP(intersect);
|
||||||
// PRINT_GP(merge_result);
|
// PRINT_GP(merge_result);
|
||||||
// PRINT_GP(sort_pair);
|
// PRINT_GP(sort_pair);
|
||||||
// PRINT_GP(sort_frag);
|
// PRINT_GP(sort_frag);
|
||||||
|
|
@ -68,10 +68,10 @@ int DisplayProfiling(int nthread) {
|
||||||
// PRINT_GP(merge_markdup);
|
// PRINT_GP(merge_markdup);
|
||||||
// PRINT_GP(merge_update);
|
// PRINT_GP(merge_update);
|
||||||
// PRINT_GP(merge_add);
|
// PRINT_GP(merge_add);
|
||||||
PRINT_GP(markdup_all);
|
//PRINT_GP(markdup_all);
|
||||||
// PRINT_GP(final_read);
|
// PRINT_GP(final_read);
|
||||||
PRINT_GP(write);
|
//PRINT_GP(write);
|
||||||
PRINT_GP(whole_process);
|
// PRINT_GP(whole_process);
|
||||||
|
|
||||||
// PRINT_TP(gen, nthread);
|
// PRINT_TP(gen, nthread);
|
||||||
// PRINT_TP(sort_frag, nthread);
|
// PRINT_TP(sort_frag, nthread);
|
||||||
|
|
|
||||||
|
|
@ -40,6 +40,9 @@ extern uint64_t gprof[LIM_GLOBAL_PROF_TYPE];
|
||||||
enum { GP_0 = 0, GP_1, GP_2, GP_3, GP_4, GP_5, GP_6, GP_7, GP_8, GP_9, GP_10 };
|
enum { GP_0 = 0, GP_1, GP_2, GP_3, GP_4, GP_5, GP_6, GP_7, GP_8, GP_9, GP_10 };
|
||||||
enum {
|
enum {
|
||||||
GP_read_wait = 11,
|
GP_read_wait = 11,
|
||||||
|
GP_covariate,
|
||||||
|
GP_read_ref,
|
||||||
|
GP_read_vcf,
|
||||||
GP_gen_wait,
|
GP_gen_wait,
|
||||||
GP_sort_wait,
|
GP_sort_wait,
|
||||||
GP_markdup_wait,
|
GP_markdup_wait,
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue