串行版本还差最后一步,将信息合并到数据汇总表中

This commit is contained in:
zzh 2025-12-24 12:47:26 +08:00
parent 1e9b58fac1
commit 25f079b936
9 changed files with 274 additions and 42 deletions

View File

@ -6,7 +6,7 @@ vector<double> BAQ::qual2prob(256); // 质量分数转化概率
vector<vector<vector<double>>> BAQ::EPSILONS(256, vector<vector<double>>(256, vector<double>(SAM_MAX_PHRED_SCORE + 1))); // [ref][read][qual] vector<vector<vector<double>>> BAQ::EPSILONS(256, vector<vector<double>>(256, vector<double>(SAM_MAX_PHRED_SCORE + 1))); // [ref][read][qual]
// 计算baq数组返回成功与否 // 计算baq数组返回成功与否
bool BAQ::calcBAQFromHMM(BamWrap* bw, ReadAdditionData& ad, string ref, int refOffset, vector<int>& baqArray) { bool BAQ::calcBAQFromHMM(BamWrap* bw, SamData& ad, string ref, int refOffset, vector<int>& baqArray) {
// 检测ref是否覆盖了read // 检测ref是否覆盖了read
if (ref.size() < refOffset + ad.read_len) { if (ref.size() < refOffset + ad.read_len) {
spdlog::error("BAQ calculation error: reference sequence length {} is less than required length {} (refOffset {} + read_len {})", spdlog::error("BAQ calculation error: reference sequence length {} is less than required length {} (refOffset {} + read_len {})",

View File

@ -84,5 +84,5 @@ struct BAQ {
double calcEpsilon(uint8_t ref, uint8_t read, uint8_t qualB) { return EPSILONS[ref][read][qualB]; } double calcEpsilon(uint8_t ref, uint8_t read, uint8_t qualB) { return EPSILONS[ref][read][qualB]; }
// 计算baq数组返回成功与否 // 计算baq数组返回成功与否
bool calcBAQFromHMM(BamWrap* bw, ReadAdditionData& ad, string ref, int refOffset, vector<int>& baqArray); bool calcBAQFromHMM(BamWrap* bw, SamData& ad, string ref, int refOffset, vector<int>& baqArray);
}; };

View File

@ -36,6 +36,9 @@ Date : 2023/10/23
using std::deque; using std::deque;
#define BAM_BLOCK_SIZE 16L * 1024 * 1024 #define BAM_BLOCK_SIZE 16L * 1024 * 1024
#define MAX_SITES_INTERVAL 100000
const uint8_t NO_BAQ_UNCERTAINTY = (uint8_t)'@';
const char cBaseToChar[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'}; const char cBaseToChar[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'};
@ -210,7 +213,7 @@ PosAndOperator getReadIndexForReferenceCoordinate(BamWrap *bw, int alignmentStar
} }
// 根据adapter位置对read进行hardclip返回左侧或右侧减掉的base数量 // 根据adapter位置对read进行hardclip返回左侧或右侧减掉的base数量
void clipByReferenceCoordinates(BamWrap *bw, int refStart, int refStop, ReadAdditionData &ad) { void clipByReferenceCoordinates(BamWrap *bw, int refStart, int refStop, SamData &ad) {
int start, stop; int start, stop;
// Determine the read coordinate to start and stop hard clipping // Determine the read coordinate to start and stop hard clipping
if (refStart < 0) { if (refStart < 0) {
@ -235,7 +238,7 @@ void clipByReferenceCoordinates(BamWrap *bw, int refStart, int refStop, ReadAddi
} }
// 计算切掉adapter之后ref相对原始ref的偏移量 // 计算切掉adapter之后ref相对原始ref的偏移量
void calculateRefOffset(BamWrap *bw, ReadAdditionData &ad) { void calculateRefOffset(BamWrap *bw, SamData &ad) {
const uint32_t* cigar = bam_get_cigar(bw->b); const uint32_t* cigar = bam_get_cigar(bw->b);
const bam1_core_t& bc = bw->b->core; const bam1_core_t& bc = bw->b->core;
int i = 0; int i = 0;
@ -253,16 +256,68 @@ void calculateRefOffset(BamWrap *bw, ReadAdditionData &ad) {
} }
// 计算clip处理之后剩余的碱基 // 计算clip处理之后剩余的碱基
void calculateReadBases(BamWrap* bw, ReadAdditionData& ad) { void calculateReadBases(BamWrap* bw, SamData& ad) {
ad.bases.resize(ad.read_len); ad.bases.resize(ad.read_len);
ad.quals.resize(ad.read_len);
uint8_t* seq = bam_get_seq(bw->b); uint8_t* seq = bam_get_seq(bw->b);
uint8_t* quals = bam_get_qual(bw->b);
for (int i = 0; i < ad.read_len; ++i) { for (int i = 0; i < ad.read_len; ++i) {
ad.bases[i] = cBaseToChar[bam_seqi(seq, i + ad.left_clip)]; ad.bases[i] = cBaseToChar[bam_seqi(seq, i + ad.left_clip)];
ad.quals[i] = quals[i];
} }
} }
// 计算read两端clip之后的softstart和softend
void calculateSoftStartEnd(BamWrap* bw, SamData& ad) {
int64_t softStart = bw->b->core.pos + ad.ref_offset;
int64_t softEnd = softStart - 1; // 闭区间
const uint32_t* cigar = bam_get_cigar(bw->b);
const bam1_core_t& bc = bw->b->core;
int cigar_start = ad.cigar_start;
int cigar_end = ad.cigar_end;
bool rightTail = false;
for (int i = ad.cigar_start; i < ad.cigar_end; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
int len = bam_cigar_oplen(cigar[i]);
if (i == ad.cigar_start) len -= ad.first_cigar_clip;
if (i == ad.cigar_end - 1) len -= ad.last_cigar_clip;
// if (c == 'S' || c == 'I')
if (c == 'S')
softStart -= len;
else if (c != 'H')
rightTail = true;
if (rightTail) {
if (consumeRefBases(c) || c == 'S')
softEnd += len;
}
}
ad.softStart() = softStart;
ad.softEnd() = softEnd;
}
// 计算clip之后的cigar
void calculateCigar(BamWrap* bw, SamData& ad) {
ad.cigars.clear();
const uint32_t* cigar = bam_get_cigar(bw->b);
const bam1_core_t& bc = bw->b->core;
int cigar_start = ad.cigar_start;
int cigar_end = ad.cigar_end;
for (int i = ad.cigar_start; i < ad.cigar_end; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
int len = bam_cigar_oplen(cigar[i]);
if (i == ad.cigar_start)
len -= ad.first_cigar_clip;
if (i == ad.cigar_end - 1)
len -= ad.last_cigar_clip;
ad.cigars.push_back({c, len});
}
//for(auto &cigar : ad.cigars) {
// spdlog::info("op: {}, len: {}", cigar.op, cigar.len);
//}
}
// 计算read两端softclip的碱基数量可能会修改ad里的clip值 // 计算read两端softclip的碱基数量可能会修改ad里的clip值
void calculateSoftClip(BamWrap *bw, ReadAdditionData &ad) { void calculateSoftClip(BamWrap *bw, SamData &ad) {
const uint32_t* cigar = bam_get_cigar(bw->b); const uint32_t* cigar = bam_get_cigar(bw->b);
const bam1_core_t& bc = bw->b->core; const bam1_core_t& bc = bw->b->core;
int readIndex = ad.left_clip; int readIndex = ad.left_clip;
@ -324,10 +379,12 @@ inline void updateIndel(vector<int> &isIndel, int index) {
} }
// 计算该read的每个碱基位置是否是SNP或Indel // 计算该read的每个碱基位置是否是SNP或Indel
int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, ReadAdditionData &ad, vector<int> &isSNP, vector<int> &isIns, vector<int> &isDel) { int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, SamData &ad, vector<int> &isSNP, vector<int> &isIns, vector<int> &isDel) {
// 1. 读取参考基因组先看看串行运行性能稍后可以将读入ref和vcf合并起来做成一个并行流水线步骤 // 1. 读取参考基因组先看看串行运行性能稍后可以将读入ref和vcf合并起来做成一个并行流水线步骤
Interval interval{bw->start_pos() + ad.ref_offset, bw->end_pos()}; // 闭区间 Interval interval{bw->start_pos() + ad.ref_offset, bw->end_pos()}; // 闭区间
PROF_START(ref);
read_ref_base(aux, interval.left, interval); read_ref_base(aux, interval.left, interval);
PROF_END(gprof[GP_read_ref], ref);
string refBases(aux.ref_seq); string refBases(aux.ref_seq);
// spdlog::info("ref: {}, {}, {} - {}", aux.ref_seq, aux.ref_len, bw->contig_pos(), bw->contig_end_pos()); // spdlog::info("ref: {}, {}, {} - {}", aux.ref_seq, aux.ref_len, bw->contig_pos(), bw->contig_end_pos());
@ -381,13 +438,13 @@ int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, ReadAdditionData &ad, vector
} }
// 简单计算baq数组就是全部赋值为'@' (64) // 简单计算baq数组就是全部赋值为'@' (64)
bool flatBAQArray(BamWrap* bw, ReadAdditionData& ad, vector<int>& baqArray) { bool flatBAQArray(BamWrap* bw, SamData& ad, vector<int>& baqArray) {
baqArray.resize(ad.read_len, (int)'@'); baqArray.resize(ad.read_len, (int)'@');
return true; return true;
} }
// 计算真实的baq数组耗时更多好像enable-baq参数默认是关闭的那就先不实现这个了 // 计算真实的baq数组耗时更多好像enable-baq参数默认是关闭的那就先不实现这个了
bool calculateBAQArray(AuxVar& aux, BAQ& baq, BamWrap* bw, ReadAdditionData& ad, vector<int>& baqArray) { bool calculateBAQArray(AuxVar& aux, BAQ& baq, BamWrap* bw, SamData& ad, vector<int>& baqArray) {
baqArray.resize(ad.read_len, 0); baqArray.resize(ad.read_len, 0);
return true; return true;
} }
@ -404,7 +461,7 @@ static void get_line_from_buf(char* buf, int64_t total, int64_t* cur, string* li
} }
// 计算与read有交叉的已知位点信息 应该要判断一下是按照read的范围去读取vcf还是按照一个batch read的范围去读取 // 计算与read有交叉的已知位点信息 应该要判断一下是按照read的范围去读取vcf还是按照一个batch read的范围去读取
void calculateKnownSites(BamWrap* bw, ReadAdditionData& ad, vector<VCFParser> &vcfs) { void calculateKnownSites(BamWrap* bw, SamData& ad, vector<VCFParser> &vcfs, vector<bool> knownSites) {
int tid = bw->contig_id(); int tid = bw->contig_id();
uint64_t startPos = bw->start_pos(); // 闭区间 uint64_t startPos = bw->start_pos(); // 闭区间
uint64_t endPos = bw->end_pos(); // 闭区间 uint64_t endPos = bw->end_pos(); // 闭区间
@ -427,6 +484,8 @@ void calculateKnownSites(BamWrap* bw, ReadAdditionData& ad, vector<VCFParser> &v
//spdlog::info("before intervals : {}", vcf.knownSites.size()); //spdlog::info("before intervals : {}", vcf.knownSites.size());
// 读取新的interval // 读取新的interval
int64_t fpos, flen; int64_t fpos, flen;
endPos = std::max(startPos + MAX_SITES_INTERVAL, endPos);
Interval readIntv(startPos, endPos);
vcf.index.SearchInterval(startPos, endPos, &fpos, &flen); vcf.index.SearchInterval(startPos, endPos, &fpos, &flen);
//spdlog::info("file index: {}, {}", fpos, flen); //spdlog::info("file index: {}, {}", fpos, flen);
if (flen > 0) { if (flen > 0) {
@ -448,7 +507,9 @@ void calculateKnownSites(BamWrap* bw, ReadAdditionData& ad, vector<VCFParser> &v
string id, ref; string id, ref;
ss_line >> stid >> pos >> id >> ref; ss_line >> stid >> pos >> id >> ref;
tid = sam_hdr_name2tid(nsgv::gInBamHeader, stid.c_str()); tid = sam_hdr_name2tid(nsgv::gInBamHeader, stid.c_str());
if (tid >= 0 && pos > 0) { int64_t varStart = BamWrap::bam_global_pos(tid, pos);
Interval varIntv(varStart, varStart + ref.size() - 1);
if (readIntv.overlaps(varIntv)) {
vcf.knownSites.push_back(Interval(tid, pos - 1, pos - 1 + ref.size())); vcf.knownSites.push_back(Interval(tid, pos - 1, pos - 1 + ref.size()));
//spdlog::info("intv-1 {}, {}, {}", tid, pos, ref.size()); //spdlog::info("intv-1 {}, {}, {}", tid, pos, ref.size());
} }
@ -461,6 +522,74 @@ void calculateKnownSites(BamWrap* bw, ReadAdditionData& ad, vector<VCFParser> &v
//} //}
} }
//exit(0); //exit(0);
knownSites.resize(ad.read_len);
endPos = bw->end_pos();
for(auto &vcf : vcfs) {
for (auto &intv : vcf.knownSites) {
// knownSite is outside clipping window for the read, ignore
if (intv.right < ad.softStart())
continue;
if (intv.left > ad.softEnd())
break;
// 计算对应的位点
ReadIdxCigar rcStart = ad.getReadIndexForReferenceCoordinate(intv.left);
int featureStartOnRead = rcStart.readIdx == SamData::READ_INDEX_NOT_FOUND ? 0 : rcStart.readIdx;
if (rcStart.cigarOp == 'D') {
--featureStartOnRead;
}
ReadIdxCigar rcEnd = ad.getReadIndexForReferenceCoordinate(intv.right);
int featureEndOnRead = rcEnd.readIdx == SamData::READ_INDEX_NOT_FOUND ? ad.read_len : rcEnd.readIdx;
if (featureStartOnRead > ad.read_len) {
featureStartOnRead = featureEndOnRead = ad.read_len;
}
for (int i = max(0, featureStartOnRead); i < min(ad.read_len, featureEndOnRead + 1); ++i) {
knownSites[i] = true;
}
}
}
}
// 应该是计算一段数据的平均值
static void calculateAndStoreErrorsInBlock(int i, int blockStartIndex, vector<int>& errorArr, vector<double>& fracErrs) {
int totalErrors = 0;
for (int j = max(0, blockStartIndex - 1); j <= i; j++) {
totalErrors += errorArr[j];
}
for (int j = max(0, blockStartIndex - 1); j <= i; j++) {
fracErrs[j] = ((double)totalErrors) / ((double)(i - max(0, blockStartIndex - 1) + 1));
}
}
// 应该是用来处理BAQ的把不等于特定BAQ分数的碱基作为一段数据统一处理
void calculateFractionalErrorArray(vector<int>& errorArr, vector<int>& baqArr, vector<double>& fracErrs) {
fracErrs.resize(baqArr.size());
// errorArray和baqArray必须长度相同
const int BLOCK_START_UNSET = -1;
bool inBlock = false;
int blockStartIndex = BLOCK_START_UNSET;
int i;
for (i = 0; i < fracErrs.size(); ++i) {
if (baqArr[i] == NO_BAQ_UNCERTAINTY) { // 不等于NO_BAQ_UNCERTAINTY的碱基当成一段数据来处理
if (!inBlock) {
fracErrs[i] = errorArr[i];
} else {
calculateAndStoreErrorsInBlock(i, blockStartIndex, errorArr, fracErrs);
inBlock = false; // reset state variables
blockStartIndex = BLOCK_START_UNSET; // reset state variables
}
} else {
inBlock = true;
if (blockStartIndex == BLOCK_START_UNSET) {
blockStartIndex = i;
}
}
}
// 处理最后一个block
if (inBlock) {
calculateAndStoreErrorsInBlock(i - 1, blockStartIndex, errorArr, fracErrs);
}
} }
// 串行bqsr // 串行bqsr
@ -513,7 +642,7 @@ int SerialBQSR() {
// 3. 如果bam文件之前做过bqsrtag中包含OQoriginnal quality原始质量分数检查用户参数里是否指定用原始质量分数进行bqsr如果是则将质量分数替换为OQ否则忽略OQ先忽略 // 3. 如果bam文件之前做过bqsrtag中包含OQoriginnal quality原始质量分数检查用户参数里是否指定用原始质量分数进行bqsr如果是则将质量分数替换为OQ否则忽略OQ先忽略
// 4. 对read的两端进行检测去除hardclipadapter // 4. 对read的两端进行检测去除hardclipadapter
BamWrap *bw = bams[i]; BamWrap *bw = bams[i];
ReadAdditionData ad; SamData ad;
ad.read_len = BamWrap::BamEffectiveLength(bw->b); ad.read_len = BamWrap::BamEffectiveLength(bw->b);
ad.cigar_end = bw->b->core.n_cigar; ad.cigar_end = bw->b->core.n_cigar;
if (ad.read_len <= 0) continue; if (ad.read_len <= 0) continue;
@ -535,6 +664,9 @@ int SerialBQSR() {
calculateRefOffset(bw, ad); // 计算ref_offset就是相对比对的position要将ref右移多少 calculateRefOffset(bw, ad); // 计算ref_offset就是相对比对的position要将ref右移多少
calculateReadBases(bw, ad); // 计算clip处理之后剩余的碱基 calculateReadBases(bw, ad); // 计算clip处理之后剩余的碱基
// 计算clip之后两端的softstart和softend
calculateSoftStartEnd(bw, ad);
calculateCigar(bw, ad);
//spdlog::info("read-len {} - {}: clip left {}, right {}, ref offset: {}, cigar range: [{}, {}), cigar: {}", bw->b->core.l_qseq, //spdlog::info("read-len {} - {}: clip left {}, right {}, ref offset: {}, cigar range: [{}, {}), cigar: {}", bw->b->core.l_qseq,
// ad.read_len, ad.left_clip, ad.right_clip, ad.ref_offset, ad.cigar_start, ad.cigar_end, bw->cigar_str()); // ad.read_len, ad.left_clip, ad.right_clip, ad.ref_offset, ad.cigar_start, ad.cigar_end, bw->cigar_str());
@ -560,7 +692,9 @@ int SerialBQSR() {
// 到这里基本的数据都准备好了后续就是进行bqsr的统计了 // 到这里基本的数据都准备好了后续就是进行bqsr的统计了
// 8. 计算这条read对应的协变量 // 8. 计算这条read对应的协变量
PROF_START(covariate);
CovariateUtils::ComputeCovariates(bw, ad, nsgv::gInBamHeader, readCovariates, true); CovariateUtils::ComputeCovariates(bw, ad, nsgv::gInBamHeader, readCovariates, true);
PROF_END(gprof[GP_covariate], covariate);
test = readCovariates[1][0][0] + readCovariates[2][1][3]; test = readCovariates[1][0][0] + readCovariates[2][1][3];
int end_pos = bw->contig_end_pos(); int end_pos = bw->contig_end_pos();
//spdlog::info("adapter: {}, read: {}, {}, strand: {}", adapter_boundary, bw->contig_pos(), end_pos, //spdlog::info("adapter: {}, read: {}, {}, strand: {}", adapter_boundary, bw->contig_pos(), end_pos,
@ -568,7 +702,23 @@ int SerialBQSR() {
// 9. 计算这条read需要跳过的位置 // 9. 计算这条read需要跳过的位置
vector<bool> skip(ad.read_len, 0); vector<bool> skip(ad.read_len, 0);
calculateKnownSites(bw, ad, nsgv::gAuxVars[0].vcfArr); PROF_START(known_sites);
calculateKnownSites(bw, ad, nsgv::gAuxVars[0].vcfArr, skip);
for (int ii = 0; ii < ad.read_len; ++ii) {
skip[ii] =
skip[ii] || (ContextCovariate::baseIndexMap[ad.bases[ii]] == -1) || ad.quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN;
}
PROF_END(gprof[GP_read_vcf], known_sites);
// 10. 根据BAQ进一步处理snpindel得到处理后的数据
vector<double> snpErrors, insErrors, delErrors;
calculateFractionalErrorArray(isSNP, baqArray, snpErrors);
calculateFractionalErrorArray(isIns, baqArray, insErrors);
calculateFractionalErrorArray(isDel, baqArray, delErrors);
// aggregate all of the info into our info object, and update the data
// 11. 合并之前计算的数据得到info并更新bqsr table数据
} }
#if 0 #if 0

View File

@ -22,16 +22,16 @@ int CycleCovariate::MAXIMUM_CYCLE_VALUE;
// for CovariateUtils // for CovariateUtils
// 对一条read计算协变量该协变量被上一个read用过 // 对一条read计算协变量该协变量被上一个read用过
void CovariateUtils::ComputeCovariates(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, void CovariateUtils::ComputeCovariates(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values,
bool recordIndelValues) { bool recordIndelValues) {
//ReadGroupCovariate::RecordValues(bw, ad, header, values, recordIndelValues); ReadGroupCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
//BaseQualityCovariate::RecordValues(bw, ad, header, values, recordIndelValues); BaseQualityCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
//ContextCovariate::RecordValues(bw, ad, header, values, recordIndelValues); ContextCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
//CycleCovariate::RecordValues(bw, ad, header, values, recordIndelValues); CycleCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
} }
// ReadGroupCovariate 协变量的方法 // ReadGroupCovariate 协变量的方法
void ReadGroupCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) { void ReadGroupCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
uint8_t *rgStr = bam_aux_get(bw->b, "RG"); uint8_t *rgStr = bam_aux_get(bw->b, "RG");
char* rgVal = nullptr; char* rgVal = nullptr;
if (rgStr) rgVal = bam_aux2Z(rgStr); if (rgStr) rgVal = bam_aux2Z(rgStr);
@ -47,7 +47,7 @@ void ReadGroupCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr
} }
// BaseQualityCovariate 协变量的方法 // BaseQualityCovariate 协变量的方法
void BaseQualityCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, void BaseQualityCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values,
bool recordIndelValues) { bool recordIndelValues) {
// 在前面的处理过后quals应该和base长度一致了 // 在前面的处理过后quals应该和base长度一致了
#define __bq_set_cov(ins, del) \ #define __bq_set_cov(ins, del) \
@ -98,7 +98,7 @@ static char SimpleComplement(const char base) {
} }
// 获取去除低质量分数碱基之后的read碱基序列将低质量分数的碱基变成N // 获取去除低质量分数碱基之后的read碱基序列将低质量分数的碱基变成N
void ContextCovariate::GetStrandedClippedBytes(BamWrap* bw, ReadAdditionData& ad, string& clippedBases, uint8_t lowQTail) { void ContextCovariate::GetStrandedClippedBytes(BamWrap* bw, SamData& ad, string& clippedBases, uint8_t lowQTail) {
uint8_t* quals = bam_get_qual(bw->b) + ad.left_clip; uint8_t* quals = bam_get_qual(bw->b) + ad.left_clip;
if (bw->GetReadNegativeStrandFlag()) { // 反向互补 if (bw->GetReadNegativeStrandFlag()) { // 反向互补
@ -218,7 +218,7 @@ void ContextCovariate::GetReadContextAtEachPosition(const string& bases, const i
} }
} }
void ContextCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) { void ContextCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
const int originalReadLength = ad.read_len; const int originalReadLength = ad.read_len;
// store the original bases and then write Ns over low quality ones // store the original bases and then write Ns over low quality ones
@ -270,7 +270,7 @@ void ContextCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t
* @param maxCycle max value of the base to compute the key for * @param maxCycle max value of the base to compute the key for
* (this method throws UserException if the computed absolute value of the cycle number is higher than this value). * (this method throws UserException if the computed absolute value of the cycle number is higher than this value).
*/ */
int CycleCovariate::CycleKey(BamWrap* bw, ReadAdditionData& ad, const int baseNumber, const bool indel, const int maxCycle) { int CycleCovariate::CycleKey(BamWrap* bw, SamData& ad, const int baseNumber, const bool indel, const int maxCycle) {
const bool isNegStrand = bw->GetReadNegativeStrandFlag(); const bool isNegStrand = bw->GetReadNegativeStrandFlag();
const bool isSecondInPair = (bw->b->core.flag & BAM_FPAIRED) && (bw->b->core.flag & BAM_FREAD2); const bool isSecondInPair = (bw->b->core.flag & BAM_FPAIRED) && (bw->b->core.flag & BAM_FREAD2);
const int readLength = ad.read_len; const int readLength = ad.read_len;
@ -299,7 +299,7 @@ int CycleCovariate::CycleKey(BamWrap* bw, ReadAdditionData& ad, const int baseNu
} }
// Used to pick out the covariate's value from attributes of the read // Used to pick out the covariate's value from attributes of the read
void CycleCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) { void CycleCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
const int readLength = ad.read_len; const int readLength = ad.read_len;
// Note: duplicate the loop to void checking recordIndelValues on every iteration // Note: duplicate the loop to void checking recordIndelValues on every iteration
if (recordIndelValues) { if (recordIndelValues) {

View File

@ -69,7 +69,7 @@ struct CovariateUtils {
} }
// 对一条read计算协变量该协变量被上一个read用过 // 对一条read计算协变量该协变量被上一个read用过
static void ComputeCovariates(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); static void ComputeCovariates(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
}; };
// Read group协变量 // Read group协变量
@ -78,13 +78,13 @@ struct ReadGroupCovariate {
static map<string, int> RgToId; // read group name到id的映射 static map<string, int> RgToId; // read group name到id的映射
static map<int, string> IdToRg; // id到read group name的映射 static map<int, string> IdToRg; // id到read group name的映射
static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); static void RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
}; };
// Base quality协变量 // Base quality协变量
struct BaseQualityCovariate { struct BaseQualityCovariate {
static constexpr int index = 1; // 在协变量数组中的索引位置 static constexpr int index = 1; // 在协变量数组中的索引位置
static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); static void RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
}; };
// Context协变量 // Context协变量
@ -162,14 +162,14 @@ struct ContextCovariate {
} }
// 获取去除低质量分数碱基之后的read碱基序列将低质量分数的碱基变成N // 获取去除低质量分数碱基之后的read碱基序列将低质量分数的碱基变成N
static void GetStrandedClippedBytes(BamWrap* bw, ReadAdditionData& ad, string& clippedBases, uint8_t lowQTail); static void GetStrandedClippedBytes(BamWrap* bw, SamData& ad, string& clippedBases, uint8_t lowQTail);
// Creates a int representation of a given dna string. // Creates a int representation of a given dna string.
static int KeyFromContext(const string& dna, const int start, const int end); static int KeyFromContext(const string& dna, const int start, const int end);
// For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context). // For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
static void GetReadContextAtEachPosition(const string& bases, const int contextSize, const int mask, vector<int>& keys); static void GetReadContextAtEachPosition(const string& bases, const int contextSize, const int mask, vector<int>& keys);
// 设置协变量的值 // 设置协变量的值
static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); static void RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
}; };
// Cycle协变量 // Cycle协变量
@ -203,7 +203,7 @@ struct CycleCovariate {
} }
// Computes the encoded value of CycleCovariate's key for the given position at the read. // Computes the encoded value of CycleCovariate's key for the given position at the read.
static int CycleKey(BamWrap* bw, ReadAdditionData& ad, const int baseNumber, const bool indel, const int maxCycle); static int CycleKey(BamWrap* bw, SamData& ad, const int baseNumber, const bool indel, const int maxCycle);
static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); static void RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
}; };

View File

@ -11,16 +11,61 @@
#include <htslib/sam.h> #include <htslib/sam.h>
#include <limits.h> #include <limits.h>
#include <math.h> #include <math.h>
#include <stdlib.h>
#include <map> #include <map>
#include <queue>
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <vector> #include <vector>
using namespace std; using namespace std;
struct Cigar {
char op = '0';
int len = 0;
// 该操作符是否消耗read的碱基
static bool ConsumeReadBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'I' || cigar == 'S'; }
// 该操作符是否消耗参考基因组的碱基
static bool ConsumeRefBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N'; }
};
struct ReadIdxCigar {
int readIdx = 0; // 在read序列中的位置
char cigarOp = '0'; // 当前位置对应的cigar
};
// 不用经常释放array的内存空间减少频繁的内存开辟和释放操作
template <class T>
struct FastArray {
vector<T> arr;
size_t idx;
void clear() { idx = 0; }
size_t size() { return idx; }
void push_back(const T& val) {
if (idx < arr.size()) {
arr[idx++] = val;
} else {
arr.push_back(val);
idx++;
}
}
struct iterator {
typename std::vector<T>::iterator it;
iterator(typename std::vector<T>::iterator _it) : it(_it) {}
iterator& operator++() { ++it; return *this;}
iterator& operator--() { --it; return *this;}
T& operator*() const { return *it; }
bool operator!=(const iterator& other) const { return it != other.it; }
bool operator==(const iterator& other) const { return it == other.it; }
};
iterator begin() { return arr.begin(); }
iterator end() { return arr.begin() + idx; }
};
// 对原始bam数据的补充比如对两端进行hardclip等 // 对原始bam数据的补充比如对两端进行hardclip等
struct ReadAdditionData { struct SamData {
int read_len = 0; // read长度各种clip之后的长度 int read_len = 0; // read长度各种clip之后的长度
int cigar_start = 0; // cigar起始位置闭区间 int cigar_start = 0; // cigar起始位置闭区间
int cigar_end = 0; // cigar结束位置开区间 int cigar_end = 0; // cigar结束位置开区间
@ -29,7 +74,41 @@ struct ReadAdditionData {
int left_clip = 0; // 左侧被切掉的碱基长度 int left_clip = 0; // 左侧被切掉的碱基长度
int right_clip = 0; // 右侧被切掉的碱基长度 int right_clip = 0; // 右侧被切掉的碱基长度
int ref_offset = 0; // 切除adapter和softclip之后(softclip应该不影响)相对原始ref比对位置contig_pos的偏移量 int ref_offset = 0; // 切除adapter和softclip之后(softclip应该不影响)相对原始ref比对位置contig_pos的偏移量
string bases; // 处理之后的read的碱基
// 记录一下bqsr运算过程中用到的数据回头提前计算一下修正现在的复杂逻辑
static constexpr int READ_INDEX_NOT_FOUND = -1;
string bases; // 处理之后的read的碱基
vector<uint8_t> quals; // 对应的质量分数
int64_t start_pos; // 因为soft clip都被切掉了这里的softstart应该就是切掉之后的匹配位点闭区间
int64_t end_pos; // 同上,闭区间
FastArray<Cigar> cigars;
int64_t& softStart() { return start_pos; }
int64_t& softEnd() { return end_pos; }
// functions
ReadIdxCigar getReadIndexForReferenceCoordinate(int64_t refPos) {
ReadIdxCigar rc;
if (refPos < start_pos)
return rc;
int firstReadPosOfElement = 0; // inclusive
int firstRefPosOfElement = start_pos; // inclusive
int lastReadPosOfElement = 0; // exclusive
int lastRefPosOfElement = start_pos; // exclusive
// advance forward through all the cigar elements until we bracket the reference coordinate
for (auto& cigar : cigars) {
firstReadPosOfElement = lastReadPosOfElement;
firstRefPosOfElement = lastRefPosOfElement;
lastReadPosOfElement += Cigar::ConsumeReadBases(cigar.op) ? cigar.len : 0;
lastRefPosOfElement += Cigar::ConsumeRefBases(cigar.op) || cigar.op == 'S' ? cigar.len : 0;
if (firstRefPosOfElement <= refPos && refPos < lastRefPosOfElement) { // refCoord falls within this cigar element
int readPosAtRefCoord = firstReadPosOfElement + (Cigar::ConsumeReadBases(cigar.op) ? (refPos - firstRefPosOfElement) : 0);
rc.cigarOp = cigar.op;
rc.readIdx = readPosAtRefCoord;
return rc;
}
}
return rc;
}
}; };
/* /*

View File

@ -54,11 +54,11 @@ int DisplayProfiling(int nthread) {
// PRINT_GP(sort_wait); // PRINT_GP(sort_wait);
// PRINT_GP(markdup_wait); // PRINT_GP(markdup_wait);
// PRINT_GP(intersect_wait); // PRINT_GP(intersect_wait);
PRINT_GP(read); PRINT_GP(read_ref);
PRINT_GP(gen); PRINT_GP(read_vcf);
PRINT_GP(sort); PRINT_GP(covariate);
PRINT_GP(markdup); //PRINT_GP(markdup);
PRINT_GP(intersect); //PRINT_GP(intersect);
// PRINT_GP(merge_result); // PRINT_GP(merge_result);
// PRINT_GP(sort_pair); // PRINT_GP(sort_pair);
// PRINT_GP(sort_frag); // PRINT_GP(sort_frag);
@ -68,10 +68,10 @@ int DisplayProfiling(int nthread) {
// PRINT_GP(merge_markdup); // PRINT_GP(merge_markdup);
// PRINT_GP(merge_update); // PRINT_GP(merge_update);
// PRINT_GP(merge_add); // PRINT_GP(merge_add);
PRINT_GP(markdup_all); //PRINT_GP(markdup_all);
// PRINT_GP(final_read); // PRINT_GP(final_read);
PRINT_GP(write); //PRINT_GP(write);
PRINT_GP(whole_process); // PRINT_GP(whole_process);
// PRINT_TP(gen, nthread); // PRINT_TP(gen, nthread);
// PRINT_TP(sort_frag, nthread); // PRINT_TP(sort_frag, nthread);

View File

@ -40,6 +40,9 @@ extern uint64_t gprof[LIM_GLOBAL_PROF_TYPE];
enum { GP_0 = 0, GP_1, GP_2, GP_3, GP_4, GP_5, GP_6, GP_7, GP_8, GP_9, GP_10 }; enum { GP_0 = 0, GP_1, GP_2, GP_3, GP_4, GP_5, GP_6, GP_7, GP_8, GP_9, GP_10 };
enum { enum {
GP_read_wait = 11, GP_read_wait = 11,
GP_covariate,
GP_read_ref,
GP_read_vcf,
GP_gen_wait, GP_gen_wait,
GP_sort_wait, GP_sort_wait,
GP_markdup_wait, GP_markdup_wait,