Compare commits
2 Commits
146055fc01
...
6662435948
| Author | SHA1 | Date |
|---|---|---|
|
|
6662435948 | |
|
|
8e3388a494 |
|
|
@ -38,6 +38,7 @@ Date : 2023/10/23
|
|||
#include "util/utils.h"
|
||||
#include "util/math/math_utils.h"
|
||||
#include "quant_info.h"
|
||||
#include "util/debug.h"
|
||||
|
||||
using std::deque;
|
||||
|
||||
|
|
@ -270,7 +271,7 @@ void calculateReadBases(BamWrap* bw, SamData& ad) {
|
|||
uint8_t* quals = bam_get_qual(bw->b);
|
||||
for (int i = 0; i < ad.read_len; ++i) {
|
||||
ad.bases[i] = cBaseToChar[bam_seqi(seq, i + ad.left_clip)];
|
||||
ad.base_quals[i] = quals[i];
|
||||
ad.base_quals[i] = quals[i + ad.left_clip];
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -470,7 +471,7 @@ static void get_line_from_buf(char* buf, int64_t total, int64_t* cur, string* li
|
|||
}
|
||||
|
||||
// 计算与read有交叉的已知位点信息, 应该要判断一下,是按照read的范围去读取vcf,还是按照一个batch read的范围去读取
|
||||
void calculateKnownSites(BamWrap* bw, SamData& ad, vector<VCFParser> &vcfs, vector<bool> knownSites) {
|
||||
void calculateKnownSites(BamWrap* bw, SamData& ad, vector<VCFParser> &vcfs, vector<bool> &knownSites) {
|
||||
int tid = bw->contig_id();
|
||||
uint64_t startPos = bw->start_pos(); // 闭区间
|
||||
uint64_t endPos = bw->end_pos(); // 闭区间
|
||||
|
|
@ -519,7 +520,7 @@ void calculateKnownSites(BamWrap* bw, SamData& ad, vector<VCFParser> &vcfs, vect
|
|||
int64_t varStart = BamWrap::bam_global_pos(tid, pos);
|
||||
Interval varIntv(varStart, varStart + ref.size() - 1);
|
||||
if (readIntv.overlaps(varIntv)) {
|
||||
vcf.knownSites.push_back(Interval(tid, pos - 1, pos - 1 + ref.size()));
|
||||
vcf.knownSites.push_back(Interval(tid, pos - 1, pos - 1 + ref.size() - 1)); // 闭区间
|
||||
//spdlog::info("intv-1 {}, {}, {}", tid, pos, ref.size());
|
||||
}
|
||||
get_line_from_buf(buf, flen, &cur, &line);
|
||||
|
|
@ -619,7 +620,8 @@ void updateRecalTablesForRead(ReadRecalInfo &info) {
|
|||
|
||||
int readLength = read.read_len;
|
||||
for (int offset = 0; offset < readLength; ++offset) {
|
||||
if (!info.skips[offset]) { // 不跳过当前位置
|
||||
if (!info.skips[offset]) {
|
||||
//if (true){ // 不跳过当前位置
|
||||
for (int idx = 0; idx < nsgv::gEventTypes.size(); ++idx) {
|
||||
// 获取四个值,readgroup / qualityscore / context / cycle
|
||||
EventTypeValue& event = nsgv::gEventTypes[idx];
|
||||
|
|
@ -693,6 +695,7 @@ void roundTableValues(RecalTables& rt) {
|
|||
|
||||
// 串行bqsr
|
||||
int SerialBQSR() {
|
||||
open_debug_files();
|
||||
int round = 0;
|
||||
BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO);
|
||||
// inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM);
|
||||
|
|
@ -754,6 +757,7 @@ int SerialBQSR() {
|
|||
// spdlog::info("bam idx: {}", i);
|
||||
BamWrap* bw = bams[i];
|
||||
SamData ad;
|
||||
ad.rid = i;
|
||||
ad.bw = bw;
|
||||
ad.read_len = BamWrap::BamEffectiveLength(bw->b);
|
||||
ad.cigar_end = bw->b->core.n_cigar;
|
||||
|
|
@ -788,6 +792,18 @@ int SerialBQSR() {
|
|||
vector<int> isIns(ad.read_len, 0); // 该位置是否是插入位置,0不是,1是
|
||||
vector<int> isDel(ad.read_len, 0); // 该位置是否是删除位置,0不是,1是
|
||||
const int nErrors = calculateIsSNPOrIndel(nsgv::gAuxVars[0], bw, ad, isSNP, isIns, isDel);
|
||||
|
||||
/*fprintf(gf[0], "%d\t", ad.read_len);
|
||||
for (int ii = 0; ii < ad.read_len; ++ii) fprintf(gf[0], "%d ", isSNP[ii]);
|
||||
fprintf(gf[0], "\n");
|
||||
fprintf(gf[1], "%d\t", ad.read_len);
|
||||
for (int ii = 0; ii < ad.read_len; ++ii) fprintf(gf[1], "%d ", isIns[ii]);
|
||||
fprintf(gf[1], "\n");
|
||||
fprintf(gf[2], "%d\t", ad.read_len);
|
||||
for (int ii = 0; ii < ad.read_len; ++ii) fprintf(gf[2], "%d ", isDel[ii]);
|
||||
fprintf(gf[2], "\n");
|
||||
*/
|
||||
|
||||
// spdlog::info("nErrors: {}", nErrors);
|
||||
// for (auto val : isSNP) { if (val > 0) spdlog::info("snp val: {}", val); }
|
||||
|
||||
|
|
@ -811,8 +827,19 @@ int SerialBQSR() {
|
|||
PROF_START(covariate);
|
||||
CovariateUtils::ComputeCovariates(bw, ad, nsgv::gInBamHeader, readCovariates, true);
|
||||
PROF_END(gprof[GP_covariate], covariate);
|
||||
test = readCovariates[1][0][0] + readCovariates[2][1][3];
|
||||
int end_pos = bw->contig_end_pos();
|
||||
|
||||
// fprintf(gf[3], "%ld %ld %d %ld\n", ad.rid, readCovariates.size(), ad.read_len, readCovariates[0][0].size());
|
||||
// for (auto &arr1 : readCovariates) {
|
||||
// for (size_t si = 0; si < ad.read_len; ++si) {
|
||||
// for (auto &val : arr1[si]) {
|
||||
// fprintf(gf[3], "%d ", val);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// fprintf(gf[3], "\n");
|
||||
|
||||
//test = readCovariates[1][0][0] + readCovariates[2][1][3];
|
||||
//int end_pos = bw->contig_end_pos();
|
||||
//spdlog::info("adapter: {}, read: {}, {}, strand: {}", adapter_boundary, bw->contig_pos(), end_pos,
|
||||
// bw->GetReadNegativeStrandFlag() ? "reverse" : "forward");
|
||||
// for (auto val : isSNP) { if (val > 0) spdlog::info("snp err val-1: {}", val); }
|
||||
|
|
@ -825,6 +852,9 @@ int SerialBQSR() {
|
|||
ad.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN;
|
||||
}
|
||||
PROF_END(gprof[GP_read_vcf], known_sites);
|
||||
// fprintf(gf[0], "%d\t", ad.read_len);
|
||||
// for (int ii = 0; ii < ad.read_len; ++ii) fprintf(gf[0], "%d ", skips[ii] ? 1 : 0);
|
||||
// fprintf(gf[0], "\n");
|
||||
|
||||
// 10. 根据BAQ进一步处理snp,indel,得到处理后的数据
|
||||
vector<double> snpErrors, insErrors, delErrors;
|
||||
|
|
@ -911,6 +941,8 @@ int SerialBQSR() {
|
|||
}
|
||||
spdlog::info("read count: {}", readNumSum);
|
||||
|
||||
close_debug_files();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -98,6 +98,51 @@ static char SimpleComplement(const char base) {
|
|||
}
|
||||
}
|
||||
|
||||
static void ClipLowQualEndsWithN(string& bases, const FastArray<uint8_t> &quals, uint8_t lowQTail, bool isNegativeStrand) {
|
||||
// 处理左边
|
||||
int left = 0;
|
||||
int readLen = bases.size();
|
||||
if (isNegativeStrand) {
|
||||
for (; left < readLen; ++left) {
|
||||
if (quals[readLen - 1 - left] <= lowQTail)
|
||||
bases[left] = 'N';
|
||||
else
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
for (; left < readLen; ++left) {
|
||||
if (quals[left] <= lowQTail)
|
||||
bases[left] = 'N';
|
||||
else
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (left == readLen) {
|
||||
bases.clear();
|
||||
return;
|
||||
}
|
||||
// 处理右边
|
||||
int right = (int)bases.size() - 1;
|
||||
if (isNegativeStrand) {
|
||||
for (; right >= 0; --right) {
|
||||
if (quals[readLen - 1 - right] <= lowQTail)
|
||||
bases[right] = 'N';
|
||||
else
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
for (; right >= 0; --right) {
|
||||
if (quals[right] <= lowQTail)
|
||||
bases[right] = 'N';
|
||||
else
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (right < left)
|
||||
bases.clear();
|
||||
|
||||
}
|
||||
|
||||
// 获取去除低质量分数碱基之后的read碱基序列(将低质量分数的碱基变成N)
|
||||
void ContextCovariate::GetStrandedClippedBytes(BamWrap* bw, SamData& ad, string& clippedBases, uint8_t lowQTail) {
|
||||
uint8_t* quals = bam_get_qual(bw->b) + ad.left_clip;
|
||||
|
|
@ -106,28 +151,7 @@ void ContextCovariate::GetStrandedClippedBytes(BamWrap* bw, SamData& ad, string&
|
|||
for (int i = 0; i < ad.read_len; ++i) clippedBases[i] = SimpleComplement(ad.bases[ad.read_len - 1 - i]);
|
||||
}
|
||||
|
||||
// 处理左边
|
||||
int left = 0;
|
||||
for (; left < ad.read_len; ++left) {
|
||||
if (quals[left] <= lowQTail)
|
||||
clippedBases[left] = 'N';
|
||||
else
|
||||
break;
|
||||
}
|
||||
if (left == ad.read_len) {
|
||||
clippedBases.clear();
|
||||
return;
|
||||
}
|
||||
// 处理右边
|
||||
int right = ad.read_len - 1;
|
||||
for (; right >= 0; --right) {
|
||||
if (quals[right] <= lowQTail)
|
||||
clippedBases[right] = 'N';
|
||||
else
|
||||
break;
|
||||
}
|
||||
if (right < left)
|
||||
clippedBases.clear();
|
||||
ClipLowQualEndsWithN(clippedBases, ad.base_quals, lowQTail, bw->GetReadNegativeStrandFlag());
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -224,7 +248,7 @@ void ContextCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header,
|
|||
|
||||
// store the original bases and then write Ns over low quality ones
|
||||
string strandedClippedBases(ad.bases);
|
||||
GetStrandedClippedBytes(bw, ad, strandedClippedBases, lowQualTail);
|
||||
GetStrandedClippedBytes(bw, ad, strandedClippedBases, 30); // 注意这里的lowQualTail数值
|
||||
// spdlog::info("bases: {}", strandedClippedBases);
|
||||
vector<int> nBasePairContextAtEachCycle;
|
||||
GetReadContextAtEachPosition(strandedClippedBases, mismatchesContextSize, mismatchesKeyMask, nBasePairContextAtEachCycle);
|
||||
|
|
|
|||
|
|
@ -225,6 +225,8 @@ struct QualityUtils {
|
|||
static uint8_t errorProbToQual(const double errorRate, const uint8_t maxQual) {
|
||||
// Utils.validateArg(MathUtils.isValidProbability(errorRate), ()->"errorRate must be good probability but got " + errorRate);
|
||||
const double d = std::round(-10.0 * std::log10(errorRate));
|
||||
if (d > maxQual) // 注意当d超过int表示的最大值时(如inf),转变成int会变成负数
|
||||
return maxQual;
|
||||
return boundQual((int)d, maxQual);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -113,13 +113,13 @@ struct RecalUtils {
|
|||
table.addColumn({"Observations", "%d"});
|
||||
table.addColumn({"Errors", "%.2f"});
|
||||
|
||||
spdlog::info("rg0: {}, {}, {}, {}", r.readGroupTable[0][0].numObservations, r.readGroupTable[0][0].numMismatches,
|
||||
r.readGroupTable[0][0].reportedQuality, r.readGroupTable[0][0].empiricalQuality);
|
||||
//spdlog::info("rg0: {}, {}, {}, {}", r.readGroupTable[0][0].numObservations, r.readGroupTable[0][0].numMismatches,
|
||||
// r.readGroupTable[0][0].reportedQuality, r.readGroupTable[0][0].empiricalQuality);
|
||||
|
||||
_Foreach2DK(r.readGroupTable, datum, {
|
||||
RecalDatum &dat = const_cast<RecalDatum&>(datum);
|
||||
spdlog::info("errors: {}, {}", datum.numMismatches, dat.getNumMismatches());
|
||||
spdlog::info("obs: {}, {}", datum.numObservations, dat.getNumObservations());
|
||||
// spdlog::info("errors: {}, {}", datum.numMismatches, dat.getNumMismatches());
|
||||
// spdlog::info("obs: {}, {}", datum.numObservations, dat.getNumObservations());
|
||||
if (dat.getNumObservations() > 0) {
|
||||
table.addRowData({
|
||||
ReadGroupCovariate::IdToRg[k1],
|
||||
|
|
|
|||
|
|
@ -31,7 +31,7 @@ struct Cigar {
|
|||
};
|
||||
|
||||
struct ReadIdxCigar {
|
||||
int readIdx = 0; // 在read序列中的位置
|
||||
int readIdx = -1; // 在read序列中的位置
|
||||
char cigarOp = '0'; // 当前位置对应的cigar
|
||||
};
|
||||
|
||||
|
|
@ -57,6 +57,7 @@ struct FastArray {
|
|||
}
|
||||
}
|
||||
inline T& operator[](size_t pos) { return arr[pos]; }
|
||||
inline const T& operator[](size_t pos) const { return arr[pos]; }
|
||||
struct iterator {
|
||||
typename std::vector<T>::iterator it;
|
||||
iterator(typename std::vector<T>::iterator _it) : it(_it) {}
|
||||
|
|
@ -74,6 +75,7 @@ struct FastArray {
|
|||
// 对原始bam数据的补充,比如对两端进行hardclip等
|
||||
class BamWrap;
|
||||
struct SamData {
|
||||
int64_t rid = 0; // for debug
|
||||
int read_len = 0; // read长度,各种clip之后的长度
|
||||
int cigar_start = 0; // cigar起始位置,闭区间
|
||||
int cigar_end = 0; // cigar结束位置,开区间
|
||||
|
|
|
|||
|
|
@ -0,0 +1,31 @@
|
|||
/*********************************************************************************************
|
||||
Description: data and files for debugging
|
||||
Copyright : All right reserved by NCIC.ICT
|
||||
|
||||
Author : Zhang Zhonghai
|
||||
Date : 2024/04/09
|
||||
***********************************************************************************************/
|
||||
#include "debug.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
FILE* gf[4] = {0};
|
||||
|
||||
int open_debug_files() {
|
||||
char fn[1024] = {0};
|
||||
int i = 0;
|
||||
for (; i < 4; ++i) {
|
||||
sprintf(fn, "./test/fp%d.txt", i);
|
||||
gf[i] = fopen(fn, "w");
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int close_debug_files() {
|
||||
int i = 0;
|
||||
for (; i < 4; ++i) {
|
||||
if (gf[i] != 0)
|
||||
fclose(gf[i]);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
/*********************************************************************************************
|
||||
Description: data and files for debugging
|
||||
Copyright : All right reserved by NCIC.ICT
|
||||
|
||||
Author : Zhang Zhonghai
|
||||
Date : 2024/04/09
|
||||
***********************************************************************************************/
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
|
||||
////////////////// for debug and test //////////////////////////
|
||||
|
||||
#define DEBUG_FILE_OUTPUT // 打开gfp1-4文件,并记录debug信息
|
||||
// #define COUNT_SEED_LENGTH // 记录seed匹配数量降低到1时的长度,以及最终扩展的长度
|
||||
// #define GET_FULL_MATCH_READ // 获取完全匹配的reads
|
||||
// #define COUNT_CALC_NUM // 统计BSW的剪枝后的计算量和未剪枝前的计算量
|
||||
// #define GET_DIFFERENT_EXTENSION_LENGTH // 获取不同长度extension的query,target,和其他用于计算的数据
|
||||
// #define GET_KSW_ALIGN_SEQ
|
||||
// #define DEBUG_SW_EXTEND // 将bsw的分值输入到debug文件里
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
|
||||
extern FILE *gf[4];
|
||||
|
||||
int open_debug_files();
|
||||
|
||||
int close_debug_files();
|
||||
1090
test/stats.table
1090
test/stats.table
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue