改了一些问题,小数据集结果没问题,但是对于5G的稍微大一些的数据集还是错误
This commit is contained in:
parent
1cd1a72760
commit
1864736509
|
|
@ -5,6 +5,8 @@
|
||||||
/build
|
/build
|
||||||
/test
|
/test
|
||||||
build.sh
|
build.sh
|
||||||
|
debug_build.sh
|
||||||
|
r1.sh
|
||||||
run.sh
|
run.sh
|
||||||
test.sh
|
test.sh
|
||||||
*.log
|
*.log
|
||||||
|
|
|
||||||
|
|
@ -221,7 +221,7 @@ PosAndOperator getReadIndexForReferenceCoordinate(BamWrap *bw, int alignmentStar
|
||||||
}
|
}
|
||||||
|
|
||||||
// 根据adapter位置,对read进行hardclip,返回左侧或右侧减掉的base数量
|
// 根据adapter位置,对read进行hardclip,返回左侧或右侧减掉的base数量
|
||||||
void clipByReferenceCoordinates(BamWrap *bw, int refStart, int refStop, SamData &ad) {
|
void clipByReferenceCoordinates(BamWrap *bw, int refStart, int refStop, SamData &sd) {
|
||||||
int start, stop;
|
int start, stop;
|
||||||
// Determine the read coordinate to start and stop hard clipping
|
// Determine the read coordinate to start and stop hard clipping
|
||||||
if (refStart < 0) {
|
if (refStart < 0) {
|
||||||
|
|
@ -230,65 +230,65 @@ void clipByReferenceCoordinates(BamWrap *bw, int refStart, int refStop, SamData
|
||||||
// if the refStop falls in a deletion, the above method returns the position after the deletion. Since the stop we return here
|
// if the refStop falls in a deletion, the above method returns the position after the deletion. Since the stop we return here
|
||||||
// is inclusive, we decrement the stop to avoid overclipping by one base. As a result we do not clip the deletion, which is fine.
|
// is inclusive, we decrement the stop to avoid overclipping by one base. As a result we do not clip the deletion, which is fine.
|
||||||
stop = stopPosAndOperator.readPosAtRefCoord - (consumeReadBases(stopPosAndOperator.cigarOperator) ? 0 : 1);
|
stop = stopPosAndOperator.readPosAtRefCoord - (consumeReadBases(stopPosAndOperator.cigarOperator) ? 0 : 1);
|
||||||
ad.left_clip = stop + 1;
|
sd.left_clip = stop + 1;
|
||||||
ad.cigar_start = stopPosAndOperator.cigarIndex;
|
sd.cigar_start = stopPosAndOperator.cigarIndex;
|
||||||
ad.first_cigar_clip = ad.left_clip - stopPosAndOperator.preCigarLen;
|
sd.first_cigar_clip = sd.left_clip - stopPosAndOperator.preCigarLen;
|
||||||
} else {
|
} else {
|
||||||
if (refStop >= 0) return;
|
if (refStop >= 0) return;
|
||||||
// unlike the above case where we clip the start fo the read, here we clip the end and returning the base to the right of a deletion avoids
|
// unlike the above case where we clip the start fo the read, here we clip the end and returning the base to the right of a deletion avoids
|
||||||
// overclipping
|
// overclipping
|
||||||
PosAndOperator startPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStart);
|
PosAndOperator startPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStart);
|
||||||
start = startPosAndOperator.readPosAtRefCoord;
|
start = startPosAndOperator.readPosAtRefCoord;
|
||||||
ad.right_clip = bw->b->core.l_qseq - start;
|
sd.right_clip = bw->b->core.l_qseq - start;
|
||||||
ad.cigar_end = startPosAndOperator.cigarIndex + 1;
|
sd.cigar_end = startPosAndOperator.cigarIndex + 1;
|
||||||
ad.last_cigar_clip = startPosAndOperator.preCigarLen + startPosAndOperator.cigarLen - start;
|
sd.last_cigar_clip = startPosAndOperator.preCigarLen + startPosAndOperator.cigarLen - start;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 计算切掉adapter之后,ref相对原始ref的偏移量
|
// 计算切掉adapter之后,ref相对原始ref的偏移量
|
||||||
void calculateRefOffset(BamWrap *bw, SamData &ad) {
|
void calculateRefOffset(BamWrap *bw, SamData &sd) {
|
||||||
const uint32_t* cigar = bam_get_cigar(bw->b);
|
const uint32_t* cigar = bam_get_cigar(bw->b);
|
||||||
const bam1_core_t& bc = bw->b->core;
|
const bam1_core_t& bc = bw->b->core;
|
||||||
int i = 0;
|
int i = 0;
|
||||||
for (i = 0; i < ad.cigar_start; ++i) {
|
for (i = 0; i < sd.cigar_start; ++i) {
|
||||||
const char c = bam_cigar_opchr(cigar[i]);
|
const char c = bam_cigar_opchr(cigar[i]);
|
||||||
int len = bam_cigar_oplen(cigar[i]);
|
int len = bam_cigar_oplen(cigar[i]);
|
||||||
if (consumeRefBases(c)) {
|
if (consumeRefBases(c)) {
|
||||||
ad.ref_offset += len;
|
sd.ref_offset += len;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
const char c = bam_cigar_opchr(cigar[i]);
|
const char c = bam_cigar_opchr(cigar[i]);
|
||||||
if (consumeRefBases(c)) {
|
if (consumeRefBases(c)) {
|
||||||
ad.ref_offset += ad.first_cigar_clip;
|
sd.ref_offset += sd.first_cigar_clip;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 计算clip处理之后,剩余的碱基
|
// 计算clip处理之后,剩余的碱基
|
||||||
void calculateReadBases(BamWrap* bw, SamData& ad) {
|
void calculateReadBases(BamWrap* bw, SamData& sd) {
|
||||||
ad.bases.resize(ad.read_len);
|
sd.bases.resize(sd.read_len);
|
||||||
ad.base_quals.resize(ad.read_len);
|
sd.base_quals.resize(sd.read_len);
|
||||||
uint8_t* seq = bam_get_seq(bw->b);
|
uint8_t* seq = bam_get_seq(bw->b);
|
||||||
uint8_t* quals = bam_get_qual(bw->b);
|
uint8_t* quals = bam_get_qual(bw->b);
|
||||||
for (int i = 0; i < ad.read_len; ++i) {
|
for (int i = 0; i < sd.read_len; ++i) {
|
||||||
ad.bases[i] = cBaseToChar[bam_seqi(seq, i + ad.left_clip)];
|
sd.bases[i] = cBaseToChar[bam_seqi(seq, i + sd.left_clip)];
|
||||||
ad.base_quals[i] = quals[i + ad.left_clip];
|
sd.base_quals[i] = quals[i + sd.left_clip];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 计算read两端clip之后的softstart和softend
|
// 计算read两端clip之后的softstart和softend
|
||||||
void calculateSoftStartEnd(BamWrap* bw, SamData& ad) {
|
void calculateSoftStartEnd(BamWrap* bw, SamData& sd) {
|
||||||
int64_t softStart = bw->b->core.pos + ad.ref_offset;
|
int64_t softStart = bw->b->core.pos + sd.ref_offset;
|
||||||
int64_t softEnd = softStart - 1; // 闭区间
|
int64_t softEnd = softStart - 1; // 闭区间
|
||||||
const uint32_t* cigar = bam_get_cigar(bw->b);
|
const uint32_t* cigar = bam_get_cigar(bw->b);
|
||||||
const bam1_core_t& bc = bw->b->core;
|
const bam1_core_t& bc = bw->b->core;
|
||||||
int cigar_start = ad.cigar_start;
|
int cigar_start = sd.cigar_start;
|
||||||
int cigar_end = ad.cigar_end;
|
int cigar_end = sd.cigar_end;
|
||||||
bool rightTail = false;
|
bool rightTail = false;
|
||||||
for (int i = ad.cigar_start; i < ad.cigar_end; ++i) {
|
for (int i = sd.cigar_start; i < sd.cigar_end; ++i) {
|
||||||
const char c = bam_cigar_opchr(cigar[i]);
|
const char c = bam_cigar_opchr(cigar[i]);
|
||||||
int len = bam_cigar_oplen(cigar[i]);
|
int len = bam_cigar_oplen(cigar[i]);
|
||||||
if (i == ad.cigar_start) len -= ad.first_cigar_clip;
|
if (i == sd.cigar_start) len -= sd.first_cigar_clip;
|
||||||
if (i == ad.cigar_end - 1) len -= ad.last_cigar_clip;
|
if (i == sd.cigar_end - 1) len -= sd.last_cigar_clip;
|
||||||
// if (c == 'S' || c == 'I')
|
// if (c == 'S' || c == 'I')
|
||||||
if (c == 'S')
|
if (c == 'S')
|
||||||
softStart -= len;
|
softStart -= len;
|
||||||
|
|
@ -299,47 +299,53 @@ void calculateSoftStartEnd(BamWrap* bw, SamData& ad) {
|
||||||
softEnd += len;
|
softEnd += len;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ad.softStart() = softStart;
|
sd.softStart() = softStart;
|
||||||
ad.softEnd() = softEnd;
|
sd.softEnd() = softEnd;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 计算clip之后的cigar
|
// 计算clip之后的cigar
|
||||||
void calculateCigar(BamWrap* bw, SamData& ad) {
|
void calculateCigar(BamWrap* bw, SamData& sd) {
|
||||||
ad.cigars.clear();
|
sd.cigars.clear();
|
||||||
const uint32_t* cigar = bam_get_cigar(bw->b);
|
const uint32_t* cigar = bam_get_cigar(bw->b);
|
||||||
const bam1_core_t& bc = bw->b->core;
|
const bam1_core_t& bc = bw->b->core;
|
||||||
int cigar_start = ad.cigar_start;
|
int cigar_start = sd.cigar_start;
|
||||||
int cigar_end = ad.cigar_end;
|
int cigar_end = sd.cigar_end;
|
||||||
for (int i = ad.cigar_start; i < ad.cigar_end; ++i) {
|
for (int i = sd.cigar_start; i < sd.cigar_end; ++i) {
|
||||||
const char c = bam_cigar_opchr(cigar[i]);
|
char c = bam_cigar_opchr(cigar[i]);
|
||||||
int len = bam_cigar_oplen(cigar[i]);
|
int len = bam_cigar_oplen(cigar[i]);
|
||||||
if (i == ad.cigar_start)
|
if (i == sd.cigar_start)
|
||||||
len -= ad.first_cigar_clip;
|
len -= sd.first_cigar_clip;
|
||||||
if (i == ad.cigar_end - 1)
|
if (i == sd.cigar_end - 1)
|
||||||
len -= ad.last_cigar_clip;
|
len -= sd.last_cigar_clip;
|
||||||
ad.cigars.push_back({c, len});
|
//if ((i == sd.cigar_start || i == sd.cigar_end - 1) && c == 'D') // 跳过开头的deletion
|
||||||
|
if (i == sd.cigar_start && c == 'D') { // 跳过开头的deletion
|
||||||
|
c = 'H';
|
||||||
|
// sd.ref_offset += len;
|
||||||
|
sd.start_pos += len;
|
||||||
|
}
|
||||||
|
sd.cigars.push_back({c, len});
|
||||||
}
|
}
|
||||||
//for(auto &cigar : ad.cigars) {
|
//for(auto &cigar : sd.cigars) {
|
||||||
// spdlog::info("op: {}, len: {}", cigar.op, cigar.len);
|
// spdlog::info("op: {}, len: {}", cigar.op, cigar.len);
|
||||||
//}
|
//}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 计算read两端softclip的碱基数量,可能会修改ad里的clip值
|
// 计算read两端softclip的碱基数量,可能会修改ad里的clip值
|
||||||
void calculateSoftClip(BamWrap *bw, SamData &ad) {
|
void calculateSoftClip(BamWrap *bw, SamData &sd) {
|
||||||
const uint32_t* cigar = bam_get_cigar(bw->b);
|
const uint32_t* cigar = bam_get_cigar(bw->b);
|
||||||
const bam1_core_t& bc = bw->b->core;
|
const bam1_core_t& bc = bw->b->core;
|
||||||
int readIndex = ad.left_clip;
|
int readIndex = sd.left_clip;
|
||||||
int cutLeft = -1; // first position to hard clip (inclusive)
|
int cutLeft = -1; // first position to hard clip (inclusive)
|
||||||
int cutRight = -1; // first position to hard clip (inclusive)
|
int cutRight = -1; // first position to hard clip (inclusive)
|
||||||
int cigar_start = ad.cigar_start;
|
int cigar_start = sd.cigar_start;
|
||||||
int cigar_end = ad.cigar_end;
|
int cigar_end = sd.cigar_end;
|
||||||
bool rightTail = false; // trigger to stop clipping the left tail and start cutting the right tail
|
bool rightTail = false; // trigger to stop clipping the left tail and start cutting the right tail
|
||||||
|
|
||||||
for (int i = ad.cigar_start; i < ad.cigar_end; ++i) {
|
for (int i = sd.cigar_start; i < sd.cigar_end; ++i) {
|
||||||
const char c = bam_cigar_opchr(cigar[i]);
|
const char c = bam_cigar_opchr(cigar[i]);
|
||||||
int len = bam_cigar_oplen(cigar[i]);
|
int len = bam_cigar_oplen(cigar[i]);
|
||||||
if (i == ad.cigar_start) len -= ad.first_cigar_clip;
|
if (i == sd.cigar_start) len -= sd.first_cigar_clip;
|
||||||
if (i == ad.cigar_end - 1) len -= ad.last_cigar_clip;
|
if (i == sd.cigar_end - 1) len -= sd.last_cigar_clip;
|
||||||
if (c == 'S') {
|
if (c == 'S') {
|
||||||
if (rightTail) {
|
if (rightTail) {
|
||||||
cutRight = readIndex;
|
cutRight = readIndex;
|
||||||
|
|
@ -356,14 +362,14 @@ void calculateSoftClip(BamWrap *bw, SamData &ad) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (cutRight >= 0) {
|
if (cutRight >= 0) {
|
||||||
ad.right_clip = bw->b->core.l_qseq - cutRight;
|
sd.right_clip = bw->b->core.l_qseq - cutRight;
|
||||||
ad.cigar_end = cigar_end;
|
sd.cigar_end = cigar_end;
|
||||||
ad.last_cigar_clip = 0;
|
sd.last_cigar_clip = 0;
|
||||||
}
|
}
|
||||||
if (cutLeft >= 0) {
|
if (cutLeft >= 0) {
|
||||||
ad.left_clip = cutLeft + 1;
|
sd.left_clip = cutLeft + 1;
|
||||||
ad.cigar_start = cigar_start;
|
sd.cigar_start = cigar_start;
|
||||||
ad.first_cigar_clip = 0;
|
sd.first_cigar_clip = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -387,9 +393,10 @@ inline void updateIndel(vector<int> &isIndel, int index) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// 计算该read的每个碱基位置是否是SNP或Indel
|
// 计算该read的每个碱基位置是否是SNP或Indel
|
||||||
int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, SamData &ad, vector<int> &isSNP, vector<int> &isIns, vector<int> &isDel) {
|
int calculateIsSNPOrIndel(AuxVar& aux, SamData &sd, vector<int> &isSNP, vector<int> &isIns, vector<int> &isDel) {
|
||||||
// 1. 读取参考基因组,先看看串行运行性能,稍后可以将读入ref和vcf合并起来做成一个并行流水线步骤
|
// 1. 读取参考基因组,先看看串行运行性能,稍后可以将读入ref和vcf合并起来做成一个并行流水线步骤
|
||||||
Interval interval{bw->start_pos() + ad.ref_offset, bw->end_pos()}; // 闭区间
|
//Interval interval{bw->start_pos() + sd.ref_offset, bw->end_pos()}; // 闭区间
|
||||||
|
Interval interval{sd.start_pos, sd.end_pos}; // 闭区间
|
||||||
PROF_START(ref);
|
PROF_START(ref);
|
||||||
read_ref_base(aux, interval.left, interval);
|
read_ref_base(aux, interval.left, interval);
|
||||||
PROF_END(gprof[GP_read_ref], ref);
|
PROF_END(gprof[GP_read_ref], ref);
|
||||||
|
|
@ -398,18 +405,13 @@ int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, SamData &ad, vector<int> &is
|
||||||
|
|
||||||
// 2. 遍历cigar,计算每个碱基是否是SNP或Indel
|
// 2. 遍历cigar,计算每个碱基是否是SNP或Indel
|
||||||
int readPos = 0, refPos = 0, nEvents = 0;
|
int readPos = 0, refPos = 0, nEvents = 0;
|
||||||
const uint32_t* cigar = bam_get_cigar(bw->b);
|
for (int i = 0; i<sd.cigars.size(); ++i) {
|
||||||
const bam1_core_t& bc = bw->b->core;
|
const char c = sd.cigars[i].op;
|
||||||
uint8_t* seq = bam_get_seq(bw->b);
|
int len = sd.cigars[i].len;
|
||||||
for (int i = ad.cigar_start; i < ad.cigar_end; ++i) {
|
|
||||||
const char c = bam_cigar_opchr(cigar[i]);
|
|
||||||
int len = bam_cigar_oplen(cigar[i]);
|
|
||||||
if (i == ad.cigar_start) len -= ad.first_cigar_clip;
|
|
||||||
if (i == ad.cigar_end - 1) len -= ad.last_cigar_clip;
|
|
||||||
if (c == 'M' || c == '=' || c == 'X') {
|
if (c == 'M' || c == '=' || c == 'X') {
|
||||||
for (int j = 0; j < len; ++j) {
|
for (int j = 0; j < len; ++j) {
|
||||||
// 按位置将read和ref碱基进行比较,不同则是snp,注意read起始位置要加上left_clip
|
// 按位置将read和ref碱基进行比较,不同则是snp,注意read起始位置要加上left_clip
|
||||||
int snpInt = cBaseToChar[bam_seqi(seq, readPos + ad.left_clip)] == refBases[refPos] ? 0 : 1;
|
int snpInt = sd.bases[readPos] == refBases[refPos] ? 0 : 1;
|
||||||
// if (snpInt > 0) { spdlog::info("snp {}, readpos: {}", snpInt, readPos); }
|
// if (snpInt > 0) { spdlog::info("snp {}, readpos: {}", snpInt, readPos); }
|
||||||
isSNP[readPos] = snpInt;
|
isSNP[readPos] = snpInt;
|
||||||
nEvents += snpInt;
|
nEvents += snpInt;
|
||||||
|
|
@ -418,14 +420,14 @@ int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, SamData &ad, vector<int> &is
|
||||||
}
|
}
|
||||||
} else if (c == 'D') {
|
} else if (c == 'D') {
|
||||||
// 应该是在上一个消耗碱基的cigar的最后一个位置,标记Del
|
// 应该是在上一个消耗碱基的cigar的最后一个位置,标记Del
|
||||||
int index = bw->GetReadNegativeStrandFlag() ? readPos : readPos - 1;
|
int index = sd.bw->GetReadNegativeStrandFlag() ? readPos : readPos - 1;
|
||||||
updateIndel(isDel, index);
|
updateIndel(isDel, index);
|
||||||
refPos += len;
|
refPos += len;
|
||||||
} else if (c == 'N') {
|
} else if (c == 'N') {
|
||||||
refPos += len;
|
refPos += len;
|
||||||
} else if (c == 'I') {
|
} else if (c == 'I') {
|
||||||
// 与Del不同,Ins应该是在下一个cigar开始的位置,标记Ins
|
// 与Del不同,Ins应该是在下一个cigar开始的位置,标记Ins
|
||||||
bool forwardStrandRead = !bw->GetReadNegativeStrandFlag();
|
bool forwardStrandRead = !sd.bw->GetReadNegativeStrandFlag();
|
||||||
if (forwardStrandRead) {
|
if (forwardStrandRead) {
|
||||||
updateIndel(isIns, readPos - 1);
|
updateIndel(isIns, readPos - 1);
|
||||||
}
|
}
|
||||||
|
|
@ -437,6 +439,47 @@ int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, SamData &ad, vector<int> &is
|
||||||
readPos += len;
|
readPos += len;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// const uint32_t* cigar = bam_get_cigar(bw->b);
|
||||||
|
// const bam1_core_t& bc = bw->b->core;
|
||||||
|
// uint8_t* seq = bam_get_seq(bw->b);
|
||||||
|
// for (int i = sd.cigar_start; i < sd.cigar_end; ++i) {
|
||||||
|
// const char c = bam_cigar_opchr(cigar[i]);
|
||||||
|
// int len = bam_cigar_oplen(cigar[i]);
|
||||||
|
// if (i == sd.cigar_start) len -= sd.first_cigar_clip;
|
||||||
|
// if (i == sd.cigar_end - 1) len -= sd.last_cigar_clip;
|
||||||
|
// if (c == 'M' || c == '=' || c == 'X') {
|
||||||
|
// for (int j = 0; j < len; ++j) {
|
||||||
|
// // 按位置将read和ref碱基进行比较,不同则是snp,注意read起始位置要加上left_clip
|
||||||
|
// int snpInt = cBaseToChar[bam_seqi(seq, readPos + sd.left_clip)] == refBases[refPos] ? 0 : 1;
|
||||||
|
// // if (snpInt > 0) { spdlog::info("snp {}, readpos: {}", snpInt, readPos); }
|
||||||
|
// isSNP[readPos] = snpInt;
|
||||||
|
// nEvents += snpInt;
|
||||||
|
// readPos++;
|
||||||
|
// refPos++;
|
||||||
|
// }
|
||||||
|
// } else if (c == 'D') {
|
||||||
|
// // 应该是在上一个消耗碱基的cigar的最后一个位置,标记Del
|
||||||
|
// int index = bw->GetReadNegativeStrandFlag() ? readPos : readPos - 1;
|
||||||
|
// updateIndel(isDel, index);
|
||||||
|
// refPos += len;
|
||||||
|
// } else if (c == 'N') {
|
||||||
|
// refPos += len;
|
||||||
|
// } else if (c == 'I') {
|
||||||
|
// // 与Del不同,Ins应该是在下一个cigar开始的位置,标记Ins
|
||||||
|
// bool forwardStrandRead = !bw->GetReadNegativeStrandFlag();
|
||||||
|
// if (forwardStrandRead) {
|
||||||
|
// updateIndel(isIns, readPos - 1);
|
||||||
|
// }
|
||||||
|
// readPos += len;
|
||||||
|
// if (!forwardStrandRead) {
|
||||||
|
// updateIndel(isIns, readPos);
|
||||||
|
// }
|
||||||
|
// } else if (c == 'S') {
|
||||||
|
// readPos += len;
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
nEvents += std::accumulate(isIns.begin(), isIns.end(), 0) + std::accumulate(isDel.begin(), isDel.end(), 0);
|
nEvents += std::accumulate(isIns.begin(), isIns.end(), 0) + std::accumulate(isDel.begin(), isDel.end(), 0);
|
||||||
// spdlog::info("nEvents: {}", nEvents);
|
// spdlog::info("nEvents: {}", nEvents);
|
||||||
|
|
||||||
|
|
@ -448,14 +491,14 @@ int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, SamData &ad, vector<int> &is
|
||||||
}
|
}
|
||||||
|
|
||||||
// 简单计算baq数组,就是全部赋值为'@' (64)
|
// 简单计算baq数组,就是全部赋值为'@' (64)
|
||||||
bool flatBAQArray(BamWrap* bw, SamData& ad, vector<uint8_t>& baqArray) {
|
bool flatBAQArray(SamData& sd, vector<uint8_t>& baqArray) {
|
||||||
baqArray.resize(ad.read_len, (uint8_t)'@');
|
baqArray.resize(sd.read_len, (uint8_t)'@');
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 计算真实的baq数组,耗时更多,好像enable-baq参数默认是关闭的,那就先不实现这个了
|
// 计算真实的baq数组,耗时更多,好像enable-baq参数默认是关闭的,那就先不实现这个了
|
||||||
bool calculateBAQArray(AuxVar& aux, BAQ& baq, BamWrap* bw, SamData& ad, vector<uint8_t>& baqArray) {
|
bool calculateBAQArray(AuxVar& aux, BAQ& baq, SamData& sd, vector<uint8_t>& baqArray) {
|
||||||
baqArray.resize(ad.read_len, 0);
|
baqArray.resize(sd.read_len, 0);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -471,7 +514,8 @@ static void get_line_from_buf(char* buf, int64_t total, int64_t* cur, string* li
|
||||||
}
|
}
|
||||||
|
|
||||||
// 计算与read有交叉的已知位点信息, 应该要判断一下,是按照read的范围去读取vcf,还是按照一个batch read的范围去读取
|
// 计算与read有交叉的已知位点信息, 应该要判断一下,是按照read的范围去读取vcf,还是按照一个batch read的范围去读取
|
||||||
void calculateKnownSites(BamWrap* bw, SamData& ad, vector<VCFParser> &vcfs, vector<bool> &knownSites) {
|
void calculateKnownSites(SamData& sd, vector<VCFParser> &vcfs, vector<bool> &knownSites) {
|
||||||
|
BamWrap* bw = sd.bw;
|
||||||
int tid = bw->contig_id();
|
int tid = bw->contig_id();
|
||||||
uint64_t startPos = bw->start_pos(); // 闭区间
|
uint64_t startPos = bw->start_pos(); // 闭区间
|
||||||
uint64_t endPos = bw->end_pos(); // 闭区间
|
uint64_t endPos = bw->end_pos(); // 闭区间
|
||||||
|
|
@ -532,28 +576,28 @@ void calculateKnownSites(BamWrap* bw, SamData& ad, vector<VCFParser> &vcfs, vect
|
||||||
//}
|
//}
|
||||||
}
|
}
|
||||||
//exit(0);
|
//exit(0);
|
||||||
knownSites.resize(ad.read_len);
|
knownSites.resize(sd.read_len);
|
||||||
endPos = bw->end_pos();
|
endPos = bw->end_pos();
|
||||||
|
|
||||||
for(auto &vcf : vcfs) {
|
for(auto &vcf : vcfs) {
|
||||||
for (auto &intv : vcf.knownSites) {
|
for (auto &intv : vcf.knownSites) {
|
||||||
// knownSite is outside clipping window for the read, ignore
|
// knownSite is outside clipping window for the read, ignore
|
||||||
if (intv.right < ad.softStart())
|
if (intv.right < sd.softStart())
|
||||||
continue;
|
continue;
|
||||||
if (intv.left > ad.softEnd())
|
if (intv.left > sd.softEnd())
|
||||||
break;
|
break;
|
||||||
// 计算对应的位点
|
// 计算对应的位点
|
||||||
ReadIdxCigar rcStart = ad.getReadIndexForReferenceCoordinate(intv.left);
|
ReadIdxCigar rcStart = sd.getReadIndexForReferenceCoordinate(intv.left);
|
||||||
int featureStartOnRead = rcStart.readIdx == SamData::READ_INDEX_NOT_FOUND ? 0 : rcStart.readIdx;
|
int featureStartOnRead = rcStart.readIdx == SamData::READ_INDEX_NOT_FOUND ? 0 : rcStart.readIdx;
|
||||||
if (rcStart.cigarOp == 'D') {
|
if (rcStart.cigarOp == 'D') {
|
||||||
--featureStartOnRead;
|
--featureStartOnRead;
|
||||||
}
|
}
|
||||||
ReadIdxCigar rcEnd = ad.getReadIndexForReferenceCoordinate(intv.right);
|
ReadIdxCigar rcEnd = sd.getReadIndexForReferenceCoordinate(intv.right);
|
||||||
int featureEndOnRead = rcEnd.readIdx == SamData::READ_INDEX_NOT_FOUND ? ad.read_len : rcEnd.readIdx;
|
int featureEndOnRead = rcEnd.readIdx == SamData::READ_INDEX_NOT_FOUND ? sd.read_len : rcEnd.readIdx;
|
||||||
if (featureStartOnRead > ad.read_len) {
|
if (featureStartOnRead > sd.read_len) {
|
||||||
featureStartOnRead = featureEndOnRead = ad.read_len;
|
featureStartOnRead = featureEndOnRead = sd.read_len;
|
||||||
}
|
}
|
||||||
for (int i = max(0, featureStartOnRead); i < min(ad.read_len, featureEndOnRead + 1); ++i) {
|
for (int i = max(0, featureStartOnRead); i < min(sd.read_len, featureEndOnRead + 1); ++i) {
|
||||||
knownSites[i] = true;
|
knownSites[i] = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -620,6 +664,9 @@ void updateRecalTablesForRead(ReadRecalInfo &info) {
|
||||||
|
|
||||||
int readLength = read.read_len;
|
int readLength = read.read_len;
|
||||||
for (int offset = 0; offset < readLength; ++offset) {
|
for (int offset = 0; offset < readLength; ++offset) {
|
||||||
|
//if (read.rid == 46114) {
|
||||||
|
// fprintf(gf[3], "%d %d\n", offset, info.skips[offset] ? 1 : 0);
|
||||||
|
//}
|
||||||
if (!info.skips[offset]) {
|
if (!info.skips[offset]) {
|
||||||
//if (true){ // 不跳过当前位置
|
//if (true){ // 不跳过当前位置
|
||||||
for (int idx = 0; idx < nsgv::gEventTypes.size(); ++idx) {
|
for (int idx = 0; idx < nsgv::gEventTypes.size(); ++idx) {
|
||||||
|
|
@ -634,6 +681,9 @@ void updateRecalTablesForRead(ReadRecalInfo &info) {
|
||||||
|
|
||||||
// 处理base quality score协变量
|
// 处理base quality score协变量
|
||||||
// RecalUtils::IncrementDatum3keys(qualityScoreTable, qual, isError, readGroup, baseQuality, event.index);
|
// RecalUtils::IncrementDatum3keys(qualityScoreTable, qual, isError, readGroup, baseQuality, event.index);
|
||||||
|
if (read.rid == 46114) {
|
||||||
|
// fprintf(gf[3], "%d %d %f\n", offset, baseQuality, isError);
|
||||||
|
}
|
||||||
qualityScoreTable[readGroup][baseQuality][event.index].increment(1, isError, baseQuality);
|
qualityScoreTable[readGroup][baseQuality][event.index].increment(1, isError, baseQuality);
|
||||||
|
|
||||||
auto &d = qualityScoreTable[readGroup][baseQuality][event.index];
|
auto &d = qualityScoreTable[readGroup][baseQuality][event.index];
|
||||||
|
|
@ -653,6 +703,29 @@ void updateRecalTablesForRead(ReadRecalInfo &info) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// fprintf(gf[3], "%ld %d %ld\n", read.rid, read.read_len, read.start_pos+1);
|
||||||
|
// for (auto& arr1 : qualityScoreTable.data) {
|
||||||
|
// for (size_t si = 0; si < arr1.size(); ++si) {
|
||||||
|
// for (auto &val : arr1[si]) {
|
||||||
|
// if (val.numObservations > 0)
|
||||||
|
// fprintf(gf[3], "%ld %f %f ", val.numObservations, val.getNumMismatches(), val.reportedQuality);
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// fprintf(gf[3], "\n");
|
||||||
|
|
||||||
|
// fprintf(gf[3], "%ld %d %ld\n", read.rid, read.read_len, read.start_pos+1);
|
||||||
|
// for (auto& arr1 : contextTable.data) {
|
||||||
|
// for (size_t si = 0; si < arr1.size(); ++si) {
|
||||||
|
// for (auto &arr2 : arr1[si]) {
|
||||||
|
// for (auto& val : arr2) {
|
||||||
|
// if (val.numObservations > 0)
|
||||||
|
// fprintf(gf[3], "%ld %f %f ", val.numObservations, val.getNumMismatches(), val.reportedQuality);
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// fprintf(gf[3], "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
// 数据总结
|
// 数据总结
|
||||||
|
|
@ -756,51 +829,51 @@ int SerialBQSR() {
|
||||||
// 4. 对read的两端进行检测,去除(hardclip)adapter
|
// 4. 对read的两端进行检测,去除(hardclip)adapter
|
||||||
// spdlog::info("bam idx: {}", i);
|
// spdlog::info("bam idx: {}", i);
|
||||||
BamWrap* bw = bams[i];
|
BamWrap* bw = bams[i];
|
||||||
SamData ad;
|
SamData sd;
|
||||||
ad.rid = i;
|
sd.rid = i + readNumSum;
|
||||||
ad.bw = bw;
|
sd.bw = bw;
|
||||||
ad.read_len = BamWrap::BamEffectiveLength(bw->b);
|
sd.read_len = BamWrap::BamEffectiveLength(bw->b);
|
||||||
ad.cigar_end = bw->b->core.n_cigar;
|
sd.cigar_end = bw->b->core.n_cigar;
|
||||||
if (ad.read_len <= 0) continue;
|
if (sd.read_len <= 0) continue;
|
||||||
|
|
||||||
int adapter_boundary = bw->GetAdapterBoundary();
|
int adapter_boundary = bw->GetAdapterBoundary();
|
||||||
if (bw->IsAdapterInRead(adapter_boundary)) {
|
if (bw->IsAdapterInRead(adapter_boundary)) {
|
||||||
// adapter在read范围内
|
// adapter在read范围内
|
||||||
if (bw->GetReadNegativeStrandFlag()) { // 反链
|
if (bw->GetReadNegativeStrandFlag()) { // 反链
|
||||||
clipByReferenceCoordinates(bw, -1, adapter_boundary, ad);
|
clipByReferenceCoordinates(bw, -1, adapter_boundary, sd);
|
||||||
} else { // 正链
|
} else { // 正链
|
||||||
clipByReferenceCoordinates(bw, adapter_boundary, -1, ad);
|
clipByReferenceCoordinates(bw, adapter_boundary, -1, sd);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ad.read_len = bw->b->core.l_qseq - ad.left_clip - ad.right_clip; // 更新read长度
|
sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip; // 更新read长度
|
||||||
// 5. 然后再去除softclip部分
|
// 5. 然后再去除softclip部分
|
||||||
calculateSoftClip(bw, ad);
|
calculateSoftClip(bw, sd);
|
||||||
ad.read_len = bw->b->core.l_qseq - ad.left_clip - ad.right_clip; // 更新read长度
|
sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip; // 更新read长度
|
||||||
if (ad.read_len <= 0) continue;
|
if (sd.read_len <= 0) continue;
|
||||||
|
|
||||||
calculateRefOffset(bw, ad); // 计算ref_offset,就是相对比对的position,要将ref右移多少
|
calculateRefOffset(bw, sd); // 计算ref_offset,就是相对比对的position,要将ref右移多少
|
||||||
calculateReadBases(bw, ad); // 计算clip处理之后,剩余的碱基
|
calculateReadBases(bw, sd); // 计算clip处理之后,剩余的碱基
|
||||||
// 计算clip之后,两端的softstart和softend
|
// 计算clip之后,两端的softstart和softend
|
||||||
calculateSoftStartEnd(bw, ad);
|
calculateSoftStartEnd(bw, sd);
|
||||||
calculateCigar(bw, ad);
|
calculateCigar(bw, sd);
|
||||||
|
|
||||||
//spdlog::info("read-len {} - {}: clip left {}, right {}, ref offset: {}, cigar range: [{}, {}), cigar: {}", bw->b->core.l_qseq,
|
//spdlog::info("read-len {} - {}: clip left {}, right {}, ref offset: {}, cigar range: [{}, {}), cigar: {}", bw->b->core.l_qseq,
|
||||||
// ad.read_len, ad.left_clip, ad.right_clip, ad.ref_offset, ad.cigar_start, ad.cigar_end, bw->cigar_str());
|
// sd.read_len, sd.left_clip, sd.right_clip, sd.ref_offset, sd.cigar_start, sd.cigar_end, bw->cigar_str());
|
||||||
|
|
||||||
// 6. 更新每个read的platform信息,好像没啥用,暂时忽略
|
// 6. 更新每个read的platform信息,好像没啥用,暂时忽略
|
||||||
vector<int> isSNP(ad.read_len, 0); // 该位置是否是SNP位置,0不是,1是
|
vector<int> isSNP(sd.read_len, 0); // 该位置是否是SNP位置,0不是,1是
|
||||||
vector<int> isIns(ad.read_len, 0); // 该位置是否是插入位置,0不是,1是
|
vector<int> isIns(sd.read_len, 0); // 该位置是否是插入位置,0不是,1是
|
||||||
vector<int> isDel(ad.read_len, 0); // 该位置是否是删除位置,0不是,1是
|
vector<int> isDel(sd.read_len, 0); // 该位置是否是删除位置,0不是,1是
|
||||||
const int nErrors = calculateIsSNPOrIndel(nsgv::gAuxVars[0], bw, ad, isSNP, isIns, isDel);
|
const int nErrors = calculateIsSNPOrIndel(nsgv::gAuxVars[0], sd, isSNP, isIns, isDel);
|
||||||
|
|
||||||
/*fprintf(gf[0], "%d\t", ad.read_len);
|
/*fprintf(gf[0], "%d\t", sd.read_len);
|
||||||
for (int ii = 0; ii < ad.read_len; ++ii) fprintf(gf[0], "%d ", isSNP[ii]);
|
for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[0], "%d ", isSNP[ii]);
|
||||||
fprintf(gf[0], "\n");
|
fprintf(gf[0], "\n");
|
||||||
fprintf(gf[1], "%d\t", ad.read_len);
|
fprintf(gf[1], "%d\t", sd.read_len);
|
||||||
for (int ii = 0; ii < ad.read_len; ++ii) fprintf(gf[1], "%d ", isIns[ii]);
|
for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[1], "%d ", isIns[ii]);
|
||||||
fprintf(gf[1], "\n");
|
fprintf(gf[1], "\n");
|
||||||
fprintf(gf[2], "%d\t", ad.read_len);
|
fprintf(gf[2], "%d\t", sd.read_len);
|
||||||
for (int ii = 0; ii < ad.read_len; ++ii) fprintf(gf[2], "%d ", isDel[ii]);
|
for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[2], "%d ", isDel[ii]);
|
||||||
fprintf(gf[2], "\n");
|
fprintf(gf[2], "\n");
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
@ -816,21 +889,21 @@ int SerialBQSR() {
|
||||||
vector<uint8_t> baqArray;
|
vector<uint8_t> baqArray;
|
||||||
bool baqCalculated = false;
|
bool baqCalculated = false;
|
||||||
if (nErrors == 0 || !nsgv::gBqsrArg.enableBAQ) {
|
if (nErrors == 0 || !nsgv::gBqsrArg.enableBAQ) {
|
||||||
baqCalculated = flatBAQArray(bw, ad, baqArray);
|
baqCalculated = flatBAQArray(sd, baqArray);
|
||||||
} else {
|
} else {
|
||||||
// baqCalculated = calculateBAQArray(nsgv::gAuxVars[0], baq, bw, ad, baqArray);
|
// baqCalculated = calculateBAQArray(nsgv::gAuxVars[0], baq, sd, baqArray);
|
||||||
}
|
}
|
||||||
if (!baqCalculated) continue;
|
if (!baqCalculated) continue;
|
||||||
// 到这里,基本的数据都准备好了,后续就是进行bqsr的统计了
|
// 到这里,基本的数据都准备好了,后续就是进行bqsr的统计了
|
||||||
|
|
||||||
// 8. 计算这条read对应的协变量
|
// 8. 计算这条read对应的协变量
|
||||||
PROF_START(covariate);
|
PROF_START(covariate);
|
||||||
CovariateUtils::ComputeCovariates(bw, ad, nsgv::gInBamHeader, readCovariates, true);
|
CovariateUtils::ComputeCovariates(sd, nsgv::gInBamHeader, readCovariates, true);
|
||||||
PROF_END(gprof[GP_covariate], covariate);
|
PROF_END(gprof[GP_covariate], covariate);
|
||||||
|
|
||||||
// fprintf(gf[3], "%ld %ld %d %ld\n", ad.rid, readCovariates.size(), ad.read_len, readCovariates[0][0].size());
|
// fprintf(gf[3], "%ld %ld %d %ld\n", sd.rid, readCovariates.size(), sd.read_len, readCovariates[0][0].size());
|
||||||
// for (auto &arr1 : readCovariates) {
|
// for (auto &arr1 : readCovariates) {
|
||||||
// for (size_t si = 0; si < ad.read_len; ++si) {
|
// for (size_t si = 0; si < sd.read_len; ++si) {
|
||||||
// for (auto &val : arr1[si]) {
|
// for (auto &val : arr1[si]) {
|
||||||
// fprintf(gf[3], "%d ", val);
|
// fprintf(gf[3], "%d ", val);
|
||||||
// }
|
// }
|
||||||
|
|
@ -838,22 +911,33 @@ int SerialBQSR() {
|
||||||
// }
|
// }
|
||||||
// fprintf(gf[3], "\n");
|
// fprintf(gf[3], "\n");
|
||||||
|
|
||||||
|
// fprintf(gf[3], "%ld %d\n", sd.rid, sd.read_len);
|
||||||
|
// {
|
||||||
|
// auto& arr1 = readCovariates[0];
|
||||||
|
// {
|
||||||
|
// for (int pos = 0; pos < sd.read_len; ++pos) {
|
||||||
|
// fprintf(gf[3], "%d %d\n", pos, arr1[pos][2]);
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// fprintf(gf[3], "\n");
|
||||||
|
|
||||||
//test = readCovariates[1][0][0] + readCovariates[2][1][3];
|
//test = readCovariates[1][0][0] + readCovariates[2][1][3];
|
||||||
//int end_pos = bw->contig_end_pos();
|
//int end_pos = bw->contig_end_pos();
|
||||||
//spdlog::info("adapter: {}, read: {}, {}, strand: {}", adapter_boundary, bw->contig_pos(), end_pos,
|
//spdlog::info("adapter: {}, read: {}, {}, strand: {}", adapter_boundary, bw->contig_pos(), end_pos,
|
||||||
// bw->GetReadNegativeStrandFlag() ? "reverse" : "forward");
|
// bw->GetReadNegativeStrandFlag() ? "reverse" : "forward");
|
||||||
// for (auto val : isSNP) { if (val > 0) spdlog::info("snp err val-1: {}", val); }
|
// for (auto val : isSNP) { if (val > 0) spdlog::info("snp err val-1: {}", val); }
|
||||||
// 9. 计算这条read需要跳过的位置
|
// 9. 计算这条read需要跳过的位置
|
||||||
vector<bool> skips(ad.read_len, 0);
|
vector<bool> skips(sd.read_len, 0);
|
||||||
PROF_START(known_sites);
|
PROF_START(known_sites);
|
||||||
calculateKnownSites(bw, ad, nsgv::gAuxVars[0].vcfArr, skips);
|
calculateKnownSites(sd, nsgv::gAuxVars[0].vcfArr, skips);
|
||||||
for (int ii = 0; ii < ad.read_len; ++ii) {
|
for (int ii = 0; ii < sd.read_len; ++ii) {
|
||||||
skips[ii] = skips[ii] || (ContextCovariate::baseIndexMap[ad.bases[ii]] == -1) ||
|
skips[ii] = skips[ii] || (ContextCovariate::baseIndexMap[sd.bases[ii]] == -1) ||
|
||||||
ad.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN;
|
sd.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN;
|
||||||
}
|
}
|
||||||
PROF_END(gprof[GP_read_vcf], known_sites);
|
PROF_END(gprof[GP_read_vcf], known_sites);
|
||||||
// fprintf(gf[0], "%d\t", ad.read_len);
|
// fprintf(gf[0], "%d\t", sd.read_len);
|
||||||
// for (int ii = 0; ii < ad.read_len; ++ii) fprintf(gf[0], "%d ", skips[ii] ? 1 : 0);
|
// for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[0], "%d ", skips[ii] ? 1 : 0);
|
||||||
// fprintf(gf[0], "\n");
|
// fprintf(gf[0], "\n");
|
||||||
|
|
||||||
// 10. 根据BAQ进一步处理snp,indel,得到处理后的数据
|
// 10. 根据BAQ进一步处理snp,indel,得到处理后的数据
|
||||||
|
|
@ -864,12 +948,12 @@ int SerialBQSR() {
|
||||||
calculateFractionalErrorArray(isDel, baqArray, delErrors);
|
calculateFractionalErrorArray(isDel, baqArray, delErrors);
|
||||||
|
|
||||||
// for (auto val : isSNP) { if (val > 0) spdlog::info("snp val: {}", val); }
|
// for (auto val : isSNP) { if (val > 0) spdlog::info("snp val: {}", val); }
|
||||||
//spdlog::info("snp errors size: {}, read len: {}", snpErrors.size(), ad.read_len);
|
//spdlog::info("snp errors size: {}, read len: {}", snpErrors.size(), sd.read_len);
|
||||||
//for (auto val : snpErrors) { if (val > 0) spdlog::info("snp err val: {}", val); }
|
//for (auto val : snpErrors) { if (val > 0) spdlog::info("snp err val: {}", val); }
|
||||||
|
|
||||||
// aggregate all of the info into our info object, and update the data
|
// aggregate all of the info into our info object, and update the data
|
||||||
// 11. 合并之前计算的数据,得到info,并更新bqsr table数据
|
// 11. 合并之前计算的数据,得到info,并更新bqsr table数据
|
||||||
ReadRecalInfo info(ad, readCovariates, skips, snpErrors, insErrors, delErrors);
|
ReadRecalInfo info(sd, readCovariates, skips, snpErrors, insErrors, delErrors);
|
||||||
int m = 0;
|
int m = 0;
|
||||||
// for (auto err : snpErrors) { if (isSNP[m] > 0 || err > 0) spdlog::info("snp err: {} : {}", isSNP[m++], err); }
|
// for (auto err : snpErrors) { if (isSNP[m] > 0 || err > 0) spdlog::info("snp err: {} : {}", isSNP[m++], err); }
|
||||||
//exit(0);
|
//exit(0);
|
||||||
|
|
|
||||||
|
|
@ -23,17 +23,17 @@ int CycleCovariate::MAXIMUM_CYCLE_VALUE;
|
||||||
|
|
||||||
// for CovariateUtils
|
// for CovariateUtils
|
||||||
// 对一条read计算协变量(该协变量被上一个read用过)
|
// 对一条read计算协变量(该协变量被上一个read用过)
|
||||||
void CovariateUtils::ComputeCovariates(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values,
|
void CovariateUtils::ComputeCovariates(SamData& sd, sam_hdr_t* header, PerReadCovariateMatrix& values,
|
||||||
bool recordIndelValues) {
|
bool recordIndelValues) {
|
||||||
ReadGroupCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
|
ReadGroupCovariate::RecordValues(sd, header, values, recordIndelValues);
|
||||||
BaseQualityCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
|
BaseQualityCovariate::RecordValues(sd, header, values, recordIndelValues);
|
||||||
ContextCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
|
ContextCovariate::RecordValues(sd, header, values, recordIndelValues);
|
||||||
CycleCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
|
CycleCovariate::RecordValues(sd, header, values, recordIndelValues);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ReadGroupCovariate 协变量的方法
|
// ReadGroupCovariate 协变量的方法
|
||||||
void ReadGroupCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
|
void ReadGroupCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
|
||||||
uint8_t *rgStr = bam_aux_get(bw->b, "RG");
|
uint8_t *rgStr = bam_aux_get(sd.bw->b, "RG");
|
||||||
char* rgVal = nullptr;
|
char* rgVal = nullptr;
|
||||||
if (rgStr) rgVal = bam_aux2Z(rgStr);
|
if (rgStr) rgVal = bam_aux2Z(rgStr);
|
||||||
int key = 0;
|
int key = 0;
|
||||||
|
|
@ -42,34 +42,35 @@ void ReadGroupCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* heade
|
||||||
} else {
|
} else {
|
||||||
key = RgToId[rgVal];
|
key = RgToId[rgVal];
|
||||||
}
|
}
|
||||||
for (int i = 0; i < ad.read_len; ++i) {
|
for (int i = 0; i < sd.read_len; ++i) {
|
||||||
CovariateUtils::SetCovariate(key, key, key, i, ReadGroupCovariate::index, values);
|
CovariateUtils::SetCovariate(key, key, key, i, ReadGroupCovariate::index, values);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// BaseQualityCovariate 协变量的方法
|
// BaseQualityCovariate 协变量的方法
|
||||||
void BaseQualityCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values,
|
void BaseQualityCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovariateMatrix& values,
|
||||||
bool recordIndelValues) {
|
bool recordIndelValues) {
|
||||||
// 在前面的处理过后,quals应该和base长度一致了
|
// 在前面的处理过后,quals应该和base长度一致了
|
||||||
#define __bq_set_cov(ins, del) \
|
#define __bq_set_cov(ins, del) \
|
||||||
do { \
|
do { \
|
||||||
for (int i = 0; i < ad.read_len; ++i) { \
|
for (int i = 0; i < sd.read_len; ++i) { \
|
||||||
CovariateUtils::SetCovariate(quals[i + ad.left_clip], (ins), (del), i, BaseQualityCovariate::index, values); \
|
CovariateUtils::SetCovariate(quals[i], (ins), (del), i, BaseQualityCovariate::index, values); \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
const int INDEL_QUAL = 45;
|
const int INDEL_QUAL = 45;
|
||||||
uint8_t* quals = bam_get_qual(bw->b);
|
// uint8_t* quals = bam_get_qual(bw->b);
|
||||||
|
auto &quals = sd.base_quals;
|
||||||
if (recordIndelValues) {
|
if (recordIndelValues) {
|
||||||
uint8_t* insQualPtr = bam_aux_get(bw->b, "BI"); // base qualities for insertions
|
uint8_t* insQualPtr = bam_aux_get(sd.bw->b, "BI"); // base qualities for insertions
|
||||||
uint8_t* delQualPtr = bam_aux_get(bw->b, "BD"); // base qualities for deletions
|
uint8_t* delQualPtr = bam_aux_get(sd.bw->b, "BD"); // base qualities for deletions
|
||||||
if (insQualPtr == nullptr && delQualPtr == nullptr) {
|
if (insQualPtr == nullptr && delQualPtr == nullptr) {
|
||||||
__bq_set_cov(INDEL_QUAL, INDEL_QUAL);
|
__bq_set_cov(INDEL_QUAL, INDEL_QUAL);
|
||||||
} else if (insQualPtr == nullptr) {
|
} else if (insQualPtr == nullptr) {
|
||||||
uint8_t* delQuals = (uint8_t*)bam_aux2Z(delQualPtr);
|
uint8_t* delQuals = (uint8_t*)bam_aux2Z(delQualPtr) + sd.left_clip;
|
||||||
__bq_set_cov(INDEL_QUAL, delQuals[i]);
|
__bq_set_cov(INDEL_QUAL, delQuals[i]);
|
||||||
} else {
|
} else {
|
||||||
uint8_t* insQuals = (uint8_t*)bam_aux2Z(insQualPtr);
|
uint8_t* insQuals = (uint8_t*)bam_aux2Z(insQualPtr) + sd.left_clip;
|
||||||
__bq_set_cov(insQuals[i], INDEL_QUAL);
|
__bq_set_cov(insQuals[i], INDEL_QUAL);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -144,14 +145,11 @@ static void ClipLowQualEndsWithN(string& bases, const FastArray<uint8_t> &quals,
|
||||||
}
|
}
|
||||||
|
|
||||||
// 获取去除低质量分数碱基之后的read碱基序列(将低质量分数的碱基变成N)
|
// 获取去除低质量分数碱基之后的read碱基序列(将低质量分数的碱基变成N)
|
||||||
void ContextCovariate::GetStrandedClippedBytes(BamWrap* bw, SamData& ad, string& clippedBases, uint8_t lowQTail) {
|
void ContextCovariate::GetStrandedClippedBytes(SamData& sd, string& clippedBases, uint8_t lowQTail) {
|
||||||
uint8_t* quals = bam_get_qual(bw->b) + ad.left_clip;
|
if (sd.bw->GetReadNegativeStrandFlag()) { // 反向互补
|
||||||
|
for (int i = 0; i < sd.read_len; ++i) clippedBases[i] = SimpleComplement(sd.bases[sd.read_len - 1 - i]);
|
||||||
if (bw->GetReadNegativeStrandFlag()) { // 反向互补
|
|
||||||
for (int i = 0; i < ad.read_len; ++i) clippedBases[i] = SimpleComplement(ad.bases[ad.read_len - 1 - i]);
|
|
||||||
}
|
}
|
||||||
|
ClipLowQualEndsWithN(clippedBases, sd.base_quals, lowQTail, sd.bw->GetReadNegativeStrandFlag());
|
||||||
ClipLowQualEndsWithN(clippedBases, ad.base_quals, lowQTail, bw->GetReadNegativeStrandFlag());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -243,12 +241,13 @@ void ContextCovariate::GetReadContextAtEachPosition(const string& bases, const i
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ContextCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
|
void ContextCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
|
||||||
const int originalReadLength = ad.read_len;
|
const int originalReadLength = sd.read_len;
|
||||||
|
|
||||||
// store the original bases and then write Ns over low quality ones
|
// store the original bases and then write Ns over low quality ones
|
||||||
string strandedClippedBases(ad.bases);
|
string strandedClippedBases(sd.bases);
|
||||||
GetStrandedClippedBytes(bw, ad, strandedClippedBases, 30); // 注意这里的lowQualTail数值
|
// GetStrandedClippedBytes(bw, sd, strandedClippedBases, 30); // 注意这里的lowQualTail数值
|
||||||
|
GetStrandedClippedBytes(sd, strandedClippedBases, lowQualTail); // 命名我之前看到过这个30的?
|
||||||
// spdlog::info("bases: {}", strandedClippedBases);
|
// spdlog::info("bases: {}", strandedClippedBases);
|
||||||
vector<int> nBasePairContextAtEachCycle;
|
vector<int> nBasePairContextAtEachCycle;
|
||||||
GetReadContextAtEachPosition(strandedClippedBases, mismatchesContextSize, mismatchesKeyMask, nBasePairContextAtEachCycle);
|
GetReadContextAtEachPosition(strandedClippedBases, mismatchesContextSize, mismatchesKeyMask, nBasePairContextAtEachCycle);
|
||||||
|
|
@ -266,7 +265,7 @@ void ContextCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const bool negativeStrand = bw->GetReadNegativeStrandFlag();
|
const bool negativeStrand = sd.bw->GetReadNegativeStrandFlag();
|
||||||
// Note: duplicated the loop to avoid checking recordIndelValues on each iteration
|
// Note: duplicated the loop to avoid checking recordIndelValues on each iteration
|
||||||
if (recordIndelValues) {
|
if (recordIndelValues) {
|
||||||
vector<int> indelKeys;
|
vector<int> indelKeys;
|
||||||
|
|
@ -295,10 +294,10 @@ void ContextCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header,
|
||||||
* @param maxCycle max value of the base to compute the key for
|
* @param maxCycle max value of the base to compute the key for
|
||||||
* (this method throws UserException if the computed absolute value of the cycle number is higher than this value).
|
* (this method throws UserException if the computed absolute value of the cycle number is higher than this value).
|
||||||
*/
|
*/
|
||||||
int CycleCovariate::CycleKey(BamWrap* bw, SamData& ad, const int baseNumber, const bool indel, const int maxCycle) {
|
int CycleCovariate::CycleKey(SamData& sd, const int baseNumber, const bool indel, const int maxCycle) {
|
||||||
const bool isNegStrand = bw->GetReadNegativeStrandFlag();
|
const bool isNegStrand = sd.bw->GetReadNegativeStrandFlag();
|
||||||
const bool isSecondInPair = (bw->b->core.flag & BAM_FPAIRED) && (bw->b->core.flag & BAM_FREAD2);
|
const bool isSecondInPair = (sd.bw->b->core.flag & BAM_FPAIRED) && (sd.bw->b->core.flag & BAM_FREAD2);
|
||||||
const int readLength = ad.read_len;
|
const int readLength = sd.read_len;
|
||||||
|
|
||||||
const int readOrderFactor = isSecondInPair ? -1 : 1;
|
const int readOrderFactor = isSecondInPair ? -1 : 1;
|
||||||
int increment;
|
int increment;
|
||||||
|
|
@ -324,18 +323,18 @@ int CycleCovariate::CycleKey(BamWrap* bw, SamData& ad, const int baseNumber, con
|
||||||
}
|
}
|
||||||
|
|
||||||
// Used to pick out the covariate's value from attributes of the read
|
// Used to pick out the covariate's value from attributes of the read
|
||||||
void CycleCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
|
void CycleCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
|
||||||
const int readLength = ad.read_len;
|
const int readLength = sd.read_len;
|
||||||
// Note: duplicate the loop to void checking recordIndelValues on every iteration
|
// Note: duplicate the loop to void checking recordIndelValues on every iteration
|
||||||
if (recordIndelValues) {
|
if (recordIndelValues) {
|
||||||
for (int i = 0; i < readLength; i++) {
|
for (int i = 0; i < readLength; i++) {
|
||||||
const int substitutionKey = CycleKey(bw, ad, i, false, MAXIMUM_CYCLE_VALUE);
|
const int substitutionKey = CycleKey(sd, i, false, MAXIMUM_CYCLE_VALUE);
|
||||||
const int indelKey = CycleKey(bw, ad, i, true, MAXIMUM_CYCLE_VALUE);
|
const int indelKey = CycleKey(sd, i, true, MAXIMUM_CYCLE_VALUE);
|
||||||
CovariateUtils::SetCovariate(substitutionKey, indelKey, indelKey, i, CycleCovariate::index, values);
|
CovariateUtils::SetCovariate(substitutionKey, indelKey, indelKey, i, CycleCovariate::index, values);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (int i = 0; i < readLength; i++) {
|
for (int i = 0; i < readLength; i++) {
|
||||||
const int substitutionKey = CycleKey(bw, ad, i, false, MAXIMUM_CYCLE_VALUE);
|
const int substitutionKey = CycleKey(sd, i, false, MAXIMUM_CYCLE_VALUE);
|
||||||
CovariateUtils::SetCovariate(substitutionKey, 0, 0, i, CycleCovariate::index, values);
|
CovariateUtils::SetCovariate(substitutionKey, 0, 0, i, CycleCovariate::index, values);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -71,7 +71,7 @@ struct CovariateUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
// 对一条read计算协变量(该协变量被上一个read用过)
|
// 对一条read计算协变量(该协变量被上一个read用过)
|
||||||
static void ComputeCovariates(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
static void ComputeCovariates(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
||||||
};
|
};
|
||||||
|
|
||||||
// Read group协变量
|
// Read group协变量
|
||||||
|
|
@ -80,13 +80,13 @@ struct ReadGroupCovariate {
|
||||||
static map<string, int> RgToId; // read group name到id的映射
|
static map<string, int> RgToId; // read group name到id的映射
|
||||||
static map<int, string> IdToRg; // id到read group name的映射
|
static map<int, string> IdToRg; // id到read group name的映射
|
||||||
|
|
||||||
static void RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
||||||
};
|
};
|
||||||
|
|
||||||
// Base quality协变量
|
// Base quality协变量
|
||||||
struct BaseQualityCovariate {
|
struct BaseQualityCovariate {
|
||||||
static constexpr int index = 1; // 在协变量数组中的索引位置
|
static constexpr int index = 1; // 在协变量数组中的索引位置
|
||||||
static void RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
||||||
};
|
};
|
||||||
|
|
||||||
// Context协变量
|
// Context协变量
|
||||||
|
|
@ -210,14 +210,14 @@ struct ContextCovariate {
|
||||||
}
|
}
|
||||||
|
|
||||||
// 获取去除低质量分数碱基之后的read碱基序列(将低质量分数的碱基变成N)
|
// 获取去除低质量分数碱基之后的read碱基序列(将低质量分数的碱基变成N)
|
||||||
static void GetStrandedClippedBytes(BamWrap* bw, SamData& ad, string& clippedBases, uint8_t lowQTail);
|
static void GetStrandedClippedBytes(SamData& ad, string& clippedBases, uint8_t lowQTail);
|
||||||
// Creates a int representation of a given dna string.
|
// Creates a int representation of a given dna string.
|
||||||
static int KeyFromContext(const string& dna, const int start, const int end);
|
static int KeyFromContext(const string& dna, const int start, const int end);
|
||||||
// For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
|
// For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
|
||||||
static void GetReadContextAtEachPosition(const string& bases, const int contextSize, const int mask, vector<int>& keys);
|
static void GetReadContextAtEachPosition(const string& bases, const int contextSize, const int mask, vector<int>& keys);
|
||||||
|
|
||||||
// 设置协变量的值
|
// 设置协变量的值
|
||||||
static void RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
||||||
};
|
};
|
||||||
|
|
||||||
// Cycle协变量
|
// Cycle协变量
|
||||||
|
|
@ -264,9 +264,9 @@ struct CycleCovariate {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Computes the encoded value of CycleCovariate's key for the given position at the read.
|
// Computes the encoded value of CycleCovariate's key for the given position at the read.
|
||||||
static int CycleKey(BamWrap* bw, SamData& ad, const int baseNumber, const bool indel, const int maxCycle);
|
static int CycleKey(SamData& ad, const int baseNumber, const bool indel, const int maxCycle);
|
||||||
|
|
||||||
static void RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
||||||
};
|
};
|
||||||
|
|
||||||
// 好像不需要
|
// 好像不需要
|
||||||
|
|
|
||||||
|
|
@ -81,7 +81,7 @@ struct SamData {
|
||||||
int cigar_end = 0; // cigar结束位置,开区间
|
int cigar_end = 0; // cigar结束位置,开区间
|
||||||
int first_cigar_clip = 0; // 第一个cigar, clip的数量,切左侧
|
int first_cigar_clip = 0; // 第一个cigar, clip的数量,切左侧
|
||||||
int last_cigar_clip = 0; // 最后一个cigar, clip的数量,切右侧
|
int last_cigar_clip = 0; // 最后一个cigar, clip的数量,切右侧
|
||||||
int left_clip = 0; // 左侧被切掉的碱基长度
|
int left_clip = 0; // 左侧被切掉的碱基长度,BI和BD质量分数也会用到
|
||||||
int right_clip = 0; // 右侧被切掉的碱基长度
|
int right_clip = 0; // 右侧被切掉的碱基长度
|
||||||
int ref_offset = 0; // 切除adapter和softclip之后(softclip应该不影响),相对原始ref比对位置(contig_pos)的偏移量
|
int ref_offset = 0; // 切除adapter和softclip之后(softclip应该不影响),相对原始ref比对位置(contig_pos)的偏移量
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue