解决了两个问题,1. clip导致两端的cigar可能成为D,需要处理,2. start_pos要加上contig,否则导致knowsites计算错误,大数据还是有点问题,得继续调试
This commit is contained in:
parent
d56d926b6e
commit
f915461205
|
|
@ -13,6 +13,11 @@ typedef struct {
|
||||||
long i;
|
long i;
|
||||||
} ktf_worker_t;
|
} ktf_worker_t;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
struct kt_steal_for_t* t;
|
||||||
|
long i;
|
||||||
|
} ktf_steal_worker_t;
|
||||||
|
|
||||||
typedef struct kt_for_t {
|
typedef struct kt_for_t {
|
||||||
int n_threads;
|
int n_threads;
|
||||||
long n;
|
long n;
|
||||||
|
|
@ -21,6 +26,14 @@ typedef struct kt_for_t {
|
||||||
void *data;
|
void *data;
|
||||||
} kt_for_t;
|
} kt_for_t;
|
||||||
|
|
||||||
|
typedef struct kt_steal_for_t {
|
||||||
|
int n_threads;
|
||||||
|
long n;
|
||||||
|
ktf_steal_worker_t* w;
|
||||||
|
void (*func)(void*, long, int, int);
|
||||||
|
void* data;
|
||||||
|
} kt_steal_for_t;
|
||||||
|
|
||||||
static inline long steal_work(kt_for_t *t)
|
static inline long steal_work(kt_for_t *t)
|
||||||
{
|
{
|
||||||
int i, min_i = -1;
|
int i, min_i = -1;
|
||||||
|
|
@ -64,6 +77,46 @@ void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline long steal_steal_work(kt_steal_for_t* t) {
|
||||||
|
int i, min_i = -1;
|
||||||
|
long k, min = LONG_MAX;
|
||||||
|
for (i = 0; i < t->n_threads; ++i)
|
||||||
|
if (min > t->w[i].i)
|
||||||
|
min = t->w[i].i, min_i = i;
|
||||||
|
k = __sync_fetch_and_add(&t->w[min_i].i, t->n_threads);
|
||||||
|
return k >= t->n ? -1 : k;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void* ktf_worker_steal(void* data) {
|
||||||
|
ktf_steal_worker_t* w = (ktf_steal_worker_t*)data;
|
||||||
|
long i;
|
||||||
|
for (;;) {
|
||||||
|
i = __sync_fetch_and_add(&w->i, w->t->n_threads);
|
||||||
|
if (i >= w->t->n)
|
||||||
|
break;
|
||||||
|
w->t->func(w->t->data, i, w - w->t->w, 0);
|
||||||
|
}
|
||||||
|
while ((i = steal_steal_work(w->t)) >= 0) w->t->func(w->t->data, i, w - w->t->w, 1);
|
||||||
|
pthread_exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
void kt_for_steal(int n_threads, void (*func)(void*, long, int, int), void* data, long n) {
|
||||||
|
if (n_threads > 1) {
|
||||||
|
int i;
|
||||||
|
kt_steal_for_t t;
|
||||||
|
pthread_t* tid;
|
||||||
|
t.func = func, t.data = data, t.n_threads = n_threads, t.n = n;
|
||||||
|
t.w = (ktf_steal_worker_t*)alloca(n_threads * sizeof(ktf_steal_worker_t));
|
||||||
|
tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t));
|
||||||
|
for (i = 0; i < n_threads; ++i) t.w[i].t = &t, t.w[i].i = i;
|
||||||
|
for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktf_worker_steal, &t.w[i]);
|
||||||
|
for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
|
||||||
|
} else {
|
||||||
|
long j;
|
||||||
|
for (j = 0; j < n; ++j) func(data, j, 0, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void* ktf_worker_no_steal(void* data) {
|
static void* ktf_worker_no_steal(void* data) {
|
||||||
ktf_worker_t* w = (ktf_worker_t*)data;
|
ktf_worker_t* w = (ktf_worker_t*)data;
|
||||||
long i;
|
long i;
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,7 @@ extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n);
|
void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n);
|
||||||
|
void kt_for_steal(int n_threads, void (*func)(void*, long, int, int), void* data, long n);
|
||||||
void kt_for_no_steal(int n_threads, void (*func)(void*, long, int), void* data, long n);
|
void kt_for_no_steal(int n_threads, void (*func)(void*, long, int), void* data, long n);
|
||||||
void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps);
|
void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -28,7 +28,7 @@ struct AuxVar {
|
||||||
//const static int REF_CONTEXT_PAD = 3; // 需要做一些填充
|
//const static int REF_CONTEXT_PAD = 3; // 需要做一些填充
|
||||||
//const static int REFERENCE_HALF_WINDOW_LENGTH = 150; // 需要额外多取出一些ref序列,防止边界效应
|
//const static int REFERENCE_HALF_WINDOW_LENGTH = 150; // 需要额外多取出一些ref序列,防止边界效应
|
||||||
|
|
||||||
static constexpr int BAM_BLOCK_NUM = 1; // 每个线程每次处理1k个bam记录
|
static constexpr int BAM_BLOCK_NUM = 1000; // 每个线程每次处理1k个bam记录
|
||||||
static int64_t processedReads;
|
static int64_t processedReads;
|
||||||
|
|
||||||
sam_hdr_t* header = nullptr; // bam header
|
sam_hdr_t* header = nullptr; // bam header
|
||||||
|
|
|
||||||
|
|
@ -114,44 +114,41 @@ void roundTableValues(RecalTables& rt) {
|
||||||
static void printRecalTables(const RecalTables& rt) {
|
static void printRecalTables(const RecalTables& rt) {
|
||||||
_Foreach2D(rt.readGroupTable, val, {
|
_Foreach2D(rt.readGroupTable, val, {
|
||||||
if (val.numObservations > 0) {
|
if (val.numObservations > 0) {
|
||||||
fprintf(gf[0], "%ld %f %f\n", val.numObservations, val.numMismatches, val.reportedQuality);
|
fprintf(gf[0], "%ld %f %f\n", val.numObservations, val.getNumMismatches(), val.reportedQuality);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
_Foreach3D(rt.qualityScoreTable, val, {
|
_Foreach3D(rt.qualityScoreTable, val, {
|
||||||
if (val.numObservations > 0) {
|
if (val.numObservations > 0) {
|
||||||
fprintf(gf[1], "%ld %f %f\n", val.numObservations, val.numMismatches, val.reportedQuality);
|
fprintf(gf[1], "%ld %f %f\n", val.numObservations, val.getNumMismatches(), val.reportedQuality);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
_Foreach4D(rt.contextTable, val, {
|
_Foreach4D(rt.contextTable, val, {
|
||||||
if (val.numObservations > 0) {
|
if (val.numObservations > 0) {
|
||||||
fprintf(gf[2], "%ld %f %f\n", val.numObservations, val.numMismatches, val.reportedQuality);
|
fprintf(gf[2], "%ld %f %f\n", val.numObservations, val.getNumMismatches(), val.reportedQuality);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
_Foreach4D(rt.cycleTable, val, {
|
_Foreach4D(rt.cycleTable, val, {
|
||||||
if (val.numObservations > 0) {
|
if (val.numObservations > 0) {
|
||||||
fprintf(gf[3], "%ld %f %f\n", val.numObservations, val.numMismatches, val.reportedQuality);
|
fprintf(gf[3], "%ld %f %f\n", val.numObservations, val.getNumMismatches(), val.reportedQuality);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// 串行bqsr
|
// 串行bqsr
|
||||||
int SerialBQSR(AuxVar &aux1) {
|
int SerialBQSR(AuxVar &aux) {
|
||||||
BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO);
|
BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO);
|
||||||
inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, bqsrReadFilterOut);
|
inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, bqsrReadFilterOut);
|
||||||
int64_t readNumSum = 0;
|
int64_t readNumSum = 0;
|
||||||
int round = 0;
|
int round = 0;
|
||||||
|
|
||||||
// PerReadCovariateMatrix &readCovariates = aux.readCovariates;
|
PerReadCovariateMatrix &readCovariates = aux.readCovariates;
|
||||||
// RecalTables& recalTables = aux.recalTables;
|
RecalTables& recalTables = aux.recalTables;
|
||||||
// SamData& sd = aux.sd;
|
SamData& sd = aux.sd;
|
||||||
// StableArray<int>&isSNP = aux.isSNP, &isIns = aux.isIns, &isDel = aux.isDel; // 该位置是否是SNP, indel位置,0不是,1是
|
StableArray<int>&isSNP = aux.isSNP, &isIns = aux.isIns, &isDel = aux.isDel; // 该位置是否是SNP, indel位置,0不是,1是
|
||||||
// StableArray<uint8_t> &baqArray = aux.baqArray;
|
StableArray<uint8_t> &baqArray = aux.baqArray;
|
||||||
// StableArray<double> &snpErrors = aux.snpErrors, &insErrors = aux.insErrors, &delErrors = aux.delErrors;
|
StableArray<double> &snpErrors = aux.snpErrors, &insErrors = aux.insErrors, &delErrors = aux.delErrors;
|
||||||
// StableArray<uint8_t> &skips = aux.skips; // 该位置是否是已知位点
|
StableArray<uint8_t> &skips = aux.skips; // 该位置是否是已知位点
|
||||||
|
|
||||||
int numProcessed = 0;
|
|
||||||
int numthreads = 2;
|
|
||||||
int BLOCK_NUM = AuxVar::BAM_BLOCK_NUM;
|
|
||||||
while (true) {
|
while (true) {
|
||||||
++round;
|
++round;
|
||||||
// 一. 读取bam数据
|
// 一. 读取bam数据
|
||||||
|
|
@ -164,179 +161,141 @@ int SerialBQSR(AuxVar &aux1) {
|
||||||
auto bams = inBamBuf.GetBamArr();
|
auto bams = inBamBuf.GetBamArr();
|
||||||
spdlog::info("{} reads processed in {} round", readNum, round);
|
spdlog::info("{} reads processed in {} round", readNum, round);
|
||||||
|
|
||||||
int numBLocks = (bams.size() + BLOCK_NUM - 1) / BLOCK_NUM;
|
|
||||||
int blocksPerThread = (numBLocks + numthreads - 1) / numthreads;
|
|
||||||
int spanBlocks = numthreads * BLOCK_NUM;
|
|
||||||
|
|
||||||
// 二. 遍历每个bam(read)记录,进行处理
|
// 二. 遍历每个bam(read)记录,进行处理
|
||||||
for (int j = 0; j < numthreads; ++j) { // 模拟多线程
|
for (int i = 0; i < bams.size(); ++i) {
|
||||||
AuxVar &aux = nsgv::gAuxVars[j];
|
// 1.
|
||||||
PerReadCovariateMatrix& readCovariates = aux.readCovariates;
|
// 对每个read,需要检查cigar是否合法,即没有两个连续的相同的cigar,而且需要将首尾的deletion处理掉,目前看好像没啥影响,我们忽略这一步
|
||||||
RecalTables& recalTables = aux.recalTables;
|
// 2. 对质量分数长度跟碱基长度不匹配的read,缺少的质量分数用默认值补齐,先忽略,后边有需要再处理
|
||||||
SamData& sd = aux.sd;
|
// 3. 如果bam文件之前做过bqsr,tag中包含OQ(originnal
|
||||||
StableArray<int>&isSNP = aux.isSNP, &isIns = aux.isIns, &isDel = aux.isDel; // 该位置是否是SNP, indel位置,0不是,1是
|
// quality,原始质量分数),检查用户参数里是否指定用原始质量分数进行bqsr,如果是则将质量分数替换为OQ,否则忽略OQ,先忽略
|
||||||
StableArray<uint8_t>& baqArray = aux.baqArray;
|
// spdlog::info("bam idx: {}", i);
|
||||||
StableArray<double>&snpErrors = aux.snpErrors, &insErrors = aux.insErrors, &delErrors = aux.delErrors;
|
BamWrap* bw = bams[i];
|
||||||
StableArray<uint8_t>& skips = aux.skips; // 该位置是否是已知位点
|
sd.init();
|
||||||
for (int k = 0; k < blocksPerThread; ++k)
|
sd.parseBasic(bw);
|
||||||
for (int m = 0; m < BLOCK_NUM; ++m) {
|
sd.rid = i + readNumSum;
|
||||||
int i = j * BLOCK_NUM + k * spanBlocks + m;
|
if (sd.read_len <= 0)
|
||||||
if (i >= bams.size()) break;
|
continue;
|
||||||
++numProcessed;
|
|
||||||
// for (int i = 0; i < bams.size(); ++i) {
|
|
||||||
// if (i % 100 == 0)
|
|
||||||
// spdlog::info("Processing read idx: {}", i);
|
|
||||||
// 1.
|
|
||||||
// 对每个read,需要检查cigar是否合法,即没有两个连续的相同的cigar,而且需要将首尾的deletion处理掉,目前看好像没啥影响,我们忽略这一步
|
|
||||||
// 2. 对质量分数长度跟碱基长度不匹配的read,缺少的质量分数用默认值补齐,先忽略,后边有需要再处理
|
|
||||||
// 3. 如果bam文件之前做过bqsr,tag中包含OQ(originnal
|
|
||||||
// quality,原始质量分数),检查用户参数里是否指定用原始质量分数进行bqsr,如果是则将质量分数替换为OQ,否则忽略OQ,先忽略
|
|
||||||
// spdlog::info("bam idx: {}", i);
|
|
||||||
BamWrap* bw = bams[i];
|
|
||||||
sd.init();
|
|
||||||
sd.parseBasic(bw);
|
|
||||||
sd.rid = i + readNumSum;
|
|
||||||
if (sd.read_len <= 0)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
PROF_START(clip_read);
|
PROF_START(clip_read);
|
||||||
// 4. 对read的两端进行检测,去除(hardclip)adapter
|
// 4. 对read的两端进行检测,去除(hardclip)adapter
|
||||||
ReadTransformer::hardClipAdaptorSequence(bw, sd);
|
ReadTransformer::hardClipAdaptorSequence(bw, sd);
|
||||||
if (sd.read_len <= 0)
|
if (sd.read_len <= 0)
|
||||||
continue;
|
continue;
|
||||||
// 5. 然后再去除softclip部分
|
// 5. 然后再去除softclip部分
|
||||||
ReadTransformer::hardClipSoftClippedBases(bw, sd);
|
ReadTransformer::hardClipSoftClippedBases(bw, sd);
|
||||||
if (sd.read_len <= 0)
|
if (sd.read_len <= 0)
|
||||||
continue;
|
continue;
|
||||||
// 应用所有的变换,计算samdata的相关信息
|
// 应用所有的变换,计算samdata的相关信息
|
||||||
sd.applyTransformations();
|
sd.applyTransformations();
|
||||||
PROF_END(gprof[GP_clip_read], clip_read);
|
PROF_END(gprof[GP_clip_read], clip_read);
|
||||||
|
|
||||||
// 6. 更新每个read的platform信息,好像没啥用,暂时忽略
|
const char* qname = bam_get_qname(sd.bw->b);
|
||||||
const int nErrors = RecalFuncs::calculateIsSNPOrIndel(aux, sd, isSNP, isIns, isDel);
|
// fprintf(gf[4], "%ld %d %d %d\n", sd.rid, sd.read_len, 1 + BamWrap::bam_pos(sd.start_pos), 1 + BamWrap::bam_pos(sd.end_pos));
|
||||||
|
// fprintf(gf[4], "%s %d %d %d %d\n", qname, sd.bw->b->core.flag, sd.read_len, 1 + BamWrap::bam_pos(sd.start_pos), 1 + BamWrap::bam_pos(sd.end_pos));
|
||||||
|
|
||||||
/*fprintf(gf[0], "%d\t", sd.read_len);
|
// 6. 更新每个read的platform信息,好像没啥用,暂时忽略
|
||||||
for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[0], "%d ", isSNP[ii]);
|
const int nErrors = RecalFuncs::calculateIsSNPOrIndel(aux, sd, isSNP, isIns, isDel);
|
||||||
fprintf(gf[0], "\n");
|
|
||||||
fprintf(gf[1], "%d\t", sd.read_len);
|
|
||||||
for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[1], "%d ", isIns[ii]);
|
|
||||||
fprintf(gf[1], "\n");
|
|
||||||
fprintf(gf[2], "%d\t", sd.read_len);
|
|
||||||
for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[2], "%d ", isDel[ii]);
|
|
||||||
fprintf(gf[2], "\n");
|
|
||||||
*/
|
|
||||||
|
|
||||||
// 7. 计算baqArray
|
// fprintf(gf[0], "%d\t", sd.read_len);
|
||||||
// BAQ = base alignment quality
|
// for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[0], "%d ", isSNP[ii]);
|
||||||
// note for efficiency reasons we don't compute the BAQ array unless we actually have
|
// fprintf(gf[0], "\n");
|
||||||
// some error to marginalize over. For ILMN data ~85% of reads have no error
|
// fprintf(gf[1], "%d\t", sd.read_len);
|
||||||
// vector<uint8_t> baqArray;
|
// for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[1], "%d ", isIns[ii]);
|
||||||
bool baqCalculated = false;
|
// fprintf(gf[1], "\n");
|
||||||
if (nErrors == 0 || !nsgv::gBqsrArg.enableBAQ) {
|
// fprintf(gf[2], "%d\t", sd.read_len);
|
||||||
baqCalculated = BAQ::flatBAQArray(sd, baqArray);
|
// for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[2], "%d ", isDel[ii]);
|
||||||
} else {
|
// fprintf(gf[2], "\n");
|
||||||
// baqCalculated = calculateBAQArray(nsgv::gAuxVars[0], baq, sd, baqArray);
|
|
||||||
}
|
|
||||||
if (!baqCalculated)
|
|
||||||
continue;
|
|
||||||
// 到这里,基本的数据都准备好了,后续就是进行bqsr的统计了
|
|
||||||
|
|
||||||
// 8. 计算这条read对应的协变量
|
// 7. 计算baqArray
|
||||||
PROF_START(covariate);
|
// BAQ = base alignment quality
|
||||||
CovariateUtils::ComputeCovariates(sd, aux.header, readCovariates, true);
|
// note for efficiency reasons we don't compute the BAQ array unless we actually have
|
||||||
PROF_END(gprof[GP_covariate], covariate);
|
// some error to marginalize over. For ILMN data ~85% of reads have no error
|
||||||
|
// vector<uint8_t> baqArray;
|
||||||
|
bool baqCalculated = false;
|
||||||
|
if (nErrors == 0 || !nsgv::gBqsrArg.enableBAQ) {
|
||||||
|
baqCalculated = BAQ::flatBAQArray(sd, baqArray);
|
||||||
|
} else {
|
||||||
|
// baqCalculated = calculateBAQArray(nsgv::gAuxVars[0], baq, sd, baqArray);
|
||||||
|
}
|
||||||
|
if (!baqCalculated)
|
||||||
|
continue;
|
||||||
|
// 到这里,基本的数据都准备好了,后续就是进行bqsr的统计了
|
||||||
|
|
||||||
// fprintf(gf[3], "%ld %ld %d %ld\n", sd.rid, readCovariates.size(), sd.read_len, readCovariates[0][0].size());
|
// 8. 计算这条read对应的协变量
|
||||||
// for (auto &arr1 : readCovariates) {
|
PROF_START(covariate);
|
||||||
// for (size_t si = 0; si < sd.read_len; ++si) {
|
CovariateUtils::ComputeCovariates(sd, aux.header, readCovariates, true);
|
||||||
// for (auto &val : arr1[si]) {
|
PROF_END(gprof[GP_covariate], covariate);
|
||||||
// fprintf(gf[3], "%d ", val);
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// fprintf(gf[3], "\n");
|
|
||||||
|
|
||||||
// fprintf(gf[3], "%ld %d\n", sd.rid, sd.read_len);
|
// fprintf(gf[4], "%ld %d\n", sd.rid, sd.read_len);
|
||||||
// {
|
// for (auto &arr1 : readCovariates) {
|
||||||
// auto& arr1 = readCovariates[0];
|
// for (size_t si = 0; si < sd.read_len; ++si) {
|
||||||
// {
|
// fprintf(gf[4], "%d %d %d %d ", arr1[si].readGroup, arr1[si].baseQuality, arr1[si].context, arr1[si].cycle);
|
||||||
// for (int pos = 0; pos < sd.read_len; ++pos) {
|
// }
|
||||||
// fprintf(gf[3], "%d %d\n", pos, arr1[pos][2]);
|
// }
|
||||||
// }
|
// fprintf(gf[4], "\n");
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// fprintf(gf[3], "\n");
|
|
||||||
|
|
||||||
// 9. 计算这条read需要跳过的位置
|
// fprintf(gf[3], "%ld %d\n", sd.rid, sd.read_len);
|
||||||
PROF_START(read_vcf);
|
// {
|
||||||
RecalFuncs::calculateKnownSites(sd, aux.vcfArr, aux.header, skips);
|
// auto& arr1 = readCovariates[0];
|
||||||
for (int ii = 0; ii < sd.read_len; ++ii) {
|
// {
|
||||||
skips[ii] = skips[ii] || (ContextCovariate::baseIndexMap[sd.bases[ii]] == -1) ||
|
// for (int pos = 0; pos < sd.read_len; ++pos) {
|
||||||
sd.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN;
|
// fprintf(gf[3], "%d %d\n", pos, arr1[pos][2]);
|
||||||
}
|
// }
|
||||||
PROF_GP_END(read_vcf);
|
// }
|
||||||
#if 1
|
// }
|
||||||
int fidx = 0;
|
// fprintf(gf[3], "\n");
|
||||||
if (sd.rid % 2 == 0) fidx = 0;
|
|
||||||
else fidx = 1;
|
// 9. 计算这条read需要跳过的位置
|
||||||
fprintf(gf[fidx], "%ld %d\t", sd.rid, sd.read_len);
|
PROF_START(read_vcf);
|
||||||
for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[fidx], "%d ", skips[ii] ? 1 : 0);
|
RecalFuncs::calculateKnownSites(sd, aux.vcfArr, aux.header, skips);
|
||||||
fprintf(gf[fidx], "\n");
|
for (int ii = 0; ii < sd.read_len; ++ii) {
|
||||||
|
skips[ii] = skips[ii] || (ContextCovariate::baseIndexMap[sd.bases[ii]] == -1) ||
|
||||||
|
sd.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN;
|
||||||
|
}
|
||||||
|
PROF_GP_END(read_vcf);
|
||||||
|
#if 0
|
||||||
|
int fidx = 0;
|
||||||
|
if (sd.rid % 2 == 0)
|
||||||
|
fidx = 0;
|
||||||
|
else
|
||||||
|
fidx = 1;
|
||||||
|
fprintf(gf[fidx], "%ld %d\t", sd.rid, sd.read_len);
|
||||||
|
for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[fidx], "%d ", skips[ii] ? 1 : 0);
|
||||||
|
fprintf(gf[fidx], "\n");
|
||||||
#endif
|
#endif
|
||||||
// fprintf(gf[0], "%ld %d\t", sd.rid, sd.read_len);
|
// fprintf(gf[0], "%ld %d\t", sd.rid, sd.read_len);
|
||||||
// for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[0], "%d ", skips[ii] ? 1 : 0);
|
// for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[0], "%d ", skips[ii] ? 1 : 0);
|
||||||
// fprintf(gf[0], "\n");
|
// fprintf(gf[0], "\n");
|
||||||
|
|
||||||
// 10. 根据BAQ进一步处理snp,indel,得到处理后的数据
|
// 10. 根据BAQ进一步处理snp,indel,得到处理后的数据
|
||||||
PROF_START(frac_err);
|
PROF_START(frac_err);
|
||||||
RecalFuncs::calculateFractionalErrorArray(isSNP, baqArray, snpErrors);
|
RecalFuncs::calculateFractionalErrorArray(isSNP, baqArray, snpErrors);
|
||||||
RecalFuncs::calculateFractionalErrorArray(isIns, baqArray, insErrors);
|
RecalFuncs::calculateFractionalErrorArray(isIns, baqArray, insErrors);
|
||||||
RecalFuncs::calculateFractionalErrorArray(isDel, baqArray, delErrors);
|
RecalFuncs::calculateFractionalErrorArray(isDel, baqArray, delErrors);
|
||||||
PROF_GP_END(frac_err);
|
PROF_GP_END(frac_err);
|
||||||
|
|
||||||
// aggregate all of the info into our info object, and update the data
|
// aggregate all of the info into our info object, and update the data
|
||||||
// 11. 合并之前计算的数据,得到info,并更新bqsr table数据
|
// 11. 合并之前计算的数据,得到info,并更新bqsr table数据
|
||||||
ReadRecalInfo info(sd, readCovariates, skips, snpErrors, insErrors, delErrors);
|
ReadRecalInfo info(sd, readCovariates, skips, snpErrors, insErrors, delErrors);
|
||||||
|
|
||||||
PROF_START(update_info);
|
PROF_START(update_info);
|
||||||
RecalUtils::updateRecalTablesForRead(info, recalTables);
|
RecalUtils::updateRecalTablesForRead(info, recalTables);
|
||||||
PROF_END(gprof[GP_update_info], update_info);
|
PROF_END(gprof[GP_update_info], update_info);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
readNumSum += readNum;
|
readNumSum += readNum;
|
||||||
inBamBuf.ClearAll(); //
|
inBamBuf.ClearAll(); //
|
||||||
}
|
}
|
||||||
#if 0
|
|
||||||
printRecalTables(recalTables);
|
|
||||||
#endif
|
|
||||||
spdlog::info("read count: {}", readNumSum);
|
|
||||||
spdlog::info("processed count: {}", numProcessed);
|
|
||||||
|
|
||||||
auto& auxArr = nsgv::gAuxVars;
|
spdlog::info("read count: {}", readNumSum);
|
||||||
RecalTables& recalTables = auxArr[0].recalTables;
|
|
||||||
for (int i = 0; i < numthreads; ++i) spdlog::info("thread {} processed reads {}.", i, auxArr[i].threadProcessedReads);
|
|
||||||
for (int i = 1; i < numthreads; ++i) {
|
|
||||||
auxArr[0].threadProcessedReads += auxArr[i].threadProcessedReads;
|
|
||||||
_Foreach3DK(auxArr[i].recalTables.qualityScoreTable, qualDatum, {
|
|
||||||
if (qualDatum.numObservations > 0) {
|
|
||||||
recalTables.qualityScoreTable(k1, k2, k3).increment(qualDatum);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
_Foreach4DK(auxArr[i].recalTables.contextTable, contextDatum, {
|
|
||||||
if (contextDatum.numObservations > 0) {
|
|
||||||
recalTables.contextTable(k1, k2, k3, k4).increment(contextDatum);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
_Foreach4DK(auxArr[i].recalTables.cycleTable, cycleDatum, {
|
|
||||||
if (cycleDatum.numObservations > 0) {
|
|
||||||
recalTables.cycleTable(k1, k2, k3, k4).increment(cycleDatum);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
// 12. 创建总结数据
|
// 12. 创建总结数据
|
||||||
collapseQualityScoreTableToReadGroupTable(recalTables.readGroupTable, recalTables.qualityScoreTable);
|
collapseQualityScoreTableToReadGroupTable(recalTables.readGroupTable, recalTables.qualityScoreTable);
|
||||||
roundTableValues(recalTables);
|
roundTableValues(recalTables);
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
printRecalTables(recalTables);
|
||||||
|
#endif
|
||||||
|
|
||||||
// 13. 量化质量分数
|
// 13. 量化质量分数
|
||||||
QuantizationInfo quantInfo(recalTables, nsgv::gBqsrArg.QUANTIZING_LEVELS);
|
QuantizationInfo quantInfo(recalTables, nsgv::gBqsrArg.QUANTIZING_LEVELS);
|
||||||
|
|
||||||
|
|
@ -347,7 +306,7 @@ int SerialBQSR(AuxVar &aux1) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// 多线程处理bam数据, tmd是乱序的?
|
// 多线程处理bam数据, tmd是乱序的?
|
||||||
static void thread_worker(void* data, long idx, int tid) {
|
static void thread_worker(void* data, long idx, int tid, int steal) {
|
||||||
AuxVar& aux = (*(vector<AuxVar>*)data)[tid];
|
AuxVar& aux = (*(vector<AuxVar>*)data)[tid];
|
||||||
auto& readCovariates = aux.readCovariates;
|
auto& readCovariates = aux.readCovariates;
|
||||||
RecalTables& recalTables = aux.recalTables;
|
RecalTables& recalTables = aux.recalTables;
|
||||||
|
|
@ -357,7 +316,8 @@ static void thread_worker(void* data, long idx, int tid) {
|
||||||
StableArray<double>&snpErrors = aux.snpErrors, &insErrors = aux.insErrors, &delErrors = aux.delErrors;
|
StableArray<double>&snpErrors = aux.snpErrors, &insErrors = aux.insErrors, &delErrors = aux.delErrors;
|
||||||
StableArray<uint8_t>& skips = aux.skips; // 该位置是否是已知位点
|
StableArray<uint8_t>& skips = aux.skips; // 该位置是否是已知位点
|
||||||
auto &bams = *aux.bamArr;
|
auto &bams = *aux.bamArr;
|
||||||
// for (auto& vcf : aux.vcfArr) vcf.knownSites.clear();
|
if (steal)
|
||||||
|
for (auto& vcf : aux.vcfArr) vcf.knownSites.clear();
|
||||||
#if 1
|
#if 1
|
||||||
int startIdx = idx * aux.BAM_BLOCK_NUM;
|
int startIdx = idx * aux.BAM_BLOCK_NUM;
|
||||||
int stopIdx = std::min((size_t)(idx + 1) * aux.BAM_BLOCK_NUM, bams.size());
|
int stopIdx = std::min((size_t)(idx + 1) * aux.BAM_BLOCK_NUM, bams.size());
|
||||||
|
|
@ -404,7 +364,7 @@ static void thread_worker(void* data, long idx, int tid) {
|
||||||
skips[ii] || (ContextCovariate::baseIndexMap[sd.bases[ii]] == -1) || sd.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN;
|
skips[ii] || (ContextCovariate::baseIndexMap[sd.bases[ii]] == -1) || sd.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN;
|
||||||
}
|
}
|
||||||
// PROF_GP_END(read_vcf);
|
// PROF_GP_END(read_vcf);
|
||||||
#if 1
|
#if 0
|
||||||
int fidx = 0 + 2 * tid;
|
int fidx = 0 + 2 * tid;
|
||||||
//if (sd.rid % 2 == 0) fidx = 0 + 2 * tid;
|
//if (sd.rid % 2 == 0) fidx = 0 + 2 * tid;
|
||||||
//else fidx = 1 + 2 * tid;
|
//else fidx = 1 + 2 * tid;
|
||||||
|
|
@ -447,9 +407,9 @@ int ParallelBQSR(vector<AuxVar>& auxArr) {
|
||||||
spdlog::info("{} reads processed in {} round", readNum, round);
|
spdlog::info("{} reads processed in {} round", readNum, round);
|
||||||
|
|
||||||
#if 1
|
#if 1
|
||||||
kt_for_no_steal(auxArr.size(), thread_worker, &auxArr, (readNum + AuxVar::BAM_BLOCK_NUM - 1) / AuxVar::BAM_BLOCK_NUM);
|
kt_for_steal(auxArr.size(), thread_worker, &auxArr, (readNum + AuxVar::BAM_BLOCK_NUM - 1) / AuxVar::BAM_BLOCK_NUM);
|
||||||
#else
|
#else
|
||||||
kt_for(auxArr.size(), thread_worker, &auxArr, auxArr.size());
|
kt_for_steal(auxArr.size(), thread_worker, &auxArr, auxArr.size());
|
||||||
#endif
|
#endif
|
||||||
readNumSum += readNum;
|
readNumSum += readNum;
|
||||||
AuxVar::processedReads += readNum;
|
AuxVar::processedReads += readNum;
|
||||||
|
|
@ -457,9 +417,9 @@ int ParallelBQSR(vector<AuxVar>& auxArr) {
|
||||||
}
|
}
|
||||||
spdlog::info("read count: {}", readNumSum);
|
spdlog::info("read count: {}", readNumSum);
|
||||||
|
|
||||||
|
|
||||||
// 合并各个线程的结果
|
// 合并各个线程的结果
|
||||||
RecalTables& recalTables = auxArr[0].recalTables;
|
RecalTables& recalTables = auxArr[0].recalTables;
|
||||||
|
// printRecalTables(recalTables);
|
||||||
for (int i = 0; i < auxArr.size(); ++i)
|
for (int i = 0; i < auxArr.size(); ++i)
|
||||||
spdlog::info("thread {} processed reads {}.", i, auxArr[i].threadProcessedReads);
|
spdlog::info("thread {} processed reads {}.", i, auxArr[i].threadProcessedReads);
|
||||||
for (int i = 1; i < auxArr.size(); ++i) {
|
for (int i = 1; i < auxArr.size(); ++i) {
|
||||||
|
|
@ -482,12 +442,12 @@ int ParallelBQSR(vector<AuxVar>& auxArr) {
|
||||||
}
|
}
|
||||||
spdlog::info("All processed reads {}.", auxArr[0].threadProcessedReads);
|
spdlog::info("All processed reads {}.", auxArr[0].threadProcessedReads);
|
||||||
|
|
||||||
// printRecalTables(recalTables);
|
|
||||||
|
|
||||||
// 创建总结数据
|
// 创建总结数据
|
||||||
collapseQualityScoreTableToReadGroupTable(recalTables.readGroupTable, recalTables.qualityScoreTable);
|
collapseQualityScoreTableToReadGroupTable(recalTables.readGroupTable, recalTables.qualityScoreTable);
|
||||||
roundTableValues(recalTables);
|
roundTableValues(recalTables);
|
||||||
|
|
||||||
|
printRecalTables(recalTables);
|
||||||
|
|
||||||
// 量化质量分数
|
// 量化质量分数
|
||||||
QuantizationInfo quantInfo(recalTables, nsgv::gBqsrArg.QUANTIZING_LEVELS);
|
QuantizationInfo quantInfo(recalTables, nsgv::gBqsrArg.QUANTIZING_LEVELS);
|
||||||
|
|
||||||
|
|
@ -583,9 +543,9 @@ int BaseRecalibrator() {
|
||||||
|
|
||||||
PROF_START(whole_process);
|
PROF_START(whole_process);
|
||||||
globalInit();
|
globalInit();
|
||||||
//if (nsgv::gBqsrArg.NUM_THREADS == 1)
|
if (nsgv::gBqsrArg.NUM_THREADS == 1)
|
||||||
// ret = SerialBQSR(nsgv::gAuxVars[0]); // 串行处理数据,生成recal table
|
ret = SerialBQSR(nsgv::gAuxVars[0]); // 串行处理数据,生成recal table
|
||||||
//else
|
else
|
||||||
ret = ParallelBQSR(nsgv::gAuxVars); // 并行处理数据,生成recal table
|
ret = ParallelBQSR(nsgv::gAuxVars); // 并行处理数据,生成recal table
|
||||||
globalDestroy();
|
globalDestroy();
|
||||||
sam_close(nsgv::gInBamFp);
|
sam_close(nsgv::gInBamFp);
|
||||||
|
|
|
||||||
|
|
@ -110,25 +110,21 @@ struct RecalFuncs {
|
||||||
int64_t startPos = bw->start_pos(); // 闭区间
|
int64_t startPos = bw->start_pos(); // 闭区间
|
||||||
int64_t endPos = bw->end_pos(); // 闭区间
|
int64_t endPos = bw->end_pos(); // 闭区间
|
||||||
knownSites.resize_fill(sd.read_len, 0);
|
knownSites.resize_fill(sd.read_len, 0);
|
||||||
// return;
|
|
||||||
|
|
||||||
// update vcfs
|
// update vcfs
|
||||||
for (auto& vcf : vcfs) {
|
for (auto& vcf : vcfs) {
|
||||||
#if 1
|
if (!vcf.knownSites.empty() && vcf.knownSites.back().right < startPos)
|
||||||
|
vcf.knownSites.clear();
|
||||||
// 清理旧的interval
|
// 清理旧的interval
|
||||||
while (!vcf.knownSites.empty()) {
|
while (!vcf.knownSites.empty()) {
|
||||||
auto& intv = vcf.knownSites.front();
|
auto& intv = vcf.knownSites.front();
|
||||||
// spdlog::info("intv bam {}, {}", intv.right, startPos);
|
|
||||||
if (intv.right < startPos)
|
if (intv.right < startPos)
|
||||||
vcf.knownSites.pop_front();
|
vcf.knownSites.pop_front();
|
||||||
else
|
else
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// #endif
|
|
||||||
if (!vcf.knownSites.empty() && vcf.knownSites.back().left > endPos) // 此时vcf的区域包含bam,不需要读取
|
if (!vcf.knownSites.empty() && vcf.knownSites.back().left > endPos) // 此时vcf的区域包含bam,不需要读取
|
||||||
continue;
|
continue;
|
||||||
#endif
|
|
||||||
vcf.knownSites.clear();
|
|
||||||
// 读取新的interval
|
// 读取新的interval
|
||||||
int64_t fpos, flen;
|
int64_t fpos, flen;
|
||||||
endPos = std::max(startPos + MAX_SITES_INTERVAL, endPos);
|
endPos = std::max(startPos + MAX_SITES_INTERVAL, endPos);
|
||||||
|
|
@ -155,7 +151,7 @@ struct RecalFuncs {
|
||||||
tid = sam_hdr_name2tid(samHdr, stid.c_str());
|
tid = sam_hdr_name2tid(samHdr, stid.c_str());
|
||||||
int64_t varStart = BamWrap::bam_global_pos(tid, pos);
|
int64_t varStart = BamWrap::bam_global_pos(tid, pos);
|
||||||
Interval varIntv(varStart, varStart + ref.size() - 1);
|
Interval varIntv(varStart, varStart + ref.size() - 1);
|
||||||
if (varIntv.right >= readIntv.left) {
|
if (varIntv.overlaps(readIntv)) {
|
||||||
vcf.knownSites.push_back(Interval(tid, pos - 1, pos - 1 + ref.size() - 1)); // 闭区间
|
vcf.knownSites.push_back(Interval(tid, pos - 1, pos - 1 + ref.size() - 1)); // 闭区间
|
||||||
}
|
}
|
||||||
get_line_from_buf(buf, flen, &cur, &line);
|
get_line_from_buf(buf, flen, &cur, &line);
|
||||||
|
|
@ -170,12 +166,12 @@ struct RecalFuncs {
|
||||||
if (intv.left > sd.softEnd())
|
if (intv.left > sd.softEnd())
|
||||||
break;
|
break;
|
||||||
// 计算对应的位点
|
// 计算对应的位点
|
||||||
ReadIdxCigar rcStart = sd.getReadIndexForReferenceCoordinate(intv.left);
|
ReadIdxCigar rcStart = sd.getReadIndexForReferenceCoordinate(intv.contigLeft());
|
||||||
int featureStartOnRead = rcStart.readIdx == SamData::READ_INDEX_NOT_FOUND ? 0 : rcStart.readIdx;
|
int featureStartOnRead = rcStart.readIdx == SamData::READ_INDEX_NOT_FOUND ? 0 : rcStart.readIdx;
|
||||||
if (rcStart.cigarOp == 'D') {
|
if (rcStart.cigarOp == 'D') {
|
||||||
--featureStartOnRead;
|
--featureStartOnRead;
|
||||||
}
|
}
|
||||||
ReadIdxCigar rcEnd = sd.getReadIndexForReferenceCoordinate(intv.right);
|
ReadIdxCigar rcEnd = sd.getReadIndexForReferenceCoordinate(intv.contigRight());
|
||||||
int featureEndOnRead = rcEnd.readIdx == SamData::READ_INDEX_NOT_FOUND ? sd.read_len : rcEnd.readIdx;
|
int featureEndOnRead = rcEnd.readIdx == SamData::READ_INDEX_NOT_FOUND ? sd.read_len : rcEnd.readIdx;
|
||||||
if (featureStartOnRead > sd.read_len) {
|
if (featureStartOnRead > sd.read_len) {
|
||||||
featureStartOnRead = featureEndOnRead = sd.read_len;
|
featureStartOnRead = featureEndOnRead = sd.read_len;
|
||||||
|
|
|
||||||
|
|
@ -21,6 +21,7 @@
|
||||||
#include "util/utils.h"
|
#include "util/utils.h"
|
||||||
#include "covariate.h"
|
#include "covariate.h"
|
||||||
#include "read_recal_info.h"
|
#include "read_recal_info.h"
|
||||||
|
#include "util/debug.h"
|
||||||
|
|
||||||
struct RecalUtils {
|
struct RecalUtils {
|
||||||
|
|
||||||
|
|
@ -63,6 +64,7 @@ struct RecalUtils {
|
||||||
CovariateValues& cv = readCovars[event.index][offset];
|
CovariateValues& cv = readCovars[event.index][offset];
|
||||||
uint8_t qual = info.getQual(event, offset);
|
uint8_t qual = info.getQual(event, offset);
|
||||||
double isError = info.getErrorFraction(event, offset);
|
double isError = info.getErrorFraction(event, offset);
|
||||||
|
// fprintf(gf[4], "%d %d %f\n", offset, qual, isError);
|
||||||
|
|
||||||
// 处理quality score covariate
|
// 处理quality score covariate
|
||||||
qualityScoreTable(event.index, cv.readGroup, cv.baseQuality).increment(1, isError, cv.baseQuality);
|
qualityScoreTable(event.index, cv.readGroup, cv.baseQuality).increment(1, isError, cv.baseQuality);
|
||||||
|
|
@ -77,16 +79,12 @@ struct RecalUtils {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// fprintf(gf[3], "%ld %d %ld\n", read.rid, read.read_len, read.start_pos+1);
|
// fprintf(gf[4], "%ld %d %ld\n", read.rid, read.read_len, read.start_pos+1);
|
||||||
// for (auto& arr1 : qualityScoreTable.data) {
|
// _Foreach3D(qualityScoreTable, val, {
|
||||||
// for (size_t si = 0; si < arr1.size(); ++si) {
|
// if (val.numObservations > 0)
|
||||||
// for (auto &val : arr1[si]) {
|
// fprintf(gf[4], "%ld %f %f ", val.numObservations, val.getNumMismatches(), val.reportedQuality);
|
||||||
// if (val.numObservations > 0)
|
// });
|
||||||
// fprintf(gf[3], "%ld %f %f ", val.numObservations, val.getNumMismatches(), val.reportedQuality);
|
// fprintf(gf[4], "\n");
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// fprintf(gf[3], "\n");
|
|
||||||
|
|
||||||
// fprintf(gf[3], "%ld %d %ld\n", read.rid, read.read_len, read.start_pos+1);
|
// fprintf(gf[3], "%ld %d %ld\n", read.rid, read.read_len, read.start_pos+1);
|
||||||
// for (auto& arr1 : contextTable.data) {
|
// for (auto& arr1 : contextTable.data) {
|
||||||
|
|
|
||||||
|
|
@ -17,8 +17,10 @@ int BamBuf::ReadBam() {
|
||||||
int read_num = 0;
|
int read_num = 0;
|
||||||
if (handle_last) { // 处理上次读入的最后一个bam
|
if (handle_last) { // 处理上次读入的最后一个bam
|
||||||
if (has_enough_space()) { // 必须调用,在边界处调整memffset
|
if (has_enough_space()) { // 必须调用,在边界处调整memffset
|
||||||
++read_num;
|
if (filter_out == nullptr || !filter_out(bw->b)) { // 这里也要加过滤器
|
||||||
append_one_bam();
|
++read_num;
|
||||||
|
append_one_bam();
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
return read_num; // 还是没空间
|
return read_num; // 还是没空间
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -9,12 +9,12 @@
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
FILE* gf[4] = {0};
|
FILE* gf[DEBUG_FILE_NUM] = {0};
|
||||||
|
|
||||||
int open_debug_files() {
|
int open_debug_files() {
|
||||||
char fn[1024] = {0};
|
char fn[1024] = {0};
|
||||||
int i = 0;
|
int i = 0;
|
||||||
for (; i < 4; ++i) {
|
for (; i < DEBUG_FILE_NUM; ++i) {
|
||||||
sprintf(fn, "./test/fp%d.txt", i);
|
sprintf(fn, "./test/fp%d.txt", i);
|
||||||
gf[i] = fopen(fn, "w");
|
gf[i] = fopen(fn, "w");
|
||||||
}
|
}
|
||||||
|
|
@ -23,7 +23,7 @@ int open_debug_files() {
|
||||||
|
|
||||||
int close_debug_files() {
|
int close_debug_files() {
|
||||||
int i = 0;
|
int i = 0;
|
||||||
for (; i < 4; ++i) {
|
for (; i < DEBUG_FILE_NUM; ++i) {
|
||||||
if (gf[i] != 0)
|
if (gf[i] != 0)
|
||||||
fclose(gf[i]);
|
fclose(gf[i]);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -20,7 +20,9 @@
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
extern FILE *gf[4];
|
#define DEBUG_FILE_NUM 5
|
||||||
|
|
||||||
|
extern FILE* gf[DEBUG_FILE_NUM];
|
||||||
|
|
||||||
int open_debug_files();
|
int open_debug_files();
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -100,6 +100,16 @@ struct SamData {
|
||||||
const bam1_core_t& bc = bw->b->core;
|
const bam1_core_t& bc = bw->b->core;
|
||||||
int i = 0;
|
int i = 0;
|
||||||
|
|
||||||
|
// 先更新一下cigar信息,去除两端无效(长度为0)的cigar
|
||||||
|
if (bam_cigar_oplen(cigar[cigar_start]) == first_cigar_clip) {
|
||||||
|
first_cigar_clip = 0;
|
||||||
|
++cigar_start;
|
||||||
|
}
|
||||||
|
if (bam_cigar_oplen(cigar[cigar_end - 1]) == last_cigar_clip) {
|
||||||
|
last_cigar_clip = 0;
|
||||||
|
--cigar_end;
|
||||||
|
}
|
||||||
|
|
||||||
// 计算ref_offset,就是相对比对的position,要将ref右移多少
|
// 计算ref_offset,就是相对比对的position,要将ref右移多少
|
||||||
for (i = 0; i < cigar_start; ++i) {
|
for (i = 0; i < cigar_start; ++i) {
|
||||||
if (BaseUtils::consumeRefBases(bam_cigar_opchr(cigar[i]))) {
|
if (BaseUtils::consumeRefBases(bam_cigar_opchr(cigar[i]))) {
|
||||||
|
|
@ -121,7 +131,7 @@ struct SamData {
|
||||||
}
|
}
|
||||||
|
|
||||||
// 计算read两端clip之后的softstart和softend,其实S之前都被切掉了
|
// 计算read两端clip之后的softstart和softend,其实S之前都被切掉了
|
||||||
int64_t softStart = bw->b->core.pos + ref_offset;
|
int64_t softStart = BamWrap::bam_global_pos(bw->b) + ref_offset; // 注意这里,要加上contig
|
||||||
int64_t softEnd = softStart - 1; // 闭区间
|
int64_t softEnd = softStart - 1; // 闭区间
|
||||||
bool rightTail = false;
|
bool rightTail = false;
|
||||||
for (i = cigar_start; i < cigar_end; ++i) {
|
for (i = cigar_start; i < cigar_end; ++i) {
|
||||||
|
|
@ -150,7 +160,7 @@ struct SamData {
|
||||||
start_pos += len; // 更新起始位置
|
start_pos += len; // 更新起始位置
|
||||||
} else if (i == cigar_end - 1 && c == 'D') { // 跳过结尾的deletion
|
} else if (i == cigar_end - 1 && c == 'D') { // 跳过结尾的deletion
|
||||||
c = 'H';
|
c = 'H';
|
||||||
softEnd -= len; // 更新结束位置
|
end_pos -= len; // 更新结束位置
|
||||||
}
|
}
|
||||||
cigars.push_back({c, len});
|
cigars.push_back({c, len});
|
||||||
}
|
}
|
||||||
|
|
@ -159,12 +169,13 @@ struct SamData {
|
||||||
// 给定一个ref pos,返回对应的read index和cigar操作
|
// 给定一个ref pos,返回对应的read index和cigar操作
|
||||||
ReadIdxCigar getReadIndexForReferenceCoordinate(int64_t refPos) {
|
ReadIdxCigar getReadIndexForReferenceCoordinate(int64_t refPos) {
|
||||||
ReadIdxCigar rc;
|
ReadIdxCigar rc;
|
||||||
if (refPos < start_pos)
|
int64_t contig_start_pos = BamWrap::bam_pos(start_pos);
|
||||||
|
if (refPos < contig_start_pos)
|
||||||
return rc;
|
return rc;
|
||||||
int firstReadPosOfElement = 0; // inclusive
|
int firstReadPosOfElement = 0; // inclusive
|
||||||
int firstRefPosOfElement = start_pos; // inclusive
|
int firstRefPosOfElement = contig_start_pos; // inclusive
|
||||||
int lastReadPosOfElement = 0; // exclusive
|
int lastReadPosOfElement = 0; // exclusive
|
||||||
int lastRefPosOfElement = start_pos; // exclusive
|
int lastRefPosOfElement = contig_start_pos; // exclusive
|
||||||
// advance forward through all the cigar elements until we bracket the reference coordinate
|
// advance forward through all the cigar elements until we bracket the reference coordinate
|
||||||
for (auto& cigar : cigars) {
|
for (auto& cigar : cigars) {
|
||||||
firstReadPosOfElement = lastReadPosOfElement;
|
firstReadPosOfElement = lastReadPosOfElement;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue