到了读取和解析known vcf部分了,性能还需优化

This commit is contained in:
zzh 2025-12-20 16:35:45 +08:00
parent 0fca937fab
commit 1e9b58fac1
17 changed files with 2397 additions and 161 deletions

17
src/bqsr/baq.cpp 100644
View File

@ -0,0 +1,17 @@
#include "baq.h"
#include <spdlog/spdlog.h>
vector<double> BAQ::qual2prob(256); // 质量分数转化概率
vector<vector<vector<double>>> BAQ::EPSILONS(256, vector<vector<double>>(256, vector<double>(SAM_MAX_PHRED_SCORE + 1))); // [ref][read][qual]
// 计算baq数组返回成功与否
bool BAQ::calcBAQFromHMM(BamWrap* bw, ReadAdditionData& ad, string ref, int refOffset, vector<int>& baqArray) {
// 检测ref是否覆盖了read
if (ref.size() < refOffset + ad.read_len) {
spdlog::error("BAQ calculation error: reference sequence length {} is less than required length {} (refOffset {} + read_len {})",
ref.size(), refOffset + ad.read_len, refOffset, ad.read_len);
return false;
}
return true;
}

88
src/bqsr/baq.h 100644
View File

@ -0,0 +1,88 @@
// ####################################################################################################
//
// NOTE -- THIS CODE IS SYNCHRONIZED WITH CODE IN THE SAMTOOLS REPOSITORY. CHANGES TO THIS CODE SHOULD BE
// NOTE -- PUSHED BACK TO HENG LI
//
// ####################################################################################################
// NOTE -- this code is a cpp version of code originally written by Heng Li in JAVA for the samtools project
// ####################################################################################################
#pragma once
#include <cctype>
#include <cmath>
#include <cstdint>
#include <string>
#include <vector>
#include "util/bam_wrap.h"
using std::vector;
using std::string;
// base alignment quality (BAQ)
struct BAQ {
static vector<double> qual2prob; // 质量分数转化概率
static constexpr double EM = 0.33333333333;
static constexpr double EI = 0.25;
// Phred scaled now (changed 1/10/2011)
static constexpr double DEFAULT_GOP = 40;
static constexpr int DEFAULT_BANDWIDTH = 7;
static constexpr int SAM_MAX_PHRED_SCORE = 93;
// 94 = max qual score + 1
static vector<vector<vector<double>>> EPSILONS; // [ref][read][qual]
double cd = -1; // gap open probability [1e-3]
double ce = 0.1; // gap extension probability [0.1]
int cb = DEFAULT_BANDWIDTH; // band width [7]
/**
* Any bases with Q < MIN_BASE_QUAL are raised up to this base quality
*/
uint8_t minBaseQual = 4;
/**
* Use defaults for everything
*/
BAQ() : BAQ(DEFAULT_GOP) {}
/**
* Use defaults for everything
*/
BAQ(const double gapOpenPenalty) {
cd = convertFromPhredScale(gapOpenPenalty);
initializeCachedData();
}
/* Takes a Phred Scale quality score and returns the error probability.
*
* Quick conversion function to maintain internal structure of BAQ calculation on
* probability scale, but take the user entered parameter in phred-scale.
*
* @param x phred scaled score
* @return probability of incorrect base call
*/
static double convertFromPhredScale(double x) { return (std::pow(10, (-x) / 10.)); }
// 初始化一些静态全局数据
void initializeCachedData() {
for (int i = 0; i < 256; i++)
for (int j = 0; j < 256; j++)
for (int q = 0; q <= SAM_MAX_PHRED_SCORE; q++) EPSILONS[i][j][q] = 1.0;
for (char b1 : "ACGTacgt") {
for (char b2 : "ACGTacgt") {
for (int q = 0; q <= SAM_MAX_PHRED_SCORE; q++) {
double qual = qual2prob[q < minBaseQual ? minBaseQual : q];
double e = std::tolower(b1) == std::tolower(b2) ? 1 - qual : qual * EM;
EPSILONS[(uint8_t)b1][(uint8_t)b2][q] = e;
}
}
}
}
double calcEpsilon(uint8_t ref, uint8_t read, uint8_t qualB) { return EPSILONS[ref][read][qualB]; }
// 计算baq数组返回成功与否
bool calcBAQFromHMM(BamWrap* bw, ReadAdditionData& ad, string ref, int refOffset, vector<int>& baqArray);
};

View File

@ -48,6 +48,9 @@ struct BQSRArg {
string PROGRAM_RECORD_ID = "FastBQSR";
// reference file
string REFERENCE_FILE;
// known sites vcf files
vector<string> KNOWN_SITES_VCFS;
@ -123,7 +126,7 @@ struct BQSRArg {
int PRESERVE_QSCORES_LESS_THAN = 6;
/**
* enable-baq, do BAQ correction"
* enable-baq, do BAQ correction" (base alignment quality), 在GATK里hidden了用不到了
*/
bool enableBAQ = false;

View File

@ -7,36 +7,93 @@ Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2023/10/23
*/
#include <htslib/faidx.h>
#include <htslib/kstring.h>
#include <htslib/sam.h>
#include <htslib/synced_bcf_reader.h>
#include <htslib/thread_pool.h>
#include <header.h>
#include <spdlog/spdlog.h>
#include <iomanip>
#include <numeric>
#include <vector>
#include <queue>
#include "baq.h"
#include "bqsr_args.h"
#include "bqsr_funcs.h"
#include "bqsr_pipeline.h"
#include "covariate.h"
#include "dup_metrics.h"
#include "fastbqsr_version.h"
#include "read_name_parser.h"
#include "util/interval.h"
#include "util/profiling.h"
#include "util/utils.h"
#include "util/linear_index.h"
using std::deque;
#define BAM_BLOCK_SIZE 16L * 1024 * 1024
const char cBaseToChar[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'};
// 解析knownSites
struct VCFParser {
deque<Interval> knownSites; // 已知的变异位点
char* buf = nullptr; // // 数据buffer
uint32_t bufLen = 4 * 1024; // 数据buffer长度
LinearIndex index; // vcf文件索引
ifstream inStm; // vcf文件流
VCFParser() { Init(); }
VCFParser(const string& vcfFileName) { Init(vcfFileName); }
VCFParser(const string& vcfFileName, sam_hdr_t* samHeader) { Init(vcfFileName, samHeader); }
void Init() { buf = (char*)malloc(bufLen); }
void Init(const string& vcfFileName) {
Init();
inStm.open(vcfFileName, ifstream::in);
string idxFileName = vcfFileName + ".idx";
if (!index.ReadIndex(idxFileName))
error("[%s] fail to load the %s index file\n", __func__, idxFileName.c_str());
}
void Init(const string& vcfFileName, sam_hdr_t *samHeader) {
index.SetHeader(samHeader);
Init(vcfFileName);
}
};
// 解析后的一些参数,文件,数据等
struct AuxVar {
const static int REF_CONTEXT_PAD = 3; // 需要做一些填充
const static int REFERENCE_HALF_WINDOW_LENGTH = 150; // 需要额外多取出一些ref序列防止边界效应
sam_hdr_t* header = nullptr; // bam header
faidx_t* faidx = nullptr; // reference index
char* ref_seq = nullptr; // reference sequence
int ref_len = 0; // reference sequence length
int offset = 0; // 在要求的ref序列两边多余取出的碱基数量
vector<VCFParser> vcfArr; // 从vcf中获取已知位点
};
namespace nsgv {
// 全局变量 for bqsr
BQSRArg gBqsrArg; // bqsr arguments
samFile* gInBamFp; // input BAM file pointer
sam_hdr_t* gInBamHeader; // input BAM header
vector<AuxVar> gAuxVars; // auxiliary variables保存一些文件数据等每个线程对应一个
// 下面是需要删除或修改的变量
std::vector<ReadNameParser> gNameParsers; // read name parser
DuplicationMetrics gMetrics; //
DupResult gDupRes;
PipelineArg gPipe(&gDupRes);
BQSRArg gBqsrArg; //
samFile *gInBamFp; // bam
sam_hdr_t *gInBamHeader; // bam
samFile *gOutBamFp; // , sambam
sam_hdr_t *gOutBamHeader; // header
vector <bcf_srs_t*> gKnownSitesVcfSrs; // known sites vcf srs
@ -49,6 +106,13 @@ struct ByteBuf {
int capacity = 0; //
};
// 读进来的这一批bam总共占了几个染色体这个方案不行读取太多没必要
// 开区间
struct Region {
int64_t start;
int64_t end;
};
/*
*
*/
@ -60,26 +124,495 @@ static string getFileExtension(const string &filename) {
return filename.substr(last_dot + 1);
}
// 过滤掉bqsr过程不符合要求的bam数据
bool bqsrReadFilterOut(const bam1_t *b) {
// 过滤掉unmapped的read
if (b->core.qual == 0) // mapping quality 0
return true;
if (b->core.qual == 255) // mapping quality not available
return true;
if (b->core.flag & BAM_FUNMAP || b->core.tid == -1 || b->core.pos == -1) { // unmapped
return true;
}
if (b->core.flag & BAM_FSECONDARY) { // secondary alignment
return true;
}
if (b->core.flag & BAM_FDUP) { // secondary alignment
return true;
}
if (b->core.flag & BAM_FQCFAIL) { // Not passing quality controls
return true;
}
return false;
}
// 该操作符是否消耗read的碱基
bool consumeReadBases(char cigar) {
return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'I' || cigar == 'S';
}
// 该操作符是否消耗参考基因组的碱基
bool consumeRefBases(char cigar) {
return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N';
}
// 给定一个ref位置在read内部找到对应的位置和操作符
struct PosAndOperator {
int readPosAtRefCoord = -1; // read中的位置
char cigarOperator = '0'; // cigar操作符
int cigarIndex = -1; // cigar索引
int cigarLen = 0;
int preCigarLen = 0; // 截止cigar之前的消耗read base的长度
};
/**
* Find the 0-based index within a read base array corresponding to a given 0-based position in the reference, along with the cigar operator of
* the element containing that base. If the reference coordinate occurs within a deletion, the first index after the deletion is returned.
* Note that this treats soft-clipped bases as if they align with the reference, which is useful for hard-clipping reads with soft clips.
*
* @param alignmentStart The soft start of the read on the reference
* @param cigar The read's cigar
* @param refCoord The target reference coordinate
* @return If the reference coordinate occurs before the read start or after the read end {@code CLIPPING_GOAL_NOT_REACHED};
* if the reference coordinate falls within an alignment block of the read's cigar, the corresponding read coordinate;
* if the reference coordinate falls within a deletion, the first read coordinate after the deletion. Note: if the last
* cigar element is a deletion (which isn't meaningful), it returns {@code CLIPPING_GOAL_NOT_REACHED}.
*/
PosAndOperator getReadIndexForReferenceCoordinate(BamWrap *bw, int alignmentStart, int refCoord) {
PosAndOperator po;
if (refCoord < alignmentStart) {
return po;
}
int firstReadPosOfElement = 0; // inclusive
int firstRefPosOfElement = alignmentStart; // inclusive
int lastReadPosOfElement = 0; // exclusive
int lastRefPosOfElement = alignmentStart; // exclusive
// advance forward through all the cigar elements until we bracket the reference coordinate
const uint32_t* cigar = bam_get_cigar(bw->b);
const bam1_core_t& bc = bw->b->core;
const int idx = bc.n_cigar - 1;
if (idx < 0)
return po;
for (int i = 0; i < bc.n_cigar; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
const int len = bam_cigar_oplen(cigar[i]);
firstReadPosOfElement = lastReadPosOfElement;
firstRefPosOfElement = lastRefPosOfElement;
lastReadPosOfElement += consumeReadBases(c) ? len : 0;
lastRefPosOfElement += (consumeRefBases(c) || c == 'S') ? len : 0;
if (firstRefPosOfElement <= refCoord && refCoord < lastRefPosOfElement) { // refCoord falls within this cigar element
int readPosAtRefCoord = firstReadPosOfElement + (consumeReadBases(c) ? (refCoord - firstRefPosOfElement) : 0);
return PosAndOperator{readPosAtRefCoord, c, i, len, firstReadPosOfElement};
}
}
return po;
}
// 根据adapter位置对read进行hardclip返回左侧或右侧减掉的base数量
void clipByReferenceCoordinates(BamWrap *bw, int refStart, int refStop, ReadAdditionData &ad) {
int start, stop;
// Determine the read coordinate to start and stop hard clipping
if (refStart < 0) {
if (refStop < 0) return;
PosAndOperator stopPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStop);
// if the refStop falls in a deletion, the above method returns the position after the deletion. Since the stop we return here
// is inclusive, we decrement the stop to avoid overclipping by one base. As a result we do not clip the deletion, which is fine.
stop = stopPosAndOperator.readPosAtRefCoord - (consumeReadBases(stopPosAndOperator.cigarOperator) ? 0 : 1);
ad.left_clip = stop + 1;
ad.cigar_start = stopPosAndOperator.cigarIndex;
ad.first_cigar_clip = ad.left_clip - stopPosAndOperator.preCigarLen;
} else {
if (refStop >= 0) return;
// unlike the above case where we clip the start fo the read, here we clip the end and returning the base to the right of a deletion avoids
// overclipping
PosAndOperator startPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStart);
start = startPosAndOperator.readPosAtRefCoord;
ad.right_clip = bw->b->core.l_qseq - start;
ad.cigar_end = startPosAndOperator.cigarIndex + 1;
ad.last_cigar_clip = startPosAndOperator.preCigarLen + startPosAndOperator.cigarLen - start;
}
}
// 计算切掉adapter之后ref相对原始ref的偏移量
void calculateRefOffset(BamWrap *bw, ReadAdditionData &ad) {
const uint32_t* cigar = bam_get_cigar(bw->b);
const bam1_core_t& bc = bw->b->core;
int i = 0;
for (i = 0; i < ad.cigar_start; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
int len = bam_cigar_oplen(cigar[i]);
if (consumeRefBases(c)) {
ad.ref_offset += len;
}
}
const char c = bam_cigar_opchr(cigar[i]);
if (consumeRefBases(c)) {
ad.ref_offset += ad.first_cigar_clip;
}
}
// 计算clip处理之后剩余的碱基
void calculateReadBases(BamWrap* bw, ReadAdditionData& ad) {
ad.bases.resize(ad.read_len);
uint8_t* seq = bam_get_seq(bw->b);
for (int i = 0; i < ad.read_len; ++i) {
ad.bases[i] = cBaseToChar[bam_seqi(seq, i + ad.left_clip)];
}
}
// 计算read两端softclip的碱基数量可能会修改ad里的clip值
void calculateSoftClip(BamWrap *bw, ReadAdditionData &ad) {
const uint32_t* cigar = bam_get_cigar(bw->b);
const bam1_core_t& bc = bw->b->core;
int readIndex = ad.left_clip;
int cutLeft = -1; // first position to hard clip (inclusive)
int cutRight = -1; // first position to hard clip (inclusive)
int cigar_start = ad.cigar_start;
int cigar_end = ad.cigar_end;
bool rightTail = false; // trigger to stop clipping the left tail and start cutting the right tail
for (int i = ad.cigar_start; i < ad.cigar_end; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
int len = bam_cigar_oplen(cigar[i]);
if (i == ad.cigar_start) len -= ad.first_cigar_clip;
if (i == ad.cigar_end - 1) len -= ad.last_cigar_clip;
if (c == 'S') {
if (rightTail) {
cutRight = readIndex;
cigar_end = i;
} else {
cutLeft = readIndex + len - 1;
cigar_start = i + 1;
}
} else if (c != 'H') {
rightTail = true;
}
if (consumeReadBases(c)) {
readIndex += len;
}
}
if (cutRight >= 0) {
ad.right_clip = bw->b->core.l_qseq - cutRight;
ad.cigar_end = cigar_end;
ad.last_cigar_clip = 0;
}
if (cutLeft >= 0) {
ad.left_clip = cutLeft + 1;
ad.cigar_start = cigar_start;
ad.first_cigar_clip = 0;
}
}
// 读取给定区间的reference
static inline void read_ref_base(AuxVar& aux, int64_t cur_pos, Interval& interval) {
if (aux.ref_seq != NULL)
free(aux.ref_seq);
int tid = BamWrap::bam_tid(cur_pos);
const char* chr = sam_hdr_tid2name(aux.header, tid);
int seq_begin = BamWrap::bam_pos(interval.left); //- aux.REFERENCE_HALF_WINDOW_LENGTH;
int seq_end = BamWrap::bam_pos(interval.right); //+ aux.REFERENCE_HALF_WINDOW_LENGTH;
aux.ref_seq = faidx_fetch_seq(aux.faidx, chr, seq_begin, seq_end, &aux.ref_len);
// aux.offset = aux.REFERENCE_HALF_WINDOW_LENGTH;
}
// 设置某个位置是indel
inline void updateIndel(vector<int> &isIndel, int index) {
if (index >=0 && index < isIndel.size()) {
isIndel[index] = 1;
}
}
// 计算该read的每个碱基位置是否是SNP或Indel
int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, ReadAdditionData &ad, vector<int> &isSNP, vector<int> &isIns, vector<int> &isDel) {
// 1. 读取参考基因组先看看串行运行性能稍后可以将读入ref和vcf合并起来做成一个并行流水线步骤
Interval interval{bw->start_pos() + ad.ref_offset, bw->end_pos()}; // 闭区间
read_ref_base(aux, interval.left, interval);
string refBases(aux.ref_seq);
// spdlog::info("ref: {}, {}, {} - {}", aux.ref_seq, aux.ref_len, bw->contig_pos(), bw->contig_end_pos());
// 2. 遍历cigar计算每个碱基是否是SNP或Indel
int readPos = 0, refPos = 0, nEvents = 0;
const uint32_t* cigar = bam_get_cigar(bw->b);
const bam1_core_t& bc = bw->b->core;
uint8_t* seq = bam_get_seq(bw->b);
for (int i = ad.cigar_start; i < ad.cigar_end; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
int len = bam_cigar_oplen(cigar[i]);
if (i == ad.cigar_start) len -= ad.first_cigar_clip;
if (i == ad.cigar_end - 1) len -= ad.last_cigar_clip;
if (c == 'M' || c == '=' || c == 'X') {
for (int j = 0; j < len; ++j) {
// 按位置将read和ref碱基进行比较不同则是snp注意read起始位置要加上left_clip
int snpInt = cBaseToChar[bam_seqi(seq, readPos + ad.left_clip)] == refBases[refPos] ? 0 : 1;
isSNP[readPos] = snpInt;
nEvents += snpInt;
readPos++;
refPos++;
}
} else if (c == 'D') {
// 应该是在上一个消耗碱基的cigar的最后一个位置标记Del
int index = bw->GetReadNegativeStrandFlag() ? readPos : readPos - 1;
updateIndel(isDel, index);
refPos += len;
} else if (c == 'N') {
refPos += len;
} else if (c == 'I') {
// 与Del不同Ins应该是在下一个cigar开始的位置标记Ins
bool forwardStrandRead = !bw->GetReadNegativeStrandFlag();
if (forwardStrandRead) {
updateIndel(isIns, readPos - 1);
}
readPos += len;
if (!forwardStrandRead) {
updateIndel(isIns, readPos);
}
} else if (c == 'S') {
readPos += len;
}
}
nEvents += std::accumulate(isIns.begin(), isIns.end(), 0) + std::accumulate(isDel.begin(), isDel.end(), 0);
//spdlog::info("SNPs: {}, Ins: {}, Del: {}, total events: {}", std::accumulate(isSNP.begin(), isSNP.end(), 0),
// std::accumulate(isIns.begin(), isIns.end(), 0), std::accumulate(isDel.begin(), isDel.end(), 0), nEvents);
// exit(0);
return nEvents;
}
// 简单计算baq数组就是全部赋值为'@' (64)
bool flatBAQArray(BamWrap* bw, ReadAdditionData& ad, vector<int>& baqArray) {
baqArray.resize(ad.read_len, (int)'@');
return true;
}
// 计算真实的baq数组耗时更多好像enable-baq参数默认是关闭的那就先不实现这个了
bool calculateBAQArray(AuxVar& aux, BAQ& baq, BamWrap* bw, ReadAdditionData& ad, vector<int>& baqArray) {
baqArray.resize(ad.read_len, 0);
return true;
}
// 获取一行字符串
static void get_line_from_buf(char* buf, int64_t total, int64_t* cur, string* line) {
line->clear();
if (*cur >= total)
return;
char b;
while (*cur < total && (b = buf[(*cur)++]) != '\n') {
line->push_back(b);
}
}
// 计算与read有交叉的已知位点信息 应该要判断一下是按照read的范围去读取vcf还是按照一个batch read的范围去读取
void calculateKnownSites(BamWrap* bw, ReadAdditionData& ad, vector<VCFParser> &vcfs) {
int tid = bw->contig_id();
uint64_t startPos = bw->start_pos(); // 闭区间
uint64_t endPos = bw->end_pos(); // 闭区间
// spdlog::info("bam {}, {}", startPos, endPos);
// update vcfs
for(auto &vcf : vcfs) {
// 清理旧的interval
while(!vcf.knownSites.empty()) {
auto& intv = vcf.knownSites.front();
// spdlog::info("intv bam {}, {}", intv.right, startPos);
if (intv.right < startPos)
vcf.knownSites.pop_front();
else
break;
}
if (!vcf.knownSites.empty() && vcf.knownSites.back().left > endPos) continue;
// spdlog::info("intv {}, {}, {}", vcf.knownSites.size(), vcf.knownSites.front().right, vcf.knownSites.front().right);
// exit(0);
//spdlog::info("before intervals : {}", vcf.knownSites.size());
// 读取新的interval
int64_t fpos, flen;
vcf.index.SearchInterval(startPos, endPos, &fpos, &flen);
//spdlog::info("file index: {}, {}", fpos, flen);
if (flen > 0) {
vcf.inStm.seekg(fpos, ios::beg);
if (flen > vcf.bufLen) {
vcf.bufLen = flen;
vcf.buf = (char*)realloc(vcf.buf, flen);
}
char* buf = vcf.buf;
vcf.inStm.read(buf, flen);
string line;
int64_t cur = 0;
get_line_from_buf(buf, flen, &cur, &line);
while (line.size() > 0) {
stringstream ss_line(line);
string stid;
int tid, pos;
int64_t locus;
string id, ref;
ss_line >> stid >> pos >> id >> ref;
tid = sam_hdr_name2tid(nsgv::gInBamHeader, stid.c_str());
if (tid >= 0 && pos > 0) {
vcf.knownSites.push_back(Interval(tid, pos - 1, pos - 1 + ref.size()));
//spdlog::info("intv-1 {}, {}, {}", tid, pos, ref.size());
}
get_line_from_buf(buf, flen, &cur, &line);
}
}
//spdlog::info("after intervals : {}", vcf.knownSites.size());
//for(auto &val : vcf.knownSites) {
// spdlog::info("intv {}, {}", val.left, val.right);
//}
}
//exit(0);
}
// 串行bqsr
int SerialBQSR() {
int round = 0;
BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO);
inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM);
// inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM);
inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, bqsrReadFilterOut);
int64_t readNumSum = 0;
// 0. 初始化一些全局数据
// BAQ baq{BAQ::DEFAULT_GOP};
// 1. 协变量数据相关初始化
PerReadCovariateMatrix readCovariates;
CovariateUtils::InitPerReadCovMat(readCovariates);
ContextCovariate::InitContextCovariate(nsgv::gBqsrArg);
CycleCovariate::InitCycleCovariate(nsgv::gBqsrArg);
// 2. 读取bam的read group
if (nsgv::gInBamHeader->hrecs->nrg == 0) {
spdlog::error("No RG tag found in the header!");
return 1;
}
for (int i = 0; i < nsgv::gInBamHeader->hrecs->nrg; ++i) {
spdlog::info("rg: {}", nsgv::gInBamHeader->hrecs->rg[i].name);
ReadGroupCovariate::RgToId[nsgv::gInBamHeader->hrecs->rg[i].name] = i;
ReadGroupCovariate::IdToRg[i] = nsgv::gInBamHeader->hrecs->rg[i].name;
}
int test = 0;
while (1) {
++ round;
// 一. 读取bam数据
size_t readNum = 0;
if (inBamBuf.ReadStat() >= 0)
readNum = inBamBuf.ReadBam();
if (readNum < 1) {
break;
}
spdlog::info("{} reads processed in {} round", readNum, round);
auto bams = inBamBuf.GetBamArr();
spdlog::info("region: {} - {}", bams[0]->softclip_start(), bams.back()->softclip_end());
// 1. 获取bams数组覆盖的region范围
spdlog::info("{} reads processed in {} round, {}", readNum, round, test);
// 二. 遍历每个bamread记录进行处理
for (int i = 0; i < bams.size(); ++i) {
// 1. 对每个read需要检查cigar是否合法即没有两个连续的相同的cigar而且需要将首尾的deletion处理掉目前看好像没啥影响我们忽略这一步
// 2. 对质量分数长度跟碱基长度不匹配的read缺少的质量分数用默认值补齐先忽略后边有需要再处理
// 3. 如果bam文件之前做过bqsrtag中包含OQoriginnal quality原始质量分数检查用户参数里是否指定用原始质量分数进行bqsr如果是则将质量分数替换为OQ否则忽略OQ先忽略
// 4. 对read的两端进行检测去除hardclipadapter
BamWrap *bw = bams[i];
ReadAdditionData ad;
ad.read_len = BamWrap::BamEffectiveLength(bw->b);
ad.cigar_end = bw->b->core.n_cigar;
if (ad.read_len <= 0) continue;
int adapter_boundary = bw->GetAdapterBoundary();
if (bw->IsAdapterInRead(adapter_boundary)) {
// adapter在read范围内
if (bw->GetReadNegativeStrandFlag()) { // 反链
clipByReferenceCoordinates(bw, -1, adapter_boundary, ad);
} else { // 正链
clipByReferenceCoordinates(bw, adapter_boundary, -1, ad);
}
}
ad.read_len = bw->b->core.l_qseq - ad.left_clip - ad.right_clip; // 更新read长度
// 5. 然后再去除softclip部分
calculateSoftClip(bw, ad);
ad.read_len = bw->b->core.l_qseq - ad.left_clip - ad.right_clip; // 更新read长度
if (ad.read_len <= 0) continue;
calculateRefOffset(bw, ad); // 计算ref_offset就是相对比对的position要将ref右移多少
calculateReadBases(bw, ad); // 计算clip处理之后剩余的碱基
//spdlog::info("read-len {} - {}: clip left {}, right {}, ref offset: {}, cigar range: [{}, {}), cigar: {}", bw->b->core.l_qseq,
// ad.read_len, ad.left_clip, ad.right_clip, ad.ref_offset, ad.cigar_start, ad.cigar_end, bw->cigar_str());
// 6. 更新每个read的platform信息好像没啥用暂时忽略
vector<int> isSNP(ad.read_len, 0); // 该位置是否是SNP位置0不是1是
vector<int> isIns(ad.read_len, 0); // 该位置是否是插入位置0不是1是
vector<int> isDel(ad.read_len, 0); // 该位置是否是删除位置0不是1是
const int nErrors = calculateIsSNPOrIndel(nsgv::gAuxVars[0], bw, ad, isSNP, isIns, isDel);
// 7. 计算baqArray
// BAQ = base alignment quality
// note for efficiency reasons we don't compute the BAQ array unless we actually have
// some error to marginalize over. For ILMN data ~85% of reads have no error
vector<int> baqArray;
bool baqCalculated = false;
if (nErrors == 0 || !nsgv::gBqsrArg.enableBAQ) {
baqCalculated = flatBAQArray(bw, ad, baqArray);
} else {
// baqCalculated = calculateBAQArray(nsgv::gAuxVars[0], baq, bw, ad, baqArray);
}
if (!baqCalculated) continue;
// 到这里基本的数据都准备好了后续就是进行bqsr的统计了
// 8. 计算这条read对应的协变量
CovariateUtils::ComputeCovariates(bw, ad, nsgv::gInBamHeader, readCovariates, true);
test = readCovariates[1][0][0] + readCovariates[2][1][3];
int end_pos = bw->contig_end_pos();
//spdlog::info("adapter: {}, read: {}, {}, strand: {}", adapter_boundary, bw->contig_pos(), end_pos,
// bw->GetReadNegativeStrandFlag() ? "reverse" : "forward");
// 9. 计算这条read需要跳过的位置
vector<bool> skip(ad.read_len, 0);
calculateKnownSites(bw, ad, nsgv::gAuxVars[0].vcfArr);
}
#if 0
// spdlog::info("region: {} - {}", bams[0]->global_softclip_start(), bams.back()->global_softclip_end());
// 1. 获取bams数组覆盖的region范围
// 如果读取的bam数组跨越了不同的染色体咋搞还是按照每个线程都有独立的vcf文件来做吧
int64_t region_start = bams[0]->global_softclip_start();
vector<Region> contig_bams;
int contig_id = bams[0]->contig_id();
int64_t start = 0, stop = 0;
while (true) {
stop = start;
while (stop < bams.size() && bams[stop]->contig_id() == contig_id) ++stop;
if (stop > start) contig_bams.push_back(Region{start, stop});
if (stop >= bams.size()) break;
contig_id = bams[stop]->contig_id();
start = stop;
}
spdlog::info("{}, {} contig regions", contig_id, contig_bams.size());
for (int i = 0; i < bams.size();) {
int64_t a1 = bams[i]->contig_pos();
int64_t b1 = bams[i]->contig_end_pos();
int64_t a = bams[i]->softclip_start();
int64_t b = bams[i]->softclip_end();
spdlog::info("{}: ({}, {}), ({}, {})", bams[i]->query_name(), a1, b1, a, b);
++i;
}
// 依次处理每个contig的bams
vector<uint32_t> bitmap(100, 0); // 用来表示known sites覆盖情况的bitmap
for (const auto& cr : contig_bams) {
spdlog::info(" contig id: {}, bam count: {}, bitmap size: {}", contig_id, cr.end - cr.start, bitmap.size());
// 当前处理的contig
int contig_id = bams[cr.start]->contig_id();
int64_t region_start = bams[cr.start]->softclip_start();
int64_t region_end = bams[cr.end - 1]->softclip_end();
if ((bitmap.size() << 5)) {
}
}
#endif
// 2. 开辟一个uint32_t的数组作为bitmap如果上一轮的不够就重开用来表示region的每个位点是否有known sites覆盖每轮使用前需清零
// 3. 读取在region范围内的所有known sites并为对应的bitmap设定0 or 1 (作为skip标识)
@ -87,14 +620,18 @@ int SerialBQSR() {
// 4. 遍历bams数组中的每一条记录并进行处理
readNumSum += readNum;
inBamBuf.ClearAll(); //
inBamBuf.ClearAll(); //
}
spdlog::info("read count: {}", readNumSum);
return 0;
}
// entrance of mark duplicates
// 需要支持vcf idxtbicsi三种索引方式
// vcf和idx是一对
// vcf.gz和tbi或csi是一对
// entrance of mark BQSR
int BaseRecalibrator() {
PROF_START(whole_process);
@ -107,6 +644,18 @@ int BaseRecalibrator() {
hts_set_opt(nsgv::gInBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
nsgv::gInBamHeader = sam_hdr_read(nsgv::gInBamFp); // header
// 初始化AuxVar
nsgv::gAuxVars.resize(nsgv::gBqsrArg.NUM_THREADS);
for (int i = 0; i < nsgv::gBqsrArg.NUM_THREADS; ++i) {
nsgv::gAuxVars[i].header = nsgv::gInBamHeader;
nsgv::gAuxVars[i].faidx = fai_load(nsgv::gBqsrArg.REFERENCE_FILE.c_str());
if (nsgv::gAuxVars[i].faidx == 0)
error("[%s] fail to load the fasta index.\n", __func__);
for (auto &vcfFileName : nsgv::gBqsrArg.KNOWN_SITES_VCFS) {
nsgv::gAuxVars[i].vcfArr.push_back(VCFParser(vcfFileName, nsgv::gInBamHeader));
}
}
// (libraryId)
nsgv::gMetrics.LIBRARY = sam_hdr_line_name(nsgv::gInBamHeader, "RG", 0);
@ -124,21 +673,21 @@ int BaseRecalibrator() {
return SerialBQSR();
// // 读取known sites vcfs
// for (const auto& ks : nsgv::gBqsrArg.KNOWN_SITES_VCFS) {
// spdlog::info(" {}", ks);
// bcf_srs_t* srs = bcf_sr_init();
// if (!bcf_sr_add_reader(srs, ks.c_str()))
// error("Failed to read from %s: %s\n", !strcmp("-", ks.c_str()) ? "standard input" : ks.c_str(), bcf_sr_strerror(srs->errnum));
// nsgv::gKnownSitesVcfSrs.push_back(srs);
//
// while (bcf_sr_next_line(srs)) {
// bcf1_t* line = srs->readers[0].buffer[0];
// cout << line->pos << '\t' << line->rlen << '\t' << line->n_allele << '\t' << line->n_info << endl;
// }
// }
//
// /* 先实现串行的bqsr-phase-1 */
// 读取known sites vcfs
for (const auto& ks : nsgv::gBqsrArg.KNOWN_SITES_VCFS) {
spdlog::info(" {}", ks);
bcf_srs_t* srs = bcf_sr_init();
if (!bcf_sr_add_reader(srs, ks.c_str()))
error("Failed to read from %s: %s\n", !strcmp("-", ks.c_str()) ? "standard input" : ks.c_str(), bcf_sr_strerror(srs->errnum));
nsgv::gKnownSitesVcfSrs.push_back(srs);
while (bcf_sr_next_line(srs)) {
bcf1_t* line = srs->readers[0].buffer[0];
cout << line->pos << '\t' << line->rlen << '\t' << line->n_allele << '\t' << line->n_info << endl;
}
}
/* 先实现串行的bqsr-phase-1 */

View File

@ -0,0 +1,317 @@
#include "covariate.h"
// for EventType
EventTypeValue EventType::BASE_SUBSTITUTION = {0, 'M', "Base Substitution"};
EventTypeValue EventType::BASE_INSERTION = {1, 'I', "Base Insertion"};
EventTypeValue EventType::BASE_DELETION = {2, 'D', "Base Deletion"};
// static变量 for ContextCovariate
int ContextCovariate::mismatchesContextSize;
int ContextCovariate::indelsContextSize;
int ContextCovariate::mismatchesKeyMask;
int ContextCovariate::indelsKeyMask;
uint8_t ContextCovariate::lowQualTail;
int ContextCovariate::baseIndexMap[256];
// for ReadGroupCovariate
map<string, int> ReadGroupCovariate::RgToId; // read group name到id的映射
map<int, string> ReadGroupCovariate::IdToRg; // id到read group name的映射
// for cycleCovariate
int CycleCovariate::MAXIMUM_CYCLE_VALUE;
// for CovariateUtils
// 对一条read计算协变量该协变量被上一个read用过
void CovariateUtils::ComputeCovariates(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values,
bool recordIndelValues) {
//ReadGroupCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
//BaseQualityCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
//ContextCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
//CycleCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
}
// ReadGroupCovariate 协变量的方法
void ReadGroupCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
uint8_t *rgStr = bam_aux_get(bw->b, "RG");
char* rgVal = nullptr;
if (rgStr) rgVal = bam_aux2Z(rgStr);
int key = 0;
if (rgVal == nullptr || RgToId.find(rgVal) == RgToId.end()) {
spdlog::error("The RG tag value for read can not be found in header!");
} else {
key = RgToId[rgVal];
}
for (int i = 0; i < ad.read_len; ++i) {
CovariateUtils::SetCovariate(key, key, key, i, ReadGroupCovariate::index, values);
}
}
// BaseQualityCovariate 协变量的方法
void BaseQualityCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values,
bool recordIndelValues) {
// 在前面的处理过后quals应该和base长度一致了
#define __bq_set_cov(ins, del) \
do { \
for (int i = 0; i < ad.read_len; ++i) { \
CovariateUtils::SetCovariate(quals[i + ad.left_clip], (ins), (del), i, BaseQualityCovariate::index, values); \
} \
} while (0)
const int INDEL_QUAL = 45;
uint8_t* quals = bam_get_qual(bw->b);
if (recordIndelValues) {
uint8_t* insQualPtr = bam_aux_get(bw->b, "BI"); // base qualities for insertions
uint8_t* delQualPtr = bam_aux_get(bw->b, "BD"); // base qualities for deletions
if (insQualPtr == nullptr && delQualPtr == nullptr) {
__bq_set_cov(INDEL_QUAL, INDEL_QUAL);
} else if (insQualPtr == nullptr) {
uint8_t* delQuals = (uint8_t*)bam_aux2Z(delQualPtr);
__bq_set_cov(INDEL_QUAL, delQuals[i]);
} else {
uint8_t* insQuals = (uint8_t*)bam_aux2Z(insQualPtr);
__bq_set_cov(insQuals[i], INDEL_QUAL);
}
} else {
__bq_set_cov(0, 0);
}
}
// ContextCovariate 协变量的方法
static char SimpleComplement(const char base) {
switch (base) {
case 'A':
case 'a':
return 'T';
case 'C':
case 'c':
return 'G';
case 'G':
case 'g':
return 'C';
case 'T':
case 't':
return 'A';
default:
return base;
}
}
// 获取去除低质量分数碱基之后的read碱基序列将低质量分数的碱基变成N
void ContextCovariate::GetStrandedClippedBytes(BamWrap* bw, ReadAdditionData& ad, string& clippedBases, uint8_t lowQTail) {
uint8_t* quals = bam_get_qual(bw->b) + ad.left_clip;
if (bw->GetReadNegativeStrandFlag()) { // 反向互补
for (int i = 0; i < ad.read_len; ++i) clippedBases[i] = SimpleComplement(ad.bases[ad.read_len - 1 - i]);
}
// 处理左边
int left = 0;
for (; left < ad.read_len; ++left) {
if (quals[left] <= lowQTail)
clippedBases[left] = 'N';
else
break;
}
if (left == ad.read_len) {
clippedBases.clear();
return;
}
// 处理右边
int right = ad.read_len - 1;
for (; right >= 0; --right) {
if (quals[right] <= lowQTail)
clippedBases[right] = 'N';
else
break;
}
if (right < left)
clippedBases.clear();
}
/**
* Creates a int representation of a given dna string.
*
* @param dna the dna sequence
* @param start the start position in the byte array (inclusive)
* @param end the end position in the array (exclusive)
* @return the key representing the dna sequence
*/
int ContextCovariate::KeyFromContext(const string& dna, const int start, const int end) {
int key = end - start;
int bitOffset = LENGTH_BITS;
for (int i = start; i < end; i++) {
const int baseIndex = baseIndexMap[dna[i] & 0xff];
if (baseIndex == -1) { // ignore non-ACGT bases
return -1;
}
key |= (baseIndex << bitOffset);
bitOffset += 2;
}
return key;
}
/**
* For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
*
* For example, for the read [AGCTG], return the list
* [-1, "AG", "GC", "CT", "TG" ]
* with each string context encoded as an integer.
*
* @param bases the bases in the read to build the context from
* @param contextSize context size to use building the context
* @param mask mask for pulling out just the context bits
*
* @return a list that has the same length as the read and contains the (preceding) n-base context at each position.
*
*/
void ContextCovariate::GetReadContextAtEachPosition(const string& bases, const int contextSize, const int mask, vector<int>& keys) {
int readLength = bases.size();
keys.resize(readLength);
int keyIdx = 0;
// the first contextSize-1 bases will not have enough previous context
for (int i = 1; i < contextSize && i <= readLength; i++) {
keys[keyIdx++] = UNKNOWN_OR_ERROR_CONTEXT_CODE;
}
if (readLength < contextSize)
return;
int newBaseOffset = 2 * (contextSize - 1) + LENGTH_BITS;
// get (and add) the key for the context starting at the first base
int currentKey = KeyFromContext(bases, 0, contextSize);
keys[keyIdx++] = currentKey;
// if the first key was -1 then there was an non-ACGT in the context; figure out how many more consecutive contexts it affects
int currentNPenalty = 0;
if (currentKey == -1) {
currentKey = 0;
currentNPenalty = contextSize - 1;
int offset = newBaseOffset;
int baseIndex;
while ((baseIndex = baseIndexMap[bases[currentNPenalty]]) != -1) {
currentKey |= (baseIndex << offset);
offset -= 2;
currentNPenalty--;
}
}
for (int currentIndex = contextSize; currentIndex < readLength; currentIndex++) {
const int baseIndex = baseIndexMap[bases[currentIndex]];
if (baseIndex == -1) { // ignore non-ACGT bases
currentNPenalty = contextSize;
currentKey = 0; // reset the key
} else {
// push this base's contribution onto the key: shift everything 2 bits, mask out the non-context bits, and add the new base and the length
// in
currentKey = (currentKey >> 2) & mask;
currentKey |= (baseIndex << newBaseOffset);
currentKey |= contextSize;
}
if (currentNPenalty == 0) {
keys[keyIdx++] = currentKey;
} else {
currentNPenalty--;
keys[keyIdx++] = -1;
}
}
}
void ContextCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
const int originalReadLength = ad.read_len;
// store the original bases and then write Ns over low quality ones
string strandedClippedBases(ad.bases);
GetStrandedClippedBytes(bw, ad, strandedClippedBases, lowQualTail);
// spdlog::info("bases: {}", strandedClippedBases);
vector<int> nBasePairContextAtEachCycle;
GetReadContextAtEachPosition(strandedClippedBases, mismatchesContextSize, mismatchesKeyMask, nBasePairContextAtEachCycle);
const int readLengthAfterClipping = strandedClippedBases.size();
// this is necessary to ensure that we don't keep historical data in the ReadCovariates values
// since the context covariate may not span the entire set of values in read covariates
// due to the clipping of the low quality bases
if (readLengthAfterClipping != originalReadLength) {
// don't bother zeroing out if we are going to overwrite the whole array
for (int i = 0; i < originalReadLength; i++) {
// this base has been clipped off, so zero out the covariate values here
CovariateUtils::SetCovariate(0, 0, 0, i, ContextCovariate::index, values);
}
}
const bool negativeStrand = bw->GetReadNegativeStrandFlag();
// Note: duplicated the loop to avoid checking recordIndelValues on each iteration
if (recordIndelValues) {
vector<int> indelKeys;
GetReadContextAtEachPosition(strandedClippedBases, indelsContextSize, indelsKeyMask, indelKeys);
for (int i = 0; i < readLengthAfterClipping; i++) {
const int readOffset = GetStrandedOffset(negativeStrand, i, readLengthAfterClipping);
const int indelKey = indelKeys[i];
CovariateUtils::SetCovariate(nBasePairContextAtEachCycle[i], indelKey, indelKey, readOffset, ContextCovariate::index, values);
}
} else {
for (int i = 0; i < readLengthAfterClipping; i++) {
const int readOffset = GetStrandedOffset(negativeStrand, i, readLengthAfterClipping);
CovariateUtils::SetCovariate(nBasePairContextAtEachCycle[i], 0, 0, readOffset, ContextCovariate::index, values);
}
}
}
// CycleCovariate 协变量的方法
/**
* Computes the encoded value of CycleCovariate's key for the given position at the read.
* Uses keyFromCycle to do the encoding.
* @param baseNumber index of the base to compute the key for
* @param read the read
* @param indel is this an indel key or a substitution key?
* @param maxCycle max value of the base to compute the key for
* (this method throws UserException if the computed absolute value of the cycle number is higher than this value).
*/
int CycleCovariate::CycleKey(BamWrap* bw, ReadAdditionData& ad, const int baseNumber, const bool indel, const int maxCycle) {
const bool isNegStrand = bw->GetReadNegativeStrandFlag();
const bool isSecondInPair = (bw->b->core.flag & BAM_FPAIRED) && (bw->b->core.flag & BAM_FREAD2);
const int readLength = ad.read_len;
const int readOrderFactor = isSecondInPair ? -1 : 1;
int increment;
int cycle;
if (isNegStrand) {
cycle = readLength * readOrderFactor;
increment = -1 * readOrderFactor;
} else {
cycle = readOrderFactor;
increment = readOrderFactor;
}
cycle += baseNumber * increment;
if (!indel) {
return CycleCovariate::KeyFromCycle(cycle, maxCycle);
}
const int maxCycleForIndels = readLength - CUSHION_FOR_INDELS - 1;
if (baseNumber < CUSHION_FOR_INDELS || baseNumber > maxCycleForIndels) {
return -1;
} else {
return CycleCovariate::KeyFromCycle(cycle, maxCycle);
}
}
// Used to pick out the covariate's value from attributes of the read
void CycleCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
const int readLength = ad.read_len;
// Note: duplicate the loop to void checking recordIndelValues on every iteration
if (recordIndelValues) {
for (int i = 0; i < readLength; i++) {
const int substitutionKey = CycleKey(bw, ad, i, false, MAXIMUM_CYCLE_VALUE);
const int indelKey = CycleKey(bw, ad, i, true, MAXIMUM_CYCLE_VALUE);
CovariateUtils::SetCovariate(substitutionKey, indelKey, indelKey, i, CycleCovariate::index, values);
}
} else {
for (int i = 0; i < readLength; i++) {
const int substitutionKey = CycleKey(bw, ad, i, false, MAXIMUM_CYCLE_VALUE);
CovariateUtils::SetCovariate(substitutionKey, 0, 0, i, CycleCovariate::index, values);
}
}
}

View File

@ -0,0 +1,209 @@
/*
Description: bqsr
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2025/12/08
*/
#pragma once
#include <spdlog/spdlog.h>
#include <cstdint>
#include <cstdlib>
#include <map>
#include <string>
#include <vector>
#include "bqsr_args.h"
#include "util/bam_wrap.h"
using std::map;
using std::string;
using std::vector;
/**
* This is where we store the pre-read covariates, also indexed by (event type) and (read position).
* Thus the array has shape { event type } x { read position (aka cycle) } x { covariate }.
* For instance, { covariate } is by default 4-dimensional (read group, base quality, context, cycle).
*/
typedef vector<vector<vector<int>>> PerReadCovariateMatrix;
// 变异类型snp, insert, deletion
struct EventTypeValue {
int index; // 在协变量数组中对应的索引
char representation;
string longRepresentation;
};
struct EventType {
static constexpr int EVENT_SIZE = 3;
static EventTypeValue BASE_SUBSTITUTION;
static EventTypeValue BASE_INSERTION;
static EventTypeValue BASE_DELETION;
};
// 协变量相关的工具类
struct CovariateUtils {
static constexpr int MAX_READ_LENGTH = 300; // 最大read长度
static constexpr int NUM_COVARIATES = 4;
// 初始化PerReadCovariateMatrix
static void InitPerReadCovMat(PerReadCovariateMatrix& matrix) {
matrix.resize(EventType::EVENT_SIZE);
for (int event_type = 0; event_type < EventType::EVENT_SIZE; ++event_type) {
matrix[event_type].resize(MAX_READ_LENGTH);
for (int pos = 0; pos < MAX_READ_LENGTH; ++pos) {
matrix[event_type][pos].resize(NUM_COVARIATES, 0);
}
}
}
// 设置协变量
static void SetCovariate(int mismatch, int insertion, int deletion, int readOffset, int covIndex, PerReadCovariateMatrix& matrix) {
matrix[EventType::BASE_SUBSTITUTION.index][readOffset][covIndex] = mismatch;
matrix[EventType::BASE_INSERTION.index][readOffset][covIndex] = insertion;
matrix[EventType::BASE_DELETION.index][readOffset][covIndex] = deletion;
}
// 对一条read计算协变量该协变量被上一个read用过
static void ComputeCovariates(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
};
// Read group协变量
struct ReadGroupCovariate {
static constexpr int index = 0; // 在协变量数组中的索引位置
static map<string, int> RgToId; // read group name到id的映射
static map<int, string> IdToRg; // id到read group name的映射
static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
};
// Base quality协变量
struct BaseQualityCovariate {
static constexpr int index = 1; // 在协变量数组中的索引位置
static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
};
// Context协变量
struct ContextCovariate {
static constexpr int index = 2; // 在协变量数组中的索引位置
static constexpr int UNKNOWN_OR_ERROR_CONTEXT_CODE = -1;
static constexpr int LENGTH_BITS = 4;
static constexpr int LENGTH_MASK = 15;
// the maximum context size (number of bases) permitted; we need to keep the leftmost base free so that values are
// not negative and we reserve 4 more bits to represent the length of the context; it takes 2 bits to encode one base.
static constexpr int MAX_DNA_CONTEXT = 13;
static int mismatchesContextSize;
static int indelsContextSize;
static int mismatchesKeyMask;
static int indelsKeyMask;
static uint8_t lowQualTail;
static int baseIndexMap[256];
static void InitContextCovariate(BQSRArg& p) {
mismatchesContextSize = p.MISMATCHES_CONTEXT_SIZE;
indelsContextSize = p.INDELS_CONTEXT_SIZE;
if (mismatchesContextSize > MAX_DNA_CONTEXT) {
spdlog::error("mismatches_context_size: context size cannot be bigger than {}, but was {}", MAX_DNA_CONTEXT, mismatchesContextSize);
exit(1);
}
if (indelsContextSize > MAX_DNA_CONTEXT) {
spdlog::error("indels_context_size: context size cannot be bigger than {}, but was {}", MAX_DNA_CONTEXT, indelsContextSize);
exit(1);
}
lowQualTail = p.LOW_QUAL_TAIL;
if (mismatchesContextSize <= 0 || indelsContextSize <= 0) {
spdlog::error("Context size must be positive. Mismatches: {} Indels: {}", mismatchesContextSize, indelsContextSize);
exit(1);
}
mismatchesKeyMask = CreateMask(mismatchesContextSize);
indelsKeyMask = CreateMask(indelsContextSize);
// init baseIndexMap
for (int i = 0; i < 256; ++i) {
baseIndexMap[i] = -1;
}
baseIndexMap['A'] = 0;
baseIndexMap['a'] = 0;
baseIndexMap['*'] = 0;
baseIndexMap['C'] = 1;
baseIndexMap['c'] = 1;
baseIndexMap['G'] = 2;
baseIndexMap['g'] = 2;
baseIndexMap['T'] = 3;
baseIndexMap['t'] = 3;
}
static int CreateMask(int contextSize) {
int mask = 0;
// create 2*contextSize worth of bits
for (int i = 0; i < contextSize; i++) {
mask = (mask << 2) | 3;
}
// shift 4 bits to mask out the bits used to encode the length
return mask << LENGTH_BITS;
}
/**
* Helper method: computes the correct offset to use in computations of covariate values.
* @param isNegativeStrand is the read on the negative strand
* @param offset 0-based index of the base in the read
* @param readLength length of the read
* @return
*/
static int GetStrandedOffset(const bool isNegativeStrand, const int offset, const int readLength) {
return isNegativeStrand ? (readLength - offset - 1) : offset;
}
// 获取去除低质量分数碱基之后的read碱基序列将低质量分数的碱基变成N
static void GetStrandedClippedBytes(BamWrap* bw, ReadAdditionData& ad, string& clippedBases, uint8_t lowQTail);
// Creates a int representation of a given dna string.
static int KeyFromContext(const string& dna, const int start, const int end);
// For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
static void GetReadContextAtEachPosition(const string& bases, const int contextSize, const int mask, vector<int>& keys);
// 设置协变量的值
static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
};
// Cycle协变量
struct CycleCovariate {
static constexpr int index = 3; // 在协变量数组中的索引位置
static int MAXIMUM_CYCLE_VALUE;
static constexpr int CUSHION_FOR_INDELS = 4;
static void InitCycleCovariate(BQSRArg& p) { MAXIMUM_CYCLE_VALUE = p.MAXIMUM_CYCLE_VALUE; }
/**
* Encodes the cycle number as a key.
*/
static int KeyFromCycle(const int cycle, const int maxCycle) {
// no negative values because values must fit into the first few bits of the long
int result = std::abs(cycle);
if (result > maxCycle) {
spdlog::error(
"The maximum allowed value for the cycle is {}, but a larger cycle ({}) was detected. Please use the --maximum-cycle-value argument "
"(when creating the recalibration table in "
"BaseRecalibrator) to increase this value (at the expense of requiring more memory to run)",
maxCycle, result);
exit(1);
}
result <<= 1; // shift so we can add the "sign" bit
if (cycle < 0) {
result++; // negative cycles get the lower-most bit set
}
return result;
}
// Computes the encoded value of CycleCovariate's key for the given position at the read.
static int CycleKey(BamWrap* bw, ReadAdditionData& ad, const int baseNumber, const bool indel, const int maxCycle);
static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
};

View File

@ -78,6 +78,12 @@ int main_BaseRecalibrator(int argc, char *argv[]) {
.nargs(1)
.metavar("<IndexFormat>");
program.add_argument("--enable-baq")
.help("Whether to do BAQ correction.")
.default_value(false)
.implicit_value(true)
.hidden();
// add help and version args
program.add_argument("-h", "--help")
.action([&](const auto & /*unused*/) {
@ -113,8 +119,9 @@ int main_BaseRecalibrator(int argc, char *argv[]) {
nsgv::gBqsrArg.OUTPUT_FILE = program.get("--output");
nsgv::gBqsrArg.NUM_THREADS = program.get<int>("--num-threads");
nsgv::gBqsrArg.CREATE_INDEX = program.get<bool>("--create-index");
nsgv::gBqsrArg.REFERENCE_FILE = program.get<string>("--reference");
nsgv::gBqsrArg.KNOWN_SITES_VCFS = program.get<std::vector<string>>("--known-sites");
nsgv::gBqsrArg.enableBAQ = program.get<bool>("--enable-baq");
// spdlog::info("known sites vcf files:");
// for (const auto& ks : nsgv::gBqsrArg.KNOWN_SITES_VCFS) {
// spdlog::info(" {}", ks);

View File

@ -1,5 +1,5 @@
/*
Description: sam/bambuf
Description: sam/bambuf
Copyright : All right reserved by ICT
@ -10,25 +10,28 @@
#include "bam_buf.h"
/*
* BamBuf
* BamBuf
*/
//
// 读取数据直到读完或者缓冲区满
int BamBuf::ReadBam() {
int read_num = 0;
if (handle_last) { // bam
if (has_enough_space()) { // memffset
if (handle_last) { // 处理上次读入的最后一个bam
if (has_enough_space()) { // 必须调用在边界处调整memffset
++read_num;
append_one_bam();
} else {
return read_num; //
return read_num; // 还是没空间
}
}
while (read_stat_ >= 0 && (read_stat_ = sam_read1(fp, hdr, bw->b)) >= 0) {
bw->end_pos_ = BamWrap::BamEndPos(bw->b);
if (has_enough_space()) { //
// if (true) { //
append_one_bam();
++read_num; //
if (has_enough_space()) { // 还有空间
// if (true) { // 还有空间
// 加过滤器
if (filter_out == nullptr || !filter_out(bw->b)) {
append_one_bam();
++read_num; // 放进缓存才算读取到
}
} else {
break;
}
@ -41,7 +44,7 @@ int BamBuf::ReadBam() {
return read_num;
}
//
// 初始化缓存
void BamBuf::Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size) {
this->fp = fp;
this->hdr = hdr;
@ -71,9 +74,9 @@ void BamBuf::ClearAll() {
prepare_read();
}
// ,
// 为下一次读取做准备, 计算一些边界条件
inline void BamBuf::prepare_read() {
// bam
// 计算余留的下次计算可能用到的bam所占的位置
if (bv.size() > 0) {
BamWrap *bw = bv[0];
legacy_start = (int64_t)bw - (int64_t)mem;
@ -81,11 +84,11 @@ inline void BamBuf::prepare_read() {
legacy_end = (int64_t)bw + bw->length() - (int64_t)mem;
} else {
legacy_start = legacy_end = 0;
mem_offset = 0; //
mem_offset = 0; // 上次没剩下那就从头存储
}
}
//
// 检查缓存是否还有空间
inline bool BamBuf::has_enough_space() {
const uint32_t bam_len = bw->length();
int64_t potential_end = mem_offset + bam_len;
@ -104,7 +107,7 @@ inline bool BamBuf::has_enough_space() {
return potential_end < legacy_start;
}
// bam
// 处理一个读取后的bam
inline void BamBuf::append_one_bam() {
BamWrap *bwp = (BamWrap *)(mem + mem_offset);
*bwp = *bw;
@ -113,15 +116,15 @@ inline void BamBuf::append_one_bam() {
*bp = *bw->b;
bp->data = (uint8_t *)((char *)bwp->b + sizeof(bam1_t));
memcpy(bp->data, bw->b->data, bw->b->l_data);
//
// 更新下次存储的位置
mem_offset = (mem_offset + bw->length() + 8 - 1) & ~((size_t)(8 - 1));
bv.push_back(bwp);
}
// read
// 处理上次读入的最后一个read
inline bool BamBuf::handle_last_read() {
if (handle_last) { // bam
if (has_enough_space()) { // memffset
if (handle_last) { // 处理上次读入的最后一个bam
if (has_enough_space()) { // 必须调用在边界处调整memffset
append_one_bam();
handle_last = false;
return true;
@ -131,9 +134,9 @@ inline bool BamBuf::handle_last_read() {
}
/*
* AsyncIoBamBuf
* AsyncIoBamBuf
*/
//
// 初始化缓存
void AsyncIoBamBuf::Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size) {
if (use_async_io_) {
buf1_.Init(fp, hdr, mem_size >> 1);
@ -147,7 +150,7 @@ void AsyncIoBamBuf::Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size) {
}
}
//
// 读取数据
int AsyncIoBamBuf::ReadBam() {
if (use_async_io_) {
hasThread = true;
@ -178,11 +181,11 @@ int AsyncIoBamBuf::async_read_bam() {
first_read_ = false;
refresh_bam_arr();
} else {
// join,
// join, 交换缓冲区指针
pthread_join(*tid_, 0);
resize_buf();
if (need_read_) { //
if (need_read_) { // 需要交换指针
BamBuf *tmp = pi_;
pi_ = po_;
po_ = tmp;
@ -190,14 +193,14 @@ int AsyncIoBamBuf::async_read_bam() {
read_num = last_read_num_;
refresh_bam_arr();
}
//
// 异步读
pthread_create(tid_, 0, async_read, this);
return read_num;
}
void *AsyncIoBamBuf::async_read(void *data) {
AsyncIoBamBuf *ab = (AsyncIoBamBuf *)data;
if (ab->need_read_ && ab->ReadStat() >= 0) { //
if (ab->need_read_ && ab->ReadStat() >= 0) { // 需要读取
ab->last_read_num_ = ab->po_->ReadBam();
} else {
ab->last_read_num_ = 0;
@ -205,23 +208,23 @@ void *AsyncIoBamBuf::async_read(void *data) {
pthread_exit(0);
}
// ,
// po_buf
// 为下一次读取做准备,
// 计算一些边界条件延迟操作因为此时可能po_对应的buf正在读取
void AsyncIoBamBuf::ClearBeforeIdx(size_t idxInBv) { clear_before_idx_ = idxInBv; }
// po_buf
// 清空上一次所有读入的数据延迟操作因为此时可能po_对应的buf正在读取
void AsyncIoBamBuf::ClearAll() { clear_all_ = true; }
inline void AsyncIoBamBuf::resize_buf() {
if (clear_all_) { //
if (clear_all_) { // 清理上一轮的数据
clear_all_ = false;
po_->ClearBeforeIdx(legacy_size_);
pi_->ClearAll();
if (pi_->handle_last_read()) { // read
if (pi_->handle_last_read()) { // 上次读取有一个read没放入缓存
last_read_num_ += 1;
legacy_size_ = pi_->Size(); // read
legacy_size_ = pi_->Size(); // 应该只有一个read
need_read_ = true;
} else { //
} else { // 没空间存放则不交换指针或者文件已经读取完毕
legacy_size_ = 0;
need_read_ = false;
}
@ -229,16 +232,16 @@ inline void AsyncIoBamBuf::resize_buf() {
if (clear_before_idx_ < legacy_size_) {
po_->ClearBeforeIdx(clear_before_idx_);
legacy_size_ -= clear_before_idx_;
//
// 不需要交换指针不需要读取
need_read_ = false;
} else {
po_->ClearBeforeIdx(legacy_size_);
pi_->ClearBeforeIdx(clear_before_idx_ - legacy_size_);
if (pi_->handle_last_read()) { // read
if (pi_->handle_last_read()) { // 上次读取有一个read没放入缓存
last_read_num_ += 1;
legacy_size_ = pi_->Size(); // read
legacy_size_ = pi_->Size(); // 应该只有一个read
need_read_ = true;
} else { //
} else { // 没空间存放则不交换指针或者文件已经读取完毕
legacy_size_ = 0;
need_read_ = false;
}

View File

@ -1,5 +1,5 @@
/*
Description: sam/bambuf
Description: sam/bambuf
Copyright : All right reserved by ICT
@ -26,33 +26,40 @@
using std::vector;
using namespace std;
using ReadFilterOut = bool (*)(const bam1_t *b);
/*
* bam
* bam
*/
struct BamBuf {
sam_hdr_t *hdr; // samheader
samFile *fp; // sam
BamWrap *bw = nullptr; // bam
uint8_t *mem = nullptr; // bam,
//
int64_t mem_offset = 0; //
int64_t mem_size; //
int read_stat_ = 0; //
vector<BamWrap *> bv; // bam
int64_t legacy_start = 0; // bammem,
int64_t legacy_end = 0; // bammem,
bool handle_last = false; // bam
sam_hdr_t *hdr; // sam文件的header信息
samFile *fp; // sam文件指针
BamWrap *bw = nullptr; // 用来循环读入bam
uint8_t *mem = nullptr; // 用来存放bam的数据,
// 程序结束后自动释放,所以没在析构函数里释放
int64_t mem_offset = 0; // 下一次要存放的位置
int64_t mem_size; // 缓存大小
int read_stat_ = 0; // 读取状态,是否读完
vector<BamWrap *> bv; // 方便对bam数据的访问
int64_t legacy_start = 0; // 没处理完的bam在mem中的起始位置, 闭区间
int64_t legacy_end = 0; // 没处理完的bam在mem中的结束位置, 开区间
bool handle_last = false; // 上次最后读入的bam是否需要处理
ReadFilterOut filter_out = nullptr; // 读入过滤函数指针
//
// 初始化缓存
void Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size);
//
void Init(samFile* fp, sam_hdr_t* hdr, int64_t mem_size, ReadFilterOut filter) {
this->filter_out = filter;
Init(fp, hdr, mem_size);
}
// 读取数据直到读完,或者缓冲区满
int ReadBam();
// ,
// 为下一次读取做准备, 计算一些边界条件
void ClearBeforeIdx(size_t idxInBv);
//
// 清空上一次所有读入的数据
void ClearAll();
inline int64_t Size() { return bv.size(); } // read
inline int ReadStat() { return read_stat_; } //
inline int64_t Size() { return bv.size(); } // 包含多少个read
inline int ReadStat() { return read_stat_; } // 文件的读取状态是否可读读取完全
~BamBuf() {
if (this->mem != nullptr) {
free(this->mem);
@ -62,7 +69,7 @@ struct BamBuf {
free(bw);
}
}
void FreeMemory() //
void FreeMemory() // 释放开辟的内存
{
if (this->mem != nullptr) {
free(this->mem);
@ -75,14 +82,14 @@ struct BamBuf {
this->bw = nullptr;
}
void prepare_read();
//
// 检查缓存是否还有空间
bool has_enough_space();
// bam
// 处理一个读取后的bam
void append_one_bam();
// read
// 处理上次读入的最后一个read
bool handle_last_read();
// bv
// 针对bv的操作
inline BamWrap *operator[](int64_t pos) { return bv[pos]; }
inline void push_back(BamWrap *val) { bv.push_back(val); }
inline void clear() { bv.clear(); }
@ -90,53 +97,57 @@ struct BamBuf {
};
/*
* io
* io
*/
struct AsyncIoBamBuf {
BamBuf buf1_;
BamBuf buf2_;
BamBuf *pi_; // buf
BamBuf *po_; // buf
BamBuf *pi_; // 当前用的buf
BamBuf *po_; // 后台在读取的buf
pthread_t *tid_ = NULL;
bool hasThread = false;
int64_t legacy_size_ = 0; // read
int64_t legacy_size_ = 0; // 上一轮运算之后缓存中还剩余的上次读取的read数量
bool first_read_ = true;
int last_read_num_ = 0; // reads
int last_read_num_ = 0; // 上一次读取了多少reads
bool need_read_ = true;
bool use_async_io_ = true;
int64_t clear_before_idx_ = 0; // clear_before_idx_reads
bool clear_all_ = false; // reads
int64_t clear_before_idx_ = 0; // 用户异步读取下一轮读取之前清理掉clear_before_idx_之前的所有reads
bool clear_all_ = false; // 用于异步读取下一轮读取之前清理掉之前的所有reads
vector<BamWrap *> bam_arr_; // bufbam
vector<BamWrap *> bam_arr_; // 用来访问buf中的bam
AsyncIoBamBuf() {}
AsyncIoBamBuf(bool use_async) : use_async_io_(use_async) {}
//
// 析构
~AsyncIoBamBuf() {
if (tid_ != NULL) {
if (hasThread)
pthread_join(*tid_, 0);
free(tid_);
}
//
// buf
// 其他的内存就等程序结束自动释放
// buf的析构函数会自动调用
}
//
// 初始化缓存
void Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size);
//
void Init(samFile* fp, sam_hdr_t* hdr, int64_t mem_size, ReadFilterOut filter) {
Init(fp, hdr, mem_size);
buf1_.filter_out = filter;
buf2_.filter_out = filter;
}
// 读取数据
int ReadBam();
// ,
// 为下一次读取做准备, 计算一些边界条件
void ClearBeforeIdx(size_t idxInBv);
vector<BamWrap *> &GetBamArr() { return bam_arr_; } // bam array
//
vector<BamWrap *> &GetBamArr() { return bam_arr_; } // 获取bam array
// 清空上一次所有读入的数据
void ClearAll();
// read
// 包含的read数量
inline int64_t Size() { return legacy_size_ + pi_->Size(); }
inline int ReadStat() { return pi_->read_stat_; }
inline BamWrap *operator[](int64_t pos) { return bam_arr_[pos]; }
// reads
// 获取某一段reads
inline vector<BamWrap *> Slice(size_t startIdx, size_t endIdx) {
if (endIdx > startIdx) {
auto begItr = bam_arr_.begin();
@ -149,11 +160,11 @@ struct AsyncIoBamBuf {
buf2_.FreeMemory();
}
//
// 同步读取
int sync_read_bam();
//
// 异步读取
int async_read_bam();
//
// 异步读取线程函数
static void *async_read(void *data);
void resize_buf();
inline void refresh_bam_arr() {

View File

@ -1,5 +1,5 @@
/*
Description: sam/bambuf
Description: sam/bambuf
Copyright : All right reserved by ICT
@ -19,38 +19,55 @@
using namespace std;
// 对原始bam数据的补充比如对两端进行hardclip等
struct ReadAdditionData {
int read_len = 0; // read长度各种clip之后的长度
int cigar_start = 0; // cigar起始位置闭区间
int cigar_end = 0; // cigar结束位置开区间
int first_cigar_clip = 0; // 第一个cigar, clip的数量切左侧
int last_cigar_clip = 0; // 最后一个cigar, clip的数量切右侧
int left_clip = 0; // 左侧被切掉的碱基长度
int right_clip = 0; // 右侧被切掉的碱基长度
int ref_offset = 0; // 切除adapter和softclip之后(softclip应该不影响)相对原始ref比对位置contig_pos的偏移量
string bases; // 处理之后的read的碱基
};
/*
bam
线
bam
*/
/*
* sam read
* sam read
*/
struct BamWrap {
// contigpos
const static int MAX_CONTIG_LEN_SHIFT = 40; // id
// contig左移后加上pos作为全局位置
const static int MAX_CONTIG_LEN_SHIFT = 40; // 将染色体id左移多少位和位点拼合在一起
const static int READ_MAX_LENGTH = 200;
const static int READ_MAX_DEPTH = 1000; //
const static int READ_MAX_DEPTH = 1000; // 这只是用来初始化空间用的深度大于这个值也没关系
//
// 成员变量尽量少减少占用内存空间
bam1_t *b;
int64_t end_pos_; // bam,
int64_t end_pos_; // bam的全局结束位置, 相对ref, 闭区间
//
// 全局开始位置
inline int64_t start_pos() { return bam_global_pos(b); }
//
// 全局结束位置
inline int64_t end_pos() { return end_pos_; }
// reference
// reference对应的序列长度不是read包含碱基的个数
inline int16_t read_len() { return (end_pos_ - start_pos() + 1); }
// contig
// contig id
inline int32_t contig_id() { return b->core.tid; }
// 在contig内的开始位置
inline int32_t contig_pos() { return b->core.pos; }
// contig
// contig内部的结束位置
inline int32_t contig_end_pos() { return bam_pos(end_pos_); }
// AGTC
// 序列的长度AGTC字母个数
inline int16_t seq_len() { return b->core.l_qseq; }
// softclip
/*
// 算上开头的softclip
inline int32_t softclip_start() {
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
@ -61,18 +78,28 @@ struct BamWrap {
return bc.pos;
}
// softclip
inline int64_t global_softclip_start() {
return softclip_start() + ((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT);
}
// 算上结尾的softclip闭区间
inline int32_t softclip_end() {
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
const char c = bam_cigar_opchr(cigar[bc.n_cigar - 1]);
const int len = bam_cigar_oplen(cigar[bc.n_cigar - 1]);
const int idx = bc.n_cigar - 1;
if (idx < 0) return bam_pos(end_pos_);
const char c = bam_cigar_opchr(cigar[idx]);
const int len = bam_cigar_oplen(cigar[idx]);
if (c == 'S')
return bam_pos(end_pos_) + len;
return bam_pos(end_pos_);
}
// softclip
inline int64_t global_softclip_end() {
return softclip_end() + ((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT);
}
// 右边softclip的长度
inline int32_t right_softclip_len() {
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
@ -82,8 +109,9 @@ struct BamWrap {
return len;
return 0;
}
*/
//
// 获取序列
inline std::string sequence() {
ostringstream oss;
char *seq = (char *)bam_get_seq(b);
@ -96,9 +124,9 @@ struct BamWrap {
return std::move(oss.str());
}
//
// 获取名字
inline const char *query_name() { return bam_get_qname(b); }
// cigar
// 获取cigar 字符串
inline string cigar_str() {
ostringstream oss;
const uint32_t *cigar = bam_get_cigar(b);
@ -111,10 +139,10 @@ struct BamWrap {
return std::move(oss.str());
}
//
// 占用的内存大小
inline int16_t length() { return sizeof(*this) + sizeof(bam1_t) + b->l_data; }
// cigarinsert
// 获取cigarinsert的总长度
inline int32_t insert_cigar_len() {
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
@ -128,7 +156,7 @@ struct BamWrap {
return ret;
}
// cigardelete
// 获取cigardelete的总长度
inline int32_t del_cigar_len() {
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
@ -142,7 +170,7 @@ struct BamWrap {
return ret;
}
// sam read
// 计算sam read的终点位置,相对参考基因组
static inline int64_t BamEndPos(const bam1_t *b) {
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
@ -156,6 +184,20 @@ struct BamWrap {
return (((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)(b->core.pos + start_offset));
};
// 计算read的有效长度即除了softclip和hardclip之外的长度
static inline int BamEffectiveLength(const bam1_t *b) {
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
int effective_len = 0;
for (int i = 0; i < bc.n_cigar; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
const int len = bam_cigar_oplen(cigar[i]);
if (c == 'I' || c == 'N' || c == 'M' || c == '=' || c == 'X')
effective_len += len;
}
return effective_len;
};
bool HasWellDefinedFragmentSize() {
const bam1_core_t &bc = b->core;
bool hasWellDefinedFragmentSize = true;
@ -170,20 +212,25 @@ struct BamWrap {
return hasWellDefinedFragmentSize;
}
// bamadapterBoundary
// 计算bamadapterBoundary
int GetAdapterBoundary() {
const bam1_core_t &bc = b->core;
int adapterBoundary;
int adapterBoundary = INT_MIN;
if (!HasWellDefinedFragmentSize())
adapterBoundary = INT_MIN;
else if (bc.flag & BAM_FREVERSE)
adapterBoundary = bc.mpos - 1;
else
adapterBoundary = bc.pos + abs(bc.isize); // GATK4.0 GATK3.53.5+1
adapterBoundary = bc.pos + abs(bc.isize); // GATK4.0 GATK3.5不一样3.5的这里+1
return adapterBoundary;
}
// I
// 检测adapter boundary是否在read范围内
bool IsAdapterInRead(int adapterBoundary) {
return (adapterBoundary != INT_MIN && (adapterBoundary >= contig_pos() && adapterBoundary <= contig_end_pos()));
}
// 获取开头的I的长度
inline int GetHeadInsertLen() {
int insLen = 0;
const uint32_t *cigar = bam_get_cigar(b);
@ -200,8 +247,8 @@ struct BamWrap {
return insLen;
}
// soft clip(HS,
// IS)
// 获取soft clip开始位置(能处理HS相连的情况有这种情况么,
// 注意开头的I要当做S)
inline int64_t GetSoftStart() {
int64_t softStart = b->core.pos;
const uint32_t *cigar = bam_get_cigar(b);
@ -209,7 +256,8 @@ struct BamWrap {
for (int i = 0; i < bc.n_cigar; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
const int len = bam_cigar_oplen(cigar[i]);
if (c == 'S' || c == 'I')
// if (c == 'S' || c == 'I')
if (c == 'S')
softStart -= len;
else if (c != 'H')
break;
@ -217,7 +265,36 @@ struct BamWrap {
return softStart;
}
// unclipped(hardclip)
/**
* Calculates the reference coordinate for the end of the read taking into account soft clips but not hard clips.
*
* Note: getUnclippedEnd() adds soft and hard clips, this function only adds soft clips.
*
* @return the unclipped end of the read taking soft clips (but not hard clips) into account
*/
inline int64_t GetSoftEnd() {
int64_t softEnd = contig_end_pos();
const uint32_t* cigar = bam_get_cigar(b);
const bam1_core_t& bc = b->core;
bool foundAlignedBase = false;
for (int i = bc.n_cigar - 1; i >= 0; --i) {
const char c = bam_cigar_opchr(cigar[i]);
const int len = bam_cigar_oplen(cigar[i]);
// if (c == 'S' || c == 'I')
if (c == 'S')
softEnd += len;
else if (c != 'H') {
foundAlignedBase = true;
break;
}
}
if (!foundAlignedBase) { // for example 64H14S, the soft end is actually the same as the alignment end
softEnd = contig_end_pos();
}
return softEnd;
}
// 获取unclipped开始位置(包括hardclip)
inline int64_t GetUnclippedStart() {
int64_t start = b->core.pos;
const uint32_t *cigar = bam_get_cigar(b);
@ -233,7 +310,7 @@ struct BamWrap {
return start;
}
// unclipped(hardclip)
// 获取unclipped结束位置(包括hardclip)
inline int64_t GetUnclippedEnd() {
int64_t end_pos = bam_endpos(b);
const uint32_t *cigar = bam_get_cigar(b);
@ -249,7 +326,7 @@ struct BamWrap {
return end_pos - 1;
}
/* */
/* 获取碱基质量分数的加和 */
/** Calculates a score for the read which is the sum of scores over Q15. */
inline int GetSumOfBaseQualities() {
int score = 0;
@ -262,9 +339,9 @@ struct BamWrap {
return score;
}
/* flag */
/* flag相关的检测 */
/* unmapped */
/* 没有比对上 unmapped */
inline bool GetReadUnmappedFlag() { return b->core.flag & BAM_FUNMAP; }
/* Template having multiple segments in sequencing */
@ -313,7 +390,7 @@ struct BamWrap {
*/
bool GetMateNegativeStrandFlag() { return b->core.flag & BAM_FMREVERSE; }
/* */
/* 其他的一些信息 */
inline int GetReferenceLength() {
int length = 0;
const uint32_t *cigar = bam_get_cigar(b);
@ -336,26 +413,26 @@ struct BamWrap {
return length;
}
// bam
// 计算bam的全局位置算上染色体序号和比对位置
static inline int64_t bam_global_pos(bam1_t *b) {
return (((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)b->core.pos);
}
static inline int64_t bam_global_pos(int tid, int pos) {
return (((int64_t)tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)pos);
}
// bam
// 根据全局位置获取bam的染色体序号
static inline int32_t bam_tid(int64_t global_pos) {
const int64_t mask = ~(((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1);
const int64_t high_tid = global_pos & mask;
return (int32_t)(high_tid >> MAX_CONTIG_LEN_SHIFT);
}
// bam()
// 根据全局位置获取bam的比对位置(染色体内)
static inline int32_t bam_pos(int64_t global_pos) {
const int64_t mask = ((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1;
return (int32_t)(global_pos & mask);
}
//
// 设置是否冗余的标记
void SetDuplicateReadFlag(bool flag) { setFlag(flag, BAM_FDUP); }
void setFlag(bool flag, int bit) {

View File

@ -0,0 +1,131 @@
#pragma once
#include <stdint.h>
#include <stdlib.h>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>
using std::ifstream;
using std::ofstream;
using std::ostringstream;
using std::string;
using std::vector;
using namespace std;
// 二进制读写相关
struct BinaryUtil {
static void WriteInt(ofstream& out, int val) {
uint32_t i = (uint32_t)val;
out << (char)(i & 0xFF) << (char)((i >> 8) & 0xFF) << (char)((i >> 16) & 0xFF) << (char)((i >> 24) & 0xFF);
};
static void WriteLong(ofstream& out, uint64_t val) {
uint64_t i = val;
out << (char)(i & 0xFF) << (char)((i >> 8) & 0xFF) << (char)((i >> 16) & 0xFF) << (char)((i >> 24) & 0xFF) << (char)((i >> 32) & 0xFF)
<< (char)((i >> 40) & 0xFF) << (char)((i >> 48) & 0xFF) << (char)((i >> 56) & 0xFF);
};
static void WriteStr(ofstream& out, const string& s) {
for (int i = 0; i < s.size(); ++i) out << s[i];
out << (char)0;
}
static bool ReadInt(char* buf, uint64_t total, uint64_t* cur, int* res) {
char b1, b2, b3, b4;
if (*cur + 4 > total)
return false;
b1 = buf[(*cur)++];
b2 = buf[(*cur)++];
b3 = buf[(*cur)++];
b4 = buf[(*cur)++];
*res = (((uint32_t)(uint8_t)b4) << 24) + (((uint32_t)(uint8_t)b3) << 16) + (((uint32_t)(uint8_t)b2) << 8) + (((uint32_t)(uint8_t)b1));
return true;
};
static bool ReadInt(ifstream& ifs, int* res) {
// if(ifs.read((char*)res, sizeof(*res))) return true;
char b[4];
if (!ifs.read(&b[0], 1))
return false;
if (!ifs.read(&b[1], 1))
return false;
if (!ifs.read(&b[2], 1))
return false;
if (!ifs.read(&b[3], 1))
return false;
uint64_t cur = 0;
return ReadInt((char*)b, 4, &cur, res);
}
static bool ReadLong(char* buf, uint64_t total, uint64_t* cur, uint64_t* res) {
char b1, b2, b3, b4, b5, b6, b7, b8;
if (*cur + 8 > total)
return false;
b1 = buf[(*cur)++];
b2 = buf[(*cur)++];
b3 = buf[(*cur)++];
b4 = buf[(*cur)++];
b5 = buf[(*cur)++];
b6 = buf[(*cur)++];
b7 = buf[(*cur)++];
b8 = buf[(*cur)++];
*res = (((uint64_t)(uint8_t)b8) << 56) + (((uint64_t)(uint8_t)b7) << 48) + (((uint64_t)(uint8_t)b6) << 40) + (((uint64_t)(uint8_t)b5) << 32) +
(((uint64_t)(uint8_t)b4) << 24) + (((uint64_t)(uint8_t)b3) << 16) + (((uint64_t)(uint8_t)b2) << 8) + (((uint64_t)(uint8_t)b1));
return true;
};
static bool ReadLong(ifstream& ifs, uint64_t* res) {
// if(ifs.read((char*)res, sizeof(*res))) return true;
char b[8];
if (!ifs.read(&b[0], 1))
return false;
if (!ifs.read(&b[1], 1))
return false;
if (!ifs.read(&b[2], 1))
return false;
if (!ifs.read(&b[3], 1))
return false;
if (!ifs.read(&b[4], 1))
return false;
if (!ifs.read(&b[5], 1))
return false;
if (!ifs.read(&b[6], 1))
return false;
if (!ifs.read(&b[7], 1))
return false;
uint64_t cur = 0;
return ReadLong((char*)b, 8, &cur, res);
}
static bool ReadStr(ifstream& ifs, string* res) {
char b;
res->clear();
if (!ifs.read(&b, 1))
return false;
while ((int)b != 0) {
res->push_back(b);
if (!ifs.read(&b, 1))
return false;
}
return true;
}
static bool ReadStr(char* buf, uint64_t total, uint64_t* cur, string* res) {
char b;
res->clear();
if (*cur >= total)
return false;
b = buf[(*cur)++];
while ((int)b != 0) {
res->push_back(b);
if (*cur >= total)
return false;
b = buf[(*cur)++];
}
return true;
}
};

View File

@ -0,0 +1,295 @@
/*
Description: intervals
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2019/11/24
*/
#include "interval.h"
#include <algorithm>
#include <sstream>
#include <fstream>
#include <string>
#include <iostream>
#include <htslib/sam.h>
#include "utils.h"
#include "bam_wrap.h"
using std::min;
using std::max;
using std::string;
using std::ifstream;
using std::stringstream;
using namespace std;
// 构造函数
Interval::Interval() : Interval(0, 0) {}
Interval::Interval(int64_t l, int64_t r) : left(l), right(r) {}
// 比较函数
bool Interval::operator<(const Interval& other) {
if (left == other.left) {
return right < other.right;
}
return left < other.left;
}
// 是否有重叠
bool Interval::overlaps(const Interval &other) {
return left <= other.right && right >= other.left;
}
// 两个interval的合并
Interval& Interval::spanWith(const Interval &other) {
left = min(left, other.left);
right = max(right, other.right);
return *this;
}
// 返回两个interval的交集不改变当前interval
Interval Interval::intersect(const Interval &that) const {
Interval val;
val.left = max(left, that.left);
val.right = min(right, that.right);
return val;
}
/*
* interval arr, interval arr
*/
void Interval::IntersectIntervals(const IntervalArr &a_arr,
const IntervalArr &b_arr,
IntervalArr *r_arr) {
if (a_arr.size() < 1 || b_arr.size() < 1) return;
int ai=0, bi=0;
const Interval *last, *cur;
if (a_arr[ai].left < b_arr[bi].left) last = &a_arr[ai++];
else last = &b_arr[bi++];
while (ai < a_arr.size() && bi < b_arr.size()) {
if (a_arr[ai].left < b_arr[bi].left) cur = &a_arr[ai++];
else cur = &b_arr[bi++];
if (last->right < cur->left) {
last = cur; continue;
} else if (last->right > cur->right) {
r_arr->push_back(*cur);
} else {
r_arr->push_back(Interval(cur->left, last->right));
last = cur;
}
}
const IntervalArr *arrp;
int ii;
if (ai < a_arr.size()) { arrp = &a_arr; ii = ai;}
else { arrp = &b_arr; ii = bi; }
const IntervalArr &arr = *arrp;
while(ii < arr.size()) {
cur = &arr[ii++];
if (last->right < cur->left) {
break;
} else if (last->right > cur->right) {
r_arr->push_back(*cur);
} else {
r_arr->push_back(Interval(cur->left, last->right));
break;
}
}
}
/*
* interval arr
*/
void Interval::UnionIntervals(const IntervalArr &a_arr,
const IntervalArr &b_arr,
IntervalArr *r_arr) {
Interval tmp;
const Interval *cur;
Interval *last;
int ai=0, bi=0;
if (a_arr.size() < 1) { *r_arr = b_arr; return; }
if (b_arr.size() < 1) { *r_arr = a_arr; return; }
r_arr->clear();
if (a_arr[ai].left < b_arr[bi].left) tmp = a_arr[ai++];
else tmp = b_arr[bi++];
last = &tmp;
while(ai < a_arr.size() && bi < b_arr.size()) {
if (a_arr[ai].left < b_arr[bi].left) cur = &a_arr[ai++];
else cur = &b_arr[bi++];
if (last->right < cur->left) {
r_arr->push_back(*last);
*last = *cur;
} else {
last->right = max(last->right, cur->right);
}
}
const IntervalArr *arrp;
int ii;
if (ai < a_arr.size()) { arrp = &a_arr; ii = ai; }
else { arrp = &b_arr; ii = bi; }
const IntervalArr &arr = *arrp;
while(ii < arr.size()) {
cur = &arr[ii++];
if (last->right < cur->left) {
r_arr->push_back(*last);
*last = *cur;
} else {
last->right = max(last->right, cur->right);
}
}
r_arr->push_back(*last);
}
/*
* readinterval
*/
int64_t Interval::MergeIntervals(const IntervalArr &n_arr,
const IntervalArr &t_arr,
IntervalArr &in_arr,
int64_t start_loc, // 闭区间
int64_t *end_loc, // 开区间
IntervalArr *r_arr) {
IntervalArr tmp_arr;
const int64_t end_loc_val = *end_loc;
if (in_arr.size() < 1) { // 如果输入的interval为空则使用tumor normal覆盖的interval
UnionIntervals(n_arr, t_arr, &tmp_arr);
} else {
IntervalArr mid_arr;
UnionIntervals(n_arr, t_arr, &mid_arr);
IntersectIntervals(mid_arr, in_arr, &tmp_arr);
}
for(int i=tmp_arr.size()-1; i>=0; --i) {
if (tmp_arr[i].left >= end_loc_val) {
tmp_arr.pop_back(); // 删除该元素
continue;
}
tmp_arr[i].right = min(tmp_arr[i].right, end_loc_val - 1); // end_loc是开区间
break;
}
for (int i=0; i<tmp_arr.size(); ++i) {
if (tmp_arr[i].right < start_loc) {
continue;
}
if (tmp_arr[i].left < start_loc) {
r_arr->push_back(Interval(start_loc, tmp_arr[i].right));
} else {
r_arr->push_back(tmp_arr[i]);
}
}
int next_i = 0;
while(next_i < in_arr.size() && in_arr[next_i].right < end_loc_val) ++next_i;
if (next_i < in_arr.size()) {
if (end_loc_val < in_arr[next_i].left) {
*end_loc = in_arr[next_i].left; // 更新本次处理的终点
} else {
in_arr[next_i].left = end_loc_val; // 更新panel
}
int i=0, j=next_i;
for (; j<in_arr.size(); ++i, ++j) {
in_arr[i] = in_arr[j];
}
in_arr.resize(i);
} else {
in_arr.clear();
}
int64_t locus_num = 0;
for (int i=0; i<r_arr->size(); ++i) {
locus_num += (*r_arr)[i].right - (*r_arr)[i].left + 1;
}
return locus_num;
}
/*
* interval
*/
void Interval::ReadInterval(const string &interval_fn,
bam_hdr_t* header,
int interval_padding,
IntervalArr *r_arr) {
ifstream interval_fs(interval_fn);
string one_line;
IntervalArr intervals;
getline(interval_fs, one_line);
while (!interval_fs.eof()) {
if (one_line[0] == '@') {
getline(interval_fs, one_line);
continue;
}
stringstream ss_line(one_line);
string contig_name;
ss_line >> contig_name;
int itid = sam_hdr_name2tid(header, contig_name.c_str());
if (itid < 0) error("[%s] interval file has unknown contig name [%s]\n", __func__, contig_name.c_str());
int64_t tid = (int64_t)itid;
tid <<= CONTIG_SHIFT;
int64_t start, stop;
ss_line >> start >> stop;
// interval文件是1-based所以这里要减去1
intervals.push_back(Interval(tid + start - 1, tid + stop -1));
getline(interval_fs, one_line);
}
sort(intervals.begin(), intervals.end());
if (intervals.size() > 0) {
Interval new_span(intervals[0].left-interval_padding, intervals[0].right+interval_padding);
for (int i=1; i<intervals.size(); ++i) {
if (intervals[i].left - interval_padding > new_span.right) {
r_arr->push_back(new_span);
new_span.left = intervals[i].left - interval_padding;
new_span.right = intervals[i].right + interval_padding;
} else {
new_span.right = max(new_span.right, intervals[i].right + interval_padding);
}
}
r_arr->push_back(new_span);
}
interval_fs.close();
}
/*
* interval
*/
void Interval::ShrinkInterval(IntervalArr *ivap) {
if (ivap->size() < 1) return;
IntervalArr &iva = *ivap;
IntervalArr tiva = iva;
iva.clear();
Interval iv;
iv.left = tiva[0].left;
iv.right = tiva[0].right;
for (int i=1; i<tiva.size(); ++i) {
if (iv.right+1 < tiva[i].left) {
iva.push_back(iv);
iv.left = tiva[i].left;
}
iv.right = tiva[i].right;
}
iva.push_back(iv);
}
/*
* headerinterval
*/
Interval Interval::ExpandInterval(int64_t start, int64_t end, int expandVal, bam_hdr_t* header) {
Interval result;
result.left = start;
result.right = end;
int64_t ext_left = start - expandVal;
int64_t ext_right = end + expandVal;
int tid = BamWrap::bam_tid(start);
uint32_t contig_len = header->target_len[tid];
result.left = max(BamWrap::bam_global_pos(tid, 0), ext_left);
result.right = min(ext_right, contig_len - 1 + BamWrap::bam_global_pos(tid, 0));
return result;
}

108
src/util/interval.h 100755
View File

@ -0,0 +1,108 @@
/*
Description: intervals
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2019/11/24
*/
#ifndef INTERVAL_H_
#define INTERVAL_H_
#include <stdint.h>
#include <vector>
#include <string>
#include <sstream>
#include <htslib/sam.h>
#include "bam_wrap.h"
using namespace std;
// 前向声明
class Interval;
typedef std::vector<Interval> IntervalArr;
/*
*
*/
struct Interval {
// const常量
const static int CONTIG_SHIFT = 40;
const static uint64_t POS_MASK = (1LL << CONTIG_SHIFT) - 1;
// 类变量
int64_t left;
int64_t right;
// 构造函数
Interval();
explicit Interval(int64_t l, int64_t r);
explicit Interval(int tid, int l, int r) : Interval(BamWrap::bam_global_pos(tid, l), BamWrap::bam_global_pos(tid, r)) {}
// 获取tid
int contig() { return left >> CONTIG_SHIFT; }
int contigLeft() { return left & POS_MASK; }
int contigRight() { return right & POS_MASK; }
// 比较函数
bool operator<(const Interval &other);
// 是否有重叠
bool overlaps(const Interval &other);
// 两个interval的合并, 会改变当前interval
Interval& spanWith(const Interval &other);
// 返回两个interval的交集不改变当前interval
Interval intersect(const Interval &that) const;
// for debug
string toString() const {
ostringstream oss;
oss << BamWrap::bam_tid(left) + 1 << ":"
<< BamWrap::bam_pos(left) + 1 << "-"
<< BamWrap::bam_pos(right) + 1;
return oss.str();
}
/*
* interval arr, interval arr
*/
static void IntersectIntervals(const IntervalArr &a_arr,
const IntervalArr &b_arr,
IntervalArr *r_arr);
/*
* interval arr
*/
static void UnionIntervals(const IntervalArr &a_arr,
const IntervalArr &b_arr,
IntervalArr *r_arr);
/*
* readinterval
*/
static int64_t MergeIntervals(const IntervalArr &n_arr,
const IntervalArr &t_arr,
IntervalArr &in_arr, // 会更改
int64_t start_loc, // 闭区间
int64_t *end_loc, // 开区间, 会更改
IntervalArr *r_arr);
/*
* interval
*/
static void ReadInterval(const std::string &interval_fn,
bam_hdr_t* header,
int interval_padding,
IntervalArr *r_arr);
/*
* interval
*/
static void ShrinkInterval(IntervalArr *iva);
/*
* headerinterval
*/
static Interval ExpandInterval(int64_t start, int64_t end, int expandVal, bam_hdr_t* header);
};
#endif

View File

@ -0,0 +1,292 @@
/*
Description: vcf index utils
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2019/11/23
*/
#include "linear_index.h"
#include "bam_wrap.h"
#include "binary_util.h"
/*
* LinearIndex
*/
// 搜索区间参数对应的数据区域,闭区间
void LinearIndex::SearchInterval(int64_t start, int64_t end, int64_t* file_pos, int64_t* content_len) {
int stid, spos, etid, epos, stid_origin, spos_origin, etid_origin, epos_origin;
stid = stid_origin = BamWrap::bam_tid(start);
spos = spos_origin = BamWrap::bam_pos(start);
etid = etid_origin = BamWrap::bam_tid(end);
epos = epos_origin = BamWrap::bam_pos(end);
while (stid < idx_.size() && idx_[stid].size() == 0) ++stid;
while (etid >= 0 && idx_[etid].size() == 0) --etid;
if (stid > etid || stid >= idx_.size()) {
*content_len = 0;
return;
}
int sb_idx, eb_idx;
if (stid == stid_origin) {
sb_idx = spos / idx_[stid].binWidth;
if (sb_idx >= idx_[stid].size()) { // 开始区域没有对应的block
sb_idx = idx_[stid].size() - 1;
Block& sb = idx_[stid][sb_idx];
*file_pos = sb.startPosition + sb.size;
} else {
Block& sb = idx_[stid][sb_idx];
*file_pos = sb.startPosition;
}
} else {
sb_idx = 0;
Block& sb = idx_[stid][sb_idx];
*file_pos = sb.startPosition;
}
if (etid == etid_origin) {
eb_idx = epos / idx_[etid].binWidth;
if (eb_idx >= idx_[etid].size()) {
eb_idx = idx_[etid].size() - 1;
}
} else {
eb_idx = idx_[etid].size() - 1;
}
Block& eb = idx_[etid][eb_idx];
if (eb.startPosition + eb.size > *file_pos)
*content_len = eb.startPosition + eb.size - *file_pos;
else
*content_len = 0;
}
// 读入index文件信息
bool LinearIndex::ReadIndex(const string& idx_fn) {
if (bam_hdr_ == NULL)
return false;
ifstream ifs(idx_fn, ios::in | ios::binary);
// 如果idx文件不存在则创建 todo
if (!ifs.good())
return false;
// 全部读进来再处理
uint64_t fsize = 0;
uint64_t cur = 0;
ifs.seekg(0, ios::end);
fsize = (uint64_t)ifs.tellg();
ifs.seekg(0, ios::beg);
char* buf = (char*)malloc(fsize + 10);
if (!ifs.read(buf, fsize)) {
free(buf);
ifs.close();
return false;
}
ifs.close();
int int_val, version, flags;
uint64_t long_val;
string str_val;
// read index header
BinaryUtil::ReadInt(buf, fsize, &cur, &int_val);
// cout << "magic: " << int_val << endl;
BinaryUtil::ReadInt(buf, fsize, &cur, &int_val);
if (int_val != INDEX_TYPE) {
free(buf);
ifs.close();
return false;
}
// cout << "type: " << int_val << endl;
BinaryUtil::ReadInt(buf, fsize, &cur, &version);
// cout << "version: " << version << endl;
BinaryUtil::ReadStr(buf, fsize, &cur, &str_val);
// cout << "path: " << str_val << endl;
BinaryUtil::ReadLong(buf, fsize, &cur, &vcf_fsize); // vcf file size
// cout << "file size: " << long_val << endl;
BinaryUtil::ReadLong(buf, fsize, &cur, &long_val); // test number
BinaryUtil::ReadStr(buf, fsize, &cur, &str_val); // md5
BinaryUtil::ReadInt(buf, fsize, &cur, &flags);
if (version < 3 && (flags & 0x8000) == 0x8000) {
BinaryUtil::ReadInt(buf, fsize, &cur, &int_val);
if (int_val < 0)
return false;
for (int i = 0; i < int_val; ++i) {
BinaryUtil::ReadStr(buf, fsize, &cur, &str_val);
BinaryUtil::ReadInt(buf, fsize, &cur, &int_val);
}
}
if (version >= 3) {
int nProperties;
BinaryUtil::ReadInt(buf, fsize, &cur, &nProperties);
string key, val;
while (nProperties-- > 0) {
BinaryUtil::ReadStr(buf, fsize, &cur, &key);
BinaryUtil::ReadStr(buf, fsize, &cur, &val);
properties_[key] = val;
}
}
// read index content
int n_chrom;
BinaryUtil::ReadInt(buf, fsize, &cur, &n_chrom);
int last_tid = 0;
string last_name;
int bam_tid = 0; // tid从0开始索引的染色体也要补充一个数据, 使得idx_ vector与tid一一对应
while (n_chrom-- > 0) {
ChrIndex ci;
BinaryUtil::ReadStr(buf, fsize, &cur, &ci.name);
BinaryUtil::ReadInt(buf, fsize, &cur, &ci.binWidth);
int nBins;
BinaryUtil::ReadInt(buf, fsize, &cur, &nBins);
BinaryUtil::ReadInt(buf, fsize, &cur, &ci.longestFeature);
BinaryUtil::ReadInt(buf, fsize, &cur, &int_val);
BinaryUtil::ReadInt(buf, fsize, &cur, &ci.nFeatures);
ci.blocks.reserve(nBins);
int tid = sam_hdr_name2tid(bam_hdr_, ci.name.c_str());
if (tid < 0) {
if (ci.name == last_name) {
tid = last_tid;
} else {
tid = ++last_tid;
last_name = ci.name;
}
}
while (bam_tid + 1 < tid) {
idx_.push_back(ChrIndex()); // 补充空染色体
++bam_tid;
}
bam_tid = tid;
ci.tid = tid;
uint64_t pos;
uint64_t nextPos;
BinaryUtil::ReadLong(buf, fsize, &cur, &pos);
for (int i = 0; i < nBins; ++i) {
BinaryUtil::ReadLong(buf, fsize, &cur, &nextPos);
ci.blocks.push_back(Block(pos, nextPos - pos));
pos = nextPos;
}
idx_.push_back(ci);
}
sort(idx_.begin(), idx_.end());
ifs.close();
free(buf);
return true;
}
/*
* LinearIndex::ChrIndex
*/
void LinearIndex::ChrIndex::write(ofstream& out) const {
BinaryUtil::WriteStr(out, name);
BinaryUtil::WriteInt(out, binWidth);
BinaryUtil::WriteInt(out, blocks.size());
BinaryUtil::WriteInt(out, longestFeature);
BinaryUtil::WriteInt(out, 0);
BinaryUtil::WriteInt(out, nFeatures);
uint64_t pos = 0, si = 0;
for (int i = 0; i < blocks.size(); ++i) {
pos = blocks[i].startPosition;
si = blocks[i].size;
BinaryUtil::WriteLong(out, pos);
}
BinaryUtil::WriteLong(out, pos + si);
}
/*
* LinearIndexCreator
*/
// 初始化索引文件头部信息
void LinearIndexCreator::InitHeaderDict(bam_hdr_t* hdr) {
for (int i = 0; i < hdr->n_targets; ++i) {
v_contig_name_.push_back(hdr->target_name[i]);
contig_name_to_id_[hdr->target_name[i]] = i;
contig_len_[hdr->target_name[i]] = hdr->target_len[i];
++n_properties_;
}
}
// 添加一条vcf记录
void LinearIndexCreator::AddFeature(const Feature& ft, uint64_t f_pos) { // f_pos是vcf文件当前正要写入的位置
if (idx_.size() == 0 || idx_.back().tid != ft.tid) {
if (idx_.size() > 0) {
for (int i = 0; i < blocks_.size() - 1; ++i) {
blocks_[i].size = blocks_[i + 1].startPosition - blocks_[i].startPosition;
idx_.back().blocks.push_back(blocks_[i]);
}
blocks_.back().size = f_pos - blocks_.back().startPosition;
idx_.back().blocks.push_back(blocks_.back());
}
idx_.push_back(LinearIndex::ChrIndex(v_contig_name_[ft.tid], ft.tid, bin_width_));
blocks_.clear();
blocks_.push_back(Block(f_pos, 0));
longest_feature_ = 0;
}
while (ft.start > blocks_.size() * bin_width_) blocks_.push_back(Block(f_pos, 0));
if (ft.FeatureLen() > longest_feature_) {
longest_feature_ = ft.FeatureLen();
idx_.back().longestFeature = longest_feature_;
}
++idx_.back().nFeatures;
++FEATURE_COUNT_;
all_feature_len += ft.FeatureLen();
}
// 添加所有记录完毕
void LinearIndexCreator::FinalizeIndex(uint64_t f_pos) {
if (f_pos == 0)
error("[%s] Error: finalize index failed\n", __func__);
if (blocks_.size() > 0) {
for (int i = 0; i < blocks_.size() - 1; ++i) {
blocks_[i].size = blocks_[i + 1].startPosition - blocks_[i].startPosition;
idx_.back().blocks.push_back(blocks_[i]);
}
blocks_.back().size = f_pos - blocks_.back().startPosition;
idx_.back().blocks.push_back(blocks_.back());
blocks_.clear();
}
FEATURE_LENGTH_MEAN_ = (double)all_feature_len / (double)FEATURE_COUNT_;
index_file_size_ = f_pos;
}
// 写入index文件
void LinearIndexCreator::WriteIndex(const string& out_fn) {
ofstream out;
out.open(out_fn, ios::out | ios::binary);
BinaryUtil::WriteInt(out, MAGIC_NUMBER);
BinaryUtil::WriteInt(out, LinearIndex::INDEX_TYPE);
BinaryUtil::WriteInt(out, LinearIndex::INDEX_VERSION);
BinaryUtil::WriteStr(out, out_fn);
BinaryUtil::WriteLong(out, index_file_size_);
BinaryUtil::WriteLong(out, -1);
BinaryUtil::WriteStr(out, "");
BinaryUtil::WriteInt(out, flags_);
n_properties_ += 4;
BinaryUtil::WriteInt(out, n_properties_);
ostringstream oss;
oss << FEATURE_LENGTH_MEAN_;
BinaryUtil::WriteStr(out, "FEATURE_LENGTH_MEAN");
BinaryUtil::WriteStr(out, oss.str());
oss.str("");
oss << FEATURE_LENGTH_STD_DEV_;
BinaryUtil::WriteStr(out, "FEATURE_LENGTH_STD_DEV");
BinaryUtil::WriteStr(out, oss.str());
oss.str("");
oss << MEAN_FEATURE_VARIANCE_;
BinaryUtil::WriteStr(out, "MEAN_FEATURE_VARIANCE");
BinaryUtil::WriteStr(out, oss.str());
oss.str("");
oss << FEATURE_COUNT_;
BinaryUtil::WriteStr(out, "FEATURE_COUNT");
BinaryUtil::WriteStr(out, oss.str());
for (int i = 0; i < v_contig_name_.size(); ++i) {
const string& cn = v_contig_name_[i];
oss.str("");
oss << contig_len_[cn];
BinaryUtil::WriteStr(out, "DICT:" + cn);
BinaryUtil::WriteStr(out, oss.str());
}
BinaryUtil::WriteInt(out, idx_.size());
for (int i = 0; i < idx_.size(); ++i) {
const LinearIndex::ChrIndex& ci = idx_[i];
ci.write(out);
}
out.close();
}

View File

@ -0,0 +1,129 @@
/*
Description: vcf index utils
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2019/11/23
*/
#ifndef INDEX_H_
#define INDEX_H_
#include <htslib/sam.h>
#include <stddef.h>
#include <algorithm>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <unordered_map>
#include <vector>
#include "utils.h"
using std::ifstream;
using std::ios;
using std::ofstream;
using std::ostringstream;
using std::sort;
using std::string;
using std::unordered_map;
using std::vector;
using std::cout;
using std::endl;
struct Block {
Block(uint64_t start, uint64_t si) : startPosition(start), size(si) {}
uint64_t startPosition;
uint64_t size;
};
struct Feature {
int tid;
int start; // 闭区间
int end; // 闭区间
Feature(int ti, int b, int e) : tid(ti), start(b), end(e) {}
inline int FeatureLen() const { return end - start + 1; }
};
struct LinearIndex {
const static int MAX_FEATURES_PER_BIN = 100;
const static int INDEX_TYPE = 1;
const static int INDEX_VERSION = 3;
const static int MAX_BIN_WIDTH = 1024000;
LinearIndex() {}
LinearIndex(bam_hdr_t* hdr) : bam_hdr_(hdr) {}
void SetHeader(bam_hdr_t* hdr) { bam_hdr_ = hdr; }
class ChrIndex;
vector<ChrIndex> idx_;
vector<string> vkey_;
vector<string> vval_;
unordered_map<string, string> properties_;
bam_hdr_t* bam_hdr_ = NULL; // 这个应该换成bcf_hdr_t
uint64_t vcf_fsize = 0;
// 染色体索引信息
struct ChrIndex {
string name;
int tid;
int binWidth;
int longestFeature = 0;
int nFeatures = 0;
vector<Block> blocks;
ChrIndex() {};
ChrIndex(string& n, int ti, int bw) : name(n), tid(ti), binWidth(bw) {}
inline bool operator<(const ChrIndex& ci) const { return tid < ci.tid; };
inline Block& operator[](int pos) { return blocks[pos]; }
inline int size() { return blocks.size(); }
void write(ofstream& out) const;
};
inline ChrIndex& operator[](int tid) { return idx_[tid]; }
// 闭区间
void SearchInterval(int64_t start, int64_t end, int64_t* file_pos, int64_t* content_len);
// 读入index文件信息
bool ReadIndex(const string& idx_fn);
};
// 根据vcf数据创建index文件
struct LinearIndexCreator {
const static int INDEX_VERSION = LinearIndex::INDEX_VERSION;
const static int MAGIC_NUMBER = 1480870228;
const static int DEFAULT_BIN_WIDTH = 8000;
int bin_width_ = DEFAULT_BIN_WIDTH;
string input_vcf_fn_;
string output_index_fn_;
int longest_feature_ = 0;
uint64_t index_file_size_ = 0;
int flags_ = 0;
int n_properties_ = 0;
float FEATURE_LENGTH_MEAN_ = 0.0;
float FEATURE_LENGTH_STD_DEV_ = 0.0;
float MEAN_FEATURE_VARIANCE_ = 0.0;
uint64_t FEATURE_COUNT_ = 0;
uint64_t all_feature_len = 0;
unordered_map<string, int> contig_name_to_id_;
unordered_map<string, int> contig_len_;
vector<string> v_contig_name_;
vector<LinearIndex::ChrIndex> idx_;
vector<Block> blocks_;
// 根据sam header初始化索引文件头部信息
void InitHeaderDict(bam_hdr_t* hdr);
// 添加一条记录
void AddFeature(const Feature& ft, uint64_t f_pos); // f_pos是vcf文件当前正要写入的位置
// 添加记录完毕
void FinalizeIndex(uint64_t f_pos);
// 写入index文件
void WriteIndex(const string& out_fn);
};
#endif

View File

@ -1,5 +1,5 @@
/*
Description: Murmur
Description: Murmur
Copyright : All right reserved by ICT

View File

@ -73,12 +73,12 @@ int DisplayProfiling(int nthread) {
PRINT_GP(write);
PRINT_GP(whole_process);
PRINT_TP(gen, nthread);
PRINT_TP(sort_frag, nthread);
PRINT_TP(sort_pair, nthread);
// PRINT_TP(gen, nthread);
// PRINT_TP(sort_frag, nthread);
// PRINT_TP(sort_pair, nthread);
fprintf(stderr, "\n");
#endif
return 0;
}
}