From 1e9b58fac171b36ffe6e550abe130639c013bd11 Mon Sep 17 00:00:00 2001 From: zzh Date: Sat, 20 Dec 2025 16:35:45 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=B0=E4=BA=86=E8=AF=BB=E5=8F=96=E5=92=8C?= =?UTF-8?q?=E8=A7=A3=E6=9E=90known=20vcf=E9=83=A8=E5=88=86=E4=BA=86?= =?UTF-8?q?=EF=BC=8C=E6=80=A7=E8=83=BD=E8=BF=98=E9=9C=80=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/bqsr/baq.cpp | 17 ++ src/bqsr/baq.h | 88 ++++++ src/bqsr/bqsr_args.h | 5 +- src/bqsr/bqsr_entry.cpp | 599 ++++++++++++++++++++++++++++++++++++-- src/bqsr/covariate.cpp | 317 ++++++++++++++++++++ src/bqsr/covariate.h | 209 +++++++++++++ src/main.cpp | 9 +- src/util/bam_buf.cpp | 79 ++--- src/util/bam_buf.h | 105 ++++--- src/util/bam_wrap.h | 165 ++++++++--- src/util/binary_util.h | 131 +++++++++ src/util/interval.cpp | 295 +++++++++++++++++++ src/util/interval.h | 108 +++++++ src/util/linear_index.cpp | 292 +++++++++++++++++++ src/util/linear_index.h | 129 ++++++++ src/util/murmur3.h | 2 +- src/util/profiling.cpp | 8 +- 17 files changed, 2397 insertions(+), 161 deletions(-) create mode 100644 src/bqsr/baq.cpp create mode 100644 src/bqsr/baq.h create mode 100644 src/bqsr/covariate.cpp create mode 100644 src/bqsr/covariate.h create mode 100644 src/util/binary_util.h create mode 100755 src/util/interval.cpp create mode 100755 src/util/interval.h create mode 100644 src/util/linear_index.cpp create mode 100644 src/util/linear_index.h diff --git a/src/bqsr/baq.cpp b/src/bqsr/baq.cpp new file mode 100644 index 0000000..d2339a1 --- /dev/null +++ b/src/bqsr/baq.cpp @@ -0,0 +1,17 @@ +#include "baq.h" + +#include + +vector BAQ::qual2prob(256); // 质量分数转化概率 +vector>> BAQ::EPSILONS(256, vector>(256, vector(SAM_MAX_PHRED_SCORE + 1))); // [ref][read][qual] + +// 计算baq数组,返回成功与否 +bool BAQ::calcBAQFromHMM(BamWrap* bw, ReadAdditionData& ad, string ref, int refOffset, vector& baqArray) { + // 检测ref是否覆盖了read + if (ref.size() < refOffset + ad.read_len) { + spdlog::error("BAQ calculation error: reference sequence length {} is less than required length {} (refOffset {} + read_len {})", + ref.size(), refOffset + ad.read_len, refOffset, ad.read_len); + return false; + } + return true; +} \ No newline at end of file diff --git a/src/bqsr/baq.h b/src/bqsr/baq.h new file mode 100644 index 0000000..f7c4b2e --- /dev/null +++ b/src/bqsr/baq.h @@ -0,0 +1,88 @@ +// #################################################################################################### +// +// NOTE -- THIS CODE IS SYNCHRONIZED WITH CODE IN THE SAMTOOLS REPOSITORY. CHANGES TO THIS CODE SHOULD BE +// NOTE -- PUSHED BACK TO HENG LI +// +// #################################################################################################### +// NOTE -- this code is a cpp version of code originally written by Heng Li in JAVA for the samtools project +// #################################################################################################### +#pragma once + +#include +#include +#include +#include +#include + +#include "util/bam_wrap.h" + +using std::vector; +using std::string; + +// base alignment quality (BAQ) + +struct BAQ { + static vector qual2prob; // 质量分数转化概率 + static constexpr double EM = 0.33333333333; + static constexpr double EI = 0.25; + // Phred scaled now (changed 1/10/2011) + static constexpr double DEFAULT_GOP = 40; + static constexpr int DEFAULT_BANDWIDTH = 7; + static constexpr int SAM_MAX_PHRED_SCORE = 93; + // 94 = max qual score + 1 + static vector>> EPSILONS; // [ref][read][qual] + + double cd = -1; // gap open probability [1e-3] + double ce = 0.1; // gap extension probability [0.1] + int cb = DEFAULT_BANDWIDTH; // band width [7] + + /** + * Any bases with Q < MIN_BASE_QUAL are raised up to this base quality + */ + uint8_t minBaseQual = 4; + + /** + * Use defaults for everything + */ + BAQ() : BAQ(DEFAULT_GOP) {} + + /** + * Use defaults for everything + */ + BAQ(const double gapOpenPenalty) { + cd = convertFromPhredScale(gapOpenPenalty); + initializeCachedData(); + } + + /* Takes a Phred Scale quality score and returns the error probability. + * + * Quick conversion function to maintain internal structure of BAQ calculation on + * probability scale, but take the user entered parameter in phred-scale. + * + * @param x phred scaled score + * @return probability of incorrect base call + */ + static double convertFromPhredScale(double x) { return (std::pow(10, (-x) / 10.)); } + + // 初始化一些静态全局数据 + void initializeCachedData() { + for (int i = 0; i < 256; i++) + for (int j = 0; j < 256; j++) + for (int q = 0; q <= SAM_MAX_PHRED_SCORE; q++) EPSILONS[i][j][q] = 1.0; + + for (char b1 : "ACGTacgt") { + for (char b2 : "ACGTacgt") { + for (int q = 0; q <= SAM_MAX_PHRED_SCORE; q++) { + double qual = qual2prob[q < minBaseQual ? minBaseQual : q]; + double e = std::tolower(b1) == std::tolower(b2) ? 1 - qual : qual * EM; + EPSILONS[(uint8_t)b1][(uint8_t)b2][q] = e; + } + } + } + } + + double calcEpsilon(uint8_t ref, uint8_t read, uint8_t qualB) { return EPSILONS[ref][read][qualB]; } + + // 计算baq数组,返回成功与否 + bool calcBAQFromHMM(BamWrap* bw, ReadAdditionData& ad, string ref, int refOffset, vector& baqArray); +}; \ No newline at end of file diff --git a/src/bqsr/bqsr_args.h b/src/bqsr/bqsr_args.h index 0265beb..dd3bd08 100644 --- a/src/bqsr/bqsr_args.h +++ b/src/bqsr/bqsr_args.h @@ -48,6 +48,9 @@ struct BQSRArg { string PROGRAM_RECORD_ID = "FastBQSR"; + // reference file + string REFERENCE_FILE; + // known sites vcf files vector KNOWN_SITES_VCFS; @@ -123,7 +126,7 @@ struct BQSRArg { int PRESERVE_QSCORES_LESS_THAN = 6; /** - * enable-baq, do BAQ correction" + * enable-baq, do BAQ correction" (base alignment quality), 在GATK里hidden了,用不到了? */ bool enableBAQ = false; diff --git a/src/bqsr/bqsr_entry.cpp b/src/bqsr/bqsr_entry.cpp index ab3e98e..1f9e4d5 100644 --- a/src/bqsr/bqsr_entry.cpp +++ b/src/bqsr/bqsr_entry.cpp @@ -7,36 +7,93 @@ Copyright : All right reserved by ICT Author : Zhang Zhonghai Date : 2023/10/23 */ +#include +#include #include #include #include +#include #include #include +#include #include +#include +#include "baq.h" #include "bqsr_args.h" #include "bqsr_funcs.h" #include "bqsr_pipeline.h" +#include "covariate.h" #include "dup_metrics.h" #include "fastbqsr_version.h" #include "read_name_parser.h" +#include "util/interval.h" #include "util/profiling.h" #include "util/utils.h" +#include "util/linear_index.h" + +using std::deque; #define BAM_BLOCK_SIZE 16L * 1024 * 1024 +const char cBaseToChar[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'}; + +// 解析knownSites +struct VCFParser { + deque knownSites; // 已知的变异位点 + char* buf = nullptr; // // 数据buffer + uint32_t bufLen = 4 * 1024; // 数据buffer长度 + LinearIndex index; // vcf文件索引 + ifstream inStm; // vcf文件流 + + VCFParser() { Init(); } + VCFParser(const string& vcfFileName) { Init(vcfFileName); } + VCFParser(const string& vcfFileName, sam_hdr_t* samHeader) { Init(vcfFileName, samHeader); } + + void Init() { buf = (char*)malloc(bufLen); } + void Init(const string& vcfFileName) { + Init(); + inStm.open(vcfFileName, ifstream::in); + string idxFileName = vcfFileName + ".idx"; + if (!index.ReadIndex(idxFileName)) + error("[%s] fail to load the %s index file\n", __func__, idxFileName.c_str()); + } + void Init(const string& vcfFileName, sam_hdr_t *samHeader) { + index.SetHeader(samHeader); + Init(vcfFileName); + } +}; + +// 解析后的一些参数,文件,数据等 +struct AuxVar { + const static int REF_CONTEXT_PAD = 3; // 需要做一些填充 + const static int REFERENCE_HALF_WINDOW_LENGTH = 150; // 需要额外多取出一些ref序列,防止边界效应 + + sam_hdr_t* header = nullptr; // bam header + faidx_t* faidx = nullptr; // reference index + char* ref_seq = nullptr; // reference sequence + int ref_len = 0; // reference sequence length + int offset = 0; // 在要求的ref序列两边,多余取出的碱基数量 + + vector vcfArr; // 从vcf中获取已知位点 +}; + namespace nsgv { +// 全局变量 for bqsr +BQSRArg gBqsrArg; // bqsr arguments +samFile* gInBamFp; // input BAM file pointer +sam_hdr_t* gInBamHeader; // input BAM header +vector gAuxVars; // auxiliary variables,保存一些文件,数据等,每个线程对应一个 + +// 下面是需要删除或修改的变量 std::vector gNameParsers; // read name parser DuplicationMetrics gMetrics; // DupResult gDupRes; PipelineArg gPipe(&gDupRes); -BQSRArg gBqsrArg; // -samFile *gInBamFp; // bam -sam_hdr_t *gInBamHeader; // bam samFile *gOutBamFp; // , sambam sam_hdr_t *gOutBamHeader; // header vector gKnownSitesVcfSrs; // known sites vcf srs @@ -49,6 +106,13 @@ struct ByteBuf { int capacity = 0; // }; +// 读进来的这一批bam总共占了几个染色体,这个方案不行,读取太多,没必要 +// 开区间 +struct Region { + int64_t start; + int64_t end; +}; + /* * */ @@ -60,26 +124,495 @@ static string getFileExtension(const string &filename) { return filename.substr(last_dot + 1); } +// 过滤掉bqsr过程不符合要求的bam数据 +bool bqsrReadFilterOut(const bam1_t *b) { + // 过滤掉unmapped的read + if (b->core.qual == 0) // mapping quality 0 + return true; + if (b->core.qual == 255) // mapping quality not available + return true; + if (b->core.flag & BAM_FUNMAP || b->core.tid == -1 || b->core.pos == -1) { // unmapped + return true; + } + if (b->core.flag & BAM_FSECONDARY) { // secondary alignment + return true; + } + if (b->core.flag & BAM_FDUP) { // secondary alignment + return true; + } + if (b->core.flag & BAM_FQCFAIL) { // Not passing quality controls + return true; + } + return false; +} + +// 该操作符是否消耗read的碱基 +bool consumeReadBases(char cigar) { + return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'I' || cigar == 'S'; +} + +// 该操作符是否消耗参考基因组的碱基 +bool consumeRefBases(char cigar) { + return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N'; +} + +// 给定一个ref位置,在read内部找到对应的位置和操作符 +struct PosAndOperator { + int readPosAtRefCoord = -1; // read中的位置 + char cigarOperator = '0'; // cigar操作符 + int cigarIndex = -1; // cigar索引 + int cigarLen = 0; + int preCigarLen = 0; // 截止cigar之前的,消耗read base的长度 +}; + +/** + * Find the 0-based index within a read base array corresponding to a given 0-based position in the reference, along with the cigar operator of + * the element containing that base. If the reference coordinate occurs within a deletion, the first index after the deletion is returned. + * Note that this treats soft-clipped bases as if they align with the reference, which is useful for hard-clipping reads with soft clips. + * + * @param alignmentStart The soft start of the read on the reference + * @param cigar The read's cigar + * @param refCoord The target reference coordinate + * @return If the reference coordinate occurs before the read start or after the read end {@code CLIPPING_GOAL_NOT_REACHED}; + * if the reference coordinate falls within an alignment block of the read's cigar, the corresponding read coordinate; + * if the reference coordinate falls within a deletion, the first read coordinate after the deletion. Note: if the last + * cigar element is a deletion (which isn't meaningful), it returns {@code CLIPPING_GOAL_NOT_REACHED}. + */ +PosAndOperator getReadIndexForReferenceCoordinate(BamWrap *bw, int alignmentStart, int refCoord) { + PosAndOperator po; + if (refCoord < alignmentStart) { + return po; + } + int firstReadPosOfElement = 0; // inclusive + int firstRefPosOfElement = alignmentStart; // inclusive + int lastReadPosOfElement = 0; // exclusive + int lastRefPosOfElement = alignmentStart; // exclusive + + // advance forward through all the cigar elements until we bracket the reference coordinate + const uint32_t* cigar = bam_get_cigar(bw->b); + const bam1_core_t& bc = bw->b->core; + const int idx = bc.n_cigar - 1; + if (idx < 0) + return po; + for (int i = 0; i < bc.n_cigar; ++i) { + const char c = bam_cigar_opchr(cigar[i]); + const int len = bam_cigar_oplen(cigar[i]); + firstReadPosOfElement = lastReadPosOfElement; + firstRefPosOfElement = lastRefPosOfElement; + lastReadPosOfElement += consumeReadBases(c) ? len : 0; + lastRefPosOfElement += (consumeRefBases(c) || c == 'S') ? len : 0; + if (firstRefPosOfElement <= refCoord && refCoord < lastRefPosOfElement) { // refCoord falls within this cigar element + int readPosAtRefCoord = firstReadPosOfElement + (consumeReadBases(c) ? (refCoord - firstRefPosOfElement) : 0); + return PosAndOperator{readPosAtRefCoord, c, i, len, firstReadPosOfElement}; + } + } + return po; +} + +// 根据adapter位置,对read进行hardclip,返回左侧或右侧减掉的base数量 +void clipByReferenceCoordinates(BamWrap *bw, int refStart, int refStop, ReadAdditionData &ad) { + int start, stop; + // Determine the read coordinate to start and stop hard clipping + if (refStart < 0) { + if (refStop < 0) return; + PosAndOperator stopPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStop); + // if the refStop falls in a deletion, the above method returns the position after the deletion. Since the stop we return here + // is inclusive, we decrement the stop to avoid overclipping by one base. As a result we do not clip the deletion, which is fine. + stop = stopPosAndOperator.readPosAtRefCoord - (consumeReadBases(stopPosAndOperator.cigarOperator) ? 0 : 1); + ad.left_clip = stop + 1; + ad.cigar_start = stopPosAndOperator.cigarIndex; + ad.first_cigar_clip = ad.left_clip - stopPosAndOperator.preCigarLen; + } else { + if (refStop >= 0) return; + // unlike the above case where we clip the start fo the read, here we clip the end and returning the base to the right of a deletion avoids + // overclipping + PosAndOperator startPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStart); + start = startPosAndOperator.readPosAtRefCoord; + ad.right_clip = bw->b->core.l_qseq - start; + ad.cigar_end = startPosAndOperator.cigarIndex + 1; + ad.last_cigar_clip = startPosAndOperator.preCigarLen + startPosAndOperator.cigarLen - start; + } +} + +// 计算切掉adapter之后,ref相对原始ref的偏移量 +void calculateRefOffset(BamWrap *bw, ReadAdditionData &ad) { + const uint32_t* cigar = bam_get_cigar(bw->b); + const bam1_core_t& bc = bw->b->core; + int i = 0; + for (i = 0; i < ad.cigar_start; ++i) { + const char c = bam_cigar_opchr(cigar[i]); + int len = bam_cigar_oplen(cigar[i]); + if (consumeRefBases(c)) { + ad.ref_offset += len; + } + } + const char c = bam_cigar_opchr(cigar[i]); + if (consumeRefBases(c)) { + ad.ref_offset += ad.first_cigar_clip; + } +} + +// 计算clip处理之后,剩余的碱基 +void calculateReadBases(BamWrap* bw, ReadAdditionData& ad) { + ad.bases.resize(ad.read_len); + uint8_t* seq = bam_get_seq(bw->b); + for (int i = 0; i < ad.read_len; ++i) { + ad.bases[i] = cBaseToChar[bam_seqi(seq, i + ad.left_clip)]; + } +} + +// 计算read两端softclip的碱基数量,可能会修改ad里的clip值 +void calculateSoftClip(BamWrap *bw, ReadAdditionData &ad) { + const uint32_t* cigar = bam_get_cigar(bw->b); + const bam1_core_t& bc = bw->b->core; + int readIndex = ad.left_clip; + int cutLeft = -1; // first position to hard clip (inclusive) + int cutRight = -1; // first position to hard clip (inclusive) + int cigar_start = ad.cigar_start; + int cigar_end = ad.cigar_end; + bool rightTail = false; // trigger to stop clipping the left tail and start cutting the right tail + + for (int i = ad.cigar_start; i < ad.cigar_end; ++i) { + const char c = bam_cigar_opchr(cigar[i]); + int len = bam_cigar_oplen(cigar[i]); + if (i == ad.cigar_start) len -= ad.first_cigar_clip; + if (i == ad.cigar_end - 1) len -= ad.last_cigar_clip; + if (c == 'S') { + if (rightTail) { + cutRight = readIndex; + cigar_end = i; + } else { + cutLeft = readIndex + len - 1; + cigar_start = i + 1; + } + } else if (c != 'H') { + rightTail = true; + } + if (consumeReadBases(c)) { + readIndex += len; + } + } + if (cutRight >= 0) { + ad.right_clip = bw->b->core.l_qseq - cutRight; + ad.cigar_end = cigar_end; + ad.last_cigar_clip = 0; + } + if (cutLeft >= 0) { + ad.left_clip = cutLeft + 1; + ad.cigar_start = cigar_start; + ad.first_cigar_clip = 0; + } +} + +// 读取给定区间的reference +static inline void read_ref_base(AuxVar& aux, int64_t cur_pos, Interval& interval) { + if (aux.ref_seq != NULL) + free(aux.ref_seq); + int tid = BamWrap::bam_tid(cur_pos); + const char* chr = sam_hdr_tid2name(aux.header, tid); + int seq_begin = BamWrap::bam_pos(interval.left); //- aux.REFERENCE_HALF_WINDOW_LENGTH; + int seq_end = BamWrap::bam_pos(interval.right); //+ aux.REFERENCE_HALF_WINDOW_LENGTH; + aux.ref_seq = faidx_fetch_seq(aux.faidx, chr, seq_begin, seq_end, &aux.ref_len); + // aux.offset = aux.REFERENCE_HALF_WINDOW_LENGTH; +} + +// 设置某个位置是indel +inline void updateIndel(vector &isIndel, int index) { + if (index >=0 && index < isIndel.size()) { + isIndel[index] = 1; + } +} + +// 计算该read的每个碱基位置是否是SNP或Indel +int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, ReadAdditionData &ad, vector &isSNP, vector &isIns, vector &isDel) { + // 1. 读取参考基因组,先看看串行运行性能,稍后可以将读入ref和vcf合并起来做成一个并行流水线步骤 + Interval interval{bw->start_pos() + ad.ref_offset, bw->end_pos()}; // 闭区间 + read_ref_base(aux, interval.left, interval); + string refBases(aux.ref_seq); + // spdlog::info("ref: {}, {}, {} - {}", aux.ref_seq, aux.ref_len, bw->contig_pos(), bw->contig_end_pos()); + + // 2. 遍历cigar,计算每个碱基是否是SNP或Indel + int readPos = 0, refPos = 0, nEvents = 0; + const uint32_t* cigar = bam_get_cigar(bw->b); + const bam1_core_t& bc = bw->b->core; + uint8_t* seq = bam_get_seq(bw->b); + for (int i = ad.cigar_start; i < ad.cigar_end; ++i) { + const char c = bam_cigar_opchr(cigar[i]); + int len = bam_cigar_oplen(cigar[i]); + if (i == ad.cigar_start) len -= ad.first_cigar_clip; + if (i == ad.cigar_end - 1) len -= ad.last_cigar_clip; + if (c == 'M' || c == '=' || c == 'X') { + for (int j = 0; j < len; ++j) { + // 按位置将read和ref碱基进行比较,不同则是snp,注意read起始位置要加上left_clip + int snpInt = cBaseToChar[bam_seqi(seq, readPos + ad.left_clip)] == refBases[refPos] ? 0 : 1; + isSNP[readPos] = snpInt; + nEvents += snpInt; + readPos++; + refPos++; + } + } else if (c == 'D') { + // 应该是在上一个消耗碱基的cigar的最后一个位置,标记Del + int index = bw->GetReadNegativeStrandFlag() ? readPos : readPos - 1; + updateIndel(isDel, index); + refPos += len; + } else if (c == 'N') { + refPos += len; + } else if (c == 'I') { + // 与Del不同,Ins应该是在下一个cigar开始的位置,标记Ins + bool forwardStrandRead = !bw->GetReadNegativeStrandFlag(); + if (forwardStrandRead) { + updateIndel(isIns, readPos - 1); + } + readPos += len; + if (!forwardStrandRead) { + updateIndel(isIns, readPos); + } + } else if (c == 'S') { + readPos += len; + } + } + nEvents += std::accumulate(isIns.begin(), isIns.end(), 0) + std::accumulate(isDel.begin(), isDel.end(), 0); + + //spdlog::info("SNPs: {}, Ins: {}, Del: {}, total events: {}", std::accumulate(isSNP.begin(), isSNP.end(), 0), + // std::accumulate(isIns.begin(), isIns.end(), 0), std::accumulate(isDel.begin(), isDel.end(), 0), nEvents); + // exit(0); + + return nEvents; +} + +// 简单计算baq数组,就是全部赋值为'@' (64) +bool flatBAQArray(BamWrap* bw, ReadAdditionData& ad, vector& baqArray) { + baqArray.resize(ad.read_len, (int)'@'); + return true; +} + +// 计算真实的baq数组,耗时更多,好像enable-baq参数默认是关闭的,那就先不实现这个了 +bool calculateBAQArray(AuxVar& aux, BAQ& baq, BamWrap* bw, ReadAdditionData& ad, vector& baqArray) { + baqArray.resize(ad.read_len, 0); + return true; +} + +// 获取一行字符串 +static void get_line_from_buf(char* buf, int64_t total, int64_t* cur, string* line) { + line->clear(); + if (*cur >= total) + return; + char b; + while (*cur < total && (b = buf[(*cur)++]) != '\n') { + line->push_back(b); + } +} + +// 计算与read有交叉的已知位点信息, 应该要判断一下,是按照read的范围去读取vcf,还是按照一个batch read的范围去读取 +void calculateKnownSites(BamWrap* bw, ReadAdditionData& ad, vector &vcfs) { + int tid = bw->contig_id(); + uint64_t startPos = bw->start_pos(); // 闭区间 + uint64_t endPos = bw->end_pos(); // 闭区间 + // spdlog::info("bam {}, {}", startPos, endPos); + // update vcfs + for(auto &vcf : vcfs) { + // 清理旧的interval + while(!vcf.knownSites.empty()) { + auto& intv = vcf.knownSites.front(); + // spdlog::info("intv bam {}, {}", intv.right, startPos); + if (intv.right < startPos) + vcf.knownSites.pop_front(); + else + break; + } + if (!vcf.knownSites.empty() && vcf.knownSites.back().left > endPos) continue; + + // spdlog::info("intv {}, {}, {}", vcf.knownSites.size(), vcf.knownSites.front().right, vcf.knownSites.front().right); + // exit(0); + //spdlog::info("before intervals : {}", vcf.knownSites.size()); + // 读取新的interval + int64_t fpos, flen; + vcf.index.SearchInterval(startPos, endPos, &fpos, &flen); + //spdlog::info("file index: {}, {}", fpos, flen); + if (flen > 0) { + vcf.inStm.seekg(fpos, ios::beg); + if (flen > vcf.bufLen) { + vcf.bufLen = flen; + vcf.buf = (char*)realloc(vcf.buf, flen); + } + char* buf = vcf.buf; + vcf.inStm.read(buf, flen); + string line; + int64_t cur = 0; + get_line_from_buf(buf, flen, &cur, &line); + while (line.size() > 0) { + stringstream ss_line(line); + string stid; + int tid, pos; + int64_t locus; + string id, ref; + ss_line >> stid >> pos >> id >> ref; + tid = sam_hdr_name2tid(nsgv::gInBamHeader, stid.c_str()); + if (tid >= 0 && pos > 0) { + vcf.knownSites.push_back(Interval(tid, pos - 1, pos - 1 + ref.size())); + //spdlog::info("intv-1 {}, {}, {}", tid, pos, ref.size()); + } + get_line_from_buf(buf, flen, &cur, &line); + } + } + //spdlog::info("after intervals : {}", vcf.knownSites.size()); + //for(auto &val : vcf.knownSites) { + // spdlog::info("intv {}, {}", val.left, val.right); + //} + } + //exit(0); +} + // 串行bqsr int SerialBQSR() { int round = 0; BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO); - inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM); + // inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM); + inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, bqsrReadFilterOut); int64_t readNumSum = 0; + // 0. 初始化一些全局数据 + // BAQ baq{BAQ::DEFAULT_GOP}; + + // 1. 协变量数据相关初始化 + PerReadCovariateMatrix readCovariates; + CovariateUtils::InitPerReadCovMat(readCovariates); + ContextCovariate::InitContextCovariate(nsgv::gBqsrArg); + CycleCovariate::InitCycleCovariate(nsgv::gBqsrArg); + + // 2. 读取bam的read group + if (nsgv::gInBamHeader->hrecs->nrg == 0) { + spdlog::error("No RG tag found in the header!"); + return 1; + } + for (int i = 0; i < nsgv::gInBamHeader->hrecs->nrg; ++i) { + spdlog::info("rg: {}", nsgv::gInBamHeader->hrecs->rg[i].name); + ReadGroupCovariate::RgToId[nsgv::gInBamHeader->hrecs->rg[i].name] = i; + ReadGroupCovariate::IdToRg[i] = nsgv::gInBamHeader->hrecs->rg[i].name; + } + + int test = 0; + while (1) { ++ round; + + // 一. 读取bam数据 size_t readNum = 0; if (inBamBuf.ReadStat() >= 0) readNum = inBamBuf.ReadBam(); if (readNum < 1) { break; } - spdlog::info("{} reads processed in {} round", readNum, round); - auto bams = inBamBuf.GetBamArr(); - spdlog::info("region: {} - {}", bams[0]->softclip_start(), bams.back()->softclip_end()); - // 1. 获取bams数组覆盖的region范围 + spdlog::info("{} reads processed in {} round, {}", readNum, round, test); + // 二. 遍历每个bam(read)记录,进行处理 + + for (int i = 0; i < bams.size(); ++i) { + // 1. 对每个read,需要检查cigar是否合法,即没有两个连续的相同的cigar,而且需要将首尾的deletion处理掉,目前看好像没啥影响,我们忽略这一步 + // 2. 对质量分数长度跟碱基长度不匹配的read,缺少的质量分数用默认值补齐,先忽略,后边有需要再处理 + // 3. 如果bam文件之前做过bqsr,tag中包含OQ(originnal quality,原始质量分数),检查用户参数里是否指定用原始质量分数进行bqsr,如果是则将质量分数替换为OQ,否则忽略OQ,先忽略 + // 4. 对read的两端进行检测,去除(hardclip)adapter + BamWrap *bw = bams[i]; + ReadAdditionData ad; + ad.read_len = BamWrap::BamEffectiveLength(bw->b); + ad.cigar_end = bw->b->core.n_cigar; + if (ad.read_len <= 0) continue; + + int adapter_boundary = bw->GetAdapterBoundary(); + if (bw->IsAdapterInRead(adapter_boundary)) { + // adapter在read范围内 + if (bw->GetReadNegativeStrandFlag()) { // 反链 + clipByReferenceCoordinates(bw, -1, adapter_boundary, ad); + } else { // 正链 + clipByReferenceCoordinates(bw, adapter_boundary, -1, ad); + } + } + ad.read_len = bw->b->core.l_qseq - ad.left_clip - ad.right_clip; // 更新read长度 + // 5. 然后再去除softclip部分 + calculateSoftClip(bw, ad); + ad.read_len = bw->b->core.l_qseq - ad.left_clip - ad.right_clip; // 更新read长度 + if (ad.read_len <= 0) continue; + + calculateRefOffset(bw, ad); // 计算ref_offset,就是相对比对的position,要将ref右移多少 + calculateReadBases(bw, ad); // 计算clip处理之后,剩余的碱基 + + //spdlog::info("read-len {} - {}: clip left {}, right {}, ref offset: {}, cigar range: [{}, {}), cigar: {}", bw->b->core.l_qseq, + // ad.read_len, ad.left_clip, ad.right_clip, ad.ref_offset, ad.cigar_start, ad.cigar_end, bw->cigar_str()); + + // 6. 更新每个read的platform信息,好像没啥用,暂时忽略 + vector isSNP(ad.read_len, 0); // 该位置是否是SNP位置,0不是,1是 + vector isIns(ad.read_len, 0); // 该位置是否是插入位置,0不是,1是 + vector isDel(ad.read_len, 0); // 该位置是否是删除位置,0不是,1是 + const int nErrors = calculateIsSNPOrIndel(nsgv::gAuxVars[0], bw, ad, isSNP, isIns, isDel); + + // 7. 计算baqArray + // BAQ = base alignment quality + // note for efficiency reasons we don't compute the BAQ array unless we actually have + // some error to marginalize over. For ILMN data ~85% of reads have no error + vector baqArray; + bool baqCalculated = false; + if (nErrors == 0 || !nsgv::gBqsrArg.enableBAQ) { + baqCalculated = flatBAQArray(bw, ad, baqArray); + } else { + // baqCalculated = calculateBAQArray(nsgv::gAuxVars[0], baq, bw, ad, baqArray); + } + if (!baqCalculated) continue; + // 到这里,基本的数据都准备好了,后续就是进行bqsr的统计了 + + // 8. 计算这条read对应的协变量 + CovariateUtils::ComputeCovariates(bw, ad, nsgv::gInBamHeader, readCovariates, true); + test = readCovariates[1][0][0] + readCovariates[2][1][3]; + int end_pos = bw->contig_end_pos(); + //spdlog::info("adapter: {}, read: {}, {}, strand: {}", adapter_boundary, bw->contig_pos(), end_pos, + // bw->GetReadNegativeStrandFlag() ? "reverse" : "forward"); + + // 9. 计算这条read需要跳过的位置 + vector skip(ad.read_len, 0); + calculateKnownSites(bw, ad, nsgv::gAuxVars[0].vcfArr); + } + +#if 0 + // spdlog::info("region: {} - {}", bams[0]->global_softclip_start(), bams.back()->global_softclip_end()); + // 1. 获取bams数组覆盖的region范围 + // 如果读取的bam数组跨越了不同的染色体,咋搞?还是按照每个线程都有独立的vcf文件来做吧 + int64_t region_start = bams[0]->global_softclip_start(); + vector contig_bams; + int contig_id = bams[0]->contig_id(); + int64_t start = 0, stop = 0; + while (true) { + stop = start; + while (stop < bams.size() && bams[stop]->contig_id() == contig_id) ++stop; + if (stop > start) contig_bams.push_back(Region{start, stop}); + if (stop >= bams.size()) break; + contig_id = bams[stop]->contig_id(); + start = stop; + } + + spdlog::info("{}, {} contig regions", contig_id, contig_bams.size()); + + for (int i = 0; i < bams.size();) { + int64_t a1 = bams[i]->contig_pos(); + int64_t b1 = bams[i]->contig_end_pos(); + int64_t a = bams[i]->softclip_start(); + int64_t b = bams[i]->softclip_end(); + spdlog::info("{}: ({}, {}), ({}, {})", bams[i]->query_name(), a1, b1, a, b); + ++i; + } + // 依次处理每个contig的bams + vector bitmap(100, 0); // 用来表示known sites覆盖情况的bitmap + for (const auto& cr : contig_bams) { + spdlog::info(" contig id: {}, bam count: {}, bitmap size: {}", contig_id, cr.end - cr.start, bitmap.size()); + // 当前处理的contig + int contig_id = bams[cr.start]->contig_id(); + int64_t region_start = bams[cr.start]->softclip_start(); + int64_t region_end = bams[cr.end - 1]->softclip_end(); + if ((bitmap.size() << 5)) { + + } + + } + +#endif // 2. 开辟一个uint32_t的数组作为bitmap(如果上一轮的不够就重开),用来表示region的每个位点是否有known sites覆盖(每轮使用前需清零) // 3. 读取在region范围内的所有known sites,并为对应的bitmap设定0 or 1 (作为skip标识) @@ -87,14 +620,18 @@ int SerialBQSR() { // 4. 遍历bams数组中的每一条记录并进行处理 readNumSum += readNum; - inBamBuf.ClearAll(); // + inBamBuf.ClearAll(); // } spdlog::info("read count: {}", readNumSum); return 0; } -// entrance of mark duplicates +// 需要支持vcf idx,tbi,csi三种索引方式 +// vcf和idx是一对 +// vcf.gz和tbi或csi是一对 + +// entrance of mark BQSR int BaseRecalibrator() { PROF_START(whole_process); @@ -107,6 +644,18 @@ int BaseRecalibrator() { hts_set_opt(nsgv::gInBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE); nsgv::gInBamHeader = sam_hdr_read(nsgv::gInBamFp); // header + // 初始化AuxVar + nsgv::gAuxVars.resize(nsgv::gBqsrArg.NUM_THREADS); + for (int i = 0; i < nsgv::gBqsrArg.NUM_THREADS; ++i) { + nsgv::gAuxVars[i].header = nsgv::gInBamHeader; + nsgv::gAuxVars[i].faidx = fai_load(nsgv::gBqsrArg.REFERENCE_FILE.c_str()); + if (nsgv::gAuxVars[i].faidx == 0) + error("[%s] fail to load the fasta index.\n", __func__); + for (auto &vcfFileName : nsgv::gBqsrArg.KNOWN_SITES_VCFS) { + nsgv::gAuxVars[i].vcfArr.push_back(VCFParser(vcfFileName, nsgv::gInBamHeader)); + } + } + // (libraryId) nsgv::gMetrics.LIBRARY = sam_hdr_line_name(nsgv::gInBamHeader, "RG", 0); @@ -124,21 +673,21 @@ int BaseRecalibrator() { return SerialBQSR(); -// // 读取known sites vcfs -// for (const auto& ks : nsgv::gBqsrArg.KNOWN_SITES_VCFS) { -// spdlog::info(" {}", ks); -// bcf_srs_t* srs = bcf_sr_init(); -// if (!bcf_sr_add_reader(srs, ks.c_str())) -// error("Failed to read from %s: %s\n", !strcmp("-", ks.c_str()) ? "standard input" : ks.c_str(), bcf_sr_strerror(srs->errnum)); -// nsgv::gKnownSitesVcfSrs.push_back(srs); -// -// while (bcf_sr_next_line(srs)) { -// bcf1_t* line = srs->readers[0].buffer[0]; -// cout << line->pos << '\t' << line->rlen << '\t' << line->n_allele << '\t' << line->n_info << endl; -// } -// } -// -// /* 先实现串行的bqsr-phase-1 */ + // 读取known sites vcfs + for (const auto& ks : nsgv::gBqsrArg.KNOWN_SITES_VCFS) { + spdlog::info(" {}", ks); + bcf_srs_t* srs = bcf_sr_init(); + if (!bcf_sr_add_reader(srs, ks.c_str())) + error("Failed to read from %s: %s\n", !strcmp("-", ks.c_str()) ? "standard input" : ks.c_str(), bcf_sr_strerror(srs->errnum)); + nsgv::gKnownSitesVcfSrs.push_back(srs); + + while (bcf_sr_next_line(srs)) { + bcf1_t* line = srs->readers[0].buffer[0]; + cout << line->pos << '\t' << line->rlen << '\t' << line->n_allele << '\t' << line->n_info << endl; + } + } + + /* 先实现串行的bqsr-phase-1 */ diff --git a/src/bqsr/covariate.cpp b/src/bqsr/covariate.cpp new file mode 100644 index 0000000..4c15958 --- /dev/null +++ b/src/bqsr/covariate.cpp @@ -0,0 +1,317 @@ +#include "covariate.h" + +// for EventType +EventTypeValue EventType::BASE_SUBSTITUTION = {0, 'M', "Base Substitution"}; +EventTypeValue EventType::BASE_INSERTION = {1, 'I', "Base Insertion"}; +EventTypeValue EventType::BASE_DELETION = {2, 'D', "Base Deletion"}; + +// static变量 for ContextCovariate +int ContextCovariate::mismatchesContextSize; +int ContextCovariate::indelsContextSize; +int ContextCovariate::mismatchesKeyMask; +int ContextCovariate::indelsKeyMask; +uint8_t ContextCovariate::lowQualTail; +int ContextCovariate::baseIndexMap[256]; + +// for ReadGroupCovariate +map ReadGroupCovariate::RgToId; // read group name到id的映射 +map ReadGroupCovariate::IdToRg; // id到read group name的映射 + +// for cycleCovariate +int CycleCovariate::MAXIMUM_CYCLE_VALUE; + +// for CovariateUtils +// 对一条read计算协变量(该协变量被上一个read用过) +void CovariateUtils::ComputeCovariates(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, + bool recordIndelValues) { + //ReadGroupCovariate::RecordValues(bw, ad, header, values, recordIndelValues); + //BaseQualityCovariate::RecordValues(bw, ad, header, values, recordIndelValues); + //ContextCovariate::RecordValues(bw, ad, header, values, recordIndelValues); + //CycleCovariate::RecordValues(bw, ad, header, values, recordIndelValues); +} + +// ReadGroupCovariate 协变量的方法 +void ReadGroupCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) { + uint8_t *rgStr = bam_aux_get(bw->b, "RG"); + char* rgVal = nullptr; + if (rgStr) rgVal = bam_aux2Z(rgStr); + int key = 0; + if (rgVal == nullptr || RgToId.find(rgVal) == RgToId.end()) { + spdlog::error("The RG tag value for read can not be found in header!"); + } else { + key = RgToId[rgVal]; + } + for (int i = 0; i < ad.read_len; ++i) { + CovariateUtils::SetCovariate(key, key, key, i, ReadGroupCovariate::index, values); + } +} + +// BaseQualityCovariate 协变量的方法 +void BaseQualityCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, + bool recordIndelValues) { + // 在前面的处理过后,quals应该和base长度一致了 +#define __bq_set_cov(ins, del) \ + do { \ + for (int i = 0; i < ad.read_len; ++i) { \ + CovariateUtils::SetCovariate(quals[i + ad.left_clip], (ins), (del), i, BaseQualityCovariate::index, values); \ + } \ + } while (0) + + const int INDEL_QUAL = 45; + uint8_t* quals = bam_get_qual(bw->b); + if (recordIndelValues) { + uint8_t* insQualPtr = bam_aux_get(bw->b, "BI"); // base qualities for insertions + uint8_t* delQualPtr = bam_aux_get(bw->b, "BD"); // base qualities for deletions + if (insQualPtr == nullptr && delQualPtr == nullptr) { + __bq_set_cov(INDEL_QUAL, INDEL_QUAL); + } else if (insQualPtr == nullptr) { + uint8_t* delQuals = (uint8_t*)bam_aux2Z(delQualPtr); + __bq_set_cov(INDEL_QUAL, delQuals[i]); + } else { + uint8_t* insQuals = (uint8_t*)bam_aux2Z(insQualPtr); + __bq_set_cov(insQuals[i], INDEL_QUAL); + } + } else { + __bq_set_cov(0, 0); + } +} + +// ContextCovariate 协变量的方法 + +static char SimpleComplement(const char base) { + switch (base) { + case 'A': + case 'a': + return 'T'; + case 'C': + case 'c': + return 'G'; + case 'G': + case 'g': + return 'C'; + case 'T': + case 't': + return 'A'; + default: + return base; + } +} + +// 获取去除低质量分数碱基之后的read碱基序列(将低质量分数的碱基变成N) +void ContextCovariate::GetStrandedClippedBytes(BamWrap* bw, ReadAdditionData& ad, string& clippedBases, uint8_t lowQTail) { + uint8_t* quals = bam_get_qual(bw->b) + ad.left_clip; + + if (bw->GetReadNegativeStrandFlag()) { // 反向互补 + for (int i = 0; i < ad.read_len; ++i) clippedBases[i] = SimpleComplement(ad.bases[ad.read_len - 1 - i]); + } + + // 处理左边 + int left = 0; + for (; left < ad.read_len; ++left) { + if (quals[left] <= lowQTail) + clippedBases[left] = 'N'; + else + break; + } + if (left == ad.read_len) { + clippedBases.clear(); + return; + } + // 处理右边 + int right = ad.read_len - 1; + for (; right >= 0; --right) { + if (quals[right] <= lowQTail) + clippedBases[right] = 'N'; + else + break; + } + if (right < left) + clippedBases.clear(); +} + +/** + * Creates a int representation of a given dna string. + * + * @param dna the dna sequence + * @param start the start position in the byte array (inclusive) + * @param end the end position in the array (exclusive) + * @return the key representing the dna sequence + */ +int ContextCovariate::KeyFromContext(const string& dna, const int start, const int end) { + int key = end - start; + int bitOffset = LENGTH_BITS; + for (int i = start; i < end; i++) { + const int baseIndex = baseIndexMap[dna[i] & 0xff]; + if (baseIndex == -1) { // ignore non-ACGT bases + return -1; + } + key |= (baseIndex << bitOffset); + bitOffset += 2; + } + return key; +} + +/** + * For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context). + * + * For example, for the read [AGCTG], return the list + * [-1, "AG", "GC", "CT", "TG" ] + * with each string context encoded as an integer. + * + * @param bases the bases in the read to build the context from + * @param contextSize context size to use building the context + * @param mask mask for pulling out just the context bits + * + * @return a list that has the same length as the read and contains the (preceding) n-base context at each position. + * + */ +void ContextCovariate::GetReadContextAtEachPosition(const string& bases, const int contextSize, const int mask, vector& keys) { + int readLength = bases.size(); + keys.resize(readLength); + int keyIdx = 0; + // the first contextSize-1 bases will not have enough previous context + for (int i = 1; i < contextSize && i <= readLength; i++) { + keys[keyIdx++] = UNKNOWN_OR_ERROR_CONTEXT_CODE; + } + if (readLength < contextSize) + return; + + int newBaseOffset = 2 * (contextSize - 1) + LENGTH_BITS; + + // get (and add) the key for the context starting at the first base + int currentKey = KeyFromContext(bases, 0, contextSize); + keys[keyIdx++] = currentKey; + + // if the first key was -1 then there was an non-ACGT in the context; figure out how many more consecutive contexts it affects + int currentNPenalty = 0; + if (currentKey == -1) { + currentKey = 0; + currentNPenalty = contextSize - 1; + int offset = newBaseOffset; + int baseIndex; + while ((baseIndex = baseIndexMap[bases[currentNPenalty]]) != -1) { + currentKey |= (baseIndex << offset); + offset -= 2; + currentNPenalty--; + } + } + + for (int currentIndex = contextSize; currentIndex < readLength; currentIndex++) { + const int baseIndex = baseIndexMap[bases[currentIndex]]; + if (baseIndex == -1) { // ignore non-ACGT bases + currentNPenalty = contextSize; + currentKey = 0; // reset the key + } else { + // push this base's contribution onto the key: shift everything 2 bits, mask out the non-context bits, and add the new base and the length + // in + currentKey = (currentKey >> 2) & mask; + currentKey |= (baseIndex << newBaseOffset); + currentKey |= contextSize; + } + + if (currentNPenalty == 0) { + keys[keyIdx++] = currentKey; + } else { + currentNPenalty--; + keys[keyIdx++] = -1; + } + } +} + +void ContextCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) { + const int originalReadLength = ad.read_len; + + // store the original bases and then write Ns over low quality ones + string strandedClippedBases(ad.bases); + GetStrandedClippedBytes(bw, ad, strandedClippedBases, lowQualTail); + // spdlog::info("bases: {}", strandedClippedBases); + vector nBasePairContextAtEachCycle; + GetReadContextAtEachPosition(strandedClippedBases, mismatchesContextSize, mismatchesKeyMask, nBasePairContextAtEachCycle); + + const int readLengthAfterClipping = strandedClippedBases.size(); + + // this is necessary to ensure that we don't keep historical data in the ReadCovariates values + // since the context covariate may not span the entire set of values in read covariates + // due to the clipping of the low quality bases + if (readLengthAfterClipping != originalReadLength) { + // don't bother zeroing out if we are going to overwrite the whole array + for (int i = 0; i < originalReadLength; i++) { + // this base has been clipped off, so zero out the covariate values here + CovariateUtils::SetCovariate(0, 0, 0, i, ContextCovariate::index, values); + } + } + + const bool negativeStrand = bw->GetReadNegativeStrandFlag(); + // Note: duplicated the loop to avoid checking recordIndelValues on each iteration + if (recordIndelValues) { + vector indelKeys; + GetReadContextAtEachPosition(strandedClippedBases, indelsContextSize, indelsKeyMask, indelKeys); + for (int i = 0; i < readLengthAfterClipping; i++) { + const int readOffset = GetStrandedOffset(negativeStrand, i, readLengthAfterClipping); + const int indelKey = indelKeys[i]; + CovariateUtils::SetCovariate(nBasePairContextAtEachCycle[i], indelKey, indelKey, readOffset, ContextCovariate::index, values); + } + } else { + for (int i = 0; i < readLengthAfterClipping; i++) { + const int readOffset = GetStrandedOffset(negativeStrand, i, readLengthAfterClipping); + CovariateUtils::SetCovariate(nBasePairContextAtEachCycle[i], 0, 0, readOffset, ContextCovariate::index, values); + } + } +} + +// CycleCovariate 协变量的方法 + +/** + * Computes the encoded value of CycleCovariate's key for the given position at the read. + * Uses keyFromCycle to do the encoding. + * @param baseNumber index of the base to compute the key for + * @param read the read + * @param indel is this an indel key or a substitution key? + * @param maxCycle max value of the base to compute the key for + * (this method throws UserException if the computed absolute value of the cycle number is higher than this value). + */ +int CycleCovariate::CycleKey(BamWrap* bw, ReadAdditionData& ad, const int baseNumber, const bool indel, const int maxCycle) { + const bool isNegStrand = bw->GetReadNegativeStrandFlag(); + const bool isSecondInPair = (bw->b->core.flag & BAM_FPAIRED) && (bw->b->core.flag & BAM_FREAD2); + const int readLength = ad.read_len; + + const int readOrderFactor = isSecondInPair ? -1 : 1; + int increment; + int cycle; + if (isNegStrand) { + cycle = readLength * readOrderFactor; + increment = -1 * readOrderFactor; + } else { + cycle = readOrderFactor; + increment = readOrderFactor; + } + cycle += baseNumber * increment; + + if (!indel) { + return CycleCovariate::KeyFromCycle(cycle, maxCycle); + } + const int maxCycleForIndels = readLength - CUSHION_FOR_INDELS - 1; + if (baseNumber < CUSHION_FOR_INDELS || baseNumber > maxCycleForIndels) { + return -1; + } else { + return CycleCovariate::KeyFromCycle(cycle, maxCycle); + } +} + +// Used to pick out the covariate's value from attributes of the read +void CycleCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) { + const int readLength = ad.read_len; + // Note: duplicate the loop to void checking recordIndelValues on every iteration + if (recordIndelValues) { + for (int i = 0; i < readLength; i++) { + const int substitutionKey = CycleKey(bw, ad, i, false, MAXIMUM_CYCLE_VALUE); + const int indelKey = CycleKey(bw, ad, i, true, MAXIMUM_CYCLE_VALUE); + CovariateUtils::SetCovariate(substitutionKey, indelKey, indelKey, i, CycleCovariate::index, values); + } + } else { + for (int i = 0; i < readLength; i++) { + const int substitutionKey = CycleKey(bw, ad, i, false, MAXIMUM_CYCLE_VALUE); + CovariateUtils::SetCovariate(substitutionKey, 0, 0, i, CycleCovariate::index, values); + } + } +} \ No newline at end of file diff --git a/src/bqsr/covariate.h b/src/bqsr/covariate.h new file mode 100644 index 0000000..0c9edc0 --- /dev/null +++ b/src/bqsr/covariate.h @@ -0,0 +1,209 @@ +/* + Description: 在bqsr过程中,计算协变量相关的类和方法 + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2025/12/08 +*/ + +#pragma once + +#include + +#include +#include +#include +#include +#include + +#include "bqsr_args.h" +#include "util/bam_wrap.h" + +using std::map; +using std::string; +using std::vector; + +/** + * This is where we store the pre-read covariates, also indexed by (event type) and (read position). + * Thus the array has shape { event type } x { read position (aka cycle) } x { covariate }. + * For instance, { covariate } is by default 4-dimensional (read group, base quality, context, cycle). + */ +typedef vector>> PerReadCovariateMatrix; + +// 变异类型(snp, insert, deletion) +struct EventTypeValue { + int index; // 在协变量数组中对应的索引 + char representation; + string longRepresentation; +}; + +struct EventType { + static constexpr int EVENT_SIZE = 3; + static EventTypeValue BASE_SUBSTITUTION; + static EventTypeValue BASE_INSERTION; + static EventTypeValue BASE_DELETION; +}; + +// 协变量相关的工具类 +struct CovariateUtils { + static constexpr int MAX_READ_LENGTH = 300; // 最大read长度 + static constexpr int NUM_COVARIATES = 4; + + // 初始化PerReadCovariateMatrix + static void InitPerReadCovMat(PerReadCovariateMatrix& matrix) { + matrix.resize(EventType::EVENT_SIZE); + for (int event_type = 0; event_type < EventType::EVENT_SIZE; ++event_type) { + matrix[event_type].resize(MAX_READ_LENGTH); + for (int pos = 0; pos < MAX_READ_LENGTH; ++pos) { + matrix[event_type][pos].resize(NUM_COVARIATES, 0); + } + } + } + + // 设置协变量 + static void SetCovariate(int mismatch, int insertion, int deletion, int readOffset, int covIndex, PerReadCovariateMatrix& matrix) { + matrix[EventType::BASE_SUBSTITUTION.index][readOffset][covIndex] = mismatch; + matrix[EventType::BASE_INSERTION.index][readOffset][covIndex] = insertion; + matrix[EventType::BASE_DELETION.index][readOffset][covIndex] = deletion; + } + + // 对一条read计算协变量(该协变量被上一个read用过) + static void ComputeCovariates(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); +}; + +// Read group协变量 +struct ReadGroupCovariate { + static constexpr int index = 0; // 在协变量数组中的索引位置 + static map RgToId; // read group name到id的映射 + static map IdToRg; // id到read group name的映射 + + static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); +}; + +// Base quality协变量 +struct BaseQualityCovariate { + static constexpr int index = 1; // 在协变量数组中的索引位置 + static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); +}; + +// Context协变量 +struct ContextCovariate { + static constexpr int index = 2; // 在协变量数组中的索引位置 + static constexpr int UNKNOWN_OR_ERROR_CONTEXT_CODE = -1; + static constexpr int LENGTH_BITS = 4; + static constexpr int LENGTH_MASK = 15; + + // the maximum context size (number of bases) permitted; we need to keep the leftmost base free so that values are + // not negative and we reserve 4 more bits to represent the length of the context; it takes 2 bits to encode one base. + static constexpr int MAX_DNA_CONTEXT = 13; + + static int mismatchesContextSize; + static int indelsContextSize; + static int mismatchesKeyMask; + static int indelsKeyMask; + static uint8_t lowQualTail; + + static int baseIndexMap[256]; + + static void InitContextCovariate(BQSRArg& p) { + mismatchesContextSize = p.MISMATCHES_CONTEXT_SIZE; + indelsContextSize = p.INDELS_CONTEXT_SIZE; + if (mismatchesContextSize > MAX_DNA_CONTEXT) { + spdlog::error("mismatches_context_size: context size cannot be bigger than {}, but was {}", MAX_DNA_CONTEXT, mismatchesContextSize); + exit(1); + } + if (indelsContextSize > MAX_DNA_CONTEXT) { + spdlog::error("indels_context_size: context size cannot be bigger than {}, but was {}", MAX_DNA_CONTEXT, indelsContextSize); + exit(1); + } + lowQualTail = p.LOW_QUAL_TAIL; + if (mismatchesContextSize <= 0 || indelsContextSize <= 0) { + spdlog::error("Context size must be positive. Mismatches: {} Indels: {}", mismatchesContextSize, indelsContextSize); + exit(1); + } + mismatchesKeyMask = CreateMask(mismatchesContextSize); + indelsKeyMask = CreateMask(indelsContextSize); + + // init baseIndexMap + for (int i = 0; i < 256; ++i) { + baseIndexMap[i] = -1; + } + baseIndexMap['A'] = 0; + baseIndexMap['a'] = 0; + baseIndexMap['*'] = 0; + baseIndexMap['C'] = 1; + baseIndexMap['c'] = 1; + baseIndexMap['G'] = 2; + baseIndexMap['g'] = 2; + baseIndexMap['T'] = 3; + baseIndexMap['t'] = 3; + } + + static int CreateMask(int contextSize) { + int mask = 0; + // create 2*contextSize worth of bits + for (int i = 0; i < contextSize; i++) { + mask = (mask << 2) | 3; + } + // shift 4 bits to mask out the bits used to encode the length + return mask << LENGTH_BITS; + } + + /** + * Helper method: computes the correct offset to use in computations of covariate values. + * @param isNegativeStrand is the read on the negative strand + * @param offset 0-based index of the base in the read + * @param readLength length of the read + * @return + */ + static int GetStrandedOffset(const bool isNegativeStrand, const int offset, const int readLength) { + return isNegativeStrand ? (readLength - offset - 1) : offset; + } + + // 获取去除低质量分数碱基之后的read碱基序列(将低质量分数的碱基变成N) + static void GetStrandedClippedBytes(BamWrap* bw, ReadAdditionData& ad, string& clippedBases, uint8_t lowQTail); + // Creates a int representation of a given dna string. + static int KeyFromContext(const string& dna, const int start, const int end); + // For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context). + static void GetReadContextAtEachPosition(const string& bases, const int contextSize, const int mask, vector& keys); + + // 设置协变量的值 + static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); +}; + +// Cycle协变量 +struct CycleCovariate { + static constexpr int index = 3; // 在协变量数组中的索引位置 + static int MAXIMUM_CYCLE_VALUE; + static constexpr int CUSHION_FOR_INDELS = 4; + + static void InitCycleCovariate(BQSRArg& p) { MAXIMUM_CYCLE_VALUE = p.MAXIMUM_CYCLE_VALUE; } + + /** + * Encodes the cycle number as a key. + */ + static int KeyFromCycle(const int cycle, const int maxCycle) { + // no negative values because values must fit into the first few bits of the long + int result = std::abs(cycle); + if (result > maxCycle) { + spdlog::error( + "The maximum allowed value for the cycle is {}, but a larger cycle ({}) was detected. Please use the --maximum-cycle-value argument " + "(when creating the recalibration table in " + "BaseRecalibrator) to increase this value (at the expense of requiring more memory to run)", + maxCycle, result); + exit(1); + } + + result <<= 1; // shift so we can add the "sign" bit + if (cycle < 0) { + result++; // negative cycles get the lower-most bit set + } + return result; +} + + // Computes the encoded value of CycleCovariate's key for the given position at the read. + static int CycleKey(BamWrap* bw, ReadAdditionData& ad, const int baseNumber, const bool indel, const int maxCycle); + + static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); +}; \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index eada665..0527061 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -78,6 +78,12 @@ int main_BaseRecalibrator(int argc, char *argv[]) { .nargs(1) .metavar(""); + program.add_argument("--enable-baq") + .help("Whether to do BAQ correction.") + .default_value(false) + .implicit_value(true) + .hidden(); + // add help and version args program.add_argument("-h", "--help") .action([&](const auto & /*unused*/) { @@ -113,8 +119,9 @@ int main_BaseRecalibrator(int argc, char *argv[]) { nsgv::gBqsrArg.OUTPUT_FILE = program.get("--output"); nsgv::gBqsrArg.NUM_THREADS = program.get("--num-threads"); nsgv::gBqsrArg.CREATE_INDEX = program.get("--create-index"); - + nsgv::gBqsrArg.REFERENCE_FILE = program.get("--reference"); nsgv::gBqsrArg.KNOWN_SITES_VCFS = program.get>("--known-sites"); + nsgv::gBqsrArg.enableBAQ = program.get("--enable-baq"); // spdlog::info("known sites vcf files:"); // for (const auto& ks : nsgv::gBqsrArg.KNOWN_SITES_VCFS) { // spdlog::info(" {}", ks); diff --git a/src/util/bam_buf.cpp b/src/util/bam_buf.cpp index 8f88976..c43cc59 100644 --- a/src/util/bam_buf.cpp +++ b/src/util/bam_buf.cpp @@ -1,5 +1,5 @@ /* - Description: sam/bam,buf, + Description: 读入sam/bam时,开辟一个大的buf,存放这些数据 Copyright : All right reserved by ICT @@ -10,25 +10,28 @@ #include "bam_buf.h" /* - * BamBuf + * BamBuf类 */ -// , +// 读取数据直到读完,或者缓冲区满 int BamBuf::ReadBam() { int read_num = 0; - if (handle_last) { // bam - if (has_enough_space()) { // ,memffset + if (handle_last) { // 处理上次读入的最后一个bam + if (has_enough_space()) { // 必须调用,在边界处调整memffset ++read_num; append_one_bam(); } else { - return read_num; // + return read_num; // 还是没空间 } } while (read_stat_ >= 0 && (read_stat_ = sam_read1(fp, hdr, bw->b)) >= 0) { bw->end_pos_ = BamWrap::BamEndPos(bw->b); - if (has_enough_space()) { // - // if (true) { // - append_one_bam(); - ++read_num; // + if (has_enough_space()) { // 还有空间 + // if (true) { // 还有空间 + // 加过滤器 + if (filter_out == nullptr || !filter_out(bw->b)) { + append_one_bam(); + ++read_num; // 放进缓存才算读取到 + } } else { break; } @@ -41,7 +44,7 @@ int BamBuf::ReadBam() { return read_num; } -// +// 初始化缓存 void BamBuf::Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size) { this->fp = fp; this->hdr = hdr; @@ -71,9 +74,9 @@ void BamBuf::ClearAll() { prepare_read(); } -// , +// 为下一次读取做准备, 计算一些边界条件 inline void BamBuf::prepare_read() { - // bam + // 计算余留的下次计算可能用到的bam所占的位置 if (bv.size() > 0) { BamWrap *bw = bv[0]; legacy_start = (int64_t)bw - (int64_t)mem; @@ -81,11 +84,11 @@ inline void BamBuf::prepare_read() { legacy_end = (int64_t)bw + bw->length() - (int64_t)mem; } else { legacy_start = legacy_end = 0; - mem_offset = 0; // , + mem_offset = 0; // 上次没剩下,那就从头存储 } } -// +// 检查缓存是否还有空间 inline bool BamBuf::has_enough_space() { const uint32_t bam_len = bw->length(); int64_t potential_end = mem_offset + bam_len; @@ -104,7 +107,7 @@ inline bool BamBuf::has_enough_space() { return potential_end < legacy_start; } -// bam +// 处理一个读取后的bam inline void BamBuf::append_one_bam() { BamWrap *bwp = (BamWrap *)(mem + mem_offset); *bwp = *bw; @@ -113,15 +116,15 @@ inline void BamBuf::append_one_bam() { *bp = *bw->b; bp->data = (uint8_t *)((char *)bwp->b + sizeof(bam1_t)); memcpy(bp->data, bw->b->data, bw->b->l_data); - // + // 更新下次存储的位置 mem_offset = (mem_offset + bw->length() + 8 - 1) & ~((size_t)(8 - 1)); bv.push_back(bwp); } -// read +// 处理上次读入的最后一个read inline bool BamBuf::handle_last_read() { - if (handle_last) { // bam - if (has_enough_space()) { // ,memffset + if (handle_last) { // 处理上次读入的最后一个bam + if (has_enough_space()) { // 必须调用,在边界处调整memffset append_one_bam(); handle_last = false; return true; @@ -131,9 +134,9 @@ inline bool BamBuf::handle_last_read() { } /* - * AsyncIoBamBuf + * AsyncIoBamBuf 类 */ -// +// 初始化缓存 void AsyncIoBamBuf::Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size) { if (use_async_io_) { buf1_.Init(fp, hdr, mem_size >> 1); @@ -147,7 +150,7 @@ void AsyncIoBamBuf::Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size) { } } -// +// 读取数据 int AsyncIoBamBuf::ReadBam() { if (use_async_io_) { hasThread = true; @@ -178,11 +181,11 @@ int AsyncIoBamBuf::async_read_bam() { first_read_ = false; refresh_bam_arr(); } else { - // join, + // join, 交换缓冲区指针 pthread_join(*tid_, 0); resize_buf(); - if (need_read_) { // + if (need_read_) { // 需要交换指针 BamBuf *tmp = pi_; pi_ = po_; po_ = tmp; @@ -190,14 +193,14 @@ int AsyncIoBamBuf::async_read_bam() { read_num = last_read_num_; refresh_bam_arr(); } - // + // 异步读 pthread_create(tid_, 0, async_read, this); return read_num; } void *AsyncIoBamBuf::async_read(void *data) { AsyncIoBamBuf *ab = (AsyncIoBamBuf *)data; - if (ab->need_read_ && ab->ReadStat() >= 0) { // + if (ab->need_read_ && ab->ReadStat() >= 0) { // 需要读取 ab->last_read_num_ = ab->po_->ReadBam(); } else { ab->last_read_num_ = 0; @@ -205,23 +208,23 @@ void *AsyncIoBamBuf::async_read(void *data) { pthread_exit(0); } -// , -// ,,po_buf +// 为下一次读取做准备, +// 计算一些边界条件,延迟操作,因为此时可能po_对应的buf正在读取 void AsyncIoBamBuf::ClearBeforeIdx(size_t idxInBv) { clear_before_idx_ = idxInBv; } -// ,,po_buf +// 清空上一次所有读入的数据,延迟操作,因为此时可能po_对应的buf正在读取 void AsyncIoBamBuf::ClearAll() { clear_all_ = true; } inline void AsyncIoBamBuf::resize_buf() { - if (clear_all_) { // + if (clear_all_) { // 清理上一轮的数据 clear_all_ = false; po_->ClearBeforeIdx(legacy_size_); pi_->ClearAll(); - if (pi_->handle_last_read()) { // read + if (pi_->handle_last_read()) { // 上次读取有一个read没放入缓存 last_read_num_ += 1; - legacy_size_ = pi_->Size(); // read + legacy_size_ = pi_->Size(); // 应该只有一个read need_read_ = true; - } else { // ,, + } else { // 没空间存放,则不交换指针,或者文件已经读取完毕 legacy_size_ = 0; need_read_ = false; } @@ -229,16 +232,16 @@ inline void AsyncIoBamBuf::resize_buf() { if (clear_before_idx_ < legacy_size_) { po_->ClearBeforeIdx(clear_before_idx_); legacy_size_ -= clear_before_idx_; - // , + // 不需要交换指针,不需要读取 need_read_ = false; } else { po_->ClearBeforeIdx(legacy_size_); pi_->ClearBeforeIdx(clear_before_idx_ - legacy_size_); - if (pi_->handle_last_read()) { // read + if (pi_->handle_last_read()) { // 上次读取有一个read没放入缓存 last_read_num_ += 1; - legacy_size_ = pi_->Size(); // read + legacy_size_ = pi_->Size(); // 应该只有一个read need_read_ = true; - } else { // ,, + } else { // 没空间存放,则不交换指针,或者文件已经读取完毕 legacy_size_ = 0; need_read_ = false; } diff --git a/src/util/bam_buf.h b/src/util/bam_buf.h index d0bb7e7..dfb3405 100644 --- a/src/util/bam_buf.h +++ b/src/util/bam_buf.h @@ -1,5 +1,5 @@ /* - Description: sam/bam,buf, + Description: 读入sam/bam时,开辟一个大的buf,存放这些数据 Copyright : All right reserved by ICT @@ -26,33 +26,40 @@ using std::vector; using namespace std; +using ReadFilterOut = bool (*)(const bam1_t *b); + /* - * bam + * 存放读入的bam数据 */ struct BamBuf { - sam_hdr_t *hdr; // samheader - samFile *fp; // sam - BamWrap *bw = nullptr; // bam - uint8_t *mem = nullptr; // bam, - // , - int64_t mem_offset = 0; // - int64_t mem_size; // - int read_stat_ = 0; // , - vector bv; // bam - int64_t legacy_start = 0; // bammem, - int64_t legacy_end = 0; // bammem, - bool handle_last = false; // bam + sam_hdr_t *hdr; // sam文件的header信息 + samFile *fp; // sam文件指针 + BamWrap *bw = nullptr; // 用来循环读入bam + uint8_t *mem = nullptr; // 用来存放bam的数据, + // 程序结束后自动释放,所以没在析构函数里释放 + int64_t mem_offset = 0; // 下一次要存放的位置 + int64_t mem_size; // 缓存大小 + int read_stat_ = 0; // 读取状态,是否读完 + vector bv; // 方便对bam数据的访问 + int64_t legacy_start = 0; // 没处理完的bam在mem中的起始位置, 闭区间 + int64_t legacy_end = 0; // 没处理完的bam在mem中的结束位置, 开区间 + bool handle_last = false; // 上次最后读入的bam是否需要处理 + ReadFilterOut filter_out = nullptr; // 读入过滤函数指针 - // + // 初始化缓存 void Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size); - // , + void Init(samFile* fp, sam_hdr_t* hdr, int64_t mem_size, ReadFilterOut filter) { + this->filter_out = filter; + Init(fp, hdr, mem_size); + } + // 读取数据直到读完,或者缓冲区满 int ReadBam(); - // , + // 为下一次读取做准备, 计算一些边界条件 void ClearBeforeIdx(size_t idxInBv); - // + // 清空上一次所有读入的数据 void ClearAll(); - inline int64_t Size() { return bv.size(); } // read - inline int ReadStat() { return read_stat_; } // ,() + inline int64_t Size() { return bv.size(); } // 包含多少个read + inline int ReadStat() { return read_stat_; } // 文件的读取状态,是否可读(读取完全) ~BamBuf() { if (this->mem != nullptr) { free(this->mem); @@ -62,7 +69,7 @@ struct BamBuf { free(bw); } } - void FreeMemory() // + void FreeMemory() // 释放开辟的内存 { if (this->mem != nullptr) { free(this->mem); @@ -75,14 +82,14 @@ struct BamBuf { this->bw = nullptr; } void prepare_read(); - // + // 检查缓存是否还有空间 bool has_enough_space(); - // bam + // 处理一个读取后的bam void append_one_bam(); - // read + // 处理上次读入的最后一个read bool handle_last_read(); - // bv + // 针对bv的操作 inline BamWrap *operator[](int64_t pos) { return bv[pos]; } inline void push_back(BamWrap *val) { bv.push_back(val); } inline void clear() { bv.clear(); } @@ -90,53 +97,57 @@ struct BamBuf { }; /* - * io + * io异步缓冲区 */ struct AsyncIoBamBuf { BamBuf buf1_; BamBuf buf2_; - BamBuf *pi_; // buf - BamBuf *po_; // buf + BamBuf *pi_; // 当前用的buf + BamBuf *po_; // 后台在读取的buf pthread_t *tid_ = NULL; bool hasThread = false; - int64_t legacy_size_ = 0; // ,read + int64_t legacy_size_ = 0; // 上一轮运算之后,缓存中还剩余的上次读取的read数量 bool first_read_ = true; - int last_read_num_ = 0; // reads + int last_read_num_ = 0; // 上一次读取了多少reads bool need_read_ = true; bool use_async_io_ = true; - int64_t clear_before_idx_ = 0; // ,clear_before_idx_reads - bool clear_all_ = false; // ,reads + int64_t clear_before_idx_ = 0; // 用户异步读取,下一轮读取之前清理掉clear_before_idx_之前的所有reads + bool clear_all_ = false; // 用于异步读取,下一轮读取之前清理掉之前的所有reads - vector bam_arr_; // bufbam + vector bam_arr_; // 用来访问buf中的bam AsyncIoBamBuf() {} AsyncIoBamBuf(bool use_async) : use_async_io_(use_async) {} - // + // 析构 ~AsyncIoBamBuf() { if (tid_ != NULL) { if (hasThread) pthread_join(*tid_, 0); free(tid_); } - // - // buf + // 其他的内存就等程序结束自动释放 + // buf的析构函数会自动调用 } - // + // 初始化缓存 void Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size); - - // + void Init(samFile* fp, sam_hdr_t* hdr, int64_t mem_size, ReadFilterOut filter) { + Init(fp, hdr, mem_size); + buf1_.filter_out = filter; + buf2_.filter_out = filter; + } + // 读取数据 int ReadBam(); - // , + // 为下一次读取做准备, 计算一些边界条件 void ClearBeforeIdx(size_t idxInBv); - vector &GetBamArr() { return bam_arr_; } // bam array - // + vector &GetBamArr() { return bam_arr_; } // 获取bam array + // 清空上一次所有读入的数据 void ClearAll(); - // read + // 包含的read数量 inline int64_t Size() { return legacy_size_ + pi_->Size(); } inline int ReadStat() { return pi_->read_stat_; } inline BamWrap *operator[](int64_t pos) { return bam_arr_[pos]; } - // reads + // 获取某一段reads inline vector Slice(size_t startIdx, size_t endIdx) { if (endIdx > startIdx) { auto begItr = bam_arr_.begin(); @@ -149,11 +160,11 @@ struct AsyncIoBamBuf { buf2_.FreeMemory(); } - // + // 同步读取 int sync_read_bam(); - // + // 异步读取 int async_read_bam(); - // + // 异步读取线程函数 static void *async_read(void *data); void resize_buf(); inline void refresh_bam_arr() { diff --git a/src/util/bam_wrap.h b/src/util/bam_wrap.h index adeed8d..387f19e 100644 --- a/src/util/bam_wrap.h +++ b/src/util/bam_wrap.h @@ -1,5 +1,5 @@ /* - Description: sam/bam,buf, + Description: 读入sam/bam时,开辟一个大的buf,存放这些数据 Copyright : All right reserved by ICT @@ -19,38 +19,55 @@ using namespace std; +// 对原始bam数据的补充,比如对两端进行hardclip等 +struct ReadAdditionData { + int read_len = 0; // read长度,各种clip之后的长度 + int cigar_start = 0; // cigar起始位置,闭区间 + int cigar_end = 0; // cigar结束位置,开区间 + int first_cigar_clip = 0; // 第一个cigar, clip的数量,切左侧 + int last_cigar_clip = 0; // 最后一个cigar, clip的数量,切右侧 + int left_clip = 0; // 左侧被切掉的碱基长度 + int right_clip = 0; // 右侧被切掉的碱基长度 + int ref_offset = 0; // 切除adapter和softclip之后(softclip应该不影响),相对原始ref比对位置(contig_pos)的偏移量 + string bases; // 处理之后的read的碱基 +}; + /* - ,,,,, - bam,,。 + 这里的成员函数命名有点混乱,特此说明,小写加下划线的函数命名,无论是静态函数,还是普通成员函数,更侧重说明 + 这是类似bam的一个属性,而大写加驼峰命名的函数,更侧重说明这是通过计算得出的。 */ + /* - * sam read + * sam read的封装 */ struct BamWrap { - // contigpos - const static int MAX_CONTIG_LEN_SHIFT = 40; // id, + // 将contig左移后加上pos作为全局位置 + const static int MAX_CONTIG_LEN_SHIFT = 40; // 将染色体id左移多少位,和位点拼合在一起 const static int READ_MAX_LENGTH = 200; - const static int READ_MAX_DEPTH = 1000; // , + const static int READ_MAX_DEPTH = 1000; // 这只是用来初始化空间用的,深度大于这个值也没关系 - // , + // 成员变量尽量少,减少占用内存空间 bam1_t *b; - int64_t end_pos_; // bam, + int64_t end_pos_; // bam的全局结束位置, 相对ref, 闭区间 - // + // 全局开始位置 inline int64_t start_pos() { return bam_global_pos(b); } - // + // 全局结束位置 inline int64_t end_pos() { return end_pos_; } - // reference + // 和reference对应的序列长度,不是read包含碱基的个数 inline int16_t read_len() { return (end_pos_ - start_pos() + 1); } - // contig + // contig id + inline int32_t contig_id() { return b->core.tid; } + // 在contig内的开始位置 inline int32_t contig_pos() { return b->core.pos; } - // contig + // 在contig内部的结束位置 inline int32_t contig_end_pos() { return bam_pos(end_pos_); } - // (AGTC) + // 序列的长度(AGTC字母个数) inline int16_t seq_len() { return b->core.l_qseq; } - // softclip +/* + // 算上开头的softclip inline int32_t softclip_start() { const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; @@ -61,18 +78,28 @@ struct BamWrap { return bc.pos; } - // softclip + inline int64_t global_softclip_start() { + return softclip_start() + ((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT); + } + + // 算上结尾的softclip,闭区间 inline int32_t softclip_end() { const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; - const char c = bam_cigar_opchr(cigar[bc.n_cigar - 1]); - const int len = bam_cigar_oplen(cigar[bc.n_cigar - 1]); + const int idx = bc.n_cigar - 1; + if (idx < 0) return bam_pos(end_pos_); + const char c = bam_cigar_opchr(cigar[idx]); + const int len = bam_cigar_oplen(cigar[idx]); if (c == 'S') return bam_pos(end_pos_) + len; return bam_pos(end_pos_); } - // softclip + inline int64_t global_softclip_end() { + return softclip_end() + ((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT); + } + + // 右边softclip的长度 inline int32_t right_softclip_len() { const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; @@ -82,8 +109,9 @@ struct BamWrap { return len; return 0; } +*/ - // + // 获取序列 inline std::string sequence() { ostringstream oss; char *seq = (char *)bam_get_seq(b); @@ -96,9 +124,9 @@ struct BamWrap { return std::move(oss.str()); } - // + // 获取名字 inline const char *query_name() { return bam_get_qname(b); } - // cigar + // 获取cigar 字符串 inline string cigar_str() { ostringstream oss; const uint32_t *cigar = bam_get_cigar(b); @@ -111,10 +139,10 @@ struct BamWrap { return std::move(oss.str()); } - // + // 占用的内存大小 inline int16_t length() { return sizeof(*this) + sizeof(bam1_t) + b->l_data; } - // cigarinsert + // 获取cigar中insert的总长度 inline int32_t insert_cigar_len() { const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; @@ -128,7 +156,7 @@ struct BamWrap { return ret; } - // cigardelete + // 获取cigar中delete的总长度 inline int32_t del_cigar_len() { const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; @@ -142,7 +170,7 @@ struct BamWrap { return ret; } - // sam read + // 计算sam read的终点位置,相对参考基因组 static inline int64_t BamEndPos(const bam1_t *b) { const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; @@ -156,6 +184,20 @@ struct BamWrap { return (((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)(b->core.pos + start_offset)); }; + // 计算read的有效长度,即除了softclip和hardclip之外的长度 + static inline int BamEffectiveLength(const bam1_t *b) { + const uint32_t *cigar = bam_get_cigar(b); + const bam1_core_t &bc = b->core; + int effective_len = 0; + for (int i = 0; i < bc.n_cigar; ++i) { + const char c = bam_cigar_opchr(cigar[i]); + const int len = bam_cigar_oplen(cigar[i]); + if (c == 'I' || c == 'N' || c == 'M' || c == '=' || c == 'X') + effective_len += len; + } + return effective_len; + }; + bool HasWellDefinedFragmentSize() { const bam1_core_t &bc = b->core; bool hasWellDefinedFragmentSize = true; @@ -170,20 +212,25 @@ struct BamWrap { return hasWellDefinedFragmentSize; } - // bamadapterBoundary + // 计算bam的adapterBoundary int GetAdapterBoundary() { const bam1_core_t &bc = b->core; - int adapterBoundary; + int adapterBoundary = INT_MIN; if (!HasWellDefinedFragmentSize()) adapterBoundary = INT_MIN; else if (bc.flag & BAM_FREVERSE) adapterBoundary = bc.mpos - 1; else - adapterBoundary = bc.pos + abs(bc.isize); // GATK4.0 GATK3.5,3.5+1 + adapterBoundary = bc.pos + abs(bc.isize); // GATK4.0 和 GATK3.5不一样,3.5的这里+1 return adapterBoundary; } - // I + // 检测adapter boundary是否在read范围内 + bool IsAdapterInRead(int adapterBoundary) { + return (adapterBoundary != INT_MIN && (adapterBoundary >= contig_pos() && adapterBoundary <= contig_end_pos())); + } + + // 获取开头的I的长度 inline int GetHeadInsertLen() { int insLen = 0; const uint32_t *cigar = bam_get_cigar(b); @@ -200,8 +247,8 @@ struct BamWrap { return insLen; } - // soft clip(HS,?, - // IS?) + // 获取soft clip开始位置(能处理H和S相连的情况,有这种情况么?, + // 注意开头的I要当做S?) inline int64_t GetSoftStart() { int64_t softStart = b->core.pos; const uint32_t *cigar = bam_get_cigar(b); @@ -209,7 +256,8 @@ struct BamWrap { for (int i = 0; i < bc.n_cigar; ++i) { const char c = bam_cigar_opchr(cigar[i]); const int len = bam_cigar_oplen(cigar[i]); - if (c == 'S' || c == 'I') + // if (c == 'S' || c == 'I') + if (c == 'S') softStart -= len; else if (c != 'H') break; @@ -217,7 +265,36 @@ struct BamWrap { return softStart; } - // unclipped(hardclip) + /** + * Calculates the reference coordinate for the end of the read taking into account soft clips but not hard clips. + * + * Note: getUnclippedEnd() adds soft and hard clips, this function only adds soft clips. + * + * @return the unclipped end of the read taking soft clips (but not hard clips) into account + */ + inline int64_t GetSoftEnd() { + int64_t softEnd = contig_end_pos(); + const uint32_t* cigar = bam_get_cigar(b); + const bam1_core_t& bc = b->core; + bool foundAlignedBase = false; + for (int i = bc.n_cigar - 1; i >= 0; --i) { + const char c = bam_cigar_opchr(cigar[i]); + const int len = bam_cigar_oplen(cigar[i]); + // if (c == 'S' || c == 'I') + if (c == 'S') + softEnd += len; + else if (c != 'H') { + foundAlignedBase = true; + break; + } + } + if (!foundAlignedBase) { // for example 64H14S, the soft end is actually the same as the alignment end + softEnd = contig_end_pos(); + } + return softEnd; + } + + // 获取unclipped开始位置(包括hardclip) inline int64_t GetUnclippedStart() { int64_t start = b->core.pos; const uint32_t *cigar = bam_get_cigar(b); @@ -233,7 +310,7 @@ struct BamWrap { return start; } - // unclipped(hardclip) + // 获取unclipped结束位置(包括hardclip) inline int64_t GetUnclippedEnd() { int64_t end_pos = bam_endpos(b); const uint32_t *cigar = bam_get_cigar(b); @@ -249,7 +326,7 @@ struct BamWrap { return end_pos - 1; } - /* */ + /* 获取碱基质量分数的加和 */ /** Calculates a score for the read which is the sum of scores over Q15. */ inline int GetSumOfBaseQualities() { int score = 0; @@ -262,9 +339,9 @@ struct BamWrap { return score; } - /* flag */ + /* 与flag相关的检测 */ - /* unmapped */ + /* 没有比对上 unmapped */ inline bool GetReadUnmappedFlag() { return b->core.flag & BAM_FUNMAP; } /* Template having multiple segments in sequencing */ @@ -313,7 +390,7 @@ struct BamWrap { */ bool GetMateNegativeStrandFlag() { return b->core.flag & BAM_FMREVERSE; } - /* */ + /* 其他的一些信息 */ inline int GetReferenceLength() { int length = 0; const uint32_t *cigar = bam_get_cigar(b); @@ -336,26 +413,26 @@ struct BamWrap { return length; } - // bam, + // 计算bam的全局位置,算上染色体序号和比对位置 static inline int64_t bam_global_pos(bam1_t *b) { return (((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)b->core.pos); } static inline int64_t bam_global_pos(int tid, int pos) { return (((int64_t)tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)pos); } - // bam + // 根据全局位置获取bam的染色体序号 static inline int32_t bam_tid(int64_t global_pos) { const int64_t mask = ~(((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1); const int64_t high_tid = global_pos & mask; return (int32_t)(high_tid >> MAX_CONTIG_LEN_SHIFT); } - // bam() + // 根据全局位置获取bam的比对位置(染色体内) static inline int32_t bam_pos(int64_t global_pos) { const int64_t mask = ((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1; return (int32_t)(global_pos & mask); } - // + // 设置是否冗余的标记 void SetDuplicateReadFlag(bool flag) { setFlag(flag, BAM_FDUP); } void setFlag(bool flag, int bit) { diff --git a/src/util/binary_util.h b/src/util/binary_util.h new file mode 100644 index 0000000..aa1e547 --- /dev/null +++ b/src/util/binary_util.h @@ -0,0 +1,131 @@ +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include + +using std::ifstream; +using std::ofstream; +using std::ostringstream; +using std::string; +using std::vector; + +using namespace std; + +// 二进制读写相关 +struct BinaryUtil { + static void WriteInt(ofstream& out, int val) { + uint32_t i = (uint32_t)val; + out << (char)(i & 0xFF) << (char)((i >> 8) & 0xFF) << (char)((i >> 16) & 0xFF) << (char)((i >> 24) & 0xFF); + }; + + static void WriteLong(ofstream& out, uint64_t val) { + uint64_t i = val; + out << (char)(i & 0xFF) << (char)((i >> 8) & 0xFF) << (char)((i >> 16) & 0xFF) << (char)((i >> 24) & 0xFF) << (char)((i >> 32) & 0xFF) + << (char)((i >> 40) & 0xFF) << (char)((i >> 48) & 0xFF) << (char)((i >> 56) & 0xFF); + }; + + static void WriteStr(ofstream& out, const string& s) { + for (int i = 0; i < s.size(); ++i) out << s[i]; + out << (char)0; + } + + static bool ReadInt(char* buf, uint64_t total, uint64_t* cur, int* res) { + char b1, b2, b3, b4; + if (*cur + 4 > total) + return false; + b1 = buf[(*cur)++]; + b2 = buf[(*cur)++]; + b3 = buf[(*cur)++]; + b4 = buf[(*cur)++]; + *res = (((uint32_t)(uint8_t)b4) << 24) + (((uint32_t)(uint8_t)b3) << 16) + (((uint32_t)(uint8_t)b2) << 8) + (((uint32_t)(uint8_t)b1)); + return true; + }; + + static bool ReadInt(ifstream& ifs, int* res) { + // if(ifs.read((char*)res, sizeof(*res))) return true; + char b[4]; + if (!ifs.read(&b[0], 1)) + return false; + if (!ifs.read(&b[1], 1)) + return false; + if (!ifs.read(&b[2], 1)) + return false; + if (!ifs.read(&b[3], 1)) + return false; + uint64_t cur = 0; + return ReadInt((char*)b, 4, &cur, res); + } + + static bool ReadLong(char* buf, uint64_t total, uint64_t* cur, uint64_t* res) { + char b1, b2, b3, b4, b5, b6, b7, b8; + if (*cur + 8 > total) + return false; + b1 = buf[(*cur)++]; + b2 = buf[(*cur)++]; + b3 = buf[(*cur)++]; + b4 = buf[(*cur)++]; + b5 = buf[(*cur)++]; + b6 = buf[(*cur)++]; + b7 = buf[(*cur)++]; + b8 = buf[(*cur)++]; + *res = (((uint64_t)(uint8_t)b8) << 56) + (((uint64_t)(uint8_t)b7) << 48) + (((uint64_t)(uint8_t)b6) << 40) + (((uint64_t)(uint8_t)b5) << 32) + + (((uint64_t)(uint8_t)b4) << 24) + (((uint64_t)(uint8_t)b3) << 16) + (((uint64_t)(uint8_t)b2) << 8) + (((uint64_t)(uint8_t)b1)); + return true; + }; + + static bool ReadLong(ifstream& ifs, uint64_t* res) { + // if(ifs.read((char*)res, sizeof(*res))) return true; + char b[8]; + if (!ifs.read(&b[0], 1)) + return false; + if (!ifs.read(&b[1], 1)) + return false; + if (!ifs.read(&b[2], 1)) + return false; + if (!ifs.read(&b[3], 1)) + return false; + if (!ifs.read(&b[4], 1)) + return false; + if (!ifs.read(&b[5], 1)) + return false; + if (!ifs.read(&b[6], 1)) + return false; + if (!ifs.read(&b[7], 1)) + return false; + uint64_t cur = 0; + return ReadLong((char*)b, 8, &cur, res); + } + static bool ReadStr(ifstream& ifs, string* res) { + char b; + res->clear(); + if (!ifs.read(&b, 1)) + return false; + while ((int)b != 0) { + res->push_back(b); + if (!ifs.read(&b, 1)) + return false; + } + return true; + } + static bool ReadStr(char* buf, uint64_t total, uint64_t* cur, string* res) { + char b; + res->clear(); + if (*cur >= total) + return false; + b = buf[(*cur)++]; + while ((int)b != 0) { + res->push_back(b); + if (*cur >= total) + return false; + b = buf[(*cur)++]; + } + return true; + } +}; \ No newline at end of file diff --git a/src/util/interval.cpp b/src/util/interval.cpp new file mode 100755 index 0000000..94e5cb5 --- /dev/null +++ b/src/util/interval.cpp @@ -0,0 +1,295 @@ +/* + Description: 处理intervals + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2019/11/24 +*/ + +#include "interval.h" + +#include +#include +#include +#include +#include + +#include + +#include "utils.h" +#include "bam_wrap.h" + +using std::min; +using std::max; +using std::string; +using std::ifstream; +using std::stringstream; + +using namespace std; + +// 构造函数 +Interval::Interval() : Interval(0, 0) {} +Interval::Interval(int64_t l, int64_t r) : left(l), right(r) {} + +// 比较函数 +bool Interval::operator<(const Interval& other) { + if (left == other.left) { + return right < other.right; + } + return left < other.left; +} + +// 是否有重叠 +bool Interval::overlaps(const Interval &other) { + return left <= other.right && right >= other.left; +} + +// 两个interval的合并 +Interval& Interval::spanWith(const Interval &other) { + left = min(left, other.left); + right = max(right, other.right); + return *this; +} + +// 返回两个interval的交集,不改变当前interval +Interval Interval::intersect(const Interval &that) const { + Interval val; + val.left = max(left, that.left); + val.right = min(right, that.right); + return val; +} + + +/* + * 合并两个interval arr,取相交区域的交集, interval arr都是排序后的 + */ +void Interval::IntersectIntervals(const IntervalArr &a_arr, + const IntervalArr &b_arr, + IntervalArr *r_arr) { + if (a_arr.size() < 1 || b_arr.size() < 1) return; + int ai=0, bi=0; + const Interval *last, *cur; + if (a_arr[ai].left < b_arr[bi].left) last = &a_arr[ai++]; + else last = &b_arr[bi++]; + while (ai < a_arr.size() && bi < b_arr.size()) { + if (a_arr[ai].left < b_arr[bi].left) cur = &a_arr[ai++]; + else cur = &b_arr[bi++]; + if (last->right < cur->left) { + last = cur; continue; + } else if (last->right > cur->right) { + r_arr->push_back(*cur); + } else { + r_arr->push_back(Interval(cur->left, last->right)); + last = cur; + } + } + const IntervalArr *arrp; + int ii; + if (ai < a_arr.size()) { arrp = &a_arr; ii = ai;} + else { arrp = &b_arr; ii = bi; } + const IntervalArr &arr = *arrp; + while(ii < arr.size()) { + cur = &arr[ii++]; + if (last->right < cur->left) { + break; + } else if (last->right > cur->right) { + r_arr->push_back(*cur); + } else { + r_arr->push_back(Interval(cur->left, last->right)); + break; + } + } +} + +/* + * 合并两个interval arr,取并集 + */ +void Interval::UnionIntervals(const IntervalArr &a_arr, + const IntervalArr &b_arr, + IntervalArr *r_arr) { + Interval tmp; + const Interval *cur; + Interval *last; + int ai=0, bi=0; + if (a_arr.size() < 1) { *r_arr = b_arr; return; } + if (b_arr.size() < 1) { *r_arr = a_arr; return; } + r_arr->clear(); + + if (a_arr[ai].left < b_arr[bi].left) tmp = a_arr[ai++]; + else tmp = b_arr[bi++]; + last = &tmp; + while(ai < a_arr.size() && bi < b_arr.size()) { + if (a_arr[ai].left < b_arr[bi].left) cur = &a_arr[ai++]; + else cur = &b_arr[bi++]; + if (last->right < cur->left) { + r_arr->push_back(*last); + *last = *cur; + } else { + last->right = max(last->right, cur->right); + } + } + const IntervalArr *arrp; + int ii; + if (ai < a_arr.size()) { arrp = &a_arr; ii = ai; } + else { arrp = &b_arr; ii = bi; } + const IntervalArr &arr = *arrp; + + while(ii < arr.size()) { + cur = &arr[ii++]; + if (last->right < cur->left) { + r_arr->push_back(*last); + *last = *cur; + } else { + last->right = max(last->right, cur->right); + } + } + r_arr->push_back(*last); +} + +/* + * 将有read覆盖的区域和参数提供的interval文件中的区域做一个交集 + */ +int64_t Interval::MergeIntervals(const IntervalArr &n_arr, + const IntervalArr &t_arr, + IntervalArr &in_arr, + int64_t start_loc, // 闭区间 + int64_t *end_loc, // 开区间 + IntervalArr *r_arr) { + IntervalArr tmp_arr; + const int64_t end_loc_val = *end_loc; + if (in_arr.size() < 1) { // 如果输入的interval为空,则使用tumor normal覆盖的interval + UnionIntervals(n_arr, t_arr, &tmp_arr); + } else { + IntervalArr mid_arr; + UnionIntervals(n_arr, t_arr, &mid_arr); + IntersectIntervals(mid_arr, in_arr, &tmp_arr); + } + for(int i=tmp_arr.size()-1; i>=0; --i) { + if (tmp_arr[i].left >= end_loc_val) { + tmp_arr.pop_back(); // 删除该元素 + continue; + } + tmp_arr[i].right = min(tmp_arr[i].right, end_loc_val - 1); // end_loc是开区间 + break; + } + for (int i=0; ipush_back(Interval(start_loc, tmp_arr[i].right)); + } else { + r_arr->push_back(tmp_arr[i]); + } + } + + int next_i = 0; + while(next_i < in_arr.size() && in_arr[next_i].right < end_loc_val) ++next_i; + if (next_i < in_arr.size()) { + if (end_loc_val < in_arr[next_i].left) { + *end_loc = in_arr[next_i].left; // 更新本次处理的终点 + } else { + in_arr[next_i].left = end_loc_val; // 更新panel + } + int i=0, j=next_i; + for (; jsize(); ++i) { + locus_num += (*r_arr)[i].right - (*r_arr)[i].left + 1; + } + return locus_num; +} + +/* + * 读取interval文件 + */ +void Interval::ReadInterval(const string &interval_fn, + bam_hdr_t* header, + int interval_padding, + IntervalArr *r_arr) { + ifstream interval_fs(interval_fn); + string one_line; + IntervalArr intervals; + getline(interval_fs, one_line); + while (!interval_fs.eof()) { + if (one_line[0] == '@') { + getline(interval_fs, one_line); + continue; + } + stringstream ss_line(one_line); + string contig_name; + ss_line >> contig_name; + int itid = sam_hdr_name2tid(header, contig_name.c_str()); + if (itid < 0) error("[%s] interval file has unknown contig name [%s]\n", __func__, contig_name.c_str()); + int64_t tid = (int64_t)itid; + tid <<= CONTIG_SHIFT; + int64_t start, stop; + ss_line >> start >> stop; + // interval文件是1-based,所以这里要减去1 + intervals.push_back(Interval(tid + start - 1, tid + stop -1)); + getline(interval_fs, one_line); + } + sort(intervals.begin(), intervals.end()); + if (intervals.size() > 0) { + Interval new_span(intervals[0].left-interval_padding, intervals[0].right+interval_padding); + for (int i=1; i new_span.right) { + r_arr->push_back(new_span); + new_span.left = intervals[i].left - interval_padding; + new_span.right = intervals[i].right + interval_padding; + } else { + new_span.right = max(new_span.right, intervals[i].right + interval_padding); + } + } + r_arr->push_back(new_span); + } + interval_fs.close(); +} + +/* + * 将interval相连的区域合并 + */ +void Interval::ShrinkInterval(IntervalArr *ivap) { + if (ivap->size() < 1) return; + IntervalArr &iva = *ivap; + IntervalArr tiva = iva; + iva.clear(); + Interval iv; + iv.left = tiva[0].left; + iv.right = tiva[0].right; + for (int i=1; itarget_len[tid]; + result.left = max(BamWrap::bam_global_pos(tid, 0), ext_left); + result.right = min(ext_right, contig_len - 1 + BamWrap::bam_global_pos(tid, 0)); + + return result; +} + diff --git a/src/util/interval.h b/src/util/interval.h new file mode 100755 index 0000000..65b8cad --- /dev/null +++ b/src/util/interval.h @@ -0,0 +1,108 @@ +/* + Description: 处理intervals + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2019/11/24 +*/ + +#ifndef INTERVAL_H_ +#define INTERVAL_H_ + +#include +#include +#include +#include + +#include + +#include "bam_wrap.h" + +using namespace std; + +// 前向声明 +class Interval; +typedef std::vector IntervalArr; +/* + * 闭区间 + */ +struct Interval { + // const常量 + const static int CONTIG_SHIFT = 40; + const static uint64_t POS_MASK = (1LL << CONTIG_SHIFT) - 1; + + // 类变量 + int64_t left; + int64_t right; + + // 构造函数 + Interval(); + explicit Interval(int64_t l, int64_t r); + explicit Interval(int tid, int l, int r) : Interval(BamWrap::bam_global_pos(tid, l), BamWrap::bam_global_pos(tid, r)) {} + // 获取tid + int contig() { return left >> CONTIG_SHIFT; } + int contigLeft() { return left & POS_MASK; } + int contigRight() { return right & POS_MASK; } + + // 比较函数 + bool operator<(const Interval &other); + // 是否有重叠 + bool overlaps(const Interval &other); + // 两个interval的合并, 会改变当前interval + Interval& spanWith(const Interval &other); + // 返回两个interval的交集,不改变当前interval + Interval intersect(const Interval &that) const; + + // for debug + string toString() const { + ostringstream oss; + oss << BamWrap::bam_tid(left) + 1 << ":" + << BamWrap::bam_pos(left) + 1 << "-" + << BamWrap::bam_pos(right) + 1; + + return oss.str(); + } + /* + * 合并两个interval arr,取相交区域的交集, interval arr都是排序后的 + */ + static void IntersectIntervals(const IntervalArr &a_arr, + const IntervalArr &b_arr, + IntervalArr *r_arr); + /* + * 合并两个interval arr,相交的区域取并集 + */ + static void UnionIntervals(const IntervalArr &a_arr, + const IntervalArr &b_arr, + IntervalArr *r_arr); + /* + * 将有read覆盖的区域和参数提供的interval文件中的区域做一个交集 + */ + static int64_t MergeIntervals(const IntervalArr &n_arr, + const IntervalArr &t_arr, + IntervalArr &in_arr, // 会更改 + int64_t start_loc, // 闭区间 + int64_t *end_loc, // 开区间, 会更改 + IntervalArr *r_arr); + /* + * 读取interval文件 + */ + static void ReadInterval(const std::string &interval_fn, + bam_hdr_t* header, + int interval_padding, + IntervalArr *r_arr); + /* + * 将interval相连的区域合并 + */ + static void ShrinkInterval(IntervalArr *iva); + + /* + * 根据header信息,扩展interval + */ + static Interval ExpandInterval(int64_t start, int64_t end, int expandVal, bam_hdr_t* header); +}; + + +#endif + + diff --git a/src/util/linear_index.cpp b/src/util/linear_index.cpp new file mode 100644 index 0000000..c2924c4 --- /dev/null +++ b/src/util/linear_index.cpp @@ -0,0 +1,292 @@ +/* + Description: vcf index utils + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2019/11/23 +*/ + +#include "linear_index.h" +#include "bam_wrap.h" +#include "binary_util.h" + +/* + * LinearIndex 类 + */ +// 搜索区间参数对应的数据区域,闭区间 +void LinearIndex::SearchInterval(int64_t start, int64_t end, int64_t* file_pos, int64_t* content_len) { + int stid, spos, etid, epos, stid_origin, spos_origin, etid_origin, epos_origin; + stid = stid_origin = BamWrap::bam_tid(start); + spos = spos_origin = BamWrap::bam_pos(start); + etid = etid_origin = BamWrap::bam_tid(end); + epos = epos_origin = BamWrap::bam_pos(end); + + while (stid < idx_.size() && idx_[stid].size() == 0) ++stid; + while (etid >= 0 && idx_[etid].size() == 0) --etid; + if (stid > etid || stid >= idx_.size()) { + *content_len = 0; + return; + } + int sb_idx, eb_idx; + if (stid == stid_origin) { + sb_idx = spos / idx_[stid].binWidth; + if (sb_idx >= idx_[stid].size()) { // 开始区域没有对应的block + sb_idx = idx_[stid].size() - 1; + Block& sb = idx_[stid][sb_idx]; + *file_pos = sb.startPosition + sb.size; + } else { + Block& sb = idx_[stid][sb_idx]; + *file_pos = sb.startPosition; + } + } else { + sb_idx = 0; + Block& sb = idx_[stid][sb_idx]; + *file_pos = sb.startPosition; + } + if (etid == etid_origin) { + eb_idx = epos / idx_[etid].binWidth; + if (eb_idx >= idx_[etid].size()) { + eb_idx = idx_[etid].size() - 1; + } + } else { + eb_idx = idx_[etid].size() - 1; + } + Block& eb = idx_[etid][eb_idx]; + if (eb.startPosition + eb.size > *file_pos) + *content_len = eb.startPosition + eb.size - *file_pos; + else + *content_len = 0; +} + +// 读入index文件信息 +bool LinearIndex::ReadIndex(const string& idx_fn) { + if (bam_hdr_ == NULL) + return false; + ifstream ifs(idx_fn, ios::in | ios::binary); + // 如果idx文件不存在,则创建 todo + if (!ifs.good()) + return false; + // 全部读进来再处理 + uint64_t fsize = 0; + uint64_t cur = 0; + ifs.seekg(0, ios::end); + fsize = (uint64_t)ifs.tellg(); + ifs.seekg(0, ios::beg); + char* buf = (char*)malloc(fsize + 10); + if (!ifs.read(buf, fsize)) { + free(buf); + ifs.close(); + return false; + } + ifs.close(); + int int_val, version, flags; + uint64_t long_val; + string str_val; + + // read index header + BinaryUtil::ReadInt(buf, fsize, &cur, &int_val); + // cout << "magic: " << int_val << endl; + BinaryUtil::ReadInt(buf, fsize, &cur, &int_val); + if (int_val != INDEX_TYPE) { + free(buf); + ifs.close(); + return false; + } + // cout << "type: " << int_val << endl; + BinaryUtil::ReadInt(buf, fsize, &cur, &version); + // cout << "version: " << version << endl; + BinaryUtil::ReadStr(buf, fsize, &cur, &str_val); + // cout << "path: " << str_val << endl; + BinaryUtil::ReadLong(buf, fsize, &cur, &vcf_fsize); // vcf file size + // cout << "file size: " << long_val << endl; + BinaryUtil::ReadLong(buf, fsize, &cur, &long_val); // test number + BinaryUtil::ReadStr(buf, fsize, &cur, &str_val); // md5 + BinaryUtil::ReadInt(buf, fsize, &cur, &flags); + + if (version < 3 && (flags & 0x8000) == 0x8000) { + BinaryUtil::ReadInt(buf, fsize, &cur, &int_val); + if (int_val < 0) + return false; + for (int i = 0; i < int_val; ++i) { + BinaryUtil::ReadStr(buf, fsize, &cur, &str_val); + BinaryUtil::ReadInt(buf, fsize, &cur, &int_val); + } + } + if (version >= 3) { + int nProperties; + BinaryUtil::ReadInt(buf, fsize, &cur, &nProperties); + string key, val; + while (nProperties-- > 0) { + BinaryUtil::ReadStr(buf, fsize, &cur, &key); + BinaryUtil::ReadStr(buf, fsize, &cur, &val); + properties_[key] = val; + } + } + // read index content + int n_chrom; + BinaryUtil::ReadInt(buf, fsize, &cur, &n_chrom); + int last_tid = 0; + string last_name; + int bam_tid = 0; // tid从0开始,索引的染色体也要补充一个数据, 使得idx_ vector与tid一一对应 + while (n_chrom-- > 0) { + ChrIndex ci; + BinaryUtil::ReadStr(buf, fsize, &cur, &ci.name); + BinaryUtil::ReadInt(buf, fsize, &cur, &ci.binWidth); + int nBins; + BinaryUtil::ReadInt(buf, fsize, &cur, &nBins); + BinaryUtil::ReadInt(buf, fsize, &cur, &ci.longestFeature); + BinaryUtil::ReadInt(buf, fsize, &cur, &int_val); + BinaryUtil::ReadInt(buf, fsize, &cur, &ci.nFeatures); + ci.blocks.reserve(nBins); + int tid = sam_hdr_name2tid(bam_hdr_, ci.name.c_str()); + if (tid < 0) { + if (ci.name == last_name) { + tid = last_tid; + } else { + tid = ++last_tid; + last_name = ci.name; + } + } + while (bam_tid + 1 < tid) { + idx_.push_back(ChrIndex()); // 补充空染色体 + ++bam_tid; + } + bam_tid = tid; + ci.tid = tid; + uint64_t pos; + uint64_t nextPos; + BinaryUtil::ReadLong(buf, fsize, &cur, &pos); + for (int i = 0; i < nBins; ++i) { + BinaryUtil::ReadLong(buf, fsize, &cur, &nextPos); + ci.blocks.push_back(Block(pos, nextPos - pos)); + pos = nextPos; + } + idx_.push_back(ci); + } + sort(idx_.begin(), idx_.end()); + ifs.close(); + free(buf); + return true; +} + +/* + * LinearIndex::ChrIndex 类 + */ +void LinearIndex::ChrIndex::write(ofstream& out) const { + BinaryUtil::WriteStr(out, name); + BinaryUtil::WriteInt(out, binWidth); + BinaryUtil::WriteInt(out, blocks.size()); + BinaryUtil::WriteInt(out, longestFeature); + BinaryUtil::WriteInt(out, 0); + BinaryUtil::WriteInt(out, nFeatures); + uint64_t pos = 0, si = 0; + for (int i = 0; i < blocks.size(); ++i) { + pos = blocks[i].startPosition; + si = blocks[i].size; + BinaryUtil::WriteLong(out, pos); + } + BinaryUtil::WriteLong(out, pos + si); +} + +/* + * LinearIndexCreator类 + */ +// 初始化索引文件头部信息 +void LinearIndexCreator::InitHeaderDict(bam_hdr_t* hdr) { + for (int i = 0; i < hdr->n_targets; ++i) { + v_contig_name_.push_back(hdr->target_name[i]); + contig_name_to_id_[hdr->target_name[i]] = i; + contig_len_[hdr->target_name[i]] = hdr->target_len[i]; + ++n_properties_; + } +} + +// 添加一条vcf记录 +void LinearIndexCreator::AddFeature(const Feature& ft, uint64_t f_pos) { // f_pos是vcf文件当前正要写入的位置 + if (idx_.size() == 0 || idx_.back().tid != ft.tid) { + if (idx_.size() > 0) { + for (int i = 0; i < blocks_.size() - 1; ++i) { + blocks_[i].size = blocks_[i + 1].startPosition - blocks_[i].startPosition; + idx_.back().blocks.push_back(blocks_[i]); + } + blocks_.back().size = f_pos - blocks_.back().startPosition; + idx_.back().blocks.push_back(blocks_.back()); + } + idx_.push_back(LinearIndex::ChrIndex(v_contig_name_[ft.tid], ft.tid, bin_width_)); + blocks_.clear(); + blocks_.push_back(Block(f_pos, 0)); + longest_feature_ = 0; + } + while (ft.start > blocks_.size() * bin_width_) blocks_.push_back(Block(f_pos, 0)); + if (ft.FeatureLen() > longest_feature_) { + longest_feature_ = ft.FeatureLen(); + idx_.back().longestFeature = longest_feature_; + } + ++idx_.back().nFeatures; + ++FEATURE_COUNT_; + all_feature_len += ft.FeatureLen(); +} + +// 添加所有记录完毕 +void LinearIndexCreator::FinalizeIndex(uint64_t f_pos) { + if (f_pos == 0) + error("[%s] Error: finalize index failed\n", __func__); + if (blocks_.size() > 0) { + for (int i = 0; i < blocks_.size() - 1; ++i) { + blocks_[i].size = blocks_[i + 1].startPosition - blocks_[i].startPosition; + idx_.back().blocks.push_back(blocks_[i]); + } + blocks_.back().size = f_pos - blocks_.back().startPosition; + idx_.back().blocks.push_back(blocks_.back()); + blocks_.clear(); + } + FEATURE_LENGTH_MEAN_ = (double)all_feature_len / (double)FEATURE_COUNT_; + index_file_size_ = f_pos; +} + +// 写入index文件 +void LinearIndexCreator::WriteIndex(const string& out_fn) { + ofstream out; + out.open(out_fn, ios::out | ios::binary); + + BinaryUtil::WriteInt(out, MAGIC_NUMBER); + BinaryUtil::WriteInt(out, LinearIndex::INDEX_TYPE); + BinaryUtil::WriteInt(out, LinearIndex::INDEX_VERSION); + BinaryUtil::WriteStr(out, out_fn); + BinaryUtil::WriteLong(out, index_file_size_); + BinaryUtil::WriteLong(out, -1); + BinaryUtil::WriteStr(out, ""); + BinaryUtil::WriteInt(out, flags_); + n_properties_ += 4; + BinaryUtil::WriteInt(out, n_properties_); + ostringstream oss; + oss << FEATURE_LENGTH_MEAN_; + BinaryUtil::WriteStr(out, "FEATURE_LENGTH_MEAN"); + BinaryUtil::WriteStr(out, oss.str()); + oss.str(""); + oss << FEATURE_LENGTH_STD_DEV_; + BinaryUtil::WriteStr(out, "FEATURE_LENGTH_STD_DEV"); + BinaryUtil::WriteStr(out, oss.str()); + oss.str(""); + oss << MEAN_FEATURE_VARIANCE_; + BinaryUtil::WriteStr(out, "MEAN_FEATURE_VARIANCE"); + BinaryUtil::WriteStr(out, oss.str()); + oss.str(""); + oss << FEATURE_COUNT_; + BinaryUtil::WriteStr(out, "FEATURE_COUNT"); + BinaryUtil::WriteStr(out, oss.str()); + for (int i = 0; i < v_contig_name_.size(); ++i) { + const string& cn = v_contig_name_[i]; + oss.str(""); + oss << contig_len_[cn]; + BinaryUtil::WriteStr(out, "DICT:" + cn); + BinaryUtil::WriteStr(out, oss.str()); + } + BinaryUtil::WriteInt(out, idx_.size()); + for (int i = 0; i < idx_.size(); ++i) { + const LinearIndex::ChrIndex& ci = idx_[i]; + ci.write(out); + } + out.close(); +} diff --git a/src/util/linear_index.h b/src/util/linear_index.h new file mode 100644 index 0000000..6d690e7 --- /dev/null +++ b/src/util/linear_index.h @@ -0,0 +1,129 @@ +/* + Description: vcf index utils + + Copyright : All right reserved by ICT + + Author : Zhang Zhonghai + Date : 2019/11/23 +*/ + +#ifndef INDEX_H_ +#define INDEX_H_ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "utils.h" + +using std::ifstream; +using std::ios; +using std::ofstream; +using std::ostringstream; +using std::sort; +using std::string; +using std::unordered_map; +using std::vector; + +using std::cout; +using std::endl; + +struct Block { + Block(uint64_t start, uint64_t si) : startPosition(start), size(si) {} + uint64_t startPosition; + uint64_t size; +}; + +struct Feature { + int tid; + int start; // 闭区间 + int end; // 闭区间 + Feature(int ti, int b, int e) : tid(ti), start(b), end(e) {} + inline int FeatureLen() const { return end - start + 1; } +}; + +struct LinearIndex { + const static int MAX_FEATURES_PER_BIN = 100; + const static int INDEX_TYPE = 1; + const static int INDEX_VERSION = 3; + const static int MAX_BIN_WIDTH = 1024000; + + LinearIndex() {} + LinearIndex(bam_hdr_t* hdr) : bam_hdr_(hdr) {} + void SetHeader(bam_hdr_t* hdr) { bam_hdr_ = hdr; } + + class ChrIndex; + vector idx_; + vector vkey_; + vector vval_; + unordered_map properties_; + bam_hdr_t* bam_hdr_ = NULL; // 这个应该换成bcf_hdr_t + uint64_t vcf_fsize = 0; + + // 染色体索引信息 + struct ChrIndex { + string name; + int tid; + int binWidth; + int longestFeature = 0; + int nFeatures = 0; + vector blocks; + ChrIndex() {}; + ChrIndex(string& n, int ti, int bw) : name(n), tid(ti), binWidth(bw) {} + inline bool operator<(const ChrIndex& ci) const { return tid < ci.tid; }; + inline Block& operator[](int pos) { return blocks[pos]; } + inline int size() { return blocks.size(); } + void write(ofstream& out) const; + }; + + inline ChrIndex& operator[](int tid) { return idx_[tid]; } + + // 闭区间 + void SearchInterval(int64_t start, int64_t end, int64_t* file_pos, int64_t* content_len); + + // 读入index文件信息 + bool ReadIndex(const string& idx_fn); +}; + +// 根据vcf数据创建index文件 +struct LinearIndexCreator { + const static int INDEX_VERSION = LinearIndex::INDEX_VERSION; + const static int MAGIC_NUMBER = 1480870228; + const static int DEFAULT_BIN_WIDTH = 8000; + int bin_width_ = DEFAULT_BIN_WIDTH; + string input_vcf_fn_; + string output_index_fn_; + int longest_feature_ = 0; + uint64_t index_file_size_ = 0; + int flags_ = 0; + int n_properties_ = 0; + float FEATURE_LENGTH_MEAN_ = 0.0; + float FEATURE_LENGTH_STD_DEV_ = 0.0; + float MEAN_FEATURE_VARIANCE_ = 0.0; + uint64_t FEATURE_COUNT_ = 0; + uint64_t all_feature_len = 0; + + unordered_map contig_name_to_id_; + unordered_map contig_len_; + vector v_contig_name_; + vector idx_; + vector blocks_; + + // 根据sam header初始化索引文件头部信息 + void InitHeaderDict(bam_hdr_t* hdr); + // 添加一条记录 + void AddFeature(const Feature& ft, uint64_t f_pos); // f_pos是vcf文件当前正要写入的位置 + // 添加记录完毕 + void FinalizeIndex(uint64_t f_pos); + // 写入index文件 + void WriteIndex(const string& out_fn); +}; + +#endif diff --git a/src/util/murmur3.h b/src/util/murmur3.h index 48c304a..d404ecc 100644 --- a/src/util/murmur3.h +++ b/src/util/murmur3.h @@ -1,5 +1,5 @@ /* -Description: Murmur +Description: Murmur哈希 Copyright : All right reserved by ICT diff --git a/src/util/profiling.cpp b/src/util/profiling.cpp index edde38a..17833a2 100644 --- a/src/util/profiling.cpp +++ b/src/util/profiling.cpp @@ -73,12 +73,12 @@ int DisplayProfiling(int nthread) { PRINT_GP(write); PRINT_GP(whole_process); - PRINT_TP(gen, nthread); - PRINT_TP(sort_frag, nthread); - PRINT_TP(sort_pair, nthread); + // PRINT_TP(gen, nthread); + // PRINT_TP(sort_frag, nthread); + // PRINT_TP(sort_pair, nthread); fprintf(stderr, "\n"); #endif return 0; -} +} \ No newline at end of file