到了读取和解析known vcf部分了,性能还需优化
This commit is contained in:
parent
0fca937fab
commit
1e9b58fac1
|
|
@ -0,0 +1,17 @@
|
||||||
|
#include "baq.h"
|
||||||
|
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
|
vector<double> BAQ::qual2prob(256); // 质量分数转化概率
|
||||||
|
vector<vector<vector<double>>> BAQ::EPSILONS(256, vector<vector<double>>(256, vector<double>(SAM_MAX_PHRED_SCORE + 1))); // [ref][read][qual]
|
||||||
|
|
||||||
|
// 计算baq数组,返回成功与否
|
||||||
|
bool BAQ::calcBAQFromHMM(BamWrap* bw, ReadAdditionData& ad, string ref, int refOffset, vector<int>& baqArray) {
|
||||||
|
// 检测ref是否覆盖了read
|
||||||
|
if (ref.size() < refOffset + ad.read_len) {
|
||||||
|
spdlog::error("BAQ calculation error: reference sequence length {} is less than required length {} (refOffset {} + read_len {})",
|
||||||
|
ref.size(), refOffset + ad.read_len, refOffset, ad.read_len);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,88 @@
|
||||||
|
// ####################################################################################################
|
||||||
|
//
|
||||||
|
// NOTE -- THIS CODE IS SYNCHRONIZED WITH CODE IN THE SAMTOOLS REPOSITORY. CHANGES TO THIS CODE SHOULD BE
|
||||||
|
// NOTE -- PUSHED BACK TO HENG LI
|
||||||
|
//
|
||||||
|
// ####################################################################################################
|
||||||
|
// NOTE -- this code is a cpp version of code originally written by Heng Li in JAVA for the samtools project
|
||||||
|
// ####################################################################################################
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <cctype>
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstdint>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "util/bam_wrap.h"
|
||||||
|
|
||||||
|
using std::vector;
|
||||||
|
using std::string;
|
||||||
|
|
||||||
|
// base alignment quality (BAQ)
|
||||||
|
|
||||||
|
struct BAQ {
|
||||||
|
static vector<double> qual2prob; // 质量分数转化概率
|
||||||
|
static constexpr double EM = 0.33333333333;
|
||||||
|
static constexpr double EI = 0.25;
|
||||||
|
// Phred scaled now (changed 1/10/2011)
|
||||||
|
static constexpr double DEFAULT_GOP = 40;
|
||||||
|
static constexpr int DEFAULT_BANDWIDTH = 7;
|
||||||
|
static constexpr int SAM_MAX_PHRED_SCORE = 93;
|
||||||
|
// 94 = max qual score + 1
|
||||||
|
static vector<vector<vector<double>>> EPSILONS; // [ref][read][qual]
|
||||||
|
|
||||||
|
double cd = -1; // gap open probability [1e-3]
|
||||||
|
double ce = 0.1; // gap extension probability [0.1]
|
||||||
|
int cb = DEFAULT_BANDWIDTH; // band width [7]
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Any bases with Q < MIN_BASE_QUAL are raised up to this base quality
|
||||||
|
*/
|
||||||
|
uint8_t minBaseQual = 4;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Use defaults for everything
|
||||||
|
*/
|
||||||
|
BAQ() : BAQ(DEFAULT_GOP) {}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Use defaults for everything
|
||||||
|
*/
|
||||||
|
BAQ(const double gapOpenPenalty) {
|
||||||
|
cd = convertFromPhredScale(gapOpenPenalty);
|
||||||
|
initializeCachedData();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Takes a Phred Scale quality score and returns the error probability.
|
||||||
|
*
|
||||||
|
* Quick conversion function to maintain internal structure of BAQ calculation on
|
||||||
|
* probability scale, but take the user entered parameter in phred-scale.
|
||||||
|
*
|
||||||
|
* @param x phred scaled score
|
||||||
|
* @return probability of incorrect base call
|
||||||
|
*/
|
||||||
|
static double convertFromPhredScale(double x) { return (std::pow(10, (-x) / 10.)); }
|
||||||
|
|
||||||
|
// 初始化一些静态全局数据
|
||||||
|
void initializeCachedData() {
|
||||||
|
for (int i = 0; i < 256; i++)
|
||||||
|
for (int j = 0; j < 256; j++)
|
||||||
|
for (int q = 0; q <= SAM_MAX_PHRED_SCORE; q++) EPSILONS[i][j][q] = 1.0;
|
||||||
|
|
||||||
|
for (char b1 : "ACGTacgt") {
|
||||||
|
for (char b2 : "ACGTacgt") {
|
||||||
|
for (int q = 0; q <= SAM_MAX_PHRED_SCORE; q++) {
|
||||||
|
double qual = qual2prob[q < minBaseQual ? minBaseQual : q];
|
||||||
|
double e = std::tolower(b1) == std::tolower(b2) ? 1 - qual : qual * EM;
|
||||||
|
EPSILONS[(uint8_t)b1][(uint8_t)b2][q] = e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
double calcEpsilon(uint8_t ref, uint8_t read, uint8_t qualB) { return EPSILONS[ref][read][qualB]; }
|
||||||
|
|
||||||
|
// 计算baq数组,返回成功与否
|
||||||
|
bool calcBAQFromHMM(BamWrap* bw, ReadAdditionData& ad, string ref, int refOffset, vector<int>& baqArray);
|
||||||
|
};
|
||||||
|
|
@ -48,6 +48,9 @@ struct BQSRArg {
|
||||||
|
|
||||||
string PROGRAM_RECORD_ID = "FastBQSR";
|
string PROGRAM_RECORD_ID = "FastBQSR";
|
||||||
|
|
||||||
|
// reference file
|
||||||
|
string REFERENCE_FILE;
|
||||||
|
|
||||||
// known sites vcf files
|
// known sites vcf files
|
||||||
vector<string> KNOWN_SITES_VCFS;
|
vector<string> KNOWN_SITES_VCFS;
|
||||||
|
|
||||||
|
|
@ -123,7 +126,7 @@ struct BQSRArg {
|
||||||
int PRESERVE_QSCORES_LESS_THAN = 6;
|
int PRESERVE_QSCORES_LESS_THAN = 6;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* enable-baq, do BAQ correction"
|
* enable-baq, do BAQ correction" (base alignment quality), 在GATK里hidden了,用不到了?
|
||||||
*/
|
*/
|
||||||
bool enableBAQ = false;
|
bool enableBAQ = false;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -7,36 +7,93 @@ Copyright : All right reserved by ICT
|
||||||
Author : Zhang Zhonghai
|
Author : Zhang Zhonghai
|
||||||
Date : 2023/10/23
|
Date : 2023/10/23
|
||||||
*/
|
*/
|
||||||
|
#include <htslib/faidx.h>
|
||||||
|
#include <htslib/kstring.h>
|
||||||
#include <htslib/sam.h>
|
#include <htslib/sam.h>
|
||||||
#include <htslib/synced_bcf_reader.h>
|
#include <htslib/synced_bcf_reader.h>
|
||||||
#include <htslib/thread_pool.h>
|
#include <htslib/thread_pool.h>
|
||||||
|
#include <header.h>
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
#include <iomanip>
|
#include <iomanip>
|
||||||
|
#include <numeric>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <queue>
|
||||||
|
|
||||||
|
#include "baq.h"
|
||||||
#include "bqsr_args.h"
|
#include "bqsr_args.h"
|
||||||
#include "bqsr_funcs.h"
|
#include "bqsr_funcs.h"
|
||||||
#include "bqsr_pipeline.h"
|
#include "bqsr_pipeline.h"
|
||||||
|
#include "covariate.h"
|
||||||
#include "dup_metrics.h"
|
#include "dup_metrics.h"
|
||||||
#include "fastbqsr_version.h"
|
#include "fastbqsr_version.h"
|
||||||
#include "read_name_parser.h"
|
#include "read_name_parser.h"
|
||||||
|
#include "util/interval.h"
|
||||||
#include "util/profiling.h"
|
#include "util/profiling.h"
|
||||||
#include "util/utils.h"
|
#include "util/utils.h"
|
||||||
|
#include "util/linear_index.h"
|
||||||
|
|
||||||
|
using std::deque;
|
||||||
|
|
||||||
#define BAM_BLOCK_SIZE 16L * 1024 * 1024
|
#define BAM_BLOCK_SIZE 16L * 1024 * 1024
|
||||||
|
|
||||||
|
const char cBaseToChar[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'};
|
||||||
|
|
||||||
|
// 解析knownSites
|
||||||
|
struct VCFParser {
|
||||||
|
deque<Interval> knownSites; // 已知的变异位点
|
||||||
|
char* buf = nullptr; // // 数据buffer
|
||||||
|
uint32_t bufLen = 4 * 1024; // 数据buffer长度
|
||||||
|
LinearIndex index; // vcf文件索引
|
||||||
|
ifstream inStm; // vcf文件流
|
||||||
|
|
||||||
|
VCFParser() { Init(); }
|
||||||
|
VCFParser(const string& vcfFileName) { Init(vcfFileName); }
|
||||||
|
VCFParser(const string& vcfFileName, sam_hdr_t* samHeader) { Init(vcfFileName, samHeader); }
|
||||||
|
|
||||||
|
void Init() { buf = (char*)malloc(bufLen); }
|
||||||
|
void Init(const string& vcfFileName) {
|
||||||
|
Init();
|
||||||
|
inStm.open(vcfFileName, ifstream::in);
|
||||||
|
string idxFileName = vcfFileName + ".idx";
|
||||||
|
if (!index.ReadIndex(idxFileName))
|
||||||
|
error("[%s] fail to load the %s index file\n", __func__, idxFileName.c_str());
|
||||||
|
}
|
||||||
|
void Init(const string& vcfFileName, sam_hdr_t *samHeader) {
|
||||||
|
index.SetHeader(samHeader);
|
||||||
|
Init(vcfFileName);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// 解析后的一些参数,文件,数据等
|
||||||
|
struct AuxVar {
|
||||||
|
const static int REF_CONTEXT_PAD = 3; // 需要做一些填充
|
||||||
|
const static int REFERENCE_HALF_WINDOW_LENGTH = 150; // 需要额外多取出一些ref序列,防止边界效应
|
||||||
|
|
||||||
|
sam_hdr_t* header = nullptr; // bam header
|
||||||
|
faidx_t* faidx = nullptr; // reference index
|
||||||
|
char* ref_seq = nullptr; // reference sequence
|
||||||
|
int ref_len = 0; // reference sequence length
|
||||||
|
int offset = 0; // 在要求的ref序列两边,多余取出的碱基数量
|
||||||
|
|
||||||
|
vector<VCFParser> vcfArr; // 从vcf中获取已知位点
|
||||||
|
};
|
||||||
|
|
||||||
namespace nsgv {
|
namespace nsgv {
|
||||||
|
|
||||||
|
// 全局变量 for bqsr
|
||||||
|
BQSRArg gBqsrArg; // bqsr arguments
|
||||||
|
samFile* gInBamFp; // input BAM file pointer
|
||||||
|
sam_hdr_t* gInBamHeader; // input BAM header
|
||||||
|
vector<AuxVar> gAuxVars; // auxiliary variables,保存一些文件,数据等,每个线程对应一个
|
||||||
|
|
||||||
|
|
||||||
|
// 下面是需要删除或修改的变量
|
||||||
std::vector<ReadNameParser> gNameParsers; // read name parser
|
std::vector<ReadNameParser> gNameParsers; // read name parser
|
||||||
DuplicationMetrics gMetrics; //
|
DuplicationMetrics gMetrics; //
|
||||||
DupResult gDupRes;
|
DupResult gDupRes;
|
||||||
PipelineArg gPipe(&gDupRes);
|
PipelineArg gPipe(&gDupRes);
|
||||||
|
|
||||||
BQSRArg gBqsrArg; //
|
|
||||||
samFile *gInBamFp; // bam
|
|
||||||
sam_hdr_t *gInBamHeader; // bam
|
|
||||||
samFile *gOutBamFp; // , sambam
|
samFile *gOutBamFp; // , sambam
|
||||||
sam_hdr_t *gOutBamHeader; // header
|
sam_hdr_t *gOutBamHeader; // header
|
||||||
vector <bcf_srs_t*> gKnownSitesVcfSrs; // known sites vcf srs
|
vector <bcf_srs_t*> gKnownSitesVcfSrs; // known sites vcf srs
|
||||||
|
|
@ -49,6 +106,13 @@ struct ByteBuf {
|
||||||
int capacity = 0; //
|
int capacity = 0; //
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// 读进来的这一批bam总共占了几个染色体,这个方案不行,读取太多,没必要
|
||||||
|
// 开区间
|
||||||
|
struct Region {
|
||||||
|
int64_t start;
|
||||||
|
int64_t end;
|
||||||
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
@ -60,26 +124,495 @@ static string getFileExtension(const string &filename) {
|
||||||
return filename.substr(last_dot + 1);
|
return filename.substr(last_dot + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 过滤掉bqsr过程不符合要求的bam数据
|
||||||
|
bool bqsrReadFilterOut(const bam1_t *b) {
|
||||||
|
// 过滤掉unmapped的read
|
||||||
|
if (b->core.qual == 0) // mapping quality 0
|
||||||
|
return true;
|
||||||
|
if (b->core.qual == 255) // mapping quality not available
|
||||||
|
return true;
|
||||||
|
if (b->core.flag & BAM_FUNMAP || b->core.tid == -1 || b->core.pos == -1) { // unmapped
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (b->core.flag & BAM_FSECONDARY) { // secondary alignment
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (b->core.flag & BAM_FDUP) { // secondary alignment
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (b->core.flag & BAM_FQCFAIL) { // Not passing quality controls
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 该操作符是否消耗read的碱基
|
||||||
|
bool consumeReadBases(char cigar) {
|
||||||
|
return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'I' || cigar == 'S';
|
||||||
|
}
|
||||||
|
|
||||||
|
// 该操作符是否消耗参考基因组的碱基
|
||||||
|
bool consumeRefBases(char cigar) {
|
||||||
|
return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N';
|
||||||
|
}
|
||||||
|
|
||||||
|
// 给定一个ref位置,在read内部找到对应的位置和操作符
|
||||||
|
struct PosAndOperator {
|
||||||
|
int readPosAtRefCoord = -1; // read中的位置
|
||||||
|
char cigarOperator = '0'; // cigar操作符
|
||||||
|
int cigarIndex = -1; // cigar索引
|
||||||
|
int cigarLen = 0;
|
||||||
|
int preCigarLen = 0; // 截止cigar之前的,消耗read base的长度
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Find the 0-based index within a read base array corresponding to a given 0-based position in the reference, along with the cigar operator of
|
||||||
|
* the element containing that base. If the reference coordinate occurs within a deletion, the first index after the deletion is returned.
|
||||||
|
* Note that this treats soft-clipped bases as if they align with the reference, which is useful for hard-clipping reads with soft clips.
|
||||||
|
*
|
||||||
|
* @param alignmentStart The soft start of the read on the reference
|
||||||
|
* @param cigar The read's cigar
|
||||||
|
* @param refCoord The target reference coordinate
|
||||||
|
* @return If the reference coordinate occurs before the read start or after the read end {@code CLIPPING_GOAL_NOT_REACHED};
|
||||||
|
* if the reference coordinate falls within an alignment block of the read's cigar, the corresponding read coordinate;
|
||||||
|
* if the reference coordinate falls within a deletion, the first read coordinate after the deletion. Note: if the last
|
||||||
|
* cigar element is a deletion (which isn't meaningful), it returns {@code CLIPPING_GOAL_NOT_REACHED}.
|
||||||
|
*/
|
||||||
|
PosAndOperator getReadIndexForReferenceCoordinate(BamWrap *bw, int alignmentStart, int refCoord) {
|
||||||
|
PosAndOperator po;
|
||||||
|
if (refCoord < alignmentStart) {
|
||||||
|
return po;
|
||||||
|
}
|
||||||
|
int firstReadPosOfElement = 0; // inclusive
|
||||||
|
int firstRefPosOfElement = alignmentStart; // inclusive
|
||||||
|
int lastReadPosOfElement = 0; // exclusive
|
||||||
|
int lastRefPosOfElement = alignmentStart; // exclusive
|
||||||
|
|
||||||
|
// advance forward through all the cigar elements until we bracket the reference coordinate
|
||||||
|
const uint32_t* cigar = bam_get_cigar(bw->b);
|
||||||
|
const bam1_core_t& bc = bw->b->core;
|
||||||
|
const int idx = bc.n_cigar - 1;
|
||||||
|
if (idx < 0)
|
||||||
|
return po;
|
||||||
|
for (int i = 0; i < bc.n_cigar; ++i) {
|
||||||
|
const char c = bam_cigar_opchr(cigar[i]);
|
||||||
|
const int len = bam_cigar_oplen(cigar[i]);
|
||||||
|
firstReadPosOfElement = lastReadPosOfElement;
|
||||||
|
firstRefPosOfElement = lastRefPosOfElement;
|
||||||
|
lastReadPosOfElement += consumeReadBases(c) ? len : 0;
|
||||||
|
lastRefPosOfElement += (consumeRefBases(c) || c == 'S') ? len : 0;
|
||||||
|
if (firstRefPosOfElement <= refCoord && refCoord < lastRefPosOfElement) { // refCoord falls within this cigar element
|
||||||
|
int readPosAtRefCoord = firstReadPosOfElement + (consumeReadBases(c) ? (refCoord - firstRefPosOfElement) : 0);
|
||||||
|
return PosAndOperator{readPosAtRefCoord, c, i, len, firstReadPosOfElement};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return po;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 根据adapter位置,对read进行hardclip,返回左侧或右侧减掉的base数量
|
||||||
|
void clipByReferenceCoordinates(BamWrap *bw, int refStart, int refStop, ReadAdditionData &ad) {
|
||||||
|
int start, stop;
|
||||||
|
// Determine the read coordinate to start and stop hard clipping
|
||||||
|
if (refStart < 0) {
|
||||||
|
if (refStop < 0) return;
|
||||||
|
PosAndOperator stopPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStop);
|
||||||
|
// if the refStop falls in a deletion, the above method returns the position after the deletion. Since the stop we return here
|
||||||
|
// is inclusive, we decrement the stop to avoid overclipping by one base. As a result we do not clip the deletion, which is fine.
|
||||||
|
stop = stopPosAndOperator.readPosAtRefCoord - (consumeReadBases(stopPosAndOperator.cigarOperator) ? 0 : 1);
|
||||||
|
ad.left_clip = stop + 1;
|
||||||
|
ad.cigar_start = stopPosAndOperator.cigarIndex;
|
||||||
|
ad.first_cigar_clip = ad.left_clip - stopPosAndOperator.preCigarLen;
|
||||||
|
} else {
|
||||||
|
if (refStop >= 0) return;
|
||||||
|
// unlike the above case where we clip the start fo the read, here we clip the end and returning the base to the right of a deletion avoids
|
||||||
|
// overclipping
|
||||||
|
PosAndOperator startPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStart);
|
||||||
|
start = startPosAndOperator.readPosAtRefCoord;
|
||||||
|
ad.right_clip = bw->b->core.l_qseq - start;
|
||||||
|
ad.cigar_end = startPosAndOperator.cigarIndex + 1;
|
||||||
|
ad.last_cigar_clip = startPosAndOperator.preCigarLen + startPosAndOperator.cigarLen - start;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 计算切掉adapter之后,ref相对原始ref的偏移量
|
||||||
|
void calculateRefOffset(BamWrap *bw, ReadAdditionData &ad) {
|
||||||
|
const uint32_t* cigar = bam_get_cigar(bw->b);
|
||||||
|
const bam1_core_t& bc = bw->b->core;
|
||||||
|
int i = 0;
|
||||||
|
for (i = 0; i < ad.cigar_start; ++i) {
|
||||||
|
const char c = bam_cigar_opchr(cigar[i]);
|
||||||
|
int len = bam_cigar_oplen(cigar[i]);
|
||||||
|
if (consumeRefBases(c)) {
|
||||||
|
ad.ref_offset += len;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const char c = bam_cigar_opchr(cigar[i]);
|
||||||
|
if (consumeRefBases(c)) {
|
||||||
|
ad.ref_offset += ad.first_cigar_clip;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 计算clip处理之后,剩余的碱基
|
||||||
|
void calculateReadBases(BamWrap* bw, ReadAdditionData& ad) {
|
||||||
|
ad.bases.resize(ad.read_len);
|
||||||
|
uint8_t* seq = bam_get_seq(bw->b);
|
||||||
|
for (int i = 0; i < ad.read_len; ++i) {
|
||||||
|
ad.bases[i] = cBaseToChar[bam_seqi(seq, i + ad.left_clip)];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 计算read两端softclip的碱基数量,可能会修改ad里的clip值
|
||||||
|
void calculateSoftClip(BamWrap *bw, ReadAdditionData &ad) {
|
||||||
|
const uint32_t* cigar = bam_get_cigar(bw->b);
|
||||||
|
const bam1_core_t& bc = bw->b->core;
|
||||||
|
int readIndex = ad.left_clip;
|
||||||
|
int cutLeft = -1; // first position to hard clip (inclusive)
|
||||||
|
int cutRight = -1; // first position to hard clip (inclusive)
|
||||||
|
int cigar_start = ad.cigar_start;
|
||||||
|
int cigar_end = ad.cigar_end;
|
||||||
|
bool rightTail = false; // trigger to stop clipping the left tail and start cutting the right tail
|
||||||
|
|
||||||
|
for (int i = ad.cigar_start; i < ad.cigar_end; ++i) {
|
||||||
|
const char c = bam_cigar_opchr(cigar[i]);
|
||||||
|
int len = bam_cigar_oplen(cigar[i]);
|
||||||
|
if (i == ad.cigar_start) len -= ad.first_cigar_clip;
|
||||||
|
if (i == ad.cigar_end - 1) len -= ad.last_cigar_clip;
|
||||||
|
if (c == 'S') {
|
||||||
|
if (rightTail) {
|
||||||
|
cutRight = readIndex;
|
||||||
|
cigar_end = i;
|
||||||
|
} else {
|
||||||
|
cutLeft = readIndex + len - 1;
|
||||||
|
cigar_start = i + 1;
|
||||||
|
}
|
||||||
|
} else if (c != 'H') {
|
||||||
|
rightTail = true;
|
||||||
|
}
|
||||||
|
if (consumeReadBases(c)) {
|
||||||
|
readIndex += len;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (cutRight >= 0) {
|
||||||
|
ad.right_clip = bw->b->core.l_qseq - cutRight;
|
||||||
|
ad.cigar_end = cigar_end;
|
||||||
|
ad.last_cigar_clip = 0;
|
||||||
|
}
|
||||||
|
if (cutLeft >= 0) {
|
||||||
|
ad.left_clip = cutLeft + 1;
|
||||||
|
ad.cigar_start = cigar_start;
|
||||||
|
ad.first_cigar_clip = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 读取给定区间的reference
|
||||||
|
static inline void read_ref_base(AuxVar& aux, int64_t cur_pos, Interval& interval) {
|
||||||
|
if (aux.ref_seq != NULL)
|
||||||
|
free(aux.ref_seq);
|
||||||
|
int tid = BamWrap::bam_tid(cur_pos);
|
||||||
|
const char* chr = sam_hdr_tid2name(aux.header, tid);
|
||||||
|
int seq_begin = BamWrap::bam_pos(interval.left); //- aux.REFERENCE_HALF_WINDOW_LENGTH;
|
||||||
|
int seq_end = BamWrap::bam_pos(interval.right); //+ aux.REFERENCE_HALF_WINDOW_LENGTH;
|
||||||
|
aux.ref_seq = faidx_fetch_seq(aux.faidx, chr, seq_begin, seq_end, &aux.ref_len);
|
||||||
|
// aux.offset = aux.REFERENCE_HALF_WINDOW_LENGTH;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 设置某个位置是indel
|
||||||
|
inline void updateIndel(vector<int> &isIndel, int index) {
|
||||||
|
if (index >=0 && index < isIndel.size()) {
|
||||||
|
isIndel[index] = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 计算该read的每个碱基位置是否是SNP或Indel
|
||||||
|
int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, ReadAdditionData &ad, vector<int> &isSNP, vector<int> &isIns, vector<int> &isDel) {
|
||||||
|
// 1. 读取参考基因组,先看看串行运行性能,稍后可以将读入ref和vcf合并起来做成一个并行流水线步骤
|
||||||
|
Interval interval{bw->start_pos() + ad.ref_offset, bw->end_pos()}; // 闭区间
|
||||||
|
read_ref_base(aux, interval.left, interval);
|
||||||
|
string refBases(aux.ref_seq);
|
||||||
|
// spdlog::info("ref: {}, {}, {} - {}", aux.ref_seq, aux.ref_len, bw->contig_pos(), bw->contig_end_pos());
|
||||||
|
|
||||||
|
// 2. 遍历cigar,计算每个碱基是否是SNP或Indel
|
||||||
|
int readPos = 0, refPos = 0, nEvents = 0;
|
||||||
|
const uint32_t* cigar = bam_get_cigar(bw->b);
|
||||||
|
const bam1_core_t& bc = bw->b->core;
|
||||||
|
uint8_t* seq = bam_get_seq(bw->b);
|
||||||
|
for (int i = ad.cigar_start; i < ad.cigar_end; ++i) {
|
||||||
|
const char c = bam_cigar_opchr(cigar[i]);
|
||||||
|
int len = bam_cigar_oplen(cigar[i]);
|
||||||
|
if (i == ad.cigar_start) len -= ad.first_cigar_clip;
|
||||||
|
if (i == ad.cigar_end - 1) len -= ad.last_cigar_clip;
|
||||||
|
if (c == 'M' || c == '=' || c == 'X') {
|
||||||
|
for (int j = 0; j < len; ++j) {
|
||||||
|
// 按位置将read和ref碱基进行比较,不同则是snp,注意read起始位置要加上left_clip
|
||||||
|
int snpInt = cBaseToChar[bam_seqi(seq, readPos + ad.left_clip)] == refBases[refPos] ? 0 : 1;
|
||||||
|
isSNP[readPos] = snpInt;
|
||||||
|
nEvents += snpInt;
|
||||||
|
readPos++;
|
||||||
|
refPos++;
|
||||||
|
}
|
||||||
|
} else if (c == 'D') {
|
||||||
|
// 应该是在上一个消耗碱基的cigar的最后一个位置,标记Del
|
||||||
|
int index = bw->GetReadNegativeStrandFlag() ? readPos : readPos - 1;
|
||||||
|
updateIndel(isDel, index);
|
||||||
|
refPos += len;
|
||||||
|
} else if (c == 'N') {
|
||||||
|
refPos += len;
|
||||||
|
} else if (c == 'I') {
|
||||||
|
// 与Del不同,Ins应该是在下一个cigar开始的位置,标记Ins
|
||||||
|
bool forwardStrandRead = !bw->GetReadNegativeStrandFlag();
|
||||||
|
if (forwardStrandRead) {
|
||||||
|
updateIndel(isIns, readPos - 1);
|
||||||
|
}
|
||||||
|
readPos += len;
|
||||||
|
if (!forwardStrandRead) {
|
||||||
|
updateIndel(isIns, readPos);
|
||||||
|
}
|
||||||
|
} else if (c == 'S') {
|
||||||
|
readPos += len;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
nEvents += std::accumulate(isIns.begin(), isIns.end(), 0) + std::accumulate(isDel.begin(), isDel.end(), 0);
|
||||||
|
|
||||||
|
//spdlog::info("SNPs: {}, Ins: {}, Del: {}, total events: {}", std::accumulate(isSNP.begin(), isSNP.end(), 0),
|
||||||
|
// std::accumulate(isIns.begin(), isIns.end(), 0), std::accumulate(isDel.begin(), isDel.end(), 0), nEvents);
|
||||||
|
// exit(0);
|
||||||
|
|
||||||
|
return nEvents;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 简单计算baq数组,就是全部赋值为'@' (64)
|
||||||
|
bool flatBAQArray(BamWrap* bw, ReadAdditionData& ad, vector<int>& baqArray) {
|
||||||
|
baqArray.resize(ad.read_len, (int)'@');
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 计算真实的baq数组,耗时更多,好像enable-baq参数默认是关闭的,那就先不实现这个了
|
||||||
|
bool calculateBAQArray(AuxVar& aux, BAQ& baq, BamWrap* bw, ReadAdditionData& ad, vector<int>& baqArray) {
|
||||||
|
baqArray.resize(ad.read_len, 0);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取一行字符串
|
||||||
|
static void get_line_from_buf(char* buf, int64_t total, int64_t* cur, string* line) {
|
||||||
|
line->clear();
|
||||||
|
if (*cur >= total)
|
||||||
|
return;
|
||||||
|
char b;
|
||||||
|
while (*cur < total && (b = buf[(*cur)++]) != '\n') {
|
||||||
|
line->push_back(b);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 计算与read有交叉的已知位点信息, 应该要判断一下,是按照read的范围去读取vcf,还是按照一个batch read的范围去读取
|
||||||
|
void calculateKnownSites(BamWrap* bw, ReadAdditionData& ad, vector<VCFParser> &vcfs) {
|
||||||
|
int tid = bw->contig_id();
|
||||||
|
uint64_t startPos = bw->start_pos(); // 闭区间
|
||||||
|
uint64_t endPos = bw->end_pos(); // 闭区间
|
||||||
|
// spdlog::info("bam {}, {}", startPos, endPos);
|
||||||
|
// update vcfs
|
||||||
|
for(auto &vcf : vcfs) {
|
||||||
|
// 清理旧的interval
|
||||||
|
while(!vcf.knownSites.empty()) {
|
||||||
|
auto& intv = vcf.knownSites.front();
|
||||||
|
// spdlog::info("intv bam {}, {}", intv.right, startPos);
|
||||||
|
if (intv.right < startPos)
|
||||||
|
vcf.knownSites.pop_front();
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (!vcf.knownSites.empty() && vcf.knownSites.back().left > endPos) continue;
|
||||||
|
|
||||||
|
// spdlog::info("intv {}, {}, {}", vcf.knownSites.size(), vcf.knownSites.front().right, vcf.knownSites.front().right);
|
||||||
|
// exit(0);
|
||||||
|
//spdlog::info("before intervals : {}", vcf.knownSites.size());
|
||||||
|
// 读取新的interval
|
||||||
|
int64_t fpos, flen;
|
||||||
|
vcf.index.SearchInterval(startPos, endPos, &fpos, &flen);
|
||||||
|
//spdlog::info("file index: {}, {}", fpos, flen);
|
||||||
|
if (flen > 0) {
|
||||||
|
vcf.inStm.seekg(fpos, ios::beg);
|
||||||
|
if (flen > vcf.bufLen) {
|
||||||
|
vcf.bufLen = flen;
|
||||||
|
vcf.buf = (char*)realloc(vcf.buf, flen);
|
||||||
|
}
|
||||||
|
char* buf = vcf.buf;
|
||||||
|
vcf.inStm.read(buf, flen);
|
||||||
|
string line;
|
||||||
|
int64_t cur = 0;
|
||||||
|
get_line_from_buf(buf, flen, &cur, &line);
|
||||||
|
while (line.size() > 0) {
|
||||||
|
stringstream ss_line(line);
|
||||||
|
string stid;
|
||||||
|
int tid, pos;
|
||||||
|
int64_t locus;
|
||||||
|
string id, ref;
|
||||||
|
ss_line >> stid >> pos >> id >> ref;
|
||||||
|
tid = sam_hdr_name2tid(nsgv::gInBamHeader, stid.c_str());
|
||||||
|
if (tid >= 0 && pos > 0) {
|
||||||
|
vcf.knownSites.push_back(Interval(tid, pos - 1, pos - 1 + ref.size()));
|
||||||
|
//spdlog::info("intv-1 {}, {}, {}", tid, pos, ref.size());
|
||||||
|
}
|
||||||
|
get_line_from_buf(buf, flen, &cur, &line);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
//spdlog::info("after intervals : {}", vcf.knownSites.size());
|
||||||
|
//for(auto &val : vcf.knownSites) {
|
||||||
|
// spdlog::info("intv {}, {}", val.left, val.right);
|
||||||
|
//}
|
||||||
|
}
|
||||||
|
//exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
// 串行bqsr
|
// 串行bqsr
|
||||||
int SerialBQSR() {
|
int SerialBQSR() {
|
||||||
int round = 0;
|
int round = 0;
|
||||||
BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO);
|
BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO);
|
||||||
inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM);
|
// inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM);
|
||||||
|
inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, bqsrReadFilterOut);
|
||||||
int64_t readNumSum = 0;
|
int64_t readNumSum = 0;
|
||||||
|
// 0. 初始化一些全局数据
|
||||||
|
// BAQ baq{BAQ::DEFAULT_GOP};
|
||||||
|
|
||||||
|
// 1. 协变量数据相关初始化
|
||||||
|
PerReadCovariateMatrix readCovariates;
|
||||||
|
CovariateUtils::InitPerReadCovMat(readCovariates);
|
||||||
|
ContextCovariate::InitContextCovariate(nsgv::gBqsrArg);
|
||||||
|
CycleCovariate::InitCycleCovariate(nsgv::gBqsrArg);
|
||||||
|
|
||||||
|
// 2. 读取bam的read group
|
||||||
|
if (nsgv::gInBamHeader->hrecs->nrg == 0) {
|
||||||
|
spdlog::error("No RG tag found in the header!");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < nsgv::gInBamHeader->hrecs->nrg; ++i) {
|
||||||
|
spdlog::info("rg: {}", nsgv::gInBamHeader->hrecs->rg[i].name);
|
||||||
|
ReadGroupCovariate::RgToId[nsgv::gInBamHeader->hrecs->rg[i].name] = i;
|
||||||
|
ReadGroupCovariate::IdToRg[i] = nsgv::gInBamHeader->hrecs->rg[i].name;
|
||||||
|
}
|
||||||
|
|
||||||
|
int test = 0;
|
||||||
|
|
||||||
while (1) {
|
while (1) {
|
||||||
++ round;
|
++ round;
|
||||||
|
|
||||||
|
// 一. 读取bam数据
|
||||||
size_t readNum = 0;
|
size_t readNum = 0;
|
||||||
if (inBamBuf.ReadStat() >= 0)
|
if (inBamBuf.ReadStat() >= 0)
|
||||||
readNum = inBamBuf.ReadBam();
|
readNum = inBamBuf.ReadBam();
|
||||||
if (readNum < 1) {
|
if (readNum < 1) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
spdlog::info("{} reads processed in {} round", readNum, round);
|
|
||||||
|
|
||||||
auto bams = inBamBuf.GetBamArr();
|
auto bams = inBamBuf.GetBamArr();
|
||||||
spdlog::info("region: {} - {}", bams[0]->softclip_start(), bams.back()->softclip_end());
|
spdlog::info("{} reads processed in {} round, {}", readNum, round, test);
|
||||||
// 1. 获取bams数组覆盖的region范围
|
|
||||||
|
|
||||||
|
// 二. 遍历每个bam(read)记录,进行处理
|
||||||
|
|
||||||
|
for (int i = 0; i < bams.size(); ++i) {
|
||||||
|
// 1. 对每个read,需要检查cigar是否合法,即没有两个连续的相同的cigar,而且需要将首尾的deletion处理掉,目前看好像没啥影响,我们忽略这一步
|
||||||
|
// 2. 对质量分数长度跟碱基长度不匹配的read,缺少的质量分数用默认值补齐,先忽略,后边有需要再处理
|
||||||
|
// 3. 如果bam文件之前做过bqsr,tag中包含OQ(originnal quality,原始质量分数),检查用户参数里是否指定用原始质量分数进行bqsr,如果是则将质量分数替换为OQ,否则忽略OQ,先忽略
|
||||||
|
// 4. 对read的两端进行检测,去除(hardclip)adapter
|
||||||
|
BamWrap *bw = bams[i];
|
||||||
|
ReadAdditionData ad;
|
||||||
|
ad.read_len = BamWrap::BamEffectiveLength(bw->b);
|
||||||
|
ad.cigar_end = bw->b->core.n_cigar;
|
||||||
|
if (ad.read_len <= 0) continue;
|
||||||
|
|
||||||
|
int adapter_boundary = bw->GetAdapterBoundary();
|
||||||
|
if (bw->IsAdapterInRead(adapter_boundary)) {
|
||||||
|
// adapter在read范围内
|
||||||
|
if (bw->GetReadNegativeStrandFlag()) { // 反链
|
||||||
|
clipByReferenceCoordinates(bw, -1, adapter_boundary, ad);
|
||||||
|
} else { // 正链
|
||||||
|
clipByReferenceCoordinates(bw, adapter_boundary, -1, ad);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ad.read_len = bw->b->core.l_qseq - ad.left_clip - ad.right_clip; // 更新read长度
|
||||||
|
// 5. 然后再去除softclip部分
|
||||||
|
calculateSoftClip(bw, ad);
|
||||||
|
ad.read_len = bw->b->core.l_qseq - ad.left_clip - ad.right_clip; // 更新read长度
|
||||||
|
if (ad.read_len <= 0) continue;
|
||||||
|
|
||||||
|
calculateRefOffset(bw, ad); // 计算ref_offset,就是相对比对的position,要将ref右移多少
|
||||||
|
calculateReadBases(bw, ad); // 计算clip处理之后,剩余的碱基
|
||||||
|
|
||||||
|
//spdlog::info("read-len {} - {}: clip left {}, right {}, ref offset: {}, cigar range: [{}, {}), cigar: {}", bw->b->core.l_qseq,
|
||||||
|
// ad.read_len, ad.left_clip, ad.right_clip, ad.ref_offset, ad.cigar_start, ad.cigar_end, bw->cigar_str());
|
||||||
|
|
||||||
|
// 6. 更新每个read的platform信息,好像没啥用,暂时忽略
|
||||||
|
vector<int> isSNP(ad.read_len, 0); // 该位置是否是SNP位置,0不是,1是
|
||||||
|
vector<int> isIns(ad.read_len, 0); // 该位置是否是插入位置,0不是,1是
|
||||||
|
vector<int> isDel(ad.read_len, 0); // 该位置是否是删除位置,0不是,1是
|
||||||
|
const int nErrors = calculateIsSNPOrIndel(nsgv::gAuxVars[0], bw, ad, isSNP, isIns, isDel);
|
||||||
|
|
||||||
|
// 7. 计算baqArray
|
||||||
|
// BAQ = base alignment quality
|
||||||
|
// note for efficiency reasons we don't compute the BAQ array unless we actually have
|
||||||
|
// some error to marginalize over. For ILMN data ~85% of reads have no error
|
||||||
|
vector<int> baqArray;
|
||||||
|
bool baqCalculated = false;
|
||||||
|
if (nErrors == 0 || !nsgv::gBqsrArg.enableBAQ) {
|
||||||
|
baqCalculated = flatBAQArray(bw, ad, baqArray);
|
||||||
|
} else {
|
||||||
|
// baqCalculated = calculateBAQArray(nsgv::gAuxVars[0], baq, bw, ad, baqArray);
|
||||||
|
}
|
||||||
|
if (!baqCalculated) continue;
|
||||||
|
// 到这里,基本的数据都准备好了,后续就是进行bqsr的统计了
|
||||||
|
|
||||||
|
// 8. 计算这条read对应的协变量
|
||||||
|
CovariateUtils::ComputeCovariates(bw, ad, nsgv::gInBamHeader, readCovariates, true);
|
||||||
|
test = readCovariates[1][0][0] + readCovariates[2][1][3];
|
||||||
|
int end_pos = bw->contig_end_pos();
|
||||||
|
//spdlog::info("adapter: {}, read: {}, {}, strand: {}", adapter_boundary, bw->contig_pos(), end_pos,
|
||||||
|
// bw->GetReadNegativeStrandFlag() ? "reverse" : "forward");
|
||||||
|
|
||||||
|
// 9. 计算这条read需要跳过的位置
|
||||||
|
vector<bool> skip(ad.read_len, 0);
|
||||||
|
calculateKnownSites(bw, ad, nsgv::gAuxVars[0].vcfArr);
|
||||||
|
}
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
// spdlog::info("region: {} - {}", bams[0]->global_softclip_start(), bams.back()->global_softclip_end());
|
||||||
|
// 1. 获取bams数组覆盖的region范围
|
||||||
|
// 如果读取的bam数组跨越了不同的染色体,咋搞?还是按照每个线程都有独立的vcf文件来做吧
|
||||||
|
int64_t region_start = bams[0]->global_softclip_start();
|
||||||
|
vector<Region> contig_bams;
|
||||||
|
int contig_id = bams[0]->contig_id();
|
||||||
|
int64_t start = 0, stop = 0;
|
||||||
|
while (true) {
|
||||||
|
stop = start;
|
||||||
|
while (stop < bams.size() && bams[stop]->contig_id() == contig_id) ++stop;
|
||||||
|
if (stop > start) contig_bams.push_back(Region{start, stop});
|
||||||
|
if (stop >= bams.size()) break;
|
||||||
|
contig_id = bams[stop]->contig_id();
|
||||||
|
start = stop;
|
||||||
|
}
|
||||||
|
|
||||||
|
spdlog::info("{}, {} contig regions", contig_id, contig_bams.size());
|
||||||
|
|
||||||
|
for (int i = 0; i < bams.size();) {
|
||||||
|
int64_t a1 = bams[i]->contig_pos();
|
||||||
|
int64_t b1 = bams[i]->contig_end_pos();
|
||||||
|
int64_t a = bams[i]->softclip_start();
|
||||||
|
int64_t b = bams[i]->softclip_end();
|
||||||
|
spdlog::info("{}: ({}, {}), ({}, {})", bams[i]->query_name(), a1, b1, a, b);
|
||||||
|
++i;
|
||||||
|
}
|
||||||
|
// 依次处理每个contig的bams
|
||||||
|
vector<uint32_t> bitmap(100, 0); // 用来表示known sites覆盖情况的bitmap
|
||||||
|
for (const auto& cr : contig_bams) {
|
||||||
|
spdlog::info(" contig id: {}, bam count: {}, bitmap size: {}", contig_id, cr.end - cr.start, bitmap.size());
|
||||||
|
// 当前处理的contig
|
||||||
|
int contig_id = bams[cr.start]->contig_id();
|
||||||
|
int64_t region_start = bams[cr.start]->softclip_start();
|
||||||
|
int64_t region_end = bams[cr.end - 1]->softclip_end();
|
||||||
|
if ((bitmap.size() << 5)) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
// 2. 开辟一个uint32_t的数组作为bitmap(如果上一轮的不够就重开),用来表示region的每个位点是否有known sites覆盖(每轮使用前需清零)
|
// 2. 开辟一个uint32_t的数组作为bitmap(如果上一轮的不够就重开),用来表示region的每个位点是否有known sites覆盖(每轮使用前需清零)
|
||||||
|
|
||||||
// 3. 读取在region范围内的所有known sites,并为对应的bitmap设定0 or 1 (作为skip标识)
|
// 3. 读取在region范围内的所有known sites,并为对应的bitmap设定0 or 1 (作为skip标识)
|
||||||
|
|
@ -87,14 +620,18 @@ int SerialBQSR() {
|
||||||
// 4. 遍历bams数组中的每一条记录并进行处理
|
// 4. 遍历bams数组中的每一条记录并进行处理
|
||||||
|
|
||||||
readNumSum += readNum;
|
readNumSum += readNum;
|
||||||
inBamBuf.ClearAll(); //
|
inBamBuf.ClearAll(); //
|
||||||
}
|
}
|
||||||
spdlog::info("read count: {}", readNumSum);
|
spdlog::info("read count: {}", readNumSum);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// entrance of mark duplicates
|
// 需要支持vcf idx,tbi,csi三种索引方式
|
||||||
|
// vcf和idx是一对
|
||||||
|
// vcf.gz和tbi或csi是一对
|
||||||
|
|
||||||
|
// entrance of mark BQSR
|
||||||
int BaseRecalibrator() {
|
int BaseRecalibrator() {
|
||||||
|
|
||||||
PROF_START(whole_process);
|
PROF_START(whole_process);
|
||||||
|
|
@ -107,6 +644,18 @@ int BaseRecalibrator() {
|
||||||
hts_set_opt(nsgv::gInBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
hts_set_opt(nsgv::gInBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
||||||
nsgv::gInBamHeader = sam_hdr_read(nsgv::gInBamFp); // header
|
nsgv::gInBamHeader = sam_hdr_read(nsgv::gInBamFp); // header
|
||||||
|
|
||||||
|
// 初始化AuxVar
|
||||||
|
nsgv::gAuxVars.resize(nsgv::gBqsrArg.NUM_THREADS);
|
||||||
|
for (int i = 0; i < nsgv::gBqsrArg.NUM_THREADS; ++i) {
|
||||||
|
nsgv::gAuxVars[i].header = nsgv::gInBamHeader;
|
||||||
|
nsgv::gAuxVars[i].faidx = fai_load(nsgv::gBqsrArg.REFERENCE_FILE.c_str());
|
||||||
|
if (nsgv::gAuxVars[i].faidx == 0)
|
||||||
|
error("[%s] fail to load the fasta index.\n", __func__);
|
||||||
|
for (auto &vcfFileName : nsgv::gBqsrArg.KNOWN_SITES_VCFS) {
|
||||||
|
nsgv::gAuxVars[i].vcfArr.push_back(VCFParser(vcfFileName, nsgv::gInBamHeader));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// (libraryId)
|
// (libraryId)
|
||||||
nsgv::gMetrics.LIBRARY = sam_hdr_line_name(nsgv::gInBamHeader, "RG", 0);
|
nsgv::gMetrics.LIBRARY = sam_hdr_line_name(nsgv::gInBamHeader, "RG", 0);
|
||||||
|
|
||||||
|
|
@ -124,21 +673,21 @@ int BaseRecalibrator() {
|
||||||
|
|
||||||
return SerialBQSR();
|
return SerialBQSR();
|
||||||
|
|
||||||
// // 读取known sites vcfs
|
// 读取known sites vcfs
|
||||||
// for (const auto& ks : nsgv::gBqsrArg.KNOWN_SITES_VCFS) {
|
for (const auto& ks : nsgv::gBqsrArg.KNOWN_SITES_VCFS) {
|
||||||
// spdlog::info(" {}", ks);
|
spdlog::info(" {}", ks);
|
||||||
// bcf_srs_t* srs = bcf_sr_init();
|
bcf_srs_t* srs = bcf_sr_init();
|
||||||
// if (!bcf_sr_add_reader(srs, ks.c_str()))
|
if (!bcf_sr_add_reader(srs, ks.c_str()))
|
||||||
// error("Failed to read from %s: %s\n", !strcmp("-", ks.c_str()) ? "standard input" : ks.c_str(), bcf_sr_strerror(srs->errnum));
|
error("Failed to read from %s: %s\n", !strcmp("-", ks.c_str()) ? "standard input" : ks.c_str(), bcf_sr_strerror(srs->errnum));
|
||||||
// nsgv::gKnownSitesVcfSrs.push_back(srs);
|
nsgv::gKnownSitesVcfSrs.push_back(srs);
|
||||||
//
|
|
||||||
// while (bcf_sr_next_line(srs)) {
|
while (bcf_sr_next_line(srs)) {
|
||||||
// bcf1_t* line = srs->readers[0].buffer[0];
|
bcf1_t* line = srs->readers[0].buffer[0];
|
||||||
// cout << line->pos << '\t' << line->rlen << '\t' << line->n_allele << '\t' << line->n_info << endl;
|
cout << line->pos << '\t' << line->rlen << '\t' << line->n_allele << '\t' << line->n_info << endl;
|
||||||
// }
|
}
|
||||||
// }
|
}
|
||||||
//
|
|
||||||
// /* 先实现串行的bqsr-phase-1 */
|
/* 先实现串行的bqsr-phase-1 */
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,317 @@
|
||||||
|
#include "covariate.h"
|
||||||
|
|
||||||
|
// for EventType
|
||||||
|
EventTypeValue EventType::BASE_SUBSTITUTION = {0, 'M', "Base Substitution"};
|
||||||
|
EventTypeValue EventType::BASE_INSERTION = {1, 'I', "Base Insertion"};
|
||||||
|
EventTypeValue EventType::BASE_DELETION = {2, 'D', "Base Deletion"};
|
||||||
|
|
||||||
|
// static变量 for ContextCovariate
|
||||||
|
int ContextCovariate::mismatchesContextSize;
|
||||||
|
int ContextCovariate::indelsContextSize;
|
||||||
|
int ContextCovariate::mismatchesKeyMask;
|
||||||
|
int ContextCovariate::indelsKeyMask;
|
||||||
|
uint8_t ContextCovariate::lowQualTail;
|
||||||
|
int ContextCovariate::baseIndexMap[256];
|
||||||
|
|
||||||
|
// for ReadGroupCovariate
|
||||||
|
map<string, int> ReadGroupCovariate::RgToId; // read group name到id的映射
|
||||||
|
map<int, string> ReadGroupCovariate::IdToRg; // id到read group name的映射
|
||||||
|
|
||||||
|
// for cycleCovariate
|
||||||
|
int CycleCovariate::MAXIMUM_CYCLE_VALUE;
|
||||||
|
|
||||||
|
// for CovariateUtils
|
||||||
|
// 对一条read计算协变量(该协变量被上一个read用过)
|
||||||
|
void CovariateUtils::ComputeCovariates(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values,
|
||||||
|
bool recordIndelValues) {
|
||||||
|
//ReadGroupCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
|
||||||
|
//BaseQualityCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
|
||||||
|
//ContextCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
|
||||||
|
//CycleCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ReadGroupCovariate 协变量的方法
|
||||||
|
void ReadGroupCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
|
||||||
|
uint8_t *rgStr = bam_aux_get(bw->b, "RG");
|
||||||
|
char* rgVal = nullptr;
|
||||||
|
if (rgStr) rgVal = bam_aux2Z(rgStr);
|
||||||
|
int key = 0;
|
||||||
|
if (rgVal == nullptr || RgToId.find(rgVal) == RgToId.end()) {
|
||||||
|
spdlog::error("The RG tag value for read can not be found in header!");
|
||||||
|
} else {
|
||||||
|
key = RgToId[rgVal];
|
||||||
|
}
|
||||||
|
for (int i = 0; i < ad.read_len; ++i) {
|
||||||
|
CovariateUtils::SetCovariate(key, key, key, i, ReadGroupCovariate::index, values);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// BaseQualityCovariate 协变量的方法
|
||||||
|
void BaseQualityCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values,
|
||||||
|
bool recordIndelValues) {
|
||||||
|
// 在前面的处理过后,quals应该和base长度一致了
|
||||||
|
#define __bq_set_cov(ins, del) \
|
||||||
|
do { \
|
||||||
|
for (int i = 0; i < ad.read_len; ++i) { \
|
||||||
|
CovariateUtils::SetCovariate(quals[i + ad.left_clip], (ins), (del), i, BaseQualityCovariate::index, values); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
const int INDEL_QUAL = 45;
|
||||||
|
uint8_t* quals = bam_get_qual(bw->b);
|
||||||
|
if (recordIndelValues) {
|
||||||
|
uint8_t* insQualPtr = bam_aux_get(bw->b, "BI"); // base qualities for insertions
|
||||||
|
uint8_t* delQualPtr = bam_aux_get(bw->b, "BD"); // base qualities for deletions
|
||||||
|
if (insQualPtr == nullptr && delQualPtr == nullptr) {
|
||||||
|
__bq_set_cov(INDEL_QUAL, INDEL_QUAL);
|
||||||
|
} else if (insQualPtr == nullptr) {
|
||||||
|
uint8_t* delQuals = (uint8_t*)bam_aux2Z(delQualPtr);
|
||||||
|
__bq_set_cov(INDEL_QUAL, delQuals[i]);
|
||||||
|
} else {
|
||||||
|
uint8_t* insQuals = (uint8_t*)bam_aux2Z(insQualPtr);
|
||||||
|
__bq_set_cov(insQuals[i], INDEL_QUAL);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
__bq_set_cov(0, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContextCovariate 协变量的方法
|
||||||
|
|
||||||
|
static char SimpleComplement(const char base) {
|
||||||
|
switch (base) {
|
||||||
|
case 'A':
|
||||||
|
case 'a':
|
||||||
|
return 'T';
|
||||||
|
case 'C':
|
||||||
|
case 'c':
|
||||||
|
return 'G';
|
||||||
|
case 'G':
|
||||||
|
case 'g':
|
||||||
|
return 'C';
|
||||||
|
case 'T':
|
||||||
|
case 't':
|
||||||
|
return 'A';
|
||||||
|
default:
|
||||||
|
return base;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取去除低质量分数碱基之后的read碱基序列(将低质量分数的碱基变成N)
|
||||||
|
void ContextCovariate::GetStrandedClippedBytes(BamWrap* bw, ReadAdditionData& ad, string& clippedBases, uint8_t lowQTail) {
|
||||||
|
uint8_t* quals = bam_get_qual(bw->b) + ad.left_clip;
|
||||||
|
|
||||||
|
if (bw->GetReadNegativeStrandFlag()) { // 反向互补
|
||||||
|
for (int i = 0; i < ad.read_len; ++i) clippedBases[i] = SimpleComplement(ad.bases[ad.read_len - 1 - i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 处理左边
|
||||||
|
int left = 0;
|
||||||
|
for (; left < ad.read_len; ++left) {
|
||||||
|
if (quals[left] <= lowQTail)
|
||||||
|
clippedBases[left] = 'N';
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (left == ad.read_len) {
|
||||||
|
clippedBases.clear();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// 处理右边
|
||||||
|
int right = ad.read_len - 1;
|
||||||
|
for (; right >= 0; --right) {
|
||||||
|
if (quals[right] <= lowQTail)
|
||||||
|
clippedBases[right] = 'N';
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (right < left)
|
||||||
|
clippedBases.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a int representation of a given dna string.
|
||||||
|
*
|
||||||
|
* @param dna the dna sequence
|
||||||
|
* @param start the start position in the byte array (inclusive)
|
||||||
|
* @param end the end position in the array (exclusive)
|
||||||
|
* @return the key representing the dna sequence
|
||||||
|
*/
|
||||||
|
int ContextCovariate::KeyFromContext(const string& dna, const int start, const int end) {
|
||||||
|
int key = end - start;
|
||||||
|
int bitOffset = LENGTH_BITS;
|
||||||
|
for (int i = start; i < end; i++) {
|
||||||
|
const int baseIndex = baseIndexMap[dna[i] & 0xff];
|
||||||
|
if (baseIndex == -1) { // ignore non-ACGT bases
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
key |= (baseIndex << bitOffset);
|
||||||
|
bitOffset += 2;
|
||||||
|
}
|
||||||
|
return key;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
|
||||||
|
*
|
||||||
|
* For example, for the read [AGCTG], return the list
|
||||||
|
* [-1, "AG", "GC", "CT", "TG" ]
|
||||||
|
* with each string context encoded as an integer.
|
||||||
|
*
|
||||||
|
* @param bases the bases in the read to build the context from
|
||||||
|
* @param contextSize context size to use building the context
|
||||||
|
* @param mask mask for pulling out just the context bits
|
||||||
|
*
|
||||||
|
* @return a list that has the same length as the read and contains the (preceding) n-base context at each position.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
void ContextCovariate::GetReadContextAtEachPosition(const string& bases, const int contextSize, const int mask, vector<int>& keys) {
|
||||||
|
int readLength = bases.size();
|
||||||
|
keys.resize(readLength);
|
||||||
|
int keyIdx = 0;
|
||||||
|
// the first contextSize-1 bases will not have enough previous context
|
||||||
|
for (int i = 1; i < contextSize && i <= readLength; i++) {
|
||||||
|
keys[keyIdx++] = UNKNOWN_OR_ERROR_CONTEXT_CODE;
|
||||||
|
}
|
||||||
|
if (readLength < contextSize)
|
||||||
|
return;
|
||||||
|
|
||||||
|
int newBaseOffset = 2 * (contextSize - 1) + LENGTH_BITS;
|
||||||
|
|
||||||
|
// get (and add) the key for the context starting at the first base
|
||||||
|
int currentKey = KeyFromContext(bases, 0, contextSize);
|
||||||
|
keys[keyIdx++] = currentKey;
|
||||||
|
|
||||||
|
// if the first key was -1 then there was an non-ACGT in the context; figure out how many more consecutive contexts it affects
|
||||||
|
int currentNPenalty = 0;
|
||||||
|
if (currentKey == -1) {
|
||||||
|
currentKey = 0;
|
||||||
|
currentNPenalty = contextSize - 1;
|
||||||
|
int offset = newBaseOffset;
|
||||||
|
int baseIndex;
|
||||||
|
while ((baseIndex = baseIndexMap[bases[currentNPenalty]]) != -1) {
|
||||||
|
currentKey |= (baseIndex << offset);
|
||||||
|
offset -= 2;
|
||||||
|
currentNPenalty--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int currentIndex = contextSize; currentIndex < readLength; currentIndex++) {
|
||||||
|
const int baseIndex = baseIndexMap[bases[currentIndex]];
|
||||||
|
if (baseIndex == -1) { // ignore non-ACGT bases
|
||||||
|
currentNPenalty = contextSize;
|
||||||
|
currentKey = 0; // reset the key
|
||||||
|
} else {
|
||||||
|
// push this base's contribution onto the key: shift everything 2 bits, mask out the non-context bits, and add the new base and the length
|
||||||
|
// in
|
||||||
|
currentKey = (currentKey >> 2) & mask;
|
||||||
|
currentKey |= (baseIndex << newBaseOffset);
|
||||||
|
currentKey |= contextSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (currentNPenalty == 0) {
|
||||||
|
keys[keyIdx++] = currentKey;
|
||||||
|
} else {
|
||||||
|
currentNPenalty--;
|
||||||
|
keys[keyIdx++] = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ContextCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
|
||||||
|
const int originalReadLength = ad.read_len;
|
||||||
|
|
||||||
|
// store the original bases and then write Ns over low quality ones
|
||||||
|
string strandedClippedBases(ad.bases);
|
||||||
|
GetStrandedClippedBytes(bw, ad, strandedClippedBases, lowQualTail);
|
||||||
|
// spdlog::info("bases: {}", strandedClippedBases);
|
||||||
|
vector<int> nBasePairContextAtEachCycle;
|
||||||
|
GetReadContextAtEachPosition(strandedClippedBases, mismatchesContextSize, mismatchesKeyMask, nBasePairContextAtEachCycle);
|
||||||
|
|
||||||
|
const int readLengthAfterClipping = strandedClippedBases.size();
|
||||||
|
|
||||||
|
// this is necessary to ensure that we don't keep historical data in the ReadCovariates values
|
||||||
|
// since the context covariate may not span the entire set of values in read covariates
|
||||||
|
// due to the clipping of the low quality bases
|
||||||
|
if (readLengthAfterClipping != originalReadLength) {
|
||||||
|
// don't bother zeroing out if we are going to overwrite the whole array
|
||||||
|
for (int i = 0; i < originalReadLength; i++) {
|
||||||
|
// this base has been clipped off, so zero out the covariate values here
|
||||||
|
CovariateUtils::SetCovariate(0, 0, 0, i, ContextCovariate::index, values);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const bool negativeStrand = bw->GetReadNegativeStrandFlag();
|
||||||
|
// Note: duplicated the loop to avoid checking recordIndelValues on each iteration
|
||||||
|
if (recordIndelValues) {
|
||||||
|
vector<int> indelKeys;
|
||||||
|
GetReadContextAtEachPosition(strandedClippedBases, indelsContextSize, indelsKeyMask, indelKeys);
|
||||||
|
for (int i = 0; i < readLengthAfterClipping; i++) {
|
||||||
|
const int readOffset = GetStrandedOffset(negativeStrand, i, readLengthAfterClipping);
|
||||||
|
const int indelKey = indelKeys[i];
|
||||||
|
CovariateUtils::SetCovariate(nBasePairContextAtEachCycle[i], indelKey, indelKey, readOffset, ContextCovariate::index, values);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int i = 0; i < readLengthAfterClipping; i++) {
|
||||||
|
const int readOffset = GetStrandedOffset(negativeStrand, i, readLengthAfterClipping);
|
||||||
|
CovariateUtils::SetCovariate(nBasePairContextAtEachCycle[i], 0, 0, readOffset, ContextCovariate::index, values);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// CycleCovariate 协变量的方法
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Computes the encoded value of CycleCovariate's key for the given position at the read.
|
||||||
|
* Uses keyFromCycle to do the encoding.
|
||||||
|
* @param baseNumber index of the base to compute the key for
|
||||||
|
* @param read the read
|
||||||
|
* @param indel is this an indel key or a substitution key?
|
||||||
|
* @param maxCycle max value of the base to compute the key for
|
||||||
|
* (this method throws UserException if the computed absolute value of the cycle number is higher than this value).
|
||||||
|
*/
|
||||||
|
int CycleCovariate::CycleKey(BamWrap* bw, ReadAdditionData& ad, const int baseNumber, const bool indel, const int maxCycle) {
|
||||||
|
const bool isNegStrand = bw->GetReadNegativeStrandFlag();
|
||||||
|
const bool isSecondInPair = (bw->b->core.flag & BAM_FPAIRED) && (bw->b->core.flag & BAM_FREAD2);
|
||||||
|
const int readLength = ad.read_len;
|
||||||
|
|
||||||
|
const int readOrderFactor = isSecondInPair ? -1 : 1;
|
||||||
|
int increment;
|
||||||
|
int cycle;
|
||||||
|
if (isNegStrand) {
|
||||||
|
cycle = readLength * readOrderFactor;
|
||||||
|
increment = -1 * readOrderFactor;
|
||||||
|
} else {
|
||||||
|
cycle = readOrderFactor;
|
||||||
|
increment = readOrderFactor;
|
||||||
|
}
|
||||||
|
cycle += baseNumber * increment;
|
||||||
|
|
||||||
|
if (!indel) {
|
||||||
|
return CycleCovariate::KeyFromCycle(cycle, maxCycle);
|
||||||
|
}
|
||||||
|
const int maxCycleForIndels = readLength - CUSHION_FOR_INDELS - 1;
|
||||||
|
if (baseNumber < CUSHION_FOR_INDELS || baseNumber > maxCycleForIndels) {
|
||||||
|
return -1;
|
||||||
|
} else {
|
||||||
|
return CycleCovariate::KeyFromCycle(cycle, maxCycle);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Used to pick out the covariate's value from attributes of the read
|
||||||
|
void CycleCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
|
||||||
|
const int readLength = ad.read_len;
|
||||||
|
// Note: duplicate the loop to void checking recordIndelValues on every iteration
|
||||||
|
if (recordIndelValues) {
|
||||||
|
for (int i = 0; i < readLength; i++) {
|
||||||
|
const int substitutionKey = CycleKey(bw, ad, i, false, MAXIMUM_CYCLE_VALUE);
|
||||||
|
const int indelKey = CycleKey(bw, ad, i, true, MAXIMUM_CYCLE_VALUE);
|
||||||
|
CovariateUtils::SetCovariate(substitutionKey, indelKey, indelKey, i, CycleCovariate::index, values);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int i = 0; i < readLength; i++) {
|
||||||
|
const int substitutionKey = CycleKey(bw, ad, i, false, MAXIMUM_CYCLE_VALUE);
|
||||||
|
CovariateUtils::SetCovariate(substitutionKey, 0, 0, i, CycleCovariate::index, values);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,209 @@
|
||||||
|
/*
|
||||||
|
Description: 在bqsr过程中,计算协变量相关的类和方法
|
||||||
|
|
||||||
|
Copyright : All right reserved by ICT
|
||||||
|
|
||||||
|
Author : Zhang Zhonghai
|
||||||
|
Date : 2025/12/08
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <map>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "bqsr_args.h"
|
||||||
|
#include "util/bam_wrap.h"
|
||||||
|
|
||||||
|
using std::map;
|
||||||
|
using std::string;
|
||||||
|
using std::vector;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This is where we store the pre-read covariates, also indexed by (event type) and (read position).
|
||||||
|
* Thus the array has shape { event type } x { read position (aka cycle) } x { covariate }.
|
||||||
|
* For instance, { covariate } is by default 4-dimensional (read group, base quality, context, cycle).
|
||||||
|
*/
|
||||||
|
typedef vector<vector<vector<int>>> PerReadCovariateMatrix;
|
||||||
|
|
||||||
|
// 变异类型(snp, insert, deletion)
|
||||||
|
struct EventTypeValue {
|
||||||
|
int index; // 在协变量数组中对应的索引
|
||||||
|
char representation;
|
||||||
|
string longRepresentation;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct EventType {
|
||||||
|
static constexpr int EVENT_SIZE = 3;
|
||||||
|
static EventTypeValue BASE_SUBSTITUTION;
|
||||||
|
static EventTypeValue BASE_INSERTION;
|
||||||
|
static EventTypeValue BASE_DELETION;
|
||||||
|
};
|
||||||
|
|
||||||
|
// 协变量相关的工具类
|
||||||
|
struct CovariateUtils {
|
||||||
|
static constexpr int MAX_READ_LENGTH = 300; // 最大read长度
|
||||||
|
static constexpr int NUM_COVARIATES = 4;
|
||||||
|
|
||||||
|
// 初始化PerReadCovariateMatrix
|
||||||
|
static void InitPerReadCovMat(PerReadCovariateMatrix& matrix) {
|
||||||
|
matrix.resize(EventType::EVENT_SIZE);
|
||||||
|
for (int event_type = 0; event_type < EventType::EVENT_SIZE; ++event_type) {
|
||||||
|
matrix[event_type].resize(MAX_READ_LENGTH);
|
||||||
|
for (int pos = 0; pos < MAX_READ_LENGTH; ++pos) {
|
||||||
|
matrix[event_type][pos].resize(NUM_COVARIATES, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 设置协变量
|
||||||
|
static void SetCovariate(int mismatch, int insertion, int deletion, int readOffset, int covIndex, PerReadCovariateMatrix& matrix) {
|
||||||
|
matrix[EventType::BASE_SUBSTITUTION.index][readOffset][covIndex] = mismatch;
|
||||||
|
matrix[EventType::BASE_INSERTION.index][readOffset][covIndex] = insertion;
|
||||||
|
matrix[EventType::BASE_DELETION.index][readOffset][covIndex] = deletion;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 对一条read计算协变量(该协变量被上一个read用过)
|
||||||
|
static void ComputeCovariates(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
||||||
|
};
|
||||||
|
|
||||||
|
// Read group协变量
|
||||||
|
struct ReadGroupCovariate {
|
||||||
|
static constexpr int index = 0; // 在协变量数组中的索引位置
|
||||||
|
static map<string, int> RgToId; // read group name到id的映射
|
||||||
|
static map<int, string> IdToRg; // id到read group name的映射
|
||||||
|
|
||||||
|
static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
||||||
|
};
|
||||||
|
|
||||||
|
// Base quality协变量
|
||||||
|
struct BaseQualityCovariate {
|
||||||
|
static constexpr int index = 1; // 在协变量数组中的索引位置
|
||||||
|
static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
||||||
|
};
|
||||||
|
|
||||||
|
// Context协变量
|
||||||
|
struct ContextCovariate {
|
||||||
|
static constexpr int index = 2; // 在协变量数组中的索引位置
|
||||||
|
static constexpr int UNKNOWN_OR_ERROR_CONTEXT_CODE = -1;
|
||||||
|
static constexpr int LENGTH_BITS = 4;
|
||||||
|
static constexpr int LENGTH_MASK = 15;
|
||||||
|
|
||||||
|
// the maximum context size (number of bases) permitted; we need to keep the leftmost base free so that values are
|
||||||
|
// not negative and we reserve 4 more bits to represent the length of the context; it takes 2 bits to encode one base.
|
||||||
|
static constexpr int MAX_DNA_CONTEXT = 13;
|
||||||
|
|
||||||
|
static int mismatchesContextSize;
|
||||||
|
static int indelsContextSize;
|
||||||
|
static int mismatchesKeyMask;
|
||||||
|
static int indelsKeyMask;
|
||||||
|
static uint8_t lowQualTail;
|
||||||
|
|
||||||
|
static int baseIndexMap[256];
|
||||||
|
|
||||||
|
static void InitContextCovariate(BQSRArg& p) {
|
||||||
|
mismatchesContextSize = p.MISMATCHES_CONTEXT_SIZE;
|
||||||
|
indelsContextSize = p.INDELS_CONTEXT_SIZE;
|
||||||
|
if (mismatchesContextSize > MAX_DNA_CONTEXT) {
|
||||||
|
spdlog::error("mismatches_context_size: context size cannot be bigger than {}, but was {}", MAX_DNA_CONTEXT, mismatchesContextSize);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
if (indelsContextSize > MAX_DNA_CONTEXT) {
|
||||||
|
spdlog::error("indels_context_size: context size cannot be bigger than {}, but was {}", MAX_DNA_CONTEXT, indelsContextSize);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
lowQualTail = p.LOW_QUAL_TAIL;
|
||||||
|
if (mismatchesContextSize <= 0 || indelsContextSize <= 0) {
|
||||||
|
spdlog::error("Context size must be positive. Mismatches: {} Indels: {}", mismatchesContextSize, indelsContextSize);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
mismatchesKeyMask = CreateMask(mismatchesContextSize);
|
||||||
|
indelsKeyMask = CreateMask(indelsContextSize);
|
||||||
|
|
||||||
|
// init baseIndexMap
|
||||||
|
for (int i = 0; i < 256; ++i) {
|
||||||
|
baseIndexMap[i] = -1;
|
||||||
|
}
|
||||||
|
baseIndexMap['A'] = 0;
|
||||||
|
baseIndexMap['a'] = 0;
|
||||||
|
baseIndexMap['*'] = 0;
|
||||||
|
baseIndexMap['C'] = 1;
|
||||||
|
baseIndexMap['c'] = 1;
|
||||||
|
baseIndexMap['G'] = 2;
|
||||||
|
baseIndexMap['g'] = 2;
|
||||||
|
baseIndexMap['T'] = 3;
|
||||||
|
baseIndexMap['t'] = 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int CreateMask(int contextSize) {
|
||||||
|
int mask = 0;
|
||||||
|
// create 2*contextSize worth of bits
|
||||||
|
for (int i = 0; i < contextSize; i++) {
|
||||||
|
mask = (mask << 2) | 3;
|
||||||
|
}
|
||||||
|
// shift 4 bits to mask out the bits used to encode the length
|
||||||
|
return mask << LENGTH_BITS;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper method: computes the correct offset to use in computations of covariate values.
|
||||||
|
* @param isNegativeStrand is the read on the negative strand
|
||||||
|
* @param offset 0-based index of the base in the read
|
||||||
|
* @param readLength length of the read
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
static int GetStrandedOffset(const bool isNegativeStrand, const int offset, const int readLength) {
|
||||||
|
return isNegativeStrand ? (readLength - offset - 1) : offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取去除低质量分数碱基之后的read碱基序列(将低质量分数的碱基变成N)
|
||||||
|
static void GetStrandedClippedBytes(BamWrap* bw, ReadAdditionData& ad, string& clippedBases, uint8_t lowQTail);
|
||||||
|
// Creates a int representation of a given dna string.
|
||||||
|
static int KeyFromContext(const string& dna, const int start, const int end);
|
||||||
|
// For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
|
||||||
|
static void GetReadContextAtEachPosition(const string& bases, const int contextSize, const int mask, vector<int>& keys);
|
||||||
|
|
||||||
|
// 设置协变量的值
|
||||||
|
static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
||||||
|
};
|
||||||
|
|
||||||
|
// Cycle协变量
|
||||||
|
struct CycleCovariate {
|
||||||
|
static constexpr int index = 3; // 在协变量数组中的索引位置
|
||||||
|
static int MAXIMUM_CYCLE_VALUE;
|
||||||
|
static constexpr int CUSHION_FOR_INDELS = 4;
|
||||||
|
|
||||||
|
static void InitCycleCovariate(BQSRArg& p) { MAXIMUM_CYCLE_VALUE = p.MAXIMUM_CYCLE_VALUE; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Encodes the cycle number as a key.
|
||||||
|
*/
|
||||||
|
static int KeyFromCycle(const int cycle, const int maxCycle) {
|
||||||
|
// no negative values because values must fit into the first few bits of the long
|
||||||
|
int result = std::abs(cycle);
|
||||||
|
if (result > maxCycle) {
|
||||||
|
spdlog::error(
|
||||||
|
"The maximum allowed value for the cycle is {}, but a larger cycle ({}) was detected. Please use the --maximum-cycle-value argument "
|
||||||
|
"(when creating the recalibration table in "
|
||||||
|
"BaseRecalibrator) to increase this value (at the expense of requiring more memory to run)",
|
||||||
|
maxCycle, result);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
result <<= 1; // shift so we can add the "sign" bit
|
||||||
|
if (cycle < 0) {
|
||||||
|
result++; // negative cycles get the lower-most bit set
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Computes the encoded value of CycleCovariate's key for the given position at the read.
|
||||||
|
static int CycleKey(BamWrap* bw, ReadAdditionData& ad, const int baseNumber, const bool indel, const int maxCycle);
|
||||||
|
|
||||||
|
static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
|
||||||
|
};
|
||||||
|
|
@ -78,6 +78,12 @@ int main_BaseRecalibrator(int argc, char *argv[]) {
|
||||||
.nargs(1)
|
.nargs(1)
|
||||||
.metavar("<IndexFormat>");
|
.metavar("<IndexFormat>");
|
||||||
|
|
||||||
|
program.add_argument("--enable-baq")
|
||||||
|
.help("Whether to do BAQ correction.")
|
||||||
|
.default_value(false)
|
||||||
|
.implicit_value(true)
|
||||||
|
.hidden();
|
||||||
|
|
||||||
// add help and version args
|
// add help and version args
|
||||||
program.add_argument("-h", "--help")
|
program.add_argument("-h", "--help")
|
||||||
.action([&](const auto & /*unused*/) {
|
.action([&](const auto & /*unused*/) {
|
||||||
|
|
@ -113,8 +119,9 @@ int main_BaseRecalibrator(int argc, char *argv[]) {
|
||||||
nsgv::gBqsrArg.OUTPUT_FILE = program.get("--output");
|
nsgv::gBqsrArg.OUTPUT_FILE = program.get("--output");
|
||||||
nsgv::gBqsrArg.NUM_THREADS = program.get<int>("--num-threads");
|
nsgv::gBqsrArg.NUM_THREADS = program.get<int>("--num-threads");
|
||||||
nsgv::gBqsrArg.CREATE_INDEX = program.get<bool>("--create-index");
|
nsgv::gBqsrArg.CREATE_INDEX = program.get<bool>("--create-index");
|
||||||
|
nsgv::gBqsrArg.REFERENCE_FILE = program.get<string>("--reference");
|
||||||
nsgv::gBqsrArg.KNOWN_SITES_VCFS = program.get<std::vector<string>>("--known-sites");
|
nsgv::gBqsrArg.KNOWN_SITES_VCFS = program.get<std::vector<string>>("--known-sites");
|
||||||
|
nsgv::gBqsrArg.enableBAQ = program.get<bool>("--enable-baq");
|
||||||
// spdlog::info("known sites vcf files:");
|
// spdlog::info("known sites vcf files:");
|
||||||
// for (const auto& ks : nsgv::gBqsrArg.KNOWN_SITES_VCFS) {
|
// for (const auto& ks : nsgv::gBqsrArg.KNOWN_SITES_VCFS) {
|
||||||
// spdlog::info(" {}", ks);
|
// spdlog::info(" {}", ks);
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
Description: sam/bam,buf,
|
Description: 读入sam/bam时,开辟一个大的buf,存放这些数据
|
||||||
|
|
||||||
Copyright : All right reserved by ICT
|
Copyright : All right reserved by ICT
|
||||||
|
|
||||||
|
|
@ -10,25 +10,28 @@
|
||||||
#include "bam_buf.h"
|
#include "bam_buf.h"
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* BamBuf
|
* BamBuf类
|
||||||
*/
|
*/
|
||||||
// ,
|
// 读取数据直到读完,或者缓冲区满
|
||||||
int BamBuf::ReadBam() {
|
int BamBuf::ReadBam() {
|
||||||
int read_num = 0;
|
int read_num = 0;
|
||||||
if (handle_last) { // bam
|
if (handle_last) { // 处理上次读入的最后一个bam
|
||||||
if (has_enough_space()) { // ,memffset
|
if (has_enough_space()) { // 必须调用,在边界处调整memffset
|
||||||
++read_num;
|
++read_num;
|
||||||
append_one_bam();
|
append_one_bam();
|
||||||
} else {
|
} else {
|
||||||
return read_num; //
|
return read_num; // 还是没空间
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
while (read_stat_ >= 0 && (read_stat_ = sam_read1(fp, hdr, bw->b)) >= 0) {
|
while (read_stat_ >= 0 && (read_stat_ = sam_read1(fp, hdr, bw->b)) >= 0) {
|
||||||
bw->end_pos_ = BamWrap::BamEndPos(bw->b);
|
bw->end_pos_ = BamWrap::BamEndPos(bw->b);
|
||||||
if (has_enough_space()) { //
|
if (has_enough_space()) { // 还有空间
|
||||||
// if (true) { //
|
// if (true) { // 还有空间
|
||||||
append_one_bam();
|
// 加过滤器
|
||||||
++read_num; //
|
if (filter_out == nullptr || !filter_out(bw->b)) {
|
||||||
|
append_one_bam();
|
||||||
|
++read_num; // 放进缓存才算读取到
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
@ -41,7 +44,7 @@ int BamBuf::ReadBam() {
|
||||||
return read_num;
|
return read_num;
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
// 初始化缓存
|
||||||
void BamBuf::Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size) {
|
void BamBuf::Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size) {
|
||||||
this->fp = fp;
|
this->fp = fp;
|
||||||
this->hdr = hdr;
|
this->hdr = hdr;
|
||||||
|
|
@ -71,9 +74,9 @@ void BamBuf::ClearAll() {
|
||||||
prepare_read();
|
prepare_read();
|
||||||
}
|
}
|
||||||
|
|
||||||
// ,
|
// 为下一次读取做准备, 计算一些边界条件
|
||||||
inline void BamBuf::prepare_read() {
|
inline void BamBuf::prepare_read() {
|
||||||
// bam
|
// 计算余留的下次计算可能用到的bam所占的位置
|
||||||
if (bv.size() > 0) {
|
if (bv.size() > 0) {
|
||||||
BamWrap *bw = bv[0];
|
BamWrap *bw = bv[0];
|
||||||
legacy_start = (int64_t)bw - (int64_t)mem;
|
legacy_start = (int64_t)bw - (int64_t)mem;
|
||||||
|
|
@ -81,11 +84,11 @@ inline void BamBuf::prepare_read() {
|
||||||
legacy_end = (int64_t)bw + bw->length() - (int64_t)mem;
|
legacy_end = (int64_t)bw + bw->length() - (int64_t)mem;
|
||||||
} else {
|
} else {
|
||||||
legacy_start = legacy_end = 0;
|
legacy_start = legacy_end = 0;
|
||||||
mem_offset = 0; // ,
|
mem_offset = 0; // 上次没剩下,那就从头存储
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
// 检查缓存是否还有空间
|
||||||
inline bool BamBuf::has_enough_space() {
|
inline bool BamBuf::has_enough_space() {
|
||||||
const uint32_t bam_len = bw->length();
|
const uint32_t bam_len = bw->length();
|
||||||
int64_t potential_end = mem_offset + bam_len;
|
int64_t potential_end = mem_offset + bam_len;
|
||||||
|
|
@ -104,7 +107,7 @@ inline bool BamBuf::has_enough_space() {
|
||||||
return potential_end < legacy_start;
|
return potential_end < legacy_start;
|
||||||
}
|
}
|
||||||
|
|
||||||
// bam
|
// 处理一个读取后的bam
|
||||||
inline void BamBuf::append_one_bam() {
|
inline void BamBuf::append_one_bam() {
|
||||||
BamWrap *bwp = (BamWrap *)(mem + mem_offset);
|
BamWrap *bwp = (BamWrap *)(mem + mem_offset);
|
||||||
*bwp = *bw;
|
*bwp = *bw;
|
||||||
|
|
@ -113,15 +116,15 @@ inline void BamBuf::append_one_bam() {
|
||||||
*bp = *bw->b;
|
*bp = *bw->b;
|
||||||
bp->data = (uint8_t *)((char *)bwp->b + sizeof(bam1_t));
|
bp->data = (uint8_t *)((char *)bwp->b + sizeof(bam1_t));
|
||||||
memcpy(bp->data, bw->b->data, bw->b->l_data);
|
memcpy(bp->data, bw->b->data, bw->b->l_data);
|
||||||
//
|
// 更新下次存储的位置
|
||||||
mem_offset = (mem_offset + bw->length() + 8 - 1) & ~((size_t)(8 - 1));
|
mem_offset = (mem_offset + bw->length() + 8 - 1) & ~((size_t)(8 - 1));
|
||||||
bv.push_back(bwp);
|
bv.push_back(bwp);
|
||||||
}
|
}
|
||||||
|
|
||||||
// read
|
// 处理上次读入的最后一个read
|
||||||
inline bool BamBuf::handle_last_read() {
|
inline bool BamBuf::handle_last_read() {
|
||||||
if (handle_last) { // bam
|
if (handle_last) { // 处理上次读入的最后一个bam
|
||||||
if (has_enough_space()) { // ,memffset
|
if (has_enough_space()) { // 必须调用,在边界处调整memffset
|
||||||
append_one_bam();
|
append_one_bam();
|
||||||
handle_last = false;
|
handle_last = false;
|
||||||
return true;
|
return true;
|
||||||
|
|
@ -131,9 +134,9 @@ inline bool BamBuf::handle_last_read() {
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* AsyncIoBamBuf
|
* AsyncIoBamBuf 类
|
||||||
*/
|
*/
|
||||||
//
|
// 初始化缓存
|
||||||
void AsyncIoBamBuf::Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size) {
|
void AsyncIoBamBuf::Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size) {
|
||||||
if (use_async_io_) {
|
if (use_async_io_) {
|
||||||
buf1_.Init(fp, hdr, mem_size >> 1);
|
buf1_.Init(fp, hdr, mem_size >> 1);
|
||||||
|
|
@ -147,7 +150,7 @@ void AsyncIoBamBuf::Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
// 读取数据
|
||||||
int AsyncIoBamBuf::ReadBam() {
|
int AsyncIoBamBuf::ReadBam() {
|
||||||
if (use_async_io_) {
|
if (use_async_io_) {
|
||||||
hasThread = true;
|
hasThread = true;
|
||||||
|
|
@ -178,11 +181,11 @@ int AsyncIoBamBuf::async_read_bam() {
|
||||||
first_read_ = false;
|
first_read_ = false;
|
||||||
refresh_bam_arr();
|
refresh_bam_arr();
|
||||||
} else {
|
} else {
|
||||||
// join,
|
// join, 交换缓冲区指针
|
||||||
pthread_join(*tid_, 0);
|
pthread_join(*tid_, 0);
|
||||||
resize_buf();
|
resize_buf();
|
||||||
|
|
||||||
if (need_read_) { //
|
if (need_read_) { // 需要交换指针
|
||||||
BamBuf *tmp = pi_;
|
BamBuf *tmp = pi_;
|
||||||
pi_ = po_;
|
pi_ = po_;
|
||||||
po_ = tmp;
|
po_ = tmp;
|
||||||
|
|
@ -190,14 +193,14 @@ int AsyncIoBamBuf::async_read_bam() {
|
||||||
read_num = last_read_num_;
|
read_num = last_read_num_;
|
||||||
refresh_bam_arr();
|
refresh_bam_arr();
|
||||||
}
|
}
|
||||||
//
|
// 异步读
|
||||||
pthread_create(tid_, 0, async_read, this);
|
pthread_create(tid_, 0, async_read, this);
|
||||||
return read_num;
|
return read_num;
|
||||||
}
|
}
|
||||||
|
|
||||||
void *AsyncIoBamBuf::async_read(void *data) {
|
void *AsyncIoBamBuf::async_read(void *data) {
|
||||||
AsyncIoBamBuf *ab = (AsyncIoBamBuf *)data;
|
AsyncIoBamBuf *ab = (AsyncIoBamBuf *)data;
|
||||||
if (ab->need_read_ && ab->ReadStat() >= 0) { //
|
if (ab->need_read_ && ab->ReadStat() >= 0) { // 需要读取
|
||||||
ab->last_read_num_ = ab->po_->ReadBam();
|
ab->last_read_num_ = ab->po_->ReadBam();
|
||||||
} else {
|
} else {
|
||||||
ab->last_read_num_ = 0;
|
ab->last_read_num_ = 0;
|
||||||
|
|
@ -205,23 +208,23 @@ void *AsyncIoBamBuf::async_read(void *data) {
|
||||||
pthread_exit(0);
|
pthread_exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ,
|
// 为下一次读取做准备,
|
||||||
// ,,po_buf
|
// 计算一些边界条件,延迟操作,因为此时可能po_对应的buf正在读取
|
||||||
void AsyncIoBamBuf::ClearBeforeIdx(size_t idxInBv) { clear_before_idx_ = idxInBv; }
|
void AsyncIoBamBuf::ClearBeforeIdx(size_t idxInBv) { clear_before_idx_ = idxInBv; }
|
||||||
|
|
||||||
// ,,po_buf
|
// 清空上一次所有读入的数据,延迟操作,因为此时可能po_对应的buf正在读取
|
||||||
void AsyncIoBamBuf::ClearAll() { clear_all_ = true; }
|
void AsyncIoBamBuf::ClearAll() { clear_all_ = true; }
|
||||||
|
|
||||||
inline void AsyncIoBamBuf::resize_buf() {
|
inline void AsyncIoBamBuf::resize_buf() {
|
||||||
if (clear_all_) { //
|
if (clear_all_) { // 清理上一轮的数据
|
||||||
clear_all_ = false;
|
clear_all_ = false;
|
||||||
po_->ClearBeforeIdx(legacy_size_);
|
po_->ClearBeforeIdx(legacy_size_);
|
||||||
pi_->ClearAll();
|
pi_->ClearAll();
|
||||||
if (pi_->handle_last_read()) { // read
|
if (pi_->handle_last_read()) { // 上次读取有一个read没放入缓存
|
||||||
last_read_num_ += 1;
|
last_read_num_ += 1;
|
||||||
legacy_size_ = pi_->Size(); // read
|
legacy_size_ = pi_->Size(); // 应该只有一个read
|
||||||
need_read_ = true;
|
need_read_ = true;
|
||||||
} else { // ,,
|
} else { // 没空间存放,则不交换指针,或者文件已经读取完毕
|
||||||
legacy_size_ = 0;
|
legacy_size_ = 0;
|
||||||
need_read_ = false;
|
need_read_ = false;
|
||||||
}
|
}
|
||||||
|
|
@ -229,16 +232,16 @@ inline void AsyncIoBamBuf::resize_buf() {
|
||||||
if (clear_before_idx_ < legacy_size_) {
|
if (clear_before_idx_ < legacy_size_) {
|
||||||
po_->ClearBeforeIdx(clear_before_idx_);
|
po_->ClearBeforeIdx(clear_before_idx_);
|
||||||
legacy_size_ -= clear_before_idx_;
|
legacy_size_ -= clear_before_idx_;
|
||||||
// ,
|
// 不需要交换指针,不需要读取
|
||||||
need_read_ = false;
|
need_read_ = false;
|
||||||
} else {
|
} else {
|
||||||
po_->ClearBeforeIdx(legacy_size_);
|
po_->ClearBeforeIdx(legacy_size_);
|
||||||
pi_->ClearBeforeIdx(clear_before_idx_ - legacy_size_);
|
pi_->ClearBeforeIdx(clear_before_idx_ - legacy_size_);
|
||||||
if (pi_->handle_last_read()) { // read
|
if (pi_->handle_last_read()) { // 上次读取有一个read没放入缓存
|
||||||
last_read_num_ += 1;
|
last_read_num_ += 1;
|
||||||
legacy_size_ = pi_->Size(); // read
|
legacy_size_ = pi_->Size(); // 应该只有一个read
|
||||||
need_read_ = true;
|
need_read_ = true;
|
||||||
} else { // ,,
|
} else { // 没空间存放,则不交换指针,或者文件已经读取完毕
|
||||||
legacy_size_ = 0;
|
legacy_size_ = 0;
|
||||||
need_read_ = false;
|
need_read_ = false;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
Description: sam/bam,buf,
|
Description: 读入sam/bam时,开辟一个大的buf,存放这些数据
|
||||||
|
|
||||||
Copyright : All right reserved by ICT
|
Copyright : All right reserved by ICT
|
||||||
|
|
||||||
|
|
@ -26,33 +26,40 @@
|
||||||
using std::vector;
|
using std::vector;
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
|
using ReadFilterOut = bool (*)(const bam1_t *b);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* bam
|
* 存放读入的bam数据
|
||||||
*/
|
*/
|
||||||
struct BamBuf {
|
struct BamBuf {
|
||||||
sam_hdr_t *hdr; // samheader
|
sam_hdr_t *hdr; // sam文件的header信息
|
||||||
samFile *fp; // sam
|
samFile *fp; // sam文件指针
|
||||||
BamWrap *bw = nullptr; // bam
|
BamWrap *bw = nullptr; // 用来循环读入bam
|
||||||
uint8_t *mem = nullptr; // bam,
|
uint8_t *mem = nullptr; // 用来存放bam的数据,
|
||||||
// ,
|
// 程序结束后自动释放,所以没在析构函数里释放
|
||||||
int64_t mem_offset = 0; //
|
int64_t mem_offset = 0; // 下一次要存放的位置
|
||||||
int64_t mem_size; //
|
int64_t mem_size; // 缓存大小
|
||||||
int read_stat_ = 0; // ,
|
int read_stat_ = 0; // 读取状态,是否读完
|
||||||
vector<BamWrap *> bv; // bam
|
vector<BamWrap *> bv; // 方便对bam数据的访问
|
||||||
int64_t legacy_start = 0; // bammem,
|
int64_t legacy_start = 0; // 没处理完的bam在mem中的起始位置, 闭区间
|
||||||
int64_t legacy_end = 0; // bammem,
|
int64_t legacy_end = 0; // 没处理完的bam在mem中的结束位置, 开区间
|
||||||
bool handle_last = false; // bam
|
bool handle_last = false; // 上次最后读入的bam是否需要处理
|
||||||
|
ReadFilterOut filter_out = nullptr; // 读入过滤函数指针
|
||||||
|
|
||||||
//
|
// 初始化缓存
|
||||||
void Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size);
|
void Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size);
|
||||||
// ,
|
void Init(samFile* fp, sam_hdr_t* hdr, int64_t mem_size, ReadFilterOut filter) {
|
||||||
|
this->filter_out = filter;
|
||||||
|
Init(fp, hdr, mem_size);
|
||||||
|
}
|
||||||
|
// 读取数据直到读完,或者缓冲区满
|
||||||
int ReadBam();
|
int ReadBam();
|
||||||
// ,
|
// 为下一次读取做准备, 计算一些边界条件
|
||||||
void ClearBeforeIdx(size_t idxInBv);
|
void ClearBeforeIdx(size_t idxInBv);
|
||||||
//
|
// 清空上一次所有读入的数据
|
||||||
void ClearAll();
|
void ClearAll();
|
||||||
inline int64_t Size() { return bv.size(); } // read
|
inline int64_t Size() { return bv.size(); } // 包含多少个read
|
||||||
inline int ReadStat() { return read_stat_; } // ,()
|
inline int ReadStat() { return read_stat_; } // 文件的读取状态,是否可读(读取完全)
|
||||||
~BamBuf() {
|
~BamBuf() {
|
||||||
if (this->mem != nullptr) {
|
if (this->mem != nullptr) {
|
||||||
free(this->mem);
|
free(this->mem);
|
||||||
|
|
@ -62,7 +69,7 @@ struct BamBuf {
|
||||||
free(bw);
|
free(bw);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
void FreeMemory() //
|
void FreeMemory() // 释放开辟的内存
|
||||||
{
|
{
|
||||||
if (this->mem != nullptr) {
|
if (this->mem != nullptr) {
|
||||||
free(this->mem);
|
free(this->mem);
|
||||||
|
|
@ -75,14 +82,14 @@ struct BamBuf {
|
||||||
this->bw = nullptr;
|
this->bw = nullptr;
|
||||||
}
|
}
|
||||||
void prepare_read();
|
void prepare_read();
|
||||||
//
|
// 检查缓存是否还有空间
|
||||||
bool has_enough_space();
|
bool has_enough_space();
|
||||||
// bam
|
// 处理一个读取后的bam
|
||||||
void append_one_bam();
|
void append_one_bam();
|
||||||
// read
|
// 处理上次读入的最后一个read
|
||||||
bool handle_last_read();
|
bool handle_last_read();
|
||||||
|
|
||||||
// bv
|
// 针对bv的操作
|
||||||
inline BamWrap *operator[](int64_t pos) { return bv[pos]; }
|
inline BamWrap *operator[](int64_t pos) { return bv[pos]; }
|
||||||
inline void push_back(BamWrap *val) { bv.push_back(val); }
|
inline void push_back(BamWrap *val) { bv.push_back(val); }
|
||||||
inline void clear() { bv.clear(); }
|
inline void clear() { bv.clear(); }
|
||||||
|
|
@ -90,53 +97,57 @@ struct BamBuf {
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* io
|
* io异步缓冲区
|
||||||
*/
|
*/
|
||||||
struct AsyncIoBamBuf {
|
struct AsyncIoBamBuf {
|
||||||
BamBuf buf1_;
|
BamBuf buf1_;
|
||||||
BamBuf buf2_;
|
BamBuf buf2_;
|
||||||
BamBuf *pi_; // buf
|
BamBuf *pi_; // 当前用的buf
|
||||||
BamBuf *po_; // buf
|
BamBuf *po_; // 后台在读取的buf
|
||||||
pthread_t *tid_ = NULL;
|
pthread_t *tid_ = NULL;
|
||||||
bool hasThread = false;
|
bool hasThread = false;
|
||||||
int64_t legacy_size_ = 0; // ,read
|
int64_t legacy_size_ = 0; // 上一轮运算之后,缓存中还剩余的上次读取的read数量
|
||||||
bool first_read_ = true;
|
bool first_read_ = true;
|
||||||
int last_read_num_ = 0; // reads
|
int last_read_num_ = 0; // 上一次读取了多少reads
|
||||||
bool need_read_ = true;
|
bool need_read_ = true;
|
||||||
bool use_async_io_ = true;
|
bool use_async_io_ = true;
|
||||||
int64_t clear_before_idx_ = 0; // ,clear_before_idx_reads
|
int64_t clear_before_idx_ = 0; // 用户异步读取,下一轮读取之前清理掉clear_before_idx_之前的所有reads
|
||||||
bool clear_all_ = false; // ,reads
|
bool clear_all_ = false; // 用于异步读取,下一轮读取之前清理掉之前的所有reads
|
||||||
|
|
||||||
vector<BamWrap *> bam_arr_; // bufbam
|
vector<BamWrap *> bam_arr_; // 用来访问buf中的bam
|
||||||
|
|
||||||
AsyncIoBamBuf() {}
|
AsyncIoBamBuf() {}
|
||||||
AsyncIoBamBuf(bool use_async) : use_async_io_(use_async) {}
|
AsyncIoBamBuf(bool use_async) : use_async_io_(use_async) {}
|
||||||
//
|
// 析构
|
||||||
~AsyncIoBamBuf() {
|
~AsyncIoBamBuf() {
|
||||||
if (tid_ != NULL) {
|
if (tid_ != NULL) {
|
||||||
if (hasThread)
|
if (hasThread)
|
||||||
pthread_join(*tid_, 0);
|
pthread_join(*tid_, 0);
|
||||||
free(tid_);
|
free(tid_);
|
||||||
}
|
}
|
||||||
//
|
// 其他的内存就等程序结束自动释放
|
||||||
// buf
|
// buf的析构函数会自动调用
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
// 初始化缓存
|
||||||
void Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size);
|
void Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size);
|
||||||
|
void Init(samFile* fp, sam_hdr_t* hdr, int64_t mem_size, ReadFilterOut filter) {
|
||||||
//
|
Init(fp, hdr, mem_size);
|
||||||
|
buf1_.filter_out = filter;
|
||||||
|
buf2_.filter_out = filter;
|
||||||
|
}
|
||||||
|
// 读取数据
|
||||||
int ReadBam();
|
int ReadBam();
|
||||||
// ,
|
// 为下一次读取做准备, 计算一些边界条件
|
||||||
void ClearBeforeIdx(size_t idxInBv);
|
void ClearBeforeIdx(size_t idxInBv);
|
||||||
vector<BamWrap *> &GetBamArr() { return bam_arr_; } // bam array
|
vector<BamWrap *> &GetBamArr() { return bam_arr_; } // 获取bam array
|
||||||
//
|
// 清空上一次所有读入的数据
|
||||||
void ClearAll();
|
void ClearAll();
|
||||||
// read
|
// 包含的read数量
|
||||||
inline int64_t Size() { return legacy_size_ + pi_->Size(); }
|
inline int64_t Size() { return legacy_size_ + pi_->Size(); }
|
||||||
inline int ReadStat() { return pi_->read_stat_; }
|
inline int ReadStat() { return pi_->read_stat_; }
|
||||||
inline BamWrap *operator[](int64_t pos) { return bam_arr_[pos]; }
|
inline BamWrap *operator[](int64_t pos) { return bam_arr_[pos]; }
|
||||||
// reads
|
// 获取某一段reads
|
||||||
inline vector<BamWrap *> Slice(size_t startIdx, size_t endIdx) {
|
inline vector<BamWrap *> Slice(size_t startIdx, size_t endIdx) {
|
||||||
if (endIdx > startIdx) {
|
if (endIdx > startIdx) {
|
||||||
auto begItr = bam_arr_.begin();
|
auto begItr = bam_arr_.begin();
|
||||||
|
|
@ -149,11 +160,11 @@ struct AsyncIoBamBuf {
|
||||||
buf2_.FreeMemory();
|
buf2_.FreeMemory();
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
// 同步读取
|
||||||
int sync_read_bam();
|
int sync_read_bam();
|
||||||
//
|
// 异步读取
|
||||||
int async_read_bam();
|
int async_read_bam();
|
||||||
//
|
// 异步读取线程函数
|
||||||
static void *async_read(void *data);
|
static void *async_read(void *data);
|
||||||
void resize_buf();
|
void resize_buf();
|
||||||
inline void refresh_bam_arr() {
|
inline void refresh_bam_arr() {
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
Description: sam/bam,buf,
|
Description: 读入sam/bam时,开辟一个大的buf,存放这些数据
|
||||||
|
|
||||||
Copyright : All right reserved by ICT
|
Copyright : All right reserved by ICT
|
||||||
|
|
||||||
|
|
@ -19,38 +19,55 @@
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
|
// 对原始bam数据的补充,比如对两端进行hardclip等
|
||||||
|
struct ReadAdditionData {
|
||||||
|
int read_len = 0; // read长度,各种clip之后的长度
|
||||||
|
int cigar_start = 0; // cigar起始位置,闭区间
|
||||||
|
int cigar_end = 0; // cigar结束位置,开区间
|
||||||
|
int first_cigar_clip = 0; // 第一个cigar, clip的数量,切左侧
|
||||||
|
int last_cigar_clip = 0; // 最后一个cigar, clip的数量,切右侧
|
||||||
|
int left_clip = 0; // 左侧被切掉的碱基长度
|
||||||
|
int right_clip = 0; // 右侧被切掉的碱基长度
|
||||||
|
int ref_offset = 0; // 切除adapter和softclip之后(softclip应该不影响),相对原始ref比对位置(contig_pos)的偏移量
|
||||||
|
string bases; // 处理之后的read的碱基
|
||||||
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
,,,,,
|
这里的成员函数命名有点混乱,特此说明,小写加下划线的函数命名,无论是静态函数,还是普通成员函数,更侧重说明
|
||||||
bam,,。
|
这是类似bam的一个属性,而大写加驼峰命名的函数,更侧重说明这是通过计算得出的。
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* sam read
|
* sam read的封装
|
||||||
*/
|
*/
|
||||||
struct BamWrap {
|
struct BamWrap {
|
||||||
// contigpos
|
// 将contig左移后加上pos作为全局位置
|
||||||
const static int MAX_CONTIG_LEN_SHIFT = 40; // id,
|
const static int MAX_CONTIG_LEN_SHIFT = 40; // 将染色体id左移多少位,和位点拼合在一起
|
||||||
const static int READ_MAX_LENGTH = 200;
|
const static int READ_MAX_LENGTH = 200;
|
||||||
const static int READ_MAX_DEPTH = 1000; // ,
|
const static int READ_MAX_DEPTH = 1000; // 这只是用来初始化空间用的,深度大于这个值也没关系
|
||||||
|
|
||||||
// ,
|
// 成员变量尽量少,减少占用内存空间
|
||||||
bam1_t *b;
|
bam1_t *b;
|
||||||
int64_t end_pos_; // bam,
|
int64_t end_pos_; // bam的全局结束位置, 相对ref, 闭区间
|
||||||
|
|
||||||
//
|
// 全局开始位置
|
||||||
inline int64_t start_pos() { return bam_global_pos(b); }
|
inline int64_t start_pos() { return bam_global_pos(b); }
|
||||||
//
|
// 全局结束位置
|
||||||
inline int64_t end_pos() { return end_pos_; }
|
inline int64_t end_pos() { return end_pos_; }
|
||||||
// reference
|
// 和reference对应的序列长度,不是read包含碱基的个数
|
||||||
inline int16_t read_len() { return (end_pos_ - start_pos() + 1); }
|
inline int16_t read_len() { return (end_pos_ - start_pos() + 1); }
|
||||||
|
|
||||||
// contig
|
// contig id
|
||||||
|
inline int32_t contig_id() { return b->core.tid; }
|
||||||
|
// 在contig内的开始位置
|
||||||
inline int32_t contig_pos() { return b->core.pos; }
|
inline int32_t contig_pos() { return b->core.pos; }
|
||||||
// contig
|
// 在contig内部的结束位置
|
||||||
inline int32_t contig_end_pos() { return bam_pos(end_pos_); }
|
inline int32_t contig_end_pos() { return bam_pos(end_pos_); }
|
||||||
// (AGTC)
|
// 序列的长度(AGTC字母个数)
|
||||||
inline int16_t seq_len() { return b->core.l_qseq; }
|
inline int16_t seq_len() { return b->core.l_qseq; }
|
||||||
|
|
||||||
// softclip
|
/*
|
||||||
|
// 算上开头的softclip
|
||||||
inline int32_t softclip_start() {
|
inline int32_t softclip_start() {
|
||||||
const uint32_t *cigar = bam_get_cigar(b);
|
const uint32_t *cigar = bam_get_cigar(b);
|
||||||
const bam1_core_t &bc = b->core;
|
const bam1_core_t &bc = b->core;
|
||||||
|
|
@ -61,18 +78,28 @@ struct BamWrap {
|
||||||
return bc.pos;
|
return bc.pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
// softclip
|
inline int64_t global_softclip_start() {
|
||||||
|
return softclip_start() + ((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 算上结尾的softclip,闭区间
|
||||||
inline int32_t softclip_end() {
|
inline int32_t softclip_end() {
|
||||||
const uint32_t *cigar = bam_get_cigar(b);
|
const uint32_t *cigar = bam_get_cigar(b);
|
||||||
const bam1_core_t &bc = b->core;
|
const bam1_core_t &bc = b->core;
|
||||||
const char c = bam_cigar_opchr(cigar[bc.n_cigar - 1]);
|
const int idx = bc.n_cigar - 1;
|
||||||
const int len = bam_cigar_oplen(cigar[bc.n_cigar - 1]);
|
if (idx < 0) return bam_pos(end_pos_);
|
||||||
|
const char c = bam_cigar_opchr(cigar[idx]);
|
||||||
|
const int len = bam_cigar_oplen(cigar[idx]);
|
||||||
if (c == 'S')
|
if (c == 'S')
|
||||||
return bam_pos(end_pos_) + len;
|
return bam_pos(end_pos_) + len;
|
||||||
return bam_pos(end_pos_);
|
return bam_pos(end_pos_);
|
||||||
}
|
}
|
||||||
|
|
||||||
// softclip
|
inline int64_t global_softclip_end() {
|
||||||
|
return softclip_end() + ((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 右边softclip的长度
|
||||||
inline int32_t right_softclip_len() {
|
inline int32_t right_softclip_len() {
|
||||||
const uint32_t *cigar = bam_get_cigar(b);
|
const uint32_t *cigar = bam_get_cigar(b);
|
||||||
const bam1_core_t &bc = b->core;
|
const bam1_core_t &bc = b->core;
|
||||||
|
|
@ -82,8 +109,9 @@ struct BamWrap {
|
||||||
return len;
|
return len;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
//
|
// 获取序列
|
||||||
inline std::string sequence() {
|
inline std::string sequence() {
|
||||||
ostringstream oss;
|
ostringstream oss;
|
||||||
char *seq = (char *)bam_get_seq(b);
|
char *seq = (char *)bam_get_seq(b);
|
||||||
|
|
@ -96,9 +124,9 @@ struct BamWrap {
|
||||||
return std::move(oss.str());
|
return std::move(oss.str());
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
// 获取名字
|
||||||
inline const char *query_name() { return bam_get_qname(b); }
|
inline const char *query_name() { return bam_get_qname(b); }
|
||||||
// cigar
|
// 获取cigar 字符串
|
||||||
inline string cigar_str() {
|
inline string cigar_str() {
|
||||||
ostringstream oss;
|
ostringstream oss;
|
||||||
const uint32_t *cigar = bam_get_cigar(b);
|
const uint32_t *cigar = bam_get_cigar(b);
|
||||||
|
|
@ -111,10 +139,10 @@ struct BamWrap {
|
||||||
return std::move(oss.str());
|
return std::move(oss.str());
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
// 占用的内存大小
|
||||||
inline int16_t length() { return sizeof(*this) + sizeof(bam1_t) + b->l_data; }
|
inline int16_t length() { return sizeof(*this) + sizeof(bam1_t) + b->l_data; }
|
||||||
|
|
||||||
// cigarinsert
|
// 获取cigar中insert的总长度
|
||||||
inline int32_t insert_cigar_len() {
|
inline int32_t insert_cigar_len() {
|
||||||
const uint32_t *cigar = bam_get_cigar(b);
|
const uint32_t *cigar = bam_get_cigar(b);
|
||||||
const bam1_core_t &bc = b->core;
|
const bam1_core_t &bc = b->core;
|
||||||
|
|
@ -128,7 +156,7 @@ struct BamWrap {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
// cigardelete
|
// 获取cigar中delete的总长度
|
||||||
inline int32_t del_cigar_len() {
|
inline int32_t del_cigar_len() {
|
||||||
const uint32_t *cigar = bam_get_cigar(b);
|
const uint32_t *cigar = bam_get_cigar(b);
|
||||||
const bam1_core_t &bc = b->core;
|
const bam1_core_t &bc = b->core;
|
||||||
|
|
@ -142,7 +170,7 @@ struct BamWrap {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
// sam read
|
// 计算sam read的终点位置,相对参考基因组
|
||||||
static inline int64_t BamEndPos(const bam1_t *b) {
|
static inline int64_t BamEndPos(const bam1_t *b) {
|
||||||
const uint32_t *cigar = bam_get_cigar(b);
|
const uint32_t *cigar = bam_get_cigar(b);
|
||||||
const bam1_core_t &bc = b->core;
|
const bam1_core_t &bc = b->core;
|
||||||
|
|
@ -156,6 +184,20 @@ struct BamWrap {
|
||||||
return (((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)(b->core.pos + start_offset));
|
return (((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)(b->core.pos + start_offset));
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// 计算read的有效长度,即除了softclip和hardclip之外的长度
|
||||||
|
static inline int BamEffectiveLength(const bam1_t *b) {
|
||||||
|
const uint32_t *cigar = bam_get_cigar(b);
|
||||||
|
const bam1_core_t &bc = b->core;
|
||||||
|
int effective_len = 0;
|
||||||
|
for (int i = 0; i < bc.n_cigar; ++i) {
|
||||||
|
const char c = bam_cigar_opchr(cigar[i]);
|
||||||
|
const int len = bam_cigar_oplen(cigar[i]);
|
||||||
|
if (c == 'I' || c == 'N' || c == 'M' || c == '=' || c == 'X')
|
||||||
|
effective_len += len;
|
||||||
|
}
|
||||||
|
return effective_len;
|
||||||
|
};
|
||||||
|
|
||||||
bool HasWellDefinedFragmentSize() {
|
bool HasWellDefinedFragmentSize() {
|
||||||
const bam1_core_t &bc = b->core;
|
const bam1_core_t &bc = b->core;
|
||||||
bool hasWellDefinedFragmentSize = true;
|
bool hasWellDefinedFragmentSize = true;
|
||||||
|
|
@ -170,20 +212,25 @@ struct BamWrap {
|
||||||
return hasWellDefinedFragmentSize;
|
return hasWellDefinedFragmentSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
// bamadapterBoundary
|
// 计算bam的adapterBoundary
|
||||||
int GetAdapterBoundary() {
|
int GetAdapterBoundary() {
|
||||||
const bam1_core_t &bc = b->core;
|
const bam1_core_t &bc = b->core;
|
||||||
int adapterBoundary;
|
int adapterBoundary = INT_MIN;
|
||||||
if (!HasWellDefinedFragmentSize())
|
if (!HasWellDefinedFragmentSize())
|
||||||
adapterBoundary = INT_MIN;
|
adapterBoundary = INT_MIN;
|
||||||
else if (bc.flag & BAM_FREVERSE)
|
else if (bc.flag & BAM_FREVERSE)
|
||||||
adapterBoundary = bc.mpos - 1;
|
adapterBoundary = bc.mpos - 1;
|
||||||
else
|
else
|
||||||
adapterBoundary = bc.pos + abs(bc.isize); // GATK4.0 GATK3.5,3.5+1
|
adapterBoundary = bc.pos + abs(bc.isize); // GATK4.0 和 GATK3.5不一样,3.5的这里+1
|
||||||
return adapterBoundary;
|
return adapterBoundary;
|
||||||
}
|
}
|
||||||
|
|
||||||
// I
|
// 检测adapter boundary是否在read范围内
|
||||||
|
bool IsAdapterInRead(int adapterBoundary) {
|
||||||
|
return (adapterBoundary != INT_MIN && (adapterBoundary >= contig_pos() && adapterBoundary <= contig_end_pos()));
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取开头的I的长度
|
||||||
inline int GetHeadInsertLen() {
|
inline int GetHeadInsertLen() {
|
||||||
int insLen = 0;
|
int insLen = 0;
|
||||||
const uint32_t *cigar = bam_get_cigar(b);
|
const uint32_t *cigar = bam_get_cigar(b);
|
||||||
|
|
@ -200,8 +247,8 @@ struct BamWrap {
|
||||||
return insLen;
|
return insLen;
|
||||||
}
|
}
|
||||||
|
|
||||||
// soft clip(HS,?,
|
// 获取soft clip开始位置(能处理H和S相连的情况,有这种情况么?,
|
||||||
// IS?)
|
// 注意开头的I要当做S?)
|
||||||
inline int64_t GetSoftStart() {
|
inline int64_t GetSoftStart() {
|
||||||
int64_t softStart = b->core.pos;
|
int64_t softStart = b->core.pos;
|
||||||
const uint32_t *cigar = bam_get_cigar(b);
|
const uint32_t *cigar = bam_get_cigar(b);
|
||||||
|
|
@ -209,7 +256,8 @@ struct BamWrap {
|
||||||
for (int i = 0; i < bc.n_cigar; ++i) {
|
for (int i = 0; i < bc.n_cigar; ++i) {
|
||||||
const char c = bam_cigar_opchr(cigar[i]);
|
const char c = bam_cigar_opchr(cigar[i]);
|
||||||
const int len = bam_cigar_oplen(cigar[i]);
|
const int len = bam_cigar_oplen(cigar[i]);
|
||||||
if (c == 'S' || c == 'I')
|
// if (c == 'S' || c == 'I')
|
||||||
|
if (c == 'S')
|
||||||
softStart -= len;
|
softStart -= len;
|
||||||
else if (c != 'H')
|
else if (c != 'H')
|
||||||
break;
|
break;
|
||||||
|
|
@ -217,7 +265,36 @@ struct BamWrap {
|
||||||
return softStart;
|
return softStart;
|
||||||
}
|
}
|
||||||
|
|
||||||
// unclipped(hardclip)
|
/**
|
||||||
|
* Calculates the reference coordinate for the end of the read taking into account soft clips but not hard clips.
|
||||||
|
*
|
||||||
|
* Note: getUnclippedEnd() adds soft and hard clips, this function only adds soft clips.
|
||||||
|
*
|
||||||
|
* @return the unclipped end of the read taking soft clips (but not hard clips) into account
|
||||||
|
*/
|
||||||
|
inline int64_t GetSoftEnd() {
|
||||||
|
int64_t softEnd = contig_end_pos();
|
||||||
|
const uint32_t* cigar = bam_get_cigar(b);
|
||||||
|
const bam1_core_t& bc = b->core;
|
||||||
|
bool foundAlignedBase = false;
|
||||||
|
for (int i = bc.n_cigar - 1; i >= 0; --i) {
|
||||||
|
const char c = bam_cigar_opchr(cigar[i]);
|
||||||
|
const int len = bam_cigar_oplen(cigar[i]);
|
||||||
|
// if (c == 'S' || c == 'I')
|
||||||
|
if (c == 'S')
|
||||||
|
softEnd += len;
|
||||||
|
else if (c != 'H') {
|
||||||
|
foundAlignedBase = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!foundAlignedBase) { // for example 64H14S, the soft end is actually the same as the alignment end
|
||||||
|
softEnd = contig_end_pos();
|
||||||
|
}
|
||||||
|
return softEnd;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取unclipped开始位置(包括hardclip)
|
||||||
inline int64_t GetUnclippedStart() {
|
inline int64_t GetUnclippedStart() {
|
||||||
int64_t start = b->core.pos;
|
int64_t start = b->core.pos;
|
||||||
const uint32_t *cigar = bam_get_cigar(b);
|
const uint32_t *cigar = bam_get_cigar(b);
|
||||||
|
|
@ -233,7 +310,7 @@ struct BamWrap {
|
||||||
return start;
|
return start;
|
||||||
}
|
}
|
||||||
|
|
||||||
// unclipped(hardclip)
|
// 获取unclipped结束位置(包括hardclip)
|
||||||
inline int64_t GetUnclippedEnd() {
|
inline int64_t GetUnclippedEnd() {
|
||||||
int64_t end_pos = bam_endpos(b);
|
int64_t end_pos = bam_endpos(b);
|
||||||
const uint32_t *cigar = bam_get_cigar(b);
|
const uint32_t *cigar = bam_get_cigar(b);
|
||||||
|
|
@ -249,7 +326,7 @@ struct BamWrap {
|
||||||
return end_pos - 1;
|
return end_pos - 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* */
|
/* 获取碱基质量分数的加和 */
|
||||||
/** Calculates a score for the read which is the sum of scores over Q15. */
|
/** Calculates a score for the read which is the sum of scores over Q15. */
|
||||||
inline int GetSumOfBaseQualities() {
|
inline int GetSumOfBaseQualities() {
|
||||||
int score = 0;
|
int score = 0;
|
||||||
|
|
@ -262,9 +339,9 @@ struct BamWrap {
|
||||||
return score;
|
return score;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* flag */
|
/* 与flag相关的检测 */
|
||||||
|
|
||||||
/* unmapped */
|
/* 没有比对上 unmapped */
|
||||||
inline bool GetReadUnmappedFlag() { return b->core.flag & BAM_FUNMAP; }
|
inline bool GetReadUnmappedFlag() { return b->core.flag & BAM_FUNMAP; }
|
||||||
|
|
||||||
/* Template having multiple segments in sequencing */
|
/* Template having multiple segments in sequencing */
|
||||||
|
|
@ -313,7 +390,7 @@ struct BamWrap {
|
||||||
*/
|
*/
|
||||||
bool GetMateNegativeStrandFlag() { return b->core.flag & BAM_FMREVERSE; }
|
bool GetMateNegativeStrandFlag() { return b->core.flag & BAM_FMREVERSE; }
|
||||||
|
|
||||||
/* */
|
/* 其他的一些信息 */
|
||||||
inline int GetReferenceLength() {
|
inline int GetReferenceLength() {
|
||||||
int length = 0;
|
int length = 0;
|
||||||
const uint32_t *cigar = bam_get_cigar(b);
|
const uint32_t *cigar = bam_get_cigar(b);
|
||||||
|
|
@ -336,26 +413,26 @@ struct BamWrap {
|
||||||
return length;
|
return length;
|
||||||
}
|
}
|
||||||
|
|
||||||
// bam,
|
// 计算bam的全局位置,算上染色体序号和比对位置
|
||||||
static inline int64_t bam_global_pos(bam1_t *b) {
|
static inline int64_t bam_global_pos(bam1_t *b) {
|
||||||
return (((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)b->core.pos);
|
return (((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)b->core.pos);
|
||||||
}
|
}
|
||||||
static inline int64_t bam_global_pos(int tid, int pos) {
|
static inline int64_t bam_global_pos(int tid, int pos) {
|
||||||
return (((int64_t)tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)pos);
|
return (((int64_t)tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)pos);
|
||||||
}
|
}
|
||||||
// bam
|
// 根据全局位置获取bam的染色体序号
|
||||||
static inline int32_t bam_tid(int64_t global_pos) {
|
static inline int32_t bam_tid(int64_t global_pos) {
|
||||||
const int64_t mask = ~(((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1);
|
const int64_t mask = ~(((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1);
|
||||||
const int64_t high_tid = global_pos & mask;
|
const int64_t high_tid = global_pos & mask;
|
||||||
return (int32_t)(high_tid >> MAX_CONTIG_LEN_SHIFT);
|
return (int32_t)(high_tid >> MAX_CONTIG_LEN_SHIFT);
|
||||||
}
|
}
|
||||||
// bam()
|
// 根据全局位置获取bam的比对位置(染色体内)
|
||||||
static inline int32_t bam_pos(int64_t global_pos) {
|
static inline int32_t bam_pos(int64_t global_pos) {
|
||||||
const int64_t mask = ((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1;
|
const int64_t mask = ((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1;
|
||||||
return (int32_t)(global_pos & mask);
|
return (int32_t)(global_pos & mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
// 设置是否冗余的标记
|
||||||
void SetDuplicateReadFlag(bool flag) { setFlag(flag, BAM_FDUP); }
|
void SetDuplicateReadFlag(bool flag) { setFlag(flag, BAM_FDUP); }
|
||||||
|
|
||||||
void setFlag(bool flag, int bit) {
|
void setFlag(bool flag, int bit) {
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,131 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
#include <fstream>
|
||||||
|
#include <iomanip>
|
||||||
|
#include <iostream>
|
||||||
|
#include <sstream>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
using std::ifstream;
|
||||||
|
using std::ofstream;
|
||||||
|
using std::ostringstream;
|
||||||
|
using std::string;
|
||||||
|
using std::vector;
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
// 二进制读写相关
|
||||||
|
struct BinaryUtil {
|
||||||
|
static void WriteInt(ofstream& out, int val) {
|
||||||
|
uint32_t i = (uint32_t)val;
|
||||||
|
out << (char)(i & 0xFF) << (char)((i >> 8) & 0xFF) << (char)((i >> 16) & 0xFF) << (char)((i >> 24) & 0xFF);
|
||||||
|
};
|
||||||
|
|
||||||
|
static void WriteLong(ofstream& out, uint64_t val) {
|
||||||
|
uint64_t i = val;
|
||||||
|
out << (char)(i & 0xFF) << (char)((i >> 8) & 0xFF) << (char)((i >> 16) & 0xFF) << (char)((i >> 24) & 0xFF) << (char)((i >> 32) & 0xFF)
|
||||||
|
<< (char)((i >> 40) & 0xFF) << (char)((i >> 48) & 0xFF) << (char)((i >> 56) & 0xFF);
|
||||||
|
};
|
||||||
|
|
||||||
|
static void WriteStr(ofstream& out, const string& s) {
|
||||||
|
for (int i = 0; i < s.size(); ++i) out << s[i];
|
||||||
|
out << (char)0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ReadInt(char* buf, uint64_t total, uint64_t* cur, int* res) {
|
||||||
|
char b1, b2, b3, b4;
|
||||||
|
if (*cur + 4 > total)
|
||||||
|
return false;
|
||||||
|
b1 = buf[(*cur)++];
|
||||||
|
b2 = buf[(*cur)++];
|
||||||
|
b3 = buf[(*cur)++];
|
||||||
|
b4 = buf[(*cur)++];
|
||||||
|
*res = (((uint32_t)(uint8_t)b4) << 24) + (((uint32_t)(uint8_t)b3) << 16) + (((uint32_t)(uint8_t)b2) << 8) + (((uint32_t)(uint8_t)b1));
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
|
||||||
|
static bool ReadInt(ifstream& ifs, int* res) {
|
||||||
|
// if(ifs.read((char*)res, sizeof(*res))) return true;
|
||||||
|
char b[4];
|
||||||
|
if (!ifs.read(&b[0], 1))
|
||||||
|
return false;
|
||||||
|
if (!ifs.read(&b[1], 1))
|
||||||
|
return false;
|
||||||
|
if (!ifs.read(&b[2], 1))
|
||||||
|
return false;
|
||||||
|
if (!ifs.read(&b[3], 1))
|
||||||
|
return false;
|
||||||
|
uint64_t cur = 0;
|
||||||
|
return ReadInt((char*)b, 4, &cur, res);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ReadLong(char* buf, uint64_t total, uint64_t* cur, uint64_t* res) {
|
||||||
|
char b1, b2, b3, b4, b5, b6, b7, b8;
|
||||||
|
if (*cur + 8 > total)
|
||||||
|
return false;
|
||||||
|
b1 = buf[(*cur)++];
|
||||||
|
b2 = buf[(*cur)++];
|
||||||
|
b3 = buf[(*cur)++];
|
||||||
|
b4 = buf[(*cur)++];
|
||||||
|
b5 = buf[(*cur)++];
|
||||||
|
b6 = buf[(*cur)++];
|
||||||
|
b7 = buf[(*cur)++];
|
||||||
|
b8 = buf[(*cur)++];
|
||||||
|
*res = (((uint64_t)(uint8_t)b8) << 56) + (((uint64_t)(uint8_t)b7) << 48) + (((uint64_t)(uint8_t)b6) << 40) + (((uint64_t)(uint8_t)b5) << 32) +
|
||||||
|
(((uint64_t)(uint8_t)b4) << 24) + (((uint64_t)(uint8_t)b3) << 16) + (((uint64_t)(uint8_t)b2) << 8) + (((uint64_t)(uint8_t)b1));
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
|
||||||
|
static bool ReadLong(ifstream& ifs, uint64_t* res) {
|
||||||
|
// if(ifs.read((char*)res, sizeof(*res))) return true;
|
||||||
|
char b[8];
|
||||||
|
if (!ifs.read(&b[0], 1))
|
||||||
|
return false;
|
||||||
|
if (!ifs.read(&b[1], 1))
|
||||||
|
return false;
|
||||||
|
if (!ifs.read(&b[2], 1))
|
||||||
|
return false;
|
||||||
|
if (!ifs.read(&b[3], 1))
|
||||||
|
return false;
|
||||||
|
if (!ifs.read(&b[4], 1))
|
||||||
|
return false;
|
||||||
|
if (!ifs.read(&b[5], 1))
|
||||||
|
return false;
|
||||||
|
if (!ifs.read(&b[6], 1))
|
||||||
|
return false;
|
||||||
|
if (!ifs.read(&b[7], 1))
|
||||||
|
return false;
|
||||||
|
uint64_t cur = 0;
|
||||||
|
return ReadLong((char*)b, 8, &cur, res);
|
||||||
|
}
|
||||||
|
static bool ReadStr(ifstream& ifs, string* res) {
|
||||||
|
char b;
|
||||||
|
res->clear();
|
||||||
|
if (!ifs.read(&b, 1))
|
||||||
|
return false;
|
||||||
|
while ((int)b != 0) {
|
||||||
|
res->push_back(b);
|
||||||
|
if (!ifs.read(&b, 1))
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
static bool ReadStr(char* buf, uint64_t total, uint64_t* cur, string* res) {
|
||||||
|
char b;
|
||||||
|
res->clear();
|
||||||
|
if (*cur >= total)
|
||||||
|
return false;
|
||||||
|
b = buf[(*cur)++];
|
||||||
|
while ((int)b != 0) {
|
||||||
|
res->push_back(b);
|
||||||
|
if (*cur >= total)
|
||||||
|
return false;
|
||||||
|
b = buf[(*cur)++];
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
@ -0,0 +1,295 @@
|
||||||
|
/*
|
||||||
|
Description: 处理intervals
|
||||||
|
|
||||||
|
Copyright : All right reserved by ICT
|
||||||
|
|
||||||
|
Author : Zhang Zhonghai
|
||||||
|
Date : 2019/11/24
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "interval.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <sstream>
|
||||||
|
#include <fstream>
|
||||||
|
#include <string>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
#include <htslib/sam.h>
|
||||||
|
|
||||||
|
#include "utils.h"
|
||||||
|
#include "bam_wrap.h"
|
||||||
|
|
||||||
|
using std::min;
|
||||||
|
using std::max;
|
||||||
|
using std::string;
|
||||||
|
using std::ifstream;
|
||||||
|
using std::stringstream;
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
// 构造函数
|
||||||
|
Interval::Interval() : Interval(0, 0) {}
|
||||||
|
Interval::Interval(int64_t l, int64_t r) : left(l), right(r) {}
|
||||||
|
|
||||||
|
// 比较函数
|
||||||
|
bool Interval::operator<(const Interval& other) {
|
||||||
|
if (left == other.left) {
|
||||||
|
return right < other.right;
|
||||||
|
}
|
||||||
|
return left < other.left;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 是否有重叠
|
||||||
|
bool Interval::overlaps(const Interval &other) {
|
||||||
|
return left <= other.right && right >= other.left;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 两个interval的合并
|
||||||
|
Interval& Interval::spanWith(const Interval &other) {
|
||||||
|
left = min(left, other.left);
|
||||||
|
right = max(right, other.right);
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 返回两个interval的交集,不改变当前interval
|
||||||
|
Interval Interval::intersect(const Interval &that) const {
|
||||||
|
Interval val;
|
||||||
|
val.left = max(left, that.left);
|
||||||
|
val.right = min(right, that.right);
|
||||||
|
return val;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* 合并两个interval arr,取相交区域的交集, interval arr都是排序后的
|
||||||
|
*/
|
||||||
|
void Interval::IntersectIntervals(const IntervalArr &a_arr,
|
||||||
|
const IntervalArr &b_arr,
|
||||||
|
IntervalArr *r_arr) {
|
||||||
|
if (a_arr.size() < 1 || b_arr.size() < 1) return;
|
||||||
|
int ai=0, bi=0;
|
||||||
|
const Interval *last, *cur;
|
||||||
|
if (a_arr[ai].left < b_arr[bi].left) last = &a_arr[ai++];
|
||||||
|
else last = &b_arr[bi++];
|
||||||
|
while (ai < a_arr.size() && bi < b_arr.size()) {
|
||||||
|
if (a_arr[ai].left < b_arr[bi].left) cur = &a_arr[ai++];
|
||||||
|
else cur = &b_arr[bi++];
|
||||||
|
if (last->right < cur->left) {
|
||||||
|
last = cur; continue;
|
||||||
|
} else if (last->right > cur->right) {
|
||||||
|
r_arr->push_back(*cur);
|
||||||
|
} else {
|
||||||
|
r_arr->push_back(Interval(cur->left, last->right));
|
||||||
|
last = cur;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const IntervalArr *arrp;
|
||||||
|
int ii;
|
||||||
|
if (ai < a_arr.size()) { arrp = &a_arr; ii = ai;}
|
||||||
|
else { arrp = &b_arr; ii = bi; }
|
||||||
|
const IntervalArr &arr = *arrp;
|
||||||
|
while(ii < arr.size()) {
|
||||||
|
cur = &arr[ii++];
|
||||||
|
if (last->right < cur->left) {
|
||||||
|
break;
|
||||||
|
} else if (last->right > cur->right) {
|
||||||
|
r_arr->push_back(*cur);
|
||||||
|
} else {
|
||||||
|
r_arr->push_back(Interval(cur->left, last->right));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* 合并两个interval arr,取并集
|
||||||
|
*/
|
||||||
|
void Interval::UnionIntervals(const IntervalArr &a_arr,
|
||||||
|
const IntervalArr &b_arr,
|
||||||
|
IntervalArr *r_arr) {
|
||||||
|
Interval tmp;
|
||||||
|
const Interval *cur;
|
||||||
|
Interval *last;
|
||||||
|
int ai=0, bi=0;
|
||||||
|
if (a_arr.size() < 1) { *r_arr = b_arr; return; }
|
||||||
|
if (b_arr.size() < 1) { *r_arr = a_arr; return; }
|
||||||
|
r_arr->clear();
|
||||||
|
|
||||||
|
if (a_arr[ai].left < b_arr[bi].left) tmp = a_arr[ai++];
|
||||||
|
else tmp = b_arr[bi++];
|
||||||
|
last = &tmp;
|
||||||
|
while(ai < a_arr.size() && bi < b_arr.size()) {
|
||||||
|
if (a_arr[ai].left < b_arr[bi].left) cur = &a_arr[ai++];
|
||||||
|
else cur = &b_arr[bi++];
|
||||||
|
if (last->right < cur->left) {
|
||||||
|
r_arr->push_back(*last);
|
||||||
|
*last = *cur;
|
||||||
|
} else {
|
||||||
|
last->right = max(last->right, cur->right);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const IntervalArr *arrp;
|
||||||
|
int ii;
|
||||||
|
if (ai < a_arr.size()) { arrp = &a_arr; ii = ai; }
|
||||||
|
else { arrp = &b_arr; ii = bi; }
|
||||||
|
const IntervalArr &arr = *arrp;
|
||||||
|
|
||||||
|
while(ii < arr.size()) {
|
||||||
|
cur = &arr[ii++];
|
||||||
|
if (last->right < cur->left) {
|
||||||
|
r_arr->push_back(*last);
|
||||||
|
*last = *cur;
|
||||||
|
} else {
|
||||||
|
last->right = max(last->right, cur->right);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
r_arr->push_back(*last);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* 将有read覆盖的区域和参数提供的interval文件中的区域做一个交集
|
||||||
|
*/
|
||||||
|
int64_t Interval::MergeIntervals(const IntervalArr &n_arr,
|
||||||
|
const IntervalArr &t_arr,
|
||||||
|
IntervalArr &in_arr,
|
||||||
|
int64_t start_loc, // 闭区间
|
||||||
|
int64_t *end_loc, // 开区间
|
||||||
|
IntervalArr *r_arr) {
|
||||||
|
IntervalArr tmp_arr;
|
||||||
|
const int64_t end_loc_val = *end_loc;
|
||||||
|
if (in_arr.size() < 1) { // 如果输入的interval为空,则使用tumor normal覆盖的interval
|
||||||
|
UnionIntervals(n_arr, t_arr, &tmp_arr);
|
||||||
|
} else {
|
||||||
|
IntervalArr mid_arr;
|
||||||
|
UnionIntervals(n_arr, t_arr, &mid_arr);
|
||||||
|
IntersectIntervals(mid_arr, in_arr, &tmp_arr);
|
||||||
|
}
|
||||||
|
for(int i=tmp_arr.size()-1; i>=0; --i) {
|
||||||
|
if (tmp_arr[i].left >= end_loc_val) {
|
||||||
|
tmp_arr.pop_back(); // 删除该元素
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
tmp_arr[i].right = min(tmp_arr[i].right, end_loc_val - 1); // end_loc是开区间
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
for (int i=0; i<tmp_arr.size(); ++i) {
|
||||||
|
if (tmp_arr[i].right < start_loc) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (tmp_arr[i].left < start_loc) {
|
||||||
|
r_arr->push_back(Interval(start_loc, tmp_arr[i].right));
|
||||||
|
} else {
|
||||||
|
r_arr->push_back(tmp_arr[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int next_i = 0;
|
||||||
|
while(next_i < in_arr.size() && in_arr[next_i].right < end_loc_val) ++next_i;
|
||||||
|
if (next_i < in_arr.size()) {
|
||||||
|
if (end_loc_val < in_arr[next_i].left) {
|
||||||
|
*end_loc = in_arr[next_i].left; // 更新本次处理的终点
|
||||||
|
} else {
|
||||||
|
in_arr[next_i].left = end_loc_val; // 更新panel
|
||||||
|
}
|
||||||
|
int i=0, j=next_i;
|
||||||
|
for (; j<in_arr.size(); ++i, ++j) {
|
||||||
|
in_arr[i] = in_arr[j];
|
||||||
|
}
|
||||||
|
in_arr.resize(i);
|
||||||
|
} else {
|
||||||
|
in_arr.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t locus_num = 0;
|
||||||
|
for (int i=0; i<r_arr->size(); ++i) {
|
||||||
|
locus_num += (*r_arr)[i].right - (*r_arr)[i].left + 1;
|
||||||
|
}
|
||||||
|
return locus_num;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* 读取interval文件
|
||||||
|
*/
|
||||||
|
void Interval::ReadInterval(const string &interval_fn,
|
||||||
|
bam_hdr_t* header,
|
||||||
|
int interval_padding,
|
||||||
|
IntervalArr *r_arr) {
|
||||||
|
ifstream interval_fs(interval_fn);
|
||||||
|
string one_line;
|
||||||
|
IntervalArr intervals;
|
||||||
|
getline(interval_fs, one_line);
|
||||||
|
while (!interval_fs.eof()) {
|
||||||
|
if (one_line[0] == '@') {
|
||||||
|
getline(interval_fs, one_line);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
stringstream ss_line(one_line);
|
||||||
|
string contig_name;
|
||||||
|
ss_line >> contig_name;
|
||||||
|
int itid = sam_hdr_name2tid(header, contig_name.c_str());
|
||||||
|
if (itid < 0) error("[%s] interval file has unknown contig name [%s]\n", __func__, contig_name.c_str());
|
||||||
|
int64_t tid = (int64_t)itid;
|
||||||
|
tid <<= CONTIG_SHIFT;
|
||||||
|
int64_t start, stop;
|
||||||
|
ss_line >> start >> stop;
|
||||||
|
// interval文件是1-based,所以这里要减去1
|
||||||
|
intervals.push_back(Interval(tid + start - 1, tid + stop -1));
|
||||||
|
getline(interval_fs, one_line);
|
||||||
|
}
|
||||||
|
sort(intervals.begin(), intervals.end());
|
||||||
|
if (intervals.size() > 0) {
|
||||||
|
Interval new_span(intervals[0].left-interval_padding, intervals[0].right+interval_padding);
|
||||||
|
for (int i=1; i<intervals.size(); ++i) {
|
||||||
|
if (intervals[i].left - interval_padding > new_span.right) {
|
||||||
|
r_arr->push_back(new_span);
|
||||||
|
new_span.left = intervals[i].left - interval_padding;
|
||||||
|
new_span.right = intervals[i].right + interval_padding;
|
||||||
|
} else {
|
||||||
|
new_span.right = max(new_span.right, intervals[i].right + interval_padding);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
r_arr->push_back(new_span);
|
||||||
|
}
|
||||||
|
interval_fs.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* 将interval相连的区域合并
|
||||||
|
*/
|
||||||
|
void Interval::ShrinkInterval(IntervalArr *ivap) {
|
||||||
|
if (ivap->size() < 1) return;
|
||||||
|
IntervalArr &iva = *ivap;
|
||||||
|
IntervalArr tiva = iva;
|
||||||
|
iva.clear();
|
||||||
|
Interval iv;
|
||||||
|
iv.left = tiva[0].left;
|
||||||
|
iv.right = tiva[0].right;
|
||||||
|
for (int i=1; i<tiva.size(); ++i) {
|
||||||
|
if (iv.right+1 < tiva[i].left) {
|
||||||
|
iva.push_back(iv);
|
||||||
|
iv.left = tiva[i].left;
|
||||||
|
}
|
||||||
|
iv.right = tiva[i].right;
|
||||||
|
}
|
||||||
|
iva.push_back(iv);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* 根据header信息,扩展interval
|
||||||
|
*/
|
||||||
|
Interval Interval::ExpandInterval(int64_t start, int64_t end, int expandVal, bam_hdr_t* header) {
|
||||||
|
Interval result;
|
||||||
|
result.left = start;
|
||||||
|
result.right = end;
|
||||||
|
|
||||||
|
int64_t ext_left = start - expandVal;
|
||||||
|
int64_t ext_right = end + expandVal;
|
||||||
|
int tid = BamWrap::bam_tid(start);
|
||||||
|
uint32_t contig_len = header->target_len[tid];
|
||||||
|
result.left = max(BamWrap::bam_global_pos(tid, 0), ext_left);
|
||||||
|
result.right = min(ext_right, contig_len - 1 + BamWrap::bam_global_pos(tid, 0));
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
@ -0,0 +1,108 @@
|
||||||
|
/*
|
||||||
|
Description: 处理intervals
|
||||||
|
|
||||||
|
Copyright : All right reserved by ICT
|
||||||
|
|
||||||
|
Author : Zhang Zhonghai
|
||||||
|
Date : 2019/11/24
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef INTERVAL_H_
|
||||||
|
#define INTERVAL_H_
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
#include <sstream>
|
||||||
|
|
||||||
|
#include <htslib/sam.h>
|
||||||
|
|
||||||
|
#include "bam_wrap.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
// 前向声明
|
||||||
|
class Interval;
|
||||||
|
typedef std::vector<Interval> IntervalArr;
|
||||||
|
/*
|
||||||
|
* 闭区间
|
||||||
|
*/
|
||||||
|
struct Interval {
|
||||||
|
// const常量
|
||||||
|
const static int CONTIG_SHIFT = 40;
|
||||||
|
const static uint64_t POS_MASK = (1LL << CONTIG_SHIFT) - 1;
|
||||||
|
|
||||||
|
// 类变量
|
||||||
|
int64_t left;
|
||||||
|
int64_t right;
|
||||||
|
|
||||||
|
// 构造函数
|
||||||
|
Interval();
|
||||||
|
explicit Interval(int64_t l, int64_t r);
|
||||||
|
explicit Interval(int tid, int l, int r) : Interval(BamWrap::bam_global_pos(tid, l), BamWrap::bam_global_pos(tid, r)) {}
|
||||||
|
// 获取tid
|
||||||
|
int contig() { return left >> CONTIG_SHIFT; }
|
||||||
|
int contigLeft() { return left & POS_MASK; }
|
||||||
|
int contigRight() { return right & POS_MASK; }
|
||||||
|
|
||||||
|
// 比较函数
|
||||||
|
bool operator<(const Interval &other);
|
||||||
|
// 是否有重叠
|
||||||
|
bool overlaps(const Interval &other);
|
||||||
|
// 两个interval的合并, 会改变当前interval
|
||||||
|
Interval& spanWith(const Interval &other);
|
||||||
|
// 返回两个interval的交集,不改变当前interval
|
||||||
|
Interval intersect(const Interval &that) const;
|
||||||
|
|
||||||
|
// for debug
|
||||||
|
string toString() const {
|
||||||
|
ostringstream oss;
|
||||||
|
oss << BamWrap::bam_tid(left) + 1 << ":"
|
||||||
|
<< BamWrap::bam_pos(left) + 1 << "-"
|
||||||
|
<< BamWrap::bam_pos(right) + 1;
|
||||||
|
|
||||||
|
return oss.str();
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
* 合并两个interval arr,取相交区域的交集, interval arr都是排序后的
|
||||||
|
*/
|
||||||
|
static void IntersectIntervals(const IntervalArr &a_arr,
|
||||||
|
const IntervalArr &b_arr,
|
||||||
|
IntervalArr *r_arr);
|
||||||
|
/*
|
||||||
|
* 合并两个interval arr,相交的区域取并集
|
||||||
|
*/
|
||||||
|
static void UnionIntervals(const IntervalArr &a_arr,
|
||||||
|
const IntervalArr &b_arr,
|
||||||
|
IntervalArr *r_arr);
|
||||||
|
/*
|
||||||
|
* 将有read覆盖的区域和参数提供的interval文件中的区域做一个交集
|
||||||
|
*/
|
||||||
|
static int64_t MergeIntervals(const IntervalArr &n_arr,
|
||||||
|
const IntervalArr &t_arr,
|
||||||
|
IntervalArr &in_arr, // 会更改
|
||||||
|
int64_t start_loc, // 闭区间
|
||||||
|
int64_t *end_loc, // 开区间, 会更改
|
||||||
|
IntervalArr *r_arr);
|
||||||
|
/*
|
||||||
|
* 读取interval文件
|
||||||
|
*/
|
||||||
|
static void ReadInterval(const std::string &interval_fn,
|
||||||
|
bam_hdr_t* header,
|
||||||
|
int interval_padding,
|
||||||
|
IntervalArr *r_arr);
|
||||||
|
/*
|
||||||
|
* 将interval相连的区域合并
|
||||||
|
*/
|
||||||
|
static void ShrinkInterval(IntervalArr *iva);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* 根据header信息,扩展interval
|
||||||
|
*/
|
||||||
|
static Interval ExpandInterval(int64_t start, int64_t end, int expandVal, bam_hdr_t* header);
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -0,0 +1,292 @@
|
||||||
|
/*
|
||||||
|
Description: vcf index utils
|
||||||
|
|
||||||
|
Copyright : All right reserved by ICT
|
||||||
|
|
||||||
|
Author : Zhang Zhonghai
|
||||||
|
Date : 2019/11/23
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "linear_index.h"
|
||||||
|
#include "bam_wrap.h"
|
||||||
|
#include "binary_util.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
* LinearIndex 类
|
||||||
|
*/
|
||||||
|
// 搜索区间参数对应的数据区域,闭区间
|
||||||
|
void LinearIndex::SearchInterval(int64_t start, int64_t end, int64_t* file_pos, int64_t* content_len) {
|
||||||
|
int stid, spos, etid, epos, stid_origin, spos_origin, etid_origin, epos_origin;
|
||||||
|
stid = stid_origin = BamWrap::bam_tid(start);
|
||||||
|
spos = spos_origin = BamWrap::bam_pos(start);
|
||||||
|
etid = etid_origin = BamWrap::bam_tid(end);
|
||||||
|
epos = epos_origin = BamWrap::bam_pos(end);
|
||||||
|
|
||||||
|
while (stid < idx_.size() && idx_[stid].size() == 0) ++stid;
|
||||||
|
while (etid >= 0 && idx_[etid].size() == 0) --etid;
|
||||||
|
if (stid > etid || stid >= idx_.size()) {
|
||||||
|
*content_len = 0;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
int sb_idx, eb_idx;
|
||||||
|
if (stid == stid_origin) {
|
||||||
|
sb_idx = spos / idx_[stid].binWidth;
|
||||||
|
if (sb_idx >= idx_[stid].size()) { // 开始区域没有对应的block
|
||||||
|
sb_idx = idx_[stid].size() - 1;
|
||||||
|
Block& sb = idx_[stid][sb_idx];
|
||||||
|
*file_pos = sb.startPosition + sb.size;
|
||||||
|
} else {
|
||||||
|
Block& sb = idx_[stid][sb_idx];
|
||||||
|
*file_pos = sb.startPosition;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
sb_idx = 0;
|
||||||
|
Block& sb = idx_[stid][sb_idx];
|
||||||
|
*file_pos = sb.startPosition;
|
||||||
|
}
|
||||||
|
if (etid == etid_origin) {
|
||||||
|
eb_idx = epos / idx_[etid].binWidth;
|
||||||
|
if (eb_idx >= idx_[etid].size()) {
|
||||||
|
eb_idx = idx_[etid].size() - 1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
eb_idx = idx_[etid].size() - 1;
|
||||||
|
}
|
||||||
|
Block& eb = idx_[etid][eb_idx];
|
||||||
|
if (eb.startPosition + eb.size > *file_pos)
|
||||||
|
*content_len = eb.startPosition + eb.size - *file_pos;
|
||||||
|
else
|
||||||
|
*content_len = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 读入index文件信息
|
||||||
|
bool LinearIndex::ReadIndex(const string& idx_fn) {
|
||||||
|
if (bam_hdr_ == NULL)
|
||||||
|
return false;
|
||||||
|
ifstream ifs(idx_fn, ios::in | ios::binary);
|
||||||
|
// 如果idx文件不存在,则创建 todo
|
||||||
|
if (!ifs.good())
|
||||||
|
return false;
|
||||||
|
// 全部读进来再处理
|
||||||
|
uint64_t fsize = 0;
|
||||||
|
uint64_t cur = 0;
|
||||||
|
ifs.seekg(0, ios::end);
|
||||||
|
fsize = (uint64_t)ifs.tellg();
|
||||||
|
ifs.seekg(0, ios::beg);
|
||||||
|
char* buf = (char*)malloc(fsize + 10);
|
||||||
|
if (!ifs.read(buf, fsize)) {
|
||||||
|
free(buf);
|
||||||
|
ifs.close();
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
ifs.close();
|
||||||
|
int int_val, version, flags;
|
||||||
|
uint64_t long_val;
|
||||||
|
string str_val;
|
||||||
|
|
||||||
|
// read index header
|
||||||
|
BinaryUtil::ReadInt(buf, fsize, &cur, &int_val);
|
||||||
|
// cout << "magic: " << int_val << endl;
|
||||||
|
BinaryUtil::ReadInt(buf, fsize, &cur, &int_val);
|
||||||
|
if (int_val != INDEX_TYPE) {
|
||||||
|
free(buf);
|
||||||
|
ifs.close();
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// cout << "type: " << int_val << endl;
|
||||||
|
BinaryUtil::ReadInt(buf, fsize, &cur, &version);
|
||||||
|
// cout << "version: " << version << endl;
|
||||||
|
BinaryUtil::ReadStr(buf, fsize, &cur, &str_val);
|
||||||
|
// cout << "path: " << str_val << endl;
|
||||||
|
BinaryUtil::ReadLong(buf, fsize, &cur, &vcf_fsize); // vcf file size
|
||||||
|
// cout << "file size: " << long_val << endl;
|
||||||
|
BinaryUtil::ReadLong(buf, fsize, &cur, &long_val); // test number
|
||||||
|
BinaryUtil::ReadStr(buf, fsize, &cur, &str_val); // md5
|
||||||
|
BinaryUtil::ReadInt(buf, fsize, &cur, &flags);
|
||||||
|
|
||||||
|
if (version < 3 && (flags & 0x8000) == 0x8000) {
|
||||||
|
BinaryUtil::ReadInt(buf, fsize, &cur, &int_val);
|
||||||
|
if (int_val < 0)
|
||||||
|
return false;
|
||||||
|
for (int i = 0; i < int_val; ++i) {
|
||||||
|
BinaryUtil::ReadStr(buf, fsize, &cur, &str_val);
|
||||||
|
BinaryUtil::ReadInt(buf, fsize, &cur, &int_val);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (version >= 3) {
|
||||||
|
int nProperties;
|
||||||
|
BinaryUtil::ReadInt(buf, fsize, &cur, &nProperties);
|
||||||
|
string key, val;
|
||||||
|
while (nProperties-- > 0) {
|
||||||
|
BinaryUtil::ReadStr(buf, fsize, &cur, &key);
|
||||||
|
BinaryUtil::ReadStr(buf, fsize, &cur, &val);
|
||||||
|
properties_[key] = val;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// read index content
|
||||||
|
int n_chrom;
|
||||||
|
BinaryUtil::ReadInt(buf, fsize, &cur, &n_chrom);
|
||||||
|
int last_tid = 0;
|
||||||
|
string last_name;
|
||||||
|
int bam_tid = 0; // tid从0开始,索引的染色体也要补充一个数据, 使得idx_ vector与tid一一对应
|
||||||
|
while (n_chrom-- > 0) {
|
||||||
|
ChrIndex ci;
|
||||||
|
BinaryUtil::ReadStr(buf, fsize, &cur, &ci.name);
|
||||||
|
BinaryUtil::ReadInt(buf, fsize, &cur, &ci.binWidth);
|
||||||
|
int nBins;
|
||||||
|
BinaryUtil::ReadInt(buf, fsize, &cur, &nBins);
|
||||||
|
BinaryUtil::ReadInt(buf, fsize, &cur, &ci.longestFeature);
|
||||||
|
BinaryUtil::ReadInt(buf, fsize, &cur, &int_val);
|
||||||
|
BinaryUtil::ReadInt(buf, fsize, &cur, &ci.nFeatures);
|
||||||
|
ci.blocks.reserve(nBins);
|
||||||
|
int tid = sam_hdr_name2tid(bam_hdr_, ci.name.c_str());
|
||||||
|
if (tid < 0) {
|
||||||
|
if (ci.name == last_name) {
|
||||||
|
tid = last_tid;
|
||||||
|
} else {
|
||||||
|
tid = ++last_tid;
|
||||||
|
last_name = ci.name;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
while (bam_tid + 1 < tid) {
|
||||||
|
idx_.push_back(ChrIndex()); // 补充空染色体
|
||||||
|
++bam_tid;
|
||||||
|
}
|
||||||
|
bam_tid = tid;
|
||||||
|
ci.tid = tid;
|
||||||
|
uint64_t pos;
|
||||||
|
uint64_t nextPos;
|
||||||
|
BinaryUtil::ReadLong(buf, fsize, &cur, &pos);
|
||||||
|
for (int i = 0; i < nBins; ++i) {
|
||||||
|
BinaryUtil::ReadLong(buf, fsize, &cur, &nextPos);
|
||||||
|
ci.blocks.push_back(Block(pos, nextPos - pos));
|
||||||
|
pos = nextPos;
|
||||||
|
}
|
||||||
|
idx_.push_back(ci);
|
||||||
|
}
|
||||||
|
sort(idx_.begin(), idx_.end());
|
||||||
|
ifs.close();
|
||||||
|
free(buf);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* LinearIndex::ChrIndex 类
|
||||||
|
*/
|
||||||
|
void LinearIndex::ChrIndex::write(ofstream& out) const {
|
||||||
|
BinaryUtil::WriteStr(out, name);
|
||||||
|
BinaryUtil::WriteInt(out, binWidth);
|
||||||
|
BinaryUtil::WriteInt(out, blocks.size());
|
||||||
|
BinaryUtil::WriteInt(out, longestFeature);
|
||||||
|
BinaryUtil::WriteInt(out, 0);
|
||||||
|
BinaryUtil::WriteInt(out, nFeatures);
|
||||||
|
uint64_t pos = 0, si = 0;
|
||||||
|
for (int i = 0; i < blocks.size(); ++i) {
|
||||||
|
pos = blocks[i].startPosition;
|
||||||
|
si = blocks[i].size;
|
||||||
|
BinaryUtil::WriteLong(out, pos);
|
||||||
|
}
|
||||||
|
BinaryUtil::WriteLong(out, pos + si);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* LinearIndexCreator类
|
||||||
|
*/
|
||||||
|
// 初始化索引文件头部信息
|
||||||
|
void LinearIndexCreator::InitHeaderDict(bam_hdr_t* hdr) {
|
||||||
|
for (int i = 0; i < hdr->n_targets; ++i) {
|
||||||
|
v_contig_name_.push_back(hdr->target_name[i]);
|
||||||
|
contig_name_to_id_[hdr->target_name[i]] = i;
|
||||||
|
contig_len_[hdr->target_name[i]] = hdr->target_len[i];
|
||||||
|
++n_properties_;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 添加一条vcf记录
|
||||||
|
void LinearIndexCreator::AddFeature(const Feature& ft, uint64_t f_pos) { // f_pos是vcf文件当前正要写入的位置
|
||||||
|
if (idx_.size() == 0 || idx_.back().tid != ft.tid) {
|
||||||
|
if (idx_.size() > 0) {
|
||||||
|
for (int i = 0; i < blocks_.size() - 1; ++i) {
|
||||||
|
blocks_[i].size = blocks_[i + 1].startPosition - blocks_[i].startPosition;
|
||||||
|
idx_.back().blocks.push_back(blocks_[i]);
|
||||||
|
}
|
||||||
|
blocks_.back().size = f_pos - blocks_.back().startPosition;
|
||||||
|
idx_.back().blocks.push_back(blocks_.back());
|
||||||
|
}
|
||||||
|
idx_.push_back(LinearIndex::ChrIndex(v_contig_name_[ft.tid], ft.tid, bin_width_));
|
||||||
|
blocks_.clear();
|
||||||
|
blocks_.push_back(Block(f_pos, 0));
|
||||||
|
longest_feature_ = 0;
|
||||||
|
}
|
||||||
|
while (ft.start > blocks_.size() * bin_width_) blocks_.push_back(Block(f_pos, 0));
|
||||||
|
if (ft.FeatureLen() > longest_feature_) {
|
||||||
|
longest_feature_ = ft.FeatureLen();
|
||||||
|
idx_.back().longestFeature = longest_feature_;
|
||||||
|
}
|
||||||
|
++idx_.back().nFeatures;
|
||||||
|
++FEATURE_COUNT_;
|
||||||
|
all_feature_len += ft.FeatureLen();
|
||||||
|
}
|
||||||
|
|
||||||
|
// 添加所有记录完毕
|
||||||
|
void LinearIndexCreator::FinalizeIndex(uint64_t f_pos) {
|
||||||
|
if (f_pos == 0)
|
||||||
|
error("[%s] Error: finalize index failed\n", __func__);
|
||||||
|
if (blocks_.size() > 0) {
|
||||||
|
for (int i = 0; i < blocks_.size() - 1; ++i) {
|
||||||
|
blocks_[i].size = blocks_[i + 1].startPosition - blocks_[i].startPosition;
|
||||||
|
idx_.back().blocks.push_back(blocks_[i]);
|
||||||
|
}
|
||||||
|
blocks_.back().size = f_pos - blocks_.back().startPosition;
|
||||||
|
idx_.back().blocks.push_back(blocks_.back());
|
||||||
|
blocks_.clear();
|
||||||
|
}
|
||||||
|
FEATURE_LENGTH_MEAN_ = (double)all_feature_len / (double)FEATURE_COUNT_;
|
||||||
|
index_file_size_ = f_pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 写入index文件
|
||||||
|
void LinearIndexCreator::WriteIndex(const string& out_fn) {
|
||||||
|
ofstream out;
|
||||||
|
out.open(out_fn, ios::out | ios::binary);
|
||||||
|
|
||||||
|
BinaryUtil::WriteInt(out, MAGIC_NUMBER);
|
||||||
|
BinaryUtil::WriteInt(out, LinearIndex::INDEX_TYPE);
|
||||||
|
BinaryUtil::WriteInt(out, LinearIndex::INDEX_VERSION);
|
||||||
|
BinaryUtil::WriteStr(out, out_fn);
|
||||||
|
BinaryUtil::WriteLong(out, index_file_size_);
|
||||||
|
BinaryUtil::WriteLong(out, -1);
|
||||||
|
BinaryUtil::WriteStr(out, "");
|
||||||
|
BinaryUtil::WriteInt(out, flags_);
|
||||||
|
n_properties_ += 4;
|
||||||
|
BinaryUtil::WriteInt(out, n_properties_);
|
||||||
|
ostringstream oss;
|
||||||
|
oss << FEATURE_LENGTH_MEAN_;
|
||||||
|
BinaryUtil::WriteStr(out, "FEATURE_LENGTH_MEAN");
|
||||||
|
BinaryUtil::WriteStr(out, oss.str());
|
||||||
|
oss.str("");
|
||||||
|
oss << FEATURE_LENGTH_STD_DEV_;
|
||||||
|
BinaryUtil::WriteStr(out, "FEATURE_LENGTH_STD_DEV");
|
||||||
|
BinaryUtil::WriteStr(out, oss.str());
|
||||||
|
oss.str("");
|
||||||
|
oss << MEAN_FEATURE_VARIANCE_;
|
||||||
|
BinaryUtil::WriteStr(out, "MEAN_FEATURE_VARIANCE");
|
||||||
|
BinaryUtil::WriteStr(out, oss.str());
|
||||||
|
oss.str("");
|
||||||
|
oss << FEATURE_COUNT_;
|
||||||
|
BinaryUtil::WriteStr(out, "FEATURE_COUNT");
|
||||||
|
BinaryUtil::WriteStr(out, oss.str());
|
||||||
|
for (int i = 0; i < v_contig_name_.size(); ++i) {
|
||||||
|
const string& cn = v_contig_name_[i];
|
||||||
|
oss.str("");
|
||||||
|
oss << contig_len_[cn];
|
||||||
|
BinaryUtil::WriteStr(out, "DICT:" + cn);
|
||||||
|
BinaryUtil::WriteStr(out, oss.str());
|
||||||
|
}
|
||||||
|
BinaryUtil::WriteInt(out, idx_.size());
|
||||||
|
for (int i = 0; i < idx_.size(); ++i) {
|
||||||
|
const LinearIndex::ChrIndex& ci = idx_[i];
|
||||||
|
ci.write(out);
|
||||||
|
}
|
||||||
|
out.close();
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,129 @@
|
||||||
|
/*
|
||||||
|
Description: vcf index utils
|
||||||
|
|
||||||
|
Copyright : All right reserved by ICT
|
||||||
|
|
||||||
|
Author : Zhang Zhonghai
|
||||||
|
Date : 2019/11/23
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef INDEX_H_
|
||||||
|
#define INDEX_H_
|
||||||
|
|
||||||
|
#include <htslib/sam.h>
|
||||||
|
#include <stddef.h>
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <fstream>
|
||||||
|
#include <iostream>
|
||||||
|
#include <sstream>
|
||||||
|
#include <string>
|
||||||
|
#include <unordered_map>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "utils.h"
|
||||||
|
|
||||||
|
using std::ifstream;
|
||||||
|
using std::ios;
|
||||||
|
using std::ofstream;
|
||||||
|
using std::ostringstream;
|
||||||
|
using std::sort;
|
||||||
|
using std::string;
|
||||||
|
using std::unordered_map;
|
||||||
|
using std::vector;
|
||||||
|
|
||||||
|
using std::cout;
|
||||||
|
using std::endl;
|
||||||
|
|
||||||
|
struct Block {
|
||||||
|
Block(uint64_t start, uint64_t si) : startPosition(start), size(si) {}
|
||||||
|
uint64_t startPosition;
|
||||||
|
uint64_t size;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct Feature {
|
||||||
|
int tid;
|
||||||
|
int start; // 闭区间
|
||||||
|
int end; // 闭区间
|
||||||
|
Feature(int ti, int b, int e) : tid(ti), start(b), end(e) {}
|
||||||
|
inline int FeatureLen() const { return end - start + 1; }
|
||||||
|
};
|
||||||
|
|
||||||
|
struct LinearIndex {
|
||||||
|
const static int MAX_FEATURES_PER_BIN = 100;
|
||||||
|
const static int INDEX_TYPE = 1;
|
||||||
|
const static int INDEX_VERSION = 3;
|
||||||
|
const static int MAX_BIN_WIDTH = 1024000;
|
||||||
|
|
||||||
|
LinearIndex() {}
|
||||||
|
LinearIndex(bam_hdr_t* hdr) : bam_hdr_(hdr) {}
|
||||||
|
void SetHeader(bam_hdr_t* hdr) { bam_hdr_ = hdr; }
|
||||||
|
|
||||||
|
class ChrIndex;
|
||||||
|
vector<ChrIndex> idx_;
|
||||||
|
vector<string> vkey_;
|
||||||
|
vector<string> vval_;
|
||||||
|
unordered_map<string, string> properties_;
|
||||||
|
bam_hdr_t* bam_hdr_ = NULL; // 这个应该换成bcf_hdr_t
|
||||||
|
uint64_t vcf_fsize = 0;
|
||||||
|
|
||||||
|
// 染色体索引信息
|
||||||
|
struct ChrIndex {
|
||||||
|
string name;
|
||||||
|
int tid;
|
||||||
|
int binWidth;
|
||||||
|
int longestFeature = 0;
|
||||||
|
int nFeatures = 0;
|
||||||
|
vector<Block> blocks;
|
||||||
|
ChrIndex() {};
|
||||||
|
ChrIndex(string& n, int ti, int bw) : name(n), tid(ti), binWidth(bw) {}
|
||||||
|
inline bool operator<(const ChrIndex& ci) const { return tid < ci.tid; };
|
||||||
|
inline Block& operator[](int pos) { return blocks[pos]; }
|
||||||
|
inline int size() { return blocks.size(); }
|
||||||
|
void write(ofstream& out) const;
|
||||||
|
};
|
||||||
|
|
||||||
|
inline ChrIndex& operator[](int tid) { return idx_[tid]; }
|
||||||
|
|
||||||
|
// 闭区间
|
||||||
|
void SearchInterval(int64_t start, int64_t end, int64_t* file_pos, int64_t* content_len);
|
||||||
|
|
||||||
|
// 读入index文件信息
|
||||||
|
bool ReadIndex(const string& idx_fn);
|
||||||
|
};
|
||||||
|
|
||||||
|
// 根据vcf数据创建index文件
|
||||||
|
struct LinearIndexCreator {
|
||||||
|
const static int INDEX_VERSION = LinearIndex::INDEX_VERSION;
|
||||||
|
const static int MAGIC_NUMBER = 1480870228;
|
||||||
|
const static int DEFAULT_BIN_WIDTH = 8000;
|
||||||
|
int bin_width_ = DEFAULT_BIN_WIDTH;
|
||||||
|
string input_vcf_fn_;
|
||||||
|
string output_index_fn_;
|
||||||
|
int longest_feature_ = 0;
|
||||||
|
uint64_t index_file_size_ = 0;
|
||||||
|
int flags_ = 0;
|
||||||
|
int n_properties_ = 0;
|
||||||
|
float FEATURE_LENGTH_MEAN_ = 0.0;
|
||||||
|
float FEATURE_LENGTH_STD_DEV_ = 0.0;
|
||||||
|
float MEAN_FEATURE_VARIANCE_ = 0.0;
|
||||||
|
uint64_t FEATURE_COUNT_ = 0;
|
||||||
|
uint64_t all_feature_len = 0;
|
||||||
|
|
||||||
|
unordered_map<string, int> contig_name_to_id_;
|
||||||
|
unordered_map<string, int> contig_len_;
|
||||||
|
vector<string> v_contig_name_;
|
||||||
|
vector<LinearIndex::ChrIndex> idx_;
|
||||||
|
vector<Block> blocks_;
|
||||||
|
|
||||||
|
// 根据sam header初始化索引文件头部信息
|
||||||
|
void InitHeaderDict(bam_hdr_t* hdr);
|
||||||
|
// 添加一条记录
|
||||||
|
void AddFeature(const Feature& ft, uint64_t f_pos); // f_pos是vcf文件当前正要写入的位置
|
||||||
|
// 添加记录完毕
|
||||||
|
void FinalizeIndex(uint64_t f_pos);
|
||||||
|
// 写入index文件
|
||||||
|
void WriteIndex(const string& out_fn);
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
Description: Murmur
|
Description: Murmur哈希
|
||||||
|
|
||||||
Copyright : All right reserved by ICT
|
Copyright : All right reserved by ICT
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -73,12 +73,12 @@ int DisplayProfiling(int nthread) {
|
||||||
PRINT_GP(write);
|
PRINT_GP(write);
|
||||||
PRINT_GP(whole_process);
|
PRINT_GP(whole_process);
|
||||||
|
|
||||||
PRINT_TP(gen, nthread);
|
// PRINT_TP(gen, nthread);
|
||||||
PRINT_TP(sort_frag, nthread);
|
// PRINT_TP(sort_frag, nthread);
|
||||||
PRINT_TP(sort_pair, nthread);
|
// PRINT_TP(sort_pair, nthread);
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
Loading…
Reference in New Issue