130 lines
3.4 KiB
C++
130 lines
3.4 KiB
C++
/*
|
|
Description: vcf index utils
|
|
|
|
Copyright : All right reserved by ICT
|
|
|
|
Author : Zhang Zhonghai
|
|
Date : 2019/11/23
|
|
*/
|
|
|
|
#ifndef INDEX_H_
|
|
#define INDEX_H_
|
|
|
|
#include <htslib/sam.h>
|
|
#include <stddef.h>
|
|
|
|
#include <algorithm>
|
|
#include <fstream>
|
|
#include <iostream>
|
|
#include <sstream>
|
|
#include <string>
|
|
#include <unordered_map>
|
|
#include <vector>
|
|
|
|
#include "utils.h"
|
|
|
|
using std::ifstream;
|
|
using std::ios;
|
|
using std::ofstream;
|
|
using std::ostringstream;
|
|
using std::sort;
|
|
using std::string;
|
|
using std::unordered_map;
|
|
using std::vector;
|
|
|
|
using std::cout;
|
|
using std::endl;
|
|
|
|
struct Block {
|
|
Block(uint64_t start, uint64_t si) : startPosition(start), size(si) {}
|
|
uint64_t startPosition;
|
|
uint64_t size;
|
|
};
|
|
|
|
struct Feature {
|
|
int tid;
|
|
int start; // 闭区间
|
|
int end; // 闭区间
|
|
Feature(int ti, int b, int e) : tid(ti), start(b), end(e) {}
|
|
inline int FeatureLen() const { return end - start + 1; }
|
|
};
|
|
|
|
struct LinearIndex {
|
|
const static int MAX_FEATURES_PER_BIN = 100;
|
|
const static int INDEX_TYPE = 1;
|
|
const static int INDEX_VERSION = 3;
|
|
const static int MAX_BIN_WIDTH = 1024000;
|
|
|
|
LinearIndex() {}
|
|
LinearIndex(bam_hdr_t* hdr) : bam_hdr_(hdr) {}
|
|
void SetHeader(bam_hdr_t* hdr) { bam_hdr_ = hdr; }
|
|
|
|
class ChrIndex;
|
|
vector<ChrIndex> idx_;
|
|
vector<string> vkey_;
|
|
vector<string> vval_;
|
|
unordered_map<string, string> properties_;
|
|
bam_hdr_t* bam_hdr_ = NULL; // 这个应该换成bcf_hdr_t
|
|
uint64_t vcf_fsize = 0;
|
|
|
|
// 染色体索引信息
|
|
struct ChrIndex {
|
|
string name;
|
|
int tid;
|
|
int binWidth;
|
|
int longestFeature = 0;
|
|
int nFeatures = 0;
|
|
vector<Block> blocks;
|
|
ChrIndex() {};
|
|
ChrIndex(string& n, int ti, int bw) : name(n), tid(ti), binWidth(bw) {}
|
|
inline bool operator<(const ChrIndex& ci) const { return tid < ci.tid; };
|
|
inline Block& operator[](int pos) { return blocks[pos]; }
|
|
inline int size() { return blocks.size(); }
|
|
void write(ofstream& out) const;
|
|
};
|
|
|
|
inline ChrIndex& operator[](int tid) { return idx_[tid]; }
|
|
|
|
// 闭区间
|
|
void SearchInterval(int64_t start, int64_t end, int64_t* file_pos, int64_t* content_len);
|
|
|
|
// 读入index文件信息
|
|
bool ReadIndex(const string& idx_fn);
|
|
};
|
|
|
|
// 根据vcf数据创建index文件
|
|
struct LinearIndexCreator {
|
|
const static int INDEX_VERSION = LinearIndex::INDEX_VERSION;
|
|
const static int MAGIC_NUMBER = 1480870228;
|
|
const static int DEFAULT_BIN_WIDTH = 8000;
|
|
int bin_width_ = DEFAULT_BIN_WIDTH;
|
|
string input_vcf_fn_;
|
|
string output_index_fn_;
|
|
int longest_feature_ = 0;
|
|
uint64_t index_file_size_ = 0;
|
|
int flags_ = 0;
|
|
int n_properties_ = 0;
|
|
float FEATURE_LENGTH_MEAN_ = 0.0;
|
|
float FEATURE_LENGTH_STD_DEV_ = 0.0;
|
|
float MEAN_FEATURE_VARIANCE_ = 0.0;
|
|
uint64_t FEATURE_COUNT_ = 0;
|
|
uint64_t all_feature_len = 0;
|
|
|
|
unordered_map<string, int> contig_name_to_id_;
|
|
unordered_map<string, int> contig_len_;
|
|
vector<string> v_contig_name_;
|
|
vector<LinearIndex::ChrIndex> idx_;
|
|
vector<Block> blocks_;
|
|
|
|
// 根据sam header初始化索引文件头部信息
|
|
void InitHeaderDict(bam_hdr_t* hdr);
|
|
// 添加一条记录
|
|
void AddFeature(const Feature& ft, uint64_t f_pos); // f_pos是vcf文件当前正要写入的位置
|
|
// 添加记录完毕
|
|
void FinalizeIndex(uint64_t f_pos);
|
|
// 写入index文件
|
|
void WriteIndex(const string& out_fn);
|
|
};
|
|
|
|
#endif
|