/* Description: vcf index utils Copyright : All right reserved by ICT Author : Zhang Zhonghai Date : 2019/11/23 */ #ifndef INDEX_H_ #define INDEX_H_ #include #include #include #include #include #include #include #include #include #include "utils.h" using std::ifstream; using std::ios; using std::ofstream; using std::ostringstream; using std::sort; using std::string; using std::unordered_map; using std::vector; using std::cout; using std::endl; struct Block { Block(uint64_t start, uint64_t si) : startPosition(start), size(si) {} uint64_t startPosition; uint64_t size; }; struct Feature { int tid; int start; // 闭区间 int end; // 闭区间 Feature(int ti, int b, int e) : tid(ti), start(b), end(e) {} inline int FeatureLen() const { return end - start + 1; } }; struct LinearIndex { const static int MAX_FEATURES_PER_BIN = 100; const static int INDEX_TYPE = 1; const static int INDEX_VERSION = 3; const static int MAX_BIN_WIDTH = 1024000; LinearIndex() {} LinearIndex(bam_hdr_t* hdr) : bam_hdr_(hdr) {} void SetHeader(bam_hdr_t* hdr) { bam_hdr_ = hdr; } class ChrIndex; vector idx_; vector vkey_; vector vval_; unordered_map properties_; bam_hdr_t* bam_hdr_ = NULL; // 这个应该换成bcf_hdr_t uint64_t vcf_fsize = 0; // 染色体索引信息 struct ChrIndex { string name; int tid; int binWidth; int longestFeature = 0; int nFeatures = 0; vector blocks; ChrIndex() {}; ChrIndex(string& n, int ti, int bw) : name(n), tid(ti), binWidth(bw) {} inline bool operator<(const ChrIndex& ci) const { return tid < ci.tid; }; inline Block& operator[](int pos) { return blocks[pos]; } inline int size() { return blocks.size(); } void write(ofstream& out) const; }; inline ChrIndex& operator[](int tid) { return idx_[tid]; } // 闭区间 void SearchInterval(int64_t start, int64_t end, int64_t* file_pos, int64_t* content_len); // 读入index文件信息 bool ReadIndex(const string& idx_fn); }; // 根据vcf数据创建index文件 struct LinearIndexCreator { const static int INDEX_VERSION = LinearIndex::INDEX_VERSION; const static int MAGIC_NUMBER = 1480870228; const static int DEFAULT_BIN_WIDTH = 8000; int bin_width_ = DEFAULT_BIN_WIDTH; string input_vcf_fn_; string output_index_fn_; int longest_feature_ = 0; uint64_t index_file_size_ = 0; int flags_ = 0; int n_properties_ = 0; float FEATURE_LENGTH_MEAN_ = 0.0; float FEATURE_LENGTH_STD_DEV_ = 0.0; float MEAN_FEATURE_VARIANCE_ = 0.0; uint64_t FEATURE_COUNT_ = 0; uint64_t all_feature_len = 0; unordered_map contig_name_to_id_; unordered_map contig_len_; vector v_contig_name_; vector idx_; vector blocks_; // 根据sam header初始化索引文件头部信息 void InitHeaderDict(bam_hdr_t* hdr); // 添加一条记录 void AddFeature(const Feature& ft, uint64_t f_pos); // f_pos是vcf文件当前正要写入的位置 // 添加记录完毕 void FinalizeIndex(uint64_t f_pos); // 写入index文件 void WriteIndex(const string& out_fn); }; #endif