BWA-FastAlign/fmt_idx.h

121 lines
4.9 KiB
C
Raw Permalink Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

/*
Description: fmt-idxseed (fm-index twice extend in one search step)
Copyright : All right reserved by
Author :
Date : 2023/12/24
*/
#ifndef FMT_INDEX_H_
#define FMT_INDEX_H_
#include <stdint.h>
#include <stddef.h>
#include "bwt.h"
#include "utils.h"
#define FMT_OCC_INTV_SHIFT 8
#define FMT_OCC_INTERVAL (1 << FMT_OCC_INTV_SHIFT)
#define FMT_OCC_INTV_MASK (FMT_OCC_INTERVAL - 1)
#define FMT_MID_INTV_SHIFT 6
#define FMT_MID_INTERVAL (1 << FMT_MID_INTV_SHIFT)
#define FMT_MID_INTV_MASK (FMT_MID_INTERVAL - 1)
// c
#define fmt_set_intv(fmt, c, ik) ((ik).x[0] = (fmt)->L2[(int)(c)] + 1, (ik).x[2] = (fmt)->L2[(int)(c) + 1] - (fmt)->L2[(int)(c)], (ik).x[1] = (fmt)->L2[3 - (c)] + 1, (ik).info = 0)
// kbwt str$check point occkOCC_INTERVAL
#if FMT_MID_INTERVAL == 8
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 144))
#elif FMT_MID_INTERVAL == 16
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 80))
#elif FMT_MID_INTERVAL == 32
//#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 48))
#define fmt_occ_intv(b, k) ((b)->bwt + ((k) >> 8) * 80)
#elif FMT_MID_INTERVAL == 64
//#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 32))
#define fmt_occ_intv(b, k) ((b)->bwt + ((k) >> 8 << 6))
#elif FMT_MID_INTERVAL == 128
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 24))
#else
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 20))
#endif
// valbwt basebpre-bwtT G C A328bitoccocc
#define __fmt_occ_e2_aux2(fmt, b, val) \
((fmt)->cnt_occ[(b)][(val) & 0xff] + (fmt)->cnt_occ[b][(val) >> 8 & 0xff] + (fmt)->cnt_occ[b][(val) >> 16 & 0xff] + (fmt)->cnt_occ[b][(val) >> 24])
#define __fmt_mid_sum(x) \
((x) >> 24 & 0xff) + ((x) >> 16 & 0xff) + ((x) >> 8 & 0xff) + ((x) & 0xff)
// sa
#define SA_INTV 4
// fm-index, extend twice in one search step (one memory access)
typedef struct
{
bwtint_t primary; // S^{-1}(0), or the primary index of BWT
bwtint_t sec_primary; // second primary line
bwtint_t L2[5]; // C(), cumulative count
bwtint_t seq_len; // sequence length
bwtint_t bwt_size; // size of bwt, about seq_len/4
uint32_t *bwt; // BWT
// occurance array, separated to two parts
uint32_t cnt_occ[16][256]; // 16-24bocc8-16bocc0-8aoccba
uint8_t sec_bcp; // base couple for sec primary line, AA=>0, AC=>1 ... TT=>15
uint8_t first_base; // 2bitint0,1,2,3
uint8_t last_base; // dollarbase
// ref pac
bwtint_t l_pac; //
uint8_t *pac; // 2bit
// kmerfmt
KmerHash kmer_hash;
// suffix array
int sa_intv;
bwtint_t n_sa;
uint8_t *sa;
} FMTIndex;
// fmt
void dump_fmt(const char *fn, const FMTIndex *fmt);
// fmt
FMTIndex *fmt_restore_fmt(const char *fn);
void fmt_create_kmer_index(FMTIndex *fmt);
// kmer hash
void fmt_dump_kmer_idx(const char *fn, const KmerHash *kh);
// kmer hash
KmerHash fmt_restore_kmer_idx(const char *fn);
// sa
void fmt_restore_sa(const char *fn, FMTIndex *fmt);
// interval-bwtfmt-index
FMTIndex *create_fmt_from_bwt(bwt_t *bwt);
// bwt basebpre-bwt strocc
void fmt_direct_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t cnt[4]);
void fmt_direct2_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t cnt[4]);
void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t cnt[4]);
//
void fmt_direct2_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok1, bwtintv_t *ok2, int is_back, int b1, int b2);
void fmt_direct_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok1, bwtintv_t *ok2, int is_back, int b1, int b2);
void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok1, bwtintv_t *ok2, int is_back, int b1, int b2);
//
void fmt_direct_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1);
void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1);
// KMER_LEN
// void gen_all_seq(char **seq_arr, int kmer_len);
// kmerposfmt
void kmer_setval_at(uint8_t *mem_addr, bwtintv_t ik, int pos);
// kmerfmt
void kmer_getval_at(uint8_t *mem_addr, bwtintv_t *ok, int pos);
void fmt_kmer_get(const FMTIndex *fmt, bwtintv_t *ok, uint32_t qbit, int pos);
// smemseed
int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_t *lm, bwtintv_v *mem, bwtintv_v *tmpvec);
int fmt_smem_2(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_t *lm, bwtintv_v *mem, bwtintv_v *tmpvec, uint32_v qev);
int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem);
bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k);
bwtint_t fmt_sa_offset(const FMTIndex *fmt, bwtint_t k);
void fmt_gen_cnt_occ(FMTIndex *fmt);
#endif