3512 lines
116 KiB
C
3512 lines
116 KiB
C
|
|
#include "ertseeding.h"
|
||
|
|
|
||
|
|
|
||
|
|
#define smem_lt(a, b) ((a).start == (b).start ? (a).end > (b).end : (a).start < (b).start)
|
||
|
|
KSORT_INIT(mem_smem_ert, mem_t, smem_lt)
|
||
|
|
|
||
|
|
#define smem_lt_2(a, b) ((a).start == (b).start ? (a).end < (b).end : (a).start < (b).start)
|
||
|
|
KSORT_INIT(mem_smem_sort_lt_ert, mem_t, smem_lt_2)
|
||
|
|
|
||
|
|
/**
|
||
|
|
* 256 x 3 (2B,3B,4B ptrs) x 4 (ACGT) table encoding offset to next address given a code byte for a leaf node
|
||
|
|
*/
|
||
|
|
unsigned char leaf_table[256 * 3][4] = {
|
||
|
|
{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,2,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{2,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,0,2,0},{0,5,0,0},
|
||
|
|
{10,5,0,0},{0,5,0,0},{0,7,2,0},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,0,2,0},{0,0,2,0},
|
||
|
|
{7,0,2,0},{0,0,2,0},{0,0,4,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,2,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{2,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{2,0,0,0},{0,0,0,0},{0,0,0,0},{0,2,0,0},
|
||
|
|
{7,2,0,0},{0,2,0,0},{0,4,0,0},{0,0,0,0},
|
||
|
|
{2,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{4,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,0,0,2},{0,5,0,0},
|
||
|
|
{10,5,0,0},{0,5,0,0},{0,7,0,2},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,0,0,2},{0,0,0,2},
|
||
|
|
{7,0,0,2},{0,0,0,2},{0,0,0,4},{0,0,5,0},
|
||
|
|
{10,0,5,0},{0,0,5,0},{0,0,7,2},{0,10,5,0},
|
||
|
|
{15,10,5,0},{0,10,5,0},{0,12,7,2},{0,0,5,0},
|
||
|
|
{10,0,5,0},{0,0,5,0},{0,0,7,2},{0,0,7,2},
|
||
|
|
{12,0,7,2},{0,0,7,2},{0,0,9,4},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,0,0,2},{0,5,0,0},
|
||
|
|
{10,5,0,0},{0,5,0,0},{0,7,0,2},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,0,0,2},{0,0,0,2},
|
||
|
|
{7,0,0,2},{0,0,0,2},{0,0,0,4},{0,0,0,2},
|
||
|
|
{7,0,0,2},{0,0,0,2},{0,0,0,4},{0,7,0,2},
|
||
|
|
{12,7,0,2},{0,7,0,2},{0,9,0,4},{0,0,0,2},
|
||
|
|
{7,0,0,2},{0,0,0,2},{0,0,0,4},{0,0,0,4},
|
||
|
|
{9,0,0,4},{0,0,0,4},{0,0,0,6},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,2,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{2,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,0,2,0},{0,5,0,0},
|
||
|
|
{10,5,0,0},{0,5,0,0},{0,7,2,0},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,0,2,0},{0,0,2,0},
|
||
|
|
{7,0,2,0},{0,0,2,0},{0,0,4,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,2,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{2,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{2,0,0,0},{0,0,0,0},{0,0,0,0},{0,2,0,0},
|
||
|
|
{7,2,0,0},{0,2,0,0},{0,4,0,0},{0,0,0,0},
|
||
|
|
{2,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{4,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{2,0,0,0},{0,0,0,0},{0,0,0,0},{0,2,0,0},
|
||
|
|
{7,2,0,0},{0,2,0,0},{0,4,0,0},{0,0,0,0},
|
||
|
|
{2,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{4,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,2,0},
|
||
|
|
{7,0,2,0},{0,0,2,0},{0,0,4,0},{0,7,2,0},
|
||
|
|
{12,7,2,0},{0,7,2,0},{0,9,4,0},{0,0,2,0},
|
||
|
|
{7,0,2,0},{0,0,2,0},{0,0,4,0},{0,0,4,0},
|
||
|
|
{9,0,4,0},{0,0,4,0},{0,0,6,0},{0,0,0,0},
|
||
|
|
{2,0,0,0},{0,0,0,0},{0,0,0,0},{0,2,0,0},
|
||
|
|
{7,2,0,0},{0,2,0,0},{0,4,0,0},{0,0,0,0},
|
||
|
|
{2,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{4,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{4,0,0,0},{0,0,0,0},{0,0,0,0},{0,4,0,0},
|
||
|
|
{9,4,0,0},{0,4,0,0},{0,6,0,0},{0,0,0,0},
|
||
|
|
{4,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{6,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,3,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{3,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,0,3,0},{0,5,0,0},
|
||
|
|
{10,5,0,0},{0,5,0,0},{0,8,3,0},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,0,3,0},{0,0,3,0},
|
||
|
|
{8,0,3,0},{0,0,3,0},{0,0,6,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,3,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{3,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{3,0,0,0},{0,0,0,0},{0,0,0,0},{0,3,0,0},
|
||
|
|
{8,3,0,0},{0,3,0,0},{0,6,0,0},{0,0,0,0},
|
||
|
|
{3,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{6,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,0,0,3},{0,5,0,0},
|
||
|
|
{10,5,0,0},{0,5,0,0},{0,8,0,3},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,0,0,3},{0,0,0,3},
|
||
|
|
{8,0,0,3},{0,0,0,3},{0,0,0,6},{0,0,5,0},
|
||
|
|
{10,0,5,0},{0,0,5,0},{0,0,8,3},{0,10,5,0},
|
||
|
|
{15,10,5,0},{0,10,5,0},{0,13,8,3},{0,0,5,0},
|
||
|
|
{10,0,5,0},{0,0,5,0},{0,0,8,3},{0,0,8,3},
|
||
|
|
{13,0,8,3},{0,0,8,3},{0,0,11,6},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,0,0,3},{0,5,0,0},
|
||
|
|
{10,5,0,0},{0,5,0,0},{0,8,0,3},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,0,0,3},{0,0,0,3},
|
||
|
|
{8,0,0,3},{0,0,0,3},{0,0,0,6},{0,0,0,3},
|
||
|
|
{8,0,0,3},{0,0,0,3},{0,0,0,6},{0,8,0,3},
|
||
|
|
{13,8,0,3},{0,8,0,3},{0,11,0,6},{0,0,0,3},
|
||
|
|
{8,0,0,3},{0,0,0,3},{0,0,0,6},{0,0,0,6},
|
||
|
|
{11,0,0,6},{0,0,0,6},{0,0,0,9},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,3,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{3,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,0,3,0},{0,5,0,0},
|
||
|
|
{10,5,0,0},{0,5,0,0},{0,8,3,0},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,0,3,0},{0,0,3,0},
|
||
|
|
{8,0,3,0},{0,0,3,0},{0,0,6,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,3,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{3,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{3,0,0,0},{0,0,0,0},{0,0,0,0},{0,3,0,0},
|
||
|
|
{8,3,0,0},{0,3,0,0},{0,6,0,0},{0,0,0,0},
|
||
|
|
{3,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{6,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{3,0,0,0},{0,0,0,0},{0,0,0,0},{0,3,0,0},
|
||
|
|
{8,3,0,0},{0,3,0,0},{0,6,0,0},{0,0,0,0},
|
||
|
|
{3,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{6,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,3,0},
|
||
|
|
{8,0,3,0},{0,0,3,0},{0,0,6,0},{0,8,3,0},
|
||
|
|
{13,8,3,0},{0,8,3,0},{0,11,6,0},{0,0,3,0},
|
||
|
|
{8,0,3,0},{0,0,3,0},{0,0,6,0},{0,0,6,0},
|
||
|
|
{11,0,6,0},{0,0,6,0},{0,0,9,0},{0,0,0,0},
|
||
|
|
{3,0,0,0},{0,0,0,0},{0,0,0,0},{0,3,0,0},
|
||
|
|
{8,3,0,0},{0,3,0,0},{0,6,0,0},{0,0,0,0},
|
||
|
|
{3,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{6,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{6,0,0,0},{0,0,0,0},{0,0,0,0},{0,6,0,0},
|
||
|
|
{11,6,0,0},{0,6,0,0},{0,9,0,0},{0,0,0,0},
|
||
|
|
{6,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{9,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,4,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{4,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,0,4,0},{0,5,0,0},
|
||
|
|
{10,5,0,0},{0,5,0,0},{0,9,4,0},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,0,4,0},{0,0,4,0},
|
||
|
|
{9,0,4,0},{0,0,4,0},{0,0,8,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,4,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{4,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{4,0,0,0},{0,0,0,0},{0,0,0,0},{0,4,0,0},
|
||
|
|
{9,4,0,0},{0,4,0,0},{0,8,0,0},{0,0,0,0},
|
||
|
|
{4,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{8,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,0,0,4},{0,5,0,0},
|
||
|
|
{10,5,0,0},{0,5,0,0},{0,9,0,4},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,0,0,4},{0,0,0,4},
|
||
|
|
{9,0,0,4},{0,0,0,4},{0,0,0,8},{0,0,5,0},
|
||
|
|
{10,0,5,0},{0,0,5,0},{0,0,9,4},{0,10,5,0},
|
||
|
|
{15,10,5,0},{0,10,5,0},{0,14,9,4},{0,0,5,0},
|
||
|
|
{10,0,5,0},{0,0,5,0},{0,0,9,4},{0,0,9,4},
|
||
|
|
{14,0,9,4},{0,0,9,4},{0,0,13,8},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,0,0,4},{0,5,0,0},
|
||
|
|
{10,5,0,0},{0,5,0,0},{0,9,0,4},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,0,0,4},{0,0,0,4},
|
||
|
|
{9,0,0,4},{0,0,0,4},{0,0,0,8},{0,0,0,4},
|
||
|
|
{9,0,0,4},{0,0,0,4},{0,0,0,8},{0,9,0,4},
|
||
|
|
{14,9,0,4},{0,9,0,4},{0,13,0,8},{0,0,0,4},
|
||
|
|
{9,0,0,4},{0,0,0,4},{0,0,0,8},{0,0,0,8},
|
||
|
|
{13,0,0,8},{0,0,0,8},{0,0,0,12},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,4,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{4,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,0,4,0},{0,5,0,0},
|
||
|
|
{10,5,0,0},{0,5,0,0},{0,9,4,0},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,0,4,0},{0,0,4,0},
|
||
|
|
{9,0,4,0},{0,0,4,0},{0,0,8,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{5,0,0,0},{0,0,0,0},{0,4,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{4,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{4,0,0,0},{0,0,0,0},{0,0,0,0},{0,4,0,0},
|
||
|
|
{9,4,0,0},{0,4,0,0},{0,8,0,0},{0,0,0,0},
|
||
|
|
{4,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{8,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{4,0,0,0},{0,0,0,0},{0,0,0,0},{0,4,0,0},
|
||
|
|
{9,4,0,0},{0,4,0,0},{0,8,0,0},{0,0,0,0},
|
||
|
|
{4,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{8,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,4,0},
|
||
|
|
{9,0,4,0},{0,0,4,0},{0,0,8,0},{0,9,4,0},
|
||
|
|
{14,9,4,0},{0,9,4,0},{0,13,8,0},{0,0,4,0},
|
||
|
|
{9,0,4,0},{0,0,4,0},{0,0,8,0},{0,0,8,0},
|
||
|
|
{13,0,8,0},{0,0,8,0},{0,0,12,0},{0,0,0,0},
|
||
|
|
{4,0,0,0},{0,0,0,0},{0,0,0,0},{0,4,0,0},
|
||
|
|
{9,4,0,0},{0,4,0,0},{0,8,0,0},{0,0,0,0},
|
||
|
|
{4,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{8,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{8,0,0,0},{0,0,0,0},{0,0,0,0},{0,8,0,0},
|
||
|
|
{13,8,0,0},{0,8,0,0},{0,12,0,0},{0,0,0,0},
|
||
|
|
{8,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{12,0,0,0},{0,0,0,0},{0,0,0,0}
|
||
|
|
};
|
||
|
|
|
||
|
|
/**
|
||
|
|
* 256 x 3 (2B,3B,4B ptrs) x 4 (ACGT) table encoding offset to next address given a code byte for an internal 'DIVERGE' node
|
||
|
|
*/
|
||
|
|
unsigned char code_table[256 * 3][4] = {
|
||
|
|
{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{2,0,0,0},{0,2,0,0},
|
||
|
|
{0,2,0,0},{0,2,0,0},{4,2,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{2,0,0,0},{0,2,0,0},
|
||
|
|
{0,2,0,0},{0,2,0,0},{4,2,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{2,0,0,0},{0,2,0,0},
|
||
|
|
{0,2,0,0},{0,2,0,0},{4,2,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{2,0,0,0},{0,2,0,0},
|
||
|
|
{0,2,0,0},{0,2,0,0},{4,2,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{2,0,0,0},{0,2,0,0},
|
||
|
|
{0,2,0,0},{0,2,0,0},{4,2,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{2,0,0,0},{0,2,0,0},
|
||
|
|
{0,2,0,0},{0,2,0,0},{4,2,0,0},{0,0,2,0},
|
||
|
|
{0,0,2,0},{0,0,2,0},{4,0,2,0},{0,0,2,0},
|
||
|
|
{0,0,2,0},{0,0,2,0},{4,0,2,0},{0,0,2,0},
|
||
|
|
{0,0,2,0},{0,0,2,0},{4,0,2,0},{0,4,2,0},
|
||
|
|
{0,4,2,0},{0,4,2,0},{6,4,2,0},
|
||
|
|
{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{3,0,0,0},{0,3,0,0},
|
||
|
|
{0,3,0,0},{0,3,0,0},{6,3,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{3,0,0,0},{0,3,0,0},
|
||
|
|
{0,3,0,0},{0,3,0,0},{6,3,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{3,0,0,0},{0,3,0,0},
|
||
|
|
{0,3,0,0},{0,3,0,0},{6,3,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{3,0,0,0},{0,3,0,0},
|
||
|
|
{0,3,0,0},{0,3,0,0},{6,3,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{3,0,0,0},{0,3,0,0},
|
||
|
|
{0,3,0,0},{0,3,0,0},{6,3,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{3,0,0,0},{0,3,0,0},
|
||
|
|
{0,3,0,0},{0,3,0,0},{6,3,0,0},{0,0,3,0},
|
||
|
|
{0,0,3,0},{0,0,3,0},{6,0,3,0},{0,0,3,0},
|
||
|
|
{0,0,3,0},{0,0,3,0},{6,0,3,0},{0,0,3,0},
|
||
|
|
{0,0,3,0},{0,0,3,0},{6,0,3,0},{0,6,3,0},
|
||
|
|
{0,6,3,0},{0,6,3,0},{9,6,3,0},
|
||
|
|
{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{4,0,0,0},{0,4,0,0},
|
||
|
|
{0,4,0,0},{0,4,0,0},{8,4,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{4,0,0,0},{0,4,0,0},
|
||
|
|
{0,4,0,0},{0,4,0,0},{8,4,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{4,0,0,0},{0,4,0,0},
|
||
|
|
{0,4,0,0},{0,4,0,0},{8,4,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{4,0,0,0},{0,4,0,0},
|
||
|
|
{0,4,0,0},{0,4,0,0},{8,4,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{4,0,0,0},{0,4,0,0},
|
||
|
|
{0,4,0,0},{0,4,0,0},{8,4,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0},
|
||
|
|
{0,0,0,0},{0,0,0,0},{4,0,0,0},{0,4,0,0},
|
||
|
|
{0,4,0,0},{0,4,0,0},{8,4,0,0},{0,0,4,0},
|
||
|
|
{0,0,4,0},{0,0,4,0},{8,0,4,0},{0,0,4,0},
|
||
|
|
{0,0,4,0},{0,0,4,0},{8,0,4,0},{0,0,4,0},
|
||
|
|
{0,0,4,0},{0,0,4,0},{8,0,4,0},{0,8,4,0},
|
||
|
|
{0,8,4,0},{0,8,4,0},{12,8,4,0}
|
||
|
|
};
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Return integer key from k-mer string.
|
||
|
|
*
|
||
|
|
* @param str Query sequence
|
||
|
|
* @param keysize K-mer length
|
||
|
|
* @param index Index into query sequence
|
||
|
|
* @param seq_len Length of query sequence
|
||
|
|
* @param end_flag Flag to indicate end of query
|
||
|
|
* @param idx_first_N Index of first ambiguous bp in the k-mer
|
||
|
|
*
|
||
|
|
* @return Integer key (hash) for k-mer
|
||
|
|
*/
|
||
|
|
static inline uint32_t getHashKey (const uint8_t *str, const int keysize, int index, int seq_len, int* end_flag, int* idx_first_N) {
|
||
|
|
int i;
|
||
|
|
uint32_t key = 0;
|
||
|
|
int len = keysize;
|
||
|
|
if (index + keysize > seq_len) {
|
||
|
|
*end_flag = 1;
|
||
|
|
len = seq_len - index;
|
||
|
|
}
|
||
|
|
for (i = 0; i < len; ++i) {
|
||
|
|
if (str[i] != 4) {
|
||
|
|
key |= ((uint32_t)(str[i]) << (i << 1));
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
*idx_first_N = i;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return key;
|
||
|
|
}
|
||
|
|
|
||
|
|
uint8_t *get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end,
|
||
|
|
int64_t *len, uint8_t *ref_string, uint8_t *seqb)
|
||
|
|
{
|
||
|
|
uint8_t *seq = 0;
|
||
|
|
if (end < beg) end ^= beg, beg ^= end, end ^= beg; // if end is smaller, swap
|
||
|
|
if (end > l_pac<<1) end = l_pac<<1;
|
||
|
|
if (beg < 0) beg = 0;
|
||
|
|
if (beg >= l_pac || end <= l_pac) {
|
||
|
|
*len = end - beg;
|
||
|
|
if (beg >= l_pac) { // reverse strand
|
||
|
|
seq = ref_string + beg;
|
||
|
|
} else { // forward strand
|
||
|
|
seq = ref_string + beg;
|
||
|
|
}
|
||
|
|
|
||
|
|
} else *len = 0; // if bridging the forward-reverse boundary, return nothing
|
||
|
|
return seq;
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Compute offset to child and return address of child node
|
||
|
|
*
|
||
|
|
* @param raux read parameters
|
||
|
|
* @param mlt_data radix tree of k-mer
|
||
|
|
* @param code type of child nodes (EMPTY:00, LEAF:01, UNIFORM:10, DIVERGE:11) - 2b encoded
|
||
|
|
* @param c next character in read (used to traverse tree)
|
||
|
|
* @param byteIdx byte index into the mlt_data radix tree
|
||
|
|
*
|
||
|
|
*/
|
||
|
|
void getOffsetToChildNode(read_aux_t* raux, uint8_t* mlt_data, uint8_t code, uint8_t c, uint64_t* byteIdx) {
|
||
|
|
uint64_t nextByteIdx = *byteIdx;
|
||
|
|
uint64_t startByteIdx = nextByteIdx - 1;
|
||
|
|
int offset = code_table[((raux->ptr_width - 2) << 8) + code][c];
|
||
|
|
uint32_t reseed_data = 0;
|
||
|
|
nextByteIdx += offset;
|
||
|
|
memcpy_bwamem(&reseed_data, raux->ptr_width * sizeof(uint8_t), &mlt_data[nextByteIdx], raux->ptr_width * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
uint32_t jumpByteIdx = reseed_data >> 6;
|
||
|
|
raux->num_hits = (reseed_data & 0x3F);
|
||
|
|
nextByteIdx = startByteIdx + jumpByteIdx;
|
||
|
|
*byteIdx = nextByteIdx;
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Compute offset to start of leaf data
|
||
|
|
*
|
||
|
|
* @param raux read parameters
|
||
|
|
* @param code type of child nodes (EMPTY:00, LEAF:01, UNIFORM:10, DIVERGE:11) - 2b encoded
|
||
|
|
* @param c next character in read (used to traverse tree)
|
||
|
|
*
|
||
|
|
*/
|
||
|
|
static inline int getOffsetToLeafData(read_aux_t* raux, uint8_t code, uint8_t c) {
|
||
|
|
return leaf_table[((raux->ptr_width - 2) << 8) + code][c];
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* This routine does depth-first tree traversal starting from an internal node to obtain all hits at leaf nodes
|
||
|
|
*
|
||
|
|
* @param raux read parameters
|
||
|
|
* @param mlt_data radix tree of k-mer
|
||
|
|
* @param byteIdx byte index into the mlt_data radix tree
|
||
|
|
* @param mem maximal-exact-match storage
|
||
|
|
* @param bc next character in read (used to traverse tree)
|
||
|
|
* @param hits list of hits for read
|
||
|
|
*/
|
||
|
|
void getNextByteIdx_dfs(read_aux_t* raux, uint8_t* mlt_data, uint64_t* byte_idx, mem_t* mem, uint8_t bc, u64v* hits) {
|
||
|
|
|
||
|
|
uint64_t nextByteIdx = *byte_idx;
|
||
|
|
uint64_t ref_pos = 0;
|
||
|
|
uint8_t c;
|
||
|
|
c = 3 - bc;
|
||
|
|
mem->skip_ref_fetch = 1; // MEM cannot be extended by leaf decompression
|
||
|
|
uint8_t code = mlt_data[nextByteIdx++];
|
||
|
|
uint8_t code_c = (code >> (c << 1)) & 3;
|
||
|
|
assert(code != 0);
|
||
|
|
if (code_c == LEAF) { // Hit a leaf node
|
||
|
|
int k;
|
||
|
|
uint64_t leaf_data = 0;
|
||
|
|
nextByteIdx += getOffsetToLeafData(raux, code, c);
|
||
|
|
memcpy_bwamem(&leaf_data, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
if (leaf_data & 1) { // Found a multi-hit leaf node
|
||
|
|
nextByteIdx = raux->mh_start_addr + (leaf_data >> 1);
|
||
|
|
memcpy_bwamem(&raux->num_hits, 2 * sizeof(uint8_t), &mlt_data[nextByteIdx], 2 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
nextByteIdx += 2;
|
||
|
|
mem->hitcount += raux->num_hits;
|
||
|
|
for (k = 0; k < raux->num_hits; ++k) {
|
||
|
|
memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
kv_push(uint64_t, *hits, ref_pos >> 1);
|
||
|
|
nextByteIdx += 5;
|
||
|
|
ref_pos = 0;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else { // Single-hit leaf node
|
||
|
|
raux->num_hits = 1;
|
||
|
|
mem->hitcount += raux->num_hits;
|
||
|
|
kv_push(uint64_t, *hits, leaf_data >> 1);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else if (code_c == UNIFORM) { // Multi-character internal node
|
||
|
|
int countBP = *((uint16_t*) &mlt_data[nextByteIdx]);
|
||
|
|
nextByteIdx += 2;
|
||
|
|
int numBitsForBP = countBP << 1;
|
||
|
|
int numBytesForBP = (numBitsForBP % 8) ? (numBitsForBP / 8 + 1) : (numBitsForBP / 8);
|
||
|
|
uint8_t packedBP[numBytesForBP];
|
||
|
|
memcpy_bwamem(packedBP, numBytesForBP * sizeof(uint8_t), &mlt_data[nextByteIdx], numBytesForBP * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
nextByteIdx += numBytesForBP;
|
||
|
|
uint8_t k;
|
||
|
|
uint64_t startByteIdx = nextByteIdx;
|
||
|
|
for (k = 0; k < 4; ++k) {
|
||
|
|
getNextByteIdx_dfs(raux, mlt_data, &startByteIdx, mem, k, hits);
|
||
|
|
startByteIdx = nextByteIdx;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else if (code_c == DIVERGE) { // Single-character internal node
|
||
|
|
getOffsetToChildNode(raux, mlt_data, code, c, &nextByteIdx);
|
||
|
|
uint8_t k;
|
||
|
|
uint64_t startByteIdx = nextByteIdx;
|
||
|
|
for (k = 0; k < 4; ++k) {
|
||
|
|
getNextByteIdx_dfs(raux, mlt_data, &startByteIdx, mem, k, hits);
|
||
|
|
startByteIdx = nextByteIdx;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
*byte_idx = nextByteIdx;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Gather all hits for the MEM
|
||
|
|
*
|
||
|
|
* @param raux read parameters
|
||
|
|
* @param mlt_data radix tree of k-mer
|
||
|
|
* @param byteIdx byte index into the mlt_data radix tree
|
||
|
|
* @param mem MEM including hits
|
||
|
|
* @param hits list of hits for read
|
||
|
|
*/
|
||
|
|
void leaf_gather(read_aux_t* raux, uint8_t* mlt_data, uint64_t* byte_idx, mem_t* mem, u64v* hits) {
|
||
|
|
uint8_t k;
|
||
|
|
uint64_t startByteIdx = *byte_idx;
|
||
|
|
uint64_t tmpByteIdx = startByteIdx;
|
||
|
|
for (k = 0; k < 4; ++k) {
|
||
|
|
getNextByteIdx_dfs(raux, mlt_data, &tmpByteIdx, mem, k, hits);
|
||
|
|
tmpByteIdx = startByteIdx;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Traverse tree during backward search(seeding). Similar to getNextByteIdx, except that we don't compute LEP
|
||
|
|
*
|
||
|
|
* @param raux read parameters
|
||
|
|
* @param mlt_data radix tree of k-mer
|
||
|
|
* @param byteIdx byte index into the mlt_data radix tree
|
||
|
|
* @param i index into read buffer
|
||
|
|
* @param mem maximal-exact-match storage
|
||
|
|
* @param hits list of hits for read
|
||
|
|
*/
|
||
|
|
void getNextByteIdx_backward(read_aux_t* raux, uint8_t* mlt_data, uint64_t* byte_idx, int* i, mem_t* mem, u64v* hits) {
|
||
|
|
uint64_t nextByteIdx = *byte_idx;
|
||
|
|
uint64_t ref_pos = 0;
|
||
|
|
uint8_t c, code, code_c;
|
||
|
|
if (raux->read_buf[*i] != 4) {
|
||
|
|
c = 3 - raux->read_buf[*i];
|
||
|
|
code = mlt_data[nextByteIdx++];
|
||
|
|
code_c = (code >> (c << 1)) & 3;
|
||
|
|
assert(code != 0);
|
||
|
|
}
|
||
|
|
else { // Terminate MEM search when we hit an 'N'
|
||
|
|
code_c = EMPTY;
|
||
|
|
}
|
||
|
|
if (code_c == EMPTY) { // Gather leaves later during forward traversal
|
||
|
|
mem->rc_end = *i;
|
||
|
|
mem->fetch_leaves = 1;
|
||
|
|
}
|
||
|
|
else if (code_c == LEAF) { // Hit a leaf node
|
||
|
|
*i += 1;
|
||
|
|
mem->rc_end = *i;
|
||
|
|
int k;
|
||
|
|
uint64_t leaf_data = 0;
|
||
|
|
nextByteIdx += getOffsetToLeafData(raux, code, c);
|
||
|
|
memcpy_bwamem(&leaf_data, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
if (leaf_data & 1) { // Found a multi-hit leaf node
|
||
|
|
nextByteIdx = raux->mh_start_addr + (leaf_data >> 1);
|
||
|
|
memcpy_bwamem(&raux->num_hits, 2 * sizeof(uint8_t), &mlt_data[nextByteIdx], 2 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
nextByteIdx += 2;
|
||
|
|
mem->hitcount += raux->num_hits;
|
||
|
|
for (k = 0; k < raux->num_hits; ++k) {
|
||
|
|
memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
nextByteIdx += 5;
|
||
|
|
kv_push(uint64_t, *hits, ref_pos >> 1);
|
||
|
|
ref_pos = 0;
|
||
|
|
}
|
||
|
|
//
|
||
|
|
// We found a multi-hit. But to report hits in the same order as BWA-MEM,
|
||
|
|
// we will fetch the hits again in a forward tree traversal
|
||
|
|
//
|
||
|
|
mem->fetch_leaves = 1;
|
||
|
|
}
|
||
|
|
else { // Single-hit leaf node
|
||
|
|
mem->hitcount += 1;
|
||
|
|
kv_push(uint64_t, *hits, leaf_data >> 1);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else if (code_c == UNIFORM) { // Multi-character internal node
|
||
|
|
uint32_t j;
|
||
|
|
int countBP = *((uint16_t*) &mlt_data[nextByteIdx]);
|
||
|
|
nextByteIdx += 2;
|
||
|
|
int numBitsForBP = countBP << 1;
|
||
|
|
int numBytesForBP = (numBitsForBP % 8) ? (numBitsForBP / 8 + 1) : (numBitsForBP / 8);
|
||
|
|
uint8_t packedBP[numBytesForBP];
|
||
|
|
memcpy_bwamem(packedBP, numBytesForBP * sizeof(uint8_t), &mlt_data[nextByteIdx], numBytesForBP * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
nextByteIdx += numBytesForBP;
|
||
|
|
// Unpack base pairs
|
||
|
|
uint8_t unpackedBP[countBP];
|
||
|
|
for (j = 0; j < countBP; ++j) {
|
||
|
|
unpackedBP[j] = ((packedBP[j >> 2] >> ((~(j) & 3) << 1)) & 3);
|
||
|
|
}
|
||
|
|
// Count number of matching base pairs with read
|
||
|
|
for (j = 0; j < countBP; ++j) {
|
||
|
|
if ((*i + j) >= raux->l_seq) {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
if (raux->read_buf[*i+j] == 4) {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
if (3 - raux->read_buf[*i+j] != unpackedBP[j]) {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
*i += j;
|
||
|
|
if (j == countBP) { // We match all bps
|
||
|
|
if (*i < raux->l_seq) {
|
||
|
|
getNextByteIdx_backward(raux, mlt_data, &nextByteIdx, i, mem, hits);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
mem->rc_end = *i;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else { // Did not match all bps. Gather leaves for MEM in a later forward traversal
|
||
|
|
mem->rc_end = *i;
|
||
|
|
mem->fetch_leaves = 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else { // Single-character internal node
|
||
|
|
getOffsetToChildNode(raux, mlt_data, code, c, &nextByteIdx);
|
||
|
|
*i += 1;
|
||
|
|
if (*i < raux->l_seq) {
|
||
|
|
getNextByteIdx_backward(raux, mlt_data, &nextByteIdx, i, mem, hits);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
mem->rc_end = *i;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
*byte_idx = nextByteIdx;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Traverse tree during backward search (reseeding). Terminate when fewer than 'raux->limit' hits are found for
|
||
|
|
* an internal node
|
||
|
|
*
|
||
|
|
* @param raux read parameters
|
||
|
|
* @param mlt_data radix tree of k-mer
|
||
|
|
* @param byteIdx byte index into the mlt_data radix tree
|
||
|
|
* @param i index into read buffer
|
||
|
|
* @param mem maximal-exact-match storage
|
||
|
|
* @param hits list of hits for each read
|
||
|
|
*/
|
||
|
|
void getNextByteIdx_backward_wlimit(read_aux_t* raux, uint8_t* mlt_data, uint64_t* byte_idx, int* i, mem_t* mem, u64v* hits) {
|
||
|
|
|
||
|
|
uint64_t nextByteIdx = *byte_idx;
|
||
|
|
uint64_t ref_pos = 0;
|
||
|
|
uint8_t c, code, code_c;
|
||
|
|
if (raux->read_buf[*i] != 4) {
|
||
|
|
c = 3 - raux->read_buf[*i];
|
||
|
|
code = mlt_data[nextByteIdx++];
|
||
|
|
code_c = (code >> (c << 1)) & 3;
|
||
|
|
assert(code != 0);
|
||
|
|
}
|
||
|
|
else { // Terminate MEM search when we hit an 'N'
|
||
|
|
code_c = EMPTY;
|
||
|
|
}
|
||
|
|
if (code_c == EMPTY) { // Gather leaves later during forward traversal
|
||
|
|
mem->rc_end = *i;
|
||
|
|
mem->fetch_leaves = 1;
|
||
|
|
}
|
||
|
|
else if (code_c == LEAF) { // Hit a leaf node
|
||
|
|
int k;
|
||
|
|
uint64_t leaf_data = 0;
|
||
|
|
nextByteIdx += getOffsetToLeafData(raux, code, c);
|
||
|
|
memcpy_bwamem(&leaf_data, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
if (leaf_data & 1) { // Found a multi-hit leaf node
|
||
|
|
nextByteIdx = raux->mh_start_addr + (leaf_data >> 1);
|
||
|
|
memcpy_bwamem(&raux->num_hits, 2 * sizeof(uint8_t), &mlt_data[nextByteIdx], 2 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
nextByteIdx += 2;
|
||
|
|
// Hits exceed reseeding threshold
|
||
|
|
if (raux->num_hits >= raux->limit) {
|
||
|
|
mem->hitcount += raux->num_hits;
|
||
|
|
for (k = 0; k < raux->num_hits; ++k) {
|
||
|
|
memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
nextByteIdx += 5;
|
||
|
|
kv_push(uint64_t, *hits, ref_pos >> 1);
|
||
|
|
ref_pos = 0;
|
||
|
|
}
|
||
|
|
*i += 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
//
|
||
|
|
// We found a multi-hit. But to report hits in the same order as BWA-MEM,
|
||
|
|
// we will fetch the hits again in a forward tree traversal
|
||
|
|
//
|
||
|
|
mem->fetch_leaves = 1;
|
||
|
|
mem->rc_end = *i;
|
||
|
|
}
|
||
|
|
else if (code_c == UNIFORM) { // Multi-character internal node
|
||
|
|
uint32_t j;
|
||
|
|
int countBP = *((uint16_t*) &mlt_data[nextByteIdx]);
|
||
|
|
nextByteIdx += 2;
|
||
|
|
int numBitsForBP = countBP << 1;
|
||
|
|
int numBytesForBP = (numBitsForBP % 8) ? (numBitsForBP / 8 + 1) : (numBitsForBP / 8);
|
||
|
|
uint8_t packedBP[numBytesForBP];
|
||
|
|
memcpy_bwamem(packedBP, numBytesForBP * sizeof(uint8_t), &mlt_data[nextByteIdx], numBytesForBP * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
nextByteIdx += numBytesForBP;
|
||
|
|
// Unpack base pairs
|
||
|
|
uint8_t unpackedBP[countBP];
|
||
|
|
for (j = 0; j < countBP; ++j) {
|
||
|
|
unpackedBP[j] = ((packedBP[j >> 2] >> ((~(j) & 3) << 1)) & 3);
|
||
|
|
}
|
||
|
|
// Count number of matching base pairs with read
|
||
|
|
for (j = 0; j < countBP; ++j) {
|
||
|
|
if ((*i + j) >= raux->l_seq) {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
if (raux->read_buf[*i+j] == 4) {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
if (3 - raux->read_buf[*i+j] != unpackedBP[j]) {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
*i += j;
|
||
|
|
if (j == countBP) { // We match all bps
|
||
|
|
if (*i < raux->l_seq) {
|
||
|
|
getNextByteIdx_backward_wlimit(raux, mlt_data, &nextByteIdx, i, mem, hits);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
mem->rc_end = *i;
|
||
|
|
mem->fetch_leaves = 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else { // Did not match all bps. Gather leaves for MEM in a later forward traversal
|
||
|
|
mem->rc_end = *i;
|
||
|
|
mem->fetch_leaves = 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else { // Single-character internal node
|
||
|
|
raux->num_hits = 0;
|
||
|
|
getOffsetToChildNode(raux, mlt_data, code, c, &nextByteIdx);
|
||
|
|
// In the internal nodes, raux->num_hits = 0 is used to reprsent # hits > 20
|
||
|
|
if ((raux->num_hits == 0) || (raux->num_hits >= raux->limit)) {
|
||
|
|
*i += 1;
|
||
|
|
if (*i < raux->l_seq) {
|
||
|
|
getNextByteIdx_backward_wlimit(raux, mlt_data, &nextByteIdx, i, mem, hits);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
mem->rc_end = *i;
|
||
|
|
mem->fetch_leaves = 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
mem->rc_end = *i;
|
||
|
|
mem->fetch_leaves = 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
*byte_idx = nextByteIdx;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Traverse tree during forward search (seeding).
|
||
|
|
*
|
||
|
|
* @param raux read parameters
|
||
|
|
* @param mlt_data radix tree of k-mer
|
||
|
|
* @param byteIdx byte index into the mlt_data radix tree
|
||
|
|
* @param i index into read buffer
|
||
|
|
* @param mem maximal-exact-match storage
|
||
|
|
* @param hits list of hits for read
|
||
|
|
*/
|
||
|
|
void getNextByteIdx(read_aux_t* raux, uint8_t* mlt_data, uint64_t* byte_idx, int* i, mem_t* mem, u64v* hits) {
|
||
|
|
|
||
|
|
uint64_t nextByteIdx = *byte_idx;
|
||
|
|
uint64_t parent_byte_idx = nextByteIdx;
|
||
|
|
uint64_t ref_pos = 0;
|
||
|
|
uint8_t c, code, code_c;
|
||
|
|
if (raux->read_buf[*i] != 4) {
|
||
|
|
c = 3 - raux->read_buf[*i];
|
||
|
|
code = mlt_data[nextByteIdx++];
|
||
|
|
code_c = (code >> (c << 1)) & 3;
|
||
|
|
assert(code != 0);
|
||
|
|
}
|
||
|
|
else { // Terminate MEM search when we hit an 'N'
|
||
|
|
code_c = EMPTY;
|
||
|
|
}
|
||
|
|
uint64_t lep_idx = 0;
|
||
|
|
uint64_t lep_bit_idx = 0;
|
||
|
|
if (code_c == EMPTY) {
|
||
|
|
if (mem->start == 0) { // FIXME: Gather leaves later during forward traversal even for MEMs starting at read_pos = 0
|
||
|
|
int mem_len = *i;
|
||
|
|
if (mem_len >= raux->min_seed_len) { // Only gather leaves when MEM length exceeds threshold
|
||
|
|
uint64_t startByteIdx = parent_byte_idx;
|
||
|
|
leaf_gather(raux, mlt_data, &startByteIdx, mem, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
// Update LEP for backward search
|
||
|
|
lep_idx = raux->nextLEPBit >> 6;
|
||
|
|
lep_bit_idx = raux->nextLEPBit & (0x3FULL);
|
||
|
|
raux->lep[lep_idx] |= (1ULL << lep_bit_idx);
|
||
|
|
raux->nextLEPBit += 1;
|
||
|
|
}
|
||
|
|
else if (code_c == LEAF) { // Hit a leaf node
|
||
|
|
int k;
|
||
|
|
uint64_t leaf_data = 0;
|
||
|
|
nextByteIdx += getOffsetToLeafData(raux, code, c);
|
||
|
|
memcpy_bwamem(&leaf_data, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
if (leaf_data & 1) { // Found a multi-hit leaf node
|
||
|
|
nextByteIdx = raux->mh_start_addr + (leaf_data >> 1);
|
||
|
|
memcpy_bwamem(&raux->num_hits, 2 * sizeof(uint8_t), &mlt_data[nextByteIdx], 2 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
nextByteIdx += 2;
|
||
|
|
mem->hitcount += raux->num_hits;
|
||
|
|
for (k = 0; k < raux->num_hits; ++k) {
|
||
|
|
memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
kv_push(uint64_t, *hits, ref_pos >> 1);
|
||
|
|
nextByteIdx += 5;
|
||
|
|
ref_pos = 0;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else { // Single-hit leaf node
|
||
|
|
raux->num_hits = 1;
|
||
|
|
mem->hitcount += raux->num_hits;
|
||
|
|
kv_push(uint64_t, *hits, leaf_data >> 1);
|
||
|
|
}
|
||
|
|
// Update LEP for backward search
|
||
|
|
lep_idx = raux->nextLEPBit >> 6;
|
||
|
|
lep_bit_idx = raux->nextLEPBit & (0x3FULL);
|
||
|
|
raux->lep[lep_idx] |= (1ULL << lep_bit_idx);
|
||
|
|
raux->nextLEPBit += 1;
|
||
|
|
*i += 1;
|
||
|
|
}
|
||
|
|
else if (code_c == UNIFORM) { // Multi-character internal node
|
||
|
|
uint32_t j;
|
||
|
|
int countBP = *((uint16_t*) &mlt_data[nextByteIdx]);
|
||
|
|
nextByteIdx += 2;
|
||
|
|
int numBitsForBP = countBP << 1;
|
||
|
|
int numBytesForBP = (numBitsForBP % 8) ? (numBitsForBP / 8 + 1) : (numBitsForBP / 8);
|
||
|
|
uint8_t packedBP[numBytesForBP];
|
||
|
|
memcpy_bwamem(packedBP, numBytesForBP * sizeof(uint8_t), &mlt_data[nextByteIdx], numBytesForBP * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
nextByteIdx += numBytesForBP;
|
||
|
|
// Unpack base pairs
|
||
|
|
uint8_t unpackedBP[countBP];
|
||
|
|
for (j = 0; j < countBP; ++j) {
|
||
|
|
unpackedBP[j] = ((packedBP[j >> 2] >> ((~(j) & 3) << 1)) & 3);
|
||
|
|
}
|
||
|
|
// Count number of matching base pairs with read
|
||
|
|
for (j = 0; j < countBP; ++j) {
|
||
|
|
if ((*i + j) >= raux->l_seq) { // Don't run past the end of the read
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
if (raux->read_buf[*i+j] == 4) {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
if ((3 - raux->read_buf[*i+j]) != unpackedBP[j]) {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
raux->nextLEPBit += j;
|
||
|
|
*i += j;
|
||
|
|
if (j == countBP) { // If we match all bases of uniform entry
|
||
|
|
if (*i == raux->l_seq) { // Check if we reached the end of the read
|
||
|
|
if (mem->start == 0) {
|
||
|
|
leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits);
|
||
|
|
}
|
||
|
|
lep_idx = raux->nextLEPBit >> 6;
|
||
|
|
lep_bit_idx = raux->nextLEPBit & (0x3FULL);
|
||
|
|
raux->lep[lep_idx] |= (1ULL << lep_bit_idx);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
if (*i < raux->l_seq) {
|
||
|
|
getNextByteIdx(raux, mlt_data, &nextByteIdx, i, mem, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
//
|
||
|
|
// We did not match all bases of uniform entry
|
||
|
|
// Fetch all hits from leaf nodes for backward extension (dfs :( )
|
||
|
|
//
|
||
|
|
assert(*i <= raux->l_seq);
|
||
|
|
if (mem->start == 0) {
|
||
|
|
int mem_len = *i;
|
||
|
|
if (mem_len >= raux->min_seed_len) {
|
||
|
|
leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
// Update LEP to start backward search from last matching bp
|
||
|
|
lep_idx = raux->nextLEPBit >> 6;
|
||
|
|
lep_bit_idx = raux->nextLEPBit & (0x3FULL);
|
||
|
|
raux->lep[lep_idx] |= (1ULL << lep_bit_idx);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else { // Single-character internal node
|
||
|
|
getOffsetToChildNode(raux, mlt_data, code, c, &nextByteIdx);
|
||
|
|
lep_idx = raux->nextLEPBit >> 6;
|
||
|
|
lep_bit_idx = raux->nextLEPBit & (0x3FULL);
|
||
|
|
raux->lep[lep_idx] |= (1ULL << lep_bit_idx);
|
||
|
|
raux->nextLEPBit += 1;
|
||
|
|
*i += 1;
|
||
|
|
if (*i < raux->l_seq) {
|
||
|
|
getNextByteIdx(raux, mlt_data, &nextByteIdx, i, mem, hits);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
if (mem->start == 0) {
|
||
|
|
leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits);
|
||
|
|
}
|
||
|
|
raux->lep[raux->nextLEPBit >> 6] |= (1ULL << (raux->nextLEPBit & (0x3FULL)));
|
||
|
|
raux->nextLEPBit += 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
*byte_idx = nextByteIdx;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Traverse tree during forward search (reseeding). Terminate when fewer than 'raux->limit' hits are found for an
|
||
|
|
* internal node
|
||
|
|
*
|
||
|
|
* @param raux read parameters
|
||
|
|
* @param mlt_data radix tree of k-mer
|
||
|
|
* @param byteIdx byte index into the mlt_data radix tree
|
||
|
|
* @param i index into read buffer
|
||
|
|
* @param mem maximal-exact-match storage
|
||
|
|
* @param visited stack to store list of visited nodes
|
||
|
|
* @param hits list of hits for read
|
||
|
|
*/
|
||
|
|
void getNextByteIdx_wlimit(read_aux_t* raux, uint8_t* mlt_data, uint64_t* byte_idx, int* i, mem_t* mem, path_v* visited, u64v* hits) {
|
||
|
|
|
||
|
|
uint64_t nextByteIdx = *byte_idx;
|
||
|
|
uint64_t parent_byte_idx = nextByteIdx;
|
||
|
|
uint64_t ref_pos = 0;
|
||
|
|
uint8_t c, code, code_c;
|
||
|
|
if (raux->read_buf[*i] != 4) {
|
||
|
|
c = 3 - raux->read_buf[*i];
|
||
|
|
code = mlt_data[nextByteIdx++];
|
||
|
|
code_c = (code >> (c << 1)) & 3;
|
||
|
|
assert(code != 0);
|
||
|
|
}
|
||
|
|
else { // Terminate MEM search when we hit an 'N'
|
||
|
|
code_c = EMPTY;
|
||
|
|
}
|
||
|
|
uint64_t lep_idx = 0;
|
||
|
|
uint64_t lep_bit_idx = 0;
|
||
|
|
if (code_c == EMPTY) {
|
||
|
|
if (mem->start == 0) {
|
||
|
|
int mem_len = *i;
|
||
|
|
if (mem_len >= raux->min_seed_len) {
|
||
|
|
leaf_gather(raux, mlt_data, &parent_byte_idx, mem, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
// Update LEP for backward search
|
||
|
|
lep_idx = raux->nextLEPBit >> 6;
|
||
|
|
lep_bit_idx = raux->nextLEPBit & (0x3FULL);
|
||
|
|
raux->lep[lep_idx] |= (1ULL << lep_bit_idx);
|
||
|
|
raux->nextLEPBit += 1;
|
||
|
|
}
|
||
|
|
else if (code_c == LEAF) { // Hit a leaf node
|
||
|
|
int k;
|
||
|
|
uint64_t leaf_data = 0;
|
||
|
|
nextByteIdx += getOffsetToLeafData(raux, code, c);
|
||
|
|
memcpy_bwamem(&leaf_data, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
if (leaf_data & 1) { // Found a multi-hit leaf node
|
||
|
|
nextByteIdx = raux->mh_start_addr + (leaf_data >> 1);
|
||
|
|
memcpy_bwamem(&raux->num_hits, 2 * sizeof(uint8_t), &mlt_data[nextByteIdx], 2 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
nextByteIdx += 2;
|
||
|
|
}
|
||
|
|
else { // Single-hit leaf node
|
||
|
|
raux->num_hits = 1;
|
||
|
|
}
|
||
|
|
// Hits exceed reseeding threshold
|
||
|
|
if (raux->num_hits >= raux->limit) {
|
||
|
|
mem->hitcount += raux->num_hits;
|
||
|
|
for (k = 0; k < raux->num_hits; ++k) {
|
||
|
|
memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
kv_push(uint64_t, *hits, ref_pos >> 1);
|
||
|
|
nextByteIdx += 5;
|
||
|
|
ref_pos = 0;
|
||
|
|
}
|
||
|
|
*i += 1;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
// Do DFS traversal to gather leaves starting from parent node
|
||
|
|
if (mem->start == 0) {
|
||
|
|
int mem_len = *i;
|
||
|
|
if (mem_len >= raux->min_seed_len) {
|
||
|
|
node_info_t* tmp_node_info = &kv_pop(*visited);
|
||
|
|
leaf_gather(raux, mlt_data, &tmp_node_info->byte_idx, mem, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
// Update LEP for backward search
|
||
|
|
lep_idx = raux->nextLEPBit >> 6;
|
||
|
|
lep_bit_idx = raux->nextLEPBit & (0x3FULL);
|
||
|
|
raux->lep[lep_idx] |= (1ULL << lep_bit_idx);
|
||
|
|
raux->nextLEPBit += 1;
|
||
|
|
}
|
||
|
|
else if (code_c == UNIFORM) { // Multi-character internal node
|
||
|
|
uint32_t j;
|
||
|
|
int countBP = *((uint16_t*) &mlt_data[nextByteIdx]);
|
||
|
|
nextByteIdx += 2;
|
||
|
|
int numBitsForBP = countBP << 1;
|
||
|
|
int numBytesForBP = (numBitsForBP % 8) ? (numBitsForBP / 8 + 1) : (numBitsForBP / 8);
|
||
|
|
uint8_t packedBP[numBytesForBP];
|
||
|
|
memcpy_bwamem(packedBP, numBytesForBP * sizeof(uint8_t), &mlt_data[nextByteIdx], numBytesForBP * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
nextByteIdx += numBytesForBP;
|
||
|
|
// Unpack base pairs
|
||
|
|
uint8_t unpackedBP[countBP];
|
||
|
|
for (j = 0; j < countBP; ++j) {
|
||
|
|
unpackedBP[j] = ((packedBP[j >> 2] >> ((~(j) & 3) << 1)) & 3);
|
||
|
|
}
|
||
|
|
// Count number of matching base pairs with read
|
||
|
|
for (j = 0; j < countBP; ++j) {
|
||
|
|
if ((*i + j) >= raux->l_seq) { // Don't run past the end of the read
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
if (raux->read_buf[*i+j] == 4) {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
if ((3 - raux->read_buf[*i+j]) != unpackedBP[j]) {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
raux->nextLEPBit += j;
|
||
|
|
*i += j;
|
||
|
|
if (j == countBP) { // If we match all bases of uniform entry
|
||
|
|
if (*i == raux->l_seq) { // Check if we reached the end of the read
|
||
|
|
if (mem->start == 0) {
|
||
|
|
leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits);
|
||
|
|
}
|
||
|
|
lep_idx = raux->nextLEPBit >> 6;
|
||
|
|
lep_bit_idx = raux->nextLEPBit & (0x3FULL);
|
||
|
|
raux->lep[lep_idx] |= (1ULL << lep_bit_idx);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
if (*i < raux->l_seq) {
|
||
|
|
getNextByteIdx_wlimit(raux, mlt_data, &nextByteIdx, i, mem, visited, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
//
|
||
|
|
// We did not match all bases of uniform entry
|
||
|
|
// Fetch all hits from leaf nodes for backward extension (dfs :( )
|
||
|
|
//
|
||
|
|
assert(*i <= raux->l_seq);
|
||
|
|
if (mem->start == 0) {
|
||
|
|
int mem_len = *i;
|
||
|
|
if (mem_len >= raux->min_seed_len) {
|
||
|
|
leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
lep_idx = raux->nextLEPBit >> 6;
|
||
|
|
lep_bit_idx = raux->nextLEPBit & (0x3FULL);
|
||
|
|
raux->lep[lep_idx] |= (1ULL << lep_bit_idx);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
getOffsetToChildNode(raux, mlt_data, code, c, &nextByteIdx);
|
||
|
|
lep_idx = raux->nextLEPBit >> 6;
|
||
|
|
lep_bit_idx = raux->nextLEPBit & (0x3FULL);
|
||
|
|
raux->lep[lep_idx] |= (1ULL << lep_bit_idx);
|
||
|
|
raux->nextLEPBit += 1;
|
||
|
|
// In the internal nodes, raux->num_hits = 0 is used to reprsent # hits > 20
|
||
|
|
if ((raux->num_hits == 0) || (raux->num_hits >= raux->limit)) {
|
||
|
|
node_info_t nif;
|
||
|
|
nif.byte_idx = nextByteIdx;
|
||
|
|
nif.num_hits = raux->num_hits;
|
||
|
|
kv_push(node_info_t, *visited, nif);
|
||
|
|
*i += 1;
|
||
|
|
if (*i < raux->l_seq) {
|
||
|
|
getNextByteIdx_wlimit(raux, mlt_data, &nextByteIdx, i, mem, visited, hits);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
if (mem->start == 0) {
|
||
|
|
leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits);
|
||
|
|
}
|
||
|
|
raux->lep[raux->nextLEPBit >> 6] |= (1ULL << (raux->nextLEPBit & (0x3FULL)));
|
||
|
|
raux->nextLEPBit += 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
// Do DFS traveral to gather leaves from parent node
|
||
|
|
if (mem->start == 0) {
|
||
|
|
int mem_len = *i;
|
||
|
|
if (mem_len >= raux->min_seed_len) {
|
||
|
|
node_info_t* tmp_node_info = &kv_pop(*visited);
|
||
|
|
leaf_gather(raux, mlt_data, &tmp_node_info->byte_idx, mem, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
*byte_idx = nextByteIdx;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Traverse tree during forward search (LAST). Terminate when fewer than 'raux->limit' hits are found for an internal node
|
||
|
|
* Minimum MEM length for LAST is opt->min_seed_len + 1
|
||
|
|
*
|
||
|
|
* @param raux read parameters
|
||
|
|
* @param mlt_data radix tree of k-mer
|
||
|
|
* @param byteIdx byte index into the mlt_data radix tree
|
||
|
|
* @param i index into read buffer
|
||
|
|
* @param mem maximal-exact match found
|
||
|
|
* @param visited stack to store list of visited nodes
|
||
|
|
* @param hits list of hits for read
|
||
|
|
*/
|
||
|
|
void getNextByteIdx_last(read_aux_t* raux, uint8_t* mlt_data, uint64_t* byte_idx, int* i, mem_t* mem, u64v* hits) {
|
||
|
|
|
||
|
|
uint64_t nextByteIdx = *byte_idx;
|
||
|
|
uint64_t ref_pos = 0;
|
||
|
|
uint8_t c, code, code_c;
|
||
|
|
if (raux->read_buf[*i] != 4) {
|
||
|
|
c = 3 - raux->read_buf[*i];
|
||
|
|
code = mlt_data[nextByteIdx++];
|
||
|
|
code_c = (code >> (c << 1)) & 3;
|
||
|
|
assert(code != 0);
|
||
|
|
}
|
||
|
|
else { // Terminate MEM search when we hit an 'N'
|
||
|
|
code_c = EMPTY;
|
||
|
|
}
|
||
|
|
if (code_c == EMPTY) {
|
||
|
|
*i += 1;
|
||
|
|
}
|
||
|
|
else if (code_c == LEAF) { // Hit a leaf node
|
||
|
|
int k;
|
||
|
|
uint64_t leaf_data = 0;
|
||
|
|
nextByteIdx += getOffsetToLeafData(raux, code, c);
|
||
|
|
memcpy_bwamem(&leaf_data, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
if (leaf_data & 1) { // multi-hit leaf
|
||
|
|
nextByteIdx = raux->mh_start_addr + (leaf_data >> 1);
|
||
|
|
memcpy_bwamem(&raux->num_hits, 2 * sizeof(uint8_t), &mlt_data[nextByteIdx], 2 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
nextByteIdx += 2;
|
||
|
|
mem->hitcount += raux->num_hits;
|
||
|
|
for (k = 0; k < raux->num_hits; ++k) {
|
||
|
|
memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
kv_push(uint64_t, *hits, ref_pos >> 1);
|
||
|
|
nextByteIdx += 5;
|
||
|
|
ref_pos = 0;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else { // single-hit leaf
|
||
|
|
raux->num_hits = 1;
|
||
|
|
mem->hitcount += raux->num_hits;
|
||
|
|
kv_push(uint64_t, *hits, leaf_data >> 1);
|
||
|
|
}
|
||
|
|
*i += 1;
|
||
|
|
}
|
||
|
|
else if (code_c == UNIFORM) { // Multi-character internal node
|
||
|
|
uint32_t j;
|
||
|
|
int countBP = *((uint16_t*) &mlt_data[nextByteIdx]);
|
||
|
|
nextByteIdx += 2;
|
||
|
|
int numBitsForBP = countBP << 1;
|
||
|
|
int numBytesForBP = (numBitsForBP % 8) ? (numBitsForBP / 8 + 1) : (numBitsForBP / 8);
|
||
|
|
uint8_t packedBP[numBytesForBP];
|
||
|
|
memcpy_bwamem(packedBP, numBytesForBP * sizeof(uint8_t), &mlt_data[nextByteIdx], numBytesForBP * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
nextByteIdx += numBytesForBP;
|
||
|
|
// Unpack base pairs
|
||
|
|
uint8_t unpackedBP[countBP];
|
||
|
|
for (j = 0; j < countBP; ++j) {
|
||
|
|
unpackedBP[j] = ((packedBP[j >> 2] >> ((~(j) & 3) << 1)) & 3);
|
||
|
|
}
|
||
|
|
// Count number of matching base pairs with read
|
||
|
|
for (j = 0; j < countBP; ++j) {
|
||
|
|
if ((*i + j) >= raux->l_seq) { //!< Don't run past the end of the read
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
if (raux->read_buf[*i+j] == 4) {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
if ((3 - raux->read_buf[*i+j]) != unpackedBP[j]) {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
*i += j;
|
||
|
|
int len = *i - mem->start;
|
||
|
|
//
|
||
|
|
// LAST stop criterion: MEM is sufficiently long and not too frequent
|
||
|
|
//
|
||
|
|
int stop_extension = (raux->num_hits > 0 &&
|
||
|
|
raux->num_hits < raux->limit &&
|
||
|
|
len >= (raux->min_seed_len + 1)) ? 1 : 0;
|
||
|
|
if (stop_extension) {
|
||
|
|
leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits);
|
||
|
|
*i = mem->start + (raux->min_seed_len + 1);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
if (j == countBP) {
|
||
|
|
if (*i < raux->l_seq) {
|
||
|
|
getNextByteIdx_last(raux, mlt_data, &nextByteIdx, i, mem, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
*i += 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else if (code_c == DIVERGE) {
|
||
|
|
getOffsetToChildNode(raux, mlt_data, code, c, &nextByteIdx);
|
||
|
|
*i += 1;
|
||
|
|
int len = *i - mem->start;
|
||
|
|
//
|
||
|
|
// LAST stop criterion: MEM is sufficiently long and not too frequent
|
||
|
|
//
|
||
|
|
int stop_extension = (raux->num_hits > 0 &&
|
||
|
|
raux->num_hits < raux->limit &&
|
||
|
|
len >= (raux->min_seed_len + 1)) ? 1 : 0;
|
||
|
|
if (stop_extension) {
|
||
|
|
leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
if (*i < raux->l_seq) {
|
||
|
|
getNextByteIdx_last(raux, mlt_data, &nextByteIdx, i, mem, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
*byte_idx = nextByteIdx;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Main backward search function (seeding). Lookup up k-mer and/or x-mer table and identify root of ERT
|
||
|
|
*
|
||
|
|
* @param iaux index parameters
|
||
|
|
* @param raux read parameters
|
||
|
|
* @param i index into read buffer
|
||
|
|
* @param mem maximal-exact match found
|
||
|
|
* @param hits list of hits for read
|
||
|
|
*/
|
||
|
|
void leftExtend(index_aux_t* iaux, read_aux_t* raux, int* i, mem_t* mem, u64v* hits) {
|
||
|
|
|
||
|
|
uint8_t code;
|
||
|
|
uint8_t* mlt_data;
|
||
|
|
uint64_t byte_idx = 0, ref_pos = 0, kmer_entry = 0, start_addr = 0;
|
||
|
|
uint32_t hashval = 0;
|
||
|
|
int idx_first_N = -1;
|
||
|
|
hashval = getHashKey(&raux->read_buf[*i], kmerSize, *i, raux->l_seq, 0, &idx_first_N);
|
||
|
|
if (idx_first_N != -1) {
|
||
|
|
*i += (kmerSize + xmerSize);
|
||
|
|
mem->rc_end = *i;
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
// index-table lookup
|
||
|
|
kmer_entry = iaux->kmer_offsets[hashval];
|
||
|
|
// index-table entry type
|
||
|
|
code = kmer_entry & METADATA_MASK;
|
||
|
|
// pointer to root of tree
|
||
|
|
start_addr = kmer_entry >> KMER_DATA_BITWIDTH;
|
||
|
|
// width used for internal pointers in tree
|
||
|
|
raux->ptr_width = (((kmer_entry >> 22) & 3) == 0) ? 4 : ((kmer_entry >> 22) & 3);
|
||
|
|
byte_idx = 0;
|
||
|
|
if (code == INVALID) { // k-mer absent
|
||
|
|
*i += (kmerSize + xmerSize);
|
||
|
|
mem->rc_end = *i;
|
||
|
|
}
|
||
|
|
else if (code == SINGLE_HIT_LEAF) { // single-hit k-mer
|
||
|
|
mlt_data = &iaux->mlt_table[start_addr];
|
||
|
|
byte_idx++;
|
||
|
|
memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[byte_idx], 5 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
byte_idx += 5;
|
||
|
|
mem->hitcount += 1;
|
||
|
|
kv_push(uint64_t, *hits, ref_pos >> 1);
|
||
|
|
*i += kmerSize;
|
||
|
|
mem->rc_end = *i;
|
||
|
|
}
|
||
|
|
else if (code == INFREQUENT) { // k-mer has fewer than 256 hits
|
||
|
|
*i += kmerSize;
|
||
|
|
mlt_data = &iaux->mlt_table[start_addr];
|
||
|
|
if (*i < raux->l_seq) {
|
||
|
|
//
|
||
|
|
// First 4 bytes of tree store the start of all multi-hit leaves for the k-mer
|
||
|
|
//
|
||
|
|
memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
byte_idx += 4;
|
||
|
|
getNextByteIdx_backward(raux, mlt_data, &byte_idx, i, mem, hits);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
mem->rc_end = *i;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else { // k-mer has large, dense tree, do an additional x-mer lookup
|
||
|
|
uint64_t xmer_entry;
|
||
|
|
uint64_t ptr = 0;
|
||
|
|
*i += kmerSize;
|
||
|
|
mlt_data = &iaux->mlt_table[start_addr];
|
||
|
|
hashval = getHashKey(&raux->read_buf[*i], xmerSize, *i, raux->l_seq, 0, &idx_first_N);
|
||
|
|
memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
byte_idx += 4;
|
||
|
|
memcpy_bwamem(&xmer_entry, 8 * sizeof(uint8_t), &mlt_data[byte_idx + (hashval << 3)], 8 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
code = xmer_entry & METADATA_MASK;
|
||
|
|
ptr = xmer_entry >> KMER_DATA_BITWIDTH;
|
||
|
|
if (idx_first_N != -1) {
|
||
|
|
*i += xmerSize;
|
||
|
|
mem->rc_end = *i;
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
if (code == INVALID) {
|
||
|
|
*i += xmerSize;
|
||
|
|
mem->rc_end = *i;
|
||
|
|
}
|
||
|
|
else if (code == SINGLE_HIT_LEAF) {
|
||
|
|
byte_idx = ptr;
|
||
|
|
byte_idx++;
|
||
|
|
memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[byte_idx], 5 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
byte_idx += 5;
|
||
|
|
mem->hitcount += 1;
|
||
|
|
kv_push(uint64_t, *hits, ref_pos >> 1);
|
||
|
|
*i += xmerSize;
|
||
|
|
mem->rc_end = *i;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
byte_idx = ptr;
|
||
|
|
*i += xmerSize;
|
||
|
|
if (*i < raux->l_seq) {
|
||
|
|
getNextByteIdx_backward(raux, mlt_data, &byte_idx, i, mem, hits);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
mem->rc_end = *i;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Main backward search function (reseeding). Lookup up k-mer and/or x-mer table and identify root of ERT.
|
||
|
|
* Return early if root node has fewer than 'raux->limit' hits
|
||
|
|
*
|
||
|
|
* @param iaux index parameters
|
||
|
|
* @param raux read parameters
|
||
|
|
* @param i index into read buffer
|
||
|
|
* @param mem maximal-exact-match storage
|
||
|
|
* @param hits list of hits for read
|
||
|
|
*/
|
||
|
|
void leftExtend_wlimit(index_aux_t* iaux, read_aux_t* raux, int* i, mem_t* mem, u64v* hits) {
|
||
|
|
|
||
|
|
uint8_t code;
|
||
|
|
uint8_t* mlt_data;
|
||
|
|
uint64_t byte_idx = 0, kmer_entry = 0, start_addr = 0;
|
||
|
|
uint32_t hashval = 0;
|
||
|
|
int idx_first_N = -1;
|
||
|
|
hashval = getHashKey(&raux->read_buf[*i], kmerSize, *i, raux->l_seq, 0, &idx_first_N);
|
||
|
|
if (idx_first_N != -1) {
|
||
|
|
*i += (kmerSize + xmerSize);
|
||
|
|
mem->rc_end = *i;
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
// index-table lookup
|
||
|
|
kmer_entry = iaux->kmer_offsets[hashval];
|
||
|
|
// index-table entry type
|
||
|
|
code = kmer_entry & METADATA_MASK;
|
||
|
|
// pointer to root of tree
|
||
|
|
start_addr = kmer_entry >> KMER_DATA_BITWIDTH;
|
||
|
|
// width used for internal pointers in tree
|
||
|
|
raux->ptr_width = (((kmer_entry >> 22) & 3) == 0) ? 4 : ((kmer_entry >> 22) & 3);
|
||
|
|
// # hits for k-mer (0 if > 20 hits)
|
||
|
|
raux->num_hits = (kmer_entry >> 17) & 0x1F;
|
||
|
|
byte_idx = 0;
|
||
|
|
if (code == INVALID) { // k-mer absent
|
||
|
|
*i += (kmerSize + xmerSize);
|
||
|
|
mem->rc_end = *i;
|
||
|
|
}
|
||
|
|
else if (code == SINGLE_HIT_LEAF) { // single-hit k-mer
|
||
|
|
*i += (kmerSize + xmerSize);
|
||
|
|
mem->rc_end = *i;
|
||
|
|
}
|
||
|
|
else if (code == INFREQUENT) { // k-mer has fewer than 256 hits
|
||
|
|
*i += kmerSize;
|
||
|
|
mlt_data = &iaux->mlt_table[start_addr];
|
||
|
|
if ((raux->num_hits == 0) || (raux->num_hits >= raux->limit)) {
|
||
|
|
if (*i < raux->l_seq) {
|
||
|
|
//
|
||
|
|
// First 4 bytes of tree store the start of all multi-hit leaves for the k-mer
|
||
|
|
//
|
||
|
|
memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
byte_idx += 4;
|
||
|
|
getNextByteIdx_backward_wlimit(raux, mlt_data, &byte_idx, i, mem, hits);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
// Leaf gathering to be done later
|
||
|
|
mem->rc_end = *i;
|
||
|
|
mem->fetch_leaves = 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
mem->rc_end = *i;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else { // k-mer has large, dense tree, do an additional x-mer lookup
|
||
|
|
uint64_t xmer_entry;
|
||
|
|
uint64_t ptr = 0;
|
||
|
|
*i += kmerSize;
|
||
|
|
mlt_data = &iaux->mlt_table[start_addr];
|
||
|
|
hashval = getHashKey(&raux->read_buf[*i], xmerSize, *i, raux->l_seq, 0, &idx_first_N);
|
||
|
|
//
|
||
|
|
// First 4 bytes of tree store the start of all multi-hit leaves for the k-mer
|
||
|
|
// This helps in decoding nodes and creates compact trees
|
||
|
|
//
|
||
|
|
memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
byte_idx += 4;
|
||
|
|
memcpy_bwamem(&xmer_entry, 8 * sizeof(uint8_t), &mlt_data[byte_idx + (hashval << 3)], 8 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
code = xmer_entry & METADATA_MASK;
|
||
|
|
ptr = xmer_entry >> KMER_DATA_BITWIDTH;
|
||
|
|
raux->num_hits = (xmer_entry >> 17) & 0x1F;
|
||
|
|
if (idx_first_N != -1) {
|
||
|
|
*i += xmerSize;
|
||
|
|
mem->rc_end = *i;
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
if (code == INVALID) {
|
||
|
|
*i += xmerSize;
|
||
|
|
mem->rc_end = *i;
|
||
|
|
}
|
||
|
|
else if (code == SINGLE_HIT_LEAF) {
|
||
|
|
*i += xmerSize;
|
||
|
|
mem->rc_end = *i;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
byte_idx = ptr;
|
||
|
|
*i += xmerSize;
|
||
|
|
if ((raux->num_hits == 0) || (raux->num_hits >= raux->limit)) {
|
||
|
|
if (*i < raux->l_seq) {
|
||
|
|
getNextByteIdx_backward_wlimit(raux, mlt_data, &byte_idx, i, mem, hits);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
// Leaf gathering
|
||
|
|
mem->rc_end = *i;
|
||
|
|
mem->fetch_leaves = 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
mem->rc_end = *i;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Fetch hits for all MEMs identified after backward search (termination conditions based on reseeding)
|
||
|
|
*
|
||
|
|
* @param raux read parameters
|
||
|
|
* @param mlt_data radix tree of k-mer
|
||
|
|
* @param byteIdx byte index into the mlt_data radix tree
|
||
|
|
* @param idx index into read buffer
|
||
|
|
* @param mem maximal-exact-match storage
|
||
|
|
* @param hits list of hits for read
|
||
|
|
*/
|
||
|
|
void getNextByteIdx_fetch_leaves_prefix_reseed(read_aux_t* raux, uint8_t* mlt_data, uint64_t* byte_idx, int idx, mem_t* mem, path_v* visited, u64v* hits) {
|
||
|
|
|
||
|
|
uint64_t nextByteIdx = *byte_idx;
|
||
|
|
uint64_t parent_byte_idx = nextByteIdx;
|
||
|
|
uint64_t ref_pos = 0;
|
||
|
|
uint8_t c;
|
||
|
|
int i = idx;
|
||
|
|
assert(raux->read_buf[i] != 4); // Should not see N in SMEMs
|
||
|
|
c = 3 - raux->read_buf[i];
|
||
|
|
uint8_t code = mlt_data[nextByteIdx++];
|
||
|
|
uint8_t code_c = (code >> (c << 1)) & 3;
|
||
|
|
assert(code != 0);
|
||
|
|
if (code_c == EMPTY) { // Do leaf gathering
|
||
|
|
mem->end = i;
|
||
|
|
int mem_len = mem->end - mem->start;
|
||
|
|
if (mem_len >= raux->min_seed_len) {
|
||
|
|
leaf_gather(raux, mlt_data, &parent_byte_idx, mem, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else if (code_c == LEAF) {
|
||
|
|
int k;
|
||
|
|
uint64_t leaf_data = 0;
|
||
|
|
nextByteIdx += getOffsetToLeafData(raux, code, c);
|
||
|
|
memcpy_bwamem(&leaf_data, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
if (leaf_data & 1) {
|
||
|
|
nextByteIdx = raux->mh_start_addr + (leaf_data >> 1);
|
||
|
|
memcpy_bwamem(&raux->num_hits, 2 * sizeof(uint8_t), &mlt_data[nextByteIdx], 2 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
nextByteIdx += 2;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
raux->num_hits = 1;
|
||
|
|
}
|
||
|
|
if (raux->num_hits >= raux->limit) {
|
||
|
|
mem->hitcount += raux->num_hits;
|
||
|
|
for (k = 0; k < raux->num_hits; ++k) {
|
||
|
|
memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
kv_push(uint64_t, *hits, ref_pos >> 1);
|
||
|
|
nextByteIdx += 5;
|
||
|
|
ref_pos = 0;
|
||
|
|
}
|
||
|
|
i += 1;
|
||
|
|
mem->end = i;
|
||
|
|
mem->is_multi_hit = 1; // decompress leaf node for potentially longer match
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
mem->end = i;
|
||
|
|
int mem_len = mem->end - mem->start;
|
||
|
|
if (mem_len >= raux->min_seed_len) {
|
||
|
|
node_info_t* tmp_node_info = &kv_pop(*visited);
|
||
|
|
leaf_gather(raux, mlt_data, &tmp_node_info->byte_idx, mem, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else if (code_c == UNIFORM) {
|
||
|
|
uint32_t j;
|
||
|
|
int countBP = *((uint16_t*) &mlt_data[nextByteIdx]);
|
||
|
|
nextByteIdx += 2;
|
||
|
|
int numBitsForBP = countBP << 1;
|
||
|
|
int numBytesForBP = (numBitsForBP % 8) ? (numBitsForBP / 8 + 1) : (numBitsForBP / 8);
|
||
|
|
uint8_t packedBP[numBytesForBP];
|
||
|
|
memcpy_bwamem(packedBP, numBytesForBP * sizeof(uint8_t), &mlt_data[nextByteIdx], numBytesForBP * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
nextByteIdx += numBytesForBP;
|
||
|
|
// Unpack base pairs
|
||
|
|
uint8_t unpackedBP[countBP];
|
||
|
|
for (j = 0; j < countBP; ++j) {
|
||
|
|
unpackedBP[j] = ((packedBP[j >> 2] >> ((~(j) & 3) << 1)) & 3);
|
||
|
|
}
|
||
|
|
// Count number of matching base pairs with read
|
||
|
|
for (j = 0; j < countBP; ++j) {
|
||
|
|
if ((i + j) >= raux->l_seq) {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
if (3 - raux->read_buf[i+j] != unpackedBP[j]) {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
i += j;
|
||
|
|
if (j == countBP) {
|
||
|
|
if (i < raux->l_seq) {
|
||
|
|
getNextByteIdx_fetch_leaves_prefix_reseed(raux, mlt_data, &nextByteIdx, i, mem, visited, hits);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
mem->end = i;
|
||
|
|
int mem_len = mem->end - mem->start;
|
||
|
|
if (mem_len >= raux->min_seed_len) {
|
||
|
|
leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
mem->end = i;
|
||
|
|
int mem_len = mem->end - mem->start;
|
||
|
|
if (mem_len >= raux->min_seed_len) {
|
||
|
|
leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else if (code_c == DIVERGE) {
|
||
|
|
raux->num_hits = 0;
|
||
|
|
getOffsetToChildNode(raux, mlt_data, code, c, &nextByteIdx);
|
||
|
|
if ((raux->num_hits == 0) || (raux->num_hits >= raux->limit)) {
|
||
|
|
node_info_t nif;
|
||
|
|
nif.byte_idx = nextByteIdx;
|
||
|
|
nif.num_hits = raux->num_hits;
|
||
|
|
kv_push(node_info_t, *visited, nif);
|
||
|
|
i += 1;
|
||
|
|
if (i < raux->l_seq) {
|
||
|
|
getNextByteIdx_fetch_leaves_prefix_reseed(raux, mlt_data, &nextByteIdx, i, mem, visited, hits);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
mem->end = i;
|
||
|
|
int mem_len = mem->end - mem->start;
|
||
|
|
if (mem_len >= raux->min_seed_len) {
|
||
|
|
leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
mem->end = i;
|
||
|
|
int mem_len = mem->end - mem->start;
|
||
|
|
if (mem_len >= raux->min_seed_len) {
|
||
|
|
node_info_t* tmp_node_info = &kv_pop(*visited);
|
||
|
|
leaf_gather(raux, mlt_data, &tmp_node_info->byte_idx, mem, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
*byte_idx = nextByteIdx;
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Fetch hits for all MEMs identified after backward search (extend beyond mem->end)
|
||
|
|
*
|
||
|
|
* @param raux read parameters
|
||
|
|
* @param mlt_data radix tree of k-mer
|
||
|
|
* @param byteIdx byte index into the mlt_data radix tree
|
||
|
|
* @param idx index into read buffer
|
||
|
|
* @param mem maximal-exact-match storage
|
||
|
|
* @param hits list of hits for read
|
||
|
|
*/
|
||
|
|
void getNextByteIdx_fetch_leaves_prefix(read_aux_t* raux, uint8_t* mlt_data, uint64_t* byte_idx, int idx, mem_t* mem, u64v* hits) {
|
||
|
|
|
||
|
|
uint64_t nextByteIdx = *byte_idx;
|
||
|
|
uint64_t ref_pos = 0;
|
||
|
|
uint8_t c;
|
||
|
|
int i = idx;
|
||
|
|
assert(raux->read_buf[i] != 4); // Should not see N in SMEMs
|
||
|
|
c = 3 - raux->read_buf[i];
|
||
|
|
uint8_t code = mlt_data[nextByteIdx++];
|
||
|
|
uint8_t code_c = (code >> (c << 1)) & 3;
|
||
|
|
assert(code != 0);
|
||
|
|
if (code_c == EMPTY) { // Do leaf gathering
|
||
|
|
mem->end = i;
|
||
|
|
int mem_len = mem->end - mem->start;
|
||
|
|
if (mem_len >= raux->min_seed_len) {
|
||
|
|
nextByteIdx = *byte_idx;
|
||
|
|
leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else if (code_c == LEAF) {
|
||
|
|
int k;
|
||
|
|
uint64_t leaf_data = 0;
|
||
|
|
nextByteIdx += getOffsetToLeafData(raux, code, c);
|
||
|
|
memcpy_bwamem(&leaf_data, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
if (leaf_data & 1) {
|
||
|
|
nextByteIdx = raux->mh_start_addr + (leaf_data >> 1);
|
||
|
|
memcpy_bwamem(&raux->num_hits, 2 * sizeof(uint8_t), &mlt_data[nextByteIdx], 2 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
nextByteIdx += 2;
|
||
|
|
mem->hitcount += raux->num_hits;
|
||
|
|
for (k = 0; k < raux->num_hits; ++k) {
|
||
|
|
memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
kv_push(uint64_t, *hits, ref_pos >> 1);
|
||
|
|
nextByteIdx += 5;
|
||
|
|
ref_pos = 0;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
raux->num_hits = 1;
|
||
|
|
mem->hitcount += raux->num_hits;
|
||
|
|
kv_push(uint64_t, *hits, leaf_data >> 1);
|
||
|
|
}
|
||
|
|
i += 1;
|
||
|
|
mem->end = i;
|
||
|
|
}
|
||
|
|
else if (code_c == UNIFORM) {
|
||
|
|
uint32_t j;
|
||
|
|
int countBP = *((uint16_t*) &mlt_data[nextByteIdx]);
|
||
|
|
nextByteIdx += 2;
|
||
|
|
int numBitsForBP = countBP << 1;
|
||
|
|
int numBytesForBP = (numBitsForBP % 8) ? (numBitsForBP / 8 + 1) : (numBitsForBP / 8);
|
||
|
|
uint8_t packedBP[numBytesForBP];
|
||
|
|
memcpy_bwamem(packedBP, numBytesForBP * sizeof(uint8_t), &mlt_data[nextByteIdx], numBytesForBP * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
nextByteIdx += numBytesForBP;
|
||
|
|
// Unpack base pairs
|
||
|
|
uint8_t unpackedBP[countBP];
|
||
|
|
for (j = 0; j < countBP; ++j) {
|
||
|
|
unpackedBP[j] = ((packedBP[j >> 2] >> ((~(j) & 3) << 1)) & 3);
|
||
|
|
}
|
||
|
|
// Count number of matching base pairs with read
|
||
|
|
for (j = 0; j < countBP; ++j) {
|
||
|
|
if ((i + j) >= raux->l_seq) {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
if (3 - raux->read_buf[i+j] != unpackedBP[j]) {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
i += j;
|
||
|
|
if (j == countBP) {
|
||
|
|
if (i < raux->l_seq) {
|
||
|
|
getNextByteIdx_fetch_leaves_prefix(raux, mlt_data, &nextByteIdx, i, mem, hits);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
mem->end = i;
|
||
|
|
int mem_len = mem->end - mem->start;
|
||
|
|
if (mem_len >= raux->min_seed_len) {
|
||
|
|
leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
mem->end = i;
|
||
|
|
int mem_len = mem->end - mem->start;
|
||
|
|
if (mem_len >= raux->min_seed_len) {
|
||
|
|
leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else if (code_c == DIVERGE) {
|
||
|
|
raux->num_hits = 0;
|
||
|
|
getOffsetToChildNode(raux, mlt_data, code, c, &nextByteIdx);
|
||
|
|
i += 1;
|
||
|
|
if (i < raux->l_seq) {
|
||
|
|
getNextByteIdx_fetch_leaves_prefix(raux, mlt_data, &nextByteIdx, i, mem, hits);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
mem->end = i;
|
||
|
|
int mem_len = mem->end - mem->start;
|
||
|
|
if (mem_len >= raux->min_seed_len) {
|
||
|
|
leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Forward traversal to fetch hits for all MEMs identified after backward search.
|
||
|
|
*
|
||
|
|
* @param raux read parameters
|
||
|
|
* @param mlt_data radix tree of k-mer
|
||
|
|
* @param byteIdx byte index into the mlt_data radix tree
|
||
|
|
* @param idx index into read buffer
|
||
|
|
* @param mem maximal-exact-match storage
|
||
|
|
* @param hits list of hits for read
|
||
|
|
*/
|
||
|
|
void getNextByteIdx_fetch_leaves(read_aux_t* raux, uint8_t* mlt_data, uint64_t* byte_idx, int idx, mem_t* mem, u64v* hits) {
|
||
|
|
|
||
|
|
uint64_t nextByteIdx = *byte_idx;
|
||
|
|
uint64_t ref_pos = 0;
|
||
|
|
uint8_t c;
|
||
|
|
int i = idx;
|
||
|
|
assert(raux->read_buf[i] != 4); // Should not see N in SMEMs
|
||
|
|
c = 3 - raux->read_buf[i];
|
||
|
|
uint8_t code = mlt_data[nextByteIdx++];
|
||
|
|
uint8_t code_c = (code >> (c << 1)) & 3;
|
||
|
|
assert(code != 0);
|
||
|
|
assert(code_c != EMPTY);
|
||
|
|
if (code_c == LEAF) {
|
||
|
|
int k;
|
||
|
|
uint64_t leaf_data = 0;
|
||
|
|
nextByteIdx += getOffsetToLeafData(raux, code, c);
|
||
|
|
memcpy_bwamem(&leaf_data, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
if (leaf_data & 1) {
|
||
|
|
nextByteIdx = raux->mh_start_addr + (leaf_data >> 1);
|
||
|
|
memcpy_bwamem(&raux->num_hits, 2 * sizeof(uint8_t), &mlt_data[nextByteIdx], 2 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
nextByteIdx += 2;
|
||
|
|
mem->hitcount += raux->num_hits;
|
||
|
|
for (k = 0; k < raux->num_hits; ++k) {
|
||
|
|
memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
kv_push(uint64_t, *hits, ref_pos >> 1);
|
||
|
|
nextByteIdx += 5;
|
||
|
|
ref_pos = 0;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
raux->num_hits = 1;
|
||
|
|
mem->hitcount += raux->num_hits;
|
||
|
|
kv_push(uint64_t, *hits, leaf_data >> 1);
|
||
|
|
}
|
||
|
|
i += 1;
|
||
|
|
}
|
||
|
|
else if (code_c == UNIFORM) {
|
||
|
|
uint32_t j;
|
||
|
|
int countBP = *((uint16_t*) &mlt_data[nextByteIdx]);
|
||
|
|
nextByteIdx += 2;
|
||
|
|
int numBitsForBP = countBP << 1;
|
||
|
|
int numBytesForBP = (numBitsForBP % 8) ? (numBitsForBP / 8 + 1) : (numBitsForBP / 8);
|
||
|
|
uint8_t packedBP[numBytesForBP];
|
||
|
|
memcpy_bwamem(packedBP, numBytesForBP * sizeof(uint8_t), &mlt_data[nextByteIdx], numBytesForBP * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
nextByteIdx += numBytesForBP;
|
||
|
|
// Unpack base pairs
|
||
|
|
uint8_t unpackedBP[countBP];
|
||
|
|
for (j = 0; j < countBP; ++j) {
|
||
|
|
unpackedBP[j] = ((packedBP[j >> 2] >> ((~(j) & 3) << 1)) & 3);
|
||
|
|
}
|
||
|
|
// Count number of matching base pairs with read
|
||
|
|
for (j = 0; j < countBP; ++j) {
|
||
|
|
if ((i + j) >= raux->l_seq) {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
if (3 - raux->read_buf[i+j] != unpackedBP[j]) {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
i += j;
|
||
|
|
if (j == countBP) {
|
||
|
|
if (i < mem->end) { // only extend till end of MEM found during previous backward search
|
||
|
|
getNextByteIdx_fetch_leaves(raux, mlt_data, &nextByteIdx, i, mem, hits);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else if (code_c == DIVERGE) {
|
||
|
|
raux->num_hits = 0;
|
||
|
|
getOffsetToChildNode(raux, mlt_data, code, c, &nextByteIdx);
|
||
|
|
i += 1;
|
||
|
|
if (i < mem->end) { // only extend till end of MEM found during previous backward search
|
||
|
|
getNextByteIdx_fetch_leaves(raux, mlt_data, &nextByteIdx, i, mem, hits);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Extend as much as possible to the right based on reseeding criteria.
|
||
|
|
*
|
||
|
|
* Return early if root node has fewer than 'raux->limit' hits
|
||
|
|
*
|
||
|
|
* @param iaux index parameters
|
||
|
|
* @param raux read parameters
|
||
|
|
* @param mem maximal-exact-match storage
|
||
|
|
* @param hits list of hits for read
|
||
|
|
*/
|
||
|
|
void rightExtend_fetch_leaves_prefix_reseed(index_aux_t* iaux, read_aux_t* raux, mem_t* mem, u64v* hits) {
|
||
|
|
|
||
|
|
uint8_t code;
|
||
|
|
uint8_t* mlt_data;
|
||
|
|
uint64_t byte_idx = 0, kmer_entry = 0, start_addr = 0;
|
||
|
|
uint32_t hashval = 0;
|
||
|
|
int flag = 0;
|
||
|
|
int i = mem->start;
|
||
|
|
int idx_first_N = -1;
|
||
|
|
hashval = getHashKey(&raux->read_buf[i], kmerSize, i, raux->l_seq, &flag, &idx_first_N);
|
||
|
|
// index-table lookup
|
||
|
|
kmer_entry = iaux->kmer_offsets[hashval];
|
||
|
|
// index-table entry type
|
||
|
|
code = kmer_entry & METADATA_MASK;
|
||
|
|
// pointer to root of tree
|
||
|
|
start_addr = kmer_entry >> KMER_DATA_BITWIDTH;
|
||
|
|
raux->mh_start_addr = 0;
|
||
|
|
// width used for internal pointers in tree
|
||
|
|
raux->ptr_width = (((kmer_entry >> 22) & 3) == 0) ? 4 : ((kmer_entry >> 22) & 3);
|
||
|
|
// # hits for k-mer (0 if > 20 hits)
|
||
|
|
raux->num_hits = (kmer_entry >> 17) & 0x1F;
|
||
|
|
byte_idx = 0;
|
||
|
|
assert(code != INVALID);
|
||
|
|
if (code == SINGLE_HIT_LEAF) {
|
||
|
|
mem->end = i;
|
||
|
|
}
|
||
|
|
else if (code == INFREQUENT) { // k-mer has fewer than 256 hits
|
||
|
|
if ((raux->num_hits == 0) || (raux->num_hits >= raux->limit)) {
|
||
|
|
i += kmerSize;
|
||
|
|
mlt_data = &iaux->mlt_table[start_addr];
|
||
|
|
//
|
||
|
|
// First 4 bytes of tree store the start of all multi-hit leaves for the k-mer
|
||
|
|
//
|
||
|
|
memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
byte_idx += 4;
|
||
|
|
if (i < raux->l_seq) {
|
||
|
|
path_v visited;
|
||
|
|
kv_init(visited);
|
||
|
|
node_info_t nif;
|
||
|
|
nif.byte_idx = byte_idx;
|
||
|
|
nif.num_hits = raux->num_hits;
|
||
|
|
kv_push(node_info_t, visited, nif);
|
||
|
|
getNextByteIdx_fetch_leaves_prefix_reseed(raux, mlt_data, &byte_idx, i, mem, &visited, hits);
|
||
|
|
kv_destroy(visited);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
mem->end = i;
|
||
|
|
int mem_len = mem->end - mem->start;
|
||
|
|
if (mem_len >= raux->min_seed_len) {
|
||
|
|
leaf_gather(raux, mlt_data, &byte_idx, mem, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
mem->end = i;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else if (code == FREQUENT) { // k-mer has large, dense tree, do an additional x-mer lookup
|
||
|
|
uint64_t xmer_entry;
|
||
|
|
uint64_t ptr = 0;
|
||
|
|
mlt_data = &iaux->mlt_table[start_addr];
|
||
|
|
hashval = getHashKey(&raux->read_buf[i + kmerSize], xmerSize, i + kmerSize, raux->l_seq, 0, &idx_first_N);
|
||
|
|
memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
byte_idx += 4;
|
||
|
|
memcpy_bwamem(&xmer_entry, 8 * sizeof(uint8_t), &mlt_data[byte_idx + (hashval << 3)], 8 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
code = xmer_entry & METADATA_MASK;
|
||
|
|
ptr = xmer_entry >> KMER_DATA_BITWIDTH;
|
||
|
|
// 5 bits to encode number of hits at each node
|
||
|
|
raux->num_hits = (xmer_entry >> 17) & 0x1F;
|
||
|
|
if (code == INVALID) {
|
||
|
|
mem->end = i;
|
||
|
|
}
|
||
|
|
else if (code == SINGLE_HIT_LEAF) {
|
||
|
|
mem->end = i;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
// When a node has greater than 20 (opt->max_mem->intv) hits, we store a 0 in the hits field
|
||
|
|
if ((raux->num_hits == 0) || (raux->num_hits >= raux->limit)) {
|
||
|
|
byte_idx = ptr;
|
||
|
|
i += (kmerSize + xmerSize);
|
||
|
|
if (i < raux->l_seq) {
|
||
|
|
path_v visited;
|
||
|
|
kv_init(visited);
|
||
|
|
node_info_t nif;
|
||
|
|
nif.byte_idx = byte_idx;
|
||
|
|
nif.num_hits = raux->num_hits;
|
||
|
|
kv_push(node_info_t, visited, nif);
|
||
|
|
getNextByteIdx_fetch_leaves_prefix_reseed(raux, mlt_data, &byte_idx, i, mem, &visited, hits);
|
||
|
|
kv_destroy(visited);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
mem->end = i;
|
||
|
|
int mem_len = mem->end - mem->start;
|
||
|
|
if (mem_len >= raux->min_seed_len) {
|
||
|
|
leaf_gather(raux, mlt_data, &byte_idx, mem, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
mem->end = i;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Extend as much as possible to the right and gather leaves if MEM length >= opt->min_seed_len.
|
||
|
|
*
|
||
|
|
* @param iaux index parameters
|
||
|
|
* @param raux read parameters
|
||
|
|
* @param mem maximal-exact-match storage
|
||
|
|
* @param hits list of hits for read
|
||
|
|
*/
|
||
|
|
void rightExtend_fetch_leaves_prefix(index_aux_t* iaux, read_aux_t* raux, mem_t* mem, u64v* hits) {
|
||
|
|
|
||
|
|
uint8_t code;
|
||
|
|
uint8_t* mlt_data;
|
||
|
|
uint64_t byte_idx = 0, ref_pos = 0, kmer_entry = 0, start_addr = 0;
|
||
|
|
uint32_t hashval = 0;
|
||
|
|
int flag = 0;
|
||
|
|
int i = mem->start;
|
||
|
|
int idx_first_N = -1;
|
||
|
|
hashval = getHashKey(&raux->read_buf[i], kmerSize, i, raux->l_seq, &flag, &idx_first_N);
|
||
|
|
// index-table lookup
|
||
|
|
kmer_entry = iaux->kmer_offsets[hashval];
|
||
|
|
// index-table entry type
|
||
|
|
code = kmer_entry & METADATA_MASK;
|
||
|
|
// pointer to root of tree
|
||
|
|
start_addr = kmer_entry >> KMER_DATA_BITWIDTH;
|
||
|
|
raux->mh_start_addr = 0;
|
||
|
|
// width used for internal pointers in tree, 2 bits, ptr_width = 4 is encoded as 0
|
||
|
|
raux->ptr_width = (((kmer_entry >> 22) & 3) == 0) ? 4 : ((kmer_entry >> 22) & 3);
|
||
|
|
byte_idx = 0;
|
||
|
|
assert(code != INVALID);
|
||
|
|
if (code == SINGLE_HIT_LEAF) {
|
||
|
|
mlt_data = &iaux->mlt_table[start_addr];
|
||
|
|
byte_idx++;
|
||
|
|
memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[byte_idx], 5 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
mem->hitcount += 1;
|
||
|
|
kv_push(uint64_t, *hits, ref_pos >> 1);
|
||
|
|
byte_idx += 5;
|
||
|
|
i += kmerSize;
|
||
|
|
mem->end = i;
|
||
|
|
}
|
||
|
|
else if (code == INFREQUENT) { // k-mer has fewer than 256 hits
|
||
|
|
i += kmerSize;
|
||
|
|
mlt_data = &iaux->mlt_table[start_addr];
|
||
|
|
//
|
||
|
|
// First 4 bytes of tree store the start of all multi-hit leaves for the k-mer
|
||
|
|
//
|
||
|
|
memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
byte_idx += 4;
|
||
|
|
if (i < raux->l_seq) {
|
||
|
|
getNextByteIdx_fetch_leaves_prefix(raux, mlt_data, &byte_idx, i, mem, hits);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
mem->end = i;
|
||
|
|
int mem_len = mem->end - mem->start;
|
||
|
|
if (mem_len >= raux->min_seed_len) {
|
||
|
|
leaf_gather(raux, mlt_data, &byte_idx, mem, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else if (code == FREQUENT) { // k-mer has large, dense tree, do an additional x-mer lookup
|
||
|
|
uint64_t xmer_entry;
|
||
|
|
uint64_t ptr = 0;
|
||
|
|
mlt_data = &iaux->mlt_table[start_addr];
|
||
|
|
hashval = getHashKey(&raux->read_buf[i + kmerSize], xmerSize, i + kmerSize, raux->l_seq, 0, &idx_first_N);
|
||
|
|
memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
byte_idx += 4;
|
||
|
|
memcpy_bwamem(&xmer_entry, 8 * sizeof(uint8_t), &mlt_data[byte_idx + (hashval << 3)], 8 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
code = xmer_entry & METADATA_MASK;
|
||
|
|
ptr = xmer_entry >> KMER_DATA_BITWIDTH;
|
||
|
|
if (code == INVALID) {
|
||
|
|
mem->end = i;
|
||
|
|
}
|
||
|
|
else if (code == SINGLE_HIT_LEAF) {
|
||
|
|
byte_idx = ptr;
|
||
|
|
byte_idx++;
|
||
|
|
memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[byte_idx], 5 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
mem->hitcount += 1;
|
||
|
|
kv_push(uint64_t, *hits, ref_pos >> 1);
|
||
|
|
byte_idx += 5;
|
||
|
|
i += (kmerSize + xmerSize);
|
||
|
|
mem->end = i;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
byte_idx = ptr;
|
||
|
|
i += (kmerSize + xmerSize);
|
||
|
|
if (i < raux->l_seq) {
|
||
|
|
getNextByteIdx_fetch_leaves_prefix(raux, mlt_data, &byte_idx, i, mem, hits);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
mem->end = i;
|
||
|
|
int mem_len = mem->end - mem->start;
|
||
|
|
if (mem_len >= raux->min_seed_len) {
|
||
|
|
leaf_gather(raux, mlt_data, &byte_idx, mem, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Fetch hits for all MEMs identified after backward search.
|
||
|
|
*
|
||
|
|
* Note that backward search functions above only perform tree traversal without gathering hits as they will be fetched in
|
||
|
|
* a different order than required by BWA-MEM (i.e., all hits for MEM need to be sorted by right-context). We re-traverse and
|
||
|
|
* the tree and fetch the hits in the correct order below (this has a minor performance penalty)
|
||
|
|
*
|
||
|
|
* Return early if root node has fewer than 'raux->limit' hits
|
||
|
|
*
|
||
|
|
* @param iaux index parameters
|
||
|
|
* @param raux read parameters
|
||
|
|
* @param mem maximal-exact-match storage
|
||
|
|
* @param hits list of hits for read
|
||
|
|
*/
|
||
|
|
void rightExtend_fetch_leaves(index_aux_t* iaux, read_aux_t* raux, mem_t* mem, u64v* hits) {
|
||
|
|
|
||
|
|
uint8_t code;
|
||
|
|
uint8_t* mlt_data;
|
||
|
|
uint64_t byte_idx = 0, kmer_entry = 0, start_addr = 0;
|
||
|
|
uint32_t hashval = 0;
|
||
|
|
int flag = 0;
|
||
|
|
int i = mem->start;
|
||
|
|
int end = mem->end;
|
||
|
|
int idx_first_N = -1;
|
||
|
|
hashval = getHashKey(&raux->read_buf[i], kmerSize, i, raux->l_seq, &flag, &idx_first_N);
|
||
|
|
// index-table lookup
|
||
|
|
kmer_entry = iaux->kmer_offsets[hashval];
|
||
|
|
// index-table entry type
|
||
|
|
code = kmer_entry & METADATA_MASK;
|
||
|
|
// pointer to root of tree
|
||
|
|
start_addr = kmer_entry >> KMER_DATA_BITWIDTH;
|
||
|
|
raux->mh_start_addr = 0;
|
||
|
|
// width used for internal pointers in tree
|
||
|
|
raux->ptr_width = (((kmer_entry >> 22) & 3) == 0) ? 4 : ((kmer_entry >> 22) & 3);
|
||
|
|
byte_idx = 0;
|
||
|
|
assert(code != INVALID);
|
||
|
|
assert(code != SINGLE_HIT_LEAF);
|
||
|
|
if (code == INFREQUENT) { // k-mer has fewer than 256 hits
|
||
|
|
i += kmerSize;
|
||
|
|
mlt_data = &iaux->mlt_table[start_addr];
|
||
|
|
//
|
||
|
|
// First 4 bytes of tree store the start of all multi-hit leaves for the k-mer
|
||
|
|
//
|
||
|
|
memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
byte_idx += 4;
|
||
|
|
if (i < end) {
|
||
|
|
getNextByteIdx_fetch_leaves(raux, mlt_data, &byte_idx, i, mem, hits);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
leaf_gather(raux, mlt_data, &byte_idx, mem, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else if (code == FREQUENT) { // k-mer has large, dense tree, do an additional x-mer lookup
|
||
|
|
uint64_t xmer_entry;
|
||
|
|
uint64_t ptr = 0;
|
||
|
|
i += kmerSize;
|
||
|
|
mlt_data = &iaux->mlt_table[start_addr];
|
||
|
|
hashval = getHashKey(&raux->read_buf[i], xmerSize, i, raux->l_seq, 0, &idx_first_N);
|
||
|
|
memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
byte_idx += 4;
|
||
|
|
memcpy_bwamem(&xmer_entry, 8 * sizeof(uint8_t), &mlt_data[byte_idx + (hashval << 3)], 8 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
code = xmer_entry & METADATA_MASK;
|
||
|
|
ptr = xmer_entry >> KMER_DATA_BITWIDTH;
|
||
|
|
assert(code != INVALID);
|
||
|
|
assert(code != SINGLE_HIT_LEAF);
|
||
|
|
byte_idx = ptr;
|
||
|
|
i += xmerSize;
|
||
|
|
if (i < end) {
|
||
|
|
getNextByteIdx_fetch_leaves(raux, mlt_data, &byte_idx, i, mem, hits);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
leaf_gather(raux, mlt_data, &byte_idx, mem, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Main forward search function (seeding). Lookup up k-mer and/or x-mer table and identify root of ERT
|
||
|
|
*
|
||
|
|
* @param iaux index parameters
|
||
|
|
* @param raux read parameters
|
||
|
|
* @param i Index into read buffer
|
||
|
|
* @param mem maximal-exact-match storage
|
||
|
|
* @param hits list of hits for read
|
||
|
|
*/
|
||
|
|
void rightExtend(index_aux_t* iaux, read_aux_t* raux, int* i, mem_t* mem, u64v* hits) {
|
||
|
|
|
||
|
|
uint8_t code;
|
||
|
|
uint8_t* mlt_data;
|
||
|
|
uint64_t byte_idx = 0, ref_pos = 0, kmer_entry = 0, start_addr = 0;
|
||
|
|
uint32_t hashval = 0;
|
||
|
|
uint64_t lep_data = 0;
|
||
|
|
int flag = 0;
|
||
|
|
int idx_first_N = -1;
|
||
|
|
hashval = getHashKey(&raux->read_buf[*i], kmerSize, *i, raux->l_seq, &flag, &idx_first_N);
|
||
|
|
// index-table lookup
|
||
|
|
kmer_entry = iaux->kmer_offsets[hashval];
|
||
|
|
// index-table entry type
|
||
|
|
code = kmer_entry & METADATA_MASK;
|
||
|
|
lep_data = (kmer_entry >> METADATA_BITWIDTH) & LEP_MASK;
|
||
|
|
// pointer to root of tree
|
||
|
|
start_addr = kmer_entry >> KMER_DATA_BITWIDTH;
|
||
|
|
uint64_t mlt_start_addr = raux->mlt_start_addr = start_addr;
|
||
|
|
raux->mh_start_addr = 0;
|
||
|
|
// width used for internal pointers in tree
|
||
|
|
raux->ptr_width = (((kmer_entry >> 22) & 3) == 0) ? 4 : ((kmer_entry >> 22) & 3);
|
||
|
|
// LEP takes up kmerSize-1 bits. Last LEP bit is at position = kmerSize-2.
|
||
|
|
if (*i <= 64-kmerSize) {
|
||
|
|
raux->lep[0] |= (lep_data << *i);
|
||
|
|
}
|
||
|
|
else if (*i < 64) {
|
||
|
|
raux->lep[0] |= (lep_data << *i);
|
||
|
|
raux->lep[1] |= (lep_data >> (64-*i));
|
||
|
|
}
|
||
|
|
else if (*i <= 128-kmerSize) {
|
||
|
|
raux->lep[1] |= (lep_data << (*i-64));
|
||
|
|
}
|
||
|
|
else if (*i < 128) {
|
||
|
|
raux->lep[1] |= (lep_data << (*i-64));
|
||
|
|
raux->lep[2] |= (lep_data >> (128-*i));
|
||
|
|
}
|
||
|
|
else if (*i <= 192-kmerSize) {
|
||
|
|
raux->lep[2] |= (lep_data << (*i-128));
|
||
|
|
}
|
||
|
|
else if (*i < 192) {
|
||
|
|
raux->lep[2] |= (lep_data << (*i-128));
|
||
|
|
raux->lep[3] |= (lep_data >> (192-*i));
|
||
|
|
}
|
||
|
|
else if (*i <= 256-kmerSize) {
|
||
|
|
raux->lep[3] |= (lep_data << (*i-192));
|
||
|
|
}
|
||
|
|
else if (*i < 256) {
|
||
|
|
raux->lep[3] |= (lep_data << (*i-192));
|
||
|
|
raux->lep[4] |= (lep_data >> (256-*i));
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
raux->lep[4] |= (lep_data << (*i-256));
|
||
|
|
}
|
||
|
|
raux->nextLEPBit = *i + kmerSize - 1;
|
||
|
|
byte_idx = 0;
|
||
|
|
// We found an ambiguous base in the kmer. Stop extension at ambiguous base and record LEP
|
||
|
|
if (idx_first_N != -1) {
|
||
|
|
if (*i != 0) {
|
||
|
|
raux->nextLEPBit = *i + idx_first_N - 1;
|
||
|
|
raux->lep[raux->nextLEPBit >> 6] |= (1ULL << (raux->nextLEPBit & (0x3FULL)));
|
||
|
|
}
|
||
|
|
*i += idx_first_N;
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
if (flag) {
|
||
|
|
raux->nextLEPBit = (raux->l_seq - 1);
|
||
|
|
*i = raux->l_seq;
|
||
|
|
raux->lep[raux->nextLEPBit >> 6] |= (1ULL << (raux->nextLEPBit & (0x3FULL)));
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
if (code == INVALID) {
|
||
|
|
// Do backward extension using LEP
|
||
|
|
*i += (kmerSize + xmerSize);
|
||
|
|
}
|
||
|
|
else if (code == SINGLE_HIT_LEAF) {
|
||
|
|
mlt_data = &iaux->mlt_table[start_addr];
|
||
|
|
byte_idx++;
|
||
|
|
memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[byte_idx], 5 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
mem->hitcount += 1;
|
||
|
|
kv_push(uint64_t, *hits, ref_pos >> 1);
|
||
|
|
byte_idx += 5;
|
||
|
|
*i += kmerSize;
|
||
|
|
}
|
||
|
|
else if (code == INFREQUENT) { // k-mer has fewer than 256 hits
|
||
|
|
*i += kmerSize;
|
||
|
|
mlt_data = &iaux->mlt_table[start_addr];
|
||
|
|
if (*i < raux->l_seq) {
|
||
|
|
//
|
||
|
|
// First 4 bytes of tree store the start of all multi-hit leaves for the k-mer
|
||
|
|
//
|
||
|
|
memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
byte_idx += 4;
|
||
|
|
getNextByteIdx(raux, mlt_data, &byte_idx, i, mem, hits);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
raux->lep[raux->nextLEPBit >> 6] |= (1ULL << (raux->nextLEPBit & (0x3FULL)));
|
||
|
|
raux->nextLEPBit += 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else if (code == FREQUENT) { // k-mer has large, dense tree, do an additional x-mer lookup
|
||
|
|
uint64_t xmer_entry;
|
||
|
|
uint64_t ptr = 0;
|
||
|
|
*i += kmerSize;
|
||
|
|
int k;
|
||
|
|
uint64_t lepBit = 0;
|
||
|
|
flag = 0;
|
||
|
|
mlt_data = &iaux->mlt_table[mlt_start_addr];
|
||
|
|
hashval = getHashKey(&raux->read_buf[*i], xmerSize, *i, raux->l_seq, &flag, &idx_first_N);
|
||
|
|
memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
byte_idx += 4;
|
||
|
|
memcpy_bwamem(&xmer_entry, 8 * sizeof(uint8_t), &mlt_data[byte_idx + (hashval << 3)], 8 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
code = xmer_entry & METADATA_MASK;
|
||
|
|
lep_data = (xmer_entry >> METADATA_BITWIDTH) & 0xF;
|
||
|
|
ptr = xmer_entry >> KMER_DATA_BITWIDTH;
|
||
|
|
int xmerLen = 0;
|
||
|
|
if (raux->l_seq - *i > xmerSize) {
|
||
|
|
xmerLen = xmerSize;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
xmerLen = raux->l_seq - *i;
|
||
|
|
}
|
||
|
|
for (k = 0; k < xmerLen; ++k) {
|
||
|
|
lepBit = (lep_data >> k) & 1;
|
||
|
|
raux->lep[raux->nextLEPBit >> 6] |= (lepBit << (raux->nextLEPBit & (0x3FULL)));
|
||
|
|
raux->nextLEPBit++;
|
||
|
|
}
|
||
|
|
// We found an ambiguous base in the kmer. Stop extension at ambiguous base and record LEP
|
||
|
|
if (idx_first_N != -1) {
|
||
|
|
raux->nextLEPBit = *i + idx_first_N - 1;
|
||
|
|
raux->lep[raux->nextLEPBit >> 6] |= (1ULL << (raux->nextLEPBit & (0x3FULL)));
|
||
|
|
*i += idx_first_N;
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
if (flag) {
|
||
|
|
raux->nextLEPBit = (raux->l_seq - 1);
|
||
|
|
*i = raux->l_seq;
|
||
|
|
raux->lep[raux->nextLEPBit >> 6] |= (1ULL << (raux->nextLEPBit & (0x3FULL)));
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
if (code == INVALID) {
|
||
|
|
*i += xmerSize;
|
||
|
|
}
|
||
|
|
else if (code == SINGLE_HIT_LEAF) {
|
||
|
|
byte_idx = ptr;
|
||
|
|
byte_idx++;
|
||
|
|
memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[byte_idx], 5 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
mem->hitcount += 1;
|
||
|
|
kv_push(uint64_t, *hits, ref_pos >> 1);
|
||
|
|
byte_idx += 5;
|
||
|
|
*i += xmerSize;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
byte_idx = ptr;
|
||
|
|
*i += xmerSize;
|
||
|
|
if (*i < raux->l_seq) {
|
||
|
|
getNextByteIdx(raux, mlt_data, &byte_idx, i, mem, hits);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
raux->lep[raux->nextLEPBit >> 6] |= (1ULL << (raux->nextLEPBit & (0x3FULL)));
|
||
|
|
raux->nextLEPBit += 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Main forward search function (reseeding). Lookup up k-mer and/or x-mer table and identify root of ERT
|
||
|
|
* Return early if root node has fewer than 'raux->limit' hits
|
||
|
|
*
|
||
|
|
* @param iaux index parameters
|
||
|
|
* @param raux read parameters
|
||
|
|
* @param i Index into read buffer
|
||
|
|
* @param mem maximal-exact-match storage
|
||
|
|
* @param hits list of hits for read
|
||
|
|
*/
|
||
|
|
void rightExtend_wlimit(index_aux_t* iaux, read_aux_t* raux, int* i, mem_t* mem, u64v* hits) {
|
||
|
|
|
||
|
|
uint8_t code;
|
||
|
|
uint8_t* mlt_data;
|
||
|
|
uint64_t byte_idx = 0, kmer_entry = 0, start_addr = 0;
|
||
|
|
uint32_t hashval = 0;
|
||
|
|
uint64_t lep_data = 0;
|
||
|
|
int flag = 0;
|
||
|
|
int idx_first_N = -1;
|
||
|
|
hashval = getHashKey(&raux->read_buf[*i], kmerSize, *i, raux->l_seq, &flag, &idx_first_N);
|
||
|
|
// index-table lookup
|
||
|
|
kmer_entry = iaux->kmer_offsets[hashval];
|
||
|
|
// index-table entry type
|
||
|
|
code = kmer_entry & METADATA_MASK;
|
||
|
|
lep_data = (kmer_entry >> METADATA_BITWIDTH) & LEP_MASK;
|
||
|
|
// pointer to root of tree
|
||
|
|
start_addr = kmer_entry >> KMER_DATA_BITWIDTH;
|
||
|
|
uint64_t mlt_start_addr = raux->mlt_start_addr = start_addr;
|
||
|
|
raux->mh_start_addr = 0;
|
||
|
|
// width used for internal pointers in tree, 2 bits, ptr_width = 4 is encoded as 0
|
||
|
|
raux->ptr_width = (((kmer_entry >> 22) & 3) == 0) ? 4 : ((kmer_entry >> 22) & 3);
|
||
|
|
raux->num_hits = (kmer_entry >> 17) & 0x1F;
|
||
|
|
// LEP takes up kmerSize-1 bits. Last LEP bit is at position = kmerSize-2.
|
||
|
|
if (*i <= 64-kmerSize) {
|
||
|
|
raux->lep[0] |= (lep_data << *i);
|
||
|
|
}
|
||
|
|
else if (*i < 64) {
|
||
|
|
raux->lep[0] |= (lep_data << *i);
|
||
|
|
raux->lep[1] |= (lep_data >> (64-*i));
|
||
|
|
}
|
||
|
|
else if (*i <= 128-kmerSize) {
|
||
|
|
raux->lep[1] |= (lep_data << (*i-64));
|
||
|
|
}
|
||
|
|
else if (*i < 128) {
|
||
|
|
raux->lep[1] |= (lep_data << (*i-64));
|
||
|
|
raux->lep[2] |= (lep_data >> (128-*i));
|
||
|
|
}
|
||
|
|
else if (*i <= 192-kmerSize) {
|
||
|
|
raux->lep[2] |= (lep_data << (*i-128));
|
||
|
|
}
|
||
|
|
else if (*i < 192) {
|
||
|
|
raux->lep[2] |= (lep_data << (*i-128));
|
||
|
|
raux->lep[3] |= (lep_data >> (192-*i));
|
||
|
|
}
|
||
|
|
else if (*i <= 256-kmerSize) {
|
||
|
|
raux->lep[3] |= (lep_data << (*i-192));
|
||
|
|
}
|
||
|
|
else if (*i < 256) {
|
||
|
|
raux->lep[3] |= (lep_data << (*i-192));
|
||
|
|
raux->lep[4] |= (lep_data >> (256-*i));
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
raux->lep[4] |= (lep_data << (*i-256));
|
||
|
|
}
|
||
|
|
raux->nextLEPBit = *i + kmerSize - 1;
|
||
|
|
byte_idx = 0;
|
||
|
|
// We found an ambiguous base in the kmer. Stop extension at ambiguous base and record LEP
|
||
|
|
if (idx_first_N != -1) {
|
||
|
|
if (*i != 0) {
|
||
|
|
raux->nextLEPBit = *i + idx_first_N - 1;
|
||
|
|
raux->lep[raux->nextLEPBit >> 6] |= (1ULL << (raux->nextLEPBit & (0x3FULL)));
|
||
|
|
}
|
||
|
|
*i += idx_first_N;
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
if (flag) {
|
||
|
|
raux->nextLEPBit = (raux->l_seq - 1);
|
||
|
|
*i = raux->l_seq;
|
||
|
|
raux->lep[raux->nextLEPBit >> 6] |= (1ULL << (raux->nextLEPBit & (0x3FULL)));
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
if (code == INVALID) {
|
||
|
|
// Do backward extension using LEP
|
||
|
|
*i += (kmerSize + xmerSize);
|
||
|
|
}
|
||
|
|
else if (code == SINGLE_HIT_LEAF) {
|
||
|
|
*i += (kmerSize + xmerSize);
|
||
|
|
}
|
||
|
|
else if (code == INFREQUENT) { // k-mer has fewer than 256 hits
|
||
|
|
*i += kmerSize;
|
||
|
|
mlt_data = &iaux->mlt_table[mlt_start_addr];
|
||
|
|
if ((raux->num_hits == 0) || (raux->num_hits >= raux->limit)) {
|
||
|
|
if (*i < raux->l_seq) {
|
||
|
|
path_v visited;
|
||
|
|
kv_init(visited);
|
||
|
|
node_info_t nif;
|
||
|
|
nif.byte_idx = byte_idx;
|
||
|
|
nif.num_hits = raux->num_hits;
|
||
|
|
kv_push(node_info_t, visited, nif);
|
||
|
|
//
|
||
|
|
// First 4 bytes of tree store the start of all multi-hit leaves for the k-mer
|
||
|
|
//
|
||
|
|
memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
byte_idx += 4;
|
||
|
|
getNextByteIdx_wlimit(raux, mlt_data, &byte_idx, i, mem, &visited, hits);
|
||
|
|
kv_destroy(visited);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
raux->lep[raux->nextLEPBit >> 6] |= (1ULL << (raux->nextLEPBit & (0x3FULL)));
|
||
|
|
raux->nextLEPBit += 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else if (code == FREQUENT) { // k-mer has large, dense tree, do an additional x-mer lookup
|
||
|
|
uint64_t xmer_entry;
|
||
|
|
uint64_t ptr = 0;
|
||
|
|
*i += kmerSize;
|
||
|
|
int k;
|
||
|
|
uint64_t lepBit = 0;
|
||
|
|
flag = 0;
|
||
|
|
mlt_data = &iaux->mlt_table[mlt_start_addr];
|
||
|
|
hashval = getHashKey(&raux->read_buf[*i], xmerSize, *i, raux->l_seq, &flag, &idx_first_N);
|
||
|
|
memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
byte_idx += 4;
|
||
|
|
memcpy_bwamem(&xmer_entry, 8 * sizeof(uint8_t), &mlt_data[byte_idx + (hashval << 3)], 8 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
code = xmer_entry & METADATA_MASK;
|
||
|
|
lep_data = (xmer_entry >> METADATA_BITWIDTH) & 0xF;
|
||
|
|
ptr = xmer_entry >> KMER_DATA_BITWIDTH;
|
||
|
|
// 5 bits to encode hits for each node
|
||
|
|
raux->num_hits = (xmer_entry >> 17) & 0x1F;
|
||
|
|
int xmerLen = 0;
|
||
|
|
if (raux->l_seq - *i > xmerSize) {
|
||
|
|
xmerLen = xmerSize;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
xmerLen = raux->l_seq - *i;
|
||
|
|
}
|
||
|
|
for (k = 0; k < xmerLen; ++k) {
|
||
|
|
lepBit = (lep_data >> k) & 1;
|
||
|
|
raux->lep[raux->nextLEPBit >> 6] |= (lepBit << (raux->nextLEPBit & (0x3FULL)));
|
||
|
|
raux->nextLEPBit++;
|
||
|
|
}
|
||
|
|
// We found an ambiguous base in the kmer. Stop extension at ambiguous base and record LEP
|
||
|
|
if (idx_first_N != -1) {
|
||
|
|
raux->nextLEPBit = *i + idx_first_N - 1;
|
||
|
|
raux->lep[raux->nextLEPBit >> 6] |= (1ULL << (raux->nextLEPBit & (0x3FULL)));
|
||
|
|
*i += idx_first_N;
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
if (flag) {
|
||
|
|
raux->nextLEPBit = (raux->l_seq - 1);
|
||
|
|
*i = raux->l_seq;
|
||
|
|
raux->lep[raux->nextLEPBit >> 6] |= (1ULL << (raux->nextLEPBit & (0x3FULL)));
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
if (code == INVALID) {
|
||
|
|
*i += xmerSize;
|
||
|
|
}
|
||
|
|
else if (code == SINGLE_HIT_LEAF) {
|
||
|
|
*i += xmerSize;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
byte_idx = ptr;
|
||
|
|
*i += xmerSize;
|
||
|
|
if ((raux->num_hits == 0) || (raux->num_hits >= raux->limit)) {
|
||
|
|
if (*i < raux->l_seq) {
|
||
|
|
path_v visited;
|
||
|
|
kv_init(visited);
|
||
|
|
node_info_t nif;
|
||
|
|
nif.byte_idx = byte_idx;
|
||
|
|
nif.num_hits = raux->num_hits;
|
||
|
|
kv_push(node_info_t, visited, nif);
|
||
|
|
getNextByteIdx_wlimit(raux, mlt_data, &byte_idx, i, mem, &visited, hits);
|
||
|
|
kv_destroy(visited);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
raux->lep[raux->nextLEPBit >> 6] |= (1ULL << (raux->nextLEPBit & (0x3FULL)));
|
||
|
|
raux->nextLEPBit += 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Main forward search function for LAST heuristic
|
||
|
|
*
|
||
|
|
* @param iaux index parameters
|
||
|
|
* @param raux read parameters
|
||
|
|
* @param i Index into read buffer
|
||
|
|
* @param mem maximal-exact-match storage
|
||
|
|
* @param hits list of hits for read
|
||
|
|
*/
|
||
|
|
void rightExtend_last(index_aux_t* iaux, read_aux_t* raux, int* i, mem_t* mem, u64v* hits) {
|
||
|
|
|
||
|
|
uint8_t code;
|
||
|
|
uint8_t* mlt_data;
|
||
|
|
uint64_t byte_idx = 0, ref_pos = 0, kmer_entry = 0, start_addr = 0;
|
||
|
|
int flag = 0;
|
||
|
|
uint32_t hashval = 0;
|
||
|
|
int idx_first_N = -1;
|
||
|
|
hashval = getHashKey(&raux->read_buf[*i], kmerSize, *i, raux->l_seq, &flag, &idx_first_N);
|
||
|
|
// We found an ambiguous base in the kmer. Stop extension
|
||
|
|
if (idx_first_N != -1) {
|
||
|
|
*i += (idx_first_N + 1);
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
if (flag) {
|
||
|
|
*i = raux->l_seq;
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
// index-table lookup
|
||
|
|
kmer_entry = iaux->kmer_offsets[hashval];
|
||
|
|
code = kmer_entry & METADATA_MASK;
|
||
|
|
start_addr = kmer_entry >> KMER_DATA_BITWIDTH;
|
||
|
|
uint64_t mlt_start_addr = raux->mlt_start_addr = start_addr;
|
||
|
|
raux->mh_start_addr = 0;
|
||
|
|
raux->ptr_width = (((kmer_entry >> 22) & 3) == 0) ? 4 : ((kmer_entry >> 22) & 3);
|
||
|
|
raux->num_hits = (kmer_entry >> 17) & 0x1F;
|
||
|
|
byte_idx = 0;
|
||
|
|
if (code == INVALID) {
|
||
|
|
// Do backward extension using LEP
|
||
|
|
*i += kmerSize;
|
||
|
|
}
|
||
|
|
else if (code == SINGLE_HIT_LEAF) {
|
||
|
|
mlt_data = &iaux->mlt_table[mlt_start_addr];
|
||
|
|
byte_idx++;
|
||
|
|
memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[byte_idx], 5 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
mem->hitcount += 1;
|
||
|
|
kv_push(uint64_t, *hits, ref_pos >> 1);
|
||
|
|
byte_idx += 5;
|
||
|
|
*i += kmerSize;
|
||
|
|
}
|
||
|
|
else if (code == INFREQUENT) {
|
||
|
|
*i += kmerSize;
|
||
|
|
mlt_data = &iaux->mlt_table[mlt_start_addr];
|
||
|
|
if (*i < raux->l_seq) { // Length <= (kmer + xmer). Need not check num_hits here.
|
||
|
|
memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
byte_idx += 4;
|
||
|
|
getNextByteIdx_last(raux, mlt_data, &byte_idx, i, mem, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else if (code == FREQUENT) {
|
||
|
|
uint64_t xmer_entry;
|
||
|
|
uint64_t ptr = 0;
|
||
|
|
*i += kmerSize;
|
||
|
|
flag = 0;
|
||
|
|
mlt_data = &iaux->mlt_table[mlt_start_addr];
|
||
|
|
hashval = getHashKey(&raux->read_buf[*i], xmerSize, *i, raux->l_seq, &flag, &idx_first_N);
|
||
|
|
// We found an ambiguous base in the kmer. Stop extension
|
||
|
|
if (idx_first_N != -1) {
|
||
|
|
*i += (idx_first_N + 1);
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
if (flag) {
|
||
|
|
*i = raux->l_seq;
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
byte_idx += 4;
|
||
|
|
memcpy_bwamem(&xmer_entry, 8 * sizeof(uint8_t), &mlt_data[byte_idx + (hashval << 3)], 8 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
code = xmer_entry & METADATA_MASK;
|
||
|
|
ptr = xmer_entry >> KMER_DATA_BITWIDTH;
|
||
|
|
raux->num_hits = (xmer_entry >> 17) & 0x1F;
|
||
|
|
if (code == INVALID) {
|
||
|
|
*i += xmerSize;
|
||
|
|
}
|
||
|
|
else if (code == SINGLE_HIT_LEAF) {
|
||
|
|
byte_idx = ptr;
|
||
|
|
byte_idx++;
|
||
|
|
memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[byte_idx], 5 * sizeof(uint8_t), __FILE__, __LINE__);
|
||
|
|
mem->hitcount += 1;
|
||
|
|
kv_push(uint64_t, *hits, ref_pos >> 1);
|
||
|
|
byte_idx += 5;
|
||
|
|
*i += xmerSize;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
byte_idx = ptr;
|
||
|
|
*i += xmerSize;
|
||
|
|
if ((raux->num_hits == 0) || (raux->num_hits >= raux->limit) || ((*i - mem->start) < (raux->min_seed_len + 1))) {
|
||
|
|
if (*i < raux->l_seq) {
|
||
|
|
getNextByteIdx_last(raux, mlt_data, &byte_idx, i, mem, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else { // number of hits is less than the limit and seed length >= opt->min_seed_len
|
||
|
|
leaf_gather(raux, mlt_data, &byte_idx, mem, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Initialize MEM parameters for backward extension
|
||
|
|
*
|
||
|
|
* @param lep LEP bit vector
|
||
|
|
* @param mem maximal-exact-match
|
||
|
|
* @param j Index into LEP
|
||
|
|
* @param seq_len Read length
|
||
|
|
* @param min_seed_len Minimum seed length
|
||
|
|
*
|
||
|
|
* @return mem_valid valid flag to indicate if backward search needs to be performed from given position j
|
||
|
|
*/
|
||
|
|
static inline int init_mem(uint64_t* lep, mem_t* mem, int j, int seq_len, int min_seed_len) {
|
||
|
|
int lep_bit_set = (lep[j >> 6] >> (j & 63)) & 1;
|
||
|
|
int in_valid_range = (j >= (min_seed_len-1)) ? 1 : 0;
|
||
|
|
int mem_valid = (lep_bit_set && in_valid_range);
|
||
|
|
mem->end = j + 1; // [start, end) indexing
|
||
|
|
mem->rc_start = seq_len - j - 1;
|
||
|
|
mem->rc_end = mem->rc_start;
|
||
|
|
mem->skip_ref_fetch = 0;
|
||
|
|
mem->forward = 0;
|
||
|
|
mem->fetch_leaves = 0;
|
||
|
|
mem->hitbeg = mem->hitcount = 0;
|
||
|
|
mem->end_correction = 0;
|
||
|
|
mem->is_multi_hit = 0;
|
||
|
|
return mem_valid;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Compute final SMEMs and their hits after considering their overlaps
|
||
|
|
*
|
||
|
|
* Leaf expansion is also performed to get the actual length of the MEM.
|
||
|
|
* Note that leaf nodes store compressed suffixes by including a pointer to the reference genome
|
||
|
|
*
|
||
|
|
* @param iaux index related parameters
|
||
|
|
* @param raux read related parameters
|
||
|
|
* @param mem maximal-exact-match storage
|
||
|
|
* @param sh helper data structure to keep track of start and end positions of previously identified MEMs
|
||
|
|
* @param smems List of SMEMs
|
||
|
|
*
|
||
|
|
* @return n number of backward extensions to skip
|
||
|
|
*/
|
||
|
|
int check_and_add_smem_prefix_reseed(index_aux_t* iaux, read_aux_t* raux, mem_t* mem, smem_helper_t* sh, mem_v* smems, u64v* hits) {
|
||
|
|
|
||
|
|
mem->start = raux->l_seq - mem->rc_end; // Adjust start position of LMEM
|
||
|
|
int lmemLen = mem->end - mem->start, rmemLen = -1, next_be_point;
|
||
|
|
if (mem->hitcount > 0 && !mem->skip_ref_fetch) {
|
||
|
|
int64_t len;
|
||
|
|
int64_t start_ref_pos = hits->a[mem->hitbeg] - mem->rc_start;
|
||
|
|
int64_t end_ref_pos = hits->a[mem->hitbeg];
|
||
|
|
// Fetch reference to check for extra matching bps on the right
|
||
|
|
uint8_t* rseq = get_seq(iaux->bns->l_pac, iaux->pac, start_ref_pos, end_ref_pos, &len, iaux->ref_string, 0);
|
||
|
|
int m, numMatchingBP = 0;
|
||
|
|
for (m = 1; m <= len; ++m) {
|
||
|
|
if (rseq[mem->rc_start - m] == raux->read_buf[mem->rc_start - m]) {
|
||
|
|
numMatchingBP++;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
mem->end += numMatchingBP;
|
||
|
|
mem->end_correction += numMatchingBP;
|
||
|
|
|
||
|
|
start_ref_pos = hits->a[mem->hitbeg] + lmemLen;
|
||
|
|
end_ref_pos = start_ref_pos + mem->start;
|
||
|
|
|
||
|
|
// Fetch reference to check for extra matching bps on the left
|
||
|
|
rseq = get_seq(iaux->bns->l_pac, iaux->pac, start_ref_pos, end_ref_pos, &len, iaux->ref_string, 0);
|
||
|
|
numMatchingBP = 0;
|
||
|
|
for (m = 0; m < len; ++m) {
|
||
|
|
if (rseq[m] == raux->read_buf[mem->rc_end + m]) {
|
||
|
|
numMatchingBP++;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
mem->start -= numMatchingBP;
|
||
|
|
}
|
||
|
|
// Adjust start position of MEM by extra matching bps
|
||
|
|
lmemLen = mem->end - mem->start;
|
||
|
|
next_be_point = mem->end;
|
||
|
|
if (mem->hitcount == 1) {
|
||
|
|
if (lmemLen >= raux->min_seed_len) {
|
||
|
|
kv_push(mem_t, *smems, *mem);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
next_be_point += (raux->min_seed_len - lmemLen);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
// perform forward extension only for non-empty MEMs from backward extension
|
||
|
|
else if (mem->fetch_leaves && (mem->start <= (raux->l_seq - raux->min_seed_len))) {
|
||
|
|
hits->n -= mem->hitcount;
|
||
|
|
mem->hitbeg = hits->n;
|
||
|
|
mem->hitcount = 0;
|
||
|
|
raux->read_buf = raux->unpacked_queue_buf;
|
||
|
|
rightExtend_fetch_leaves_prefix_reseed(iaux, raux, mem, hits);
|
||
|
|
raux->read_buf = raux->unpacked_rc_queue_buf;
|
||
|
|
rmemLen = mem->end - mem->start;
|
||
|
|
next_be_point = mem->end;
|
||
|
|
if (mem->hitcount > 0) {
|
||
|
|
if (mem->is_multi_hit) {
|
||
|
|
int64_t len;
|
||
|
|
int64_t start_ref_pos = hits->a[mem->hitbeg] + rmemLen;
|
||
|
|
int64_t end_ref_pos = hits->a[mem->hitbeg] + raux->l_seq - mem->start;
|
||
|
|
/// Fetch reference
|
||
|
|
uint8_t* rseq = get_seq(iaux->bns->l_pac, iaux->pac, start_ref_pos, end_ref_pos, &len, iaux->ref_string, 0);
|
||
|
|
int m;
|
||
|
|
int numMatchingBP = 0;
|
||
|
|
/// Check for matching bases
|
||
|
|
for (m = 0; m < len; ++m) {
|
||
|
|
if (rseq[m] == raux->unpacked_queue_buf[mem->end + m]) {
|
||
|
|
numMatchingBP++;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
mem->end += numMatchingBP;
|
||
|
|
rmemLen = mem->end - mem->start;
|
||
|
|
next_be_point = mem->end;
|
||
|
|
}
|
||
|
|
if (rmemLen >= raux->min_seed_len && mem->end <= sh->mem_end_limit) {
|
||
|
|
kv_push(mem_t, *smems, *mem);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
next_be_point += (raux->min_seed_len - rmemLen);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else { // we don't have min_seed_len match for this start position
|
||
|
|
assert(rmemLen <= raux->min_seed_len);
|
||
|
|
next_be_point += (raux->min_seed_len - rmemLen);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
if (lmemLen <= raux->min_seed_len) {
|
||
|
|
next_be_point += (raux->min_seed_len - lmemLen);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return next_be_point;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Compute final SMEMs and their hits after considering their overlaps
|
||
|
|
*
|
||
|
|
* Leaf expansion is also performed to get the actual length of the MEM.
|
||
|
|
* Note that leaf nodes store compressed suffixes by including a pointer to the reference genome
|
||
|
|
*
|
||
|
|
* @param iaux index related parameters
|
||
|
|
* @param raux read related parameters
|
||
|
|
* @param mem maximal-exact-match storage
|
||
|
|
* @param sh helper data structure to keep track of start and end positions of previously identified MEMs
|
||
|
|
* @param smems list of SMEMs
|
||
|
|
* @param hits list of hits for read
|
||
|
|
*
|
||
|
|
* @return n number of backward extensions to skip
|
||
|
|
*/
|
||
|
|
int check_and_add_smem_prefix(index_aux_t* iaux, read_aux_t* raux, mem_t* mem, smem_helper_t* sh, mem_v* smems, u64v* hits) {
|
||
|
|
|
||
|
|
mem->start = raux->l_seq - mem->rc_end; // Adjust start position of LMEM
|
||
|
|
int lmemLen = mem->end - mem->start, rmemLen = -1, next_be_point;
|
||
|
|
if (mem->hitcount > 0 && !mem->skip_ref_fetch) {
|
||
|
|
int64_t len;
|
||
|
|
int64_t start_ref_pos = hits->a[mem->hitbeg] - mem->rc_start;
|
||
|
|
int64_t end_ref_pos = hits->a[mem->hitbeg];
|
||
|
|
// Fetch reference to check for extra matching bps
|
||
|
|
uint8_t* rseq = get_seq(iaux->bns->l_pac, iaux->pac, start_ref_pos, end_ref_pos, &len, iaux->ref_string, 0);
|
||
|
|
int m, numMatchingBP = 0;
|
||
|
|
for (m = 1; m <= len; ++m) {
|
||
|
|
if (rseq[mem->rc_start - m] == raux->read_buf[mem->rc_start - m]) {
|
||
|
|
numMatchingBP++;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
mem->end += numMatchingBP;
|
||
|
|
mem->end_correction += numMatchingBP;
|
||
|
|
|
||
|
|
start_ref_pos = hits->a[mem->hitbeg] + lmemLen;
|
||
|
|
end_ref_pos = start_ref_pos + mem->start;
|
||
|
|
// Fetch reference to check for extra matching bps
|
||
|
|
rseq = get_seq(iaux->bns->l_pac, iaux->pac, start_ref_pos, end_ref_pos, &len, iaux->ref_string, 0);
|
||
|
|
numMatchingBP = 0;
|
||
|
|
for (m = 0; m < len; ++m) {
|
||
|
|
if (rseq[m] == raux->read_buf[mem->rc_end + m]) {
|
||
|
|
numMatchingBP++;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
// free(rseq);
|
||
|
|
mem->start -= numMatchingBP;
|
||
|
|
}
|
||
|
|
lmemLen = mem->end - mem->start;
|
||
|
|
next_be_point = mem->end;
|
||
|
|
// skip forward re-traversal for single-git MEMs
|
||
|
|
if (mem->hitcount == 1) {
|
||
|
|
if (lmemLen >= raux->min_seed_len) {
|
||
|
|
kv_push(mem_t, *smems, *mem);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
next_be_point += (raux->min_seed_len - lmemLen);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else if (mem->fetch_leaves && (mem->start <= (raux->l_seq - raux->min_seed_len))) {
|
||
|
|
hits->n -= mem->hitcount;
|
||
|
|
mem->hitbeg = hits->n;
|
||
|
|
mem->hitcount = 0;
|
||
|
|
raux->read_buf = raux->unpacked_queue_buf;
|
||
|
|
rightExtend_fetch_leaves_prefix(iaux, raux, mem, hits);
|
||
|
|
raux->read_buf = raux->unpacked_rc_queue_buf;
|
||
|
|
rmemLen = mem->end - mem->start;
|
||
|
|
next_be_point = mem->end;
|
||
|
|
if (mem->hitcount > 0) {
|
||
|
|
int64_t len;
|
||
|
|
int64_t start_ref_pos = hits->a[mem->hitbeg] + rmemLen;
|
||
|
|
int64_t end_ref_pos = hits->a[mem->hitbeg] + raux->l_seq - mem->start;
|
||
|
|
/// Fetch reference
|
||
|
|
uint8_t* rseq = get_seq(iaux->bns->l_pac, iaux->pac, start_ref_pos, end_ref_pos, &len, iaux->ref_string, 0);
|
||
|
|
int m;
|
||
|
|
int numMatchingBP = 0;
|
||
|
|
/// Check for matching bases
|
||
|
|
for (m = 0; m < len; ++m) {
|
||
|
|
if (rseq[m] == raux->unpacked_queue_buf[mem->end + m]) {
|
||
|
|
numMatchingBP++;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
mem->end += numMatchingBP;
|
||
|
|
rmemLen = mem->end - mem->start;
|
||
|
|
next_be_point = mem->end;
|
||
|
|
if (rmemLen >= raux->min_seed_len) {
|
||
|
|
kv_push(mem_t, *smems, *mem);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
next_be_point += (raux->min_seed_len - rmemLen);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else { // we don't have min_seed_len match for this start position
|
||
|
|
assert(rmemLen <= raux->min_seed_len);
|
||
|
|
next_be_point += (raux->min_seed_len - rmemLen);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
assert(lmemLen <= raux->min_seed_len);
|
||
|
|
next_be_point += (raux->min_seed_len - lmemLen);
|
||
|
|
}
|
||
|
|
|
||
|
|
return next_be_point;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Compute final SMEMs and their hits after considering their overlaps
|
||
|
|
*
|
||
|
|
* Leaf expansion is also performed to get the actual length of the MEM.
|
||
|
|
* Note that leaf nodes store compressed suffixes by including a pointer to the reference genome
|
||
|
|
*
|
||
|
|
* @param iaux index related parameters
|
||
|
|
* @param raux read related parameters
|
||
|
|
* @param mem maximal-exact-match storage
|
||
|
|
* @param sh helper data structure to keep track of start and end positions of previously identified MEMs
|
||
|
|
* @param smems list of SMEMs
|
||
|
|
* @param hits list of hits for read
|
||
|
|
*/
|
||
|
|
void check_and_add_smem(index_aux_t* iaux, read_aux_t* raux, mem_t* mem, smem_helper_t* sh, mem_v* smems, u64v* hits) {
|
||
|
|
|
||
|
|
mem->start = raux->l_seq - mem->rc_end; // Adjust start position of LMEM
|
||
|
|
int lmemLen = mem->end - mem->start;
|
||
|
|
if (mem->hitcount > 0 && !mem->skip_ref_fetch) {
|
||
|
|
int64_t len;
|
||
|
|
int64_t start_ref_pos = hits->a[mem->hitbeg] + lmemLen;
|
||
|
|
int64_t end_ref_pos = start_ref_pos + mem->start;
|
||
|
|
// Fetch reference to check for extra matching bps
|
||
|
|
uint8_t* rseq = get_seq(iaux->bns->l_pac, iaux->pac, start_ref_pos, end_ref_pos, &len, iaux->ref_string, 0);
|
||
|
|
int m, numMatchingBP = 0;
|
||
|
|
for (m = 0; m < len; ++m) {
|
||
|
|
if (rseq[m] == raux->read_buf[mem->rc_end + m]) {
|
||
|
|
numMatchingBP++;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
// Adjust start position of MEM by extra matching bps
|
||
|
|
mem->start -= numMatchingBP;
|
||
|
|
}
|
||
|
|
lmemLen = mem->end - mem->start;
|
||
|
|
if (lmemLen >= raux->min_seed_len) {
|
||
|
|
// Check if MEM lies completely within previously discovered MEM
|
||
|
|
if (mem->start < sh->prevMemStart || mem->end > sh->prevMemEnd) {
|
||
|
|
// Extra work for equivalency with BWA-MEM.
|
||
|
|
if (mem->fetch_leaves) {
|
||
|
|
hits->n -= mem->hitcount;
|
||
|
|
mem->hitbeg = hits->n;
|
||
|
|
mem->hitcount = 0;
|
||
|
|
raux->read_buf = raux->unpacked_queue_buf;
|
||
|
|
rightExtend_fetch_leaves(iaux, raux, mem, hits);
|
||
|
|
raux->read_buf = raux->unpacked_rc_queue_buf;
|
||
|
|
}
|
||
|
|
if (mem->hitcount > 0) {
|
||
|
|
mem->pt.c_pivot = sh->curr_pivot;
|
||
|
|
mem->pt.p_pivot = sh->prev_pivot;
|
||
|
|
mem->pt.pp_pivot = sh->prev_prev_pivot;
|
||
|
|
kv_push(mem_t, *smems, *mem);
|
||
|
|
if (mem->start <= (sh->prev_pivot + 1)) {
|
||
|
|
sh->stop_be = 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
sh->prevMemStart = mem->start;
|
||
|
|
sh->prevMemEnd = mem->end;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* This function replaces bwt_smem1() and uses ERT to generate SMEMs
|
||
|
|
*
|
||
|
|
* @param iaux index related parameters
|
||
|
|
* @param raux read related parameters
|
||
|
|
* @param smems list of SMEMs
|
||
|
|
* @param hits list of hits for read
|
||
|
|
*/
|
||
|
|
void get_seeds_prefix(index_aux_t* iaux, read_aux_t* raux, mem_v* smems, u64v* hits) {
|
||
|
|
|
||
|
|
smem_helper_t sh;
|
||
|
|
memset(&sh, 0, sizeof(smem_helper_t));
|
||
|
|
sh.prevMemStart = raux->l_seq;
|
||
|
|
sh.prevMemEnd = 0;
|
||
|
|
int i = 0, j = 0;
|
||
|
|
sh.prev_pivot = -1;
|
||
|
|
sh.prev_prev_pivot = -1;
|
||
|
|
memset(raux->lep, 0, 5 * sizeof(uint64_t));
|
||
|
|
while (i < raux->l_seq) { // Begin identifying RMEMs
|
||
|
|
mem_t rm;
|
||
|
|
memset(&rm, 0, sizeof(mem_t));
|
||
|
|
rm.start = i;
|
||
|
|
rm.forward = 1;
|
||
|
|
rm.hitbeg = hits->n;
|
||
|
|
sh.curr_pivot = rm.start;
|
||
|
|
raux->read_buf = raux->unpacked_queue_buf;
|
||
|
|
rightExtend(iaux, raux, &i, &rm, hits); //!< Compute LEP.
|
||
|
|
// Lazy expansion of leaf nodes.
|
||
|
|
if (rm.hitcount > 0 && !rm.skip_ref_fetch) {
|
||
|
|
int64_t len;
|
||
|
|
int64_t start_ref_pos = hits->a[rm.hitbeg] + i - rm.start;
|
||
|
|
int64_t end_ref_pos = hits->a[rm.hitbeg] + raux->l_seq - rm.start;
|
||
|
|
// Fetch reference
|
||
|
|
uint8_t* rseq = get_seq(iaux->bns->l_pac, iaux->pac, start_ref_pos, end_ref_pos, &len, iaux->ref_string, 0);
|
||
|
|
int m;
|
||
|
|
int numMatchingBP = 0;
|
||
|
|
// Check for matching bases
|
||
|
|
for (m = 0; m < len; ++m) {
|
||
|
|
if (rseq[m] == raux->unpacked_queue_buf[i+m]) {
|
||
|
|
numMatchingBP++;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
raux->lep[(i+m-1) >> 6] |= (1ULL << ((i+m-1) & (0x3FULL)));
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
// Last base of RMEM must have LEP bit set
|
||
|
|
if (m == len) {
|
||
|
|
raux->lep[(i+m-1) >> 6] |= (1ULL << ((i+m-1) & (0x3FULL)));
|
||
|
|
}
|
||
|
|
i += numMatchingBP;
|
||
|
|
}
|
||
|
|
rm.end = i;
|
||
|
|
int rmemLen = rm.end - rm.start;
|
||
|
|
// No left-extension for position 0 in read
|
||
|
|
// rm.start is the current pivot
|
||
|
|
if (rm.start == 0) {
|
||
|
|
if (rmemLen >= raux->min_seed_len) {
|
||
|
|
if (rm.hitcount > 0) {
|
||
|
|
kv_push(mem_t, *smems, rm);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
hits->n -= rm.hitcount;
|
||
|
|
}
|
||
|
|
memset(raux->lep, 0, 5 * sizeof(uint64_t));
|
||
|
|
}
|
||
|
|
else { // perform all backward extensions
|
||
|
|
hits->n -= rm.hitcount;
|
||
|
|
uint64_t* lep = raux->lep;
|
||
|
|
int seq_len = raux->l_seq;
|
||
|
|
int min_seed_len = raux->min_seed_len;
|
||
|
|
sh.stop_be = 0;
|
||
|
|
int min_j = (rm.start > min_seed_len) ? (rm.start-1) : (min_seed_len-1);
|
||
|
|
int max_j = rm.end - 1;
|
||
|
|
j = min_j;
|
||
|
|
sh.prev_pivot = rm.start;
|
||
|
|
while (j <= max_j) {
|
||
|
|
mem_t m;
|
||
|
|
int be_point;
|
||
|
|
int mem_valid = init_mem(lep, &m, j, seq_len, min_seed_len);
|
||
|
|
m.hitbeg = hits->n;
|
||
|
|
int next_j = j + 1;
|
||
|
|
if (mem_valid) {
|
||
|
|
be_point = j + 1;
|
||
|
|
if (be_point >= min_seed_len) {
|
||
|
|
int rc_i = seq_len - be_point;
|
||
|
|
raux->read_buf = raux->unpacked_rc_queue_buf;
|
||
|
|
leftExtend(iaux, raux, &rc_i, &m, hits);
|
||
|
|
next_j = check_and_add_smem_prefix(iaux, raux, &m, &sh, smems, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
j = next_j;
|
||
|
|
if (m.end > i) {
|
||
|
|
i = m.end;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
raux->read_buf = raux->unpacked_queue_buf;
|
||
|
|
// Skip all ambiguous bases
|
||
|
|
while (i < raux->l_seq) {
|
||
|
|
if (raux->read_buf[i] == 4) {
|
||
|
|
++i;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
// Check if there other ambiguous bases within min_seed_len bases of the start of the MEM
|
||
|
|
while ((i < raux->l_seq) && (i - rm.start) < raux->min_seed_len) {
|
||
|
|
if (raux->read_buf[i] == 4) {
|
||
|
|
++i;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
++i;
|
||
|
|
}
|
||
|
|
sh.prev_prev_pivot = sh.prev_pivot;
|
||
|
|
sh.prev_pivot = rm.start;
|
||
|
|
memset(raux->lep, 0, 5 * sizeof(uint64_t));
|
||
|
|
break; // zzh
|
||
|
|
}
|
||
|
|
#ifdef PRINT_SMEM
|
||
|
|
ks_introsort(mem_smem_sort_lt_ert, smems->n, smems->a); // Sort SMEMs based on start pos in read. For DEBUG.
|
||
|
|
for (i = 0; i < smems->n; ++i) {
|
||
|
|
// printf("[SMEM]:%d,%d\n", smems->a[i].start, smems->a[i].end);
|
||
|
|
int idx;
|
||
|
|
for (idx = 0; idx < smems->a[i].hitcount; ++idx) {
|
||
|
|
if (smems->a[i].forward || smems->a[i].fetch_leaves) {
|
||
|
|
printf("[SMEM]:%d,%d,%lu\n", smems->a[i].start, smems->a[i].end, hits->a[smems->a[i].hitbeg + idx]);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
printf("[SMEM]:%d,%d,%lu\n", smems->a[i].start, smems->a[i].end, (iaux->bns->l_pac << 1) - hits->a[smems->a[i].hitbeg + idx] - (smems->a[i].end - smems->a[i].start - smems->a[i].end_correction));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
#endif
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* This function replaces bwt_smem1() and uses ERT to generate SMEMs
|
||
|
|
*
|
||
|
|
* @param iaux index related parameters
|
||
|
|
* @param raux read related parameters
|
||
|
|
* @param smems list of SMEMs
|
||
|
|
* @param hits list of hits for read
|
||
|
|
*/
|
||
|
|
void get_seeds(index_aux_t* iaux, read_aux_t* raux, mem_v* smems, u64v* hits) {
|
||
|
|
|
||
|
|
smem_helper_t sh;
|
||
|
|
memset(&sh, 0, sizeof(smem_helper_t));
|
||
|
|
sh.prevMemStart = raux->l_seq;
|
||
|
|
sh.prevMemEnd = 0;
|
||
|
|
int i = 0, j = 0;
|
||
|
|
sh.prev_pivot = -1;
|
||
|
|
sh.prev_prev_pivot = -1;
|
||
|
|
memset(raux->lep, 0, 5 * sizeof(uint64_t));
|
||
|
|
while (i < raux->l_seq) { // Begin identifying RMEMs
|
||
|
|
mem_t rm;
|
||
|
|
memset(&rm, 0, sizeof(mem_t));
|
||
|
|
rm.start = i;
|
||
|
|
rm.forward = 1;
|
||
|
|
rm.hitbeg = hits->n;
|
||
|
|
sh.curr_pivot = rm.start;
|
||
|
|
raux->read_buf = raux->unpacked_queue_buf;
|
||
|
|
rightExtend(iaux, raux, &i, &rm, hits); // Compute LEP.
|
||
|
|
// Lazy expansion of leaf nodes.
|
||
|
|
if (rm.hitcount > 0 && !rm.skip_ref_fetch) {
|
||
|
|
int64_t len;
|
||
|
|
int64_t start_ref_pos = hits->a[rm.hitbeg] + i - rm.start;
|
||
|
|
int64_t end_ref_pos = hits->a[rm.hitbeg] + raux->l_seq - rm.start;
|
||
|
|
// Fetch reference
|
||
|
|
uint8_t* rseq = get_seq(iaux->bns->l_pac, iaux->pac, start_ref_pos, end_ref_pos, &len, iaux->ref_string, 0);
|
||
|
|
int m;
|
||
|
|
int numMatchingBP = 0;
|
||
|
|
// Check for matching bases
|
||
|
|
for (m = 0; m < len; ++m) {
|
||
|
|
if (rseq[m] == raux->unpacked_queue_buf[i+m]) {
|
||
|
|
numMatchingBP++;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
raux->lep[(i+m-1) >> 6] |= (1ULL << ((i+m-1) & (0x3FULL)));
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
// Last base of RMEM must have LEP bit set
|
||
|
|
if (m == len) {
|
||
|
|
raux->lep[(i+m-1) >> 6] |= (1ULL << ((i+m-1) & (0x3FULL)));
|
||
|
|
}
|
||
|
|
i += numMatchingBP;
|
||
|
|
}
|
||
|
|
rm.end = i;
|
||
|
|
int rmemLen = rm.end - rm.start;
|
||
|
|
// No left-extension for position 0 in read
|
||
|
|
// rm.start is the current pivot
|
||
|
|
if (rm.start == 0) {
|
||
|
|
if (rmemLen >= raux->min_seed_len) {
|
||
|
|
if (rm.hitcount > 0) {
|
||
|
|
rm.pt.c_pivot = sh.curr_pivot;
|
||
|
|
rm.pt.p_pivot = sh.prev_pivot;
|
||
|
|
rm.pt.pp_pivot = sh.prev_prev_pivot;
|
||
|
|
kv_push(mem_t, *smems, rm);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
hits->n -= rm.hitcount;
|
||
|
|
}
|
||
|
|
memset(raux->lep, 0, 5 * sizeof(uint64_t));
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
hits->n -= rm.hitcount;
|
||
|
|
uint64_t* lep = raux->lep;
|
||
|
|
int seq_len = raux->l_seq;
|
||
|
|
int min_seed_len = raux->min_seed_len;
|
||
|
|
j = rm.end-1;
|
||
|
|
sh.stop_be = 0;
|
||
|
|
int min_j = (rm.start > min_seed_len) ? (rm.start-1) : (min_seed_len-1);
|
||
|
|
while (j >= min_j) {
|
||
|
|
mem_t m;
|
||
|
|
int be_point;
|
||
|
|
int mem_valid = init_mem(lep, &m, j, seq_len, min_seed_len);
|
||
|
|
m.hitbeg = hits->n;
|
||
|
|
if (mem_valid) {
|
||
|
|
be_point = j + 1;
|
||
|
|
if (be_point >= min_seed_len) {
|
||
|
|
int rc_i = seq_len - be_point;
|
||
|
|
raux->read_buf = raux->unpacked_rc_queue_buf;
|
||
|
|
leftExtend(iaux, raux, &rc_i, &m, hits);
|
||
|
|
check_and_add_smem(iaux, raux, &m, &sh, smems, hits);
|
||
|
|
if (sh.stop_be) break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
j -= 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
raux->read_buf = raux->unpacked_queue_buf;
|
||
|
|
// Skip all ambiguous bases
|
||
|
|
while (i < raux->l_seq) {
|
||
|
|
if (raux->read_buf[i] == 4) {
|
||
|
|
++i;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
// Check if there other ambiguous bases within min_seed_len bases of the start of the MEM
|
||
|
|
while ((i < raux->l_seq) && (i - rm.start) < raux->min_seed_len) {
|
||
|
|
if (raux->read_buf[i] == 4) {
|
||
|
|
++i;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
++i;
|
||
|
|
}
|
||
|
|
sh.prev_prev_pivot = sh.prev_pivot;
|
||
|
|
sh.prev_pivot = rm.start;
|
||
|
|
memset(raux->lep, 0, 5 * sizeof(uint64_t));
|
||
|
|
break; // zzh
|
||
|
|
}
|
||
|
|
#ifdef PRINT_SMEM
|
||
|
|
ks_introsort(mem_smem_sort_lt_ert, smems->n, smems->a); // Sort SMEMs based on start pos in read. For DEBUG.
|
||
|
|
for (i = 0; i < smems->n; ++i) {
|
||
|
|
// printf("[SMEM]:%d,%d\n", smems->a[i].start, smems->a[i].end);
|
||
|
|
int idx;
|
||
|
|
for (idx = 0; idx < smems->a[i].hitcount; ++idx) {
|
||
|
|
if (smems->a[i].forward || smems->a[i].fetch_leaves) {
|
||
|
|
printf("[SMEM]:%d,%d,%lu\n", smems->a[i].start, smems->a[i].end, hits->a[smems->a[i].hitbeg + idx]);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
printf("[SMEM]:%d,%d,%lu\n", smems->a[i].start, smems->a[i].end, (iaux->bns->l_pac << 1) - hits->a[smems->a[i].hitbeg + idx] - (smems->a[i].end - smems->a[i].start));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
#endif
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* This function performs reseeding of SMEMs
|
||
|
|
*
|
||
|
|
* @param iaux index related parameters
|
||
|
|
* @param raux read related parameters
|
||
|
|
* @param smems list of SMEMs
|
||
|
|
* @param start pivot position in read
|
||
|
|
* @param limit hit threshold below which tree traversal must stop
|
||
|
|
* @param pt track pivot information to reduce work done during reseeding
|
||
|
|
* @param hits list of hits for every read
|
||
|
|
*/
|
||
|
|
void reseed_prefix(index_aux_t* iaux, read_aux_t* raux, mem_v* smems, int start, int limit, pivot_t* pt, u64v* hits) {
|
||
|
|
|
||
|
|
smem_helper_t sh;
|
||
|
|
memset(&sh, 0, sizeof(smem_helper_t));
|
||
|
|
sh.prevMemStart = raux->l_seq;
|
||
|
|
sh.prevMemEnd = 0;
|
||
|
|
int i = start, j = 0;
|
||
|
|
#ifdef PRINT_SMEM
|
||
|
|
int old_n = smems->n;
|
||
|
|
#endif
|
||
|
|
memset(raux->lep, 0, 5 * sizeof(uint64_t));
|
||
|
|
mem_t rm;
|
||
|
|
memset(&rm, 0, sizeof(mem_t));
|
||
|
|
rm.start = i;
|
||
|
|
rm.forward = 1;
|
||
|
|
rm.hitbeg = hits->n;
|
||
|
|
sh.prev_pivot = (rm.start >= pt->c_pivot) ? pt->p_pivot : pt->pp_pivot;
|
||
|
|
raux->read_buf = raux->unpacked_queue_buf;
|
||
|
|
raux->limit = limit;
|
||
|
|
rightExtend_wlimit(iaux, raux, &i, &rm, hits); // Compute LEP.
|
||
|
|
// Lazy expansion of leaf nodes.
|
||
|
|
if (rm.hitcount > 0 && !rm.skip_ref_fetch) {
|
||
|
|
int64_t len;
|
||
|
|
int64_t start_ref_pos = hits->a[rm.hitbeg] + i - rm.start;
|
||
|
|
int64_t end_ref_pos = hits->a[rm.hitbeg] + raux->l_seq - rm.start;
|
||
|
|
// Fetch reference
|
||
|
|
uint8_t* rseq = get_seq(iaux->bns->l_pac, iaux->pac, start_ref_pos, end_ref_pos, &len, iaux->ref_string, 0);
|
||
|
|
int m;
|
||
|
|
int numMatchingBP = 0;
|
||
|
|
// Check for matching bases
|
||
|
|
for (m = 0; m < len; ++m) {
|
||
|
|
if (rseq[m] == raux->unpacked_queue_buf[i+m]) {
|
||
|
|
numMatchingBP++;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
raux->lep[(i+m-1) >> 6] |= (1ULL << ((i+m-1) & (0x3FULL)));
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
// Last base of RMEM must have LEP bit set
|
||
|
|
if (m == len) {
|
||
|
|
raux->lep[(i+m-1) >> 6] |= (1ULL << ((i+m-1) & (0x3FULL)));
|
||
|
|
}
|
||
|
|
i += numMatchingBP;
|
||
|
|
}
|
||
|
|
rm.end = i;
|
||
|
|
int rmemLen = rm.end - rm.start;
|
||
|
|
if (rm.start == 0) {
|
||
|
|
if (rmemLen >= raux->min_seed_len) {
|
||
|
|
if (rm.hitcount > 0) {
|
||
|
|
kv_push(mem_t, *smems, rm);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
hits->n -= rm.hitcount;
|
||
|
|
}
|
||
|
|
memset(raux->lep, 0, 5 * sizeof(uint64_t));
|
||
|
|
}
|
||
|
|
// Begin left-extension, i.e., right extension on reverse complemented read
|
||
|
|
else {
|
||
|
|
hits->n -= rm.hitcount;
|
||
|
|
uint64_t* lep = raux->lep;
|
||
|
|
int seq_len = raux->l_seq;
|
||
|
|
int min_seed_len = raux->min_seed_len;
|
||
|
|
sh.stop_be = 0;
|
||
|
|
int min_j = (rm.start > min_seed_len) ? (rm.start-1) : (min_seed_len-1);
|
||
|
|
int max_j = rm.end - 1;
|
||
|
|
j = min_j;
|
||
|
|
sh.prev_pivot = rm.start;
|
||
|
|
sh.mem_end_limit = rm.end;
|
||
|
|
while (j <= max_j) {
|
||
|
|
mem_t m;
|
||
|
|
int be_point;
|
||
|
|
int mem_valid = init_mem(lep, &m, j, seq_len, min_seed_len);
|
||
|
|
m.hitbeg = hits->n;
|
||
|
|
int next_j = j + 1;
|
||
|
|
if (mem_valid) {
|
||
|
|
be_point = j + 1;
|
||
|
|
if (be_point >= min_seed_len) {
|
||
|
|
int rc_i = seq_len - be_point;
|
||
|
|
raux->read_buf = raux->unpacked_rc_queue_buf;
|
||
|
|
leftExtend_wlimit(iaux, raux, &rc_i, &m, hits);
|
||
|
|
next_j = check_and_add_smem_prefix_reseed(iaux, raux, &m, &sh, smems, hits);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
j = next_j;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
#ifdef PRINT_SMEM
|
||
|
|
ks_introsort(mem_smem_sort_lt_ert, smems->n - old_n, &smems->a[old_n]); // Debug: Sort SMEMs based on start pos in read.
|
||
|
|
for (i = old_n; i < smems->n; ++i) {
|
||
|
|
int idx;
|
||
|
|
for (idx = 0; idx < smems->a[i].hitcount; ++idx) {
|
||
|
|
if (smems->a[i].forward || smems->a[i].fetch_leaves) {
|
||
|
|
printf("[Reseed]:%d,%d,%lu\n", smems->a[i].start, smems->a[i].end, hits->a[smems->a[i].hitbeg + idx]);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
printf("[Reseed]:%d,%d,%lu\n", smems->a[i].start, smems->a[i].end, (iaux->bns->l_pac << 1) - hits->a[smems->a[i].hitbeg + idx] - (smems->a[i].end - smems->a[i].start - smems->a[i].end_correction));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
#endif
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* This function performs reseeding of SMEMs
|
||
|
|
*
|
||
|
|
* @param iaux index related parameters
|
||
|
|
* @param raux read related parameters
|
||
|
|
* @param smems list of SMEMs and their hits
|
||
|
|
* @param start pivot position in read
|
||
|
|
* @param limit hit threshold below which tree traversal must stop
|
||
|
|
* @param pt track pivot information to reduce work done during reseeding
|
||
|
|
* @param hits list of hits for read
|
||
|
|
*/
|
||
|
|
void reseed(index_aux_t* iaux, read_aux_t* raux, mem_v* smems, int start, int limit, pivot_t* pt, u64v* hits) {
|
||
|
|
|
||
|
|
smem_helper_t sh;
|
||
|
|
memset(&sh, 0, sizeof(smem_helper_t));
|
||
|
|
sh.prevMemStart = raux->l_seq;
|
||
|
|
sh.prevMemEnd = 0;
|
||
|
|
int i = start, j = 0;
|
||
|
|
#ifdef PRINT_SMEM
|
||
|
|
int old_n = smems->n;
|
||
|
|
#endif
|
||
|
|
memset(raux->lep, 0, 5 * sizeof(uint64_t));
|
||
|
|
mem_t rm;
|
||
|
|
memset(&rm, 0, sizeof(mem_t));
|
||
|
|
rm.start = i;
|
||
|
|
rm.forward = 1;
|
||
|
|
rm.hitbeg = hits->n;
|
||
|
|
sh.prev_pivot = (rm.start >= pt->c_pivot) ? pt->p_pivot : pt->pp_pivot;
|
||
|
|
raux->read_buf = raux->unpacked_queue_buf;
|
||
|
|
raux->limit = limit;
|
||
|
|
rightExtend_wlimit(iaux, raux, &i, &rm, hits); //!< Compute LEP.
|
||
|
|
// Lazy expansion of leaf nodes.
|
||
|
|
if (rm.hitcount > 0 && !rm.skip_ref_fetch) {
|
||
|
|
int64_t len;
|
||
|
|
int64_t start_ref_pos = hits->a[rm.hitbeg] + i - rm.start;
|
||
|
|
int64_t end_ref_pos = hits->a[rm.hitbeg] + raux->l_seq - rm.start;
|
||
|
|
// Fetch reference
|
||
|
|
uint8_t* rseq = get_seq(iaux->bns->l_pac, iaux->pac, start_ref_pos, end_ref_pos, &len, iaux->ref_string, 0);
|
||
|
|
int m;
|
||
|
|
int numMatchingBP = 0;
|
||
|
|
// Check for matching bases
|
||
|
|
for (m = 0; m < len; ++m) {
|
||
|
|
if (rseq[m] == raux->unpacked_queue_buf[i+m]) {
|
||
|
|
numMatchingBP++;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
raux->lep[(i+m-1) >> 6] |= (1ULL << ((i+m-1) & (0x3FULL)));
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
// Last base of RMEM must have LEP bit set
|
||
|
|
if (m == len) {
|
||
|
|
raux->lep[(i+m-1) >> 6] |= (1ULL << ((i+m-1) & (0x3FULL)));
|
||
|
|
}
|
||
|
|
i += numMatchingBP;
|
||
|
|
}
|
||
|
|
rm.end = i;
|
||
|
|
int rmemLen = rm.end - rm.start;
|
||
|
|
if (rm.start == 0) {
|
||
|
|
if (rmemLen >= raux->min_seed_len) {
|
||
|
|
if (rm.hitcount > 0) {
|
||
|
|
kv_push(mem_t, *smems, rm);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
hits->n -= rm.hitcount;
|
||
|
|
}
|
||
|
|
memset(raux->lep, 0, 5 * sizeof(uint64_t));
|
||
|
|
}
|
||
|
|
// Begin left-extension, i.e., right extension on reverse complemented read
|
||
|
|
else {
|
||
|
|
hits->n -= rm.hitcount;
|
||
|
|
uint64_t* lep = raux->lep;
|
||
|
|
int seq_len = raux->l_seq;
|
||
|
|
int min_seed_len = raux->min_seed_len;
|
||
|
|
j = rm.end-1;
|
||
|
|
sh.stop_be = 0;
|
||
|
|
int min_j = (rm.start > min_seed_len) ? (rm.start-1) : (min_seed_len-1);
|
||
|
|
while (j >= min_j) {
|
||
|
|
mem_t m;
|
||
|
|
int be_point;
|
||
|
|
int mem_valid = init_mem(lep, &m, j, seq_len, min_seed_len);
|
||
|
|
m.hitbeg = hits->n;
|
||
|
|
if (mem_valid) {
|
||
|
|
be_point = j + 1;
|
||
|
|
if (be_point >= min_seed_len) {
|
||
|
|
int rc_i = seq_len - be_point;
|
||
|
|
raux->read_buf = raux->unpacked_rc_queue_buf;
|
||
|
|
leftExtend_wlimit(iaux, raux, &rc_i, &m, hits);
|
||
|
|
check_and_add_smem(iaux, raux, &m, &sh, smems, hits);
|
||
|
|
if (sh.stop_be) break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
j -= 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
#ifdef PRINT_SMEM
|
||
|
|
ks_introsort(mem_smem_sort_lt_ert, smems->n - old_n, &smems->a[old_n]); // Debug: Sort SMEMs based on start pos in read.
|
||
|
|
for (i = old_n; i < smems->n; ++i) {
|
||
|
|
int idx;
|
||
|
|
for (idx = 0; idx < smems->a[i].hitcount; ++idx) {
|
||
|
|
if (smems->a[i].forward || smems->a[i].fetch_leaves) {
|
||
|
|
printf("[Reseed]:%d,%d,%lu\n", smems->a[i].start, smems->a[i].end, hits->a[smems->a[i].hitbeg + idx]);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
printf("[Reseed]:%d,%d,%lu\n", smems->a[i].start, smems->a[i].end, (iaux->bns->l_pac << 1) - hits->a[smems->a[i].hitbeg + idx] - (smems->a[i].end - smems->a[i].start));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
#endif
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* This function performs the LAST heuristic
|
||
|
|
*
|
||
|
|
* @param iaux index related parameters
|
||
|
|
* @param raux read related parameters
|
||
|
|
* @param smems list of SMEMs
|
||
|
|
* @param limit hit threshold above which tree traversal must stop
|
||
|
|
* @param hits list of hits for read
|
||
|
|
*/
|
||
|
|
void last(index_aux_t* iaux, read_aux_t* raux, mem_v* smems, int limit, u64v* hits) {
|
||
|
|
|
||
|
|
int i = 0;
|
||
|
|
#ifdef PRINT_SMEM
|
||
|
|
int old_n = smems->n;
|
||
|
|
#endif
|
||
|
|
const uint8_t minSeedLen = raux->min_seed_len + 1; // LAST exits seeding only when seed length >= 20
|
||
|
|
raux->limit = limit;
|
||
|
|
while (i < raux->l_seq) { // Begin identifying RMEMs
|
||
|
|
mem_t rm;
|
||
|
|
rm.start = i; rm.end = 0;
|
||
|
|
rm.forward = 1;
|
||
|
|
rm.skip_ref_fetch = 0;
|
||
|
|
rm.hitbeg = hits->n;
|
||
|
|
rm.hitcount = 0;
|
||
|
|
raux->read_buf = raux->unpacked_queue_buf;
|
||
|
|
rightExtend_last(iaux, raux, &i, &rm, hits);
|
||
|
|
// Lazy expansion of leaf nodes.
|
||
|
|
if (rm.hitcount > 0 && !rm.skip_ref_fetch) {
|
||
|
|
int64_t len;
|
||
|
|
int64_t start_ref_pos = hits->a[rm.hitbeg] + i - rm.start;
|
||
|
|
int64_t end_ref_pos = hits->a[rm.hitbeg] + raux->l_seq - rm.start;
|
||
|
|
// Fetch reference
|
||
|
|
uint8_t* rseq = get_seq(iaux->bns->l_pac, iaux->pac, start_ref_pos, end_ref_pos, &len, iaux->ref_string, 0);
|
||
|
|
int m, numMatchingBP = 0;
|
||
|
|
// Check for matching bases
|
||
|
|
for (m = 0; m < len; ++m) {
|
||
|
|
int seedLen = (i + m) - rm.start;
|
||
|
|
int match_next_bp = ((seedLen < minSeedLen) || (rm.hitcount >= raux->limit)) ? 1 : 0;
|
||
|
|
if (match_next_bp) {
|
||
|
|
if ((rseq[m] == raux->unpacked_queue_buf[i+m])) {
|
||
|
|
numMatchingBP++;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
++i; // Increment i on every mismatch for LAST to match BWA-MEM
|
||
|
|
hits->n -= rm.hitcount;
|
||
|
|
rm.hitcount = 0;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
i += numMatchingBP;
|
||
|
|
}
|
||
|
|
rm.end = i;
|
||
|
|
int rmemLen = rm.end - rm.start;
|
||
|
|
if (rmemLen >= minSeedLen) {
|
||
|
|
if (rm.hitcount > 0 && rm.hitcount < raux->limit) {
|
||
|
|
kv_push(mem_t, *smems, rm);
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
hits->n -= rm.hitcount;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
hits->n -= rm.hitcount;
|
||
|
|
}
|
||
|
|
int foundN = 0;
|
||
|
|
// Skip all ambiguous bases
|
||
|
|
assert(i > 0);
|
||
|
|
if (raux->read_buf[i-1] == 4) { // Found an 'N' during lookup
|
||
|
|
foundN = 1;
|
||
|
|
}
|
||
|
|
if (!foundN) {
|
||
|
|
// Check if there other ambiguous bases within min_seed_len bases of the start of the MEM
|
||
|
|
while ((i < raux->l_seq) && (i - rm.start) < minSeedLen) {
|
||
|
|
if (raux->read_buf[i] == 4) {
|
||
|
|
++i;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
++i;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
#ifdef PRINT_SMEM
|
||
|
|
ks_introsort(mem_smem_sort_lt_ert, smems->n - old_n, &smems->a[old_n]); // Debug: Sort SMEMs based on start pos in read.
|
||
|
|
for (i = old_n; i < smems->n; ++i) {
|
||
|
|
int idx;
|
||
|
|
for (idx = 0; idx < smems->a[i].hitcount; ++idx) {
|
||
|
|
printf("[LAST]:%d,%d,%lu\n", smems->a[i].start, smems->a[i].end, hits->a[smems->a[i].hitbeg + idx]);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
#endif
|
||
|
|
}
|
||
|
|
|