#include "ertseeding.h" #define smem_lt(a, b) ((a).start == (b).start ? (a).end > (b).end : (a).start < (b).start) KSORT_INIT(mem_smem_ert, mem_t, smem_lt) #define smem_lt_2(a, b) ((a).start == (b).start ? (a).end < (b).end : (a).start < (b).start) KSORT_INIT(mem_smem_sort_lt_ert, mem_t, smem_lt_2) /** * 256 x 3 (2B,3B,4B ptrs) x 4 (ACGT) table encoding offset to next address given a code byte for a leaf node */ unsigned char leaf_table[256 * 3][4] = { {0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,2,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {2,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,0,2,0},{0,5,0,0}, {10,5,0,0},{0,5,0,0},{0,7,2,0},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,0,2,0},{0,0,2,0}, {7,0,2,0},{0,0,2,0},{0,0,4,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,2,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {2,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {2,0,0,0},{0,0,0,0},{0,0,0,0},{0,2,0,0}, {7,2,0,0},{0,2,0,0},{0,4,0,0},{0,0,0,0}, {2,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {4,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,0,0,2},{0,5,0,0}, {10,5,0,0},{0,5,0,0},{0,7,0,2},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,0,0,2},{0,0,0,2}, {7,0,0,2},{0,0,0,2},{0,0,0,4},{0,0,5,0}, {10,0,5,0},{0,0,5,0},{0,0,7,2},{0,10,5,0}, {15,10,5,0},{0,10,5,0},{0,12,7,2},{0,0,5,0}, {10,0,5,0},{0,0,5,0},{0,0,7,2},{0,0,7,2}, {12,0,7,2},{0,0,7,2},{0,0,9,4},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,0,0,2},{0,5,0,0}, {10,5,0,0},{0,5,0,0},{0,7,0,2},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,0,0,2},{0,0,0,2}, {7,0,0,2},{0,0,0,2},{0,0,0,4},{0,0,0,2}, {7,0,0,2},{0,0,0,2},{0,0,0,4},{0,7,0,2}, {12,7,0,2},{0,7,0,2},{0,9,0,4},{0,0,0,2}, {7,0,0,2},{0,0,0,2},{0,0,0,4},{0,0,0,4}, {9,0,0,4},{0,0,0,4},{0,0,0,6},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,2,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {2,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,0,2,0},{0,5,0,0}, {10,5,0,0},{0,5,0,0},{0,7,2,0},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,0,2,0},{0,0,2,0}, {7,0,2,0},{0,0,2,0},{0,0,4,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,2,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {2,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {2,0,0,0},{0,0,0,0},{0,0,0,0},{0,2,0,0}, {7,2,0,0},{0,2,0,0},{0,4,0,0},{0,0,0,0}, {2,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {4,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {2,0,0,0},{0,0,0,0},{0,0,0,0},{0,2,0,0}, {7,2,0,0},{0,2,0,0},{0,4,0,0},{0,0,0,0}, {2,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {4,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,2,0}, {7,0,2,0},{0,0,2,0},{0,0,4,0},{0,7,2,0}, {12,7,2,0},{0,7,2,0},{0,9,4,0},{0,0,2,0}, {7,0,2,0},{0,0,2,0},{0,0,4,0},{0,0,4,0}, {9,0,4,0},{0,0,4,0},{0,0,6,0},{0,0,0,0}, {2,0,0,0},{0,0,0,0},{0,0,0,0},{0,2,0,0}, {7,2,0,0},{0,2,0,0},{0,4,0,0},{0,0,0,0}, {2,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {4,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {4,0,0,0},{0,0,0,0},{0,0,0,0},{0,4,0,0}, {9,4,0,0},{0,4,0,0},{0,6,0,0},{0,0,0,0}, {4,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {6,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,3,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {3,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,0,3,0},{0,5,0,0}, {10,5,0,0},{0,5,0,0},{0,8,3,0},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,0,3,0},{0,0,3,0}, {8,0,3,0},{0,0,3,0},{0,0,6,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,3,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {3,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {3,0,0,0},{0,0,0,0},{0,0,0,0},{0,3,0,0}, {8,3,0,0},{0,3,0,0},{0,6,0,0},{0,0,0,0}, {3,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {6,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,0,0,3},{0,5,0,0}, {10,5,0,0},{0,5,0,0},{0,8,0,3},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,0,0,3},{0,0,0,3}, {8,0,0,3},{0,0,0,3},{0,0,0,6},{0,0,5,0}, {10,0,5,0},{0,0,5,0},{0,0,8,3},{0,10,5,0}, {15,10,5,0},{0,10,5,0},{0,13,8,3},{0,0,5,0}, {10,0,5,0},{0,0,5,0},{0,0,8,3},{0,0,8,3}, {13,0,8,3},{0,0,8,3},{0,0,11,6},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,0,0,3},{0,5,0,0}, {10,5,0,0},{0,5,0,0},{0,8,0,3},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,0,0,3},{0,0,0,3}, {8,0,0,3},{0,0,0,3},{0,0,0,6},{0,0,0,3}, {8,0,0,3},{0,0,0,3},{0,0,0,6},{0,8,0,3}, {13,8,0,3},{0,8,0,3},{0,11,0,6},{0,0,0,3}, {8,0,0,3},{0,0,0,3},{0,0,0,6},{0,0,0,6}, {11,0,0,6},{0,0,0,6},{0,0,0,9},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,3,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {3,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,0,3,0},{0,5,0,0}, {10,5,0,0},{0,5,0,0},{0,8,3,0},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,0,3,0},{0,0,3,0}, {8,0,3,0},{0,0,3,0},{0,0,6,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,3,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {3,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {3,0,0,0},{0,0,0,0},{0,0,0,0},{0,3,0,0}, {8,3,0,0},{0,3,0,0},{0,6,0,0},{0,0,0,0}, {3,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {6,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {3,0,0,0},{0,0,0,0},{0,0,0,0},{0,3,0,0}, {8,3,0,0},{0,3,0,0},{0,6,0,0},{0,0,0,0}, {3,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {6,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,3,0}, {8,0,3,0},{0,0,3,0},{0,0,6,0},{0,8,3,0}, {13,8,3,0},{0,8,3,0},{0,11,6,0},{0,0,3,0}, {8,0,3,0},{0,0,3,0},{0,0,6,0},{0,0,6,0}, {11,0,6,0},{0,0,6,0},{0,0,9,0},{0,0,0,0}, {3,0,0,0},{0,0,0,0},{0,0,0,0},{0,3,0,0}, {8,3,0,0},{0,3,0,0},{0,6,0,0},{0,0,0,0}, {3,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {6,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {6,0,0,0},{0,0,0,0},{0,0,0,0},{0,6,0,0}, {11,6,0,0},{0,6,0,0},{0,9,0,0},{0,0,0,0}, {6,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {9,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,4,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {4,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,0,4,0},{0,5,0,0}, {10,5,0,0},{0,5,0,0},{0,9,4,0},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,0,4,0},{0,0,4,0}, {9,0,4,0},{0,0,4,0},{0,0,8,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,4,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {4,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {4,0,0,0},{0,0,0,0},{0,0,0,0},{0,4,0,0}, {9,4,0,0},{0,4,0,0},{0,8,0,0},{0,0,0,0}, {4,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {8,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,0,0,4},{0,5,0,0}, {10,5,0,0},{0,5,0,0},{0,9,0,4},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,0,0,4},{0,0,0,4}, {9,0,0,4},{0,0,0,4},{0,0,0,8},{0,0,5,0}, {10,0,5,0},{0,0,5,0},{0,0,9,4},{0,10,5,0}, {15,10,5,0},{0,10,5,0},{0,14,9,4},{0,0,5,0}, {10,0,5,0},{0,0,5,0},{0,0,9,4},{0,0,9,4}, {14,0,9,4},{0,0,9,4},{0,0,13,8},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,0,0,4},{0,5,0,0}, {10,5,0,0},{0,5,0,0},{0,9,0,4},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,0,0,4},{0,0,0,4}, {9,0,0,4},{0,0,0,4},{0,0,0,8},{0,0,0,4}, {9,0,0,4},{0,0,0,4},{0,0,0,8},{0,9,0,4}, {14,9,0,4},{0,9,0,4},{0,13,0,8},{0,0,0,4}, {9,0,0,4},{0,0,0,4},{0,0,0,8},{0,0,0,8}, {13,0,0,8},{0,0,0,8},{0,0,0,12},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,4,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {4,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,0,4,0},{0,5,0,0}, {10,5,0,0},{0,5,0,0},{0,9,4,0},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,0,4,0},{0,0,4,0}, {9,0,4,0},{0,0,4,0},{0,0,8,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {5,0,0,0},{0,0,0,0},{0,4,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {4,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {4,0,0,0},{0,0,0,0},{0,0,0,0},{0,4,0,0}, {9,4,0,0},{0,4,0,0},{0,8,0,0},{0,0,0,0}, {4,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {8,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {4,0,0,0},{0,0,0,0},{0,0,0,0},{0,4,0,0}, {9,4,0,0},{0,4,0,0},{0,8,0,0},{0,0,0,0}, {4,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {8,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,4,0}, {9,0,4,0},{0,0,4,0},{0,0,8,0},{0,9,4,0}, {14,9,4,0},{0,9,4,0},{0,13,8,0},{0,0,4,0}, {9,0,4,0},{0,0,4,0},{0,0,8,0},{0,0,8,0}, {13,0,8,0},{0,0,8,0},{0,0,12,0},{0,0,0,0}, {4,0,0,0},{0,0,0,0},{0,0,0,0},{0,4,0,0}, {9,4,0,0},{0,4,0,0},{0,8,0,0},{0,0,0,0}, {4,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {8,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {8,0,0,0},{0,0,0,0},{0,0,0,0},{0,8,0,0}, {13,8,0,0},{0,8,0,0},{0,12,0,0},{0,0,0,0}, {8,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {12,0,0,0},{0,0,0,0},{0,0,0,0} }; /** * 256 x 3 (2B,3B,4B ptrs) x 4 (ACGT) table encoding offset to next address given a code byte for an internal 'DIVERGE' node */ unsigned char code_table[256 * 3][4] = { {0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{2,0,0,0},{0,2,0,0}, {0,2,0,0},{0,2,0,0},{4,2,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{2,0,0,0},{0,2,0,0}, {0,2,0,0},{0,2,0,0},{4,2,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{2,0,0,0},{0,2,0,0}, {0,2,0,0},{0,2,0,0},{4,2,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{2,0,0,0},{0,2,0,0}, {0,2,0,0},{0,2,0,0},{4,2,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{2,0,0,0},{0,2,0,0}, {0,2,0,0},{0,2,0,0},{4,2,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{2,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{2,0,0,0},{0,2,0,0}, {0,2,0,0},{0,2,0,0},{4,2,0,0},{0,0,2,0}, {0,0,2,0},{0,0,2,0},{4,0,2,0},{0,0,2,0}, {0,0,2,0},{0,0,2,0},{4,0,2,0},{0,0,2,0}, {0,0,2,0},{0,0,2,0},{4,0,2,0},{0,4,2,0}, {0,4,2,0},{0,4,2,0},{6,4,2,0}, {0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{3,0,0,0},{0,3,0,0}, {0,3,0,0},{0,3,0,0},{6,3,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{3,0,0,0},{0,3,0,0}, {0,3,0,0},{0,3,0,0},{6,3,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{3,0,0,0},{0,3,0,0}, {0,3,0,0},{0,3,0,0},{6,3,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{3,0,0,0},{0,3,0,0}, {0,3,0,0},{0,3,0,0},{6,3,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{3,0,0,0},{0,3,0,0}, {0,3,0,0},{0,3,0,0},{6,3,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{3,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{3,0,0,0},{0,3,0,0}, {0,3,0,0},{0,3,0,0},{6,3,0,0},{0,0,3,0}, {0,0,3,0},{0,0,3,0},{6,0,3,0},{0,0,3,0}, {0,0,3,0},{0,0,3,0},{6,0,3,0},{0,0,3,0}, {0,0,3,0},{0,0,3,0},{6,0,3,0},{0,6,3,0}, {0,6,3,0},{0,6,3,0},{9,6,3,0}, {0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{4,0,0,0},{0,4,0,0}, {0,4,0,0},{0,4,0,0},{8,4,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{4,0,0,0},{0,4,0,0}, {0,4,0,0},{0,4,0,0},{8,4,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{4,0,0,0},{0,4,0,0}, {0,4,0,0},{0,4,0,0},{8,4,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{4,0,0,0},{0,4,0,0}, {0,4,0,0},{0,4,0,0},{8,4,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{4,0,0,0},{0,4,0,0}, {0,4,0,0},{0,4,0,0},{8,4,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{4,0,0,0},{0,0,0,0}, {0,0,0,0},{0,0,0,0},{4,0,0,0},{0,4,0,0}, {0,4,0,0},{0,4,0,0},{8,4,0,0},{0,0,4,0}, {0,0,4,0},{0,0,4,0},{8,0,4,0},{0,0,4,0}, {0,0,4,0},{0,0,4,0},{8,0,4,0},{0,0,4,0}, {0,0,4,0},{0,0,4,0},{8,0,4,0},{0,8,4,0}, {0,8,4,0},{0,8,4,0},{12,8,4,0} }; /** * Return integer key from k-mer string. * * @param str Query sequence * @param keysize K-mer length * @param index Index into query sequence * @param seq_len Length of query sequence * @param end_flag Flag to indicate end of query * @param idx_first_N Index of first ambiguous bp in the k-mer * * @return Integer key (hash) for k-mer */ static inline uint32_t getHashKey (const uint8_t *str, const int keysize, int index, int seq_len, int* end_flag, int* idx_first_N) { int i; uint32_t key = 0; int len = keysize; if (index + keysize > seq_len) { *end_flag = 1; len = seq_len - index; } for (i = 0; i < len; ++i) { if (str[i] != 4) { key |= ((uint32_t)(str[i]) << (i << 1)); } else { *idx_first_N = i; break; } } return key; } uint8_t *get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len, uint8_t *ref_string, uint8_t *seqb) { uint8_t *seq = 0; if (end < beg) end ^= beg, beg ^= end, end ^= beg; // if end is smaller, swap if (end > l_pac<<1) end = l_pac<<1; if (beg < 0) beg = 0; if (beg >= l_pac || end <= l_pac) { *len = end - beg; if (beg >= l_pac) { // reverse strand seq = ref_string + beg; } else { // forward strand seq = ref_string + beg; } } else *len = 0; // if bridging the forward-reverse boundary, return nothing return seq; } /** * Compute offset to child and return address of child node * * @param raux read parameters * @param mlt_data radix tree of k-mer * @param code type of child nodes (EMPTY:00, LEAF:01, UNIFORM:10, DIVERGE:11) - 2b encoded * @param c next character in read (used to traverse tree) * @param byteIdx byte index into the mlt_data radix tree * */ void getOffsetToChildNode(read_aux_t* raux, uint8_t* mlt_data, uint8_t code, uint8_t c, uint64_t* byteIdx) { uint64_t nextByteIdx = *byteIdx; uint64_t startByteIdx = nextByteIdx - 1; int offset = code_table[((raux->ptr_width - 2) << 8) + code][c]; uint32_t reseed_data = 0; nextByteIdx += offset; memcpy_bwamem(&reseed_data, raux->ptr_width * sizeof(uint8_t), &mlt_data[nextByteIdx], raux->ptr_width * sizeof(uint8_t), __FILE__, __LINE__); uint32_t jumpByteIdx = reseed_data >> 6; raux->num_hits = (reseed_data & 0x3F); nextByteIdx = startByteIdx + jumpByteIdx; *byteIdx = nextByteIdx; } /** * Compute offset to start of leaf data * * @param raux read parameters * @param code type of child nodes (EMPTY:00, LEAF:01, UNIFORM:10, DIVERGE:11) - 2b encoded * @param c next character in read (used to traverse tree) * */ static inline int getOffsetToLeafData(read_aux_t* raux, uint8_t code, uint8_t c) { return leaf_table[((raux->ptr_width - 2) << 8) + code][c]; } /** * This routine does depth-first tree traversal starting from an internal node to obtain all hits at leaf nodes * * @param raux read parameters * @param mlt_data radix tree of k-mer * @param byteIdx byte index into the mlt_data radix tree * @param mem maximal-exact-match storage * @param bc next character in read (used to traverse tree) * @param hits list of hits for read */ void getNextByteIdx_dfs(read_aux_t* raux, uint8_t* mlt_data, uint64_t* byte_idx, mem_t* mem, uint8_t bc, u64v* hits) { uint64_t nextByteIdx = *byte_idx; uint64_t ref_pos = 0; uint8_t c; c = 3 - bc; mem->skip_ref_fetch = 1; // MEM cannot be extended by leaf decompression uint8_t code = mlt_data[nextByteIdx++]; uint8_t code_c = (code >> (c << 1)) & 3; assert(code != 0); if (code_c == LEAF) { // Hit a leaf node int k; uint64_t leaf_data = 0; nextByteIdx += getOffsetToLeafData(raux, code, c); memcpy_bwamem(&leaf_data, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__); if (leaf_data & 1) { // Found a multi-hit leaf node nextByteIdx = raux->mh_start_addr + (leaf_data >> 1); memcpy_bwamem(&raux->num_hits, 2 * sizeof(uint8_t), &mlt_data[nextByteIdx], 2 * sizeof(uint8_t), __FILE__, __LINE__); nextByteIdx += 2; mem->hitcount += raux->num_hits; for (k = 0; k < raux->num_hits; ++k) { memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__); kv_push(uint64_t, *hits, ref_pos >> 1); nextByteIdx += 5; ref_pos = 0; } } else { // Single-hit leaf node raux->num_hits = 1; mem->hitcount += raux->num_hits; kv_push(uint64_t, *hits, leaf_data >> 1); } } else if (code_c == UNIFORM) { // Multi-character internal node int countBP = *((uint16_t*) &mlt_data[nextByteIdx]); nextByteIdx += 2; int numBitsForBP = countBP << 1; int numBytesForBP = (numBitsForBP % 8) ? (numBitsForBP / 8 + 1) : (numBitsForBP / 8); uint8_t packedBP[numBytesForBP]; memcpy_bwamem(packedBP, numBytesForBP * sizeof(uint8_t), &mlt_data[nextByteIdx], numBytesForBP * sizeof(uint8_t), __FILE__, __LINE__); nextByteIdx += numBytesForBP; uint8_t k; uint64_t startByteIdx = nextByteIdx; for (k = 0; k < 4; ++k) { getNextByteIdx_dfs(raux, mlt_data, &startByteIdx, mem, k, hits); startByteIdx = nextByteIdx; } } else if (code_c == DIVERGE) { // Single-character internal node getOffsetToChildNode(raux, mlt_data, code, c, &nextByteIdx); uint8_t k; uint64_t startByteIdx = nextByteIdx; for (k = 0; k < 4; ++k) { getNextByteIdx_dfs(raux, mlt_data, &startByteIdx, mem, k, hits); startByteIdx = nextByteIdx; } } *byte_idx = nextByteIdx; } /** * Gather all hits for the MEM * * @param raux read parameters * @param mlt_data radix tree of k-mer * @param byteIdx byte index into the mlt_data radix tree * @param mem MEM including hits * @param hits list of hits for read */ void leaf_gather(read_aux_t* raux, uint8_t* mlt_data, uint64_t* byte_idx, mem_t* mem, u64v* hits) { uint8_t k; uint64_t startByteIdx = *byte_idx; uint64_t tmpByteIdx = startByteIdx; for (k = 0; k < 4; ++k) { getNextByteIdx_dfs(raux, mlt_data, &tmpByteIdx, mem, k, hits); tmpByteIdx = startByteIdx; } } /** * Traverse tree during backward search(seeding). Similar to getNextByteIdx, except that we don't compute LEP * * @param raux read parameters * @param mlt_data radix tree of k-mer * @param byteIdx byte index into the mlt_data radix tree * @param i index into read buffer * @param mem maximal-exact-match storage * @param hits list of hits for read */ void getNextByteIdx_backward(read_aux_t* raux, uint8_t* mlt_data, uint64_t* byte_idx, int* i, mem_t* mem, u64v* hits) { uint64_t nextByteIdx = *byte_idx; uint64_t ref_pos = 0; uint8_t c, code, code_c; if (raux->read_buf[*i] != 4) { c = 3 - raux->read_buf[*i]; code = mlt_data[nextByteIdx++]; code_c = (code >> (c << 1)) & 3; assert(code != 0); } else { // Terminate MEM search when we hit an 'N' code_c = EMPTY; } if (code_c == EMPTY) { // Gather leaves later during forward traversal mem->rc_end = *i; mem->fetch_leaves = 1; } else if (code_c == LEAF) { // Hit a leaf node *i += 1; mem->rc_end = *i; int k; uint64_t leaf_data = 0; nextByteIdx += getOffsetToLeafData(raux, code, c); memcpy_bwamem(&leaf_data, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__); if (leaf_data & 1) { // Found a multi-hit leaf node nextByteIdx = raux->mh_start_addr + (leaf_data >> 1); memcpy_bwamem(&raux->num_hits, 2 * sizeof(uint8_t), &mlt_data[nextByteIdx], 2 * sizeof(uint8_t), __FILE__, __LINE__); nextByteIdx += 2; mem->hitcount += raux->num_hits; for (k = 0; k < raux->num_hits; ++k) { memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__); nextByteIdx += 5; kv_push(uint64_t, *hits, ref_pos >> 1); ref_pos = 0; } // // We found a multi-hit. But to report hits in the same order as BWA-MEM, // we will fetch the hits again in a forward tree traversal // mem->fetch_leaves = 1; } else { // Single-hit leaf node mem->hitcount += 1; kv_push(uint64_t, *hits, leaf_data >> 1); } } else if (code_c == UNIFORM) { // Multi-character internal node uint32_t j; int countBP = *((uint16_t*) &mlt_data[nextByteIdx]); nextByteIdx += 2; int numBitsForBP = countBP << 1; int numBytesForBP = (numBitsForBP % 8) ? (numBitsForBP / 8 + 1) : (numBitsForBP / 8); uint8_t packedBP[numBytesForBP]; memcpy_bwamem(packedBP, numBytesForBP * sizeof(uint8_t), &mlt_data[nextByteIdx], numBytesForBP * sizeof(uint8_t), __FILE__, __LINE__); nextByteIdx += numBytesForBP; // Unpack base pairs uint8_t unpackedBP[countBP]; for (j = 0; j < countBP; ++j) { unpackedBP[j] = ((packedBP[j >> 2] >> ((~(j) & 3) << 1)) & 3); } // Count number of matching base pairs with read for (j = 0; j < countBP; ++j) { if ((*i + j) >= raux->l_seq) { break; } if (raux->read_buf[*i+j] == 4) { break; } if (3 - raux->read_buf[*i+j] != unpackedBP[j]) { break; } } *i += j; if (j == countBP) { // We match all bps if (*i < raux->l_seq) { getNextByteIdx_backward(raux, mlt_data, &nextByteIdx, i, mem, hits); } else { mem->rc_end = *i; } } else { // Did not match all bps. Gather leaves for MEM in a later forward traversal mem->rc_end = *i; mem->fetch_leaves = 1; } } else { // Single-character internal node getOffsetToChildNode(raux, mlt_data, code, c, &nextByteIdx); *i += 1; if (*i < raux->l_seq) { getNextByteIdx_backward(raux, mlt_data, &nextByteIdx, i, mem, hits); } else { mem->rc_end = *i; } } *byte_idx = nextByteIdx; } /** * Traverse tree during backward search (reseeding). Terminate when fewer than 'raux->limit' hits are found for * an internal node * * @param raux read parameters * @param mlt_data radix tree of k-mer * @param byteIdx byte index into the mlt_data radix tree * @param i index into read buffer * @param mem maximal-exact-match storage * @param hits list of hits for each read */ void getNextByteIdx_backward_wlimit(read_aux_t* raux, uint8_t* mlt_data, uint64_t* byte_idx, int* i, mem_t* mem, u64v* hits) { uint64_t nextByteIdx = *byte_idx; uint64_t ref_pos = 0; uint8_t c, code, code_c; if (raux->read_buf[*i] != 4) { c = 3 - raux->read_buf[*i]; code = mlt_data[nextByteIdx++]; code_c = (code >> (c << 1)) & 3; assert(code != 0); } else { // Terminate MEM search when we hit an 'N' code_c = EMPTY; } if (code_c == EMPTY) { // Gather leaves later during forward traversal mem->rc_end = *i; mem->fetch_leaves = 1; } else if (code_c == LEAF) { // Hit a leaf node int k; uint64_t leaf_data = 0; nextByteIdx += getOffsetToLeafData(raux, code, c); memcpy_bwamem(&leaf_data, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__); if (leaf_data & 1) { // Found a multi-hit leaf node nextByteIdx = raux->mh_start_addr + (leaf_data >> 1); memcpy_bwamem(&raux->num_hits, 2 * sizeof(uint8_t), &mlt_data[nextByteIdx], 2 * sizeof(uint8_t), __FILE__, __LINE__); nextByteIdx += 2; // Hits exceed reseeding threshold if (raux->num_hits >= raux->limit) { mem->hitcount += raux->num_hits; for (k = 0; k < raux->num_hits; ++k) { memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__); nextByteIdx += 5; kv_push(uint64_t, *hits, ref_pos >> 1); ref_pos = 0; } *i += 1; } } // // We found a multi-hit. But to report hits in the same order as BWA-MEM, // we will fetch the hits again in a forward tree traversal // mem->fetch_leaves = 1; mem->rc_end = *i; } else if (code_c == UNIFORM) { // Multi-character internal node uint32_t j; int countBP = *((uint16_t*) &mlt_data[nextByteIdx]); nextByteIdx += 2; int numBitsForBP = countBP << 1; int numBytesForBP = (numBitsForBP % 8) ? (numBitsForBP / 8 + 1) : (numBitsForBP / 8); uint8_t packedBP[numBytesForBP]; memcpy_bwamem(packedBP, numBytesForBP * sizeof(uint8_t), &mlt_data[nextByteIdx], numBytesForBP * sizeof(uint8_t), __FILE__, __LINE__); nextByteIdx += numBytesForBP; // Unpack base pairs uint8_t unpackedBP[countBP]; for (j = 0; j < countBP; ++j) { unpackedBP[j] = ((packedBP[j >> 2] >> ((~(j) & 3) << 1)) & 3); } // Count number of matching base pairs with read for (j = 0; j < countBP; ++j) { if ((*i + j) >= raux->l_seq) { break; } if (raux->read_buf[*i+j] == 4) { break; } if (3 - raux->read_buf[*i+j] != unpackedBP[j]) { break; } } *i += j; if (j == countBP) { // We match all bps if (*i < raux->l_seq) { getNextByteIdx_backward_wlimit(raux, mlt_data, &nextByteIdx, i, mem, hits); } else { mem->rc_end = *i; mem->fetch_leaves = 1; } } else { // Did not match all bps. Gather leaves for MEM in a later forward traversal mem->rc_end = *i; mem->fetch_leaves = 1; } } else { // Single-character internal node raux->num_hits = 0; getOffsetToChildNode(raux, mlt_data, code, c, &nextByteIdx); // In the internal nodes, raux->num_hits = 0 is used to reprsent # hits > 20 if ((raux->num_hits == 0) || (raux->num_hits >= raux->limit)) { *i += 1; if (*i < raux->l_seq) { getNextByteIdx_backward_wlimit(raux, mlt_data, &nextByteIdx, i, mem, hits); } else { mem->rc_end = *i; mem->fetch_leaves = 1; } } else { mem->rc_end = *i; mem->fetch_leaves = 1; } } *byte_idx = nextByteIdx; } /** * Traverse tree during forward search (seeding). * * @param raux read parameters * @param mlt_data radix tree of k-mer * @param byteIdx byte index into the mlt_data radix tree * @param i index into read buffer * @param mem maximal-exact-match storage * @param hits list of hits for read */ void getNextByteIdx(read_aux_t* raux, uint8_t* mlt_data, uint64_t* byte_idx, int* i, mem_t* mem, u64v* hits) { uint64_t nextByteIdx = *byte_idx; uint64_t parent_byte_idx = nextByteIdx; uint64_t ref_pos = 0; uint8_t c, code, code_c; if (raux->read_buf[*i] != 4) { c = 3 - raux->read_buf[*i]; code = mlt_data[nextByteIdx++]; code_c = (code >> (c << 1)) & 3; assert(code != 0); } else { // Terminate MEM search when we hit an 'N' code_c = EMPTY; } uint64_t lep_idx = 0; uint64_t lep_bit_idx = 0; if (code_c == EMPTY) { if (mem->start == 0) { // FIXME: Gather leaves later during forward traversal even for MEMs starting at read_pos = 0 int mem_len = *i; if (mem_len >= raux->min_seed_len) { // Only gather leaves when MEM length exceeds threshold uint64_t startByteIdx = parent_byte_idx; leaf_gather(raux, mlt_data, &startByteIdx, mem, hits); } } // Update LEP for backward search lep_idx = raux->nextLEPBit >> 6; lep_bit_idx = raux->nextLEPBit & (0x3FULL); raux->lep[lep_idx] |= (1ULL << lep_bit_idx); raux->nextLEPBit += 1; } else if (code_c == LEAF) { // Hit a leaf node int k; uint64_t leaf_data = 0; nextByteIdx += getOffsetToLeafData(raux, code, c); memcpy_bwamem(&leaf_data, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__); if (leaf_data & 1) { // Found a multi-hit leaf node nextByteIdx = raux->mh_start_addr + (leaf_data >> 1); memcpy_bwamem(&raux->num_hits, 2 * sizeof(uint8_t), &mlt_data[nextByteIdx], 2 * sizeof(uint8_t), __FILE__, __LINE__); nextByteIdx += 2; mem->hitcount += raux->num_hits; for (k = 0; k < raux->num_hits; ++k) { memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__); kv_push(uint64_t, *hits, ref_pos >> 1); nextByteIdx += 5; ref_pos = 0; } } else { // Single-hit leaf node raux->num_hits = 1; mem->hitcount += raux->num_hits; kv_push(uint64_t, *hits, leaf_data >> 1); } // Update LEP for backward search lep_idx = raux->nextLEPBit >> 6; lep_bit_idx = raux->nextLEPBit & (0x3FULL); raux->lep[lep_idx] |= (1ULL << lep_bit_idx); raux->nextLEPBit += 1; *i += 1; } else if (code_c == UNIFORM) { // Multi-character internal node uint32_t j; int countBP = *((uint16_t*) &mlt_data[nextByteIdx]); nextByteIdx += 2; int numBitsForBP = countBP << 1; int numBytesForBP = (numBitsForBP % 8) ? (numBitsForBP / 8 + 1) : (numBitsForBP / 8); uint8_t packedBP[numBytesForBP]; memcpy_bwamem(packedBP, numBytesForBP * sizeof(uint8_t), &mlt_data[nextByteIdx], numBytesForBP * sizeof(uint8_t), __FILE__, __LINE__); nextByteIdx += numBytesForBP; // Unpack base pairs uint8_t unpackedBP[countBP]; for (j = 0; j < countBP; ++j) { unpackedBP[j] = ((packedBP[j >> 2] >> ((~(j) & 3) << 1)) & 3); } // Count number of matching base pairs with read for (j = 0; j < countBP; ++j) { if ((*i + j) >= raux->l_seq) { // Don't run past the end of the read break; } if (raux->read_buf[*i+j] == 4) { break; } if ((3 - raux->read_buf[*i+j]) != unpackedBP[j]) { break; } } raux->nextLEPBit += j; *i += j; if (j == countBP) { // If we match all bases of uniform entry if (*i == raux->l_seq) { // Check if we reached the end of the read if (mem->start == 0) { leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits); } lep_idx = raux->nextLEPBit >> 6; lep_bit_idx = raux->nextLEPBit & (0x3FULL); raux->lep[lep_idx] |= (1ULL << lep_bit_idx); } else { if (*i < raux->l_seq) { getNextByteIdx(raux, mlt_data, &nextByteIdx, i, mem, hits); } } } else { // // We did not match all bases of uniform entry // Fetch all hits from leaf nodes for backward extension (dfs :( ) // assert(*i <= raux->l_seq); if (mem->start == 0) { int mem_len = *i; if (mem_len >= raux->min_seed_len) { leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits); } } // Update LEP to start backward search from last matching bp lep_idx = raux->nextLEPBit >> 6; lep_bit_idx = raux->nextLEPBit & (0x3FULL); raux->lep[lep_idx] |= (1ULL << lep_bit_idx); } } else { // Single-character internal node getOffsetToChildNode(raux, mlt_data, code, c, &nextByteIdx); lep_idx = raux->nextLEPBit >> 6; lep_bit_idx = raux->nextLEPBit & (0x3FULL); raux->lep[lep_idx] |= (1ULL << lep_bit_idx); raux->nextLEPBit += 1; *i += 1; if (*i < raux->l_seq) { getNextByteIdx(raux, mlt_data, &nextByteIdx, i, mem, hits); } else { if (mem->start == 0) { leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits); } raux->lep[raux->nextLEPBit >> 6] |= (1ULL << (raux->nextLEPBit & (0x3FULL))); raux->nextLEPBit += 1; } } *byte_idx = nextByteIdx; } /** * Traverse tree during forward search (reseeding). Terminate when fewer than 'raux->limit' hits are found for an * internal node * * @param raux read parameters * @param mlt_data radix tree of k-mer * @param byteIdx byte index into the mlt_data radix tree * @param i index into read buffer * @param mem maximal-exact-match storage * @param visited stack to store list of visited nodes * @param hits list of hits for read */ void getNextByteIdx_wlimit(read_aux_t* raux, uint8_t* mlt_data, uint64_t* byte_idx, int* i, mem_t* mem, path_v* visited, u64v* hits) { uint64_t nextByteIdx = *byte_idx; uint64_t parent_byte_idx = nextByteIdx; uint64_t ref_pos = 0; uint8_t c, code, code_c; if (raux->read_buf[*i] != 4) { c = 3 - raux->read_buf[*i]; code = mlt_data[nextByteIdx++]; code_c = (code >> (c << 1)) & 3; assert(code != 0); } else { // Terminate MEM search when we hit an 'N' code_c = EMPTY; } uint64_t lep_idx = 0; uint64_t lep_bit_idx = 0; if (code_c == EMPTY) { if (mem->start == 0) { int mem_len = *i; if (mem_len >= raux->min_seed_len) { leaf_gather(raux, mlt_data, &parent_byte_idx, mem, hits); } } // Update LEP for backward search lep_idx = raux->nextLEPBit >> 6; lep_bit_idx = raux->nextLEPBit & (0x3FULL); raux->lep[lep_idx] |= (1ULL << lep_bit_idx); raux->nextLEPBit += 1; } else if (code_c == LEAF) { // Hit a leaf node int k; uint64_t leaf_data = 0; nextByteIdx += getOffsetToLeafData(raux, code, c); memcpy_bwamem(&leaf_data, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__); if (leaf_data & 1) { // Found a multi-hit leaf node nextByteIdx = raux->mh_start_addr + (leaf_data >> 1); memcpy_bwamem(&raux->num_hits, 2 * sizeof(uint8_t), &mlt_data[nextByteIdx], 2 * sizeof(uint8_t), __FILE__, __LINE__); nextByteIdx += 2; } else { // Single-hit leaf node raux->num_hits = 1; } // Hits exceed reseeding threshold if (raux->num_hits >= raux->limit) { mem->hitcount += raux->num_hits; for (k = 0; k < raux->num_hits; ++k) { memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__); kv_push(uint64_t, *hits, ref_pos >> 1); nextByteIdx += 5; ref_pos = 0; } *i += 1; } else { // Do DFS traversal to gather leaves starting from parent node if (mem->start == 0) { int mem_len = *i; if (mem_len >= raux->min_seed_len) { node_info_t* tmp_node_info = &kv_pop(*visited); leaf_gather(raux, mlt_data, &tmp_node_info->byte_idx, mem, hits); } } } // Update LEP for backward search lep_idx = raux->nextLEPBit >> 6; lep_bit_idx = raux->nextLEPBit & (0x3FULL); raux->lep[lep_idx] |= (1ULL << lep_bit_idx); raux->nextLEPBit += 1; } else if (code_c == UNIFORM) { // Multi-character internal node uint32_t j; int countBP = *((uint16_t*) &mlt_data[nextByteIdx]); nextByteIdx += 2; int numBitsForBP = countBP << 1; int numBytesForBP = (numBitsForBP % 8) ? (numBitsForBP / 8 + 1) : (numBitsForBP / 8); uint8_t packedBP[numBytesForBP]; memcpy_bwamem(packedBP, numBytesForBP * sizeof(uint8_t), &mlt_data[nextByteIdx], numBytesForBP * sizeof(uint8_t), __FILE__, __LINE__); nextByteIdx += numBytesForBP; // Unpack base pairs uint8_t unpackedBP[countBP]; for (j = 0; j < countBP; ++j) { unpackedBP[j] = ((packedBP[j >> 2] >> ((~(j) & 3) << 1)) & 3); } // Count number of matching base pairs with read for (j = 0; j < countBP; ++j) { if ((*i + j) >= raux->l_seq) { // Don't run past the end of the read break; } if (raux->read_buf[*i+j] == 4) { break; } if ((3 - raux->read_buf[*i+j]) != unpackedBP[j]) { break; } } raux->nextLEPBit += j; *i += j; if (j == countBP) { // If we match all bases of uniform entry if (*i == raux->l_seq) { // Check if we reached the end of the read if (mem->start == 0) { leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits); } lep_idx = raux->nextLEPBit >> 6; lep_bit_idx = raux->nextLEPBit & (0x3FULL); raux->lep[lep_idx] |= (1ULL << lep_bit_idx); } else { if (*i < raux->l_seq) { getNextByteIdx_wlimit(raux, mlt_data, &nextByteIdx, i, mem, visited, hits); } } } else { // // We did not match all bases of uniform entry // Fetch all hits from leaf nodes for backward extension (dfs :( ) // assert(*i <= raux->l_seq); if (mem->start == 0) { int mem_len = *i; if (mem_len >= raux->min_seed_len) { leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits); } } lep_idx = raux->nextLEPBit >> 6; lep_bit_idx = raux->nextLEPBit & (0x3FULL); raux->lep[lep_idx] |= (1ULL << lep_bit_idx); } } else { getOffsetToChildNode(raux, mlt_data, code, c, &nextByteIdx); lep_idx = raux->nextLEPBit >> 6; lep_bit_idx = raux->nextLEPBit & (0x3FULL); raux->lep[lep_idx] |= (1ULL << lep_bit_idx); raux->nextLEPBit += 1; // In the internal nodes, raux->num_hits = 0 is used to reprsent # hits > 20 if ((raux->num_hits == 0) || (raux->num_hits >= raux->limit)) { node_info_t nif; nif.byte_idx = nextByteIdx; nif.num_hits = raux->num_hits; kv_push(node_info_t, *visited, nif); *i += 1; if (*i < raux->l_seq) { getNextByteIdx_wlimit(raux, mlt_data, &nextByteIdx, i, mem, visited, hits); } else { if (mem->start == 0) { leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits); } raux->lep[raux->nextLEPBit >> 6] |= (1ULL << (raux->nextLEPBit & (0x3FULL))); raux->nextLEPBit += 1; } } else { // Do DFS traveral to gather leaves from parent node if (mem->start == 0) { int mem_len = *i; if (mem_len >= raux->min_seed_len) { node_info_t* tmp_node_info = &kv_pop(*visited); leaf_gather(raux, mlt_data, &tmp_node_info->byte_idx, mem, hits); } } } } *byte_idx = nextByteIdx; } /** * Traverse tree during forward search (LAST). Terminate when fewer than 'raux->limit' hits are found for an internal node * Minimum MEM length for LAST is opt->min_seed_len + 1 * * @param raux read parameters * @param mlt_data radix tree of k-mer * @param byteIdx byte index into the mlt_data radix tree * @param i index into read buffer * @param mem maximal-exact match found * @param visited stack to store list of visited nodes * @param hits list of hits for read */ void getNextByteIdx_last(read_aux_t* raux, uint8_t* mlt_data, uint64_t* byte_idx, int* i, mem_t* mem, u64v* hits) { uint64_t nextByteIdx = *byte_idx; uint64_t ref_pos = 0; uint8_t c, code, code_c; if (raux->read_buf[*i] != 4) { c = 3 - raux->read_buf[*i]; code = mlt_data[nextByteIdx++]; code_c = (code >> (c << 1)) & 3; assert(code != 0); } else { // Terminate MEM search when we hit an 'N' code_c = EMPTY; } if (code_c == EMPTY) { *i += 1; } else if (code_c == LEAF) { // Hit a leaf node int k; uint64_t leaf_data = 0; nextByteIdx += getOffsetToLeafData(raux, code, c); memcpy_bwamem(&leaf_data, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__); if (leaf_data & 1) { // multi-hit leaf nextByteIdx = raux->mh_start_addr + (leaf_data >> 1); memcpy_bwamem(&raux->num_hits, 2 * sizeof(uint8_t), &mlt_data[nextByteIdx], 2 * sizeof(uint8_t), __FILE__, __LINE__); nextByteIdx += 2; mem->hitcount += raux->num_hits; for (k = 0; k < raux->num_hits; ++k) { memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__); kv_push(uint64_t, *hits, ref_pos >> 1); nextByteIdx += 5; ref_pos = 0; } } else { // single-hit leaf raux->num_hits = 1; mem->hitcount += raux->num_hits; kv_push(uint64_t, *hits, leaf_data >> 1); } *i += 1; } else if (code_c == UNIFORM) { // Multi-character internal node uint32_t j; int countBP = *((uint16_t*) &mlt_data[nextByteIdx]); nextByteIdx += 2; int numBitsForBP = countBP << 1; int numBytesForBP = (numBitsForBP % 8) ? (numBitsForBP / 8 + 1) : (numBitsForBP / 8); uint8_t packedBP[numBytesForBP]; memcpy_bwamem(packedBP, numBytesForBP * sizeof(uint8_t), &mlt_data[nextByteIdx], numBytesForBP * sizeof(uint8_t), __FILE__, __LINE__); nextByteIdx += numBytesForBP; // Unpack base pairs uint8_t unpackedBP[countBP]; for (j = 0; j < countBP; ++j) { unpackedBP[j] = ((packedBP[j >> 2] >> ((~(j) & 3) << 1)) & 3); } // Count number of matching base pairs with read for (j = 0; j < countBP; ++j) { if ((*i + j) >= raux->l_seq) { //!< Don't run past the end of the read break; } if (raux->read_buf[*i+j] == 4) { break; } if ((3 - raux->read_buf[*i+j]) != unpackedBP[j]) { break; } } *i += j; int len = *i - mem->start; // // LAST stop criterion: MEM is sufficiently long and not too frequent // int stop_extension = (raux->num_hits > 0 && raux->num_hits < raux->limit && len >= (raux->min_seed_len + 1)) ? 1 : 0; if (stop_extension) { leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits); *i = mem->start + (raux->min_seed_len + 1); } else { if (j == countBP) { if (*i < raux->l_seq) { getNextByteIdx_last(raux, mlt_data, &nextByteIdx, i, mem, hits); } } else { *i += 1; } } } else if (code_c == DIVERGE) { getOffsetToChildNode(raux, mlt_data, code, c, &nextByteIdx); *i += 1; int len = *i - mem->start; // // LAST stop criterion: MEM is sufficiently long and not too frequent // int stop_extension = (raux->num_hits > 0 && raux->num_hits < raux->limit && len >= (raux->min_seed_len + 1)) ? 1 : 0; if (stop_extension) { leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits); } else { if (*i < raux->l_seq) { getNextByteIdx_last(raux, mlt_data, &nextByteIdx, i, mem, hits); } } } *byte_idx = nextByteIdx; } /** * Main backward search function (seeding). Lookup up k-mer and/or x-mer table and identify root of ERT * * @param iaux index parameters * @param raux read parameters * @param i index into read buffer * @param mem maximal-exact match found * @param hits list of hits for read */ void leftExtend(index_aux_t* iaux, read_aux_t* raux, int* i, mem_t* mem, u64v* hits) { uint8_t code; uint8_t* mlt_data; uint64_t byte_idx = 0, ref_pos = 0, kmer_entry = 0, start_addr = 0; uint32_t hashval = 0; int idx_first_N = -1; hashval = getHashKey(&raux->read_buf[*i], kmerSize, *i, raux->l_seq, 0, &idx_first_N); if (idx_first_N != -1) { *i += (kmerSize + xmerSize); mem->rc_end = *i; return; } // index-table lookup kmer_entry = iaux->kmer_offsets[hashval]; // index-table entry type code = kmer_entry & METADATA_MASK; // pointer to root of tree start_addr = kmer_entry >> KMER_DATA_BITWIDTH; // width used for internal pointers in tree raux->ptr_width = (((kmer_entry >> 22) & 3) == 0) ? 4 : ((kmer_entry >> 22) & 3); byte_idx = 0; if (code == INVALID) { // k-mer absent *i += (kmerSize + xmerSize); mem->rc_end = *i; } else if (code == SINGLE_HIT_LEAF) { // single-hit k-mer mlt_data = &iaux->mlt_table[start_addr]; byte_idx++; memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[byte_idx], 5 * sizeof(uint8_t), __FILE__, __LINE__); byte_idx += 5; mem->hitcount += 1; kv_push(uint64_t, *hits, ref_pos >> 1); *i += kmerSize; mem->rc_end = *i; } else if (code == INFREQUENT) { // k-mer has fewer than 256 hits *i += kmerSize; mlt_data = &iaux->mlt_table[start_addr]; if (*i < raux->l_seq) { // // First 4 bytes of tree store the start of all multi-hit leaves for the k-mer // memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__); byte_idx += 4; getNextByteIdx_backward(raux, mlt_data, &byte_idx, i, mem, hits); } else { mem->rc_end = *i; } } else { // k-mer has large, dense tree, do an additional x-mer lookup uint64_t xmer_entry; uint64_t ptr = 0; *i += kmerSize; mlt_data = &iaux->mlt_table[start_addr]; hashval = getHashKey(&raux->read_buf[*i], xmerSize, *i, raux->l_seq, 0, &idx_first_N); memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__); byte_idx += 4; memcpy_bwamem(&xmer_entry, 8 * sizeof(uint8_t), &mlt_data[byte_idx + (hashval << 3)], 8 * sizeof(uint8_t), __FILE__, __LINE__); code = xmer_entry & METADATA_MASK; ptr = xmer_entry >> KMER_DATA_BITWIDTH; if (idx_first_N != -1) { *i += xmerSize; mem->rc_end = *i; return; } if (code == INVALID) { *i += xmerSize; mem->rc_end = *i; } else if (code == SINGLE_HIT_LEAF) { byte_idx = ptr; byte_idx++; memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[byte_idx], 5 * sizeof(uint8_t), __FILE__, __LINE__); byte_idx += 5; mem->hitcount += 1; kv_push(uint64_t, *hits, ref_pos >> 1); *i += xmerSize; mem->rc_end = *i; } else { byte_idx = ptr; *i += xmerSize; if (*i < raux->l_seq) { getNextByteIdx_backward(raux, mlt_data, &byte_idx, i, mem, hits); } else { mem->rc_end = *i; } } } } /** * Main backward search function (reseeding). Lookup up k-mer and/or x-mer table and identify root of ERT. * Return early if root node has fewer than 'raux->limit' hits * * @param iaux index parameters * @param raux read parameters * @param i index into read buffer * @param mem maximal-exact-match storage * @param hits list of hits for read */ void leftExtend_wlimit(index_aux_t* iaux, read_aux_t* raux, int* i, mem_t* mem, u64v* hits) { uint8_t code; uint8_t* mlt_data; uint64_t byte_idx = 0, kmer_entry = 0, start_addr = 0; uint32_t hashval = 0; int idx_first_N = -1; hashval = getHashKey(&raux->read_buf[*i], kmerSize, *i, raux->l_seq, 0, &idx_first_N); if (idx_first_N != -1) { *i += (kmerSize + xmerSize); mem->rc_end = *i; return; } // index-table lookup kmer_entry = iaux->kmer_offsets[hashval]; // index-table entry type code = kmer_entry & METADATA_MASK; // pointer to root of tree start_addr = kmer_entry >> KMER_DATA_BITWIDTH; // width used for internal pointers in tree raux->ptr_width = (((kmer_entry >> 22) & 3) == 0) ? 4 : ((kmer_entry >> 22) & 3); // # hits for k-mer (0 if > 20 hits) raux->num_hits = (kmer_entry >> 17) & 0x1F; byte_idx = 0; if (code == INVALID) { // k-mer absent *i += (kmerSize + xmerSize); mem->rc_end = *i; } else if (code == SINGLE_HIT_LEAF) { // single-hit k-mer *i += (kmerSize + xmerSize); mem->rc_end = *i; } else if (code == INFREQUENT) { // k-mer has fewer than 256 hits *i += kmerSize; mlt_data = &iaux->mlt_table[start_addr]; if ((raux->num_hits == 0) || (raux->num_hits >= raux->limit)) { if (*i < raux->l_seq) { // // First 4 bytes of tree store the start of all multi-hit leaves for the k-mer // memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__); byte_idx += 4; getNextByteIdx_backward_wlimit(raux, mlt_data, &byte_idx, i, mem, hits); } else { // Leaf gathering to be done later mem->rc_end = *i; mem->fetch_leaves = 1; } } else { mem->rc_end = *i; } } else { // k-mer has large, dense tree, do an additional x-mer lookup uint64_t xmer_entry; uint64_t ptr = 0; *i += kmerSize; mlt_data = &iaux->mlt_table[start_addr]; hashval = getHashKey(&raux->read_buf[*i], xmerSize, *i, raux->l_seq, 0, &idx_first_N); // // First 4 bytes of tree store the start of all multi-hit leaves for the k-mer // This helps in decoding nodes and creates compact trees // memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__); byte_idx += 4; memcpy_bwamem(&xmer_entry, 8 * sizeof(uint8_t), &mlt_data[byte_idx + (hashval << 3)], 8 * sizeof(uint8_t), __FILE__, __LINE__); code = xmer_entry & METADATA_MASK; ptr = xmer_entry >> KMER_DATA_BITWIDTH; raux->num_hits = (xmer_entry >> 17) & 0x1F; if (idx_first_N != -1) { *i += xmerSize; mem->rc_end = *i; return; } if (code == INVALID) { *i += xmerSize; mem->rc_end = *i; } else if (code == SINGLE_HIT_LEAF) { *i += xmerSize; mem->rc_end = *i; } else { byte_idx = ptr; *i += xmerSize; if ((raux->num_hits == 0) || (raux->num_hits >= raux->limit)) { if (*i < raux->l_seq) { getNextByteIdx_backward_wlimit(raux, mlt_data, &byte_idx, i, mem, hits); } else { // Leaf gathering mem->rc_end = *i; mem->fetch_leaves = 1; } } else { mem->rc_end = *i; } } } } /** * Fetch hits for all MEMs identified after backward search (termination conditions based on reseeding) * * @param raux read parameters * @param mlt_data radix tree of k-mer * @param byteIdx byte index into the mlt_data radix tree * @param idx index into read buffer * @param mem maximal-exact-match storage * @param hits list of hits for read */ void getNextByteIdx_fetch_leaves_prefix_reseed(read_aux_t* raux, uint8_t* mlt_data, uint64_t* byte_idx, int idx, mem_t* mem, path_v* visited, u64v* hits) { uint64_t nextByteIdx = *byte_idx; uint64_t parent_byte_idx = nextByteIdx; uint64_t ref_pos = 0; uint8_t c; int i = idx; assert(raux->read_buf[i] != 4); // Should not see N in SMEMs c = 3 - raux->read_buf[i]; uint8_t code = mlt_data[nextByteIdx++]; uint8_t code_c = (code >> (c << 1)) & 3; assert(code != 0); if (code_c == EMPTY) { // Do leaf gathering mem->end = i; int mem_len = mem->end - mem->start; if (mem_len >= raux->min_seed_len) { leaf_gather(raux, mlt_data, &parent_byte_idx, mem, hits); } } else if (code_c == LEAF) { int k; uint64_t leaf_data = 0; nextByteIdx += getOffsetToLeafData(raux, code, c); memcpy_bwamem(&leaf_data, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__); if (leaf_data & 1) { nextByteIdx = raux->mh_start_addr + (leaf_data >> 1); memcpy_bwamem(&raux->num_hits, 2 * sizeof(uint8_t), &mlt_data[nextByteIdx], 2 * sizeof(uint8_t), __FILE__, __LINE__); nextByteIdx += 2; } else { raux->num_hits = 1; } if (raux->num_hits >= raux->limit) { mem->hitcount += raux->num_hits; for (k = 0; k < raux->num_hits; ++k) { memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__); kv_push(uint64_t, *hits, ref_pos >> 1); nextByteIdx += 5; ref_pos = 0; } i += 1; mem->end = i; mem->is_multi_hit = 1; // decompress leaf node for potentially longer match } else { mem->end = i; int mem_len = mem->end - mem->start; if (mem_len >= raux->min_seed_len) { node_info_t* tmp_node_info = &kv_pop(*visited); leaf_gather(raux, mlt_data, &tmp_node_info->byte_idx, mem, hits); } } } else if (code_c == UNIFORM) { uint32_t j; int countBP = *((uint16_t*) &mlt_data[nextByteIdx]); nextByteIdx += 2; int numBitsForBP = countBP << 1; int numBytesForBP = (numBitsForBP % 8) ? (numBitsForBP / 8 + 1) : (numBitsForBP / 8); uint8_t packedBP[numBytesForBP]; memcpy_bwamem(packedBP, numBytesForBP * sizeof(uint8_t), &mlt_data[nextByteIdx], numBytesForBP * sizeof(uint8_t), __FILE__, __LINE__); nextByteIdx += numBytesForBP; // Unpack base pairs uint8_t unpackedBP[countBP]; for (j = 0; j < countBP; ++j) { unpackedBP[j] = ((packedBP[j >> 2] >> ((~(j) & 3) << 1)) & 3); } // Count number of matching base pairs with read for (j = 0; j < countBP; ++j) { if ((i + j) >= raux->l_seq) { break; } if (3 - raux->read_buf[i+j] != unpackedBP[j]) { break; } } i += j; if (j == countBP) { if (i < raux->l_seq) { getNextByteIdx_fetch_leaves_prefix_reseed(raux, mlt_data, &nextByteIdx, i, mem, visited, hits); } else { mem->end = i; int mem_len = mem->end - mem->start; if (mem_len >= raux->min_seed_len) { leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits); } } } else { mem->end = i; int mem_len = mem->end - mem->start; if (mem_len >= raux->min_seed_len) { leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits); } } } else if (code_c == DIVERGE) { raux->num_hits = 0; getOffsetToChildNode(raux, mlt_data, code, c, &nextByteIdx); if ((raux->num_hits == 0) || (raux->num_hits >= raux->limit)) { node_info_t nif; nif.byte_idx = nextByteIdx; nif.num_hits = raux->num_hits; kv_push(node_info_t, *visited, nif); i += 1; if (i < raux->l_seq) { getNextByteIdx_fetch_leaves_prefix_reseed(raux, mlt_data, &nextByteIdx, i, mem, visited, hits); } else { mem->end = i; int mem_len = mem->end - mem->start; if (mem_len >= raux->min_seed_len) { leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits); } } } else { mem->end = i; int mem_len = mem->end - mem->start; if (mem_len >= raux->min_seed_len) { node_info_t* tmp_node_info = &kv_pop(*visited); leaf_gather(raux, mlt_data, &tmp_node_info->byte_idx, mem, hits); } } } *byte_idx = nextByteIdx; } /** * Fetch hits for all MEMs identified after backward search (extend beyond mem->end) * * @param raux read parameters * @param mlt_data radix tree of k-mer * @param byteIdx byte index into the mlt_data radix tree * @param idx index into read buffer * @param mem maximal-exact-match storage * @param hits list of hits for read */ void getNextByteIdx_fetch_leaves_prefix(read_aux_t* raux, uint8_t* mlt_data, uint64_t* byte_idx, int idx, mem_t* mem, u64v* hits) { uint64_t nextByteIdx = *byte_idx; uint64_t ref_pos = 0; uint8_t c; int i = idx; assert(raux->read_buf[i] != 4); // Should not see N in SMEMs c = 3 - raux->read_buf[i]; uint8_t code = mlt_data[nextByteIdx++]; uint8_t code_c = (code >> (c << 1)) & 3; assert(code != 0); if (code_c == EMPTY) { // Do leaf gathering mem->end = i; int mem_len = mem->end - mem->start; if (mem_len >= raux->min_seed_len) { nextByteIdx = *byte_idx; leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits); } } else if (code_c == LEAF) { int k; uint64_t leaf_data = 0; nextByteIdx += getOffsetToLeafData(raux, code, c); memcpy_bwamem(&leaf_data, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__); if (leaf_data & 1) { nextByteIdx = raux->mh_start_addr + (leaf_data >> 1); memcpy_bwamem(&raux->num_hits, 2 * sizeof(uint8_t), &mlt_data[nextByteIdx], 2 * sizeof(uint8_t), __FILE__, __LINE__); nextByteIdx += 2; mem->hitcount += raux->num_hits; for (k = 0; k < raux->num_hits; ++k) { memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__); kv_push(uint64_t, *hits, ref_pos >> 1); nextByteIdx += 5; ref_pos = 0; } } else { raux->num_hits = 1; mem->hitcount += raux->num_hits; kv_push(uint64_t, *hits, leaf_data >> 1); } i += 1; mem->end = i; } else if (code_c == UNIFORM) { uint32_t j; int countBP = *((uint16_t*) &mlt_data[nextByteIdx]); nextByteIdx += 2; int numBitsForBP = countBP << 1; int numBytesForBP = (numBitsForBP % 8) ? (numBitsForBP / 8 + 1) : (numBitsForBP / 8); uint8_t packedBP[numBytesForBP]; memcpy_bwamem(packedBP, numBytesForBP * sizeof(uint8_t), &mlt_data[nextByteIdx], numBytesForBP * sizeof(uint8_t), __FILE__, __LINE__); nextByteIdx += numBytesForBP; // Unpack base pairs uint8_t unpackedBP[countBP]; for (j = 0; j < countBP; ++j) { unpackedBP[j] = ((packedBP[j >> 2] >> ((~(j) & 3) << 1)) & 3); } // Count number of matching base pairs with read for (j = 0; j < countBP; ++j) { if ((i + j) >= raux->l_seq) { break; } if (3 - raux->read_buf[i+j] != unpackedBP[j]) { break; } } i += j; if (j == countBP) { if (i < raux->l_seq) { getNextByteIdx_fetch_leaves_prefix(raux, mlt_data, &nextByteIdx, i, mem, hits); } else { mem->end = i; int mem_len = mem->end - mem->start; if (mem_len >= raux->min_seed_len) { leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits); } } } else { mem->end = i; int mem_len = mem->end - mem->start; if (mem_len >= raux->min_seed_len) { leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits); } } } else if (code_c == DIVERGE) { raux->num_hits = 0; getOffsetToChildNode(raux, mlt_data, code, c, &nextByteIdx); i += 1; if (i < raux->l_seq) { getNextByteIdx_fetch_leaves_prefix(raux, mlt_data, &nextByteIdx, i, mem, hits); } else { mem->end = i; int mem_len = mem->end - mem->start; if (mem_len >= raux->min_seed_len) { leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits); } } } } /** * Forward traversal to fetch hits for all MEMs identified after backward search. * * @param raux read parameters * @param mlt_data radix tree of k-mer * @param byteIdx byte index into the mlt_data radix tree * @param idx index into read buffer * @param mem maximal-exact-match storage * @param hits list of hits for read */ void getNextByteIdx_fetch_leaves(read_aux_t* raux, uint8_t* mlt_data, uint64_t* byte_idx, int idx, mem_t* mem, u64v* hits) { uint64_t nextByteIdx = *byte_idx; uint64_t ref_pos = 0; uint8_t c; int i = idx; assert(raux->read_buf[i] != 4); // Should not see N in SMEMs c = 3 - raux->read_buf[i]; uint8_t code = mlt_data[nextByteIdx++]; uint8_t code_c = (code >> (c << 1)) & 3; assert(code != 0); assert(code_c != EMPTY); if (code_c == LEAF) { int k; uint64_t leaf_data = 0; nextByteIdx += getOffsetToLeafData(raux, code, c); memcpy_bwamem(&leaf_data, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__); if (leaf_data & 1) { nextByteIdx = raux->mh_start_addr + (leaf_data >> 1); memcpy_bwamem(&raux->num_hits, 2 * sizeof(uint8_t), &mlt_data[nextByteIdx], 2 * sizeof(uint8_t), __FILE__, __LINE__); nextByteIdx += 2; mem->hitcount += raux->num_hits; for (k = 0; k < raux->num_hits; ++k) { memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[nextByteIdx], 5 * sizeof(uint8_t), __FILE__, __LINE__); kv_push(uint64_t, *hits, ref_pos >> 1); nextByteIdx += 5; ref_pos = 0; } } else { raux->num_hits = 1; mem->hitcount += raux->num_hits; kv_push(uint64_t, *hits, leaf_data >> 1); } i += 1; } else if (code_c == UNIFORM) { uint32_t j; int countBP = *((uint16_t*) &mlt_data[nextByteIdx]); nextByteIdx += 2; int numBitsForBP = countBP << 1; int numBytesForBP = (numBitsForBP % 8) ? (numBitsForBP / 8 + 1) : (numBitsForBP / 8); uint8_t packedBP[numBytesForBP]; memcpy_bwamem(packedBP, numBytesForBP * sizeof(uint8_t), &mlt_data[nextByteIdx], numBytesForBP * sizeof(uint8_t), __FILE__, __LINE__); nextByteIdx += numBytesForBP; // Unpack base pairs uint8_t unpackedBP[countBP]; for (j = 0; j < countBP; ++j) { unpackedBP[j] = ((packedBP[j >> 2] >> ((~(j) & 3) << 1)) & 3); } // Count number of matching base pairs with read for (j = 0; j < countBP; ++j) { if ((i + j) >= raux->l_seq) { break; } if (3 - raux->read_buf[i+j] != unpackedBP[j]) { break; } } i += j; if (j == countBP) { if (i < mem->end) { // only extend till end of MEM found during previous backward search getNextByteIdx_fetch_leaves(raux, mlt_data, &nextByteIdx, i, mem, hits); } else { leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits); } } else { leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits); } } else if (code_c == DIVERGE) { raux->num_hits = 0; getOffsetToChildNode(raux, mlt_data, code, c, &nextByteIdx); i += 1; if (i < mem->end) { // only extend till end of MEM found during previous backward search getNextByteIdx_fetch_leaves(raux, mlt_data, &nextByteIdx, i, mem, hits); } else { leaf_gather(raux, mlt_data, &nextByteIdx, mem, hits); } } } /** * Extend as much as possible to the right based on reseeding criteria. * * Return early if root node has fewer than 'raux->limit' hits * * @param iaux index parameters * @param raux read parameters * @param mem maximal-exact-match storage * @param hits list of hits for read */ void rightExtend_fetch_leaves_prefix_reseed(index_aux_t* iaux, read_aux_t* raux, mem_t* mem, u64v* hits) { uint8_t code; uint8_t* mlt_data; uint64_t byte_idx = 0, kmer_entry = 0, start_addr = 0; uint32_t hashval = 0; int flag = 0; int i = mem->start; int idx_first_N = -1; hashval = getHashKey(&raux->read_buf[i], kmerSize, i, raux->l_seq, &flag, &idx_first_N); // index-table lookup kmer_entry = iaux->kmer_offsets[hashval]; // index-table entry type code = kmer_entry & METADATA_MASK; // pointer to root of tree start_addr = kmer_entry >> KMER_DATA_BITWIDTH; raux->mh_start_addr = 0; // width used for internal pointers in tree raux->ptr_width = (((kmer_entry >> 22) & 3) == 0) ? 4 : ((kmer_entry >> 22) & 3); // # hits for k-mer (0 if > 20 hits) raux->num_hits = (kmer_entry >> 17) & 0x1F; byte_idx = 0; assert(code != INVALID); if (code == SINGLE_HIT_LEAF) { mem->end = i; } else if (code == INFREQUENT) { // k-mer has fewer than 256 hits if ((raux->num_hits == 0) || (raux->num_hits >= raux->limit)) { i += kmerSize; mlt_data = &iaux->mlt_table[start_addr]; // // First 4 bytes of tree store the start of all multi-hit leaves for the k-mer // memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__); byte_idx += 4; if (i < raux->l_seq) { path_v visited; kv_init(visited); node_info_t nif; nif.byte_idx = byte_idx; nif.num_hits = raux->num_hits; kv_push(node_info_t, visited, nif); getNextByteIdx_fetch_leaves_prefix_reseed(raux, mlt_data, &byte_idx, i, mem, &visited, hits); kv_destroy(visited); } else { mem->end = i; int mem_len = mem->end - mem->start; if (mem_len >= raux->min_seed_len) { leaf_gather(raux, mlt_data, &byte_idx, mem, hits); } } } else { mem->end = i; } } else if (code == FREQUENT) { // k-mer has large, dense tree, do an additional x-mer lookup uint64_t xmer_entry; uint64_t ptr = 0; mlt_data = &iaux->mlt_table[start_addr]; hashval = getHashKey(&raux->read_buf[i + kmerSize], xmerSize, i + kmerSize, raux->l_seq, 0, &idx_first_N); memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__); byte_idx += 4; memcpy_bwamem(&xmer_entry, 8 * sizeof(uint8_t), &mlt_data[byte_idx + (hashval << 3)], 8 * sizeof(uint8_t), __FILE__, __LINE__); code = xmer_entry & METADATA_MASK; ptr = xmer_entry >> KMER_DATA_BITWIDTH; // 5 bits to encode number of hits at each node raux->num_hits = (xmer_entry >> 17) & 0x1F; if (code == INVALID) { mem->end = i; } else if (code == SINGLE_HIT_LEAF) { mem->end = i; } else { // When a node has greater than 20 (opt->max_mem->intv) hits, we store a 0 in the hits field if ((raux->num_hits == 0) || (raux->num_hits >= raux->limit)) { byte_idx = ptr; i += (kmerSize + xmerSize); if (i < raux->l_seq) { path_v visited; kv_init(visited); node_info_t nif; nif.byte_idx = byte_idx; nif.num_hits = raux->num_hits; kv_push(node_info_t, visited, nif); getNextByteIdx_fetch_leaves_prefix_reseed(raux, mlt_data, &byte_idx, i, mem, &visited, hits); kv_destroy(visited); } else { mem->end = i; int mem_len = mem->end - mem->start; if (mem_len >= raux->min_seed_len) { leaf_gather(raux, mlt_data, &byte_idx, mem, hits); } } } else { mem->end = i; } } } } /** * Extend as much as possible to the right and gather leaves if MEM length >= opt->min_seed_len. * * @param iaux index parameters * @param raux read parameters * @param mem maximal-exact-match storage * @param hits list of hits for read */ void rightExtend_fetch_leaves_prefix(index_aux_t* iaux, read_aux_t* raux, mem_t* mem, u64v* hits) { uint8_t code; uint8_t* mlt_data; uint64_t byte_idx = 0, ref_pos = 0, kmer_entry = 0, start_addr = 0; uint32_t hashval = 0; int flag = 0; int i = mem->start; int idx_first_N = -1; hashval = getHashKey(&raux->read_buf[i], kmerSize, i, raux->l_seq, &flag, &idx_first_N); // index-table lookup kmer_entry = iaux->kmer_offsets[hashval]; // index-table entry type code = kmer_entry & METADATA_MASK; // pointer to root of tree start_addr = kmer_entry >> KMER_DATA_BITWIDTH; raux->mh_start_addr = 0; // width used for internal pointers in tree, 2 bits, ptr_width = 4 is encoded as 0 raux->ptr_width = (((kmer_entry >> 22) & 3) == 0) ? 4 : ((kmer_entry >> 22) & 3); byte_idx = 0; assert(code != INVALID); if (code == SINGLE_HIT_LEAF) { mlt_data = &iaux->mlt_table[start_addr]; byte_idx++; memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[byte_idx], 5 * sizeof(uint8_t), __FILE__, __LINE__); mem->hitcount += 1; kv_push(uint64_t, *hits, ref_pos >> 1); byte_idx += 5; i += kmerSize; mem->end = i; } else if (code == INFREQUENT) { // k-mer has fewer than 256 hits i += kmerSize; mlt_data = &iaux->mlt_table[start_addr]; // // First 4 bytes of tree store the start of all multi-hit leaves for the k-mer // memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__); byte_idx += 4; if (i < raux->l_seq) { getNextByteIdx_fetch_leaves_prefix(raux, mlt_data, &byte_idx, i, mem, hits); } else { mem->end = i; int mem_len = mem->end - mem->start; if (mem_len >= raux->min_seed_len) { leaf_gather(raux, mlt_data, &byte_idx, mem, hits); } } } else if (code == FREQUENT) { // k-mer has large, dense tree, do an additional x-mer lookup uint64_t xmer_entry; uint64_t ptr = 0; mlt_data = &iaux->mlt_table[start_addr]; hashval = getHashKey(&raux->read_buf[i + kmerSize], xmerSize, i + kmerSize, raux->l_seq, 0, &idx_first_N); memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__); byte_idx += 4; memcpy_bwamem(&xmer_entry, 8 * sizeof(uint8_t), &mlt_data[byte_idx + (hashval << 3)], 8 * sizeof(uint8_t), __FILE__, __LINE__); code = xmer_entry & METADATA_MASK; ptr = xmer_entry >> KMER_DATA_BITWIDTH; if (code == INVALID) { mem->end = i; } else if (code == SINGLE_HIT_LEAF) { byte_idx = ptr; byte_idx++; memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[byte_idx], 5 * sizeof(uint8_t), __FILE__, __LINE__); mem->hitcount += 1; kv_push(uint64_t, *hits, ref_pos >> 1); byte_idx += 5; i += (kmerSize + xmerSize); mem->end = i; } else { byte_idx = ptr; i += (kmerSize + xmerSize); if (i < raux->l_seq) { getNextByteIdx_fetch_leaves_prefix(raux, mlt_data, &byte_idx, i, mem, hits); } else { mem->end = i; int mem_len = mem->end - mem->start; if (mem_len >= raux->min_seed_len) { leaf_gather(raux, mlt_data, &byte_idx, mem, hits); } } } } } /** * Fetch hits for all MEMs identified after backward search. * * Note that backward search functions above only perform tree traversal without gathering hits as they will be fetched in * a different order than required by BWA-MEM (i.e., all hits for MEM need to be sorted by right-context). We re-traverse and * the tree and fetch the hits in the correct order below (this has a minor performance penalty) * * Return early if root node has fewer than 'raux->limit' hits * * @param iaux index parameters * @param raux read parameters * @param mem maximal-exact-match storage * @param hits list of hits for read */ void rightExtend_fetch_leaves(index_aux_t* iaux, read_aux_t* raux, mem_t* mem, u64v* hits) { uint8_t code; uint8_t* mlt_data; uint64_t byte_idx = 0, kmer_entry = 0, start_addr = 0; uint32_t hashval = 0; int flag = 0; int i = mem->start; int end = mem->end; int idx_first_N = -1; hashval = getHashKey(&raux->read_buf[i], kmerSize, i, raux->l_seq, &flag, &idx_first_N); // index-table lookup kmer_entry = iaux->kmer_offsets[hashval]; // index-table entry type code = kmer_entry & METADATA_MASK; // pointer to root of tree start_addr = kmer_entry >> KMER_DATA_BITWIDTH; raux->mh_start_addr = 0; // width used for internal pointers in tree raux->ptr_width = (((kmer_entry >> 22) & 3) == 0) ? 4 : ((kmer_entry >> 22) & 3); byte_idx = 0; assert(code != INVALID); assert(code != SINGLE_HIT_LEAF); if (code == INFREQUENT) { // k-mer has fewer than 256 hits i += kmerSize; mlt_data = &iaux->mlt_table[start_addr]; // // First 4 bytes of tree store the start of all multi-hit leaves for the k-mer // memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__); byte_idx += 4; if (i < end) { getNextByteIdx_fetch_leaves(raux, mlt_data, &byte_idx, i, mem, hits); } else { leaf_gather(raux, mlt_data, &byte_idx, mem, hits); } } else if (code == FREQUENT) { // k-mer has large, dense tree, do an additional x-mer lookup uint64_t xmer_entry; uint64_t ptr = 0; i += kmerSize; mlt_data = &iaux->mlt_table[start_addr]; hashval = getHashKey(&raux->read_buf[i], xmerSize, i, raux->l_seq, 0, &idx_first_N); memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__); byte_idx += 4; memcpy_bwamem(&xmer_entry, 8 * sizeof(uint8_t), &mlt_data[byte_idx + (hashval << 3)], 8 * sizeof(uint8_t), __FILE__, __LINE__); code = xmer_entry & METADATA_MASK; ptr = xmer_entry >> KMER_DATA_BITWIDTH; assert(code != INVALID); assert(code != SINGLE_HIT_LEAF); byte_idx = ptr; i += xmerSize; if (i < end) { getNextByteIdx_fetch_leaves(raux, mlt_data, &byte_idx, i, mem, hits); } else { leaf_gather(raux, mlt_data, &byte_idx, mem, hits); } } } /** * Main forward search function (seeding). Lookup up k-mer and/or x-mer table and identify root of ERT * * @param iaux index parameters * @param raux read parameters * @param i Index into read buffer * @param mem maximal-exact-match storage * @param hits list of hits for read */ void rightExtend(index_aux_t* iaux, read_aux_t* raux, int* i, mem_t* mem, u64v* hits) { uint8_t code; uint8_t* mlt_data; uint64_t byte_idx = 0, ref_pos = 0, kmer_entry = 0, start_addr = 0; uint32_t hashval = 0; uint64_t lep_data = 0; int flag = 0; int idx_first_N = -1; hashval = getHashKey(&raux->read_buf[*i], kmerSize, *i, raux->l_seq, &flag, &idx_first_N); // index-table lookup kmer_entry = iaux->kmer_offsets[hashval]; // index-table entry type code = kmer_entry & METADATA_MASK; lep_data = (kmer_entry >> METADATA_BITWIDTH) & LEP_MASK; // pointer to root of tree start_addr = kmer_entry >> KMER_DATA_BITWIDTH; uint64_t mlt_start_addr = raux->mlt_start_addr = start_addr; raux->mh_start_addr = 0; // width used for internal pointers in tree raux->ptr_width = (((kmer_entry >> 22) & 3) == 0) ? 4 : ((kmer_entry >> 22) & 3); // LEP takes up kmerSize-1 bits. Last LEP bit is at position = kmerSize-2. if (*i <= 64-kmerSize) { raux->lep[0] |= (lep_data << *i); } else if (*i < 64) { raux->lep[0] |= (lep_data << *i); raux->lep[1] |= (lep_data >> (64-*i)); } else if (*i <= 128-kmerSize) { raux->lep[1] |= (lep_data << (*i-64)); } else if (*i < 128) { raux->lep[1] |= (lep_data << (*i-64)); raux->lep[2] |= (lep_data >> (128-*i)); } else if (*i <= 192-kmerSize) { raux->lep[2] |= (lep_data << (*i-128)); } else if (*i < 192) { raux->lep[2] |= (lep_data << (*i-128)); raux->lep[3] |= (lep_data >> (192-*i)); } else if (*i <= 256-kmerSize) { raux->lep[3] |= (lep_data << (*i-192)); } else if (*i < 256) { raux->lep[3] |= (lep_data << (*i-192)); raux->lep[4] |= (lep_data >> (256-*i)); } else { raux->lep[4] |= (lep_data << (*i-256)); } raux->nextLEPBit = *i + kmerSize - 1; byte_idx = 0; // We found an ambiguous base in the kmer. Stop extension at ambiguous base and record LEP if (idx_first_N != -1) { if (*i != 0) { raux->nextLEPBit = *i + idx_first_N - 1; raux->lep[raux->nextLEPBit >> 6] |= (1ULL << (raux->nextLEPBit & (0x3FULL))); } *i += idx_first_N; return; } if (flag) { raux->nextLEPBit = (raux->l_seq - 1); *i = raux->l_seq; raux->lep[raux->nextLEPBit >> 6] |= (1ULL << (raux->nextLEPBit & (0x3FULL))); return; } if (code == INVALID) { // Do backward extension using LEP *i += (kmerSize + xmerSize); } else if (code == SINGLE_HIT_LEAF) { mlt_data = &iaux->mlt_table[start_addr]; byte_idx++; memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[byte_idx], 5 * sizeof(uint8_t), __FILE__, __LINE__); mem->hitcount += 1; kv_push(uint64_t, *hits, ref_pos >> 1); byte_idx += 5; *i += kmerSize; } else if (code == INFREQUENT) { // k-mer has fewer than 256 hits *i += kmerSize; mlt_data = &iaux->mlt_table[start_addr]; if (*i < raux->l_seq) { // // First 4 bytes of tree store the start of all multi-hit leaves for the k-mer // memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__); byte_idx += 4; getNextByteIdx(raux, mlt_data, &byte_idx, i, mem, hits); } else { raux->lep[raux->nextLEPBit >> 6] |= (1ULL << (raux->nextLEPBit & (0x3FULL))); raux->nextLEPBit += 1; } } else if (code == FREQUENT) { // k-mer has large, dense tree, do an additional x-mer lookup uint64_t xmer_entry; uint64_t ptr = 0; *i += kmerSize; int k; uint64_t lepBit = 0; flag = 0; mlt_data = &iaux->mlt_table[mlt_start_addr]; hashval = getHashKey(&raux->read_buf[*i], xmerSize, *i, raux->l_seq, &flag, &idx_first_N); memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__); byte_idx += 4; memcpy_bwamem(&xmer_entry, 8 * sizeof(uint8_t), &mlt_data[byte_idx + (hashval << 3)], 8 * sizeof(uint8_t), __FILE__, __LINE__); code = xmer_entry & METADATA_MASK; lep_data = (xmer_entry >> METADATA_BITWIDTH) & 0xF; ptr = xmer_entry >> KMER_DATA_BITWIDTH; int xmerLen = 0; if (raux->l_seq - *i > xmerSize) { xmerLen = xmerSize; } else { xmerLen = raux->l_seq - *i; } for (k = 0; k < xmerLen; ++k) { lepBit = (lep_data >> k) & 1; raux->lep[raux->nextLEPBit >> 6] |= (lepBit << (raux->nextLEPBit & (0x3FULL))); raux->nextLEPBit++; } // We found an ambiguous base in the kmer. Stop extension at ambiguous base and record LEP if (idx_first_N != -1) { raux->nextLEPBit = *i + idx_first_N - 1; raux->lep[raux->nextLEPBit >> 6] |= (1ULL << (raux->nextLEPBit & (0x3FULL))); *i += idx_first_N; return; } if (flag) { raux->nextLEPBit = (raux->l_seq - 1); *i = raux->l_seq; raux->lep[raux->nextLEPBit >> 6] |= (1ULL << (raux->nextLEPBit & (0x3FULL))); return; } if (code == INVALID) { *i += xmerSize; } else if (code == SINGLE_HIT_LEAF) { byte_idx = ptr; byte_idx++; memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[byte_idx], 5 * sizeof(uint8_t), __FILE__, __LINE__); mem->hitcount += 1; kv_push(uint64_t, *hits, ref_pos >> 1); byte_idx += 5; *i += xmerSize; } else { byte_idx = ptr; *i += xmerSize; if (*i < raux->l_seq) { getNextByteIdx(raux, mlt_data, &byte_idx, i, mem, hits); } else { raux->lep[raux->nextLEPBit >> 6] |= (1ULL << (raux->nextLEPBit & (0x3FULL))); raux->nextLEPBit += 1; } } } } /** * Main forward search function (reseeding). Lookup up k-mer and/or x-mer table and identify root of ERT * Return early if root node has fewer than 'raux->limit' hits * * @param iaux index parameters * @param raux read parameters * @param i Index into read buffer * @param mem maximal-exact-match storage * @param hits list of hits for read */ void rightExtend_wlimit(index_aux_t* iaux, read_aux_t* raux, int* i, mem_t* mem, u64v* hits) { uint8_t code; uint8_t* mlt_data; uint64_t byte_idx = 0, kmer_entry = 0, start_addr = 0; uint32_t hashval = 0; uint64_t lep_data = 0; int flag = 0; int idx_first_N = -1; hashval = getHashKey(&raux->read_buf[*i], kmerSize, *i, raux->l_seq, &flag, &idx_first_N); // index-table lookup kmer_entry = iaux->kmer_offsets[hashval]; // index-table entry type code = kmer_entry & METADATA_MASK; lep_data = (kmer_entry >> METADATA_BITWIDTH) & LEP_MASK; // pointer to root of tree start_addr = kmer_entry >> KMER_DATA_BITWIDTH; uint64_t mlt_start_addr = raux->mlt_start_addr = start_addr; raux->mh_start_addr = 0; // width used for internal pointers in tree, 2 bits, ptr_width = 4 is encoded as 0 raux->ptr_width = (((kmer_entry >> 22) & 3) == 0) ? 4 : ((kmer_entry >> 22) & 3); raux->num_hits = (kmer_entry >> 17) & 0x1F; // LEP takes up kmerSize-1 bits. Last LEP bit is at position = kmerSize-2. if (*i <= 64-kmerSize) { raux->lep[0] |= (lep_data << *i); } else if (*i < 64) { raux->lep[0] |= (lep_data << *i); raux->lep[1] |= (lep_data >> (64-*i)); } else if (*i <= 128-kmerSize) { raux->lep[1] |= (lep_data << (*i-64)); } else if (*i < 128) { raux->lep[1] |= (lep_data << (*i-64)); raux->lep[2] |= (lep_data >> (128-*i)); } else if (*i <= 192-kmerSize) { raux->lep[2] |= (lep_data << (*i-128)); } else if (*i < 192) { raux->lep[2] |= (lep_data << (*i-128)); raux->lep[3] |= (lep_data >> (192-*i)); } else if (*i <= 256-kmerSize) { raux->lep[3] |= (lep_data << (*i-192)); } else if (*i < 256) { raux->lep[3] |= (lep_data << (*i-192)); raux->lep[4] |= (lep_data >> (256-*i)); } else { raux->lep[4] |= (lep_data << (*i-256)); } raux->nextLEPBit = *i + kmerSize - 1; byte_idx = 0; // We found an ambiguous base in the kmer. Stop extension at ambiguous base and record LEP if (idx_first_N != -1) { if (*i != 0) { raux->nextLEPBit = *i + idx_first_N - 1; raux->lep[raux->nextLEPBit >> 6] |= (1ULL << (raux->nextLEPBit & (0x3FULL))); } *i += idx_first_N; return; } if (flag) { raux->nextLEPBit = (raux->l_seq - 1); *i = raux->l_seq; raux->lep[raux->nextLEPBit >> 6] |= (1ULL << (raux->nextLEPBit & (0x3FULL))); return; } if (code == INVALID) { // Do backward extension using LEP *i += (kmerSize + xmerSize); } else if (code == SINGLE_HIT_LEAF) { *i += (kmerSize + xmerSize); } else if (code == INFREQUENT) { // k-mer has fewer than 256 hits *i += kmerSize; mlt_data = &iaux->mlt_table[mlt_start_addr]; if ((raux->num_hits == 0) || (raux->num_hits >= raux->limit)) { if (*i < raux->l_seq) { path_v visited; kv_init(visited); node_info_t nif; nif.byte_idx = byte_idx; nif.num_hits = raux->num_hits; kv_push(node_info_t, visited, nif); // // First 4 bytes of tree store the start of all multi-hit leaves for the k-mer // memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__); byte_idx += 4; getNextByteIdx_wlimit(raux, mlt_data, &byte_idx, i, mem, &visited, hits); kv_destroy(visited); } else { raux->lep[raux->nextLEPBit >> 6] |= (1ULL << (raux->nextLEPBit & (0x3FULL))); raux->nextLEPBit += 1; } } } else if (code == FREQUENT) { // k-mer has large, dense tree, do an additional x-mer lookup uint64_t xmer_entry; uint64_t ptr = 0; *i += kmerSize; int k; uint64_t lepBit = 0; flag = 0; mlt_data = &iaux->mlt_table[mlt_start_addr]; hashval = getHashKey(&raux->read_buf[*i], xmerSize, *i, raux->l_seq, &flag, &idx_first_N); memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__); byte_idx += 4; memcpy_bwamem(&xmer_entry, 8 * sizeof(uint8_t), &mlt_data[byte_idx + (hashval << 3)], 8 * sizeof(uint8_t), __FILE__, __LINE__); code = xmer_entry & METADATA_MASK; lep_data = (xmer_entry >> METADATA_BITWIDTH) & 0xF; ptr = xmer_entry >> KMER_DATA_BITWIDTH; // 5 bits to encode hits for each node raux->num_hits = (xmer_entry >> 17) & 0x1F; int xmerLen = 0; if (raux->l_seq - *i > xmerSize) { xmerLen = xmerSize; } else { xmerLen = raux->l_seq - *i; } for (k = 0; k < xmerLen; ++k) { lepBit = (lep_data >> k) & 1; raux->lep[raux->nextLEPBit >> 6] |= (lepBit << (raux->nextLEPBit & (0x3FULL))); raux->nextLEPBit++; } // We found an ambiguous base in the kmer. Stop extension at ambiguous base and record LEP if (idx_first_N != -1) { raux->nextLEPBit = *i + idx_first_N - 1; raux->lep[raux->nextLEPBit >> 6] |= (1ULL << (raux->nextLEPBit & (0x3FULL))); *i += idx_first_N; return; } if (flag) { raux->nextLEPBit = (raux->l_seq - 1); *i = raux->l_seq; raux->lep[raux->nextLEPBit >> 6] |= (1ULL << (raux->nextLEPBit & (0x3FULL))); return; } if (code == INVALID) { *i += xmerSize; } else if (code == SINGLE_HIT_LEAF) { *i += xmerSize; } else { byte_idx = ptr; *i += xmerSize; if ((raux->num_hits == 0) || (raux->num_hits >= raux->limit)) { if (*i < raux->l_seq) { path_v visited; kv_init(visited); node_info_t nif; nif.byte_idx = byte_idx; nif.num_hits = raux->num_hits; kv_push(node_info_t, visited, nif); getNextByteIdx_wlimit(raux, mlt_data, &byte_idx, i, mem, &visited, hits); kv_destroy(visited); } else { raux->lep[raux->nextLEPBit >> 6] |= (1ULL << (raux->nextLEPBit & (0x3FULL))); raux->nextLEPBit += 1; } } } } } /* * Main forward search function for LAST heuristic * * @param iaux index parameters * @param raux read parameters * @param i Index into read buffer * @param mem maximal-exact-match storage * @param hits list of hits for read */ void rightExtend_last(index_aux_t* iaux, read_aux_t* raux, int* i, mem_t* mem, u64v* hits) { uint8_t code; uint8_t* mlt_data; uint64_t byte_idx = 0, ref_pos = 0, kmer_entry = 0, start_addr = 0; int flag = 0; uint32_t hashval = 0; int idx_first_N = -1; hashval = getHashKey(&raux->read_buf[*i], kmerSize, *i, raux->l_seq, &flag, &idx_first_N); // We found an ambiguous base in the kmer. Stop extension if (idx_first_N != -1) { *i += (idx_first_N + 1); return; } if (flag) { *i = raux->l_seq; return; } // index-table lookup kmer_entry = iaux->kmer_offsets[hashval]; code = kmer_entry & METADATA_MASK; start_addr = kmer_entry >> KMER_DATA_BITWIDTH; uint64_t mlt_start_addr = raux->mlt_start_addr = start_addr; raux->mh_start_addr = 0; raux->ptr_width = (((kmer_entry >> 22) & 3) == 0) ? 4 : ((kmer_entry >> 22) & 3); raux->num_hits = (kmer_entry >> 17) & 0x1F; byte_idx = 0; if (code == INVALID) { // Do backward extension using LEP *i += kmerSize; } else if (code == SINGLE_HIT_LEAF) { mlt_data = &iaux->mlt_table[mlt_start_addr]; byte_idx++; memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[byte_idx], 5 * sizeof(uint8_t), __FILE__, __LINE__); mem->hitcount += 1; kv_push(uint64_t, *hits, ref_pos >> 1); byte_idx += 5; *i += kmerSize; } else if (code == INFREQUENT) { *i += kmerSize; mlt_data = &iaux->mlt_table[mlt_start_addr]; if (*i < raux->l_seq) { // Length <= (kmer + xmer). Need not check num_hits here. memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__); byte_idx += 4; getNextByteIdx_last(raux, mlt_data, &byte_idx, i, mem, hits); } } else if (code == FREQUENT) { uint64_t xmer_entry; uint64_t ptr = 0; *i += kmerSize; flag = 0; mlt_data = &iaux->mlt_table[mlt_start_addr]; hashval = getHashKey(&raux->read_buf[*i], xmerSize, *i, raux->l_seq, &flag, &idx_first_N); // We found an ambiguous base in the kmer. Stop extension if (idx_first_N != -1) { *i += (idx_first_N + 1); return; } if (flag) { *i = raux->l_seq; return; } memcpy_bwamem(&raux->mh_start_addr, 4 * sizeof(uint8_t), &mlt_data[byte_idx], 4 * sizeof(uint8_t), __FILE__, __LINE__); byte_idx += 4; memcpy_bwamem(&xmer_entry, 8 * sizeof(uint8_t), &mlt_data[byte_idx + (hashval << 3)], 8 * sizeof(uint8_t), __FILE__, __LINE__); code = xmer_entry & METADATA_MASK; ptr = xmer_entry >> KMER_DATA_BITWIDTH; raux->num_hits = (xmer_entry >> 17) & 0x1F; if (code == INVALID) { *i += xmerSize; } else if (code == SINGLE_HIT_LEAF) { byte_idx = ptr; byte_idx++; memcpy_bwamem(&ref_pos, 5 * sizeof(uint8_t), &mlt_data[byte_idx], 5 * sizeof(uint8_t), __FILE__, __LINE__); mem->hitcount += 1; kv_push(uint64_t, *hits, ref_pos >> 1); byte_idx += 5; *i += xmerSize; } else { byte_idx = ptr; *i += xmerSize; if ((raux->num_hits == 0) || (raux->num_hits >= raux->limit) || ((*i - mem->start) < (raux->min_seed_len + 1))) { if (*i < raux->l_seq) { getNextByteIdx_last(raux, mlt_data, &byte_idx, i, mem, hits); } } else { // number of hits is less than the limit and seed length >= opt->min_seed_len leaf_gather(raux, mlt_data, &byte_idx, mem, hits); } } } } /* * Initialize MEM parameters for backward extension * * @param lep LEP bit vector * @param mem maximal-exact-match * @param j Index into LEP * @param seq_len Read length * @param min_seed_len Minimum seed length * * @return mem_valid valid flag to indicate if backward search needs to be performed from given position j */ static inline int init_mem(uint64_t* lep, mem_t* mem, int j, int seq_len, int min_seed_len) { int lep_bit_set = (lep[j >> 6] >> (j & 63)) & 1; int in_valid_range = (j >= (min_seed_len-1)) ? 1 : 0; int mem_valid = (lep_bit_set && in_valid_range); mem->end = j + 1; // [start, end) indexing mem->rc_start = seq_len - j - 1; mem->rc_end = mem->rc_start; mem->skip_ref_fetch = 0; mem->forward = 0; mem->fetch_leaves = 0; mem->hitbeg = mem->hitcount = 0; mem->end_correction = 0; mem->is_multi_hit = 0; return mem_valid; } /* * Compute final SMEMs and their hits after considering their overlaps * * Leaf expansion is also performed to get the actual length of the MEM. * Note that leaf nodes store compressed suffixes by including a pointer to the reference genome * * @param iaux index related parameters * @param raux read related parameters * @param mem maximal-exact-match storage * @param sh helper data structure to keep track of start and end positions of previously identified MEMs * @param smems List of SMEMs * * @return n number of backward extensions to skip */ int check_and_add_smem_prefix_reseed(index_aux_t* iaux, read_aux_t* raux, mem_t* mem, smem_helper_t* sh, mem_v* smems, u64v* hits) { mem->start = raux->l_seq - mem->rc_end; // Adjust start position of LMEM int lmemLen = mem->end - mem->start, rmemLen = -1, next_be_point; if (mem->hitcount > 0 && !mem->skip_ref_fetch) { int64_t len; int64_t start_ref_pos = hits->a[mem->hitbeg] - mem->rc_start; int64_t end_ref_pos = hits->a[mem->hitbeg]; // Fetch reference to check for extra matching bps on the right uint8_t* rseq = get_seq(iaux->bns->l_pac, iaux->pac, start_ref_pos, end_ref_pos, &len, iaux->ref_string, 0); int m, numMatchingBP = 0; for (m = 1; m <= len; ++m) { if (rseq[mem->rc_start - m] == raux->read_buf[mem->rc_start - m]) { numMatchingBP++; } else { break; } } mem->end += numMatchingBP; mem->end_correction += numMatchingBP; start_ref_pos = hits->a[mem->hitbeg] + lmemLen; end_ref_pos = start_ref_pos + mem->start; // Fetch reference to check for extra matching bps on the left rseq = get_seq(iaux->bns->l_pac, iaux->pac, start_ref_pos, end_ref_pos, &len, iaux->ref_string, 0); numMatchingBP = 0; for (m = 0; m < len; ++m) { if (rseq[m] == raux->read_buf[mem->rc_end + m]) { numMatchingBP++; } else { break; } } mem->start -= numMatchingBP; } // Adjust start position of MEM by extra matching bps lmemLen = mem->end - mem->start; next_be_point = mem->end; if (mem->hitcount == 1) { if (lmemLen >= raux->min_seed_len) { kv_push(mem_t, *smems, *mem); } else { next_be_point += (raux->min_seed_len - lmemLen); } } // perform forward extension only for non-empty MEMs from backward extension else if (mem->fetch_leaves && (mem->start <= (raux->l_seq - raux->min_seed_len))) { hits->n -= mem->hitcount; mem->hitbeg = hits->n; mem->hitcount = 0; raux->read_buf = raux->unpacked_queue_buf; rightExtend_fetch_leaves_prefix_reseed(iaux, raux, mem, hits); raux->read_buf = raux->unpacked_rc_queue_buf; rmemLen = mem->end - mem->start; next_be_point = mem->end; if (mem->hitcount > 0) { if (mem->is_multi_hit) { int64_t len; int64_t start_ref_pos = hits->a[mem->hitbeg] + rmemLen; int64_t end_ref_pos = hits->a[mem->hitbeg] + raux->l_seq - mem->start; /// Fetch reference uint8_t* rseq = get_seq(iaux->bns->l_pac, iaux->pac, start_ref_pos, end_ref_pos, &len, iaux->ref_string, 0); int m; int numMatchingBP = 0; /// Check for matching bases for (m = 0; m < len; ++m) { if (rseq[m] == raux->unpacked_queue_buf[mem->end + m]) { numMatchingBP++; } else { break; } } mem->end += numMatchingBP; rmemLen = mem->end - mem->start; next_be_point = mem->end; } if (rmemLen >= raux->min_seed_len && mem->end <= sh->mem_end_limit) { kv_push(mem_t, *smems, *mem); } else { next_be_point += (raux->min_seed_len - rmemLen); } } else { // we don't have min_seed_len match for this start position assert(rmemLen <= raux->min_seed_len); next_be_point += (raux->min_seed_len - rmemLen); } } else { if (lmemLen <= raux->min_seed_len) { next_be_point += (raux->min_seed_len - lmemLen); } } return next_be_point; } /* * Compute final SMEMs and their hits after considering their overlaps * * Leaf expansion is also performed to get the actual length of the MEM. * Note that leaf nodes store compressed suffixes by including a pointer to the reference genome * * @param iaux index related parameters * @param raux read related parameters * @param mem maximal-exact-match storage * @param sh helper data structure to keep track of start and end positions of previously identified MEMs * @param smems list of SMEMs * @param hits list of hits for read * * @return n number of backward extensions to skip */ int check_and_add_smem_prefix(index_aux_t* iaux, read_aux_t* raux, mem_t* mem, smem_helper_t* sh, mem_v* smems, u64v* hits) { mem->start = raux->l_seq - mem->rc_end; // Adjust start position of LMEM int lmemLen = mem->end - mem->start, rmemLen = -1, next_be_point; if (mem->hitcount > 0 && !mem->skip_ref_fetch) { int64_t len; int64_t start_ref_pos = hits->a[mem->hitbeg] - mem->rc_start; int64_t end_ref_pos = hits->a[mem->hitbeg]; // Fetch reference to check for extra matching bps uint8_t* rseq = get_seq(iaux->bns->l_pac, iaux->pac, start_ref_pos, end_ref_pos, &len, iaux->ref_string, 0); int m, numMatchingBP = 0; for (m = 1; m <= len; ++m) { if (rseq[mem->rc_start - m] == raux->read_buf[mem->rc_start - m]) { numMatchingBP++; } else { break; } } mem->end += numMatchingBP; mem->end_correction += numMatchingBP; start_ref_pos = hits->a[mem->hitbeg] + lmemLen; end_ref_pos = start_ref_pos + mem->start; // Fetch reference to check for extra matching bps rseq = get_seq(iaux->bns->l_pac, iaux->pac, start_ref_pos, end_ref_pos, &len, iaux->ref_string, 0); numMatchingBP = 0; for (m = 0; m < len; ++m) { if (rseq[m] == raux->read_buf[mem->rc_end + m]) { numMatchingBP++; } else { break; } } // free(rseq); mem->start -= numMatchingBP; } lmemLen = mem->end - mem->start; next_be_point = mem->end; // skip forward re-traversal for single-git MEMs if (mem->hitcount == 1) { if (lmemLen >= raux->min_seed_len) { kv_push(mem_t, *smems, *mem); } else { next_be_point += (raux->min_seed_len - lmemLen); } } else if (mem->fetch_leaves && (mem->start <= (raux->l_seq - raux->min_seed_len))) { hits->n -= mem->hitcount; mem->hitbeg = hits->n; mem->hitcount = 0; raux->read_buf = raux->unpacked_queue_buf; rightExtend_fetch_leaves_prefix(iaux, raux, mem, hits); raux->read_buf = raux->unpacked_rc_queue_buf; rmemLen = mem->end - mem->start; next_be_point = mem->end; if (mem->hitcount > 0) { int64_t len; int64_t start_ref_pos = hits->a[mem->hitbeg] + rmemLen; int64_t end_ref_pos = hits->a[mem->hitbeg] + raux->l_seq - mem->start; /// Fetch reference uint8_t* rseq = get_seq(iaux->bns->l_pac, iaux->pac, start_ref_pos, end_ref_pos, &len, iaux->ref_string, 0); int m; int numMatchingBP = 0; /// Check for matching bases for (m = 0; m < len; ++m) { if (rseq[m] == raux->unpacked_queue_buf[mem->end + m]) { numMatchingBP++; } else { break; } } mem->end += numMatchingBP; rmemLen = mem->end - mem->start; next_be_point = mem->end; if (rmemLen >= raux->min_seed_len) { kv_push(mem_t, *smems, *mem); } else { next_be_point += (raux->min_seed_len - rmemLen); } } else { // we don't have min_seed_len match for this start position assert(rmemLen <= raux->min_seed_len); next_be_point += (raux->min_seed_len - rmemLen); } } else { assert(lmemLen <= raux->min_seed_len); next_be_point += (raux->min_seed_len - lmemLen); } return next_be_point; } /* * Compute final SMEMs and their hits after considering their overlaps * * Leaf expansion is also performed to get the actual length of the MEM. * Note that leaf nodes store compressed suffixes by including a pointer to the reference genome * * @param iaux index related parameters * @param raux read related parameters * @param mem maximal-exact-match storage * @param sh helper data structure to keep track of start and end positions of previously identified MEMs * @param smems list of SMEMs * @param hits list of hits for read */ void check_and_add_smem(index_aux_t* iaux, read_aux_t* raux, mem_t* mem, smem_helper_t* sh, mem_v* smems, u64v* hits) { mem->start = raux->l_seq - mem->rc_end; // Adjust start position of LMEM int lmemLen = mem->end - mem->start; if (mem->hitcount > 0 && !mem->skip_ref_fetch) { int64_t len; int64_t start_ref_pos = hits->a[mem->hitbeg] + lmemLen; int64_t end_ref_pos = start_ref_pos + mem->start; // Fetch reference to check for extra matching bps uint8_t* rseq = get_seq(iaux->bns->l_pac, iaux->pac, start_ref_pos, end_ref_pos, &len, iaux->ref_string, 0); int m, numMatchingBP = 0; for (m = 0; m < len; ++m) { if (rseq[m] == raux->read_buf[mem->rc_end + m]) { numMatchingBP++; } else { break; } } // Adjust start position of MEM by extra matching bps mem->start -= numMatchingBP; } lmemLen = mem->end - mem->start; if (lmemLen >= raux->min_seed_len) { // Check if MEM lies completely within previously discovered MEM if (mem->start < sh->prevMemStart || mem->end > sh->prevMemEnd) { // Extra work for equivalency with BWA-MEM. if (mem->fetch_leaves) { hits->n -= mem->hitcount; mem->hitbeg = hits->n; mem->hitcount = 0; raux->read_buf = raux->unpacked_queue_buf; rightExtend_fetch_leaves(iaux, raux, mem, hits); raux->read_buf = raux->unpacked_rc_queue_buf; } if (mem->hitcount > 0) { mem->pt.c_pivot = sh->curr_pivot; mem->pt.p_pivot = sh->prev_pivot; mem->pt.pp_pivot = sh->prev_prev_pivot; kv_push(mem_t, *smems, *mem); if (mem->start <= (sh->prev_pivot + 1)) { sh->stop_be = 1; } } sh->prevMemStart = mem->start; sh->prevMemEnd = mem->end; } } } /* * This function replaces bwt_smem1() and uses ERT to generate SMEMs * * @param iaux index related parameters * @param raux read related parameters * @param smems list of SMEMs * @param hits list of hits for read */ void get_seeds_prefix(index_aux_t* iaux, read_aux_t* raux, mem_v* smems, u64v* hits) { smem_helper_t sh; memset(&sh, 0, sizeof(smem_helper_t)); sh.prevMemStart = raux->l_seq; sh.prevMemEnd = 0; int i = 0, j = 0; sh.prev_pivot = -1; sh.prev_prev_pivot = -1; memset(raux->lep, 0, 5 * sizeof(uint64_t)); while (i < raux->l_seq) { // Begin identifying RMEMs mem_t rm; memset(&rm, 0, sizeof(mem_t)); rm.start = i; rm.forward = 1; rm.hitbeg = hits->n; sh.curr_pivot = rm.start; raux->read_buf = raux->unpacked_queue_buf; rightExtend(iaux, raux, &i, &rm, hits); //!< Compute LEP. // Lazy expansion of leaf nodes. if (rm.hitcount > 0 && !rm.skip_ref_fetch) { int64_t len; int64_t start_ref_pos = hits->a[rm.hitbeg] + i - rm.start; int64_t end_ref_pos = hits->a[rm.hitbeg] + raux->l_seq - rm.start; // Fetch reference uint8_t* rseq = get_seq(iaux->bns->l_pac, iaux->pac, start_ref_pos, end_ref_pos, &len, iaux->ref_string, 0); int m; int numMatchingBP = 0; // Check for matching bases for (m = 0; m < len; ++m) { if (rseq[m] == raux->unpacked_queue_buf[i+m]) { numMatchingBP++; } else { raux->lep[(i+m-1) >> 6] |= (1ULL << ((i+m-1) & (0x3FULL))); break; } } // Last base of RMEM must have LEP bit set if (m == len) { raux->lep[(i+m-1) >> 6] |= (1ULL << ((i+m-1) & (0x3FULL))); } i += numMatchingBP; } rm.end = i; int rmemLen = rm.end - rm.start; // No left-extension for position 0 in read // rm.start is the current pivot if (rm.start == 0) { if (rmemLen >= raux->min_seed_len) { if (rm.hitcount > 0) { kv_push(mem_t, *smems, rm); } } else { hits->n -= rm.hitcount; } memset(raux->lep, 0, 5 * sizeof(uint64_t)); } else { // perform all backward extensions hits->n -= rm.hitcount; uint64_t* lep = raux->lep; int seq_len = raux->l_seq; int min_seed_len = raux->min_seed_len; sh.stop_be = 0; int min_j = (rm.start > min_seed_len) ? (rm.start-1) : (min_seed_len-1); int max_j = rm.end - 1; j = min_j; sh.prev_pivot = rm.start; while (j <= max_j) { mem_t m; int be_point; int mem_valid = init_mem(lep, &m, j, seq_len, min_seed_len); m.hitbeg = hits->n; int next_j = j + 1; if (mem_valid) { be_point = j + 1; if (be_point >= min_seed_len) { int rc_i = seq_len - be_point; raux->read_buf = raux->unpacked_rc_queue_buf; leftExtend(iaux, raux, &rc_i, &m, hits); next_j = check_and_add_smem_prefix(iaux, raux, &m, &sh, smems, hits); } } j = next_j; if (m.end > i) { i = m.end; } } } raux->read_buf = raux->unpacked_queue_buf; // Skip all ambiguous bases while (i < raux->l_seq) { if (raux->read_buf[i] == 4) { ++i; } else { break; } } // Check if there other ambiguous bases within min_seed_len bases of the start of the MEM while ((i < raux->l_seq) && (i - rm.start) < raux->min_seed_len) { if (raux->read_buf[i] == 4) { ++i; break; } ++i; } sh.prev_prev_pivot = sh.prev_pivot; sh.prev_pivot = rm.start; memset(raux->lep, 0, 5 * sizeof(uint64_t)); // break; // zzh } #ifdef PRINT_SMEM ks_introsort(mem_smem_sort_lt_ert, smems->n, smems->a); // Sort SMEMs based on start pos in read. For DEBUG. for (i = 0; i < smems->n; ++i) { // printf("[SMEM]:%d,%d\n", smems->a[i].start, smems->a[i].end); int idx; for (idx = 0; idx < smems->a[i].hitcount; ++idx) { if (smems->a[i].forward || smems->a[i].fetch_leaves) { printf("[SMEM]:%d,%d,%lu\n", smems->a[i].start, smems->a[i].end, hits->a[smems->a[i].hitbeg + idx]); } else { printf("[SMEM]:%d,%d,%lu\n", smems->a[i].start, smems->a[i].end, (iaux->bns->l_pac << 1) - hits->a[smems->a[i].hitbeg + idx] - (smems->a[i].end - smems->a[i].start - smems->a[i].end_correction)); } } } #endif } /* * This function replaces bwt_smem1() and uses ERT to generate SMEMs * * @param iaux index related parameters * @param raux read related parameters * @param smems list of SMEMs * @param hits list of hits for read */ void get_seeds(index_aux_t* iaux, read_aux_t* raux, mem_v* smems, u64v* hits) { smem_helper_t sh; memset(&sh, 0, sizeof(smem_helper_t)); sh.prevMemStart = raux->l_seq; sh.prevMemEnd = 0; int i = 0, j = 0; sh.prev_pivot = -1; sh.prev_prev_pivot = -1; memset(raux->lep, 0, 5 * sizeof(uint64_t)); while (i < raux->l_seq) { // Begin identifying RMEMs mem_t rm; memset(&rm, 0, sizeof(mem_t)); rm.start = i; rm.forward = 1; rm.hitbeg = hits->n; sh.curr_pivot = rm.start; raux->read_buf = raux->unpacked_queue_buf; rightExtend(iaux, raux, &i, &rm, hits); // Compute LEP. // Lazy expansion of leaf nodes. if (rm.hitcount > 0 && !rm.skip_ref_fetch) { int64_t len; int64_t start_ref_pos = hits->a[rm.hitbeg] + i - rm.start; int64_t end_ref_pos = hits->a[rm.hitbeg] + raux->l_seq - rm.start; // Fetch reference uint8_t* rseq = get_seq(iaux->bns->l_pac, iaux->pac, start_ref_pos, end_ref_pos, &len, iaux->ref_string, 0); int m; int numMatchingBP = 0; // Check for matching bases for (m = 0; m < len; ++m) { if (rseq[m] == raux->unpacked_queue_buf[i+m]) { numMatchingBP++; } else { raux->lep[(i+m-1) >> 6] |= (1ULL << ((i+m-1) & (0x3FULL))); break; } } // Last base of RMEM must have LEP bit set if (m == len) { raux->lep[(i+m-1) >> 6] |= (1ULL << ((i+m-1) & (0x3FULL))); } i += numMatchingBP; } rm.end = i; int rmemLen = rm.end - rm.start; // No left-extension for position 0 in read // rm.start is the current pivot if (rm.start == 0) { if (rmemLen >= raux->min_seed_len) { if (rm.hitcount > 0) { rm.pt.c_pivot = sh.curr_pivot; rm.pt.p_pivot = sh.prev_pivot; rm.pt.pp_pivot = sh.prev_prev_pivot; kv_push(mem_t, *smems, rm); } } else { hits->n -= rm.hitcount; } memset(raux->lep, 0, 5 * sizeof(uint64_t)); } else { hits->n -= rm.hitcount; uint64_t* lep = raux->lep; int seq_len = raux->l_seq; int min_seed_len = raux->min_seed_len; j = rm.end-1; sh.stop_be = 0; int min_j = (rm.start > min_seed_len) ? (rm.start-1) : (min_seed_len-1); while (j >= min_j) { mem_t m; int be_point; int mem_valid = init_mem(lep, &m, j, seq_len, min_seed_len); m.hitbeg = hits->n; if (mem_valid) { be_point = j + 1; if (be_point >= min_seed_len) { int rc_i = seq_len - be_point; raux->read_buf = raux->unpacked_rc_queue_buf; leftExtend(iaux, raux, &rc_i, &m, hits); check_and_add_smem(iaux, raux, &m, &sh, smems, hits); if (sh.stop_be) break; } } j -= 1; } } raux->read_buf = raux->unpacked_queue_buf; // Skip all ambiguous bases while (i < raux->l_seq) { if (raux->read_buf[i] == 4) { ++i; } else { break; } } // Check if there other ambiguous bases within min_seed_len bases of the start of the MEM while ((i < raux->l_seq) && (i - rm.start) < raux->min_seed_len) { if (raux->read_buf[i] == 4) { ++i; break; } ++i; } sh.prev_prev_pivot = sh.prev_pivot; sh.prev_pivot = rm.start; memset(raux->lep, 0, 5 * sizeof(uint64_t)); // break; // zzh } #ifdef PRINT_SMEM ks_introsort(mem_smem_sort_lt_ert, smems->n, smems->a); // Sort SMEMs based on start pos in read. For DEBUG. for (i = 0; i < smems->n; ++i) { // printf("[SMEM]:%d,%d\n", smems->a[i].start, smems->a[i].end); int idx; for (idx = 0; idx < smems->a[i].hitcount; ++idx) { if (smems->a[i].forward || smems->a[i].fetch_leaves) { printf("[SMEM]:%d,%d,%lu\n", smems->a[i].start, smems->a[i].end, hits->a[smems->a[i].hitbeg + idx]); } else { printf("[SMEM]:%d,%d,%lu\n", smems->a[i].start, smems->a[i].end, (iaux->bns->l_pac << 1) - hits->a[smems->a[i].hitbeg + idx] - (smems->a[i].end - smems->a[i].start)); } } } #endif } /* * This function performs reseeding of SMEMs * * @param iaux index related parameters * @param raux read related parameters * @param smems list of SMEMs * @param start pivot position in read * @param limit hit threshold below which tree traversal must stop * @param pt track pivot information to reduce work done during reseeding * @param hits list of hits for every read */ void reseed_prefix(index_aux_t* iaux, read_aux_t* raux, mem_v* smems, int start, int limit, pivot_t* pt, u64v* hits) { smem_helper_t sh; memset(&sh, 0, sizeof(smem_helper_t)); sh.prevMemStart = raux->l_seq; sh.prevMemEnd = 0; int i = start, j = 0; #ifdef PRINT_SMEM int old_n = smems->n; #endif memset(raux->lep, 0, 5 * sizeof(uint64_t)); mem_t rm; memset(&rm, 0, sizeof(mem_t)); rm.start = i; rm.forward = 1; rm.hitbeg = hits->n; sh.prev_pivot = (rm.start >= pt->c_pivot) ? pt->p_pivot : pt->pp_pivot; raux->read_buf = raux->unpacked_queue_buf; raux->limit = limit; rightExtend_wlimit(iaux, raux, &i, &rm, hits); // Compute LEP. // Lazy expansion of leaf nodes. if (rm.hitcount > 0 && !rm.skip_ref_fetch) { int64_t len; int64_t start_ref_pos = hits->a[rm.hitbeg] + i - rm.start; int64_t end_ref_pos = hits->a[rm.hitbeg] + raux->l_seq - rm.start; // Fetch reference uint8_t* rseq = get_seq(iaux->bns->l_pac, iaux->pac, start_ref_pos, end_ref_pos, &len, iaux->ref_string, 0); int m; int numMatchingBP = 0; // Check for matching bases for (m = 0; m < len; ++m) { if (rseq[m] == raux->unpacked_queue_buf[i+m]) { numMatchingBP++; } else { raux->lep[(i+m-1) >> 6] |= (1ULL << ((i+m-1) & (0x3FULL))); break; } } // Last base of RMEM must have LEP bit set if (m == len) { raux->lep[(i+m-1) >> 6] |= (1ULL << ((i+m-1) & (0x3FULL))); } i += numMatchingBP; } rm.end = i; int rmemLen = rm.end - rm.start; if (rm.start == 0) { if (rmemLen >= raux->min_seed_len) { if (rm.hitcount > 0) { kv_push(mem_t, *smems, rm); } } else { hits->n -= rm.hitcount; } memset(raux->lep, 0, 5 * sizeof(uint64_t)); } // Begin left-extension, i.e., right extension on reverse complemented read else { hits->n -= rm.hitcount; uint64_t* lep = raux->lep; int seq_len = raux->l_seq; int min_seed_len = raux->min_seed_len; sh.stop_be = 0; int min_j = (rm.start > min_seed_len) ? (rm.start-1) : (min_seed_len-1); int max_j = rm.end - 1; j = min_j; sh.prev_pivot = rm.start; sh.mem_end_limit = rm.end; while (j <= max_j) { mem_t m; int be_point; int mem_valid = init_mem(lep, &m, j, seq_len, min_seed_len); m.hitbeg = hits->n; int next_j = j + 1; if (mem_valid) { be_point = j + 1; if (be_point >= min_seed_len) { int rc_i = seq_len - be_point; raux->read_buf = raux->unpacked_rc_queue_buf; leftExtend_wlimit(iaux, raux, &rc_i, &m, hits); next_j = check_and_add_smem_prefix_reseed(iaux, raux, &m, &sh, smems, hits); } } j = next_j; } } #ifdef PRINT_SMEM ks_introsort(mem_smem_sort_lt_ert, smems->n - old_n, &smems->a[old_n]); // Debug: Sort SMEMs based on start pos in read. for (i = old_n; i < smems->n; ++i) { int idx; for (idx = 0; idx < smems->a[i].hitcount; ++idx) { if (smems->a[i].forward || smems->a[i].fetch_leaves) { printf("[Reseed]:%d,%d,%lu\n", smems->a[i].start, smems->a[i].end, hits->a[smems->a[i].hitbeg + idx]); } else { printf("[Reseed]:%d,%d,%lu\n", smems->a[i].start, smems->a[i].end, (iaux->bns->l_pac << 1) - hits->a[smems->a[i].hitbeg + idx] - (smems->a[i].end - smems->a[i].start - smems->a[i].end_correction)); } } } #endif } /* * This function performs reseeding of SMEMs * * @param iaux index related parameters * @param raux read related parameters * @param smems list of SMEMs and their hits * @param start pivot position in read * @param limit hit threshold below which tree traversal must stop * @param pt track pivot information to reduce work done during reseeding * @param hits list of hits for read */ void reseed(index_aux_t* iaux, read_aux_t* raux, mem_v* smems, int start, int limit, pivot_t* pt, u64v* hits) { smem_helper_t sh; memset(&sh, 0, sizeof(smem_helper_t)); sh.prevMemStart = raux->l_seq; sh.prevMemEnd = 0; int i = start, j = 0; #ifdef PRINT_SMEM int old_n = smems->n; #endif memset(raux->lep, 0, 5 * sizeof(uint64_t)); mem_t rm; memset(&rm, 0, sizeof(mem_t)); rm.start = i; rm.forward = 1; rm.hitbeg = hits->n; sh.prev_pivot = (rm.start >= pt->c_pivot) ? pt->p_pivot : pt->pp_pivot; raux->read_buf = raux->unpacked_queue_buf; raux->limit = limit; rightExtend_wlimit(iaux, raux, &i, &rm, hits); //!< Compute LEP. // Lazy expansion of leaf nodes. if (rm.hitcount > 0 && !rm.skip_ref_fetch) { int64_t len; int64_t start_ref_pos = hits->a[rm.hitbeg] + i - rm.start; int64_t end_ref_pos = hits->a[rm.hitbeg] + raux->l_seq - rm.start; // Fetch reference uint8_t* rseq = get_seq(iaux->bns->l_pac, iaux->pac, start_ref_pos, end_ref_pos, &len, iaux->ref_string, 0); int m; int numMatchingBP = 0; // Check for matching bases for (m = 0; m < len; ++m) { if (rseq[m] == raux->unpacked_queue_buf[i+m]) { numMatchingBP++; } else { raux->lep[(i+m-1) >> 6] |= (1ULL << ((i+m-1) & (0x3FULL))); break; } } // Last base of RMEM must have LEP bit set if (m == len) { raux->lep[(i+m-1) >> 6] |= (1ULL << ((i+m-1) & (0x3FULL))); } i += numMatchingBP; } rm.end = i; int rmemLen = rm.end - rm.start; if (rm.start == 0) { if (rmemLen >= raux->min_seed_len) { if (rm.hitcount > 0) { kv_push(mem_t, *smems, rm); } } else { hits->n -= rm.hitcount; } memset(raux->lep, 0, 5 * sizeof(uint64_t)); } // Begin left-extension, i.e., right extension on reverse complemented read else { hits->n -= rm.hitcount; uint64_t* lep = raux->lep; int seq_len = raux->l_seq; int min_seed_len = raux->min_seed_len; j = rm.end-1; sh.stop_be = 0; int min_j = (rm.start > min_seed_len) ? (rm.start-1) : (min_seed_len-1); while (j >= min_j) { mem_t m; int be_point; int mem_valid = init_mem(lep, &m, j, seq_len, min_seed_len); m.hitbeg = hits->n; if (mem_valid) { be_point = j + 1; if (be_point >= min_seed_len) { int rc_i = seq_len - be_point; raux->read_buf = raux->unpacked_rc_queue_buf; leftExtend_wlimit(iaux, raux, &rc_i, &m, hits); check_and_add_smem(iaux, raux, &m, &sh, smems, hits); if (sh.stop_be) break; } } j -= 1; } } #ifdef PRINT_SMEM ks_introsort(mem_smem_sort_lt_ert, smems->n - old_n, &smems->a[old_n]); // Debug: Sort SMEMs based on start pos in read. for (i = old_n; i < smems->n; ++i) { int idx; for (idx = 0; idx < smems->a[i].hitcount; ++idx) { if (smems->a[i].forward || smems->a[i].fetch_leaves) { printf("[Reseed]:%d,%d,%lu\n", smems->a[i].start, smems->a[i].end, hits->a[smems->a[i].hitbeg + idx]); } else { printf("[Reseed]:%d,%d,%lu\n", smems->a[i].start, smems->a[i].end, (iaux->bns->l_pac << 1) - hits->a[smems->a[i].hitbeg + idx] - (smems->a[i].end - smems->a[i].start)); } } } #endif } /* * This function performs the LAST heuristic * * @param iaux index related parameters * @param raux read related parameters * @param smems list of SMEMs * @param limit hit threshold above which tree traversal must stop * @param hits list of hits for read */ void last(index_aux_t* iaux, read_aux_t* raux, mem_v* smems, int limit, u64v* hits) { int i = 0; #ifdef PRINT_SMEM int old_n = smems->n; #endif const uint8_t minSeedLen = raux->min_seed_len + 1; // LAST exits seeding only when seed length >= 20 raux->limit = limit; while (i < raux->l_seq) { // Begin identifying RMEMs mem_t rm; rm.start = i; rm.end = 0; rm.forward = 1; rm.skip_ref_fetch = 0; rm.hitbeg = hits->n; rm.hitcount = 0; raux->read_buf = raux->unpacked_queue_buf; rightExtend_last(iaux, raux, &i, &rm, hits); // Lazy expansion of leaf nodes. if (rm.hitcount > 0 && !rm.skip_ref_fetch) { int64_t len; int64_t start_ref_pos = hits->a[rm.hitbeg] + i - rm.start; int64_t end_ref_pos = hits->a[rm.hitbeg] + raux->l_seq - rm.start; // Fetch reference uint8_t* rseq = get_seq(iaux->bns->l_pac, iaux->pac, start_ref_pos, end_ref_pos, &len, iaux->ref_string, 0); int m, numMatchingBP = 0; // Check for matching bases for (m = 0; m < len; ++m) { int seedLen = (i + m) - rm.start; int match_next_bp = ((seedLen < minSeedLen) || (rm.hitcount >= raux->limit)) ? 1 : 0; if (match_next_bp) { if ((rseq[m] == raux->unpacked_queue_buf[i+m])) { numMatchingBP++; } else { ++i; // Increment i on every mismatch for LAST to match BWA-MEM hits->n -= rm.hitcount; rm.hitcount = 0; break; } } else { break; } } i += numMatchingBP; } rm.end = i; int rmemLen = rm.end - rm.start; if (rmemLen >= minSeedLen) { if (rm.hitcount > 0 && rm.hitcount < raux->limit) { kv_push(mem_t, *smems, rm); } else { hits->n -= rm.hitcount; } } else { hits->n -= rm.hitcount; } int foundN = 0; // Skip all ambiguous bases assert(i > 0); if (raux->read_buf[i-1] == 4) { // Found an 'N' during lookup foundN = 1; } if (!foundN) { // Check if there other ambiguous bases within min_seed_len bases of the start of the MEM while ((i < raux->l_seq) && (i - rm.start) < minSeedLen) { if (raux->read_buf[i] == 4) { ++i; break; } ++i; } } } #ifdef PRINT_SMEM ks_introsort(mem_smem_sort_lt_ert, smems->n - old_n, &smems->a[old_n]); // Debug: Sort SMEMs based on start pos in read. for (i = old_n; i < smems->n; ++i) { int idx; for (idx = 0; idx < smems->a[i].hitcount; ++idx) { printf("[LAST]:%d,%d,%lu\n", smems->a[i].start, smems->a[i].end, hits->a[smems->a[i].hitbeg + idx]); } } #endif }