first attempt to make bwt_gen work for >4GB seq
This commit is contained in:
parent
3114edcb7c
commit
95b1ab7e96
|
|
@ -36,59 +36,29 @@
|
|||
#include "QSufSort.h"
|
||||
|
||||
// Static functions
|
||||
static void QSufSortSortSplit(int* __restrict V, int* __restrict I, const int lowestPos,
|
||||
const int highestPos, const int numSortedChar);
|
||||
static int QSufSortChoosePivot(int* __restrict V, int* __restrict I, const int lowestPos,
|
||||
const int highestPos, const int numSortedChar);
|
||||
static void QSufSortInsertSortSplit(int* __restrict V, int* __restrict I, const int lowestPos,
|
||||
const int highestPos, const int numSortedChar);
|
||||
static void QSufSortBucketSort(int* __restrict V, int* __restrict I, const int numChar, const int alphabetSize);
|
||||
static int QSufSortTransform(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol,
|
||||
const int smallestInputSymbol, const int maxNewAlphabetSize, int *numSymbolAggregated);
|
||||
|
||||
// from MiscUtilities.c
|
||||
static unsigned int leadingZero(const unsigned int input) {
|
||||
|
||||
unsigned int l;
|
||||
const static unsigned int leadingZero8bit[256] = {8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
|
||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
|
||||
|
||||
if (input & 0xFFFF0000) {
|
||||
if (input & 0xFF000000) {
|
||||
l = leadingZero8bit[input >> 24];
|
||||
} else {
|
||||
l = 8 + leadingZero8bit[input >> 16];
|
||||
}
|
||||
} else {
|
||||
if (input & 0x0000FF00) {
|
||||
l = 16 + leadingZero8bit[input >> 8];
|
||||
} else {
|
||||
l = 24 + leadingZero8bit[input];
|
||||
}
|
||||
}
|
||||
return l;
|
||||
|
||||
}
|
||||
static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
|
||||
const qsint_t highestPos, const qsint_t numSortedChar);
|
||||
static qsint_t QSufSortChoosePivot(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
|
||||
const qsint_t highestPos, const qsint_t numSortedChar);
|
||||
static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
|
||||
const qsint_t highestPos, const qsint_t numSortedChar);
|
||||
static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize);
|
||||
static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol,
|
||||
const qsint_t smallestInputSymbol, const qsint_t maxNewAlphabetSize, qsint_t *numSymbolAggregated);
|
||||
|
||||
/* Makes suffix array p of x. x becomes inverse of p. p and x are both of size
|
||||
n+1. Contents of x[0...n-1] are integers in the range l...k-1. Original
|
||||
contents of x[n] is disregarded, the n-th symbol being regarded as
|
||||
end-of-string smaller than all other symbols.*/
|
||||
void QSufSortSuffixSort(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol,
|
||||
const int smallestInputSymbol, const int skipTransform) {
|
||||
void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol,
|
||||
const qsint_t smallestInputSymbol, const int skipTransform) {
|
||||
|
||||
int i, j;
|
||||
int s, negatedSortedGroupLength;
|
||||
int numSymbolAggregated;
|
||||
int maxNumInputSymbol;
|
||||
int numSortedPos = 1;
|
||||
int newAlphabetSize;
|
||||
qsint_t i, j;
|
||||
qsint_t s, negatedSortedGroupLength;
|
||||
qsint_t numSymbolAggregated;
|
||||
qsint_t maxNumInputSymbol;
|
||||
qsint_t numSortedPos = 1;
|
||||
qsint_t newAlphabetSize;
|
||||
|
||||
maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1;
|
||||
|
||||
|
|
@ -102,7 +72,7 @@ void QSufSortSuffixSort(int* __restrict V, int* __restrict I, const int numChar,
|
|||
numSortedPos = numSymbolAggregated;
|
||||
}
|
||||
|
||||
while ((int)(I[0]) >= -(int)numChar) {
|
||||
while ((qsint_t)(I[0]) >= -(qsint_t)numChar) {
|
||||
i = 0;
|
||||
negatedSortedGroupLength = 0;
|
||||
do {
|
||||
|
|
@ -129,9 +99,9 @@ void QSufSortSuffixSort(int* __restrict V, int* __restrict I, const int numChar,
|
|||
|
||||
}
|
||||
|
||||
void QSufSortGenerateSaFromInverse(const int* V, int* __restrict I, const int numChar) {
|
||||
void QSufSortGenerateSaFromInverse(const qsint_t* V, qsint_t* __restrict I, const qsint_t numChar) {
|
||||
|
||||
int i;
|
||||
qsint_t i;
|
||||
for (i=0; i<=numChar; i++) {
|
||||
I[V[i]] = i + 1;
|
||||
}
|
||||
|
|
@ -143,21 +113,14 @@ void QSufSortGenerateSaFromInverse(const int* V, int* __restrict I, const int nu
|
|||
quicksort taken from Bentley & McIlroy, "Engineering a Sort Function",
|
||||
Software -- Practice and Experience 23(11), 1249-1265 (November 1993). This
|
||||
function is based on Program 7.*/
|
||||
static void QSufSortSortSplit(int* __restrict V, int* __restrict I, const int lowestPos,
|
||||
const int highestPos, const int numSortedChar) {
|
||||
static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
|
||||
const qsint_t highestPos, const qsint_t numSortedChar) {
|
||||
|
||||
int a, b, c, d;
|
||||
int l, m;
|
||||
int f, v, s, t;
|
||||
int tmp;
|
||||
int numItem;
|
||||
|
||||
#ifdef DEBUG
|
||||
if (lowestPos > highestPos) {
|
||||
fprintf(stderr, "QSufSortSortSplit(): lowestPos > highestPos!\n");
|
||||
exit(1);
|
||||
}
|
||||
#endif
|
||||
qsint_t a, b, c, d;
|
||||
qsint_t l, m;
|
||||
qsint_t f, v, s, t;
|
||||
qsint_t tmp;
|
||||
qsint_t numItem;
|
||||
|
||||
numItem = highestPos - lowestPos + 1;
|
||||
|
||||
|
|
@ -171,7 +134,7 @@ static void QSufSortSortSplit(int* __restrict V, int* __restrict I, const int lo
|
|||
a = b = lowestPos;
|
||||
c = d = highestPos;
|
||||
|
||||
while (TRUE) {
|
||||
while (1) {
|
||||
while (c >= b && (f = KEY(V, I, b, numSortedChar)) <= v) {
|
||||
if (f == v) {
|
||||
swap(I[a], I[b], tmp);
|
||||
|
|
@ -235,31 +198,17 @@ static void QSufSortSortSplit(int* __restrict V, int* __restrict I, const int lo
|
|||
}
|
||||
|
||||
/* Algorithm by Bentley & McIlroy.*/
|
||||
static int QSufSortChoosePivot(int* __restrict V, int* __restrict I, const int lowestPos,
|
||||
const int highestPos, const int numSortedChar) {
|
||||
static qsint_t QSufSortChoosePivot(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
|
||||
const qsint_t highestPos, const qsint_t numSortedChar) {
|
||||
|
||||
int m;
|
||||
int keyl, keym, keyn;
|
||||
int key1, key2, key3;
|
||||
int s;
|
||||
int numItem;
|
||||
|
||||
#ifdef DEBUG
|
||||
if (lowestPos > highestPos) {
|
||||
fprintf(stderr, "QSufSortChoosePivot(): lowestPos > highestPos!\n");
|
||||
exit(1);
|
||||
}
|
||||
#endif
|
||||
qsint_t m;
|
||||
qsint_t keyl, keym, keyn;
|
||||
qsint_t key1, key2, key3;
|
||||
qsint_t s;
|
||||
qsint_t numItem;
|
||||
|
||||
numItem = highestPos - lowestPos + 1;
|
||||
|
||||
#ifdef DEBUG
|
||||
if (numItem <= INSERT_SORT_NUM_ITEM) {
|
||||
fprintf(stderr, "QSufSortChoosePivot(): number of items <= INSERT_SORT_NUM_ITEM!\n");
|
||||
exit(1);
|
||||
}
|
||||
#endif
|
||||
|
||||
m = lowestPos + numItem / 2;
|
||||
|
||||
s = numItem / 8;
|
||||
|
|
@ -282,39 +231,19 @@ static int QSufSortChoosePivot(int* __restrict V, int* __restrict I, const int l
|
|||
}
|
||||
|
||||
/* Quadratic sorting method to use for small subarrays. */
|
||||
static void QSufSortInsertSortSplit(int* __restrict V, int* __restrict I, const int lowestPos,
|
||||
const int highestPos, const int numSortedChar) {
|
||||
static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
|
||||
const qsint_t highestPos, const qsint_t numSortedChar) {
|
||||
|
||||
int i, j;
|
||||
int tmpKey, tmpPos;
|
||||
int numItem;
|
||||
int key[INSERT_SORT_NUM_ITEM], pos[INSERT_SORT_NUM_ITEM];
|
||||
int negativeSortedLength;
|
||||
int groupNum;
|
||||
|
||||
#ifdef DEBUG
|
||||
if (lowestPos > highestPos) {
|
||||
fprintf(stderr, "QSufSortInsertSortSplit(): lowestPos > highestPos!\n");
|
||||
exit(1);
|
||||
}
|
||||
#endif
|
||||
qsint_t i, j;
|
||||
qsint_t tmpKey, tmpPos;
|
||||
qsint_t numItem;
|
||||
qsint_t key[INSERT_SORT_NUM_ITEM], pos[INSERT_SORT_NUM_ITEM];
|
||||
qsint_t negativeSortedLength;
|
||||
qsint_t groupNum;
|
||||
|
||||
numItem = highestPos - lowestPos + 1;
|
||||
|
||||
#ifdef DEBUG
|
||||
if (numItem > INSERT_SORT_NUM_ITEM) {
|
||||
fprintf(stderr, "QSufSortInsertSortSplit(): number of items > INSERT_SORT_NUM_ITEM!\n");
|
||||
exit(1);
|
||||
}
|
||||
#endif
|
||||
|
||||
for (i=0; i<numItem; i++) {
|
||||
#ifdef DEBUG
|
||||
if (I[lowestPos + i] < 0) {
|
||||
fprintf(stderr, "QSufSortInsertSortSplit(): I < 0 in unsorted region!\n");
|
||||
exit(1);
|
||||
}
|
||||
#endif
|
||||
pos[i] = I[lowestPos + i];
|
||||
key[i] = V[pos[i] + numSortedChar];
|
||||
}
|
||||
|
|
@ -366,12 +295,12 @@ static void QSufSortInsertSortSplit(int* __restrict V, int* __restrict I, const
|
|||
Output: x is V and p is I after the initial sorting stage of the refined
|
||||
suffix sorting algorithm.*/
|
||||
|
||||
static void QSufSortBucketSort(int* __restrict V, int* __restrict I, const int numChar, const int alphabetSize) {
|
||||
static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize) {
|
||||
|
||||
int i, c;
|
||||
int d;
|
||||
int groupNum;
|
||||
int currentIndex;
|
||||
qsint_t i, c;
|
||||
qsint_t d;
|
||||
qsint_t groupNum;
|
||||
qsint_t currentIndex;
|
||||
|
||||
// mark linked list empty
|
||||
for (i=0; i<alphabetSize; i++) {
|
||||
|
|
@ -381,14 +310,14 @@ static void QSufSortBucketSort(int* __restrict V, int* __restrict I, const int n
|
|||
// insert to linked list
|
||||
for (i=0; i<=numChar; i++) {
|
||||
c = V[i];
|
||||
V[i] = (int)(I[c]);
|
||||
V[i] = (qsint_t)(I[c]);
|
||||
I[c] = i;
|
||||
}
|
||||
|
||||
currentIndex = numChar;
|
||||
for (i=alphabetSize; i>0; i--) {
|
||||
c = I[i-1];
|
||||
d = (int)(V[c]);
|
||||
d = (qsint_t)(V[c]);
|
||||
groupNum = currentIndex;
|
||||
V[c] = groupNum;
|
||||
if (d >= 0) {
|
||||
|
|
@ -424,20 +353,20 @@ static void QSufSortBucketSort(int* __restrict V, int* __restrict I, const int n
|
|||
Output: Returns an integer j in the range 1...q representing the size of the
|
||||
new alphabet. If j<=n+1, the alphabet is compacted. The global variable r is
|
||||
set to the number of old symbols grouped into one. Only x[n] is 0.*/
|
||||
static int QSufSortTransform(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol,
|
||||
const int smallestInputSymbol, const int maxNewAlphabetSize, int *numSymbolAggregated) {
|
||||
static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol,
|
||||
const qsint_t smallestInputSymbol, const qsint_t maxNewAlphabetSize, qsint_t *numSymbolAggregated) {
|
||||
|
||||
int c, i, j;
|
||||
int a; // numSymbolAggregated
|
||||
int mask;
|
||||
int minSymbolInChunk = 0, maxSymbolInChunk = 0;
|
||||
int newAlphabetSize;
|
||||
int maxNumInputSymbol, maxNumBit, maxSymbol;
|
||||
qsint_t c, i, j;
|
||||
qsint_t a; // numSymbolAggregated
|
||||
qsint_t mask;
|
||||
qsint_t minSymbolInChunk = 0, maxSymbolInChunk = 0;
|
||||
qsint_t newAlphabetSize;
|
||||
qsint_t maxNumInputSymbol, maxNumBit, maxSymbol;
|
||||
|
||||
maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1;
|
||||
|
||||
maxNumBit = BITS_IN_WORD - leadingZero(maxNumInputSymbol);
|
||||
maxSymbol = INT_MAX >> maxNumBit;
|
||||
for (maxNumBit = 0, i = maxNumInputSymbol; i; i >>= 1) ++maxNumBit;
|
||||
maxSymbol = QSINT_MAX >> maxNumBit;
|
||||
|
||||
c = maxNumInputSymbol;
|
||||
for (a = 0; a < numChar && maxSymbolInChunk <= maxSymbol && c <= maxNewAlphabetSize; a++) {
|
||||
|
|
@ -449,14 +378,6 @@ static int QSufSortTransform(int* __restrict V, int* __restrict I, const int num
|
|||
mask = (1 << (a-1) * maxNumBit) - 1; /* mask masks off top old symbol from chunk.*/
|
||||
V[numChar] = smallestInputSymbol - 1; /* emulate zero terminator.*/
|
||||
|
||||
#ifdef DEBUG
|
||||
// Section of code for maxSymbolInChunk > numChar removed!
|
||||
if (maxSymbolInChunk > numChar) {
|
||||
fprintf(stderr, "QSufSortTransform(): maxSymbolInChunk > numChar!\n");
|
||||
exit(1);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* bucketing possible, compact alphabet.*/
|
||||
for (i=0; i<=maxSymbolInChunk; i++) {
|
||||
I[i] = 0; /* zero transformation table.*/
|
||||
|
|
|
|||
|
|
@ -29,12 +29,17 @@
|
|||
#ifndef __QSUFSORT_H__
|
||||
#define __QSUFSORT_H__
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#define KEY(V, I, p, h) ( V[ I[p] + h ] )
|
||||
#define INSERT_SORT_NUM_ITEM 16
|
||||
|
||||
void QSufSortSuffixSort(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol,
|
||||
const int smallestInputSymbol, const int skipTransform);
|
||||
void QSufSortGenerateSaFromInverse(const int *V, int* __restrict I, const int numChar);
|
||||
typedef int64_t qsint_t;
|
||||
#define QSINT_MAX INT64_MAX
|
||||
|
||||
void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol,
|
||||
const qsint_t smallestInputSymbol, const int skipTransform);
|
||||
void QSufSortGenerateSaFromInverse(const qsint_t *V, qsint_t* __restrict I, const qsint_t numChar);
|
||||
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -25,22 +25,27 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include "bwt_gen.h"
|
||||
#include "QSufSort.h"
|
||||
|
||||
static unsigned int TextLengthFromBytePacked(unsigned int bytePackedLength, unsigned int bitPerChar,
|
||||
#define MIN_AVAILABLE_WORD 0x10000
|
||||
|
||||
static bgint_t TextLengthFromBytePacked(bgint_t bytePackedLength, unsigned int bitPerChar,
|
||||
unsigned int lastByteLength)
|
||||
{
|
||||
if (bytePackedLength > ALL_ONE_MASK / (BITS_IN_BYTE / bitPerChar)) {
|
||||
fprintf(stderr, "TextLengthFromBytePacked(): text length > 2^32!\n");
|
||||
exit(1);
|
||||
}
|
||||
return (bytePackedLength - 1) * (BITS_IN_BYTE / bitPerChar) + lastByteLength;
|
||||
}
|
||||
|
||||
static void initializeVAL(unsigned int *startAddr, const unsigned int length, const unsigned int initValue)
|
||||
static void initializeVAL(unsigned int *startAddr, const bgint_t length, const unsigned int initValue)
|
||||
{
|
||||
unsigned int i;
|
||||
bgint_t i;
|
||||
for (i=0; i<length; i++) startAddr[i] = initValue;
|
||||
}
|
||||
|
||||
static void initializeVAL_bg(bgint_t *startAddr, const bgint_t length, const bgint_t initValue)
|
||||
{
|
||||
bgint_t i;
|
||||
for (i=0; i<length; i++) startAddr[i] = initValue;
|
||||
}
|
||||
|
||||
|
|
@ -60,25 +65,25 @@ static void GenerateDNAOccCountTable(unsigned int *dnaDecodeTable)
|
|||
|
||||
}
|
||||
// for BWTIncCreate()
|
||||
static unsigned int BWTOccValueMajorSizeInWord(const unsigned int numChar)
|
||||
static bgint_t BWTOccValueMajorSizeInWord(const bgint_t numChar)
|
||||
{
|
||||
unsigned int numOfOccValue;
|
||||
unsigned int numOfOccIntervalPerMajor;
|
||||
bgint_t numOfOccValue;
|
||||
unsigned numOfOccIntervalPerMajor;
|
||||
numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding
|
||||
numOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL;
|
||||
return (numOfOccValue + numOfOccIntervalPerMajor - 1) / numOfOccIntervalPerMajor * ALPHABET_SIZE;
|
||||
}
|
||||
// for BWTIncCreate()
|
||||
static unsigned int BWTOccValueMinorSizeInWord(const unsigned int numChar)
|
||||
static bgint_t BWTOccValueMinorSizeInWord(const bgint_t numChar)
|
||||
{
|
||||
unsigned int numOfOccValue;
|
||||
bgint_t numOfOccValue;
|
||||
numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding
|
||||
return (numOfOccValue + OCC_VALUE_PER_WORD - 1) / OCC_VALUE_PER_WORD * ALPHABET_SIZE;
|
||||
}
|
||||
// for BWTIncCreate()
|
||||
static unsigned int BWTResidentSizeInWord(const unsigned int numChar) {
|
||||
static bgint_t BWTResidentSizeInWord(const bgint_t numChar) {
|
||||
|
||||
unsigned int numCharRoundUpToOccInterval;
|
||||
bgint_t numCharRoundUpToOccInterval;
|
||||
|
||||
// The $ in BWT at the position of inverseSa0 is not encoded
|
||||
numCharRoundUpToOccInterval = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL;
|
||||
|
|
@ -89,13 +94,13 @@ static unsigned int BWTResidentSizeInWord(const unsigned int numChar) {
|
|||
|
||||
static void BWTIncSetBuildSizeAndTextAddr(BWTInc *bwtInc)
|
||||
{
|
||||
unsigned int maxBuildSize;
|
||||
bgint_t maxBuildSize;
|
||||
|
||||
if (bwtInc->bwt->textLength == 0) {
|
||||
// initial build
|
||||
// Minus 2 because n+1 entries of seq and rank needed for n char
|
||||
maxBuildSize = (bwtInc->availableWord - 2 - OCC_INTERVAL / CHAR_PER_WORD)
|
||||
/ (2 * CHAR_PER_WORD + 1) * CHAR_PER_WORD;
|
||||
maxBuildSize = (bwtInc->availableWord - 2 * (sizeof(bgint_t) / 4) - OCC_INTERVAL / CHAR_PER_WORD)
|
||||
/ (2 * CHAR_PER_WORD + 1) * CHAR_PER_WORD / (sizeof(bgint_t) / 4);
|
||||
if (bwtInc->initialMaxBuildSize > 0) {
|
||||
bwtInc->buildSize = min(bwtInc->initialMaxBuildSize, maxBuildSize);
|
||||
} else {
|
||||
|
|
@ -104,9 +109,9 @@ static void BWTIncSetBuildSizeAndTextAddr(BWTInc *bwtInc)
|
|||
} else {
|
||||
// Minus 3 because n+1 entries of sorted rank, seq and rank needed for n char
|
||||
// Minus numberOfIterationDone because bwt slightly shift to left in each iteration
|
||||
maxBuildSize = (bwtInc->availableWord - bwtInc->bwt->bwtSizeInWord - bwtInc->bwt->occSizeInWord - 3
|
||||
maxBuildSize = (bwtInc->availableWord - bwtInc->bwt->bwtSizeInWord - bwtInc->bwt->occSizeInWord - 3 * (sizeof(bgint_t) / 4)
|
||||
- bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR)
|
||||
/ 3;
|
||||
/ 3 / (sizeof(bgint_t) / 4);
|
||||
if (maxBuildSize < CHAR_PER_WORD) {
|
||||
fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n");
|
||||
exit(1);
|
||||
|
|
@ -116,9 +121,8 @@ static void BWTIncSetBuildSizeAndTextAddr(BWTInc *bwtInc)
|
|||
} else {
|
||||
bwtInc->buildSize = maxBuildSize;
|
||||
}
|
||||
if (bwtInc->buildSize < CHAR_PER_WORD) {
|
||||
if (bwtInc->buildSize < CHAR_PER_WORD)
|
||||
bwtInc->buildSize = CHAR_PER_WORD;
|
||||
}
|
||||
}
|
||||
|
||||
if (bwtInc->buildSize < CHAR_PER_WORD) {
|
||||
|
|
@ -128,9 +132,8 @@ static void BWTIncSetBuildSizeAndTextAddr(BWTInc *bwtInc)
|
|||
|
||||
bwtInc->buildSize = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD;
|
||||
|
||||
bwtInc->packedText = bwtInc->workingMemory + 2 * (bwtInc->buildSize + 1);
|
||||
bwtInc->textBuffer = (unsigned char*)(bwtInc->workingMemory + bwtInc->buildSize + 1);
|
||||
|
||||
bwtInc->packedText = bwtInc->workingMemory + 2 * (bwtInc->buildSize + 1) * (sizeof(bgint_t) / 4);
|
||||
bwtInc->textBuffer = (unsigned char*)(bwtInc->workingMemory + (bwtInc->buildSize + 1) * (sizeof(bgint_t) / 4));
|
||||
}
|
||||
|
||||
// for ceilLog2()
|
||||
|
|
@ -186,17 +189,17 @@ static unsigned int BitPerWordPackedChar(const unsigned int alphabetSize)
|
|||
}
|
||||
|
||||
static void ConvertBytePackedToWordPacked(const unsigned char *input, unsigned int *output, const unsigned int alphabetSize,
|
||||
const unsigned int textLength)
|
||||
const bgint_t textLength)
|
||||
{
|
||||
unsigned int i, j, k;
|
||||
unsigned int c;
|
||||
bgint_t i;
|
||||
unsigned int j, k, c;
|
||||
unsigned int bitPerBytePackedChar;
|
||||
unsigned int bitPerWordPackedChar;
|
||||
unsigned int charPerWord;
|
||||
unsigned int charPerByte;
|
||||
unsigned int bytePerIteration;
|
||||
unsigned int byteProcessed = 0;
|
||||
unsigned int wordProcessed = 0;
|
||||
bgint_t byteProcessed = 0;
|
||||
bgint_t wordProcessed = 0;
|
||||
unsigned int mask, shift;
|
||||
|
||||
unsigned int buffer[BITS_IN_WORD];
|
||||
|
|
@ -250,7 +253,7 @@ static void ConvertBytePackedToWordPacked(const unsigned char *input, unsigned i
|
|||
output[wordProcessed] = c;
|
||||
}
|
||||
|
||||
BWT *BWTCreate(const unsigned int textLength, unsigned int *decodeTable)
|
||||
BWT *BWTCreate(const bgint_t textLength, unsigned int *decodeTable)
|
||||
{
|
||||
BWT *bwt;
|
||||
|
||||
|
|
@ -258,8 +261,8 @@ BWT *BWTCreate(const unsigned int textLength, unsigned int *decodeTable)
|
|||
|
||||
bwt->textLength = 0;
|
||||
|
||||
bwt->cumulativeFreq = (unsigned*)calloc((ALPHABET_SIZE + 1), sizeof(unsigned int*));
|
||||
initializeVAL(bwt->cumulativeFreq, ALPHABET_SIZE + 1, 0);
|
||||
bwt->cumulativeFreq = (bgint_t*)calloc((ALPHABET_SIZE + 1), sizeof(bgint_t));
|
||||
initializeVAL_bg(bwt->cumulativeFreq, ALPHABET_SIZE + 1, 0);
|
||||
|
||||
bwt->bwtSizeInWord = 0;
|
||||
|
||||
|
|
@ -272,7 +275,7 @@ BWT *BWTCreate(const unsigned int textLength, unsigned int *decodeTable)
|
|||
}
|
||||
|
||||
bwt->occMajorSizeInWord = BWTOccValueMajorSizeInWord(textLength);
|
||||
bwt->occValueMajor = (unsigned*)calloc(bwt->occMajorSizeInWord, sizeof(unsigned int));
|
||||
bwt->occValueMajor = (bgint_t*)calloc(bwt->occMajorSizeInWord, sizeof(bgint_t));
|
||||
|
||||
bwt->occSizeInWord = 0;
|
||||
bwt->occValue = NULL;
|
||||
|
|
@ -297,8 +300,8 @@ BWTInc *BWTIncCreate(const unsigned int textLength, const float targetNBit,
|
|||
bwtInc->initialMaxBuildSize = initialMaxBuildSize;
|
||||
bwtInc->incMaxBuildSize = incMaxBuildSize;
|
||||
bwtInc->targetNBit = targetNBit;
|
||||
bwtInc->cumulativeCountInCurrentBuild = (unsigned*)calloc((ALPHABET_SIZE + 1), sizeof(unsigned int));
|
||||
initializeVAL(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0);
|
||||
bwtInc->cumulativeCountInCurrentBuild = (bgint_t*)calloc((ALPHABET_SIZE + 1), sizeof(bgint_t));
|
||||
initializeVAL_bg(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0);
|
||||
|
||||
// Build frequently accessed data
|
||||
bwtInc->packedShift = (unsigned*)calloc(CHAR_PER_WORD, sizeof(unsigned int));
|
||||
|
|
@ -306,7 +309,8 @@ BWTInc *BWTIncCreate(const unsigned int textLength, const float targetNBit,
|
|||
bwtInc->packedShift[i] = BITS_IN_WORD - (i+1) * BIT_PER_CHAR;
|
||||
}
|
||||
|
||||
bwtInc->availableWord = (unsigned int)((textLength + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL / BITS_IN_WORD * bwtInc->targetNBit);
|
||||
bwtInc->availableWord = (bgint_t)((textLength + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL / BITS_IN_WORD * bwtInc->targetNBit);
|
||||
if (bwtInc->availableWord < MIN_AVAILABLE_WORD) bwtInc->availableWord = MIN_AVAILABLE_WORD;
|
||||
if (bwtInc->availableWord < BWTResidentSizeInWord(textLength) + BWTOccValueMinorSizeInWord(textLength)) {
|
||||
fprintf(stderr, "BWTIncCreate() : targetNBit is too low!\n");
|
||||
exit(1);
|
||||
|
|
@ -317,14 +321,16 @@ BWTInc *BWTIncCreate(const unsigned int textLength, const float targetNBit,
|
|||
|
||||
}
|
||||
// for BWTIncConstruct()
|
||||
static void BWTIncPutPackedTextToRank(const unsigned int *packedText, unsigned int* __restrict rank,
|
||||
unsigned int* __restrict cumulativeCount, const unsigned int numChar)
|
||||
static void BWTIncPutPackedTextToRank(const unsigned int *packedText, bgint_t* __restrict rank,
|
||||
bgint_t* __restrict cumulativeCount, const bgint_t numChar)
|
||||
{
|
||||
unsigned int i, j;
|
||||
bgint_t i;
|
||||
unsigned int j;
|
||||
unsigned int c, t;
|
||||
unsigned int packedMask;
|
||||
unsigned int rankIndex;
|
||||
unsigned int lastWord, numCharInLastWord;
|
||||
bgint_t rankIndex;
|
||||
bgint_t lastWord;
|
||||
unsigned int numCharInLastWord;
|
||||
|
||||
lastWord = (numChar - 1) / CHAR_PER_WORD;
|
||||
numCharInLastWord = numChar - lastWord * CHAR_PER_WORD;
|
||||
|
|
@ -359,17 +365,17 @@ static void BWTIncPutPackedTextToRank(const unsigned int *packedText, unsigned i
|
|||
}
|
||||
|
||||
|
||||
static void ForwardDNAAllOccCountNoLimit(const unsigned int* dna, const unsigned int index,
|
||||
unsigned int* __restrict occCount, const unsigned int* dnaDecodeTable)
|
||||
static void ForwardDNAAllOccCountNoLimit(const unsigned int* dna, const bgint_t index,
|
||||
bgint_t* __restrict occCount, const unsigned int* dnaDecodeTable)
|
||||
{
|
||||
static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000,
|
||||
0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000,
|
||||
0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00,
|
||||
0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC };
|
||||
|
||||
unsigned int iteration, wordToCount, charToCount;
|
||||
unsigned int i, j, c;
|
||||
unsigned int sum;
|
||||
bgint_t iteration, i;
|
||||
unsigned int wordToCount, charToCount;
|
||||
unsigned int j, c, sum;
|
||||
|
||||
occCount[0] = 0;
|
||||
occCount[1] = 0;
|
||||
|
|
@ -432,13 +438,14 @@ static void ForwardDNAAllOccCountNoLimit(const unsigned int* dna, const unsigne
|
|||
occCount[3] += sum;
|
||||
}
|
||||
|
||||
static void BWTIncBuildPackedBwt(const unsigned int *relativeRank, unsigned int* __restrict bwt, const unsigned int numChar,
|
||||
const unsigned int *cumulativeCount, const unsigned int *packedShift) {
|
||||
static void BWTIncBuildPackedBwt(const bgint_t *relativeRank, unsigned int* __restrict bwt, const bgint_t numChar,
|
||||
const bgint_t *cumulativeCount, const unsigned int *packedShift) {
|
||||
|
||||
unsigned int i, c, r;
|
||||
unsigned int previousRank, currentRank;
|
||||
unsigned int wordIndex, charIndex;
|
||||
unsigned int inverseSa0;
|
||||
bgint_t i, r;
|
||||
unsigned int c;
|
||||
bgint_t previousRank, currentRank;
|
||||
bgint_t wordIndex, charIndex;
|
||||
bgint_t inverseSa0;
|
||||
|
||||
inverseSa0 = previousRank = relativeRank[0];
|
||||
|
||||
|
|
@ -463,10 +470,10 @@ static void BWTIncBuildPackedBwt(const unsigned int *relativeRank, unsigned int*
|
|||
}
|
||||
}
|
||||
|
||||
static inline unsigned int BWTOccValueExplicit(const BWT *bwt, const unsigned int occIndexExplicit,
|
||||
static inline bgint_t BWTOccValueExplicit(const BWT *bwt, const bgint_t occIndexExplicit,
|
||||
const unsigned int character)
|
||||
{
|
||||
unsigned int occIndexMajor;
|
||||
bgint_t occIndexMajor;
|
||||
|
||||
occIndexMajor = occIndexExplicit * OCC_INTERVAL / OCC_INTERVAL_MAJOR;
|
||||
|
||||
|
|
@ -546,46 +553,43 @@ static unsigned int BackwardDNAOccCount(const unsigned int* dna, const unsigned
|
|||
|
||||
}
|
||||
|
||||
unsigned int BWTOccValue(const BWT *bwt, unsigned int index, const unsigned int character) {
|
||||
|
||||
unsigned int occValue;
|
||||
unsigned int occExplicitIndex, occIndex;
|
||||
bgint_t BWTOccValue(const BWT *bwt, bgint_t index, const unsigned int character)
|
||||
{
|
||||
bgint_t occValue;
|
||||
bgint_t occExplicitIndex, occIndex;
|
||||
|
||||
// $ is supposed to be positioned at inverseSa0 but it is not encoded
|
||||
// therefore index is subtracted by 1 for adjustment
|
||||
if (index > bwt->inverseSa0) {
|
||||
if (index > bwt->inverseSa0)
|
||||
index--;
|
||||
}
|
||||
|
||||
occExplicitIndex = (index + OCC_INTERVAL / 2 - 1) / OCC_INTERVAL; // Bidirectional encoding
|
||||
occIndex = occExplicitIndex * OCC_INTERVAL;
|
||||
occValue = BWTOccValueExplicit(bwt, occExplicitIndex, character);
|
||||
|
||||
if (occIndex == index) {
|
||||
if (occIndex == index)
|
||||
return occValue;
|
||||
}
|
||||
|
||||
if (occIndex < index) {
|
||||
return occValue + ForwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, index - occIndex, character, bwt->decodeTable);
|
||||
} else {
|
||||
return occValue - BackwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, occIndex - index, character, bwt->decodeTable);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static unsigned int BWTIncGetAbsoluteRank(BWT *bwt, unsigned int* __restrict absoluteRank, unsigned int* __restrict seq,
|
||||
const unsigned int *packedText, const unsigned int numChar,
|
||||
const unsigned int* cumulativeCount, const unsigned int firstCharInLastIteration)
|
||||
static bgint_t BWTIncGetAbsoluteRank(BWT *bwt, bgint_t* __restrict absoluteRank, bgint_t* __restrict seq,
|
||||
const unsigned int *packedText, const bgint_t numChar,
|
||||
const bgint_t* cumulativeCount, const unsigned int firstCharInLastIteration)
|
||||
{
|
||||
unsigned int saIndex;
|
||||
unsigned int lastWord;
|
||||
bgint_t saIndex;
|
||||
bgint_t lastWord;
|
||||
unsigned int packedMask;
|
||||
unsigned int i, j;
|
||||
unsigned int c, t;
|
||||
unsigned int rankIndex;
|
||||
bgint_t i;
|
||||
unsigned int c, t, j;
|
||||
bgint_t rankIndex;
|
||||
unsigned int shift;
|
||||
unsigned int seqIndexFromStart[ALPHABET_SIZE];
|
||||
unsigned int seqIndexFromEnd[ALPHABET_SIZE];
|
||||
bgint_t seqIndexFromStart[ALPHABET_SIZE];
|
||||
bgint_t seqIndexFromEnd[ALPHABET_SIZE];
|
||||
|
||||
for (i=0; i<ALPHABET_SIZE; i++) {
|
||||
seqIndexFromStart[i] = cumulativeCount[i];
|
||||
|
|
@ -625,16 +629,16 @@ static unsigned int BWTIncGetAbsoluteRank(BWT *bwt, unsigned int* __restrict abs
|
|||
return seqIndexFromStart[firstCharInLastIteration];
|
||||
}
|
||||
|
||||
static void BWTIncSortKey(unsigned int* __restrict key, unsigned int* __restrict seq, const unsigned int numItem)
|
||||
static void BWTIncSortKey(bgint_t* __restrict key, bgint_t* __restrict seq, const bgint_t numItem)
|
||||
{
|
||||
#define EQUAL_KEY_THRESHOLD 4 // Partition for equal key if data array size / the number of data with equal value with pivot < EQUAL_KEY_THRESHOLD
|
||||
|
||||
int lowIndex, highIndex, midIndex;
|
||||
int lowPartitionIndex, highPartitionIndex;
|
||||
int lowStack[32], highStack[32];
|
||||
int64_t lowIndex, highIndex, midIndex;
|
||||
int64_t lowPartitionIndex, highPartitionIndex;
|
||||
int64_t lowStack[32], highStack[32];
|
||||
int stackDepth;
|
||||
int i, j;
|
||||
unsigned int tempSeq, tempKey;
|
||||
int64_t i, j;
|
||||
bgint_t tempSeq, tempKey;
|
||||
int numberOfEqualKey;
|
||||
|
||||
if (numItem < 2) return;
|
||||
|
|
@ -803,15 +807,15 @@ static void BWTIncSortKey(unsigned int* __restrict key, unsigned int* __restrict
|
|||
}
|
||||
|
||||
|
||||
static void BWTIncBuildRelativeRank(unsigned int* __restrict sortedRank, unsigned int* __restrict seq,
|
||||
unsigned int* __restrict relativeRank, const unsigned int numItem,
|
||||
unsigned int oldInverseSa0, const unsigned int *cumulativeCount)
|
||||
static void BWTIncBuildRelativeRank(bgint_t* __restrict sortedRank, bgint_t* __restrict seq,
|
||||
bgint_t* __restrict relativeRank, const bgint_t numItem,
|
||||
bgint_t oldInverseSa0, const bgint_t *cumulativeCount)
|
||||
{
|
||||
unsigned int i, c;
|
||||
unsigned int s, r;
|
||||
unsigned int lastRank, lastIndex;
|
||||
unsigned int oldInverseSa0RelativeRank = 0;
|
||||
unsigned int freq;
|
||||
bgint_t i, c;
|
||||
bgint_t s, r;
|
||||
bgint_t lastRank, lastIndex;
|
||||
bgint_t oldInverseSa0RelativeRank = 0;
|
||||
bgint_t freq;
|
||||
|
||||
lastIndex = numItem;
|
||||
lastRank = sortedRank[numItem];
|
||||
|
|
@ -831,14 +835,12 @@ static void BWTIncBuildRelativeRank(unsigned int* __restrict sortedRank, unsigne
|
|||
|
||||
for (i=numItem; i--;) { // from numItem - 1 to 0
|
||||
r = sortedRank[i];
|
||||
if (r > oldInverseSa0) {
|
||||
if (r > oldInverseSa0)
|
||||
sortedRank[i]--; // to prepare for merging; $ is not encoded in bwt
|
||||
}
|
||||
s = seq[i];
|
||||
if (i < freq) {
|
||||
if (lastIndex >= freq) {
|
||||
if (lastIndex >= freq)
|
||||
lastRank++; // to trigger the group across alphabet boundary to be split
|
||||
}
|
||||
c--;
|
||||
freq = cumulativeCount[c];
|
||||
}
|
||||
|
|
@ -846,10 +848,10 @@ static void BWTIncBuildRelativeRank(unsigned int* __restrict sortedRank, unsigne
|
|||
relativeRank[s] = lastIndex;
|
||||
} else {
|
||||
if (i == lastIndex - 1) {
|
||||
if (lastIndex < numItem && (int)seq[lastIndex + 1] < 0) {
|
||||
if (lastIndex < numItem && (sbgint_t)seq[lastIndex + 1] < 0) {
|
||||
seq[lastIndex] = seq[lastIndex + 1] - 1;
|
||||
} else {
|
||||
seq[lastIndex] = (unsigned int)-1;
|
||||
seq[lastIndex] = (bgint_t)-1;
|
||||
}
|
||||
}
|
||||
lastIndex = i;
|
||||
|
|
@ -865,11 +867,12 @@ static void BWTIncBuildRelativeRank(unsigned int* __restrict sortedRank, unsigne
|
|||
|
||||
}
|
||||
|
||||
static void BWTIncBuildBwt(unsigned int* seq, const unsigned int *relativeRank, const unsigned int numChar,
|
||||
const unsigned int *cumulativeCount)
|
||||
static void BWTIncBuildBwt(unsigned int* insertBwt, const bgint_t *relativeRank, const bgint_t numChar,
|
||||
const bgint_t *cumulativeCount)
|
||||
{
|
||||
unsigned int i, c;
|
||||
unsigned int previousRank, currentRank;
|
||||
unsigned int c;
|
||||
bgint_t i;
|
||||
bgint_t previousRank, currentRank;
|
||||
|
||||
previousRank = relativeRank[0];
|
||||
|
||||
|
|
@ -877,20 +880,20 @@ static void BWTIncBuildBwt(unsigned int* seq, const unsigned int *relativeRank,
|
|||
currentRank = relativeRank[i];
|
||||
c = (previousRank >= cumulativeCount[1]) + (previousRank >= cumulativeCount[2])
|
||||
+ (previousRank >= cumulativeCount[3]);
|
||||
seq[currentRank] = c;
|
||||
insertBwt[currentRank] = c;
|
||||
previousRank = currentRank;
|
||||
}
|
||||
}
|
||||
|
||||
static void BWTIncMergeBwt(const unsigned int *sortedRank, const unsigned int* oldBwt, const unsigned int *insertBwt,
|
||||
unsigned int* __restrict mergedBwt, const unsigned int numOldBwt, const unsigned int numInsertBwt)
|
||||
static void BWTIncMergeBwt(const bgint_t *sortedRank, const unsigned int* oldBwt, const unsigned int *insertBwt,
|
||||
unsigned int* __restrict mergedBwt, const bgint_t numOldBwt, const bgint_t numInsertBwt)
|
||||
{
|
||||
unsigned int bitsInWordMinusBitPerChar;
|
||||
unsigned int leftShift, rightShift;
|
||||
unsigned int o;
|
||||
unsigned int oIndex, iIndex, mIndex;
|
||||
unsigned int mWord, mChar, oWord, oChar;
|
||||
unsigned int numInsert;
|
||||
bgint_t leftShift, rightShift;
|
||||
bgint_t o;
|
||||
bgint_t oIndex, iIndex, mIndex;
|
||||
bgint_t mWord, mChar, oWord, oChar;
|
||||
bgint_t numInsert;
|
||||
|
||||
bitsInWordMinusBitPerChar = BITS_IN_WORD - BIT_PER_CHAR;
|
||||
|
||||
|
|
@ -997,9 +1000,9 @@ static void BWTIncMergeBwt(const unsigned int *sortedRank, const unsigned int* o
|
|||
|
||||
void BWTClearTrailingBwtCode(BWT *bwt)
|
||||
{
|
||||
unsigned int bwtResidentSizeInWord;
|
||||
unsigned int wordIndex, offset;
|
||||
unsigned int i;
|
||||
bgint_t bwtResidentSizeInWord;
|
||||
bgint_t wordIndex, offset;
|
||||
bgint_t i;
|
||||
|
||||
bwtResidentSizeInWord = BWTResidentSizeInWord(bwt->textLength);
|
||||
|
||||
|
|
@ -1020,18 +1023,18 @@ void BWTClearTrailingBwtCode(BWT *bwt)
|
|||
|
||||
|
||||
void BWTGenerateOccValueFromBwt(const unsigned int* bwt, unsigned int* __restrict occValue,
|
||||
unsigned int* __restrict occValueMajor,
|
||||
const unsigned int textLength, const unsigned int* decodeTable)
|
||||
bgint_t* __restrict occValueMajor,
|
||||
const bgint_t textLength, const unsigned int* decodeTable)
|
||||
{
|
||||
unsigned int numberOfOccValueMajor, numberOfOccValue;
|
||||
bgint_t numberOfOccValueMajor, numberOfOccValue;
|
||||
unsigned int wordBetweenOccValue;
|
||||
unsigned int numberOfOccIntervalPerMajor;
|
||||
bgint_t numberOfOccIntervalPerMajor;
|
||||
unsigned int c;
|
||||
unsigned int i, j;
|
||||
unsigned int occMajorIndex;
|
||||
unsigned int occIndex, bwtIndex;
|
||||
unsigned int sum;
|
||||
unsigned int tempOccValue0[ALPHABET_SIZE], tempOccValue1[ALPHABET_SIZE];
|
||||
bgint_t i, j;
|
||||
bgint_t occMajorIndex;
|
||||
bgint_t occIndex, bwtIndex;
|
||||
bgint_t sum; // perhaps unsigned is big enough
|
||||
bgint_t tempOccValue0[ALPHABET_SIZE], tempOccValue1[ALPHABET_SIZE];
|
||||
|
||||
wordBetweenOccValue = OCC_INTERVAL / CHAR_PER_WORD;
|
||||
|
||||
|
|
@ -1231,31 +1234,25 @@ void BWTGenerateOccValueFromBwt(const unsigned int* bwt, unsigned int* __restri
|
|||
|
||||
}
|
||||
|
||||
static void BWTIncConstruct(BWTInc *bwtInc, const unsigned int numChar)
|
||||
static void BWTIncConstruct(BWTInc *bwtInc, const bgint_t numChar)
|
||||
{
|
||||
unsigned int i;
|
||||
unsigned int mergedBwtSizeInWord, mergedOccSizeInWord;
|
||||
bgint_t mergedBwtSizeInWord, mergedOccSizeInWord;
|
||||
unsigned int firstCharInThisIteration;
|
||||
|
||||
unsigned int *relativeRank, *seq, *sortedRank, *insertBwt, *mergedBwt;
|
||||
unsigned int newInverseSa0RelativeRank, oldInverseSa0RelativeRank, newInverseSa0;
|
||||
|
||||
#ifdef DEBUG
|
||||
if (numChar > bwtInc->buildSize) {
|
||||
fprintf(stderr, "BWTIncConstruct(): numChar > buildSize!\n");
|
||||
exit(1);
|
||||
}
|
||||
#endif
|
||||
bgint_t *relativeRank, *seq, *sortedRank;
|
||||
unsigned int *insertBwt, *mergedBwt;
|
||||
bgint_t newInverseSa0RelativeRank, oldInverseSa0RelativeRank, newInverseSa0;
|
||||
|
||||
mergedBwtSizeInWord = BWTResidentSizeInWord(bwtInc->bwt->textLength + numChar);
|
||||
mergedOccSizeInWord = BWTOccValueMinorSizeInWord(bwtInc->bwt->textLength + numChar);
|
||||
|
||||
initializeVAL(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0);
|
||||
initializeVAL_bg(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0);
|
||||
|
||||
if (bwtInc->bwt->textLength == 0) { // Initial build
|
||||
|
||||
// Set address
|
||||
seq = bwtInc->workingMemory;
|
||||
seq = (bgint_t*)bwtInc->workingMemory;
|
||||
relativeRank = seq + bwtInc->buildSize + 1;
|
||||
mergedBwt = insertBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord; // build in place
|
||||
|
||||
|
|
@ -1265,7 +1262,7 @@ static void BWTIncConstruct(BWTInc *bwtInc, const unsigned int numChar)
|
|||
relativeRank[numChar] = 0;
|
||||
|
||||
// Sort suffix
|
||||
QSufSortSuffixSort((int*)relativeRank, (int*)seq, (int)numChar, (int)ALPHABET_SIZE - 1, 0, FALSE);
|
||||
QSufSortSuffixSort((qsint_t*)relativeRank, (qsint_t*)seq, (qsint_t)numChar, (qsint_t)ALPHABET_SIZE - 1, 0, FALSE);
|
||||
newInverseSa0 = relativeRank[0];
|
||||
|
||||
// Clear BWT area
|
||||
|
|
@ -1279,9 +1276,9 @@ static void BWTIncConstruct(BWTInc *bwtInc, const unsigned int numChar)
|
|||
|
||||
} else { // Incremental build
|
||||
// Set address
|
||||
sortedRank = bwtInc->workingMemory;
|
||||
sortedRank = (bgint_t*)bwtInc->workingMemory;
|
||||
seq = sortedRank + bwtInc->buildSize + 1;
|
||||
insertBwt = seq;
|
||||
insertBwt = (unsigned*)seq; // insertBwt and seq share memory
|
||||
relativeRank = seq + bwtInc->buildSize + 1;
|
||||
|
||||
// Store the first character of this iteration
|
||||
|
|
@ -1318,15 +1315,10 @@ static void BWTIncConstruct(BWTInc *bwtInc, const unsigned int numChar)
|
|||
// build relative rank; sortedRank is updated for merging to cater for the fact that $ is not encoded in bwt
|
||||
// the cumulative freq information is used to make sure that inverseSa0 and suffix beginning with different characters are kept in different unsorted groups)
|
||||
BWTIncBuildRelativeRank(sortedRank, seq, relativeRank, numChar, bwtInc->bwt->inverseSa0, bwtInc->cumulativeCountInCurrentBuild);
|
||||
#ifdef DEBUG
|
||||
if (relativeRank[numChar] != oldInverseSa0RelativeRank) {
|
||||
fprintf(stderr, "BWTIncConstruct(): relativeRank[numChar] != oldInverseSa0RelativeRank!\n");
|
||||
exit(1);
|
||||
}
|
||||
#endif
|
||||
assert(relativeRank[numChar] == oldInverseSa0RelativeRank);
|
||||
|
||||
// Sort suffix
|
||||
QSufSortSuffixSort((int*)relativeRank, (int*)seq, (int)numChar, (int)numChar, 1, TRUE);
|
||||
QSufSortSuffixSort((qsint_t*)relativeRank, (qsint_t*)seq, (qsint_t)numChar, (qsint_t)numChar, 1, TRUE);
|
||||
|
||||
newInverseSa0RelativeRank = relativeRank[0];
|
||||
newInverseSa0 = sortedRank[newInverseSa0RelativeRank] + newInverseSa0RelativeRank;
|
||||
|
|
@ -1334,7 +1326,7 @@ static void BWTIncConstruct(BWTInc *bwtInc, const unsigned int numChar)
|
|||
sortedRank[newInverseSa0RelativeRank] = 0; // a special value so that this is skipped in the merged bwt
|
||||
|
||||
// Build BWT
|
||||
BWTIncBuildBwt(seq, relativeRank, numChar, bwtInc->cumulativeCountInCurrentBuild);
|
||||
BWTIncBuildBwt(insertBwt, relativeRank, numChar, bwtInc->cumulativeCountInCurrentBuild);
|
||||
|
||||
// Merge BWT
|
||||
mergedBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord
|
||||
|
|
@ -1349,10 +1341,7 @@ static void BWTIncConstruct(BWTInc *bwtInc, const unsigned int numChar)
|
|||
bwtInc->bwt->bwtCode = mergedBwt;
|
||||
bwtInc->bwt->bwtSizeInWord = mergedBwtSizeInWord;
|
||||
bwtInc->bwt->occSizeInWord = mergedOccSizeInWord;
|
||||
if (mergedBwt < bwtInc->workingMemory + mergedOccSizeInWord) {
|
||||
fprintf(stderr, "BWTIncConstruct() : Not enough memory allocated!\n");
|
||||
exit(1);
|
||||
}
|
||||
assert(mergedBwt >= bwtInc->workingMemory + mergedOccSizeInWord);
|
||||
|
||||
bwtInc->bwt->occValue = mergedBwt - mergedOccSizeInWord;
|
||||
|
||||
|
|
@ -1376,14 +1365,14 @@ static void BWTIncConstruct(BWTInc *bwtInc, const unsigned int numChar)
|
|||
}
|
||||
|
||||
BWTInc *BWTIncConstructFromPacked(const char *inputFileName, const float targetNBit,
|
||||
const unsigned int initialMaxBuildSize, const unsigned int incMaxBuildSize)
|
||||
bgint_t initialMaxBuildSize, bgint_t incMaxBuildSize)
|
||||
{
|
||||
|
||||
FILE *packedFile;
|
||||
unsigned int packedFileLen;
|
||||
unsigned int totalTextLength;
|
||||
unsigned int textToLoad, textSizeInByte;
|
||||
unsigned int processedTextLength;
|
||||
bgint_t packedFileLen;
|
||||
bgint_t totalTextLength;
|
||||
bgint_t textToLoad, textSizeInByte;
|
||||
bgint_t processedTextLength;
|
||||
unsigned char lastByteLength;
|
||||
|
||||
BWTInc *bwtInc;
|
||||
|
|
@ -1397,10 +1386,6 @@ BWTInc *BWTIncConstructFromPacked(const char *inputFileName, const float targetN
|
|||
|
||||
fseek(packedFile, -1, SEEK_END);
|
||||
packedFileLen = ftell(packedFile);
|
||||
if ((int)packedFileLen < 0) {
|
||||
fprintf(stderr, "BWTIncConstructFromPacked: Cannot determine file length!\n");
|
||||
exit(1);
|
||||
}
|
||||
fread(&lastByteLength, sizeof(unsigned char), 1, packedFile);
|
||||
totalTextLength = TextLengthFromBytePacked(packedFileLen, BIT_PER_CHAR, lastByteLength);
|
||||
|
||||
|
|
@ -1416,9 +1401,9 @@ BWTInc *BWTIncConstructFromPacked(const char *inputFileName, const float targetN
|
|||
textSizeInByte = textToLoad / CHAR_PER_BYTE; // excluded the odd byte
|
||||
|
||||
fseek(packedFile, -2, SEEK_CUR);
|
||||
fseek(packedFile, -((int)textSizeInByte), SEEK_CUR);
|
||||
fseek(packedFile, -((long)textSizeInByte), SEEK_CUR);
|
||||
fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte + 1, packedFile);
|
||||
fseek(packedFile, -((int)textSizeInByte + 1), SEEK_CUR);
|
||||
fseek(packedFile, -((long)textSizeInByte + 1), SEEK_CUR);
|
||||
|
||||
ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad);
|
||||
BWTIncConstruct(bwtInc, textToLoad);
|
||||
|
|
@ -1431,15 +1416,15 @@ BWTInc *BWTIncConstructFromPacked(const char *inputFileName, const float targetN
|
|||
textToLoad = totalTextLength - processedTextLength;
|
||||
}
|
||||
textSizeInByte = textToLoad / CHAR_PER_BYTE;
|
||||
fseek(packedFile, -((int)textSizeInByte), SEEK_CUR);
|
||||
fseek(packedFile, -((long)textSizeInByte), SEEK_CUR);
|
||||
fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte, packedFile);
|
||||
fseek(packedFile, -((int)textSizeInByte), SEEK_CUR);
|
||||
fseek(packedFile, -((long)textSizeInByte), SEEK_CUR);
|
||||
ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad);
|
||||
BWTIncConstruct(bwtInc, textToLoad);
|
||||
processedTextLength += textToLoad;
|
||||
if (bwtInc->numberOfIterationDone % 10 == 0) {
|
||||
printf("[BWTIncConstructFromPacked] %u iterations done. %u characters processed.\n",
|
||||
bwtInc->numberOfIterationDone, processedTextLength);
|
||||
printf("[BWTIncConstructFromPacked] %lu iterations done. %lu characters processed.\n",
|
||||
(long)bwtInc->numberOfIterationDone, (long)processedTextLength);
|
||||
}
|
||||
}
|
||||
return bwtInc;
|
||||
|
|
@ -1464,7 +1449,7 @@ void BWTIncFree(BWTInc *bwtInc)
|
|||
free(bwtInc);
|
||||
}
|
||||
|
||||
static unsigned int BWTFileSizeInWord(const unsigned int numChar)
|
||||
static bgint_t BWTFileSizeInWord(const bgint_t numChar)
|
||||
{
|
||||
// The $ in BWT at the position of inverseSa0 is not encoded
|
||||
return (numChar + CHAR_PER_WORD - 1) / CHAR_PER_WORD;
|
||||
|
|
@ -1474,7 +1459,7 @@ void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *o
|
|||
{
|
||||
FILE *bwtFile;
|
||||
/* FILE *occValueFile; */
|
||||
unsigned int bwtLength;
|
||||
bgint_t bwtLength;
|
||||
|
||||
bwtFile = (FILE*)fopen(bwtFileName, "wb");
|
||||
if (bwtFile == NULL) {
|
||||
|
|
@ -1482,8 +1467,8 @@ void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *o
|
|||
exit(1);
|
||||
}
|
||||
|
||||
fwrite(&bwt->inverseSa0, sizeof(unsigned int), 1, bwtFile);
|
||||
fwrite(bwt->cumulativeFreq + 1, sizeof(unsigned int), ALPHABET_SIZE, bwtFile);
|
||||
fwrite(&bwt->inverseSa0, sizeof(bgint_t), 1, bwtFile);
|
||||
fwrite(bwt->cumulativeFreq + 1, sizeof(bgint_t), ALPHABET_SIZE, bwtFile);
|
||||
bwtLength = BWTFileSizeInWord(bwt->textLength);
|
||||
fwrite(bwt->bwtCode, sizeof(unsigned int), bwtLength, bwtFile);
|
||||
fclose(bwtFile);
|
||||
|
|
|
|||
|
|
@ -27,8 +27,8 @@
|
|||
|
||||
#include <stdint.h>
|
||||
|
||||
//typedef int64_t bgint_t;
|
||||
typedef unsigned bgint_t;
|
||||
typedef uint64_t bgint_t;
|
||||
typedef int64_t sbgint_t;
|
||||
|
||||
#define ALPHABET_SIZE 4
|
||||
#define BIT_PER_CHAR 2
|
||||
|
|
@ -69,20 +69,20 @@ typedef struct BWT {
|
|||
unsigned int *occValue; // Occurrence values stored explicitly
|
||||
bgint_t *occValueMajor; // Occurrence values stored explicitly
|
||||
unsigned int *decodeTable; // For decoding BWT by table lookup
|
||||
unsigned int bwtSizeInWord; // Temporary variable to hold the memory allocated
|
||||
unsigned int occSizeInWord; // Temporary variable to hold the memory allocated
|
||||
unsigned int occMajorSizeInWord; // Temporary variable to hold the memory allocated
|
||||
bgint_t bwtSizeInWord; // Temporary variable to hold the memory allocated
|
||||
bgint_t occSizeInWord; // Temporary variable to hold the memory allocated
|
||||
bgint_t occMajorSizeInWord; // Temporary variable to hold the memory allocated
|
||||
} BWT;
|
||||
|
||||
typedef struct BWTInc {
|
||||
BWT *bwt;
|
||||
unsigned int numberOfIterationDone;
|
||||
unsigned int *cumulativeCountInCurrentBuild;
|
||||
unsigned int availableWord;
|
||||
bgint_t *cumulativeCountInCurrentBuild;
|
||||
bgint_t availableWord;
|
||||
float targetNBit;
|
||||
unsigned int buildSize;
|
||||
unsigned int initialMaxBuildSize;
|
||||
unsigned int incMaxBuildSize;
|
||||
bgint_t buildSize;
|
||||
bgint_t initialMaxBuildSize;
|
||||
bgint_t incMaxBuildSize;
|
||||
unsigned int firstCharInLastIteration;
|
||||
unsigned int *workingMemory;
|
||||
unsigned int *packedText;
|
||||
|
|
|
|||
Loading…
Reference in New Issue