diff --git a/bwt_gen/QSufSort.c b/bwt_gen/QSufSort.c index 5bf35de..92a8594 100644 --- a/bwt_gen/QSufSort.c +++ b/bwt_gen/QSufSort.c @@ -36,59 +36,29 @@ #include "QSufSort.h" // Static functions -static void QSufSortSortSplit(int* __restrict V, int* __restrict I, const int lowestPos, - const int highestPos, const int numSortedChar); -static int QSufSortChoosePivot(int* __restrict V, int* __restrict I, const int lowestPos, - const int highestPos, const int numSortedChar); -static void QSufSortInsertSortSplit(int* __restrict V, int* __restrict I, const int lowestPos, - const int highestPos, const int numSortedChar); -static void QSufSortBucketSort(int* __restrict V, int* __restrict I, const int numChar, const int alphabetSize); -static int QSufSortTransform(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol, - const int smallestInputSymbol, const int maxNewAlphabetSize, int *numSymbolAggregated); - -// from MiscUtilities.c -static unsigned int leadingZero(const unsigned int input) { - - unsigned int l; - const static unsigned int leadingZero8bit[256] = {8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; - - if (input & 0xFFFF0000) { - if (input & 0xFF000000) { - l = leadingZero8bit[input >> 24]; - } else { - l = 8 + leadingZero8bit[input >> 16]; - } - } else { - if (input & 0x0000FF00) { - l = 16 + leadingZero8bit[input >> 8]; - } else { - l = 24 + leadingZero8bit[input]; - } - } - return l; - -} +static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, + const qsint_t highestPos, const qsint_t numSortedChar); +static qsint_t QSufSortChoosePivot(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, + const qsint_t highestPos, const qsint_t numSortedChar); +static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, + const qsint_t highestPos, const qsint_t numSortedChar); +static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize); +static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, + const qsint_t smallestInputSymbol, const qsint_t maxNewAlphabetSize, qsint_t *numSymbolAggregated); /* Makes suffix array p of x. x becomes inverse of p. p and x are both of size n+1. Contents of x[0...n-1] are integers in the range l...k-1. Original contents of x[n] is disregarded, the n-th symbol being regarded as end-of-string smaller than all other symbols.*/ -void QSufSortSuffixSort(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol, - const int smallestInputSymbol, const int skipTransform) { +void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, + const qsint_t smallestInputSymbol, const int skipTransform) { - int i, j; - int s, negatedSortedGroupLength; - int numSymbolAggregated; - int maxNumInputSymbol; - int numSortedPos = 1; - int newAlphabetSize; + qsint_t i, j; + qsint_t s, negatedSortedGroupLength; + qsint_t numSymbolAggregated; + qsint_t maxNumInputSymbol; + qsint_t numSortedPos = 1; + qsint_t newAlphabetSize; maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1; @@ -102,7 +72,7 @@ void QSufSortSuffixSort(int* __restrict V, int* __restrict I, const int numChar, numSortedPos = numSymbolAggregated; } - while ((int)(I[0]) >= -(int)numChar) { + while ((qsint_t)(I[0]) >= -(qsint_t)numChar) { i = 0; negatedSortedGroupLength = 0; do { @@ -129,9 +99,9 @@ void QSufSortSuffixSort(int* __restrict V, int* __restrict I, const int numChar, } -void QSufSortGenerateSaFromInverse(const int* V, int* __restrict I, const int numChar) { +void QSufSortGenerateSaFromInverse(const qsint_t* V, qsint_t* __restrict I, const qsint_t numChar) { - int i; + qsint_t i; for (i=0; i<=numChar; i++) { I[V[i]] = i + 1; } @@ -143,21 +113,14 @@ void QSufSortGenerateSaFromInverse(const int* V, int* __restrict I, const int nu quicksort taken from Bentley & McIlroy, "Engineering a Sort Function", Software -- Practice and Experience 23(11), 1249-1265 (November 1993). This function is based on Program 7.*/ -static void QSufSortSortSplit(int* __restrict V, int* __restrict I, const int lowestPos, - const int highestPos, const int numSortedChar) { +static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, + const qsint_t highestPos, const qsint_t numSortedChar) { - int a, b, c, d; - int l, m; - int f, v, s, t; - int tmp; - int numItem; - - #ifdef DEBUG - if (lowestPos > highestPos) { - fprintf(stderr, "QSufSortSortSplit(): lowestPos > highestPos!\n"); - exit(1); - } - #endif + qsint_t a, b, c, d; + qsint_t l, m; + qsint_t f, v, s, t; + qsint_t tmp; + qsint_t numItem; numItem = highestPos - lowestPos + 1; @@ -171,7 +134,7 @@ static void QSufSortSortSplit(int* __restrict V, int* __restrict I, const int lo a = b = lowestPos; c = d = highestPos; - while (TRUE) { + while (1) { while (c >= b && (f = KEY(V, I, b, numSortedChar)) <= v) { if (f == v) { swap(I[a], I[b], tmp); @@ -235,31 +198,17 @@ static void QSufSortSortSplit(int* __restrict V, int* __restrict I, const int lo } /* Algorithm by Bentley & McIlroy.*/ -static int QSufSortChoosePivot(int* __restrict V, int* __restrict I, const int lowestPos, - const int highestPos, const int numSortedChar) { +static qsint_t QSufSortChoosePivot(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, + const qsint_t highestPos, const qsint_t numSortedChar) { - int m; - int keyl, keym, keyn; - int key1, key2, key3; - int s; - int numItem; - - #ifdef DEBUG - if (lowestPos > highestPos) { - fprintf(stderr, "QSufSortChoosePivot(): lowestPos > highestPos!\n"); - exit(1); - } - #endif + qsint_t m; + qsint_t keyl, keym, keyn; + qsint_t key1, key2, key3; + qsint_t s; + qsint_t numItem; numItem = highestPos - lowestPos + 1; - #ifdef DEBUG - if (numItem <= INSERT_SORT_NUM_ITEM) { - fprintf(stderr, "QSufSortChoosePivot(): number of items <= INSERT_SORT_NUM_ITEM!\n"); - exit(1); - } - #endif - m = lowestPos + numItem / 2; s = numItem / 8; @@ -282,39 +231,19 @@ static int QSufSortChoosePivot(int* __restrict V, int* __restrict I, const int l } /* Quadratic sorting method to use for small subarrays. */ -static void QSufSortInsertSortSplit(int* __restrict V, int* __restrict I, const int lowestPos, - const int highestPos, const int numSortedChar) { +static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, + const qsint_t highestPos, const qsint_t numSortedChar) { - int i, j; - int tmpKey, tmpPos; - int numItem; - int key[INSERT_SORT_NUM_ITEM], pos[INSERT_SORT_NUM_ITEM]; - int negativeSortedLength; - int groupNum; - - #ifdef DEBUG - if (lowestPos > highestPos) { - fprintf(stderr, "QSufSortInsertSortSplit(): lowestPos > highestPos!\n"); - exit(1); - } - #endif + qsint_t i, j; + qsint_t tmpKey, tmpPos; + qsint_t numItem; + qsint_t key[INSERT_SORT_NUM_ITEM], pos[INSERT_SORT_NUM_ITEM]; + qsint_t negativeSortedLength; + qsint_t groupNum; numItem = highestPos - lowestPos + 1; - #ifdef DEBUG - if (numItem > INSERT_SORT_NUM_ITEM) { - fprintf(stderr, "QSufSortInsertSortSplit(): number of items > INSERT_SORT_NUM_ITEM!\n"); - exit(1); - } - #endif - for (i=0; i0; i--) { c = I[i-1]; - d = (int)(V[c]); + d = (qsint_t)(V[c]); groupNum = currentIndex; V[c] = groupNum; if (d >= 0) { @@ -424,20 +353,20 @@ static void QSufSortBucketSort(int* __restrict V, int* __restrict I, const int n Output: Returns an integer j in the range 1...q representing the size of the new alphabet. If j<=n+1, the alphabet is compacted. The global variable r is set to the number of old symbols grouped into one. Only x[n] is 0.*/ -static int QSufSortTransform(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol, - const int smallestInputSymbol, const int maxNewAlphabetSize, int *numSymbolAggregated) { +static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, + const qsint_t smallestInputSymbol, const qsint_t maxNewAlphabetSize, qsint_t *numSymbolAggregated) { - int c, i, j; - int a; // numSymbolAggregated - int mask; - int minSymbolInChunk = 0, maxSymbolInChunk = 0; - int newAlphabetSize; - int maxNumInputSymbol, maxNumBit, maxSymbol; + qsint_t c, i, j; + qsint_t a; // numSymbolAggregated + qsint_t mask; + qsint_t minSymbolInChunk = 0, maxSymbolInChunk = 0; + qsint_t newAlphabetSize; + qsint_t maxNumInputSymbol, maxNumBit, maxSymbol; maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1; - maxNumBit = BITS_IN_WORD - leadingZero(maxNumInputSymbol); - maxSymbol = INT_MAX >> maxNumBit; + for (maxNumBit = 0, i = maxNumInputSymbol; i; i >>= 1) ++maxNumBit; + maxSymbol = QSINT_MAX >> maxNumBit; c = maxNumInputSymbol; for (a = 0; a < numChar && maxSymbolInChunk <= maxSymbol && c <= maxNewAlphabetSize; a++) { @@ -449,14 +378,6 @@ static int QSufSortTransform(int* __restrict V, int* __restrict I, const int num mask = (1 << (a-1) * maxNumBit) - 1; /* mask masks off top old symbol from chunk.*/ V[numChar] = smallestInputSymbol - 1; /* emulate zero terminator.*/ - #ifdef DEBUG - // Section of code for maxSymbolInChunk > numChar removed! - if (maxSymbolInChunk > numChar) { - fprintf(stderr, "QSufSortTransform(): maxSymbolInChunk > numChar!\n"); - exit(1); - } - #endif - /* bucketing possible, compact alphabet.*/ for (i=0; i<=maxSymbolInChunk; i++) { I[i] = 0; /* zero transformation table.*/ diff --git a/bwt_gen/QSufSort.h b/bwt_gen/QSufSort.h index 8724d30..6faf9f6 100644 --- a/bwt_gen/QSufSort.h +++ b/bwt_gen/QSufSort.h @@ -29,12 +29,17 @@ #ifndef __QSUFSORT_H__ #define __QSUFSORT_H__ +#include + #define KEY(V, I, p, h) ( V[ I[p] + h ] ) #define INSERT_SORT_NUM_ITEM 16 -void QSufSortSuffixSort(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol, - const int smallestInputSymbol, const int skipTransform); -void QSufSortGenerateSaFromInverse(const int *V, int* __restrict I, const int numChar); +typedef int64_t qsint_t; +#define QSINT_MAX INT64_MAX + +void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, + const qsint_t smallestInputSymbol, const int skipTransform); +void QSufSortGenerateSaFromInverse(const qsint_t *V, qsint_t* __restrict I, const qsint_t numChar); #endif diff --git a/bwt_gen/bwt_gen.c b/bwt_gen/bwt_gen.c index b187c18..fecbf56 100644 --- a/bwt_gen/bwt_gen.c +++ b/bwt_gen/bwt_gen.c @@ -25,22 +25,27 @@ #include #include #include +#include #include "bwt_gen.h" #include "QSufSort.h" -static unsigned int TextLengthFromBytePacked(unsigned int bytePackedLength, unsigned int bitPerChar, +#define MIN_AVAILABLE_WORD 0x10000 + +static bgint_t TextLengthFromBytePacked(bgint_t bytePackedLength, unsigned int bitPerChar, unsigned int lastByteLength) { - if (bytePackedLength > ALL_ONE_MASK / (BITS_IN_BYTE / bitPerChar)) { - fprintf(stderr, "TextLengthFromBytePacked(): text length > 2^32!\n"); - exit(1); - } return (bytePackedLength - 1) * (BITS_IN_BYTE / bitPerChar) + lastByteLength; } -static void initializeVAL(unsigned int *startAddr, const unsigned int length, const unsigned int initValue) +static void initializeVAL(unsigned int *startAddr, const bgint_t length, const unsigned int initValue) { - unsigned int i; + bgint_t i; + for (i=0; ibwt->textLength == 0) { // initial build // Minus 2 because n+1 entries of seq and rank needed for n char - maxBuildSize = (bwtInc->availableWord - 2 - OCC_INTERVAL / CHAR_PER_WORD) - / (2 * CHAR_PER_WORD + 1) * CHAR_PER_WORD; + maxBuildSize = (bwtInc->availableWord - 2 * (sizeof(bgint_t) / 4) - OCC_INTERVAL / CHAR_PER_WORD) + / (2 * CHAR_PER_WORD + 1) * CHAR_PER_WORD / (sizeof(bgint_t) / 4); if (bwtInc->initialMaxBuildSize > 0) { bwtInc->buildSize = min(bwtInc->initialMaxBuildSize, maxBuildSize); } else { @@ -104,9 +109,9 @@ static void BWTIncSetBuildSizeAndTextAddr(BWTInc *bwtInc) } else { // Minus 3 because n+1 entries of sorted rank, seq and rank needed for n char // Minus numberOfIterationDone because bwt slightly shift to left in each iteration - maxBuildSize = (bwtInc->availableWord - bwtInc->bwt->bwtSizeInWord - bwtInc->bwt->occSizeInWord - 3 + maxBuildSize = (bwtInc->availableWord - bwtInc->bwt->bwtSizeInWord - bwtInc->bwt->occSizeInWord - 3 * (sizeof(bgint_t) / 4) - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR) - / 3; + / 3 / (sizeof(bgint_t) / 4); if (maxBuildSize < CHAR_PER_WORD) { fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n"); exit(1); @@ -116,9 +121,8 @@ static void BWTIncSetBuildSizeAndTextAddr(BWTInc *bwtInc) } else { bwtInc->buildSize = maxBuildSize; } - if (bwtInc->buildSize < CHAR_PER_WORD) { + if (bwtInc->buildSize < CHAR_PER_WORD) bwtInc->buildSize = CHAR_PER_WORD; - } } if (bwtInc->buildSize < CHAR_PER_WORD) { @@ -128,9 +132,8 @@ static void BWTIncSetBuildSizeAndTextAddr(BWTInc *bwtInc) bwtInc->buildSize = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD; - bwtInc->packedText = bwtInc->workingMemory + 2 * (bwtInc->buildSize + 1); - bwtInc->textBuffer = (unsigned char*)(bwtInc->workingMemory + bwtInc->buildSize + 1); - + bwtInc->packedText = bwtInc->workingMemory + 2 * (bwtInc->buildSize + 1) * (sizeof(bgint_t) / 4); + bwtInc->textBuffer = (unsigned char*)(bwtInc->workingMemory + (bwtInc->buildSize + 1) * (sizeof(bgint_t) / 4)); } // for ceilLog2() @@ -186,17 +189,17 @@ static unsigned int BitPerWordPackedChar(const unsigned int alphabetSize) } static void ConvertBytePackedToWordPacked(const unsigned char *input, unsigned int *output, const unsigned int alphabetSize, - const unsigned int textLength) + const bgint_t textLength) { - unsigned int i, j, k; - unsigned int c; + bgint_t i; + unsigned int j, k, c; unsigned int bitPerBytePackedChar; unsigned int bitPerWordPackedChar; unsigned int charPerWord; unsigned int charPerByte; unsigned int bytePerIteration; - unsigned int byteProcessed = 0; - unsigned int wordProcessed = 0; + bgint_t byteProcessed = 0; + bgint_t wordProcessed = 0; unsigned int mask, shift; unsigned int buffer[BITS_IN_WORD]; @@ -250,7 +253,7 @@ static void ConvertBytePackedToWordPacked(const unsigned char *input, unsigned i output[wordProcessed] = c; } -BWT *BWTCreate(const unsigned int textLength, unsigned int *decodeTable) +BWT *BWTCreate(const bgint_t textLength, unsigned int *decodeTable) { BWT *bwt; @@ -258,8 +261,8 @@ BWT *BWTCreate(const unsigned int textLength, unsigned int *decodeTable) bwt->textLength = 0; - bwt->cumulativeFreq = (unsigned*)calloc((ALPHABET_SIZE + 1), sizeof(unsigned int*)); - initializeVAL(bwt->cumulativeFreq, ALPHABET_SIZE + 1, 0); + bwt->cumulativeFreq = (bgint_t*)calloc((ALPHABET_SIZE + 1), sizeof(bgint_t)); + initializeVAL_bg(bwt->cumulativeFreq, ALPHABET_SIZE + 1, 0); bwt->bwtSizeInWord = 0; @@ -272,7 +275,7 @@ BWT *BWTCreate(const unsigned int textLength, unsigned int *decodeTable) } bwt->occMajorSizeInWord = BWTOccValueMajorSizeInWord(textLength); - bwt->occValueMajor = (unsigned*)calloc(bwt->occMajorSizeInWord, sizeof(unsigned int)); + bwt->occValueMajor = (bgint_t*)calloc(bwt->occMajorSizeInWord, sizeof(bgint_t)); bwt->occSizeInWord = 0; bwt->occValue = NULL; @@ -297,8 +300,8 @@ BWTInc *BWTIncCreate(const unsigned int textLength, const float targetNBit, bwtInc->initialMaxBuildSize = initialMaxBuildSize; bwtInc->incMaxBuildSize = incMaxBuildSize; bwtInc->targetNBit = targetNBit; - bwtInc->cumulativeCountInCurrentBuild = (unsigned*)calloc((ALPHABET_SIZE + 1), sizeof(unsigned int)); - initializeVAL(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0); + bwtInc->cumulativeCountInCurrentBuild = (bgint_t*)calloc((ALPHABET_SIZE + 1), sizeof(bgint_t)); + initializeVAL_bg(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0); // Build frequently accessed data bwtInc->packedShift = (unsigned*)calloc(CHAR_PER_WORD, sizeof(unsigned int)); @@ -306,7 +309,8 @@ BWTInc *BWTIncCreate(const unsigned int textLength, const float targetNBit, bwtInc->packedShift[i] = BITS_IN_WORD - (i+1) * BIT_PER_CHAR; } - bwtInc->availableWord = (unsigned int)((textLength + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL / BITS_IN_WORD * bwtInc->targetNBit); + bwtInc->availableWord = (bgint_t)((textLength + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL / BITS_IN_WORD * bwtInc->targetNBit); + if (bwtInc->availableWord < MIN_AVAILABLE_WORD) bwtInc->availableWord = MIN_AVAILABLE_WORD; if (bwtInc->availableWord < BWTResidentSizeInWord(textLength) + BWTOccValueMinorSizeInWord(textLength)) { fprintf(stderr, "BWTIncCreate() : targetNBit is too low!\n"); exit(1); @@ -317,14 +321,16 @@ BWTInc *BWTIncCreate(const unsigned int textLength, const float targetNBit, } // for BWTIncConstruct() -static void BWTIncPutPackedTextToRank(const unsigned int *packedText, unsigned int* __restrict rank, - unsigned int* __restrict cumulativeCount, const unsigned int numChar) +static void BWTIncPutPackedTextToRank(const unsigned int *packedText, bgint_t* __restrict rank, + bgint_t* __restrict cumulativeCount, const bgint_t numChar) { - unsigned int i, j; + bgint_t i; + unsigned int j; unsigned int c, t; unsigned int packedMask; - unsigned int rankIndex; - unsigned int lastWord, numCharInLastWord; + bgint_t rankIndex; + bgint_t lastWord; + unsigned int numCharInLastWord; lastWord = (numChar - 1) / CHAR_PER_WORD; numCharInLastWord = numChar - lastWord * CHAR_PER_WORD; @@ -359,17 +365,17 @@ static void BWTIncPutPackedTextToRank(const unsigned int *packedText, unsigned i } -static void ForwardDNAAllOccCountNoLimit(const unsigned int* dna, const unsigned int index, - unsigned int* __restrict occCount, const unsigned int* dnaDecodeTable) +static void ForwardDNAAllOccCountNoLimit(const unsigned int* dna, const bgint_t index, + bgint_t* __restrict occCount, const unsigned int* dnaDecodeTable) { static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000, 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000, 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00, 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC }; - unsigned int iteration, wordToCount, charToCount; - unsigned int i, j, c; - unsigned int sum; + bgint_t iteration, i; + unsigned int wordToCount, charToCount; + unsigned int j, c, sum; occCount[0] = 0; occCount[1] = 0; @@ -432,13 +438,14 @@ static void ForwardDNAAllOccCountNoLimit(const unsigned int* dna, const unsigne occCount[3] += sum; } -static void BWTIncBuildPackedBwt(const unsigned int *relativeRank, unsigned int* __restrict bwt, const unsigned int numChar, - const unsigned int *cumulativeCount, const unsigned int *packedShift) { +static void BWTIncBuildPackedBwt(const bgint_t *relativeRank, unsigned int* __restrict bwt, const bgint_t numChar, + const bgint_t *cumulativeCount, const unsigned int *packedShift) { - unsigned int i, c, r; - unsigned int previousRank, currentRank; - unsigned int wordIndex, charIndex; - unsigned int inverseSa0; + bgint_t i, r; + unsigned int c; + bgint_t previousRank, currentRank; + bgint_t wordIndex, charIndex; + bgint_t inverseSa0; inverseSa0 = previousRank = relativeRank[0]; @@ -463,10 +470,10 @@ static void BWTIncBuildPackedBwt(const unsigned int *relativeRank, unsigned int* } } -static inline unsigned int BWTOccValueExplicit(const BWT *bwt, const unsigned int occIndexExplicit, +static inline bgint_t BWTOccValueExplicit(const BWT *bwt, const bgint_t occIndexExplicit, const unsigned int character) { - unsigned int occIndexMajor; + bgint_t occIndexMajor; occIndexMajor = occIndexExplicit * OCC_INTERVAL / OCC_INTERVAL_MAJOR; @@ -546,46 +553,43 @@ static unsigned int BackwardDNAOccCount(const unsigned int* dna, const unsigned } -unsigned int BWTOccValue(const BWT *bwt, unsigned int index, const unsigned int character) { - - unsigned int occValue; - unsigned int occExplicitIndex, occIndex; +bgint_t BWTOccValue(const BWT *bwt, bgint_t index, const unsigned int character) +{ + bgint_t occValue; + bgint_t occExplicitIndex, occIndex; // $ is supposed to be positioned at inverseSa0 but it is not encoded // therefore index is subtracted by 1 for adjustment - if (index > bwt->inverseSa0) { + if (index > bwt->inverseSa0) index--; - } occExplicitIndex = (index + OCC_INTERVAL / 2 - 1) / OCC_INTERVAL; // Bidirectional encoding occIndex = occExplicitIndex * OCC_INTERVAL; occValue = BWTOccValueExplicit(bwt, occExplicitIndex, character); - if (occIndex == index) { + if (occIndex == index) return occValue; - } if (occIndex < index) { return occValue + ForwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, index - occIndex, character, bwt->decodeTable); } else { return occValue - BackwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, occIndex - index, character, bwt->decodeTable); } - } -static unsigned int BWTIncGetAbsoluteRank(BWT *bwt, unsigned int* __restrict absoluteRank, unsigned int* __restrict seq, - const unsigned int *packedText, const unsigned int numChar, - const unsigned int* cumulativeCount, const unsigned int firstCharInLastIteration) +static bgint_t BWTIncGetAbsoluteRank(BWT *bwt, bgint_t* __restrict absoluteRank, bgint_t* __restrict seq, + const unsigned int *packedText, const bgint_t numChar, + const bgint_t* cumulativeCount, const unsigned int firstCharInLastIteration) { - unsigned int saIndex; - unsigned int lastWord; + bgint_t saIndex; + bgint_t lastWord; unsigned int packedMask; - unsigned int i, j; - unsigned int c, t; - unsigned int rankIndex; + bgint_t i; + unsigned int c, t, j; + bgint_t rankIndex; unsigned int shift; - unsigned int seqIndexFromStart[ALPHABET_SIZE]; - unsigned int seqIndexFromEnd[ALPHABET_SIZE]; + bgint_t seqIndexFromStart[ALPHABET_SIZE]; + bgint_t seqIndexFromEnd[ALPHABET_SIZE]; for (i=0; i oldInverseSa0) { + if (r > oldInverseSa0) sortedRank[i]--; // to prepare for merging; $ is not encoded in bwt - } s = seq[i]; if (i < freq) { - if (lastIndex >= freq) { + if (lastIndex >= freq) lastRank++; // to trigger the group across alphabet boundary to be split - } c--; freq = cumulativeCount[c]; } @@ -846,10 +848,10 @@ static void BWTIncBuildRelativeRank(unsigned int* __restrict sortedRank, unsigne relativeRank[s] = lastIndex; } else { if (i == lastIndex - 1) { - if (lastIndex < numItem && (int)seq[lastIndex + 1] < 0) { + if (lastIndex < numItem && (sbgint_t)seq[lastIndex + 1] < 0) { seq[lastIndex] = seq[lastIndex + 1] - 1; } else { - seq[lastIndex] = (unsigned int)-1; + seq[lastIndex] = (bgint_t)-1; } } lastIndex = i; @@ -865,11 +867,12 @@ static void BWTIncBuildRelativeRank(unsigned int* __restrict sortedRank, unsigne } -static void BWTIncBuildBwt(unsigned int* seq, const unsigned int *relativeRank, const unsigned int numChar, - const unsigned int *cumulativeCount) +static void BWTIncBuildBwt(unsigned int* insertBwt, const bgint_t *relativeRank, const bgint_t numChar, + const bgint_t *cumulativeCount) { - unsigned int i, c; - unsigned int previousRank, currentRank; + unsigned int c; + bgint_t i; + bgint_t previousRank, currentRank; previousRank = relativeRank[0]; @@ -877,20 +880,20 @@ static void BWTIncBuildBwt(unsigned int* seq, const unsigned int *relativeRank, currentRank = relativeRank[i]; c = (previousRank >= cumulativeCount[1]) + (previousRank >= cumulativeCount[2]) + (previousRank >= cumulativeCount[3]); - seq[currentRank] = c; + insertBwt[currentRank] = c; previousRank = currentRank; } } -static void BWTIncMergeBwt(const unsigned int *sortedRank, const unsigned int* oldBwt, const unsigned int *insertBwt, - unsigned int* __restrict mergedBwt, const unsigned int numOldBwt, const unsigned int numInsertBwt) +static void BWTIncMergeBwt(const bgint_t *sortedRank, const unsigned int* oldBwt, const unsigned int *insertBwt, + unsigned int* __restrict mergedBwt, const bgint_t numOldBwt, const bgint_t numInsertBwt) { unsigned int bitsInWordMinusBitPerChar; - unsigned int leftShift, rightShift; - unsigned int o; - unsigned int oIndex, iIndex, mIndex; - unsigned int mWord, mChar, oWord, oChar; - unsigned int numInsert; + bgint_t leftShift, rightShift; + bgint_t o; + bgint_t oIndex, iIndex, mIndex; + bgint_t mWord, mChar, oWord, oChar; + bgint_t numInsert; bitsInWordMinusBitPerChar = BITS_IN_WORD - BIT_PER_CHAR; @@ -997,9 +1000,9 @@ static void BWTIncMergeBwt(const unsigned int *sortedRank, const unsigned int* o void BWTClearTrailingBwtCode(BWT *bwt) { - unsigned int bwtResidentSizeInWord; - unsigned int wordIndex, offset; - unsigned int i; + bgint_t bwtResidentSizeInWord; + bgint_t wordIndex, offset; + bgint_t i; bwtResidentSizeInWord = BWTResidentSizeInWord(bwt->textLength); @@ -1020,18 +1023,18 @@ void BWTClearTrailingBwtCode(BWT *bwt) void BWTGenerateOccValueFromBwt(const unsigned int* bwt, unsigned int* __restrict occValue, - unsigned int* __restrict occValueMajor, - const unsigned int textLength, const unsigned int* decodeTable) + bgint_t* __restrict occValueMajor, + const bgint_t textLength, const unsigned int* decodeTable) { - unsigned int numberOfOccValueMajor, numberOfOccValue; + bgint_t numberOfOccValueMajor, numberOfOccValue; unsigned int wordBetweenOccValue; - unsigned int numberOfOccIntervalPerMajor; + bgint_t numberOfOccIntervalPerMajor; unsigned int c; - unsigned int i, j; - unsigned int occMajorIndex; - unsigned int occIndex, bwtIndex; - unsigned int sum; - unsigned int tempOccValue0[ALPHABET_SIZE], tempOccValue1[ALPHABET_SIZE]; + bgint_t i, j; + bgint_t occMajorIndex; + bgint_t occIndex, bwtIndex; + bgint_t sum; // perhaps unsigned is big enough + bgint_t tempOccValue0[ALPHABET_SIZE], tempOccValue1[ALPHABET_SIZE]; wordBetweenOccValue = OCC_INTERVAL / CHAR_PER_WORD; @@ -1231,31 +1234,25 @@ void BWTGenerateOccValueFromBwt(const unsigned int* bwt, unsigned int* __restri } -static void BWTIncConstruct(BWTInc *bwtInc, const unsigned int numChar) +static void BWTIncConstruct(BWTInc *bwtInc, const bgint_t numChar) { unsigned int i; - unsigned int mergedBwtSizeInWord, mergedOccSizeInWord; + bgint_t mergedBwtSizeInWord, mergedOccSizeInWord; unsigned int firstCharInThisIteration; - unsigned int *relativeRank, *seq, *sortedRank, *insertBwt, *mergedBwt; - unsigned int newInverseSa0RelativeRank, oldInverseSa0RelativeRank, newInverseSa0; - - #ifdef DEBUG - if (numChar > bwtInc->buildSize) { - fprintf(stderr, "BWTIncConstruct(): numChar > buildSize!\n"); - exit(1); - } - #endif + bgint_t *relativeRank, *seq, *sortedRank; + unsigned int *insertBwt, *mergedBwt; + bgint_t newInverseSa0RelativeRank, oldInverseSa0RelativeRank, newInverseSa0; mergedBwtSizeInWord = BWTResidentSizeInWord(bwtInc->bwt->textLength + numChar); mergedOccSizeInWord = BWTOccValueMinorSizeInWord(bwtInc->bwt->textLength + numChar); - initializeVAL(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0); + initializeVAL_bg(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0); if (bwtInc->bwt->textLength == 0) { // Initial build // Set address - seq = bwtInc->workingMemory; + seq = (bgint_t*)bwtInc->workingMemory; relativeRank = seq + bwtInc->buildSize + 1; mergedBwt = insertBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord; // build in place @@ -1265,7 +1262,7 @@ static void BWTIncConstruct(BWTInc *bwtInc, const unsigned int numChar) relativeRank[numChar] = 0; // Sort suffix - QSufSortSuffixSort((int*)relativeRank, (int*)seq, (int)numChar, (int)ALPHABET_SIZE - 1, 0, FALSE); + QSufSortSuffixSort((qsint_t*)relativeRank, (qsint_t*)seq, (qsint_t)numChar, (qsint_t)ALPHABET_SIZE - 1, 0, FALSE); newInverseSa0 = relativeRank[0]; // Clear BWT area @@ -1279,9 +1276,9 @@ static void BWTIncConstruct(BWTInc *bwtInc, const unsigned int numChar) } else { // Incremental build // Set address - sortedRank = bwtInc->workingMemory; + sortedRank = (bgint_t*)bwtInc->workingMemory; seq = sortedRank + bwtInc->buildSize + 1; - insertBwt = seq; + insertBwt = (unsigned*)seq; // insertBwt and seq share memory relativeRank = seq + bwtInc->buildSize + 1; // Store the first character of this iteration @@ -1318,15 +1315,10 @@ static void BWTIncConstruct(BWTInc *bwtInc, const unsigned int numChar) // build relative rank; sortedRank is updated for merging to cater for the fact that $ is not encoded in bwt // the cumulative freq information is used to make sure that inverseSa0 and suffix beginning with different characters are kept in different unsorted groups) BWTIncBuildRelativeRank(sortedRank, seq, relativeRank, numChar, bwtInc->bwt->inverseSa0, bwtInc->cumulativeCountInCurrentBuild); -#ifdef DEBUG - if (relativeRank[numChar] != oldInverseSa0RelativeRank) { - fprintf(stderr, "BWTIncConstruct(): relativeRank[numChar] != oldInverseSa0RelativeRank!\n"); - exit(1); - } -#endif + assert(relativeRank[numChar] == oldInverseSa0RelativeRank); // Sort suffix - QSufSortSuffixSort((int*)relativeRank, (int*)seq, (int)numChar, (int)numChar, 1, TRUE); + QSufSortSuffixSort((qsint_t*)relativeRank, (qsint_t*)seq, (qsint_t)numChar, (qsint_t)numChar, 1, TRUE); newInverseSa0RelativeRank = relativeRank[0]; newInverseSa0 = sortedRank[newInverseSa0RelativeRank] + newInverseSa0RelativeRank; @@ -1334,7 +1326,7 @@ static void BWTIncConstruct(BWTInc *bwtInc, const unsigned int numChar) sortedRank[newInverseSa0RelativeRank] = 0; // a special value so that this is skipped in the merged bwt // Build BWT - BWTIncBuildBwt(seq, relativeRank, numChar, bwtInc->cumulativeCountInCurrentBuild); + BWTIncBuildBwt(insertBwt, relativeRank, numChar, bwtInc->cumulativeCountInCurrentBuild); // Merge BWT mergedBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord @@ -1349,10 +1341,7 @@ static void BWTIncConstruct(BWTInc *bwtInc, const unsigned int numChar) bwtInc->bwt->bwtCode = mergedBwt; bwtInc->bwt->bwtSizeInWord = mergedBwtSizeInWord; bwtInc->bwt->occSizeInWord = mergedOccSizeInWord; - if (mergedBwt < bwtInc->workingMemory + mergedOccSizeInWord) { - fprintf(stderr, "BWTIncConstruct() : Not enough memory allocated!\n"); - exit(1); - } + assert(mergedBwt >= bwtInc->workingMemory + mergedOccSizeInWord); bwtInc->bwt->occValue = mergedBwt - mergedOccSizeInWord; @@ -1376,14 +1365,14 @@ static void BWTIncConstruct(BWTInc *bwtInc, const unsigned int numChar) } BWTInc *BWTIncConstructFromPacked(const char *inputFileName, const float targetNBit, - const unsigned int initialMaxBuildSize, const unsigned int incMaxBuildSize) + bgint_t initialMaxBuildSize, bgint_t incMaxBuildSize) { FILE *packedFile; - unsigned int packedFileLen; - unsigned int totalTextLength; - unsigned int textToLoad, textSizeInByte; - unsigned int processedTextLength; + bgint_t packedFileLen; + bgint_t totalTextLength; + bgint_t textToLoad, textSizeInByte; + bgint_t processedTextLength; unsigned char lastByteLength; BWTInc *bwtInc; @@ -1397,10 +1386,6 @@ BWTInc *BWTIncConstructFromPacked(const char *inputFileName, const float targetN fseek(packedFile, -1, SEEK_END); packedFileLen = ftell(packedFile); - if ((int)packedFileLen < 0) { - fprintf(stderr, "BWTIncConstructFromPacked: Cannot determine file length!\n"); - exit(1); - } fread(&lastByteLength, sizeof(unsigned char), 1, packedFile); totalTextLength = TextLengthFromBytePacked(packedFileLen, BIT_PER_CHAR, lastByteLength); @@ -1416,9 +1401,9 @@ BWTInc *BWTIncConstructFromPacked(const char *inputFileName, const float targetN textSizeInByte = textToLoad / CHAR_PER_BYTE; // excluded the odd byte fseek(packedFile, -2, SEEK_CUR); - fseek(packedFile, -((int)textSizeInByte), SEEK_CUR); + fseek(packedFile, -((long)textSizeInByte), SEEK_CUR); fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte + 1, packedFile); - fseek(packedFile, -((int)textSizeInByte + 1), SEEK_CUR); + fseek(packedFile, -((long)textSizeInByte + 1), SEEK_CUR); ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad); BWTIncConstruct(bwtInc, textToLoad); @@ -1431,15 +1416,15 @@ BWTInc *BWTIncConstructFromPacked(const char *inputFileName, const float targetN textToLoad = totalTextLength - processedTextLength; } textSizeInByte = textToLoad / CHAR_PER_BYTE; - fseek(packedFile, -((int)textSizeInByte), SEEK_CUR); + fseek(packedFile, -((long)textSizeInByte), SEEK_CUR); fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte, packedFile); - fseek(packedFile, -((int)textSizeInByte), SEEK_CUR); + fseek(packedFile, -((long)textSizeInByte), SEEK_CUR); ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad); BWTIncConstruct(bwtInc, textToLoad); processedTextLength += textToLoad; if (bwtInc->numberOfIterationDone % 10 == 0) { - printf("[BWTIncConstructFromPacked] %u iterations done. %u characters processed.\n", - bwtInc->numberOfIterationDone, processedTextLength); + printf("[BWTIncConstructFromPacked] %lu iterations done. %lu characters processed.\n", + (long)bwtInc->numberOfIterationDone, (long)processedTextLength); } } return bwtInc; @@ -1464,7 +1449,7 @@ void BWTIncFree(BWTInc *bwtInc) free(bwtInc); } -static unsigned int BWTFileSizeInWord(const unsigned int numChar) +static bgint_t BWTFileSizeInWord(const bgint_t numChar) { // The $ in BWT at the position of inverseSa0 is not encoded return (numChar + CHAR_PER_WORD - 1) / CHAR_PER_WORD; @@ -1474,7 +1459,7 @@ void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *o { FILE *bwtFile; /* FILE *occValueFile; */ - unsigned int bwtLength; + bgint_t bwtLength; bwtFile = (FILE*)fopen(bwtFileName, "wb"); if (bwtFile == NULL) { @@ -1482,8 +1467,8 @@ void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *o exit(1); } - fwrite(&bwt->inverseSa0, sizeof(unsigned int), 1, bwtFile); - fwrite(bwt->cumulativeFreq + 1, sizeof(unsigned int), ALPHABET_SIZE, bwtFile); + fwrite(&bwt->inverseSa0, sizeof(bgint_t), 1, bwtFile); + fwrite(bwt->cumulativeFreq + 1, sizeof(bgint_t), ALPHABET_SIZE, bwtFile); bwtLength = BWTFileSizeInWord(bwt->textLength); fwrite(bwt->bwtCode, sizeof(unsigned int), bwtLength, bwtFile); fclose(bwtFile); diff --git a/bwt_gen/bwt_gen.h b/bwt_gen/bwt_gen.h index fe7fcf9..954c1c0 100644 --- a/bwt_gen/bwt_gen.h +++ b/bwt_gen/bwt_gen.h @@ -27,8 +27,8 @@ #include -//typedef int64_t bgint_t; -typedef unsigned bgint_t; +typedef uint64_t bgint_t; +typedef int64_t sbgint_t; #define ALPHABET_SIZE 4 #define BIT_PER_CHAR 2 @@ -69,20 +69,20 @@ typedef struct BWT { unsigned int *occValue; // Occurrence values stored explicitly bgint_t *occValueMajor; // Occurrence values stored explicitly unsigned int *decodeTable; // For decoding BWT by table lookup - unsigned int bwtSizeInWord; // Temporary variable to hold the memory allocated - unsigned int occSizeInWord; // Temporary variable to hold the memory allocated - unsigned int occMajorSizeInWord; // Temporary variable to hold the memory allocated + bgint_t bwtSizeInWord; // Temporary variable to hold the memory allocated + bgint_t occSizeInWord; // Temporary variable to hold the memory allocated + bgint_t occMajorSizeInWord; // Temporary variable to hold the memory allocated } BWT; typedef struct BWTInc { BWT *bwt; unsigned int numberOfIterationDone; - unsigned int *cumulativeCountInCurrentBuild; - unsigned int availableWord; + bgint_t *cumulativeCountInCurrentBuild; + bgint_t availableWord; float targetNBit; - unsigned int buildSize; - unsigned int initialMaxBuildSize; - unsigned int incMaxBuildSize; + bgint_t buildSize; + bgint_t initialMaxBuildSize; + bgint_t incMaxBuildSize; unsigned int firstCharInLastIteration; unsigned int *workingMemory; unsigned int *packedText;