merge bwt_gen.h to bwt_gen.c

This commit is contained in:
Heng Li 2011-10-18 23:48:46 -04:00
parent 95b1ab7e96
commit 573ad0c98f
3 changed files with 88 additions and 130 deletions

View File

@ -32,9 +32,12 @@
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include "bwt_gen.h"
#include "QSufSort.h"
#define min(value1, value2) ( ((value1) < (value2)) ? (value1) : (value2) )
#define med3(a, b, c) ( a<b ? (b<c ? b : a<c ? c : a) : (b>c ? b : a>c ? c : a))
#define swap(a, b, t); t = a; a = b; b = t;
// Static functions
static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
const qsint_t highestPos, const qsint_t numSortedChar);
@ -51,8 +54,8 @@ static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, c
contents of x[n] is disregarded, the n-th symbol being regarded as
end-of-string smaller than all other symbols.*/
void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol,
const qsint_t smallestInputSymbol, const int skipTransform) {
const qsint_t smallestInputSymbol, const int skipTransform)
{
qsint_t i, j;
qsint_t s, negatedSortedGroupLength;
qsint_t numSymbolAggregated;
@ -96,16 +99,13 @@ void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsin
}
numSortedPos *= 2; /* double sorted-depth.*/
}
}
void QSufSortGenerateSaFromInverse(const qsint_t* V, qsint_t* __restrict I, const qsint_t numChar) {
void QSufSortGenerateSaFromInverse(const qsint_t* V, qsint_t* __restrict I, const qsint_t numChar)
{
qsint_t i;
for (i=0; i<=numChar; i++) {
for (i=0; i<=numChar; i++)
I[V[i]] = i + 1;
}
}
/* Sorting routine called for each unsorted group. Sorts the array of integers
@ -149,9 +149,8 @@ static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, cons
}
c--;
}
if (b > c) {
if (b > c)
break;
}
swap(I[b], I[c], tmp);
b++;
c--;
@ -173,9 +172,8 @@ static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, cons
s = b - a;
t = d - c;
if (s > 0) {
if (s > 0)
QSufSortSortSplit(V, I, lowestPos, lowestPos + s - 1, numSortedChar);
}
// Update group number for equal portion
a = lowestPos + s;
@ -186,14 +184,12 @@ static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, cons
I[a] = -1;
} else {
// Unsorted group
for (c=a; c<=b; c++) {
for (c=a; c<=b; c++)
V[I[c]] = b;
}
}
if (t > 0) {
if (t > 0)
QSufSortSortSplit(V, I, highestPos - t + 1, highestPos, numSortedChar);
}
}
@ -232,8 +228,8 @@ static qsint_t QSufSortChoosePivot(qsint_t* __restrict V, qsint_t* __restrict I,
/* Quadratic sorting method to use for small subarrays. */
static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
const qsint_t highestPos, const qsint_t numSortedChar) {
const qsint_t highestPos, const qsint_t numSortedChar)
{
qsint_t i, j;
qsint_t tmpKey, tmpPos;
qsint_t numItem;
@ -269,9 +265,8 @@ static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I
if (key[i-1] == key[i]) {
negativeSortedLength = 0;
} else {
if (negativeSortedLength < 0) {
if (negativeSortedLength < 0)
I[i+lowestPos] = negativeSortedLength;
}
groupNum = i + lowestPos - 1;
negativeSortedLength--;
}
@ -280,10 +275,8 @@ static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I
I[lowestPos] = pos[0];
V[I[lowestPos]] = groupNum;
if (negativeSortedLength < 0) {
if (negativeSortedLength < 0)
I[lowestPos] = negativeSortedLength;
}
}
/* Bucketsort for first iteration.
@ -295,17 +288,16 @@ static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I
Output: x is V and p is I after the initial sorting stage of the refined
suffix sorting algorithm.*/
static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize) {
static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize)
{
qsint_t i, c;
qsint_t d;
qsint_t groupNum;
qsint_t currentIndex;
// mark linked list empty
for (i=0; i<alphabetSize; i++) {
for (i=0; i<alphabetSize; i++)
I[i] = -1;
}
// insert to linked list
for (i=0; i<=numChar; i++) {
@ -335,7 +327,6 @@ static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, con
}
currentIndex--;
}
}
/* Transforms the alphabet of x by attempting to aggregate several symbols into
@ -354,8 +345,8 @@ static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, con
new alphabet. If j<=n+1, the alphabet is compacted. The global variable r is
set to the number of old symbols grouped into one. Only x[n] is 0.*/
static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol,
const qsint_t smallestInputSymbol, const qsint_t maxNewAlphabetSize, qsint_t *numSymbolAggregated) {
const qsint_t smallestInputSymbol, const qsint_t maxNewAlphabetSize, qsint_t *numSymbolAggregated)
{
qsint_t c, i, j;
qsint_t a; // numSymbolAggregated
qsint_t mask;
@ -379,9 +370,8 @@ static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, c
V[numChar] = smallestInputSymbol - 1; /* emulate zero terminator.*/
/* bucketing possible, compact alphabet.*/
for (i=0; i<=maxSymbolInChunk; i++) {
for (i=0; i<=maxSymbolInChunk; i++)
I[i] = 0; /* zero transformation table.*/
}
c = minSymbolInChunk;
for (i=a; i<=numChar; i++) {
I[c] = 1; /* mark used chunk symbol.*/
@ -412,6 +402,4 @@ static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, c
*numSymbolAggregated = a;
return newAlphabetSize;
}

View File

@ -26,11 +26,74 @@
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include "bwt_gen.h"
#include <stdint.h>
#include "QSufSort.h"
typedef uint64_t bgint_t;
typedef int64_t sbgint_t;
#define ALPHABET_SIZE 4
#define BIT_PER_CHAR 2
#define CHAR_PER_WORD 16
#define CHAR_PER_BYTE 4
#define BITS_IN_WORD 32
#define BITS_IN_BYTE 8
#define BYTES_IN_WORD 4
#define ALL_ONE_MASK 0xFFFFFFFF
#define DNA_OCC_CNT_TABLE_SIZE_IN_WORD 65536
#define BITS_PER_OCC_VALUE 16
#define OCC_VALUE_PER_WORD 2
#define OCC_INTERVAL 256
#define OCC_INTERVAL_MAJOR 65536
#define TRUE 1
#define FALSE 0
#define BWTINC_INSERT_SORT_NUM_ITEM 7
#define MIN_AVAILABLE_WORD 0x10000
#define average(value1, value2) ( ((value1) & (value2)) + ((value1) ^ (value2)) / 2 )
#define min(value1, value2) ( ((value1) < (value2)) ? (value1) : (value2) )
#define max(value1, value2) ( ((value1) > (value2)) ? (value1) : (value2) )
#define med3(a, b, c) ( a<b ? (b<c ? b : a<c ? c : a) : (b>c ? b : a>c ? c : a))
#define swap(a, b, t); t = a; a = b; b = t;
#define truncateLeft(value, offset) ( (value) << (offset) >> (offset) )
#define truncateRight(value, offset) ( (value) >> (offset) << (offset) )
#define DNA_OCC_SUM_EXCEPTION(sum) ((sum & 0xfefefeff) == 0)
typedef struct BWT {
bgint_t textLength; // length of the text
bgint_t inverseSa0; // SA-1[0]
bgint_t *cumulativeFreq; // cumulative frequency
unsigned int *bwtCode; // BWT code
unsigned int *occValue; // Occurrence values stored explicitly
bgint_t *occValueMajor; // Occurrence values stored explicitly
unsigned int *decodeTable; // For decoding BWT by table lookup
bgint_t bwtSizeInWord; // Temporary variable to hold the memory allocated
bgint_t occSizeInWord; // Temporary variable to hold the memory allocated
bgint_t occMajorSizeInWord; // Temporary variable to hold the memory allocated
} BWT;
typedef struct BWTInc {
BWT *bwt;
unsigned int numberOfIterationDone;
bgint_t *cumulativeCountInCurrentBuild;
bgint_t availableWord;
float targetNBit;
bgint_t buildSize;
bgint_t initialMaxBuildSize;
bgint_t incMaxBuildSize;
unsigned int firstCharInLastIteration;
unsigned int *workingMemory;
unsigned int *packedText;
unsigned char *textBuffer;
unsigned int *packedShift;
} BWTInc;
static bgint_t TextLengthFromBytePacked(bgint_t bytePackedLength, unsigned int bitPerChar,
unsigned int lastByteLength)
{
@ -1477,7 +1540,7 @@ void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *o
void bwt_bwtgen(const char *fn_pac, const char *fn_bwt)
{
BWTInc *bwtInc;
bwtInc = BWTIncConstructFromPacked(fn_pac, 2.5, 10000000, 10000000);
bwtInc = BWTIncConstructFromPacked(fn_pac, 5.0, 10000000, 10000000);
printf("[bwt_gen] Finished constructing BWT in %u iterations.\n", bwtInc->numberOfIterationDone);
BWTSaveBwtCodeAndOcc(bwtInc->bwt, fn_bwt, 0);
BWTIncFree(bwtInc);

View File

@ -1,93 +0,0 @@
/*
BWTConstruct.h BWT-Index Construction
This module constructs BWT and auxiliary data structures.
Copyright (C) 2004, Wong Chi Kwong.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#ifndef BWT_GEN_H
#define BWT_GEN_H
#include <stdint.h>
typedef uint64_t bgint_t;
typedef int64_t sbgint_t;
#define ALPHABET_SIZE 4
#define BIT_PER_CHAR 2
#define CHAR_PER_WORD 16
#define CHAR_PER_BYTE 4
#define BITS_IN_WORD 32
#define BITS_IN_BYTE 8
#define BYTES_IN_WORD 4
#define ALL_ONE_MASK 0xFFFFFFFF
#define DNA_OCC_CNT_TABLE_SIZE_IN_WORD 65536
#define BITS_PER_OCC_VALUE 16
#define OCC_VALUE_PER_WORD 2
#define OCC_INTERVAL 256
#define OCC_INTERVAL_MAJOR 65536
#define TRUE 1
#define FALSE 0
#define BWTINC_INSERT_SORT_NUM_ITEM 7
#define average(value1, value2) ( ((value1) & (value2)) + ((value1) ^ (value2)) / 2 )
#define min(value1, value2) ( ((value1) < (value2)) ? (value1) : (value2) )
#define max(value1, value2) ( ((value1) > (value2)) ? (value1) : (value2) )
#define med3(a, b, c) ( a<b ? (b<c ? b : a<c ? c : a) : (b>c ? b : a>c ? c : a))
#define swap(a, b, t); t = a; a = b; b = t;
#define truncateLeft(value, offset) ( (value) << (offset) >> (offset) )
#define truncateRight(value, offset) ( (value) >> (offset) << (offset) )
#define DNA_OCC_SUM_EXCEPTION(sum) ((sum & 0xfefefeff) == 0)
typedef struct BWT {
bgint_t textLength; // length of the text
bgint_t inverseSa0; // SA-1[0]
bgint_t *cumulativeFreq; // cumulative frequency
unsigned int *bwtCode; // BWT code
unsigned int *occValue; // Occurrence values stored explicitly
bgint_t *occValueMajor; // Occurrence values stored explicitly
unsigned int *decodeTable; // For decoding BWT by table lookup
bgint_t bwtSizeInWord; // Temporary variable to hold the memory allocated
bgint_t occSizeInWord; // Temporary variable to hold the memory allocated
bgint_t occMajorSizeInWord; // Temporary variable to hold the memory allocated
} BWT;
typedef struct BWTInc {
BWT *bwt;
unsigned int numberOfIterationDone;
bgint_t *cumulativeCountInCurrentBuild;
bgint_t availableWord;
float targetNBit;
bgint_t buildSize;
bgint_t initialMaxBuildSize;
bgint_t incMaxBuildSize;
unsigned int firstCharInLastIteration;
unsigned int *workingMemory;
unsigned int *packedText;
unsigned char *textBuffer;
unsigned int *packedShift;
} BWTInc;
#endif