twirls/MexFunc/SortDedup.cpp

147 lines
3.9 KiB
C++
Raw Normal View History

2023-10-07 04:21:54 +08:00
#include <mex.h>
#include <mat.h>
#include <iostream>
#include <algorithm>
#include <vector>
#include <string>
#include <unordered_set>
#include <ctime>
#include <immintrin.h>
#include <zmmintrin.h>
#include <vector>
#include <queue>
#include <memory>
#include <thread>
#include <mutex>
#include <condition_variable>
#include <future>
#include <functional>
#include <stdexcept>
#include <unordered_set>
#include <set>
#include <fstream>
using std::cout;
using std::endl;
using namespace std;
#define STRING_BUF_SIZE 204800
// <20><>ȡ<EFBFBD>ַ<EFBFBD><D6B7><EFBFBD><EFBFBD><EFBFBD>ת<EFBFBD><D7AA><EFBFBD>ɴ<EFBFBD>д, <20><><EFBFBD><EFBFBD>set
bool ReadInsertWord(const mxArray* pMxArray, unordered_set<string> &sWord) {
mxArray* pCell = nullptr;
int rowNum, colNum;
char* strBuf = new char[STRING_BUF_SIZE];
rowNum = (int)mxGetM(pMxArray);
colNum = (int)mxGetN(pMxArray);
for (int i = 0; i < rowNum; ++i) {
for (int j = 0; j < colNum; ++j) {
pCell = mxGetCell(pMxArray, j * rowNum + i);
int childRowNum = (int)mxGetM(pCell);
int childColNum = (int)mxGetN(pCell);
for (int ii = 0; ii < childRowNum; ii++) {
for (int jj = 0; jj < childColNum; jj++) {
mxArray* pChildCell = mxGetCell(pCell, jj * childRowNum + ii);
if (mxGetString(pChildCell, strBuf, STRING_BUF_SIZE) != 0) {
cout << "String is too large to fit in the buffer! " << i + 1 << '\t' << j + 1 << endl;
return false;
}
string str(strBuf);
transform(str.cbegin(), str.cend(), str.begin(), ::toupper); // ת<>ɴ<EFBFBD>д
sWord.insert(str);
}
}
}
}
delete[]strBuf;
return true;
}
/* <20><><EFBFBD>ں<EFBFBD><DABA><EFBFBD> */
/*
<EFBFBD><EFBFBD><EFBFBD>
1. wd: <EFBFBD><EFBFBD><EFBFBD><EFBFBD>ժҪ<EFBFBD><EFBFBD><EFBFBD>ɶ<EFBFBD>άcell<EFBFBD><EFBFBD><EFBFBD>ɵ<EFBFBD><EFBFBD>ַ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
[2]. <EFBFBD><EFBFBD><EFBFBD>ַ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ļ<EFBFBD>·<EFBFBD><EFBFBD>
[3]. flagPrint <EFBFBD>Ƿ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϣ
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
1. dic: <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ɵ<EFBFBD>һάcell<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȥ<EFBFBD><EFBFBD>֮<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ժҪ<EFBFBD><EFBFBD><EFBFBD>е<EFBFBD><EFBFBD>ʣ<EFBFBD><EFBFBD><EFBFBD>д<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ĸ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>(ֻ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ĸ<EFBFBD>ĵ<EFBFBD><EFBFBD>ʣ<EFBFBD>ȥ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֵ<EFBFBD>)
*/
2023-10-07 04:21:54 +08:00
void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) {
if (nrhs < 1) {
cout << "At least 1 arguments should be given for this function!" << endl;
return;
}
clock_t begin = clock(), mid, finish;
2023-10-07 04:21:54 +08:00
unordered_set<string> usStr;
ReadInsertWord(prhs[0], usStr);
// usStr.insert("A");
// usStr.insert("Z");
string outputPath;
if (nrhs > 1) {
char* strBuf = new char[STRING_BUF_SIZE];
mxGetString(prhs[1], strBuf, STRING_BUF_SIZE);
outputPath = strBuf;
delete[]strBuf;
}
int flagPrint = 0; // <20>Ƿ<EFBFBD><C7B7><EFBFBD>ӡ<EFBFBD><D3A1>Ϣ, 1<><31>ӡ<EFBFBD><D3A1><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϣ<EFBFBD><CFA2>2<EFBFBD><32>ӡ<EFBFBD><D3A1>ϸ<EFBFBD><CFB8>Ϣ
if (nrhs > 2) {
double* pData = (double*)mxGetData(prhs[2]);
flagPrint = (int)pData[0];
}
2023-10-07 04:21:54 +08:00
finish = clock();
if (flagPrint == 2) cout << "Load data time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
/* <20><><EFBFBD><EFBFBD> */
mid = clock();
2023-10-07 04:21:54 +08:00
set<string> sOrderedWord;
for (auto& word : usStr) {
sOrderedWord.insert(word);
}
finish = clock();
if (flagPrint == 2) cout << "Sort and deduplicate time: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl;
2023-10-07 04:21:54 +08:00
/* <20><><EFBFBD>ַ<EFBFBD><D6B7><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ļ<EFBFBD> */
if (! outputPath.empty()) {
cout << outputPath << endl;
ofstream ofs(outputPath);
for (auto& word : sOrderedWord) ofs << word << endl;
ofs.close();
}
sOrderedWord.insert("A");
sOrderedWord.insert("Z");
2023-10-07 04:21:54 +08:00
/* д<><D0B4><EFBFBD><EFBFBD><EFBFBD><EFBFBD> */
mid = clock();
2023-10-07 04:21:54 +08:00
if (nlhs > 0) {
int wordSize = 0;
for (auto& word : sOrderedWord) {
if (word[0] >= 'A' && word[0] <= 'Z') {
wordSize++;
}
}
mxArray* pCell = mxCreateCellMatrix(1, wordSize);
int i = 0;
for (auto& word : sOrderedWord) {
if (word[0] >= 'A' && word[0] <= 'Z') {
mxArray* mxStr = mxCreateString(word.c_str());
mxSetCell(pCell, i++, mxStr);
}
}
plhs[0] = pCell; // <20><>ֵ<EFBFBD><D6B5><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֵ
}
finish = clock();
if (flagPrint == 2) cout << "Write back data time: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl;
finish = clock();
if (flagPrint)cout << "Deduplicate and Sort word Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
}
// <20><>c++<2B><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
void mexFunctionWrap(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) {
return mexFunction(nlhs, plhs, nrhs, prhs);
2023-10-07 04:21:54 +08:00
}