diff --git a/CommonLib/matlab_io.cpp b/CommonLib/matlab_io.cpp index 77e8513..76593a4 100644 --- a/CommonLib/matlab_io.cpp +++ b/CommonLib/matlab_io.cpp @@ -28,7 +28,7 @@ bool ReadChildString2D(const string& filePath, const string& parentName, const s mxArray* pMxArray = nullptr; mxArray* pCell = nullptr; int rowNum, colNum; - char strBuf[STRING_BUF_SIZE]; + char *strBuf = new char[STRING_BUF_SIZE]; pMatFile = matOpen(filePath.c_str(), "r"); //打开.mat文件 if (pMatFile == nullptr) { @@ -63,6 +63,7 @@ bool ReadChildString2D(const string& filePath, const string& parentName, const s } } mxDestroyArray(pMxArray); + delete[]strBuf; return true; } diff --git a/MexFunc/MexFunc.vcxproj b/MexFunc/MexFunc.vcxproj index 0a22ae6..6eb53f7 100644 --- a/MexFunc/MexFunc.vcxproj +++ b/MexFunc/MexFunc.vcxproj @@ -119,8 +119,8 @@ - + diff --git a/MexFunc/MexFunc.vcxproj.filters b/MexFunc/MexFunc.vcxproj.filters index 87c8a4e..ef800b8 100644 --- a/MexFunc/MexFunc.vcxproj.filters +++ b/MexFunc/MexFunc.vcxproj.filters @@ -18,7 +18,7 @@ Source Files - + Source Files diff --git a/MexFunc/RandSim.cpp b/MexFunc/RandSim.cpp new file mode 100644 index 0000000..7391229 --- /dev/null +++ b/MexFunc/RandSim.cpp @@ -0,0 +1,381 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using std::cout; +using std::endl; +using namespace std; + +#define STRING_BUF_SIZE 204800 + +class ThreadPool { +public: + ThreadPool(size_t); + template + auto enqueue(F&& f, Args&&... args) + ->std::future::type>; + ~ThreadPool(); +private: + // need to keep track of threads so we can join them + std::vector< std::thread > workers; + // the task queue + std::queue< std::function > tasks; + + // synchronization + std::mutex queue_mutex; + std::condition_variable condition; + bool stop; +}; + +// the constructor just launches some amount of workers +inline ThreadPool::ThreadPool(size_t threads) + : stop(false) +{ + for (size_t i = 0;i < threads;++i) + workers.emplace_back( + [this] + { + for (;;) + { + std::function task; + + { + std::unique_lock lock(this->queue_mutex); + this->condition.wait(lock, + [this] { return this->stop || !this->tasks.empty(); }); + if (this->stop && this->tasks.empty()) + return; + task = std::move(this->tasks.front()); + this->tasks.pop(); + } + + task(); + } + } + ); +} + +// add new work item to the pool +template +auto ThreadPool::enqueue(F&& f, Args&&... args) +-> std::future::type> +{ + using return_type = typename std::result_of::type; + + auto task = std::make_shared< std::packaged_task >( + std::bind(std::forward(f), std::forward(args)...) + ); + + std::future res = task->get_future(); + { + std::unique_lock lock(queue_mutex); + + // don't allow enqueueing after stopping the pool + if (stop) + throw std::runtime_error("enqueue on stopped ThreadPool"); + + tasks.emplace([task]() { (*task)(); }); + } + condition.notify_one(); + return res; +} + +// the destructor joins all threads +inline ThreadPool::~ThreadPool() +{ + { + std::unique_lock lock(queue_mutex); + stop = true; + } + condition.notify_all(); + for (std::thread& worker : workers) + worker.join(); +} + +// 读取一维cell字符串并转换成大写 +inline bool Read1DWord(const mxArray* pMxArray, vector& vStr) { + mxArray* pCell = nullptr; + int rowNum, colNum; + char* strBuf = new char[STRING_BUF_SIZE]; + + rowNum = (int)mxGetM(pMxArray); + colNum = (int)mxGetN(pMxArray); + vStr.resize(rowNum * colNum); + for (int i = 0; i < rowNum; ++i) { + for (int j = 0; j < colNum; ++j) { + pCell = mxGetCell(pMxArray, j * rowNum + i); + if (mxGetString(pCell, strBuf, STRING_BUF_SIZE) != 0) { + cout << "String is too large to fit in the buffer! " << i + 1 << '\t' << j + 1 << endl; + return false; + } + vStr[i * colNum + j] = strBuf; + auto& lastStr = vStr[i * colNum + j]; + transform(lastStr.cbegin(), lastStr.cend(), lastStr.begin(), ::toupper); // 转成大写 + } + } + delete[]strBuf; + return true; +} + +// 读取二维cell字符串并转换成大写 +inline bool Read2DWord(const mxArray* pMxArray, vector>& vvStr) { + mxArray* pCell = nullptr; + int rowNum, colNum; + char* strBuf = new char[STRING_BUF_SIZE]; + + rowNum = (int)mxGetM(pMxArray); + colNum = (int)mxGetN(pMxArray); + for (int i = 0; i < rowNum; ++i) { + for (int j = 0; j < colNum; ++j) { + pCell = mxGetCell(pMxArray, j * rowNum + i); + int childRowNum = (int)mxGetM(pCell); + int childColNum = (int)mxGetN(pCell); + vvStr.push_back(vector()); + Read1DWord(pCell, vvStr.back()); + } + } + delete[]strBuf; + return true; +} + +// 从txt文件里读取字符串, 并转换成大写 +inline void ReadWordFromFile(const string& filePath, vector>& vvStr) { + filebuf fb; + if (fb.open(filePath.c_str(), ios::in) == NULL) { + cout << "FilePath error: " << filePath << endl; + return; + } + istream ist(&fb); + string lineInfo; + while (getline(ist, lineInfo)) { + int i = 0; + vvStr.push_back(vector()); + vector & vecStr = vvStr.back(); + string tmp; + while (i < lineInfo.length()) { + while (i < lineInfo.length() && lineInfo[i] != ' ') { + tmp += lineInfo[i++]; + } + if (!tmp.empty()) { + transform(tmp.begin(), tmp.end(), tmp.begin(), ::toupper); + vecStr.push_back(tmp); + } + tmp.clear(); + ++i; + } + } + fb.close(); +} + +// 线程参数 +struct TPRandSim { + vector* pvZr; + vector* pvRandPos; + unordered_map* pumDicWordPos; + vector>* pvvWd2; + int wdSize; +}; + +// 多线程入口函数 +void ThreadRandSim(TPRandSim& param) { + vector &vZr = *param.pvZr; + vector &vRandPos = *param.pvRandPos; + unordered_map &umDicWordPos = *param.pumDicWordPos; + vector> &vvWd2 = *param.pvvWd2; + int wdSize = param.wdSize; + + clock_t begin = clock(), finish; + /* 随机模拟 */ + std::random_device rd; + std::shuffle(vRandPos.begin(), vRandPos.end(), std::default_random_engine(rd())); + unordered_set usPos; + for (int i = 0; i < wdSize; ++i) { + // cout << i << '\t' << vRandPos[i] << '\t' << vvWd2.size() << endl; + auto& vWd2 = vvWd2[vRandPos[i]]; + usPos.clear(); + for (auto& word : vWd2) { + auto itr = umDicWordPos.find(word); + if (itr != umDicWordPos.end()) { + usPos.insert(itr->second); + } + } + for (auto idx : usPos) { + vZr[idx] += 1; + } + } + finish = clock(); + // cout << "Random simulation time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; +} + + + +/* 入口函数 */ +/* +三个参数,一个返回值 +输入: +1. wd 文献摘要中的单词,二维cell +2. wd2 包含5w个摘要,每个摘要中包含多个单词,二维cell +3. dicr 由大写字母组成的字典,按字母序排序的,一维cell +4. numThread +5. numLoop +输出: +vr 应该是相关性数值 +*/ +void mexFunction(int nlhs, mxArray* plhs[], int nrhs, mxArray** prhs) { +//void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) { + if (nrhs < 1) { + cout << "At least 3 arguments should be given for this function!" << endl; + return; + } + clock_t begin = clock(), mid, finish; + vector vDicr; + vector> vvWd; + vector> vvWd2; // 5w word + + Read2DWord(prhs[0], vvWd); + // Read2DWord(prhs[1], vvWd2); + char* strBuf = new char[STRING_BUF_SIZE]; + mxGetString(prhs[1], strBuf, STRING_BUF_SIZE); + string wd2FilePath(strBuf); + delete[]strBuf; + ReadWordFromFile(wd2FilePath, vvWd2); + Read1DWord(prhs[2], vDicr); + + // char* strBuf = new char[STRING_BUF_SIZE]; + // mxGetString(prhs[1], strBuf, STRING_BUF_SIZE); + // string wd2FilePath(strBuf); + // delete[]strBuf; + // cout << wd2FilePath << endl; + // vector> vvWd3; + // ReadWordFromFile("D:\\Twirls\\gat1\\literatures\\temp\\wd2s.txt", vvWd3); + + //ofstream ofs("d:\\diff.txt"); + //ofs << vvWd2.size() << '\t' << vvWd3.size() << endl; + //for (int i = 0; i < vvWd2.size(); ++i) { + // if (vvWd2[i].size() != vvWd3[i].size()) + // ofs << vvWd2[i].size() << '\t' << vvWd3[i].size() << endl; + // //for (int j = 0; j < vvWd2[i].size(); ++j) { + // // if (vvWd2[i][j] != vvWd3[i][j]) { + // // ofs << i+1 << '\t' << j+1 << '\t' << vvWd2[i][j] << '\t' << vvWd3[i][j] << endl; + // // } + // //} + //} + //ofs.close(); + + int numThread = 1; + int loopNum = 1000; + + if (nrhs > 3) { + double* pNumThread = (double*)mxGetData(prhs[3]); + numThread = (int)pNumThread[0]; + if (numThread < 1) numThread = 1; + } + if (nrhs > 4) { + double* pLoopNum = (double*)mxGetData(prhs[4]); + loopNum = (int)pLoopNum[0]; + if (loopNum < 1000) loopNum = 1000; + } + + /* 统计dicr字典中,每个单词在wd中出现的次数 */ + mid = clock(); + unordered_map umWordPos; + for (int i = 0; i < vDicr.size(); ++i) umWordPos[vDicr[i]] = i; // 记录单词位置 + vector vZs(vDicr.size()); + unordered_set usPos; // 多次出现在wd中的单词,只统计一次,这是原matlab代码的功能,是否需要修改? + for (auto & vWd : vvWd) { + usPos.clear(); + for (auto & word : vWd) { + auto itr = umWordPos.find(word); + if (itr != umWordPos.end()) { + usPos.insert(itr->second); + } + } + for (auto idx : usPos) { + vZs[idx] += 1; + } + } + finish = clock(); + cout << "Calc word occurrence time: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl; + + /* 进行随机模拟 */ + mid = clock(); + vector> vvZr(loopNum, vector(vDicr.size(), 0)); // 输出结果 + vector> vvRandPos(numThread, vector(vvWd2.size())); + for (int i = 0; i < vvWd2.size(); ++i) { + for (auto& vRandPos : vvRandPos) { + vRandPos[i] = i; + } + } + + //ThreadPool thPool(numThread); + int tid = 0; + for (int i = 0; i < loopNum; ++i) { + TPRandSim tParam = { &vvZr[i], &vvRandPos[tid++ % numThread], &umWordPos, &vvWd2, vvWd.size()}; + //thPool.enqueue(ThreadRandSim, tParam); + ThreadRandSim(tParam); + } + //thPool.~ThreadPool(); + finish = clock(); + cout << "Random simulation time: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl; + + /* 计算vr */ + vector vVr(vDicr.size()); + // 按列计算平均值 + vector vMean(vDicr.size()); + vector vStd(vDicr.size()); + for (int i = 0; i < vvZr.size(); ++i) { + for (int j = 0; j < vvZr[i].size(); ++j) { + vMean[j] += vvZr[i][j]; + } + } + for (auto& val : vMean) { val /= loopNum; } // 均值 + for (int i = 0; i < vvZr.size(); ++i) { + for (int j = 0; j < vvZr[i].size(); ++j) { + const double diff = vvZr[i][j] - vMean[j]; + vStd[j] += diff * diff; + } + } + for (auto& val : vStd) { val = sqrt(val / (loopNum - 1)); } // 均方根 + // 计算vr + for (int i = 0; i < vVr.size(); ++i) { + vVr[i] = (vZs[i] - vMean[i]) / vStd[i]; + } + + // ofstream ofs("d:\\result.txt"); + // int i = 0; + // for (auto& vr : vVr) { + // ofs << vr << endl; + // } + // ofs.close(); + + /* 写入结果 */ + if (nlhs > 0) { + mxArray* pWriteArray = NULL;//matlab格式矩阵 + //创建一个rowNum*colNum的矩阵 + pWriteArray = mxCreateDoubleMatrix(1, vVr.size(), mxREAL); + //把data的值赋给pWriteArray指针 + memcpy((void*)(mxGetPr(pWriteArray)), (void*)vVr.data(), sizeof(double) * vVr.size()); + plhs[0] = pWriteArray; // 赋值给返回值 + } + finish = clock(); + cout << "Random simulation Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; +} \ No newline at end of file diff --git a/MexFunc/SortDedup.cpp b/MexFunc/SortDedup.cpp new file mode 100644 index 0000000..e7623be --- /dev/null +++ b/MexFunc/SortDedup.cpp @@ -0,0 +1,109 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using std::cout; +using std::endl; +using namespace std; + +#define STRING_BUF_SIZE 204800 + +// 读取字符串并转换成大写, 插入set +bool ReadInsertWord(const mxArray* pMxArray, unordered_set &sWord) { + mxArray* pCell = nullptr; + int rowNum, colNum; + char* strBuf = new char[STRING_BUF_SIZE]; + + rowNum = (int)mxGetM(pMxArray); + colNum = (int)mxGetN(pMxArray); + for (int i = 0; i < rowNum; ++i) { + for (int j = 0; j < colNum; ++j) { + pCell = mxGetCell(pMxArray, j * rowNum + i); + int childRowNum = (int)mxGetM(pCell); + int childColNum = (int)mxGetN(pCell); + for (int ii = 0; ii < childRowNum; ii++) { + for (int jj = 0; jj < childColNum; jj++) { + mxArray* pChildCell = mxGetCell(pCell, jj * childRowNum + ii); + if (mxGetString(pChildCell, strBuf, STRING_BUF_SIZE) != 0) { + cout << "String is too large to fit in the buffer! " << i + 1 << '\t' << j + 1 << endl; + return false; + } + string str(strBuf); + transform(str.cbegin(), str.cend(), str.begin(), ::toupper); // 转成大写 + sWord.insert(str); + } + } + } + } + delete[]strBuf; + return true; +} + +/* 入口函数 */ +// void mexFunction(int nlhs, mxArray* plhs[], int nrhs, mxArray** prhs) { +void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) { + if (nrhs < 1) { + cout << "At least 1 arguments should be given for this function!" << endl; + return; + } + clock_t begin = clock(), finish; + + //set sOrderedWord; + + unordered_set usStr; + ReadInsertWord(prhs[0], usStr); + usStr.insert("A"); + usStr.insert("Z"); + + ///* 排序 */ + set sOrderedWord; + for (auto& word : usStr) { + sOrderedWord.insert(word); + } + + //ofstream ofs("d:\\wd_dict.txt"); + //for (auto& word : sOrderedWord) ofs << word << endl; + //ofs.close(); + + /* 写入结果 */ + if (nlhs > 0) { + int wordSize = 0; + for (auto& word : sOrderedWord) { + if (word[0] >= 'A' && word[0] <= 'Z') { + wordSize++; + } + } + mxArray* pCell = mxCreateCellMatrix(1, wordSize); + int i = 0; + for (auto& word : sOrderedWord) { + if (word[0] >= 'A' && word[0] <= 'Z') { + mxArray* mxStr = mxCreateString(word.c_str()); + mxSetCell(pCell, i++, mxStr); + //ofs << word << endl; + } + } + plhs[0] = pCell; // 赋值给返回值 + } + //ofs.close(); + finish = clock(); + cout << "Deduplicate and Sort word Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; +} \ No newline at end of file diff --git a/MexFunc/main.cpp b/MexFunc/main.cpp index a26ec9f..d365162 100644 --- a/MexFunc/main.cpp +++ b/MexFunc/main.cpp @@ -8,27 +8,40 @@ using namespace std; int main(int argc, const char** argv) { - string matFile = "D:\\x_large.mat"; + //string matFile = "D:\\x_large.mat"; //string matFile = "D:\\x.mat"; + //string matFile = "D:\\Twirls\\wd_small.mat"; + //string matFile = "D:\\Twirls\\wd.mat"; + clock_t begin = clock(), finish; + string wd2Mat = "D:\\wd2_5w.mat"; + string dicrMat = "D:\\dicr.mat"; + string wdMat = "D:\\wd.mat"; - MATFile* pMatFile = nullptr; - mxArray* prhs[1]; - int rowNum, colNum; - double* matData; - pMatFile = matOpen(matFile.c_str(), "r"); //鎵撳紑.mat鏂囦欢 - if (pMatFile == nullptr) { - cout << "filePath is error!" << endl; - return 1; - } - prhs[0] = matGetVariable(pMatFile, "x"); //鑾峰彇.mat鏂囦欢閲岄潰鍚嶄负matrixName鐨勭煩闃 + //string dicrMat = "D:\\dicr_large.mat"; + //string wdMat = "D:\\wd_large.mat"; + + MATFile* pwdMat, *pwd2Mat, *pdicrMat; + mxArray* prhs[4]; + + pwdMat = matOpen(wdMat.c_str(), "r"); + pwd2Mat = matOpen(wd2Mat.c_str(), "r"); + pdicrMat = matOpen(dicrMat.c_str(), "r"); + + + prhs[0] = matGetVariable(pwdMat, "wd"); //鑾峰彇.mat鏂囦欢閲岄潰鍚嶄负matrixName鐨勭煩闃 + prhs[1] = matGetVariable(pwd2Mat, "wd2"); + // prhs[1] = mxCreateString("D:\\Twirls\\gat1\\literatures\\temp\\wd2s.txt"); + prhs[2] = matGetVariable(pdicrMat, "dicr"); mxArray* plhs = (mxArray*)malloc(sizeof(mxArray*)); mxArray** arg = &prhs[0]; - mexFunction(1, &plhs, 1, arg); + mexFunction(1, &plhs, 3, arg); //mexFunction(0, 0, 0, 0); + finish = clock(); + cout << "mexFunction Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; return 0; } diff --git a/RandSim/RandSim.vcxproj b/RandSim/RandSim.vcxproj index 02cfec9..6f7896b 100644 --- a/RandSim/RandSim.vcxproj +++ b/RandSim/RandSim.vcxproj @@ -75,16 +75,24 @@ Disabled _DEBUG;%(PreprocessorDefinitions) + D:\matlab2023a\extern\include;$(SolutionDir) Console false + CommonLib.lib;kernel32.lib;user32.lib;%(AdditionalDependencies) + $(OutDir) WIN32;%(PreprocessorDefinitions) + D:\matlab2023a\extern\include;$(SolutionDir) + + CommonLib.lib;kernel32.lib;user32.lib;%(AdditionalDependencies) + $(OutDir) + @@ -92,12 +100,18 @@ true true NDEBUG;%(PreprocessorDefinitions) + D:\matlab2023a\extern\include;$(SolutionDir) + D:\matlab2023a\extern\include;$(SolutionDir) Console true true false + CommonLib.lib;kernel32.lib;user32.lib;%(AdditionalDependencies) + CommonLib.lib;kernel32.lib;user32.lib;%(AdditionalDependencies) + $(OutDir) + $(OutDir)