diff --git a/CommonLib/matlab_io.cpp b/CommonLib/matlab_io.cpp
index 77e8513..76593a4 100644
--- a/CommonLib/matlab_io.cpp
+++ b/CommonLib/matlab_io.cpp
@@ -28,7 +28,7 @@ bool ReadChildString2D(const string& filePath, const string& parentName, const s
mxArray* pMxArray = nullptr;
mxArray* pCell = nullptr;
int rowNum, colNum;
- char strBuf[STRING_BUF_SIZE];
+ char *strBuf = new char[STRING_BUF_SIZE];
pMatFile = matOpen(filePath.c_str(), "r"); //打开.mat文件
if (pMatFile == nullptr) {
@@ -63,6 +63,7 @@ bool ReadChildString2D(const string& filePath, const string& parentName, const s
}
}
mxDestroyArray(pMxArray);
+ delete[]strBuf;
return true;
}
diff --git a/MexFunc/MexFunc.vcxproj b/MexFunc/MexFunc.vcxproj
index 0a22ae6..6eb53f7 100644
--- a/MexFunc/MexFunc.vcxproj
+++ b/MexFunc/MexFunc.vcxproj
@@ -119,8 +119,8 @@
-
+
diff --git a/MexFunc/MexFunc.vcxproj.filters b/MexFunc/MexFunc.vcxproj.filters
index 87c8a4e..ef800b8 100644
--- a/MexFunc/MexFunc.vcxproj.filters
+++ b/MexFunc/MexFunc.vcxproj.filters
@@ -18,7 +18,7 @@
Source Files
-
+
Source Files
diff --git a/MexFunc/RandSim.cpp b/MexFunc/RandSim.cpp
new file mode 100644
index 0000000..7391229
--- /dev/null
+++ b/MexFunc/RandSim.cpp
@@ -0,0 +1,381 @@
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+using std::cout;
+using std::endl;
+using namespace std;
+
+#define STRING_BUF_SIZE 204800
+
+class ThreadPool {
+public:
+ ThreadPool(size_t);
+ template
+ auto enqueue(F&& f, Args&&... args)
+ ->std::future::type>;
+ ~ThreadPool();
+private:
+ // need to keep track of threads so we can join them
+ std::vector< std::thread > workers;
+ // the task queue
+ std::queue< std::function > tasks;
+
+ // synchronization
+ std::mutex queue_mutex;
+ std::condition_variable condition;
+ bool stop;
+};
+
+// the constructor just launches some amount of workers
+inline ThreadPool::ThreadPool(size_t threads)
+ : stop(false)
+{
+ for (size_t i = 0;i < threads;++i)
+ workers.emplace_back(
+ [this]
+ {
+ for (;;)
+ {
+ std::function task;
+
+ {
+ std::unique_lock lock(this->queue_mutex);
+ this->condition.wait(lock,
+ [this] { return this->stop || !this->tasks.empty(); });
+ if (this->stop && this->tasks.empty())
+ return;
+ task = std::move(this->tasks.front());
+ this->tasks.pop();
+ }
+
+ task();
+ }
+ }
+ );
+}
+
+// add new work item to the pool
+template
+auto ThreadPool::enqueue(F&& f, Args&&... args)
+-> std::future::type>
+{
+ using return_type = typename std::result_of::type;
+
+ auto task = std::make_shared< std::packaged_task >(
+ std::bind(std::forward(f), std::forward(args)...)
+ );
+
+ std::future res = task->get_future();
+ {
+ std::unique_lock lock(queue_mutex);
+
+ // don't allow enqueueing after stopping the pool
+ if (stop)
+ throw std::runtime_error("enqueue on stopped ThreadPool");
+
+ tasks.emplace([task]() { (*task)(); });
+ }
+ condition.notify_one();
+ return res;
+}
+
+// the destructor joins all threads
+inline ThreadPool::~ThreadPool()
+{
+ {
+ std::unique_lock lock(queue_mutex);
+ stop = true;
+ }
+ condition.notify_all();
+ for (std::thread& worker : workers)
+ worker.join();
+}
+
+// 读取一维cell字符串并转换成大写
+inline bool Read1DWord(const mxArray* pMxArray, vector& vStr) {
+ mxArray* pCell = nullptr;
+ int rowNum, colNum;
+ char* strBuf = new char[STRING_BUF_SIZE];
+
+ rowNum = (int)mxGetM(pMxArray);
+ colNum = (int)mxGetN(pMxArray);
+ vStr.resize(rowNum * colNum);
+ for (int i = 0; i < rowNum; ++i) {
+ for (int j = 0; j < colNum; ++j) {
+ pCell = mxGetCell(pMxArray, j * rowNum + i);
+ if (mxGetString(pCell, strBuf, STRING_BUF_SIZE) != 0) {
+ cout << "String is too large to fit in the buffer! " << i + 1 << '\t' << j + 1 << endl;
+ return false;
+ }
+ vStr[i * colNum + j] = strBuf;
+ auto& lastStr = vStr[i * colNum + j];
+ transform(lastStr.cbegin(), lastStr.cend(), lastStr.begin(), ::toupper); // 转成大写
+ }
+ }
+ delete[]strBuf;
+ return true;
+}
+
+// 读取二维cell字符串并转换成大写
+inline bool Read2DWord(const mxArray* pMxArray, vector>& vvStr) {
+ mxArray* pCell = nullptr;
+ int rowNum, colNum;
+ char* strBuf = new char[STRING_BUF_SIZE];
+
+ rowNum = (int)mxGetM(pMxArray);
+ colNum = (int)mxGetN(pMxArray);
+ for (int i = 0; i < rowNum; ++i) {
+ for (int j = 0; j < colNum; ++j) {
+ pCell = mxGetCell(pMxArray, j * rowNum + i);
+ int childRowNum = (int)mxGetM(pCell);
+ int childColNum = (int)mxGetN(pCell);
+ vvStr.push_back(vector());
+ Read1DWord(pCell, vvStr.back());
+ }
+ }
+ delete[]strBuf;
+ return true;
+}
+
+// 从txt文件里读取字符串, 并转换成大写
+inline void ReadWordFromFile(const string& filePath, vector>& vvStr) {
+ filebuf fb;
+ if (fb.open(filePath.c_str(), ios::in) == NULL) {
+ cout << "FilePath error: " << filePath << endl;
+ return;
+ }
+ istream ist(&fb);
+ string lineInfo;
+ while (getline(ist, lineInfo)) {
+ int i = 0;
+ vvStr.push_back(vector());
+ vector & vecStr = vvStr.back();
+ string tmp;
+ while (i < lineInfo.length()) {
+ while (i < lineInfo.length() && lineInfo[i] != ' ') {
+ tmp += lineInfo[i++];
+ }
+ if (!tmp.empty()) {
+ transform(tmp.begin(), tmp.end(), tmp.begin(), ::toupper);
+ vecStr.push_back(tmp);
+ }
+ tmp.clear();
+ ++i;
+ }
+ }
+ fb.close();
+}
+
+// 线程参数
+struct TPRandSim {
+ vector* pvZr;
+ vector* pvRandPos;
+ unordered_map* pumDicWordPos;
+ vector>* pvvWd2;
+ int wdSize;
+};
+
+// 多线程入口函数
+void ThreadRandSim(TPRandSim& param) {
+ vector &vZr = *param.pvZr;
+ vector &vRandPos = *param.pvRandPos;
+ unordered_map &umDicWordPos = *param.pumDicWordPos;
+ vector> &vvWd2 = *param.pvvWd2;
+ int wdSize = param.wdSize;
+
+ clock_t begin = clock(), finish;
+ /* 随机模拟 */
+ std::random_device rd;
+ std::shuffle(vRandPos.begin(), vRandPos.end(), std::default_random_engine(rd()));
+ unordered_set usPos;
+ for (int i = 0; i < wdSize; ++i) {
+ // cout << i << '\t' << vRandPos[i] << '\t' << vvWd2.size() << endl;
+ auto& vWd2 = vvWd2[vRandPos[i]];
+ usPos.clear();
+ for (auto& word : vWd2) {
+ auto itr = umDicWordPos.find(word);
+ if (itr != umDicWordPos.end()) {
+ usPos.insert(itr->second);
+ }
+ }
+ for (auto idx : usPos) {
+ vZr[idx] += 1;
+ }
+ }
+ finish = clock();
+ // cout << "Random simulation time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
+}
+
+
+
+/* 入口函数 */
+/*
+三个参数,一个返回值
+输入:
+1. wd 文献摘要中的单词,二维cell
+2. wd2 包含5w个摘要,每个摘要中包含多个单词,二维cell
+3. dicr 由大写字母组成的字典,按字母序排序的,一维cell
+4. numThread
+5. numLoop
+输出:
+vr 应该是相关性数值
+*/
+void mexFunction(int nlhs, mxArray* plhs[], int nrhs, mxArray** prhs) {
+//void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) {
+ if (nrhs < 1) {
+ cout << "At least 3 arguments should be given for this function!" << endl;
+ return;
+ }
+ clock_t begin = clock(), mid, finish;
+ vector vDicr;
+ vector> vvWd;
+ vector> vvWd2; // 5w word
+
+ Read2DWord(prhs[0], vvWd);
+ // Read2DWord(prhs[1], vvWd2);
+ char* strBuf = new char[STRING_BUF_SIZE];
+ mxGetString(prhs[1], strBuf, STRING_BUF_SIZE);
+ string wd2FilePath(strBuf);
+ delete[]strBuf;
+ ReadWordFromFile(wd2FilePath, vvWd2);
+ Read1DWord(prhs[2], vDicr);
+
+ // char* strBuf = new char[STRING_BUF_SIZE];
+ // mxGetString(prhs[1], strBuf, STRING_BUF_SIZE);
+ // string wd2FilePath(strBuf);
+ // delete[]strBuf;
+ // cout << wd2FilePath << endl;
+ // vector> vvWd3;
+ // ReadWordFromFile("D:\\Twirls\\gat1\\literatures\\temp\\wd2s.txt", vvWd3);
+
+ //ofstream ofs("d:\\diff.txt");
+ //ofs << vvWd2.size() << '\t' << vvWd3.size() << endl;
+ //for (int i = 0; i < vvWd2.size(); ++i) {
+ // if (vvWd2[i].size() != vvWd3[i].size())
+ // ofs << vvWd2[i].size() << '\t' << vvWd3[i].size() << endl;
+ // //for (int j = 0; j < vvWd2[i].size(); ++j) {
+ // // if (vvWd2[i][j] != vvWd3[i][j]) {
+ // // ofs << i+1 << '\t' << j+1 << '\t' << vvWd2[i][j] << '\t' << vvWd3[i][j] << endl;
+ // // }
+ // //}
+ //}
+ //ofs.close();
+
+ int numThread = 1;
+ int loopNum = 1000;
+
+ if (nrhs > 3) {
+ double* pNumThread = (double*)mxGetData(prhs[3]);
+ numThread = (int)pNumThread[0];
+ if (numThread < 1) numThread = 1;
+ }
+ if (nrhs > 4) {
+ double* pLoopNum = (double*)mxGetData(prhs[4]);
+ loopNum = (int)pLoopNum[0];
+ if (loopNum < 1000) loopNum = 1000;
+ }
+
+ /* 统计dicr字典中,每个单词在wd中出现的次数 */
+ mid = clock();
+ unordered_map umWordPos;
+ for (int i = 0; i < vDicr.size(); ++i) umWordPos[vDicr[i]] = i; // 记录单词位置
+ vector vZs(vDicr.size());
+ unordered_set usPos; // 多次出现在wd中的单词,只统计一次,这是原matlab代码的功能,是否需要修改?
+ for (auto & vWd : vvWd) {
+ usPos.clear();
+ for (auto & word : vWd) {
+ auto itr = umWordPos.find(word);
+ if (itr != umWordPos.end()) {
+ usPos.insert(itr->second);
+ }
+ }
+ for (auto idx : usPos) {
+ vZs[idx] += 1;
+ }
+ }
+ finish = clock();
+ cout << "Calc word occurrence time: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl;
+
+ /* 进行随机模拟 */
+ mid = clock();
+ vector> vvZr(loopNum, vector(vDicr.size(), 0)); // 输出结果
+ vector> vvRandPos(numThread, vector(vvWd2.size()));
+ for (int i = 0; i < vvWd2.size(); ++i) {
+ for (auto& vRandPos : vvRandPos) {
+ vRandPos[i] = i;
+ }
+ }
+
+ //ThreadPool thPool(numThread);
+ int tid = 0;
+ for (int i = 0; i < loopNum; ++i) {
+ TPRandSim tParam = { &vvZr[i], &vvRandPos[tid++ % numThread], &umWordPos, &vvWd2, vvWd.size()};
+ //thPool.enqueue(ThreadRandSim, tParam);
+ ThreadRandSim(tParam);
+ }
+ //thPool.~ThreadPool();
+ finish = clock();
+ cout << "Random simulation time: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl;
+
+ /* 计算vr */
+ vector vVr(vDicr.size());
+ // 按列计算平均值
+ vector vMean(vDicr.size());
+ vector vStd(vDicr.size());
+ for (int i = 0; i < vvZr.size(); ++i) {
+ for (int j = 0; j < vvZr[i].size(); ++j) {
+ vMean[j] += vvZr[i][j];
+ }
+ }
+ for (auto& val : vMean) { val /= loopNum; } // 均值
+ for (int i = 0; i < vvZr.size(); ++i) {
+ for (int j = 0; j < vvZr[i].size(); ++j) {
+ const double diff = vvZr[i][j] - vMean[j];
+ vStd[j] += diff * diff;
+ }
+ }
+ for (auto& val : vStd) { val = sqrt(val / (loopNum - 1)); } // 均方根
+ // 计算vr
+ for (int i = 0; i < vVr.size(); ++i) {
+ vVr[i] = (vZs[i] - vMean[i]) / vStd[i];
+ }
+
+ // ofstream ofs("d:\\result.txt");
+ // int i = 0;
+ // for (auto& vr : vVr) {
+ // ofs << vr << endl;
+ // }
+ // ofs.close();
+
+ /* 写入结果 */
+ if (nlhs > 0) {
+ mxArray* pWriteArray = NULL;//matlab格式矩阵
+ //创建一个rowNum*colNum的矩阵
+ pWriteArray = mxCreateDoubleMatrix(1, vVr.size(), mxREAL);
+ //把data的值赋给pWriteArray指针
+ memcpy((void*)(mxGetPr(pWriteArray)), (void*)vVr.data(), sizeof(double) * vVr.size());
+ plhs[0] = pWriteArray; // 赋值给返回值
+ }
+ finish = clock();
+ cout << "Random simulation Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
+}
\ No newline at end of file
diff --git a/MexFunc/SortDedup.cpp b/MexFunc/SortDedup.cpp
new file mode 100644
index 0000000..e7623be
--- /dev/null
+++ b/MexFunc/SortDedup.cpp
@@ -0,0 +1,109 @@
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+using std::cout;
+using std::endl;
+using namespace std;
+
+#define STRING_BUF_SIZE 204800
+
+// 读取字符串并转换成大写, 插入set
+bool ReadInsertWord(const mxArray* pMxArray, unordered_set &sWord) {
+ mxArray* pCell = nullptr;
+ int rowNum, colNum;
+ char* strBuf = new char[STRING_BUF_SIZE];
+
+ rowNum = (int)mxGetM(pMxArray);
+ colNum = (int)mxGetN(pMxArray);
+ for (int i = 0; i < rowNum; ++i) {
+ for (int j = 0; j < colNum; ++j) {
+ pCell = mxGetCell(pMxArray, j * rowNum + i);
+ int childRowNum = (int)mxGetM(pCell);
+ int childColNum = (int)mxGetN(pCell);
+ for (int ii = 0; ii < childRowNum; ii++) {
+ for (int jj = 0; jj < childColNum; jj++) {
+ mxArray* pChildCell = mxGetCell(pCell, jj * childRowNum + ii);
+ if (mxGetString(pChildCell, strBuf, STRING_BUF_SIZE) != 0) {
+ cout << "String is too large to fit in the buffer! " << i + 1 << '\t' << j + 1 << endl;
+ return false;
+ }
+ string str(strBuf);
+ transform(str.cbegin(), str.cend(), str.begin(), ::toupper); // 转成大写
+ sWord.insert(str);
+ }
+ }
+ }
+ }
+ delete[]strBuf;
+ return true;
+}
+
+/* 入口函数 */
+// void mexFunction(int nlhs, mxArray* plhs[], int nrhs, mxArray** prhs) {
+void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) {
+ if (nrhs < 1) {
+ cout << "At least 1 arguments should be given for this function!" << endl;
+ return;
+ }
+ clock_t begin = clock(), finish;
+
+ //set sOrderedWord;
+
+ unordered_set usStr;
+ ReadInsertWord(prhs[0], usStr);
+ usStr.insert("A");
+ usStr.insert("Z");
+
+ ///* 排序 */
+ set sOrderedWord;
+ for (auto& word : usStr) {
+ sOrderedWord.insert(word);
+ }
+
+ //ofstream ofs("d:\\wd_dict.txt");
+ //for (auto& word : sOrderedWord) ofs << word << endl;
+ //ofs.close();
+
+ /* 写入结果 */
+ if (nlhs > 0) {
+ int wordSize = 0;
+ for (auto& word : sOrderedWord) {
+ if (word[0] >= 'A' && word[0] <= 'Z') {
+ wordSize++;
+ }
+ }
+ mxArray* pCell = mxCreateCellMatrix(1, wordSize);
+ int i = 0;
+ for (auto& word : sOrderedWord) {
+ if (word[0] >= 'A' && word[0] <= 'Z') {
+ mxArray* mxStr = mxCreateString(word.c_str());
+ mxSetCell(pCell, i++, mxStr);
+ //ofs << word << endl;
+ }
+ }
+ plhs[0] = pCell; // 赋值给返回值
+ }
+ //ofs.close();
+ finish = clock();
+ cout << "Deduplicate and Sort word Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
+}
\ No newline at end of file
diff --git a/MexFunc/main.cpp b/MexFunc/main.cpp
index a26ec9f..d365162 100644
--- a/MexFunc/main.cpp
+++ b/MexFunc/main.cpp
@@ -8,27 +8,40 @@ using namespace std;
int main(int argc, const char** argv)
{
- string matFile = "D:\\x_large.mat";
+ //string matFile = "D:\\x_large.mat";
//string matFile = "D:\\x.mat";
+ //string matFile = "D:\\Twirls\\wd_small.mat";
+ //string matFile = "D:\\Twirls\\wd.mat";
+ clock_t begin = clock(), finish;
+ string wd2Mat = "D:\\wd2_5w.mat";
+ string dicrMat = "D:\\dicr.mat";
+ string wdMat = "D:\\wd.mat";
- MATFile* pMatFile = nullptr;
- mxArray* prhs[1];
- int rowNum, colNum;
- double* matData;
- pMatFile = matOpen(matFile.c_str(), "r"); //鎵撳紑.mat鏂囦欢
- if (pMatFile == nullptr) {
- cout << "filePath is error!" << endl;
- return 1;
- }
- prhs[0] = matGetVariable(pMatFile, "x"); //鑾峰彇.mat鏂囦欢閲岄潰鍚嶄负matrixName鐨勭煩闃
+ //string dicrMat = "D:\\dicr_large.mat";
+ //string wdMat = "D:\\wd_large.mat";
+
+ MATFile* pwdMat, *pwd2Mat, *pdicrMat;
+ mxArray* prhs[4];
+
+ pwdMat = matOpen(wdMat.c_str(), "r");
+ pwd2Mat = matOpen(wd2Mat.c_str(), "r");
+ pdicrMat = matOpen(dicrMat.c_str(), "r");
+
+
+ prhs[0] = matGetVariable(pwdMat, "wd"); //鑾峰彇.mat鏂囦欢閲岄潰鍚嶄负matrixName鐨勭煩闃
+ prhs[1] = matGetVariable(pwd2Mat, "wd2");
+ // prhs[1] = mxCreateString("D:\\Twirls\\gat1\\literatures\\temp\\wd2s.txt");
+ prhs[2] = matGetVariable(pdicrMat, "dicr");
mxArray* plhs = (mxArray*)malloc(sizeof(mxArray*));
mxArray** arg = &prhs[0];
- mexFunction(1, &plhs, 1, arg);
+ mexFunction(1, &plhs, 3, arg);
//mexFunction(0, 0, 0, 0);
+ finish = clock();
+ cout << "mexFunction Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
return 0;
}
diff --git a/RandSim/RandSim.vcxproj b/RandSim/RandSim.vcxproj
index 02cfec9..6f7896b 100644
--- a/RandSim/RandSim.vcxproj
+++ b/RandSim/RandSim.vcxproj
@@ -75,16 +75,24 @@
Disabled
_DEBUG;%(PreprocessorDefinitions)
+ D:\matlab2023a\extern\include;$(SolutionDir)
Console
false
+ CommonLib.lib;kernel32.lib;user32.lib;%(AdditionalDependencies)
+ $(OutDir)
WIN32;%(PreprocessorDefinitions)
+ D:\matlab2023a\extern\include;$(SolutionDir)
+
+ CommonLib.lib;kernel32.lib;user32.lib;%(AdditionalDependencies)
+ $(OutDir)
+
@@ -92,12 +100,18 @@
true
true
NDEBUG;%(PreprocessorDefinitions)
+ D:\matlab2023a\extern\include;$(SolutionDir)
+ D:\matlab2023a\extern\include;$(SolutionDir)
Console
true
true
false
+ CommonLib.lib;kernel32.lib;user32.lib;%(AdditionalDependencies)
+ CommonLib.lib;kernel32.lib;user32.lib;%(AdditionalDependencies)
+ $(OutDir)
+ $(OutDir)