From ca3f99cc98bf0d428408d61cffa20f549fd99987 Mon Sep 17 00:00:00 2001 From: zzh Date: Thu, 12 Oct 2023 12:46:17 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E6=88=90=E4=BA=86=E8=AE=A1=E7=AE=97?= =?UTF-8?q?=E4=BF=A1=E6=81=AF=E7=86=B5=E5=92=8C=E5=B9=B3=E5=9D=87=E4=BF=A1?= =?UTF-8?q?=E6=81=AF=E7=86=B5=EF=BC=88=E7=AC=AC=E4=B8=89=E4=B8=AA=E8=AE=A1?= =?UTF-8?q?=E7=AE=97=E4=BF=A1=E6=81=AF=E7=86=B5=E7=9A=84=E5=87=BD=E6=95=B0?= =?UTF-8?q?=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- MexFunc/AllClusterRandSim.cpp | 415 ++++++++++++++++++++++++++++++++ MexFunc/AllEntropyMean.cpp | 312 ++++++++++++++++++++++++ MexFunc/CalcEntropy.cpp | 11 +- MexFunc/ClusterRandSim.cpp | 189 +++++---------- MexFunc/IsWordInDic.cpp | 6 +- MexFunc/MexFunc.vcxproj | 2 +- MexFunc/MexFunc.vcxproj.filters | 2 +- MexFunc/main.cpp | 62 ++++- 8 files changed, 852 insertions(+), 147 deletions(-) create mode 100644 MexFunc/AllClusterRandSim.cpp create mode 100644 MexFunc/AllEntropyMean.cpp diff --git a/MexFunc/AllClusterRandSim.cpp b/MexFunc/AllClusterRandSim.cpp new file mode 100644 index 0000000..c9af047 --- /dev/null +++ b/MexFunc/AllClusterRandSim.cpp @@ -0,0 +1,415 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using std::cout; +using std::endl; +using namespace std; + +#define STRING_BUF_SIZE 204800 + +class ThreadPool { +public: + ThreadPool(size_t); + template + auto enqueue(F&& f, Args&&... args) + ->std::future::type>; + ~ThreadPool(); +private: + // need to keep track of threads so we can join them + std::vector< std::thread > workers; + // the task queue + std::queue< std::function > tasks; + + // synchronization + std::mutex queue_mutex; + std::condition_variable condition; + bool stop; +}; + +// the constructor just launches some amount of workers +inline ThreadPool::ThreadPool(size_t threads) + : stop(false) +{ + for (size_t i = 0;i < threads;++i) + workers.emplace_back( + [this] + { + for (;;) + { + std::function task; + + { + std::unique_lock lock(this->queue_mutex); + this->condition.wait(lock, + [this] { return this->stop || !this->tasks.empty(); }); + if (this->stop && this->tasks.empty()) + return; + task = std::move(this->tasks.front()); + this->tasks.pop(); + } + + task(); + } + } + ); +} + +// add new work item to the pool +template +auto ThreadPool::enqueue(F&& f, Args&&... args) +-> std::future::type> +{ + using return_type = typename std::result_of::type; + + auto task = std::make_shared< std::packaged_task >( + std::bind(std::forward(f), std::forward(args)...) + ); + + std::future res = task->get_future(); + { + std::unique_lock lock(queue_mutex); + + // don't allow enqueueing after stopping the pool + if (stop) + throw std::runtime_error("enqueue on stopped ThreadPool"); + + tasks.emplace([task]() { (*task)(); }); + } + condition.notify_one(); + return res; +} + +// the destructor joins all threads +inline ThreadPool::~ThreadPool() +{ + { + std::unique_lock lock(queue_mutex); + stop = true; + } + condition.notify_all(); + for (std::thread& worker : workers) + worker.join(); +} + +/* 读取一维double数据 */ +void Read1DDouble(const mxArray* pMxArray, vector& vDat) { + int rowNum, colNum; + double* matData; + + rowNum = (int)mxGetM(pMxArray); + colNum = (int)mxGetN(pMxArray); + // cout << rowNum << " " << colNum << endl; + matData = (double*)mxGetData(pMxArray); //获取指针 + vDat.resize(rowNum * colNum); + for (int i = 0; i < vDat.size(); ++i) vDat[i] = matData[i]; +} + +/* 读取二维double数据 */ +void Read2DDouble(const mxArray* pMxArray, vector>& vvDat) { + int rowNum, colNum; + double* matData; + + rowNum = (int)mxGetM(pMxArray); + colNum = (int)mxGetN(pMxArray); + vvDat.resize(rowNum); + matData = (double*)mxGetData(pMxArray); //获取指针 + for (int i = 0; i < rowNum; ++i) { + vvDat[i].resize(colNum); + for (int j = 0; j < colNum; ++j) { + vvDat[i][j] = matData[j * rowNum + i]; + } + } +} + +// 将结果写入mxArray, 作为后续的返回值 +mxArray* writeToMatDouble(const double* data, int rowNum, int colNum) { + mxArray* pWriteArray = NULL;//matlab格式矩阵 + int len = rowNum * colNum; + //创建一个rowNum*colNum的矩阵 + pWriteArray = mxCreateDoubleMatrix(rowNum, colNum, mxREAL); + //把data的值赋给pWriteArray指针 + memcpy((void*)(mxGetPr(pWriteArray)), (void*)data, sizeof(double) * len); + return pWriteArray; // 赋值给返回值 +} + +#define START_IDX(tid, threadNum, arrLen) (arrLen) * (tid) / (threadNum) +#define END_IDX(tid, threadNum, arrLen) (arrLen) * (tid + 1) / (threadNum) + +// 线程参数 +struct TPRandSim { + vector>* pvvSum; + vector>* pvvSqSum; + vector>* pvvRandPos; + vector* pvIx; + vector* pvCateNum; + vector>* pvvRound; + vector>* pvvX; + int loopNum; + int maxCategoryNum; + int tid; + int numThread; +}; + +// 多线程入口函数 +void ThreadRandSim(TPRandSim param) { + vector>& vvSum = *param.pvvSum; + vector>& vvSqSum = *param.pvvSqSum; + vector>& vvRandPos = *param.pvvRandPos; + vector& vIx = *param.pvIx; + vector& vCateNum = *param.pvCateNum; + vector>& vvRound = *param.pvvRound; + vector>& vvX = *param.pvvX; + int tid = param.tid; + int numThread = param.numThread; + int loopNum = param.loopNum; + int maxCategoryNum = param.maxCategoryNum; + int rowNum = vvX.size(); + int colNum = vvX[0].size(); + + clock_t begin = clock(), mid = clock(), finish; + /* 随机模拟 */ + int startIdx = START_IDX(tid, numThread, loopNum); + int endIdx = END_IDX(tid, numThread, loopNum); + + // if (tid == 0) cout << startIdx << '\t' << endIdx << endl; + + for (int idx = startIdx; idx < endIdx; ++idx) { // 模拟轮次 + // if (tid == 0 && idx % 100 == 0) { + // finish = clock(); + // cout << idx << ": time: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl << flush; + // mid = finish; + // } + auto& vRandPos = vvRandPos[idx]; + for (int i = 0; i < rowNum; ++i) { + const int hRowIdx = vRandPos[i]; // 随机打乱之后的行索引 + const int cateIdx = vIx[hRowIdx] - 1; // 聚类的编号 + //if (cateIdx == 0) { + auto& vRound = vvRound[cateIdx]; + for (int j = 0; j < colNum; ++j) { + vRound[j] += vvX[i][j]; + } + //} + } + for (int c = 0; c < maxCategoryNum; ++c) { + auto& vRound = vvRound[c]; + const int numPositive = vCateNum[c]; + for (int j = 0; j < colNum; ++j) { + const double val = vRound[j] / numPositive; + vvSum[c][j] += val; + vvSqSum[c][j] += val * val; + } + } + for (auto& vRound : vvRound) for (auto &val : vRound) val = 0; + } + finish = clock(); + // cout << tid << ": Random simulation time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl << flush; +} + +/* 多线程进行shuffle操作 */ +struct TPShuffle { + vector>* pvvRandPos; + int loopNum; + int tid; + int numThread; +}; +void ThreadShuffle(TPShuffle param) { + clock_t begin = clock(), finish; + vector>& vvRandPos = *param.pvvRandPos; + int tid = param.tid; + int numThread = param.numThread; + int loopNum = param.loopNum; + std::random_device rd; + int startIdx = START_IDX(tid, numThread, loopNum); + int endIdx = END_IDX(tid, numThread, loopNum); + for (int roundIdx = startIdx; roundIdx < endIdx; ++roundIdx) { + vector& vRandPos = vvRandPos[roundIdx]; + for (int i = 0; i < vRandPos.size(); ++i) vRandPos[i] = i; + std::shuffle(vRandPos.begin(), vRandPos.end(), std::default_random_engine(rd())); + } + finish = clock(); + // cout << tid << ": thread shuffle time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl << flush; +} + +/* 入口函数 */ +/* +三个参数,一个返回值 +输入: +1. x 二维数据,double类型,行数为文献数量,列数为字典长度(每个单词在所有文献中出现的次数超过5) +2. h 长度为文献个数,值为1代表该文献属于该知识颗粒(应该是),为0则不属于 +3. numThread +4. loopNum +输出: +vs z score,显著性指数,一维 +ps 与vs长度一致 +*/ +void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) { + if (nrhs < 2) { + cout << "At least 2 arguments should be given for this function!" << endl; + return; + } + clock_t begin = clock(), mid, finish; + vector vIx; // 类别 + vector> vvX; + + + Read2DDouble(prhs[0], vvX); + Read1DDouble(prhs[1], vIx); + int rowNum = vvX.size(); + int colNum = vvX[0].size(); + + // 查找最大的类别编号 + int maxCategoryNum = 1; + for (auto category : vIx) if (maxCategoryNum < category) maxCategoryNum = category; + + int numThread = 1; + int loopNum = 1000; + + if (nrhs > 2) { + double* pNumThread = (double*)mxGetData(prhs[2]); + numThread = (int)pNumThread[0]; + if (numThread < 1) numThread = 1; + } + if (nrhs > 3) { + double* pLoopNum = (double*)mxGetData(prhs[3]); + loopNum = (int)pLoopNum[0]; + if (loopNum < 1000) loopNum = 1000; + } + // 线程中用到的数据 + vector vCateNum(maxCategoryNum); // 每个类别包含的个数 + vector> vvTs(maxCategoryNum, vector(colNum)); //记录各个类别的行的行均值 + vector>> vvvSum(numThread, vector>(maxCategoryNum, vector(colNum))); + vector>> vvvSqSum(numThread, vector>(maxCategoryNum, vector(colNum))); + vector>> vvvRound(numThread, vector>(maxCategoryNum, vector(colNum))); + + finish = clock(); + cout << "Load Data time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl << flush; + // cout << numThread << '\t' << loopNum << endl; + + // maxCategoryNum = 1; + + /* 计算实际分类的均值 */ + mid = clock(); + for (int i = 0; i < rowNum; ++i) { + const int cateIdx = vIx[i] - 1; + vCateNum[cateIdx] ++; + for (int j = 0; j < colNum; ++j) vvTs[cateIdx][j] += vvX[i][j]; + } + for (int i = 0; i < maxCategoryNum; ++i) { + for (int j = 0; j < colNum; ++j) { + vvTs[i][j] /= vCateNum[i]; + } + } + + // for (auto c : vCateNum) cout << c << endl; + + // 先把loopNum次随机shuffle做了 + mid = clock(); + vector> vvRandPos(loopNum, vector(rowNum)); + vector vTHShuffle; + for (int i = 0; i < numThread; ++i) { + TPShuffle param = { &vvRandPos, loopNum, i, numThread }; + vTHShuffle.push_back(std::thread(ThreadShuffle, param)); + } + for (auto& t : vTHShuffle) t.join(); + finish = clock(); + cout << "Shuffle time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl << flush; + + vector vTHRandSim; + for (int i = 0; i < numThread; ++i) { + TPRandSim tParam = { &vvvSum[i], &vvvSqSum[i], &vvRandPos, &vIx, &vCateNum, &vvvRound[i], & vvX, loopNum, maxCategoryNum, i, numThread}; + vTHRandSim.push_back(std::thread(ThreadRandSim, tParam)); + } + for (auto& t : vTHRandSim) t.join(); + + // 合并所有线程的数据 + auto& vvSum = vvvSum[0]; + auto& vvSqSum = vvvSqSum[0]; + for (int t = 1; t < numThread; ++t) { + for (int i = 0; i < maxCategoryNum; ++i) { + for (int j = 0; j < colNum; ++j) { + vvSum[i][j] += vvvSum[t][i][j]; + vvSqSum[i][j] += vvvSqSum[t][i][j]; + } + } + } + + finish = clock(); + cout << "Random simulation time: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl << flush; + + /* 计算结果 */ + vector> vvVs(maxCategoryNum, vector(colNum)); + + // 按列计算平均值 + vector> vvMean(maxCategoryNum, vector(colNum)); + vector> vvStd(maxCategoryNum, vector(colNum)); + + for (int c = 0; c < maxCategoryNum; ++c) { + auto& vMean = vvMean[c]; + auto& vStd = vvStd[c]; + auto& vVs = vvVs[c]; + auto& vSum = vvSum[c]; + auto& vSqSum = vvSqSum[c]; + auto& vTs = vvTs[c]; + + for (int i = 0; i < colNum; ++i) vMean[i] = vSum[i] / loopNum; + for (int i = 0; i < colNum; ++i) { + const double meanVal = vSum[i] / loopNum; + vMean[i] = meanVal; // 均值 + const double sqDiff = vSqSum[i] + loopNum * meanVal * meanVal - 2 * meanVal * vSum[i]; + vStd[i] = sqrt(sqDiff / (loopNum - 1)); // 均方根 + } + + // 计算vs + for (int i = 0; i < vVs.size(); ++i) { + vVs[i] = (vTs[i] - vMean[i]) / vStd[i]; + } + } + + // auto& vVs = vvVs[0]; + // ofstream ofs("d:\\result_new.txt"); + // for (int i = 0; i < colNum; ++i) { + // ofs << vVs[i] << endl; + // } + // ofs.close(); + + /* 写入结果 */ + if (nlhs > 0) { // vs + vector vVsData(maxCategoryNum* colNum); + for (int i = 0; i < maxCategoryNum; ++i) { + for (int j = 0; j < colNum; ++j) { + vVsData[j * maxCategoryNum + i] = vvVs[i][j]; + } + } + plhs[0] = writeToMatDouble(vVsData.data(), maxCategoryNum, colNum); + } + + finish = clock(); + cout << "All Cluster Random simulation Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl << flush; +} + +// 供c++调试用 +void mexFunctionWrap(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) { + return mexFunction(nlhs, plhs, nrhs, prhs); +} \ No newline at end of file diff --git a/MexFunc/AllEntropyMean.cpp b/MexFunc/AllEntropyMean.cpp new file mode 100644 index 0000000..03d05d3 --- /dev/null +++ b/MexFunc/AllEntropyMean.cpp @@ -0,0 +1,312 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +using std::cout; +using std::endl; +using namespace std; + +#define STRING_BUF_SIZE 204800 +// 将matlab存储方式转换成c存储方式 +#define TRANS_ROW_COL(dst, src, rowNum, colNum) \ + for (int rowI = 0; rowI < rowNum; ++rowI) { \ + for (int colJ = 0; colJ < colNum; ++colJ) { \ + dst[rowI * colNum + colJ] = src[colJ * rowNum + rowI]; \ + } \ + } + +class ThreadPool { +public: + ThreadPool(size_t); + template + auto enqueue(F&& f, Args&&... args) + ->std::future::type>; + ~ThreadPool(); +private: + // need to keep track of threads so we can join them + std::vector< std::thread > workers; + // the task queue + std::queue< std::function > tasks; + + // synchronization + std::mutex queue_mutex; + std::condition_variable condition; + bool stop; +}; + +// the constructor just launches some amount of workers +inline ThreadPool::ThreadPool(size_t threads) + : stop(false) +{ + for (size_t i = 0;i < threads;++i) + workers.emplace_back( + [this] + { + for (;;) + { + std::function task; + + { + std::unique_lock lock(this->queue_mutex); + this->condition.wait(lock, + [this] { return this->stop || !this->tasks.empty(); }); + if (this->stop && this->tasks.empty()) + return; + task = std::move(this->tasks.front()); + this->tasks.pop(); + } + + task(); + } + } + ); +} + +// add new work item to the pool +template +auto ThreadPool::enqueue(F && f, Args&&... args) +-> std::future::type> +{ + using return_type = typename std::result_of::type; + + auto task = std::make_shared< std::packaged_task >( + std::bind(std::forward(f), std::forward(args)...) + ); + + std::future res = task->get_future(); + { + std::unique_lock lock(queue_mutex); + + // don't allow enqueueing after stopping the pool + if (stop) + throw std::runtime_error("enqueue on stopped ThreadPool"); + + tasks.emplace([task]() { (*task)(); }); + } + condition.notify_one(); + return res; +} + +// the destructor joins all threads +inline ThreadPool::~ThreadPool() +{ + { + std::unique_lock lock(queue_mutex); + stop = true; + } + condition.notify_all(); + for (std::thread& worker : workers) + worker.join(); +} + +// 读取一维cell字符串并转换成大写 +inline bool ReadWord1DCell(const mxArray* pMxArray, vector& vStr) { + mxArray* pCell = nullptr; + int rowNum, colNum; + char* strBuf = new char[STRING_BUF_SIZE]; + + rowNum = (int)mxGetM(pMxArray); + colNum = (int)mxGetN(pMxArray); + vStr.resize(rowNum * colNum); + for (int i = 0; i < rowNum; ++i) { + for (int j = 0; j < colNum; ++j) { + pCell = mxGetCell(pMxArray, j * rowNum + i); + if (mxGetString(pCell, strBuf, STRING_BUF_SIZE) != 0) { + cout << "String is too large to fit in the buffer! " << i + 1 << '\t' << j + 1 << endl; + return false; + } + vStr[i * colNum + j] = strBuf; + auto& lastStr = vStr[i * colNum + j]; + transform(lastStr.cbegin(), lastStr.cend(), lastStr.begin(), ::toupper); // 转成大写 + } + } + delete[]strBuf; + return true; +} + +// 读取二维cell字符串并转换成大写 +inline bool ReadWord2DCell(const mxArray* pMxArray, vector>& vvStr) { + mxArray* pCell = nullptr; + int rowNum, colNum; + + rowNum = (int)mxGetM(pMxArray); + colNum = (int)mxGetN(pMxArray); + for (int i = 0; i < rowNum; ++i) { + for (int j = 0; j < colNum; ++j) { + pCell = mxGetCell(pMxArray, j * rowNum + i); + vvStr.push_back(vector()); + ReadWord1DCell(pCell, vvStr.back()); + } + } + return true; +} + +// 读取由一维cell包裹的double数据,每个cell是一个一维的double数组 +inline void ReadDoulbe1DCell(const mxArray* pMxArray, vector >& vvData) { + // 读取fr数值 + int rowNum = (int)mxGetM(pMxArray); + int colNum = (int)mxGetN(pMxArray); + for (int i = 0; i < rowNum; ++i) { + for (int j = 0; j < colNum; ++j) { + mxArray* pCell = mxGetCell(pMxArray, j * rowNum + i); + int childRowNum = (int)mxGetM(pCell); + int childColNum = (int)mxGetN(pCell); + vvData.push_back(vector()); + vvData.back().resize(childRowNum * childColNum); + double* pVal = (double*)mxGetData(pCell); //获取数据指针 + TRANS_ROW_COL(vvData.back(), pVal, childRowNum, childColNum); // 行列存储方式转换 + } + } +} + +// 将结果写入mxArray, 作为后续的返回值 +mxArray* writeToMatDouble(const double* data, int rowNum, int colNum) { + mxArray* pWriteArray = NULL;//matlab格式矩阵 + int len = rowNum * colNum; + //创建一个rowNum*colNum的矩阵 + pWriteArray = mxCreateDoubleMatrix(rowNum, colNum, mxREAL); + //把data的值赋给pWriteArray指针 + memcpy((void*)(mxGetPr(pWriteArray)), (void*)data, sizeof(double) * len); + return pWriteArray; // 赋值给返回值 +} + +/* 多线程计算信息熵 */ +struct TPEntropyMean { + vector* pvDs; + vector* pvFr; + vector>* pvusAbsWord; + vector* pvHs; + vector* pvHd; +}; + +void ThreadCalcEntropyMean(TPEntropyMean& param) { + vector& vDs = *param.pvDs; // 这一组ds + vector& vFr = *param.pvFr; // frequency + vector>& vusAbsWord = *param.pvusAbsWord; + vector& vHs = *param.pvHs; + vector& vHd = *param.pvHd; + const int numAbs = vusAbsWord.size(); + const int numDsWord = vDs.size(); // 这一组数据中包含的单词数量 + // 检查知识颗粒中的词语是否出现在pubmed摘要的词语中 + for (int i = 0; i < numAbs; ++i) { + for (int j = 0; j < numDsWord; ++j) { + if (vusAbsWord[i].find(vDs[j]) != vusAbsWord[i].end()) { // 这一组单词中的j索引位置的单词在第i个文献中出现过 + vHs[i] -= vFr[j] * log2(vFr[j]); + } + } + vHd[i] = vHs[i] / vusAbsWord[i].size(); + } +} + +/* +输入: +1. ds: 知识颗粒中的信息,应该也是摘要,分割成了字符串,大写的。 +2. frr: ds中每个单词对应的频率。 +3. ws: 文献的摘要,被切割成特定的长度了,字符串已经分割好了。 +[4]. numThread +输出: +1. hs: 信息熵,二维[len(ds)][len(ws)] +2. hd: 每个单词的平均信息熵,维度同hs +*/ +void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) { + if (nrhs < 3) { + cout << "At least 3 arguments should be given for this function!" << endl; + return; + } + clock_t begin = clock(), mid, finish; + + vector > vvDs; // 每个知识颗粒的ds矩阵(词汇矩阵) + vector > vvFr; // 词汇对应的频率 + ReadWord2DCell(prhs[0], vvDs); + ReadDoulbe1DCell(prhs[1], vvFr); + + vector> vvWs; + ReadWord2DCell(prhs[2], vvWs); // 文献摘要的字符串数组 + + int numThread = 1; + if (nrhs > 3) { + double* pNumThread = (double*)mxGetData(prhs[3]); + numThread = (int)pNumThread[0]; + if (numThread < 1) numThread = 1; + } + + vector> vusAbsWord(vvWs.size()); // 将每篇文章摘要的单词放入hash表 + // 将处理摘要之后的每个词语放入hash表 + for (int i=0; i> vvHs(numGroup, vector(numAbs)); + vector> vvHd(numGroup, vector(numAbs)); + + vector* pvDs; + vector* pvFr; + vector>* pvusAbsWord; + vector* pvHs; + vector* pvHd; + /* 多线程计算信息熵 */ + ThreadPool thPool(numThread); + for (int groupIdx = 0; groupIdx < numGroup; ++groupIdx) { // 遍历知识颗粒中的每一组 + TPEntropyMean tp = { &vvDs[groupIdx], &vvFr[groupIdx], &vusAbsWord, &vvHs[groupIdx], &vvHd[groupIdx] }; + thPool.enqueue(ThreadCalcEntropyMean, tp); + } + thPool.~ThreadPool(); + + // ofstream ofs("d:\\result_hs.txt"); + // for (int i = 0; i < numGroup; ++i) { + // for (int j = 0; j < numAbs; ++j) { + // ofs << vvHs[i][j] << ' '; + // } + // ofs << endl; + // } + // ofs.close(); + + /* 将结果写入返回值 */ + if (nlhs > 0) { + vector vData(numGroup * numAbs); + for (int i = 0; i < numGroup; ++i) for (int j = 0; j < numAbs; ++j) vData[j * numGroup + i] = vvHs[i][j]; + plhs[0] = writeToMatDouble(vData.data(), numGroup, numAbs); + } + + if (nlhs > 1) { + vector vData(numGroup * numAbs); + for (int i = 0; i < numGroup; ++i) for (int j = 0; j < numAbs; ++j) vData[j * numGroup + i] = vvHd[i][j]; + plhs[1] = writeToMatDouble(vData.data(), numGroup, numAbs); + } + + finish = clock(); + cout << "Calc Entropy and Mean Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; +} + +/* 供main调试调用 */ +void mexFunctionWrap(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) { + mexFunction(nlhs, plhs, nrhs, prhs); +} \ No newline at end of file diff --git a/MexFunc/CalcEntropy.cpp b/MexFunc/CalcEntropy.cpp index eeeefd5..bd3f778 100644 --- a/MexFunc/CalcEntropy.cpp +++ b/MexFunc/CalcEntropy.cpp @@ -99,10 +99,11 @@ void GetAbstract(const mxArray* pMxAbs, vector& vAbs) { } /* -nlhs:输出参数数目(Number Left - hand side),等号左边 -plhs:指向输出参数的指针(Point Left - hand side),等号左边 -nrhs:输入参数数目(Number Right - hand side),等号右边 -prhs:指向输入参数的指针(Point Right - hand side),等号右边。要注意prhs是const的指针数组,即不能改变其指向内容。 +输入: +1. abs: 待感知的文献的摘要信息。 +2. G: 知识颗粒,包含该程序需要的热词ds以及对应的频率fr。 +输出: +1. hs: 信息熵,二维[len(知识颗粒)][len(文献)] */ void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) { //cout << "MexCalcEntropy" << endl; @@ -232,6 +233,6 @@ void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) { } /* 供main调试调用 */ -void MexCalcEntropy(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) { +void mexFunctionWrap(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) { mexFunction(nlhs, plhs, nrhs, prhs); } \ No newline at end of file diff --git a/MexFunc/ClusterRandSim.cpp b/MexFunc/ClusterRandSim.cpp index 348c3e9..801bb57 100644 --- a/MexFunc/ClusterRandSim.cpp +++ b/MexFunc/ClusterRandSim.cpp @@ -17,9 +17,11 @@ #include #include #include -#include #include #include +#include +#include +#include using std::cout; using std::endl; @@ -27,89 +29,6 @@ using namespace std; #define STRING_BUF_SIZE 204800 -class ThreadPool { -public: - ThreadPool(size_t); - template - auto enqueue(F&& f, Args&&... args) - ->std::future::type>; - ~ThreadPool(); -private: - // need to keep track of threads so we can join them - std::vector< std::thread > workers; - // the task queue - std::queue< std::function > tasks; - - // synchronization - std::mutex queue_mutex; - std::condition_variable condition; - bool stop; -}; - -// the constructor just launches some amount of workers -inline ThreadPool::ThreadPool(size_t threads) - : stop(false) -{ - for (size_t i = 0;i < threads;++i) - workers.emplace_back( - [this] - { - for (;;) - { - std::function task; - - { - std::unique_lock lock(this->queue_mutex); - this->condition.wait(lock, - [this] { return this->stop || !this->tasks.empty(); }); - if (this->stop && this->tasks.empty()) - return; - task = std::move(this->tasks.front()); - this->tasks.pop(); - } - - task(); - } - } - ); -} - -// add new work item to the pool -template -auto ThreadPool::enqueue(F&& f, Args&&... args) --> std::future::type> -{ - using return_type = typename std::result_of::type; - - auto task = std::make_shared< std::packaged_task >( - std::bind(std::forward(f), std::forward(args)...) - ); - - std::future res = task->get_future(); - { - std::unique_lock lock(queue_mutex); - - // don't allow enqueueing after stopping the pool - if (stop) - throw std::runtime_error("enqueue on stopped ThreadPool"); - - tasks.emplace([task]() { (*task)(); }); - } - condition.notify_one(); - return res; -} - -// the destructor joins all threads -inline ThreadPool::~ThreadPool() -{ - { - std::unique_lock lock(queue_mutex); - stop = true; - } - condition.notify_all(); - for (std::thread& worker : workers) - worker.join(); -} /* 读取一维double数据 */ void Read1DDouble(const mxArray* pMxArray, vector& vDat) { @@ -141,45 +60,68 @@ void Read2DDouble(const mxArray* pMxArray, vector>& vvDat) { } } +// 将结果写入mxArray, 作为后续的返回值 +mxArray* writeToMatDouble(const double* data, int rowNum, int colNum) { + mxArray* pWriteArray = NULL;//matlab格式矩阵 + int len = rowNum * colNum; + //创建一个rowNum*colNum的矩阵 + pWriteArray = mxCreateDoubleMatrix(rowNum, colNum, mxREAL); + //把data的值赋给pWriteArray指针 + memcpy((void*)(mxGetPr(pWriteArray)), (void*)data, sizeof(double) * len); + return pWriteArray; // 赋值给返回值 +} + +#define START_IDX(tid, threadNum, arrLen) (arrLen) * (tid) / (threadNum) +#define END_IDX(tid, threadNum, arrLen) (arrLen) * (tid + 1) / (threadNum) + // 线程参数 struct TPRandSim { - vector* pvTr; + vector>* pvvTr; vector* pvRandPos; vector* pvH; vector>* pvvX; int numPositive; + int tid; + int numThread; }; // 多线程入口函数 -void ThreadRandSim(TPRandSim& param) { - vector& vTr = *param.pvTr; +void ThreadRandSim(TPRandSim param) { + vector < vector>& vvTr = *param.pvvTr; vector& vRandPos = *param.pvRandPos; vector>& vvX = *param.pvvX; vector& vH = *param.pvH; int numPositive = param.numPositive; int rowNum = vvX.size(); int colNum = vvX[0].size(); + int tid = param.tid; + int numThread = param.numThread; clock_t begin = clock(), finish; /* 随机模拟 */ std::random_device rd; - std::shuffle(vRandPos.begin(), vRandPos.end(), std::default_random_engine(rd())); + int startIdx = START_IDX(tid, numThread, vvTr.size()); + int endIdx = END_IDX(tid, numThread, vvTr.size()); - for (int i = 0; i < rowNum; ++i) { - int hRowIdx = vRandPos[i]; // 随机打乱之后的行索引 - if (vH[hRowIdx] == 1) { - for (int j = 0; j < colNum; ++j) { - vTr[j] += vvX[i][j]; + // cout << tid << '\t' << numThread << '\t' << startIdx << '\t' << endIdx << endl; + + for (int idx = startIdx; idx < endIdx; ++idx) { + auto& vTr = vvTr[idx]; + std::shuffle(vRandPos.begin(), vRandPos.end(), std::default_random_engine(rd())); + for (int i = 0; i < rowNum; ++i) { + int hRowIdx = vRandPos[i]; // 随机打乱之后的行索引 + if (vH[hRowIdx] == 1) { + for (int j = 0; j < colNum; ++j) { + vTr[j] += vvX[i][j]; + } } } + for (auto& val : vTr) val /= numPositive; } - for (auto& val : vTr) val /= numPositive; finish = clock(); - // cout << "Random simulation time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; + cout << "Thread Random simulation time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; } - - /* 入口函数 */ /* 三个参数,一个返回值 @@ -187,12 +129,12 @@ void ThreadRandSim(TPRandSim& param) { 1. x 二维数据,double类型,行数为文献数量,列数为字典长度(每个单词在所有文献中出现的次数超过5) 2. h 长度为文献个数,值为1代表该文献属于该知识颗粒(应该是),为0则不属于 3. numThread +4. loopNum 输出: vs z score,显著性指数,一维 ps 与vs长度一致 */ -void mexFunction(int nlhs, mxArray* plhs[], int nrhs, mxArray** prhs) { -//void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) { +void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) { if (nrhs < 2) { cout << "At least 2 arguments should be given for this function!" << endl; return; @@ -206,7 +148,7 @@ void mexFunction(int nlhs, mxArray* plhs[], int nrhs, mxArray** prhs) { int rowNum = vvX.size(); int colNum = vvX[0].size(); - cout << vH.size() << '\t' << vvX.size() << endl; + // cout << vH.size() << '\t' << vvX.size() << endl; int numThread = 1; int loopNum = 1000; @@ -222,6 +164,10 @@ void mexFunction(int nlhs, mxArray* plhs[], int nrhs, mxArray** prhs) { if (loopNum < 1000) loopNum = 1000; } + finish = clock(); + cout << "Load Data time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; + // cout << numThread << '\t' << loopNum << endl; + /* 进行随机模拟 */ mid = clock(); vector vTs(colNum); // 初始数据,记录vH中label为1的行的行均值 @@ -236,7 +182,6 @@ void mexFunction(int nlhs, mxArray* plhs[], int nrhs, mxArray** prhs) { } for (auto& val : vTs) val /= numPositive; - vector> vvTr(loopNum, vector(colNum, 0)); // 模拟结果 vector> vvRandPos(numThread, vector(rowNum)); for (int i = 0; i < rowNum; ++i) { @@ -245,14 +190,13 @@ void mexFunction(int nlhs, mxArray* plhs[], int nrhs, mxArray** prhs) { } } - ThreadPool thPool(numThread); - int tid = 0; - for (int i = 0; i < loopNum; ++i) { - TPRandSim tParam = { &vvTr[i], &vvRandPos[tid++ % numThread], &vH, &vvX, numPositive }; - thPool.enqueue(ThreadRandSim, tParam); - //ThreadRandSim(tParam); + vector vT; + for (int i = 0; i < numThread; ++i) { + TPRandSim tParam = { &vvTr, &vvRandPos[i], &vH, &vvX, numPositive, i, numThread }; + vT.push_back(std::thread(ThreadRandSim, tParam)); } - thPool.~ThreadPool(); + for (auto& t : vT) t.join(); + finish = clock(); cout << "Random simulation time: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl; @@ -295,29 +239,24 @@ void mexFunction(int nlhs, mxArray* plhs[], int nrhs, mxArray** prhs) { vPs[i] = min(vSumGreater[i], vSumLess[i]); } - ofstream ofs("d:\\result.txt"); - for (int i = 0; i < colNum; ++i) { - ofs << vVs[i] << '\t' << vPs[i] << endl; - } - ofs.close(); + // ofstream ofs("d:\\result_2.txt"); + // for (int i = 0; i < colNum; ++i) { + // ofs << vVs[i] << '\t' << vPs[i] << endl; + // } + // ofs.close(); /* 写入结果 */ if (nlhs > 0) { // vs - mxArray* pWriteArray = NULL;//matlab格式矩阵 - //创建一个rowNum*colNum的矩阵 - pWriteArray = mxCreateDoubleMatrix(1, vVs.size(), mxREAL); - //把data的值赋给pWriteArray指针 - memcpy((void*)(mxGetPr(pWriteArray)), (void*)vVs.data(), sizeof(double) * vVs.size()); - plhs[0] = pWriteArray; // 赋值给返回值 + plhs[0] = writeToMatDouble(vVs.data(), 1, vVs.size()); } if (nlhs > 1) { // ps - mxArray* pWriteArray = NULL;//matlab格式矩阵 - //创建一个rowNum*colNum的矩阵 - pWriteArray = mxCreateDoubleMatrix(1, vPs.size(), mxREAL); - //把data的值赋给pWriteArray指针 - memcpy((void*)(mxGetPr(pWriteArray)), (void*)vPs.data(), sizeof(double)* vPs.size()); - plhs[1] = pWriteArray; // 赋值给返回值 + plhs[1] = writeToMatDouble(vPs.data(), 1, vPs.size()); } finish = clock(); cout << "Cluster Random simulation Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; +} + +// 供c++调试用 +void mexFunctionWrap(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) { + return mexFunction(nlhs, plhs, nrhs, prhs); } \ No newline at end of file diff --git a/MexFunc/IsWordInDic.cpp b/MexFunc/IsWordInDic.cpp index a4683f2..385e671 100644 --- a/MexFunc/IsWordInDic.cpp +++ b/MexFunc/IsWordInDic.cpp @@ -75,7 +75,7 @@ inline bool Read2DWord(const mxArray* pMxArray, vector>& vvStr) { } // 将结果写入mxArray, 作为后续的返回值 -mxArray* writeToMat(const double *data, int rowNum, int colNum) { +mxArray* writeToMatDouble(const double *data, int rowNum, int colNum) { mxArray* pWriteArray = NULL;//matlab格式矩阵 int len = rowNum * colNum; //创建一个rowNum*colNum的矩阵 @@ -187,10 +187,10 @@ void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) { /* 写入结果 */ mid = clock(); if (nlhs > 0) { - plhs[0] = writeToMat(vXSum.data(), 1, vXSum.size()); + plhs[0] = writeToMatDouble(vXSum.data(), 1, vXSum.size()); } if (nlhs > 1) { // xs - plhs[1] = writeToMat(vXsData.data(), rowNum, colNum); + plhs[1] = writeToMatDouble(vXsData.data(), rowNum, colNum); } finish = clock(); cout << "Write result time: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl; diff --git a/MexFunc/MexFunc.vcxproj b/MexFunc/MexFunc.vcxproj index e983a64..5c7ad9f 100644 --- a/MexFunc/MexFunc.vcxproj +++ b/MexFunc/MexFunc.vcxproj @@ -119,7 +119,7 @@ - + diff --git a/MexFunc/MexFunc.vcxproj.filters b/MexFunc/MexFunc.vcxproj.filters index 9b2cc18..da4d1f8 100644 --- a/MexFunc/MexFunc.vcxproj.filters +++ b/MexFunc/MexFunc.vcxproj.filters @@ -18,7 +18,7 @@ Source Files - + Source Files diff --git a/MexFunc/main.cpp b/MexFunc/main.cpp index 364161e..0096f28 100644 --- a/MexFunc/main.cpp +++ b/MexFunc/main.cpp @@ -31,23 +31,61 @@ int main(int argc, const char** argv) // prhs[2] = matGetVariable(pdicrMat, "dicr"); /* IsWordInDic */ - MATFile* pwdMat, * pdicMat; + // MATFile* pwdMat, * pdicMat; + // mxArray* plhs[4]; + // const mxArray* prhs[4]; + // int nlhs = 2, nrhs = 2; + // pwdMat = matOpen("D:\\wd_large.mat", "r"); + // pdicMat = matOpen("D:\\G_dc_large.mat", "r"); + // prhs[0] = matGetVariable(pwdMat, "wd"); //鑾峰彇.mat鏂囦欢閲岄潰鍚嶄负matrixName鐨勭煩闃 + // prhs[1] = matGetVariable(pdicMat, "dc"); + + /* ClusterRandSim */ + // mxArray* plhs[4]; + // const mxArray* prhs[4]; + // int nlhs = 2, nrhs = 4; + // MATFile* pMatX = matOpen("D:\\x_large.mat", "r"); + // MATFile* pMatH = matOpen("D:\\h_large.mat", "r"); + // prhs[0] = matGetVariable(pMatX, "x"); + // prhs[1] = matGetVariable(pMatH, "h3"); + // prhs[2] = mxCreateDoubleMatrix(1, 1, mxREAL); + // *mxGetPr(prhs[2]) = 1; + // prhs[3] = mxCreateDoubleMatrix(1, 1, mxREAL); + // *mxGetPr(prhs[3]) = 10000; + + + /* AllClusterRandSim */ + // mxArray* plhs[4]; + // const mxArray* prhs[4]; + // int nlhs = 2, nrhs = 4; + // MATFile* pMatX = matOpen("D:\\x_large.mat", "r"); + // MATFile* pMatIx = matOpen("D:\\ix_large.mat", "r"); + // prhs[0] = matGetVariable(pMatX, "x"); + // prhs[1] = matGetVariable(pMatIx, "ix"); + // prhs[2] = mxCreateDoubleMatrix(1, 1, mxREAL); + // *mxGetPr(prhs[2]) = 4; + // prhs[3] = mxCreateDoubleMatrix(1, 1, mxREAL); + // *mxGetPr(prhs[3]) = 10000; + + /* AllEntropyMean */ mxArray* plhs[4]; const mxArray* prhs[4]; - int nlhs = 2, nrhs = 2; - pwdMat = matOpen("D:\\wd_large.mat", "r"); - pdicMat = matOpen("D:\\G_dc_large.mat", "r"); - prhs[0] = matGetVariable(pwdMat, "wd"); //鑾峰彇.mat鏂囦欢閲岄潰鍚嶄负matrixName鐨勭煩闃 - prhs[1] = matGetVariable(pdicMat, "dc"); - - /* */ - //MATFile* pMatX = matOpen("D:\\x_large.mat", "r"); - //MATFile* pMatH = matOpen("D:\\h_large.mat", "r"); - //prhs[0] = matGetVariable(pMatX, "x"); - //prhs[1] = matGetVariable(pMatH, "h"); + int nlhs = 2, nrhs = 4; + MATFile* pMatG = matOpen("D:\\G_large.mat", "r"); + MATFile* pMatWs = matOpen("D:\\ws_large.mat", "r"); + mxArray* pMxG = matGetVariable(pMatG, "G"); + prhs[0] = mxGetField(pMxG, 0, "ds"); + prhs[1] = mxGetField(pMxG, 0, "frr"); + prhs[2] = matGetVariable(pMatWs, "ws"); + prhs[3] = mxCreateDoubleMatrix(1, 1, mxREAL); + *mxGetPr(prhs[3]) = 12; + + + + // 璋冪敤鍑芥暟杩涜娴嬭瘯 finish = clock(); cout << "Load Data time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;