diff --git a/MexFunc/ClusterRandSim.cpp b/MexFunc/ClusterRandSim.cpp new file mode 100644 index 0000000..348c3e9 --- /dev/null +++ b/MexFunc/ClusterRandSim.cpp @@ -0,0 +1,323 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using std::cout; +using std::endl; +using namespace std; + +#define STRING_BUF_SIZE 204800 + +class ThreadPool { +public: + ThreadPool(size_t); + template + auto enqueue(F&& f, Args&&... args) + ->std::future::type>; + ~ThreadPool(); +private: + // need to keep track of threads so we can join them + std::vector< std::thread > workers; + // the task queue + std::queue< std::function > tasks; + + // synchronization + std::mutex queue_mutex; + std::condition_variable condition; + bool stop; +}; + +// the constructor just launches some amount of workers +inline ThreadPool::ThreadPool(size_t threads) + : stop(false) +{ + for (size_t i = 0;i < threads;++i) + workers.emplace_back( + [this] + { + for (;;) + { + std::function task; + + { + std::unique_lock lock(this->queue_mutex); + this->condition.wait(lock, + [this] { return this->stop || !this->tasks.empty(); }); + if (this->stop && this->tasks.empty()) + return; + task = std::move(this->tasks.front()); + this->tasks.pop(); + } + + task(); + } + } + ); +} + +// add new work item to the pool +template +auto ThreadPool::enqueue(F&& f, Args&&... args) +-> std::future::type> +{ + using return_type = typename std::result_of::type; + + auto task = std::make_shared< std::packaged_task >( + std::bind(std::forward(f), std::forward(args)...) + ); + + std::future res = task->get_future(); + { + std::unique_lock lock(queue_mutex); + + // don't allow enqueueing after stopping the pool + if (stop) + throw std::runtime_error("enqueue on stopped ThreadPool"); + + tasks.emplace([task]() { (*task)(); }); + } + condition.notify_one(); + return res; +} + +// the destructor joins all threads +inline ThreadPool::~ThreadPool() +{ + { + std::unique_lock lock(queue_mutex); + stop = true; + } + condition.notify_all(); + for (std::thread& worker : workers) + worker.join(); +} + +/* 读取一维double数据 */ +void Read1DDouble(const mxArray* pMxArray, vector& vDat) { + int rowNum, colNum; + double* matData; + + rowNum = (int)mxGetM(pMxArray); + colNum = (int)mxGetN(pMxArray); + // cout << rowNum << " " << colNum << endl; + matData = (double*)mxGetData(pMxArray); //获取指针 + vDat.resize(rowNum * colNum); + for (int i = 0; i < vDat.size(); ++i) vDat[i] = matData[i]; +} + +/* 读取二维double数据 */ +void Read2DDouble(const mxArray* pMxArray, vector>& vvDat) { + int rowNum, colNum; + double* matData; + + rowNum = (int)mxGetM(pMxArray); + colNum = (int)mxGetN(pMxArray); + vvDat.resize(rowNum); + matData = (double*)mxGetData(pMxArray); //获取指针 + for (int i = 0; i < rowNum; ++i) { + vvDat[i].resize(colNum); + for (int j = 0; j < colNum; ++j) { + vvDat[i][j] = matData[j * rowNum + i]; + } + } +} + +// 线程参数 +struct TPRandSim { + vector* pvTr; + vector* pvRandPos; + vector* pvH; + vector>* pvvX; + int numPositive; +}; + +// 多线程入口函数 +void ThreadRandSim(TPRandSim& param) { + vector& vTr = *param.pvTr; + vector& vRandPos = *param.pvRandPos; + vector>& vvX = *param.pvvX; + vector& vH = *param.pvH; + int numPositive = param.numPositive; + int rowNum = vvX.size(); + int colNum = vvX[0].size(); + + clock_t begin = clock(), finish; + /* 随机模拟 */ + std::random_device rd; + std::shuffle(vRandPos.begin(), vRandPos.end(), std::default_random_engine(rd())); + + for (int i = 0; i < rowNum; ++i) { + int hRowIdx = vRandPos[i]; // 随机打乱之后的行索引 + if (vH[hRowIdx] == 1) { + for (int j = 0; j < colNum; ++j) { + vTr[j] += vvX[i][j]; + } + } + } + for (auto& val : vTr) val /= numPositive; + finish = clock(); + // cout << "Random simulation time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; +} + + + +/* 入口函数 */ +/* +三个参数,一个返回值 +输入: +1. x 二维数据,double类型,行数为文献数量,列数为字典长度(每个单词在所有文献中出现的次数超过5) +2. h 长度为文献个数,值为1代表该文献属于该知识颗粒(应该是),为0则不属于 +3. numThread +输出: +vs z score,显著性指数,一维 +ps 与vs长度一致 +*/ +void mexFunction(int nlhs, mxArray* plhs[], int nrhs, mxArray** prhs) { +//void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) { + if (nrhs < 2) { + cout << "At least 2 arguments should be given for this function!" << endl; + return; + } + clock_t begin = clock(), mid, finish; + vector vH; + vector> vvX; + + Read2DDouble(prhs[0], vvX); + Read1DDouble(prhs[1], vH); + int rowNum = vvX.size(); + int colNum = vvX[0].size(); + + cout << vH.size() << '\t' << vvX.size() << endl; + + int numThread = 1; + int loopNum = 1000; + + if (nrhs > 2) { + double* pNumThread = (double*)mxGetData(prhs[2]); + numThread = (int)pNumThread[0]; + if (numThread < 1) numThread = 1; + } + if (nrhs > 3) { + double* pLoopNum = (double*)mxGetData(prhs[3]); + loopNum = (int)pLoopNum[0]; + if (loopNum < 1000) loopNum = 1000; + } + + /* 进行随机模拟 */ + mid = clock(); + vector vTs(colNum); // 初始数据,记录vH中label为1的行的行均值 + int numPositive = 0; + for (int i = 0; i < rowNum; ++i) { + if (vH[i] == 1) { + ++numPositive; + for (int j = 0; j < colNum; ++j) { + vTs[j] += vvX[i][j]; + } + } + } + for (auto& val : vTs) val /= numPositive; + + + vector> vvTr(loopNum, vector(colNum, 0)); // 模拟结果 + vector> vvRandPos(numThread, vector(rowNum)); + for (int i = 0; i < rowNum; ++i) { + for (auto& vRandPos : vvRandPos) { + vRandPos[i] = i; + } + } + + ThreadPool thPool(numThread); + int tid = 0; + for (int i = 0; i < loopNum; ++i) { + TPRandSim tParam = { &vvTr[i], &vvRandPos[tid++ % numThread], &vH, &vvX, numPositive }; + thPool.enqueue(ThreadRandSim, tParam); + //ThreadRandSim(tParam); + } + thPool.~ThreadPool(); + finish = clock(); + cout << "Random simulation time: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl; + + /* 计算结果 */ + vector vVs(colNum); + vector vPs(colNum); + // 按列计算平均值 + vector vMean(colNum); + vector vStd(colNum); + for (int i = 0; i < vvTr.size(); ++i) { + for (int j = 0; j < vvTr[i].size(); ++j) { + vMean[j] += vvTr[i][j]; + } + } + for (auto& val : vMean) { val /= loopNum; } // 均值 + for (int i = 0; i < vvTr.size(); ++i) { + for (int j = 0; j < vvTr[i].size(); ++j) { + const double diff = vvTr[i][j] - vMean[j]; + vStd[j] += diff * diff; + } + } + for (auto& val : vStd) { val = sqrt(val / (loopNum - 1)); } // 均方根 + // 计算vs + for (int i = 0; i < vVs.size(); ++i) { + vVs[i] = (vTs[i] - vMean[i]) / vStd[i]; + } + + // 计算ps + vector vSumGreater(colNum); + vector vSumLess(colNum); + for (int i = 0; i < loopNum; ++i) { + for (int j = 0; j < colNum; ++j) { + if (vvTr[i][j] >= vTs[j]) vSumGreater[j] ++; + if (vvTr[i][j] <= vTs[j]) vSumLess[j] ++; + } + } + for (auto& val : vSumGreater) val /= loopNum; + for (auto& val : vSumLess) val /= loopNum; + for (int i = 0; i < colNum; ++i) { + vPs[i] = min(vSumGreater[i], vSumLess[i]); + } + + ofstream ofs("d:\\result.txt"); + for (int i = 0; i < colNum; ++i) { + ofs << vVs[i] << '\t' << vPs[i] << endl; + } + ofs.close(); + + /* 写入结果 */ + if (nlhs > 0) { // vs + mxArray* pWriteArray = NULL;//matlab格式矩阵 + //创建一个rowNum*colNum的矩阵 + pWriteArray = mxCreateDoubleMatrix(1, vVs.size(), mxREAL); + //把data的值赋给pWriteArray指针 + memcpy((void*)(mxGetPr(pWriteArray)), (void*)vVs.data(), sizeof(double) * vVs.size()); + plhs[0] = pWriteArray; // 赋值给返回值 + } + if (nlhs > 1) { // ps + mxArray* pWriteArray = NULL;//matlab格式矩阵 + //创建一个rowNum*colNum的矩阵 + pWriteArray = mxCreateDoubleMatrix(1, vPs.size(), mxREAL); + //把data的值赋给pWriteArray指针 + memcpy((void*)(mxGetPr(pWriteArray)), (void*)vPs.data(), sizeof(double)* vPs.size()); + plhs[1] = pWriteArray; // 赋值给返回值 + } + finish = clock(); + cout << "Cluster Random simulation Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; +} \ No newline at end of file diff --git a/MexFunc/GaussFit.cpp b/MexFunc/GaussFit.cpp index 7b3d7e5..caba97d 100644 --- a/MexFunc/GaussFit.cpp +++ b/MexFunc/GaussFit.cpp @@ -1166,6 +1166,11 @@ T SquareAverage(vector& vVal) { /* 计算向量x和y的相关距离, 向量维度必须相等*/ double CorrelationDistance(vector& vX, vector& vY) { + double xMean = Average(vX); + transform(vX.cbegin(), vX.cend(), vX.begin(), [&](double val) { return val - xMean; }); + double yMean = Average(vY); + transform(vY.cbegin(), vY.cend(), vY.begin(), [&](double val) { return val - yMean; }); + vector vXY(vX.size()); for (int i = 0; i < vXY.size(); ++i) { vXY[i] = vX[i] * vY[i]; diff --git a/MexFunc/IsWordInDic.cpp b/MexFunc/IsWordInDic.cpp new file mode 100644 index 0000000..a4683f2 --- /dev/null +++ b/MexFunc/IsWordInDic.cpp @@ -0,0 +1,204 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using std::cout; +using std::endl; +using namespace std; + +#define STRING_BUF_SIZE 204800 + + +// 读取一维cell字符串并转换成大写 +inline bool Read1DWord(const mxArray* pMxArray, vector& vStr) { + mxArray* pCell = nullptr; + int rowNum, colNum; + char* strBuf = new char[STRING_BUF_SIZE]; + + rowNum = (int)mxGetM(pMxArray); + colNum = (int)mxGetN(pMxArray); + vStr.resize(rowNum * colNum); + for (int i = 0; i < rowNum; ++i) { + for (int j = 0; j < colNum; ++j) { + pCell = mxGetCell(pMxArray, j * rowNum + i); + if (mxGetString(pCell, strBuf, STRING_BUF_SIZE) != 0) { + cout << "String is too large to fit in the buffer! " << i + 1 << '\t' << j + 1 << endl; + return false; + } + vStr[i * colNum + j] = strBuf; + auto& lastStr = vStr[i * colNum + j]; + transform(lastStr.cbegin(), lastStr.cend(), lastStr.begin(), ::toupper); // 转成大写 + } + } + delete[]strBuf; + return true; +} + +// 读取二维cell字符串并转换成大写 +inline bool Read2DWord(const mxArray* pMxArray, vector>& vvStr) { + mxArray* pCell = nullptr; + int rowNum, colNum; + char* strBuf = new char[STRING_BUF_SIZE]; + + rowNum = (int)mxGetM(pMxArray); + colNum = (int)mxGetN(pMxArray); + for (int i = 0; i < rowNum; ++i) { + for (int j = 0; j < colNum; ++j) { + pCell = mxGetCell(pMxArray, j * rowNum + i); + int childRowNum = (int)mxGetM(pCell); + int childColNum = (int)mxGetN(pCell); + vvStr.push_back(vector()); + Read1DWord(pCell, vvStr.back()); + } + } + delete[]strBuf; + return true; +} + +// 将结果写入mxArray, 作为后续的返回值 +mxArray* writeToMat(const double *data, int rowNum, int colNum) { + mxArray* pWriteArray = NULL;//matlab格式矩阵 + int len = rowNum * colNum; + //创建一个rowNum*colNum的矩阵 + pWriteArray = mxCreateDoubleMatrix(rowNum, colNum, mxREAL); + //把data的值赋给pWriteArray指针 + memcpy((void*)(mxGetPr(pWriteArray)), (void*)data, sizeof(double) * len); + return pWriteArray; // 赋值给返回值 +} + +/* 入口函数 */ +/* +三个参数,一个返回值 +输入: +1. wd 文献摘要中的单词,二维cell +2. dic 字典,按字母序排序的,一维cell +3. threshold 保留超过阈值的列 +输出: +x 一维int(double)类型,表示在wd的每一行单词中,dic中是否有单词匹配上(匹配后,对应的坐标为设为1,否则为0),所有匹配的个数(统计每一行的匹配个数) + dic每一个单词在wd所有行中出现的次数 +*/ +void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) { + if (nrhs < 2) { + cout << "At least 2 arguments should be given for this function!" << endl; + return; + } + clock_t begin = clock(), mid, finish; + vector vDic; + vector> vvWd; + + Read2DWord(prhs[0], vvWd); + Read1DWord(prhs[1], vDic); + int rowNum = vvWd.size(); + + int threshold = 5; + if (nrhs > 2) { + double* pThreshold = (double*)mxGetData(prhs[2]); + threshold = (int)pThreshold[0]; + if (threshold < 5) threshold = 5; + } + finish = clock(); + cout << "Load data time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; + + vector vXSum(vDic.size()); + + /* 统计dicr字典中,每个单词在wd中出现的次数 */ + mid = clock(); + unordered_map umWordPos; + for (int i = 0; i < vDic.size(); ++i) umWordPos[vDic[i]] = i; // 记录单词位置 + unordered_set usPos; // 多次出现在wd中的单词,只统计一次,这是原matlab代码的功能,是否需要修改? + vector> vusX(rowNum); // 保存每一行中非零元的坐标 + int row = 0; + for (auto& vWd : vvWd) { + auto& usPos = vusX[row++]; + for (auto& word : vWd) { + auto itr = umWordPos.find(word); + if (itr != umWordPos.end()) { + usPos.insert(itr->second); + } + } + for (auto idx : usPos) { + vXSum[idx] += 1; + } + } + finish = clock(); + cout << "Calc word occurrence time: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl; + + /* 计算xs */ + mid = clock(); + int colNum = 0; + vector vColIdx; + for (int i = 0; i < vXSum.size(); ++i) { + if (vXSum[i] >= threshold) { + vColIdx.push_back(i); + } + } + colNum = vColIdx.size(); + + vector vXsData(rowNum * colNum); + + for (int i = 0; i < rowNum; ++i) { + for (int j = 0; j < colNum; ++j) { + if (vusX[i].find(vColIdx[j]) != vusX[i].end()) { + vXsData[j * rowNum + i] = 1; + } + } + } + finish = clock(); + cout << "Calc xs time: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl; + + // 测试输出 + // cout << rowNum << '\t' << colNum << endl; + // ofstream ofs1("d:\\result_xsum.txt"); + // for (auto& val : vXSum) { + // ofs1 << val << endl; + // } + // ofs1.close(); + // + // ofstream ofs2("d:\\result_xs.txt"); + // for (int i = 0; i < rowNum; ++i) { + // for (int j = 0; j < colNum; ++j) { + // if (vXsData[j * rowNum + i] > 0) { + // ofs2 << j + 1 << '\t'; + // } + // } + // ofs2 << endl; + // } + // ofs2.close(); + + /* 写入结果 */ + mid = clock(); + if (nlhs > 0) { + plhs[0] = writeToMat(vXSum.data(), 1, vXSum.size()); + } + if (nlhs > 1) { // xs + plhs[1] = writeToMat(vXsData.data(), rowNum, colNum); + } + finish = clock(); + cout << "Write result time: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl; + + cout << "Calc word occurrence in Dic Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; +} + +// 供c++调试用 +void mexFunctionWrap(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) { + return mexFunction(nlhs, plhs, nrhs, prhs); +} \ No newline at end of file diff --git a/MexFunc/MexFunc.vcxproj b/MexFunc/MexFunc.vcxproj index 6eb53f7..e983a64 100644 --- a/MexFunc/MexFunc.vcxproj +++ b/MexFunc/MexFunc.vcxproj @@ -119,8 +119,8 @@ + - diff --git a/MexFunc/MexFunc.vcxproj.filters b/MexFunc/MexFunc.vcxproj.filters index ef800b8..9b2cc18 100644 --- a/MexFunc/MexFunc.vcxproj.filters +++ b/MexFunc/MexFunc.vcxproj.filters @@ -18,7 +18,7 @@ Source Files - + Source Files diff --git a/MexFunc/main.cpp b/MexFunc/main.cpp index d365162..364161e 100644 --- a/MexFunc/main.cpp +++ b/MexFunc/main.cpp @@ -13,32 +13,45 @@ int main(int argc, const char** argv) //string matFile = "D:\\Twirls\\wd_small.mat"; //string matFile = "D:\\Twirls\\wd.mat"; clock_t begin = clock(), finish; - string wd2Mat = "D:\\wd2_5w.mat"; - string dicrMat = "D:\\dicr.mat"; - string wdMat = "D:\\wd.mat"; + //string wd2Mat = "D:\\wd2_5w.mat"; + //string dicrMat = "D:\\dicr.mat"; + //string wdMat = "D:\\wd.mat"; - //string dicrMat = "D:\\dicr_large.mat"; + + //string dicMat = "D:\\G_dc_large.mat"; //string wdMat = "D:\\wd_large.mat"; - MATFile* pwdMat, *pwd2Mat, *pdicrMat; - mxArray* prhs[4]; + //MATFile* pwdMat, *pwd2Mat, *pdicMat; + //mxArray* prhs[4]; - pwdMat = matOpen(wdMat.c_str(), "r"); - pwd2Mat = matOpen(wd2Mat.c_str(), "r"); - pdicrMat = matOpen(dicrMat.c_str(), "r"); - - - prhs[0] = matGetVariable(pwdMat, "wd"); //鑾峰彇.mat鏂囦欢閲岄潰鍚嶄负matrixName鐨勭煩闃 - prhs[1] = matGetVariable(pwd2Mat, "wd2"); + //pwdMat = matOpen(wdMat.c_str(), "r"); + // pwd2Mat = matOpen(wd2Mat.c_str(), "r"); + //pdicMat = matOpen(dicMat.c_str(), "r"); // prhs[1] = mxCreateString("D:\\Twirls\\gat1\\literatures\\temp\\wd2s.txt"); - prhs[2] = matGetVariable(pdicrMat, "dicr"); + // prhs[2] = matGetVariable(pdicrMat, "dicr"); - mxArray* plhs = (mxArray*)malloc(sizeof(mxArray*)); + /* IsWordInDic */ + MATFile* pwdMat, * pdicMat; + mxArray* plhs[4]; + const mxArray* prhs[4]; + int nlhs = 2, nrhs = 2; + pwdMat = matOpen("D:\\wd_large.mat", "r"); + pdicMat = matOpen("D:\\G_dc_large.mat", "r"); + prhs[0] = matGetVariable(pwdMat, "wd"); //鑾峰彇.mat鏂囦欢閲岄潰鍚嶄负matrixName鐨勭煩闃 + prhs[1] = matGetVariable(pdicMat, "dc"); - mxArray** arg = &prhs[0]; + /* */ + //MATFile* pMatX = matOpen("D:\\x_large.mat", "r"); + //MATFile* pMatH = matOpen("D:\\h_large.mat", "r"); + //prhs[0] = matGetVariable(pMatX, "x"); + //prhs[1] = matGetVariable(pMatH, "h"); - mexFunction(1, &plhs, 3, arg); - //mexFunction(0, 0, 0, 0); + + + finish = clock(); + cout << "Load Data time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; + + mexFunctionWrap(nlhs, &plhs[0], nrhs, &prhs[0]); finish = clock(); cout << "mexFunction Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; diff --git a/MexFunc/mex_func.h b/MexFunc/mex_func.h index fc4a7a7..c6e5c9b 100644 --- a/MexFunc/mex_func.h +++ b/MexFunc/mex_func.h @@ -1,3 +1,4 @@ #pragma once #include -void mexFunction(int nlhs, mxArray* plhs[], int nrhs, mxArray** prhs); \ No newline at end of file +// void mexFunction(int nlhs, mxArray* plhs[], int nrhs, mxArray** prhs); +void mexFunctionWrap(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]); \ No newline at end of file