twirls/MexFunc/ClusterRandSim.cpp

262 lines
7.0 KiB
C++
Raw Permalink Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#include <mex.h>
#include <mat.h>
#include <iostream>
#include <algorithm>
#include <string>
#include <unordered_set>
#include <ctime>
#include <vector>
#include <queue>
#include <memory>
#include <thread>
#include <mutex>
#include <condition_variable>
#include <future>
#include <functional>
#include <stdexcept>
#include <unordered_map>
#include <set>
#include <fstream>
#include <random>
#include <cmath>
#include <stdlib.h>
#include <limits.h>
#include <atomic>
using std::cout;
using std::endl;
using namespace std;
#define STRING_BUF_SIZE 204800
/* 读取一维double数据 */
void Read1DDouble(const mxArray* pMxArray, vector<double>& vDat) {
int rowNum, colNum;
double* matData;
rowNum = (int)mxGetM(pMxArray);
colNum = (int)mxGetN(pMxArray);
// cout << rowNum << " " << colNum << endl;
matData = (double*)mxGetData(pMxArray); //获取指针
vDat.resize(rowNum * colNum);
for (int i = 0; i < vDat.size(); ++i) vDat[i] = matData[i];
}
/* 读取二维double数据 */
void Read2DDouble(const mxArray* pMxArray, vector<vector<double>>& vvDat) {
int rowNum, colNum;
double* matData;
rowNum = (int)mxGetM(pMxArray);
colNum = (int)mxGetN(pMxArray);
vvDat.resize(rowNum);
matData = (double*)mxGetData(pMxArray); //获取指针
for (int i = 0; i < rowNum; ++i) {
vvDat[i].resize(colNum);
for (int j = 0; j < colNum; ++j) {
vvDat[i][j] = matData[j * rowNum + i];
}
}
}
// 将结果写入mxArray, 作为后续的返回值
mxArray* writeToMatDouble(const double* data, int rowNum, int colNum) {
mxArray* pWriteArray = NULL;//matlab格式矩阵
int len = rowNum * colNum;
//创建一个rowNum*colNum的矩阵
pWriteArray = mxCreateDoubleMatrix(rowNum, colNum, mxREAL);
//把data的值赋给pWriteArray指针
memcpy((void*)(mxGetPr(pWriteArray)), (void*)data, sizeof(double) * len);
return pWriteArray; // 赋值给返回值
}
#define START_IDX(tid, threadNum, arrLen) (arrLen) * (tid) / (threadNum)
#define END_IDX(tid, threadNum, arrLen) (arrLen) * (tid + 1) / (threadNum)
// 线程参数
struct TPRandSim {
vector<vector<double>>* pvvTr;
vector<int>* pvRandPos;
vector<double>* pvH;
vector<vector<double>>* pvvX;
int numPositive;
int tid;
int numThread;
};
// 多线程入口函数
void ThreadRandSim(TPRandSim param) {
vector < vector<double>>& vvTr = *param.pvvTr;
vector<int>& vRandPos = *param.pvRandPos;
vector<vector<double>>& vvX = *param.pvvX;
vector<double>& vH = *param.pvH;
int numPositive = param.numPositive;
int rowNum = vvX.size();
int colNum = vvX[0].size();
int tid = param.tid;
int numThread = param.numThread;
clock_t begin = clock(), finish;
/* 随机模拟 */
std::random_device rd;
int startIdx = START_IDX(tid, numThread, vvTr.size());
int endIdx = END_IDX(tid, numThread, vvTr.size());
// cout << tid << '\t' << numThread << '\t' << startIdx << '\t' << endIdx << endl;
for (int idx = startIdx; idx < endIdx; ++idx) {
auto& vTr = vvTr[idx];
std::shuffle(vRandPos.begin(), vRandPos.end(), std::default_random_engine(rd()));
for (int i = 0; i < rowNum; ++i) {
int hRowIdx = vRandPos[i]; // 随机打乱之后的行索引
if (vH[hRowIdx] == 1) {
for (int j = 0; j < colNum; ++j) {
vTr[j] += vvX[i][j];
}
}
}
for (auto& val : vTr) val /= numPositive;
}
finish = clock();
cout << "Thread Random simulation time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
}
/* 入口函数 */
/*
三个参数,一个返回值
输入:
1. x 二维数据double类型行数为文献数量列数为字典长度每个单词在所有文献中出现的次数超过5
2. h 长度为文献个数值为1代表该文献属于该知识颗粒应该是为0则不属于
3. numThread
4. loopNum
输出:
vs z score,显著性指数,一维
ps 与vs长度一致
*/
void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) {
if (nrhs < 2) {
cout << "At least 2 arguments should be given for this function!" << endl;
return;
}
clock_t begin = clock(), mid, finish;
vector<double> vH;
vector<vector<double>> vvX;
Read2DDouble(prhs[0], vvX);
Read1DDouble(prhs[1], vH);
int rowNum = vvX.size();
int colNum = vvX[0].size();
// cout << vH.size() << '\t' << vvX.size() << endl;
int numThread = 1;
int loopNum = 1000;
if (nrhs > 2) {
double* pNumThread = (double*)mxGetData(prhs[2]);
numThread = (int)pNumThread[0];
if (numThread < 1) numThread = 1;
}
if (nrhs > 3) {
double* pLoopNum = (double*)mxGetData(prhs[3]);
loopNum = (int)pLoopNum[0];
if (loopNum < 1000) loopNum = 1000;
}
finish = clock();
cout << "Load Data time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
// cout << numThread << '\t' << loopNum << endl;
/* 进行随机模拟 */
mid = clock();
vector<double> vTs(colNum); // 初始数据记录vH中label为1的行的行均值
int numPositive = 0;
for (int i = 0; i < rowNum; ++i) {
if (vH[i] == 1) {
++numPositive;
for (int j = 0; j < colNum; ++j) {
vTs[j] += vvX[i][j];
}
}
}
for (auto& val : vTs) val /= numPositive;
vector<vector<double>> vvTr(loopNum, vector<double>(colNum, 0)); // 模拟结果
vector<vector<int>> vvRandPos(numThread, vector<int>(rowNum));
for (int i = 0; i < rowNum; ++i) {
for (auto& vRandPos : vvRandPos) {
vRandPos[i] = i;
}
}
vector<std::thread> vT;
for (int i = 0; i < numThread; ++i) {
TPRandSim tParam = { &vvTr, &vvRandPos[i], &vH, &vvX, numPositive, i, numThread };
vT.push_back(std::thread(ThreadRandSim, tParam));
}
for (auto& t : vT) t.join();
finish = clock();
cout << "Random simulation time: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl;
/* 计算结果 */
vector<double> vVs(colNum);
vector<double> vPs(colNum);
// 按列计算平均值
vector<double> vMean(colNum);
vector<double> vStd(colNum);
for (int i = 0; i < vvTr.size(); ++i) {
for (int j = 0; j < vvTr[i].size(); ++j) {
vMean[j] += vvTr[i][j];
}
}
for (auto& val : vMean) { val /= loopNum; } // 均值
for (int i = 0; i < vvTr.size(); ++i) {
for (int j = 0; j < vvTr[i].size(); ++j) {
const double diff = vvTr[i][j] - vMean[j];
vStd[j] += diff * diff;
}
}
for (auto& val : vStd) { val = sqrt(val / (loopNum - 1)); } // 均方根
// 计算vs
for (int i = 0; i < vVs.size(); ++i) {
vVs[i] = (vTs[i] - vMean[i]) / vStd[i];
}
// 计算ps
vector<double> vSumGreater(colNum);
vector<double> vSumLess(colNum);
for (int i = 0; i < loopNum; ++i) {
for (int j = 0; j < colNum; ++j) {
if (vvTr[i][j] >= vTs[j]) vSumGreater[j] ++;
if (vvTr[i][j] <= vTs[j]) vSumLess[j] ++;
}
}
for (auto& val : vSumGreater) val /= loopNum;
for (auto& val : vSumLess) val /= loopNum;
for (int i = 0; i < colNum; ++i) {
vPs[i] = min(vSumGreater[i], vSumLess[i]);
}
// ofstream ofs("d:\\result_2.txt");
// for (int i = 0; i < colNum; ++i) {
// ofs << vVs[i] << '\t' << vPs[i] << endl;
// }
// ofs.close();
/* 写入结果 */
if (nlhs > 0) { // vs
plhs[0] = writeToMatDouble(vVs.data(), 1, vVs.size());
}
if (nlhs > 1) { // ps
plhs[1] = writeToMatDouble(vPs.data(), 1, vPs.size());
}
finish = clock();
cout << "Cluster Random simulation Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
}
// 供c++调试用
void mexFunctionWrap(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) {
return mexFunction(nlhs, plhs, nrhs, prhs);
}