完成了计算信息熵和平均信息熵（第三个计算信息熵的函数）

2023-10-12 12:46:17 +08:00 · 2023-10-12 12:46:17 +08:00 · ca3f99cc98
parent 47b0cc187b
commit ca3f99cc98
8 changed files with 852 additions and 147 deletions
--- a/MexFunc/AllClusterRandSim.cpp
+++ b/MexFunc/AllClusterRandSim.cpp
@ -0,0 +1,415 @@
+#include <mex.h>
+#include <mat.h>
+#include <iostream>
+#include <algorithm>
+#include <string>
+#include <unordered_set>
+#include <ctime>
+#include <vector>
+#include <queue>
+#include <memory>
+#include <thread>
+#include <mutex>
+#include <condition_variable>
+#include <future>
+#include <functional>
+#include <stdexcept>
+#include <unordered_map>
+#include <set>
+#include <fstream>
+#include <random>
+#include <cmath>
+#include <stdlib.h>
+#include <limits.h>
+#include <atomic>
+
+using std::cout;
+using std::endl;
+using namespace std;
+
+#define STRING_BUF_SIZE 204800
+
+class ThreadPool {
+public:
+	ThreadPool(size_t);
+	template<class F, class... Args>
+	auto enqueue(F&& f, Args&&... args)
+		->std::future<typename std::result_of<F(Args...)>::type>;
+	~ThreadPool();
+private:
+	// need to keep track of threads so we can join them
+	std::vector< std::thread > workers;
+	// the task queue
+	std::queue< std::function<void()> > tasks;
+
+	// synchronization
+	std::mutex queue_mutex;
+	std::condition_variable condition;
+	bool stop;
+};
+
+// the constructor just launches some amount of workers
+inline ThreadPool::ThreadPool(size_t threads)
+	: stop(false)
+{
+	for (size_t i = 0;i < threads;++i)
+		workers.emplace_back(
+			[this]
+			{
+				for (;;)
+				{
+					std::function<void()> task;
+
+					{
+						std::unique_lock<std::mutex> lock(this->queue_mutex);
+						this->condition.wait(lock,
+							[this] { return this->stop || !this->tasks.empty(); });
+						if (this->stop && this->tasks.empty())
+							return;
+						task = std::move(this->tasks.front());
+						this->tasks.pop();
+					}
+
+					task();
+				}
+			}
+			);
+}
+
+// add new work item to the pool
+template<class F, class... Args>
+auto ThreadPool::enqueue(F&& f, Args&&... args)
+-> std::future<typename std::result_of<F(Args...)>::type>
+{
+	using return_type = typename std::result_of<F(Args...)>::type;
+
+	auto task = std::make_shared< std::packaged_task<return_type()> >(
+		std::bind(std::forward<F>(f), std::forward<Args>(args)...)
+		);
+
+	std::future<return_type> res = task->get_future();
+	{
+		std::unique_lock<std::mutex> lock(queue_mutex);
+
+		// don't allow enqueueing after stopping the pool
+		if (stop)
+			throw std::runtime_error("enqueue on stopped ThreadPool");
+
+		tasks.emplace([task]() { (*task)(); });
+	}
+	condition.notify_one();
+	return res;
+}
+
+// the destructor joins all threads
+inline ThreadPool::~ThreadPool()
+{
+	{
+		std::unique_lock<std::mutex> lock(queue_mutex);
+		stop = true;
+	}
+	condition.notify_all();
+	for (std::thread& worker : workers)
+		worker.join();
+}
+
+/* 读取一维double数据 */
+void Read1DDouble(const  mxArray* pMxArray, vector<double>& vDat) {
+	int rowNum, colNum;
+	double* matData;
+
+	rowNum = (int)mxGetM(pMxArray);
+	colNum = (int)mxGetN(pMxArray);
+	// cout << rowNum << " " << colNum << endl;
+	matData = (double*)mxGetData(pMxArray); //获取指针
+	vDat.resize(rowNum * colNum);
+	for (int i = 0; i < vDat.size(); ++i) vDat[i] = matData[i];
+}
+
+/* 读取二维double数据 */
+void Read2DDouble(const  mxArray* pMxArray, vector<vector<double>>& vvDat) {
+	int rowNum, colNum;
+	double* matData;
+
+	rowNum = (int)mxGetM(pMxArray);
+	colNum = (int)mxGetN(pMxArray);
+	vvDat.resize(rowNum);
+	matData = (double*)mxGetData(pMxArray); //获取指针
+	for (int i = 0; i < rowNum; ++i) {
+		vvDat[i].resize(colNum);
+		for (int j = 0; j < colNum; ++j) {
+			vvDat[i][j] = matData[j * rowNum + i];
+		}
+	}
+}
+
+// 将结果写入mxArray, 作为后续的返回值
+mxArray* writeToMatDouble(const double* data, int rowNum, int colNum) {
+	mxArray* pWriteArray = NULL;//matlab格式矩阵
+	int len = rowNum * colNum;
+	//创建一个rowNum*colNum的矩阵  
+	pWriteArray = mxCreateDoubleMatrix(rowNum, colNum, mxREAL);
+	//把data的值赋给pWriteArray指针
+	memcpy((void*)(mxGetPr(pWriteArray)), (void*)data, sizeof(double) * len);
+	return pWriteArray; // 赋值给返回值
+}
+
+#define START_IDX(tid, threadNum, arrLen) (arrLen) * (tid) / (threadNum)
+#define END_IDX(tid, threadNum, arrLen) (arrLen) * (tid + 1) / (threadNum)
+
+// 线程参数
+struct TPRandSim {
+	vector<vector<double>>* pvvSum;
+	vector<vector<double>>* pvvSqSum;
+	vector<vector<int>>* pvvRandPos;
+	vector<double>* pvIx;
+	vector<int>* pvCateNum;
+	vector<vector<double>>* pvvRound;
+	vector<vector<double>>* pvvX;
+	int loopNum;
+	int maxCategoryNum;
+	int tid;
+	int numThread;
+};
+
+// 多线程入口函数
+void ThreadRandSim(TPRandSim param) {
+	vector<vector<double>>& vvSum = *param.pvvSum;
+	vector<vector<double>>& vvSqSum = *param.pvvSqSum;
+	vector<vector<int>>& vvRandPos = *param.pvvRandPos;
+	vector<double>& vIx = *param.pvIx;
+	vector<int>& vCateNum = *param.pvCateNum;
+	vector<vector<double>>& vvRound = *param.pvvRound;
+	vector<vector<double>>& vvX = *param.pvvX;
+	int tid = param.tid;
+	int numThread = param.numThread;
+	int loopNum = param.loopNum;
+	int maxCategoryNum = param.maxCategoryNum;
+	int rowNum = vvX.size();
+	int colNum = vvX[0].size();
+
+	clock_t begin = clock(), mid = clock(), finish;
+	/* 随机模拟 */
+	int startIdx = START_IDX(tid, numThread, loopNum);
+	int endIdx = END_IDX(tid, numThread, loopNum);
+
+	// if (tid == 0) cout << startIdx << '\t' << endIdx << endl;
+
+	for (int idx = startIdx; idx < endIdx; ++idx) { // 模拟轮次
+		// if (tid == 0 && idx % 100 == 0) {
+		// 	finish = clock();
+		// 	cout << idx << ": time: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl << flush;
+		// 	mid = finish;
+		// }
+		auto& vRandPos = vvRandPos[idx];
+		for (int i = 0; i < rowNum; ++i) {
+			const int hRowIdx = vRandPos[i]; // 随机打乱之后的行索引
+			const int cateIdx = vIx[hRowIdx] - 1; // 聚类的编号
+			//if (cateIdx == 0) {
+				auto& vRound = vvRound[cateIdx];
+				for (int j = 0; j < colNum; ++j) {
+					vRound[j] += vvX[i][j];
+				}
+			//}
+		}
+		for (int c = 0; c < maxCategoryNum; ++c) {
+			auto& vRound = vvRound[c];
+			const int numPositive = vCateNum[c];
+			for (int j = 0; j < colNum; ++j) {
+				const double val = vRound[j] / numPositive;
+				vvSum[c][j] += val;
+				vvSqSum[c][j] += val * val;
+			}
+		}
+		for (auto& vRound : vvRound) for (auto &val : vRound) val = 0;
+	}
+	finish = clock();
+	// cout << tid << ": Random simulation time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl << flush;
+}
+
+/* 多线程进行shuffle操作 */
+struct TPShuffle {
+	vector<vector<int>>* pvvRandPos;
+	int loopNum;
+	int tid;
+	int numThread;
+};
+void ThreadShuffle(TPShuffle param) {
+	clock_t begin = clock(), finish;
+	vector<vector<int>>& vvRandPos = *param.pvvRandPos;
+	int tid = param.tid;
+	int numThread = param.numThread;
+	int loopNum = param.loopNum;
+	std::random_device rd;
+	int startIdx = START_IDX(tid, numThread, loopNum);
+	int endIdx = END_IDX(tid, numThread, loopNum);
+	for (int roundIdx = startIdx; roundIdx < endIdx; ++roundIdx) {
+		vector<int>& vRandPos = vvRandPos[roundIdx];
+		for (int i = 0; i < vRandPos.size(); ++i) vRandPos[i] = i;
+		std::shuffle(vRandPos.begin(), vRandPos.end(), std::default_random_engine(rd()));
+	}
+	finish = clock();
+	// cout << tid << ": thread shuffle time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl << flush;
+}
+
+/* 入口函数 */
+/*
+三个参数，一个返回值
+输入：
+1. x 二维数据，double类型，行数为文献数量，列数为字典长度（每个单词在所有文献中出现的次数超过5）
+2. h 长度为文献个数，值为1代表该文献属于该知识颗粒（应该是），为0则不属于
+3. numThread
+4. loopNum
+输出：
+vs z score,显著性指数，一维
+ps 与vs长度一致
+*/
+void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) {
+	if (nrhs < 2) {
+		cout << "At least 2 arguments should be given for this function!" << endl;
+		return;
+	}
+	clock_t begin = clock(), mid, finish;
+	vector<double> vIx; // 类别
+	vector<vector<double>> vvX;
+
+
+	Read2DDouble(prhs[0], vvX);
+	Read1DDouble(prhs[1], vIx);
+	int rowNum = vvX.size();
+	int colNum = vvX[0].size();
+
+	// 查找最大的类别编号
+	int maxCategoryNum = 1;
+	for (auto category : vIx) if (maxCategoryNum < category) maxCategoryNum = category;
+
+	int numThread = 1;
+	int loopNum = 1000;
+
+	if (nrhs > 2) {
+		double* pNumThread = (double*)mxGetData(prhs[2]);
+		numThread = (int)pNumThread[0];
+		if (numThread < 1) numThread = 1;
+	}
+	if (nrhs > 3) {
+		double* pLoopNum = (double*)mxGetData(prhs[3]);
+		loopNum = (int)pLoopNum[0];
+		if (loopNum < 1000) loopNum = 1000;
+	}
+	// 线程中用到的数据
+	vector<int> vCateNum(maxCategoryNum); // 每个类别包含的个数
+	vector<vector<double>> vvTs(maxCategoryNum, vector<double>(colNum)); //记录各个类别的行的行均值
+	vector<vector<vector<double>>> vvvSum(numThread, vector<vector<double>>(maxCategoryNum, vector<double>(colNum)));
+	vector<vector<vector<double>>> vvvSqSum(numThread, vector<vector<double>>(maxCategoryNum, vector<double>(colNum)));
+	vector<vector<vector<double>>> vvvRound(numThread, vector<vector<double>>(maxCategoryNum, vector<double>(colNum)));
+
+	finish = clock();
+	cout << "Load Data time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl << flush;
+	// cout << numThread << '\t' << loopNum << endl;
+
+	// maxCategoryNum = 1;
+
+	/* 计算实际分类的均值 */
+	mid = clock();
+	for (int i = 0; i < rowNum; ++i) {
+		const int cateIdx = vIx[i] - 1;
+		vCateNum[cateIdx] ++;
+		for (int j = 0; j < colNum; ++j) vvTs[cateIdx][j] += vvX[i][j];
+	}
+	for (int i = 0; i < maxCategoryNum; ++i) {
+		for (int j = 0; j < colNum; ++j) {
+			vvTs[i][j] /= vCateNum[i];
+		}
+	}
+
+	// for (auto c : vCateNum) cout << c << endl;
+
+	// 先把loopNum次随机shuffle做了
+	mid = clock();
+	vector<vector<int>> vvRandPos(loopNum, vector<int>(rowNum));
+	vector<std::thread> vTHShuffle;
+	for (int i = 0; i < numThread; ++i) {
+		TPShuffle param = { &vvRandPos, loopNum, i, numThread };
+		vTHShuffle.push_back(std::thread(ThreadShuffle, param));
+	}
+	for (auto& t : vTHShuffle) t.join();
+	finish = clock();
+	cout << "Shuffle time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl << flush;
+
+	vector<std::thread> vTHRandSim;
+	for (int i = 0; i < numThread; ++i) {
+		TPRandSim tParam = { &vvvSum[i], &vvvSqSum[i], &vvRandPos, &vIx, &vCateNum, &vvvRound[i], & vvX, loopNum, maxCategoryNum, i, numThread};
+		vTHRandSim.push_back(std::thread(ThreadRandSim, tParam));
+	}
+	for (auto& t : vTHRandSim) t.join();
+
+	// 合并所有线程的数据
+	auto& vvSum = vvvSum[0];
+	auto& vvSqSum = vvvSqSum[0];
+	for (int t = 1; t < numThread; ++t) {
+		for (int i = 0; i < maxCategoryNum; ++i) {
+			for (int j = 0; j < colNum; ++j) {
+				vvSum[i][j] += vvvSum[t][i][j];
+				vvSqSum[i][j] += vvvSqSum[t][i][j];
+			}
+		}
+	}
+
+	finish = clock();
+	cout << "Random simulation time: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl << flush;
+
+	/* 计算结果 */
+	vector<vector<double>> vvVs(maxCategoryNum, vector<double>(colNum));
+
+	// 按列计算平均值
+	vector<vector<double>> vvMean(maxCategoryNum, vector<double>(colNum));
+	vector<vector<double>> vvStd(maxCategoryNum, vector<double>(colNum));
+
+	for (int c = 0; c < maxCategoryNum; ++c) {
+		auto& vMean = vvMean[c];
+		auto& vStd = vvStd[c];
+		auto& vVs = vvVs[c];
+		auto& vSum = vvSum[c];
+		auto& vSqSum = vvSqSum[c];
+		auto& vTs = vvTs[c];
+
+		for (int i = 0; i < colNum; ++i) vMean[i] = vSum[i] / loopNum;
+		for (int i = 0; i < colNum; ++i) {
+			const double meanVal = vSum[i] / loopNum;
+			vMean[i] = meanVal; // 均值
+			const double sqDiff = vSqSum[i] + loopNum * meanVal * meanVal - 2 * meanVal * vSum[i];
+			vStd[i] = sqrt(sqDiff / (loopNum - 1)); // 均方根
+		}
+
+		// 计算vs
+		for (int i = 0; i < vVs.size(); ++i) {
+			vVs[i] = (vTs[i] - vMean[i]) / vStd[i];
+		}
+	}
+
+	// auto& vVs = vvVs[0];
+	// ofstream ofs("d:\\result_new.txt");
+	// for (int i = 0; i < colNum; ++i) {
+	// 	ofs << vVs[i] << endl;
+	// }
+	// ofs.close();
+
+	/* 写入结果 */
+	if (nlhs > 0) { // vs
+		vector<double> vVsData(maxCategoryNum* colNum);
+		for (int i = 0; i < maxCategoryNum; ++i) {
+			for (int j = 0; j < colNum; ++j) {
+				vVsData[j * maxCategoryNum + i] = vvVs[i][j];
+			}
+		}
+		plhs[0] = writeToMatDouble(vVsData.data(), maxCategoryNum, colNum);
+	}
+
+	finish = clock();
+	cout << "All Cluster Random simulation Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl << flush;
+}
+
+// 供c++调试用
+void mexFunctionWrap(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) {
+	return mexFunction(nlhs, plhs, nrhs, prhs);
+}
--- a/MexFunc/AllEntropyMean.cpp
+++ b/MexFunc/AllEntropyMean.cpp
@ -0,0 +1,312 @@
+#include <mex.h>
+#include <mat.h>
+#include <iostream>
+#include <algorithm>
+#include <string>
+#include <unordered_set>
+#include <ctime>
+#include <vector>
+#include <queue>
+#include <memory>
+#include <thread>
+#include <mutex>
+#include <condition_variable>
+#include <future>
+#include <functional>
+#include <stdexcept>
+#include <unordered_map>
+#include <set>
+#include <fstream>
+#include <random>
+#include <cmath>
+#include <stdlib.h>
+#include <limits.h>
+#include <atomic>
+using std::cout;
+using std::endl;
+using namespace std;
+
+#define STRING_BUF_SIZE 204800
+// 将matlab存储方式转换成c存储方式
+#define TRANS_ROW_COL(dst, src, rowNum, colNum)						\
+	for (int rowI = 0; rowI < rowNum; ++rowI) {                     \
+		for (int colJ = 0; colJ < colNum; ++colJ) {                 \
+			dst[rowI * colNum + colJ] = src[colJ * rowNum + rowI];	\
+		}                                                           \
+	}
+
+class ThreadPool {
+public:
+	ThreadPool(size_t);
+	template<class F, class... Args>
+	auto enqueue(F&& f, Args&&... args)
+		->std::future<typename std::result_of<F(Args...)>::type>;
+	~ThreadPool();
+private:
+	// need to keep track of threads so we can join them
+	std::vector< std::thread > workers;
+	// the task queue
+	std::queue< std::function<void()> > tasks;
+
+	// synchronization
+	std::mutex queue_mutex;
+	std::condition_variable condition;
+	bool stop;
+};
+
+// the constructor just launches some amount of workers
+inline ThreadPool::ThreadPool(size_t threads)
+	: stop(false)
+{
+	for (size_t i = 0;i < threads;++i)
+		workers.emplace_back(
+			[this]
+			{
+				for (;;)
+				{
+					std::function<void()> task;
+
+					{
+						std::unique_lock<std::mutex> lock(this->queue_mutex);
+						this->condition.wait(lock,
+							[this] { return this->stop || !this->tasks.empty(); });
+						if (this->stop && this->tasks.empty())
+							return;
+						task = std::move(this->tasks.front());
+						this->tasks.pop();
+					}
+
+					task();
+				}
+			}
+			);
+}
+
+// add new work item to the pool
+template<class F, class... Args>
+auto ThreadPool::enqueue(F && f, Args&&... args)
+-> std::future<typename std::result_of<F(Args...)>::type>
+{
+	using return_type = typename std::result_of<F(Args...)>::type;
+
+	auto task = std::make_shared< std::packaged_task<return_type()> >(
+		std::bind(std::forward<F>(f), std::forward<Args>(args)...)
+		);
+
+	std::future<return_type> res = task->get_future();
+	{
+		std::unique_lock<std::mutex> lock(queue_mutex);
+
+		// don't allow enqueueing after stopping the pool
+		if (stop)
+			throw std::runtime_error("enqueue on stopped ThreadPool");
+
+		tasks.emplace([task]() { (*task)(); });
+	}
+	condition.notify_one();
+	return res;
+}
+
+// the destructor joins all threads
+inline ThreadPool::~ThreadPool()
+{
+	{
+		std::unique_lock<std::mutex> lock(queue_mutex);
+		stop = true;
+	}
+	condition.notify_all();
+	for (std::thread& worker : workers)
+		worker.join();
+}
+
+// 读取一维cell字符串并转换成大写
+inline bool ReadWord1DCell(const mxArray* pMxArray, vector<string>& vStr) {
+	mxArray* pCell = nullptr;
+	int rowNum, colNum;
+	char* strBuf = new char[STRING_BUF_SIZE];
+
+	rowNum = (int)mxGetM(pMxArray);
+	colNum = (int)mxGetN(pMxArray);
+	vStr.resize(rowNum * colNum);
+	for (int i = 0; i < rowNum; ++i) {
+		for (int j = 0; j < colNum; ++j) {
+			pCell = mxGetCell(pMxArray, j * rowNum + i);
+			if (mxGetString(pCell, strBuf, STRING_BUF_SIZE) != 0) {
+				cout << "String is too large to fit in the buffer! " << i + 1 << '\t' << j + 1 << endl;
+				return false;
+			}
+			vStr[i * colNum + j] = strBuf;
+			auto& lastStr = vStr[i * colNum + j];
+			transform(lastStr.cbegin(), lastStr.cend(), lastStr.begin(), ::toupper); // 转成大写
+		}
+	}
+	delete[]strBuf;
+	return true;
+}
+
+// 读取二维cell字符串并转换成大写
+inline bool ReadWord2DCell(const mxArray* pMxArray, vector<vector<string>>& vvStr) {
+	mxArray* pCell = nullptr;
+	int rowNum, colNum;
+
+	rowNum = (int)mxGetM(pMxArray);
+	colNum = (int)mxGetN(pMxArray);
+	for (int i = 0; i < rowNum; ++i) {
+		for (int j = 0; j < colNum; ++j) {
+			pCell = mxGetCell(pMxArray, j * rowNum + i);
+			vvStr.push_back(vector<string>());
+			ReadWord1DCell(pCell, vvStr.back());
+		}
+	}
+	return true;
+}
+
+// 读取由一维cell包裹的double数据，每个cell是一个一维的double数组
+inline void ReadDoulbe1DCell(const mxArray* pMxArray, vector<vector<double> >& vvData) {
+	// 读取fr数值
+	int rowNum = (int)mxGetM(pMxArray);
+	int colNum = (int)mxGetN(pMxArray);
+	for (int i = 0; i < rowNum; ++i) {
+		for (int j = 0; j < colNum; ++j) {
+			mxArray* pCell = mxGetCell(pMxArray, j * rowNum + i);
+			int childRowNum = (int)mxGetM(pCell);
+			int childColNum = (int)mxGetN(pCell);
+			vvData.push_back(vector<double>());
+			vvData.back().resize(childRowNum * childColNum);
+			double* pVal = (double*)mxGetData(pCell); //获取数据指针
+			TRANS_ROW_COL(vvData.back(), pVal, childRowNum, childColNum); // 行列存储方式转换
+		}
+	}
+}
+
+// 将结果写入mxArray, 作为后续的返回值
+mxArray* writeToMatDouble(const double* data, int rowNum, int colNum) {
+	mxArray* pWriteArray = NULL;//matlab格式矩阵
+	int len = rowNum * colNum;
+	//创建一个rowNum*colNum的矩阵  
+	pWriteArray = mxCreateDoubleMatrix(rowNum, colNum, mxREAL);
+	//把data的值赋给pWriteArray指针
+	memcpy((void*)(mxGetPr(pWriteArray)), (void*)data, sizeof(double) * len);
+	return pWriteArray; // 赋值给返回值
+}
+
+/* 多线程计算信息熵 */
+struct TPEntropyMean {
+	vector<string>* pvDs;
+	vector<double>* pvFr;
+	vector<unordered_set<string>>* pvusAbsWord;
+	vector<double>* pvHs;
+	vector<double>* pvHd;
+};
+
+void ThreadCalcEntropyMean(TPEntropyMean& param) {
+	vector<string>& vDs = *param.pvDs; // 这一组ds
+	vector<double>& vFr = *param.pvFr; // frequency
+	vector<unordered_set<string>>& vusAbsWord = *param.pvusAbsWord;
+	vector<double>& vHs = *param.pvHs;
+	vector<double>& vHd = *param.pvHd;
+	const int numAbs = vusAbsWord.size();
+	const int numDsWord = vDs.size(); // 这一组数据中包含的单词数量
+	// 检查知识颗粒中的词语是否出现在pubmed摘要的词语中
+	for (int i = 0; i < numAbs; ++i) {
+		for (int j = 0; j < numDsWord; ++j) {
+			if (vusAbsWord[i].find(vDs[j]) != vusAbsWord[i].end()) { // 这一组单词中的j索引位置的单词在第i个文献中出现过
+				vHs[i] -= vFr[j] * log2(vFr[j]);
+			}
+		}
+		vHd[i] = vHs[i] / vusAbsWord[i].size();
+	}
+}
+
+/*
+输入：
+1. ds: 知识颗粒中的信息，应该也是摘要，分割成了字符串，大写的。
+2. frr: ds中每个单词对应的频率。
+3. ws: 文献的摘要，被切割成特定的长度了，字符串已经分割好了。
+[4]. numThread
+输出：
+1. hs: 信息熵，二维[len(ds)][len(ws)]
+2. hd: 每个单词的平均信息熵，维度同hs
+*/
+void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) {
+	if (nrhs < 3) {
+		cout << "At least 3 arguments should be given for this function!" << endl;
+		return;
+	}
+	clock_t begin = clock(), mid, finish;
+
+	vector<vector<string> > vvDs; // 每个知识颗粒的ds矩阵（词汇矩阵）
+	vector<vector<double> > vvFr; // 词汇对应的频率
+	ReadWord2DCell(prhs[0], vvDs);
+	ReadDoulbe1DCell(prhs[1], vvFr);
+
+	vector<vector<string>> vvWs;
+	ReadWord2DCell(prhs[2], vvWs); // 文献摘要的字符串数组
+
+	int numThread = 1;
+	if (nrhs > 3) {
+		double* pNumThread = (double*)mxGetData(prhs[3]);
+		numThread = (int)pNumThread[0];
+		if (numThread < 1) numThread = 1;
+	}
+
+	vector<unordered_set<string>> vusAbsWord(vvWs.size()); // 将每篇文章摘要的单词放入hash表
+	// 将处理摘要之后的每个词语放入hash表
+	for (int i=0; i<vvWs.size(); ++i) {
+		for (auto& word : vvWs[i]) {
+			vusAbsWord[i].insert(word);
+		}
+	}
+	
+	finish = clock();
+	cout << "Load Data time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl << flush;
+
+	int numGroup = vvDs.size();
+	int numAbs = vvWs.size(); // 摘要个数
+	// 计算结果
+	vector<vector<double>> vvHs(numGroup, vector<double>(numAbs));
+	vector<vector<double>> vvHd(numGroup, vector<double>(numAbs));
+
+	vector<string>* pvDs;
+	vector<double>* pvFr;
+	vector<unordered_set<string>>* pvusAbsWord;
+	vector<double>* pvHs;
+	vector<double>* pvHd;
+	/* 多线程计算信息熵 */
+	ThreadPool thPool(numThread);
+	for (int groupIdx = 0; groupIdx < numGroup; ++groupIdx) { // 遍历知识颗粒中的每一组
+		TPEntropyMean tp = { &vvDs[groupIdx], &vvFr[groupIdx], &vusAbsWord, &vvHs[groupIdx], &vvHd[groupIdx] };
+		thPool.enqueue(ThreadCalcEntropyMean, tp);
+	}
+	thPool.~ThreadPool();
+
+	// ofstream ofs("d:\\result_hs.txt");
+	// for (int i = 0; i < numGroup; ++i) {
+	// 	for (int j = 0; j < numAbs; ++j) {
+	// 		ofs << vvHs[i][j] << ' ';
+	// 	}
+	// 	ofs << endl;
+	// }
+	// ofs.close();
+
+	/* 将结果写入返回值 */
+	if (nlhs > 0) {
+		vector<double> vData(numGroup * numAbs);
+		for (int i = 0; i < numGroup; ++i) for (int j = 0; j < numAbs; ++j) vData[j * numGroup + i] = vvHs[i][j];
+		plhs[0] = writeToMatDouble(vData.data(), numGroup, numAbs);
+	}
+
+	if (nlhs > 1) {
+		vector<double> vData(numGroup * numAbs);
+		for (int i = 0; i < numGroup; ++i) for (int j = 0; j < numAbs; ++j) vData[j * numGroup + i] = vvHd[i][j];
+		plhs[1] = writeToMatDouble(vData.data(), numGroup, numAbs);
+	}
+
+	finish = clock();
+	cout << "Calc Entropy and Mean Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
+}
+
+/* 供main调试调用 */
+void mexFunctionWrap(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) {
+	mexFunction(nlhs, plhs, nrhs, prhs);
+}
--- a/MexFunc/CalcEntropy.cpp
+++ b/MexFunc/CalcEntropy.cpp
@ -99,10 +99,11 @@ void GetAbstract(const mxArray* pMxAbs, vector<string>& vAbs) {
 }

 /*
-nlhs：输出参数数目(Number Left - hand side)，等号左边
-plhs：指向输出参数的指针(Point Left - hand side)，等号左边
-nrhs：输入参数数目(Number Right - hand side)，等号右边
-prhs：指向输入参数的指针(Point Right - hand side)，等号右边。要注意prhs是const的指针数组，即不能改变其指向内容。
+输入：
+1. abs: 待感知的文献的摘要信息。
+2. G: 知识颗粒，包含该程序需要的热词ds以及对应的频率fr。
+输出：
+1. hs: 信息熵，二维[len(知识颗粒)][len(文献)]
 */
 void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) {
 	//cout << "MexCalcEntropy" << endl;
@ -232,6 +233,6 @@ void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) {
 }

 /* 供main调试调用 */
-void MexCalcEntropy(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) {
+void mexFunctionWrap(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) {
 	mexFunction(nlhs, plhs, nrhs, prhs);
 }
--- a/MexFunc/ClusterRandSim.cpp
+++ b/MexFunc/ClusterRandSim.cpp
@ -17,9 +17,11 @@
 #include <unordered_map>
 #include <set>
 #include <fstream>
-#include <algorithm>
 #include <random>
 #include <cmath>
+#include <stdlib.h>
+#include <limits.h>
+#include <atomic>

 using std::cout;
 using std::endl;
@ -27,89 +29,6 @@ using namespace std;

 #define STRING_BUF_SIZE 204800

-class ThreadPool {
-public:
-	ThreadPool(size_t);
-	template<class F, class... Args>
-	auto enqueue(F&& f, Args&&... args)
-		->std::future<typename std::result_of<F(Args...)>::type>;
-	~ThreadPool();
-private:
-	// need to keep track of threads so we can join them
-	std::vector< std::thread > workers;
-	// the task queue
-	std::queue< std::function<void()> > tasks;
-
-	// synchronization
-	std::mutex queue_mutex;
-	std::condition_variable condition;
-	bool stop;
-};
-
-// the constructor just launches some amount of workers
-inline ThreadPool::ThreadPool(size_t threads)
-	: stop(false)
-{
-	for (size_t i = 0;i < threads;++i)
-		workers.emplace_back(
-			[this]
-			{
-				for (;;)
-				{
-					std::function<void()> task;
-
-					{
-						std::unique_lock<std::mutex> lock(this->queue_mutex);
-						this->condition.wait(lock,
-							[this] { return this->stop || !this->tasks.empty(); });
-						if (this->stop && this->tasks.empty())
-							return;
-						task = std::move(this->tasks.front());
-						this->tasks.pop();
-					}
-
-					task();
-				}
-			}
-			);
-}
-
-// add new work item to the pool
-template<class F, class... Args>
-auto ThreadPool::enqueue(F&& f, Args&&... args)
-> std::future<typename std::result_of<F(Args...)>::type>
-{
-	using return_type = typename std::result_of<F(Args...)>::type;
-
-	auto task = std::make_shared< std::packaged_task<return_type()> >(
-		std::bind(std::forward<F>(f), std::forward<Args>(args)...)
-		);
-
-	std::future<return_type> res = task->get_future();
-	{
-		std::unique_lock<std::mutex> lock(queue_mutex);
-
-		// don't allow enqueueing after stopping the pool
-		if (stop)
-			throw std::runtime_error("enqueue on stopped ThreadPool");
-
-		tasks.emplace([task]() { (*task)(); });
-	}
-	condition.notify_one();
-	return res;
-}
-
-// the destructor joins all threads
-inline ThreadPool::~ThreadPool()
-{
-	{
-		std::unique_lock<std::mutex> lock(queue_mutex);
-		stop = true;
-	}
-	condition.notify_all();
-	for (std::thread& worker : workers)
-		worker.join();
-}

 /* 读取一维double数据 */
 void Read1DDouble(const  mxArray* pMxArray, vector<double>& vDat) {
@ -141,45 +60,68 @@ void Read2DDouble(const  mxArray* pMxArray, vector<vector<double>>& vvDat) {
 	}
 }

+// 将结果写入mxArray, 作为后续的返回值
+mxArray* writeToMatDouble(const double* data, int rowNum, int colNum) {
+	mxArray* pWriteArray = NULL;//matlab格式矩阵
+	int len = rowNum * colNum;
+	//创建一个rowNum*colNum的矩阵  
+	pWriteArray = mxCreateDoubleMatrix(rowNum, colNum, mxREAL);
+	//把data的值赋给pWriteArray指针
+	memcpy((void*)(mxGetPr(pWriteArray)), (void*)data, sizeof(double) * len);
+	return pWriteArray; // 赋值给返回值
+}
+
+#define START_IDX(tid, threadNum, arrLen) (arrLen) * (tid) / (threadNum)
+#define END_IDX(tid, threadNum, arrLen) (arrLen) * (tid + 1) / (threadNum)
+
 // 线程参数
 struct TPRandSim {
-	vector<double>* pvTr;
+	vector<vector<double>>* pvvTr;
 	vector<int>* pvRandPos;
 	vector<double>* pvH;
 	vector<vector<double>>* pvvX;
 	int numPositive;
+	int tid;
+	int numThread;
 };

 // 多线程入口函数
-void ThreadRandSim(TPRandSim& param) {
-	vector<double>& vTr = *param.pvTr;
+void ThreadRandSim(TPRandSim param) {
+	vector < vector<double>>& vvTr = *param.pvvTr;
 	vector<int>& vRandPos = *param.pvRandPos;
 	vector<vector<double>>& vvX = *param.pvvX;
 	vector<double>& vH = *param.pvH;
 	int numPositive = param.numPositive;
 	int rowNum = vvX.size();
 	int colNum = vvX[0].size();
+	int tid = param.tid;
+	int numThread = param.numThread;

 	clock_t begin = clock(), finish;
 	/* 随机模拟 */
 	std::random_device rd;
-	std::shuffle(vRandPos.begin(), vRandPos.end(), std::default_random_engine(rd()));
+	int startIdx = START_IDX(tid, numThread, vvTr.size());
+	int endIdx = END_IDX(tid, numThread, vvTr.size());

-	for (int i = 0; i < rowNum; ++i) {
-		int hRowIdx = vRandPos[i]; // 随机打乱之后的行索引
-		if (vH[hRowIdx] == 1) {
-			for (int j = 0; j < colNum; ++j) {
-				vTr[j] += vvX[i][j];
+	// cout << tid << '\t' << numThread << '\t' << startIdx << '\t' << endIdx << endl;
+
+	for (int idx = startIdx; idx < endIdx; ++idx) {
+		auto& vTr = vvTr[idx];
+		std::shuffle(vRandPos.begin(), vRandPos.end(), std::default_random_engine(rd()));
+		for (int i = 0; i < rowNum; ++i) {
+			int hRowIdx = vRandPos[i]; // 随机打乱之后的行索引
+			if (vH[hRowIdx] == 1) {
+				for (int j = 0; j < colNum; ++j) {
+					vTr[j] += vvX[i][j];
+				}
 			}
 		}
+		for (auto& val : vTr) val /= numPositive;
 	}
-	for (auto& val : vTr) val /= numPositive;
 	finish = clock();
-	// cout << "Random simulation time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
+	cout << "Thread Random simulation time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
 }

-
-
 /* 入口函数 */
 /*
 三个参数，一个返回值
@ -187,12 +129,12 @@ void ThreadRandSim(TPRandSim& param) {
 1. x 二维数据，double类型，行数为文献数量，列数为字典长度（每个单词在所有文献中出现的次数超过5）
 2. h 长度为文献个数，值为1代表该文献属于该知识颗粒（应该是），为0则不属于
 3. numThread
+4. loopNum
 输出：
 vs z score,显著性指数，一维
 ps 与vs长度一致
 */
-void mexFunction(int nlhs, mxArray* plhs[], int nrhs, mxArray** prhs) {
-//void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) {
+void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) {
 	if (nrhs < 2) {
 		cout << "At least 2 arguments should be given for this function!" << endl;
 		return;
@ -206,7 +148,7 @@ void mexFunction(int nlhs, mxArray* plhs[], int nrhs, mxArray** prhs) {
 	int rowNum = vvX.size();
 	int colNum = vvX[0].size();

-	cout << vH.size() << '\t' << vvX.size() << endl;
+	// cout << vH.size() << '\t' << vvX.size() << endl;

 	int numThread = 1;
 	int loopNum = 1000;
@ -222,6 +164,10 @@ void mexFunction(int nlhs, mxArray* plhs[], int nrhs, mxArray** prhs) {
 		if (loopNum < 1000) loopNum = 1000;
 	}

+	finish = clock();
+	cout << "Load Data time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
+	// cout << numThread << '\t' << loopNum << endl;
+
 	/* 进行随机模拟 */
 	mid = clock();
 	vector<double> vTs(colNum); // 初始数据，记录vH中label为1的行的行均值
@ -236,7 +182,6 @@ void mexFunction(int nlhs, mxArray* plhs[], int nrhs, mxArray** prhs) {
 	}
 	for (auto& val : vTs) val /= numPositive;

-
 	vector<vector<double>> vvTr(loopNum, vector<double>(colNum, 0)); // 模拟结果
 	vector<vector<int>> vvRandPos(numThread, vector<int>(rowNum));
 	for (int i = 0; i < rowNum; ++i) {
@ -245,14 +190,13 @@ void mexFunction(int nlhs, mxArray* plhs[], int nrhs, mxArray** prhs) {
 		}
 	}

-	ThreadPool thPool(numThread);
-	int tid = 0;
-	for (int i = 0; i < loopNum; ++i) {
-		TPRandSim tParam = { &vvTr[i], &vvRandPos[tid++ % numThread], &vH, &vvX, numPositive };
-		thPool.enqueue(ThreadRandSim, tParam);
-		//ThreadRandSim(tParam);
+	vector<std::thread> vT;
+	for (int i = 0; i < numThread; ++i) {
+		TPRandSim tParam = { &vvTr, &vvRandPos[i], &vH, &vvX, numPositive, i, numThread };
+		vT.push_back(std::thread(ThreadRandSim, tParam));
 	}
-	thPool.~ThreadPool();
+	for (auto& t : vT) t.join();
+
 	finish = clock();
 	cout << "Random simulation time: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl;

@ -295,29 +239,24 @@ void mexFunction(int nlhs, mxArray* plhs[], int nrhs, mxArray** prhs) {
 		vPs[i] = min(vSumGreater[i], vSumLess[i]);
 	}

-	ofstream ofs("d:\\result.txt");
-	for (int i = 0; i < colNum; ++i) {
-		ofs << vVs[i] << '\t' << vPs[i] << endl;
-	}
-	ofs.close();
+	// ofstream ofs("d:\\result_2.txt");
+	// for (int i = 0; i < colNum; ++i) {
+	// 	ofs << vVs[i] << '\t' << vPs[i] << endl;
+	// }
+	// ofs.close();

 	/* 写入结果 */
 	if (nlhs > 0) { // vs
-		mxArray* pWriteArray = NULL;//matlab格式矩阵
-		//创建一个rowNum*colNum的矩阵  
-		pWriteArray = mxCreateDoubleMatrix(1, vVs.size(), mxREAL);
-		//把data的值赋给pWriteArray指针
-		memcpy((void*)(mxGetPr(pWriteArray)), (void*)vVs.data(), sizeof(double) * vVs.size());
-		plhs[0] = pWriteArray; // 赋值给返回值
+		plhs[0] = writeToMatDouble(vVs.data(), 1, vVs.size());
 	}
 	if (nlhs > 1) { // ps
-		mxArray* pWriteArray = NULL;//matlab格式矩阵
-		//创建一个rowNum*colNum的矩阵  
-		pWriteArray = mxCreateDoubleMatrix(1, vPs.size(), mxREAL);
-		//把data的值赋给pWriteArray指针
-		memcpy((void*)(mxGetPr(pWriteArray)), (void*)vPs.data(), sizeof(double)* vPs.size());
-		plhs[1] = pWriteArray; // 赋值给返回值
+		plhs[1] = writeToMatDouble(vPs.data(), 1, vPs.size());
 	}
 	finish = clock();
 	cout << "Cluster Random simulation Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
 }
+
+// 供c++调试用
+void mexFunctionWrap(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) {
+	return mexFunction(nlhs, plhs, nrhs, prhs);
+}
--- a/MexFunc/IsWordInDic.cpp
+++ b/MexFunc/IsWordInDic.cpp
@ -75,7 +75,7 @@ inline bool Read2DWord(const mxArray* pMxArray, vector<vector<string>>& vvStr) {
 }

 // 将结果写入mxArray, 作为后续的返回值
-mxArray* writeToMat(const double *data, int rowNum, int colNum) {
+mxArray* writeToMatDouble(const double *data, int rowNum, int colNum) {
 	mxArray* pWriteArray = NULL;//matlab格式矩阵
 	int len = rowNum * colNum;
 	//创建一个rowNum*colNum的矩阵  
@ -187,10 +187,10 @@ void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) {
 	/* 写入结果 */
 	mid = clock();
 	if (nlhs > 0) {
-		plhs[0] = writeToMat(vXSum.data(), 1, vXSum.size());
+		plhs[0] = writeToMatDouble(vXSum.data(), 1, vXSum.size());
 	}
 	if (nlhs > 1) { // xs
-		plhs[1] = writeToMat(vXsData.data(), rowNum, colNum);
+		plhs[1] = writeToMatDouble(vXsData.data(), rowNum, colNum);
 	}
 	finish = clock();
 	cout << "Write result time: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl;
--- a/MexFunc/MexFunc.vcxproj
+++ b/MexFunc/MexFunc.vcxproj
@ -119,7 +119,7 @@
    </Link>
  </ItemDefinitionGroup>
  <ItemGroup>
-    <ClCompile Include="IsWordInDic.cpp" />
+    <ClCompile Include="AllEntropyMean.cpp" />
    <ClCompile Include="main.cpp" />
  </ItemGroup>
  <ItemGroup>
--- a/MexFunc/MexFunc.vcxproj.filters
+++ b/MexFunc/MexFunc.vcxproj.filters
@ -18,7 +18,7 @@
    <ClCompile Include="main.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
-    <ClCompile Include="IsWordInDic.cpp">
+    <ClCompile Include="AllEntropyMean.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
  </ItemGroup>
--- a/MexFunc/main.cpp
+++ b/MexFunc/main.cpp
@ -31,23 +31,61 @@ int main(int argc, const char** argv)
 	// 	prhs[2] = matGetVariable(pdicrMat, "dicr");

 	/* IsWordInDic  */
-	MATFile* pwdMat, * pdicMat;
+	// MATFile* pwdMat, * pdicMat;
+	// mxArray* plhs[4];
+	// const mxArray* prhs[4];
+	// int nlhs = 2, nrhs = 2;
+	// pwdMat = matOpen("D:\\wd_large.mat", "r");
+	// pdicMat = matOpen("D:\\G_dc_large.mat", "r");
+	// prhs[0] = matGetVariable(pwdMat, "wd"); //获取.mat文件里面名为matrixName的矩阵
+	// prhs[1] = matGetVariable(pdicMat, "dc");
+
+	/* ClusterRandSim */
+	// mxArray* plhs[4];
+	// const mxArray* prhs[4];
+	// int nlhs = 2, nrhs = 4;
+	// MATFile* pMatX = matOpen("D:\\x_large.mat", "r");
+	// MATFile* pMatH = matOpen("D:\\h_large.mat", "r");
+	// prhs[0] = matGetVariable(pMatX, "x");
+	// prhs[1] = matGetVariable(pMatH, "h3");
+	// prhs[2] = mxCreateDoubleMatrix(1, 1, mxREAL);
+	// *mxGetPr(prhs[2]) = 1;
+	// prhs[3] = mxCreateDoubleMatrix(1, 1, mxREAL);
+	// *mxGetPr(prhs[3]) = 10000;
+
+
+	/* AllClusterRandSim */
+	// mxArray* plhs[4];
+	// const mxArray* prhs[4];
+	// int nlhs = 2, nrhs = 4;
+	// MATFile* pMatX = matOpen("D:\\x_large.mat", "r");
+	// MATFile* pMatIx = matOpen("D:\\ix_large.mat", "r");
+	// prhs[0] = matGetVariable(pMatX, "x");
+	// prhs[1] = matGetVariable(pMatIx, "ix");
+	// prhs[2] = mxCreateDoubleMatrix(1, 1, mxREAL);
+	// *mxGetPr(prhs[2]) = 4;
+	// prhs[3] = mxCreateDoubleMatrix(1, 1, mxREAL);
+	// *mxGetPr(prhs[3]) = 10000;
+
+	/* AllEntropyMean */
 	mxArray* plhs[4];
 	const mxArray* prhs[4];
-	int nlhs = 2, nrhs = 2;
-	pwdMat = matOpen("D:\\wd_large.mat", "r");
-	pdicMat = matOpen("D:\\G_dc_large.mat", "r");
-	prhs[0] = matGetVariable(pwdMat, "wd"); //获取.mat文件里面名为matrixName的矩阵
-	prhs[1] = matGetVariable(pdicMat, "dc");
-
-	/*  */
-	//MATFile* pMatX = matOpen("D:\\x_large.mat", "r");
-	//MATFile* pMatH = matOpen("D:\\h_large.mat", "r");
-	//prhs[0] = matGetVariable(pMatX, "x");
-	//prhs[1] = matGetVariable(pMatH, "h");
+	int nlhs = 2, nrhs = 4;
+	MATFile* pMatG = matOpen("D:\\G_large.mat", "r");
+	MATFile* pMatWs = matOpen("D:\\ws_large.mat", "r");
+	mxArray* pMxG = matGetVariable(pMatG, "G");
+	prhs[0] = mxGetField(pMxG, 0, "ds");
+	prhs[1] = mxGetField(pMxG, 0, "frr");
+	prhs[2] = matGetVariable(pMatWs, "ws");
+	prhs[3] = mxCreateDoubleMatrix(1, 1, mxREAL);
+	*mxGetPr(prhs[3]) = 12;



+
+
+
+	// 调用函数进行测试
 	finish = clock();
 	cout << "Load Data time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;