twirls/MexFunc/CalcEntropy.cpp

#include <mex.h>
#include <mat.h>
#include <iostream>
#include <algorithm>
#include <string>
#include <unordered_set>
#include <ctime>
#include <vector>
#include <queue>
#include <memory>
#include <thread>
#include <mutex>
#include <condition_variable>
#include <future>
#include <functional>
#include <stdexcept>
#include <unordered_map>
#include <set>
#include <fstream>
#include <random>
#include <cmath>
#include <stdlib.h>
#include <limits.h>
#include <atomic>
using std::cout;
using std::endl;
using namespace std;

#define STRING_BUF_SIZE 204800

/* 读取二层cell包裹的字符串,和数值，ds,fr */
#define OUTER_FOR_BEGIN                                           \
	rowNum = (int)mxGetM(pMxArray);                               \
	colNum = (int)mxGetN(pMxArray);                               \
    for (int i = 0; i < rowNum; ++i) {                            \
        for (int j = 0; j < colNum; ++j) {                        \
            mxArray* pCell = mxGetCell(pMxArray, j * rowNum + i); \
            int childRowNum = (int)mxGetM(pCell);                 \
            int childColNum = (int)mxGetN(pCell);

#define OUTER_FOR_END              \
		}                          \
    }

#define INNTER_FOR_BEGIN						    \
    for (int ii = 0; ii < childRowNum; ii++) {		\
        for (int jj = 0; jj < childColNum; jj++) {	\
            mxArray *pChildCell = mxGetCell(pCell, jj * childRowNum + ii);
#define INNTER_FOR_END                  \
		}                               \
	}
// 将matlab存储方式转换成c存储方式
#define TRANS_ROW_COL(dst, src, rowNum, colNum)						\
	for (int rowI = 0; rowI < rowNum; ++rowI) {                     \
		for (int colJ = 0; colJ < colNum; ++colJ) {                 \
			dst[rowI * colNum + colJ] = src[colJ * rowNum + rowI];	\
		}                                                           \
	}

class ThreadPool {
public:
	ThreadPool(size_t);
	template<class F, class... Args>
	auto enqueue(F&& f, Args&&... args)
		->std::future<typename std::result_of<F(Args...)>::type>;
	~ThreadPool();
private:
	// need to keep track of threads so we can join them
	std::vector< std::thread > workers;
	// the task queue
	std::queue< std::function<void()> > tasks;

	// synchronization
	std::mutex queue_mutex;
	std::condition_variable condition;
	bool stop;
};

// the constructor just launches some amount of workers
inline ThreadPool::ThreadPool(size_t threads)
	: stop(false)
{
	for (size_t i = 0;i < threads;++i)
		workers.emplace_back(
			[this]
			{
				for (;;)
				{
					std::function<void()> task;

					{
						std::unique_lock<std::mutex> lock(this->queue_mutex);
						this->condition.wait(lock,
							[this] { return this->stop || !this->tasks.empty(); });
						if (this->stop && this->tasks.empty())
							return;
						task = std::move(this->tasks.front());
						this->tasks.pop();
					}

					task();
				}
			}
			);
}

// add new work item to the pool
template<class F, class... Args>
auto ThreadPool::enqueue(F && f, Args&&... args)
-> std::future<typename std::result_of<F(Args...)>::type>
{
	using return_type = typename std::result_of<F(Args...)>::type;

	auto task = std::make_shared< std::packaged_task<return_type()> >(
		std::bind(std::forward<F>(f), std::forward<Args>(args)...)
		);

	std::future<return_type> res = task->get_future();
	{
		std::unique_lock<std::mutex> lock(queue_mutex);

		// don't allow enqueueing after stopping the pool
		if (stop)
			throw std::runtime_error("enqueue on stopped ThreadPool");

		tasks.emplace([task]() { (*task)(); });
	}
	condition.notify_one();
	return res;
}

// the destructor joins all threads
inline ThreadPool::~ThreadPool()
{
	{
		std::unique_lock<std::mutex> lock(queue_mutex);
		stop = true;
	}
	condition.notify_all();
	for (std::thread& worker : workers)
		worker.join();
}

// 将二维索引转成一维的索引
inline int Get1DIndex(int colNum, int row, int col) {
	return row * colNum + col;
}

// 读取G结构体中的ds和fr
void GetFrDs(const mxArray* pMxParent, vector<vector<string> >& vvDs, vector<vector<double> >& vvFr) {
	// 读取ds字符串
	int rowNum, colNum;
	char *strBuf = new char[STRING_BUF_SIZE];
	mxArray* pMxArray = mxGetField(pMxParent, 0, "ds"); // ds
	OUTER_FOR_BEGIN
		vvDs.push_back(vector<string>());
		vvDs.back().resize(childRowNum * childColNum);
		INNTER_FOR_BEGIN
			if (mxGetString(pChildCell, strBuf, STRING_BUF_SIZE) != 0) {
				cout << "String is too large to fit in the buffer! " << i + 1 << '\t' << j + 1 << endl;
				delete[]strBuf;
				return;
			}
			vvDs.back()[ii * childColNum + jj] = strBuf;
			auto& lastStr = vvDs.back()[ii * childColNum + jj];
			transform(lastStr.begin(), lastStr.end(), lastStr.begin(), ::toupper); // 转成大写
		INNTER_FOR_END
	OUTER_FOR_END

	// 读取fr数值
	pMxArray = mxGetField(pMxParent, 0, "fr"); // fr
	OUTER_FOR_BEGIN
		vvFr.push_back(vector<double>());
		vvFr.back().resize(childRowNum * childColNum);
		double* pVal = (double*)mxGetData(pCell); //获取指针
		TRANS_ROW_COL(vvFr.back(), pVal, childRowNum, childColNum); // 行列存储方式转换
	OUTER_FOR_END
	delete[]strBuf;
}

/* 读取abs */
void GetAbstract(const mxArray* pMxAbs, vector<string>& vAbs) {
	int rowNum = (int)mxGetM(pMxAbs);
	int colNum = (int)mxGetN(pMxAbs);
	char *strBuf = new char[STRING_BUF_SIZE];

	vAbs.resize(rowNum * colNum);
	for (int i = 0; i < rowNum; ++i) {
		for (int j = 0; j < colNum; ++j) {
			mxArray* pCell = mxGetCell(pMxAbs, j * rowNum + i);
			if (mxGetString(pCell, strBuf, STRING_BUF_SIZE) != 0) {
				cout << "String is too large to fit in the buffer! " << i + 1 << '\t' << j + 1 << endl;
				delete[]strBuf;
				return;
			}
			vAbs[i * colNum + j] = strBuf;
		}
	}
	delete[]strBuf;
}

// 保存由一维cell组成的字符串数组
mxArray* writeToMatString1DCell(vector<string>& vStr) {
	mxArray* pCellMtx = mxCreateCellMatrix(1, vStr.size());
	for (int j = 0; j < vStr.size(); ++j) {
		mxArray* mxStr = mxCreateString(vStr[j].c_str());
		mxSetCell(pCellMtx, j, mxStr);
	}
	return pCellMtx;
}

// 保存由二维cell组成的字符串数组
mxArray* writeToMatString2DCell(vector<vector<string>>& vvStr) {
	mxArray* pCellMtx = mxCreateCellMatrix(1, vvStr.size());
	for (int i = 0; i < vvStr.size(); ++i) {
		mxArray* pChildCellMtx = writeToMatString1DCell(vvStr[i]);
		mxSetCell(pCellMtx, i, pChildCellMtx);
	}
	return pCellMtx;
}

// 将结果写入mxArray, 作为后续的返回值
mxArray* writeToMatDouble(const double* data, int rowNum, int colNum) {
	mxArray* pWriteArray = NULL;//matlab格式矩阵
	int len = rowNum * colNum;
	//创建一个rowNum*colNum的矩阵
	pWriteArray = mxCreateDoubleMatrix(rowNum, colNum, mxREAL);
	//把data的值赋给pWriteArray指针
	memcpy((void*)(mxGetPr(pWriteArray)), (void*)data, sizeof(double) * len);
	return pWriteArray; // 赋值给返回值
}

/* 多线程计算信息熵 */
struct TPEntropy {
	vector<string>* pvDs;
	vector<double>* pvFr;
	vector<unordered_set<string>>* pvusAbsWord;
	double* pHs;
};

void ThreadCalcEntropy(TPEntropy& param) {
	vector<string>& vDs = *param.pvDs; // 这一组ds
	vector<double>& vFr = *param.pvFr; // frequency
	vector<unordered_set<string>>& vusAbsWord = *param.pvusAbsWord;
	double* hs = param.pHs;
	const int numAbs = vusAbsWord.size();
	const int numDsWord = vDs.size(); // 这一组数据中包含的单词数量

	vector<vector<int> > vX(numAbs, vector<int>(numDsWord, 0));
	// 检查知识颗粒中的词语是否出现在pubmed摘要的词语中
	for (int i = 0; i < numAbs; ++i) {
		for (int j = 0; j < numDsWord; ++j) {
			if (vusAbsWord[i].find(vDs[j]) != vusAbsWord[i].end()) { // 这一组单词中的j索引位置的单词在第i个文献中出现过
				vX[i][j] = 1;
			}
		}
	}

	// 找词汇的最高频率
	double maxFr = *max_element(vFr.begin(), vFr.end());
	// 将fr的数值规范化到（0，0.368）之间
	const double normalMax = 0.368;
	for (auto& frVal : vFr) frVal = frVal * normalMax / maxFr;
	maxFr = normalMax;
	// 对每个知识颗粒每一组数据，计算信息熵
	for (int i = 0; i < numAbs; ++i) {
		for (int j = 0; j < numDsWord; ++j) {
			if (vX[i][j] == 1) {
				hs[i] -= vFr[j] * log2(vFr[j]);
			}
		}
	}
}

/*
输入：
1. abs: 待感知的文献的摘要信息。
2. G: 知识颗粒，包含该程序需要的热词ds以及对应的频率fr。
输出：
1. hs: 信息熵，二维[len(知识颗粒)][len(文献)]
*/
void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) {
	if (nrhs < 2) {
		cout << "At least 2 arguments should be given for this function!" << endl;
		return;
	}
	clock_t begin = clock(), mid, finish;
	vector<string> vAbstract; // 读取abs1, 然后分割成一个一个的单词
	GetAbstract(prhs[0], vAbstract);

	vector<vector<string>> vvDs; // 每个知识颗粒的ds矩阵（词汇矩阵）
	vector<vector<double>> vvFr; // 词汇对应的频率
	GetFrDs(prhs[1], vvDs, vvFr);

	int numThread = 1; // 是否打印信息, 1打印简单信息，2打印详细信息
	if (nrhs > 2) {
		double* pData = (double*)mxGetData(prhs[2]);
		numThread = (int)pData[0];
		if (numThread < 1) numThread = 1;
	}

	int flagPrint = 0; // 是否打印信息, 1打印简单信息，2打印详细信息
	if (nrhs > 3) {
		double* pData = (double*)mxGetData(prhs[3]);
		flagPrint = (int)pData[0];
	}
	finish = clock();
	if (flagPrint == 2) cout << "Load data time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
	/* 将摘要信息分割成一个一个的词汇 */
	mid = clock();
	unordered_set<char> usWordChars; // 能组成单词的字符，要不要考虑数字？原版matlab是提取了数字的
	for (int i = 65; i <= 90; i++) usWordChars.insert(char(i)); // A - Z
	for (int i = 97; i <= 122; i++) usWordChars.insert(char(i)); // a - z
	for (int i = 48; i <= 57; i++) usWordChars.insert(char(i)); // 0 - 9
	usWordChars.insert('/'); usWordChars.insert('+'); usWordChars.insert('-');
	vector<vector<string> > vvWordMtx(vAbstract.size()); // 初始大小为文章的个数
	vector<unordered_set<string> > vusAbsWord(vAbstract.size()); // 将每篇文章摘要的单词放入hash表
	for (int i = 0; i < vAbstract.size(); i++) {
		auto& strAbs = vAbstract[i];
		// 遍历摘要字符串的每一个字符，取出每一个单词
		vector<string>& vWord = vvWordMtx[i];
		if (strAbs.size() == 0) continue; // 摘要信息为空，跳过（一般不会出现这个情况）
		int wordStartPos = 0;
		while (wordStartPos < strAbs.size() && usWordChars.find(strAbs[wordStartPos]) == usWordChars.end())
			wordStartPos++;
		for (int curPos = wordStartPos + 1; curPos < strAbs.size(); ++curPos) {
			if (usWordChars.find(strAbs[curPos]) == usWordChars.end()) { // 找到了分割符
				vWord.push_back(strAbs.substr(wordStartPos, curPos - wordStartPos));
				wordStartPos = curPos + 1; // 找下一个词语起始位置
				while (wordStartPos < strAbs.size() && usWordChars.find(strAbs[wordStartPos]) == usWordChars.end())
					wordStartPos++;
				curPos = wordStartPos; // 循环会自动加1
			}
		}
		// 将处理摘要之后的每个词语放入hash表
		for (auto& word : vWord) {
			string upWord(word);
			transform(upWord.begin(), upWord.end(), upWord.begin(), ::toupper);
			vusAbsWord[i].insert(upWord);
		}
	}
	finish = clock();
	if (flagPrint == 2) cout << "Split abstract time: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl;

	// 存放结果，用一维数组存放二维数据
	mid = clock();
	vector<double> hs;
	// vector<double> hr;
	const int numLiterature = vusAbsWord.size(); // pubmed 文件中包含的文献数量
	const int numGroup = vvDs.size(); // ds包含的组数
	hs.resize(numGroup * numLiterature);
	// hr.resize(numLiterature * numGroup);
	// 并行, 没有计算hr
	ThreadPool thPool(numThread);
	for (int groupIdx = 0; groupIdx < numGroup; ++groupIdx) {
		TPEntropy tp = { &vvDs[groupIdx], &vvFr[groupIdx], &vusAbsWord, &hs[groupIdx * numLiterature] };
		thPool.enqueue(ThreadCalcEntropy, tp);
	}
	thPool.~ThreadPool();

//  // 串行
//	for (int groupIdx = 0; groupIdx < numGroup; ++groupIdx) { // 遍历知识颗粒中的每一组
//		vector<string>& vDs = vvDs[groupIdx]; // 这一组ds
//		vector<double>& vFr = vvFr[groupIdx]; // frequency
//		const int numWord = vDs.size(); // 这一组数据中包含的单词数量
//		vector<vector<int> > vX(numLiterature, vector<int>(numWord, 0));
//		// 检查知识颗粒中的词语是否出现在pubmed摘要的词语中
//		for (int i = 0; i < numLiterature; ++i) {
//			for (int j = 0; j < numWord; ++j) {
//				if (vusAbsWord[i].find(vDs[j]) != vusAbsWord[i].end()) { // 这一组单词中的j索引位置的单词在第i个文献中出现过
//					vX[i][j] = 1;
//				}
//			}
//		}
//
//		// 找词汇的最高频率
//		double maxFr = *max_element(vFr.begin(), vFr.end());
//		// 将fr的数值规范化到（0，0.368）之间
//		const double normalMax = 0.368;
//		for (auto& frVal : vFr) frVal = frVal * normalMax / maxFr;
//		maxFr = normalMax;
//		// 对每个知识颗粒每一组数据，计算信息熵
//		for (int i = 0; i < numLiterature; ++i) {
//			for (int j = 0; j < numWord; ++j) {
//				if (vX[i][j] == 1) {
//					hs[Get1DIndex(numLiterature, groupIdx, i)] -= vFr[j] * log2(vFr[j]);
//				}
//			}
//		}
//
//		// 找最高频词汇所在的索引位置
//		vector<int> vMaxPos;
//		int idx = 0;
//		for_each(vFr.begin(), vFr.end(), [&idx, maxFr, &vMaxPos](double val) {
//			if (val == maxFr) vMaxPos.push_back(idx);
//			idx++;
//			});
//
//		for (int i = 0; i < numLiterature; ++i) {
//			int cumulateX = 0; // 计算在最高频词汇处，x值的累加结果
//			for (int j = 0; j < vMaxPos.size(); ++j) cumulateX += vX[i][vMaxPos[j]];
//			if (cumulateX == vMaxPos.size()) { // 如果频率最高的词汇都出现在了文献中
//				hr[Get1DIndex(numGroup, i, groupIdx)] = 1; // 应该是表示知识颗粒的这一组数据跟这篇文献相关性比较高
//			}
//		}
//	}
	finish = clock();
	if (flagPrint == 2) cout << "Calc entropy time: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl;

	/* 将结果写入返回值 */
	mid = clock();
	if (nlhs > 0) {
		int datasize = numGroup * numLiterature;
		vector<double> vData(datasize);
		for (int i = 0; i < numGroup; i++) for (int j = 0; j < numLiterature; j++)
			vData[j * numGroup + i] = hs[i * numLiterature + j];
		plhs[0] = writeToMatDouble(vData.data(), numGroup, numLiterature);
	}
	if (nlhs > 1) { // 将ws写入结果
		plhs[1] = writeToMatString2DCell(vvWordMtx);
	}
	finish = clock();
	if (flagPrint == 2) cout << "Write back data time: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl;

	finish = clock();
	if(flagPrint) cout << "CalcEntropy Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
}

/* 供main调试调用 */
void mexFunctionWrap(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) {
	mexFunction(nlhs, plhs, nrhs, prhs);
}