twirls/CppRun/calc_entropy.cpp

331 lines
12 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

/*********************************************************************************************
Description: 检查每个文献的摘要部分是否包含设定的高频词汇,并用这些高频词汇计算摘要的信息熵
Copyright : All right reserved by ZheYuan.BJ
Author : Zhang Zhonghai
Date : 2023/09/20
***********************************************************************************************/
#include <iostream>
#include <fstream>
#include <sstream>
#include <filesystem>
#include <vector>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <algorithm>
#include <omp.h>
#include <time.h>
#include <vector>
#include <queue>
#include <thread>
#include <cctype>
#include <cmath>
#ifdef _WIN32
#include <io.h>
#include <process.h>
#define F_OK 0
#else
#include <unistd.h>
#endif
#include <mat.h>
#include "common.h"
#include "CommonLib/thread_pool.h"
#include "CommonLib/matlab_io.h"
#include "CommonLib/kthread.h"
using namespace std;
using std::cout;
using std::vector;
namespace fs = std::filesystem;
#include "common.h"
#include "CommonLib/matlab_io.h"
using namespace std;
// 遍历知识颗粒,循环处理
#define FOREACH_PARTICLE_START \
for (auto &childDir : fs::directory_iterator(parrentDir)) { \
for (auto &file : fs::directory_iterator(childDir)) { \
const string &fileName = file.path().filename().string(); \
auto rPos = fileName.rfind(wordMatSuffix); \
if (rPos != string::npos && fileName.size() - rPos == wordMatSuffix.size()) {
#define FOREACH_PARTICLE_END \
} \
} \
}
/* 读取二层cell包裹的字符串,和数值ds,fr */
#define OUTER_FOR_BEGIN \
rowNum = (int)mxGetM(pMxArray); \
colNum = (int)mxGetN(pMxArray); \
for (int i = 0; i < rowNum; ++i) { \
for (int j = 0; j < colNum; ++j) { \
pCell = mxGetCell(pMxArray, j * rowNum + i); \
int childRowNum = (int)mxGetM(pCell); \
int childColNum = (int)mxGetN(pCell);
#define OUTER_FOR_END \
} \
} \
mxDestroyArray(pMxArray);
#define INNTER_FOR_BEGIN \
for (int ii = 0; ii < childRowNum; ii++) { \
for (int jj = 0; jj < childColNum; jj++) { \
mxArray *pChildCell = mxGetCell(pCell, jj * childRowNum + ii);
#define INNTER_FOR_END \
} \
}
// 将matlab存储方式转换成c存储方式
#define TRANS_ROW_COL(dst, src, rowNum, colNum) \
for (int rowI = 0; rowI < rowNum; ++rowI) { \
for (int colJ = 0; colJ < colNum; ++colJ) { \
dst[rowI * colNum + colJ] = src[colJ * rowNum + rowI]; \
} \
}
// 读取ds和fr信息
bool ReadInfoFromMat(const string & filePath, vector<vector<string> >&vvDs, vector<vector<double> >&vvFr) {
MATFile* pMatFile = nullptr;
mxArray* pMxArray = nullptr;
mxArray* pCell = nullptr;
int rowNum, colNum;
char *strBuf = new char[STRING_BUF_SIZE];
const string& parrentName = "G";
const string& firstChildName = "ds";
const string& secondChildName = "fr";
pMatFile = matOpen(filePath.c_str(), "r"); //打开.mat文件
if (pMatFile == nullptr) {
cout << "filePath is error!" << endl;
return false;
}
mxArray* pMxG = matGetVariable(pMatFile, parrentName.c_str()); //获取G变量
// 读取ds字符串
pMxArray = mxGetField(pMxG, 0, firstChildName.c_str()); // ds
OUTER_FOR_BEGIN
vvDs.push_back(vector<string>());
vvDs.back().resize(childRowNum * childColNum);
INNTER_FOR_BEGIN
if (mxGetString(pChildCell, strBuf, STRING_BUF_SIZE) != 0) {
cout << "String is too large to fit in the buffer! " << i + 1 << '\t' << j + 1 << endl;
delete[]strBuf;
return false;
}
vvDs.back()[ii * childColNum + jj] = strBuf;
auto& lastStr = vvDs.back()[ii * childColNum + jj];
transform(lastStr.begin(), lastStr.end(), lastStr.begin(), ::toupper); // 转成大写
INNTER_FOR_END
OUTER_FOR_END
// 读取fr数值
pMxArray = mxGetField(pMxG, 0, secondChildName.c_str()); // fr
OUTER_FOR_BEGIN
vvFr.push_back(vector<double>());
vvFr.back().resize(childRowNum * childColNum);
double *pVal = (double*)mxGetData(pCell); //获取指针
TRANS_ROW_COL(vvFr.back(), pVal, childRowNum, childColNum); // 行列存储方式转换
OUTER_FOR_END
// 没考虑完全哪些数据需要mxDestroyArray可能会有内存泄漏
delete[]strBuf;
return true;
}
// 将二维索引转成一维的索引
inline int Get1DIndex(int colNum, int row, int col) {
return row * colNum + col;
}
/* 处理一个知识颗粒 */
struct ThreadParamEntropy { // 线程参数
fs::path matFilePath;
fs::path outFilePath;
vector<unordered_set<string> >* pvusWord;
};
void ThreadProcessEntropy(ThreadParamEntropy& param) {
const fs::path& matFilePath = param.matFilePath;
const fs::path& outFilePath = param.outFilePath;
vector <unordered_set<string> >& vusWord = *param.pvusWord;
// 存放结果,用一维数组存放二维数据
vector<double> hs;
vector<double> hr;
vector<vector<string> > vvDs; // 每个知识颗粒的ds矩阵词汇矩阵
vector<vector<double> > vvFr; // 词汇对应的频率
// 读取G结构体中的ds和fr信息
ReadInfoFromMat(matFilePath.string(), vvDs, vvFr);
const int numLiterature = vusWord.size(); // pubmed 文件中包含的文献数量
const int numGroup = vvDs.size(); // ds包含的组数
hs.resize(numGroup * numLiterature);
hr.resize(numLiterature * numGroup);
for (int groupIdx = 0; groupIdx < numGroup; ++groupIdx) { // 遍历知识颗粒中的每一组
vector<string>& vDs = vvDs[groupIdx]; // 这一组ds
vector<double>& vFr = vvFr[groupIdx]; // frequency
const int numWord = vDs.size(); // 这一组数据中包含的单词数量
vector<vector<int> > vX(numLiterature, vector<int>(numWord, 0));
// 检查知识颗粒中的词语是否出现在pubmed摘要的词语中
for (int i= 0; i < numLiterature; ++i) {
for (int j = 0; j < numWord; ++j) {
if (vusWord[i].find(vDs[j]) != vusWord[i].end()) { // 这一组单词中的j索引位置的单词在第i个文献中出现过
vX[i][j] = 1;
}
}
}
// 找词汇的最高频率
double maxFr = *max_element(vFr.begin(), vFr.end());
// 将fr的数值规范化到00.368)之间
const double normalMax = 0.368;
for (auto& frVal : vFr) frVal = frVal * normalMax / maxFr;
maxFr = normalMax;
// 对每个知识颗粒每一组数据,计算信息熵
for (int i = 0; i < numLiterature; ++i) {
for (int j = 0; j < numWord; ++j) {
if (vX[i][j] == 1) {
hs[Get1DIndex(numLiterature, groupIdx, i)] -= vFr[j] * log2(vFr[j]);
}
}
}
// 找最高频词汇所在的索引位置
vector<int> vMaxPos;
int idx = 0;
for_each(vFr.begin(), vFr.end(), [&idx, maxFr, &vMaxPos](double val) {
if (val == maxFr) vMaxPos.push_back(idx);
idx++;
});
for (int i = 0; i < numLiterature; ++i) {
int cumulateX = 0; // 计算在最高频词汇处x值的累加结果
for (int j = 0; j < vMaxPos.size(); ++j) cumulateX += vX[i][vMaxPos[j]];
if (cumulateX == vMaxPos.size()) { // 如果频率最高的词汇都出现在了文献中
hr[Get1DIndex(numGroup,i, groupIdx)] = 1; // 应该是表示知识颗粒的这一组数据跟这篇文献相关性比较高
}
}
}
/* 将结果hs和hr写入每个知识颗粒的目录内 */
MATFile* pMatFile = matOpen(outFilePath.string().c_str(), "w");
SaveMtxDouble(hs.data(), pMatFile, "hs", numGroup, numLiterature);
SaveMtxDouble(hr.data(), pMatFile, "hr", numLiterature, numGroup);
matClose(pMatFile);
}
/* 程序入口 */
// 运行例子CalcEntropy d:\Twirls\runtime\ALS_test abs2class.mat d:\Twirls\runtime\pubmed_files\pubmed-multiplesc-set.mat hx_info.mat 12 word.mat
void CalcEntropy(int argc, const char** argv) {
// argv
// 1. 知识颗粒的父目录名称
// 2. 包含高频词汇信息的mat文件的后缀
// 3. 包含处理后的pubmed文献信息的mat文件路径
// 4. 存放输出结果的mat文件名每个知识颗粒目录中生成一个结果文件
// 5. 线程数量(可选)
if (argc < 5) {
cout << "This program should take at least 4 arguments(1.parrent Dir; 2. mat file suffix; 3. pubmed mat file; 4. out mat filename; [5. thread number]; [6. word out mat filepath])!" << endl;
return;
}
clock_t begin, finish;
string parrentDir(argv[1]); // 知识颗粒的父目录名称
string wordMatSuffix(argv[2]); // 高频词汇矩阵对应的mat文件的后缀名可以是全文件名可以是文件名后缀必须保证唯一
fs::path outFileName(argv[4]);
int numThread = 1;
if (argc > 5) numThread = atoi(argv[5]);
if (numThread < 1) numThread = 1;
/* 读入处理后的pubmed文献信息的mat文件只读入摘要信息即变量abs1 */
vector<string> vAbstract;
int rowNum, colNum;
ReadMtxString(argv[3], "abs1", vAbstract, &rowNum, &colNum);
if (vAbstract.size() == 0) { // 摘要信息为空,出错
cout << "PubMed Abstract info is null!" << endl;
return;
}
/* 将摘要信息分割成一个一个的词汇 */
begin = clock();
unordered_set<char> usWordChars; // 能组成单词的字符要不要考虑数字原版matlab是提取了数字的
for (int i = 65; i <= 90; i++) usWordChars.insert(char(i)); // A - Z
for (int i = 97; i <= 122; i++) usWordChars.insert(char(i)); // a - z
for (int i = 48; i <= 57; i++) usWordChars.insert(char(i)); // 0 - 9
usWordChars.insert('/'); usWordChars.insert('+'); usWordChars.insert('-');
vector<vector<string> > vvWordMtx(vAbstract.size()); // 初始大小为文章的个数
vector<unordered_set<string> > vusAbsWord(vAbstract.size()); // 将每篇文章摘要的单词放入hash表
for (int i = 0; i < vAbstract.size(); i++) {
auto& strAbs = vAbstract[i];
// 遍历摘要字符串的每一个字符,取出每一个单词
vector<string>& vWord = vvWordMtx[i];
if (strAbs.size() == 0) continue; // 摘要信息为空,跳过(一般不会出现这个情况)
int wordStartPos = 0;
while (wordStartPos < strAbs.size() && usWordChars.find(strAbs[wordStartPos]) == usWordChars.end())
wordStartPos++;
for (int curPos = wordStartPos + 1; curPos < strAbs.size(); ++curPos) {
if (usWordChars.find(strAbs[curPos]) == usWordChars.end()) { // 找到了分割符
vWord.push_back(strAbs.substr(wordStartPos, curPos - wordStartPos));
wordStartPos = curPos + 1; // 找下一个词语起始位置
while (wordStartPos < strAbs.size() && usWordChars.find(strAbs[wordStartPos]) == usWordChars.end())
wordStartPos++;
curPos = wordStartPos; // 循环会自动加1
}
}
// 将处理摘要之后的每个词语放入hash表
for (auto& word : vWord) {
string upWord(word);
transform(upWord.begin(), upWord.end(), upWord.begin(), ::toupper);
vusAbsWord[i].insert(upWord);
}
}
finish = clock();
cout << "read abstract time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
/* 将分割结果写入mat文件 */
begin = clock();
if (argc > 6) {
MATFile* pMatFile = matOpen(argv[6], "w");
mxArray* pCellMtx= mxCreateCellMatrix(1, vvWordMtx.size());
for (int i = 0; i < vvWordMtx.size(); ++i) {
mxArray* pChildCellMtx = mxCreateCellMatrix(1, vvWordMtx[i].size());
for (int j = 0; j < vvWordMtx[i].size(); ++j) {
mxArray* mxStr = mxCreateString(vvWordMtx[i][j].c_str());
mxSetCell(pChildCellMtx, j, mxStr);
}
mxSetCell(pCellMtx, i, pChildCellMtx);
}
matPutVariable(pMatFile, "wd", pCellMtx);
matClose(pMatFile);
mxDestroyArray(pCellMtx);
}
finish = clock();
cout << "write abstract word time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
/* 遍历所有的知识颗粒目录,逐一进行处理 */
begin = clock();
//ThreadPool thPool(numThread);
// 查看知识颗粒数量
int numKnowledgeParticle = 0;
FOREACH_PARTICLE_START
numKnowledgeParticle++;
FOREACH_PARTICLE_END
// 遍历每个知识颗粒,逐一进行处理
vector<ThreadParamEntropy> vTP;
for (int round = 0; round < 1; ++round) { // 测试用
int i = 0;
FOREACH_PARTICLE_START
//ThreadParam tParam = { file, childDir / outFileName, &vusAbsWord };
//thPool.enqueue(ThreadProcessData, tParam);
vTP.push_back({ file, childDir / outFileName, &vusAbsWord });
i++;
FOREACH_PARTICLE_END
}
kt_for(numThread, ThreadProcessEntropy, vTP);
// synchronize
//thPool.~ThreadPool();
finish = clock();
cout << "thread pool time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
}