345 lines
12 KiB
C++
345 lines
12 KiB
C++
/*********************************************************************************************
|
||
Description: 检查每个文献的摘要部分是否包含设定的高频词汇,并用这些高频词汇计算摘要的信息熵
|
||
|
||
Copyright : All right reserved by ZheYuan.BJ
|
||
|
||
Author : Zhang Zhonghai
|
||
Date : 2023/09/20
|
||
***********************************************************************************************/
|
||
#include <iostream>
|
||
#include <fstream>
|
||
#include <sstream>
|
||
#include <filesystem>
|
||
#include <vector>
|
||
#include <string>
|
||
#include <unordered_map>
|
||
#include <unordered_set>
|
||
#include <algorithm>
|
||
#include <omp.h>
|
||
#include <time.h>
|
||
#include <vector>
|
||
#include <queue>
|
||
#include <thread>
|
||
#include <cctype>
|
||
#include <cmath>
|
||
#ifdef _WIN32
|
||
#include <io.h>
|
||
#include <process.h>
|
||
#define F_OK 0
|
||
#else
|
||
#include <unistd.h>
|
||
#endif
|
||
#include <mat.h>
|
||
#include "common.h"
|
||
#include "CommonLib/thread_pool.h"
|
||
#include "CommonLib/matlab_io.h"
|
||
using namespace std;
|
||
using std::cout;
|
||
using std::vector;
|
||
namespace fs = std::filesystem;
|
||
#include "common.h"
|
||
#include "CommonLib/matlab_io.h"
|
||
using namespace std;
|
||
|
||
// 遍历知识颗粒,循环处理
|
||
#define FOREACH_PARTICLE_START \
|
||
for (auto &childDir : fs::directory_iterator(parrentDir)) { \
|
||
for (auto &file : fs::directory_iterator(childDir)) { \
|
||
const string &fileName = file.path().filename().string(); \
|
||
auto rPos = fileName.rfind(wordMatSuffix); \
|
||
if (rPos != string::npos && fileName.size() - rPos == wordMatSuffix.size()) {
|
||
|
||
#define FOREACH_PARTICLE_END \
|
||
} \
|
||
} \
|
||
}
|
||
|
||
/* 读取二层cell包裹的字符串,和数值,ds,fr */
|
||
#define OUTER_FOR_BEGIN \
|
||
rowNum = (int)mxGetM(pMxArray); \
|
||
colNum = (int)mxGetN(pMxArray); \
|
||
for (int i = 0; i < rowNum; ++i) { \
|
||
for (int j = 0; j < colNum; ++j) { \
|
||
pCell = mxGetCell(pMxArray, j * rowNum + i); \
|
||
int childRowNum = (int)mxGetM(pCell); \
|
||
int childColNum = (int)mxGetN(pCell);
|
||
|
||
#define OUTER_FOR_END \
|
||
} \
|
||
} \
|
||
mxDestroyArray(pMxArray);
|
||
|
||
#define INNTER_FOR_BEGIN \
|
||
for (int ii = 0; ii < childRowNum; ii++) { \
|
||
for (int jj = 0; jj < childColNum; jj++) { \
|
||
mxArray *pChildCell = mxGetCell(pCell, jj * childRowNum + ii);
|
||
#define INNTER_FOR_END \
|
||
} \
|
||
}
|
||
// 将matlab存储方式转换成c存储方式
|
||
#define TRANS_ROW_COL(dst, src, rowNum, colNum) \
|
||
for (int rowI = 0; rowI < rowNum; ++rowI) { \
|
||
for (int colJ = 0; colJ < colNum; ++colJ) { \
|
||
dst[rowI * colNum + colJ] = src[colJ * rowNum + rowI]; \
|
||
} \
|
||
}
|
||
|
||
// 读取ds和fr信息
|
||
bool ReadInfoFromMat(const string & filePath, vector<vector<string> >&vvDs, vector<vector<double> >&vvFr) {
|
||
|
||
MATFile* pMatFile = nullptr;
|
||
mxArray* pMxArray = nullptr;
|
||
mxArray* pCell = nullptr;
|
||
int rowNum, colNum;
|
||
char strBuf[STRING_BUF_SIZE];
|
||
const string& parrentName = "G";
|
||
const string& firstChildName = "ds";
|
||
const string& secondChildName = "fr";
|
||
|
||
pMatFile = matOpen(filePath.c_str(), "r"); //打开.mat文件
|
||
if (pMatFile == nullptr) {
|
||
cout << "filePath is error!" << endl;
|
||
return false;
|
||
}
|
||
mxArray* pMxG = matGetVariable(pMatFile, parrentName.c_str()); //获取G变量
|
||
|
||
// 读取ds字符串
|
||
pMxArray = mxGetField(pMxG, 0, firstChildName.c_str()); // ds
|
||
OUTER_FOR_BEGIN
|
||
// cout << childRowNum << '\t' << childColNum << endl;
|
||
vvDs.push_back(vector<string>());
|
||
vvDs.back().resize(childRowNum * childColNum);
|
||
INNTER_FOR_BEGIN
|
||
if (mxGetString(pChildCell, strBuf, STRING_BUF_SIZE) != 0) {
|
||
cout << "String is too large to fit in the buffer! " << i + 1 << '\t' << j + 1 << endl;
|
||
return false;
|
||
}
|
||
vvDs.back()[ii * childColNum + jj] = strBuf;
|
||
auto& lastStr = vvDs.back()[ii * childColNum + jj];
|
||
transform(lastStr.begin(), lastStr.end(), lastStr.begin(), ::toupper); // 转成大写
|
||
INNTER_FOR_END
|
||
OUTER_FOR_END
|
||
|
||
// 读取fr数值
|
||
pMxArray = mxGetField(pMxG, 0, secondChildName.c_str()); // fr
|
||
OUTER_FOR_BEGIN
|
||
vvFr.push_back(vector<double>());
|
||
vvFr.back().resize(childRowNum * childColNum);
|
||
double *pVal = (double*)mxGetData(pCell); //获取指针
|
||
TRANS_ROW_COL(vvFr.back(), pVal, childRowNum, childColNum); // 行列存储方式转换
|
||
OUTER_FOR_END
|
||
|
||
// 没考虑完全哪些数据需要mxDestroyArray,可能会有内存泄漏
|
||
return true;
|
||
}
|
||
|
||
// 将二维索引转成一维的索引
|
||
inline int Get1DIndex(int colNum, int row, int col) {
|
||
return row * colNum + col;
|
||
}
|
||
|
||
/* 处理一个知识颗粒 */
|
||
struct ThreadParam { // 线程参数
|
||
fs::path matFilePath;
|
||
fs::path outFilePath;
|
||
vector<unordered_set<string> >* pvusWord;
|
||
};
|
||
void ThreadProcessData(const ThreadParam& param) {
|
||
const fs::path& matFilePath = param.matFilePath;
|
||
const fs::path& outFilePath = param.outFilePath;
|
||
vector <unordered_set<string> >& vusWord = *param.pvusWord;
|
||
|
||
// 存放结果,用一维数组存放二维数据
|
||
vector<double> hs;
|
||
vector<double> hr;
|
||
|
||
vector<vector<string> > vvDs; // 每个知识颗粒的ds矩阵(词汇矩阵)
|
||
vector<vector<double> > vvFr; // 词汇对应的频率
|
||
|
||
// cout << matFilePath.string() << endl;
|
||
// 读取G结构体中的ds和fr信息
|
||
ReadInfoFromMat(matFilePath.string(), vvDs, vvFr);
|
||
// res.vvEntropy.push_back(vvFr[0]);
|
||
// cout << vvDs.size() << '\t' << vvDs[0].size() << endl;
|
||
const int numLiterature = vusWord.size(); // pubmed 文件中包含的文献数量
|
||
const int numGroup = vvDs.size(); // ds包含的组数
|
||
hs.resize(numGroup * numLiterature);
|
||
hr.resize(numLiterature * numGroup);
|
||
|
||
for (int groupIdx = 0; groupIdx < numGroup; ++groupIdx) { // 遍历知识颗粒中的每一组
|
||
vector<string>& vDs = vvDs[groupIdx]; // 这一组ds
|
||
vector<double>& vFr = vvFr[groupIdx]; // frequency
|
||
const int numWord = vDs.size(); // 这一组数据中包含的单词数量
|
||
vector<vector<int> > vX(numLiterature, vector<int>(numWord, 0));
|
||
// 检查知识颗粒中的词语是否出现在pubmed摘要的词语中
|
||
for (int i= 0; i < numLiterature; ++i) {
|
||
for (int j = 0; j < numWord; ++j) {
|
||
if (vusWord[i].find(vDs[j]) != vusWord[i].end()) { // 这一组单词中的j索引位置的单词在第i个文献中出现过
|
||
vX[i][j] = 1;
|
||
if (groupIdx == 1 && i == 2) {
|
||
// cout << matFilePath.string() << '\t' << j+1 << '\t' << vDs[j] << endl;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// 找词汇的最高频率
|
||
double maxFr = *max_element(vFr.begin(), vFr.end());
|
||
// 将fr的数值规范化到(0,0.368)之间
|
||
const double normalMax = 0.368;
|
||
for (auto& frVal : vFr) frVal = frVal * normalMax / maxFr;
|
||
maxFr = normalMax;
|
||
// 对每个知识颗粒每一组数据,计算信息熵
|
||
for (int i = 0; i < numLiterature; ++i) {
|
||
for (int j = 0; j < numWord; ++j) {
|
||
if (vX[i][j] == 1) {
|
||
hs[Get1DIndex(numLiterature, groupIdx, i)] -= vFr[j] * log2(vFr[j]);
|
||
}
|
||
}
|
||
}
|
||
|
||
// 找最高频词汇所在的索引位置
|
||
vector<int> vMaxPos;
|
||
int idx = 0;
|
||
for_each(vFr.begin(), vFr.end(), [&idx, maxFr, &vMaxPos](double val) {
|
||
if (val == maxFr) vMaxPos.push_back(idx);
|
||
idx++;
|
||
});
|
||
|
||
for (int i = 0; i < numLiterature; ++i) {
|
||
int cumulateX = 0; // 计算在最高频词汇处,x值的累加结果
|
||
for (int j = 0; j < vMaxPos.size(); ++j) cumulateX += vX[i][vMaxPos[j]];
|
||
if (cumulateX == vMaxPos.size()) { // 如果频率最高的词汇都出现在了文献中
|
||
hr[Get1DIndex(numGroup,i, groupIdx)] = 1; // 应该是表示知识颗粒的这一组数据跟这篇文献相关性比较高
|
||
}
|
||
}
|
||
}
|
||
/* 将结果(hs和hr)写入每个知识颗粒的目录内 */
|
||
MATFile* pMatFile = matOpen(outFilePath.string().c_str(), "w");
|
||
SaveMtxDouble(hs.data(), pMatFile, "hs1", numGroup, numLiterature);
|
||
SaveMtxDouble(hr.data(), pMatFile, "hr1", numLiterature, numGroup);
|
||
matClose(pMatFile);
|
||
}
|
||
|
||
/* 程序入口 */
|
||
// 运行例子:CalcEntropy d:\Twirls\runtime\ALS_test abs2class.mat d:\Twirls\runtime\pubmed_files\pubmed-multiplesc-set.mat hx_info.mat 12 word.mat
|
||
void CalcEntropy(int argc, const char** argv) {
|
||
// argv
|
||
// 1. 知识颗粒的父目录名称
|
||
// 2. 包含高频词汇信息的mat文件的后缀
|
||
// 3. 包含处理后的pubmed文献信息的mat文件路径
|
||
// 4. 存放输出结果的mat文件的后缀(每个知识颗粒目录中生成一个结果文件)
|
||
// 5. 线程数量(可选)
|
||
if (argc < 5) {
|
||
cout << "This program should take at least 4 arguments(1.parrent Dir; 2. mat file suffix; 3. pubmed mat file; 4. out mat filename; [5. thread number]; [6. word out mat filepath])!" << endl;
|
||
return;
|
||
}
|
||
clock_t begin, finish;
|
||
string parrentDir(argv[1]); // 知识颗粒的父目录名称
|
||
string wordMatSuffix(argv[2]); // 高频词汇矩阵对应的mat文件的后缀名(可以是全文件名,可以是文件名后缀,必须保证唯一)
|
||
fs::path outFileName(argv[4]);
|
||
int numThread = 1;
|
||
if (argc >= 5) numThread = atoi(argv[5]);
|
||
if (numThread < 1) numThread = 1;
|
||
// cout << "thread num: " << numThread << endl;
|
||
|
||
/* 读入处理后的pubmed文献信息的mat文件,只读入摘要信息,即变量abs1 */
|
||
vector<string> vAbstract;
|
||
int rowNum, colNum;
|
||
ReadMtxString(argv[3], "abs1", vAbstract, &rowNum, &colNum);
|
||
if (vAbstract.size() == 0) { // 摘要信息为空,出错
|
||
cout << "PubMed Abstract info is null!" << endl;
|
||
return;
|
||
}
|
||
/* 将摘要信息分割成一个一个的词汇 */
|
||
begin = clock();
|
||
unordered_set<char> usWordChars; // 能组成单词的字符,要不要考虑数字?原版matlab是提取了数字的
|
||
for (int i = 65; i <= 90; i++) usWordChars.insert(char(i)); // A - Z
|
||
for (int i = 97; i <= 122; i++) usWordChars.insert(char(i)); // a - z
|
||
for (int i = 48; i <= 57; i++) usWordChars.insert(char(i)); // 0 - 9
|
||
usWordChars.insert('/'); usWordChars.insert('+'); usWordChars.insert('-');
|
||
vector<vector<string> > vvWordMtx(vAbstract.size()); // 初始大小为文章的个数
|
||
vector<unordered_set<string> > vusAbsWord(vAbstract.size()); // 将每篇文章摘要的单词放入hash表
|
||
for (int i = 0; i < vAbstract.size(); i++) {
|
||
auto& strAbs = vAbstract[i];
|
||
// 遍历摘要字符串的每一个字符,取出每一个单词
|
||
vector<string>& vWord = vvWordMtx[i];
|
||
if (strAbs.size() == 0) continue; // 摘要信息为空,跳过(一般不会出现这个情况)
|
||
int wordStartPos = 0;
|
||
while (wordStartPos < strAbs.size() && usWordChars.find(strAbs[wordStartPos]) == usWordChars.end())
|
||
wordStartPos++;
|
||
for (int curPos = wordStartPos + 1; curPos < strAbs.size(); ++curPos) {
|
||
if (usWordChars.find(strAbs[curPos]) == usWordChars.end()) { // 找到了分割符
|
||
vWord.push_back(strAbs.substr(wordStartPos, curPos - wordStartPos));
|
||
wordStartPos = curPos + 1; // 找下一个词语起始位置
|
||
while (wordStartPos < strAbs.size() && usWordChars.find(strAbs[wordStartPos]) == usWordChars.end())
|
||
wordStartPos++;
|
||
curPos = wordStartPos; // 循环会自动加1
|
||
}
|
||
}
|
||
// 将处理摘要之后的每个词语放入hash表
|
||
for (auto& word : vWord) {
|
||
string upWord(word);
|
||
transform(upWord.begin(), upWord.end(), upWord.begin(), ::toupper);
|
||
// cout << upWord << endl;
|
||
vusAbsWord[i].insert(upWord);
|
||
}
|
||
}
|
||
/* 将分割结果写入mat文件 */
|
||
if (argc >= 6) {
|
||
MATFile* pMatFile = matOpen(argv[6], "w");
|
||
mxArray* pCellMtx= mxCreateCellMatrix(1, vvWordMtx.size());
|
||
for (int i = 0; i < vvWordMtx.size(); ++i) {
|
||
mxArray* pChildCellMtx = mxCreateCellMatrix(1, vvWordMtx[i].size());
|
||
for (int j = 0; j < vvWordMtx[i].size(); ++j) {
|
||
mxArray* mxStr = mxCreateString(vvWordMtx[i][j].c_str());
|
||
mxSetCell(pChildCellMtx, j, mxStr);
|
||
}
|
||
mxSetCell(pCellMtx, i, pChildCellMtx);
|
||
}
|
||
matPutVariable(pMatFile, "wd1", pCellMtx);
|
||
matClose(pMatFile);
|
||
mxDestroyArray(pCellMtx);
|
||
}
|
||
finish = clock();
|
||
cout << "abstract time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||
//auto & vTest = vvWordMtx[0];
|
||
//cout << vTest.size() << endl;
|
||
//for (auto& str : vTest) cout << str << endl;
|
||
|
||
|
||
/* 遍历所有的知识颗粒目录,逐一进行处理 */
|
||
begin = clock();
|
||
ThreadPool thPool(numThread);
|
||
// ThreadPool thPool(24);
|
||
// 查看知识颗粒数量
|
||
int numKnowledgeParticle = 0;
|
||
FOREACH_PARTICLE_START
|
||
numKnowledgeParticle++;
|
||
FOREACH_PARTICLE_END
|
||
|
||
// 遍历每个知识颗粒,逐一进行处理
|
||
for (int round = 0; round < 1; ++round) { // 测试用
|
||
int i = 0;
|
||
FOREACH_PARTICLE_START
|
||
ThreadParam tParam = { file, childDir / outFileName, &vusAbsWord };
|
||
thPool.enqueue(ThreadProcessData, tParam);
|
||
i++;
|
||
FOREACH_PARTICLE_END
|
||
}
|
||
|
||
// synchronize
|
||
thPool.~ThreadPool();
|
||
finish = clock();
|
||
|
||
cout << "thread pool time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||
|
||
//ofstream ofs("test_out.txt");
|
||
//for (auto& item : vEntropyResult) {
|
||
// auto& vvEntropy = item.vvEntropy;
|
||
// auto& vVal = vvEntropy[0];
|
||
// for (auto& val : vVal) ofs << val << ' ';
|
||
// ofs << endl;
|
||
//}
|
||
//ofs.close();
|
||
} |