301 lines
11 KiB
C++
301 lines
11 KiB
C++
|
|
/*********************************************************************************************
|
|||
|
|
Description: <EFBFBD><EFBFBD><EFBFBD><EFBFBD>ÿ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ժҪ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƿ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>趨<EFBFBD>ĸ<EFBFBD>Ƶ<EFBFBD>ʻ㣬<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Щ<EFBFBD><EFBFBD>Ƶ<EFBFBD>ʻ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ժҪ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϣ<EFBFBD><EFBFBD>
|
|||
|
|
|
|||
|
|
Copyright : All right reserved by ZheYuan.BJ
|
|||
|
|
|
|||
|
|
Author : Zhang Zhonghai
|
|||
|
|
Date : 2023/09/20
|
|||
|
|
***********************************************************************************************/
|
|||
|
|
#include <iostream>
|
|||
|
|
#include <fstream>
|
|||
|
|
#include <sstream>
|
|||
|
|
#include <filesystem>
|
|||
|
|
#include <vector>
|
|||
|
|
#include <string>
|
|||
|
|
#include <unordered_map>
|
|||
|
|
#include <unordered_set>
|
|||
|
|
#include <algorithm>
|
|||
|
|
#include <omp.h>
|
|||
|
|
#include <time.h>
|
|||
|
|
#include <vector>
|
|||
|
|
#include <queue>
|
|||
|
|
#include <thread>
|
|||
|
|
#include <cctype>
|
|||
|
|
#include <cmath>
|
|||
|
|
#ifdef _WIN32
|
|||
|
|
#include <io.h>
|
|||
|
|
#include <process.h>
|
|||
|
|
#define F_OK 0
|
|||
|
|
#else
|
|||
|
|
#include <unistd.h>
|
|||
|
|
#endif
|
|||
|
|
#include <mat.h>
|
|||
|
|
#include "common.h"
|
|||
|
|
#include "CommonLib/thread_pool.h"
|
|||
|
|
#include "CommonLib/matlab_io.h"
|
|||
|
|
using namespace std;
|
|||
|
|
using std::cout;
|
|||
|
|
using std::vector;
|
|||
|
|
namespace fs = std::filesystem;
|
|||
|
|
#include "common.h"
|
|||
|
|
#include "CommonLib/matlab_io.h"
|
|||
|
|
using namespace std;
|
|||
|
|
|
|||
|
|
// <20><><EFBFBD><EFBFBD>֪ʶ<D6AA><CAB6><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ѭ<EFBFBD><D1AD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|||
|
|
#define FOREACH_PARTICLE_START \
|
|||
|
|
for (auto &childDir : fs::directory_iterator(parrentDir)) { \
|
|||
|
|
for (auto &file : fs::directory_iterator(childDir)) { \
|
|||
|
|
const string &fileName = file.path().filename().string(); \
|
|||
|
|
auto rPos = fileName.rfind(wordMatSuffix); \
|
|||
|
|
if (rPos != string::npos && fileName.size() - rPos == wordMatSuffix.size()) {
|
|||
|
|
|
|||
|
|
#define FOREACH_PARTICLE_END \
|
|||
|
|
} \
|
|||
|
|
} \
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/* <20><>ȡ<EFBFBD><C8A1><EFBFBD><EFBFBD>cell<6C><6C><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ַ<EFBFBD><D6B7><EFBFBD>,<2C><><EFBFBD><EFBFBD>ֵ<EFBFBD><D6B5>ds,fr */
|
|||
|
|
#define OUTER_FOR_BEGIN \
|
|||
|
|
rowNum = (int)mxGetM(pMxArray); \
|
|||
|
|
colNum = (int)mxGetN(pMxArray); \
|
|||
|
|
for (int i = 0; i < rowNum; ++i) { \
|
|||
|
|
for (int j = 0; j < colNum; ++j) { \
|
|||
|
|
pCell = mxGetCell(pMxArray, j * rowNum + i); \
|
|||
|
|
int childRowNum = (int)mxGetM(pCell); \
|
|||
|
|
int childColNum = (int)mxGetN(pCell);
|
|||
|
|
|
|||
|
|
#define OUTER_FOR_END \
|
|||
|
|
} \
|
|||
|
|
} \
|
|||
|
|
mxDestroyArray(pMxArray);
|
|||
|
|
|
|||
|
|
#define INNTER_FOR_BEGIN \
|
|||
|
|
for (int ii = 0; ii < childRowNum; ii++) { \
|
|||
|
|
for (int jj = 0; jj < childColNum; jj++) { \
|
|||
|
|
mxArray *pChildCell = mxGetCell(pCell, jj * childRowNum + ii);
|
|||
|
|
#define INNTER_FOR_END \
|
|||
|
|
} \
|
|||
|
|
}
|
|||
|
|
// <20><>matlab<61>洢<EFBFBD><E6B4A2>ʽת<CABD><D7AA><EFBFBD><EFBFBD>c<EFBFBD>洢<EFBFBD><E6B4A2>ʽ
|
|||
|
|
#define TRANS_ROW_COL(dst, src, rowNum, colNum) \
|
|||
|
|
for (int rowI = 0; rowI < rowNum; ++rowI) { \
|
|||
|
|
for (int colJ = 0; colJ < colNum; ++colJ) { \
|
|||
|
|
dst[rowI * colNum + colJ] = src[colJ * rowNum + rowI]; \
|
|||
|
|
} \
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// <20><>ȡds<64><73>fr<66><72>Ϣ
|
|||
|
|
bool ReadInfoFromMat(const string & filePath, vector<vector<string> >&vvDs, vector<vector<double> >&vvFr) {
|
|||
|
|
|
|||
|
|
MATFile* pMatFile = nullptr;
|
|||
|
|
mxArray* pMxArray = nullptr;
|
|||
|
|
mxArray* pCell = nullptr;
|
|||
|
|
int rowNum, colNum;
|
|||
|
|
char strBuf[STRING_BUF_SIZE];
|
|||
|
|
const string& parrentName = "G";
|
|||
|
|
const string& firstChildName = "ds";
|
|||
|
|
const string& secondChildName = "fr";
|
|||
|
|
|
|||
|
|
pMatFile = matOpen(filePath.c_str(), "r"); //<2F><><EFBFBD><EFBFBD>.mat<61>ļ<EFBFBD>
|
|||
|
|
if (pMatFile == nullptr) {
|
|||
|
|
cout << "filePath is error!" << endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
mxArray* pMxG = matGetVariable(pMatFile, parrentName.c_str()); //<2F><>ȡG<C8A1><47><EFBFBD><EFBFBD>
|
|||
|
|
|
|||
|
|
// <20><>ȡds<64>ַ<EFBFBD><D6B7><EFBFBD>
|
|||
|
|
pMxArray = mxGetField(pMxG, 0, firstChildName.c_str()); // ds
|
|||
|
|
OUTER_FOR_BEGIN
|
|||
|
|
// cout << childRowNum << '\t' << childColNum << endl;
|
|||
|
|
vvDs.push_back(vector<string>());
|
|||
|
|
vvDs.back().resize(childRowNum * childColNum);
|
|||
|
|
INNTER_FOR_BEGIN
|
|||
|
|
if (mxGetString(pChildCell, strBuf, STRING_BUF_SIZE) != 0) {
|
|||
|
|
cout << "String is too large to fit in the buffer! " << i + 1 << '\t' << j + 1 << endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
vvDs.back()[ii * childColNum + jj] = strBuf;
|
|||
|
|
auto& lastStr = vvDs.back()[ii * childColNum + jj];
|
|||
|
|
transform(lastStr.begin(), lastStr.end(), lastStr.begin(), ::toupper); // ת<>ɴ<EFBFBD>д
|
|||
|
|
INNTER_FOR_END
|
|||
|
|
OUTER_FOR_END
|
|||
|
|
|
|||
|
|
// <20><>ȡfr<66><72>ֵ
|
|||
|
|
pMxArray = mxGetField(pMxG, 0, secondChildName.c_str()); // fr
|
|||
|
|
OUTER_FOR_BEGIN
|
|||
|
|
vvFr.push_back(vector<double>());
|
|||
|
|
vvFr.back().resize(childRowNum * childColNum);
|
|||
|
|
double *pVal = (double*)mxGetData(pCell); //<2F><>ȡָ<C8A1><D6B8>
|
|||
|
|
TRANS_ROW_COL(vvFr.back(), pVal, childRowNum, childColNum); // <20><><EFBFBD>д洢<D0B4><E6B4A2>ʽת<CABD><D7AA>
|
|||
|
|
OUTER_FOR_END
|
|||
|
|
|
|||
|
|
// û<><C3BB><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȫ<EFBFBD><C8AB>Щ<EFBFBD><D0A9><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ҪmxDestroyArray<61><79><EFBFBD><EFBFBD><EFBFBD>ܻ<EFBFBD><DCBB><EFBFBD><EFBFBD>ڴ<EFBFBD>й©
|
|||
|
|
return true;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/* <20><><EFBFBD><EFBFBD>һ<EFBFBD><D2BB>֪ʶ<D6AA><CAB6><EFBFBD><EFBFBD> */
|
|||
|
|
struct EntropyResult { // <20><><EFBFBD><EFBFBD>ÿ<EFBFBD><C3BF><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ӧ<EFBFBD>Ľ<EFBFBD><C4BD><EFBFBD>
|
|||
|
|
vector<vector<double> > vvEntropy; // <20><>Ϣ<EFBFBD><CFA2>
|
|||
|
|
vector<vector<double> > vvTransEntropy; // ת<>õ<EFBFBD><C3B5><EFBFBD>Ϣ<EFBFBD><CFA2>
|
|||
|
|
};
|
|||
|
|
struct ThreadParam { // <20>̲߳<DFB3><CCB2><EFBFBD>
|
|||
|
|
fs::path matFilePath;
|
|||
|
|
vector<unordered_set<string> >* pvusWord;
|
|||
|
|
EntropyResult* pRes;
|
|||
|
|
};
|
|||
|
|
void ThreadProcessData(const ThreadParam& param) {
|
|||
|
|
const fs::path& matFilePath = param.matFilePath;
|
|||
|
|
EntropyResult& res = *param.pRes;
|
|||
|
|
vector <unordered_set<string> >& vusWord = *param.pvusWord;
|
|||
|
|
|
|||
|
|
// <20><><EFBFBD>Ž<EFBFBD><C5BD><EFBFBD>
|
|||
|
|
auto& hs = res.vvEntropy;
|
|||
|
|
auto& hr = res.vvTransEntropy;
|
|||
|
|
|
|||
|
|
vector<vector<string> > vvDs; // ÿ<><C3BF>֪ʶ<D6AA><CAB6><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ds<64><73><EFBFBD>ʻ<F3A3A8B4><CABB><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|||
|
|
vector<vector<double> > vvFr; // <20>ʻ<EFBFBD><CABB><EFBFBD>Ӧ<EFBFBD><D3A6>Ƶ<EFBFBD><C6B5>
|
|||
|
|
|
|||
|
|
// cout << matFilePath.string() << endl;
|
|||
|
|
// <20><>ȡG<C8A1>ṹ<EFBFBD><E1B9B9><EFBFBD>е<EFBFBD>ds<64><73>fr<66><72>Ϣ
|
|||
|
|
ReadInfoFromMat(matFilePath.string(), vvDs, vvFr);
|
|||
|
|
// res.vvEntropy.push_back(vvFr[0]);
|
|||
|
|
// cout << vvDs.size() << '\t' << vvDs[0].size() << endl;
|
|||
|
|
const int numLiterature = vusWord.size(); // pubmed <20>ļ<EFBFBD><C4BC>а<EFBFBD><D0B0><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|||
|
|
const int numGroup = vvDs.size(); // ds<64><73><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|||
|
|
hs.resize(numGroup);
|
|||
|
|
hr.resize(numLiterature);
|
|||
|
|
for (int i = 0; i < numGroup; ++i) hs[i].resize(numLiterature); // resize<7A><65><EFBFBD>Զ<EFBFBD><D4B6><EFBFBD>ʼ<EFBFBD><CABC>
|
|||
|
|
for (int i = 0; i < numLiterature; ++i) hr[i].resize(numGroup);
|
|||
|
|
for (int groupIdx = 0; groupIdx < vvDs.size(); ++groupIdx) { // <20><><EFBFBD><EFBFBD>֪ʶ<D6AA><CAB6><EFBFBD><EFBFBD><EFBFBD>е<EFBFBD>ÿһ<C3BF><D2BB>
|
|||
|
|
vector<string>& vDs = vvDs[groupIdx]; // <20><>һ<EFBFBD><D2BB>ds
|
|||
|
|
vector<double>& vFr = vvFr[groupIdx]; // frequency
|
|||
|
|
const int numWord = vDs.size(); // <20><>һ<EFBFBD><D2BB><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>а<EFBFBD><D0B0><EFBFBD><EFBFBD>ĵ<EFBFBD><C4B5><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|||
|
|
vector<vector<int> > vX(numLiterature, vector<int>(numWord, 0));
|
|||
|
|
// <20><><EFBFBD><EFBFBD>֪ʶ<D6AA><CAB6><EFBFBD><EFBFBD><EFBFBD>еĴ<D0B5><C4B4><EFBFBD><EFBFBD>Ƿ<EFBFBD><C7B7><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>pubmedժҪ<D5AA>Ĵ<EFBFBD><C4B4><EFBFBD><EFBFBD><EFBFBD>
|
|||
|
|
for (int i= 0; i < numLiterature; ++i) {
|
|||
|
|
for (int j = 0; j < numWord; ++j) {
|
|||
|
|
if (vusWord[i].find(vDs[j]) != vusWord[i].end()) { // <20><>һ<EFBFBD>鵥<EFBFBD><E9B5A5><EFBFBD>е<EFBFBD>j<EFBFBD><6A><EFBFBD><EFBFBD>λ<EFBFBD>õĵ<C3B5><C4B5><EFBFBD><EFBFBD>ڵ<EFBFBD>i<EFBFBD><69><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>г<EFBFBD><D0B3>ֹ<EFBFBD>
|
|||
|
|
vX[i][j] = 1;
|
|||
|
|
// <20><>ÿ<EFBFBD><C3BF>֪ʶ<D6AA><CAB6><EFBFBD><EFBFBD>ÿһ<C3BF><D2BB><EFBFBD><EFBFBD><EFBFBD>ݣ<EFBFBD><DDA3><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϣ<EFBFBD><CFA2>
|
|||
|
|
hs[groupIdx][i] -= vFr[j] * log2(vFr[j]);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
// cout << vX[0][0] << endl;
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
for (int i = 0; i < numLiterature; ++i) {
|
|||
|
|
|
|||
|
|
if (vX[groupIdx][i] == 1) {
|
|||
|
|
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƶ<EFBFBD>ʻ<EFBFBD><CABB><EFBFBD><EFBFBD>ڵ<EFBFBD><DAB5><EFBFBD><EFBFBD><EFBFBD>λ<EFBFBD><CEBB>
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> */
|
|||
|
|
void CalcEntropy(int argc, const char** argv) {
|
|||
|
|
// argv
|
|||
|
|
// 1. ֪ʶ<D6AA><CAB6><EFBFBD><EFBFBD><EFBFBD>ĸ<EFBFBD>Ŀ¼<C4BF><C2BC><EFBFBD><EFBFBD>
|
|||
|
|
// 2. <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƶ<EFBFBD>ʻ<EFBFBD><CABB><EFBFBD>Ϣ<EFBFBD><CFA2>mat<61>ļ<EFBFBD><C4BC>ĺ<EFBFBD>
|
|||
|
|
// 3. <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>pubmed<65><64><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϣ<EFBFBD><CFA2>mat<61>ļ<EFBFBD>·<EFBFBD><C2B7>
|
|||
|
|
// 4. <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>mat<61>ļ<EFBFBD><C4BC>ĺ<EFBFBD><EFBFBD><D7BA>ÿ<EFBFBD><C3BF>֪ʶ<D6AA><CAB6><EFBFBD><EFBFBD>Ŀ¼<C4BF><C2BC><EFBFBD><EFBFBD><EFBFBD><EFBFBD>һ<EFBFBD><D2BB><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ļ<EFBFBD><C4BC><EFBFBD>
|
|||
|
|
// 5. <20>߳<EFBFBD><DFB3><EFBFBD><EFBFBD><EFBFBD>(<28><>ѡ)
|
|||
|
|
if (argc < 5) {
|
|||
|
|
cout << "This program should take at least 4 arguments(1.parrent Dir; 2. mat file suffix; 3. pubmed mat file; 4. out mat filename; [5. thread number])!" << endl;
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
clock_t begin, finish;
|
|||
|
|
string parrentDir(argv[1]); // ֪ʶ<D6AA><CAB6><EFBFBD><EFBFBD><EFBFBD>ĸ<EFBFBD>Ŀ¼<C4BF><C2BC><EFBFBD><EFBFBD>
|
|||
|
|
string wordMatSuffix(argv[2]); // <20><>Ƶ<EFBFBD>ʻ<EFBFBD><CABB><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ӧ<EFBFBD><D3A6>mat<61>ļ<EFBFBD><C4BC>ĺ<EFBFBD><EFBFBD><D7BA><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȫ<EFBFBD>ļ<EFBFBD><C4BC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ļ<EFBFBD><C4BC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><D7BA><EFBFBD><EFBFBD><EFBFBD>뱣֤Ψһ<CEA8><D2BB>
|
|||
|
|
int numThread = 1;
|
|||
|
|
if (argc >= 5) numThread = atoi(argv[5]);
|
|||
|
|
if (numThread < 1) numThread = 1;
|
|||
|
|
// cout << "thread num: " << numThread << endl;
|
|||
|
|
|
|||
|
|
/* <20><><EFBFBD>봦<EFBFBD><EBB4A6><EFBFBD><EFBFBD><EFBFBD><EFBFBD>pubmed<65><64><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϣ<EFBFBD><CFA2>mat<61>ļ<EFBFBD><C4BC><EFBFBD>ֻ<EFBFBD><D6BB><EFBFBD><EFBFBD>ժҪ<D5AA><D2AA>Ϣ<EFBFBD><CFA2><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>abs1 */
|
|||
|
|
vector<string> vAbstract;
|
|||
|
|
int rowNum, colNum;
|
|||
|
|
ReadMtxString(argv[3], "abs1", vAbstract, &rowNum, &colNum);
|
|||
|
|
if (vAbstract.size() == 0) { // ժҪ<D5AA><D2AA>ϢΪ<CFA2>գ<EFBFBD><D5A3><EFBFBD><EFBFBD><EFBFBD>
|
|||
|
|
cout << "PubMed Abstract info is null!" << endl;
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
// <20><>ժҪ<D5AA><D2AA>Ϣ<EFBFBD>ָ<EFBFBD><D6B8><EFBFBD>һ<EFBFBD><D2BB>һ<EFBFBD><D2BB><EFBFBD>Ĵʻ<C4B4>
|
|||
|
|
begin = clock();
|
|||
|
|
unordered_set<char> usWordChars; // <20><><EFBFBD><EFBFBD><EFBFBD>ɵ<EFBFBD><C9B5>ʵ<EFBFBD><CAB5>ַ<EFBFBD><D6B7><EFBFBD>Ҫ<EFBFBD><D2AA>Ҫ<EFBFBD><D2AA><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>֣<EFBFBD>ԭ<EFBFBD><D4AD>matlab<61><62><EFBFBD><EFBFBD>ȡ<EFBFBD><C8A1><EFBFBD><EFBFBD><EFBFBD>ֵ<EFBFBD>
|
|||
|
|
for (int i = 65; i <= 90; i++) usWordChars.insert(char(i)); // A - Z
|
|||
|
|
for (int i = 97; i <= 122; i++) usWordChars.insert(char(i)); // a - z
|
|||
|
|
vector<vector<string> > vvWordMtx(vAbstract.size()); // <20><>ʼ<EFBFBD><CABC>СΪ<D0A1><CEAA><EFBFBD>µĸ<C2B5><C4B8><EFBFBD>
|
|||
|
|
vector<unordered_set<string> > vusAbsWord(vAbstract.size()); // <20><>ÿƪ<C3BF><C6AA><EFBFBD><EFBFBD>ժҪ<D5AA>ĵ<EFBFBD><C4B5>ʷ<EFBFBD><CAB7><EFBFBD>hash<73><68>
|
|||
|
|
for (int i = 0; i < vAbstract.size(); i++) {
|
|||
|
|
auto& strAbs = vAbstract[i];
|
|||
|
|
// <20><><EFBFBD><EFBFBD>ժҪ<D5AA>ַ<EFBFBD><D6B7><EFBFBD><EFBFBD><EFBFBD>ÿһ<C3BF><D2BB><EFBFBD>ַ<EFBFBD><D6B7><EFBFBD>ȡ<EFBFBD><C8A1>ÿһ<C3BF><D2BB><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|||
|
|
vector<string>& vWord = vvWordMtx[i];
|
|||
|
|
if (strAbs.size() == 0) continue; // ժҪ<D5AA><D2AA>ϢΪ<CFA2>գ<EFBFBD><D5A3><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>һ<EFBFBD>㲻<EFBFBD><E3B2BB><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|||
|
|
int wordStartPos = 0;
|
|||
|
|
while (wordStartPos < strAbs.size() && usWordChars.find(strAbs[wordStartPos]) == usWordChars.end())
|
|||
|
|
wordStartPos++;
|
|||
|
|
for (int curPos = wordStartPos + 1; curPos < strAbs.size(); ++curPos) {
|
|||
|
|
if (usWordChars.find(strAbs[curPos]) == usWordChars.end()) { // <20>ҵ<EFBFBD><D2B5>˷ָ<CBB7><D6B8><EFBFBD>
|
|||
|
|
vWord.push_back(strAbs.substr(wordStartPos, curPos - wordStartPos));
|
|||
|
|
wordStartPos = curPos + 1; // <20><><EFBFBD><EFBFBD>һ<EFBFBD><D2BB><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʼλ<CABC><CEBB>
|
|||
|
|
while (wordStartPos < strAbs.size() && usWordChars.find(strAbs[wordStartPos]) == usWordChars.end())
|
|||
|
|
wordStartPos++;
|
|||
|
|
curPos = wordStartPos; // ѭ<><D1AD><EFBFBD><EFBFBD><EFBFBD>Զ<EFBFBD><D4B6><EFBFBD>1
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
// <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ժҪ֮<D2AA><D6AE><EFBFBD><EFBFBD>ÿ<EFBFBD><C3BF><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>hash<73><68>
|
|||
|
|
for (auto& word : vWord) {
|
|||
|
|
string upWord(word);
|
|||
|
|
transform(upWord.begin(), upWord.end(), upWord.begin(), ::toupper);
|
|||
|
|
// cout << upWord << endl;
|
|||
|
|
vusAbsWord[i].insert(upWord);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
finish = clock();
|
|||
|
|
cout << "abstract time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
|||
|
|
//auto & vTest = vvWordMtx[0];
|
|||
|
|
//cout << vTest.size() << endl;
|
|||
|
|
//for (auto& str : vTest) cout << str << endl;
|
|||
|
|
|
|||
|
|
|
|||
|
|
/* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>е<EFBFBD>֪ʶ<D6AA><CAB6><EFBFBD><EFBFBD>Ŀ¼<C4BF><C2BC><EFBFBD><EFBFBD>һ<EFBFBD><D2BB><EFBFBD>д<EFBFBD><D0B4><EFBFBD> */
|
|||
|
|
begin = clock();
|
|||
|
|
// ThreadPool thPool(numThread);
|
|||
|
|
ThreadPool thPool(24);
|
|||
|
|
// <20>鿴֪ʶ<D6AA><CAB6><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|||
|
|
int numKnowledgeParticle = 0;
|
|||
|
|
FOREACH_PARTICLE_START
|
|||
|
|
numKnowledgeParticle++;
|
|||
|
|
FOREACH_PARTICLE_END
|
|||
|
|
|
|||
|
|
vector<EntropyResult> vEntropyResult(numKnowledgeParticle); // <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>н<EFBFBD><D0BD><EFBFBD>
|
|||
|
|
// <20><><EFBFBD><EFBFBD>ÿ<EFBFBD><C3BF>֪ʶ<D6AA><CAB6><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>һ<EFBFBD><D2BB><EFBFBD>д<EFBFBD><D0B4><EFBFBD>
|
|||
|
|
for (int round = 0; round < 1; ++round) { // <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|||
|
|
int i = 0;
|
|||
|
|
FOREACH_PARTICLE_START
|
|||
|
|
ThreadParam tParam = { file, &vusAbsWord, &vEntropyResult[i] };
|
|||
|
|
thPool.enqueue(ThreadProcessData, tParam);
|
|||
|
|
i++;
|
|||
|
|
FOREACH_PARTICLE_END
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// synchronize
|
|||
|
|
thPool.~ThreadPool();
|
|||
|
|
finish = clock();
|
|||
|
|
|
|||
|
|
cout << "thread pool time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
|||
|
|
/* <20>ϲ<EFBFBD><CFB2><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> */
|
|||
|
|
//ofstream ofs("test_out.txt");
|
|||
|
|
//for (auto& item : vEntropyResult) {
|
|||
|
|
// auto& vvEntropy = item.vvEntropy;
|
|||
|
|
// auto& vVal = vvEntropy[0];
|
|||
|
|
// for (auto& val : vVal) ofs << val << ' ';
|
|||
|
|
// ofs << endl;
|
|||
|
|
//}
|
|||
|
|
//ofs.close();
|
|||
|
|
}
|