添加了kthread,并适配c++和windows

This commit is contained in:
zzh 2023-09-27 10:27:19 +08:00
parent 31f0382cc1
commit 0c73318fb7
7 changed files with 254 additions and 65 deletions

View File

@ -112,6 +112,7 @@
</Text> </Text>
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<ClInclude Include="kthread.h" />
<ClInclude Include="matlab_io.h" /> <ClInclude Include="matlab_io.h" />
<ClInclude Include="thread_pool.h" /> <ClInclude Include="thread_pool.h" />
</ItemGroup> </ItemGroup>

View File

@ -28,6 +28,9 @@
<ClInclude Include="thread_pool.h"> <ClInclude Include="thread_pool.h">
<Filter>Header Files</Filter> <Filter>Header Files</Filter>
</ClInclude> </ClInclude>
<ClInclude Include="kthread.h">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<ClCompile Include="matlab_io.cpp"> <ClCompile Include="matlab_io.cpp">

145
CommonLib/kthread.h 100644
View File

@ -0,0 +1,145 @@
#ifndef KTHREAD_H
#define KTHREAD_H
#include <stdlib.h>
#include <limits.h>
#include <thread>
#include <vector>
#include <atomic>
using std::atomic;
using std::thread;
using std::vector;
/************
* kt_for() *
************/
template <typename T>
using FuncType3Arg = void (*)(vector<T>&, long, int);
template <typename T>
using FuncType1Arg = void (*)(T&);
template <class T>
struct kt_for_t;
template <typename T>
struct ktf_worker_t
{
kt_for_t<T>* t;
atomic<long> i;
};
template <typename T>
struct kt_for_t
{
int n_threads;
long n;
ktf_worker_t<T>* w;
FuncType1Arg<T> func1Arg;
FuncType3Arg<T> func3Arg;
vector<T>* data;
};
template <class T>
static inline long steal_work(kt_for_t<T>* t)
{
int i, min_i = -1;
long k, min = LONG_MAX;
for (i = 0; i < t->n_threads; ++i)
if (min > t->w[i].i)
min = t->w[i].i, min_i = i;
k = t->w[min_i].i.fetch_add(t->n_threads);
return k >= t->n ? -1 : k;
}
template <class T>
static void ktf_worker_1_arg(void* data)
{
ktf_worker_t<T>* w = (ktf_worker_t<T> *)data;
long i;
for (;;)
{
i = w->i.fetch_add(w->t->n_threads);
if (i >= w->t->n)
break;
w->t->func1Arg(( * w->t->data)[i]);
}
while ((i = steal_work<T>(w->t)) >= 0)
w->t->func1Arg((*w->t->data)[i]);
}
template <class T>
static void ktf_worker_3_arg(void* data)
{
ktf_worker_t<T>* w = (ktf_worker_t<T> *)data;
long i;
for (;;)
{
i = w->i.fetch_add(w->t->n_threads);
if (i >= w->t->n)
break;
w->t->func3Arg(*w->t->data, i, w - w->t->w);
}
while ((i = steal_work<T>(w->t)) >= 0)
w->t->func3Arg(*w->t->data, i, w - w->t->w);
}
template <typename T>
void kt_for(int n_threads, FuncType3Arg<T> func, vector<T>& vData)
{
const long n = (long)vData.size();
if (n_threads > 1)
{
int i;
kt_for_t<T> t;
t.func3Arg = func, t.data = &vData, t.n_threads = n_threads, t.n = n;
t.w = (ktf_worker_t<T> *)alloca(n_threads * sizeof(ktf_worker_t<T>));
vector<thread> vThread;
for (i = 0; i < n_threads; ++i)
t.w[i].t = &t, t.w[i].i.store(i);
for (i = 0; i < n_threads; ++i)
vThread.push_back(thread(ktf_worker_3_arg<T>, &t.w[i]));
for (i = 0; i < n_threads; ++i)
vThread[i].join();
}
else
{
long j;
for (j = 0; j < n; ++j)
func(vData, j, 0);
}
}
template <typename T>
void kt_for(int n_threads, FuncType1Arg<T> func, vector<T>& vData)
{
const long n = (long)vData.size();
if (n_threads > 1)
{
int i;
kt_for_t<T> t;
t.func1Arg = func, t.data = &vData, t.n_threads = n_threads, t.n = n;
t.w = (ktf_worker_t<T> *)alloca(n_threads * sizeof(ktf_worker_t<T>));
vector<thread> vThread;
for (i = 0; i < n_threads; ++i)
t.w[i].t = &t, t.w[i].i.store(i);
for (i = 0; i < n_threads; ++i)
vThread.push_back(thread(ktf_worker_1_arg<T>, &t.w[i]));
for (i = 0; i < n_threads; ++i)
vThread[i].join();
}
else
{
long j;
for (j = 0; j < n; ++j)
func(vData[j]);
}
}
#endif

View File

@ -33,6 +33,7 @@
#include "common.h" #include "common.h"
#include "CommonLib/thread_pool.h" #include "CommonLib/thread_pool.h"
#include "CommonLib/matlab_io.h" #include "CommonLib/matlab_io.h"
#include "CommonLib/kthread.h"
using namespace std; using namespace std;
using std::cout; using std::cout;
using std::vector; using std::vector;
@ -106,7 +107,6 @@ bool ReadInfoFromMat(const string & filePath, vector<vector<string> >&vvDs, vect
// 读取ds字符串 // 读取ds字符串
pMxArray = mxGetField(pMxG, 0, firstChildName.c_str()); // ds pMxArray = mxGetField(pMxG, 0, firstChildName.c_str()); // ds
OUTER_FOR_BEGIN OUTER_FOR_BEGIN
// cout << childRowNum << '\t' << childColNum << endl;
vvDs.push_back(vector<string>()); vvDs.push_back(vector<string>());
vvDs.back().resize(childRowNum * childColNum); vvDs.back().resize(childRowNum * childColNum);
INNTER_FOR_BEGIN INNTER_FOR_BEGIN
@ -144,7 +144,7 @@ struct ThreadParam { //
fs::path outFilePath; fs::path outFilePath;
vector<unordered_set<string> >* pvusWord; vector<unordered_set<string> >* pvusWord;
}; };
void ThreadProcessData(const ThreadParam& param) { void ThreadProcessData(ThreadParam& param) {
const fs::path& matFilePath = param.matFilePath; const fs::path& matFilePath = param.matFilePath;
const fs::path& outFilePath = param.outFilePath; const fs::path& outFilePath = param.outFilePath;
vector <unordered_set<string> >& vusWord = *param.pvusWord; vector <unordered_set<string> >& vusWord = *param.pvusWord;
@ -156,11 +156,8 @@ void ThreadProcessData(const ThreadParam& param) {
vector<vector<string> > vvDs; // 每个知识颗粒的ds矩阵词汇矩阵 vector<vector<string> > vvDs; // 每个知识颗粒的ds矩阵词汇矩阵
vector<vector<double> > vvFr; // 词汇对应的频率 vector<vector<double> > vvFr; // 词汇对应的频率
// cout << matFilePath.string() << endl;
// 读取G结构体中的ds和fr信息 // 读取G结构体中的ds和fr信息
ReadInfoFromMat(matFilePath.string(), vvDs, vvFr); ReadInfoFromMat(matFilePath.string(), vvDs, vvFr);
// res.vvEntropy.push_back(vvFr[0]);
// cout << vvDs.size() << '\t' << vvDs[0].size() << endl;
const int numLiterature = vusWord.size(); // pubmed 文件中包含的文献数量 const int numLiterature = vusWord.size(); // pubmed 文件中包含的文献数量
const int numGroup = vvDs.size(); // ds包含的组数 const int numGroup = vvDs.size(); // ds包含的组数
hs.resize(numGroup * numLiterature); hs.resize(numGroup * numLiterature);
@ -176,9 +173,6 @@ void ThreadProcessData(const ThreadParam& param) {
for (int j = 0; j < numWord; ++j) { for (int j = 0; j < numWord; ++j) {
if (vusWord[i].find(vDs[j]) != vusWord[i].end()) { // 这一组单词中的j索引位置的单词在第i个文献中出现过 if (vusWord[i].find(vDs[j]) != vusWord[i].end()) { // 这一组单词中的j索引位置的单词在第i个文献中出现过
vX[i][j] = 1; vX[i][j] = 1;
if (groupIdx == 1 && i == 2) {
// cout << matFilePath.string() << '\t' << j+1 << '\t' << vDs[j] << endl;
}
} }
} }
} }
@ -216,8 +210,8 @@ void ThreadProcessData(const ThreadParam& param) {
} }
/* 将结果hs和hr写入每个知识颗粒的目录内 */ /* 将结果hs和hr写入每个知识颗粒的目录内 */
MATFile* pMatFile = matOpen(outFilePath.string().c_str(), "w"); MATFile* pMatFile = matOpen(outFilePath.string().c_str(), "w");
SaveMtxDouble(hs.data(), pMatFile, "hs1", numGroup, numLiterature); SaveMtxDouble(hs.data(), pMatFile, "hs", numGroup, numLiterature);
SaveMtxDouble(hr.data(), pMatFile, "hr1", numLiterature, numGroup); SaveMtxDouble(hr.data(), pMatFile, "hr", numLiterature, numGroup);
matClose(pMatFile); matClose(pMatFile);
} }
@ -228,7 +222,7 @@ void CalcEntropy(int argc, const char** argv) {
// 1. 知识颗粒的父目录名称 // 1. 知识颗粒的父目录名称
// 2. 包含高频词汇信息的mat文件的后缀 // 2. 包含高频词汇信息的mat文件的后缀
// 3. 包含处理后的pubmed文献信息的mat文件路径 // 3. 包含处理后的pubmed文献信息的mat文件路径
// 4. 存放输出结果的mat文件的后缀(每个知识颗粒目录中生成一个结果文件) // 4. 存放输出结果的mat文件(每个知识颗粒目录中生成一个结果文件)
// 5. 线程数量(可选) // 5. 线程数量(可选)
if (argc < 5) { if (argc < 5) {
cout << "This program should take at least 4 arguments(1.parrent Dir; 2. mat file suffix; 3. pubmed mat file; 4. out mat filename; [5. thread number]; [6. word out mat filepath])!" << endl; cout << "This program should take at least 4 arguments(1.parrent Dir; 2. mat file suffix; 3. pubmed mat file; 4. out mat filename; [5. thread number]; [6. word out mat filepath])!" << endl;
@ -241,7 +235,6 @@ void CalcEntropy(int argc, const char** argv) {
int numThread = 1; int numThread = 1;
if (argc >= 5) numThread = atoi(argv[5]); if (argc >= 5) numThread = atoi(argv[5]);
if (numThread < 1) numThread = 1; if (numThread < 1) numThread = 1;
// cout << "thread num: " << numThread << endl;
/* 读入处理后的pubmed文献信息的mat文件只读入摘要信息即变量abs1 */ /* 读入处理后的pubmed文献信息的mat文件只读入摘要信息即变量abs1 */
vector<string> vAbstract; vector<string> vAbstract;
@ -281,11 +274,13 @@ void CalcEntropy(int argc, const char** argv) {
for (auto& word : vWord) { for (auto& word : vWord) {
string upWord(word); string upWord(word);
transform(upWord.begin(), upWord.end(), upWord.begin(), ::toupper); transform(upWord.begin(), upWord.end(), upWord.begin(), ::toupper);
// cout << upWord << endl;
vusAbsWord[i].insert(upWord); vusAbsWord[i].insert(upWord);
} }
} }
finish = clock();
cout << "read abstract time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
/* 将分割结果写入mat文件 */ /* 将分割结果写入mat文件 */
begin = clock();
if (argc >= 6) { if (argc >= 6) {
MATFile* pMatFile = matOpen(argv[6], "w"); MATFile* pMatFile = matOpen(argv[6], "w");
mxArray* pCellMtx= mxCreateCellMatrix(1, vvWordMtx.size()); mxArray* pCellMtx= mxCreateCellMatrix(1, vvWordMtx.size());
@ -297,21 +292,16 @@ void CalcEntropy(int argc, const char** argv) {
} }
mxSetCell(pCellMtx, i, pChildCellMtx); mxSetCell(pCellMtx, i, pChildCellMtx);
} }
matPutVariable(pMatFile, "wd1", pCellMtx); matPutVariable(pMatFile, "wd", pCellMtx);
matClose(pMatFile); matClose(pMatFile);
mxDestroyArray(pCellMtx); mxDestroyArray(pCellMtx);
} }
finish = clock(); finish = clock();
cout << "abstract time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; cout << "write abstract word time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
//auto & vTest = vvWordMtx[0];
//cout << vTest.size() << endl;
//for (auto& str : vTest) cout << str << endl;
/* 遍历所有的知识颗粒目录,逐一进行处理 */ /* 遍历所有的知识颗粒目录,逐一进行处理 */
begin = clock(); begin = clock();
ThreadPool thPool(numThread); //ThreadPool thPool(numThread);
// ThreadPool thPool(24);
// 查看知识颗粒数量 // 查看知识颗粒数量
int numKnowledgeParticle = 0; int numKnowledgeParticle = 0;
FOREACH_PARTICLE_START FOREACH_PARTICLE_START
@ -319,27 +309,21 @@ void CalcEntropy(int argc, const char** argv) {
FOREACH_PARTICLE_END FOREACH_PARTICLE_END
// 遍历每个知识颗粒,逐一进行处理 // 遍历每个知识颗粒,逐一进行处理
vector<ThreadParam> vTP;
for (int round = 0; round < 1; ++round) { // 测试用 for (int round = 0; round < 1; ++round) { // 测试用
int i = 0; int i = 0;
FOREACH_PARTICLE_START FOREACH_PARTICLE_START
ThreadParam tParam = { file, childDir / outFileName, &vusAbsWord }; //ThreadParam tParam = { file, childDir / outFileName, &vusAbsWord };
thPool.enqueue(ThreadProcessData, tParam); //thPool.enqueue(ThreadProcessData, tParam);
vTP.push_back({ file, childDir / outFileName, &vusAbsWord });
i++; i++;
FOREACH_PARTICLE_END FOREACH_PARTICLE_END
} }
kt_for(numThread, ThreadProcessData, vTP);
// synchronize // synchronize
thPool.~ThreadPool(); //thPool.~ThreadPool();
finish = clock(); finish = clock();
cout << "thread pool time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; cout << "thread pool time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
//ofstream ofs("test_out.txt");
//for (auto& item : vEntropyResult) {
// auto& vvEntropy = item.vvEntropy;
// auto& vVal = vvEntropy[0];
// for (auto& val : vVal) ofs << val << ' ';
// ofs << endl;
//}
//ofs.close();
} }

View File

@ -27,6 +27,7 @@ int main(int argc, const char** argv) {
} }
else if (string(argv[1]) == "CalcEntropy") { else if (string(argv[1]) == "CalcEntropy") {
/* 计算信息熵 */ /* 计算信息熵 */
cout << "CalcEntropy" << endl;
CalcEntropy(argc - 1, argv + 1); CalcEntropy(argc - 1, argv + 1);
} }
finish = clock(); finish = clock();

View File

@ -14,7 +14,9 @@
#include <unordered_map> #include <unordered_map>
#include <mat.h> #include <mat.h>
#include "common.h" #include "common.h"
#include "CommonLib/thread_pool.h"
#include "CommonLib/matlab_io.h" #include "CommonLib/matlab_io.h"
#include "CommonLib/kthread.h"
using namespace std; using namespace std;
/* 将结果写入mat文件 */ /* 将结果写入mat文件 */
@ -35,7 +37,7 @@ bool SavePubmed(const string& matPath,
} }
// 创建结构体数据 // 创建结构体数据
mxArray* mxStruct = mxCreateStructMatrix(1, 1, vTgName.size(), vTgChars.data()); mxArray* mxStruct = mxCreateStructMatrix(1, 1, (int)vTgName.size(), vTgChars.data());
// 创建cell matrix // 创建cell matrix
unordered_map<string, mxArray*> ummxCellMtx; unordered_map<string, mxArray*> ummxCellMtx;
for (auto & tgName : vTgName) { for (auto & tgName : vTgName) {
@ -68,9 +70,45 @@ bool SavePubmed(const string& matPath,
return true; return true;
} }
/* 处理一篇文章 */
struct ThreadParam { // 线程参数
unordered_map<string, string> *pumTagContent;
vector<string>* pvLineTag;
vector<string>* pvTgName;
int paperStartIdx;
int paperEndIdx;
unordered_map<string, string>* pumFullTagToTag;
vector<string>* pvStrPubmedTxt;
};
//void ThreadProcessArticle(vector<ThreadParam>& vTP, long idx, int tid) {
void ThreadProcessArticle(ThreadParam& param) {
//ThreadParam& param = vTP[idx];
unordered_map<string, string>& umTagContent = *param.pumTagContent;
vector<string>& vLineTag = *param.pvLineTag;
vector<string>& vTgName = *param.pvTgName;
unordered_map<string, string>& umFullTagToTag = *param.pumFullTagToTag;
vector<string>& vStrPubmedTxt = *param.pvStrPubmedTxt;
int startIdx = param.paperStartIdx;
int endIdx = param.paperEndIdx;
for (int tgIdx = 0; tgIdx < vTgName.size(); ++tgIdx) {
umTagContent[vTgName[tgIdx]] = ""; // 对每一个tag设置一个新的string
}
for (int idx = startIdx; idx < endIdx; ++idx) { // 遍历当前文章的每一个tag内容
string& fullTag = vLineTag[idx];
auto tagItr = umFullTagToTag.find(fullTag);
if (tagItr != umFullTagToTag.end()) { // 找到tag了
const string& tag = tagItr->second;
string& tagContent = umTagContent[tag];
tagContent.append(vStrPubmedTxt[idx]);
}
}
}
// 命令行参数示例 // 命令行参数示例
// ProcessPubmedTxt d:\Twirls\gat1\literatures\pubmed_tag.mat D:\Twirls\runtime\negatives\pubmed-multiplesc-set.txt d:\pubmed_txt.mat // ProcessPubmedTxt d:\Twirls\gat1\literatures\pubmed_tag.mat D:\Twirls\runtime\negatives\pubmed-multiplesc-set.txt d:\pubmed_txt.mat 12
/* /*
pubmed txttagtag pubmed txttagtag
1. pubmed tags, tags'-'' 'tag 1. pubmed tags, tags'-'' 'tag
@ -81,11 +119,11 @@ bool SavePubmed(const string& matPath,
void ProcessPubmedTxt(int argc, const char** argv) { void ProcessPubmedTxt(int argc, const char** argv) {
// argv 1.pubmed tag.mat文件; 2.pubmed article.txt文件; 3.pubmed out.mat输出文件 // argv 1.pubmed tag.mat文件; 2.pubmed article.txt文件; 3.pubmed out.mat输出文件
// //
if (argc != 4) { if (argc < 4) {
cout << "This program should take 3 arguments(1.pubmed tag.mat; 2. pubmed article.txt; 3. pubmed out.mat)!" << endl; cout << "This program should take at least 3 arguments(1.pubmed tag.mat; 2. pubmed article.txt; 3. pubmed out.mat; [4. thread num])!" << endl;
return; return;
} }
clock_t begin, finish;
int rowNum, colNum; int rowNum, colNum;
vector<string> vTg; vector<string> vTg;
vector<string> vTgName; vector<string> vTgName;
@ -94,6 +132,7 @@ void ProcessPubmedTxt(int argc, const char** argv) {
/* 读取pubmed tags */ /* 读取pubmed tags */
ReadMtxString(argv[1], "tg", vTg, &rowNum, &colNum); ReadMtxString(argv[1], "tg", vTg, &rowNum, &colNum);
/* 1. 去掉tags里的'-'和' '字符得到纯净的tag */ /* 1. 去掉tags里的'-'和' '字符得到纯净的tag */
begin = clock();
vTgName = vTg; vTgName = vTg;
for (int i = 0; i < vTg.size(); ++i) { for (int i = 0; i < vTg.size(); ++i) {
int pos = 0; int pos = 0;
@ -105,6 +144,8 @@ void ProcessPubmedTxt(int argc, const char** argv) {
vTgName[i].resize(pos); vTgName[i].resize(pos);
umFullTagToTag[vTg[i]] = vTgName[i]; umFullTagToTag[vTg[i]] = vTgName[i];
} }
finish = clock();
cout << "process tag Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
/* 2. 读取pubmed txt文件先读入后处理 */ /* 2. 读取pubmed txt文件先读入后处理 */
ifstream ifsPubmedTxt(argv[2]); ifstream ifsPubmedTxt(argv[2]);
@ -117,6 +158,7 @@ void ProcessPubmedTxt(int argc, const char** argv) {
int curPos = 0; int curPos = 0;
vPaperStartIdx.push_back(curPos); // 添加初始索引 vPaperStartIdx.push_back(curPos); // 添加初始索引
const int FULL_TAG_LEN = 5; const int FULL_TAG_LEN = 5;
begin = clock();
while (getline(ifsPubmedTxt, strLine)) { // 读取内容时候去掉了行尾的换行符 while (getline(ifsPubmedTxt, strLine)) { // 读取内容时候去掉了行尾的换行符
while (strLine.back() == ' ') strLine.pop_back(); // 去掉行尾的空格 while (strLine.back() == ' ') strLine.pop_back(); // 去掉行尾的空格
if (strLine.size() == 0) { // 新的paper if (strLine.size() == 0) { // 新的paper
@ -135,31 +177,27 @@ void ProcessPubmedTxt(int argc, const char** argv) {
} }
} }
vPaperStartIdx.push_back(curPos); // 比文章多1最后一个记录结束位置 vPaperStartIdx.push_back(curPos); // 比文章多1最后一个记录结束位置
finish = clock();
cout << "read txt Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
/* 处理每一篇文章 */ /* 处理每一篇文章 */
for (int i = 0; i < vPaperStartIdx.size() - 1; ++i) { int numThread = 1;
int startIdx = vPaperStartIdx[i]; if (argc >= 5) numThread = atoi(argv[4]);
int endIdx = vPaperStartIdx[i + 1]; if (numThread < 1) numThread = 1;
unordered_map<string, string> umTagContent; ThreadPool thPool(numThread);
vumPaperTagVal.resize(vPaperStartIdx.size()-1);
for (int tgIdx = 0; tgIdx < vTgName.size(); ++tgIdx) { vector<thread> vT;
umTagContent[vTgName[tgIdx]] = ""; // 对每一个tag设置一个新的string vector<ThreadParam> vTP(vPaperStartIdx.size() - 1);
} begin = clock();
for (int idx = startIdx; idx < endIdx; ++idx) { // 遍历当前文章的每一个tag内容 for (int i = 0; i < vTP.size(); ++i) {
string& fullTag = vLineTag[idx]; vTP[i] = { &vumPaperTagVal[i], &vLineTag, &vTgName, vPaperStartIdx[i], vPaperStartIdx[i + 1], &umFullTagToTag, &vStrPubmedTxt };
auto tagItr = umFullTagToTag.find(fullTag);
if (tagItr != umFullTagToTag.end()) { // 找到tag了
const string& tag = tagItr->second;
string& tagContent = umTagContent[tag];
tagContent.append(vStrPubmedTxt[idx]);
}
}
vumPaperTagVal.push_back(umTagContent);
} }
kt_for(numThread, ThreadProcessArticle, vTP);
// cout << "文件个数:" << vumPaperTagVal.size() << endl; finish = clock();
cout << "kt for Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
/* 去除没有摘要的文章 */ /* 去除没有摘要的文章 */
begin = clock();
const string abstractTag = "AB"; const string abstractTag = "AB";
for (auto itr = vumPaperTagVal.begin(); itr != vumPaperTagVal.end(); ) { for (auto itr = vumPaperTagVal.begin(); itr != vumPaperTagVal.end(); ) {
if ((*itr)[abstractTag].size() == 0) { if ((*itr)[abstractTag].size() == 0) {
@ -169,8 +207,11 @@ void ProcessPubmedTxt(int argc, const char** argv) {
itr++; itr++;
} }
} }
finish = clock();
cout << "remove no AB Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
/* 根据PMID去除冗余 */ /* 根据PMID去除冗余 */
begin = clock();
unordered_map<string, int> umPMID; unordered_map<string, int> umPMID;
const string pmidTag = "PMID"; const string pmidTag = "PMID";
for (auto itr = vumPaperTagVal.begin(); itr != vumPaperTagVal.end(); ) { for (auto itr = vumPaperTagVal.begin(); itr != vumPaperTagVal.end(); ) {
@ -183,16 +224,25 @@ void ProcessPubmedTxt(int argc, const char** argv) {
itr++; itr++;
} }
} }
finish = clock();
cout << "remove duplication Total time : " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
/* 将title和abstract合并赋值给abstract */ /* 将title和abstract合并赋值给abstract */
begin = clock();
const string titleTag = "TI"; const string titleTag = "TI";
for (auto itr = vumPaperTagVal.begin(); itr != vumPaperTagVal.end(); itr++) { for (auto itr = vumPaperTagVal.begin(); itr != vumPaperTagVal.end(); itr++) {
string& abstractStr = (*itr)[abstractTag]; string& abstractStr = (*itr)[abstractTag];
abstractStr = (*itr)[titleTag] + " " + abstractStr; // 可能会有性能损失,不过影响不大 abstractStr = (*itr)[titleTag] + " " + abstractStr; // 可能会有性能损失,不过影响不大
} }
finish = clock();
cout << "merge abs and title Total time : " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
// 关闭txt文件
ifsPubmedTxt.close(); ifsPubmedTxt.close();
/* 将处理后的数据写入mat文件mat中的变量名称分别为Tx和abs1 */ /* 将处理后的数据写入mat文件mat中的变量名称分别为Tx和abs1 */
begin = clock();
SavePubmed(argv[3], vTgName, vumPaperTagVal); SavePubmed(argv[3], vTgName, vumPaperTagVal);
finish = clock();
cout << "write to MAT Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
} }

View File

@ -30,8 +30,9 @@
#endif #endif
#include <mat.h> #include <mat.h>
#include "gmm.h" #include "gmm.h"
#include "CommonLib/thread_pool.h" // #include "CommonLib/thread_pool.h"
#include "CommonLib/matlab_io.h" #include "CommonLib/matlab_io.h"
#include "CommonLib/kthread.h"
using namespace std; using namespace std;
using std::cout; using std::cout;
using std::vector; using std::vector;
@ -144,7 +145,9 @@ struct ThreadParam {
fs::path matFilePath; fs::path matFilePath;
fs::path outFilePath; fs::path outFilePath;
}; };
void ThreadProcessData(const ThreadParam& param) { //void ThreadProcessData(vector<ThreadParam>& vTP, long idx, int tid) {
void ThreadProcessData(ThreadParam& param) {
//const ThreadParam& param = vTP[idx];
const fs::path& matFilePath = param.matFilePath; const fs::path& matFilePath = param.matFilePath;
const fs::path& outFilePath = param.outFilePath; const fs::path& outFilePath = param.outFilePath;
double* hs = nullptr; double* hs = nullptr;
@ -192,10 +195,10 @@ int main(int argc, const char** argv) {
int numThread = 1; int numThread = 1;
if (argc >= 4) numThread = atoi(argv[4]); if (argc >= 4) numThread = atoi(argv[4]);
if (numThread < 1) numThread = 1; if (numThread < 1) numThread = 1;
ThreadPool thPool(numThread); //ThreadPool thPool(numThread);
clock_t begin, finish; clock_t begin, finish;
begin = clock(); begin = clock();
vector<ThreadParam> vTP;
/* 遍历所有的知识颗粒目录,逐一进行处理 */ /* 遍历所有的知识颗粒目录,逐一进行处理 */
for (auto& childDir : fs::directory_iterator(parrentDir)) { for (auto& childDir : fs::directory_iterator(parrentDir)) {
fs::path outFilePath = childDir / outFileName; fs::path outFilePath = childDir / outFileName;
@ -203,12 +206,14 @@ int main(int argc, const char** argv) {
const string& fileName = file.path().filename().string(); const string& fileName = file.path().filename().string();
auto rPos = fileName.rfind(hsMatSuffix); auto rPos = fileName.rfind(hsMatSuffix);
if (rPos != string::npos && fileName.size() - rPos == hsMatSuffix.size()) { if (rPos != string::npos && fileName.size() - rPos == hsMatSuffix.size()) {
ThreadParam tParam = { file, outFilePath }; //ThreadParam tParam = { file, outFilePath };
thPool.enqueue(ThreadProcessData, tParam); //thPool.enqueue(ThreadProcessData, tParam);
vTP.push_back({ file, outFilePath });
} }
} }
} }
thPool.~ThreadPool(); kt_for(numThread, ThreadProcessData, vTP);
//thPool.~ThreadPool();
finish = clock(); finish = clock();
cout << "GMM Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; cout << "GMM Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
return 0; return 0;