添加了kthread,并适配c++和windows
This commit is contained in:
parent
31f0382cc1
commit
0c73318fb7
|
|
@ -112,6 +112,7 @@
|
|||
</Text>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="kthread.h" />
|
||||
<ClInclude Include="matlab_io.h" />
|
||||
<ClInclude Include="thread_pool.h" />
|
||||
</ItemGroup>
|
||||
|
|
|
|||
|
|
@ -28,6 +28,9 @@
|
|||
<ClInclude Include="thread_pool.h">
|
||||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="kthread.h">
|
||||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="matlab_io.cpp">
|
||||
|
|
|
|||
|
|
@ -0,0 +1,145 @@
|
|||
#ifndef KTHREAD_H
|
||||
#define KTHREAD_H
|
||||
#include <stdlib.h>
|
||||
#include <limits.h>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <atomic>
|
||||
|
||||
using std::atomic;
|
||||
using std::thread;
|
||||
using std::vector;
|
||||
|
||||
/************
|
||||
* kt_for() *
|
||||
************/
|
||||
|
||||
template <typename T>
|
||||
using FuncType3Arg = void (*)(vector<T>&, long, int);
|
||||
|
||||
template <typename T>
|
||||
using FuncType1Arg = void (*)(T&);
|
||||
|
||||
template <class T>
|
||||
struct kt_for_t;
|
||||
|
||||
template <typename T>
|
||||
struct ktf_worker_t
|
||||
{
|
||||
kt_for_t<T>* t;
|
||||
atomic<long> i;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct kt_for_t
|
||||
{
|
||||
int n_threads;
|
||||
long n;
|
||||
|
||||
ktf_worker_t<T>* w;
|
||||
FuncType1Arg<T> func1Arg;
|
||||
FuncType3Arg<T> func3Arg;
|
||||
vector<T>* data;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
static inline long steal_work(kt_for_t<T>* t)
|
||||
{
|
||||
int i, min_i = -1;
|
||||
long k, min = LONG_MAX;
|
||||
for (i = 0; i < t->n_threads; ++i)
|
||||
if (min > t->w[i].i)
|
||||
min = t->w[i].i, min_i = i;
|
||||
k = t->w[min_i].i.fetch_add(t->n_threads);
|
||||
return k >= t->n ? -1 : k;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
static void ktf_worker_1_arg(void* data)
|
||||
{
|
||||
ktf_worker_t<T>* w = (ktf_worker_t<T> *)data;
|
||||
long i;
|
||||
|
||||
for (;;)
|
||||
{
|
||||
i = w->i.fetch_add(w->t->n_threads);
|
||||
if (i >= w->t->n)
|
||||
break;
|
||||
w->t->func1Arg(( * w->t->data)[i]);
|
||||
}
|
||||
while ((i = steal_work<T>(w->t)) >= 0)
|
||||
w->t->func1Arg((*w->t->data)[i]);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
static void ktf_worker_3_arg(void* data)
|
||||
{
|
||||
ktf_worker_t<T>* w = (ktf_worker_t<T> *)data;
|
||||
long i;
|
||||
|
||||
for (;;)
|
||||
{
|
||||
i = w->i.fetch_add(w->t->n_threads);
|
||||
if (i >= w->t->n)
|
||||
break;
|
||||
w->t->func3Arg(*w->t->data, i, w - w->t->w);
|
||||
}
|
||||
while ((i = steal_work<T>(w->t)) >= 0)
|
||||
w->t->func3Arg(*w->t->data, i, w - w->t->w);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void kt_for(int n_threads, FuncType3Arg<T> func, vector<T>& vData)
|
||||
{
|
||||
const long n = (long)vData.size();
|
||||
if (n_threads > 1)
|
||||
{
|
||||
int i;
|
||||
kt_for_t<T> t;
|
||||
t.func3Arg = func, t.data = &vData, t.n_threads = n_threads, t.n = n;
|
||||
t.w = (ktf_worker_t<T> *)alloca(n_threads * sizeof(ktf_worker_t<T>));
|
||||
vector<thread> vThread;
|
||||
|
||||
for (i = 0; i < n_threads; ++i)
|
||||
t.w[i].t = &t, t.w[i].i.store(i);
|
||||
for (i = 0; i < n_threads; ++i)
|
||||
vThread.push_back(thread(ktf_worker_3_arg<T>, &t.w[i]));
|
||||
for (i = 0; i < n_threads; ++i)
|
||||
vThread[i].join();
|
||||
}
|
||||
else
|
||||
{
|
||||
long j;
|
||||
for (j = 0; j < n; ++j)
|
||||
func(vData, j, 0);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void kt_for(int n_threads, FuncType1Arg<T> func, vector<T>& vData)
|
||||
{
|
||||
const long n = (long)vData.size();
|
||||
if (n_threads > 1)
|
||||
{
|
||||
int i;
|
||||
kt_for_t<T> t;
|
||||
t.func1Arg = func, t.data = &vData, t.n_threads = n_threads, t.n = n;
|
||||
t.w = (ktf_worker_t<T> *)alloca(n_threads * sizeof(ktf_worker_t<T>));
|
||||
vector<thread> vThread;
|
||||
|
||||
for (i = 0; i < n_threads; ++i)
|
||||
t.w[i].t = &t, t.w[i].i.store(i);
|
||||
for (i = 0; i < n_threads; ++i)
|
||||
vThread.push_back(thread(ktf_worker_1_arg<T>, &t.w[i]));
|
||||
for (i = 0; i < n_threads; ++i)
|
||||
vThread[i].join();
|
||||
}
|
||||
else
|
||||
{
|
||||
long j;
|
||||
for (j = 0; j < n; ++j)
|
||||
func(vData[j]);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -33,6 +33,7 @@
|
|||
#include "common.h"
|
||||
#include "CommonLib/thread_pool.h"
|
||||
#include "CommonLib/matlab_io.h"
|
||||
#include "CommonLib/kthread.h"
|
||||
using namespace std;
|
||||
using std::cout;
|
||||
using std::vector;
|
||||
|
|
@ -106,7 +107,6 @@ bool ReadInfoFromMat(const string & filePath, vector<vector<string> >&vvDs, vect
|
|||
// 读取ds字符串
|
||||
pMxArray = mxGetField(pMxG, 0, firstChildName.c_str()); // ds
|
||||
OUTER_FOR_BEGIN
|
||||
// cout << childRowNum << '\t' << childColNum << endl;
|
||||
vvDs.push_back(vector<string>());
|
||||
vvDs.back().resize(childRowNum * childColNum);
|
||||
INNTER_FOR_BEGIN
|
||||
|
|
@ -144,7 +144,7 @@ struct ThreadParam { //
|
|||
fs::path outFilePath;
|
||||
vector<unordered_set<string> >* pvusWord;
|
||||
};
|
||||
void ThreadProcessData(const ThreadParam& param) {
|
||||
void ThreadProcessData(ThreadParam& param) {
|
||||
const fs::path& matFilePath = param.matFilePath;
|
||||
const fs::path& outFilePath = param.outFilePath;
|
||||
vector <unordered_set<string> >& vusWord = *param.pvusWord;
|
||||
|
|
@ -156,11 +156,8 @@ void ThreadProcessData(const ThreadParam& param) {
|
|||
vector<vector<string> > vvDs; // 每个知识颗粒的ds矩阵(词汇矩阵)
|
||||
vector<vector<double> > vvFr; // 词汇对应的频率
|
||||
|
||||
// cout << matFilePath.string() << endl;
|
||||
// 读取G结构体中的ds和fr信息
|
||||
ReadInfoFromMat(matFilePath.string(), vvDs, vvFr);
|
||||
// res.vvEntropy.push_back(vvFr[0]);
|
||||
// cout << vvDs.size() << '\t' << vvDs[0].size() << endl;
|
||||
const int numLiterature = vusWord.size(); // pubmed 文件中包含的文献数量
|
||||
const int numGroup = vvDs.size(); // ds包含的组数
|
||||
hs.resize(numGroup * numLiterature);
|
||||
|
|
@ -176,9 +173,6 @@ void ThreadProcessData(const ThreadParam& param) {
|
|||
for (int j = 0; j < numWord; ++j) {
|
||||
if (vusWord[i].find(vDs[j]) != vusWord[i].end()) { // 这一组单词中的j索引位置的单词在第i个文献中出现过
|
||||
vX[i][j] = 1;
|
||||
if (groupIdx == 1 && i == 2) {
|
||||
// cout << matFilePath.string() << '\t' << j+1 << '\t' << vDs[j] << endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -216,8 +210,8 @@ void ThreadProcessData(const ThreadParam& param) {
|
|||
}
|
||||
/* 将结果(hs和hr)写入每个知识颗粒的目录内 */
|
||||
MATFile* pMatFile = matOpen(outFilePath.string().c_str(), "w");
|
||||
SaveMtxDouble(hs.data(), pMatFile, "hs1", numGroup, numLiterature);
|
||||
SaveMtxDouble(hr.data(), pMatFile, "hr1", numLiterature, numGroup);
|
||||
SaveMtxDouble(hs.data(), pMatFile, "hs", numGroup, numLiterature);
|
||||
SaveMtxDouble(hr.data(), pMatFile, "hr", numLiterature, numGroup);
|
||||
matClose(pMatFile);
|
||||
}
|
||||
|
||||
|
|
@ -228,7 +222,7 @@ void CalcEntropy(int argc, const char** argv) {
|
|||
// 1. 知识颗粒的父目录名称
|
||||
// 2. 包含高频词汇信息的mat文件的后缀
|
||||
// 3. 包含处理后的pubmed文献信息的mat文件路径
|
||||
// 4. 存放输出结果的mat文件的后缀(每个知识颗粒目录中生成一个结果文件)
|
||||
// 4. 存放输出结果的mat文件名(每个知识颗粒目录中生成一个结果文件)
|
||||
// 5. 线程数量(可选)
|
||||
if (argc < 5) {
|
||||
cout << "This program should take at least 4 arguments(1.parrent Dir; 2. mat file suffix; 3. pubmed mat file; 4. out mat filename; [5. thread number]; [6. word out mat filepath])!" << endl;
|
||||
|
|
@ -241,7 +235,6 @@ void CalcEntropy(int argc, const char** argv) {
|
|||
int numThread = 1;
|
||||
if (argc >= 5) numThread = atoi(argv[5]);
|
||||
if (numThread < 1) numThread = 1;
|
||||
// cout << "thread num: " << numThread << endl;
|
||||
|
||||
/* 读入处理后的pubmed文献信息的mat文件,只读入摘要信息,即变量abs1 */
|
||||
vector<string> vAbstract;
|
||||
|
|
@ -281,11 +274,13 @@ void CalcEntropy(int argc, const char** argv) {
|
|||
for (auto& word : vWord) {
|
||||
string upWord(word);
|
||||
transform(upWord.begin(), upWord.end(), upWord.begin(), ::toupper);
|
||||
// cout << upWord << endl;
|
||||
vusAbsWord[i].insert(upWord);
|
||||
}
|
||||
}
|
||||
finish = clock();
|
||||
cout << "read abstract time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||||
/* 将分割结果写入mat文件 */
|
||||
begin = clock();
|
||||
if (argc >= 6) {
|
||||
MATFile* pMatFile = matOpen(argv[6], "w");
|
||||
mxArray* pCellMtx= mxCreateCellMatrix(1, vvWordMtx.size());
|
||||
|
|
@ -297,21 +292,16 @@ void CalcEntropy(int argc, const char** argv) {
|
|||
}
|
||||
mxSetCell(pCellMtx, i, pChildCellMtx);
|
||||
}
|
||||
matPutVariable(pMatFile, "wd1", pCellMtx);
|
||||
matPutVariable(pMatFile, "wd", pCellMtx);
|
||||
matClose(pMatFile);
|
||||
mxDestroyArray(pCellMtx);
|
||||
}
|
||||
finish = clock();
|
||||
cout << "abstract time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||||
//auto & vTest = vvWordMtx[0];
|
||||
//cout << vTest.size() << endl;
|
||||
//for (auto& str : vTest) cout << str << endl;
|
||||
|
||||
cout << "write abstract word time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||||
|
||||
/* 遍历所有的知识颗粒目录,逐一进行处理 */
|
||||
begin = clock();
|
||||
ThreadPool thPool(numThread);
|
||||
// ThreadPool thPool(24);
|
||||
//ThreadPool thPool(numThread);
|
||||
// 查看知识颗粒数量
|
||||
int numKnowledgeParticle = 0;
|
||||
FOREACH_PARTICLE_START
|
||||
|
|
@ -319,27 +309,21 @@ void CalcEntropy(int argc, const char** argv) {
|
|||
FOREACH_PARTICLE_END
|
||||
|
||||
// 遍历每个知识颗粒,逐一进行处理
|
||||
vector<ThreadParam> vTP;
|
||||
for (int round = 0; round < 1; ++round) { // 测试用
|
||||
int i = 0;
|
||||
FOREACH_PARTICLE_START
|
||||
ThreadParam tParam = { file, childDir / outFileName, &vusAbsWord };
|
||||
thPool.enqueue(ThreadProcessData, tParam);
|
||||
//ThreadParam tParam = { file, childDir / outFileName, &vusAbsWord };
|
||||
//thPool.enqueue(ThreadProcessData, tParam);
|
||||
vTP.push_back({ file, childDir / outFileName, &vusAbsWord });
|
||||
i++;
|
||||
FOREACH_PARTICLE_END
|
||||
}
|
||||
kt_for(numThread, ThreadProcessData, vTP);
|
||||
|
||||
// synchronize
|
||||
thPool.~ThreadPool();
|
||||
//thPool.~ThreadPool();
|
||||
finish = clock();
|
||||
|
||||
cout << "thread pool time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||||
|
||||
//ofstream ofs("test_out.txt");
|
||||
//for (auto& item : vEntropyResult) {
|
||||
// auto& vvEntropy = item.vvEntropy;
|
||||
// auto& vVal = vvEntropy[0];
|
||||
// for (auto& val : vVal) ofs << val << ' ';
|
||||
// ofs << endl;
|
||||
//}
|
||||
//ofs.close();
|
||||
}
|
||||
|
|
@ -27,6 +27,7 @@ int main(int argc, const char** argv) {
|
|||
}
|
||||
else if (string(argv[1]) == "CalcEntropy") {
|
||||
/* 计算信息熵 */
|
||||
cout << "CalcEntropy" << endl;
|
||||
CalcEntropy(argc - 1, argv + 1);
|
||||
}
|
||||
finish = clock();
|
||||
|
|
|
|||
|
|
@ -14,7 +14,9 @@
|
|||
#include <unordered_map>
|
||||
#include <mat.h>
|
||||
#include "common.h"
|
||||
#include "CommonLib/thread_pool.h"
|
||||
#include "CommonLib/matlab_io.h"
|
||||
#include "CommonLib/kthread.h"
|
||||
using namespace std;
|
||||
|
||||
/* 将结果写入mat文件 */
|
||||
|
|
@ -35,7 +37,7 @@ bool SavePubmed(const string& matPath,
|
|||
}
|
||||
|
||||
// 创建结构体数据
|
||||
mxArray* mxStruct = mxCreateStructMatrix(1, 1, vTgName.size(), vTgChars.data());
|
||||
mxArray* mxStruct = mxCreateStructMatrix(1, 1, (int)vTgName.size(), vTgChars.data());
|
||||
// 创建cell matrix
|
||||
unordered_map<string, mxArray*> ummxCellMtx;
|
||||
for (auto & tgName : vTgName) {
|
||||
|
|
@ -68,9 +70,45 @@ bool SavePubmed(const string& matPath,
|
|||
|
||||
return true;
|
||||
}
|
||||
/* 处理一篇文章 */
|
||||
struct ThreadParam { // 线程参数
|
||||
unordered_map<string, string> *pumTagContent;
|
||||
vector<string>* pvLineTag;
|
||||
vector<string>* pvTgName;
|
||||
int paperStartIdx;
|
||||
int paperEndIdx;
|
||||
unordered_map<string, string>* pumFullTagToTag;
|
||||
vector<string>* pvStrPubmedTxt;
|
||||
};
|
||||
|
||||
//void ThreadProcessArticle(vector<ThreadParam>& vTP, long idx, int tid) {
|
||||
void ThreadProcessArticle(ThreadParam& param) {
|
||||
//ThreadParam& param = vTP[idx];
|
||||
unordered_map<string, string>& umTagContent = *param.pumTagContent;
|
||||
vector<string>& vLineTag = *param.pvLineTag;
|
||||
vector<string>& vTgName = *param.pvTgName;
|
||||
unordered_map<string, string>& umFullTagToTag = *param.pumFullTagToTag;
|
||||
vector<string>& vStrPubmedTxt = *param.pvStrPubmedTxt;
|
||||
|
||||
int startIdx = param.paperStartIdx;
|
||||
int endIdx = param.paperEndIdx;
|
||||
|
||||
for (int tgIdx = 0; tgIdx < vTgName.size(); ++tgIdx) {
|
||||
umTagContent[vTgName[tgIdx]] = ""; // 对每一个tag,设置一个新的string
|
||||
}
|
||||
for (int idx = startIdx; idx < endIdx; ++idx) { // 遍历当前文章的每一个tag内容
|
||||
string& fullTag = vLineTag[idx];
|
||||
auto tagItr = umFullTagToTag.find(fullTag);
|
||||
if (tagItr != umFullTagToTag.end()) { // 找到tag了
|
||||
const string& tag = tagItr->second;
|
||||
string& tagContent = umTagContent[tag];
|
||||
tagContent.append(vStrPubmedTxt[idx]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 命令行参数示例
|
||||
// ProcessPubmedTxt d:\Twirls\gat1\literatures\pubmed_tag.mat D:\Twirls\runtime\negatives\pubmed-multiplesc-set.txt d:\pubmed_txt.mat
|
||||
// ProcessPubmedTxt d:\Twirls\gat1\literatures\pubmed_tag.mat D:\Twirls\runtime\negatives\pubmed-multiplesc-set.txt d:\pubmed_txt.mat 12
|
||||
/*
|
||||
pubmed txt文件中包含多个文章的摘要信息,每个信息最前边有一个tag,每个tag对应的信息可能有一行,也可能多行,每个文章中间由一个空行隔开
|
||||
1. 读取预先提取的pubmed tags, 并将tags中的'-'和' '字符去掉,只留下纯字符串做tag
|
||||
|
|
@ -81,11 +119,11 @@ bool SavePubmed(const string& matPath,
|
|||
void ProcessPubmedTxt(int argc, const char** argv) {
|
||||
// argv 1.pubmed tag.mat文件; 2.pubmed article.txt文件; 3.pubmed out.mat输出文件
|
||||
//
|
||||
if (argc != 4) {
|
||||
cout << "This program should take 3 arguments(1.pubmed tag.mat; 2. pubmed article.txt; 3. pubmed out.mat)!" << endl;
|
||||
if (argc < 4) {
|
||||
cout << "This program should take at least 3 arguments(1.pubmed tag.mat; 2. pubmed article.txt; 3. pubmed out.mat; [4. thread num])!" << endl;
|
||||
return;
|
||||
}
|
||||
|
||||
clock_t begin, finish;
|
||||
int rowNum, colNum;
|
||||
vector<string> vTg;
|
||||
vector<string> vTgName;
|
||||
|
|
@ -94,6 +132,7 @@ void ProcessPubmedTxt(int argc, const char** argv) {
|
|||
/* 读取pubmed tags */
|
||||
ReadMtxString(argv[1], "tg", vTg, &rowNum, &colNum);
|
||||
/* 1. 去掉tags里的'-'和' '字符,得到纯净的tag */
|
||||
begin = clock();
|
||||
vTgName = vTg;
|
||||
for (int i = 0; i < vTg.size(); ++i) {
|
||||
int pos = 0;
|
||||
|
|
@ -105,6 +144,8 @@ void ProcessPubmedTxt(int argc, const char** argv) {
|
|||
vTgName[i].resize(pos);
|
||||
umFullTagToTag[vTg[i]] = vTgName[i];
|
||||
}
|
||||
finish = clock();
|
||||
cout << "process tag Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||||
|
||||
/* 2. 读取pubmed txt文件,先读入后处理 */
|
||||
ifstream ifsPubmedTxt(argv[2]);
|
||||
|
|
@ -117,6 +158,7 @@ void ProcessPubmedTxt(int argc, const char** argv) {
|
|||
int curPos = 0;
|
||||
vPaperStartIdx.push_back(curPos); // 添加初始索引
|
||||
const int FULL_TAG_LEN = 5;
|
||||
begin = clock();
|
||||
while (getline(ifsPubmedTxt, strLine)) { // 读取内容时候去掉了行尾的换行符
|
||||
while (strLine.back() == ' ') strLine.pop_back(); // 去掉行尾的空格
|
||||
if (strLine.size() == 0) { // 新的paper
|
||||
|
|
@ -135,31 +177,27 @@ void ProcessPubmedTxt(int argc, const char** argv) {
|
|||
}
|
||||
}
|
||||
vPaperStartIdx.push_back(curPos); // 比文章多1,最后一个记录结束位置
|
||||
finish = clock();
|
||||
cout << "read txt Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||||
|
||||
/* 处理每一篇文章 */
|
||||
for (int i = 0; i < vPaperStartIdx.size() - 1; ++i) {
|
||||
int startIdx = vPaperStartIdx[i];
|
||||
int endIdx = vPaperStartIdx[i + 1];
|
||||
unordered_map<string, string> umTagContent;
|
||||
|
||||
for (int tgIdx = 0; tgIdx < vTgName.size(); ++tgIdx) {
|
||||
umTagContent[vTgName[tgIdx]] = ""; // 对每一个tag,设置一个新的string
|
||||
}
|
||||
for (int idx = startIdx; idx < endIdx; ++idx) { // 遍历当前文章的每一个tag内容
|
||||
string& fullTag = vLineTag[idx];
|
||||
auto tagItr = umFullTagToTag.find(fullTag);
|
||||
if (tagItr != umFullTagToTag.end()) { // 找到tag了
|
||||
const string& tag = tagItr->second;
|
||||
string& tagContent = umTagContent[tag];
|
||||
tagContent.append(vStrPubmedTxt[idx]);
|
||||
}
|
||||
}
|
||||
vumPaperTagVal.push_back(umTagContent);
|
||||
int numThread = 1;
|
||||
if (argc >= 5) numThread = atoi(argv[4]);
|
||||
if (numThread < 1) numThread = 1;
|
||||
ThreadPool thPool(numThread);
|
||||
vumPaperTagVal.resize(vPaperStartIdx.size()-1);
|
||||
vector<thread> vT;
|
||||
vector<ThreadParam> vTP(vPaperStartIdx.size() - 1);
|
||||
begin = clock();
|
||||
for (int i = 0; i < vTP.size(); ++i) {
|
||||
vTP[i] = { &vumPaperTagVal[i], &vLineTag, &vTgName, vPaperStartIdx[i], vPaperStartIdx[i + 1], &umFullTagToTag, &vStrPubmedTxt };
|
||||
}
|
||||
|
||||
// cout << "文件个数:" << vumPaperTagVal.size() << endl;
|
||||
kt_for(numThread, ThreadProcessArticle, vTP);
|
||||
finish = clock();
|
||||
cout << "kt for Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||||
|
||||
/* 去除没有摘要的文章 */
|
||||
begin = clock();
|
||||
const string abstractTag = "AB";
|
||||
for (auto itr = vumPaperTagVal.begin(); itr != vumPaperTagVal.end(); ) {
|
||||
if ((*itr)[abstractTag].size() == 0) {
|
||||
|
|
@ -169,8 +207,11 @@ void ProcessPubmedTxt(int argc, const char** argv) {
|
|||
itr++;
|
||||
}
|
||||
}
|
||||
finish = clock();
|
||||
cout << "remove no AB Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||||
|
||||
/* 根据PMID,去除冗余 */
|
||||
begin = clock();
|
||||
unordered_map<string, int> umPMID;
|
||||
const string pmidTag = "PMID";
|
||||
for (auto itr = vumPaperTagVal.begin(); itr != vumPaperTagVal.end(); ) {
|
||||
|
|
@ -183,16 +224,25 @@ void ProcessPubmedTxt(int argc, const char** argv) {
|
|||
itr++;
|
||||
}
|
||||
}
|
||||
finish = clock();
|
||||
cout << "remove duplication Total time : " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||||
|
||||
/* 将title和abstract合并,赋值给abstract */
|
||||
begin = clock();
|
||||
const string titleTag = "TI";
|
||||
for (auto itr = vumPaperTagVal.begin(); itr != vumPaperTagVal.end(); itr++) {
|
||||
string& abstractStr = (*itr)[abstractTag];
|
||||
abstractStr = (*itr)[titleTag] + " " + abstractStr; // 可能会有性能损失,不过影响不大
|
||||
}
|
||||
finish = clock();
|
||||
cout << "merge abs and title Total time : " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||||
|
||||
// 关闭txt文件
|
||||
ifsPubmedTxt.close();
|
||||
|
||||
/* 将处理后的数据写入mat文件,mat中的变量名称分别为Tx和abs1 */
|
||||
begin = clock();
|
||||
SavePubmed(argv[3], vTgName, vumPaperTagVal);
|
||||
finish = clock();
|
||||
cout << "write to MAT Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||||
}
|
||||
19
GMM/main.cpp
19
GMM/main.cpp
|
|
@ -30,8 +30,9 @@
|
|||
#endif
|
||||
#include <mat.h>
|
||||
#include "gmm.h"
|
||||
#include "CommonLib/thread_pool.h"
|
||||
// #include "CommonLib/thread_pool.h"
|
||||
#include "CommonLib/matlab_io.h"
|
||||
#include "CommonLib/kthread.h"
|
||||
using namespace std;
|
||||
using std::cout;
|
||||
using std::vector;
|
||||
|
|
@ -144,7 +145,9 @@ struct ThreadParam {
|
|||
fs::path matFilePath;
|
||||
fs::path outFilePath;
|
||||
};
|
||||
void ThreadProcessData(const ThreadParam& param) {
|
||||
//void ThreadProcessData(vector<ThreadParam>& vTP, long idx, int tid) {
|
||||
void ThreadProcessData(ThreadParam& param) {
|
||||
//const ThreadParam& param = vTP[idx];
|
||||
const fs::path& matFilePath = param.matFilePath;
|
||||
const fs::path& outFilePath = param.outFilePath;
|
||||
double* hs = nullptr;
|
||||
|
|
@ -192,10 +195,10 @@ int main(int argc, const char** argv) {
|
|||
int numThread = 1;
|
||||
if (argc >= 4) numThread = atoi(argv[4]);
|
||||
if (numThread < 1) numThread = 1;
|
||||
ThreadPool thPool(numThread);
|
||||
//ThreadPool thPool(numThread);
|
||||
clock_t begin, finish;
|
||||
begin = clock();
|
||||
|
||||
vector<ThreadParam> vTP;
|
||||
/* 遍历所有的知识颗粒目录,逐一进行处理 */
|
||||
for (auto& childDir : fs::directory_iterator(parrentDir)) {
|
||||
fs::path outFilePath = childDir / outFileName;
|
||||
|
|
@ -203,12 +206,14 @@ int main(int argc, const char** argv) {
|
|||
const string& fileName = file.path().filename().string();
|
||||
auto rPos = fileName.rfind(hsMatSuffix);
|
||||
if (rPos != string::npos && fileName.size() - rPos == hsMatSuffix.size()) {
|
||||
ThreadParam tParam = { file, outFilePath };
|
||||
thPool.enqueue(ThreadProcessData, tParam);
|
||||
//ThreadParam tParam = { file, outFilePath };
|
||||
//thPool.enqueue(ThreadProcessData, tParam);
|
||||
vTP.push_back({ file, outFilePath });
|
||||
}
|
||||
}
|
||||
}
|
||||
thPool.~ThreadPool();
|
||||
kt_for(numThread, ThreadProcessData, vTP);
|
||||
//thPool.~ThreadPool();
|
||||
finish = clock();
|
||||
cout << "GMM Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||||
return 0;
|
||||
|
|
|
|||
Loading…
Reference in New Issue