解決了一个很诡异的bug,ThreadParam可能是由于命名的问题,导致vector内存错误

This commit is contained in:
zzh 2023-09-27 15:13:13 +08:00
parent 0c73318fb7
commit 215e1d3dea
1 changed files with 51 additions and 39 deletions

View File

@ -12,13 +12,16 @@
#include <string> #include <string>
#include <algorithm> #include <algorithm>
#include <unordered_map> #include <unordered_map>
#include <filesystem>
#include <mat.h> #include <mat.h>
#include "common.h" #include "common.h"
#include "CommonLib/thread_pool.h" #include "CommonLib/thread_pool.h"
#include "CommonLib/matlab_io.h" #include "CommonLib/matlab_io.h"
#include "CommonLib/kthread.h" #include "CommonLib/kthread.h"
namespace fs = std::filesystem;
using std::cout;
using std::vector;
using namespace std; using namespace std;
/* 将结果写入mat文件 */ /* 将结果写入mat文件 */
/* 将数据写入mat文件中用给定的名称命名 */ /* 将数据写入mat文件中用给定的名称命名 */
bool SavePubmed(const string& matPath, bool SavePubmed(const string& matPath,
@ -71,19 +74,17 @@ bool SavePubmed(const string& matPath,
return true; return true;
} }
/* 处理一篇文章 */ /* 处理一篇文章 */
struct ThreadParam { // 线程参数 struct ThreadParamPubmed { // 线程参数
unordered_map<string, string> *pumTagContent; unordered_map<string, string> *pumTagContent;
vector<string>* pvLineTag; vector<string> *pvLineTag;
vector<string>* pvTgName; vector<string> *pvTgName;
int paperStartIdx; int paperStartIdx;
int paperEndIdx; int paperEndIdx;
unordered_map<string, string>* pumFullTagToTag; unordered_map<string, string> *pumFullTagToTag;
vector<string>* pvStrPubmedTxt; vector<string> *pvStrPubmedTxt;
}; };
//void ThreadProcessArticle(vector<ThreadParam>& vTP, long idx, int tid) { void ThreadProcessArticle(ThreadParamPubmed& param) {
void ThreadProcessArticle(ThreadParam& param) {
//ThreadParam& param = vTP[idx];
unordered_map<string, string>& umTagContent = *param.pumTagContent; unordered_map<string, string>& umTagContent = *param.pumTagContent;
vector<string>& vLineTag = *param.pvLineTag; vector<string>& vLineTag = *param.pvLineTag;
vector<string>& vTgName = *param.pvTgName; vector<string>& vTgName = *param.pvTgName;
@ -108,7 +109,7 @@ void ThreadProcessArticle(ThreadParam& param) {
} }
// 命令行参数示例 // 命令行参数示例
// ProcessPubmedTxt d:\Twirls\gat1\literatures\pubmed_tag.mat D:\Twirls\runtime\negatives\pubmed-multiplesc-set.txt d:\pubmed_txt.mat 12 // ProcessPubmedTxt d:\Twirls\gat1\literatures\pubmed_tag.mat D:\Twirls\runtime\pubmed_files d:\pubmed_txt.mat 12
/* /*
pubmed txttagtag pubmed txttagtag
1. pubmed tags, tags'-'' 'tag 1. pubmed tags, tags'-'' 'tag
@ -117,10 +118,10 @@ void ThreadProcessArticle(ThreadParam& param) {
4. mat 4. mat
*/ */
void ProcessPubmedTxt(int argc, const char** argv) { void ProcessPubmedTxt(int argc, const char** argv) {
// argv 1.pubmed tag.mat文件; 2.pubmed article.txt文件; 3.pubmed out.mat输出文件 // argv 1.pubmed tag.mat文件; 2.pubmed txt文件父目录; 3.pubmed out.mat输出文件; 4.Thread number
// //
if (argc < 4) { if (argc < 4) {
cout << "This program should take at least 3 arguments(1.pubmed tag.mat; 2. pubmed article.txt; 3. pubmed out.mat; [4. thread num])!" << endl; cout << "This program should take at least 3 arguments(1.pubmed tag.mat; 2. pubmed txt parent dir; 3. pubmed out.mat; [4. thread num])!" << endl;
return; return;
} }
clock_t begin, finish; clock_t begin, finish;
@ -128,7 +129,7 @@ void ProcessPubmedTxt(int argc, const char** argv) {
vector<string> vTg; vector<string> vTg;
vector<string> vTgName; vector<string> vTgName;
vector<unordered_map<string, string> > vumPaperTagVal; vector<unordered_map<string, string> > vumPaperTagVal;
unordered_map<string, string> umFullTagToTag; // 完整tag与tag的映射如“PMID- ”“PMID” unordered_map<string, string> umFullTagToTag; // 完整tag与tag的映射如“PMID- ”:“PMID”
/* 读取pubmed tags */ /* 读取pubmed tags */
ReadMtxString(argv[1], "tg", vTg, &rowNum, &colNum); ReadMtxString(argv[1], "tg", vTg, &rowNum, &colNum);
/* 1. 去掉tags里的'-'和' '字符得到纯净的tag */ /* 1. 去掉tags里的'-'和' '字符得到纯净的tag */
@ -148,7 +149,8 @@ void ProcessPubmedTxt(int argc, const char** argv) {
cout << "process tag Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; cout << "process tag Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
/* 2. 读取pubmed txt文件先读入后处理 */ /* 2. 读取pubmed txt文件先读入后处理 */
ifstream ifsPubmedTxt(argv[2]); string parentDir(argv[2]);
string txtSuffix(".txt");
vector<string> vStrPubmedTxt; vector<string> vStrPubmedTxt;
vector<string> vLineTag; vector<string> vLineTag;
vector<int> vPaperStartIdx; vector<int> vPaperStartIdx;
@ -159,39 +161,52 @@ void ProcessPubmedTxt(int argc, const char** argv) {
vPaperStartIdx.push_back(curPos); // 添加初始索引 vPaperStartIdx.push_back(curPos); // 添加初始索引
const int FULL_TAG_LEN = 5; const int FULL_TAG_LEN = 5;
begin = clock(); begin = clock();
while (getline(ifsPubmedTxt, strLine)) { // 读取内容时候去掉了行尾的换行符 for (auto &file : fs::directory_iterator(parentDir)) { // 遍历目录里的每一个txt文件
while (strLine.back() == ' ') strLine.pop_back(); // 去掉行尾的空格 const string &fileName = file.path().filename().string();
if (strLine.size() == 0) { // 新的paper auto rPos = fileName.rfind(txtSuffix);
vPaperStartIdx.push_back(curPos); if (rPos != string::npos && fileName.size() - rPos == txtSuffix.size()){
continue; ifstream ifsPubmedTxt(file.path().string());
} while (getline(ifsPubmedTxt, strLine)) { // 读取内容时候去掉了行尾的换行符
fullTag = strLine.substr(0, 5); while (strLine.back() == ' ') strLine.pop_back(); // 去掉行尾的空格
if (fullTag == blankTag) { // 这一行的内容还是属于上一个tag的 if (strLine.size() == 0) { // 新的paper
string& lastTagConteng = vStrPubmedTxt.back(); vPaperStartIdx.push_back(curPos);
lastTagConteng.append(strLine.substr(FULL_TAG_LEN)); // 最前边包含了一个空格 continue;
} }
else { fullTag = strLine.substr(0, 5);
vStrPubmedTxt.push_back(strLine.substr(FULL_TAG_LEN)); if (fullTag == blankTag) { // 这一行的内容还是属于上一个tag的
vLineTag.push_back(fullTag); string& lastTagConteng = vStrPubmedTxt.back();
curPos++; lastTagConteng.append(strLine.substr(FULL_TAG_LEN)); // 最前边包含了一个空格
}
else {
vStrPubmedTxt.push_back(strLine.substr(FULL_TAG_LEN));
vLineTag.push_back(fullTag);
curPos++;
}
}
vPaperStartIdx.push_back(curPos); // 比文章多1最后一个记录结束位置
ifsPubmedTxt.close();
} }
} }
vPaperStartIdx.push_back(curPos); // 比文章多1最后一个记录结束位置
finish = clock(); finish = clock();
cout << "read txt Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; cout << "read txt Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
cout << "paper num: " << vPaperStartIdx.size() - 1 << endl;
/* 处理每一篇文章 */ /* 处理每一篇文章 */
int numThread = 1; int numThread = 1;
if (argc >= 5) numThread = atoi(argv[4]); if (argc >= 5) numThread = atoi(argv[4]);
if (numThread < 1) numThread = 1; if (numThread < 1) numThread = 1;
ThreadPool thPool(numThread); // ThreadPool thPool(numThread);
vumPaperTagVal.resize(vPaperStartIdx.size()-1); vumPaperTagVal.resize(vPaperStartIdx.size()-1);
vector<thread> vT; vector<ThreadParamPubmed> vTP(vumPaperTagVal.size());
vector<ThreadParam> vTP(vPaperStartIdx.size() - 1);
begin = clock(); begin = clock();
for (int i = 0; i < vTP.size(); ++i) { for (int i = 0; i < vumPaperTagVal.size(); ++i) {
vTP[i] = { &vumPaperTagVal[i], &vLineTag, &vTgName, vPaperStartIdx[i], vPaperStartIdx[i + 1], &umFullTagToTag, &vStrPubmedTxt }; vTP[i] = { &vumPaperTagVal[i], &vLineTag, &vTgName, vPaperStartIdx[i], vPaperStartIdx[i + 1], &umFullTagToTag, &vStrPubmedTxt };
// ThreadParamPubmed tp = { &vumPaperTagVal[i], &vLineTag, &vTgName, vPaperStartIdx[i], vPaperStartIdx[i + 1], &umFullTagToTag, &vStrPubmedTxt };
// ThreadProcessArticle(tp);
// thPool.enqueue(ThreadProcessArticle, tp);
} }
// thPool.~ThreadPool();
kt_for(numThread, ThreadProcessArticle, vTP); kt_for(numThread, ThreadProcessArticle, vTP);
finish = clock(); finish = clock();
cout << "kt for Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; cout << "kt for Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
@ -237,9 +252,6 @@ void ProcessPubmedTxt(int argc, const char** argv) {
finish = clock(); finish = clock();
cout << "merge abs and title Total time : " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; cout << "merge abs and title Total time : " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
// 关闭txt文件
ifsPubmedTxt.close();
/* 将处理后的数据写入mat文件mat中的变量名称分别为Tx和abs1 */ /* 将处理后的数据写入mat文件mat中的变量名称分别为Tx和abs1 */
begin = clock(); begin = clock();
SavePubmed(argv[3], vTgName, vumPaperTagVal); SavePubmed(argv[3], vTgName, vumPaperTagVal);