248 lines
8.7 KiB
C++
248 lines
8.7 KiB
C++
/*********************************************************************************************
|
||
Description: 处理pubmed生成的txt文件,将处理结果放到mat文件中
|
||
|
||
Copyright : All right reserved by ZheYuan.BJ
|
||
|
||
Author : Zhang Zhonghai
|
||
Date : 2023/09/18
|
||
***********************************************************************************************/
|
||
#include <iostream>
|
||
#include <fstream>
|
||
#include <vector>
|
||
#include <string>
|
||
#include <algorithm>
|
||
#include <unordered_map>
|
||
#include <mat.h>
|
||
#include "common.h"
|
||
#include "CommonLib/thread_pool.h"
|
||
#include "CommonLib/matlab_io.h"
|
||
#include "CommonLib/kthread.h"
|
||
using namespace std;
|
||
|
||
/* 将结果写入mat文件 */
|
||
/* 将数据写入mat文件中,用给定的名称命名 */
|
||
bool SavePubmed(const string& matPath,
|
||
const vector<string>& vTgName,
|
||
vector<unordered_map<string, string> >& vumPaperTagVal)
|
||
{
|
||
MATFile* pMatFile = matOpen(matPath.c_str(), "w"); //打开.mat文件
|
||
if (pMatFile == nullptr) {
|
||
cout << "filePath is error! " << matPath << endl;
|
||
return false;
|
||
}
|
||
|
||
vector<const char*> vTgChars;
|
||
for (auto& strTg : vTgName) {
|
||
vTgChars.push_back(strTg.c_str());
|
||
}
|
||
|
||
// 创建结构体数据
|
||
mxArray* mxStruct = mxCreateStructMatrix(1, 1, (int)vTgName.size(), vTgChars.data());
|
||
// 创建cell matrix
|
||
unordered_map<string, mxArray*> ummxCellMtx;
|
||
for (auto & tgName : vTgName) {
|
||
ummxCellMtx[tgName] = mxCreateCellMatrix(1, vumPaperTagVal.size());
|
||
}
|
||
|
||
// 遍历每一篇文章
|
||
for (int i = 0; i < vumPaperTagVal.size(); ++i) {
|
||
auto& umTagVal = vumPaperTagVal[i];
|
||
// 遍历文章的每一个tag
|
||
for (auto& tgName : vTgName) {
|
||
mxArray* mxStr = mxCreateString(umTagVal[tgName].c_str());
|
||
mxArray* pMxArr = ummxCellMtx[tgName];
|
||
mxSetCell(pMxArr, i, mxStr);
|
||
}
|
||
}
|
||
|
||
// 将cell matrix赋值给struct matrix
|
||
for (auto& tgName : vTgName) {
|
||
mxArray* pMxArr = ummxCellMtx[tgName];
|
||
mxSetField(mxStruct, 0, tgName.c_str(), pMxArr);
|
||
}
|
||
|
||
// 将结构体写入mat,并命名为Tx
|
||
matPutVariable(pMatFile, "Tx", mxStruct);
|
||
// 将abstract信息写入mat,并命名为abs1
|
||
matPutVariable(pMatFile, "abs1", ummxCellMtx["AB"]);
|
||
|
||
matClose(pMatFile);
|
||
|
||
return true;
|
||
}
|
||
/* 处理一篇文章 */
|
||
struct ThreadParam { // 线程参数
|
||
unordered_map<string, string> *pumTagContent;
|
||
vector<string>* pvLineTag;
|
||
vector<string>* pvTgName;
|
||
int paperStartIdx;
|
||
int paperEndIdx;
|
||
unordered_map<string, string>* pumFullTagToTag;
|
||
vector<string>* pvStrPubmedTxt;
|
||
};
|
||
|
||
//void ThreadProcessArticle(vector<ThreadParam>& vTP, long idx, int tid) {
|
||
void ThreadProcessArticle(ThreadParam& param) {
|
||
//ThreadParam& param = vTP[idx];
|
||
unordered_map<string, string>& umTagContent = *param.pumTagContent;
|
||
vector<string>& vLineTag = *param.pvLineTag;
|
||
vector<string>& vTgName = *param.pvTgName;
|
||
unordered_map<string, string>& umFullTagToTag = *param.pumFullTagToTag;
|
||
vector<string>& vStrPubmedTxt = *param.pvStrPubmedTxt;
|
||
|
||
int startIdx = param.paperStartIdx;
|
||
int endIdx = param.paperEndIdx;
|
||
|
||
for (int tgIdx = 0; tgIdx < vTgName.size(); ++tgIdx) {
|
||
umTagContent[vTgName[tgIdx]] = ""; // 对每一个tag,设置一个新的string
|
||
}
|
||
for (int idx = startIdx; idx < endIdx; ++idx) { // 遍历当前文章的每一个tag内容
|
||
string& fullTag = vLineTag[idx];
|
||
auto tagItr = umFullTagToTag.find(fullTag);
|
||
if (tagItr != umFullTagToTag.end()) { // 找到tag了
|
||
const string& tag = tagItr->second;
|
||
string& tagContent = umTagContent[tag];
|
||
tagContent.append(vStrPubmedTxt[idx]);
|
||
}
|
||
}
|
||
}
|
||
|
||
// 命令行参数示例
|
||
// ProcessPubmedTxt d:\Twirls\gat1\literatures\pubmed_tag.mat D:\Twirls\runtime\negatives\pubmed-multiplesc-set.txt d:\pubmed_txt.mat 12
|
||
/*
|
||
pubmed txt文件中包含多个文章的摘要信息,每个信息最前边有一个tag,每个tag对应的信息可能有一行,也可能多行,每个文章中间由一个空行隔开
|
||
1. 读取预先提取的pubmed tags, 并将tags中的'-'和' '字符去掉,只留下纯字符串做tag
|
||
2. 读取pubmed txt文件, 对所有文献根据tag进行拆分,同一个文献中,合并同一个tag的所有内容,空格隔开
|
||
3. 去除一些冗余等内容, 将title和abstract合并,赋值给abstract
|
||
4. 将结果写入mat文件
|
||
*/
|
||
void ProcessPubmedTxt(int argc, const char** argv) {
|
||
// argv 1.pubmed tag.mat文件; 2.pubmed article.txt文件; 3.pubmed out.mat输出文件
|
||
//
|
||
if (argc < 4) {
|
||
cout << "This program should take at least 3 arguments(1.pubmed tag.mat; 2. pubmed article.txt; 3. pubmed out.mat; [4. thread num])!" << endl;
|
||
return;
|
||
}
|
||
clock_t begin, finish;
|
||
int rowNum, colNum;
|
||
vector<string> vTg;
|
||
vector<string> vTgName;
|
||
vector<unordered_map<string, string> > vumPaperTagVal;
|
||
unordered_map<string, string> umFullTagToTag; // 完整tag与tag的映射,如“PMID- ”:“PMID”
|
||
/* 读取pubmed tags */
|
||
ReadMtxString(argv[1], "tg", vTg, &rowNum, &colNum);
|
||
/* 1. 去掉tags里的'-'和' '字符,得到纯净的tag */
|
||
begin = clock();
|
||
vTgName = vTg;
|
||
for (int i = 0; i < vTg.size(); ++i) {
|
||
int pos = 0;
|
||
for (int j = 0; j < vTg[i].size(); ++j) {
|
||
if (vTg[i][j] != ' ' && vTg[i][j] != '-') { // 去掉tag中的空格和'-'字符,生成tag name
|
||
vTgName[i][pos++] = vTg[i][j];
|
||
}
|
||
}
|
||
vTgName[i].resize(pos);
|
||
umFullTagToTag[vTg[i]] = vTgName[i];
|
||
}
|
||
finish = clock();
|
||
cout << "process tag Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||
|
||
/* 2. 读取pubmed txt文件,先读入后处理 */
|
||
ifstream ifsPubmedTxt(argv[2]);
|
||
vector<string> vStrPubmedTxt;
|
||
vector<string> vLineTag;
|
||
vector<int> vPaperStartIdx;
|
||
string blankTag = " "; // 5个空格
|
||
string strLine;
|
||
string fullTag;
|
||
int curPos = 0;
|
||
vPaperStartIdx.push_back(curPos); // 添加初始索引
|
||
const int FULL_TAG_LEN = 5;
|
||
begin = clock();
|
||
while (getline(ifsPubmedTxt, strLine)) { // 读取内容时候去掉了行尾的换行符
|
||
while (strLine.back() == ' ') strLine.pop_back(); // 去掉行尾的空格
|
||
if (strLine.size() == 0) { // 新的paper
|
||
vPaperStartIdx.push_back(curPos);
|
||
continue;
|
||
}
|
||
fullTag = strLine.substr(0, 5);
|
||
if (fullTag == blankTag) { // 这一行的内容还是属于上一个tag的
|
||
string& lastTagConteng = vStrPubmedTxt.back();
|
||
lastTagConteng.append(strLine.substr(FULL_TAG_LEN)); // 最前边包含了一个空格
|
||
}
|
||
else {
|
||
vStrPubmedTxt.push_back(strLine.substr(FULL_TAG_LEN));
|
||
vLineTag.push_back(fullTag);
|
||
curPos++;
|
||
}
|
||
}
|
||
vPaperStartIdx.push_back(curPos); // 比文章多1,最后一个记录结束位置
|
||
finish = clock();
|
||
cout << "read txt Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||
|
||
/* 处理每一篇文章 */
|
||
int numThread = 1;
|
||
if (argc >= 5) numThread = atoi(argv[4]);
|
||
if (numThread < 1) numThread = 1;
|
||
ThreadPool thPool(numThread);
|
||
vumPaperTagVal.resize(vPaperStartIdx.size()-1);
|
||
vector<thread> vT;
|
||
vector<ThreadParam> vTP(vPaperStartIdx.size() - 1);
|
||
begin = clock();
|
||
for (int i = 0; i < vTP.size(); ++i) {
|
||
vTP[i] = { &vumPaperTagVal[i], &vLineTag, &vTgName, vPaperStartIdx[i], vPaperStartIdx[i + 1], &umFullTagToTag, &vStrPubmedTxt };
|
||
}
|
||
kt_for(numThread, ThreadProcessArticle, vTP);
|
||
finish = clock();
|
||
cout << "kt for Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||
|
||
/* 去除没有摘要的文章 */
|
||
begin = clock();
|
||
const string abstractTag = "AB";
|
||
for (auto itr = vumPaperTagVal.begin(); itr != vumPaperTagVal.end(); ) {
|
||
if ((*itr)[abstractTag].size() == 0) {
|
||
itr = vumPaperTagVal.erase(itr);
|
||
}
|
||
else {
|
||
itr++;
|
||
}
|
||
}
|
||
finish = clock();
|
||
cout << "remove no AB Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||
|
||
/* 根据PMID,去除冗余 */
|
||
begin = clock();
|
||
unordered_map<string, int> umPMID;
|
||
const string pmidTag = "PMID";
|
||
for (auto itr = vumPaperTagVal.begin(); itr != vumPaperTagVal.end(); ) {
|
||
if (umPMID.find((*itr)[pmidTag]) != umPMID.end()) {
|
||
// out << "duplicate " << (*itr)[pmidTag] << endl;
|
||
itr = vumPaperTagVal.erase(itr);
|
||
}
|
||
else {
|
||
umPMID[(*itr)[pmidTag]] = 1;
|
||
itr++;
|
||
}
|
||
}
|
||
finish = clock();
|
||
cout << "remove duplication Total time : " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||
|
||
/* 将title和abstract合并,赋值给abstract */
|
||
begin = clock();
|
||
const string titleTag = "TI";
|
||
for (auto itr = vumPaperTagVal.begin(); itr != vumPaperTagVal.end(); itr++) {
|
||
string& abstractStr = (*itr)[abstractTag];
|
||
abstractStr = (*itr)[titleTag] + " " + abstractStr; // 可能会有性能损失,不过影响不大
|
||
}
|
||
finish = clock();
|
||
cout << "merge abs and title Total time : " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||
|
||
// 关闭txt文件
|
||
ifsPubmedTxt.close();
|
||
|
||
/* 将处理后的数据写入mat文件,mat中的变量名称分别为Tx和abs1 */
|
||
begin = clock();
|
||
SavePubmed(argv[3], vTgName, vumPaperTagVal);
|
||
finish = clock();
|
||
cout << "write to MAT Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||
} |