twirls/CppRun/process_pubmed_txt.cpp

181 lines
6.2 KiB
C++
Raw Normal View History

/*********************************************************************************************
Description: pubmedtxtmat
Copyright : All right reserved by ZheYuan.BJ
Author : Zhang Zhonghai
Date : 2023/09/18
***********************************************************************************************/
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <algorithm>
#include <unordered_map>
#include <mat.h>
#include "common.h"
#include "CommonLib/matlab_io.h"
using namespace std;
/* 将结果写入mat文件 */
/* 将数据写入mat文件中用给定的名称命名 */
bool SavePubmed(const string& matPath,
const vector<string> &vTgName,
const vector<unordered_map<string, string> >& vumPaperTagVal)
{
MATFile* pMatFile = matOpen(matPath.c_str(), "r"); //打开.mat文件
if (pMatFile == nullptr) {
cerr << "filePath is error! " << matPath << endl;
return false;
}
vector<const char*> vTgChars;
for (auto strTg : vTgName) {
vTgChars.push_back(strTg.c_str());
}
// 创建结构体数据
mxArray* mxStruct = mxCreateStructMatrix(1, 1, vTgName.size(), vTgChars.data());
return true;
}
/*
pubmed txttagtag
1. pubmed tags, tags'-'' 'tag
2. pubmed txt, tagtag
3. titleabstractabstract
4. mat
*/
void ProcessPubmedTxt(int argc, const char** argv) {
// argv 1.pubmed tag.mat文件; 2.pubmed article.txt文件; 3.pubmed out.mat输出文件
//
// cout << argc << '\t' << argv[1] << endl;
int rowNum, colNum;
vector<string> vTg;
vector<string> vTgName;
// unordered_map<string, vector<string> > umTagVal;
vector<unordered_map<string, string> > vumPaperTagVal;
unordered_map<string, string> umFullTagToTag; // 完整tag与tag的映射如“PMID- ”“PMID”
/* 读取pubmed tags */
ReadMtxString(argv[1], "tg", vTg, &rowNum, &colNum);
/* 1. 去掉tags里的'-'和' '字符得到纯净的tag */
vTgName = vTg;
for (int i = 0; i < vTg.size(); ++i) {
// cout << vTg[i] << '\t';
int pos = 0;
for (int j = 0; j < vTg[i].size(); ++j) {
if (vTg[i][j] != ' ' && vTg[i][j] != '-') { // 去掉tag中的空格和'-'字符生成tag name
vTgName[i][pos++] = vTg[i][j];
}
}
vTgName[i].resize(pos);
umFullTagToTag[vTg[i]] = vTgName[i];
// cout << vTg[i].size() << '\t' << vTgName[i].size() << endl;
}
/* 2. 读取pubmed txt文件先读入后处理 */
ifstream ifsPubmedTxt(argv[2]);
vector<string> vStrPubmedTxt;
vector<string> vLineTag;
vector<int> vPaperStartIdx;
string blankTag = " "; // 5个空格
string strLine;
string fullTag;
int curPos = 0;
vPaperStartIdx.push_back(curPos); // 添加初始索引
const int FULL_TAG_LEN = 5;
while (getline(ifsPubmedTxt, strLine)) { // 读取内容时候去掉了行尾的换行符
while (strLine.back() == ' ') strLine.pop_back(); // 去掉行尾的空格
if (strLine.size() == 0) { // 新的paper
vPaperStartIdx.push_back(curPos);
continue;
}
fullTag = strLine.substr(0, 5);
if (fullTag == blankTag) { // 这一行的内容还是属于上一个tag的
string& lastTagConteng = vStrPubmedTxt.back();
lastTagConteng.append(strLine.substr(FULL_TAG_LEN)); // 最前边包含了一个空格
}
else {
vStrPubmedTxt.push_back(strLine.substr(FULL_TAG_LEN));
vLineTag.push_back(fullTag);
curPos++;
}
// cout << strLine << endl;
}
// cout << vStrPubmedTxt.size() << endl;
vPaperStartIdx.push_back(curPos); // 比文章多1最后一个记录结束位置
/* 处理每一篇文章 */
ofstream testOfs("pubmed_test-1.txt");
for (int i = 0; i < vPaperStartIdx.size() - 1; ++i) {
int startIdx = vPaperStartIdx[i];
int endIdx = vPaperStartIdx[i + 1];
unordered_map<string, string> umTagContent;
for (int tgIdx = 0; tgIdx < vTgName.size(); ++tgIdx) {
umTagContent[vTgName[tgIdx]] = ""; // 对每一个tag设置一个新的string
}
for (int idx = startIdx; idx < endIdx; ++idx) { // 遍历当前文章的每一个tag内容
string& fullTag = vLineTag[idx];
auto tagItr = umFullTagToTag.find(fullTag);
if (tagItr != umFullTagToTag.end()) { // 找到tag了
const string& tag = tagItr->second;
string& tagContent = umTagContent[tag];
tagContent.append(vStrPubmedTxt[idx]);
}
}
vumPaperTagVal.push_back(umTagContent);
}
cout << "文件个数:" << vumPaperTagVal.size() << endl;
/* 去除没有摘要的文章 */
const string abstractTag = "AB";
for (auto itr = vumPaperTagVal.begin(); itr != vumPaperTagVal.end(); ) {
if ((*itr)[abstractTag].size() == 0) {
itr = vumPaperTagVal.erase(itr);
}
else {
itr++;
}
}
/* 根据PMID去除冗余 */
unordered_map<string, int> umPMID;
const string pmidTag = "PMID";
for (auto itr = vumPaperTagVal.begin(); itr != vumPaperTagVal.end(); ) {
if (umPMID.find((*itr)[pmidTag]) != umPMID.end()) {
cout << "duplicate " << (*itr)[pmidTag] << endl;
itr = vumPaperTagVal.erase(itr);
}
else {
umPMID[(*itr)[pmidTag]] = 1;
itr++;
}
}
/* 将title和abstract合并赋值给abstract */
const string titleTag = "TI";
for (auto itr = vumPaperTagVal.begin(); itr != vumPaperTagVal.end(); itr++) {
string& abstractStr = (*itr)[abstractTag];
abstractStr = (*itr)[titleTag] + " " + abstractStr; // 可能会有性能损失,不过影响不大
}
//for (int tgIdx = 0; tgIdx < vTgName.size(); ++tgIdx) {
// for (int i = 0; i < vumPaperTagVal.size(); ++i) {
// testOfs << vumPaperTagVal[i][vTgName[tgIdx]] << endl;
// }
//}
for (int i = 0; i < vumPaperTagVal.size(); ++i) {
testOfs << vumPaperTagVal[i][abstractTag] << endl;
}
testOfs.close();
cout << "文件个数:" << vumPaperTagVal.size() << endl;
// for (auto num : vPaperStartIdx) cout << num << endl;
ifsPubmedTxt.close();
/* 将处理后的数据写入mat文件mat中的变量名称分别为Tx和abs1 */
SavePubmed(argv[3], vTgName, vumPaperTagVal);
}