修改了IsWordInDic,添加了阈值参数,并计算了xs

This commit is contained in:
zzh 2023-10-08 15:09:11 +08:00
parent 18a5415e59
commit 47b0cc187b
7 changed files with 567 additions and 21 deletions

View File

@ -0,0 +1,323 @@
#include <mex.h>
#include <mat.h>
#include <iostream>
#include <algorithm>
#include <string>
#include <unordered_set>
#include <ctime>
#include <vector>
#include <queue>
#include <memory>
#include <thread>
#include <mutex>
#include <condition_variable>
#include <future>
#include <functional>
#include <stdexcept>
#include <unordered_map>
#include <set>
#include <fstream>
#include <algorithm>
#include <random>
#include <cmath>
using std::cout;
using std::endl;
using namespace std;
#define STRING_BUF_SIZE 204800
class ThreadPool {
public:
ThreadPool(size_t);
template<class F, class... Args>
auto enqueue(F&& f, Args&&... args)
->std::future<typename std::result_of<F(Args...)>::type>;
~ThreadPool();
private:
// need to keep track of threads so we can join them
std::vector< std::thread > workers;
// the task queue
std::queue< std::function<void()> > tasks;
// synchronization
std::mutex queue_mutex;
std::condition_variable condition;
bool stop;
};
// the constructor just launches some amount of workers
inline ThreadPool::ThreadPool(size_t threads)
: stop(false)
{
for (size_t i = 0;i < threads;++i)
workers.emplace_back(
[this]
{
for (;;)
{
std::function<void()> task;
{
std::unique_lock<std::mutex> lock(this->queue_mutex);
this->condition.wait(lock,
[this] { return this->stop || !this->tasks.empty(); });
if (this->stop && this->tasks.empty())
return;
task = std::move(this->tasks.front());
this->tasks.pop();
}
task();
}
}
);
}
// add new work item to the pool
template<class F, class... Args>
auto ThreadPool::enqueue(F&& f, Args&&... args)
-> std::future<typename std::result_of<F(Args...)>::type>
{
using return_type = typename std::result_of<F(Args...)>::type;
auto task = std::make_shared< std::packaged_task<return_type()> >(
std::bind(std::forward<F>(f), std::forward<Args>(args)...)
);
std::future<return_type> res = task->get_future();
{
std::unique_lock<std::mutex> lock(queue_mutex);
// don't allow enqueueing after stopping the pool
if (stop)
throw std::runtime_error("enqueue on stopped ThreadPool");
tasks.emplace([task]() { (*task)(); });
}
condition.notify_one();
return res;
}
// the destructor joins all threads
inline ThreadPool::~ThreadPool()
{
{
std::unique_lock<std::mutex> lock(queue_mutex);
stop = true;
}
condition.notify_all();
for (std::thread& worker : workers)
worker.join();
}
/* 读取一维double数据 */
void Read1DDouble(const mxArray* pMxArray, vector<double>& vDat) {
int rowNum, colNum;
double* matData;
rowNum = (int)mxGetM(pMxArray);
colNum = (int)mxGetN(pMxArray);
// cout << rowNum << " " << colNum << endl;
matData = (double*)mxGetData(pMxArray); //获取指针
vDat.resize(rowNum * colNum);
for (int i = 0; i < vDat.size(); ++i) vDat[i] = matData[i];
}
/* 读取二维double数据 */
void Read2DDouble(const mxArray* pMxArray, vector<vector<double>>& vvDat) {
int rowNum, colNum;
double* matData;
rowNum = (int)mxGetM(pMxArray);
colNum = (int)mxGetN(pMxArray);
vvDat.resize(rowNum);
matData = (double*)mxGetData(pMxArray); //获取指针
for (int i = 0; i < rowNum; ++i) {
vvDat[i].resize(colNum);
for (int j = 0; j < colNum; ++j) {
vvDat[i][j] = matData[j * rowNum + i];
}
}
}
// 线程参数
struct TPRandSim {
vector<double>* pvTr;
vector<int>* pvRandPos;
vector<double>* pvH;
vector<vector<double>>* pvvX;
int numPositive;
};
// 多线程入口函数
void ThreadRandSim(TPRandSim& param) {
vector<double>& vTr = *param.pvTr;
vector<int>& vRandPos = *param.pvRandPos;
vector<vector<double>>& vvX = *param.pvvX;
vector<double>& vH = *param.pvH;
int numPositive = param.numPositive;
int rowNum = vvX.size();
int colNum = vvX[0].size();
clock_t begin = clock(), finish;
/* 随机模拟 */
std::random_device rd;
std::shuffle(vRandPos.begin(), vRandPos.end(), std::default_random_engine(rd()));
for (int i = 0; i < rowNum; ++i) {
int hRowIdx = vRandPos[i]; // 随机打乱之后的行索引
if (vH[hRowIdx] == 1) {
for (int j = 0; j < colNum; ++j) {
vTr[j] += vvX[i][j];
}
}
}
for (auto& val : vTr) val /= numPositive;
finish = clock();
// cout << "Random simulation time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
}
/* 入口函数 */
/*
1. x double5
2. h 10
3. numThread
vs z score,
ps vs
*/
void mexFunction(int nlhs, mxArray* plhs[], int nrhs, mxArray** prhs) {
//void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) {
if (nrhs < 2) {
cout << "At least 2 arguments should be given for this function!" << endl;
return;
}
clock_t begin = clock(), mid, finish;
vector<double> vH;
vector<vector<double>> vvX;
Read2DDouble(prhs[0], vvX);
Read1DDouble(prhs[1], vH);
int rowNum = vvX.size();
int colNum = vvX[0].size();
cout << vH.size() << '\t' << vvX.size() << endl;
int numThread = 1;
int loopNum = 1000;
if (nrhs > 2) {
double* pNumThread = (double*)mxGetData(prhs[2]);
numThread = (int)pNumThread[0];
if (numThread < 1) numThread = 1;
}
if (nrhs > 3) {
double* pLoopNum = (double*)mxGetData(prhs[3]);
loopNum = (int)pLoopNum[0];
if (loopNum < 1000) loopNum = 1000;
}
/* 进行随机模拟 */
mid = clock();
vector<double> vTs(colNum); // 初始数据记录vH中label为1的行的行均值
int numPositive = 0;
for (int i = 0; i < rowNum; ++i) {
if (vH[i] == 1) {
++numPositive;
for (int j = 0; j < colNum; ++j) {
vTs[j] += vvX[i][j];
}
}
}
for (auto& val : vTs) val /= numPositive;
vector<vector<double>> vvTr(loopNum, vector<double>(colNum, 0)); // 模拟结果
vector<vector<int>> vvRandPos(numThread, vector<int>(rowNum));
for (int i = 0; i < rowNum; ++i) {
for (auto& vRandPos : vvRandPos) {
vRandPos[i] = i;
}
}
ThreadPool thPool(numThread);
int tid = 0;
for (int i = 0; i < loopNum; ++i) {
TPRandSim tParam = { &vvTr[i], &vvRandPos[tid++ % numThread], &vH, &vvX, numPositive };
thPool.enqueue(ThreadRandSim, tParam);
//ThreadRandSim(tParam);
}
thPool.~ThreadPool();
finish = clock();
cout << "Random simulation time: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl;
/* 计算结果 */
vector<double> vVs(colNum);
vector<double> vPs(colNum);
// 按列计算平均值
vector<double> vMean(colNum);
vector<double> vStd(colNum);
for (int i = 0; i < vvTr.size(); ++i) {
for (int j = 0; j < vvTr[i].size(); ++j) {
vMean[j] += vvTr[i][j];
}
}
for (auto& val : vMean) { val /= loopNum; } // 均值
for (int i = 0; i < vvTr.size(); ++i) {
for (int j = 0; j < vvTr[i].size(); ++j) {
const double diff = vvTr[i][j] - vMean[j];
vStd[j] += diff * diff;
}
}
for (auto& val : vStd) { val = sqrt(val / (loopNum - 1)); } // 均方根
// 计算vs
for (int i = 0; i < vVs.size(); ++i) {
vVs[i] = (vTs[i] - vMean[i]) / vStd[i];
}
// 计算ps
vector<double> vSumGreater(colNum);
vector<double> vSumLess(colNum);
for (int i = 0; i < loopNum; ++i) {
for (int j = 0; j < colNum; ++j) {
if (vvTr[i][j] >= vTs[j]) vSumGreater[j] ++;
if (vvTr[i][j] <= vTs[j]) vSumLess[j] ++;
}
}
for (auto& val : vSumGreater) val /= loopNum;
for (auto& val : vSumLess) val /= loopNum;
for (int i = 0; i < colNum; ++i) {
vPs[i] = min(vSumGreater[i], vSumLess[i]);
}
ofstream ofs("d:\\result.txt");
for (int i = 0; i < colNum; ++i) {
ofs << vVs[i] << '\t' << vPs[i] << endl;
}
ofs.close();
/* 写入结果 */
if (nlhs > 0) { // vs
mxArray* pWriteArray = NULL;//matlab格式矩阵
//创建一个rowNum*colNum的矩阵
pWriteArray = mxCreateDoubleMatrix(1, vVs.size(), mxREAL);
//把data的值赋给pWriteArray指针
memcpy((void*)(mxGetPr(pWriteArray)), (void*)vVs.data(), sizeof(double) * vVs.size());
plhs[0] = pWriteArray; // 赋值给返回值
}
if (nlhs > 1) { // ps
mxArray* pWriteArray = NULL;//matlab格式矩阵
//创建一个rowNum*colNum的矩阵
pWriteArray = mxCreateDoubleMatrix(1, vPs.size(), mxREAL);
//把data的值赋给pWriteArray指针
memcpy((void*)(mxGetPr(pWriteArray)), (void*)vPs.data(), sizeof(double)* vPs.size());
plhs[1] = pWriteArray; // 赋值给返回值
}
finish = clock();
cout << "Cluster Random simulation Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
}

View File

@ -1166,6 +1166,11 @@ T SquareAverage(vector<T>& vVal) {
/* 计算向量x和y的相关距离, 向量维度必须相等*/
double CorrelationDistance(vector<double>& vX, vector<double>& vY) {
double xMean = Average(vX);
transform(vX.cbegin(), vX.cend(), vX.begin(), [&](double val) { return val - xMean; });
double yMean = Average(vY);
transform(vY.cbegin(), vY.cend(), vY.begin(), [&](double val) { return val - yMean; });
vector<double> vXY(vX.size());
for (int i = 0; i < vXY.size(); ++i) {
vXY[i] = vX[i] * vY[i];

View File

@ -0,0 +1,204 @@
#include <mex.h>
#include <mat.h>
#include <iostream>
#include <algorithm>
#include <string>
#include <unordered_set>
#include <ctime>
#include <vector>
#include <queue>
#include <memory>
#include <thread>
#include <mutex>
#include <condition_variable>
#include <future>
#include <functional>
#include <stdexcept>
#include <unordered_map>
#include <set>
#include <fstream>
#include <algorithm>
#include <random>
#include <cmath>
using std::cout;
using std::endl;
using namespace std;
#define STRING_BUF_SIZE 204800
// 读取一维cell字符串并转换成大写
inline bool Read1DWord(const mxArray* pMxArray, vector<string>& vStr) {
mxArray* pCell = nullptr;
int rowNum, colNum;
char* strBuf = new char[STRING_BUF_SIZE];
rowNum = (int)mxGetM(pMxArray);
colNum = (int)mxGetN(pMxArray);
vStr.resize(rowNum * colNum);
for (int i = 0; i < rowNum; ++i) {
for (int j = 0; j < colNum; ++j) {
pCell = mxGetCell(pMxArray, j * rowNum + i);
if (mxGetString(pCell, strBuf, STRING_BUF_SIZE) != 0) {
cout << "String is too large to fit in the buffer! " << i + 1 << '\t' << j + 1 << endl;
return false;
}
vStr[i * colNum + j] = strBuf;
auto& lastStr = vStr[i * colNum + j];
transform(lastStr.cbegin(), lastStr.cend(), lastStr.begin(), ::toupper); // 转成大写
}
}
delete[]strBuf;
return true;
}
// 读取二维cell字符串并转换成大写
inline bool Read2DWord(const mxArray* pMxArray, vector<vector<string>>& vvStr) {
mxArray* pCell = nullptr;
int rowNum, colNum;
char* strBuf = new char[STRING_BUF_SIZE];
rowNum = (int)mxGetM(pMxArray);
colNum = (int)mxGetN(pMxArray);
for (int i = 0; i < rowNum; ++i) {
for (int j = 0; j < colNum; ++j) {
pCell = mxGetCell(pMxArray, j * rowNum + i);
int childRowNum = (int)mxGetM(pCell);
int childColNum = (int)mxGetN(pCell);
vvStr.push_back(vector<string>());
Read1DWord(pCell, vvStr.back());
}
}
delete[]strBuf;
return true;
}
// 将结果写入mxArray, 作为后续的返回值
mxArray* writeToMat(const double *data, int rowNum, int colNum) {
mxArray* pWriteArray = NULL;//matlab格式矩阵
int len = rowNum * colNum;
//创建一个rowNum*colNum的矩阵
pWriteArray = mxCreateDoubleMatrix(rowNum, colNum, mxREAL);
//把data的值赋给pWriteArray指针
memcpy((void*)(mxGetPr(pWriteArray)), (void*)data, sizeof(double) * len);
return pWriteArray; // 赋值给返回值
}
/* 入口函数 */
/*
1. wd cell
2. dic cell
3. threshold
x int(double)wddic10
dicwd
*/
void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) {
if (nrhs < 2) {
cout << "At least 2 arguments should be given for this function!" << endl;
return;
}
clock_t begin = clock(), mid, finish;
vector<string> vDic;
vector<vector<string>> vvWd;
Read2DWord(prhs[0], vvWd);
Read1DWord(prhs[1], vDic);
int rowNum = vvWd.size();
int threshold = 5;
if (nrhs > 2) {
double* pThreshold = (double*)mxGetData(prhs[2]);
threshold = (int)pThreshold[0];
if (threshold < 5) threshold = 5;
}
finish = clock();
cout << "Load data time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
vector<double> vXSum(vDic.size());
/* 统计dicr字典中每个单词在wd中出现的次数 */
mid = clock();
unordered_map<string, int> umWordPos;
for (int i = 0; i < vDic.size(); ++i) umWordPos[vDic[i]] = i; // 记录单词位置
unordered_set<int> usPos; // 多次出现在wd中的单词只统计一次这是原matlab代码的功能是否需要修改
vector<unordered_set<int>> vusX(rowNum); // 保存每一行中非零元的坐标
int row = 0;
for (auto& vWd : vvWd) {
auto& usPos = vusX[row++];
for (auto& word : vWd) {
auto itr = umWordPos.find(word);
if (itr != umWordPos.end()) {
usPos.insert(itr->second);
}
}
for (auto idx : usPos) {
vXSum[idx] += 1;
}
}
finish = clock();
cout << "Calc word occurrence time: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl;
/* 计算xs */
mid = clock();
int colNum = 0;
vector<int> vColIdx;
for (int i = 0; i < vXSum.size(); ++i) {
if (vXSum[i] >= threshold) {
vColIdx.push_back(i);
}
}
colNum = vColIdx.size();
vector<double> vXsData(rowNum * colNum);
for (int i = 0; i < rowNum; ++i) {
for (int j = 0; j < colNum; ++j) {
if (vusX[i].find(vColIdx[j]) != vusX[i].end()) {
vXsData[j * rowNum + i] = 1;
}
}
}
finish = clock();
cout << "Calc xs time: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl;
// 测试输出
// cout << rowNum << '\t' << colNum << endl;
// ofstream ofs1("d:\\result_xsum.txt");
// for (auto& val : vXSum) {
// ofs1 << val << endl;
// }
// ofs1.close();
//
// ofstream ofs2("d:\\result_xs.txt");
// for (int i = 0; i < rowNum; ++i) {
// for (int j = 0; j < colNum; ++j) {
// if (vXsData[j * rowNum + i] > 0) {
// ofs2 << j + 1 << '\t';
// }
// }
// ofs2 << endl;
// }
// ofs2.close();
/* 写入结果 */
mid = clock();
if (nlhs > 0) {
plhs[0] = writeToMat(vXSum.data(), 1, vXSum.size());
}
if (nlhs > 1) { // xs
plhs[1] = writeToMat(vXsData.data(), rowNum, colNum);
}
finish = clock();
cout << "Write result time: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl;
cout << "Calc word occurrence in Dic Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
}
// 供c++调试用
void mexFunctionWrap(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) {
return mexFunction(nlhs, plhs, nrhs, prhs);
}

View File

@ -119,8 +119,8 @@
</Link>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="IsWordInDic.cpp" />
<ClCompile Include="main.cpp" />
<ClCompile Include="RandSim.cpp" />
</ItemGroup>
<ItemGroup>
<None Include="packages.config" />

View File

@ -18,7 +18,7 @@
<ClCompile Include="main.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="RandSim.cpp">
<ClCompile Include="IsWordInDic.cpp">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>

View File

@ -13,32 +13,45 @@ int main(int argc, const char** argv)
//string matFile = "D:\\Twirls\\wd_small.mat";
//string matFile = "D:\\Twirls\\wd.mat";
clock_t begin = clock(), finish;
string wd2Mat = "D:\\wd2_5w.mat";
string dicrMat = "D:\\dicr.mat";
string wdMat = "D:\\wd.mat";
//string wd2Mat = "D:\\wd2_5w.mat";
//string dicrMat = "D:\\dicr.mat";
//string wdMat = "D:\\wd.mat";
//string dicrMat = "D:\\dicr_large.mat";
//string dicMat = "D:\\G_dc_large.mat";
//string wdMat = "D:\\wd_large.mat";
MATFile* pwdMat, *pwd2Mat, *pdicrMat;
mxArray* prhs[4];
//MATFile* pwdMat, *pwd2Mat, *pdicMat;
//mxArray* prhs[4];
pwdMat = matOpen(wdMat.c_str(), "r");
pwd2Mat = matOpen(wd2Mat.c_str(), "r");
pdicrMat = matOpen(dicrMat.c_str(), "r");
prhs[0] = matGetVariable(pwdMat, "wd"); //获取.mat文件里面名为matrixName的矩阵
prhs[1] = matGetVariable(pwd2Mat, "wd2");
//pwdMat = matOpen(wdMat.c_str(), "r");
// pwd2Mat = matOpen(wd2Mat.c_str(), "r");
//pdicMat = matOpen(dicMat.c_str(), "r");
// prhs[1] = mxCreateString("D:\\Twirls\\gat1\\literatures\\temp\\wd2s.txt");
prhs[2] = matGetVariable(pdicrMat, "dicr");
// prhs[2] = matGetVariable(pdicrMat, "dicr");
mxArray* plhs = (mxArray*)malloc(sizeof(mxArray*));
/* IsWordInDic */
MATFile* pwdMat, * pdicMat;
mxArray* plhs[4];
const mxArray* prhs[4];
int nlhs = 2, nrhs = 2;
pwdMat = matOpen("D:\\wd_large.mat", "r");
pdicMat = matOpen("D:\\G_dc_large.mat", "r");
prhs[0] = matGetVariable(pwdMat, "wd"); //获取.mat文件里面名为matrixName的矩阵
prhs[1] = matGetVariable(pdicMat, "dc");
mxArray** arg = &prhs[0];
/* */
//MATFile* pMatX = matOpen("D:\\x_large.mat", "r");
//MATFile* pMatH = matOpen("D:\\h_large.mat", "r");
//prhs[0] = matGetVariable(pMatX, "x");
//prhs[1] = matGetVariable(pMatH, "h");
mexFunction(1, &plhs, 3, arg);
//mexFunction(0, 0, 0, 0);
finish = clock();
cout << "Load Data time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
mexFunctionWrap(nlhs, &plhs[0], nrhs, &prhs[0]);
finish = clock();
cout << "mexFunction Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;

View File

@ -1,3 +1,4 @@
#pragma once
#include <mat.h>
void mexFunction(int nlhs, mxArray* plhs[], int nrhs, mxArray** prhs);
// void mexFunction(int nlhs, mxArray* plhs[], int nrhs, mxArray** prhs);
void mexFunctionWrap(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]);