diff --git a/CommonLib/CommonLib.vcxproj b/CommonLib/CommonLib.vcxproj index 8836c8c..cdba565 100644 --- a/CommonLib/CommonLib.vcxproj +++ b/CommonLib/CommonLib.vcxproj @@ -107,9 +107,7 @@ - - false - + diff --git a/CommonLib/CommonLib.vcxproj.filters b/CommonLib/CommonLib.vcxproj.filters index 7d4df5e..1380e3e 100644 --- a/CommonLib/CommonLib.vcxproj.filters +++ b/CommonLib/CommonLib.vcxproj.filters @@ -17,9 +17,7 @@ - - - + diff --git a/CommonLib/matlab_io.cpp b/CommonLib/matlab_io.cpp index 4831c3b..77e8513 100644 --- a/CommonLib/matlab_io.cpp +++ b/CommonLib/matlab_io.cpp @@ -14,6 +14,92 @@ #include "matlab_io.h" using namespace std; +// 将matlab存储方式转换成c存储方式 +#define TRANS_ROW_COL(dst, src, rowNum, colNum) \ + for (int rowI = 0; rowI < rowNum; ++rowI) { \ + for (int colJ = 0; colJ < colNum; ++colJ) { \ + dst[rowI * colNum + colJ] = src[colJ * rowNum + rowI]; \ + } \ + } + +/* 读取结构体中的二维字符串矩阵(一维的cell,每个cell又有一层cell,每个cell是一个字符串)*/ +bool ReadChildString2D(const string& filePath, const string& parentName, const string& selfName, vector >& vvStr) { + MATFile* pMatFile = nullptr; + mxArray* pMxArray = nullptr; + mxArray* pCell = nullptr; + int rowNum, colNum; + char strBuf[STRING_BUF_SIZE]; + + pMatFile = matOpen(filePath.c_str(), "r"); //打开.mat文件 + if (pMatFile == nullptr) { + cout << "filePath is error!" << endl; + return false; + } + mxArray* pMxParent = matGetVariable(pMatFile, parentName.c_str()); //获取G变量 + + // 读取字符串 + pMxArray = mxGetField(pMxParent, 0, selfName.c_str()); // ds + rowNum = (int)mxGetM(pMxArray); + colNum = (int)mxGetN(pMxArray); + for (int i = 0; i < rowNum; ++i) { + for (int j = 0; j < colNum; ++j) { + pCell = mxGetCell(pMxArray, j * rowNum + i); + int childRowNum = (int)mxGetM(pCell); + int childColNum = (int)mxGetN(pCell); + vvStr.push_back(vector()); + vvStr.back().resize(childRowNum * childColNum); + for (int ii = 0; ii < childRowNum; ii++) { + for (int jj = 0; jj < childColNum; jj++) { + mxArray* pChildCell = mxGetCell(pCell, jj * childRowNum + ii); + if (mxGetString(pChildCell, strBuf, STRING_BUF_SIZE) != 0) { + cout << "String is too large to fit in the buffer! " << i + 1 << '\t' << j + 1 << endl; + return false; + } + vvStr.back()[ii * childColNum + jj] = strBuf; + // auto& lastStr = vvStr.back()[ii * childColNum + jj]; + // transform(lastStr.begin(), lastStr.end(), lastStr.begin(), ::toupper); // 转成大写 + } + } + } + } + mxDestroyArray(pMxArray); + return true; +} + +/* 读取结构体中的二维double矩阵(一维的cell,每个cell又有一层cell,每个cell是一个字符串)*/ +bool ReadChildDouble2D(const string& filePath, const string& parentName, const string& selfName, vector >& vvDouble) { + MATFile* pMatFile = nullptr; + mxArray* pMxArray = nullptr; + mxArray* pCell = nullptr; + int rowNum, colNum; + + pMatFile = matOpen(filePath.c_str(), "r"); //打开.mat文件 + if (pMatFile == nullptr) { + cout << "filePath is error!" << endl; + return false; + } + mxArray* pMxParent = matGetVariable(pMatFile, parentName.c_str()); //获取G变量 + + // 读取double数据 + pMxArray = mxGetField(pMxParent, 0, selfName.c_str()); // ds + rowNum = (int)mxGetM(pMxArray); + colNum = (int)mxGetN(pMxArray); + for (int i = 0; i < rowNum; ++i) { + for (int j = 0; j < colNum; ++j) { + pCell = mxGetCell(pMxArray, j * rowNum + i); + int childRowNum = (int)mxGetM(pCell); + int childColNum = (int)mxGetN(pCell); + + vvDouble.push_back(vector()); + vvDouble.back().resize(childRowNum * childColNum); + double* pVal = (double*)mxGetData(pCell); //获取指针 + TRANS_ROW_COL(vvDouble.back(), pVal, childRowNum, childColNum); // 行列存储方式转换 + } + } + mxDestroyArray(pMxArray); + return true; +} + /* 读取字符串矩阵 */ bool ReadMtxString(const string& filePath, const string& mtxName, vector& vStr, int* pRowNum, int* pColNum) { diff --git a/CommonLib/matlab_io.h b/CommonLib/matlab_io.h index 37cab9d..e160dff 100644 --- a/CommonLib/matlab_io.h +++ b/CommonLib/matlab_io.h @@ -15,7 +15,7 @@ #include using namespace std; -#define STRING_BUF_SIZE 10240 +#define STRING_BUF_SIZE 204800 typedef double T; /* 注意参数的区别,读取时候直接传递文件路径名,写入的时候需要提供文件指针, @@ -31,4 +31,11 @@ T* ReadMtxDouble(const string& filePath, const string& mtxName, int* pRowNum, in /* 将数据写入mat文件中,用给定的名称命名 */ bool SaveMtxDouble(T* src, MATFile* pMatFile, string matrixName, int rowNum, int colNum); +/* 读取结构体中的字符串数据 */ +bool ReadChildString2D(const string& filePath, const string& parentName, const string& selfName, vector >& vvStr); + +/* 读取结构体中的二维double矩阵(一维的cell,每个cell又有一层cell,每个cell是一维double数组)*/ +bool ReadChildDouble2D(const string& filePath, const string& parentName, const string& selfName, vector >& vvDouble); + + #endif \ No newline at end of file diff --git a/CppRun/CppRun.vcxproj b/CppRun/CppRun.vcxproj index 63d4327..4ca5721 100644 --- a/CppRun/CppRun.vcxproj +++ b/CppRun/CppRun.vcxproj @@ -129,9 +129,7 @@ - - false - + diff --git a/CppRun/CppRun.vcxproj.filters b/CppRun/CppRun.vcxproj.filters index e5360c6..d65e643 100644 --- a/CppRun/CppRun.vcxproj.filters +++ b/CppRun/CppRun.vcxproj.filters @@ -33,8 +33,6 @@ - - - + \ No newline at end of file diff --git a/CppRun/calc_entropy.cpp b/CppRun/calc_entropy.cpp index ded1596..d00d3cc 100644 --- a/CppRun/calc_entropy.cpp +++ b/CppRun/calc_entropy.cpp @@ -92,7 +92,7 @@ bool ReadInfoFromMat(const string & filePath, vector >&vvDs, vect mxArray* pMxArray = nullptr; mxArray* pCell = nullptr; int rowNum, colNum; - char strBuf[STRING_BUF_SIZE]; + char *strBuf = new char[STRING_BUF_SIZE]; const string& parrentName = "G"; const string& firstChildName = "ds"; const string& secondChildName = "fr"; @@ -112,6 +112,7 @@ bool ReadInfoFromMat(const string & filePath, vector >&vvDs, vect INNTER_FOR_BEGIN if (mxGetString(pChildCell, strBuf, STRING_BUF_SIZE) != 0) { cout << "String is too large to fit in the buffer! " << i + 1 << '\t' << j + 1 << endl; + delete[]strBuf; return false; } vvDs.back()[ii * childColNum + jj] = strBuf; @@ -127,9 +128,10 @@ bool ReadInfoFromMat(const string & filePath, vector >&vvDs, vect vvFr.back().resize(childRowNum * childColNum); double *pVal = (double*)mxGetData(pCell); //获取指针 TRANS_ROW_COL(vvFr.back(), pVal, childRowNum, childColNum); // 行列存储方式转换 - OUTER_FOR_END + OUTER_FOR_END - // 没考虑完全哪些数据需要mxDestroyArray,可能会有内存泄漏 + // 没考虑完全哪些数据需要mxDestroyArray,可能会有内存泄漏 + delete[]strBuf; return true; } @@ -233,7 +235,7 @@ void CalcEntropy(int argc, const char** argv) { string wordMatSuffix(argv[2]); // 高频词汇矩阵对应的mat文件的后缀名(可以是全文件名,可以是文件名后缀,必须保证唯一) fs::path outFileName(argv[4]); int numThread = 1; - if (argc >= 5) numThread = atoi(argv[5]); + if (argc > 5) numThread = atoi(argv[5]); if (numThread < 1) numThread = 1; /* 读入处理后的pubmed文献信息的mat文件,只读入摘要信息,即变量abs1 */ @@ -281,7 +283,7 @@ void CalcEntropy(int argc, const char** argv) { cout << "read abstract time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; /* 将分割结果写入mat文件 */ begin = clock(); - if (argc >= 6) { + if (argc > 6) { MATFile* pMatFile = matOpen(argv[6], "w"); mxArray* pCellMtx= mxCreateCellMatrix(1, vvWordMtx.size()); for (int i = 0; i < vvWordMtx.size(); ++i) { diff --git a/GMM/GMM.vcxproj b/GMM/GMM.vcxproj index abfb3da..f615ea3 100644 --- a/GMM/GMM.vcxproj +++ b/GMM/GMM.vcxproj @@ -127,9 +127,7 @@ - - false - + diff --git a/GMM/GMM.vcxproj.filters b/GMM/GMM.vcxproj.filters index c29d2e2..5711398 100644 --- a/GMM/GMM.vcxproj.filters +++ b/GMM/GMM.vcxproj.filters @@ -36,8 +36,6 @@ - - - + \ No newline at end of file diff --git a/GMM/main.cpp b/GMM/main.cpp index 4f5b4e2..8f134cf 100644 --- a/GMM/main.cpp +++ b/GMM/main.cpp @@ -128,6 +128,11 @@ T SquareAverage(vector& vVal) { /* 计算向量x和y的相关距离, 向量维度必须相等*/ double CorrelationDistance(vector& vX, vector& vY) { + double xMean = Average(vX); + transform(vX.cbegin(), vX.cend(), vX.begin(), [&](double val) { return val - xMean; }); + double yMean = Average(vY); + transform(vY.cbegin(), vY.cend(), vY.begin(), [&](double val) { return val - yMean; }); + vector vXY(vX.size()); for (int i = 0; i < vXY.size(); ++i) { vXY[i] = vX[i] * vY[i]; diff --git a/MexFunc/CalcEntropy.cpp b/MexFunc/CalcEntropy.cpp new file mode 100644 index 0000000..eeeefd5 --- /dev/null +++ b/MexFunc/CalcEntropy.cpp @@ -0,0 +1,237 @@ +#include +#include +#include +#include +#include +#include +#include +#include +using std::cout; +using std::endl; +using namespace std; + +#define STRING_BUF_SIZE 204800 + +/* 读取二层cell包裹的字符串,和数值,ds,fr */ +#define OUTER_FOR_BEGIN \ + rowNum = (int)mxGetM(pMxArray); \ + colNum = (int)mxGetN(pMxArray); \ + for (int i = 0; i < rowNum; ++i) { \ + for (int j = 0; j < colNum; ++j) { \ + mxArray* pCell = mxGetCell(pMxArray, j * rowNum + i); \ + int childRowNum = (int)mxGetM(pCell); \ + int childColNum = (int)mxGetN(pCell); + +#define OUTER_FOR_END \ + } \ + } + +#define INNTER_FOR_BEGIN \ + for (int ii = 0; ii < childRowNum; ii++) { \ + for (int jj = 0; jj < childColNum; jj++) { \ + mxArray *pChildCell = mxGetCell(pCell, jj * childRowNum + ii); +#define INNTER_FOR_END \ + } \ + } +// 将matlab存储方式转换成c存储方式 +#define TRANS_ROW_COL(dst, src, rowNum, colNum) \ + for (int rowI = 0; rowI < rowNum; ++rowI) { \ + for (int colJ = 0; colJ < colNum; ++colJ) { \ + dst[rowI * colNum + colJ] = src[colJ * rowNum + rowI]; \ + } \ + } +// 将二维索引转成一维的索引 +inline int Get1DIndex(int colNum, int row, int col) { + return row * colNum + col; +} + +// 读取G结构体中的ds和fr +void GetFrDs(const mxArray* pMxParent, vector >& vvDs, vector >& vvFr) { + // 读取ds字符串 + int rowNum, colNum; + char *strBuf = new char[STRING_BUF_SIZE]; + mxArray* pMxArray = mxGetField(pMxParent, 0, "ds"); // ds + OUTER_FOR_BEGIN + vvDs.push_back(vector()); + vvDs.back().resize(childRowNum * childColNum); + INNTER_FOR_BEGIN + if (mxGetString(pChildCell, strBuf, STRING_BUF_SIZE) != 0) { + cout << "String is too large to fit in the buffer! " << i + 1 << '\t' << j + 1 << endl; + delete[]strBuf; + return; + } + vvDs.back()[ii * childColNum + jj] = strBuf; + auto& lastStr = vvDs.back()[ii * childColNum + jj]; + transform(lastStr.begin(), lastStr.end(), lastStr.begin(), ::toupper); // 转成大写 + INNTER_FOR_END + OUTER_FOR_END + + // 读取fr数值 + pMxArray = mxGetField(pMxParent, 0, "fr"); // fr + OUTER_FOR_BEGIN + vvFr.push_back(vector()); + vvFr.back().resize(childRowNum * childColNum); + double* pVal = (double*)mxGetData(pCell); //获取指针 + TRANS_ROW_COL(vvFr.back(), pVal, childRowNum, childColNum); // 行列存储方式转换 + OUTER_FOR_END + delete[]strBuf; +} + +/* 读取abs */ +void GetAbstract(const mxArray* pMxAbs, vector& vAbs) { + int rowNum = (int)mxGetM(pMxAbs); + int colNum = (int)mxGetN(pMxAbs); + char *strBuf = new char[STRING_BUF_SIZE]; + + vAbs.resize(rowNum * colNum); + for (int i = 0; i < rowNum; ++i) { + for (int j = 0; j < colNum; ++j) { + mxArray* pCell = mxGetCell(pMxAbs, j * rowNum + i); + if (mxGetString(pCell, strBuf, STRING_BUF_SIZE) != 0) { + cout << "String is too large to fit in the buffer! " << i + 1 << '\t' << j + 1 << endl; + delete[]strBuf; + return; + } + vAbs[i * colNum + j] = strBuf; + } + } + delete[]strBuf; +} + +/* +nlhs:输出参数数目(Number Left - hand side),等号左边 +plhs:指向输出参数的指针(Point Left - hand side),等号左边 +nrhs:输入参数数目(Number Right - hand side),等号右边 +prhs:指向输入参数的指针(Point Right - hand side),等号右边。要注意prhs是const的指针数组,即不能改变其指向内容。 +*/ +void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) { + //cout << "MexCalcEntropy" << endl; + //cout << nlhs << '\t' << nrhs << endl; + if (nrhs != 2) { + cout << "2 arguments should be given for this function!" << endl; + return; + } + clock_t begin, finish; + begin = clock(); + vector > vvDs; // 每个知识颗粒的ds矩阵(词汇矩阵) + vector > vvFr; // 词汇对应的频率 + GetFrDs(prhs[1], vvDs, vvFr); + + vector vAbstract; // 读取abs1, 然后分割成一个一个的单词 + GetAbstract(prhs[0], vAbstract); + + /* 将摘要信息分割成一个一个的词汇 */ + // begin = clock(); + unordered_set usWordChars; // 能组成单词的字符,要不要考虑数字?原版matlab是提取了数字的 + for (int i = 65; i <= 90; i++) usWordChars.insert(char(i)); // A - Z + for (int i = 97; i <= 122; i++) usWordChars.insert(char(i)); // a - z + for (int i = 48; i <= 57; i++) usWordChars.insert(char(i)); // 0 - 9 + usWordChars.insert('/'); usWordChars.insert('+'); usWordChars.insert('-'); + vector > vvWordMtx(vAbstract.size()); // 初始大小为文章的个数 + vector > vusAbsWord(vAbstract.size()); // 将每篇文章摘要的单词放入hash表 + for (int i = 0; i < vAbstract.size(); i++) { + auto& strAbs = vAbstract[i]; + // 遍历摘要字符串的每一个字符,取出每一个单词 + vector& vWord = vvWordMtx[i]; + if (strAbs.size() == 0) continue; // 摘要信息为空,跳过(一般不会出现这个情况) + int wordStartPos = 0; + while (wordStartPos < strAbs.size() && usWordChars.find(strAbs[wordStartPos]) == usWordChars.end()) + wordStartPos++; + for (int curPos = wordStartPos + 1; curPos < strAbs.size(); ++curPos) { + if (usWordChars.find(strAbs[curPos]) == usWordChars.end()) { // 找到了分割符 + vWord.push_back(strAbs.substr(wordStartPos, curPos - wordStartPos)); + wordStartPos = curPos + 1; // 找下一个词语起始位置 + while (wordStartPos < strAbs.size() && usWordChars.find(strAbs[wordStartPos]) == usWordChars.end()) + wordStartPos++; + curPos = wordStartPos; // 循环会自动加1 + } + } + // 将处理摘要之后的每个词语放入hash表 + for (auto& word : vWord) { + string upWord(word); + transform(upWord.begin(), upWord.end(), upWord.begin(), ::toupper); + vusAbsWord[i].insert(upWord); + } + } + // finish = clock(); + // cout << "Split abstract time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; + + // 存放结果,用一维数组存放二维数据 + vector hs; + vector hr; + const int numLiterature = vusAbsWord.size(); // pubmed 文件中包含的文献数量 + const int numGroup = vvDs.size(); // ds包含的组数 + hs.resize(numGroup * numLiterature); + hr.resize(numLiterature * numGroup); + + for (int groupIdx = 0; groupIdx < numGroup; ++groupIdx) { // 遍历知识颗粒中的每一组 + vector& vDs = vvDs[groupIdx]; // 这一组ds + vector& vFr = vvFr[groupIdx]; // frequency + const int numWord = vDs.size(); // 这一组数据中包含的单词数量 + vector > vX(numLiterature, vector(numWord, 0)); + // 检查知识颗粒中的词语是否出现在pubmed摘要的词语中 + for (int i = 0; i < numLiterature; ++i) { + for (int j = 0; j < numWord; ++j) { + if (vusAbsWord[i].find(vDs[j]) != vusAbsWord[i].end()) { // 这一组单词中的j索引位置的单词在第i个文献中出现过 + vX[i][j] = 1; + } + } + } + + // 找词汇的最高频率 + double maxFr = *max_element(vFr.begin(), vFr.end()); + // 将fr的数值规范化到(0,0.368)之间 + const double normalMax = 0.368; + for (auto& frVal : vFr) frVal = frVal * normalMax / maxFr; + maxFr = normalMax; + // 对每个知识颗粒每一组数据,计算信息熵 + for (int i = 0; i < numLiterature; ++i) { + for (int j = 0; j < numWord; ++j) { + if (vX[i][j] == 1) { + hs[Get1DIndex(numLiterature, groupIdx, i)] -= vFr[j] * log2(vFr[j]); + } + } + } + + // 找最高频词汇所在的索引位置 + vector vMaxPos; + int idx = 0; + for_each(vFr.begin(), vFr.end(), [&idx, maxFr, &vMaxPos](double val) { + if (val == maxFr) vMaxPos.push_back(idx); + idx++; + }); + + for (int i = 0; i < numLiterature; ++i) { + int cumulateX = 0; // 计算在最高频词汇处,x值的累加结果 + for (int j = 0; j < vMaxPos.size(); ++j) cumulateX += vX[i][vMaxPos[j]]; + if (cumulateX == vMaxPos.size()) { // 如果频率最高的词汇都出现在了文献中 + hr[Get1DIndex(numGroup, i, groupIdx)] = 1; // 应该是表示知识颗粒的这一组数据跟这篇文献相关性比较高 + } + } + } + + /* 将结果写入返回值 */ + if (nlhs > 0) { + int datasize = numGroup * numLiterature; + double* mtxData = new double[datasize];//待存储数据转为double格式 + mxArray* pWriteArray = NULL;//matlab格式矩阵 + //创建一个rowNum*colNum的矩阵 + pWriteArray = mxCreateDoubleMatrix(numGroup, numLiterature, mxREAL); + for (int i = 0; i < numGroup; i++) { + for (int j = 0; j < numLiterature; j++) { + mtxData[j * numGroup + i] = hs[i * numLiterature + j]; + } + } + //把data的值赋给pWriteArray指针 + memcpy((void*)(mxGetPr(pWriteArray)), (void*)mtxData, sizeof(double) * datasize); + plhs[0] = pWriteArray; // 赋值给返回值 + delete[]mtxData; + } + finish = clock(); + // cout << "CalcEntropy Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; +} + +/* 供main调试调用 */ +void MexCalcEntropy(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) { + mexFunction(nlhs, plhs, nrhs, prhs); +} \ No newline at end of file diff --git a/MexFunc/CorrelationDist.cpp b/MexFunc/CorrelationDist.cpp new file mode 100644 index 0000000..7746785 --- /dev/null +++ b/MexFunc/CorrelationDist.cpp @@ -0,0 +1,224 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "CommonLib/kthread.h" +#include "CommonLib/thread_pool.h" +using std::cout; +using std::endl; +using namespace std; + +// 线程参数 +struct TPCorDist { + vector>* pvvX; + vector* pvDist; + vector* pvSq; + int rowIdxStart; + int rowIdxEnd; + int rowNum; + int colNum; +}; + +// 线程函数 +void ThreadCalcDist(TPCorDist& param) { + vector>& vvX = *param.pvvX; + vector& vDist = *param.pvDist; + vector& vSq = *param.pvSq; + int rowIdxStart = param.rowIdxStart; + int rowIdxEnd = param.rowIdxEnd; + int rowNum = param.rowNum; + int colNum = param.colNum; + double uv = 0; + clock_t begin = clock(), finish; + __m256 vec_zero = _mm256_set1_ps(0); + for (int i = rowIdxStart; i < rowIdxEnd; ++i) { + const int baseIdx = i * (rowNum * 2 - i - 1) / 2; + for (int cur = i + 1, idx = 0; cur < rowNum; ++cur, ++idx) { + double uv = 0; + //for (int j = 0; j < colNum; ++j) { uv += vvX[i][j] * vvX[cur][j]; } + __m256 vec_uv = vec_zero; + int colNum8 = colNum / 8 * 8; + int remain = colNum - colNum8; + for (int j = 0; j < colNum8; j += 8) { + __m256 vec_u = _mm256_loadu_ps(vvX[i].data() + j); + __m256 vec_v = _mm256_loadu_ps(vvX[cur].data() + j); + vec_uv = _mm256_add_ps(_mm256_mul_ps(vec_u, vec_v), vec_uv); + } + for (int j = colNum8; j < colNum; ++j) { + uv += vvX[i][j] * vvX[cur][j]; + } + float* pVec = (float*)&vec_uv; + for (int j = 0; j < 8; ++j) uv += pVec[j]; + + uv /= colNum; + const double dist = abs(1.0 - uv / sqrt(vSq[i] * vSq[cur])); + vDist[baseIdx + idx] = dist; + } + } + finish = clock(); + //cout << rowIdxEnd << " Thread time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; +} + +/* 入口函数 */ +void mexFunction(int nlhs, mxArray* plhs[], int nrhs, mxArray** prhs) { + //cout << "WordSplit" << endl; + //cout << nlhs << '\t' << nrhs << endl; + //if (nrhs < 1) { + // cout << "At least 1 arguments should be given for this function!" << endl; + // return; + //} + clock_t begin = clock(), mid, finish; + + int rowNum = (int)mxGetM(prhs[0]); + int colNum = (int)mxGetN(prhs[0]); + double* pData = (double*)mxGetData(prhs[0]); + // + //int numThread = 1; + //if (nrhs > 1) { + // double* pNumThread = (double*)mxGetData(prhs[1]); + // numThread = (int)*pNumThread; + // if (numThread < 1) numThread = 1; + //} + + + //for (int i = 0; i < rowNum; ++i) { + // for (int j = 0; j < colNum; ++j) { + // cout << pData[j * rowNum + i] << '\t'; + // } + // cout << endl; + //} + +// int rowNum = 2; +// int colNum = 5; +// vector pData = { 1,1,2,2,3,9,4,4,5,4 }; + + cout << rowNum << '\t' << colNum << endl; + + /* 计算每一行的平均数 */ + mid = clock(); + vector vMean(rowNum); + for (int j = 0; j < colNum; ++j) { + for (int i = 0; i < rowNum; ++i) { + vMean[i] += pData[j * rowNum + i]; + } + } + for (int i = 0; i < rowNum; ++i) { + vMean[i] /= colNum; + } + finish = clock(); + cout << "计算平均数: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl; + /* 减去平均值, 计算平方 */ + mid = clock(); + vector> vvX(rowNum, vector(colNum)); + vector vSq(rowNum); + for (int i = 0; i < rowNum; ++i) { + for (int j = 0; j < colNum; ++j) { + vvX[i][j] = pData[j * rowNum + i] - vMean[i]; + vSq[i] += vvX[i][j] * vvX[i][j]; + } + } + for (auto& val : vSq) { val /= colNum; } + finish = clock(); + cout << "计算平方: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl; + + /* 计算相关距离 */ +// clock_t mid0 = clock(); +// const int distSize = rowNum * (rowNum - 1) / 2; +// const int row2 = 2 * rowNum; +// vector vDist(distSize, 0.0); +// //__m256d vec_zero = _mm256_set1_pd(0); +// __m256 vec_zero = _mm256_set1_ps(0); +// for (int i = 0; i < rowNum; ++i) { +// if (i % 100 == 0) { +// finish = clock(); +// cout << "time " << i << ": " << (double)(finish - mid) / CLOCKS_PER_SEC << " s; " << +// (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; +// mid = clock(); +// } +// const int baseIdx = i * (row2 - i - 1) / 2; +// for (int cur = i + 1, idx = 0; cur < rowNum; ++cur, ++idx) { +// double uv = 0; +// //for (int j = 0; j < colNum; ++j) { uv += vvX[i][j] * vvX[cur][j]; } +// //__m256d vec_uv = vec_zero; +// //int colNum4 = colNum / 4 * 4; +// //int remain = colNum - colNum4; +// //for (int j = 0; j < colNum4; j += 4) { +// // __m256d vec_u = _mm256_loadu_pd(vvX[i].data() + j); +// // __m256d vec_v = _mm256_loadu_pd(vvX[cur].data() + j); +// // vec_uv = _mm256_add_pd(_mm256_mul_pd(vec_u, vec_v), vec_uv); +// //} +// //for (int j = colNum4; j < colNum; ++j) { +// // uv += vvX[i][j] * vvX[cur][j]; +// //} +// //double* pVec = (double*)&vec_uv; +// //for (int j = 0; j < 4; ++j) uv += pVec[j]; +// +// __m256 vec_uv = vec_zero; +// int colNum8 = colNum / 8 * 8; +// int remain = colNum - colNum8; +// for (int j = 0; j < colNum8; j += 8) { +// __m256 vec_u = _mm256_loadu_ps(vvX[i].data() + j); +// __m256 vec_v = _mm256_loadu_ps(vvX[cur].data() + j); +// vec_uv = _mm256_add_ps(_mm256_mul_ps(vec_u, vec_v), vec_uv); +// } +// for (int j = colNum8; j < colNum; ++j) { +// uv += vvX[i][j] * vvX[cur][j]; +// } +// float* pVec = (float*)&vec_uv; +// for (int j = 0; j < 8; ++j) uv += pVec[j]; +// +// uv /= colNum; +// const double dist = abs(1.0 - uv / sqrt(vSq[i] * vSq[cur])); +// vDist[baseIdx + idx] = dist; +// } +// } +// finish = clock(); +// cout << "计算相关距离: " << (double)(finish - mid0) / CLOCKS_PER_SEC << " s" << endl; +// for (auto& val : vDist) {cout << val << endl;} + + /* 多线程计算相关距离 */ + const int distSize = rowNum * (rowNum - 1) / 2; + const int row2 = 2 * rowNum; + vector vDist(distSize, 0.0); + mid = clock(); + vector vTP; + ThreadPool thPool(6); + int span = 32; + int rowNumSpan = rowNum / span * span; + int i = 0; + for (i = 0; i < rowNumSpan; i += span) { + // vTP.push_back({&vvX, &vDist, &vSq, i, rowNum, colNum}); + TPCorDist tp = { &vvX, &vDist, &vSq, i, i + span, rowNum, colNum }; + thPool.enqueue(ThreadCalcDist, tp); + //vTP.push_back(tp); + } + if (i < rowNum) { + TPCorDist tp = { &vvX, &vDist, &vSq, i, rowNum, rowNum, colNum }; + thPool.enqueue(ThreadCalcDist, tp); + //vTP.push_back(tp); + } + thPool.~ThreadPool(); + //kt_for(6, ThreadCalcDist, vTP); + + finish = clock(); + cout << "多线程计算相关距离: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl; + // for (auto& val : vDist) {cout << val << endl;} + + /* 写入结果 */ + if (nlhs > 0) { // b + mxArray* pWriteArray = NULL; + //创建一个rowNum*colNum的矩阵 + pWriteArray = mxCreateDoubleMatrix(1, distSize, mxREAL); + memcpy((void*)(mxGetPr(pWriteArray)), (void*)vDist.data(), sizeof(double) * 6); + plhs[0] = pWriteArray; // 赋值给返回值 + } + + finish = clock(); + cout << "Correlation Dist Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; +} \ No newline at end of file diff --git a/MexFunc/GaussFit.cpp b/MexFunc/GaussFit.cpp new file mode 100644 index 0000000..7b3d7e5 --- /dev/null +++ b/MexFunc/GaussFit.cpp @@ -0,0 +1,1256 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +using std::cout; +using std::endl; +using namespace std; +#define M_PI 3.1415926535897932384626433832795 + + +class KMeans +{ +public: + enum InitMode + { + InitRandom, + InitManual, + InitUniform, + }; + + KMeans(int dimNum = 1, int clusterNum = 1); + ~KMeans(); + + void SetMean(int i, const double* u) { memcpy(m_means[i], u, sizeof(double) * m_dimNum); } + void SetInitMode(int i) { m_initMode = i; } + void SetMaxIterNum(int i) { m_maxIterNum = i; } + void SetEndError(double f) { m_endError = f; } + + double* GetMean(int i) { return m_means[i]; } + int GetInitMode() { return m_initMode; } + int GetMaxIterNum() { return m_maxIterNum; } + double GetEndError() { return m_endError; } + + + /* SampleFile: ... + LabelFile: