From 69a64179997b56ba29804286cd70b81fd92eeea6 Mon Sep 17 00:00:00 2001
From: HPDell <huyg0180110559@outlook.com>
Date: Thu, 1 Aug 2024 13:14:44 +0100
Subject: [PATCH 01/11] add: calc trQtQ and diagB (serial)

---
 include/gwmodelpp/GWRBasic.h | 18 +++++++
 src/gwmodelpp/GWRBasic.cpp   | 99 ++++++++++++++++++++++++++++++++++++
 2 files changed, 117 insertions(+)

diff --git a/include/gwmodelpp/GWRBasic.h b/include/gwmodelpp/GWRBasic.h
index 30665d2..f18f489 100644
--- a/include/gwmodelpp/GWRBasic.h
+++ b/include/gwmodelpp/GWRBasic.h
@@ -51,6 +51,10 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
     typedef arma::mat (GWRBasic::*FitCoreCalculator)(const arma::mat&, const arma::vec&, const SpatialWeight&);   //!< \~english Fit function declaration. \~chinese 拟合函数声明。
     typedef arma::mat (GWRBasic::*FitCoreSHatCalculator)(const arma::mat&, const arma::vec&, const SpatialWeight&, arma::vec&);   //!< \~english Fit function declaration. \~chinese 拟合函数声明。
     typedef arma::mat (GWRBasic::*FitCoreCVCalculator)(const arma::mat&, const arma::vec&, const SpatialWeight&);   //!< \~english Fit function declaration. \~chinese 拟合函数声明。
+    typedef double (GWRBasic::*TrQtQCalculator)();
+    typedef double (GWRBasic::*TrQtQCoreCalculator)();
+    typedef arma::vec (GWRBasic::*DiagBCalculator)(arma::uword);
+    typedef arma::vec (GWRBasic::*DiagBCoreCalculator)(arma::uword);
 
     typedef double (GWRBasic::*BandwidthSelectionCriterionCalculator)(BandwidthWeight*);        //!< \~english Declaration of criterion calculator for bandwidth selection. \~chinese 带宽优选指标计算函数声明。
     typedef double (GWRBasic::*IndepVarsSelectCriterionCalculator)(const std::vector<std::size_t>&); //!< \~english Declaration of criterion calculator for variable selection. \~chinese 变量优选指标计算函数声明。
@@ -508,6 +512,12 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
      */
     arma::mat fitBase();
 
+    double calcTrQtQBase();
+
+    arma::vec calcDiagBBase(arma::uword i)
+    {
+        return (this->*mCalcDiagBCoreFunction)(i);
+    }
 
 private:
 
@@ -517,6 +527,10 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
 
     arma::mat fitCoreCVSerial(const arma::mat& x, const arma::vec& y, const SpatialWeight& sw);
 
+    double calcTrQtQCoreSerial();
+
+    arma::vec calcDiagBCoreSerial(arma::uword i);
+
 #ifdef ENABLE_OPENMP
 
     /**
@@ -772,6 +786,10 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
     arma::cube mC;//!< \~english All \f$S\f$ matrices. \~chinese 所有 \f$C\f$ 矩阵。
     bool mStoreS = false; //!< \~english Whether to save S \~chinese 是否保存 S 矩阵
     bool mStoreC = false; //!< \~english Whether to save C \~chinese 是否保存 C 矩阵
+    TrQtQCalculator mCalcTrQtQFunction = &GWRBasic::calcTrQtQBase;
+    TrQtQCoreCalculator mCalcTrQtQCoreFUnction = &GWRBasic::calcTrQtQCoreSerial;
+    DiagBCalculator mCalcDiagBFunction = &GWRBasic::calcDiagBBase;
+    DiagBCoreCalculator mCalcDiagBCoreFunction = &GWRBasic::calcDiagBCoreSerial;
 };
 
 }
diff --git a/src/gwmodelpp/GWRBasic.cpp b/src/gwmodelpp/GWRBasic.cpp
index 12720c7..67669fd 100644
--- a/src/gwmodelpp/GWRBasic.cpp
+++ b/src/gwmodelpp/GWRBasic.cpp
@@ -416,6 +416,105 @@ double gwm::GWRBasic::indepVarsSelectionCriterion(const vector<size_t>& indepVar
         return DBL_MAX;
     }
 }
+double GWRBasic::calcTrQtQBase()
+{
+    double trQtQ = 0.0;
+    uword nDp = mCoords.n_rows;
+    if (isStoreS())
+    {
+        mat EmS = eye(nDp, nDp) - mS;
+        mat Q = trans(EmS) * EmS;
+        trQtQ = sum(diagvec(trans(Q) * Q));
+    }
+    else
+    {
+        trQtQ = (this->*mCalcTrQtQCoreFUnction)();
+    }
+    return trQtQ;
+}
+
+double GWRBasic::calcTrQtQCoreSerial()
+{
+    double trQtQ = 0.0;
+    uword nDp = mCoords.n_rows, nVar = mCoords.n_cols;
+    mat wspan(1, nVar, fill::ones);
+    for (uword i = 0; i < nDp; i++)
+    {
+        vec wi = mSpatialWeight.weightVector(i);
+        mat xtwi = trans(mX % (wi * wspan));
+        try
+        {
+            mat xtwxR = inv_sympd(xtwi * mX);
+            mat ci = xtwxR * xtwi;
+            mat si = mX.row(i) * inv(xtwi * mX) * xtwi;
+            vec pi = -trans(si);
+            pi(i) += 1.0;
+            double qi = sum(pi % pi);
+            trQtQ += qi * qi;
+            for (arma::uword j = i + 1; j < nDp; j++)
+            {
+                vec wj = mSpatialWeight.weightVector(j);
+                mat xtwj = trans(mX % (wj * wspan));
+                try {
+                    mat sj = mX.row(j) * inv_sympd(xtwj * mX) * xtwj;
+                    vec pj = -trans(sj);
+                    pj(j) += 1.0;
+                    double qj = sum(pi % pj);
+                    trQtQ += qj * qj * 2.0;
+                }
+                catch (const std::exception& e)
+                {
+                    return DBL_MAX;
+                }
+            }
+        }
+        catch(const std::exception& e)
+        {
+            return DBL_MAX;
+        }
+        
+    }
+    return trQtQ;
+}
+
+vec GWRBasic::calcDiagBCoreSerial(uword i)
+{
+    arma::uword nDp = mX.n_rows, nVar = mX.n_cols;
+    vec diagB(nDp, fill::zeros), c(nDp, fill::zeros);
+    mat ek = eye(nVar, nVar);
+    mat wspan(1, nVar, fill::ones);
+    for (arma::uword j = 0; j < nDp; j++)
+    {
+        vec w = mSpatialWeight.weightVector(j);
+        mat xtw = trans(mX % (w * wspan));
+        try
+        {
+            mat C = trans(xtw) * inv_sympd(xtw * mX);
+            c += C.col(i);
+        }
+        catch (const std::exception& e)
+        {
+            return { DBL_MAX, DBL_MAX };
+        }
+    }
+    for (arma::uword k = 0; k < nDp; k++)
+    {
+        vec w = mSpatialWeight.weightVector(k);
+        mat xtw = trans(mX % (w * wspan));
+        try
+        {
+            mat C = trans(xtw) * inv_sympd(xtw * mX);
+            vec b = C.col(i);
+            diagB += (b % b - (1.0 / nDp) * (b % c));
+        }
+        catch (const std::exception& e)
+        {
+            return { DBL_MAX, DBL_MAX };
+        }
+    }
+    diagB = 1.0 / nDp * diagB;
+    return { sum(diagB), sum(diagB % diagB) };
+}
 
 #ifdef ENABLE_OPENMP
 mat GWRBasic::predictOmp(const mat& locations, const mat& x, const vec& y)

From d4573b3b7a139a5fc1dbcad51cb0684f1a05b8cc Mon Sep 17 00:00:00 2001
From: HPDell <huyg0180110559@outlook.com>
Date: Thu, 1 Aug 2024 13:16:02 +0100
Subject: [PATCH 02/11] add: f test base

---
 include/gwmodelpp/GWRBasic.h | 22 +++++++++++++
 src/gwmodelpp/GWRBasic.cpp   | 62 ++++++++++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+)

diff --git a/include/gwmodelpp/GWRBasic.h b/include/gwmodelpp/GWRBasic.h
index f18f489..9f89e1d 100644
--- a/include/gwmodelpp/GWRBasic.h
+++ b/include/gwmodelpp/GWRBasic.h
@@ -51,6 +51,7 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
     typedef arma::mat (GWRBasic::*FitCoreCalculator)(const arma::mat&, const arma::vec&, const SpatialWeight&);   //!< \~english Fit function declaration. \~chinese 拟合函数声明。
     typedef arma::mat (GWRBasic::*FitCoreSHatCalculator)(const arma::mat&, const arma::vec&, const SpatialWeight&, arma::vec&);   //!< \~english Fit function declaration. \~chinese 拟合函数声明。
     typedef arma::mat (GWRBasic::*FitCoreCVCalculator)(const arma::mat&, const arma::vec&, const SpatialWeight&);   //!< \~english Fit function declaration. \~chinese 拟合函数声明。
+    typedef void (GWRBasic::*FTestCalculator)();
     typedef double (GWRBasic::*TrQtQCalculator)();
     typedef double (GWRBasic::*TrQtQCoreCalculator)();
     typedef arma::vec (GWRBasic::*DiagBCalculator)(arma::uword);
@@ -59,6 +60,14 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
     typedef double (GWRBasic::*BandwidthSelectionCriterionCalculator)(BandwidthWeight*);        //!< \~english Declaration of criterion calculator for bandwidth selection. \~chinese 带宽优选指标计算函数声明。
     typedef double (GWRBasic::*IndepVarsSelectCriterionCalculator)(const std::vector<std::size_t>&); //!< \~english Declaration of criterion calculator for variable selection. \~chinese 变量优选指标计算函数声明。
 
+    struct FTestResult
+    {
+        double s = 0.0;
+        double df1 = 0.0;
+        double df2 = 0.0;
+        double p = 0.0;
+    };
+
 private:
 
     /**
@@ -400,6 +409,11 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
 
     arma::mat fit() override;
 
+    void fTest()
+    {
+        (this->*mFTestFunction)();
+    }
+
 public:     // Implement IVariableSelectable
     Status getCriterion(const std::vector<size_t>& variables, double& criterion) override
     {
@@ -512,6 +526,8 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
      */
     arma::mat fitBase();
 
+    void fTestBase();
+
     double calcTrQtQBase();
 
     arma::vec calcDiagBBase(arma::uword i)
@@ -786,6 +802,12 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
     arma::cube mC;//!< \~english All \f$S\f$ matrices. \~chinese 所有 \f$C\f$ 矩阵。
     bool mStoreS = false; //!< \~english Whether to save S \~chinese 是否保存 S 矩阵
     bool mStoreC = false; //!< \~english Whether to save C \~chinese 是否保存 C 矩阵
+
+    FTestResult mF1Test;
+    FTestResult mF2Test;
+    std::vector<FTestResult> mF3Test;
+    FTestResult mF4Test;
+    FTestCalculator mFTestFunction = &GWRBasic::fTestBase;
     TrQtQCalculator mCalcTrQtQFunction = &GWRBasic::calcTrQtQBase;
     TrQtQCoreCalculator mCalcTrQtQCoreFUnction = &GWRBasic::calcTrQtQCoreSerial;
     DiagBCalculator mCalcDiagBFunction = &GWRBasic::calcDiagBBase;
diff --git a/src/gwmodelpp/GWRBasic.cpp b/src/gwmodelpp/GWRBasic.cpp
index 67669fd..e3b336f 100644
--- a/src/gwmodelpp/GWRBasic.cpp
+++ b/src/gwmodelpp/GWRBasic.cpp
@@ -1,3 +1,4 @@
+#include <gsl/gsl_cdf.h>
 #include "GWRBasic.h"
 #include "BandwidthSelector.h"
 #include "VariableForwardSelector.h"
@@ -416,6 +417,67 @@ double gwm::GWRBasic::indepVarsSelectionCriterion(const vector<size_t>& indepVar
         return DBL_MAX;
     }
 }
+
+void gwm::GWRBasic::fTestBase()
+{
+    double v1 = mSHat(0), v2 = mSHat(1);
+    double nDp = double(mCoords.n_rows), nVar = double(mX.n_cols);
+    double RSSg = RSS(mX, mY, mBetas);
+    vec betaOls = solve(mX, mY);
+    vec residualOls = mY - mX * betaOls;
+    double RSSo = sum(residualOls % residualOls);
+    double DFo = nDp - nVar;
+    double delta1 = 1.0 * nDp - 2.0 * v1 + v2;
+    double sigma21 = RSSg / delta1;
+    double lDelta1 = sum(mQDiag), lDelta2 = (this->*mCalcTrQtQFunction)();
+    //=========
+    // F1 Test
+    //=========
+    mF1Test.s = (RSSg/lDelta1)/(RSSo/DFo);
+    mF1Test.df1 = lDelta1 * lDelta1 / lDelta2;
+    mF1Test.df2 = DFo;
+    mF1Test.p = gsl_cdf_fdist_P(mF1Test.s, mF1Test.df1, mF1Test.df2);
+    //=========
+    // F2 Test
+    //=========
+    mF2Test.s = ((RSSo-RSSg)/(DFo-lDelta1))/(RSSo/DFo);
+    mF2Test.df1 = (DFo-lDelta1) * (DFo-lDelta1) / (DFo - 2 * lDelta1 + lDelta2);
+    mF2Test.df2 = DFo;
+    mF2Test.p = gsl_cdf_fdist_Q(mF2Test.s, mF2Test.df1, mF2Test.df2);
+    //=========
+    // F3 Test
+    //=========
+    mF3Test.resize(mX.n_cols);
+    vec vk2(nVar, arma::fill::zeros);
+    for (int i = 0; i < nVar; i++)
+    {
+        vec betasi = mBetas.col(i);
+        vec betasJndp = vec(nDp, fill::ones) * (sum(betasi) * 1.0 / nDp);
+        vk2(i) = (1.0 / nDp) * det(trans(betasi - betasJndp) * betasi);
+    }
+
+    for (uword i = 0; i < mX.n_cols; i++)
+    {
+        vec diagB = (this->*mCalcDiagBFunction)(i);
+        double g1 = diagB(0);
+        double g2 = diagB(1);
+        double numdf = g1 * g1 / g2;
+        FTestResult f3i;
+        f3i.s = (vk2(i) / g1) / sigma21;
+        f3i.df1 = numdf;
+        f3i.df2 = mF1Test.df1;
+        f3i.p = gsl_cdf_fdist_Q(f3i.s, numdf, mF1Test.df1);
+        mF3Test[i] = f3i;
+    }
+    //=========
+    // F4 Test
+    //=========
+    mF4Test.s = RSSg / RSSo;
+    mF4Test.df1 = delta1;
+    mF4Test.df2 = DFo;
+    mF4Test.p = gsl_cdf_fdist_P(mF4Test.s, mF4Test.df1, mF4Test.df2);
+}
+
 double GWRBasic::calcTrQtQBase()
 {
     double trQtQ = 0.0;

From a27ff1ac843658ed9b49dff1c8bd98e2eaf3e596 Mon Sep 17 00:00:00 2001
From: HPDell <huyg0180110559@outlook.com>
Date: Thu, 1 Aug 2024 14:43:31 +0100
Subject: [PATCH 03/11] add: multithreading implementation

---
 include/gwmodelpp/GWRBasic.h |  4 ++
 src/gwmodelpp/GWRBasic.cpp   | 97 ++++++++++++++++++++++++++++++++++++
 2 files changed, 101 insertions(+)

diff --git a/include/gwmodelpp/GWRBasic.h b/include/gwmodelpp/GWRBasic.h
index 9f89e1d..ed9b050 100644
--- a/include/gwmodelpp/GWRBasic.h
+++ b/include/gwmodelpp/GWRBasic.h
@@ -623,6 +623,10 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
      */
     arma::mat fitCoreSHatOmp(const arma::mat& x, const arma::vec& y, const SpatialWeight& sw, arma::vec& shat);
 
+    double calcTrQtQCoreOmp();
+
+    arma::vec calcDiagBCoreOmp(arma::uword i);
+
 #endif
 
 #ifdef ENABLE_CUDA
diff --git a/src/gwmodelpp/GWRBasic.cpp b/src/gwmodelpp/GWRBasic.cpp
index e3b336f..dbf11ed 100644
--- a/src/gwmodelpp/GWRBasic.cpp
+++ b/src/gwmodelpp/GWRBasic.cpp
@@ -743,6 +743,91 @@ arma::mat GWRBasic::fitCoreSHatOmp(const arma::mat& x, const arma::vec& y, const
     shat = sum(shat_all, 1);
     return betas.t();
 }
+
+double GWRBasic::calcTrQtQCoreOmp()
+{
+    double trQtQ = 0.0;
+    uword nDp = mCoords.n_rows, nVar = mCoords.n_cols;
+    mat wspan(1, nVar, fill::ones);
+#pragma omp parallel for num_threads(mOmpThreadNum)
+    for (uword i = 0; i < nDp; i++)
+    {
+        vec wi = mSpatialWeight.weightVector(i);
+        mat xtwi = trans(mX % (wi * wspan));
+        try
+        {
+            mat xtwxR = inv_sympd(xtwi * mX);
+            mat ci = xtwxR * xtwi;
+            mat si = mX.row(i) * inv(xtwi * mX) * xtwi;
+            vec pi = -trans(si);
+            pi(i) += 1.0;
+            double qi = sum(pi % pi);
+            trQtQ += qi * qi;
+            for (arma::uword j = i + 1; j < nDp; j++)
+            {
+                vec wj = mSpatialWeight.weightVector(j);
+                mat xtwj = trans(mX % (wj * wspan));
+                try {
+                    mat sj = mX.row(j) * inv_sympd(xtwj * mX) * xtwj;
+                    vec pj = -trans(sj);
+                    pj(j) += 1.0;
+                    double qj = sum(pi % pj);
+                    trQtQ += qj * qj * 2.0;
+                }
+                catch (const std::exception& e)
+                {
+                    return DBL_MAX;
+                }
+            }
+        }
+        catch(const std::exception& e)
+        {
+            return DBL_MAX;
+        }
+        
+    }
+    return trQtQ;
+}
+
+vec GWRBasic::calcDiagBCoreOmp(uword i)
+{
+    arma::uword nDp = mX.n_rows, nVar = mX.n_cols;
+    vec diagB(nDp, fill::zeros), c(nDp, fill::zeros);
+    mat ek = eye(nVar, nVar);
+    mat wspan(1, nVar, fill::ones);
+    for (arma::uword j = 0; j < nDp; j++)
+    {
+        vec w = mSpatialWeight.weightVector(j);
+        mat xtw = trans(mX % (w * wspan));
+        try
+        {
+            mat C = trans(xtw) * inv_sympd(xtw * mX);
+            c += C.col(i);
+        }
+        catch (const std::exception& e)
+        {
+            return { DBL_MAX, DBL_MAX };
+        }
+    }
+#pragma omp parallel for num_threads(mOmpThreadNum)
+    for (arma::uword k = 0; k < nDp; k++)
+    {
+        vec w = mSpatialWeight.weightVector(k);
+        mat xtw = trans(mX % (w * wspan));
+        try
+        {
+            mat C = trans(xtw) * inv_sympd(xtw * mX);
+            vec b = C.col(i);
+            diagB += (b % b - (1.0 / nDp) * (b % c));
+        }
+        catch (const std::exception& e)
+        {
+            return { DBL_MAX, DBL_MAX };
+        }
+    }
+    diagB = 1.0 / nDp * diagB;
+    return { sum(diagB), sum(diagB % diagB) };
+}
 #endif
 
 #ifdef ENABLE_CUDA
@@ -1248,6 +1333,8 @@ void GWRBasic::setParallelType(const ParallelType& type)
             mFitCoreFunction = &GWRBasic::fitCoreSerial;
             mFitCoreCVFunction = &GWRBasic::fitCoreCVSerial;
             mFitCoreSHatFunction = &GWRBasic::fitCoreSHatSerial;
+            mCalcTrQtQCoreFUnction = &GWRBasic::calcTrQtQCoreSerial;
+            mCalcDiagBCoreFunction = &GWRBasic::calcDiagBCoreSerial;
             break;
 #ifdef ENABLE_OPENMP
         case ParallelType::OpenMP:
@@ -1256,6 +1343,8 @@ void GWRBasic::setParallelType(const ParallelType& type)
             mFitCoreFunction = &GWRBasic::fitCoreOmp;
             mFitCoreCVFunction = &GWRBasic::fitCoreCVOmp;
             mFitCoreSHatFunction = &GWRBasic::fitCoreSHatOmp;
+            mCalcTrQtQCoreFUnction = &GWRBasic::calcTrQtQCoreOmp;
+            mCalcDiagBCoreFunction = &GWRBasic::calcDiagBCoreOmp;
             break;
 #endif // ENABLE_OPENMP
 #ifdef ENABLE_CUDA
@@ -1272,6 +1361,8 @@ void GWRBasic::setParallelType(const ParallelType& type)
             mFitCoreFunction = &GWRBasic::fitCoreSerial;
             mFitCoreCVFunction = &GWRBasic::fitCoreCVSerial;
             mFitCoreSHatFunction = &GWRBasic::fitCoreSHatSerial;
+            mCalcTrQtQCoreFUnction = &GWRBasic::calcTrQtQCoreSerial;
+            mCalcDiagBCoreFunction = &GWRBasic::calcDiagBCoreSerial;
             break;
         }
 #ifdef ENABLE_MPI
@@ -1279,11 +1370,17 @@ void GWRBasic::setParallelType(const ParallelType& type)
         {
             mFitFunction = &GWRBasic::fitMpi;
             mIndepVarsSelectionCriterionFunction = &GWRBasic::indepVarsSelectionCriterionMpi;
+            mFTestFunction = &GWRBasic::fTestBase;
+            mCalcTrQtQFunction = &GWRBasic::calcTrQtQBase;
+            mCalcDiagBFunction = &GWRBasic::calcDiagBBase;
         }
         else
         {
             mFitFunction = &GWRBasic::fitBase;
             mIndepVarsSelectionCriterionFunction = &GWRBasic::indepVarsSelectionCriterion;
+            mFTestFunction = &GWRBasic::fTestBase;
+            mCalcTrQtQFunction = &GWRBasic::calcTrQtQBase;
+            mCalcDiagBFunction = &GWRBasic::calcDiagBBase;
         }
 #endif
         setBandwidthSelectionCriterion(mBandwidthSelectionCriterion);

From 5c388f7fe07309d7701f2f05374765428ee736ee Mon Sep 17 00:00:00 2001
From: HPDell <huyg0180110559@outlook.com>
Date: Thu, 1 Aug 2024 16:48:50 +0100
Subject: [PATCH 04/11] fix: return in omp

---
 src/gwmodelpp/GWRBasic.cpp | 88 +++++++++++++++++++++-----------------
 1 file changed, 48 insertions(+), 40 deletions(-)

diff --git a/src/gwmodelpp/GWRBasic.cpp b/src/gwmodelpp/GWRBasic.cpp
index dbf11ed..d3ab9c1 100644
--- a/src/gwmodelpp/GWRBasic.cpp
+++ b/src/gwmodelpp/GWRBasic.cpp
@@ -749,44 +749,46 @@ double GWRBasic::calcTrQtQCoreOmp()
     double trQtQ = 0.0;
     uword nDp = mCoords.n_rows, nVar = mCoords.n_cols;
     mat wspan(1, nVar, fill::ones);
+    int flag = true;
 #pragma omp parallel for num_threads(mOmpThreadNum)
     for (uword i = 0; i < nDp; i++)
     {
-        vec wi = mSpatialWeight.weightVector(i);
-        mat xtwi = trans(mX % (wi * wspan));
-        try
-        {
-            mat xtwxR = inv_sympd(xtwi * mX);
-            mat ci = xtwxR * xtwi;
-            mat si = mX.row(i) * inv(xtwi * mX) * xtwi;
-            vec pi = -trans(si);
-            pi(i) += 1.0;
-            double qi = sum(pi % pi);
-            trQtQ += qi * qi;
-            for (arma::uword j = i + 1; j < nDp; j++)
+        if (flag) {
+            vec wi = mSpatialWeight.weightVector(i);
+            mat xtwi = trans(mX % (wi * wspan));
+            try
             {
-                vec wj = mSpatialWeight.weightVector(j);
-                mat xtwj = trans(mX % (wj * wspan));
-                try {
-                    mat sj = mX.row(j) * inv_sympd(xtwj * mX) * xtwj;
-                    vec pj = -trans(sj);
-                    pj(j) += 1.0;
-                    double qj = sum(pi % pj);
-                    trQtQ += qj * qj * 2.0;
-                }
-                catch (const std::exception& e)
+                mat xtwxR = inv_sympd(xtwi * mX);
+                mat ci = xtwxR * xtwi;
+                mat si = mX.row(i) * inv(xtwi * mX) * xtwi;
+                vec pi = -trans(si);
+                pi(i) += 1.0;
+                double qi = sum(pi % pi);
+                trQtQ += qi * qi;
+                for (arma::uword j = i + 1; j < nDp; j++)
                 {
-                    return DBL_MAX;
+                    vec wj = mSpatialWeight.weightVector(j);
+                    mat xtwj = trans(mX % (wj * wspan));
+                    try {
+                        mat sj = mX.row(j) * inv_sympd(xtwj * mX) * xtwj;
+                        vec pj = -trans(sj);
+                        pj(j) += 1.0;
+                        double qj = sum(pi % pj);
+                        trQtQ += qj * qj * 2.0;
+                    }
+                    catch (const std::exception& e)
+                    {
+                        flag = false;
+                    }
                 }
             }
+            catch(const std::exception& e)
+            {
+                flag = false;
+            }
         }
-        catch(const std::exception& e)
-        {
-            return DBL_MAX;
-        }
-        
     }
-    return trQtQ;
+    return flag ? trQtQ : DBL_MAX;
 }
 
 vec GWRBasic::calcDiagBCoreOmp(uword i)
@@ -809,22 +811,28 @@ vec GWRBasic::calcDiagBCoreOmp(uword i)
             return { DBL_MAX, DBL_MAX };
         }
     }
+    int flag = true;
 #pragma omp parallel for num_threads(mOmpThreadNum)
     for (arma::uword k = 0; k < nDp; k++)
     {
-        vec w = mSpatialWeight.weightVector(k);
-        mat xtw = trans(mX % (w * wspan));
-        try
-        {
-            mat C = trans(xtw) * inv_sympd(xtw * mX);
-            vec b = C.col(i);
-            diagB += (b % b - (1.0 / nDp) * (b % c));
-        }
-        catch (const std::exception& e)
-        {
-            return { DBL_MAX, DBL_MAX };
+        if (flag) {
+            vec w = mSpatialWeight.weightVector(k);
+            mat xtw = trans(mX % (w * wspan));
+            try
+            {
+                mat C = trans(xtw) * inv_sympd(xtw * mX);
+                vec b = C.col(i);
+                diagB += (b % b - (1.0 / nDp) * (b % c));
+            }
+            catch (const std::exception& e)
+            {
+                flag = false;
+            }
         }
     }
+    if (!flag) {
+        return { DBL_MAX, DBL_MAX };
+    }
     diagB = 1.0 / nDp * diagB;
     return { sum(diagB), sum(diagB % diagB) };
 }

From a759c695c8a4577d8f6d7df4502b296de1498b92 Mon Sep 17 00:00:00 2001
From: HPDell <huyg0180110559@outlook.com>
Date: Thu, 1 Aug 2024 17:34:25 +0100
Subject: [PATCH 05/11] add: mat quad mpi

---
 include/gwmodelpp/utils/armampi.h |  4 ++-
 src/gwmodelpp/GWRMultiscale.cpp   |  2 +-
 src/gwmodelpp/utils/armampi.cpp   | 19 +++++++++++-
 test/mpi/testMpiMatMul.cpp        | 51 ++++++++++++++++++++++++++++++-
 4 files changed, 72 insertions(+), 4 deletions(-)

diff --git a/include/gwmodelpp/utils/armampi.h b/include/gwmodelpp/utils/armampi.h
index d1866a0..9382261 100644
--- a/include/gwmodelpp/utils/armampi.h
+++ b/include/gwmodelpp/utils/armampi.h
@@ -7,4 +7,6 @@
 #define GWM_MPI_UWORD MPI_UNSIGNED_LONG_LONG
 #endif  // ARMA_32BIT_WORD
 
-void mat_mul_mpi(arma::mat& a, arma::mat& b, arma::mat& c, const int ip, const int np, const size_t range);
+void mat_mul_mpi(arma::mat& a, arma::mat& b, arma::mat& c, const int ip, const int np);
+
+void mat_quad_mpi(arma::mat& a, arma::mat& aTa, const int ip, const int np);
diff --git a/src/gwmodelpp/GWRMultiscale.cpp b/src/gwmodelpp/GWRMultiscale.cpp
index 3e03f4a..41a2a55 100644
--- a/src/gwmodelpp/GWRMultiscale.cpp
+++ b/src/gwmodelpp/GWRMultiscale.cpp
@@ -954,7 +954,7 @@ vec GWRMultiscale::fitVarMpi(const size_t var)
     if (mHasHatMatrix)
     {
         mat SArrayi = mSArray.slice(var) - mS0;
-        mat_mul_mpi(S, SArrayi, mSArray.slice(var), mWorkerId, mWorkerNum, mWorkRangeSize);
+        mat_mul_mpi(S, SArrayi, mSArray.slice(var), mWorkerId, mWorkerNum);
         mSArray.slice(var) += S;
         mS0 = mSArray.slice(var) - SArrayi;
     }
diff --git a/src/gwmodelpp/utils/armampi.cpp b/src/gwmodelpp/utils/armampi.cpp
index 024db9b..97594af 100644
--- a/src/gwmodelpp/utils/armampi.cpp
+++ b/src/gwmodelpp/utils/armampi.cpp
@@ -5,7 +5,7 @@
 using namespace std;
 using namespace arma;
 
-void mat_mul_mpi(mat& a, mat& b, mat& c, const int ip, const int np, const size_t range)
+void mat_mul_mpi(mat& a, mat& b, mat& c, const int ip, const int np)
 {
     arma::uword m = a.n_rows, n = b.n_cols;
     int k = (int) b.n_rows;
@@ -23,3 +23,20 @@ void mat_mul_mpi(mat& a, mat& b, mat& c, const int ip, const int np, const size_
         MPI_Reduce(ci.memptr(), c.memptr(), ci.n_elem, MPI_DOUBLE, MPI_SUM, pi, MPI_COMM_WORLD);
     }
 }
+
+void mat_quad_mpi(mat& a, mat& aTa, const int ip, const int np)
+{
+    arma::uword m = a.n_cols, n = a.n_rows;
+    int k = (int) a.n_rows;
+    arma::Col<int> a_rows(np, arma::fill::zeros);
+    MPI_Allgather(&k, 1, MPI_INT, a_rows.memptr(), 1, MPI_INT, MPI_COMM_WORLD);
+    aTa = mat(n, m, arma::fill::zeros);
+    mat a_buf;
+    for (int pi = 0; pi < np; pi++)
+    {
+        arma::Col<int> a_disp = arma::cumsum(a_rows) - a_rows;
+        a_buf = a.cols(a_disp(pi), a_disp(pi) + a_rows(pi) - 1);
+        mat aTai = a_buf.t() * a;
+        MPI_Reduce(aTai.memptr(), aTa.memptr(), aTai.n_elem, MPI_DOUBLE, MPI_SUM, pi, MPI_COMM_WORLD);
+    }
+}
diff --git a/test/mpi/testMpiMatMul.cpp b/test/mpi/testMpiMatMul.cpp
index dc70b5c..c2979a4 100644
--- a/test/mpi/testMpiMatMul.cpp
+++ b/test/mpi/testMpiMatMul.cpp
@@ -32,7 +32,56 @@ TEST_CASE("mat mul mpi")
     A = A_all.rows(from, to - 1);
     B = B_all.rows(from, to - 1);
     // printf("process %d begin mat mul\n", ip);
-    REQUIRE_NOTHROW(mat_mul_mpi(A, B, C, ip, np, size));
+    REQUIRE_NOTHROW(mat_mul_mpi(A, B, C, ip, np));
+    // printf("process %d end mat mul\n", ip);
+
+    if (ip == 0)
+    {
+        mat C_gather(n, n);
+        C_gather.rows(from, to - 1) = C;
+        uvec received(np, fill::zeros);
+        received(0) = 1;
+        auto bufsize = size * n;
+        double *buf = new double[size * n];
+        while (!all(received == 1))
+        {
+            MPI_Status status;
+            MPI_Recv(buf, bufsize, MPI_DOUBLE, MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, &status);
+            received(status.MPI_SOURCE) = 1;
+            uword ifrom = status.MPI_SOURCE * size, ito = min(ifrom + size, n), irange = ito - ifrom; 
+            C_gather.rows(ifrom, ito - 1) = mat(buf, irange, n);
+        }
+        REQUIRE(approx_equal(C_gather, C_all, "absdiff", 1e-6));
+    }
+    else
+    {
+        MPI_Send(C.memptr(), C.n_elem, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
+    }
+}
+
+TEST_CASE("mat quad mpi")
+{
+    int ip, np;
+    MPI_Comm_size(MPI_COMM_WORLD, &np);
+    MPI_Comm_rank(MPI_COMM_WORLD, &ip);
+
+    uword n = 100;
+    uword size = n / np + (n % np == 0 ? 0 : 1);
+    uword from = ip * size, to = min(from + size, n);
+
+    // printf("range from %llu to %llu\n", from, to);
+
+    mat A_all, C_all;
+    A_all = mat(n, n, arma::fill::randn);
+    if (ip == 0)
+    {
+        C_all = A_all.t() * A_all;
+    }
+
+    mat A, C;
+    A = A_all.rows(from, to - 1);
+    // printf("process %d begin mat mul\n", ip);
+    REQUIRE_NOTHROW(mat_quad_mpi(A, C, ip, np));
     // printf("process %d end mat mul\n", ip);
 
     if (ip == 0)

From d4fcd5c1808e5718bf202b0d4259d35ee0e186fe Mon Sep 17 00:00:00 2001
From: HPDell <huyg0180110559@outlook.com>
Date: Thu, 1 Aug 2024 17:51:28 +0100
Subject: [PATCH 06/11] add: mat trace mpi

---
 include/gwmodelpp/utils/armampi.h |  2 ++
 src/gwmodelpp/utils/armampi.cpp   | 13 +++++++++++
 test/mpi/testMpiMatMul.cpp        | 36 +++++++++++++++++++++++++++++++
 3 files changed, 51 insertions(+)

diff --git a/include/gwmodelpp/utils/armampi.h b/include/gwmodelpp/utils/armampi.h
index 9382261..9fa1758 100644
--- a/include/gwmodelpp/utils/armampi.h
+++ b/include/gwmodelpp/utils/armampi.h
@@ -10,3 +10,5 @@
 void mat_mul_mpi(arma::mat& a, arma::mat& b, arma::mat& c, const int ip, const int np);
 
 void mat_quad_mpi(arma::mat& a, arma::mat& aTa, const int ip, const int np);
+
+double mat_trace_mpi(arma::mat& a, const int ip, const int np);
diff --git a/src/gwmodelpp/utils/armampi.cpp b/src/gwmodelpp/utils/armampi.cpp
index 97594af..69c3f1b 100644
--- a/src/gwmodelpp/utils/armampi.cpp
+++ b/src/gwmodelpp/utils/armampi.cpp
@@ -40,3 +40,16 @@ void mat_quad_mpi(mat& a, mat& aTa, const int ip, const int np)
         MPI_Reduce(aTai.memptr(), aTa.memptr(), aTai.n_elem, MPI_DOUBLE, MPI_SUM, pi, MPI_COMM_WORLD);
     }
 }
+
+double mat_trace_mpi(arma::mat& a, const int ip, const int np)
+{
+    int k = (int) a.n_rows;
+    arma::Col<int> a_rows(np, arma::fill::zeros);
+    MPI_Allgather(&k, 1, MPI_INT, a_rows.memptr(), 1, MPI_INT, MPI_COMM_WORLD);
+    arma::Col<int> a_disp = arma::cumsum(a_rows) - a_rows;
+    mat ai = a.cols(a_disp(ip), a_disp(ip) + a_rows(ip) - 1);
+    double ti = trace(ai);
+    double tr = 0.0;
+    MPI_Allreduce(&ti, &tr, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    return tr;
+}
diff --git a/test/mpi/testMpiMatMul.cpp b/test/mpi/testMpiMatMul.cpp
index c2979a4..2bb7299 100644
--- a/test/mpi/testMpiMatMul.cpp
+++ b/test/mpi/testMpiMatMul.cpp
@@ -108,6 +108,42 @@ TEST_CASE("mat quad mpi")
     }
 }
 
+TEST_CASE("mat trace mpi")
+{
+    int ip, np;
+    MPI_Comm_size(MPI_COMM_WORLD, &np);
+    MPI_Comm_rank(MPI_COMM_WORLD, &ip);
+
+    uword n = 100;
+    uword size = n / np + (n % np == 0 ? 0 : 1);
+    uword from = ip * size, to = min(from + size, n);
+
+    // printf("range from %llu to %llu\n", from, to);
+
+    mat A_all;
+    double trA_all;
+    A_all = mat(n, n, arma::fill::randn);
+    if (ip == 0)
+    {
+        trA_all = trace(A_all);
+    }
+
+    mat A;
+    double trA = 0.0;
+    A = A_all.rows(from, to - 1);
+    // printf("process %d begin mat mul\n", ip);
+    REQUIRE_NOTHROW(([&]()
+    {
+        trA = mat_trace_mpi(A, ip, np);
+    })());
+    // printf("process %d end mat mul\n", ip);
+
+    if (ip == 0)
+    {
+        REQUIRE_THAT(trA, Catch::Matchers::WithinAbs(trA_all, 1e-6));
+    }
+}
+
 int main(int argc, char *argv[])
 {
     MPI_Init(&argc, &argv);

From 8fffcd6e289ed60d383fea31ce586248664ea38e Mon Sep 17 00:00:00 2001
From: HPDell <huyg0180110559@outlook.com>
Date: Fri, 2 Aug 2024 11:42:46 +0100
Subject: [PATCH 07/11] add: MPI version of FTest

---
 include/gwmodelpp/GWRBasic.h |  22 ++++---
 src/gwmodelpp/GWRBasic.cpp   | 112 ++++++++++++++++++-----------------
 2 files changed, 72 insertions(+), 62 deletions(-)

diff --git a/include/gwmodelpp/GWRBasic.h b/include/gwmodelpp/GWRBasic.h
index ed9b050..395b48c 100644
--- a/include/gwmodelpp/GWRBasic.h
+++ b/include/gwmodelpp/GWRBasic.h
@@ -55,7 +55,7 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
     typedef double (GWRBasic::*TrQtQCalculator)();
     typedef double (GWRBasic::*TrQtQCoreCalculator)();
     typedef arma::vec (GWRBasic::*DiagBCalculator)(arma::uword);
-    typedef arma::vec (GWRBasic::*DiagBCoreCalculator)(arma::uword);
+    typedef arma::vec (GWRBasic::*DiagBCoreCalculator)(arma::uword, const arma::vec&);
 
     typedef double (GWRBasic::*BandwidthSelectionCriterionCalculator)(BandwidthWeight*);        //!< \~english Declaration of criterion calculator for bandwidth selection. \~chinese 带宽优选指标计算函数声明。
     typedef double (GWRBasic::*IndepVarsSelectCriterionCalculator)(const std::vector<std::size_t>&); //!< \~english Declaration of criterion calculator for variable selection. \~chinese 变量优选指标计算函数声明。
@@ -68,6 +68,8 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
         double p = 0.0;
     };
 
+    using FTestResultCombine = std::tuple<FTestResult, FTestResult, std::vector<FTestResult>, FTestResult>;
+
 private:
 
     /**
@@ -401,6 +403,11 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
 
     void setStoreC(bool flag) { mStoreC = flag; }
 
+    FTestResultCombine fTestResults()
+    {
+        return std::make_tuple(mF1Test, mF2Test, mF3Test, mF4Test);
+    }
+
 public:     // Implement Algorithm
     bool isValid() override;
 
@@ -530,10 +537,7 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
 
     double calcTrQtQBase();
 
-    arma::vec calcDiagBBase(arma::uword i)
-    {
-        return (this->*mCalcDiagBCoreFunction)(i);
-    }
+    arma::vec calcDiagBBase(arma::uword i);
 
 private:
 
@@ -545,7 +549,7 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
 
     double calcTrQtQCoreSerial();
 
-    arma::vec calcDiagBCoreSerial(arma::uword i);
+    arma::vec calcDiagBCoreSerial(arma::uword i, const arma::vec& c);
 
 #ifdef ENABLE_OPENMP
 
@@ -625,7 +629,7 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
 
     double calcTrQtQCoreOmp();
 
-    arma::vec calcDiagBCoreOmp(arma::uword i);
+    arma::vec calcDiagBCoreOmp(arma::uword i, const arma::vec& c);
 
 #endif
 
@@ -712,6 +716,8 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
     double bandwidthSizeCriterionCVMpi(BandwidthWeight* bandwidthWeight);
     double bandwidthSizeCriterionAICMpi(BandwidthWeight* bandwidthWeight);
     arma::mat fitMpi();
+    double calcTrQtQMpi();
+    arma::vec calcDiagBMpi(arma::uword i);
 #endif // ENABLE_MPI
 
 public:     // Implement IParallelizable
@@ -813,7 +819,7 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
     FTestResult mF4Test;
     FTestCalculator mFTestFunction = &GWRBasic::fTestBase;
     TrQtQCalculator mCalcTrQtQFunction = &GWRBasic::calcTrQtQBase;
-    TrQtQCoreCalculator mCalcTrQtQCoreFUnction = &GWRBasic::calcTrQtQCoreSerial;
+    TrQtQCoreCalculator mCalcTrQtQCoreFunction = &GWRBasic::calcTrQtQCoreSerial;
     DiagBCalculator mCalcDiagBFunction = &GWRBasic::calcDiagBBase;
     DiagBCoreCalculator mCalcDiagBCoreFunction = &GWRBasic::calcDiagBCoreSerial;
 };
diff --git a/src/gwmodelpp/GWRBasic.cpp b/src/gwmodelpp/GWRBasic.cpp
index d3ab9c1..97dfd8e 100644
--- a/src/gwmodelpp/GWRBasic.cpp
+++ b/src/gwmodelpp/GWRBasic.cpp
@@ -490,7 +490,7 @@ double GWRBasic::calcTrQtQBase()
     }
     else
     {
-        trQtQ = (this->*mCalcTrQtQCoreFUnction)();
+        trQtQ = (this->*mCalcTrQtQCoreFunction)();
     }
     return trQtQ;
 }
@@ -500,7 +500,8 @@ double GWRBasic::calcTrQtQCoreSerial()
     double trQtQ = 0.0;
     uword nDp = mCoords.n_rows, nVar = mCoords.n_cols;
     mat wspan(1, nVar, fill::ones);
-    for (uword i = 0; i < nDp; i++)
+    std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
+    for (uword i = workRange.first; i < workRange.second; i++)
     {
         vec wi = mSpatialWeight.weightVector(i);
         mat xtwi = trans(mX % (wi * wspan));
@@ -539,16 +540,14 @@ double GWRBasic::calcTrQtQCoreSerial()
     return trQtQ;
 }
 
-vec GWRBasic::calcDiagBCoreSerial(uword i)
+arma::vec GWRBasic::calcDiagBBase(arma::uword i)
 {
     arma::uword nDp = mX.n_rows, nVar = mX.n_cols;
-    vec diagB(nDp, fill::zeros), c(nDp, fill::zeros);
-    mat ek = eye(nVar, nVar);
-    mat wspan(1, nVar, fill::ones);
+    vec c(nDp, fill::zeros);
     for (arma::uword j = 0; j < nDp; j++)
     {
         vec w = mSpatialWeight.weightVector(j);
-        mat xtw = trans(mX % (w * wspan));
+        mat xtw = trans(mX.each_col() % w);
         try
         {
             mat C = trans(xtw) * inv_sympd(xtw * mX);
@@ -559,7 +558,16 @@ vec GWRBasic::calcDiagBCoreSerial(uword i)
             return { DBL_MAX, DBL_MAX };
         }
     }
-    for (arma::uword k = 0; k < nDp; k++)
+    return (this->*mCalcDiagBCoreFunction)(i);
+}
+
+vec GWRBasic::calcDiagBCoreSerial(uword i, const vec& c)
+{
+    arma::uword nDp = mX.n_rows, nVar = mX.n_cols;
+    vec diagB(nDp, fill::zeros)
+    mat ek = eye(nVar, nVar);
+    std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
+    for (arma::uword k = workRange.first; k < workRange.second; k++)
     {
         vec w = mSpatialWeight.weightVector(k);
         mat xtw = trans(mX % (w * wspan));
@@ -749,9 +757,10 @@ double GWRBasic::calcTrQtQCoreOmp()
     double trQtQ = 0.0;
     uword nDp = mCoords.n_rows, nVar = mCoords.n_cols;
     mat wspan(1, nVar, fill::ones);
+    std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
     int flag = true;
 #pragma omp parallel for num_threads(mOmpThreadNum)
-    for (uword i = 0; i < nDp; i++)
+    for (uword i = workRange.first; i < workRange.second; i++)
     {
         if (flag) {
             vec wi = mSpatialWeight.weightVector(i);
@@ -791,33 +800,19 @@ double GWRBasic::calcTrQtQCoreOmp()
     return flag ? trQtQ : DBL_MAX;
 }
 
-vec GWRBasic::calcDiagBCoreOmp(uword i)
+vec GWRBasic::calcDiagBCoreOmp(uword i, const vec& c)
 {
     arma::uword nDp = mX.n_rows, nVar = mX.n_cols;
-    vec diagB(nDp, fill::zeros), c(nDp, fill::zeros);
+    vec diagB(nDp, fill::zeros);
     mat ek = eye(nVar, nVar);
-    mat wspan(1, nVar, fill::ones);
-    for (arma::uword j = 0; j < nDp; j++)
-    {
-        vec w = mSpatialWeight.weightVector(j);
-        mat xtw = trans(mX % (w * wspan));
-        try
-        {
-            mat C = trans(xtw) * inv_sympd(xtw * mX);
-            c += C.col(i);
-        }
-        catch (const std::exception& e)
-        {
-            return { DBL_MAX, DBL_MAX };
-        }
-    }
+    std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
     int flag = true;
 #pragma omp parallel for num_threads(mOmpThreadNum)
-    for (arma::uword k = 0; k < nDp; k++)
+    for (arma::uword k = workRange.first; k < workRange.second; k++)
     {
         if (flag) {
             vec w = mSpatialWeight.weightVector(k);
-            mat xtw = trans(mX % (w * wspan));
+            mat xtw = trans(mX.each_col() % w);
             try
             {
                 mat C = trans(xtw) * inv_sympd(xtw * mX);
@@ -830,7 +825,8 @@ vec GWRBasic::calcDiagBCoreOmp(uword i)
             }
         }
     }
-    if (!flag) {
+    if (!flag)
+    {
         return { DBL_MAX, DBL_MAX };
     }
     diagB = 1.0 / nDp * diagB;
@@ -1274,26 +1270,34 @@ arma::mat gwm::GWRBasic::fitMpi()
     return mBetas;
 }
 
-// double GWRBasic::indepVarsSelectionCriterion(const mat& x, const vec& y, vec& shat)
-// {
-//     try
-//     {
-//         mat S;
-//         mat betas = (this->*mFitCoreSHatFunction)(x, y, mSpatialWeight, shat, S);
-//         GWM_LOG_PROGRESS(++mIndepVarSelectionProgressCurrent, mIndepVarSelectionProgressTotal);
-//         if (mStatus == Status::Success)
-//         {
-//             double rss = GWRBase::RSS(x, y, betas);
-//             return rss;
-//         }
-//         else return DBL_MAX;
-//     }
-//     catch(const std::exception& e)
-//     {
-//         return DBL_MAX;
-//     }
-    
-// }
+double GWRBasic::calcTrQtQMpi()
+{
+    double trQtQ = 0.0;
+    uword nDp = mCoords.n_rows;
+    if (isStoreS())
+    {
+        std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
+        mat EmS = eye(nDp, nDp).eval().rows(workRange.first, workRange.second - 1) - mS;
+        mat Q, QtQ;
+        mat_quad_mpi(EmS, Q, mWorkerId, mWorkerNum);
+        mat_quad_mpi(Q, QtQ, mWorkerId, mWorkerNum);
+        trQtQ = mat_trace_mpi(QtQ, mWorkerId, mWorkerNum);
+    }
+    else
+    {
+        double trQtQi = (this->*mCalcTrQtQCoreFunction)();
+        MPI_Allreduce(&trQtQi, &trQtQ, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    }
+    return trQtQ;
+}
+
+vec GWRBasic::calcDiagBMpi(uword i)
+{
+    vec diagBi = calcDiagBBase(i);
+    vec diagB;
+    MPI_Allreduce(diagBi.memptr(), diagB.memptr(), 2, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    return diagB;
+}
 #endif // ENABLE_MPI
 
 void GWRBasic::setBandwidthSelectionCriterion(const BandwidthSelectionCriterionType& criterion)
@@ -1341,7 +1345,7 @@ void GWRBasic::setParallelType(const ParallelType& type)
             mFitCoreFunction = &GWRBasic::fitCoreSerial;
             mFitCoreCVFunction = &GWRBasic::fitCoreCVSerial;
             mFitCoreSHatFunction = &GWRBasic::fitCoreSHatSerial;
-            mCalcTrQtQCoreFUnction = &GWRBasic::calcTrQtQCoreSerial;
+            mCalcTrQtQCoreFunction = &GWRBasic::calcTrQtQCoreSerial;
             mCalcDiagBCoreFunction = &GWRBasic::calcDiagBCoreSerial;
             break;
 #ifdef ENABLE_OPENMP
@@ -1351,7 +1355,7 @@ void GWRBasic::setParallelType(const ParallelType& type)
             mFitCoreFunction = &GWRBasic::fitCoreOmp;
             mFitCoreCVFunction = &GWRBasic::fitCoreCVOmp;
             mFitCoreSHatFunction = &GWRBasic::fitCoreSHatOmp;
-            mCalcTrQtQCoreFUnction = &GWRBasic::calcTrQtQCoreOmp;
+            mCalcTrQtQCoreFunction = &GWRBasic::calcTrQtQCoreOmp;
             mCalcDiagBCoreFunction = &GWRBasic::calcDiagBCoreOmp;
             break;
 #endif // ENABLE_OPENMP
@@ -1369,7 +1373,7 @@ void GWRBasic::setParallelType(const ParallelType& type)
             mFitCoreFunction = &GWRBasic::fitCoreSerial;
             mFitCoreCVFunction = &GWRBasic::fitCoreCVSerial;
             mFitCoreSHatFunction = &GWRBasic::fitCoreSHatSerial;
-            mCalcTrQtQCoreFUnction = &GWRBasic::calcTrQtQCoreSerial;
+            mCalcTrQtQCoreFunction = &GWRBasic::calcTrQtQCoreSerial;
             mCalcDiagBCoreFunction = &GWRBasic::calcDiagBCoreSerial;
             break;
         }
@@ -1379,8 +1383,8 @@ void GWRBasic::setParallelType(const ParallelType& type)
             mFitFunction = &GWRBasic::fitMpi;
             mIndepVarsSelectionCriterionFunction = &GWRBasic::indepVarsSelectionCriterionMpi;
             mFTestFunction = &GWRBasic::fTestBase;
-            mCalcTrQtQFunction = &GWRBasic::calcTrQtQBase;
-            mCalcDiagBFunction = &GWRBasic::calcDiagBBase;
+            mCalcTrQtQFunction = &GWRBasic::calcTrQtQMpi;
+            mCalcDiagBFunction = &GWRBasic::calcDiagBMpi;
         }
         else
         {

From 29c885312d6eeef19724253b6f85da73217e2bc3 Mon Sep 17 00:00:00 2001
From: HPDell <huyg0180110559@outlook.com>
Date: Fri, 2 Aug 2024 11:46:17 +0100
Subject: [PATCH 08/11] fix: remove ek in diag B calculator

---
 src/gwmodelpp/GWRBasic.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/gwmodelpp/GWRBasic.cpp b/src/gwmodelpp/GWRBasic.cpp
index 97dfd8e..81d6491 100644
--- a/src/gwmodelpp/GWRBasic.cpp
+++ b/src/gwmodelpp/GWRBasic.cpp
@@ -558,22 +558,21 @@ arma::vec GWRBasic::calcDiagBBase(arma::uword i)
             return { DBL_MAX, DBL_MAX };
         }
     }
-    return (this->*mCalcDiagBCoreFunction)(i);
+    return (this->*mCalcDiagBCoreFunction)(i, c);
 }
 
 vec GWRBasic::calcDiagBCoreSerial(uword i, const vec& c)
 {
     arma::uword nDp = mX.n_rows, nVar = mX.n_cols;
     vec diagB(nDp, fill::zeros)
-    mat ek = eye(nVar, nVar);
     std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
     for (arma::uword k = workRange.first; k < workRange.second; k++)
     {
         vec w = mSpatialWeight.weightVector(k);
-        mat xtw = trans(mX % (w * wspan));
+        mat xtw = (mX.each_col() % w).t();
         try
         {
-            mat C = trans(xtw) * inv_sympd(xtw * mX);
+            mat C = xtw.t() * inv_sympd(xtw * mX);
             vec b = C.col(i);
             diagB += (b % b - (1.0 / nDp) * (b % c));
         }
@@ -804,7 +803,6 @@ vec GWRBasic::calcDiagBCoreOmp(uword i, const vec& c)
 {
     arma::uword nDp = mX.n_rows, nVar = mX.n_cols;
     vec diagB(nDp, fill::zeros);
-    mat ek = eye(nVar, nVar);
     std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
     int flag = true;
 #pragma omp parallel for num_threads(mOmpThreadNum)

From 811731689dbb7b7e0f2316bd37f59835dd1e0da3 Mon Sep 17 00:00:00 2001
From: HPDell <huyg0180110559@outlook.com>
Date: Fri, 2 Aug 2024 12:33:18 +0100
Subject: [PATCH 09/11] add: test for GWR f test

---
 include/gwmodelpp/GWRBasic.h |  5 +++
 src/gwmodelpp/GWRBasic.cpp   | 18 ++++++---
 test/testGWRBasic.cpp        | 73 ++++++++++++++++++++++++++++++++++++
 3 files changed, 90 insertions(+), 6 deletions(-)

diff --git a/include/gwmodelpp/GWRBasic.h b/include/gwmodelpp/GWRBasic.h
index 395b48c..5e6905f 100644
--- a/include/gwmodelpp/GWRBasic.h
+++ b/include/gwmodelpp/GWRBasic.h
@@ -403,6 +403,10 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
 
     void setStoreC(bool flag) { mStoreC = flag; }
 
+    bool isDoFTest() { return mIsDoFTest; };
+
+    void setIsDoFtest(bool value) { mIsDoFTest = value; }
+
     FTestResultCombine fTestResults()
     {
         return std::make_tuple(mF1Test, mF2Test, mF3Test, mF4Test);
@@ -813,6 +817,7 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
     bool mStoreS = false; //!< \~english Whether to save S \~chinese 是否保存 S 矩阵
     bool mStoreC = false; //!< \~english Whether to save C \~chinese 是否保存 C 矩阵
 
+    bool mIsDoFTest = false;
     FTestResult mF1Test;
     FTestResult mF2Test;
     std::vector<FTestResult> mF3Test;
diff --git a/src/gwmodelpp/GWRBasic.cpp b/src/gwmodelpp/GWRBasic.cpp
index 81d6491..3906913 100644
--- a/src/gwmodelpp/GWRBasic.cpp
+++ b/src/gwmodelpp/GWRBasic.cpp
@@ -159,6 +159,12 @@ mat GWRBasic::fit()
 
     GWM_LOG_STAGE("Model fitting");
     mBetas = (this->*mFitFunction)();
+
+    if (mIsDoFTest)
+    {
+        GWM_LOG_STAGE("F Test");
+        (this->*mFTestFunction)();
+    }
     
 #ifdef ENABLE_CUDA
     if (mParallelType & ParallelType::CUDA)
@@ -423,11 +429,11 @@ void gwm::GWRBasic::fTestBase()
     double v1 = mSHat(0), v2 = mSHat(1);
     double nDp = double(mCoords.n_rows), nVar = double(mX.n_cols);
     double RSSg = RSS(mX, mY, mBetas);
-    vec betaOls = solve(mX, mY);
+    vec betaOls = (mX.t() * mX).i() * (mX.t() * mY);
     vec residualOls = mY - mX * betaOls;
     double RSSo = sum(residualOls % residualOls);
     double DFo = nDp - nVar;
-    double delta1 = 1.0 * nDp - 2.0 * v1 + v2;
+    double delta1 = nDp - 2.0 * v1 + v2;
     double sigma21 = RSSg / delta1;
     double lDelta1 = sum(mQDiag), lDelta2 = (this->*mCalcTrQtQFunction)();
     //=========
@@ -542,7 +548,7 @@ double GWRBasic::calcTrQtQCoreSerial()
 
 arma::vec GWRBasic::calcDiagBBase(arma::uword i)
 {
-    arma::uword nDp = mX.n_rows, nVar = mX.n_cols;
+    arma::uword nDp = mX.n_rows;
     vec c(nDp, fill::zeros);
     for (arma::uword j = 0; j < nDp; j++)
     {
@@ -563,8 +569,8 @@ arma::vec GWRBasic::calcDiagBBase(arma::uword i)
 
 vec GWRBasic::calcDiagBCoreSerial(uword i, const vec& c)
 {
-    arma::uword nDp = mX.n_rows, nVar = mX.n_cols;
-    vec diagB(nDp, fill::zeros)
+    arma::uword nDp = mX.n_rows;
+    vec diagB(nDp, fill::zeros);
     std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
     for (arma::uword k = workRange.first; k < workRange.second; k++)
     {
@@ -801,7 +807,7 @@ double GWRBasic::calcTrQtQCoreOmp()
 
 vec GWRBasic::calcDiagBCoreOmp(uword i, const vec& c)
 {
-    arma::uword nDp = mX.n_rows, nVar = mX.n_cols;
+    arma::uword nDp = mX.n_rows;
     vec diagB(nDp, fill::zeros);
     std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
     int flag = true;
diff --git a/test/testGWRBasic.cpp b/test/testGWRBasic.cpp
index b1e76f0..f490ab9 100644
--- a/test/testGWRBasic.cpp
+++ b/test/testGWRBasic.cpp
@@ -1,6 +1,7 @@
 #define CATCH_CONFIG_MAIN
 #include <catch2/catch_all.hpp>
 
+#include <array>
 #include <vector>
 #include <string>
 #include <armadillo>
@@ -20,6 +21,11 @@ using namespace std;
 using namespace arma;
 using namespace gwm;
 
+array<double, 4> convFTestArray(GWRBasic::FTestResult f)
+{
+    return { f.s, f.df1, f.df2, f.p };
+}
+
 TEST_CASE("BasicGWR: LondonHP")
 {
     mat londonhp100_coord, londonhp100_data;
@@ -259,6 +265,73 @@ TEST_CASE("BasicGWR: LondonHP")
         REQUIRE_THAT(diagnostic.RSquareAdjust, Catch::Matchers::WithinAbs(0.678982114793865, 1e-8));
     }
 
+    SECTION("F test | adaptive bandwidth | no bandwidth optimization | no variable optimization") {
+        auto parallel = GENERATE_REF(values(parallel_list));
+        INFO("Parallel:" << ParallelTypeDict.at(parallel));
+
+        CRSDistance distance(false);
+        BandwidthWeight bandwidth(36, true, BandwidthWeight::Gaussian);
+        SpatialWeight spatial(&bandwidth, &distance);
+
+        GWRBasic algorithm;
+        algorithm.setCoords(londonhp100_coord);
+        algorithm.setDependentVariable(y);
+        algorithm.setIndependentVariables(x);
+        algorithm.setSpatialWeight(spatial);
+        algorithm.setParallelType(parallel);
+#ifdef ENABLE_OPENMP
+        if (parallel == ParallelType::OpenMP)
+        {
+            algorithm.setOmpThreadNum(omp_get_num_threads());
+        }
+#endif // ENABLE_OPENMP
+#ifdef ENABLE_CUDA
+        if (parallel == ParallelType::CUDA)
+        {
+            algorithm.setGPUId(0);
+            algorithm.setGroupSize(64);
+        }
+#endif // ENABLE_CUDA
+        algorithm.setIsDoFtest(true);
+        REQUIRE_NOTHROW(algorithm.fit());
+        auto results = algorithm.fTestResults();
+        auto f1 = convFTestArray(get<0>(results));
+        auto f2 = convFTestArray(get<1>(results));
+        vector<array<double, 4>> f3;
+        for (auto &&i : get<2>(results))
+        {
+            f3.push_back(convFTestArray(i));
+        }
+        auto f4 = convFTestArray(get<3>(results));
+        REQUIRE_THAT(f1[0], Catch::Matchers::WithinAbs(0.9342, 1e-3));
+        REQUIRE_THAT(f1[1], Catch::Matchers::WithinAbs(93.3300, 1e-3));
+        REQUIRE_THAT(f1[2], Catch::Matchers::WithinAbs(96.0000, 1e-3));
+        REQUIRE_THAT(f1[3], Catch::Matchers::WithinAbs(0.3710, 1e-3));
+        REQUIRE_THAT(f2[0], Catch::Matchers::WithinAbs(1.9762, 1e-3));
+        REQUIRE_THAT(f2[1], Catch::Matchers::WithinAbs(13.1571, 1e-3));
+        REQUIRE_THAT(f2[2], Catch::Matchers::WithinAbs(96.0000, 1e-3));
+        REQUIRE_THAT(f2[3], Catch::Matchers::WithinAbs(0.0303, 1e-3));
+        REQUIRE_THAT(f4[0], Catch::Matchers::WithinAbs(0.8752, 1e-3));
+        REQUIRE_THAT(f4[1], Catch::Matchers::WithinAbs(89.9377, 1e-3));
+        REQUIRE_THAT(f4[2], Catch::Matchers::WithinAbs(96.0000, 1e-3));
+        REQUIRE_THAT(f4[3], Catch::Matchers::WithinAbs(0.2619, 1e-3));
+        REQUIRE_THAT(f3[0][0], Catch::Matchers::WithinAbs(0.4655, 1e-3));
+        REQUIRE_THAT(f3[0][1], Catch::Matchers::WithinAbs(27.1298, 1e-3));
+        REQUIRE_THAT(f3[0][2], Catch::Matchers::WithinAbs(93.3300, 1e-3));
+        REQUIRE_THAT(f3[0][3], Catch::Matchers::WithinAbs(0.9872, 1e-3));
+        REQUIRE_THAT(f3[1][0], Catch::Matchers::WithinAbs(0.5022, 1e-3));
+        REQUIRE_THAT(f3[1][1], Catch::Matchers::WithinAbs(18.3315, 1e-3));
+        REQUIRE_THAT(f3[1][2], Catch::Matchers::WithinAbs(93.3300, 1e-3));
+        REQUIRE_THAT(f3[1][3], Catch::Matchers::WithinAbs(0.9526, 1e-3));
+        REQUIRE_THAT(f3[2][0], Catch::Matchers::WithinAbs(0.6415, 1e-3));
+        REQUIRE_THAT(f3[2][1], Catch::Matchers::WithinAbs(29.6827, 1e-3));
+        REQUIRE_THAT(f3[2][2], Catch::Matchers::WithinAbs(93.3300, 1e-3));
+        REQUIRE_THAT(f3[2][3], Catch::Matchers::WithinAbs(0.9151, 1e-3));
+        REQUIRE_THAT(f3[3][0], Catch::Matchers::WithinAbs(0.3019, 1e-3));
+        REQUIRE_THAT(f3[3][1], Catch::Matchers::WithinAbs(24.7164, 1e-3));
+        REQUIRE_THAT(f3[3][2], Catch::Matchers::WithinAbs(93.3300, 1e-3));
+        REQUIRE_THAT(f3[3][3], Catch::Matchers::WithinAbs(0.9994, 1e-3));
+    }
 }
 
 

From 24404242c0387e66081bb9788c518d308808b869 Mon Sep 17 00:00:00 2001
From: HPDell <huyg0180110559@outlook.com>
Date: Fri, 2 Aug 2024 13:12:54 +0100
Subject: [PATCH 10/11] fix: MPI problems

---
 src/gwmodelpp/GWRBasic.cpp   | 49 +++++++++++++++-------
 test/mpi/testGWRBasicMpi.cpp | 78 ++++++++++++++++++++++++++++++++++++
 2 files changed, 112 insertions(+), 15 deletions(-)

diff --git a/src/gwmodelpp/GWRBasic.cpp b/src/gwmodelpp/GWRBasic.cpp
index 3906913..4c56b1d 100644
--- a/src/gwmodelpp/GWRBasic.cpp
+++ b/src/gwmodelpp/GWRBasic.cpp
@@ -465,8 +465,7 @@ void gwm::GWRBasic::fTestBase()
     for (uword i = 0; i < mX.n_cols; i++)
     {
         vec diagB = (this->*mCalcDiagBFunction)(i);
-        double g1 = diagB(0);
-        double g2 = diagB(1);
+        double g1 = diagB(0), g2 = diagB(1);
         double numdf = g1 * g1 / g2;
         FTestResult f3i;
         f3i.s = (vk2(i) / g1) / sigma21;
@@ -564,7 +563,9 @@ arma::vec GWRBasic::calcDiagBBase(arma::uword i)
             return { DBL_MAX, DBL_MAX };
         }
     }
-    return (this->*mCalcDiagBCoreFunction)(i, c);
+    vec diagB = (this->*mCalcDiagBCoreFunction)(i, c);
+    diagB = 1.0 / nDp * diagB;
+    return { sum(diagB), sum(diagB % diagB) };
 }
 
 vec GWRBasic::calcDiagBCoreSerial(uword i, const vec& c)
@@ -584,11 +585,11 @@ vec GWRBasic::calcDiagBCoreSerial(uword i, const vec& c)
         }
         catch (const std::exception& e)
         {
-            return { DBL_MAX, DBL_MAX };
+            diagB.fill(DBL_MAX);
+            return diagB;
         }
     }
-    diagB = 1.0 / nDp * diagB;
-    return { sum(diagB), sum(diagB % diagB) };
+    return diagB;
 }
 
 #ifdef ENABLE_OPENMP
@@ -808,20 +809,21 @@ double GWRBasic::calcTrQtQCoreOmp()
 vec GWRBasic::calcDiagBCoreOmp(uword i, const vec& c)
 {
     arma::uword nDp = mX.n_rows;
-    vec diagB(nDp, fill::zeros);
+    mat diagB_all(nDp, mOmpThreadNum, fill::zeros);
     std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
     int flag = true;
 #pragma omp parallel for num_threads(mOmpThreadNum)
     for (arma::uword k = workRange.first; k < workRange.second; k++)
     {
         if (flag) {
+            int thread = omp_get_thread_num();
             vec w = mSpatialWeight.weightVector(k);
             mat xtw = trans(mX.each_col() % w);
             try
             {
                 mat C = trans(xtw) * inv_sympd(xtw * mX);
                 vec b = C.col(i);
-                diagB += (b % b - (1.0 / nDp) * (b % c));
+                diagB_all.col(thread) += (b % b - (1.0 / nDp) * (b % c));
             }
             catch (const std::exception& e)
             {
@@ -831,10 +833,10 @@ vec GWRBasic::calcDiagBCoreOmp(uword i, const vec& c)
     }
     if (!flag)
     {
-        return { DBL_MAX, DBL_MAX };
+        diagB_all.fill(DBL_MAX);
     }
-    diagB = 1.0 / nDp * diagB;
-    return { sum(diagB), sum(diagB % diagB) };
+    vec diagB = sum(diagB_all, 1);
+    return diagB;
 }
 #endif
 
@@ -1297,10 +1299,27 @@ double GWRBasic::calcTrQtQMpi()
 
 vec GWRBasic::calcDiagBMpi(uword i)
 {
-    vec diagBi = calcDiagBBase(i);
-    vec diagB;
-    MPI_Allreduce(diagBi.memptr(), diagB.memptr(), 2, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    return diagB;
+    arma::uword nDp = mX.n_rows;
+    vec c(nDp, fill::zeros);
+    for (arma::uword j = 0; j < nDp; j++)
+    {
+        vec w = mSpatialWeight.weightVector(j);
+        mat xtw = trans(mX.each_col() % w);
+        try
+        {
+            mat C = trans(xtw) * inv_sympd(xtw * mX);
+            c += C.col(i);
+        }
+        catch (const std::exception& e)
+        {
+            return { DBL_MAX, DBL_MAX };
+        }
+    }
+    vec diagBi = (this->*mCalcDiagBCoreFunction)(i, c);
+    vec diagB(nDp, arma::fill::zeros);
+    MPI_Allreduce(diagBi.memptr(), diagB.memptr(), int(nDp), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    diagB = 1.0 / double(nDp) * diagB;
+    return { sum(diagB), sum(diagB % diagB) };
 }
 #endif // ENABLE_MPI
 
diff --git a/test/mpi/testGWRBasicMpi.cpp b/test/mpi/testGWRBasicMpi.cpp
index 2c44061..d435540 100644
--- a/test/mpi/testGWRBasicMpi.cpp
+++ b/test/mpi/testGWRBasicMpi.cpp
@@ -21,6 +21,11 @@ using namespace std;
 using namespace arma;
 using namespace gwm;
 
+array<double, 4> convFTestArray(GWRBasic::FTestResult f)
+{
+    return { f.s, f.df1, f.df2, f.p };
+}
+
 TEST_CASE("BasicGWR: LondonHP")
 {
     int iProcess, nProcess;
@@ -303,6 +308,79 @@ TEST_CASE("BasicGWR: LondonHP")
         }
     }
 
+    SECTION("F test | adaptive bandwidth | no bandwidth optimization | no variable optimization") {
+        auto parallel = GENERATE_REF(values(parallel_list));
+        INFO("Parallel:" << ParallelTypeDict.at(parallel));
+
+        CRSDistance distance(false);
+        BandwidthWeight bandwidth(36, true, BandwidthWeight::Gaussian);
+        SpatialWeight spatial(&bandwidth, &distance);
+
+        GWRBasic algorithm;
+        algorithm.setCoords(londonhp100_coord);
+        algorithm.setDependentVariable(y);
+        algorithm.setIndependentVariables(x);
+        algorithm.setSpatialWeight(spatial);
+        algorithm.setParallelType(parallel);
+        algorithm.setWorkerId(iProcess);
+        algorithm.setWorkerNum(nProcess);
+#ifdef ENABLE_OPENMP
+        if (parallel == ParallelType::OpenMP)
+        {
+            algorithm.setOmpThreadNum(omp_get_num_threads());
+        }
+#endif // ENABLE_OPENMP
+#ifdef ENABLE_CUDA
+        if (parallel == ParallelType::CUDA)
+        {
+            algorithm.setGPUId(0);
+            algorithm.setGroupSize(64);
+        }
+#endif // ENABLE_CUDA
+        algorithm.setIsDoFtest(true);
+        REQUIRE_NOTHROW(algorithm.fit());
+        if (iProcess == 0)
+        {
+            auto results = algorithm.fTestResults();
+            auto f1 = convFTestArray(get<0>(results));
+            auto f2 = convFTestArray(get<1>(results));
+            vector<array<double, 4>> f3;
+            for (auto &&i : get<2>(results))
+            {
+                f3.push_back(convFTestArray(i));
+            }
+            auto f4 = convFTestArray(get<3>(results));
+            REQUIRE_THAT(f1[0], Catch::Matchers::WithinAbs(0.9342, 1e-3));
+            REQUIRE_THAT(f1[1], Catch::Matchers::WithinAbs(93.3300, 1e-3));
+            REQUIRE_THAT(f1[2], Catch::Matchers::WithinAbs(96.0000, 1e-3));
+            REQUIRE_THAT(f1[3], Catch::Matchers::WithinAbs(0.3710, 1e-3));
+            REQUIRE_THAT(f2[0], Catch::Matchers::WithinAbs(1.9762, 1e-3));
+            REQUIRE_THAT(f2[1], Catch::Matchers::WithinAbs(13.1571, 1e-3));
+            REQUIRE_THAT(f2[2], Catch::Matchers::WithinAbs(96.0000, 1e-3));
+            REQUIRE_THAT(f2[3], Catch::Matchers::WithinAbs(0.0303, 1e-3));
+            REQUIRE_THAT(f4[0], Catch::Matchers::WithinAbs(0.8752, 1e-3));
+            REQUIRE_THAT(f4[1], Catch::Matchers::WithinAbs(89.9377, 1e-3));
+            REQUIRE_THAT(f4[2], Catch::Matchers::WithinAbs(96.0000, 1e-3));
+            REQUIRE_THAT(f4[3], Catch::Matchers::WithinAbs(0.2619, 1e-3));
+            REQUIRE_THAT(f3[0][0], Catch::Matchers::WithinAbs(0.4655, 1e-3));
+            REQUIRE_THAT(f3[0][1], Catch::Matchers::WithinAbs(27.1298, 1e-3));
+            REQUIRE_THAT(f3[0][2], Catch::Matchers::WithinAbs(93.3300, 1e-3));
+            REQUIRE_THAT(f3[0][3], Catch::Matchers::WithinAbs(0.9872, 1e-3));
+            REQUIRE_THAT(f3[1][0], Catch::Matchers::WithinAbs(0.5022, 1e-3));
+            REQUIRE_THAT(f3[1][1], Catch::Matchers::WithinAbs(18.3315, 1e-3));
+            REQUIRE_THAT(f3[1][2], Catch::Matchers::WithinAbs(93.3300, 1e-3));
+            REQUIRE_THAT(f3[1][3], Catch::Matchers::WithinAbs(0.9526, 1e-3));
+            REQUIRE_THAT(f3[2][0], Catch::Matchers::WithinAbs(0.6415, 1e-3));
+            REQUIRE_THAT(f3[2][1], Catch::Matchers::WithinAbs(29.6827, 1e-3));
+            REQUIRE_THAT(f3[2][2], Catch::Matchers::WithinAbs(93.3300, 1e-3));
+            REQUIRE_THAT(f3[2][3], Catch::Matchers::WithinAbs(0.9151, 1e-3));
+            REQUIRE_THAT(f3[3][0], Catch::Matchers::WithinAbs(0.3019, 1e-3));
+            REQUIRE_THAT(f3[3][1], Catch::Matchers::WithinAbs(24.7164, 1e-3));
+            REQUIRE_THAT(f3[3][2], Catch::Matchers::WithinAbs(93.3300, 1e-3));
+            REQUIRE_THAT(f3[3][3], Catch::Matchers::WithinAbs(0.9994, 1e-3));
+        }
+    }
+
 }
 
 int main(int argc, char *argv[])

From 4de2600d92649aaf468fe99240430fba6f881058 Mon Sep 17 00:00:00 2001
From: HPDell <huyg0180110559@outlook.com>
Date: Fri, 2 Aug 2024 13:54:46 +0100
Subject: [PATCH 11/11] edit: use serial implementation under CUDA parallel by
 default

---
 src/gwmodelpp/GWRBasic.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gwmodelpp/GWRBasic.cpp b/src/gwmodelpp/GWRBasic.cpp
index 4c56b1d..acd771c 100644
--- a/src/gwmodelpp/GWRBasic.cpp
+++ b/src/gwmodelpp/GWRBasic.cpp
@@ -1389,6 +1389,8 @@ void GWRBasic::setParallelType(const ParallelType& type)
             mFitCoreFunction = &GWRBasic::fitCoreCuda;
             mFitCoreCVFunction = &GWRBasic::fitCoreCVCuda;
             mFitCoreSHatFunction = &GWRBasic::fitCoreSHatCuda;
+            mCalcTrQtQCoreFunction = &GWRBasic::calcTrQtQCoreSerial;
+            mCalcDiagBCoreFunction = &GWRBasic::calcDiagBCoreSerial;
             break;
 #endif // ENABLE_CUDA
         default: