From 19bd79f88136229a3b139b2da7bf42a46a8176e0 Mon Sep 17 00:00:00 2001
From: Alex Rosen <alex.rosen@alex.rosen-XPHQ>
Date: Tue, 10 Feb 2026 14:12:02 +0200
Subject: [PATCH 1/2] Fix EI/PI acquisition stability for tiny std

Avoid NaN gradients in gaussian_ei/gaussian_pi for extreme tails by
masking unstable regions, and add a regression test.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 skopt/acquisition.py            | 14 ++++++++++++--
 skopt/tests/test_acquisition.py | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/skopt/acquisition.py b/skopt/acquisition.py
index 065dd7e..9378be5 100644
--- a/skopt/acquisition.py
+++ b/skopt/acquisition.py
@@ -225,7 +225,12 @@ def gaussian_pi(X, model, y_opt=0.0, xi=0.01, return_grad=False):
         )
 
     values = np.zeros_like(mu)
-    mask = std > 0
+    # NOTE: We avoid extreme tails of the Gaussian where the analytic gradient
+    # can become numerically unstable (inf * 0 -> NaN) for very small std.
+    # See Connecteam fork fix (ct-0.9.1).
+    # mask = std > 0
+    with np.errstate(divide="ignore", invalid="ignore"):
+        mask = np.abs((y_opt - xi - mu) / std) < 35.0
     improve = y_opt - xi - mu[mask]
     scaled = improve / std[mask]
     values[mask] = norm.cdf(scaled)
@@ -308,7 +313,12 @@ def gaussian_ei(X, model, y_opt=0.0, xi=0.01, return_grad=False):
         )
 
     values = np.zeros_like(mu)
-    mask = std > 0
+    # NOTE: We avoid extreme tails of the Gaussian where the analytic gradient
+    # can become numerically unstable (inf * 0 -> NaN) for very small std.
+    # See Connecteam fork fix (ct-0.9.1).
+    # mask = std > 0
+    with np.errstate(divide="ignore", invalid="ignore"):
+        mask = np.abs((y_opt - xi - mu) / std) < 35.0
     improve = y_opt - xi - mu[mask]
     scaled = improve / std[mask]
     cdf = norm.cdf(scaled)
diff --git a/skopt/tests/test_acquisition.py b/skopt/tests/test_acquisition.py
index 4902d53..42ca727 100644
--- a/skopt/tests/test_acquisition.py
+++ b/skopt/tests/test_acquisition.py
@@ -23,6 +23,27 @@ def predict(self, X, return_std=True):
         return np.zeros(X.shape[0]), np.ones(X.shape[0])
 
 
+class TinyStdSurrogateWithGrad:
+    """Minimal surrogate to exercise EI/PI numerical stability.
+
+    For extremely small but positive std, the EI/PI gradient expressions can
+    overflow/underflow and produce NaNs (inf * 0). We assert gradients remain
+    finite.
+    """
+
+    def predict(self, X, return_std=True, return_mean_grad=False, return_std_grad=False):
+        X = np.asarray(X)
+        mu = np.zeros(X.shape[0])
+        std = np.full(X.shape[0], 1e-160)
+
+        if return_mean_grad and return_std_grad:
+            mu_grad = np.zeros((X.shape[0], X.shape[1]))
+            std_grad = np.ones((X.shape[0], X.shape[1]))
+            return mu, std, mu_grad, std_grad
+
+        return mu, std
+
+
 # This is used to test that given constant acquisition values at
 # different points, acquisition functions "EIps" and "PIps"
 # prefer candidate points that take lesser time.
@@ -94,6 +115,18 @@ def test_acquisition_api():
         assert_raises(ValueError, method, rng.rand(10), gpr)
 
 
+@pytest.mark.fast_test
+@pytest.mark.parametrize("acq", [gaussian_pi, gaussian_ei])
+def test_ei_pi_gradients_are_finite_for_tiny_positive_std(acq):
+    X = np.zeros((1, 1))
+    model = TinyStdSurrogateWithGrad()
+
+    values, grad = acq(X, model, y_opt=1.0, xi=0.0, return_grad=True)
+
+    assert np.isfinite(values).all()
+    assert np.isfinite(grad).all()
+
+
 def check_gradient_correctness(X_new, model, acq_func, y_opt):
     def num_grad_func(x):
         return gaussian_acquisition_1D(x, model, y_opt, acq_func=acq_func)[0]

From 44e84fb34f254c238b9d65ed0e5c3e51739c914e Mon Sep 17 00:00:00 2001
From: Alex Rosen <alex.rosen@alex.rosen-XPHQ>
Date: Tue, 10 Feb 2026 14:16:52 +0200
Subject: [PATCH 2/2] Keep EI/PI values and zero gradients in extreme tails

Preserve acquisition values for std>0 while returning a zero gradient when
|scaled| is extreme, avoiding NaNs from inf*0 in analytic gradients.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 skopt/acquisition.py | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/skopt/acquisition.py b/skopt/acquisition.py
index 9378be5..544591d 100644
--- a/skopt/acquisition.py
+++ b/skopt/acquisition.py
@@ -225,12 +225,7 @@ def gaussian_pi(X, model, y_opt=0.0, xi=0.01, return_grad=False):
         )
 
     values = np.zeros_like(mu)
-    # NOTE: We avoid extreme tails of the Gaussian where the analytic gradient
-    # can become numerically unstable (inf * 0 -> NaN) for very small std.
-    # See Connecteam fork fix (ct-0.9.1).
-    # mask = std > 0
-    with np.errstate(divide="ignore", invalid="ignore"):
-        mask = np.abs((y_opt - xi - mu) / std) < 35.0
+    mask = std > 0
     improve = y_opt - xi - mu[mask]
     scaled = improve / std[mask]
     values[mask] = norm.cdf(scaled)
@@ -239,6 +234,12 @@ def gaussian_pi(X, model, y_opt=0.0, xi=0.01, return_grad=False):
         if not np.all(mask):
             return values, np.zeros_like(std_grad)
 
+        # In extreme tails, norm.pdf(scaled) underflows to 0 while improve_grad
+        # can overflow, leading to NaNs via inf * 0. In this regime, the true
+        # gradient tends to 0, so we return a zero gradient.
+        if np.any(np.abs(scaled) > 35.0):
+            return values, np.zeros_like(std_grad)
+
         # Substitute (y_opt - xi - mu) / sigma = t and apply chain rule.
         # improve_grad is the gradient of t wrt x.
         improve_grad = -mu_grad * std - std_grad * improve
@@ -313,12 +314,7 @@ def gaussian_ei(X, model, y_opt=0.0, xi=0.01, return_grad=False):
         )
 
     values = np.zeros_like(mu)
-    # NOTE: We avoid extreme tails of the Gaussian where the analytic gradient
-    # can become numerically unstable (inf * 0 -> NaN) for very small std.
-    # See Connecteam fork fix (ct-0.9.1).
-    # mask = std > 0
-    with np.errstate(divide="ignore", invalid="ignore"):
-        mask = np.abs((y_opt - xi - mu) / std) < 35.0
+    mask = std > 0
     improve = y_opt - xi - mu[mask]
     scaled = improve / std[mask]
     cdf = norm.cdf(scaled)
@@ -331,6 +327,12 @@ def gaussian_ei(X, model, y_opt=0.0, xi=0.01, return_grad=False):
         if not np.all(mask):
             return values, np.zeros_like(std_grad)
 
+        # In extreme tails, norm.pdf(scaled) underflows to 0 while improve_grad
+        # can overflow, leading to NaNs via inf * 0. In this regime, the true
+        # gradient tends to 0, so we return a zero gradient.
+        if np.any(np.abs(scaled) > 35.0):
+            return values, np.zeros_like(std_grad)
+
         # Substitute (y_opt - xi - mu) / sigma = t and apply chain rule.
         # improve_grad is the gradient of t wrt x.
         improve_grad = -mu_grad * std - std_grad * improve