From 19bd79f88136229a3b139b2da7bf42a46a8176e0 Mon Sep 17 00:00:00 2001 From: Alex Rosen Date: Tue, 10 Feb 2026 14:12:02 +0200 Subject: [PATCH 1/2] Fix EI/PI acquisition stability for tiny std Avoid NaN gradients in gaussian_ei/gaussian_pi for extreme tails by masking unstable regions, and add a regression test. Co-authored-by: Cursor --- skopt/acquisition.py | 14 ++++++++++++-- skopt/tests/test_acquisition.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/skopt/acquisition.py b/skopt/acquisition.py index 065dd7e..9378be5 100644 --- a/skopt/acquisition.py +++ b/skopt/acquisition.py @@ -225,7 +225,12 @@ def gaussian_pi(X, model, y_opt=0.0, xi=0.01, return_grad=False): ) values = np.zeros_like(mu) - mask = std > 0 + # NOTE: We avoid extreme tails of the Gaussian where the analytic gradient + # can become numerically unstable (inf * 0 -> NaN) for very small std. + # See Connecteam fork fix (ct-0.9.1). + # mask = std > 0 + with np.errstate(divide="ignore", invalid="ignore"): + mask = np.abs((y_opt - xi - mu) / std) < 35.0 improve = y_opt - xi - mu[mask] scaled = improve / std[mask] values[mask] = norm.cdf(scaled) @@ -308,7 +313,12 @@ def gaussian_ei(X, model, y_opt=0.0, xi=0.01, return_grad=False): ) values = np.zeros_like(mu) - mask = std > 0 + # NOTE: We avoid extreme tails of the Gaussian where the analytic gradient + # can become numerically unstable (inf * 0 -> NaN) for very small std. + # See Connecteam fork fix (ct-0.9.1). + # mask = std > 0 + with np.errstate(divide="ignore", invalid="ignore"): + mask = np.abs((y_opt - xi - mu) / std) < 35.0 improve = y_opt - xi - mu[mask] scaled = improve / std[mask] cdf = norm.cdf(scaled) diff --git a/skopt/tests/test_acquisition.py b/skopt/tests/test_acquisition.py index 4902d53..42ca727 100644 --- a/skopt/tests/test_acquisition.py +++ b/skopt/tests/test_acquisition.py @@ -23,6 +23,27 @@ def predict(self, X, return_std=True): return np.zeros(X.shape[0]), np.ones(X.shape[0]) +class TinyStdSurrogateWithGrad: + """Minimal surrogate to exercise EI/PI numerical stability. + + For extremely small but positive std, the EI/PI gradient expressions can + overflow/underflow and produce NaNs (inf * 0). We assert gradients remain + finite. + """ + + def predict(self, X, return_std=True, return_mean_grad=False, return_std_grad=False): + X = np.asarray(X) + mu = np.zeros(X.shape[0]) + std = np.full(X.shape[0], 1e-160) + + if return_mean_grad and return_std_grad: + mu_grad = np.zeros((X.shape[0], X.shape[1])) + std_grad = np.ones((X.shape[0], X.shape[1])) + return mu, std, mu_grad, std_grad + + return mu, std + + # This is used to test that given constant acquisition values at # different points, acquisition functions "EIps" and "PIps" # prefer candidate points that take lesser time. @@ -94,6 +115,18 @@ def test_acquisition_api(): assert_raises(ValueError, method, rng.rand(10), gpr) +@pytest.mark.fast_test +@pytest.mark.parametrize("acq", [gaussian_pi, gaussian_ei]) +def test_ei_pi_gradients_are_finite_for_tiny_positive_std(acq): + X = np.zeros((1, 1)) + model = TinyStdSurrogateWithGrad() + + values, grad = acq(X, model, y_opt=1.0, xi=0.0, return_grad=True) + + assert np.isfinite(values).all() + assert np.isfinite(grad).all() + + def check_gradient_correctness(X_new, model, acq_func, y_opt): def num_grad_func(x): return gaussian_acquisition_1D(x, model, y_opt, acq_func=acq_func)[0] From 44e84fb34f254c238b9d65ed0e5c3e51739c914e Mon Sep 17 00:00:00 2001 From: Alex Rosen Date: Tue, 10 Feb 2026 14:16:52 +0200 Subject: [PATCH 2/2] Keep EI/PI values and zero gradients in extreme tails Preserve acquisition values for std>0 while returning a zero gradient when |scaled| is extreme, avoiding NaNs from inf*0 in analytic gradients. Co-authored-by: Cursor --- skopt/acquisition.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/skopt/acquisition.py b/skopt/acquisition.py index 9378be5..544591d 100644 --- a/skopt/acquisition.py +++ b/skopt/acquisition.py @@ -225,12 +225,7 @@ def gaussian_pi(X, model, y_opt=0.0, xi=0.01, return_grad=False): ) values = np.zeros_like(mu) - # NOTE: We avoid extreme tails of the Gaussian where the analytic gradient - # can become numerically unstable (inf * 0 -> NaN) for very small std. - # See Connecteam fork fix (ct-0.9.1). - # mask = std > 0 - with np.errstate(divide="ignore", invalid="ignore"): - mask = np.abs((y_opt - xi - mu) / std) < 35.0 + mask = std > 0 improve = y_opt - xi - mu[mask] scaled = improve / std[mask] values[mask] = norm.cdf(scaled) @@ -239,6 +234,12 @@ def gaussian_pi(X, model, y_opt=0.0, xi=0.01, return_grad=False): if not np.all(mask): return values, np.zeros_like(std_grad) + # In extreme tails, norm.pdf(scaled) underflows to 0 while improve_grad + # can overflow, leading to NaNs via inf * 0. In this regime, the true + # gradient tends to 0, so we return a zero gradient. + if np.any(np.abs(scaled) > 35.0): + return values, np.zeros_like(std_grad) + # Substitute (y_opt - xi - mu) / sigma = t and apply chain rule. # improve_grad is the gradient of t wrt x. improve_grad = -mu_grad * std - std_grad * improve @@ -313,12 +314,7 @@ def gaussian_ei(X, model, y_opt=0.0, xi=0.01, return_grad=False): ) values = np.zeros_like(mu) - # NOTE: We avoid extreme tails of the Gaussian where the analytic gradient - # can become numerically unstable (inf * 0 -> NaN) for very small std. - # See Connecteam fork fix (ct-0.9.1). - # mask = std > 0 - with np.errstate(divide="ignore", invalid="ignore"): - mask = np.abs((y_opt - xi - mu) / std) < 35.0 + mask = std > 0 improve = y_opt - xi - mu[mask] scaled = improve / std[mask] cdf = norm.cdf(scaled) @@ -331,6 +327,12 @@ def gaussian_ei(X, model, y_opt=0.0, xi=0.01, return_grad=False): if not np.all(mask): return values, np.zeros_like(std_grad) + # In extreme tails, norm.pdf(scaled) underflows to 0 while improve_grad + # can overflow, leading to NaNs via inf * 0. In this regime, the true + # gradient tends to 0, so we return a zero gradient. + if np.any(np.abs(scaled) > 35.0): + return values, np.zeros_like(std_grad) + # Substitute (y_opt - xi - mu) / sigma = t and apply chain rule. # improve_grad is the gradient of t wrt x. improve_grad = -mu_grad * std - std_grad * improve