From d5807568fbc9a53c6cc069b5cc2c3067befc3c31 Mon Sep 17 00:00:00 2001 From: Abhinav Gupta Date: Thu, 29 Jan 2026 21:37:34 +0530 Subject: [PATCH 01/21] refactor(evaluators)!: reorganize evaluators into flat folder structure - Restructure evaluators into peer directories (regex, list, json, sql, galileo_luna2) - Split each evaluator into config.py and evaluator.py - Move Evaluator, EvaluatorMetadata, registry from models to evaluators package - Rename luna2 to galileo_luna2 following provider_evaluatorname convention - Move discovery and factory from engine to evaluators package - Update engine to delegate to evaluators package - Organize tests to mirror source structure (tests/json/, tests/sql/) - Fix SDK __all__: remove duplicate "control", remove non-existent tool exports - Update documentation with correct import paths - Remove stale TODO comments and add docstrings to empty __init__.py BREAKING CHANGE: Evaluator, EvaluatorMetadata, register_evaluator now imported from agent_control_evaluators instead of agent_control_models --- docs/OVERVIEW.md | 3 +- docs/REFERENCE.md | 8 +- engine/src/agent_control_engine/__init__.py | 6 +- engine/src/agent_control_engine/core.py | 2 +- engine/tests/conftest.py | 5 +- engine/tests/test_core.py | 18 +- engine/tests/test_discovery.py | 22 +- engine/tests/test_evaluator_integrations.py | 25 +- engine/tests/test_evaluators.py | 49 +- evaluators/README.md | 2 +- evaluators/pyproject.toml | 10 +- .../src/agent_control_evaluators/__init__.py | 37 +- .../src/agent_control_evaluators/_base.py | 115 ++-- .../agent_control_evaluators/_discovery.py | 8 +- .../src/agent_control_evaluators/_factory.py | 32 +- .../src/agent_control_evaluators/_registry.py | 87 +++ .../builtin/__init__.py | 11 - .../{luna2 => galileo_luna2}/__init__.py | 0 .../{luna2 => galileo_luna2}/client.py | 0 .../{luna2 => galileo_luna2}/config.py | 6 +- .../{luna2 => galileo_luna2}/evaluator.py | 13 +- .../agent_control_evaluators/json/__init__.py | 6 + .../agent_control_evaluators/json/config.py | 237 +++++++ .../{builtin/json.py => json/evaluator.py} | 51 +- .../agent_control_evaluators/list/__init__.py | 6 + .../agent_control_evaluators/list/config.py | 26 + .../{builtin/list.py => list/evaluator.py} | 12 +- .../regex/__init__.py | 6 + .../agent_control_evaluators/regex/config.py | 23 + .../{builtin/regex.py => regex/evaluator.py} | 12 +- .../agent_control_evaluators/sql/__init__.py | 6 + .../agent_control_evaluators/sql/config.py | 187 ++++++ .../{builtin/sql.py => sql/evaluator.py} | 30 +- evaluators/tests/json/__init__.py | 0 evaluators/tests/{ => json}/test_json.py | 3 +- evaluators/tests/sql/__init__.py | 0 evaluators/tests/{ => sql}/test_sql.py | 4 +- evaluators/tests/test_base.py | 3 +- examples/galileo/luna2_demo.py | 2 +- models/src/agent_control_models/__init__.py | 30 +- models/src/agent_control_models/controls.py | 597 +----------------- sdks/python/src/agent_control/__init__.py | 51 +- .../src/agent_control/evaluators/__init__.py | 4 +- .../src/agent_control/evaluators/base.py | 6 +- sdks/python/tests/test_evaluators.py | 8 +- sdks/python/tests/test_luna2_evaluator.py | 153 ++--- server/src/agent_control_server/config.py | 2 - .../endpoints/__init__.py | 1 + 48 files changed, 921 insertions(+), 1004 deletions(-) rename models/src/agent_control_models/evaluator.py => evaluators/src/agent_control_evaluators/_base.py (64%) rename engine/src/agent_control_engine/discovery.py => evaluators/src/agent_control_evaluators/_discovery.py (95%) rename engine/src/agent_control_engine/evaluators.py => evaluators/src/agent_control_evaluators/_factory.py (72%) create mode 100644 evaluators/src/agent_control_evaluators/_registry.py delete mode 100644 evaluators/src/agent_control_evaluators/builtin/__init__.py rename evaluators/src/agent_control_evaluators/{luna2 => galileo_luna2}/__init__.py (100%) rename evaluators/src/agent_control_evaluators/{luna2 => galileo_luna2}/client.py (100%) rename evaluators/src/agent_control_evaluators/{luna2 => galileo_luna2}/config.py (96%) rename evaluators/src/agent_control_evaluators/{luna2 => galileo_luna2}/evaluator.py (96%) create mode 100644 evaluators/src/agent_control_evaluators/json/__init__.py create mode 100644 evaluators/src/agent_control_evaluators/json/config.py rename evaluators/src/agent_control_evaluators/{builtin/json.py => json/evaluator.py} (91%) create mode 100644 evaluators/src/agent_control_evaluators/list/__init__.py create mode 100644 evaluators/src/agent_control_evaluators/list/config.py rename evaluators/src/agent_control_evaluators/{builtin/list.py => list/evaluator.py} (94%) create mode 100644 evaluators/src/agent_control_evaluators/regex/__init__.py create mode 100644 evaluators/src/agent_control_evaluators/regex/config.py rename evaluators/src/agent_control_evaluators/{builtin/regex.py => regex/evaluator.py} (89%) create mode 100644 evaluators/src/agent_control_evaluators/sql/__init__.py create mode 100644 evaluators/src/agent_control_evaluators/sql/config.py rename evaluators/src/agent_control_evaluators/{builtin/sql.py => sql/evaluator.py} (98%) create mode 100644 evaluators/tests/json/__init__.py rename evaluators/tests/{ => json}/test_json.py (99%) create mode 100644 evaluators/tests/sql/__init__.py rename evaluators/tests/{ => sql}/test_sql.py (99%) diff --git a/docs/OVERVIEW.md b/docs/OVERVIEW.md index a7c64929..06f1b836 100644 --- a/docs/OVERVIEW.md +++ b/docs/OVERVIEW.md @@ -435,7 +435,8 @@ Every evaluator implements the `Evaluator` base class: ```python from typing import Any from pydantic import BaseModel -from agent_control_models import EvaluatorResult, Evaluator, EvaluatorMetadata, register_evaluator +from agent_control_models import EvaluatorResult +from agent_control_evaluators import Evaluator, EvaluatorMetadata, register_evaluator class MyEvaluatorConfig(BaseModel): diff --git a/docs/REFERENCE.md b/docs/REFERENCE.md index 598769bd..282ca4c3 100644 --- a/docs/REFERENCE.md +++ b/docs/REFERENCE.md @@ -483,12 +483,8 @@ You can create custom evaluators to extend Agent Control with your own detection ```python from typing import Any from pydantic import BaseModel -from agent_control_models import ( - EvaluatorResult, - Evaluator, - EvaluatorMetadata, - register_evaluator, -) +from agent_control_models import EvaluatorResult +from agent_control_evaluators import Evaluator, EvaluatorMetadata, register_evaluator class MyEvaluatorConfig(BaseModel): diff --git a/engine/src/agent_control_engine/__init__.py b/engine/src/agent_control_engine/__init__.py index 08b8b9b2..b70e2307 100644 --- a/engine/src/agent_control_engine/__init__.py +++ b/engine/src/agent_control_engine/__init__.py @@ -1,8 +1,10 @@ """Agent Control Engine - Rule execution logic and evaluator system.""" -from .discovery import ( +from agent_control_evaluators import ( + clear_evaluator_cache, discover_evaluators, ensure_evaluators_discovered, + get_evaluator_instance, list_evaluators, reset_evaluator_discovery, ) @@ -10,8 +12,10 @@ __version__ = "0.1.0" __all__ = [ + "clear_evaluator_cache", "discover_evaluators", "ensure_evaluators_discovered", + "get_evaluator_instance", "list_evaluators", "reset_evaluator_discovery", ] diff --git a/engine/src/agent_control_engine/core.py b/engine/src/agent_control_engine/core.py index 8de40680..f3c2ca23 100644 --- a/engine/src/agent_control_engine/core.py +++ b/engine/src/agent_control_engine/core.py @@ -12,6 +12,7 @@ from typing import Any, Literal, Protocol import re2 +from agent_control_evaluators import get_evaluator_instance from agent_control_models import ( ControlDefinition, ControlMatch, @@ -20,7 +21,6 @@ EvaluatorResult, ) -from .evaluators import get_evaluator_instance from .selectors import select_data logger = logging.getLogger(__name__) diff --git a/engine/tests/conftest.py b/engine/tests/conftest.py index 5cd71b1d..0c669091 100644 --- a/engine/tests/conftest.py +++ b/engine/tests/conftest.py @@ -2,9 +2,8 @@ import pytest -from agent_control_engine.discovery import reset_evaluator_discovery -from agent_control_engine.evaluators import clear_evaluator_cache -from agent_control_models import clear_evaluators +from agent_control_engine import clear_evaluator_cache, reset_evaluator_discovery +from agent_control_evaluators import clear_evaluators @pytest.fixture(autouse=True) diff --git a/engine/tests/test_core.py b/engine/tests/test_core.py index b470d421..dad8fcbe 100644 --- a/engine/tests/test_core.py +++ b/engine/tests/test_core.py @@ -11,17 +11,15 @@ from typing import Any import pytest +from agent_control_engine import clear_evaluator_cache from agent_control_engine.core import ControlEngine, _compile_regex -from agent_control_engine.evaluators import clear_evaluator_cache +from agent_control_evaluators import Evaluator, EvaluatorMetadata, register_evaluator from agent_control_models import ( ControlDefinition, EvaluationRequest, - Evaluator, - EvaluatorConfig, - EvaluatorMetadata, EvaluatorResult, + EvaluatorSpec, Step, - register_evaluator, ) from pydantic import BaseModel @@ -208,7 +206,7 @@ def make_control( execution=execution, scope=scope, selector=selector or {"path": "*"}, - evaluator=EvaluatorConfig( + evaluator=EvaluatorSpec( name=evaluator, config={"value": config_value}, ), @@ -1057,7 +1055,7 @@ def test_invalid_step_name_regex_rejected(self): execution="server", scope={"step_types": ["tool"], "stages": ["pre"], "step_name_regex": "("}, selector={"path": "input"}, - evaluator=EvaluatorConfig(name="test-allow", config={"value": "x"}), + evaluator=EvaluatorSpec(name="test-allow", config={"value": "x"}), action={"decision": "log"}, ) @@ -1094,7 +1092,7 @@ async def test_evaluator_timeout_is_enforced(self): execution="server", scope={"step_types": ["llm"], "stages": ["pre"]}, selector={"path": "input"}, - evaluator=EvaluatorConfig( + evaluator=EvaluatorSpec( name="test-timeout", config={"value": "t1", "timeout_ms": 100}, ), @@ -1152,7 +1150,7 @@ async def test_timeout_does_not_affect_fast_evaluators(self): execution="server", scope={"step_types": ["llm"], "stages": ["pre"]}, selector={"path": "input"}, - evaluator=EvaluatorConfig( + evaluator=EvaluatorSpec( name="test-timeout", config={"value": "slow", "timeout_ms": 100}, ), @@ -1298,7 +1296,7 @@ def make_control_with_execution( execution=execution, scope=scope, selector={"path": path}, - evaluator=EvaluatorConfig( + evaluator=EvaluatorSpec( name=evaluator, config={"value": config_value}, ), diff --git a/engine/tests/test_discovery.py b/engine/tests/test_discovery.py index 6e0af44f..920aa366 100644 --- a/engine/tests/test_discovery.py +++ b/engine/tests/test_discovery.py @@ -5,16 +5,20 @@ from pydantic import BaseModel -from agent_control_engine import discover_evaluators, ensure_evaluators_discovered, list_evaluators -from agent_control_engine.discovery import reset_evaluator_discovery -from agent_control_models import ( +from agent_control_engine import ( + discover_evaluators, + ensure_evaluators_discovered, + list_evaluators, + reset_evaluator_discovery, +) +from agent_control_evaluators import ( Evaluator, EvaluatorMetadata, - EvaluatorResult, clear_evaluators, get_evaluator, register_evaluator, ) +from agent_control_models import EvaluatorResult class TestDiscoverEvaluators: @@ -28,7 +32,7 @@ def test_discover_evaluators_loads_builtins(self) -> None: assert "regex" in evaluators assert "list" in evaluators - @patch("agent_control_engine.discovery.entry_points") + @patch("agent_control_evaluators._discovery.entry_points") def test_discover_evaluators_loads_entry_points( self, mock_entry_points: MagicMock ) -> None: @@ -62,7 +66,7 @@ async def evaluate(self, data: Any) -> EvaluatorResult: # Count only includes entry-point registrations (not built-ins loaded via import) assert count >= 1 - @patch("agent_control_engine.discovery.entry_points") + @patch("agent_control_evaluators._discovery.entry_points") def test_discover_evaluators_handles_load_error( self, mock_entry_points: MagicMock ) -> None: @@ -97,7 +101,7 @@ def test_ensure_evaluators_discovered_triggers_discovery(self) -> None: assert "regex" in evaluators assert "list" in evaluators - def test_reset_discovery_allows_rediscovery(self) -> None: + def test_reset_evaluator_discovery_allows_rediscovery(self) -> None: """Test that reset_evaluator_discovery allows discovery to run again.""" discover_evaluators() evaluators1 = list_evaluators() @@ -112,7 +116,7 @@ def test_reset_discovery_allows_rediscovery(self) -> None: assert "regex" in evaluators2 assert "list" in evaluators2 - @patch("agent_control_engine.discovery.entry_points") + @patch("agent_control_evaluators._discovery.entry_points") def test_discover_evaluators_skips_unavailable( self, mock_entry_points: MagicMock ) -> None: @@ -148,7 +152,7 @@ async def evaluate(self, data: Any) -> EvaluatorResult: assert "unavailable-evaluator" not in evaluators assert count == 0 - @patch("agent_control_engine.discovery.entry_points") + @patch("agent_control_evaluators._discovery.entry_points") def test_discover_evaluators_registers_available( self, mock_entry_points: MagicMock ) -> None: diff --git a/engine/tests/test_evaluator_integrations.py b/engine/tests/test_evaluator_integrations.py index cdf3bc65..3bfadd9e 100644 --- a/engine/tests/test_evaluator_integrations.py +++ b/engine/tests/test_evaluator_integrations.py @@ -8,14 +8,9 @@ # Import to ensure built-in evaluators are registered import agent_control_evaluators # noqa: F401 import pytest -from agent_control_engine.evaluators import get_evaluator_instance -from agent_control_models import ( - Evaluator, - EvaluatorConfig, - EvaluatorMetadata, - EvaluatorResult, - register_evaluator, -) +from agent_control_engine import get_evaluator_instance +from agent_control_evaluators import Evaluator, EvaluatorMetadata, register_evaluator +from agent_control_models import EvaluatorResult, EvaluatorSpec from pydantic import BaseModel @@ -91,7 +86,7 @@ def register_mock(self): async def test_evaluate_matched(self): """Test evaluation when threshold exceeded.""" # Given: Mock evaluator with threshold 0.5 - config = EvaluatorConfig(name="test-mock-evaluator", config={"threshold": 0.5}) + config = EvaluatorSpec(name="test-mock-evaluator", config={"threshold": 0.5}) evaluator = get_evaluator_instance(config) # When: Evaluating value above threshold @@ -107,7 +102,7 @@ async def test_evaluate_matched(self): async def test_evaluate_not_matched(self): """Test evaluation when below threshold.""" # Given: Mock evaluator with threshold 0.9 - config = EvaluatorConfig(name="test-mock-evaluator", config={"threshold": 0.9}) + config = EvaluatorSpec(name="test-mock-evaluator", config={"threshold": 0.9}) evaluator = get_evaluator_instance(config) # When: Evaluating value below threshold @@ -120,7 +115,7 @@ async def test_evaluate_not_matched(self): async def test_multiple_evaluations(self): """Test multiple evaluations with same evaluator.""" # Given: Mock evaluator with threshold 0.5 - config = EvaluatorConfig(name="test-mock-evaluator", config={"threshold": 0.5}) + config = EvaluatorSpec(name="test-mock-evaluator", config={"threshold": 0.5}) evaluator = get_evaluator_instance(config) # When: Evaluating multiple values @@ -193,7 +188,7 @@ async def test_regex_case_sensitive_by_default(self): Then: Only exact case matches """ # Given: Regex for "SECRET" without flags - config = EvaluatorConfig( + config = EvaluatorSpec( name="regex", config={"pattern": "SECRET"} ) @@ -219,7 +214,7 @@ async def test_regex_ignorecase_flag(self): Then: All cases match """ # Given: Regex for "SECRET" with IGNORECASE flag - config = EvaluatorConfig( + config = EvaluatorSpec( name="regex", config={"pattern": "SECRET", "flags": ["IGNORECASE"]} ) @@ -247,7 +242,7 @@ async def test_regex_short_i_flag(self): Then: All cases match """ # Given: Regex with short "I" flag - config = EvaluatorConfig( + config = EvaluatorSpec( name="regex", config={"pattern": "password", "flags": ["I"]} ) @@ -272,7 +267,7 @@ async def test_regex_ignorecase_lowercase_flag(self): Then: All cases match """ # Given: Regex with lowercase flag variant - config = EvaluatorConfig( + config = EvaluatorSpec( name="regex", config={"pattern": "admin", "flags": ["ignorecase"]} ) diff --git a/engine/tests/test_evaluators.py b/engine/tests/test_evaluators.py index 49bb5c61..019ab8db 100644 --- a/engine/tests/test_evaluators.py +++ b/engine/tests/test_evaluators.py @@ -1,16 +1,17 @@ """Tests for unified evaluator factory.""" import pytest -from agent_control_engine import list_evaluators -from agent_control_engine.evaluators import ( +from agent_control_engine import ( clear_evaluator_cache, get_evaluator_instance, + list_evaluators, ) -from agent_control_models import ( - EvaluatorConfig, +from agent_control_models import EvaluatorSpec +from agent_control_evaluators import ( + ListEvaluator, + RegexEvaluator, RegexEvaluatorConfig, ) -from agent_control_evaluators import ListEvaluator, RegexEvaluator class TestRegexEvaluator: @@ -20,7 +21,7 @@ class TestRegexEvaluator: async def test_basic_match(self): """Test regex matches SSN pattern.""" # Given: A regex evaluator with SSN pattern - config = EvaluatorConfig(name="regex", config={"pattern": r"\d{3}-\d{2}-\d{4}"}) + config = EvaluatorSpec(name="regex", config={"pattern": r"\d{3}-\d{2}-\d{4}"}) evaluator = get_evaluator_instance(config) # When: Evaluating text containing SSN @@ -34,7 +35,7 @@ async def test_basic_match(self): async def test_no_match(self): """Test regex doesn't match when pattern not found.""" # Given: A regex evaluator with SSN pattern - config = EvaluatorConfig(name="regex", config={"pattern": r"\d{3}-\d{2}-\d{4}"}) + config = EvaluatorSpec(name="regex", config={"pattern": r"\d{3}-\d{2}-\d{4}"}) evaluator = get_evaluator_instance(config) # When: Evaluating text without pattern @@ -48,7 +49,7 @@ async def test_no_match(self): async def test_non_string_input(self): """Test non-string input is converted to string.""" # Given: A regex evaluator - config = EvaluatorConfig(name="regex", config={"pattern": r"123"}) + config = EvaluatorSpec(name="regex", config={"pattern": r"123"}) evaluator = get_evaluator_instance(config) # When: Evaluating non-string input @@ -61,7 +62,7 @@ async def test_non_string_input(self): async def test_none_input(self): """Test handling of None input.""" # Given: A regex evaluator - config = EvaluatorConfig(name="regex", config={"pattern": r".*"}) + config = EvaluatorSpec(name="regex", config={"pattern": r".*"}) evaluator = get_evaluator_instance(config) # When: Evaluating None @@ -82,7 +83,7 @@ def test_invalid_regex_pattern(self): async def test_empty_pattern_matches_everything(self): """Test empty pattern matches everything.""" # Given: A regex evaluator with empty pattern - config = EvaluatorConfig(name="regex", config={"pattern": ""}) + config = EvaluatorSpec(name="regex", config={"pattern": ""}) evaluator = get_evaluator_instance(config) # When: Evaluating any text @@ -99,7 +100,7 @@ class TestListEvaluator: async def test_any_match(self): """Test list evaluator with any/match logic.""" # Given: A list evaluator with blocklist items - config = EvaluatorConfig( + config = EvaluatorSpec( name="list", config={"values": ["bad", "evil"], "logic": "any", "match_on": "match"}, ) @@ -114,7 +115,7 @@ async def test_any_match(self): async def test_any_no_match(self): """Test list evaluator as allowlist (any/no_match).""" # Given: A list evaluator as allowlist - config = EvaluatorConfig( + config = EvaluatorSpec( name="list", config={"values": ["safe", "ok"], "logic": "any", "match_on": "no_match"}, ) @@ -129,7 +130,7 @@ async def test_any_no_match(self): async def test_all_match(self): """Test list evaluator with all/match logic.""" # Given: A list evaluator with all/match logic - config = EvaluatorConfig( + config = EvaluatorSpec( name="list", config={"values": ["valid1", "valid2"], "logic": "all", "match_on": "match"}, ) @@ -144,7 +145,7 @@ async def test_all_match(self): async def test_case_insensitive(self): """Test case-insensitive matching.""" # Given: A case-insensitive list evaluator - config = EvaluatorConfig( + config = EvaluatorSpec( name="list", config={"values": ["MixedCase"], "case_sensitive": False, "match_on": "match"}, ) @@ -161,7 +162,7 @@ class TestGetEvaluatorInstance: def test_get_evaluator_instance_returns_correct_type(self): """Test factory returns correct evaluator type.""" # Given: An evaluator config - config = EvaluatorConfig(name="regex", config={"pattern": "abc"}) + config = EvaluatorSpec(name="regex", config={"pattern": "abc"}) # When: Getting evaluator evaluator = get_evaluator_instance(config) @@ -172,7 +173,7 @@ def test_get_evaluator_instance_returns_correct_type(self): def test_get_evaluator_instance_unknown_evaluator(self): """Test error when evaluator not found.""" # Given: Config for nonexistent evaluator - config = EvaluatorConfig(name="nonexistent", config={}) + config = EvaluatorSpec(name="nonexistent", config={}) # When/Then: Should raise ValueError with pytest.raises(ValueError, match="not found"): @@ -202,7 +203,7 @@ def teardown_method(self): def test_evaluator_cache_hit(self): """Test that same config returns same cached instance.""" # Given: An evaluator config - config = EvaluatorConfig(name="regex", config={"pattern": "test"}) + config = EvaluatorSpec(name="regex", config={"pattern": "test"}) # When: First call creates instance evaluator1 = get_evaluator_instance(config) @@ -215,8 +216,8 @@ def test_evaluator_cache_hit(self): def test_evaluator_cache_miss_different_config(self): """Test that different configs return different instances.""" # Given: Two different configs - config1 = EvaluatorConfig(name="regex", config={"pattern": "test1"}) - config2 = EvaluatorConfig(name="regex", config={"pattern": "test2"}) + config1 = EvaluatorSpec(name="regex", config={"pattern": "test1"}) + config2 = EvaluatorSpec(name="regex", config={"pattern": "test2"}) # When: Getting evaluators evaluator1 = get_evaluator_instance(config1) @@ -228,8 +229,8 @@ def test_evaluator_cache_miss_different_config(self): def test_evaluator_cache_miss_different_evaluator(self): """Test that same config but different evaluators return different instances.""" # Given: Two configs with different evaluators - config1 = EvaluatorConfig(name="regex", config={"pattern": "bad"}) - config2 = EvaluatorConfig(name="list", config={"values": ["bad"]}) + config1 = EvaluatorSpec(name="regex", config={"pattern": "bad"}) + config2 = EvaluatorSpec(name="list", config={"values": ["bad"]}) # When: Getting evaluators evaluator1 = get_evaluator_instance(config1) @@ -243,8 +244,8 @@ def test_evaluator_cache_miss_different_evaluator(self): def test_evaluator_cache_clear_all(self): """Test that clear_evaluator_cache clears all entries.""" # Given: Two cached evaluators - config1 = EvaluatorConfig(name="regex", config={"pattern": "test1"}) - config2 = EvaluatorConfig(name="list", config={"values": ["test"]}) + config1 = EvaluatorSpec(name="regex", config={"pattern": "test1"}) + config2 = EvaluatorSpec(name="list", config={"values": ["test"]}) evaluator1a = get_evaluator_instance(config1) evaluator2a = get_evaluator_instance(config2) @@ -270,7 +271,7 @@ def test_cache_size_is_clamped_to_minimum(self): When: Module is imported Then: The value should be at least 1 (MIN_CACHE_SIZE) """ - from agent_control_engine.evaluators import EVALUATOR_CACHE_SIZE, MIN_CACHE_SIZE + from agent_control_evaluators._factory import EVALUATOR_CACHE_SIZE, MIN_CACHE_SIZE assert EVALUATOR_CACHE_SIZE >= MIN_CACHE_SIZE assert MIN_CACHE_SIZE == 1 diff --git a/evaluators/README.md b/evaluators/README.md index aa806cef..3a0d8db0 100644 --- a/evaluators/README.md +++ b/evaluators/README.md @@ -11,7 +11,7 @@ Evaluator implementations for agent-control. ## Optional Evaluators -- **luna2** - Galileo Luna-2 integration (requires `luna2` extra) +- **galileo-luna2** - Galileo Luna-2 integration (requires `luna2` extra) ## Installation diff --git a/evaluators/pyproject.toml b/evaluators/pyproject.toml index fde84fa0..4e606f57 100644 --- a/evaluators/pyproject.toml +++ b/evaluators/pyproject.toml @@ -20,11 +20,11 @@ all = ["httpx>=0.24.0"] dev = ["pytest>=8.0.0", "pytest-asyncio>=0.23.0"] [project.entry-points."agent_control.evaluators"] -regex = "agent_control_evaluators.builtin.regex:RegexEvaluator" -list = "agent_control_evaluators.builtin.list:ListEvaluator" -json = "agent_control_evaluators.builtin.json:JSONEvaluator" -sql = "agent_control_evaluators.builtin.sql:SQLEvaluator" -luna2 = "agent_control_evaluators.luna2.evaluator:Luna2Evaluator" +regex = "agent_control_evaluators.regex:RegexEvaluator" +list = "agent_control_evaluators.list:ListEvaluator" +json = "agent_control_evaluators.json:JSONEvaluator" +sql = "agent_control_evaluators.sql:SQLEvaluator" +luna2 = "agent_control_evaluators.galileo_luna2:Luna2Evaluator" [build-system] requires = ["hatchling"] diff --git a/evaluators/src/agent_control_evaluators/__init__.py b/evaluators/src/agent_control_evaluators/__init__.py index ef9ba126..32515ac8 100644 --- a/evaluators/src/agent_control_evaluators/__init__.py +++ b/evaluators/src/agent_control_evaluators/__init__.py @@ -14,19 +14,52 @@ Their schemas are registered via initAgent for validation purposes. """ -from agent_control_models import Evaluator, EvaluatorMetadata, register_evaluator +# Core infrastructure - export from _base and _registry +from agent_control_evaluators._base import Evaluator, EvaluatorConfig, EvaluatorMetadata +from agent_control_evaluators._discovery import ( + discover_evaluators, + ensure_evaluators_discovered, + list_evaluators, + reset_evaluator_discovery, +) +from agent_control_evaluators._factory import clear_evaluator_cache, get_evaluator_instance +from agent_control_evaluators._registry import ( + clear_evaluators, + get_all_evaluators, + get_evaluator, + register_evaluator, +) # Import built-in evaluators to auto-register them -from .builtin import JSONEvaluator, ListEvaluator, RegexEvaluator, SQLEvaluator +from agent_control_evaluators.json import JSONEvaluator, JSONEvaluatorConfig +from agent_control_evaluators.list import ListEvaluator, ListEvaluatorConfig +from agent_control_evaluators.regex import RegexEvaluator, RegexEvaluatorConfig +from agent_control_evaluators.sql import SQLEvaluator, SQLEvaluatorConfig __version__ = "0.1.0" __all__ = [ + # Core infrastructure "Evaluator", + "EvaluatorConfig", "EvaluatorMetadata", "register_evaluator", + "get_evaluator", + "get_all_evaluators", + "clear_evaluators", + "discover_evaluators", + "ensure_evaluators_discovered", + "reset_evaluator_discovery", + "list_evaluators", + "get_evaluator_instance", + "clear_evaluator_cache", + # Built-in evaluators "RegexEvaluator", + "RegexEvaluatorConfig", "ListEvaluator", + "ListEvaluatorConfig", "JSONEvaluator", + "JSONEvaluatorConfig", "SQLEvaluator", + "SQLEvaluatorConfig", ] diff --git a/models/src/agent_control_models/evaluator.py b/evaluators/src/agent_control_evaluators/_base.py similarity index 64% rename from models/src/agent_control_models/evaluator.py rename to evaluators/src/agent_control_evaluators/_base.py index f1234c60..3fb53554 100644 --- a/models/src/agent_control_models/evaluator.py +++ b/evaluators/src/agent_control_evaluators/_base.py @@ -1,4 +1,4 @@ -"""Evaluator system base classes and registry.""" +"""Evaluator base classes and metadata.""" from __future__ import annotations @@ -7,16 +7,36 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Any, ClassVar, Generic, TypeVar -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict -from .controls import EvaluatorResult +from agent_control_models import EvaluatorResult if TYPE_CHECKING: from typing import Self logger = logging.getLogger(__name__) -ConfigT = TypeVar("ConfigT", bound=BaseModel) + +class EvaluatorConfig(BaseModel): + """Base class for typed evaluator configurations. + + All evaluator config classes should extend this to ensure consistent + behavior and enable type checking. + + Example: + ```python + from agent_control_evaluators import EvaluatorConfig + + class MyEvaluatorConfig(EvaluatorConfig): + pattern: str + threshold: float = 0.5 + ``` + """ + + model_config = ConfigDict(extra="forbid") + + +ConfigT = TypeVar("ConfigT", bound=EvaluatorConfig) @dataclass @@ -24,7 +44,7 @@ class EvaluatorMetadata: """Metadata about an evaluator. Attributes: - name: Unique evaluator name (e.g., "regex", "galileo-luna2") + name: Unique evaluator name (e.g., "regex", "galileo_luna2") version: Evaluator version string description: Human-readable description requires_api_key: Whether the evaluator requires an API key @@ -38,7 +58,7 @@ class EvaluatorMetadata: timeout_ms: int = 10000 -class Evaluator(ABC, Generic[ConfigT]): # noqa: UP046 +class Evaluator(ABC, Generic[ConfigT]): """Base class for all evaluators (built-in, external, or custom). All evaluators follow the same pattern: @@ -74,9 +94,15 @@ async def evaluate(self, data): Example: ```python - from agent_control_models import Evaluator, EvaluatorMetadata, register_evaluator - - class MyConfig(BaseModel): + from agent_control_evaluators import ( + Evaluator, + EvaluatorConfig, + EvaluatorMetadata, + register_evaluator, + ) + from agent_control_models import EvaluatorResult + + class MyConfig(EvaluatorConfig): threshold: float = 0.5 @register_evaluator @@ -98,7 +124,7 @@ async def evaluate(self, data: Any) -> EvaluatorResult: """ metadata: ClassVar[EvaluatorMetadata] - config_model: ClassVar[type[BaseModel]] + config_model: ClassVar[type[EvaluatorConfig]] def __init__(self, config: ConfigT) -> None: """Initialize evaluator with validated config. @@ -151,72 +177,3 @@ def is_available(cls) -> bool: True if evaluator can be used, False otherwise """ return True - - -# ============================================================================= -# Evaluator Registry -# ============================================================================= - -_EVALUATOR_REGISTRY: dict[str, type[Evaluator[Any]]] = {} - - -def register_evaluator( - evaluator_class: type[Evaluator[Any]], -) -> type[Evaluator[Any]]: - """Register an evaluator class by its metadata name. - - Can be used as a decorator or called directly. Respects the evaluator's - is_available() method - evaluators with unavailable dependencies are - silently skipped. - - Args: - evaluator_class: Evaluator class to register - - Returns: - The same evaluator class (for decorator usage) - - Raises: - ValueError: If evaluator name already registered - """ - name = evaluator_class.metadata.name - - # Check if evaluator dependencies are satisfied - if not evaluator_class.is_available(): - logger.debug(f"Evaluator '{name}' not available (is_available=False), skipping") - return evaluator_class - - if name in _EVALUATOR_REGISTRY: - # Allow re-registration of same class (e.g., during hot reload) - if _EVALUATOR_REGISTRY[name] is evaluator_class: - return evaluator_class - raise ValueError(f"Evaluator '{name}' is already registered") - - _EVALUATOR_REGISTRY[name] = evaluator_class - logger.debug(f"Registered evaluator: {name} v{evaluator_class.metadata.version}") - return evaluator_class - - -def get_evaluator(name: str) -> type[Evaluator[Any]] | None: - """Get a registered evaluator by name. - - Args: - name: Evaluator name to look up - - Returns: - Evaluator class if found, None otherwise - """ - return _EVALUATOR_REGISTRY.get(name) - - -def get_all_evaluators() -> dict[str, type[Evaluator[Any]]]: - """Get all registered evaluators. - - Returns: - Dictionary mapping evaluator names to evaluator classes - """ - return dict(_EVALUATOR_REGISTRY) - - -def clear_evaluators() -> None: - """Clear all registered evaluators. Useful for testing.""" - _EVALUATOR_REGISTRY.clear() diff --git a/engine/src/agent_control_engine/discovery.py b/evaluators/src/agent_control_evaluators/_discovery.py similarity index 95% rename from engine/src/agent_control_engine/discovery.py rename to evaluators/src/agent_control_evaluators/_discovery.py index dd4563f4..a733c2b4 100644 --- a/engine/src/agent_control_engine/discovery.py +++ b/evaluators/src/agent_control_evaluators/_discovery.py @@ -5,15 +5,17 @@ import logging import threading from importlib.metadata import entry_points -from typing import Any +from typing import TYPE_CHECKING, Any -from agent_control_models import ( - Evaluator, +from agent_control_evaluators._registry import ( get_all_evaluators, get_evaluator, register_evaluator, ) +if TYPE_CHECKING: + from agent_control_evaluators._base import Evaluator + logger = logging.getLogger(__name__) _DISCOVERY_COMPLETE = False diff --git a/engine/src/agent_control_engine/evaluators.py b/evaluators/src/agent_control_evaluators/_factory.py similarity index 72% rename from engine/src/agent_control_engine/evaluators.py rename to evaluators/src/agent_control_evaluators/_factory.py index c9c43717..94af326d 100644 --- a/engine/src/agent_control_engine/evaluators.py +++ b/evaluators/src/agent_control_evaluators/_factory.py @@ -1,14 +1,18 @@ -"""Unified evaluator factory using evaluator registry with caching.""" +"""Evaluator factory with instance caching.""" + +from __future__ import annotations import json import logging import os from collections import OrderedDict -from typing import Any +from typing import TYPE_CHECKING, Any -from agent_control_models import Evaluator, EvaluatorConfig +from agent_control_evaluators._discovery import list_evaluators -from .discovery import list_evaluators +if TYPE_CHECKING: + from agent_control_evaluators._base import Evaluator + from agent_control_models import EvaluatorSpec logger = logging.getLogger(__name__) @@ -42,8 +46,8 @@ def _config_hash(config: dict[str, Any]) -> str: return json.dumps(config, sort_keys=True, default=str) -def get_evaluator_instance(evaluator_config: EvaluatorConfig) -> Evaluator[Any]: - """Get or create a cached evaluator instance from configuration. +def get_evaluator_instance(evaluator_spec: EvaluatorSpec) -> Evaluator[Any]: + """Get or create a cached evaluator instance from specification. Uses LRU caching to reuse evaluator instances with the same config. Cache key is: {evaluator_name}:{config_hash} @@ -54,7 +58,7 @@ def get_evaluator_instance(evaluator_config: EvaluatorConfig) -> Evaluator[Any]: docstring for details on safe patterns. Args: - evaluator_config: The evaluator configuration with evaluator name and config + evaluator_spec: The evaluator specification with name and config Returns: Evaluator instance (cached or new) @@ -63,27 +67,27 @@ def get_evaluator_instance(evaluator_config: EvaluatorConfig) -> Evaluator[Any]: ValueError: If evaluator not found """ # Build cache key - cache_key = f"{evaluator_config.name}:{_config_hash(evaluator_config.config)}" + cache_key = f"{evaluator_spec.name}:{_config_hash(evaluator_spec.config)}" # Check cache if cache_key in _EVALUATOR_CACHE: # Move to end (most recently used) _EVALUATOR_CACHE.move_to_end(cache_key) - logger.debug(f"Cache hit for evaluator: {evaluator_config.name}") + logger.debug(f"Cache hit for evaluator: {evaluator_spec.name}") return _EVALUATOR_CACHE[cache_key] # Cache miss - create new instance evaluators = list_evaluators() - evaluator_cls = evaluators.get(evaluator_config.name) + evaluator_cls = evaluators.get(evaluator_spec.name) if evaluator_cls is None: raise ValueError( - f"Evaluator '{evaluator_config.name}' not found. " + f"Evaluator '{evaluator_spec.name}' not found. " f"Available evaluators: {', '.join(evaluators.keys())}" ) - logger.debug(f"Cache miss, creating evaluator: {evaluator_config.name}") - instance = evaluator_cls.from_dict(evaluator_config.config) + logger.debug(f"Cache miss, creating evaluator: {evaluator_spec.name}") + instance = evaluator_cls.from_dict(evaluator_spec.config) # Evict oldest if cache is full while len(_EVALUATOR_CACHE) >= EVALUATOR_CACHE_SIZE: @@ -98,5 +102,3 @@ def get_evaluator_instance(evaluator_config: EvaluatorConfig) -> Evaluator[Any]: def clear_evaluator_cache() -> None: """Clear all cached evaluator instances. Useful for testing.""" _EVALUATOR_CACHE.clear() - - diff --git a/evaluators/src/agent_control_evaluators/_registry.py b/evaluators/src/agent_control_evaluators/_registry.py new file mode 100644 index 00000000..bd93e63e --- /dev/null +++ b/evaluators/src/agent_control_evaluators/_registry.py @@ -0,0 +1,87 @@ +"""Evaluator registry for registration and lookup.""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from agent_control_evaluators._base import Evaluator + +logger = logging.getLogger(__name__) + +# ============================================================================= +# Evaluator Registry +# ============================================================================= + +_EVALUATOR_REGISTRY: dict[str, type[Evaluator[Any]]] = {} + + +def register_evaluator( + evaluator_class: type[Evaluator[Any]], +) -> type[Evaluator[Any]]: + """Register an evaluator class by its metadata name. + + Can be used as a decorator or called directly. Respects the evaluator's + is_available() method - evaluators with unavailable dependencies are + silently skipped. + + Args: + evaluator_class: Evaluator class to register + + Returns: + The same evaluator class (for decorator usage) + + Raises: + ValueError: If evaluator name already registered with different class + + Example: + ```python + @register_evaluator + class MyEvaluator(Evaluator[MyConfig]): + metadata = EvaluatorMetadata(name="my-evaluator", ...) + ... + ``` + """ + name = evaluator_class.metadata.name + + # Check if evaluator dependencies are satisfied + if not evaluator_class.is_available(): + logger.debug(f"Evaluator '{name}' not available (is_available=False), skipping") + return evaluator_class + + if name in _EVALUATOR_REGISTRY: + # Allow re-registration of same class (e.g., during hot reload) + if _EVALUATOR_REGISTRY[name] is evaluator_class: + return evaluator_class + raise ValueError(f"Evaluator '{name}' is already registered") + + _EVALUATOR_REGISTRY[name] = evaluator_class + logger.debug(f"Registered evaluator: {name} v{evaluator_class.metadata.version}") + return evaluator_class + + +def get_evaluator(name: str) -> type[Evaluator[Any]] | None: + """Get a registered evaluator by name. + + Args: + name: Evaluator name to look up + + Returns: + Evaluator class if found, None otherwise + """ + return _EVALUATOR_REGISTRY.get(name) + + +def get_all_evaluators() -> dict[str, type[Evaluator[Any]]]: + """Get all registered evaluators. + + Returns: + Dictionary mapping evaluator names to evaluator classes + """ + return dict(_EVALUATOR_REGISTRY) + + +def clear_evaluators() -> None: + """Clear all registered evaluators. Useful for testing.""" + _EVALUATOR_REGISTRY.clear() diff --git a/evaluators/src/agent_control_evaluators/builtin/__init__.py b/evaluators/src/agent_control_evaluators/builtin/__init__.py deleted file mode 100644 index 6b82f363..00000000 --- a/evaluators/src/agent_control_evaluators/builtin/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -"""Built-in evaluators for agent-control. - -These evaluators are automatically registered when this module is imported. -""" - -from .json import JSONEvaluator -from .list import ListEvaluator -from .regex import RegexEvaluator -from .sql import SQLEvaluator - -__all__ = ["JSONEvaluator", "ListEvaluator", "RegexEvaluator", "SQLEvaluator"] diff --git a/evaluators/src/agent_control_evaluators/luna2/__init__.py b/evaluators/src/agent_control_evaluators/galileo_luna2/__init__.py similarity index 100% rename from evaluators/src/agent_control_evaluators/luna2/__init__.py rename to evaluators/src/agent_control_evaluators/galileo_luna2/__init__.py diff --git a/evaluators/src/agent_control_evaluators/luna2/client.py b/evaluators/src/agent_control_evaluators/galileo_luna2/client.py similarity index 100% rename from evaluators/src/agent_control_evaluators/luna2/client.py rename to evaluators/src/agent_control_evaluators/galileo_luna2/client.py diff --git a/evaluators/src/agent_control_evaluators/luna2/config.py b/evaluators/src/agent_control_evaluators/galileo_luna2/config.py similarity index 96% rename from evaluators/src/agent_control_evaluators/luna2/config.py rename to evaluators/src/agent_control_evaluators/galileo_luna2/config.py index 44e4563a..52e76759 100644 --- a/evaluators/src/agent_control_evaluators/luna2/config.py +++ b/evaluators/src/agent_control_evaluators/galileo_luna2/config.py @@ -2,7 +2,9 @@ from typing import Any, Literal, Union -from pydantic import BaseModel, Field, model_validator +from pydantic import Field, model_validator + +from agent_control_evaluators._base import EvaluatorConfig # Supported Luna-2 metrics Luna2Metric = Literal[ @@ -20,7 +22,7 @@ Luna2Operator = Literal["gt", "lt", "gte", "lte", "eq", "contains", "any"] -class Luna2EvaluatorConfig(BaseModel): +class Luna2EvaluatorConfig(EvaluatorConfig): """Configuration for Luna-2 evaluator. Two stage types are supported: diff --git a/evaluators/src/agent_control_evaluators/luna2/evaluator.py b/evaluators/src/agent_control_evaluators/galileo_luna2/evaluator.py similarity index 96% rename from evaluators/src/agent_control_evaluators/luna2/evaluator.py rename to evaluators/src/agent_control_evaluators/galileo_luna2/evaluator.py index 5efaf321..24e25266 100644 --- a/evaluators/src/agent_control_evaluators/luna2/evaluator.py +++ b/evaluators/src/agent_control_evaluators/galileo_luna2/evaluator.py @@ -8,14 +8,11 @@ import os from typing import Any -from agent_control_models import ( - Evaluator, - EvaluatorMetadata, - EvaluatorResult, - register_evaluator, -) +from agent_control_models import EvaluatorResult -from .config import Luna2EvaluatorConfig +from agent_control_evaluators._base import Evaluator, EvaluatorMetadata +from agent_control_evaluators._registry import register_evaluator +from agent_control_evaluators.galileo_luna2.config import Luna2EvaluatorConfig logger = logging.getLogger(__name__) @@ -67,7 +64,7 @@ class Luna2Evaluator(Evaluator[Luna2EvaluatorConfig]): Example: ```python - from agent_control_evaluators.luna2 import Luna2Evaluator, Luna2EvaluatorConfig + from agent_control_evaluators.galileo_luna2 import Luna2Evaluator, Luna2EvaluatorConfig config = Luna2EvaluatorConfig( stage_type="local", diff --git a/evaluators/src/agent_control_evaluators/json/__init__.py b/evaluators/src/agent_control_evaluators/json/__init__.py new file mode 100644 index 00000000..a24322f9 --- /dev/null +++ b/evaluators/src/agent_control_evaluators/json/__init__.py @@ -0,0 +1,6 @@ +"""JSON validation evaluator.""" + +from agent_control_evaluators.json.config import JSONEvaluatorConfig +from agent_control_evaluators.json.evaluator import JSONEvaluator + +__all__ = ["JSONEvaluator", "JSONEvaluatorConfig"] diff --git a/evaluators/src/agent_control_evaluators/json/config.py b/evaluators/src/agent_control_evaluators/json/config.py new file mode 100644 index 00000000..06e8b760 --- /dev/null +++ b/evaluators/src/agent_control_evaluators/json/config.py @@ -0,0 +1,237 @@ +"""Configuration for JSON validation evaluator.""" + +from typing import Any, Literal + +import re2 +from pydantic import Field, field_validator, model_validator + +from agent_control_evaluators._base import EvaluatorConfig + + +class JSONEvaluatorConfig(EvaluatorConfig): + """Configuration for JSON validation evaluator. + + Multiple validation checks can be combined. Checks are evaluated in this order (fail-fast): + 1. JSON syntax/validity (always - ensures data is valid JSON) + 2. JSON Schema validation (if schema provided) - comprehensive structure validation + 3. Required fields check (if required_fields provided) - ensures critical fields exist + 4. Type checking (if field_types provided) - validates field types are correct + 5. Field constraints (if field_constraints provided) - validates ranges, enums, string length + 6. Pattern matching (if field_patterns provided) - validates field values match patterns + """ + + # Validation Options (all optional, can be combined) + json_schema: dict[str, Any] | None = Field( + default=None, description="JSON Schema specification (Draft 7 or later)" + ) + + required_fields: list[str] | None = Field( + default=None, + description="List of field paths that must be present (dot notation)", + ) + + field_types: dict[str, str] | None = Field( + default=None, + description=( + "Map of field paths to expected JSON types " + "(string, number, integer, boolean, array, object, null)" + ), + ) + + field_constraints: dict[str, dict[str, Any]] | None = Field( + default=None, + description="Field-level constraints: numeric ranges (min/max), enums, string length", + ) + + field_patterns: dict[str, str | dict[str, Any]] | None = Field( + default=None, + description=( + "Map of field paths to RE2 regex patterns. " + "Can be string (pattern only) or dict with 'pattern' and optional 'flags'" + ), + ) + + # Validation Behavior + allow_extra_fields: bool = Field( + default=True, + description="If False, fail if extra fields exist beyond those specified in field_types", + ) + + allow_null_required: bool = Field( + default=False, + description=( + "If True, required fields can be present but null. " + "If False, null is treated as missing" + ), + ) + + pattern_match_logic: Literal["all", "any"] = Field( + default="all", + description=( + "For field_patterns: 'all' requires all patterns to match, " + "'any' requires at least one" + ), + ) + + case_sensitive_enums: bool = Field( + default=True, + description="If False, enum value matching is case-insensitive", + ) + + # Error Handling + allow_invalid_json: bool = Field( + default=False, + description=( + "If True, treat invalid JSON as non-match and allow. " + "If False, block invalid JSON" + ), + ) + + @field_validator("json_schema") + @classmethod + def validate_json_schema(cls, v: dict[str, Any] | None) -> dict[str, Any] | None: + """Ensure the JSON schema itself is valid.""" + if v is None: + return v + from jsonschema import Draft7Validator + + Draft7Validator.check_schema(v) + return v + + @field_validator("field_types") + @classmethod + def validate_type_names(cls, v: dict[str, str] | None) -> dict[str, str] | None: + """Ensure type names are valid JSON types.""" + if v is None: + return v + valid_types = { + "string", + "number", + "integer", + "boolean", + "array", + "object", + "null", + } + for path, type_name in v.items(): + if type_name not in valid_types: + raise ValueError(f"Invalid type '{type_name}' for field '{path}'") + return v + + @field_validator("field_patterns") + @classmethod + def validate_patterns( + cls, v: dict[str, str | dict[str, Any]] | None + ) -> dict[str, str | dict[str, Any]] | None: + """Validate all regex patterns compile.""" + if v is None: + return v + + for path, pattern_config in v.items(): + # Support both string (simple) and dict (with flags) formats + if isinstance(pattern_config, str): + pattern = pattern_config + flags = None + elif isinstance(pattern_config, dict): + if "pattern" not in pattern_config: + raise ValueError( + f"Pattern config for field '{path}' must have 'pattern' key" + ) + pattern = pattern_config["pattern"] + flags = pattern_config.get("flags") + + # Validate flags if provided + if flags is not None: + if not isinstance(flags, list): + raise ValueError(f"Flags for field '{path}' must be a list") + valid_flags = {"IGNORECASE"} + for flag in flags: + if flag not in valid_flags: + raise ValueError( + f"Invalid flag '{flag}' for field '{path}'. " + f"Valid flags: {valid_flags}" + ) + else: + raise ValueError( + f"Pattern for field '{path}' must be string or dict" + ) + + # Validate pattern compiles + try: + re2.compile(pattern) + except re2.error as e: + raise ValueError(f"Invalid regex for field '{path}': {e}") from e + + return v + + @field_validator("field_constraints") + @classmethod + def validate_constraints( + cls, v: dict[str, dict[str, Any]] | None + ) -> dict[str, dict[str, Any]] | None: + """Validate constraint definitions.""" + if v is None: + return v + + for field_path, constraints in v.items(): + # Must have at least one constraint type + valid_keys = {"type", "min", "max", "enum", "min_length", "max_length"} + if not any(k in constraints for k in valid_keys): + raise ValueError( + f"Constraint for '{field_path}' must specify at least one constraint" + ) + + # Validate numeric constraints + if "min" in constraints or "max" in constraints: + if "type" in constraints and constraints["type"] not in ( + "number", + "integer", + ): + raise ValueError( + f"min/max constraints require type 'number' or 'integer' for '{field_path}'" + ) + + # Validate enum + if "enum" in constraints: + if ( + not isinstance(constraints["enum"], list) + or len(constraints["enum"]) == 0 + ): + raise ValueError( + f"enum constraint must be a non-empty list for '{field_path}'" + ) + + # Validate string length + if "min_length" in constraints or "max_length" in constraints: + if "min_length" in constraints and not isinstance( + constraints["min_length"], int + ): + raise ValueError( + f"min_length must be an integer for '{field_path}'" + ) + if "max_length" in constraints and not isinstance( + constraints["max_length"], int + ): + raise ValueError( + f"max_length must be an integer for '{field_path}'" + ) + + return v + + @model_validator(mode="after") + def validate_has_checks(self): + """Ensure at least one validation check is configured.""" + if not any( + [ + self.json_schema, + self.field_types, + self.required_fields, + self.field_constraints, + self.field_patterns, + ] + ): + raise ValueError( + "At least one validation check must be configured: " + "json_schema, field_types, required_fields, field_constraints, or field_patterns" + ) + return self diff --git a/evaluators/src/agent_control_evaluators/builtin/json.py b/evaluators/src/agent_control_evaluators/json/evaluator.py similarity index 91% rename from evaluators/src/agent_control_evaluators/builtin/json.py rename to evaluators/src/agent_control_evaluators/json/evaluator.py index 5e3eb15f..38600f37 100644 --- a/evaluators/src/agent_control_evaluators/builtin/json.py +++ b/evaluators/src/agent_control_evaluators/json/evaluator.py @@ -5,15 +5,13 @@ from typing import Any import re2 -from agent_control_models import ( - Evaluator, - EvaluatorMetadata, - EvaluatorResult, - JSONEvaluatorConfig, - register_evaluator, -) +from agent_control_models import EvaluatorResult from jsonschema import Draft7Validator +from agent_control_evaluators._base import Evaluator, EvaluatorMetadata +from agent_control_evaluators._registry import register_evaluator +from agent_control_evaluators.json.config import JSONEvaluatorConfig + @register_evaluator class JSONEvaluator(Evaluator[JSONEvaluatorConfig]): @@ -27,12 +25,6 @@ class JSONEvaluator(Evaluator[JSONEvaluatorConfig]): 5. Field constraints (if configured) - Validate ranges, enums, string length 6. Pattern matching (if configured) - Validate field value patterns - This order ensures: - - Fast failure on basic issues (invalid JSON, missing required fields) - - Type is validated before checking value constraints - - Clear error messages indicating which check failed - - Developers can easily understand and predict validation behavior - Example configs: # JSON Schema validation {"json_schema": {"type": "object", "required": ["id", "name"]}} @@ -46,9 +38,6 @@ class JSONEvaluator(Evaluator[JSONEvaluatorConfig]): # Field constraints - numeric ranges {"field_constraints": {"score": {"min": 0.0, "max": 1.0}}} - # Field constraints - enums - {"field_constraints": {"status": {"enum": ["active", "inactive"]}}} - # Pattern matching {"field_patterns": {"email": "^[a-z0-9._%+-]+@[a-z0-9.-]+\\\\.[a-z]+$"}} """ @@ -100,14 +89,6 @@ def __init__(self, config: JSONEvaluatorConfig) -> None: async def evaluate(self, data: Any) -> EvaluatorResult: """Evaluate JSON data against all configured validation checks. - Evaluation order (fail-fast from simple to complex): - 1. JSON syntax/validity - 2. JSON Schema (if configured) - 3. Required fields (if configured) - 4. Type checking (if configured) - 5. Field constraints (if configured) - 6. Pattern matching (if configured) - Note: Validation is offloaded to a thread executor to avoid blocking the event loop for large payloads, since all validation logic is synchronous. """ @@ -299,10 +280,7 @@ def _check_required(self, data: dict | list) -> EvaluatorResult | None: ) def _check_constraints(self, data: dict | list) -> EvaluatorResult | None: - """Validate field constraints (ranges, enums, string length). - - Returns error result or None. - """ + """Validate field constraints (ranges, enums, string length).""" if not isinstance(data, dict): return EvaluatorResult( matched=True, @@ -339,9 +317,7 @@ def _check_constraints(self, data: dict | list) -> EvaluatorResult | None: # Enum constraints if "enum" in constraints: - # Case-insensitive matching if configured if self.config.case_sensitive_enums: - # Case-sensitive (default behavior) if value not in constraints["enum"]: allowed = ", ".join(str(v) for v in constraints["enum"][:5]) errors.append( @@ -349,8 +325,6 @@ def _check_constraints(self, data: dict | list) -> EvaluatorResult | None: ) continue else: - # Case-insensitive matching - # Convert to lowercase for comparison (only for strings) if isinstance(value, str): value_lower = value.lower() enum_lower = [ @@ -365,7 +339,6 @@ def _check_constraints(self, data: dict | list) -> EvaluatorResult | None: ) continue else: - # Non-string values: exact match only if value not in constraints["enum"]: allowed = ", ".join(str(v) for v in constraints["enum"][:5]) errors.append( @@ -490,24 +463,14 @@ def _get_json_type(self, value: Any) -> str: def _get_all_paths( self, data: dict, prefix: str = "", leaves_only: bool = False ) -> set[str]: - """Recursively get all field paths in nested dict. - - Args: - data: The dictionary to traverse - prefix: Current path prefix for nested traversal - leaves_only: If True, only return paths to leaf values (non-dict values). - This avoids flagging parent containers as extra fields. - """ + """Recursively get all field paths in nested dict.""" paths = set() for key, value in data.items(): path = f"{prefix}.{key}" if prefix else key if isinstance(value, dict): - # Recurse into nested dicts paths.update(self._get_all_paths(value, path, leaves_only)) - # Only add container path if not leaves_only if not leaves_only: paths.add(path) else: - # Always add leaf paths (non-dict values) paths.add(path) return paths diff --git a/evaluators/src/agent_control_evaluators/list/__init__.py b/evaluators/src/agent_control_evaluators/list/__init__.py new file mode 100644 index 00000000..ff7ad17e --- /dev/null +++ b/evaluators/src/agent_control_evaluators/list/__init__.py @@ -0,0 +1,6 @@ +"""List evaluator for value matching.""" + +from agent_control_evaluators.list.config import ListEvaluatorConfig +from agent_control_evaluators.list.evaluator import ListEvaluator + +__all__ = ["ListEvaluator", "ListEvaluatorConfig"] diff --git a/evaluators/src/agent_control_evaluators/list/config.py b/evaluators/src/agent_control_evaluators/list/config.py new file mode 100644 index 00000000..3ba7efbf --- /dev/null +++ b/evaluators/src/agent_control_evaluators/list/config.py @@ -0,0 +1,26 @@ +"""Configuration for list evaluator.""" + +from typing import Literal + +from pydantic import Field + +from agent_control_evaluators._base import EvaluatorConfig + + +class ListEvaluatorConfig(EvaluatorConfig): + """Configuration for list evaluator.""" + + values: list[str | int | float] = Field( + ..., description="List of values to match against" + ) + logic: Literal["any", "all"] = Field( + "any", description="Matching logic: any item matches vs all items match" + ) + match_on: Literal["match", "no_match"] = Field( + "match", description="Trigger rule on match or no match" + ) + match_mode: Literal["exact", "contains"] = Field( + "exact", + description="'exact' for full string match, 'contains' for keyword/substring match", + ) + case_sensitive: bool = Field(False, description="Whether matching is case sensitive") diff --git a/evaluators/src/agent_control_evaluators/builtin/list.py b/evaluators/src/agent_control_evaluators/list/evaluator.py similarity index 94% rename from evaluators/src/agent_control_evaluators/builtin/list.py rename to evaluators/src/agent_control_evaluators/list/evaluator.py index 227e448f..7c0612ed 100644 --- a/evaluators/src/agent_control_evaluators/builtin/list.py +++ b/evaluators/src/agent_control_evaluators/list/evaluator.py @@ -4,13 +4,11 @@ from typing import Any import re2 -from agent_control_models import ( - Evaluator, - EvaluatorMetadata, - EvaluatorResult, - ListEvaluatorConfig, - register_evaluator, -) +from agent_control_models import EvaluatorResult + +from agent_control_evaluators._base import Evaluator, EvaluatorMetadata +from agent_control_evaluators._registry import register_evaluator +from agent_control_evaluators.list.config import ListEvaluatorConfig @register_evaluator diff --git a/evaluators/src/agent_control_evaluators/regex/__init__.py b/evaluators/src/agent_control_evaluators/regex/__init__.py new file mode 100644 index 00000000..8a03bcae --- /dev/null +++ b/evaluators/src/agent_control_evaluators/regex/__init__.py @@ -0,0 +1,6 @@ +"""Regex evaluator for pattern matching.""" + +from agent_control_evaluators.regex.config import RegexEvaluatorConfig +from agent_control_evaluators.regex.evaluator import RegexEvaluator + +__all__ = ["RegexEvaluator", "RegexEvaluatorConfig"] diff --git a/evaluators/src/agent_control_evaluators/regex/config.py b/evaluators/src/agent_control_evaluators/regex/config.py new file mode 100644 index 00000000..ed325096 --- /dev/null +++ b/evaluators/src/agent_control_evaluators/regex/config.py @@ -0,0 +1,23 @@ +"""Configuration for regex evaluator.""" + +import re2 +from pydantic import Field, field_validator + +from agent_control_evaluators._base import EvaluatorConfig + + +class RegexEvaluatorConfig(EvaluatorConfig): + """Configuration for regex evaluator.""" + + pattern: str = Field(..., description="Regular expression pattern (RE2 syntax)") + flags: list[str] | None = Field(default=None, description="Regex flags (e.g., ['IGNORECASE'])") + + @field_validator("pattern") + @classmethod + def validate_pattern(cls, v: str) -> str: + """Validate that the pattern is a valid RE2 regex.""" + try: + re2.compile(v) + except re2.error as e: + raise ValueError(f"Invalid regex pattern: {e}") from e + return v diff --git a/evaluators/src/agent_control_evaluators/builtin/regex.py b/evaluators/src/agent_control_evaluators/regex/evaluator.py similarity index 89% rename from evaluators/src/agent_control_evaluators/builtin/regex.py rename to evaluators/src/agent_control_evaluators/regex/evaluator.py index 7c3d04ae..2348be0f 100644 --- a/evaluators/src/agent_control_evaluators/builtin/regex.py +++ b/evaluators/src/agent_control_evaluators/regex/evaluator.py @@ -3,13 +3,11 @@ from typing import Any import re2 -from agent_control_models import ( - Evaluator, - EvaluatorMetadata, - EvaluatorResult, - RegexEvaluatorConfig, - register_evaluator, -) +from agent_control_models import EvaluatorResult + +from agent_control_evaluators._base import Evaluator, EvaluatorMetadata +from agent_control_evaluators._registry import register_evaluator +from agent_control_evaluators.regex.config import RegexEvaluatorConfig @register_evaluator diff --git a/evaluators/src/agent_control_evaluators/sql/__init__.py b/evaluators/src/agent_control_evaluators/sql/__init__.py new file mode 100644 index 00000000..3f7402e2 --- /dev/null +++ b/evaluators/src/agent_control_evaluators/sql/__init__.py @@ -0,0 +1,6 @@ +"""SQL validation evaluator.""" + +from agent_control_evaluators.sql.config import SQLEvaluatorConfig +from agent_control_evaluators.sql.evaluator import SQLEvaluator + +__all__ = ["SQLEvaluator", "SQLEvaluatorConfig"] diff --git a/evaluators/src/agent_control_evaluators/sql/config.py b/evaluators/src/agent_control_evaluators/sql/config.py new file mode 100644 index 00000000..b6eb32c9 --- /dev/null +++ b/evaluators/src/agent_control_evaluators/sql/config.py @@ -0,0 +1,187 @@ +"""Configuration for SQL validation evaluator.""" + +import warnings +from typing import Any, Literal + +from pydantic import Field, model_validator + +from agent_control_evaluators._base import EvaluatorConfig + + +class SQLEvaluatorConfig(EvaluatorConfig): + """Configuration for comprehensive SQL control evaluator. + + Validates SQL query strings using AST-based analysis via sqlglot. + Controls are evaluated in order: + syntax → multi-statement → operations → tables/schemas → columns → limits. + """ + + # Multi-Statement + allow_multi_statements: bool = Field( + default=True, + description=( + "Whether to allow multiple SQL statements in a single query. " + "Set to False to prevent queries like 'SELECT x; DROP TABLE y' " + "(SQL injection prevention)." + ), + ) + max_statements: int | None = Field( + default=None, + description=( + "Maximum number of statements allowed (e.g., 2 allows up to 2 statements). " + "Only applicable when allow_multi_statements=True." + ), + ) + + # Operations + blocked_operations: list[str] | None = Field( + default=None, + description=( + "SQL operations to block (e.g., ['DROP', 'DELETE', 'TRUNCATE']). " + "Cannot be used with allowed_operations." + ), + ) + allowed_operations: list[str] | None = Field( + default=None, + description=( + "SQL operations to allow (e.g., ['SELECT'] for read-only). " + "Cannot be used with blocked_operations." + ), + ) + block_ddl: bool = Field( + default=False, + description="Block all DDL operations (CREATE, ALTER, DROP, TRUNCATE, RENAME, COMMENT).", + ) + block_dcl: bool = Field( + default=False, + description="Block all DCL operations (GRANT, REVOKE).", + ) + + # Table/Schema Access + allowed_tables: list[str] | None = Field( + default=None, + description="Table names allowed (allowlist mode). Cannot be used with blocked_tables.", + ) + blocked_tables: list[str] | None = Field( + default=None, + description="Table names to block (blocklist mode). Cannot be used with allowed_tables.", + ) + allowed_schemas: list[str] | None = Field( + default=None, + description="Schema names allowed (allowlist mode). Cannot be used with blocked_schemas.", + ) + blocked_schemas: list[str] | None = Field( + default=None, + description="Schema names to block (blocklist mode). Cannot be used with allowed_schemas.", + ) + + # Column Presence + required_columns: list[str] | None = Field( + default=None, + description=( + "Columns that must be present in the query " + "(e.g., ['tenant_id'] for multi-tenant security)." + ), + ) + column_presence_logic: Literal["any", "all"] = Field( + default="any", + description="Matching logic for required_columns: 'any' or 'all'.", + ) + column_context: Literal["select", "where"] | None = Field( + default=None, + description="Where required columns must appear: 'select', 'where', or None (anywhere).", + ) + column_context_scope: Literal["top_level", "all"] = Field( + default="all", + description=( + "Scope for column_context checking. " + "'top_level': Only check top-level clause. " + "'all': Check all clauses including subqueries." + ), + ) + + # Limits + require_limit: bool = Field( + default=False, + description="Require SELECT queries to have a LIMIT clause.", + ) + max_limit: int | None = Field( + default=None, + description="Maximum allowed LIMIT value.", + ) + max_result_window: int | None = Field( + default=None, + description="Maximum value of (LIMIT + OFFSET) for pagination control.", + ) + + # Options + case_sensitive: bool = Field( + default=False, + description="Whether table/column/schema name matching is case sensitive.", + ) + dialect: Literal["postgres", "mysql", "tsql", "oracle", "sqlite"] = Field( + default="postgres", + description="SQL dialect to use for parsing.", + ) + + # Query Complexity Limits + max_subquery_depth: int | None = Field( + default=None, + description="Maximum nesting depth for subqueries.", + ) + max_joins: int | None = Field( + default=None, + description="Maximum number of JOIN operations in a single query.", + ) + max_union_count: int | None = Field( + default=None, + description="Maximum number of UNION/INTERSECT/EXCEPT operations.", + ) + + @model_validator(mode="after") + def validate_config(self): + """Validate configuration constraints.""" + # Validate operation restrictions + if self.blocked_operations and self.allowed_operations: + raise ValueError( + "Cannot specify both blocked_operations and allowed_operations" + ) + + # Validate table restrictions + if self.allowed_tables and self.blocked_tables: + raise ValueError("Cannot specify both allowed_tables and blocked_tables") + + # Validate schema restrictions + if self.allowed_schemas and self.blocked_schemas: + raise ValueError( + "Cannot specify both allowed_schemas and blocked_schemas" + ) + + # Validate limit controls + if self.max_limit is not None and self.max_limit <= 0: + raise ValueError("max_limit must be a positive integer") + + # Validate multi-statement controls + if not self.allow_multi_statements and self.max_statements is not None: + raise ValueError( + "max_statements is only applicable when allow_multi_statements=True" + ) + + if self.max_statements is not None and self.max_statements <= 0: + raise ValueError("max_statements must be a positive integer") + + # Validate column controls + if self.column_context and not self.required_columns: + warnings.warn( + "column_context is set but required_columns is empty - " + "column_context will be ignored" + ) + + # Validate LIMIT controls + if self.max_limit and not self.require_limit: + warnings.warn( + "max_limit is set but require_limit is False - " + "max_limit only enforced if LIMIT clause exists" + ) + + return self diff --git a/evaluators/src/agent_control_evaluators/builtin/sql.py b/evaluators/src/agent_control_evaluators/sql/evaluator.py similarity index 98% rename from evaluators/src/agent_control_evaluators/builtin/sql.py rename to evaluators/src/agent_control_evaluators/sql/evaluator.py index b0bc5ebb..09c1af58 100644 --- a/evaluators/src/agent_control_evaluators/builtin/sql.py +++ b/evaluators/src/agent_control_evaluators/sql/evaluator.py @@ -10,15 +10,13 @@ from typing import Any import sqlglot -from agent_control_models import ( - Evaluator, - EvaluatorMetadata, - EvaluatorResult, - SQLEvaluatorConfig, - register_evaluator, -) +from agent_control_models import EvaluatorResult from sqlglot import exp +from agent_control_evaluators._base import Evaluator, EvaluatorMetadata +from agent_control_evaluators._registry import register_evaluator +from agent_control_evaluators.sql.config import SQLEvaluatorConfig + logger = logging.getLogger(__name__) @@ -332,12 +330,12 @@ def _is_top_level_select(self, select_node: exp.Select) -> bool: A SELECT is considered top-level if it's not nested inside another SELECT. This properly handles cases like: - - Simple SELECT: top-level ✓ - - SELECT in subquery: not top-level ✗ - - SELECT inside CTE definition: not top-level ✗ (has outer SELECT as ancestor) - - Main SELECT with CTEs: top-level ✓ (the outer SELECT, not the CTE body) - - SELECT in CREATE VIEW: not top-level ✗ - - SELECT in INSERT SELECT: not top-level ✗ + - Simple SELECT: top-level + - SELECT in subquery: not top-level + - SELECT inside CTE definition: not top-level (has outer SELECT as ancestor) + - Main SELECT with CTEs: top-level (the outer SELECT, not the CTE body) + - SELECT in CREATE VIEW: not top-level + - SELECT in INSERT SELECT: not top-level Args: select_node: SELECT node to check @@ -619,9 +617,9 @@ def _calculate_subquery_depth(self, node: exp.Expression) -> int: """Calculate maximum subquery nesting depth recursively. Depth is the number of nested SELECT layers: - - SELECT ... FROM table → depth 0 - - SELECT ... FROM (SELECT ...) → depth 1 - - SELECT ... FROM (SELECT ... FROM (SELECT ...)) → depth 2 + - SELECT ... FROM table -> depth 0 + - SELECT ... FROM (SELECT ...) -> depth 1 + - SELECT ... FROM (SELECT ... FROM (SELECT ...)) -> depth 2 Args: node: Current AST node diff --git a/evaluators/tests/json/__init__.py b/evaluators/tests/json/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/evaluators/tests/test_json.py b/evaluators/tests/json/test_json.py similarity index 99% rename from evaluators/tests/test_json.py rename to evaluators/tests/json/test_json.py index 77d52243..10908159 100644 --- a/evaluators/tests/test_json.py +++ b/evaluators/tests/json/test_json.py @@ -1,8 +1,7 @@ """Tests for JSON validation evaluator.""" import pytest -from agent_control_models import JSONEvaluatorConfig -from agent_control_evaluators.builtin.json import JSONEvaluator +from agent_control_evaluators.json import JSONEvaluator, JSONEvaluatorConfig class TestJSONParsing: diff --git a/evaluators/tests/sql/__init__.py b/evaluators/tests/sql/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/evaluators/tests/test_sql.py b/evaluators/tests/sql/test_sql.py similarity index 99% rename from evaluators/tests/test_sql.py rename to evaluators/tests/sql/test_sql.py index 1c2c4826..d8f1be34 100644 --- a/evaluators/tests/test_sql.py +++ b/evaluators/tests/sql/test_sql.py @@ -5,8 +5,8 @@ import pytest from pydantic import ValidationError -from agent_control_models import EvaluatorResult, SQLEvaluatorConfig -from agent_control_evaluators.builtin.sql import SQLEvaluator +from agent_control_models import EvaluatorResult +from agent_control_evaluators.sql import SQLEvaluator, SQLEvaluatorConfig class TestEvaluatorResultValidator: diff --git a/evaluators/tests/test_base.py b/evaluators/tests/test_base.py index e5bdc5da..ef94bee2 100644 --- a/evaluators/tests/test_base.py +++ b/evaluators/tests/test_base.py @@ -8,7 +8,8 @@ from pydantic import BaseModel -from agent_control_models import EvaluatorResult, Evaluator, EvaluatorMetadata +from agent_control_evaluators import Evaluator, EvaluatorMetadata +from agent_control_models import EvaluatorResult class MockConfig(BaseModel): diff --git a/examples/galileo/luna2_demo.py b/examples/galileo/luna2_demo.py index 0452400d..82989558 100644 --- a/examples/galileo/luna2_demo.py +++ b/examples/galileo/luna2_demo.py @@ -37,7 +37,7 @@ # Import our direct API client (no SDK required) try: - from agent_control_evaluators.luna2.client import ( + from agent_control_evaluators.galileo_luna2.client import ( GalileoProtectClient, Payload, ) diff --git a/models/src/agent_control_models/__init__.py b/models/src/agent_control_models/__init__.py index c828524c..2903b7ab 100644 --- a/models/src/agent_control_models/__init__.py +++ b/models/src/agent_control_models/__init__.py @@ -18,12 +18,8 @@ ControlMatch, ControlScope, ControlSelector, - EvaluatorConfig, EvaluatorResult, - JSONEvaluatorConfig, - ListEvaluatorConfig, - RegexEvaluatorConfig, - SQLEvaluatorConfig, + EvaluatorSpec, ) from .errors import ( ERROR_TITLES, @@ -41,14 +37,6 @@ EvaluationResponse, EvaluationResult, ) -from .evaluator import ( - Evaluator, - EvaluatorMetadata, - clear_evaluators, - get_all_evaluators, - get_evaluator, - register_evaluator, -) from .health import HealthResponse from .observability import ( BatchEventsRequest, @@ -108,21 +96,9 @@ "ControlMatch", "ControlScope", "ControlSelector", - "EvaluatorConfig", + "EvaluatorSpec", "EvaluatorResult", - # Evaluator configs - "JSONEvaluatorConfig", - "ListEvaluatorConfig", - "RegexEvaluatorConfig", - "SQLEvaluatorConfig", - # Evaluator system - "Evaluator", - "EvaluatorMetadata", - "register_evaluator", - "get_evaluator", - "get_all_evaluators", - "clear_evaluators", - # Error models (RFC 7807 / Kubernetes / GitHub-style) + # Error models "ProblemDetail", "ErrorCode", "ErrorReason", diff --git a/models/src/agent_control_models/controls.py b/models/src/agent_control_models/controls.py index a9b9ccab..7ae4be4e 100644 --- a/models/src/agent_control_models/controls.py +++ b/models/src/agent_control_models/controls.py @@ -1,6 +1,5 @@ """Control definition models for agent protection.""" -import warnings from typing import Any, Literal, Self from uuid import uuid4 @@ -153,586 +152,16 @@ def validate_stages( # ============================================================================= -# Evaluator Config Models (used by evaluator implementations) +# Unified Evaluator Spec (used in API) # ============================================================================= -class RegexEvaluatorConfig(BaseModel): - """Configuration for regex evaluator.""" - - pattern: str = Field(..., description="Regular expression pattern") - flags: list[str] | None = Field(default=None, description="Regex flags") - - @field_validator("pattern") - @classmethod - def validate_pattern(cls, v: str) -> str: - """Validate that the pattern is a valid regex.""" - try: - re2.compile(v) - except re2.error as e: - raise ValueError(f"Invalid regex pattern: {e}") from e - return v - - -class ListEvaluatorConfig(BaseModel): - """Configuration for list evaluator.""" - - values: list[str | int | float] = Field( - ..., description="List of values to match against" - ) - logic: Literal["any", "all"] = Field( - "any", description="Matching logic: any item matches vs all items match" - ) - match_on: Literal["match", "no_match"] = Field( - "match", description="Trigger rule on match or no match" - ) - match_mode: Literal["exact", "contains"] = Field( - "exact", - description="'exact' for full string match, 'contains' for keyword/substring match", - ) - case_sensitive: bool = Field(False, description="Whether matching is case sensitive") - - -class JSONEvaluatorConfig(BaseModel): - """Configuration for JSON validation evaluator. - - Multiple validation checks can be combined. Checks are evaluated in this order (fail-fast): - 1. JSON syntax/validity (always - ensures data is valid JSON) - 2. JSON Schema validation (if schema provided) - comprehensive structure validation - 3. Required fields check (if required_fields provided) - ensures critical fields exist - 4. Type checking (if field_types provided) - validates field types are correct - 5. Field constraints (if field_constraints provided) - validates ranges, enums, string length - 6. Pattern matching (if field_patterns provided) - validates field values match patterns - - This order makes sense because: - - Check syntax first (can't do anything with invalid JSON) - - Check schema next (comprehensive structural validation) - - Check required fields (fail fast if missing critical fields) - - Check types (verify data types before checking constraints) - - Check constraints (validate value ranges/enums after type is confirmed) - - Check patterns last (most specific regex validation) - """ - - # Validation Options (all optional, can be combined) - json_schema: dict[str, Any] | None = Field( - default=None, description="JSON Schema specification (Draft 7 or later)" - ) - - required_fields: list[str] | None = Field( - default=None, - description="List of field paths that must be present (dot notation)", - ) - - field_types: dict[str, str] | None = Field( - default=None, - description=( - "Map of field paths to expected JSON types " - "(string, number, integer, boolean, array, object, null)" - ), - ) - - field_constraints: dict[str, dict[str, Any]] | None = Field( - default=None, - description="Field-level constraints: numeric ranges (min/max), enums, string length", - ) - - field_patterns: dict[str, str | dict[str, Any]] | None = Field( - default=None, - description=( - "Map of field paths to RE2 regex patterns. " - "Can be string (pattern only) or dict with 'pattern' and optional 'flags'" - ), - ) - - # Validation Behavior - allow_extra_fields: bool = Field( - default=True, - description="If False, fail if extra fields exist beyond those specified in field_types", - ) - - allow_null_required: bool = Field( - default=False, - description=( - "If True, required fields can be present but null. " - "If False, null is treated as missing" - ), - ) - - pattern_match_logic: Literal["all", "any"] = Field( - default="all", - description=( - "For field_patterns: 'all' requires all patterns to match, " - "'any' requires at least one" - ), - ) - - case_sensitive_enums: bool = Field( - default=True, - description="If False, enum value matching is case-insensitive", - ) - - # Error Handling - allow_invalid_json: bool = Field( - default=False, - description=( - "If True, treat invalid JSON as non-match and allow. " - "If False, block invalid JSON" - ), - ) - - @field_validator("json_schema") - @classmethod - def validate_json_schema(cls, v: dict[str, Any] | None) -> dict[str, Any] | None: - """Ensure the JSON schema itself is valid.""" - if v is None: - return v - from jsonschema import Draft7Validator - - Draft7Validator.check_schema(v) - return v - - @field_validator("field_types") - @classmethod - def validate_type_names(cls, v: dict[str, str] | None) -> dict[str, str] | None: - """Ensure type names are valid JSON types.""" - if v is None: - return v - valid_types = { - "string", - "number", - "integer", - "boolean", - "array", - "object", - "null", - } - for path, type_name in v.items(): - if type_name not in valid_types: - raise ValueError(f"Invalid type '{type_name}' for field '{path}'") - return v - - @field_validator("field_patterns") - @classmethod - def validate_patterns( - cls, v: dict[str, str | dict[str, Any]] | None - ) -> dict[str, str | dict[str, Any]] | None: - """Validate all regex patterns compile.""" - if v is None: - return v - - for path, pattern_config in v.items(): - # Support both string (simple) and dict (with flags) formats - if isinstance(pattern_config, str): - pattern = pattern_config - flags = None - elif isinstance(pattern_config, dict): - if "pattern" not in pattern_config: - raise ValueError( - f"Pattern config for field '{path}' must have 'pattern' key" - ) - pattern = pattern_config["pattern"] - flags = pattern_config.get("flags") - - # Validate flags if provided - if flags is not None: - if not isinstance(flags, list): - raise ValueError(f"Flags for field '{path}' must be a list") - valid_flags = {"IGNORECASE"} - for flag in flags: - if flag not in valid_flags: - raise ValueError( - f"Invalid flag '{flag}' for field '{path}'. " - f"Valid flags: {valid_flags}" - ) - else: - raise ValueError( - f"Pattern for field '{path}' must be string or dict" - ) - - # Validate pattern compiles - try: - re2.compile(pattern) - except re2.error as e: - raise ValueError(f"Invalid regex for field '{path}': {e}") from e - - return v - - @field_validator("field_constraints") - @classmethod - def validate_constraints( - cls, v: dict[str, dict[str, Any]] | None - ) -> dict[str, dict[str, Any]] | None: - """Validate constraint definitions.""" - if v is None: - return v - - for field_path, constraints in v.items(): - # Must have at least one constraint type - valid_keys = {"type", "min", "max", "enum", "min_length", "max_length"} - if not any(k in constraints for k in valid_keys): - raise ValueError( - f"Constraint for '{field_path}' must specify at least one constraint" - ) - - # Validate numeric constraints - if "min" in constraints or "max" in constraints: - if "type" in constraints and constraints["type"] not in ( - "number", - "integer", - ): - raise ValueError( - f"min/max constraints require type 'number' or 'integer' for '{field_path}'" - ) - - # Validate enum - if "enum" in constraints: - if ( - not isinstance(constraints["enum"], list) - or len(constraints["enum"]) == 0 - ): - raise ValueError( - f"enum constraint must be a non-empty list for '{field_path}'" - ) - - # Validate string length - if "min_length" in constraints or "max_length" in constraints: - if "min_length" in constraints and not isinstance( - constraints["min_length"], int - ): - raise ValueError( - f"min_length must be an integer for '{field_path}'" - ) - if "max_length" in constraints and not isinstance( - constraints["max_length"], int - ): - raise ValueError( - f"max_length must be an integer for '{field_path}'" - ) - - return v - - @model_validator(mode="after") - def validate_has_checks(self) -> Self: - """Ensure at least one validation check is configured.""" - if not any( - [ - self.json_schema, - self.field_types, - self.required_fields, - self.field_constraints, - self.field_patterns, - ] - ): - raise ValueError( - "At least one validation check must be configured: " - "json_schema, field_types, required_fields, field_constraints, or field_patterns" - ) - return self - - -class SQLEvaluatorConfig(BaseModel): - """Configuration for comprehensive SQL control evaluator. - - Validates SQL query strings using AST-based analysis via sqlglot. - Controls are evaluated in order: - syntax → multi-statement → operations → tables/schemas → columns → limits. - """ - - # Multi-Statement - allow_multi_statements: bool = Field( - default=True, - description=( - "Whether to allow multiple SQL statements in a single query. " - "Set to False to prevent queries like 'SELECT x; DROP TABLE y' " - "(SQL injection prevention). " - "When False, queries with multiple statements are blocked. " - "Cannot be used with max_statements (use one or the other)." - ), - ) - max_statements: int | None = Field( - default=None, - description=( - "Maximum number of statements allowed (e.g., 2 allows up to 2 statements). " - "Only applicable when allow_multi_statements=True. " - "Must be a positive integer. " - "Use this to allow controlled multi-statement queries while preventing abuse." - ), - ) - - # Operations - blocked_operations: list[str] | None = Field( - default=None, - description=( - "SQL operations to block (e.g., ['DROP', 'DELETE', 'TRUNCATE']). " - "Cannot be used with allowed_operations. " - "Use this for blocklist mode where most operations are allowed except specific ones." - ), - ) - allowed_operations: list[str] | None = Field( - default=None, - description=( - "SQL operations to allow (e.g., ['SELECT'] for read-only). " - "Cannot be used with blocked_operations. " - "When set, all operations NOT in this list are blocked (allowlist mode). " - "Can be combined with block_ddl/block_dcl for stricter control " - "(e.g., allowed_operations=['SELECT'] + block_ddl=True enforces both)." - ), - ) - block_ddl: bool = Field( - default=False, - description=( - "Block all DDL operations (CREATE, ALTER, DROP, TRUNCATE, RENAME, COMMENT). " - "Adds DDL operations to the blocklist. " - "Can be combined with either blocked_operations or allowed_operations (but not both, " - "since those are mutually exclusive). " - "Example: allowed_operations=['SELECT'] + block_ddl=True = read-only with DDL blocked." - ), - ) - block_dcl: bool = Field( - default=False, - description=( - "Block all DCL operations (GRANT, REVOKE). " - "Adds DCL operations to the blocklist. " - "Can be combined with either blocked_operations or allowed_operations (but not both, " - "since those are mutually exclusive). " - "Useful for preventing privilege escalation even with allowed operations." - ), - ) - - # Table/Schema Access - allowed_tables: list[str] | None = Field( - default=None, - description=( - "Table names allowed (e.g., ['users', 'orders']). " - "Cannot be used with blocked_tables. " - "When set, all tables NOT in this list are blocked (allowlist mode). " - "Case sensitivity controlled by case_sensitive field." - ), - ) - blocked_tables: list[str] | None = Field( - default=None, - description=( - "Table names to block (e.g., ['sensitive_data', 'admin_users']). " - "Cannot be used with allowed_tables. " - "Use this for blocklist mode where most tables are allowed except specific ones. " - "Case sensitivity controlled by case_sensitive field." - ), - ) - allowed_schemas: list[str] | None = Field( - default=None, - description=( - "Schema names allowed (e.g., ['public', 'analytics']). " - "Cannot be used with blocked_schemas. " - "When set, all schemas NOT in this list are blocked (allowlist mode). " - "Case sensitivity controlled by case_sensitive field." - ), - ) - blocked_schemas: list[str] | None = Field( - default=None, - description=( - "Schema names to block (e.g., ['system', 'admin', 'internal']). " - "Cannot be used with allowed_schemas. " - "Use this for blocklist mode where most schemas are allowed except specific ones. " - "Case sensitivity controlled by case_sensitive field." - ), - ) - - # Column Presence - required_columns: list[str] | None = Field( - default=None, - description=( - "Columns that must be present in the query " - "(e.g., ['tenant_id'] for multi-tenant security). " - "Use with column_presence_logic to control 'any' vs 'all' matching. " - "Use with column_context to restrict where columns must appear " - "(WHERE, SELECT, or anywhere). " - "Case sensitivity controlled by case_sensitive field." - ), - ) - column_presence_logic: Literal["any", "all"] = Field( - default="any", - description=( - "Matching logic for required_columns. " - "'any': at least one required column must be present (OR logic). " - "'all': all required columns must be present (AND logic). " - "Only applicable when required_columns is set." - ), - ) - column_context: Literal["select", "where"] | None = Field( - default=None, - description=( - "Where required columns must appear. " - "'select': columns must appear in SELECT clause. " - "'where': columns must appear in WHERE clause (common for tenant_id filtering). " - "None: columns can appear anywhere in the query. " - "Only applicable when required_columns is set." - ), - ) - column_context_scope: Literal["top_level", "all"] = Field( - default="all", - description=( - "Scope for column_context checking when set to 'where' or 'select'. " - "top_level: Only check columns in the top-level WHERE/SELECT clause " - "(recommended for multi-tenant RLS security). " - "all: Check columns in all WHERE/SELECT clauses including subqueries " - "(default for backward compatibility). " - "Only applies when column_context is 'where' or 'select'. " - "For multi-tenant security, use column_context='where' with " - "column_context_scope='top_level' to ensure tenant filtering is in " - "the outer query, not just subqueries." - ), - ) - - # Limits - require_limit: bool = Field( - default=False, - description=( - "Require SELECT queries to have a LIMIT clause. " - "Prevents accidentally pulling millions of rows. " - "Only applies to SELECT queries; INSERT/UPDATE/DELETE are unaffected. " - "Combine with max_limit to enforce a maximum value." - ), - ) - max_limit: int | None = Field( - default=None, - description=( - "Maximum allowed LIMIT value (e.g., 1000 prevents LIMIT 10000). " - "Only applies to SELECT queries with a LIMIT clause. " - "Must be a positive integer. " - "If LIMIT value cannot be determined (e.g., LIMIT ALL), behavior depends on fail_safe." - ), - ) - max_result_window: int | None = Field( - default=None, - description=( - "Maximum value of (LIMIT + OFFSET) for pagination control. " - "Prevents deep pagination attacks where large OFFSET values can " - "cause expensive queries. Similar to Elasticsearch's max_result_window. " - "Example: max_result_window=10000 allows 'LIMIT 100 OFFSET 9900' " - "but blocks 'LIMIT 10 OFFSET 10000'. " - "None (default): No limit on pagination depth. " - "Recommended: Set to 10000 or similar value to prevent abuse." - ), - ) - - # Options - case_sensitive: bool = Field( - default=False, - description=( - "Whether table/column/schema name matching is case sensitive. " - "False (default): 'Users' matches 'users'. " - "True: 'Users' does NOT match 'users'. " - "Applies to allowed_tables, blocked_tables, allowed_schemas, " - "blocked_schemas, and required_columns." - ), - ) - dialect: Literal["postgres", "mysql", "tsql", "oracle", "sqlite"] = Field( - default="postgres", - description=( - "SQL dialect to use for parsing. " - "Affects how sqlglot interprets SQL syntax. " - "postgres (default): Standard SQL, case-insensitive identifiers, " - "quoted with \", most ANSI-compliant. " - "mysql: MySQL-specific syntax, case-sensitive on Unix/Linux, " - "backtick-quoted identifiers. " - "tsql: T-SQL/SQL Server, bracket-quoted identifiers [like_this], " - "supports CAST differently. " - "oracle: Oracle-specific syntax, quoted identifiers with \", " - "supports -- comments. " - "sqlite: Lightweight SQL, double-quoted identifiers, supports " - "AUTOINCREMENT and datetime functions. " - "Choose based on the target database system." - ), - ) - - # Query Complexity Limits (Issue #13) - max_subquery_depth: int | None = Field( - default=None, - description=( - "Maximum nesting depth for subqueries. " - "Prevents DoS via deeply nested queries like SELECT FROM (SELECT FROM (SELECT...)). " - "None (default): No limit. " - "Recommended: 5-10 for typical applications." - ), - ) - max_joins: int | None = Field( - default=None, - description=( - "Maximum number of JOIN operations in a single query. " - "Prevents cartesian product attacks and expensive multi-way joins. " - "None (default): No limit. " - "Recommended: 10-20 depending on use case." - ), - ) - max_union_count: int | None = Field( - default=None, - description=( - "Maximum number of UNION/UNION ALL/INTERSECT/EXCEPT operations. " - "Prevents DoS via massive UNION chains. " - "None (default): No limit. " - "Recommended: 10-50 depending on use case." - ), - ) - - @model_validator(mode="after") - def validate_config(self) -> Self: - """Validate configuration constraints.""" - # Validate operation restrictions - if self.blocked_operations and self.allowed_operations: - raise ValueError( - "Cannot specify both blocked_operations and allowed_operations" - ) - - # Validate table restrictions - if self.allowed_tables and self.blocked_tables: - raise ValueError("Cannot specify both allowed_tables and blocked_tables") - - # Validate schema restrictions - if self.allowed_schemas and self.blocked_schemas: - raise ValueError( - "Cannot specify both allowed_schemas and blocked_schemas" - ) - - # Validate limit controls - if self.max_limit is not None and self.max_limit <= 0: - raise ValueError("max_limit must be a positive integer") - - # Validate multi-statement controls - if not self.allow_multi_statements and self.max_statements is not None: - raise ValueError( - "max_statements is only applicable when allow_multi_statements=True" - ) - - if self.max_statements is not None and self.max_statements <= 0: - raise ValueError("max_statements must be a positive integer") - - # Validate column controls - if self.column_context and not self.required_columns: - warnings.warn( - "column_context is set but required_columns is empty - " - "column_context will be ignored" - ) - - # Validate LIMIT controls - if self.max_limit and not self.require_limit: - warnings.warn( - "max_limit is set but require_limit is False - " - "max_limit only enforced if LIMIT clause exists" - ) - - return self - - -# ============================================================================= -# Unified Evaluator Config (used in API) -# ============================================================================= - - -class EvaluatorConfig(BaseModel): - """Evaluator configuration. See GET /evaluators for available evaluators and schemas. +class EvaluatorSpec(BaseModel): + """Evaluator specification. See GET /evaluators for available evaluators and schemas. Evaluator reference formats: - - Built-in: "regex", "list" + - Built-in: "regex", "list", "json", "sql" + - External: "galileo-luna2" (requires agent-control-evaluators[luna2]) - Agent-scoped: "my-agent:my-evaluator" (validated in endpoint, not here) """ @@ -762,11 +191,17 @@ def validate_evaluator_config(self) -> Self: return self # Built-in evaluators: validate config against evaluator's config_model - from .evaluator import get_evaluator + # This import is optional - evaluators package may not be installed + try: + from agent_control_evaluators import get_evaluator + + evaluator_cls = get_evaluator(self.name) + if evaluator_cls: + evaluator_cls.config_model(**self.config) + except ImportError: + # Evaluators package not installed - skip validation + pass - evaluator_cls = get_evaluator(self.name) - if evaluator_cls: - evaluator_cls.config_model(**self.config) # If evaluator not found, allow it (might be a server-side registered evaluator) return self @@ -802,7 +237,7 @@ class ControlDefinition(BaseModel): selector: ControlSelector = Field(..., description="What data to select from the payload") # How to check (unified evaluator-based system) - evaluator: EvaluatorConfig = Field(..., description="How to evaluate the selected data") + evaluator: EvaluatorSpec = Field(..., description="How to evaluate the selected data") # What to do action: ControlAction = Field(..., description="What action to take when control matches") diff --git a/sdks/python/src/agent_control/__init__.py b/sdks/python/src/agent_control/__init__.py index 11990d11..2ff847e7 100644 --- a/sdks/python/src/agent_control/__init__.py +++ b/sdks/python/src/agent_control/__init__.py @@ -44,6 +44,19 @@ async def process(input: str) -> str: import httpx +if TYPE_CHECKING: + from agent_control_models import ( + Agent, + ControlAction, + ControlDefinition, + ControlSelector, + EvaluationRequest, + EvaluationResult, + EvaluatorSpec, + Step, + StepSchema, + ) + from . import agents, controls, evaluation, evaluators, policies # Import client and operations modules @@ -87,7 +100,7 @@ async def process(input: str) -> str: ControlSelector, EvaluationRequest, EvaluationResult, - EvaluatorConfig, + EvaluatorSpec, Step, StepSchema, ) @@ -104,7 +117,7 @@ class ControlSelector: class ControlAction: pass - class EvaluatorConfig: + class EvaluatorSpec: pass class Agent: # runtime fallback @@ -1012,15 +1025,6 @@ async def main(): return await policies.list_policy_controls(client, policy_id) -# Note: The @control decorator is imported from control_decorators.py -# It applies server-defined policies to agent functions. -# See: from agent_control import control - - -# ============================================================================ -# Exports -# ============================================================================ - __all__ = [ # Initialization "init", @@ -1033,56 +1037,38 @@ async def main(): # SDK Logging "get_logger", - # Agent management "get_agent", "list_agents", - # Control management "create_control", "list_controls", "get_control", "delete_control", "update_control", - - # Decorator (server-side policy evaluation) - "control", - - # Control Decorator + # Decorator "control", "ControlViolationError", - # Client "AgentControlClient", - # Operation modules "agents", "policies", "controls", "evaluation", "evaluators", - # Policy-Control management "add_control_to_policy", "remove_control_from_policy", "list_policy_controls", - - - # Tool inference utilities - "tool", - "extract_tools_from_functions", - "tools_from_module", - # Local evaluation "check_evaluation_with_local", - # Tracing "get_trace_and_span_ids", "get_current_trace_id", "get_current_span_id", "with_trace", "is_otel_available", - # Observability "init_observability", "add_event", @@ -1093,8 +1079,7 @@ async def main(): "get_log_config", "log_control_evaluation", "LogConfig", - - # Models (if available) + # Models (re-exported when available) "Agent", "Step", "StepSchema", @@ -1103,7 +1088,7 @@ async def main(): "ControlDefinition", "ControlSelector", "ControlAction", - "EvaluatorConfig", + "EvaluatorSpec", ] try: diff --git a/sdks/python/src/agent_control/evaluators/__init__.py b/sdks/python/src/agent_control/evaluators/__init__.py index 6a34537c..4c18102b 100644 --- a/sdks/python/src/agent_control/evaluators/__init__.py +++ b/sdks/python/src/agent_control/evaluators/__init__.py @@ -22,7 +22,7 @@ ensure_evaluators_discovered, list_evaluators, ) -from agent_control_models import register_evaluator +from agent_control_evaluators import register_evaluator from .base import Evaluator, EvaluatorMetadata @@ -37,7 +37,7 @@ # Optionally export Luna-2 types when available try: - from agent_control_evaluators.luna2 import ( # noqa: F401 + from agent_control_evaluators.galileo_luna2 import ( # noqa: F401 LUNA2_AVAILABLE, Luna2Evaluator, Luna2EvaluatorConfig, diff --git a/sdks/python/src/agent_control/evaluators/base.py b/sdks/python/src/agent_control/evaluators/base.py index 38236d81..33c23f9f 100644 --- a/sdks/python/src/agent_control/evaluators/base.py +++ b/sdks/python/src/agent_control/evaluators/base.py @@ -1,9 +1,9 @@ """Base classes for agent_control evaluators. -Re-exports from agent_control_models for convenience. +Re-exports from agent_control_evaluators for convenience. """ -# Re-export from the models package (where they're defined) -from agent_control_models import Evaluator, EvaluatorMetadata +# Re-export from the evaluators package (where they're now defined) +from agent_control_evaluators import Evaluator, EvaluatorMetadata __all__ = ["Evaluator", "EvaluatorMetadata"] diff --git a/sdks/python/tests/test_evaluators.py b/sdks/python/tests/test_evaluators.py index 7fcfb5b0..777a11d5 100644 --- a/sdks/python/tests/test_evaluators.py +++ b/sdks/python/tests/test_evaluators.py @@ -4,7 +4,7 @@ requiring actual evaluator implementations or external services. Evaluators take config at __init__, evaluate() only takes data. -Registry is now in agent_control_models, discovery in agent_control_engine. +Registry, base classes, and discovery are in agent_control_evaluators. """ import pytest @@ -19,8 +19,8 @@ list_evaluators, register_evaluator, ) -from agent_control_models import clear_evaluators -from agent_control_engine.discovery import reset_evaluator_discovery +from agent_control_evaluators import clear_evaluators +from agent_control_engine import reset_evaluator_discovery from agent_control_models.controls import EvaluatorResult @@ -236,7 +236,7 @@ def test_discover_evaluators_only_runs_once(self): # Second call should return 0 (already discovered) assert count2 == 0 - @patch("agent_control_engine.discovery.entry_points") + @patch("agent_control_evaluators._discovery.entry_points") def test_discover_evaluators_loads_entry_points(self, mock_entry_points): """Test loading evaluators via entry points.""" mock_ep = MagicMock() diff --git a/sdks/python/tests/test_luna2_evaluator.py b/sdks/python/tests/test_luna2_evaluator.py index 6c4f4742..7b30c63b 100644 --- a/sdks/python/tests/test_luna2_evaluator.py +++ b/sdks/python/tests/test_luna2_evaluator.py @@ -13,7 +13,8 @@ import pytest from pydantic import ValidationError -from agent_control_models import EvaluatorResult, Evaluator +from agent_control_evaluators import Evaluator +from agent_control_models import EvaluatorResult def create_mock_protect_response( @@ -23,7 +24,7 @@ def create_mock_protect_response( execution_time: float = 100.0, ) -> MagicMock: """Create a mock ProtectResponse object for testing.""" - from agent_control_evaluators.luna2.client import ProtectResponse, TraceMetadata + from agent_control_evaluators.galileo_luna2.client import ProtectResponse, TraceMetadata return ProtectResponse( status=status, @@ -44,7 +45,7 @@ class TestLuna2EvaluatorConfig: def test_local_stage_config_valid(self): """Test valid local stage configuration.""" - from agent_control_evaluators.luna2 import Luna2EvaluatorConfig + from agent_control_evaluators.galileo_luna2 import Luna2EvaluatorConfig config = Luna2EvaluatorConfig( stage_type="local", @@ -62,7 +63,7 @@ def test_local_stage_config_valid(self): def test_local_stage_config_with_numeric_target(self): """Test local stage configuration with numeric target_value.""" - from agent_control_evaluators.luna2 import Luna2EvaluatorConfig + from agent_control_evaluators.galileo_luna2 import Luna2EvaluatorConfig config = Luna2EvaluatorConfig( stage_type="local", @@ -76,7 +77,7 @@ def test_local_stage_config_with_numeric_target(self): def test_central_stage_config_valid(self): """Test valid central stage configuration.""" - from agent_control_evaluators.luna2 import Luna2EvaluatorConfig + from agent_control_evaluators.galileo_luna2 import Luna2EvaluatorConfig config = Luna2EvaluatorConfig( stage_type="central", @@ -90,7 +91,7 @@ def test_central_stage_config_valid(self): def test_local_stage_requires_metric(self): """Test local stage requires metric field.""" - from agent_control_evaluators.luna2 import Luna2EvaluatorConfig + from agent_control_evaluators.galileo_luna2 import Luna2EvaluatorConfig with pytest.raises(ValidationError, match="metric.*required"): Luna2EvaluatorConfig( @@ -101,7 +102,7 @@ def test_local_stage_requires_metric(self): def test_local_stage_requires_operator(self): """Test local stage requires operator field.""" - from agent_control_evaluators.luna2 import Luna2EvaluatorConfig + from agent_control_evaluators.galileo_luna2 import Luna2EvaluatorConfig with pytest.raises(ValidationError, match="operator.*required"): Luna2EvaluatorConfig( @@ -112,7 +113,7 @@ def test_local_stage_requires_operator(self): def test_local_stage_requires_target_value(self): """Test local stage requires target_value field.""" - from agent_control_evaluators.luna2 import Luna2EvaluatorConfig + from agent_control_evaluators.galileo_luna2 import Luna2EvaluatorConfig with pytest.raises(ValidationError, match="target_value.*required"): Luna2EvaluatorConfig( @@ -123,7 +124,7 @@ def test_local_stage_requires_target_value(self): def test_central_stage_requires_stage_name(self): """Test central stage requires stage_name field.""" - from agent_control_evaluators.luna2 import Luna2EvaluatorConfig + from agent_control_evaluators.galileo_luna2 import Luna2EvaluatorConfig with pytest.raises(ValidationError, match="stage_name.*required"): Luna2EvaluatorConfig( @@ -133,7 +134,7 @@ def test_central_stage_requires_stage_name(self): def test_timeout_ms_validation(self): """Test timeout_ms must be within valid range.""" - from agent_control_evaluators.luna2 import Luna2EvaluatorConfig + from agent_control_evaluators.galileo_luna2 import Luna2EvaluatorConfig # Too low with pytest.raises(ValidationError): @@ -161,7 +162,7 @@ def test_timeout_ms_validation(self): def test_on_error_validation(self): """Test on_error must be 'allow' or 'deny'.""" - from agent_control_evaluators.luna2 import Luna2EvaluatorConfig + from agent_control_evaluators.galileo_luna2 import Luna2EvaluatorConfig config_allow = Luna2EvaluatorConfig( stage_type="central", @@ -186,7 +187,7 @@ def test_on_error_validation(self): def test_metric_validation(self): """Test metric must be a valid Luna2 metric.""" - from agent_control_evaluators.luna2 import Luna2EvaluatorConfig + from agent_control_evaluators.galileo_luna2 import Luna2EvaluatorConfig # Valid metrics valid_metrics = [ @@ -217,7 +218,7 @@ def test_metric_validation(self): def test_operator_validation(self): """Test operator must be a valid Luna2 operator.""" - from agent_control_evaluators.luna2 import Luna2EvaluatorConfig + from agent_control_evaluators.galileo_luna2 import Luna2EvaluatorConfig valid_operators = ["gt", "lt", "gte", "lte", "eq", "contains", "any"] for op in valid_operators: @@ -239,7 +240,7 @@ def test_operator_validation(self): def test_model_dump(self): """Test config can be dumped to dict.""" - from agent_control_evaluators.luna2 import Luna2EvaluatorConfig + from agent_control_evaluators.galileo_luna2 import Luna2EvaluatorConfig config = Luna2EvaluatorConfig( stage_type="local", @@ -263,10 +264,10 @@ class TestLuna2EvaluatorInheritance: """Tests for Luna-2 evaluator inheritance.""" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) def test_evaluator_extends_base(self): """Test Luna2Evaluator extends Evaluator.""" - from agent_control_evaluators.luna2 import Luna2Evaluator + from agent_control_evaluators.galileo_luna2 import Luna2Evaluator assert issubclass(Luna2Evaluator, Evaluator) @@ -275,36 +276,36 @@ class TestLuna2EvaluatorImport: """Tests for Luna-2 evaluator import and initialization.""" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) def test_luna2_evaluator_import_success(self): """Test importing Luna-2 evaluator with dependencies available.""" - from agent_control_evaluators.luna2 import Luna2Evaluator + from agent_control_evaluators.galileo_luna2 import Luna2Evaluator assert Luna2Evaluator is not None assert Luna2Evaluator.metadata.name == "galileo-luna2" assert Luna2Evaluator.metadata.version == "2.0.0" - @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", False) + @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", False) def test_luna2_evaluator_is_available_false_without_httpx(self): """Test that is_available() returns False when httpx is not installed.""" - from agent_control_evaluators.luna2 import Luna2Evaluator + from agent_control_evaluators.galileo_luna2 import Luna2Evaluator # When httpx is not available, is_available() should return False assert Luna2Evaluator.is_available() is False - @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) def test_luna2_evaluator_is_available_true_with_httpx(self): """Test that is_available() returns True when httpx is installed.""" - from agent_control_evaluators.luna2 import Luna2Evaluator + from agent_control_evaluators.galileo_luna2 import Luna2Evaluator # When httpx is available, is_available() should return True assert Luna2Evaluator.is_available() is True - @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) @patch.dict(os.environ, {}, clear=True) def test_luna2_evaluator_init_without_api_key_raises_error(self): """Test that initializing without API key raises ValueError.""" - from agent_control_evaluators.luna2 import Luna2Evaluator + from agent_control_evaluators.galileo_luna2 import Luna2Evaluator config = { "stage_type": "local", @@ -321,10 +322,10 @@ class TestLuna2EvaluatorMetadata: """Tests for Luna-2 evaluator metadata.""" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) def test_metadata_fields(self): """Test Luna-2 evaluator metadata fields.""" - from agent_control_evaluators.luna2 import Luna2Evaluator + from agent_control_evaluators.galileo_luna2 import Luna2Evaluator metadata = Luna2Evaluator.metadata @@ -335,10 +336,10 @@ def test_metadata_fields(self): assert Luna2Evaluator.config_model is not None @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) def test_config_schema_supported_metrics(self): """Test config schema includes all supported metrics.""" - from agent_control_evaluators.luna2 import Luna2Evaluator + from agent_control_evaluators.galileo_luna2 import Luna2Evaluator schema = Luna2Evaluator.config_model.model_json_schema() # Pydantic uses anyOf with const for Literal types @@ -364,12 +365,12 @@ class TestLuna2EvaluatorLocalStage: """Tests for Luna-2 evaluator with local stages.""" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) @pytest.mark.asyncio async def test_local_stage_triggered(self): """Test local stage evaluation when rule is triggered.""" - from agent_control_evaluators.luna2 import Luna2Evaluator - from agent_control_evaluators.luna2.client import GalileoProtectClient + from agent_control_evaluators.galileo_luna2 import Luna2Evaluator + from agent_control_evaluators.galileo_luna2.client import GalileoProtectClient # Create mock response with triggered status mock_response = create_mock_protect_response( @@ -404,12 +405,12 @@ async def test_local_stage_triggered(self): assert result.metadata["status"] == "triggered" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) @pytest.mark.asyncio async def test_local_stage_not_triggered(self): """Test local stage evaluation when rule is not triggered.""" - from agent_control_evaluators.luna2 import Luna2Evaluator - from agent_control_evaluators.luna2.client import GalileoProtectClient + from agent_control_evaluators.galileo_luna2 import Luna2Evaluator + from agent_control_evaluators.galileo_luna2.client import GalileoProtectClient mock_response = create_mock_protect_response( status="not_triggered", @@ -439,12 +440,12 @@ async def test_local_stage_not_triggered(self): assert result.metadata["status"] == "not_triggered" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) @pytest.mark.asyncio async def test_local_stage_with_timeout_ms(self): """Test local stage respects timeout_ms configuration.""" - from agent_control_evaluators.luna2 import Luna2Evaluator - from agent_control_evaluators.luna2.client import GalileoProtectClient + from agent_control_evaluators.galileo_luna2 import Luna2Evaluator + from agent_control_evaluators.galileo_luna2.client import GalileoProtectClient mock_response = create_mock_protect_response() @@ -476,12 +477,12 @@ class TestLuna2EvaluatorCentralStage: """Tests for Luna-2 evaluator with central stages.""" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) @pytest.mark.asyncio async def test_central_stage_evaluation(self): """Test central stage evaluation.""" - from agent_control_evaluators.luna2 import Luna2Evaluator - from agent_control_evaluators.luna2.client import GalileoProtectClient + from agent_control_evaluators.galileo_luna2 import Luna2Evaluator + from agent_control_evaluators.galileo_luna2.client import GalileoProtectClient mock_response = create_mock_protect_response( status="triggered", @@ -509,12 +510,12 @@ async def test_central_stage_evaluation(self): assert result.metadata["status"] == "triggered" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) @pytest.mark.asyncio async def test_central_stage_without_version(self): """Test central stage without pinned version.""" - from agent_control_evaluators.luna2 import Luna2Evaluator - from agent_control_evaluators.luna2.client import GalileoProtectClient + from agent_control_evaluators.galileo_luna2 import Luna2Evaluator + from agent_control_evaluators.galileo_luna2.client import GalileoProtectClient mock_response = create_mock_protect_response(trace_id="trace-latest") @@ -542,10 +543,10 @@ class TestLuna2EvaluatorPayloadPreparation: """Tests for payload preparation logic.""" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) def test_input_metric_payload(self): """Test payload for input metrics uses _prepare_payload correctly.""" - from agent_control_evaluators.luna2 import Luna2Evaluator + from agent_control_evaluators.galileo_luna2 import Luna2Evaluator config = { "stage_type": "local", @@ -562,10 +563,10 @@ def test_input_metric_payload(self): assert payload.output == "" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) def test_output_metric_payload(self): """Test payload for output metrics uses _prepare_payload correctly.""" - from agent_control_evaluators.luna2 import Luna2Evaluator + from agent_control_evaluators.galileo_luna2 import Luna2Evaluator config = { "stage_type": "local", @@ -582,10 +583,10 @@ def test_output_metric_payload(self): assert payload.output == "llm output text" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) def test_payload_field_override(self): """Test explicit payload_field configuration.""" - from agent_control_evaluators.luna2 import Luna2Evaluator + from agent_control_evaluators.galileo_luna2 import Luna2Evaluator config = { "stage_type": "central", @@ -605,12 +606,12 @@ class TestLuna2EvaluatorErrorHandling: """Tests for error handling in Luna-2 evaluator.""" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) @pytest.mark.asyncio async def test_error_with_fail_open(self): """Test error handling with fail open (default).""" - from agent_control_evaluators.luna2 import Luna2Evaluator - from agent_control_evaluators.luna2.client import GalileoProtectClient + from agent_control_evaluators.galileo_luna2 import Luna2Evaluator + from agent_control_evaluators.galileo_luna2.client import GalileoProtectClient config = { "stage_type": "local", @@ -635,12 +636,12 @@ async def test_error_with_fail_open(self): assert result.metadata["fallback_action"] == "allow" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) @pytest.mark.asyncio async def test_error_with_fail_closed(self): """Test error handling with fail closed.""" - from agent_control_evaluators.luna2 import Luna2Evaluator - from agent_control_evaluators.luna2.client import GalileoProtectClient + from agent_control_evaluators.galileo_luna2 import Luna2Evaluator + from agent_control_evaluators.galileo_luna2.client import GalileoProtectClient config = { "stage_type": "local", @@ -665,12 +666,12 @@ async def test_error_with_fail_closed(self): assert result.metadata["fallback_action"] == "deny" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) @pytest.mark.asyncio async def test_empty_response_handling(self): """Test handling of empty/None response.""" - from agent_control_evaluators.luna2 import Luna2Evaluator - from agent_control_evaluators.luna2.client import GalileoProtectClient + from agent_control_evaluators.galileo_luna2 import Luna2Evaluator + from agent_control_evaluators.galileo_luna2.client import GalileoProtectClient config = { "stage_type": "local", @@ -697,10 +698,10 @@ class TestLuna2EvaluatorTimeoutHelper: """Tests for timeout helper method.""" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) def test_get_timeout_from_config(self): """Test timeout conversion from config.""" - from agent_control_evaluators.luna2 import Luna2Evaluator + from agent_control_evaluators.galileo_luna2 import Luna2Evaluator config = { "stage_type": "local", @@ -714,10 +715,10 @@ def test_get_timeout_from_config(self): assert evaluator.get_timeout_seconds() == 5.0 @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) def test_get_timeout_from_default(self): """Test timeout uses metadata default.""" - from agent_control_evaluators.luna2 import Luna2Evaluator + from agent_control_evaluators.galileo_luna2 import Luna2Evaluator config = { "stage_type": "local", @@ -735,10 +736,10 @@ class TestLuna2EvaluatorNumericTargetValue: """Tests for numeric target_value handling.""" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) def test_numeric_target_value_float(self): """Test evaluator accepts float target_value.""" - from agent_control_evaluators.luna2 import Luna2Evaluator + from agent_control_evaluators.galileo_luna2 import Luna2Evaluator config = { "stage_type": "local", @@ -751,10 +752,10 @@ def test_numeric_target_value_float(self): assert evaluator._get_numeric_target_value() == 0.5 @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) def test_numeric_target_value_int(self): """Test evaluator accepts int target_value.""" - from agent_control_evaluators.luna2 import Luna2Evaluator + from agent_control_evaluators.galileo_luna2 import Luna2Evaluator config = { "stage_type": "local", @@ -767,10 +768,10 @@ def test_numeric_target_value_int(self): assert evaluator._get_numeric_target_value() == 1 @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) def test_string_target_value_converts_to_float(self): """Test evaluator converts string target_value to float.""" - from agent_control_evaluators.luna2 import Luna2Evaluator + from agent_control_evaluators.galileo_luna2 import Luna2Evaluator config = { "stage_type": "local", @@ -788,7 +789,7 @@ class TestGalileoProtectClient: def test_client_init_with_api_key(self): """Test client initialization with API key.""" - from agent_control_evaluators.luna2.client import GalileoProtectClient + from agent_control_evaluators.galileo_luna2.client import GalileoProtectClient with patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}): client = GalileoProtectClient() @@ -796,7 +797,7 @@ def test_client_init_with_api_key(self): def test_client_init_without_api_key_raises(self): """Test client raises error without API key.""" - from agent_control_evaluators.luna2.client import GalileoProtectClient + from agent_control_evaluators.galileo_luna2.client import GalileoProtectClient with patch.dict(os.environ, {}, clear=True): with pytest.raises(ValueError, match="GALILEO_API_KEY"): @@ -804,7 +805,7 @@ def test_client_init_without_api_key_raises(self): def test_derive_api_url_from_console_url(self): """Test API URL derivation from console URL.""" - from agent_control_evaluators.luna2.client import GalileoProtectClient + from agent_control_evaluators.galileo_luna2.client import GalileoProtectClient with patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}): client = GalileoProtectClient( @@ -814,7 +815,7 @@ def test_derive_api_url_from_console_url(self): def test_derive_api_url_default(self): """Test default API URL.""" - from agent_control_evaluators.luna2.client import GalileoProtectClient + from agent_control_evaluators.galileo_luna2.client import GalileoProtectClient with patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}): client = GalileoProtectClient() @@ -826,14 +827,14 @@ class TestPayloadDataClasses: def test_payload_to_dict(self): """Test Payload.to_dict() method.""" - from agent_control_evaluators.luna2.client import Payload + from agent_control_evaluators.galileo_luna2.client import Payload payload = Payload(input="test input", output="test output") assert payload.to_dict() == {"input": "test input", "output": "test output"} def test_rule_to_dict(self): """Test Rule.to_dict() method.""" - from agent_control_evaluators.luna2.client import Rule + from agent_control_evaluators.galileo_luna2.client import Rule rule = Rule(metric="input_toxicity", operator="gt", target_value=0.5) assert rule.to_dict() == { @@ -844,7 +845,7 @@ def test_rule_to_dict(self): def test_ruleset_to_dict(self): """Test Ruleset.to_dict() method.""" - from agent_control_evaluators.luna2.client import PassthroughAction, Rule, Ruleset + from agent_control_evaluators.galileo_luna2.client import PassthroughAction, Rule, Ruleset ruleset = Ruleset( rules=[Rule(metric="input_toxicity", operator="gt", target_value=0.5)], @@ -858,7 +859,7 @@ def test_ruleset_to_dict(self): def test_protect_response_from_dict(self): """Test ProtectResponse.from_dict() method.""" - from agent_control_evaluators.luna2.client import ProtectResponse + from agent_control_evaluators.galileo_luna2.client import ProtectResponse data = { "status": "triggered", diff --git a/server/src/agent_control_server/config.py b/server/src/agent_control_server/config.py index 2dceb9dc..3776b7fa 100644 --- a/server/src/agent_control_server/config.py +++ b/server/src/agent_control_server/config.py @@ -95,8 +95,6 @@ def get_url(self) -> str: class Settings(BaseSettings): """Server configuration settings.""" - # TODO: Clean this up since we may want to connect to pg, etc., so - # database_url may have to go model_config = SettingsConfigDict( env_file=".env", diff --git a/server/src/agent_control_server/endpoints/__init__.py b/server/src/agent_control_server/endpoints/__init__.py index e69de29b..ef925220 100644 --- a/server/src/agent_control_server/endpoints/__init__.py +++ b/server/src/agent_control_server/endpoints/__init__.py @@ -0,0 +1 @@ +"""API endpoint routers for the Agent Control server.""" From 33ae401a22abcd76ae778c87d8a1af45f92858f8 Mon Sep 17 00:00:00 2001 From: Abhinav Gupta Date: Thu, 29 Jan 2026 22:08:41 +0530 Subject: [PATCH 02/21] refactor(evaluators): extend EvaluatorConfig from project's BaseModel EvaluatorConfig now extends agent_control_models.base.BaseModel instead of pydantic.BaseModel directly, inheriting standard model behavior: - populate_by_name, use_enum_values, validate_assignment - to_dict(), to_json(), from_dict(), from_json() helpers - extra="ignore" for forward compatibility --- evaluators/src/agent_control_evaluators/_base.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/evaluators/src/agent_control_evaluators/_base.py b/evaluators/src/agent_control_evaluators/_base.py index 3fb53554..b6e7048d 100644 --- a/evaluators/src/agent_control_evaluators/_base.py +++ b/evaluators/src/agent_control_evaluators/_base.py @@ -7,9 +7,8 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Any, ClassVar, Generic, TypeVar -from pydantic import BaseModel, ConfigDict - from agent_control_models import EvaluatorResult +from agent_control_models.base import BaseModel if TYPE_CHECKING: from typing import Self @@ -20,8 +19,8 @@ class EvaluatorConfig(BaseModel): """Base class for typed evaluator configurations. - All evaluator config classes should extend this to ensure consistent - behavior and enable type checking. + Extends the project's BaseModel to ensure consistent behavior + and enable type checking across all evaluator configs. Example: ```python @@ -33,7 +32,7 @@ class MyEvaluatorConfig(EvaluatorConfig): ``` """ - model_config = ConfigDict(extra="forbid") + pass ConfigT = TypeVar("ConfigT", bound=EvaluatorConfig) From 2b3713e54d81eccddc1403bde890a2644d77d1d3 Mon Sep 17 00:00:00 2001 From: Abhinav Gupta Date: Thu, 29 Jan 2026 22:23:11 +0530 Subject: [PATCH 03/21] test(evaluators): use EvaluatorConfig in test_base.py MockConfig Update MockConfig to extend EvaluatorConfig instead of pydantic's BaseModel for consistency with the new evaluator config pattern. --- evaluators/tests/test_base.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/evaluators/tests/test_base.py b/evaluators/tests/test_base.py index ef94bee2..776a8d01 100644 --- a/evaluators/tests/test_base.py +++ b/evaluators/tests/test_base.py @@ -6,13 +6,11 @@ import pytest from typing import Any -from pydantic import BaseModel - -from agent_control_evaluators import Evaluator, EvaluatorMetadata +from agent_control_evaluators import Evaluator, EvaluatorConfig, EvaluatorMetadata from agent_control_models import EvaluatorResult -class MockConfig(BaseModel): +class MockConfig(EvaluatorConfig): """Config model for mock evaluator.""" should_match: bool = False From 5fb2f7dffb9e6a779c1ac60316386f7980eb584f Mon Sep 17 00:00:00 2001 From: Abhinav Gupta Date: Thu, 29 Jan 2026 23:18:24 +0530 Subject: [PATCH 04/21] docs: update documentation for evaluator refactoring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix EvaluatorConfig → EvaluatorSpec in examples and models README - Fix luna2 → galileo-luna2 with proper config in customer_support example - Fix luna2/ → galileo_luna2/ path in galileo example - Rename luna2 entry point to galileo-luna2 in pyproject.toml - Update CONTRIBUTING.md with flat directory structure and correct imports - Update AGENTS.md with correct register_evaluator import source - Add ensure_evaluators_discovered() call in EvaluatorSpec validation - Remove stale TODOs in sdks/python/tests/conftest.py - Add docstrings to evaluators/tests/{json,sql}/__init__.py --- AGENTS.md | 2 +- CONTRIBUTING.md | 46 +++++++++++++-------- evaluators/pyproject.toml | 2 +- evaluators/tests/json/__init__.py | 1 + evaluators/tests/sql/__init__.py | 1 + examples/README.md | 4 +- examples/customer_support_agent/README.md | 7 +++- examples/galileo/README.md | 2 +- models/README.md | 2 +- models/src/agent_control_models/controls.py | 4 +- sdks/python/tests/conftest.py | 4 -- 11 files changed, 44 insertions(+), 31 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 32d05971..04df435a 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -68,7 +68,7 @@ All testing guidance (including “behavior changes require tests”) lives in ` - Add a new evaluator: 1) implement evaluator class extending `Evaluator` in `evaluators/src/agent_control_evaluators/` - 2) use `@register_evaluator` decorator (from `agent_control_models`) + 2) use `@register_evaluator` decorator (from `agent_control_evaluators`) 3) add entry point in `evaluators/pyproject.toml` for auto-discovery 4) add tests in the evaluators package 5) evaluator is automatically available to server and SDK via `discover_evaluators()` diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index be940723..1ef4360c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -221,14 +221,23 @@ Extensible evaluators for custom detection logic. # Location evaluators/src/agent_control_evaluators/ -# Key directories -├── builtin/ # Built-in evaluators -│ ├── regex.py # RegexEvaluator - pattern matching -│ └── list.py # ListEvaluator - value matching -└── luna2/ # Galileo Luna-2 integration - ├── evaluator.py # Luna2Evaluator implementation - ├── config.py # Luna2Config model - └── client.py # Direct HTTP client (no SDK dependency) +# Key directories (flat structure - each evaluator is a peer directory) +├── regex/ # Regex pattern matching evaluator +│ ├── config.py # RegexEvaluatorConfig +│ └── evaluator.py # RegexEvaluator implementation +├── list/ # List value matching evaluator +│ ├── config.py # ListEvaluatorConfig +│ └── evaluator.py # ListEvaluator implementation +├── json/ # JSON validation evaluator +│ ├── config.py # JsonEvaluatorConfig +│ └── evaluator.py # JsonEvaluator implementation +├── sql/ # SQL validation evaluator +│ ├── config.py # SqlEvaluatorConfig +│ └── evaluator.py # SqlEvaluator implementation +└── galileo_luna2/ # Galileo Luna-2 integration + ├── config.py # Luna2EvaluatorConfig + ├── evaluator.py # Luna2Evaluator implementation + └── client.py # Direct HTTP client (no SDK dependency) ``` **Adding a new evaluator:** @@ -251,8 +260,8 @@ evaluators/src/agent_control_evaluators/ 3. **Implement evaluator (`evaluator.py`):** ```python from typing import Any - from agent_control_models import ( - EvaluatorResult, + from agent_control_models import EvaluatorResult + from agent_control_evaluators import ( Evaluator, EvaluatorMetadata, register_evaluator, @@ -466,10 +475,11 @@ test: add control set integration tests ### Add a built-in evaluator (regex/list style) -1. Add evaluator class in `evaluators/src/agent_control_evaluators/builtin/` -2. Add config model in `models/src/agent_control_models/controls.py` -3. Register with `@register_evaluator` decorator -4. Add comprehensive tests in `evaluators/tests/` +1. Create evaluator directory in `evaluators/src/agent_control_evaluators/my_evaluator/` +2. Add `config.py` with your config model extending `EvaluatorConfig` +3. Add `evaluator.py` with your evaluator class using `@register_evaluator` decorator +4. Add entry point in `evaluators/pyproject.toml` +5. Add comprehensive tests in `evaluators/tests/` ### Update shared models @@ -499,11 +509,11 @@ test: add control set integration tests | Task | Location | |------|----------| -| Evaluator base class | `agent_control_models.Evaluator` | -| Evaluator metadata | `agent_control_models.EvaluatorMetadata` | +| Evaluator base class | `agent_control_evaluators.Evaluator` | +| Evaluator metadata | `agent_control_evaluators.EvaluatorMetadata` | | Evaluator result | `agent_control_models.EvaluatorResult` | -| Register decorator | `@agent_control_models.register_evaluator` | -| Built-in evaluators | `evaluators/src/agent_control_evaluators/builtin/` | +| Register decorator | `@agent_control_evaluators.register_evaluator` | +| Built-in evaluators | `evaluators/src/agent_control_evaluators/{regex,list,json,sql}/` | | Evaluator tests | `evaluators/tests/` | **Evaluator config model fields:** diff --git a/evaluators/pyproject.toml b/evaluators/pyproject.toml index 4e606f57..c197c9e4 100644 --- a/evaluators/pyproject.toml +++ b/evaluators/pyproject.toml @@ -24,7 +24,7 @@ regex = "agent_control_evaluators.regex:RegexEvaluator" list = "agent_control_evaluators.list:ListEvaluator" json = "agent_control_evaluators.json:JSONEvaluator" sql = "agent_control_evaluators.sql:SQLEvaluator" -luna2 = "agent_control_evaluators.galileo_luna2:Luna2Evaluator" +galileo-luna2 = "agent_control_evaluators.galileo_luna2:Luna2Evaluator" [build-system] requires = ["hatchling"] diff --git a/evaluators/tests/json/__init__.py b/evaluators/tests/json/__init__.py index e69de29b..5f848dd5 100644 --- a/evaluators/tests/json/__init__.py +++ b/evaluators/tests/json/__init__.py @@ -0,0 +1 @@ +"""Tests for the JSON evaluator.""" diff --git a/evaluators/tests/sql/__init__.py b/evaluators/tests/sql/__init__.py index e69de29b..541fa0d1 100644 --- a/evaluators/tests/sql/__init__.py +++ b/evaluators/tests/sql/__init__.py @@ -0,0 +1 @@ +"""Tests for the SQL evaluator.""" diff --git a/examples/README.md b/examples/README.md index 88febbc6..6fa2fe6f 100644 --- a/examples/README.md +++ b/examples/README.md @@ -131,7 +131,7 @@ from agent_control import ( ControlSelector, ControlScope, ControlAction, - EvaluatorConfig, + EvaluatorSpec, controls, ) @@ -146,7 +146,7 @@ async with AgentControlClient() as client: execution="server", scope=ControlScope(step_types=["llm"], stages=["post"]), selector=ControlSelector(path="output"), - evaluator=EvaluatorConfig( + evaluator=EvaluatorSpec( name="regex", config={"pattern": r"\b\d{3}-\d{2}-\d{4}\b"} ), diff --git a/examples/customer_support_agent/README.md b/examples/customer_support_agent/README.md index 98e88360..8b7d3ba0 100644 --- a/examples/customer_support_agent/README.md +++ b/examples/customer_support_agent/README.md @@ -302,9 +302,12 @@ scope: selector: path: input evaluator: - name: luna2 + name: galileo-luna2 config: - threshold: 0.8 + stage_type: local + metric: input_toxicity + operator: gt + target_value: 0.8 action: decision: deny message: "Inappropriate content detected" diff --git a/examples/galileo/README.md b/examples/galileo/README.md index 0336284e..2f801c1e 100644 --- a/examples/galileo/README.md +++ b/examples/galileo/README.md @@ -84,5 +84,5 @@ Testing toxicity detection with Central Stage... - [Galileo Protect Overview](https://v2docs.galileo.ai/concepts/protect/overview) - [Luna-2 Python API Reference](https://v2docs.galileo.ai/sdk-api/python/reference/protect) -- [Agent Control Luna-2 Evaluator](../../evaluators/src/agent_control_evaluators/luna2/) +- [Agent Control Luna-2 Evaluator](../../evaluators/src/agent_control_evaluators/galileo_luna2/) diff --git a/models/README.md b/models/README.md index df73c129..cfb52396 100644 --- a/models/README.md +++ b/models/README.md @@ -157,7 +157,7 @@ Complete control specification. - `execution` (str): Execution mode ("server" or "local") - `scope` (ControlScope): When to apply the control - `selector` (ControlSelector): What data to evaluate -- `evaluator` (EvaluatorConfig): How to evaluate +- `evaluator` (EvaluatorSpec): How to evaluate - `action` (ControlAction): What to do on match #### EvaluationRequest diff --git a/models/src/agent_control_models/controls.py b/models/src/agent_control_models/controls.py index 7ae4be4e..8a47b91f 100644 --- a/models/src/agent_control_models/controls.py +++ b/models/src/agent_control_models/controls.py @@ -193,8 +193,10 @@ def validate_evaluator_config(self) -> Self: # Built-in evaluators: validate config against evaluator's config_model # This import is optional - evaluators package may not be installed try: - from agent_control_evaluators import get_evaluator + from agent_control_evaluators import ensure_evaluators_discovered, get_evaluator + # Ensure entry points are loaded before looking up evaluator + ensure_evaluators_discovered() evaluator_cls = get_evaluator(self.name) if evaluator_cls: evaluator_cls.config_model(**self.config) diff --git a/sdks/python/tests/conftest.py b/sdks/python/tests/conftest.py index d5bd090f..b1201214 100644 --- a/sdks/python/tests/conftest.py +++ b/sdks/python/tests/conftest.py @@ -175,8 +175,6 @@ async def test_policy( yield result - # TODO: Add cleanup when delete endpoint is available - @pytest_asyncio.fixture async def test_control( @@ -196,8 +194,6 @@ async def test_control( yield result - # TODO: Add cleanup when delete endpoint is available - @pytest.fixture def sample_steps() -> list[dict[str, Any]]: From 49c3206505b44793a6360ce14650a5dba536bda6 Mon Sep 17 00:00:00 2001 From: Abhinav Gupta Date: Fri, 30 Jan 2026 18:27:05 +0530 Subject: [PATCH 05/21] refactor(evaluators)!: standardize naming conventions and remove legacy API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BREAKING CHANGE: `parse_evaluator_ref()` removed, use `parse_evaluator_ref_full()` or `is_agent_scoped()` instead. Evaluator naming conventions: - Built-in: "regex", "list", "json", "sql" (no namespace) - External: "galileo/luna2" (slash separator) - Agent-scoped: "my-agent:custom" (colon separator) Changes: - Rename galileo-luna2 → galileo/luna2 throughout codebase - Add ParsedEvaluatorRef dataclass with type detection - Remove deprecated parse_evaluator_ref() tuple API - Migrate endpoints to use parse_evaluator_ref_full() and is_agent_scoped() - Standardize XEvaluator + XEvaluatorConfig naming in docs - Fix pre-existing lint issues (import sorting, Union syntax, unused imports) --- CONTRIBUTING.md | 406 +++++++++++++----- README.md | 2 +- docs/OVERVIEW.md | 33 +- docs/REFERENCE.md | 31 +- evaluators/README.md | 4 +- evaluators/pyproject.toml | 2 +- .../src/agent_control_evaluators/__init__.py | 19 +- .../src/agent_control_evaluators/_base.py | 4 +- .../src/agent_control_evaluators/_factory.py | 3 +- .../galileo_luna2/__init__.py | 4 +- .../galileo_luna2/config.py | 6 +- .../galileo_luna2/evaluator.py | 2 +- .../agent_control_evaluators/sql/config.py | 2 +- examples/customer_support_agent/README.md | 2 +- models/src/agent_control_models/controls.py | 2 +- sdks/python/tests/test_luna2_evaluator.py | 7 +- .../agent_control_server/endpoints/agents.py | 24 +- .../endpoints/controls.py | 39 +- .../endpoints/evaluator_configs.py | 5 +- .../services/evaluator_utils.py | 100 ++++- server/tests/test_evaluator_utils.py | 122 +++--- ui/src/core/evaluators/luna2/index.ts | 2 +- ui/tests/fixtures.ts | 2 +- 23 files changed, 572 insertions(+), 251 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1ef4360c..bebd73e9 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -222,105 +222,219 @@ Extensible evaluators for custom detection logic. evaluators/src/agent_control_evaluators/ # Key directories (flat structure - each evaluator is a peer directory) -├── regex/ # Regex pattern matching evaluator +# Built-in evaluators (no namespace, core dependencies only) +├── regex/ # Type name: "regex" │ ├── config.py # RegexEvaluatorConfig │ └── evaluator.py # RegexEvaluator implementation -├── list/ # List value matching evaluator +├── list/ # Type name: "list" │ ├── config.py # ListEvaluatorConfig │ └── evaluator.py # ListEvaluator implementation -├── json/ # JSON validation evaluator +├── json/ # Type name: "json" │ ├── config.py # JsonEvaluatorConfig │ └── evaluator.py # JsonEvaluator implementation -├── sql/ # SQL validation evaluator +├── sql/ # Type name: "sql" │ ├── config.py # SqlEvaluatorConfig │ └── evaluator.py # SqlEvaluator implementation -└── galileo_luna2/ # Galileo Luna-2 integration +# +# External evaluators (namespaced, optional dependencies) +└── galileo_luna2/ # Type name: "galileo/luna2" (folder uses underscore) ├── config.py # Luna2EvaluatorConfig ├── evaluator.py # Luna2Evaluator implementation └── client.py # Direct HTTP client (no SDK dependency) ``` -**Adding a new evaluator:** - -1. **Create evaluator directory:** - ```bash - mkdir evaluators/src/agent_control_evaluators/my_evaluator/ - ``` - -2. **Define configuration model (`config.py`):** - ```python - from pydantic import BaseModel, Field - - class MyEvaluatorConfig(BaseModel): - """Configuration for MyEvaluator.""" - threshold: float = Field(0.5, ge=0.0, le=1.0) - api_endpoint: str = Field(default="https://api.example.com") - ``` - -3. **Implement evaluator (`evaluator.py`):** - ```python - from typing import Any - from agent_control_models import EvaluatorResult - from agent_control_evaluators import ( - Evaluator, - EvaluatorMetadata, - register_evaluator, - ) - from .config import MyEvaluatorConfig - - @register_evaluator - class MyEvaluator(Evaluator[MyEvaluatorConfig]): - """My custom evaluator.""" - - metadata = EvaluatorMetadata( - name="my-evaluator", - version="1.0.0", - description="Custom detection logic", - requires_api_key=False, - timeout_ms=5000, - ) - config_model = MyEvaluatorConfig - - def __init__(self, config: MyEvaluatorConfig) -> None: - super().__init__(config) - # Initialize any clients or resources - - async def evaluate(self, data: Any) -> EvaluatorResult: - # Your detection logic here - score = await self._analyze(str(data)) - - return EvaluatorResult( - matched=score > self.config.threshold, - confidence=score, - message=f"Analysis score: {score:.2f}", - metadata={"score": score}, - ) - ``` - -4. **Export in `__init__.py`:** - ```python - from .config import MyEvaluatorConfig - from .evaluator import MyEvaluator - - __all__ = ["MyEvaluator", "MyEvaluatorConfig"] - ``` - -5. **Add optional dependencies in `evaluators/pyproject.toml`:** - ```toml - [project.optional-dependencies] - my-evaluator = ["httpx>=0.24.0"] # Add your dependencies - all = ["httpx>=0.24.0", ...] # Include in 'all' extra - ``` - -6. **Add tests in `evaluators/tests/`** - -**Evaluator Best Practices:** -- Use Pydantic for config validation -- Make API calls async with httpx -- Return confidence scores (0.0-1.0) -- Include helpful metadata for debugging -- Handle errors gracefully (respect `on_error` config) -- Avoid storing request-scoped state (evaluators are cached) +> **Note:** Folder names use `snake_case` (Python convention), but type names in metadata +> use `provider/name` format with slash for external evaluators. + +**Creating a new evaluator:** + +Choose the appropriate type based on your use case: + +| Type | When to Use | Name Format | +|------|-------------|-------------| +| Built-in | Core functionality, no external deps | `my-evaluator` | +| External | External provider integration, optional deps | `provider/name` | +| Agent-scoped | Custom logic deployed with agent | `my-agent:custom` | + +### Creating a Third-Party Evaluator (Recommended for External Providers) + +This example creates a external evaluator `acme/toxicity`: + +**1. Create evaluator directory:** +```bash +mkdir -p evaluators/src/agent_control_evaluators/acme_toxicity/ +touch evaluators/src/agent_control_evaluators/acme_toxicity/__init__.py +``` + +**2. Define configuration model (`config.py`):** +```python +from pydantic import Field +from agent_control_evaluators import EvaluatorConfig + + +class AcmeToxicityEvaluatorConfig(EvaluatorConfig): + """Configuration for Acme Toxicity evaluator.""" + + threshold: float = Field( + default=0.7, + ge=0.0, + le=1.0, + description="Score threshold for triggering (0.0-1.0)", + ) + categories: list[str] = Field( + default_factory=lambda: ["hate", "violence"], + description="Toxicity categories to check", + ) + timeout_ms: int = Field( + default=5000, + ge=100, + le=30000, + description="API timeout in milliseconds", + ) +``` + +**3. Implement evaluator (`evaluator.py`):** +```python +from typing import Any + +from agent_control_models import EvaluatorResult + +from agent_control_evaluators._base import Evaluator, EvaluatorMetadata +from agent_control_evaluators._registry import register_evaluator +from agent_control_evaluators.acme_toxicity.config import AcmeToxicityEvaluatorConfig + +# Check optional dependency +try: + import httpx + ACME_AVAILABLE = True +except ImportError: + ACME_AVAILABLE = False + + +@register_evaluator +class AcmeToxicityEvaluator(Evaluator[AcmeToxicityEvaluatorConfig]): + """Acme Toxicity detection evaluator. + + Calls the Acme API to detect toxic content in text. + + Example config: + {"threshold": 0.8, "categories": ["hate", "harassment"]} + """ + + metadata = EvaluatorMetadata( + name="acme/toxicity", # <-- External provider with slash + version="1.0.0", + description="Acme toxicity detection API", + requires_api_key=True, + timeout_ms=5000, + ) + config_model = AcmeToxicityConfig + + @classmethod + def is_available(cls) -> bool: + """Check if httpx dependency is installed.""" + return ACME_AVAILABLE + + def __init__(self, config: AcmeToxicityEvaluatorConfig) -> None: + super().__init__(config) + # Pre-compile or initialize resources here (will be cached) + self._client: httpx.AsyncClient | None = None + + async def evaluate(self, data: Any) -> EvaluatorResult: + """Evaluate text for toxicity.""" + if data is None: + return EvaluatorResult( + matched=False, + confidence=1.0, + message="No data to evaluate", + ) + + text = str(data) + + try: + score = await self._call_api(text) + matched = score >= self.config.threshold + + return EvaluatorResult( + matched=matched, + confidence=score, + message=f"Toxicity score: {score:.2f}", + metadata={ + "score": score, + "threshold": self.config.threshold, + "categories": self.config.categories, + }, + ) + except Exception as e: + # Return error result (fail-open by default) + return EvaluatorResult( + matched=False, + confidence=0.0, + message=f"Evaluation failed: {e}", + error=f"{type(e).__name__}: {str(e)[:200]}", + ) + + async def _call_api(self, text: str) -> float: + """Call Acme API and return toxicity score.""" + # Implementation details... + pass +``` + +**4. Export in `__init__.py`:** +```python +from agent_control_evaluators.acme_toxicity.config import AcmeToxicityEvaluatorConfig +from agent_control_evaluators.acme_toxicity.evaluator import AcmeToxicityEvaluator + +__all__ = ["AcmeToxicityEvaluator", "AcmeToxicityEvaluatorConfig"] +``` + +**5. Register entry point in `evaluators/pyproject.toml`:** +```toml +[project.optional-dependencies] +acme = ["httpx>=0.24.0"] # Your dependencies +all = ["httpx>=0.24.0"] # Include in 'all' extra + +[project.entry-points."agent_control.evaluators"] +regex = "agent_control_evaluators.regex:RegexEvaluator" +list = "agent_control_evaluators.list:ListEvaluator" +# ... existing entries ... +"acme/toxicity" = "agent_control_evaluators.acme_toxicity:AcmeToxicityEvaluator" +``` + +**6. Add tests in `evaluators/tests/acme_toxicity/`** + +### Creating a Built-in Evaluator + +For evaluators with no external dependencies (to be included in core), follow the same pattern but: +- Use simple name: `name="my-evaluator"` (no slash) +- No `is_available()` override needed +- Import directly in `evaluators/src/agent_control_evaluators/__init__.py` for auto-registration: + ```python + from agent_control_evaluators.my_evaluator import MyEvaluator, MyEvaluatorConfig + ``` + +### Evaluator Best Practices + +**Thread Safety & Caching:** +- Evaluator instances are **cached and reused** across requests +- **DO NOT** store mutable request-scoped state on `self` +- Use local variables in `evaluate()` for request-specific data +- Initialize immutable resources in `__init__()` (compiled patterns, clients) + +**Error Handling:** +- Set `error` field for evaluator failures (API errors, timeouts) +- Return `matched=False` when `error` is set (fail-open) +- DO NOT set `error` for validation failures (bad input is a valid "matched" result) + +**Performance:** +- Pre-compile patterns in `__init__()` +- Use `asyncio.to_thread()` for CPU-bound work (see SQL evaluator) +- Respect `timeout_ms` config for external API calls + +**Config Validation:** +- Extend `EvaluatorConfig` (not plain `BaseModel`) +- Use Pydantic validators for complex rules +- Provide sensible defaults with `Field(default=...)` --- @@ -465,21 +579,18 @@ test: add control set integration tests ### Add a new evaluator -1. Create evaluator directory in `evaluators/src/agent_control_evaluators/` -2. Implement `Evaluator` interface (see Evaluators section above) -3. Add `@register_evaluator` decorator to your evaluator class -4. Add optional dependencies in `evaluators/pyproject.toml` -5. Export from `evaluators/src/agent_control_evaluators/__init__.py` -6. Add tests in `evaluators/tests/` -7. Update `docs/OVERVIEW.md` with usage examples - -### Add a built-in evaluator (regex/list style) +See the **Evaluators** section above for detailed instructions. Summary: -1. Create evaluator directory in `evaluators/src/agent_control_evaluators/my_evaluator/` -2. Add `config.py` with your config model extending `EvaluatorConfig` -3. Add `evaluator.py` with your evaluator class using `@register_evaluator` decorator -4. Add entry point in `evaluators/pyproject.toml` -5. Add comprehensive tests in `evaluators/tests/` +1. Decide on evaluator type (built-in vs external) +2. Create directory: `evaluators/src/agent_control_evaluators/my_evaluator/` +3. Add `config.py` extending `EvaluatorConfig` +4. Add `evaluator.py` with `@register_evaluator` decorator +5. Add entry point in `evaluators/pyproject.toml`: + - Built-in: `my-evaluator = "..."` + - External: `"provider/name" = "..."` +6. Add optional dependencies if needed +7. Add tests in `evaluators/tests/` +8. Update `docs/OVERVIEW.md` with usage examples ### Update shared models @@ -505,28 +616,115 @@ test: add control set integration tests --- +## Evaluator Naming Conventions + +### Terminology + +There are three distinct concepts related to evaluators: + +| Concept | Definition | Example | +|---------|------------|---------| +| **Evaluator Type** | An implementation class with `evaluate()` method | `RegexEvaluator`, `Luna2Evaluator` | +| **Evaluator Schema** | Metadata about a custom type (name + JSON Schema for config validation) | Registered via `initAgent` | +| **Evaluator Config** | A saved configuration template (type + specific config values) | Stored via `/evaluator-configs` API | + +### Evaluator Type Name Formats + +Evaluator type names identify evaluator implementations. The format indicates the evaluator's origin: + +| Format | Origin | Examples | +|--------|--------|----------| +| `name` | Built-in (first-party, no dependencies) | `regex`, `list`, `json`, `sql` | +| `provider/name` | External (external providers, optional deps) | `galileo/luna2`, `nvidia/nemo` | +| `agent:name` | Agent-scoped (custom code deployed with agent) | `my-agent:pii-detector` | + +**Parsing rules:** +```python +if ":" in name: # Agent-scoped (split on first ":") + agent, evaluator = name.split(":", 1) +elif "/" in name: # External provider (split on first "/") + provider, evaluator = name.split("/", 1) +else: # Built-in + evaluator = name +``` + +### Built-in vs Third-Party Evaluators + +**Built-in evaluators** (`regex`, `list`, `json`, `sql`): +- No namespace prefix +- Core dependencies only (included in base package) +- Imported and registered automatically on package import + +**External evaluators** (`galileo/luna2`): +- Use `provider/name` format with slash separator +- Have optional dependencies (install via extras: `pip install agent-control-evaluators[luna2]`) +- Discovered via Python entry points (not auto-imported) + +### Agent-Scoped Evaluators + +Agent-scoped evaluators (`my-agent:pii-detector`) are custom evaluator types that: +1. Are **implemented in the agent's code** (not in the evaluators package) +2. Have their **schema registered via `initAgent`** for config validation +3. Are **server-only** (SDK cannot run them locally) + +``` +Agent Code Server Database +┌─────────────────────┐ ┌─────────────────────────────┐ +│ @register_evaluator │ initAgent │ Agent: "my-agent" │ +│ class PIIDetector │ ─────────► │ Schemas: [{ │ +│ ... │ │ name: "pii-detector", │ +└─────────────────────┘ │ config_schema: {...} │ + │ }] │ + └─────────────────────────────┘ +``` + +Controls reference them as `my-agent:pii-detector` (the `:` indicates agent scope). + +### Folder and File Naming + +| Item | Convention | Example | +|------|------------|---------| +| Folder name | `snake_case` (Python package) | `galileo_luna2/` | +| Entry point key | Same as type name | `"galileo/luna2"` | +| Metadata name | Same as type name | `name="galileo/luna2"` | + +> **Note:** In code, use "provider" as the type identifier. In user-facing docs, +> use "external" as the descriptive term. + +--- + ## Evaluator Development Quick Reference | Task | Location | |------|----------| | Evaluator base class | `agent_control_evaluators.Evaluator` | +| Config base class | `agent_control_evaluators.EvaluatorConfig` | | Evaluator metadata | `agent_control_evaluators.EvaluatorMetadata` | | Evaluator result | `agent_control_models.EvaluatorResult` | | Register decorator | `@agent_control_evaluators.register_evaluator` | | Built-in evaluators | `evaluators/src/agent_control_evaluators/{regex,list,json,sql}/` | +| External evaluators | `evaluators/src/agent_control_evaluators/galileo_luna2/` | | Evaluator tests | `evaluators/tests/` | +**Naming convention quick reference:** +``` +Built-in: regex, list, json, sql +External: galileo/luna2, nvidia/nemo +Agent-scoped: my-agent:pii-detector +``` + **Evaluator config model fields:** ```python -from pydantic import BaseModel, Field +from pydantic import Field +from agent_control_evaluators import EvaluatorConfig -class MyConfig(BaseModel): +class MyEvaluatorConfig(EvaluatorConfig): # Required field pattern: str = Field(..., description="Pattern to match") - + # Optional with default threshold: float = Field(0.5, ge=0.0, le=1.0) - + # List field values: list[str] = Field(default_factory=list) ``` diff --git a/README.md b/README.md index 1e773b67..7e7af095 100644 --- a/README.md +++ b/README.md @@ -188,7 +188,7 @@ Controls are defined via the API or dashboard. Each control specifies what to ch "scope": { "step_names": ["process_user_message"], "stages": ["pre"] }, "selector": { "path": "input" }, "evaluator": { - "name": "galileo-luna2", + "name": "galileo/luna2", "config": { "metric": "input_toxicity", "operator": "gt", diff --git a/docs/OVERVIEW.md b/docs/OVERVIEW.md index 06f1b836..25c5eb42 100644 --- a/docs/OVERVIEW.md +++ b/docs/OVERVIEW.md @@ -236,7 +236,7 @@ Flexible value matching with multiple modes and logic options. --- -### 3. Luna-2 Evaluator (`galileo-luna2`) +### 3. Luna-2 Evaluator (`galileo/luna2`) AI-powered detection using Galileo's Luna-2 small language models. Provides real-time, low-latency evaluation for complex patterns that can't be caught with regex or lists. @@ -267,7 +267,7 @@ AI-powered detection using Galileo's Luna-2 small language models. Provides real ```json // Block toxic inputs (score > 0.5) { - "name": "galileo-luna2", + "name": "galileo/luna2", "config": { "metric": "input_toxicity", "operator": "gt", @@ -278,7 +278,7 @@ AI-powered detection using Galileo's Luna-2 small language models. Provides real // Block prompt injection attempts { - "name": "galileo-luna2", + "name": "galileo/luna2", "config": { "metric": "prompt_injection", "operator": "gt", @@ -289,7 +289,7 @@ AI-powered detection using Galileo's Luna-2 small language models. Provides real // Flag potential hallucinations (warn but allow) { - "name": "galileo-luna2", + "name": "galileo/luna2", "config": { "metric": "hallucination", "operator": "gt", @@ -299,7 +299,7 @@ AI-powered detection using Galileo's Luna-2 small language models. Provides real // Using a central stage (pre-defined server-side rules) { - "name": "galileo-luna2", + "name": "galileo/luna2", "config": { "stage_type": "central", "stage_name": "production-safety", @@ -434,23 +434,28 @@ Every evaluator implements the `Evaluator` base class: ```python from typing import Any -from pydantic import BaseModel + from agent_control_models import EvaluatorResult -from agent_control_evaluators import Evaluator, EvaluatorMetadata, register_evaluator +from agent_control_evaluators import ( + Evaluator, + EvaluatorConfig, + EvaluatorMetadata, + register_evaluator, +) -class MyEvaluatorConfig(BaseModel): +class MyEvaluatorConfig(EvaluatorConfig): """Configuration schema for your evaluator.""" threshold: float = 0.5 custom_option: str = "default" @register_evaluator -class MyCustomEvaluator(Evaluator[MyEvaluatorConfig]): +class MyEvaluator(Evaluator[MyEvaluatorConfig]): """Your custom evaluator.""" metadata = EvaluatorMetadata( - name="my-custom-evaluator", + name="my-evaluator", version="1.0.0", description="Detects custom patterns using proprietary logic", requires_api_key=True, # Set to True if you need credentials @@ -527,7 +532,7 @@ except ImportError: AVAILABLE = False @register_evaluator -class MyEvaluator(Evaluator[MyConfig]): +class MyEvaluator(Evaluator[MyEvaluatorConfig]): @classmethod def is_available(cls) -> bool: return AVAILABLE @@ -552,7 +557,7 @@ Here's how a partner might integrate their content moderation API: ```python @register_evaluator -class ContentModerationEvaluator(Evaluator[ContentModConfig]): +class ContentModerationEvaluator(Evaluator[ContentModerationEvaluatorConfig]): """Integration with Acme Content Moderation API.""" metadata = EvaluatorMetadata( @@ -562,9 +567,9 @@ class ContentModerationEvaluator(Evaluator[ContentModConfig]): requires_api_key=True, timeout_ms=3000, ) - config_model = ContentModConfig + config_model = ContentModerationEvaluatorConfig - def __init__(self, config: ContentModConfig) -> None: + def __init__(self, config: ContentModerationEvaluatorConfig) -> None: super().__init__(config) self.client = AcmeClient(api_key=os.getenv("ACME_API_KEY")) diff --git a/docs/REFERENCE.md b/docs/REFERENCE.md index 282ca4c3..81e9989c 100644 --- a/docs/REFERENCE.md +++ b/docs/REFERENCE.md @@ -385,7 +385,7 @@ Flexible value matching with multiple modes and logic options. AI-powered detection using Galileo's Luna-2 small language models. Provides real-time, low-latency evaluation for complex patterns that can't be caught with regex or lists. -**Evaluator name**: `galileo-luna2` +**Evaluator name**: `galileo/luna2` **Installation**: Luna-2 requires an optional dependency: @@ -429,7 +429,7 @@ pip install agent-control-evaluators[luna2] ```json // Block toxic inputs (score > 0.5) { - "name": "galileo-luna2", + "name": "galileo/luna2", "config": { "metric": "input_toxicity", "operator": "gt", @@ -440,7 +440,7 @@ pip install agent-control-evaluators[luna2] // Block prompt injection attempts { - "name": "galileo-luna2", + "name": "galileo/luna2", "config": { "metric": "prompt_injection", "operator": "gt", @@ -451,7 +451,7 @@ pip install agent-control-evaluators[luna2] // Flag potential hallucinations (warn but allow) { - "name": "galileo-luna2", + "name": "galileo/luna2", "config": { "metric": "hallucination", "operator": "gt", @@ -461,7 +461,7 @@ pip install agent-control-evaluators[luna2] // Using central stage (pre-defined in Galileo) { - "name": "galileo-luna2", + "name": "galileo/luna2", "config": { "stage_type": "central", "stage_name": "production-safety", @@ -482,23 +482,28 @@ You can create custom evaluators to extend Agent Control with your own detection ```python from typing import Any -from pydantic import BaseModel + from agent_control_models import EvaluatorResult -from agent_control_evaluators import Evaluator, EvaluatorMetadata, register_evaluator +from agent_control_evaluators import ( + Evaluator, + EvaluatorConfig, + EvaluatorMetadata, + register_evaluator, +) -class MyEvaluatorConfig(BaseModel): +class MyEvaluatorConfig(EvaluatorConfig): """Configuration schema for your evaluator.""" threshold: float = 0.5 custom_option: str = "default" @register_evaluator -class MyCustomEvaluator(Evaluator[MyEvaluatorConfig]): +class MyEvaluator(Evaluator[MyEvaluatorConfig]): """Your custom evaluator.""" metadata = EvaluatorMetadata( - name="my-custom-evaluator", + name="my-evaluator", version="1.0.0", description="Detects custom patterns using proprietary logic", requires_api_key=True, @@ -535,14 +540,14 @@ class MyCustomEvaluator(Evaluator[MyEvaluatorConfig]): ```toml [project.entry-points."agent_control.evaluators"] -my-evaluator = "my_package.evaluator:MyCustomEvaluator" +my-evaluator = "my_package.evaluator:MyEvaluator" ``` **Optional Dependencies**: Override `is_available()` if your evaluator has optional dependencies: ```python @register_evaluator -class MyEvaluator(Evaluator[MyConfig]): +class MyEvaluator(Evaluator[MyEvaluatorConfig]): @classmethod def is_available(cls) -> bool: try: @@ -962,6 +967,6 @@ make alembic-upgrade 4. Verify the metric name is valid 5. Check `on_error` setting if failures are silently allowed -**Evaluator Not Found**: If `galileo-luna2` doesn't appear in `list_evaluators()`: +**Evaluator Not Found**: If `galileo/luna2` doesn't appear in `list_evaluators()`: - Verify `httpx` is installed (Luna-2's `is_available()` returns `False` without it) - Check server logs for evaluator discovery messages diff --git a/evaluators/README.md b/evaluators/README.md index 3a0d8db0..96de8396 100644 --- a/evaluators/README.md +++ b/evaluators/README.md @@ -9,9 +9,9 @@ Evaluator implementations for agent-control. - **json** - JSON schema validation - **sql** - SQL query validation using sqlglot -## Optional Evaluators +## External Evaluators -- **galileo-luna2** - Galileo Luna-2 integration (requires `luna2` extra) +- **galileo/luna2** - Galileo Luna-2 integration (requires `luna2` extra) ## Installation diff --git a/evaluators/pyproject.toml b/evaluators/pyproject.toml index c197c9e4..95529979 100644 --- a/evaluators/pyproject.toml +++ b/evaluators/pyproject.toml @@ -24,7 +24,7 @@ regex = "agent_control_evaluators.regex:RegexEvaluator" list = "agent_control_evaluators.list:ListEvaluator" json = "agent_control_evaluators.json:JSONEvaluator" sql = "agent_control_evaluators.sql:SQLEvaluator" -galileo-luna2 = "agent_control_evaluators.galileo_luna2:Luna2Evaluator" +"galileo/luna2" = "agent_control_evaluators.galileo_luna2:Luna2Evaluator" [build-system] requires = ["hatchling"] diff --git a/evaluators/src/agent_control_evaluators/__init__.py b/evaluators/src/agent_control_evaluators/__init__.py index 32515ac8..ea84eaf5 100644 --- a/evaluators/src/agent_control_evaluators/__init__.py +++ b/evaluators/src/agent_control_evaluators/__init__.py @@ -4,11 +4,20 @@ Built-in evaluators (regex, list, json, sql) are registered automatically on import. Available evaluators: - - regex: Regular expression matching (built-in) - - list: List-based value matching (built-in) - - json: JSON validation (built-in) - - sql: SQL query validation (built-in) - - galileo-luna2: Galileo Luna-2 runtime protection (pip install agent-control-evaluators[luna2]) + Built-in (no namespace): + - regex: Regular expression matching + - list: List-based value matching + - json: JSON validation + - sql: SQL query validation + + External (provider/name format): + - galileo/luna2: Galileo Luna-2 runtime protection + (pip install agent-control-evaluators[luna2]) + +Naming convention: + - Built-in: "regex", "list", "json", "sql" + - External: "provider/name" (e.g., "galileo/luna2") + - Agent-scoped: "agent:name" (custom code deployed with agent) Custom evaluators are Evaluator classes deployed with the engine. Their schemas are registered via initAgent for validation purposes. diff --git a/evaluators/src/agent_control_evaluators/_base.py b/evaluators/src/agent_control_evaluators/_base.py index b6e7048d..b548e60b 100644 --- a/evaluators/src/agent_control_evaluators/_base.py +++ b/evaluators/src/agent_control_evaluators/_base.py @@ -43,7 +43,7 @@ class EvaluatorMetadata: """Metadata about an evaluator. Attributes: - name: Unique evaluator name (e.g., "regex", "galileo_luna2") + name: Unique evaluator name (e.g., "regex", "galileo/luna2") version: Evaluator version string description: Human-readable description requires_api_key: Whether the evaluator requires an API key @@ -57,7 +57,7 @@ class EvaluatorMetadata: timeout_ms: int = 10000 -class Evaluator(ABC, Generic[ConfigT]): +class Evaluator(ABC, Generic[ConfigT]): # noqa: UP046 - need Python 3.10 compat """Base class for all evaluators (built-in, external, or custom). All evaluators follow the same pattern: diff --git a/evaluators/src/agent_control_evaluators/_factory.py b/evaluators/src/agent_control_evaluators/_factory.py index 94af326d..772903df 100644 --- a/evaluators/src/agent_control_evaluators/_factory.py +++ b/evaluators/src/agent_control_evaluators/_factory.py @@ -11,9 +11,10 @@ from agent_control_evaluators._discovery import list_evaluators if TYPE_CHECKING: - from agent_control_evaluators._base import Evaluator from agent_control_models import EvaluatorSpec + from agent_control_evaluators._base import Evaluator + logger = logging.getLogger(__name__) # Configuration diff --git a/evaluators/src/agent_control_evaluators/galileo_luna2/__init__.py b/evaluators/src/agent_control_evaluators/galileo_luna2/__init__.py index 1ead7014..60558600 100644 --- a/evaluators/src/agent_control_evaluators/galileo_luna2/__init__.py +++ b/evaluators/src/agent_control_evaluators/galileo_luna2/__init__.py @@ -26,9 +26,9 @@ "LUNA2_AVAILABLE", ] -# Export client classes when available +# Export client classes when available (added to __all__ below) if LUNA2_AVAILABLE: - from .client import ( + from .client import ( # noqa: F401 GalileoProtectClient, PassthroughAction, Payload, diff --git a/evaluators/src/agent_control_evaluators/galileo_luna2/config.py b/evaluators/src/agent_control_evaluators/galileo_luna2/config.py index 52e76759..f24d221e 100644 --- a/evaluators/src/agent_control_evaluators/galileo_luna2/config.py +++ b/evaluators/src/agent_control_evaluators/galileo_luna2/config.py @@ -1,6 +1,6 @@ """Configuration models for Luna-2 evaluator.""" -from typing import Any, Literal, Union +from typing import Any, Literal from pydantic import Field, model_validator @@ -67,9 +67,9 @@ class Luna2EvaluatorConfig(EvaluatorConfig): default=None, description="Comparison operator (required for local stage)", ) - target_value: Union[str, float, int, None] = Field( + target_value: str | float | int | None = Field( default=None, - description="Target value for comparison (required for local stage). Can be string or number.", + description="Target value for comparison (required for local stage).", ) # Central stage fields diff --git a/evaluators/src/agent_control_evaluators/galileo_luna2/evaluator.py b/evaluators/src/agent_control_evaluators/galileo_luna2/evaluator.py index 24e25266..48ca3fe5 100644 --- a/evaluators/src/agent_control_evaluators/galileo_luna2/evaluator.py +++ b/evaluators/src/agent_control_evaluators/galileo_luna2/evaluator.py @@ -84,7 +84,7 @@ class Luna2Evaluator(Evaluator[Luna2EvaluatorConfig]): """ metadata = EvaluatorMetadata( - name="galileo-luna2", + name="galileo/luna2", version="2.0.0", description="Galileo Luna-2 enterprise runtime protection (direct API)", requires_api_key=True, diff --git a/evaluators/src/agent_control_evaluators/sql/config.py b/evaluators/src/agent_control_evaluators/sql/config.py index b6eb32c9..c8ccb3ba 100644 --- a/evaluators/src/agent_control_evaluators/sql/config.py +++ b/evaluators/src/agent_control_evaluators/sql/config.py @@ -1,7 +1,7 @@ """Configuration for SQL validation evaluator.""" import warnings -from typing import Any, Literal +from typing import Literal from pydantic import Field, model_validator diff --git a/examples/customer_support_agent/README.md b/examples/customer_support_agent/README.md index 8b7d3ba0..0517135d 100644 --- a/examples/customer_support_agent/README.md +++ b/examples/customer_support_agent/README.md @@ -302,7 +302,7 @@ scope: selector: path: input evaluator: - name: galileo-luna2 + name: galileo/luna2 config: stage_type: local metric: input_toxicity diff --git a/models/src/agent_control_models/controls.py b/models/src/agent_control_models/controls.py index 8a47b91f..47ab4d09 100644 --- a/models/src/agent_control_models/controls.py +++ b/models/src/agent_control_models/controls.py @@ -161,7 +161,7 @@ class EvaluatorSpec(BaseModel): Evaluator reference formats: - Built-in: "regex", "list", "json", "sql" - - External: "galileo-luna2" (requires agent-control-evaluators[luna2]) + - External: "galileo/luna2" (requires agent-control-evaluators[luna2]) - Agent-scoped: "my-agent:my-evaluator" (validated in endpoint, not here) """ diff --git a/sdks/python/tests/test_luna2_evaluator.py b/sdks/python/tests/test_luna2_evaluator.py index 7b30c63b..a16d6586 100644 --- a/sdks/python/tests/test_luna2_evaluator.py +++ b/sdks/python/tests/test_luna2_evaluator.py @@ -11,10 +11,9 @@ from unittest.mock import AsyncMock, MagicMock, patch import pytest -from pydantic import ValidationError - from agent_control_evaluators import Evaluator from agent_control_models import EvaluatorResult +from pydantic import ValidationError def create_mock_protect_response( @@ -282,7 +281,7 @@ def test_luna2_evaluator_import_success(self): from agent_control_evaluators.galileo_luna2 import Luna2Evaluator assert Luna2Evaluator is not None - assert Luna2Evaluator.metadata.name == "galileo-luna2" + assert Luna2Evaluator.metadata.name == "galileo/luna2" assert Luna2Evaluator.metadata.version == "2.0.0" @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", False) @@ -329,7 +328,7 @@ def test_metadata_fields(self): metadata = Luna2Evaluator.metadata - assert metadata.name == "galileo-luna2" + assert metadata.name == "galileo/luna2" assert metadata.requires_api_key is True assert metadata.timeout_ms == 10000 # Config schema is now from config_model diff --git a/server/src/agent_control_server/endpoints/agents.py b/server/src/agent_control_server/endpoints/agents.py index a22ea011..5ba569d2 100644 --- a/server/src/agent_control_server/endpoints/agents.py +++ b/server/src/agent_control_server/endpoints/agents.py @@ -43,7 +43,10 @@ policy_controls, ) from ..services.controls import list_controls_for_agent, list_controls_for_policy -from ..services.evaluator_utils import parse_evaluator_ref, validate_config_against_schema +from ..services.evaluator_utils import ( + parse_evaluator_ref_full, + validate_config_against_schema, +) from ..services.schema_compat import ( check_schema_compatibility, format_compatibility_error, @@ -111,36 +114,37 @@ async def _validate_policy_controls_for_agent( if not evaluator_name: continue - agent_name, eval_name = parse_evaluator_ref(evaluator_name) - if agent_name is None: - continue # Built-in evaluator, already validated at control creation + parsed = parse_evaluator_ref_full(evaluator_name) + if parsed.type != "agent": + continue # Built-in/external evaluator, already validated at control creation # Agent-scoped evaluator - check if target matches this agent - if agent_name != agent.name: + if parsed.namespace != agent.name: errors.append( f"Control '{control.name}' references evaluator '{evaluator_name}' " - f"which belongs to agent '{agent_name}', not '{agent.name}'" + f"which belongs to agent '{parsed.namespace}', not '{agent.name}'" ) continue # Check if evaluator exists on this agent - if eval_name not in agent_evaluators: + if parsed.local_name not in agent_evaluators: errors.append( - f"Control '{control.name}' references evaluator '{eval_name}' " + f"Control '{control.name}' references evaluator '{parsed.local_name}' " f"which is not registered with agent '{agent.name}'. " f"Register it via initAgent or use a different evaluator." ) continue # Validate config against schema - registered_ev = agent_evaluators[eval_name] + registered_ev = agent_evaluators[parsed.local_name] config = evaluator_cfg.get("config", {}) if registered_ev.config_schema: try: validate_config_against_schema(config, registered_ev.config_schema) except JSONSchemaValidationError as e: errors.append( - f"Control '{control.name}' invalid config for '{eval_name}': {e.message}" + f"Control '{control.name}' invalid config for " + f"'{parsed.local_name}': {e.message}" ) return errors diff --git a/server/src/agent_control_server/endpoints/controls.py b/server/src/agent_control_server/endpoints/controls.py index 5198d24a..e887a6c7 100644 --- a/server/src/agent_control_server/endpoints/controls.py +++ b/server/src/agent_control_server/endpoints/controls.py @@ -31,7 +31,10 @@ ) from ..logging_utils import get_logger from ..models import Agent, AgentData, Control, Policy, policy_controls -from ..services.evaluator_utils import parse_evaluator_ref, validate_config_against_schema +from ..services.evaluator_utils import ( + parse_evaluator_ref_full, + validate_config_against_schema, +) # Pagination constants _DEFAULT_PAGINATION_LIMIT = 20 @@ -252,20 +255,20 @@ async def set_control_data( # Validate evaluator config evaluator_ref = request.data.evaluator.name - agent_name, eval_name = parse_evaluator_ref(evaluator_ref) + parsed = parse_evaluator_ref_full(evaluator_ref) - if agent_name is not None: + if parsed.type == "agent": # Agent-scoped evaluator: validate against agent's registered schema agent_result = await db.execute( - select(Agent).where(Agent.name == agent_name) + select(Agent).where(Agent.name == parsed.namespace) ) agent = agent_result.scalars().first() if agent is None: raise NotFoundError( error_code=ErrorCode.AGENT_NOT_FOUND, - detail=f"Agent '{agent_name}' not found", + detail=f"Agent '{parsed.namespace}' not found", resource="Agent", - resource_id=agent_name, + resource_id=parsed.namespace, hint=( "Ensure the agent exists before creating controls " "that reference its evaluators." @@ -277,7 +280,7 @@ async def set_control_data( except ValidationError as e: raise APIValidationError( error_code=ErrorCode.CORRUPTED_DATA, - detail=f"Agent '{agent_name}' has invalid data", + detail=f"Agent '{parsed.namespace}' has invalid data", resource="Agent", errors=[ ValidationErrorItem( @@ -291,14 +294,17 @@ async def set_control_data( ) evaluator = next( - (e for e in (agent_data.evaluators or []) if e.name == eval_name), + (e for e in (agent_data.evaluators or []) if e.name == parsed.local_name), None, ) if evaluator is None: available = [e.name for e in (agent_data.evaluators or [])] raise APIValidationError( error_code=ErrorCode.EVALUATOR_NOT_FOUND, - detail=f"Evaluator '{eval_name}' is not registered with agent '{agent_name}'", + detail=( + f"Evaluator '{parsed.local_name}' is not registered " + f"with agent '{parsed.namespace}'" + ), resource="Evaluator", hint=( f"Register it via initAgent first. " @@ -309,7 +315,10 @@ async def set_control_data( resource="Control", field="data.evaluator.name", code="evaluator_not_found", - message=f"Evaluator '{eval_name}' not found on agent '{agent_name}'", + message=( + f"Evaluator '{parsed.local_name}' not found " + f"on agent '{parsed.namespace}'" + ), value=evaluator_ref, ) ], @@ -324,7 +333,7 @@ async def set_control_data( except JSONSchemaValidationError as e: raise APIValidationError( error_code=ErrorCode.INVALID_CONFIG, - detail=f"Config validation failed for evaluator '{agent_name}:{eval_name}'", + detail=f"Config validation failed for evaluator '{evaluator_ref}'", resource="Control", hint="Check the evaluator's config schema for required fields and types.", errors=[ @@ -337,15 +346,15 @@ async def set_control_data( ], ) else: - # Built-in or server-side evaluator: validate if registered - evaluator_cls = list_evaluators().get(eval_name) + # Built-in or external evaluator: validate if registered + evaluator_cls = list_evaluators().get(parsed.name) if evaluator_cls is not None: try: evaluator_cls.config_model(**request.data.evaluator.config) except ValidationError as e: raise APIValidationError( error_code=ErrorCode.INVALID_CONFIG, - detail=f"Config validation failed for evaluator '{eval_name}'", + detail=f"Config validation failed for evaluator '{parsed.name}'", resource="Control", hint="Check the evaluator's config schema for required fields and types.", errors=[ @@ -364,7 +373,7 @@ async def set_control_data( except TypeError as e: raise APIValidationError( error_code=ErrorCode.INVALID_CONFIG, - detail=f"Invalid config parameters for evaluator '{eval_name}'", + detail=f"Invalid config parameters for evaluator '{parsed.name}'", resource="Control", hint="Check the evaluator's config schema for valid parameter names.", errors=[ diff --git a/server/src/agent_control_server/endpoints/evaluator_configs.py b/server/src/agent_control_server/endpoints/evaluator_configs.py index e297cc71..b3aed487 100644 --- a/server/src/agent_control_server/endpoints/evaluator_configs.py +++ b/server/src/agent_control_server/endpoints/evaluator_configs.py @@ -22,7 +22,7 @@ from ..errors import APIValidationError, ConflictError, DatabaseError, NotFoundError from ..logging_utils import get_logger from ..models import EvaluatorConfigDB -from ..services.evaluator_utils import parse_evaluator_ref +from ..services.evaluator_utils import is_agent_scoped _logger = get_logger(__name__) @@ -46,8 +46,7 @@ def _to_item(config: EvaluatorConfigDB) -> EvaluatorConfigItem: def _ensure_not_agent_scoped(evaluator: str) -> None: - agent_name, _ = parse_evaluator_ref(evaluator) - if agent_name is not None: + if is_agent_scoped(evaluator): raise APIValidationError( error_code=ErrorCode.VALIDATION_ERROR, detail="Agent-scoped evaluators are not supported for evaluator configs", diff --git a/server/src/agent_control_server/services/evaluator_utils.py b/server/src/agent_control_server/services/evaluator_utils.py index cb162f51..a39074d7 100644 --- a/server/src/agent_control_server/services/evaluator_utils.py +++ b/server/src/agent_control_server/services/evaluator_utils.py @@ -1,29 +1,105 @@ -"""Utilities for working with evaluator references.""" +"""Utilities for working with evaluator references. + +Evaluator Type Name Formats: + - Built-in: "regex", "list", "json", "sql" + - External: "galileo/luna2", "nvidia/nemo" (slash separator) + - Agent-scoped: "my-agent:pii-detector" (colon separator) + +The key distinction is: + - Built-in and external evaluators are global (available to all agents) + - Agent-scoped evaluators are custom implementations deployed with a specific agent +""" import json +from dataclasses import dataclass from functools import lru_cache -from typing import Any +from typing import Any, Literal from jsonschema_rs import validator_for -def parse_evaluator_ref(evaluator_ref: str) -> tuple[str | None, str]: - """Parse evaluator reference into (agent_name, evaluator_name). +@dataclass +class ParsedEvaluatorRef: + """Parsed evaluator reference with type information. + + Attributes: + type: The evaluator category ("builtin", "external", or "agent") + name: The full evaluator name (e.g., "regex", "galileo/luna2", "my-agent:pii") + namespace: For external evaluators, the provider name; for agent-scoped, the agent name + local_name: The evaluator name without namespace prefix + """ + + type: Literal["builtin", "external", "agent"] + name: str + namespace: str | None + local_name: str + - Built-in evaluators have no prefix, agent-scoped evaluators use {agent}:{name} format. +def parse_evaluator_ref_full(evaluator_ref: str) -> ParsedEvaluatorRef: + """Parse evaluator reference into structured form with type detection. + + Determines the evaluator type based on the name format: + - Contains ":" → agent-scoped (split on first ":") + - Contains "/" → external (split on first "/") + - Otherwise → built-in Args: - evaluator_ref: Evaluator reference string (e.g., "regex" or "my-agent:pii-detector") + evaluator_ref: Evaluator reference string Returns: - Tuple of (agent_name, evaluator_name): - - (None, "regex") for built-in evaluators - - ("my-agent", "pii-detector") for agent-scoped evaluators + ParsedEvaluatorRef with type, namespace, and local name + + Examples: + >>> parse_evaluator_ref_full("regex") + ParsedEvaluatorRef(type="builtin", name="regex", ...) + + >>> parse_evaluator_ref_full("galileo/luna2") + ParsedEvaluatorRef(type="external", namespace="galileo", ...) + + >>> parse_evaluator_ref_full("my-agent:pii-detector") + ParsedEvaluatorRef(type="agent", namespace="my-agent", ...) """ if ":" in evaluator_ref: - agent, name = evaluator_ref.split(":", 1) - return agent, name - return None, evaluator_ref + # Agent-scoped: "my-agent:pii-detector" + agent, local_name = evaluator_ref.split(":", 1) + return ParsedEvaluatorRef( + type="agent", + name=evaluator_ref, + namespace=agent, + local_name=local_name, + ) + elif "/" in evaluator_ref: + # External: "galileo/luna2" + provider, local_name = evaluator_ref.split("/", 1) + return ParsedEvaluatorRef( + type="external", + name=evaluator_ref, + namespace=provider, + local_name=local_name, + ) + else: + # Built-in: "regex" + return ParsedEvaluatorRef( + type="builtin", + name=evaluator_ref, + namespace=None, + local_name=evaluator_ref, + ) + + +def is_agent_scoped(evaluator_ref: str) -> bool: + """Check if an evaluator reference is agent-scoped. + + Agent-scoped evaluators use the "agent:name" format and reference + custom implementations deployed with a specific agent. + + Args: + evaluator_ref: Evaluator reference string + + Returns: + True if agent-scoped, False for built-in or external evaluators + """ + return ":" in evaluator_ref def _canonicalize_schema(schema: dict[str, Any]) -> str: diff --git a/server/tests/test_evaluator_utils.py b/server/tests/test_evaluator_utils.py index 7d605b48..f7f21c1d 100644 --- a/server/tests/test_evaluator_utils.py +++ b/server/tests/test_evaluator_utils.py @@ -3,67 +3,83 @@ import pytest from agent_control_server.services.evaluator_utils import ( - parse_evaluator_ref, + is_agent_scoped, + parse_evaluator_ref_full, validate_config_against_schema, ) -class TestParseEvaluatorRef: - """Tests for parse_evaluator_ref function.""" +class TestParseEvaluatorRefFull: + """Tests for parse_evaluator_ref_full function (full three-way parsing).""" def test_builtin_evaluator(self) -> None: - """Given a built-in evaluator name, when parsing, then returns None for agent.""" - # When: - agent, name = parse_evaluator_ref("regex") - - # Then: - assert agent is None - assert name == "regex" + """Given a built-in evaluator, when parsing full, then type is builtin.""" + # When + result = parse_evaluator_ref_full("regex") + + # Then + assert result.type == "builtin" + assert result.name == "regex" + assert result.namespace is None + assert result.local_name == "regex" + + def test_external_evaluator(self) -> None: + """Given an external evaluator, when parsing full, then type is external.""" + # When + result = parse_evaluator_ref_full("galileo/luna2") + + # Then + assert result.type == "external" + assert result.name == "galileo/luna2" + assert result.namespace == "galileo" + assert result.local_name == "luna2" def test_agent_scoped_evaluator(self) -> None: - """Given an agent-scoped reference, when parsing, then returns both parts.""" - # When: - agent, name = parse_evaluator_ref("my-agent:pii-detector") - - # Then: - assert agent == "my-agent" - assert name == "pii-detector" - - def test_multiple_colons(self) -> None: - """Given a reference with multiple colons, when parsing, then splits on first colon only.""" - # When: - agent, name = parse_evaluator_ref("my-agent:complex:name") - - # Then: - assert agent == "my-agent" - assert name == "complex:name" - - def test_empty_string(self) -> None: - """Given an empty string, when parsing, then returns None agent and empty name.""" - # When: - agent, name = parse_evaluator_ref("") - - # Then: - assert agent is None - assert name == "" - - def test_list_evaluator(self) -> None: - """Given the list built-in evaluator, when parsing, then returns None for agent.""" - # When: - agent, name = parse_evaluator_ref("list") - - # Then: - assert agent is None - assert name == "list" - - def test_agent_name_with_hyphens(self) -> None: - """Given an agent name with hyphens, when parsing, then handles correctly.""" - # When: - agent, name = parse_evaluator_ref("my-cool-agent:my-eval") - - # Then: - assert agent == "my-cool-agent" - assert name == "my-eval" + """Given an agent-scoped evaluator, when parsing full, then type is agent.""" + # When + result = parse_evaluator_ref_full("my-agent:pii-detector") + + # Then + assert result.type == "agent" + assert result.name == "my-agent:pii-detector" + assert result.namespace == "my-agent" + assert result.local_name == "pii-detector" + + def test_external_with_nested_path(self) -> None: + """Given an external evaluator with nested path, when parsing, splits on first slash.""" + # When + result = parse_evaluator_ref_full("acme/safety/toxicity") + + # Then + assert result.type == "external" + assert result.namespace == "acme" + assert result.local_name == "safety/toxicity" + + def test_agent_scoped_with_slash_in_name(self) -> None: + """Given agent-scoped with slash in name, when parsing, then colon takes precedence.""" + # When - colon should be detected before slash + result = parse_evaluator_ref_full("my-agent:vendor/eval") + + # Then + assert result.type == "agent" + assert result.namespace == "my-agent" + assert result.local_name == "vendor/eval" + + +class TestIsAgentScoped: + """Tests for is_agent_scoped helper function.""" + + def test_builtin_not_agent_scoped(self) -> None: + """Given a built-in evaluator, when checking, then returns False.""" + assert is_agent_scoped("regex") is False + + def test_external_not_agent_scoped(self) -> None: + """Given an external evaluator, when checking, then returns False.""" + assert is_agent_scoped("galileo/luna2") is False + + def test_agent_scoped_returns_true(self) -> None: + """Given an agent-scoped evaluator, when checking, then returns True.""" + assert is_agent_scoped("my-agent:pii-detector") is True class TestValidateConfigAgainstSchema: diff --git a/ui/src/core/evaluators/luna2/index.ts b/ui/src/core/evaluators/luna2/index.ts index bcbe904a..c9be82c8 100644 --- a/ui/src/core/evaluators/luna2/index.ts +++ b/ui/src/core/evaluators/luna2/index.ts @@ -33,7 +33,7 @@ const numberOrNull = (value: number | ""): number | null => * prompt injection, PII detection, and more. */ export const luna2Evaluator: EvaluatorDefinition = { - id: "galileo-luna2", + id: "galileo/luna2", displayName: "Galileo Luna-2", initialValues: { diff --git a/ui/tests/fixtures.ts b/ui/tests/fixtures.ts index 1c2685da..05881c3c 100644 --- a/ui/tests/fixtures.ts +++ b/ui/tests/fixtures.ts @@ -240,7 +240,7 @@ const evaluatorsResponse: EvaluatorsResponse = { required: ["schema"], }, }, - "galileo-luna2": { + "galileo/luna2": { name: "Galileo Luna-2", version: "1.0.0", description: "AI-powered content moderation using Galileo Luna-2", From ee4208d9a9b7e479e86da1d531ac0006cfc5ca14 Mon Sep 17 00:00:00 2001 From: Abhinav Gupta Date: Fri, 30 Jan 2026 19:14:32 +0530 Subject: [PATCH 06/21] docs: fix typo in CONTRIBUTING.md evaluator example Change config_model = AcmeToxicityConfig to AcmeToxicityEvaluatorConfig to match the class definition used in the example. --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index bebd73e9..c530b459 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -328,7 +328,7 @@ class AcmeToxicityEvaluator(Evaluator[AcmeToxicityEvaluatorConfig]): requires_api_key=True, timeout_ms=5000, ) - config_model = AcmeToxicityConfig + config_model = AcmeToxicityEvaluatorConfig @classmethod def is_available(cls) -> bool: From d00b2e2ce167c220f1efcf7a181682e62b628564 Mon Sep 17 00:00:00 2001 From: Abhinav Gupta Date: Fri, 30 Jan 2026 20:17:41 +0530 Subject: [PATCH 07/21] chore: sync pyproject.toml versions and metadata across packages - Bump evaluators and engine versions from 0.1.0 to 2.1.0 - Align requires-python to >=3.12 (was >=3.10 in evaluators) - Standardize license to Apache-2.0 (was MIT in evaluators) - Align authors format to "Agent Control Team" - Bump pydantic minimum to >=2.12.4 in evaluators --- engine/pyproject.toml | 2 +- evaluators/pyproject.toml | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/engine/pyproject.toml b/engine/pyproject.toml index 2b84bb23..8363e52c 100644 --- a/engine/pyproject.toml +++ b/engine/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "agent-control-engine" -version = "0.1.0" +version = "2.1.0" description = "Control execution engine for Agent Control" requires-python = ">=3.12" dependencies = [ diff --git a/evaluators/pyproject.toml b/evaluators/pyproject.toml index 95529979..372853e2 100644 --- a/evaluators/pyproject.toml +++ b/evaluators/pyproject.toml @@ -1,14 +1,14 @@ [project] name = "agent-control-evaluators" -version = "0.1.0" +version = "2.1.0" description = "Evaluator implementations for agent-control" readme = "README.md" -requires-python = ">=3.10" -license = { text = "MIT" } -authors = [{ name = "Galileo", email = "support@rungalileo.io" }] +requires-python = ">=3.12" +license = { text = "Apache-2.0" } +authors = [{ name = "Agent Control Team" }] dependencies = [ "agent-control-models", - "pydantic>=2.0.0", + "pydantic>=2.12.4", "google-re2>=1.1", "jsonschema>=4.0.0", "sqlglot[rs]>=20.0.0", From 9cc9cbb3221c2f70db07c56559935353b524ac06 Mon Sep 17 00:00:00 2001 From: Abhinav Gupta Date: Tue, 3 Feb 2026 18:29:33 +0530 Subject: [PATCH 08/21] refactor(evaluators)!: reorganize into builtin + extra tiers Split evaluators into two packages: - builtin (`agent-control-evaluators`): Core infrastructure + regex, list, json, sql - extra/galileo (`agent-control-evaluator-galileo`): Luna2 evaluator (calls external API) BREAKING CHANGES: - Luna2 import path changed from `agent_control_evaluators.galileo_luna2` to `agent_control_evaluator_galileo.luna2` - External evaluator names use dot notation instead of slash (e.g., `galileo.luna2` instead of `galileo/luna2`) - SDK and server now depend on `agent-control-evaluators` as a runtime dependency (not vendored) to avoid duplicate module conflicts Key changes: - Move builtin evaluators to `evaluators/builtin/` - Create `evaluators/extra/galileo/` as separate package - Add entry points for plugin discovery (`agent_control.evaluators`) - Update workspace to include only builtin (extras excluded for perf) - Add CI workflow for testing extra packages - Add template scaffold for creating new evaluator packages - Server build script no longer vendors evaluators --- .github/workflows/test-extras.yml | 52 ++++++ AGENTS.md | 17 +- Makefile | 29 +++- evaluators/README.md | 23 --- evaluators/builtin/Makefile | 33 ++++ evaluators/builtin/README.md | 48 ++++++ evaluators/{ => builtin}/pyproject.toml | 9 +- .../src/agent_control_evaluators/__init__.py | 11 +- .../src/agent_control_evaluators/_base.py | 0 .../agent_control_evaluators/_discovery.py | 0 .../src/agent_control_evaluators/_factory.py | 0 .../src/agent_control_evaluators/_registry.py | 0 .../agent_control_evaluators/json/__init__.py | 0 .../agent_control_evaluators/json/config.py | 2 +- .../json/evaluator.py | 22 +++ .../agent_control_evaluators/list/__init__.py | 0 .../agent_control_evaluators/list/config.py | 0 .../list/evaluator.py | 0 .../src/agent_control_evaluators/py.typed | 0 .../regex/__init__.py | 0 .../agent_control_evaluators/regex/config.py | 0 .../regex/evaluator.py | 0 .../agent_control_evaluators/sql/__init__.py | 0 .../agent_control_evaluators/sql/config.py | 2 +- .../agent_control_evaluators/sql/evaluator.py | 12 +- evaluators/{ => builtin}/tests/__init__.py | 0 .../{ => builtin}/tests/json/__init__.py | 0 .../{ => builtin}/tests/json/test_json.py | 0 .../{ => builtin}/tests/sql/__init__.py | 0 .../{ => builtin}/tests/sql/test_sql.py | 0 evaluators/{ => builtin}/tests/test_base.py | 0 evaluators/extra/galileo/README.md | 60 +++++++ evaluators/extra/galileo/pyproject.toml | 32 ++++ .../__init__.py | 31 ++++ .../luna2}/__init__.py | 12 +- .../luna2}/client.py | 0 .../luna2}/config.py | 2 +- .../luna2}/evaluator.py | 12 +- evaluators/extra/galileo/tests/__init__.py | 0 .../galileo}/tests/test_luna2_evaluator.py | 156 +++++++++--------- evaluators/extra/template/README.md | 100 +++++++++++ .../extra/template/pyproject.toml.template | 30 ++++ models/src/agent_control_models/controls.py | 2 +- pyproject.toml | 11 +- scripts/build.py | 21 +-- sdks/python/pyproject.toml | 11 +- .../src/agent_control/evaluators/__init__.py | 2 +- sdks/python/tests/test_luna2_smoke.py | 19 +++ server/pyproject.toml | 18 +- .../services/evaluator_utils.py | 8 +- server/tests/test_controls_additional.py | 2 +- server/tests/test_evaluator_utils.py | 22 +-- 52 files changed, 627 insertions(+), 184 deletions(-) create mode 100644 .github/workflows/test-extras.yml delete mode 100644 evaluators/README.md create mode 100644 evaluators/builtin/Makefile create mode 100644 evaluators/builtin/README.md rename evaluators/{ => builtin}/pyproject.toml (81%) rename evaluators/{ => builtin}/src/agent_control_evaluators/__init__.py (87%) rename evaluators/{ => builtin}/src/agent_control_evaluators/_base.py (100%) rename evaluators/{ => builtin}/src/agent_control_evaluators/_discovery.py (100%) rename evaluators/{ => builtin}/src/agent_control_evaluators/_factory.py (100%) rename evaluators/{ => builtin}/src/agent_control_evaluators/_registry.py (100%) rename evaluators/{ => builtin}/src/agent_control_evaluators/json/__init__.py (100%) rename evaluators/{ => builtin}/src/agent_control_evaluators/json/config.py (99%) rename evaluators/{ => builtin}/src/agent_control_evaluators/json/evaluator.py (96%) rename evaluators/{ => builtin}/src/agent_control_evaluators/list/__init__.py (100%) rename evaluators/{ => builtin}/src/agent_control_evaluators/list/config.py (100%) rename evaluators/{ => builtin}/src/agent_control_evaluators/list/evaluator.py (100%) rename evaluators/{ => builtin}/src/agent_control_evaluators/py.typed (100%) rename evaluators/{ => builtin}/src/agent_control_evaluators/regex/__init__.py (100%) rename evaluators/{ => builtin}/src/agent_control_evaluators/regex/config.py (100%) rename evaluators/{ => builtin}/src/agent_control_evaluators/regex/evaluator.py (100%) rename evaluators/{ => builtin}/src/agent_control_evaluators/sql/__init__.py (100%) rename evaluators/{ => builtin}/src/agent_control_evaluators/sql/config.py (99%) rename evaluators/{ => builtin}/src/agent_control_evaluators/sql/evaluator.py (99%) rename evaluators/{ => builtin}/tests/__init__.py (100%) rename evaluators/{ => builtin}/tests/json/__init__.py (100%) rename evaluators/{ => builtin}/tests/json/test_json.py (100%) rename evaluators/{ => builtin}/tests/sql/__init__.py (100%) rename evaluators/{ => builtin}/tests/sql/test_sql.py (100%) rename evaluators/{ => builtin}/tests/test_base.py (100%) create mode 100644 evaluators/extra/galileo/README.md create mode 100644 evaluators/extra/galileo/pyproject.toml create mode 100644 evaluators/extra/galileo/src/agent_control_evaluator_galileo/__init__.py rename evaluators/{src/agent_control_evaluators/galileo_luna2 => extra/galileo/src/agent_control_evaluator_galileo/luna2}/__init__.py (75%) rename evaluators/{src/agent_control_evaluators/galileo_luna2 => extra/galileo/src/agent_control_evaluator_galileo/luna2}/client.py (100%) rename evaluators/{src/agent_control_evaluators/galileo_luna2 => extra/galileo/src/agent_control_evaluator_galileo/luna2}/config.py (98%) rename evaluators/{src/agent_control_evaluators/galileo_luna2 => extra/galileo/src/agent_control_evaluator_galileo/luna2}/evaluator.py (96%) create mode 100644 evaluators/extra/galileo/tests/__init__.py rename {sdks/python => evaluators/extra/galileo}/tests/test_luna2_evaluator.py (80%) create mode 100644 evaluators/extra/template/README.md create mode 100644 evaluators/extra/template/pyproject.toml.template create mode 100644 sdks/python/tests/test_luna2_smoke.py diff --git a/.github/workflows/test-extras.yml b/.github/workflows/test-extras.yml new file mode 100644 index 00000000..dbb2ea33 --- /dev/null +++ b/.github/workflows/test-extras.yml @@ -0,0 +1,52 @@ +name: Test Extras + +on: + push: + paths: + # Trigger on extra changes + - 'evaluators/extra/**' + # Also trigger on core changes that could break extras + - 'evaluators/builtin/**' + - 'models/**' + - 'engine/**' + - 'server/**' + - 'sdks/python/**' + pull_request: + paths: + - 'evaluators/extra/**' + - 'evaluators/builtin/**' + - 'models/**' + - 'engine/**' + - 'server/**' + - 'sdks/python/**' + +jobs: + test-galileo: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Setup uv and Python + uses: astral-sh/setup-uv@v3 + with: + python-version: "3.12" + + - name: Sync workspace + run: make sync + + - name: Install galileo extra + run: cd evaluators/extra/galileo && uv pip install -e . + + - name: Lint galileo + run: cd evaluators/extra/galileo && uv run ruff check --config ../../../pyproject.toml src/ + + - name: Typecheck galileo + run: cd evaluators/extra/galileo && uv run mypy --config-file ../../../pyproject.toml src/ + + - name: Test galileo + run: cd evaluators/extra/galileo && uv run pytest + + - name: Verify SDK integration + run: | + cd sdks/python + uv run pytest tests/test_luna2_smoke.py diff --git a/AGENTS.md b/AGENTS.md index 04df435a..e10f7f93 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -26,7 +26,8 @@ Forwarded targets: - `engine/`: **control evaluation engine and evaluator system** — all evaluation logic, evaluator discovery, and evaluator orchestration lives here (`engine/src/agent_control_engine/`) - `server/`: FastAPI server (`server/src/agent_control_server/`) - `sdks/python/`: Python SDK — uses engine for evaluation (`sdks/python/src/agent_control/`) -- `evaluators/`: evaluator implementations (`evaluators/src/agent_control_evaluators/`) +- `evaluators/builtin/`: builtin evaluator implementations (`evaluators/builtin/src/agent_control_evaluators/`) +- `evaluators/extra/`: optional evaluator packages (e.g., `evaluators/extra/galileo/`) - `ui/`: Nextjs based web app to manage agent controls - `examples/`: runnable examples (ruff has relaxed import rules here) @@ -66,13 +67,19 @@ All testing guidance (including “behavior changes require tests”) lives in ` 4) add SDK wrapper in `sdks/python/src/agent_control/` 5) add tests (server + SDK) and update docs/examples if user-facing -- Add a new evaluator: - 1) implement evaluator class extending `Evaluator` in `evaluators/src/agent_control_evaluators/` +- Add a new builtin evaluator: + 1) implement evaluator class extending `Evaluator` in `evaluators/builtin/src/agent_control_evaluators/` 2) use `@register_evaluator` decorator (from `agent_control_evaluators`) - 3) add entry point in `evaluators/pyproject.toml` for auto-discovery - 4) add tests in the evaluators package + 3) add entry point in `evaluators/builtin/pyproject.toml` for auto-discovery + 4) add tests in the evaluators/builtin package 5) evaluator is automatically available to server and SDK via `discover_evaluators()` +- Add an external evaluator package: + 1) copy `evaluators/extra/template/` as a starting point + 2) implement evaluator class extending `Evaluator` from `agent_control_evaluators` + 3) add entry point using `org.name` format (e.g., `galileo.luna2`) + 4) package is discovered automatically when installed alongside agent-control + ## Git/PR workflow - Branch naming: `feature/...`, `fix/...`, `refactor/...` diff --git a/Makefile b/Makefile index 13bc5626..68b2bb86 100644 --- a/Makefile +++ b/Makefile @@ -1,16 +1,18 @@ -.PHONY: help sync test test-models test-sdk lint lint-fix typecheck check build build-models build-server build-sdk publish publish-models publish-server publish-sdk hooks-install hooks-uninstall prepush +.PHONY: help sync test test-models test-sdk lint lint-fix typecheck check build build-models build-server build-sdk publish publish-models publish-server publish-sdk hooks-install hooks-uninstall prepush evaluators-test evaluators-lint evaluators-lint-fix evaluators-typecheck evaluators-build # Workspace package names PACK_MODELS := agent-control-models PACK_SERVER := agent-control-server PACK_SDK := agent-control PACK_ENGINE := agent-control-engine +PACK_EVALUATORS := agent-control-evaluators # Directories MODELS_DIR := models SERVER_DIR := server SDK_DIR := sdks/python ENGINE_DIR := engine +EVALUATORS_DIR := evaluators/builtin help: @echo "Agent Control - Makefile commands" @@ -56,7 +58,7 @@ sync: # Test # --------------------------- -test: server-test engine-test sdk-test +test: server-test engine-test sdk-test evaluators-test # Run tests, lint, and typecheck check: test lint typecheck @@ -65,17 +67,17 @@ check: test lint typecheck # Quality # --------------------------- -lint: engine-lint +lint: engine-lint evaluators-lint uv run --package $(PACK_MODELS) ruff check --config pyproject.toml models/src uv run --package $(PACK_SERVER) ruff check --config pyproject.toml server/src uv run --package $(PACK_SDK) ruff check --config pyproject.toml sdks/python/src -lint-fix: engine-lint-fix +lint-fix: engine-lint-fix evaluators-lint-fix uv run --package $(PACK_MODELS) ruff check --config pyproject.toml --fix models/src uv run --package $(PACK_SERVER) ruff check --config pyproject.toml --fix server/src uv run --package $(PACK_SDK) ruff check --config pyproject.toml --fix sdks/python/src -typecheck: engine-typecheck +typecheck: engine-typecheck evaluators-typecheck uv run --package $(PACK_MODELS) mypy --config-file pyproject.toml models/src uv run --package $(PACK_SERVER) mypy --config-file pyproject.toml server/src uv run --package $(PACK_SDK) mypy --config-file pyproject.toml sdks/python/src @@ -84,7 +86,7 @@ typecheck: engine-typecheck # Build / Publish # --------------------------- -build: build-models build-server build-sdk engine-build +build: build-models build-server build-sdk engine-build evaluators-build build-models: cd $(MODELS_DIR) && uv build @@ -130,6 +132,21 @@ engine-%: sdk-%: $(MAKE) -C $(SDK_DIR) $(patsubst sdk-%,%,$@) +evaluators-test: + $(MAKE) -C $(EVALUATORS_DIR) test + +evaluators-lint: + $(MAKE) -C $(EVALUATORS_DIR) lint + +evaluators-lint-fix: + $(MAKE) -C $(EVALUATORS_DIR) lint-fix + +evaluators-typecheck: + $(MAKE) -C $(EVALUATORS_DIR) typecheck + +evaluators-build: + $(MAKE) -C $(EVALUATORS_DIR) build + .PHONY: server-% server-%: $(MAKE) -C $(SERVER_DIR) $(patsubst server-%,%,$@) diff --git a/evaluators/README.md b/evaluators/README.md deleted file mode 100644 index 96de8396..00000000 --- a/evaluators/README.md +++ /dev/null @@ -1,23 +0,0 @@ -# agent-control-evaluators - -Evaluator implementations for agent-control. - -## Built-in Evaluators - -- **regex** - Pattern matching using regular expressions -- **list** - Value matching against allow/deny lists -- **json** - JSON schema validation -- **sql** - SQL query validation using sqlglot - -## External Evaluators - -- **galileo/luna2** - Galileo Luna-2 integration (requires `luna2` extra) - -## Installation - -```bash -pip install agent-control-evaluators - -# With Luna-2 support -pip install agent-control-evaluators[luna2] -``` diff --git a/evaluators/builtin/Makefile b/evaluators/builtin/Makefile new file mode 100644 index 00000000..37abbdd1 --- /dev/null +++ b/evaluators/builtin/Makefile @@ -0,0 +1,33 @@ +.PHONY: help sync test lint lint-fix typecheck build publish + +PACKAGE := agent-control-evaluators + +help: + @echo "Agent Control Evaluators - Makefile commands" + @echo "" + @echo " make test - run pytest" + @echo " make lint - run ruff check" + @echo " make lint-fix - run ruff check --fix" + @echo " make typecheck - run mypy" + @echo " make build - build package" + +sync: + uv sync + +test: + uv run pytest --cov=src --cov-report=xml:../../coverage-evaluators.xml -q + +lint: + uv run ruff check --config ../../pyproject.toml src/ + +lint-fix: + uv run ruff check --config ../../pyproject.toml --fix src/ + +typecheck: + uv run mypy --config-file ../../pyproject.toml src/ + +build: + uv build + +publish: + uv publish diff --git a/evaluators/builtin/README.md b/evaluators/builtin/README.md new file mode 100644 index 00000000..4e558ea7 --- /dev/null +++ b/evaluators/builtin/README.md @@ -0,0 +1,48 @@ +# Agent Control Evaluators + +Built-in evaluators for agent-control. + +## Installation + +```bash +pip install agent-control-evaluators +``` + +## Available Evaluators + +| Name | Description | +|------|-------------| +| `regex` | Regular expression pattern matching | +| `list` | List-based value matching (allow/deny) | +| `json` | JSON validation (schema, required fields, types) | +| `sql` | SQL query validation | + +## Usage + +Evaluators are automatically discovered via Python entry points: + +```python +from agent_control_evaluators import discover_evaluators, list_evaluators + +# Load all available evaluators +discover_evaluators() + +# See what's available +print(list_evaluators()) +# {'regex': , 'list': ..., 'json': ..., 'sql': ...} +``` + +## External Evaluators + +Additional evaluators are available via separate packages: + +- `agent-control-evaluator-galileo` - Galileo Luna2 evaluator + +Install convenience extras: +```bash +pip install agent-control-evaluators[galileo] +``` + +## Creating Custom Evaluators + +See [AGENTS.md](../../AGENTS.md) for guidance on creating new evaluators. diff --git a/evaluators/pyproject.toml b/evaluators/builtin/pyproject.toml similarity index 81% rename from evaluators/pyproject.toml rename to evaluators/builtin/pyproject.toml index 372853e2..2b0e8edd 100644 --- a/evaluators/pyproject.toml +++ b/evaluators/builtin/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "agent-control-evaluators" -version = "2.1.0" -description = "Evaluator implementations for agent-control" +version = "3.0.0" +description = "Builtin evaluators for agent-control" readme = "README.md" requires-python = ">=3.12" license = { text = "Apache-2.0" } @@ -15,8 +15,8 @@ dependencies = [ ] [project.optional-dependencies] -luna2 = ["httpx>=0.24.0"] -all = ["httpx>=0.24.0"] +# NOTE: galileo extra commented out during local dev - package not yet on PyPI +# galileo = ["agent-control-evaluator-galileo>=3.0.0"] dev = ["pytest>=8.0.0", "pytest-asyncio>=0.23.0"] [project.entry-points."agent_control.evaluators"] @@ -24,7 +24,6 @@ regex = "agent_control_evaluators.regex:RegexEvaluator" list = "agent_control_evaluators.list:ListEvaluator" json = "agent_control_evaluators.json:JSONEvaluator" sql = "agent_control_evaluators.sql:SQLEvaluator" -"galileo/luna2" = "agent_control_evaluators.galileo_luna2:Luna2Evaluator" [build-system] requires = ["hatchling"] diff --git a/evaluators/src/agent_control_evaluators/__init__.py b/evaluators/builtin/src/agent_control_evaluators/__init__.py similarity index 87% rename from evaluators/src/agent_control_evaluators/__init__.py rename to evaluators/builtin/src/agent_control_evaluators/__init__.py index ea84eaf5..db1389de 100644 --- a/evaluators/src/agent_control_evaluators/__init__.py +++ b/evaluators/builtin/src/agent_control_evaluators/__init__.py @@ -1,6 +1,6 @@ """Agent Control Evaluators. -This package contains evaluator implementations for agent-control. +This package contains builtin evaluator implementations for agent-control. Built-in evaluators (regex, list, json, sql) are registered automatically on import. Available evaluators: @@ -10,15 +10,12 @@ - json: JSON validation - sql: SQL query validation - External (provider/name format): - - galileo/luna2: Galileo Luna-2 runtime protection - (pip install agent-control-evaluators[luna2]) - Naming convention: - Built-in: "regex", "list", "json", "sql" - - External: "provider/name" (e.g., "galileo/luna2") + - External: "provider.name" (e.g., "galileo.luna2") - Agent-scoped: "agent:name" (custom code deployed with agent) +External evaluators are installed via separate packages (e.g., agent-control-evaluator-galileo). Custom evaluators are Evaluator classes deployed with the engine. Their schemas are registered via initAgent for validation purposes. """ @@ -45,7 +42,7 @@ from agent_control_evaluators.regex import RegexEvaluator, RegexEvaluatorConfig from agent_control_evaluators.sql import SQLEvaluator, SQLEvaluatorConfig -__version__ = "0.1.0" +__version__ = "3.0.0" __all__ = [ # Core infrastructure diff --git a/evaluators/src/agent_control_evaluators/_base.py b/evaluators/builtin/src/agent_control_evaluators/_base.py similarity index 100% rename from evaluators/src/agent_control_evaluators/_base.py rename to evaluators/builtin/src/agent_control_evaluators/_base.py diff --git a/evaluators/src/agent_control_evaluators/_discovery.py b/evaluators/builtin/src/agent_control_evaluators/_discovery.py similarity index 100% rename from evaluators/src/agent_control_evaluators/_discovery.py rename to evaluators/builtin/src/agent_control_evaluators/_discovery.py diff --git a/evaluators/src/agent_control_evaluators/_factory.py b/evaluators/builtin/src/agent_control_evaluators/_factory.py similarity index 100% rename from evaluators/src/agent_control_evaluators/_factory.py rename to evaluators/builtin/src/agent_control_evaluators/_factory.py diff --git a/evaluators/src/agent_control_evaluators/_registry.py b/evaluators/builtin/src/agent_control_evaluators/_registry.py similarity index 100% rename from evaluators/src/agent_control_evaluators/_registry.py rename to evaluators/builtin/src/agent_control_evaluators/_registry.py diff --git a/evaluators/src/agent_control_evaluators/json/__init__.py b/evaluators/builtin/src/agent_control_evaluators/json/__init__.py similarity index 100% rename from evaluators/src/agent_control_evaluators/json/__init__.py rename to evaluators/builtin/src/agent_control_evaluators/json/__init__.py diff --git a/evaluators/src/agent_control_evaluators/json/config.py b/evaluators/builtin/src/agent_control_evaluators/json/config.py similarity index 99% rename from evaluators/src/agent_control_evaluators/json/config.py rename to evaluators/builtin/src/agent_control_evaluators/json/config.py index 06e8b760..fc891430 100644 --- a/evaluators/src/agent_control_evaluators/json/config.py +++ b/evaluators/builtin/src/agent_control_evaluators/json/config.py @@ -219,7 +219,7 @@ def validate_constraints( return v @model_validator(mode="after") - def validate_has_checks(self): + def validate_has_checks(self) -> "JSONEvaluatorConfig": """Ensure at least one validation check is configured.""" if not any( [ diff --git a/evaluators/src/agent_control_evaluators/json/evaluator.py b/evaluators/builtin/src/agent_control_evaluators/json/evaluator.py similarity index 96% rename from evaluators/src/agent_control_evaluators/json/evaluator.py rename to evaluators/builtin/src/agent_control_evaluators/json/evaluator.py index 38600f37..24fcfff1 100644 --- a/evaluators/src/agent_control_evaluators/json/evaluator.py +++ b/evaluators/builtin/src/agent_control_evaluators/json/evaluator.py @@ -53,6 +53,10 @@ class JSONEvaluator(Evaluator[JSONEvaluatorConfig]): ) config_model = JSONEvaluatorConfig + # Instance variables (typed to support None when feature not configured) + _schema_validator: Draft7Validator | None + _compiled_patterns: dict[str, Any] | None + def __init__(self, config: JSONEvaluatorConfig) -> None: super().__init__(config) @@ -103,6 +107,9 @@ def _evaluate_sync(self, data: Any) -> EvaluatorResult: if parse_error: return self._handle_parse_error(parse_error) + # Type narrowing: if no parse error, parsed_data is guaranteed to be valid + assert parsed_data is not None + # 2. JSON Schema Validation (comprehensive structure check) if self._schema_validator: schema_result = self._check_schema(parsed_data) @@ -178,6 +185,9 @@ def _handle_parse_error(self, error: str) -> EvaluatorResult: def _check_schema(self, data: dict | list) -> EvaluatorResult | None: """Validate against JSON Schema. Returns error result or None.""" + if not self._schema_validator: + return None + errors = list(self._schema_validator.iter_errors(data)) if not errors: @@ -202,6 +212,9 @@ def _check_schema(self, data: dict | list) -> EvaluatorResult | None: def _check_types(self, data: dict | list) -> EvaluatorResult | None: """Validate field types. Returns error result or None.""" + if not self.config.field_types: + return None + if not isinstance(data, dict): return EvaluatorResult( matched=True, @@ -252,6 +265,9 @@ def _check_types(self, data: dict | list) -> EvaluatorResult | None: def _check_required(self, data: dict | list) -> EvaluatorResult | None: """Validate required fields are present. Returns error result or None.""" + if not self.config.required_fields: + return None + if not isinstance(data, dict): return EvaluatorResult( matched=True, @@ -281,6 +297,9 @@ def _check_required(self, data: dict | list) -> EvaluatorResult | None: def _check_constraints(self, data: dict | list) -> EvaluatorResult | None: """Validate field constraints (ranges, enums, string length).""" + if not self.config.field_constraints: + return None + if not isinstance(data, dict): return EvaluatorResult( matched=True, @@ -377,6 +396,9 @@ def _check_constraints(self, data: dict | list) -> EvaluatorResult | None: def _check_patterns(self, data: dict | list) -> EvaluatorResult | None: """Validate field values match patterns. Returns error result or None.""" + if not self._compiled_patterns: + return None + if not isinstance(data, dict): return EvaluatorResult( matched=True, diff --git a/evaluators/src/agent_control_evaluators/list/__init__.py b/evaluators/builtin/src/agent_control_evaluators/list/__init__.py similarity index 100% rename from evaluators/src/agent_control_evaluators/list/__init__.py rename to evaluators/builtin/src/agent_control_evaluators/list/__init__.py diff --git a/evaluators/src/agent_control_evaluators/list/config.py b/evaluators/builtin/src/agent_control_evaluators/list/config.py similarity index 100% rename from evaluators/src/agent_control_evaluators/list/config.py rename to evaluators/builtin/src/agent_control_evaluators/list/config.py diff --git a/evaluators/src/agent_control_evaluators/list/evaluator.py b/evaluators/builtin/src/agent_control_evaluators/list/evaluator.py similarity index 100% rename from evaluators/src/agent_control_evaluators/list/evaluator.py rename to evaluators/builtin/src/agent_control_evaluators/list/evaluator.py diff --git a/evaluators/src/agent_control_evaluators/py.typed b/evaluators/builtin/src/agent_control_evaluators/py.typed similarity index 100% rename from evaluators/src/agent_control_evaluators/py.typed rename to evaluators/builtin/src/agent_control_evaluators/py.typed diff --git a/evaluators/src/agent_control_evaluators/regex/__init__.py b/evaluators/builtin/src/agent_control_evaluators/regex/__init__.py similarity index 100% rename from evaluators/src/agent_control_evaluators/regex/__init__.py rename to evaluators/builtin/src/agent_control_evaluators/regex/__init__.py diff --git a/evaluators/src/agent_control_evaluators/regex/config.py b/evaluators/builtin/src/agent_control_evaluators/regex/config.py similarity index 100% rename from evaluators/src/agent_control_evaluators/regex/config.py rename to evaluators/builtin/src/agent_control_evaluators/regex/config.py diff --git a/evaluators/src/agent_control_evaluators/regex/evaluator.py b/evaluators/builtin/src/agent_control_evaluators/regex/evaluator.py similarity index 100% rename from evaluators/src/agent_control_evaluators/regex/evaluator.py rename to evaluators/builtin/src/agent_control_evaluators/regex/evaluator.py diff --git a/evaluators/src/agent_control_evaluators/sql/__init__.py b/evaluators/builtin/src/agent_control_evaluators/sql/__init__.py similarity index 100% rename from evaluators/src/agent_control_evaluators/sql/__init__.py rename to evaluators/builtin/src/agent_control_evaluators/sql/__init__.py diff --git a/evaluators/src/agent_control_evaluators/sql/config.py b/evaluators/builtin/src/agent_control_evaluators/sql/config.py similarity index 99% rename from evaluators/src/agent_control_evaluators/sql/config.py rename to evaluators/builtin/src/agent_control_evaluators/sql/config.py index c8ccb3ba..91038b55 100644 --- a/evaluators/src/agent_control_evaluators/sql/config.py +++ b/evaluators/builtin/src/agent_control_evaluators/sql/config.py @@ -139,7 +139,7 @@ class SQLEvaluatorConfig(EvaluatorConfig): ) @model_validator(mode="after") - def validate_config(self): + def validate_config(self) -> "SQLEvaluatorConfig": """Validate configuration constraints.""" # Validate operation restrictions if self.blocked_operations and self.allowed_operations: diff --git a/evaluators/src/agent_control_evaluators/sql/evaluator.py b/evaluators/builtin/src/agent_control_evaluators/sql/evaluator.py similarity index 99% rename from evaluators/src/agent_control_evaluators/sql/evaluator.py rename to evaluators/builtin/src/agent_control_evaluators/sql/evaluator.py index 09c1af58..99760f2d 100644 --- a/evaluators/src/agent_control_evaluators/sql/evaluator.py +++ b/evaluators/builtin/src/agent_control_evaluators/sql/evaluator.py @@ -286,7 +286,7 @@ def _is_in_top_level_select( ) -> bool: """Check if column is in top-level SELECT expressions (not subqueries).""" # Walk up from column to see if we're in the top-level SELECT - current = column + current: exp.Expression | None = column while current: # If we hit the top-level SELECT, check if we're in its expressions if current is top_level_select: @@ -303,7 +303,7 @@ def _is_in_top_level_select( def _is_in_select_clause(self, column: exp.Column) -> bool: """Check if column is in any SELECT clause (including subqueries).""" - current = column.parent + current: exp.Expression | None = column.parent while current: # Check if this column is in a SELECT's expressions if isinstance(current, exp.Select): @@ -318,7 +318,7 @@ def _is_descendant_of( self, node: exp.Expression, potential_ancestor: exp.Expression ) -> bool: """Check if node is a descendant of potential_ancestor.""" - current = node + current: exp.Expression | None = node while current: if current is potential_ancestor: return True @@ -447,7 +447,7 @@ def _check_limits( offset_node = select_node.find(exp.Offset) offset_value = 0 if offset_node: - offset_value = self._extract_offset_value(offset_node) + offset_value = self._extract_offset_value(offset_node) or 0 # Check LIMIT value (skip if indeterminate) if limit_value is not None: @@ -1097,6 +1097,10 @@ def _check_columns( Returns: EvaluatorResult if required columns are missing, None otherwise """ + # Early return if no required columns (caller should check, but be defensive) + if not self._required_columns: + return None + # Collect columns from all analyses based on context and scope columns = [] for analysis in analyses: diff --git a/evaluators/tests/__init__.py b/evaluators/builtin/tests/__init__.py similarity index 100% rename from evaluators/tests/__init__.py rename to evaluators/builtin/tests/__init__.py diff --git a/evaluators/tests/json/__init__.py b/evaluators/builtin/tests/json/__init__.py similarity index 100% rename from evaluators/tests/json/__init__.py rename to evaluators/builtin/tests/json/__init__.py diff --git a/evaluators/tests/json/test_json.py b/evaluators/builtin/tests/json/test_json.py similarity index 100% rename from evaluators/tests/json/test_json.py rename to evaluators/builtin/tests/json/test_json.py diff --git a/evaluators/tests/sql/__init__.py b/evaluators/builtin/tests/sql/__init__.py similarity index 100% rename from evaluators/tests/sql/__init__.py rename to evaluators/builtin/tests/sql/__init__.py diff --git a/evaluators/tests/sql/test_sql.py b/evaluators/builtin/tests/sql/test_sql.py similarity index 100% rename from evaluators/tests/sql/test_sql.py rename to evaluators/builtin/tests/sql/test_sql.py diff --git a/evaluators/tests/test_base.py b/evaluators/builtin/tests/test_base.py similarity index 100% rename from evaluators/tests/test_base.py rename to evaluators/builtin/tests/test_base.py diff --git a/evaluators/extra/galileo/README.md b/evaluators/extra/galileo/README.md new file mode 100644 index 00000000..8bc8aa3d --- /dev/null +++ b/evaluators/extra/galileo/README.md @@ -0,0 +1,60 @@ +# Agent Control Evaluator - Galileo + +Galileo Luna2 evaluator for agent-control. + +## Installation + +```bash +pip install agent-control-evaluator-galileo +``` + +Or via the convenience extra from the main evaluators package: +```bash +pip install agent-control-evaluators[galileo] +``` + +## Available Evaluators + +| Name | Description | +|------|-------------| +| `galileo.luna2` | Galileo Luna-2 runtime protection | + +## Configuration + +Set the `GALILEO_API_KEY` environment variable: +```bash +export GALILEO_API_KEY=your-api-key +``` + +## Usage + +Once installed, the evaluator is automatically discovered: + +```python +from agent_control_evaluators import discover_evaluators, get_evaluator + +discover_evaluators() +Luna2Evaluator = get_evaluator("galileo.luna2") +``` + +Or import directly: + +```python +from agent_control_evaluator_galileo.luna2 import Luna2Evaluator, Luna2EvaluatorConfig + +config = Luna2EvaluatorConfig( + stage_type="local", + metric="input_toxicity", + operator="gt", + target_value=0.5, + galileo_project="my-project", +) + +evaluator = Luna2Evaluator(config) +result = await evaluator.evaluate("some text") +``` + +## Documentation + +- [Galileo Protect Overview](https://v2docs.galileo.ai/concepts/protect/overview) +- [Galileo Python SDK Reference](https://v2docs.galileo.ai/sdk-api/python/reference/protect) diff --git a/evaluators/extra/galileo/pyproject.toml b/evaluators/extra/galileo/pyproject.toml new file mode 100644 index 00000000..7342c207 --- /dev/null +++ b/evaluators/extra/galileo/pyproject.toml @@ -0,0 +1,32 @@ +[project] +name = "agent-control-evaluator-galileo" +version = "3.0.0" +description = "Galileo Luna2 evaluator for agent-control" +readme = "README.md" +requires-python = ">=3.12" +license = { text = "Apache-2.0" } +authors = [{ name = "Agent Control Team" }] +dependencies = [ + "agent-control-evaluators>=3.0.0", + "agent-control-models>=3.0.0", + "httpx>=0.24.0", + "pydantic>=2.12.4", +] + +[project.optional-dependencies] +dev = ["pytest>=8.0.0", "pytest-asyncio>=0.23.0"] + +[project.entry-points."agent_control.evaluators"] +"galileo.luna2" = "agent_control_evaluator_galileo.luna2:Luna2Evaluator" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/agent_control_evaluator_galileo"] + +# For local dev, use override to resolve from workspace +[tool.uv.sources] +agent-control-evaluators = { path = "../../builtin", editable = true } +agent-control-models = { path = "../../../models", editable = true } diff --git a/evaluators/extra/galileo/src/agent_control_evaluator_galileo/__init__.py b/evaluators/extra/galileo/src/agent_control_evaluator_galileo/__init__.py new file mode 100644 index 00000000..e6f23b3c --- /dev/null +++ b/evaluators/extra/galileo/src/agent_control_evaluator_galileo/__init__.py @@ -0,0 +1,31 @@ +"""Agent Control Evaluator - Galileo. + +This package provides Galileo evaluators for agent-control. + +Available evaluators: + - galileo.luna2: Galileo Luna-2 runtime protection + +Installation: + pip install agent-control-evaluator-galileo + +Or via the agent-control-evaluators convenience extra: + pip install agent-control-evaluators[galileo] +""" + +from agent_control_evaluator_galileo.luna2 import ( + LUNA2_AVAILABLE, + Luna2Evaluator, + Luna2EvaluatorConfig, + Luna2Metric, + Luna2Operator, +) + +__version__ = "3.0.0" + +__all__ = [ + "Luna2Evaluator", + "Luna2EvaluatorConfig", + "Luna2Metric", + "Luna2Operator", + "LUNA2_AVAILABLE", +] diff --git a/evaluators/src/agent_control_evaluators/galileo_luna2/__init__.py b/evaluators/extra/galileo/src/agent_control_evaluator_galileo/luna2/__init__.py similarity index 75% rename from evaluators/src/agent_control_evaluators/galileo_luna2/__init__.py rename to evaluators/extra/galileo/src/agent_control_evaluator_galileo/luna2/__init__.py index 60558600..6e2f4080 100644 --- a/evaluators/src/agent_control_evaluators/galileo_luna2/__init__.py +++ b/evaluators/extra/galileo/src/agent_control_evaluator_galileo/luna2/__init__.py @@ -4,7 +4,7 @@ using direct HTTP API calls (no SDK dependency required). Installation: - pip install agent-control-evaluators[luna2] + pip install agent-control-evaluator-galileo Environment Variables: GALILEO_API_KEY: Your Galileo API key (required) @@ -15,8 +15,12 @@ https://v2docs.galileo.ai/sdk-api/python/reference/protect """ -from .config import Luna2EvaluatorConfig, Luna2Metric, Luna2Operator -from .evaluator import LUNA2_AVAILABLE, Luna2Evaluator +from agent_control_evaluator_galileo.luna2.config import ( + Luna2EvaluatorConfig, + Luna2Metric, + Luna2Operator, +) +from agent_control_evaluator_galileo.luna2.evaluator import LUNA2_AVAILABLE, Luna2Evaluator __all__ = [ "Luna2EvaluatorConfig", @@ -28,7 +32,7 @@ # Export client classes when available (added to __all__ below) if LUNA2_AVAILABLE: - from .client import ( # noqa: F401 + from agent_control_evaluator_galileo.luna2.client import ( # noqa: F401 GalileoProtectClient, PassthroughAction, Payload, diff --git a/evaluators/src/agent_control_evaluators/galileo_luna2/client.py b/evaluators/extra/galileo/src/agent_control_evaluator_galileo/luna2/client.py similarity index 100% rename from evaluators/src/agent_control_evaluators/galileo_luna2/client.py rename to evaluators/extra/galileo/src/agent_control_evaluator_galileo/luna2/client.py diff --git a/evaluators/src/agent_control_evaluators/galileo_luna2/config.py b/evaluators/extra/galileo/src/agent_control_evaluator_galileo/luna2/config.py similarity index 98% rename from evaluators/src/agent_control_evaluators/galileo_luna2/config.py rename to evaluators/extra/galileo/src/agent_control_evaluator_galileo/luna2/config.py index f24d221e..5f158e03 100644 --- a/evaluators/src/agent_control_evaluators/galileo_luna2/config.py +++ b/evaluators/extra/galileo/src/agent_control_evaluator_galileo/luna2/config.py @@ -4,7 +4,7 @@ from pydantic import Field, model_validator -from agent_control_evaluators._base import EvaluatorConfig +from agent_control_evaluators import EvaluatorConfig # Supported Luna-2 metrics Luna2Metric = Literal[ diff --git a/evaluators/src/agent_control_evaluators/galileo_luna2/evaluator.py b/evaluators/extra/galileo/src/agent_control_evaluator_galileo/luna2/evaluator.py similarity index 96% rename from evaluators/src/agent_control_evaluators/galileo_luna2/evaluator.py rename to evaluators/extra/galileo/src/agent_control_evaluator_galileo/luna2/evaluator.py index 48ca3fe5..ab037be8 100644 --- a/evaluators/src/agent_control_evaluators/galileo_luna2/evaluator.py +++ b/evaluators/extra/galileo/src/agent_control_evaluator_galileo/luna2/evaluator.py @@ -10,9 +10,9 @@ from agent_control_models import EvaluatorResult -from agent_control_evaluators._base import Evaluator, EvaluatorMetadata -from agent_control_evaluators._registry import register_evaluator -from agent_control_evaluators.galileo_luna2.config import Luna2EvaluatorConfig +from agent_control_evaluators import Evaluator, EvaluatorMetadata, register_evaluator + +from agent_control_evaluator_galileo.luna2.config import Luna2EvaluatorConfig logger = logging.getLogger(__name__) @@ -64,7 +64,7 @@ class Luna2Evaluator(Evaluator[Luna2EvaluatorConfig]): Example: ```python - from agent_control_evaluators.galileo_luna2 import Luna2Evaluator, Luna2EvaluatorConfig + from agent_control_evaluator_galileo.luna2 import Luna2Evaluator, Luna2EvaluatorConfig config = Luna2EvaluatorConfig( stage_type="local", @@ -84,8 +84,8 @@ class Luna2Evaluator(Evaluator[Luna2EvaluatorConfig]): """ metadata = EvaluatorMetadata( - name="galileo/luna2", - version="2.0.0", + name="galileo.luna2", + version="3.0.0", description="Galileo Luna-2 enterprise runtime protection (direct API)", requires_api_key=True, timeout_ms=10000, diff --git a/evaluators/extra/galileo/tests/__init__.py b/evaluators/extra/galileo/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/sdks/python/tests/test_luna2_evaluator.py b/evaluators/extra/galileo/tests/test_luna2_evaluator.py similarity index 80% rename from sdks/python/tests/test_luna2_evaluator.py rename to evaluators/extra/galileo/tests/test_luna2_evaluator.py index a16d6586..3b34c620 100644 --- a/sdks/python/tests/test_luna2_evaluator.py +++ b/evaluators/extra/galileo/tests/test_luna2_evaluator.py @@ -23,7 +23,7 @@ def create_mock_protect_response( execution_time: float = 100.0, ) -> MagicMock: """Create a mock ProtectResponse object for testing.""" - from agent_control_evaluators.galileo_luna2.client import ProtectResponse, TraceMetadata + from agent_control_evaluator_galileo.luna2.client import ProtectResponse, TraceMetadata return ProtectResponse( status=status, @@ -44,7 +44,7 @@ class TestLuna2EvaluatorConfig: def test_local_stage_config_valid(self): """Test valid local stage configuration.""" - from agent_control_evaluators.galileo_luna2 import Luna2EvaluatorConfig + from agent_control_evaluator_galileo.luna2 import Luna2EvaluatorConfig config = Luna2EvaluatorConfig( stage_type="local", @@ -62,7 +62,7 @@ def test_local_stage_config_valid(self): def test_local_stage_config_with_numeric_target(self): """Test local stage configuration with numeric target_value.""" - from agent_control_evaluators.galileo_luna2 import Luna2EvaluatorConfig + from agent_control_evaluator_galileo.luna2 import Luna2EvaluatorConfig config = Luna2EvaluatorConfig( stage_type="local", @@ -76,7 +76,7 @@ def test_local_stage_config_with_numeric_target(self): def test_central_stage_config_valid(self): """Test valid central stage configuration.""" - from agent_control_evaluators.galileo_luna2 import Luna2EvaluatorConfig + from agent_control_evaluator_galileo.luna2 import Luna2EvaluatorConfig config = Luna2EvaluatorConfig( stage_type="central", @@ -90,7 +90,7 @@ def test_central_stage_config_valid(self): def test_local_stage_requires_metric(self): """Test local stage requires metric field.""" - from agent_control_evaluators.galileo_luna2 import Luna2EvaluatorConfig + from agent_control_evaluator_galileo.luna2 import Luna2EvaluatorConfig with pytest.raises(ValidationError, match="metric.*required"): Luna2EvaluatorConfig( @@ -101,7 +101,7 @@ def test_local_stage_requires_metric(self): def test_local_stage_requires_operator(self): """Test local stage requires operator field.""" - from agent_control_evaluators.galileo_luna2 import Luna2EvaluatorConfig + from agent_control_evaluator_galileo.luna2 import Luna2EvaluatorConfig with pytest.raises(ValidationError, match="operator.*required"): Luna2EvaluatorConfig( @@ -112,7 +112,7 @@ def test_local_stage_requires_operator(self): def test_local_stage_requires_target_value(self): """Test local stage requires target_value field.""" - from agent_control_evaluators.galileo_luna2 import Luna2EvaluatorConfig + from agent_control_evaluator_galileo.luna2 import Luna2EvaluatorConfig with pytest.raises(ValidationError, match="target_value.*required"): Luna2EvaluatorConfig( @@ -123,7 +123,7 @@ def test_local_stage_requires_target_value(self): def test_central_stage_requires_stage_name(self): """Test central stage requires stage_name field.""" - from agent_control_evaluators.galileo_luna2 import Luna2EvaluatorConfig + from agent_control_evaluator_galileo.luna2 import Luna2EvaluatorConfig with pytest.raises(ValidationError, match="stage_name.*required"): Luna2EvaluatorConfig( @@ -133,7 +133,7 @@ def test_central_stage_requires_stage_name(self): def test_timeout_ms_validation(self): """Test timeout_ms must be within valid range.""" - from agent_control_evaluators.galileo_luna2 import Luna2EvaluatorConfig + from agent_control_evaluator_galileo.luna2 import Luna2EvaluatorConfig # Too low with pytest.raises(ValidationError): @@ -161,7 +161,7 @@ def test_timeout_ms_validation(self): def test_on_error_validation(self): """Test on_error must be 'allow' or 'deny'.""" - from agent_control_evaluators.galileo_luna2 import Luna2EvaluatorConfig + from agent_control_evaluator_galileo.luna2 import Luna2EvaluatorConfig config_allow = Luna2EvaluatorConfig( stage_type="central", @@ -186,7 +186,7 @@ def test_on_error_validation(self): def test_metric_validation(self): """Test metric must be a valid Luna2 metric.""" - from agent_control_evaluators.galileo_luna2 import Luna2EvaluatorConfig + from agent_control_evaluator_galileo.luna2 import Luna2EvaluatorConfig # Valid metrics valid_metrics = [ @@ -217,7 +217,7 @@ def test_metric_validation(self): def test_operator_validation(self): """Test operator must be a valid Luna2 operator.""" - from agent_control_evaluators.galileo_luna2 import Luna2EvaluatorConfig + from agent_control_evaluator_galileo.luna2 import Luna2EvaluatorConfig valid_operators = ["gt", "lt", "gte", "lte", "eq", "contains", "any"] for op in valid_operators: @@ -239,7 +239,7 @@ def test_operator_validation(self): def test_model_dump(self): """Test config can be dumped to dict.""" - from agent_control_evaluators.galileo_luna2 import Luna2EvaluatorConfig + from agent_control_evaluator_galileo.luna2 import Luna2EvaluatorConfig config = Luna2EvaluatorConfig( stage_type="local", @@ -263,10 +263,10 @@ class TestLuna2EvaluatorInheritance: """Tests for Luna-2 evaluator inheritance.""" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluator_galileo.luna2.evaluator.LUNA2_AVAILABLE", True) def test_evaluator_extends_base(self): """Test Luna2Evaluator extends Evaluator.""" - from agent_control_evaluators.galileo_luna2 import Luna2Evaluator + from agent_control_evaluator_galileo.luna2 import Luna2Evaluator assert issubclass(Luna2Evaluator, Evaluator) @@ -275,36 +275,36 @@ class TestLuna2EvaluatorImport: """Tests for Luna-2 evaluator import and initialization.""" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluator_galileo.luna2.evaluator.LUNA2_AVAILABLE", True) def test_luna2_evaluator_import_success(self): """Test importing Luna-2 evaluator with dependencies available.""" - from agent_control_evaluators.galileo_luna2 import Luna2Evaluator + from agent_control_evaluator_galileo.luna2 import Luna2Evaluator assert Luna2Evaluator is not None - assert Luna2Evaluator.metadata.name == "galileo/luna2" - assert Luna2Evaluator.metadata.version == "2.0.0" + assert Luna2Evaluator.metadata.name == "galileo.luna2" + assert Luna2Evaluator.metadata.version == "3.0.0" - @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", False) + @patch("agent_control_evaluator_galileo.luna2.evaluator.LUNA2_AVAILABLE", False) def test_luna2_evaluator_is_available_false_without_httpx(self): """Test that is_available() returns False when httpx is not installed.""" - from agent_control_evaluators.galileo_luna2 import Luna2Evaluator + from agent_control_evaluator_galileo.luna2 import Luna2Evaluator # When httpx is not available, is_available() should return False assert Luna2Evaluator.is_available() is False - @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluator_galileo.luna2.evaluator.LUNA2_AVAILABLE", True) def test_luna2_evaluator_is_available_true_with_httpx(self): """Test that is_available() returns True when httpx is installed.""" - from agent_control_evaluators.galileo_luna2 import Luna2Evaluator + from agent_control_evaluator_galileo.luna2 import Luna2Evaluator # When httpx is available, is_available() should return True assert Luna2Evaluator.is_available() is True - @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluator_galileo.luna2.evaluator.LUNA2_AVAILABLE", True) @patch.dict(os.environ, {}, clear=True) def test_luna2_evaluator_init_without_api_key_raises_error(self): """Test that initializing without API key raises ValueError.""" - from agent_control_evaluators.galileo_luna2 import Luna2Evaluator + from agent_control_evaluator_galileo.luna2 import Luna2Evaluator config = { "stage_type": "local", @@ -321,24 +321,24 @@ class TestLuna2EvaluatorMetadata: """Tests for Luna-2 evaluator metadata.""" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluator_galileo.luna2.evaluator.LUNA2_AVAILABLE", True) def test_metadata_fields(self): """Test Luna-2 evaluator metadata fields.""" - from agent_control_evaluators.galileo_luna2 import Luna2Evaluator + from agent_control_evaluator_galileo.luna2 import Luna2Evaluator metadata = Luna2Evaluator.metadata - assert metadata.name == "galileo/luna2" + assert metadata.name == "galileo.luna2" assert metadata.requires_api_key is True assert metadata.timeout_ms == 10000 # Config schema is now from config_model assert Luna2Evaluator.config_model is not None @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluator_galileo.luna2.evaluator.LUNA2_AVAILABLE", True) def test_config_schema_supported_metrics(self): """Test config schema includes all supported metrics.""" - from agent_control_evaluators.galileo_luna2 import Luna2Evaluator + from agent_control_evaluator_galileo.luna2 import Luna2Evaluator schema = Luna2Evaluator.config_model.model_json_schema() # Pydantic uses anyOf with const for Literal types @@ -364,12 +364,12 @@ class TestLuna2EvaluatorLocalStage: """Tests for Luna-2 evaluator with local stages.""" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluator_galileo.luna2.evaluator.LUNA2_AVAILABLE", True) @pytest.mark.asyncio async def test_local_stage_triggered(self): """Test local stage evaluation when rule is triggered.""" - from agent_control_evaluators.galileo_luna2 import Luna2Evaluator - from agent_control_evaluators.galileo_luna2.client import GalileoProtectClient + from agent_control_evaluator_galileo.luna2 import Luna2Evaluator + from agent_control_evaluator_galileo.luna2.client import GalileoProtectClient # Create mock response with triggered status mock_response = create_mock_protect_response( @@ -404,12 +404,12 @@ async def test_local_stage_triggered(self): assert result.metadata["status"] == "triggered" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluator_galileo.luna2.evaluator.LUNA2_AVAILABLE", True) @pytest.mark.asyncio async def test_local_stage_not_triggered(self): """Test local stage evaluation when rule is not triggered.""" - from agent_control_evaluators.galileo_luna2 import Luna2Evaluator - from agent_control_evaluators.galileo_luna2.client import GalileoProtectClient + from agent_control_evaluator_galileo.luna2 import Luna2Evaluator + from agent_control_evaluator_galileo.luna2.client import GalileoProtectClient mock_response = create_mock_protect_response( status="not_triggered", @@ -439,12 +439,12 @@ async def test_local_stage_not_triggered(self): assert result.metadata["status"] == "not_triggered" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluator_galileo.luna2.evaluator.LUNA2_AVAILABLE", True) @pytest.mark.asyncio async def test_local_stage_with_timeout_ms(self): """Test local stage respects timeout_ms configuration.""" - from agent_control_evaluators.galileo_luna2 import Luna2Evaluator - from agent_control_evaluators.galileo_luna2.client import GalileoProtectClient + from agent_control_evaluator_galileo.luna2 import Luna2Evaluator + from agent_control_evaluator_galileo.luna2.client import GalileoProtectClient mock_response = create_mock_protect_response() @@ -476,12 +476,12 @@ class TestLuna2EvaluatorCentralStage: """Tests for Luna-2 evaluator with central stages.""" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluator_galileo.luna2.evaluator.LUNA2_AVAILABLE", True) @pytest.mark.asyncio async def test_central_stage_evaluation(self): """Test central stage evaluation.""" - from agent_control_evaluators.galileo_luna2 import Luna2Evaluator - from agent_control_evaluators.galileo_luna2.client import GalileoProtectClient + from agent_control_evaluator_galileo.luna2 import Luna2Evaluator + from agent_control_evaluator_galileo.luna2.client import GalileoProtectClient mock_response = create_mock_protect_response( status="triggered", @@ -509,12 +509,12 @@ async def test_central_stage_evaluation(self): assert result.metadata["status"] == "triggered" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluator_galileo.luna2.evaluator.LUNA2_AVAILABLE", True) @pytest.mark.asyncio async def test_central_stage_without_version(self): """Test central stage without pinned version.""" - from agent_control_evaluators.galileo_luna2 import Luna2Evaluator - from agent_control_evaluators.galileo_luna2.client import GalileoProtectClient + from agent_control_evaluator_galileo.luna2 import Luna2Evaluator + from agent_control_evaluator_galileo.luna2.client import GalileoProtectClient mock_response = create_mock_protect_response(trace_id="trace-latest") @@ -542,10 +542,10 @@ class TestLuna2EvaluatorPayloadPreparation: """Tests for payload preparation logic.""" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluator_galileo.luna2.evaluator.LUNA2_AVAILABLE", True) def test_input_metric_payload(self): """Test payload for input metrics uses _prepare_payload correctly.""" - from agent_control_evaluators.galileo_luna2 import Luna2Evaluator + from agent_control_evaluator_galileo.luna2 import Luna2Evaluator config = { "stage_type": "local", @@ -562,10 +562,10 @@ def test_input_metric_payload(self): assert payload.output == "" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluator_galileo.luna2.evaluator.LUNA2_AVAILABLE", True) def test_output_metric_payload(self): """Test payload for output metrics uses _prepare_payload correctly.""" - from agent_control_evaluators.galileo_luna2 import Luna2Evaluator + from agent_control_evaluator_galileo.luna2 import Luna2Evaluator config = { "stage_type": "local", @@ -582,10 +582,10 @@ def test_output_metric_payload(self): assert payload.output == "llm output text" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluator_galileo.luna2.evaluator.LUNA2_AVAILABLE", True) def test_payload_field_override(self): """Test explicit payload_field configuration.""" - from agent_control_evaluators.galileo_luna2 import Luna2Evaluator + from agent_control_evaluator_galileo.luna2 import Luna2Evaluator config = { "stage_type": "central", @@ -605,12 +605,12 @@ class TestLuna2EvaluatorErrorHandling: """Tests for error handling in Luna-2 evaluator.""" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluator_galileo.luna2.evaluator.LUNA2_AVAILABLE", True) @pytest.mark.asyncio async def test_error_with_fail_open(self): """Test error handling with fail open (default).""" - from agent_control_evaluators.galileo_luna2 import Luna2Evaluator - from agent_control_evaluators.galileo_luna2.client import GalileoProtectClient + from agent_control_evaluator_galileo.luna2 import Luna2Evaluator + from agent_control_evaluator_galileo.luna2.client import GalileoProtectClient config = { "stage_type": "local", @@ -635,12 +635,12 @@ async def test_error_with_fail_open(self): assert result.metadata["fallback_action"] == "allow" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluator_galileo.luna2.evaluator.LUNA2_AVAILABLE", True) @pytest.mark.asyncio async def test_error_with_fail_closed(self): """Test error handling with fail closed.""" - from agent_control_evaluators.galileo_luna2 import Luna2Evaluator - from agent_control_evaluators.galileo_luna2.client import GalileoProtectClient + from agent_control_evaluator_galileo.luna2 import Luna2Evaluator + from agent_control_evaluator_galileo.luna2.client import GalileoProtectClient config = { "stage_type": "local", @@ -665,12 +665,12 @@ async def test_error_with_fail_closed(self): assert result.metadata["fallback_action"] == "deny" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluator_galileo.luna2.evaluator.LUNA2_AVAILABLE", True) @pytest.mark.asyncio async def test_empty_response_handling(self): """Test handling of empty/None response.""" - from agent_control_evaluators.galileo_luna2 import Luna2Evaluator - from agent_control_evaluators.galileo_luna2.client import GalileoProtectClient + from agent_control_evaluator_galileo.luna2 import Luna2Evaluator + from agent_control_evaluator_galileo.luna2.client import GalileoProtectClient config = { "stage_type": "local", @@ -697,10 +697,10 @@ class TestLuna2EvaluatorTimeoutHelper: """Tests for timeout helper method.""" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluator_galileo.luna2.evaluator.LUNA2_AVAILABLE", True) def test_get_timeout_from_config(self): """Test timeout conversion from config.""" - from agent_control_evaluators.galileo_luna2 import Luna2Evaluator + from agent_control_evaluator_galileo.luna2 import Luna2Evaluator config = { "stage_type": "local", @@ -714,10 +714,10 @@ def test_get_timeout_from_config(self): assert evaluator.get_timeout_seconds() == 5.0 @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluator_galileo.luna2.evaluator.LUNA2_AVAILABLE", True) def test_get_timeout_from_default(self): """Test timeout uses metadata default.""" - from agent_control_evaluators.galileo_luna2 import Luna2Evaluator + from agent_control_evaluator_galileo.luna2 import Luna2Evaluator config = { "stage_type": "local", @@ -735,10 +735,10 @@ class TestLuna2EvaluatorNumericTargetValue: """Tests for numeric target_value handling.""" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluator_galileo.luna2.evaluator.LUNA2_AVAILABLE", True) def test_numeric_target_value_float(self): """Test evaluator accepts float target_value.""" - from agent_control_evaluators.galileo_luna2 import Luna2Evaluator + from agent_control_evaluator_galileo.luna2 import Luna2Evaluator config = { "stage_type": "local", @@ -751,10 +751,10 @@ def test_numeric_target_value_float(self): assert evaluator._get_numeric_target_value() == 0.5 @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluator_galileo.luna2.evaluator.LUNA2_AVAILABLE", True) def test_numeric_target_value_int(self): """Test evaluator accepts int target_value.""" - from agent_control_evaluators.galileo_luna2 import Luna2Evaluator + from agent_control_evaluator_galileo.luna2 import Luna2Evaluator config = { "stage_type": "local", @@ -767,10 +767,10 @@ def test_numeric_target_value_int(self): assert evaluator._get_numeric_target_value() == 1 @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_evaluators.galileo_luna2.evaluator.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluator_galileo.luna2.evaluator.LUNA2_AVAILABLE", True) def test_string_target_value_converts_to_float(self): """Test evaluator converts string target_value to float.""" - from agent_control_evaluators.galileo_luna2 import Luna2Evaluator + from agent_control_evaluator_galileo.luna2 import Luna2Evaluator config = { "stage_type": "local", @@ -788,7 +788,7 @@ class TestGalileoProtectClient: def test_client_init_with_api_key(self): """Test client initialization with API key.""" - from agent_control_evaluators.galileo_luna2.client import GalileoProtectClient + from agent_control_evaluator_galileo.luna2.client import GalileoProtectClient with patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}): client = GalileoProtectClient() @@ -796,7 +796,7 @@ def test_client_init_with_api_key(self): def test_client_init_without_api_key_raises(self): """Test client raises error without API key.""" - from agent_control_evaluators.galileo_luna2.client import GalileoProtectClient + from agent_control_evaluator_galileo.luna2.client import GalileoProtectClient with patch.dict(os.environ, {}, clear=True): with pytest.raises(ValueError, match="GALILEO_API_KEY"): @@ -804,7 +804,7 @@ def test_client_init_without_api_key_raises(self): def test_derive_api_url_from_console_url(self): """Test API URL derivation from console URL.""" - from agent_control_evaluators.galileo_luna2.client import GalileoProtectClient + from agent_control_evaluator_galileo.luna2.client import GalileoProtectClient with patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}): client = GalileoProtectClient( @@ -814,7 +814,7 @@ def test_derive_api_url_from_console_url(self): def test_derive_api_url_default(self): """Test default API URL.""" - from agent_control_evaluators.galileo_luna2.client import GalileoProtectClient + from agent_control_evaluator_galileo.luna2.client import GalileoProtectClient with patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}): client = GalileoProtectClient() @@ -826,14 +826,14 @@ class TestPayloadDataClasses: def test_payload_to_dict(self): """Test Payload.to_dict() method.""" - from agent_control_evaluators.galileo_luna2.client import Payload + from agent_control_evaluator_galileo.luna2.client import Payload payload = Payload(input="test input", output="test output") assert payload.to_dict() == {"input": "test input", "output": "test output"} def test_rule_to_dict(self): """Test Rule.to_dict() method.""" - from agent_control_evaluators.galileo_luna2.client import Rule + from agent_control_evaluator_galileo.luna2.client import Rule rule = Rule(metric="input_toxicity", operator="gt", target_value=0.5) assert rule.to_dict() == { @@ -844,7 +844,7 @@ def test_rule_to_dict(self): def test_ruleset_to_dict(self): """Test Ruleset.to_dict() method.""" - from agent_control_evaluators.galileo_luna2.client import PassthroughAction, Rule, Ruleset + from agent_control_evaluator_galileo.luna2.client import PassthroughAction, Rule, Ruleset ruleset = Ruleset( rules=[Rule(metric="input_toxicity", operator="gt", target_value=0.5)], @@ -858,7 +858,7 @@ def test_ruleset_to_dict(self): def test_protect_response_from_dict(self): """Test ProtectResponse.from_dict() method.""" - from agent_control_evaluators.galileo_luna2.client import ProtectResponse + from agent_control_evaluator_galileo.luna2.client import ProtectResponse data = { "status": "triggered", diff --git a/evaluators/extra/template/README.md b/evaluators/extra/template/README.md new file mode 100644 index 00000000..cd52e07b --- /dev/null +++ b/evaluators/extra/template/README.md @@ -0,0 +1,100 @@ +# Evaluator Package Template + +This template provides a starting point for creating new evaluator packages for agent-control. + +## Setup + +1. Copy this template: `cp -r template/ {{org}}/` +2. Replace placeholders in pyproject.toml.template: + - `{{ORG}}`: Your organization name (e.g., `acme`) + - `{{EVALUATOR}}`: Evaluator name (e.g., `toxicity`) + - `{{CLASS}}`: Python class name (e.g., `ToxicityEvaluator`) + - `{{AUTHOR}}`: Author name +3. Rename to `pyproject.toml` +4. Create your evaluator in `src/agent_control_evaluator_{{org}}/` +5. Register via entry point in pyproject.toml + +## Directory Structure + +``` +{{org}}/ +├── pyproject.toml +├── src/agent_control_evaluator_{{org}}/ +│ ├── __init__.py +│ └── {{evaluator}}/ +│ ├── __init__.py +│ ├── config.py # Extends EvaluatorConfig +│ └── evaluator.py # Extends Evaluator, uses @register_evaluator +└── tests/ + ├── __init__.py + └── test_{{evaluator}}.py +``` + +## Entry Point Naming Convention + +Use `org.evaluator_name` format (e.g., `acme.toxicity`). + +This naming convention: +- Uses dots (`.`) as separators for external evaluators +- Distinguishes from built-in evaluators (no namespace) and agent-scoped evaluators (colon separator) + +## Implementation Pattern + +Your evaluator should: + +1. **Extend `EvaluatorConfig`** for configuration: + +```python +from agent_control_evaluators import EvaluatorConfig + +class MyEvaluatorConfig(EvaluatorConfig): + threshold: float = 0.5 + # ... other config fields +``` + +2. **Extend `Evaluator` and use `@register_evaluator`**: + +```python +from agent_control_evaluators import Evaluator, EvaluatorMetadata, register_evaluator +from agent_control_models import EvaluatorResult + +@register_evaluator +class MyEvaluator(Evaluator[MyEvaluatorConfig]): + metadata = EvaluatorMetadata( + name="myorg.myevaluator", # Must match entry point + version="1.0.0", + description="My custom evaluator", + ) + config_model = MyEvaluatorConfig + + async def evaluate(self, data: Any) -> EvaluatorResult: + # Your evaluation logic here + return EvaluatorResult( + matched=..., + confidence=..., + message=..., + ) +``` + +## Testing + +Run tests with: +```bash +cd evaluators/extra/{{org}} +uv run pytest +``` + +## Publishing + +Build and publish your package: +```bash +uv build +uv publish +``` + +Once published, users can install via: +```bash +pip install agent-control-evaluator-{{org}} +``` + +The evaluator will be automatically discovered via entry points when used alongside agent-control. diff --git a/evaluators/extra/template/pyproject.toml.template b/evaluators/extra/template/pyproject.toml.template new file mode 100644 index 00000000..8a054e29 --- /dev/null +++ b/evaluators/extra/template/pyproject.toml.template @@ -0,0 +1,30 @@ +[project] +name = "agent-control-evaluator-{{ORG}}" +version = "3.0.0" +description = "{{ORG}} evaluators for agent-control" +requires-python = ">=3.12" +license = { text = "Apache-2.0" } +authors = [{ name = "{{AUTHOR}}" }] +dependencies = [ + "agent-control-evaluators>=3.0.0", + "agent-control-models>=3.0.0", + # Add your package-specific dependencies here +] + +[project.optional-dependencies] +dev = ["pytest>=8.0.0", "pytest-asyncio>=0.23.0"] + +[project.entry-points."agent_control.evaluators"] +# Format: "org.evaluator_name" = "package.module:Class" +"{{ORG}}.{{EVALUATOR}}" = "agent_control_evaluator_{{ORG}}.{{EVALUATOR}}:{{CLASS}}" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/agent_control_evaluator_{{ORG}}"] + +[tool.uv.sources] +agent-control-evaluators = { path = "../../builtin", editable = true } +agent-control-models = { path = "../../../models", editable = true } diff --git a/models/src/agent_control_models/controls.py b/models/src/agent_control_models/controls.py index 47ab4d09..ab9023de 100644 --- a/models/src/agent_control_models/controls.py +++ b/models/src/agent_control_models/controls.py @@ -161,7 +161,7 @@ class EvaluatorSpec(BaseModel): Evaluator reference formats: - Built-in: "regex", "list", "json", "sql" - - External: "galileo/luna2" (requires agent-control-evaluators[luna2]) + - External: "galileo.luna2" (requires agent-control-evaluators[galileo]) - Agent-scoped: "my-agent:my-evaluator" (validated in endpoint, not here) """ diff --git a/pyproject.toml b/pyproject.toml index 0385e930..c673f66e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,14 @@ description = "Agent Control - protect your AI agents with controls" requires-python = ">=3.12" [tool.uv.workspace] -members = ["models", "server", "sdks/python", "engine", "evaluators"] +members = [ + "models", + "server", + "sdks/python", + "engine", + "evaluators/builtin", + # NOTE: evaluators/extra/* excluded - install separately when needed +] [tool.uv] # Require resolution to be compatible with the following environments so that CI and local dev @@ -61,6 +68,8 @@ version_toml = [ "models/pyproject.toml:project.version", "sdks/python/pyproject.toml:project.version", "server/pyproject.toml:project.version", + "evaluators/builtin/pyproject.toml:project.version", + "evaluators/extra/galileo/pyproject.toml:project.version", ] version_source = "tag" commit_message = "chore(release): v{version}" diff --git a/scripts/build.py b/scripts/build.py index caf4ae3e..1a5340c8 100644 --- a/scripts/build.py +++ b/scripts/build.py @@ -127,7 +127,11 @@ def build_sdk() -> None: def build_server() -> None: - """Build agent-control-server with vendored packages.""" + """Build agent-control-server with vendored packages. + + Note: evaluators are NOT vendored - server uses agent-control-evaluators as a + runtime dependency to avoid duplicate module conflicts with galileo extras. + """ version = get_global_version() server_dir = ROOT / "server" server_src = server_dir / "src" @@ -135,7 +139,7 @@ def build_server() -> None: print(f"Building agent-control-server v{version}") # Clean previous builds and vendored code - for pkg in ["agent_control_models", "agent_control_engine", "agent_control_evaluators"]: + for pkg in ["agent_control_models", "agent_control_engine"]: target = server_src / pkg if target.exists(): shutil.rmtree(target) @@ -144,7 +148,7 @@ def build_server() -> None: if dist_dir.exists(): shutil.rmtree(dist_dir) - # Copy vendored packages + # Copy vendored packages (models and engine only, NOT evaluators) shutil.copytree( ROOT / "models" / "src" / "agent_control_models", server_src / "agent_control_models", @@ -153,10 +157,6 @@ def build_server() -> None: ROOT / "engine" / "src" / "agent_control_engine", server_src / "agent_control_engine", ) - shutil.copytree( - ROOT / "evaluators" / "src" / "agent_control_evaluators", - server_src / "agent_control_evaluators", - ) # Inject bundle metadata for conflict detection inject_bundle_metadata( @@ -169,11 +169,6 @@ def build_server() -> None: "agent-control-server", version, ) - inject_bundle_metadata( - server_src / "agent_control_evaluators" / "__init__.py", - "agent-control-server", - version, - ) # Set version set_package_version(server_dir / "pyproject.toml", version) @@ -183,7 +178,7 @@ def build_server() -> None: print(f" Built agent-control-server v{version}") finally: # Clean up vendored code (don't commit it) - for pkg in ["agent_control_models", "agent_control_engine", "agent_control_evaluators"]: + for pkg in ["agent_control_models", "agent_control_engine"]: target = server_src / pkg if target.exists(): shutil.rmtree(target) diff --git a/sdks/python/pyproject.toml b/sdks/python/pyproject.toml index 40580251..4efff631 100644 --- a/sdks/python/pyproject.toml +++ b/sdks/python/pyproject.toml @@ -4,6 +4,8 @@ version = "4.0.0" description = "Python SDK for Agent Control - protect your AI agents with controls" requires-python = ">=3.12" # Note: agent-control-models and agent-control-engine are bundled at build time +# Note: agent-control-evaluators is a runtime dependency (NOT vendored) to avoid +# duplicate module conflict when galileo extras are installed dependencies = [ "httpx>=0.26.0", "pydantic>=2.5.0", @@ -11,6 +13,7 @@ dependencies = [ "docstring-parser>=0.15", # For @tool decorator schema inference "google-re2>=1.1", # For engine (bundled) "jsonschema>=4.0.0", # For models/engine (bundled) + "agent-control-evaluators>=3.0.0", # NOT vendored - avoid conflict with galileo ] authors = [ {name = "Agent Control Team"} @@ -32,10 +35,10 @@ Documentation = "https://github.com/yourusername/agent-control#readme" Repository = "https://github.com/yourusername/agent-control" [project.optional-dependencies] -# Optional: Luna-2 evaluator requires additional dependencies -luna2 = [ - "httpx>=0.24.0", -] +# NOTE: luna2 extra commented out during local dev - package not yet on PyPI +# luna2 = [ +# "agent-control-evaluator-galileo>=3.0.0", +# ] [dependency-groups] dev = [ diff --git a/sdks/python/src/agent_control/evaluators/__init__.py b/sdks/python/src/agent_control/evaluators/__init__.py index 4c18102b..ee77851a 100644 --- a/sdks/python/src/agent_control/evaluators/__init__.py +++ b/sdks/python/src/agent_control/evaluators/__init__.py @@ -37,7 +37,7 @@ # Optionally export Luna-2 types when available try: - from agent_control_evaluators.galileo_luna2 import ( # noqa: F401 + from agent_control_evaluator_galileo.luna2 import ( # type: ignore[import-not-found] # noqa: F401 LUNA2_AVAILABLE, Luna2Evaluator, Luna2EvaluatorConfig, diff --git a/sdks/python/tests/test_luna2_smoke.py b/sdks/python/tests/test_luna2_smoke.py new file mode 100644 index 00000000..f6c3cdfe --- /dev/null +++ b/sdks/python/tests/test_luna2_smoke.py @@ -0,0 +1,19 @@ +"""Smoke test for Luna2 SDK exports.""" + +import pytest + + +def test_luna2_exports_available_when_installed(): + """Verify SDK re-exports Luna2 types when package installed.""" + try: + from agent_control.evaluators import ( + LUNA2_AVAILABLE, + Luna2Evaluator, + Luna2EvaluatorConfig, + ) + + assert LUNA2_AVAILABLE is True + assert Luna2Evaluator is not None + assert Luna2EvaluatorConfig is not None + except ImportError: + pytest.skip("agent-control-evaluator-galileo not installed") diff --git a/server/pyproject.toml b/server/pyproject.toml index 52a0adcb..26b42e16 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -3,7 +3,9 @@ name = "agent-control-server" version = "4.0.0" description = "Server for Agent Control - manage and evaluate controls for AI agents" requires-python = ">=3.12" -# Note: agent-control-models, agent-control-engine, and agent-control-evaluators are bundled at build time +# Note: agent-control-models and agent-control-engine are bundled at build time +# Note: agent-control-evaluators is a runtime dependency (NOT vendored) to avoid +# duplicate module conflict when galileo extras are installed dependencies = [ "fastapi>=0.109.0", "starlette-exporter>=0.23.0", @@ -18,7 +20,7 @@ dependencies = [ "jsonschema>=4.25.1", "jsonschema-rs>=0.22.0", "google-re2>=1.1", # For engine (bundled) - "sqlglot[rs]>=20.0.0", # For SQL evaluator (bundled) + "agent-control-evaluators>=3.0.0", # NOT vendored - avoid conflict with galileo ] authors = [ {name = "Agent Control Team"} @@ -27,9 +29,10 @@ readme = "README.md" license = {text = "Apache-2.0"} [project.optional-dependencies] -luna2 = [ - "httpx>=0.24.0", -] +# NOTE: luna2 extra commented out during local dev - package not yet on PyPI +# luna2 = [ +# "agent-control-evaluator-galileo>=3.0.0", +# ] [dependency-groups] dev = [ @@ -55,8 +58,9 @@ requires = ["hatchling"] build-backend = "hatchling.build" [tool.hatch.build.targets.wheel] -# Note: agent_control_models, agent_control_engine, and agent_control_evaluators are copied by scripts/build.py -packages = ["src/agent_control_server", "src/agent_control_models", "src/agent_control_engine", "src/agent_control_evaluators"] +# Note: agent_control_models and agent_control_engine are copied by scripts/build.py +# Note: agent_control_evaluators is a runtime dep, not vendored +packages = ["src/agent_control_server", "src/agent_control_models", "src/agent_control_engine"] [tool.pytest.ini_options] asyncio_mode = "auto" diff --git a/server/src/agent_control_server/services/evaluator_utils.py b/server/src/agent_control_server/services/evaluator_utils.py index a39074d7..6d5dbbb5 100644 --- a/server/src/agent_control_server/services/evaluator_utils.py +++ b/server/src/agent_control_server/services/evaluator_utils.py @@ -2,7 +2,7 @@ Evaluator Type Name Formats: - Built-in: "regex", "list", "json", "sql" - - External: "galileo/luna2", "nvidia/nemo" (slash separator) + - External: "galileo.luna2", "nvidia.nemo" (dot separator) - Agent-scoped: "my-agent:pii-detector" (colon separator) The key distinction is: @@ -68,9 +68,9 @@ def parse_evaluator_ref_full(evaluator_ref: str) -> ParsedEvaluatorRef: namespace=agent, local_name=local_name, ) - elif "/" in evaluator_ref: - # External: "galileo/luna2" - provider, local_name = evaluator_ref.split("/", 1) + elif "." in evaluator_ref: + # External: "galileo.luna2" + provider, local_name = evaluator_ref.split(".", 1) return ParsedEvaluatorRef( type="external", name=evaluator_ref, diff --git a/server/tests/test_controls_additional.py b/server/tests/test_controls_additional.py index 1ae79040..271f04b1 100644 --- a/server/tests/test_controls_additional.py +++ b/server/tests/test_controls_additional.py @@ -12,7 +12,7 @@ from agent_control_server.models import Control -from agent_control_models.controls import RegexEvaluatorConfig +from agent_control_evaluators import RegexEvaluatorConfig from agent_control_server.endpoints import controls as controls_module from agent_control_server.models import Control diff --git a/server/tests/test_evaluator_utils.py b/server/tests/test_evaluator_utils.py index f7f21c1d..706dcd53 100644 --- a/server/tests/test_evaluator_utils.py +++ b/server/tests/test_evaluator_utils.py @@ -26,11 +26,11 @@ def test_builtin_evaluator(self) -> None: def test_external_evaluator(self) -> None: """Given an external evaluator, when parsing full, then type is external.""" # When - result = parse_evaluator_ref_full("galileo/luna2") + result = parse_evaluator_ref_full("galileo.luna2") # Then assert result.type == "external" - assert result.name == "galileo/luna2" + assert result.name == "galileo.luna2" assert result.namespace == "galileo" assert result.local_name == "luna2" @@ -46,24 +46,24 @@ def test_agent_scoped_evaluator(self) -> None: assert result.local_name == "pii-detector" def test_external_with_nested_path(self) -> None: - """Given an external evaluator with nested path, when parsing, splits on first slash.""" + """Given an external evaluator with nested path, when parsing, splits on first dot.""" # When - result = parse_evaluator_ref_full("acme/safety/toxicity") + result = parse_evaluator_ref_full("acme.safety.toxicity") # Then assert result.type == "external" assert result.namespace == "acme" - assert result.local_name == "safety/toxicity" + assert result.local_name == "safety.toxicity" - def test_agent_scoped_with_slash_in_name(self) -> None: - """Given agent-scoped with slash in name, when parsing, then colon takes precedence.""" - # When - colon should be detected before slash - result = parse_evaluator_ref_full("my-agent:vendor/eval") + def test_agent_scoped_with_dot_in_name(self) -> None: + """Given agent-scoped with dot in name, when parsing, then colon takes precedence.""" + # When - colon should be detected before dot + result = parse_evaluator_ref_full("my-agent:vendor.eval") # Then assert result.type == "agent" assert result.namespace == "my-agent" - assert result.local_name == "vendor/eval" + assert result.local_name == "vendor.eval" class TestIsAgentScoped: @@ -75,7 +75,7 @@ def test_builtin_not_agent_scoped(self) -> None: def test_external_not_agent_scoped(self) -> None: """Given an external evaluator, when checking, then returns False.""" - assert is_agent_scoped("galileo/luna2") is False + assert is_agent_scoped("galileo.luna2") is False def test_agent_scoped_returns_true(self) -> None: """Given an agent-scoped evaluator, when checking, then returns True.""" From b5e4e44a6bd0c84f212bc4a0048eb5f4ea4808e4 Mon Sep 17 00:00:00 2001 From: Abhinav Gupta Date: Tue, 3 Feb 2026 18:42:35 +0530 Subject: [PATCH 09/21] fix(ci): add ruff/mypy to galileo dev deps and fix workflow - Add ruff and mypy to galileo package dev dependencies - Update CI workflow to use `uv sync --extra dev` instead of `uv pip install` - Use `uv run --extra dev` to ensure dev tools are available - Update template with same dev dependencies --- .github/workflows/test-extras.yml | 10 +++++----- evaluators/extra/galileo/pyproject.toml | 7 ++++++- evaluators/extra/template/pyproject.toml.template | 7 ++++++- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/.github/workflows/test-extras.yml b/.github/workflows/test-extras.yml index dbb2ea33..8b577775 100644 --- a/.github/workflows/test-extras.yml +++ b/.github/workflows/test-extras.yml @@ -34,17 +34,17 @@ jobs: - name: Sync workspace run: make sync - - name: Install galileo extra - run: cd evaluators/extra/galileo && uv pip install -e . + - name: Install galileo extra with dev dependencies + run: cd evaluators/extra/galileo && uv sync --extra dev - name: Lint galileo - run: cd evaluators/extra/galileo && uv run ruff check --config ../../../pyproject.toml src/ + run: cd evaluators/extra/galileo && uv run --extra dev ruff check --config ../../../pyproject.toml src/ - name: Typecheck galileo - run: cd evaluators/extra/galileo && uv run mypy --config-file ../../../pyproject.toml src/ + run: cd evaluators/extra/galileo && uv run --extra dev mypy --config-file ../../../pyproject.toml src/ - name: Test galileo - run: cd evaluators/extra/galileo && uv run pytest + run: cd evaluators/extra/galileo && uv run --extra dev pytest - name: Verify SDK integration run: | diff --git a/evaluators/extra/galileo/pyproject.toml b/evaluators/extra/galileo/pyproject.toml index 7342c207..f14f7d14 100644 --- a/evaluators/extra/galileo/pyproject.toml +++ b/evaluators/extra/galileo/pyproject.toml @@ -14,7 +14,12 @@ dependencies = [ ] [project.optional-dependencies] -dev = ["pytest>=8.0.0", "pytest-asyncio>=0.23.0"] +dev = [ + "pytest>=8.0.0", + "pytest-asyncio>=0.23.0", + "ruff>=0.1.0", + "mypy>=1.8.0", +] [project.entry-points."agent_control.evaluators"] "galileo.luna2" = "agent_control_evaluator_galileo.luna2:Luna2Evaluator" diff --git a/evaluators/extra/template/pyproject.toml.template b/evaluators/extra/template/pyproject.toml.template index 8a054e29..484864bb 100644 --- a/evaluators/extra/template/pyproject.toml.template +++ b/evaluators/extra/template/pyproject.toml.template @@ -12,7 +12,12 @@ dependencies = [ ] [project.optional-dependencies] -dev = ["pytest>=8.0.0", "pytest-asyncio>=0.23.0"] +dev = [ + "pytest>=8.0.0", + "pytest-asyncio>=0.23.0", + "ruff>=0.1.0", + "mypy>=1.8.0", +] [project.entry-points."agent_control.evaluators"] # Format: "org.evaluator_name" = "package.module:Class" From dc9fa1c08a94e331b6d674d645b6178904eabef4 Mon Sep 17 00:00:00 2001 From: Abhinav Gupta Date: Tue, 3 Feb 2026 18:46:28 +0530 Subject: [PATCH 10/21] style(galileo): fix import sorting order --- .../src/agent_control_evaluator_galileo/luna2/config.py | 3 +-- .../src/agent_control_evaluator_galileo/luna2/evaluator.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/evaluators/extra/galileo/src/agent_control_evaluator_galileo/luna2/config.py b/evaluators/extra/galileo/src/agent_control_evaluator_galileo/luna2/config.py index 5f158e03..aced94ab 100644 --- a/evaluators/extra/galileo/src/agent_control_evaluator_galileo/luna2/config.py +++ b/evaluators/extra/galileo/src/agent_control_evaluator_galileo/luna2/config.py @@ -2,9 +2,8 @@ from typing import Any, Literal -from pydantic import Field, model_validator - from agent_control_evaluators import EvaluatorConfig +from pydantic import Field, model_validator # Supported Luna-2 metrics Luna2Metric = Literal[ diff --git a/evaluators/extra/galileo/src/agent_control_evaluator_galileo/luna2/evaluator.py b/evaluators/extra/galileo/src/agent_control_evaluator_galileo/luna2/evaluator.py index ab037be8..18137115 100644 --- a/evaluators/extra/galileo/src/agent_control_evaluator_galileo/luna2/evaluator.py +++ b/evaluators/extra/galileo/src/agent_control_evaluator_galileo/luna2/evaluator.py @@ -8,9 +8,8 @@ import os from typing import Any -from agent_control_models import EvaluatorResult - from agent_control_evaluators import Evaluator, EvaluatorMetadata, register_evaluator +from agent_control_models import EvaluatorResult from agent_control_evaluator_galileo.luna2.config import Luna2EvaluatorConfig From 3b1a1edb7c6bf0fe00ab1a4244fa5d4e51314f2c Mon Sep 17 00:00:00 2001 From: Abhinav Gupta Date: Tue, 3 Feb 2026 18:48:33 +0530 Subject: [PATCH 11/21] chore(hooks): add galileo extras to pre-push checks --- .githooks/pre-push | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.githooks/pre-push b/.githooks/pre-push index 9141a52c..c4ac972d 100755 --- a/.githooks/pre-push +++ b/.githooks/pre-push @@ -14,3 +14,12 @@ make lint echo "pre-push: running make typecheck" make typecheck + +# Check extras if they exist and have changes +if [ -d "evaluators/extra/galileo" ]; then + echo "pre-push: checking evaluators/extra/galileo" + cd evaluators/extra/galileo + uv run --extra dev ruff check --config ../../../pyproject.toml src/ + uv run --extra dev mypy --config-file ../../../pyproject.toml src/ + cd "$REPO_ROOT" +fi From d8e7025f4db5a78faf47eb4802c11108a446a6e4 Mon Sep 17 00:00:00 2001 From: Abhinav Gupta Date: Tue, 3 Feb 2026 18:48:45 +0530 Subject: [PATCH 12/21] chore(ci): remove test-extras workflow (covered by pre-push hook) --- .github/workflows/test-extras.yml | 52 ------------------------------- 1 file changed, 52 deletions(-) delete mode 100644 .github/workflows/test-extras.yml diff --git a/.github/workflows/test-extras.yml b/.github/workflows/test-extras.yml deleted file mode 100644 index 8b577775..00000000 --- a/.github/workflows/test-extras.yml +++ /dev/null @@ -1,52 +0,0 @@ -name: Test Extras - -on: - push: - paths: - # Trigger on extra changes - - 'evaluators/extra/**' - # Also trigger on core changes that could break extras - - 'evaluators/builtin/**' - - 'models/**' - - 'engine/**' - - 'server/**' - - 'sdks/python/**' - pull_request: - paths: - - 'evaluators/extra/**' - - 'evaluators/builtin/**' - - 'models/**' - - 'engine/**' - - 'server/**' - - 'sdks/python/**' - -jobs: - test-galileo: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - - name: Setup uv and Python - uses: astral-sh/setup-uv@v3 - with: - python-version: "3.12" - - - name: Sync workspace - run: make sync - - - name: Install galileo extra with dev dependencies - run: cd evaluators/extra/galileo && uv sync --extra dev - - - name: Lint galileo - run: cd evaluators/extra/galileo && uv run --extra dev ruff check --config ../../../pyproject.toml src/ - - - name: Typecheck galileo - run: cd evaluators/extra/galileo && uv run --extra dev mypy --config-file ../../../pyproject.toml src/ - - - name: Test galileo - run: cd evaluators/extra/galileo && uv run --extra dev pytest - - - name: Verify SDK integration - run: | - cd sdks/python - uv run pytest tests/test_luna2_smoke.py From e6dd37168f34c67f6289b324a19209e249d99eaf Mon Sep 17 00:00:00 2001 From: Abhinav Gupta Date: Tue, 3 Feb 2026 18:49:54 +0530 Subject: [PATCH 13/21] fix(galileo): fix api_key type annotation for mypy --- .../agent_control_evaluator_galileo/luna2/client.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/evaluators/extra/galileo/src/agent_control_evaluator_galileo/luna2/client.py b/evaluators/extra/galileo/src/agent_control_evaluator_galileo/luna2/client.py index 192fd868..363abc56 100644 --- a/evaluators/extra/galileo/src/agent_control_evaluator_galileo/luna2/client.py +++ b/evaluators/extra/galileo/src/agent_control_evaluator_galileo/luna2/client.py @@ -207,16 +207,16 @@ def __init__( Raises: ValueError: If no API key is provided or found in environment. """ - self.api_key = api_key or os.getenv("GALILEO_API_KEY") - self.console_url = ( - console_url or os.getenv("GALILEO_CONSOLE_URL") or "https://console.galileo.ai" - ) - - if not self.api_key: + resolved_api_key = api_key or os.getenv("GALILEO_API_KEY") + if not resolved_api_key: raise ValueError( "GALILEO_API_KEY is required. " "Set it as an environment variable or pass it to the constructor." ) + self.api_key: str = resolved_api_key + self.console_url = ( + console_url or os.getenv("GALILEO_CONSOLE_URL") or "https://console.galileo.ai" + ) # Derive API base URL from console URL # console.galileo.ai -> api.galileo.ai From 88b5c57232f92593cb61b190817f27123846afa8 Mon Sep 17 00:00:00 2001 From: Abhinav Gupta Date: Tue, 3 Feb 2026 19:02:18 +0530 Subject: [PATCH 14/21] docs: update evaluator naming from slash to dot notation Update all documentation and code references from `galileo/luna2` to `galileo.luna2` to match the actual implementation. The dot separator is used for external evaluators to distinguish from agent-scoped evaluators (which use colon). Files updated: - docs/OVERVIEW.md, docs/REFERENCE.md - evaluator examples - CONTRIBUTING.md - naming convention docs - README.md, examples/ - usage examples - UI evaluator definition and test fixtures - Server evaluator_utils.py docstrings - Evaluator _base.py docstring --- CONTRIBUTING.md | 28 +++++++++---------- README.md | 2 +- docs/OVERVIEW.md | 10 +++---- docs/REFERENCE.md | 12 ++++---- .../src/agent_control_evaluators/_base.py | 2 +- examples/customer_support_agent/README.md | 2 +- .../services/evaluator_utils.py | 6 ++-- ui/src/core/evaluators/luna2/index.ts | 2 +- ui/tests/fixtures.ts | 2 +- 9 files changed, 33 insertions(+), 33 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c530b459..534da7e0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -237,14 +237,14 @@ evaluators/src/agent_control_evaluators/ │ └── evaluator.py # SqlEvaluator implementation # # External evaluators (namespaced, optional dependencies) -└── galileo_luna2/ # Type name: "galileo/luna2" (folder uses underscore) +└── galileo_luna2/ # Type name: "galileo.luna2" (folder uses underscore) ├── config.py # Luna2EvaluatorConfig ├── evaluator.py # Luna2Evaluator implementation └── client.py # Direct HTTP client (no SDK dependency) ``` > **Note:** Folder names use `snake_case` (Python convention), but type names in metadata -> use `provider/name` format with slash for external evaluators. +> use `provider.name` format with dot separator for external evaluators. **Creating a new evaluator:** @@ -253,12 +253,12 @@ Choose the appropriate type based on your use case: | Type | When to Use | Name Format | |------|-------------|-------------| | Built-in | Core functionality, no external deps | `my-evaluator` | -| External | External provider integration, optional deps | `provider/name` | +| External | External provider integration, optional deps | `provider.name` | | Agent-scoped | Custom logic deployed with agent | `my-agent:custom` | ### Creating a Third-Party Evaluator (Recommended for External Providers) -This example creates a external evaluator `acme/toxicity`: +This example creates a external evaluator `acme.toxicity`: **1. Create evaluator directory:** ```bash @@ -322,7 +322,7 @@ class AcmeToxicityEvaluator(Evaluator[AcmeToxicityEvaluatorConfig]): """ metadata = EvaluatorMetadata( - name="acme/toxicity", # <-- External provider with slash + name="acme.toxicity", # <-- External provider with dot version="1.0.0", description="Acme toxicity detection API", requires_api_key=True, @@ -398,7 +398,7 @@ all = ["httpx>=0.24.0"] # Include in 'all' extra regex = "agent_control_evaluators.regex:RegexEvaluator" list = "agent_control_evaluators.list:ListEvaluator" # ... existing entries ... -"acme/toxicity" = "agent_control_evaluators.acme_toxicity:AcmeToxicityEvaluator" +"acme.toxicity" = "agent_control_evaluators.acme_toxicity:AcmeToxicityEvaluator" ``` **6. Add tests in `evaluators/tests/acme_toxicity/`** @@ -635,15 +635,15 @@ Evaluator type names identify evaluator implementations. The format indicates th | Format | Origin | Examples | |--------|--------|----------| | `name` | Built-in (first-party, no dependencies) | `regex`, `list`, `json`, `sql` | -| `provider/name` | External (external providers, optional deps) | `galileo/luna2`, `nvidia/nemo` | +| `provider.name` | External (external providers, optional deps) | `galileo.luna2`, `nvidia.nemo` | | `agent:name` | Agent-scoped (custom code deployed with agent) | `my-agent:pii-detector` | **Parsing rules:** ```python if ":" in name: # Agent-scoped (split on first ":") agent, evaluator = name.split(":", 1) -elif "/" in name: # External provider (split on first "/") - provider, evaluator = name.split("/", 1) +elif "." in name: # External provider (split on first ".") + provider, evaluator = name.split(".", 1) else: # Built-in evaluator = name ``` @@ -655,8 +655,8 @@ else: # Built-in - Core dependencies only (included in base package) - Imported and registered automatically on package import -**External evaluators** (`galileo/luna2`): -- Use `provider/name` format with slash separator +**External evaluators** (`galileo.luna2`): +- Use `provider.name` format with dot separator - Have optional dependencies (install via extras: `pip install agent-control-evaluators[luna2]`) - Discovered via Python entry points (not auto-imported) @@ -685,8 +685,8 @@ Controls reference them as `my-agent:pii-detector` (the `:` indicates agent scop | Item | Convention | Example | |------|------------|---------| | Folder name | `snake_case` (Python package) | `galileo_luna2/` | -| Entry point key | Same as type name | `"galileo/luna2"` | -| Metadata name | Same as type name | `name="galileo/luna2"` | +| Entry point key | Same as type name | `"galileo.luna2"` | +| Metadata name | Same as type name | `name="galileo.luna2"` | > **Note:** In code, use "provider" as the type identifier. In user-facing docs, > use "external" as the descriptive term. @@ -709,7 +709,7 @@ Controls reference them as `my-agent:pii-detector` (the `:` indicates agent scop **Naming convention quick reference:** ``` Built-in: regex, list, json, sql -External: galileo/luna2, nvidia/nemo +External: galileo.luna2, nvidia.nemo Agent-scoped: my-agent:pii-detector ``` diff --git a/README.md b/README.md index 7e7af095..b8cd997b 100644 --- a/README.md +++ b/README.md @@ -188,7 +188,7 @@ Controls are defined via the API or dashboard. Each control specifies what to ch "scope": { "step_names": ["process_user_message"], "stages": ["pre"] }, "selector": { "path": "input" }, "evaluator": { - "name": "galileo/luna2", + "name": "galileo.luna2", "config": { "metric": "input_toxicity", "operator": "gt", diff --git a/docs/OVERVIEW.md b/docs/OVERVIEW.md index 25c5eb42..0b832279 100644 --- a/docs/OVERVIEW.md +++ b/docs/OVERVIEW.md @@ -236,7 +236,7 @@ Flexible value matching with multiple modes and logic options. --- -### 3. Luna-2 Evaluator (`galileo/luna2`) +### 3. Luna-2 Evaluator (`galileo.luna2`) AI-powered detection using Galileo's Luna-2 small language models. Provides real-time, low-latency evaluation for complex patterns that can't be caught with regex or lists. @@ -267,7 +267,7 @@ AI-powered detection using Galileo's Luna-2 small language models. Provides real ```json // Block toxic inputs (score > 0.5) { - "name": "galileo/luna2", + "name": "galileo.luna2", "config": { "metric": "input_toxicity", "operator": "gt", @@ -278,7 +278,7 @@ AI-powered detection using Galileo's Luna-2 small language models. Provides real // Block prompt injection attempts { - "name": "galileo/luna2", + "name": "galileo.luna2", "config": { "metric": "prompt_injection", "operator": "gt", @@ -289,7 +289,7 @@ AI-powered detection using Galileo's Luna-2 small language models. Provides real // Flag potential hallucinations (warn but allow) { - "name": "galileo/luna2", + "name": "galileo.luna2", "config": { "metric": "hallucination", "operator": "gt", @@ -299,7 +299,7 @@ AI-powered detection using Galileo's Luna-2 small language models. Provides real // Using a central stage (pre-defined server-side rules) { - "name": "galileo/luna2", + "name": "galileo.luna2", "config": { "stage_type": "central", "stage_name": "production-safety", diff --git a/docs/REFERENCE.md b/docs/REFERENCE.md index 81e9989c..53328e13 100644 --- a/docs/REFERENCE.md +++ b/docs/REFERENCE.md @@ -385,7 +385,7 @@ Flexible value matching with multiple modes and logic options. AI-powered detection using Galileo's Luna-2 small language models. Provides real-time, low-latency evaluation for complex patterns that can't be caught with regex or lists. -**Evaluator name**: `galileo/luna2` +**Evaluator name**: `galileo.luna2` **Installation**: Luna-2 requires an optional dependency: @@ -429,7 +429,7 @@ pip install agent-control-evaluators[luna2] ```json // Block toxic inputs (score > 0.5) { - "name": "galileo/luna2", + "name": "galileo.luna2", "config": { "metric": "input_toxicity", "operator": "gt", @@ -440,7 +440,7 @@ pip install agent-control-evaluators[luna2] // Block prompt injection attempts { - "name": "galileo/luna2", + "name": "galileo.luna2", "config": { "metric": "prompt_injection", "operator": "gt", @@ -451,7 +451,7 @@ pip install agent-control-evaluators[luna2] // Flag potential hallucinations (warn but allow) { - "name": "galileo/luna2", + "name": "galileo.luna2", "config": { "metric": "hallucination", "operator": "gt", @@ -461,7 +461,7 @@ pip install agent-control-evaluators[luna2] // Using central stage (pre-defined in Galileo) { - "name": "galileo/luna2", + "name": "galileo.luna2", "config": { "stage_type": "central", "stage_name": "production-safety", @@ -967,6 +967,6 @@ make alembic-upgrade 4. Verify the metric name is valid 5. Check `on_error` setting if failures are silently allowed -**Evaluator Not Found**: If `galileo/luna2` doesn't appear in `list_evaluators()`: +**Evaluator Not Found**: If `galileo.luna2` doesn't appear in `list_evaluators()`: - Verify `httpx` is installed (Luna-2's `is_available()` returns `False` without it) - Check server logs for evaluator discovery messages diff --git a/evaluators/builtin/src/agent_control_evaluators/_base.py b/evaluators/builtin/src/agent_control_evaluators/_base.py index b548e60b..f5e6fc77 100644 --- a/evaluators/builtin/src/agent_control_evaluators/_base.py +++ b/evaluators/builtin/src/agent_control_evaluators/_base.py @@ -43,7 +43,7 @@ class EvaluatorMetadata: """Metadata about an evaluator. Attributes: - name: Unique evaluator name (e.g., "regex", "galileo/luna2") + name: Unique evaluator name (e.g., "regex", "galileo.luna2") version: Evaluator version string description: Human-readable description requires_api_key: Whether the evaluator requires an API key diff --git a/examples/customer_support_agent/README.md b/examples/customer_support_agent/README.md index 0517135d..8eeedd49 100644 --- a/examples/customer_support_agent/README.md +++ b/examples/customer_support_agent/README.md @@ -302,7 +302,7 @@ scope: selector: path: input evaluator: - name: galileo/luna2 + name: galileo.luna2 config: stage_type: local metric: input_toxicity diff --git a/server/src/agent_control_server/services/evaluator_utils.py b/server/src/agent_control_server/services/evaluator_utils.py index 6d5dbbb5..1af769bb 100644 --- a/server/src/agent_control_server/services/evaluator_utils.py +++ b/server/src/agent_control_server/services/evaluator_utils.py @@ -24,7 +24,7 @@ class ParsedEvaluatorRef: Attributes: type: The evaluator category ("builtin", "external", or "agent") - name: The full evaluator name (e.g., "regex", "galileo/luna2", "my-agent:pii") + name: The full evaluator name (e.g., "regex", "galileo.luna2", "my-agent:pii") namespace: For external evaluators, the provider name; for agent-scoped, the agent name local_name: The evaluator name without namespace prefix """ @@ -40,7 +40,7 @@ def parse_evaluator_ref_full(evaluator_ref: str) -> ParsedEvaluatorRef: Determines the evaluator type based on the name format: - Contains ":" → agent-scoped (split on first ":") - - Contains "/" → external (split on first "/") + - Contains "." → external (split on first ".") - Otherwise → built-in Args: @@ -53,7 +53,7 @@ def parse_evaluator_ref_full(evaluator_ref: str) -> ParsedEvaluatorRef: >>> parse_evaluator_ref_full("regex") ParsedEvaluatorRef(type="builtin", name="regex", ...) - >>> parse_evaluator_ref_full("galileo/luna2") + >>> parse_evaluator_ref_full("galileo.luna2") ParsedEvaluatorRef(type="external", namespace="galileo", ...) >>> parse_evaluator_ref_full("my-agent:pii-detector") diff --git a/ui/src/core/evaluators/luna2/index.ts b/ui/src/core/evaluators/luna2/index.ts index c9be82c8..c42e6015 100644 --- a/ui/src/core/evaluators/luna2/index.ts +++ b/ui/src/core/evaluators/luna2/index.ts @@ -33,7 +33,7 @@ const numberOrNull = (value: number | ""): number | null => * prompt injection, PII detection, and more. */ export const luna2Evaluator: EvaluatorDefinition = { - id: "galileo/luna2", + id: "galileo.luna2", displayName: "Galileo Luna-2", initialValues: { diff --git a/ui/tests/fixtures.ts b/ui/tests/fixtures.ts index 05881c3c..82395efe 100644 --- a/ui/tests/fixtures.ts +++ b/ui/tests/fixtures.ts @@ -240,7 +240,7 @@ const evaluatorsResponse: EvaluatorsResponse = { required: ["schema"], }, }, - "galileo/luna2": { + "galileo.luna2": { name: "Galileo Luna-2", version: "1.0.0", description: "AI-powered content moderation using Galileo Luna-2", From 32f6f70d419127747e8cd761cee8ff1b483ccea1 Mon Sep 17 00:00:00 2001 From: Abhinav Gupta Date: Tue, 3 Feb 2026 19:43:37 +0530 Subject: [PATCH 15/21] fix: SQL evaluator LIMIT/OFFSET bypass and update docs for new package structure Bug fixes: - fix(sql): use args.get() instead of find() for LIMIT/OFFSET to prevent subquery clauses from being attributed to outer queries - fix(engine): bump dependency floor to >=3.0.0 for models and evaluators Documentation updates: - Update evaluators directory structure to reflect builtin/extra tiers - Update external evaluator example to use separate package pattern - Show both direct install and extras syntax for Luna-2 - Fix all outdated path references to use evaluators/builtin/ Package config: - Add TODO comments for commented-out extras (tracking PyPI publish) --- CONTRIBUTING.md | 234 ++++++++---------- docs/OVERVIEW.md | 10 +- docs/REFERENCE.md | 12 +- engine/pyproject.toml | 4 +- evaluators/builtin/pyproject.toml | 3 +- .../agent_control_evaluators/sql/evaluator.py | 8 +- sdks/python/pyproject.toml | 3 +- server/pyproject.toml | 3 +- 8 files changed, 137 insertions(+), 140 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 534da7e0..59b79f57 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -218,33 +218,31 @@ make test Extensible evaluators for custom detection logic. ```bash -# Location -evaluators/src/agent_control_evaluators/ - -# Key directories (flat structure - each evaluator is a peer directory) -# Built-in evaluators (no namespace, core dependencies only) -├── regex/ # Type name: "regex" -│ ├── config.py # RegexEvaluatorConfig -│ └── evaluator.py # RegexEvaluator implementation -├── list/ # Type name: "list" -│ ├── config.py # ListEvaluatorConfig -│ └── evaluator.py # ListEvaluator implementation -├── json/ # Type name: "json" -│ ├── config.py # JsonEvaluatorConfig -│ └── evaluator.py # JsonEvaluator implementation -├── sql/ # Type name: "sql" -│ ├── config.py # SqlEvaluatorConfig -│ └── evaluator.py # SqlEvaluator implementation -# -# External evaluators (namespaced, optional dependencies) -└── galileo_luna2/ # Type name: "galileo.luna2" (folder uses underscore) - ├── config.py # Luna2EvaluatorConfig - ├── evaluator.py # Luna2Evaluator implementation - └── client.py # Direct HTTP client (no SDK dependency) -``` - -> **Note:** Folder names use `snake_case` (Python convention), but type names in metadata -> use `provider.name` format with dot separator for external evaluators. +evaluators/ +├── builtin/ # agent-control-evaluators package +│ ├── pyproject.toml +│ ├── src/agent_control_evaluators/ +│ │ ├── _base.py # Evaluator, EvaluatorConfig, EvaluatorMetadata +│ │ ├── _registry.py # register_evaluator, get_evaluator +│ │ ├── _discovery.py # Entry point discovery +│ │ ├── _factory.py # Instance caching +│ │ ├── regex/ # Type name: "regex" +│ │ ├── list/ # Type name: "list" +│ │ ├── json/ # Type name: "json" +│ │ └── sql/ # Type name: "sql" +│ └── tests/ +│ +└── extra/ # External evaluator packages + ├── galileo/ # agent-control-evaluator-galileo package + │ ├── pyproject.toml # Separate package with own entry points + │ ├── src/agent_control_evaluator_galileo/ + │ │ └── luna2/ # Type name: "galileo.luna2" + │ └── tests/ + └── template/ # Template for new external evaluators +``` + +> **Note:** Built-in evaluators live in the `builtin/` package. External evaluators are +> separate packages under `extra/`, each with their own `pyproject.toml` and entry points. **Creating a new evaluator:** @@ -256,17 +254,49 @@ Choose the appropriate type based on your use case: | External | External provider integration, optional deps | `provider.name` | | Agent-scoped | Custom logic deployed with agent | `my-agent:custom` | -### Creating a Third-Party Evaluator (Recommended for External Providers) +### Creating an External Evaluator Package (Recommended for External Providers) -This example creates a external evaluator `acme.toxicity`: +External evaluators live in their own packages under `evaluators/extra/`. This example +creates an `acme.toxicity` evaluator as a separate package. -**1. Create evaluator directory:** +**1. Copy the template and set up the package:** ```bash -mkdir -p evaluators/src/agent_control_evaluators/acme_toxicity/ -touch evaluators/src/agent_control_evaluators/acme_toxicity/__init__.py +cp -r evaluators/extra/template evaluators/extra/acme +cd evaluators/extra/acme +``` + +**2. Create `pyproject.toml`** (from the template): +```toml +[project] +name = "agent-control-evaluator-acme" +version = "1.0.0" +description = "Acme toxicity evaluator for agent-control" +requires-python = ">=3.12" +dependencies = [ + "agent-control-evaluators>=3.0.0", + "agent-control-models>=3.0.0", + "httpx>=0.24.0", # Your external dependencies +] + +[project.entry-points."agent_control.evaluators"] +"acme.toxicity" = "agent_control_evaluator_acme.toxicity:AcmeToxicityEvaluator" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/agent_control_evaluator_acme"] ``` -**2. Define configuration model (`config.py`):** +**3. Create directory structure:** +```bash +mkdir -p src/agent_control_evaluator_acme/toxicity +touch src/agent_control_evaluator_acme/__init__.py +touch src/agent_control_evaluator_acme/toxicity/__init__.py +``` + +**4. Define configuration model (`toxicity/config.py`):** ```python from pydantic import Field from agent_control_evaluators import EvaluatorConfig @@ -285,44 +315,25 @@ class AcmeToxicityEvaluatorConfig(EvaluatorConfig): default_factory=lambda: ["hate", "violence"], description="Toxicity categories to check", ) - timeout_ms: int = Field( - default=5000, - ge=100, - le=30000, - description="API timeout in milliseconds", - ) ``` -**3. Implement evaluator (`evaluator.py`):** +**5. Implement evaluator (`toxicity/evaluator.py`):** ```python from typing import Any +import httpx +from agent_control_evaluators import Evaluator, EvaluatorMetadata, register_evaluator from agent_control_models import EvaluatorResult -from agent_control_evaluators._base import Evaluator, EvaluatorMetadata -from agent_control_evaluators._registry import register_evaluator -from agent_control_evaluators.acme_toxicity.config import AcmeToxicityEvaluatorConfig - -# Check optional dependency -try: - import httpx - ACME_AVAILABLE = True -except ImportError: - ACME_AVAILABLE = False +from agent_control_evaluator_acme.toxicity.config import AcmeToxicityEvaluatorConfig @register_evaluator class AcmeToxicityEvaluator(Evaluator[AcmeToxicityEvaluatorConfig]): - """Acme Toxicity detection evaluator. - - Calls the Acme API to detect toxic content in text. - - Example config: - {"threshold": 0.8, "categories": ["hate", "harassment"]} - """ + """Acme Toxicity detection evaluator.""" metadata = EvaluatorMetadata( - name="acme.toxicity", # <-- External provider with dot + name="acme.toxicity", # <-- External provider: org.name format version="1.0.0", description="Acme toxicity detection API", requires_api_key=True, @@ -330,88 +341,61 @@ class AcmeToxicityEvaluator(Evaluator[AcmeToxicityEvaluatorConfig]): ) config_model = AcmeToxicityEvaluatorConfig - @classmethod - def is_available(cls) -> bool: - """Check if httpx dependency is installed.""" - return ACME_AVAILABLE - - def __init__(self, config: AcmeToxicityEvaluatorConfig) -> None: - super().__init__(config) - # Pre-compile or initialize resources here (will be cached) - self._client: httpx.AsyncClient | None = None - async def evaluate(self, data: Any) -> EvaluatorResult: """Evaluate text for toxicity.""" if data is None: - return EvaluatorResult( - matched=False, - confidence=1.0, - message="No data to evaluate", - ) - - text = str(data) + return EvaluatorResult(matched=False, confidence=1.0, message="No data") try: - score = await self._call_api(text) - matched = score >= self.config.threshold - + score = await self._call_api(str(data)) return EvaluatorResult( - matched=matched, + matched=score >= self.config.threshold, confidence=score, message=f"Toxicity score: {score:.2f}", - metadata={ - "score": score, - "threshold": self.config.threshold, - "categories": self.config.categories, - }, ) except Exception as e: - # Return error result (fail-open by default) return EvaluatorResult( matched=False, confidence=0.0, message=f"Evaluation failed: {e}", - error=f"{type(e).__name__}: {str(e)[:200]}", + error=str(e), ) async def _call_api(self, text: str) -> float: """Call Acme API and return toxicity score.""" - # Implementation details... + # Your implementation here pass ``` -**4. Export in `__init__.py`:** +**6. Export in `toxicity/__init__.py`:** ```python -from agent_control_evaluators.acme_toxicity.config import AcmeToxicityEvaluatorConfig -from agent_control_evaluators.acme_toxicity.evaluator import AcmeToxicityEvaluator +from agent_control_evaluator_acme.toxicity.config import AcmeToxicityEvaluatorConfig +from agent_control_evaluator_acme.toxicity.evaluator import AcmeToxicityEvaluator __all__ = ["AcmeToxicityEvaluator", "AcmeToxicityEvaluatorConfig"] ``` -**5. Register entry point in `evaluators/pyproject.toml`:** -```toml -[project.optional-dependencies] -acme = ["httpx>=0.24.0"] # Your dependencies -all = ["httpx>=0.24.0"] # Include in 'all' extra - -[project.entry-points."agent_control.evaluators"] -regex = "agent_control_evaluators.regex:RegexEvaluator" -list = "agent_control_evaluators.list:ListEvaluator" -# ... existing entries ... -"acme.toxicity" = "agent_control_evaluators.acme_toxicity:AcmeToxicityEvaluator" +**7. Add tests in `tests/`** and publish: +```bash +uv run pytest +uv build && uv publish ``` -**6. Add tests in `evaluators/tests/acme_toxicity/`** +Once published, users install via `pip install agent-control-evaluator-acme` and the +evaluator is automatically discovered via entry points ### Creating a Built-in Evaluator -For evaluators with no external dependencies (to be included in core), follow the same pattern but: -- Use simple name: `name="my-evaluator"` (no slash) -- No `is_available()` override needed -- Import directly in `evaluators/src/agent_control_evaluators/__init__.py` for auto-registration: - ```python - from agent_control_evaluators.my_evaluator import MyEvaluator, MyEvaluatorConfig - ``` +For evaluators with no external dependencies (to be included in core): + +1. Create directory: `evaluators/builtin/src/agent_control_evaluators/my_evaluator/` +2. Add `config.py` extending `EvaluatorConfig` +3. Add `evaluator.py` with `@register_evaluator` and simple name: `name="my-evaluator"` +4. Add entry point in `evaluators/builtin/pyproject.toml` +5. Import in `evaluators/builtin/src/agent_control_evaluators/__init__.py` for auto-registration: + ```python + from agent_control_evaluators.my_evaluator import MyEvaluator, MyEvaluatorConfig + ``` ### Evaluator Best Practices @@ -533,7 +517,8 @@ Update `version` in respective `pyproject.toml` files: - `server/pyproject.toml` - `sdks/python/pyproject.toml` - `engine/pyproject.toml` -- `evaluators/pyproject.toml` +- `evaluators/builtin/pyproject.toml` +- `evaluators/extra/galileo/pyproject.toml` (and other external packages) --- @@ -581,16 +566,17 @@ test: add control set integration tests See the **Evaluators** section above for detailed instructions. Summary: -1. Decide on evaluator type (built-in vs external) -2. Create directory: `evaluators/src/agent_control_evaluators/my_evaluator/` -3. Add `config.py` extending `EvaluatorConfig` -4. Add `evaluator.py` with `@register_evaluator` decorator -5. Add entry point in `evaluators/pyproject.toml`: - - Built-in: `my-evaluator = "..."` - - External: `"provider/name" = "..."` -6. Add optional dependencies if needed -7. Add tests in `evaluators/tests/` -8. Update `docs/OVERVIEW.md` with usage examples +**Built-in evaluator:** +1. Create directory: `evaluators/builtin/src/agent_control_evaluators/my_evaluator/` +2. Add `config.py` extending `EvaluatorConfig` +3. Add `evaluator.py` with `@register_evaluator` decorator +4. Add entry point in `evaluators/builtin/pyproject.toml` +5. Add tests in `evaluators/builtin/tests/` + +**External evaluator (separate package):** +1. Copy template: `cp -r evaluators/extra/template evaluators/extra/myorg` +2. Create package with own `pyproject.toml` and entry points +3. Add tests and publish to PyPI ### Update shared models @@ -657,7 +643,7 @@ else: # Built-in **External evaluators** (`galileo.luna2`): - Use `provider.name` format with dot separator -- Have optional dependencies (install via extras: `pip install agent-control-evaluators[luna2]`) +- Are separate packages (e.g., `pip install agent-control-evaluator-galileo` or `pip install agent-control-evaluators[galileo]`) - Discovered via Python entry points (not auto-imported) ### Agent-Scoped Evaluators @@ -702,9 +688,9 @@ Controls reference them as `my-agent:pii-detector` (the `:` indicates agent scop | Evaluator metadata | `agent_control_evaluators.EvaluatorMetadata` | | Evaluator result | `agent_control_models.EvaluatorResult` | | Register decorator | `@agent_control_evaluators.register_evaluator` | -| Built-in evaluators | `evaluators/src/agent_control_evaluators/{regex,list,json,sql}/` | -| External evaluators | `evaluators/src/agent_control_evaluators/galileo_luna2/` | -| Evaluator tests | `evaluators/tests/` | +| Built-in evaluators | `evaluators/builtin/src/agent_control_evaluators/{regex,list,json,sql}/` | +| External evaluators | `evaluators/extra/galileo/` (separate packages) | +| Evaluator tests | `evaluators/builtin/tests/` or `evaluators/extra/*/tests/` | **Naming convention quick reference:** ``` diff --git a/docs/OVERVIEW.md b/docs/OVERVIEW.md index 0b832279..27be6e2d 100644 --- a/docs/OVERVIEW.md +++ b/docs/OVERVIEW.md @@ -594,11 +594,13 @@ class ContentModerationEvaluator(Evaluator[ContentModerationEvaluatorConfig]): # Core SDK pip install agent-control -# Server (with Luna-2 support) -pip install agent-control-server[luna2] +# Server +pip install agent-control-server -# Or install everything -pip install agent-control-server[all] +# With Luna-2 evaluator support (Galileo) +pip install agent-control-evaluator-galileo +# Or via convenience extra: +pip install agent-control-server[luna2] ``` ### Quick Start diff --git a/docs/REFERENCE.md b/docs/REFERENCE.md index 53328e13..4468149c 100644 --- a/docs/REFERENCE.md +++ b/docs/REFERENCE.md @@ -387,10 +387,14 @@ AI-powered detection using Galileo's Luna-2 small language models. Provides real **Evaluator name**: `galileo.luna2` -**Installation**: Luna-2 requires an optional dependency: +**Installation**: Luna-2 is available as a separate package: ```bash -pip install agent-control-evaluators[luna2] +# Direct install +pip install agent-control-evaluator-galileo + +# Or via convenience extra +pip install agent-control-evaluators[galileo] ``` **Requirements**: Set `GALILEO_API_KEY` environment variable where evaluations run (on the server for server-side controls, or in the client environment for local controls). @@ -961,12 +965,12 @@ make alembic-upgrade ### Luna-2 Evaluator Errors -1. Ensure `httpx` is installed: `pip install agent-control-evaluators[luna2]` +1. Ensure the Galileo package is installed: `pip install agent-control-evaluator-galileo` (or `pip install agent-control-evaluators[galileo]`) 2. Ensure `GALILEO_API_KEY` is set 3. Check network connectivity to Galileo API 4. Verify the metric name is valid 5. Check `on_error` setting if failures are silently allowed **Evaluator Not Found**: If `galileo.luna2` doesn't appear in `list_evaluators()`: -- Verify `httpx` is installed (Luna-2's `is_available()` returns `False` without it) +- Verify the Galileo package is installed - Check server logs for evaluator discovery messages diff --git a/engine/pyproject.toml b/engine/pyproject.toml index 8363e52c..2dc4c15b 100644 --- a/engine/pyproject.toml +++ b/engine/pyproject.toml @@ -4,8 +4,8 @@ version = "2.1.0" description = "Control execution engine for Agent Control" requires-python = ">=3.12" dependencies = [ - "agent-control-models>=0.1.0", - "agent-control-evaluators>=0.1.0", + "agent-control-models>=3.0.0", + "agent-control-evaluators>=3.0.0", "google-re2>=1.1", ] authors = [ diff --git a/evaluators/builtin/pyproject.toml b/evaluators/builtin/pyproject.toml index 2b0e8edd..6569b079 100644 --- a/evaluators/builtin/pyproject.toml +++ b/evaluators/builtin/pyproject.toml @@ -15,7 +15,8 @@ dependencies = [ ] [project.optional-dependencies] -# NOTE: galileo extra commented out during local dev - package not yet on PyPI +# TODO: Uncomment galileo extra once agent-control-evaluator-galileo is published to PyPI +# Tracking: This enables `pip install agent-control-evaluators[galileo]` convenience install # galileo = ["agent-control-evaluator-galileo>=3.0.0"] dev = ["pytest>=8.0.0", "pytest-asyncio>=0.23.0"] diff --git a/evaluators/builtin/src/agent_control_evaluators/sql/evaluator.py b/evaluators/builtin/src/agent_control_evaluators/sql/evaluator.py index 99760f2d..a4f27e6e 100644 --- a/evaluators/builtin/src/agent_control_evaluators/sql/evaluator.py +++ b/evaluators/builtin/src/agent_control_evaluators/sql/evaluator.py @@ -412,7 +412,9 @@ def _check_limits( for analysis in analyses: # Use pre-computed SELECT nodes for select_node in analysis.select_nodes: - limit_node = select_node.find(exp.Limit) + # Use args.get() to get direct LIMIT child, not find() which searches descendants + # This prevents a subquery's LIMIT from being attributed to the outer query + limit_node = select_node.args.get("limit") # Check if LIMIT is required but missing if self.config.require_limit and not limit_node: @@ -443,8 +445,8 @@ def _check_limits( if limit_node: limit_value = self._extract_limit_value(limit_node) - # Extract OFFSET value if present - offset_node = select_node.find(exp.Offset) + # Extract OFFSET value if present (use args.get() for direct child only) + offset_node = select_node.args.get("offset") offset_value = 0 if offset_node: offset_value = self._extract_offset_value(offset_node) or 0 diff --git a/sdks/python/pyproject.toml b/sdks/python/pyproject.toml index 4efff631..3343aebd 100644 --- a/sdks/python/pyproject.toml +++ b/sdks/python/pyproject.toml @@ -35,7 +35,8 @@ Documentation = "https://github.com/yourusername/agent-control#readme" Repository = "https://github.com/yourusername/agent-control" [project.optional-dependencies] -# NOTE: luna2 extra commented out during local dev - package not yet on PyPI +# TODO: Uncomment luna2 extra once agent-control-evaluator-galileo is published to PyPI +# Tracking: This enables `pip install agent-control[luna2]` convenience install # luna2 = [ # "agent-control-evaluator-galileo>=3.0.0", # ] diff --git a/server/pyproject.toml b/server/pyproject.toml index 26b42e16..627d57c7 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -29,7 +29,8 @@ readme = "README.md" license = {text = "Apache-2.0"} [project.optional-dependencies] -# NOTE: luna2 extra commented out during local dev - package not yet on PyPI +# TODO: Uncomment luna2 extra once agent-control-evaluator-galileo is published to PyPI +# Tracking: This enables `pip install agent-control-server[luna2]` convenience install # luna2 = [ # "agent-control-evaluator-galileo>=3.0.0", # ] From 3464063ecb2a9c5570db9536847488f9b5dfb2e5 Mon Sep 17 00:00:00 2001 From: Abhinav Gupta Date: Tue, 3 Feb 2026 19:59:41 +0530 Subject: [PATCH 16/21] fix(engine): sync __version__ with pyproject.toml (2.1.0) --- engine/src/agent_control_engine/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/engine/src/agent_control_engine/__init__.py b/engine/src/agent_control_engine/__init__.py index b70e2307..fde72164 100644 --- a/engine/src/agent_control_engine/__init__.py +++ b/engine/src/agent_control_engine/__init__.py @@ -9,7 +9,7 @@ reset_evaluator_discovery, ) -__version__ = "0.1.0" +__version__ = "2.1.0" __all__ = [ "clear_evaluator_cache", From 3fbd87070d9c44a2133c129bca00930543c8f769 Mon Sep 17 00:00:00 2001 From: Abhinav Gupta Date: Wed, 4 Feb 2026 15:35:05 +0530 Subject: [PATCH 17/21] refactor: remove __version__ from all packages Version is now single-sourced from pyproject.toml only. Use importlib.metadata.version("package-name") for runtime access. --- engine/src/agent_control_engine/__init__.py | 2 -- .../builtin/src/agent_control_evaluators/__init__.py | 2 -- .../src/agent_control_evaluator_galileo/__init__.py | 2 -- models/src/agent_control_models/__init__.py | 2 -- sdks/python/src/agent_control/__init__.py | 8 -------- server/src/agent_control_server/__init__.py | 3 --- 6 files changed, 19 deletions(-) diff --git a/engine/src/agent_control_engine/__init__.py b/engine/src/agent_control_engine/__init__.py index fde72164..8fb341f5 100644 --- a/engine/src/agent_control_engine/__init__.py +++ b/engine/src/agent_control_engine/__init__.py @@ -9,8 +9,6 @@ reset_evaluator_discovery, ) -__version__ = "2.1.0" - __all__ = [ "clear_evaluator_cache", "discover_evaluators", diff --git a/evaluators/builtin/src/agent_control_evaluators/__init__.py b/evaluators/builtin/src/agent_control_evaluators/__init__.py index db1389de..5ccf1c86 100644 --- a/evaluators/builtin/src/agent_control_evaluators/__init__.py +++ b/evaluators/builtin/src/agent_control_evaluators/__init__.py @@ -42,8 +42,6 @@ from agent_control_evaluators.regex import RegexEvaluator, RegexEvaluatorConfig from agent_control_evaluators.sql import SQLEvaluator, SQLEvaluatorConfig -__version__ = "3.0.0" - __all__ = [ # Core infrastructure "Evaluator", diff --git a/evaluators/extra/galileo/src/agent_control_evaluator_galileo/__init__.py b/evaluators/extra/galileo/src/agent_control_evaluator_galileo/__init__.py index e6f23b3c..f4b05e5f 100644 --- a/evaluators/extra/galileo/src/agent_control_evaluator_galileo/__init__.py +++ b/evaluators/extra/galileo/src/agent_control_evaluator_galileo/__init__.py @@ -20,8 +20,6 @@ Luna2Operator, ) -__version__ = "3.0.0" - __all__ = [ "Luna2Evaluator", "Luna2EvaluatorConfig", diff --git a/models/src/agent_control_models/__init__.py b/models/src/agent_control_models/__init__.py index 2903b7ab..2a07d389 100644 --- a/models/src/agent_control_models/__init__.py +++ b/models/src/agent_control_models/__init__.py @@ -1,7 +1,5 @@ """Agent Control Models - Shared data models for server and SDK.""" -__version__ = "0.1.0" - from .agent import ( BUILTIN_STEP_TYPES, STEP_TYPE_LLM, diff --git a/sdks/python/src/agent_control/__init__.py b/sdks/python/src/agent_control/__init__.py index 2ff847e7..7bd8c785 100644 --- a/sdks/python/src/agent_control/__init__.py +++ b/sdks/python/src/agent_control/__init__.py @@ -37,8 +37,6 @@ async def process(input: str) -> str: import os from collections.abc import Callable from datetime import UTC, datetime -from importlib.metadata import PackageNotFoundError -from importlib.metadata import version as get_version from typing import TYPE_CHECKING, Any, Literal, TypeVar from uuid import UUID @@ -1090,9 +1088,3 @@ async def main(): "ControlAction", "EvaluatorSpec", ] - -try: - __version__ = get_version("agent-control-sdk") -except PackageNotFoundError: - # Package not installed (e.g., running from source without install) - __version__ = "0.0.0.dev" diff --git a/server/src/agent_control_server/__init__.py b/server/src/agent_control_server/__init__.py index 6f1f59a2..6a120d48 100644 --- a/server/src/agent_control_server/__init__.py +++ b/server/src/agent_control_server/__init__.py @@ -1,4 +1 @@ """Agent Control Server - Server component for agent protection system.""" - -__version__ = "0.1.0" - From 2f2ce327e41ba4ef8b215944c07102336f53fd79 Mon Sep 17 00:00:00 2001 From: Abhinav Gupta Date: Wed, 4 Feb 2026 16:11:13 +0530 Subject: [PATCH 18/21] fix(json): include field_constraints and field_patterns in extra-fields allow-list When allow_extra_fields=false, fields referenced only in field_constraints or field_patterns were incorrectly flagged as "extra fields". Now all four config options (required_fields, field_types, field_constraints, field_patterns) contribute to the allow-list. Also fixes README to use direct package install (agent-control-evaluator-galileo) instead of non-existent [galileo] extra, with TODO for enabling it post-publish. --- evaluators/builtin/README.md | 7 +- .../json/evaluator.py | 6 +- evaluators/builtin/tests/json/test_json.py | 74 +++++++++++++++++++ 3 files changed, 84 insertions(+), 3 deletions(-) diff --git a/evaluators/builtin/README.md b/evaluators/builtin/README.md index 4e558ea7..c384688c 100644 --- a/evaluators/builtin/README.md +++ b/evaluators/builtin/README.md @@ -38,11 +38,14 @@ Additional evaluators are available via separate packages: - `agent-control-evaluator-galileo` - Galileo Luna2 evaluator -Install convenience extras: ```bash -pip install agent-control-evaluators[galileo] +pip install agent-control-evaluator-galileo ``` + + ## Creating Custom Evaluators See [AGENTS.md](../../AGENTS.md) for guidance on creating new evaluators. diff --git a/evaluators/builtin/src/agent_control_evaluators/json/evaluator.py b/evaluators/builtin/src/agent_control_evaluators/json/evaluator.py index 24fcfff1..35bbf169 100644 --- a/evaluators/builtin/src/agent_control_evaluators/json/evaluator.py +++ b/evaluators/builtin/src/agent_control_evaluators/json/evaluator.py @@ -242,10 +242,14 @@ def _check_types(self, data: dict | list) -> EvaluatorResult | None: # Get only leaf paths to avoid flagging parent containers actual_paths = self._get_all_paths(data, leaves_only=True) - # Include both field_types and required_fields as allowed paths + # Include all explicitly referenced fields as allowed paths specified_paths = set(self.config.field_types.keys()) if self.config.required_fields: specified_paths.update(self.config.required_fields) + if self.config.field_constraints: + specified_paths.update(self.config.field_constraints.keys()) + if self.config.field_patterns: + specified_paths.update(self.config.field_patterns.keys()) extra_paths = actual_paths - specified_paths if extra_paths: diff --git a/evaluators/builtin/tests/json/test_json.py b/evaluators/builtin/tests/json/test_json.py index 10908159..f1120fa5 100644 --- a/evaluators/builtin/tests/json/test_json.py +++ b/evaluators/builtin/tests/json/test_json.py @@ -310,6 +310,80 @@ async def test_strict_mode_top_level_extra_field_still_detected(self): assert result.matched is True # Failed assert "Extra fields not allowed" in result.message + @pytest.mark.asyncio + async def test_strict_mode_allows_field_constraints_fields(self): + """Test that fields in field_constraints are not flagged as extra. + + Regression test: field_constraints fields should be in the allow-list + when allow_extra_fields=False. + """ + evaluator = JSONEvaluator( + JSONEvaluatorConfig( + field_types={"id": "string"}, + field_constraints={"score": {"min": 0.0, "max": 1.0}}, + allow_extra_fields=False, + ) + ) + # Should pass: "score" is referenced in field_constraints + result = await evaluator.evaluate({"id": "123", "score": 0.5}) + assert result.matched is False # Validation passed + assert "passed" in result.message.lower() + + @pytest.mark.asyncio + async def test_strict_mode_allows_field_patterns_fields(self): + """Test that fields in field_patterns are not flagged as extra. + + Regression test: field_patterns fields should be in the allow-list + when allow_extra_fields=False. + """ + evaluator = JSONEvaluator( + JSONEvaluatorConfig( + field_types={"id": "string"}, + field_patterns={"email": r"^.+@.+$"}, + allow_extra_fields=False, + ) + ) + # Should pass: "email" is referenced in field_patterns + result = await evaluator.evaluate({"id": "123", "email": "test@example.com"}) + assert result.matched is False # Validation passed + assert "passed" in result.message.lower() + + @pytest.mark.asyncio + async def test_strict_mode_with_all_field_references(self): + """Test strict mode with fields from all config options. + + Ensures that required_fields, field_types, field_constraints, and + field_patterns are all included in the allow-list. + """ + evaluator = JSONEvaluator( + JSONEvaluatorConfig( + required_fields=["name"], + field_types={"id": "string"}, + field_constraints={"score": {"min": 0.0, "max": 1.0}}, + field_patterns={"email": r"^.+@.+$"}, + allow_extra_fields=False, + ) + ) + # Should pass: all fields are referenced in some config option + result = await evaluator.evaluate({ + "id": "123", + "name": "Test", + "score": 0.75, + "email": "test@example.com", + }) + assert result.matched is False # Validation passed + + # Should fail: "extra" is not referenced anywhere + result = await evaluator.evaluate({ + "id": "123", + "name": "Test", + "score": 0.75, + "email": "test@example.com", + "extra": "not allowed", + }) + assert result.matched is True # Failed + assert "Extra fields not allowed" in result.message + class TestConstraintsValidation: """Test field constraints validation mode.""" From 3c1eebb798a85b708c570ebf67c6a3764ac429e1 Mon Sep 17 00:00:00 2001 From: Abhinav Gupta Date: Wed, 4 Feb 2026 18:12:55 +0530 Subject: [PATCH 19/21] feat(evaluators): add test-extras target for extra evaluator tests Wire galileo tests into the build system without affecting the default `make test` target. Developers can now run: - `make test-extras` for extra evaluators only - `make test-all` for core + extras - `make galileo-test/lint/typecheck` for galileo specifically Also adds pytest-cov to galileo dev dependencies for coverage support. --- Makefile | 32 ++++++++++++++++++++++-- evaluators/extra/galileo/Makefile | 33 +++++++++++++++++++++++++ evaluators/extra/galileo/pyproject.toml | 1 + 3 files changed, 64 insertions(+), 2 deletions(-) create mode 100644 evaluators/extra/galileo/Makefile diff --git a/Makefile b/Makefile index 68b2bb86..b0c386d6 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: help sync test test-models test-sdk lint lint-fix typecheck check build build-models build-server build-sdk publish publish-models publish-server publish-sdk hooks-install hooks-uninstall prepush evaluators-test evaluators-lint evaluators-lint-fix evaluators-typecheck evaluators-build +.PHONY: help sync test test-extras test-all test-models test-sdk lint lint-fix typecheck check build build-models build-server build-sdk publish publish-models publish-server publish-sdk hooks-install hooks-uninstall prepush evaluators-test evaluators-lint evaluators-lint-fix evaluators-typecheck evaluators-build galileo-test galileo-lint galileo-lint-fix galileo-typecheck galileo-build # Workspace package names PACK_MODELS := agent-control-models @@ -13,6 +13,7 @@ SERVER_DIR := server SDK_DIR := sdks/python ENGINE_DIR := engine EVALUATORS_DIR := evaluators/builtin +GALILEO_DIR := evaluators/extra/galileo help: @echo "Agent Control - Makefile commands" @@ -24,7 +25,9 @@ help: @echo " make server- - forward to server targets (e.g., server-help, server-alembic-upgrade)" @echo "" @echo "Test:" - @echo " make test - run tests for all members" + @echo " make test - run tests for core packages (server, engine, sdk, evaluators)" + @echo " make test-extras - run tests for extra evaluators (galileo, etc.)" + @echo " make test-all - run all tests (core + extras)" @echo "" @echo "Quality:" @echo " make lint - ruff check for all members" @@ -60,6 +63,12 @@ sync: test: server-test engine-test sdk-test evaluators-test +# Run tests for extra evaluators (not included in default test target) +test-extras: galileo-test + +# Run all tests (core + extras) +test-all: test test-extras + # Run tests, lint, and typecheck check: test lint typecheck @@ -150,3 +159,22 @@ evaluators-build: .PHONY: server-% server-%: $(MAKE) -C $(SERVER_DIR) $(patsubst server-%,%,$@) + +# --------------------------- +# Extra Evaluators (Galileo) +# --------------------------- + +galileo-test: + $(MAKE) -C $(GALILEO_DIR) test + +galileo-lint: + $(MAKE) -C $(GALILEO_DIR) lint + +galileo-lint-fix: + $(MAKE) -C $(GALILEO_DIR) lint-fix + +galileo-typecheck: + $(MAKE) -C $(GALILEO_DIR) typecheck + +galileo-build: + $(MAKE) -C $(GALILEO_DIR) build diff --git a/evaluators/extra/galileo/Makefile b/evaluators/extra/galileo/Makefile new file mode 100644 index 00000000..0deea340 --- /dev/null +++ b/evaluators/extra/galileo/Makefile @@ -0,0 +1,33 @@ +.PHONY: help sync test lint lint-fix typecheck build publish + +PACKAGE := agent-control-evaluator-galileo + +help: + @echo "Agent Control Evaluator - Galileo - Makefile commands" + @echo "" + @echo " make test - run pytest" + @echo " make lint - run ruff check" + @echo " make lint-fix - run ruff check --fix" + @echo " make typecheck - run mypy" + @echo " make build - build package" + +sync: + uv sync + +test: + uv run pytest --cov=src --cov-report=xml:../../../coverage-evaluators-galileo.xml -q + +lint: + uv run ruff check --config ../../../pyproject.toml src/ + +lint-fix: + uv run ruff check --config ../../../pyproject.toml --fix src/ + +typecheck: + uv run mypy --config-file ../../../pyproject.toml src/ + +build: + uv build + +publish: + uv publish diff --git a/evaluators/extra/galileo/pyproject.toml b/evaluators/extra/galileo/pyproject.toml index f14f7d14..21785e53 100644 --- a/evaluators/extra/galileo/pyproject.toml +++ b/evaluators/extra/galileo/pyproject.toml @@ -17,6 +17,7 @@ dependencies = [ dev = [ "pytest>=8.0.0", "pytest-asyncio>=0.23.0", + "pytest-cov>=4.0.0", "ruff>=0.1.0", "mypy>=1.8.0", ] From 0b68789ba83dbde0e150c7b691873dccaa1936f8 Mon Sep 17 00:00:00 2001 From: Abhinav Gupta Date: Thu, 5 Feb 2026 13:10:24 +0530 Subject: [PATCH 20/21] feat: add dynamic __version__ to all packages using importlib.metadata Reads version from package metadata at runtime instead of hardcoding. This keeps pyproject.toml as the single source of truth for versions. --- engine/src/agent_control_engine/__init__.py | 7 +++++++ .../builtin/src/agent_control_evaluators/__init__.py | 7 +++++++ .../src/agent_control_evaluator_galileo/__init__.py | 7 +++++++ models/src/agent_control_models/__init__.py | 7 +++++++ sdks/python/src/agent_control/__init__.py | 7 +++++++ server/src/agent_control_server/__init__.py | 7 +++++++ 6 files changed, 42 insertions(+) diff --git a/engine/src/agent_control_engine/__init__.py b/engine/src/agent_control_engine/__init__.py index 8fb341f5..8c8966c7 100644 --- a/engine/src/agent_control_engine/__init__.py +++ b/engine/src/agent_control_engine/__init__.py @@ -1,5 +1,12 @@ """Agent Control Engine - Rule execution logic and evaluator system.""" +from importlib.metadata import PackageNotFoundError, version + +try: + __version__ = version("agent-control-engine") +except PackageNotFoundError: + __version__ = "0.0.0.dev" + from agent_control_evaluators import ( clear_evaluator_cache, discover_evaluators, diff --git a/evaluators/builtin/src/agent_control_evaluators/__init__.py b/evaluators/builtin/src/agent_control_evaluators/__init__.py index 5ccf1c86..b1dabd9e 100644 --- a/evaluators/builtin/src/agent_control_evaluators/__init__.py +++ b/evaluators/builtin/src/agent_control_evaluators/__init__.py @@ -20,6 +20,13 @@ Their schemas are registered via initAgent for validation purposes. """ +from importlib.metadata import PackageNotFoundError, version + +try: + __version__ = version("agent-control-evaluators") +except PackageNotFoundError: + __version__ = "0.0.0.dev" + # Core infrastructure - export from _base and _registry from agent_control_evaluators._base import Evaluator, EvaluatorConfig, EvaluatorMetadata from agent_control_evaluators._discovery import ( diff --git a/evaluators/extra/galileo/src/agent_control_evaluator_galileo/__init__.py b/evaluators/extra/galileo/src/agent_control_evaluator_galileo/__init__.py index f4b05e5f..6389087f 100644 --- a/evaluators/extra/galileo/src/agent_control_evaluator_galileo/__init__.py +++ b/evaluators/extra/galileo/src/agent_control_evaluator_galileo/__init__.py @@ -12,6 +12,13 @@ pip install agent-control-evaluators[galileo] """ +from importlib.metadata import PackageNotFoundError, version + +try: + __version__ = version("agent-control-evaluator-galileo") +except PackageNotFoundError: + __version__ = "0.0.0.dev" + from agent_control_evaluator_galileo.luna2 import ( LUNA2_AVAILABLE, Luna2Evaluator, diff --git a/models/src/agent_control_models/__init__.py b/models/src/agent_control_models/__init__.py index 2a07d389..262a0e8d 100644 --- a/models/src/agent_control_models/__init__.py +++ b/models/src/agent_control_models/__init__.py @@ -1,5 +1,12 @@ """Agent Control Models - Shared data models for server and SDK.""" +from importlib.metadata import PackageNotFoundError, version + +try: + __version__ = version("agent-control-models") +except PackageNotFoundError: + __version__ = "0.0.0.dev" + from .agent import ( BUILTIN_STEP_TYPES, STEP_TYPE_LLM, diff --git a/sdks/python/src/agent_control/__init__.py b/sdks/python/src/agent_control/__init__.py index 7bd8c785..4ec2a597 100644 --- a/sdks/python/src/agent_control/__init__.py +++ b/sdks/python/src/agent_control/__init__.py @@ -34,6 +34,13 @@ async def process(input: str) -> str: ) """ +from importlib.metadata import PackageNotFoundError, version + +try: + __version__ = version("agent-control-sdk") +except PackageNotFoundError: + __version__ = "0.0.0.dev" + import os from collections.abc import Callable from datetime import UTC, datetime diff --git a/server/src/agent_control_server/__init__.py b/server/src/agent_control_server/__init__.py index 6a120d48..782d1312 100644 --- a/server/src/agent_control_server/__init__.py +++ b/server/src/agent_control_server/__init__.py @@ -1 +1,8 @@ """Agent Control Server - Server component for agent protection system.""" + +from importlib.metadata import PackageNotFoundError, version + +try: + __version__ = version("agent-control-server") +except PackageNotFoundError: + __version__ = "0.0.0.dev" From 19c71a0c6be1f4c2baa85053ff7105fe513bcd33 Mon Sep 17 00:00:00 2001 From: Abhinav Gupta Date: Thu, 5 Feb 2026 14:45:47 +0530 Subject: [PATCH 21/21] feat(release): publish evaluators and galileo to PyPI - Add build_evaluators() and build_evaluator_galileo() to build script - Add PyPI publish steps in dependency order: models -> evaluators -> sdk -> evaluator-galileo - Enable [galileo] convenience extra on evaluators, SDK, and server - Rename [luna2] extra to [galileo] for consistency with package naming - Add uv source overrides for local galileo development - Update documentation and examples to use [galileo] extra --- .github/workflows/release.yaml | 19 +++++++++++ docs/OVERVIEW.md | 4 +-- evaluators/builtin/README.md | 8 ++--- evaluators/builtin/pyproject.toml | 6 ++-- examples/galileo/README.md | 2 +- examples/galileo/luna2_demo.py | 4 +-- examples/galileo/pyproject.toml | 2 +- scripts/build.py | 56 +++++++++++++++++++++++++++---- sdks/python/pyproject.toml | 8 ++--- server/pyproject.toml | 8 ++--- 10 files changed, 88 insertions(+), 29 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 5deb1858..f7f09dfb 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -42,6 +42,7 @@ jobs: uv sync uv run python scripts/build.py all + # Publish in dependency order: models -> evaluators -> sdk -> evaluator-galileo - name: Publish agent-control-models to PyPI if: steps.release.outputs.released == 'true' uses: pypa/gh-action-pypi-publish@release/v1 @@ -50,6 +51,14 @@ jobs: user: __token__ password: ${{ secrets.PYPI_API_TOKEN }} + - name: Publish agent-control-evaluators to PyPI + if: steps.release.outputs.released == 'true' + uses: pypa/gh-action-pypi-publish@release/v1 + with: + packages-dir: evaluators/builtin/dist/ + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} + - name: Publish agent-control-sdk to PyPI if: steps.release.outputs.released == 'true' uses: pypa/gh-action-pypi-publish@release/v1 @@ -58,6 +67,14 @@ jobs: user: __token__ password: ${{ secrets.PYPI_API_TOKEN }} + - name: Publish agent-control-evaluator-galileo to PyPI + if: steps.release.outputs.released == 'true' + uses: pypa/gh-action-pypi-publish@release/v1 + with: + packages-dir: evaluators/extra/galileo/dist/ + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} + - name: Upload to GitHub Release if: steps.release.outputs.released == 'true' uses: python-semantic-release/upload-to-gh-release@main @@ -67,5 +84,7 @@ jobs: root_options: "-vv" dist_glob: | models/dist/* + evaluators/builtin/dist/* sdks/python/dist/* server/dist/* + evaluators/extra/galileo/dist/* diff --git a/docs/OVERVIEW.md b/docs/OVERVIEW.md index 27be6e2d..87b4a2ed 100644 --- a/docs/OVERVIEW.md +++ b/docs/OVERVIEW.md @@ -597,10 +597,10 @@ pip install agent-control # Server pip install agent-control-server -# With Luna-2 evaluator support (Galileo) +# With Galileo Luna-2 evaluator support pip install agent-control-evaluator-galileo # Or via convenience extra: -pip install agent-control-server[luna2] +pip install agent-control-server[galileo] ``` ### Quick Start diff --git a/evaluators/builtin/README.md b/evaluators/builtin/README.md index c384688c..cd87fea3 100644 --- a/evaluators/builtin/README.md +++ b/evaluators/builtin/README.md @@ -39,12 +39,12 @@ Additional evaluators are available via separate packages: - `agent-control-evaluator-galileo` - Galileo Luna2 evaluator ```bash +# Direct install pip install agent-control-evaluator-galileo -``` - +# Or via convenience extra +pip install agent-control-evaluators[galileo] +``` ## Creating Custom Evaluators diff --git a/evaluators/builtin/pyproject.toml b/evaluators/builtin/pyproject.toml index 6569b079..fb3a75c3 100644 --- a/evaluators/builtin/pyproject.toml +++ b/evaluators/builtin/pyproject.toml @@ -15,9 +15,7 @@ dependencies = [ ] [project.optional-dependencies] -# TODO: Uncomment galileo extra once agent-control-evaluator-galileo is published to PyPI -# Tracking: This enables `pip install agent-control-evaluators[galileo]` convenience install -# galileo = ["agent-control-evaluator-galileo>=3.0.0"] +galileo = ["agent-control-evaluator-galileo>=3.0.0"] dev = ["pytest>=8.0.0", "pytest-asyncio>=0.23.0"] [project.entry-points."agent_control.evaluators"] @@ -35,3 +33,5 @@ packages = ["src/agent_control_evaluators"] [tool.uv.sources] agent-control-models = { workspace = true } +# For local dev: use local galileo package instead of PyPI +agent-control-evaluator-galileo = { path = "../extra/galileo", editable = true } diff --git a/examples/galileo/README.md b/examples/galileo/README.md index 2f801c1e..a52c43e4 100644 --- a/examples/galileo/README.md +++ b/examples/galileo/README.md @@ -78,7 +78,7 @@ Testing toxicity detection with Central Stage... - **"GALILEO_API_KEY environment variable is required"**: Export your API key - **"Project not found"**: Set `GALILEO_PROJECT_NAME` to match your Galileo project - **"Stage not found"**: Set `GALILEO_STAGE_NAME` to match a stage in your project -- **Import errors**: Ensure you installed with `[luna2]` extra: `pip install agent-control-evaluators[luna2]` +- **Import errors**: Ensure you installed with `[galileo]` extra: `pip install agent-control-evaluators[galileo]` ### Documentation diff --git a/examples/galileo/luna2_demo.py b/examples/galileo/luna2_demo.py index 82989558..beda6484 100644 --- a/examples/galileo/luna2_demo.py +++ b/examples/galileo/luna2_demo.py @@ -14,7 +14,7 @@ python luna2_demo.py Requirements: - pip install agent-control-evaluators[luna2] + pip install agent-control-evaluators[galileo] """ import asyncio @@ -45,7 +45,7 @@ GALILEO_AVAILABLE = True except ImportError as e: print(f"❌ agent-control-evaluators not available: {e}") - print(" Install with: pip install agent-control-evaluators[luna2]") + print(" Install with: pip install agent-control-evaluators[galileo]") sys.exit(1) diff --git a/examples/galileo/pyproject.toml b/examples/galileo/pyproject.toml index a5e6510e..c2744ce8 100644 --- a/examples/galileo/pyproject.toml +++ b/examples/galileo/pyproject.toml @@ -5,7 +5,7 @@ description = "Agent Control Luna-2 Galileo Protect Integration Example" readme = "README.md" requires-python = ">=3.12" dependencies = [ - "agent-control-evaluators[luna2]", + "agent-control-evaluators[galileo]", "httpx>=0.24.0", ] diff --git a/scripts/build.py b/scripts/build.py index 1a5340c8..b354127d 100644 --- a/scripts/build.py +++ b/scripts/build.py @@ -1,12 +1,12 @@ #!/usr/bin/env python3 -"""Build SDK and server packages with vendored dependencies. +"""Build packages for PyPI distribution. -This script copies internal packages (models, engine, evaluators) into the SDK and server -source directories before building, then cleans up afterward. This allows the published -wheels to be self-contained without requiring separate PyPI dependencies. +This script builds all publishable packages. For SDK and server, it copies internal +packages (models, engine) into the source directories before building, then cleans up +afterward. This allows the published wheels to be self-contained. Usage: - python scripts/build.py [models|sdk|server|all] + python scripts/build.py [models|evaluators|sdk|server|galileo|all] """ import shutil @@ -184,12 +184,52 @@ def build_server() -> None: shutil.rmtree(target) +def build_evaluators() -> None: + """Build agent-control-evaluators (standalone, no vendoring needed).""" + version = get_global_version() + evaluators_dir = ROOT / "evaluators" / "builtin" + + print(f"Building agent-control-evaluators v{version}") + + # Clean previous builds + dist_dir = evaluators_dir / "dist" + if dist_dir.exists(): + shutil.rmtree(dist_dir) + + # Set version + set_package_version(evaluators_dir / "pyproject.toml", version) + + subprocess.run(["uv", "build", "-o", str(dist_dir)], cwd=evaluators_dir, check=True) + print(f" Built agent-control-evaluators v{version}") + + +def build_evaluator_galileo() -> None: + """Build agent-control-evaluator-galileo (standalone, no vendoring needed).""" + version = get_global_version() + galileo_dir = ROOT / "evaluators" / "extra" / "galileo" + + print(f"Building agent-control-evaluator-galileo v{version}") + + # Clean previous builds + dist_dir = galileo_dir / "dist" + if dist_dir.exists(): + shutil.rmtree(dist_dir) + + # Set version + set_package_version(galileo_dir / "pyproject.toml", version) + + subprocess.run(["uv", "build", "-o", str(dist_dir)], cwd=galileo_dir, check=True) + print(f" Built agent-control-evaluator-galileo v{version}") + + def build_all() -> None: """Build all packages.""" print(f"Building all packages (version {get_global_version()})\n") build_models() + build_evaluators() build_sdk() build_server() + build_evaluator_galileo() print("\nAll packages built successfully!") @@ -200,12 +240,16 @@ def build_all() -> None: if target == "models": build_models() + elif target == "evaluators": + build_evaluators() elif target == "sdk": build_sdk() elif target == "server": build_server() + elif target == "galileo": + build_evaluator_galileo() elif target == "all": build_all() else: - print("Usage: python scripts/build.py [models|sdk|server|all]") + print("Usage: python scripts/build.py [models|evaluators|sdk|server|galileo|all]") sys.exit(1) diff --git a/sdks/python/pyproject.toml b/sdks/python/pyproject.toml index 3343aebd..e747487c 100644 --- a/sdks/python/pyproject.toml +++ b/sdks/python/pyproject.toml @@ -35,11 +35,7 @@ Documentation = "https://github.com/yourusername/agent-control#readme" Repository = "https://github.com/yourusername/agent-control" [project.optional-dependencies] -# TODO: Uncomment luna2 extra once agent-control-evaluator-galileo is published to PyPI -# Tracking: This enables `pip install agent-control[luna2]` convenience install -# luna2 = [ -# "agent-control-evaluator-galileo>=3.0.0", -# ] +galileo = ["agent-control-evaluator-galileo>=3.0.0"] [dependency-groups] dev = [ @@ -82,3 +78,5 @@ known-first-party = ["agent_control"] agent-control-models = { workspace = true } agent-control-engine = { workspace = true } agent-control-evaluators = { workspace = true } +# For local dev: use local galileo package instead of PyPI +agent-control-evaluator-galileo = { path = "../../evaluators/extra/galileo", editable = true } diff --git a/server/pyproject.toml b/server/pyproject.toml index 627d57c7..b6409a57 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -29,11 +29,7 @@ readme = "README.md" license = {text = "Apache-2.0"} [project.optional-dependencies] -# TODO: Uncomment luna2 extra once agent-control-evaluator-galileo is published to PyPI -# Tracking: This enables `pip install agent-control-server[luna2]` convenience install -# luna2 = [ -# "agent-control-evaluator-galileo>=3.0.0", -# ] +galileo = ["agent-control-evaluator-galileo>=3.0.0"] [dependency-groups] dev = [ @@ -91,3 +87,5 @@ known-first-party = ["agent_control_server"] agent-control-models = { workspace = true } agent-control-engine = { workspace = true } agent-control-evaluators = { workspace = true } +# For local dev: use local galileo package instead of PyPI +agent-control-evaluator-galileo = { path = "../evaluators/extra/galileo", editable = true }