From 50150ca5714be18b591b03bb548e03fdcb563250 Mon Sep 17 00:00:00 2001 From: Yash1hi Date: Thu, 5 Mar 2026 14:56:58 -0800 Subject: [PATCH 1/4] feat: add otel_link_id support for SDK/trace record deduplication Generate a unique otel_link_id per testcase execution in run_and_evaluate and async_run_and_evaluate, pass it to the user's system function via SystemOptions, and include it in the createRecord API call via extra_body. BREAKING CHANGE: system function signature now includes a SystemOptions parameter as the third argument containing { otel_link_id: str }. --- src/scorecard_ai/lib/__init__.py | 2 ++ src/scorecard_ai/lib/_helpers.py | 30 ++++++++++++++++++++++++------ 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/src/scorecard_ai/lib/__init__.py b/src/scorecard_ai/lib/__init__.py index b0b981e..f70584b 100644 --- a/src/scorecard_ai/lib/__init__.py +++ b/src/scorecard_ai/lib/__init__.py @@ -1,6 +1,7 @@ from ._helpers import ( run_and_evaluate, async_run_and_evaluate, + SystemOptions, ) from .wrap_llms import ( wrap, @@ -18,6 +19,7 @@ __all__ = [ "run_and_evaluate", "async_run_and_evaluate", + "SystemOptions", "StopCheck", "StopChecks", "ChatMessage", diff --git a/src/scorecard_ai/lib/_helpers.py b/src/scorecard_ai/lib/_helpers.py index bca11ff..ed09348 100644 --- a/src/scorecard_ai/lib/_helpers.py +++ b/src/scorecard_ai/lib/_helpers.py @@ -5,6 +5,7 @@ from __future__ import annotations import asyncio +import uuid from typing import Any, Dict, List, TypeVar, Callable, Coroutine from collections.abc import Generator, AsyncGenerator from typing_extensions import TypedDict @@ -20,6 +21,15 @@ _T = TypeVar("_T") +class SystemOptions(TypedDict): + """Options passed to the system function for each testcase execution.""" + + otel_link_id: str + """A unique ID for linking this execution with its OpenTelemetry trace. + Set this as an attribute on your OTel span (e.g. ``scorecard.otel_link_id``) + to deduplicate SDK records with trace-created records.""" + + def _omit_if_not_given(value: _T | NotGiven) -> _T | Omit: """ Converts NotGiven sentinel to Omit sentinel for API calls. @@ -82,7 +92,7 @@ def run_and_evaluate( testset_id: str | NotGiven = NOT_GIVEN, testcases: List[Testcase] | List[SimpleTestcase] | NotGiven = NOT_GIVEN, system_version_id: str | NotGiven = NOT_GIVEN, - system: Callable[[SystemInput, SystemVersion | None], SystemOutput], + system: Callable[[SystemInput, SystemVersion | None, SystemOptions], SystemOutput], trials: int = 1, ) -> RunResponse: """ @@ -103,7 +113,8 @@ def run_and_evaluate( system_version_id: The ID of the SystemVersion to use for the run. - system: The system to run on the Testset. + system: The system to run on the Testset. Receives the testcase input, system version (or None), + and a SystemOptions dict containing ``otel_link_id`` for trace deduplication. trials: The number of times to run the system on each Testcase. """ @@ -134,13 +145,16 @@ def run_and_evaluate( # Run each Testcase sequentially for testcase in testcase_iter: for _ in range(trials): - model_response = system(testcase["inputs"], system_version) + otel_link_id = str(uuid.uuid4()) + options = SystemOptions(otel_link_id=otel_link_id) + model_response = system(testcase["inputs"], system_version, options) client.records.create( run_id=run.id, testcase_id=_omit_if_not_given(testcase["id"]), inputs=testcase["inputs"], expected=testcase["expected"], outputs=model_response, + extra_body={"otelLinkId": otel_link_id}, ) return RunResponse(id=run.id, url=_get_run_url(client, project_id, run.id)) @@ -154,7 +168,7 @@ async def async_run_and_evaluate( testset_id: str | NotGiven = NOT_GIVEN, testcases: List[Testcase] | List[SimpleTestcase] | NotGiven = NOT_GIVEN, system_version_id: str | NotGiven = NOT_GIVEN, - system: Callable[[SystemInput, SystemVersion | None], SystemOutput], + system: Callable[[SystemInput, SystemVersion | None, SystemOptions], SystemOutput], trials: int = 1, ) -> RunResponse: """ @@ -175,7 +189,8 @@ async def async_run_and_evaluate( system_version_id: The ID of the SystemVersion to use for the run. - system: The system to run on the Testset. + system: The system to run on the Testset. Receives the testcase input, system version (or None), + and a SystemOptions dict containing ``otel_link_id`` for trace deduplication. trials: The number of times to run the system on each Testcase. """ @@ -206,13 +221,16 @@ async def async_run_and_evaluate( def run_testcase( testcase: _SimpleTestcaseWithId, ) -> Coroutine[Any, Any, Record]: - model_response = system(testcase["inputs"], system_version) + otel_link_id = str(uuid.uuid4()) + options = SystemOptions(otel_link_id=otel_link_id) + model_response = system(testcase["inputs"], system_version, options) return client.records.create( run_id=run.id, testcase_id=_omit_if_not_given(testcase["id"]), inputs=testcase["inputs"], expected=testcase["expected"], outputs=model_response, + extra_body={"otelLinkId": otel_link_id}, ) # Create a Record for each Testcase From 7ce95ac229fa7bc096a464f6ec3890bb0896a00a Mon Sep 17 00:00:00 2001 From: Yash1hi Date: Fri, 6 Mar 2026 14:52:37 -0800 Subject: [PATCH 2/4] fix: make SystemOptions a non-breaking optional parameter Use inspect.signature to detect whether the user's system function accepts 3 positional args. If it does, pass SystemOptions; if not, call with just (inputs, system_version) as before. Existing 2-arg system functions continue to work unchanged. --- src/scorecard_ai/lib/_helpers.py | 43 ++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/src/scorecard_ai/lib/_helpers.py b/src/scorecard_ai/lib/_helpers.py index ed09348..b0aae0f 100644 --- a/src/scorecard_ai/lib/_helpers.py +++ b/src/scorecard_ai/lib/_helpers.py @@ -5,6 +5,7 @@ from __future__ import annotations import asyncio +import inspect import uuid from typing import Any, Dict, List, TypeVar, Callable, Coroutine from collections.abc import Generator, AsyncGenerator @@ -30,6 +31,22 @@ class SystemOptions(TypedDict): to deduplicate SDK records with trace-created records.""" +def _system_accepts_options(system: Callable[..., Any]) -> bool: + """Check if the system function accepts a third 'options' parameter.""" + try: + sig = inspect.signature(system) + params = list(sig.parameters.values()) + # Accepts options if it has 3+ positional params + positional_kinds = ( + inspect.Parameter.POSITIONAL_ONLY, + inspect.Parameter.POSITIONAL_OR_KEYWORD, + ) + positional = [p for p in params if p.kind in positional_kinds] + return len(positional) >= 3 + except (ValueError, TypeError): + return False + + def _omit_if_not_given(value: _T | NotGiven) -> _T | Omit: """ Converts NotGiven sentinel to Omit sentinel for API calls. @@ -92,7 +109,7 @@ def run_and_evaluate( testset_id: str | NotGiven = NOT_GIVEN, testcases: List[Testcase] | List[SimpleTestcase] | NotGiven = NOT_GIVEN, system_version_id: str | NotGiven = NOT_GIVEN, - system: Callable[[SystemInput, SystemVersion | None, SystemOptions], SystemOutput], + system: Callable[..., SystemOutput], trials: int = 1, ) -> RunResponse: """ @@ -113,8 +130,8 @@ def run_and_evaluate( system_version_id: The ID of the SystemVersion to use for the run. - system: The system to run on the Testset. Receives the testcase input, system version (or None), - and a SystemOptions dict containing ``otel_link_id`` for trace deduplication. + system: The system to run on the Testset. Receives the testcase input and system version (or None). + Optionally accepts a third ``SystemOptions`` argument containing ``otel_link_id`` for trace deduplication. trials: The number of times to run the system on each Testcase. """ @@ -142,12 +159,17 @@ def run_and_evaluate( client.systems.versions.get(system_version_id) if not isinstance(system_version_id, NotGiven) else None ) + accepts_options = _system_accepts_options(system) + # Run each Testcase sequentially for testcase in testcase_iter: for _ in range(trials): otel_link_id = str(uuid.uuid4()) options = SystemOptions(otel_link_id=otel_link_id) - model_response = system(testcase["inputs"], system_version, options) + if accepts_options: + model_response = system(testcase["inputs"], system_version, options) + else: + model_response = system(testcase["inputs"], system_version) client.records.create( run_id=run.id, testcase_id=_omit_if_not_given(testcase["id"]), @@ -168,7 +190,7 @@ async def async_run_and_evaluate( testset_id: str | NotGiven = NOT_GIVEN, testcases: List[Testcase] | List[SimpleTestcase] | NotGiven = NOT_GIVEN, system_version_id: str | NotGiven = NOT_GIVEN, - system: Callable[[SystemInput, SystemVersion | None, SystemOptions], SystemOutput], + system: Callable[..., SystemOutput], trials: int = 1, ) -> RunResponse: """ @@ -189,8 +211,8 @@ async def async_run_and_evaluate( system_version_id: The ID of the SystemVersion to use for the run. - system: The system to run on the Testset. Receives the testcase input, system version (or None), - and a SystemOptions dict containing ``otel_link_id`` for trace deduplication. + system: The system to run on the Testset. Receives the testcase input and system version (or None). + Optionally accepts a third ``SystemOptions`` argument containing ``otel_link_id`` for trace deduplication. trials: The number of times to run the system on each Testcase. """ @@ -218,12 +240,17 @@ async def async_run_and_evaluate( await client.systems.versions.get(system_version_id) if not isinstance(system_version_id, NotGiven) else None ) + accepts_options = _system_accepts_options(system) + def run_testcase( testcase: _SimpleTestcaseWithId, ) -> Coroutine[Any, Any, Record]: otel_link_id = str(uuid.uuid4()) options = SystemOptions(otel_link_id=otel_link_id) - model_response = system(testcase["inputs"], system_version, options) + if accepts_options: + model_response = system(testcase["inputs"], system_version, options) + else: + model_response = system(testcase["inputs"], system_version) return client.records.create( run_id=run.id, testcase_id=_omit_if_not_given(testcase["id"]), From 75cebe4e1bd449956ed9673f9360e91a99e1404f Mon Sep 17 00:00:00 2001 From: Yash1hi Date: Fri, 6 Mar 2026 15:05:36 -0800 Subject: [PATCH 3/4] fix: resolve import sorting lint errors --- src/scorecard_ai/lib/__init__.py | 4 ++-- src/scorecard_ai/lib/_helpers.py | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/scorecard_ai/lib/__init__.py b/src/scorecard_ai/lib/__init__.py index f70584b..562b9c2 100644 --- a/src/scorecard_ai/lib/__init__.py +++ b/src/scorecard_ai/lib/__init__.py @@ -1,7 +1,7 @@ from ._helpers import ( - run_and_evaluate, - async_run_and_evaluate, SystemOptions, + async_run_and_evaluate, + run_and_evaluate, ) from .wrap_llms import ( wrap, diff --git a/src/scorecard_ai/lib/_helpers.py b/src/scorecard_ai/lib/_helpers.py index b0aae0f..c3d264f 100644 --- a/src/scorecard_ai/lib/_helpers.py +++ b/src/scorecard_ai/lib/_helpers.py @@ -7,8 +7,9 @@ import asyncio import inspect import uuid -from typing import Any, Dict, List, TypeVar, Callable, Coroutine -from collections.abc import Generator, AsyncGenerator +from collections.abc import AsyncGenerator, Generator +from typing import Any, Callable, Coroutine, Dict, List, TypeVar + from typing_extensions import TypedDict from scorecard_ai import Scorecard, AsyncScorecard From 85214df2eb37d75727b4c8aacc2cd5c493d8858f Mon Sep 17 00:00:00 2001 From: Yash1hi Date: Fri, 6 Mar 2026 15:17:44 -0800 Subject: [PATCH 4/4] fix: use length-sorted imports per ruff config --- src/scorecard_ai/lib/__init__.py | 2 +- src/scorecard_ai/lib/_helpers.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/scorecard_ai/lib/__init__.py b/src/scorecard_ai/lib/__init__.py index 562b9c2..2e9aabf 100644 --- a/src/scorecard_ai/lib/__init__.py +++ b/src/scorecard_ai/lib/__init__.py @@ -1,7 +1,7 @@ from ._helpers import ( SystemOptions, - async_run_and_evaluate, run_and_evaluate, + async_run_and_evaluate, ) from .wrap_llms import ( wrap, diff --git a/src/scorecard_ai/lib/_helpers.py b/src/scorecard_ai/lib/_helpers.py index c3d264f..6341e3a 100644 --- a/src/scorecard_ai/lib/_helpers.py +++ b/src/scorecard_ai/lib/_helpers.py @@ -4,12 +4,11 @@ from __future__ import annotations +import uuid import asyncio import inspect -import uuid -from collections.abc import AsyncGenerator, Generator -from typing import Any, Callable, Coroutine, Dict, List, TypeVar - +from typing import Any, Dict, List, TypeVar, Callable, Coroutine +from collections.abc import Generator, AsyncGenerator from typing_extensions import TypedDict from scorecard_ai import Scorecard, AsyncScorecard