From 50150ca5714be18b591b03bb548e03fdcb563250 Mon Sep 17 00:00:00 2001
From: Yash1hi <yash.thapliyal.007@gmail.com>
Date: Thu, 5 Mar 2026 14:56:58 -0800
Subject: [PATCH 1/4] feat: add otel_link_id support for SDK/trace record
 deduplication

Generate a unique otel_link_id per testcase execution in
run_and_evaluate and async_run_and_evaluate, pass it to the user's
system function via SystemOptions, and include it in the createRecord
API call via extra_body.

BREAKING CHANGE: system function signature now includes a SystemOptions
parameter as the third argument containing { otel_link_id: str }.
---
 src/scorecard_ai/lib/__init__.py |  2 ++
 src/scorecard_ai/lib/_helpers.py | 30 ++++++++++++++++++++++++------
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/src/scorecard_ai/lib/__init__.py b/src/scorecard_ai/lib/__init__.py
index b0b981e..f70584b 100644
--- a/src/scorecard_ai/lib/__init__.py
+++ b/src/scorecard_ai/lib/__init__.py
@@ -1,6 +1,7 @@
 from ._helpers import (
     run_and_evaluate,
     async_run_and_evaluate,
+    SystemOptions,
 )
 from .wrap_llms import (
     wrap,
@@ -18,6 +19,7 @@
 __all__ = [
     "run_and_evaluate",
     "async_run_and_evaluate",
+    "SystemOptions",
     "StopCheck",
     "StopChecks",
     "ChatMessage",
diff --git a/src/scorecard_ai/lib/_helpers.py b/src/scorecard_ai/lib/_helpers.py
index bca11ff..ed09348 100644
--- a/src/scorecard_ai/lib/_helpers.py
+++ b/src/scorecard_ai/lib/_helpers.py
@@ -5,6 +5,7 @@
 from __future__ import annotations
 
 import asyncio
+import uuid
 from typing import Any, Dict, List, TypeVar, Callable, Coroutine
 from collections.abc import Generator, AsyncGenerator
 from typing_extensions import TypedDict
@@ -20,6 +21,15 @@
 _T = TypeVar("_T")
 
 
+class SystemOptions(TypedDict):
+    """Options passed to the system function for each testcase execution."""
+
+    otel_link_id: str
+    """A unique ID for linking this execution with its OpenTelemetry trace.
+    Set this as an attribute on your OTel span (e.g. ``scorecard.otel_link_id``)
+    to deduplicate SDK records with trace-created records."""
+
+
 def _omit_if_not_given(value: _T | NotGiven) -> _T | Omit:
     """
     Converts NotGiven sentinel to Omit sentinel for API calls.
@@ -82,7 +92,7 @@ def run_and_evaluate(
     testset_id: str | NotGiven = NOT_GIVEN,
     testcases: List[Testcase] | List[SimpleTestcase] | NotGiven = NOT_GIVEN,
     system_version_id: str | NotGiven = NOT_GIVEN,
-    system: Callable[[SystemInput, SystemVersion | None], SystemOutput],
+    system: Callable[[SystemInput, SystemVersion | None, SystemOptions], SystemOutput],
     trials: int = 1,
 ) -> RunResponse:
     """
@@ -103,7 +113,8 @@ def run_and_evaluate(
 
         system_version_id: The ID of the SystemVersion to use for the run.
 
-        system: The system to run on the Testset.
+        system: The system to run on the Testset. Receives the testcase input, system version (or None),
+            and a SystemOptions dict containing ``otel_link_id`` for trace deduplication.
 
         trials: The number of times to run the system on each Testcase.
     """
@@ -134,13 +145,16 @@ def run_and_evaluate(
     # Run each Testcase sequentially
     for testcase in testcase_iter:
         for _ in range(trials):
-            model_response = system(testcase["inputs"], system_version)
+            otel_link_id = str(uuid.uuid4())
+            options = SystemOptions(otel_link_id=otel_link_id)
+            model_response = system(testcase["inputs"], system_version, options)
             client.records.create(
                 run_id=run.id,
                 testcase_id=_omit_if_not_given(testcase["id"]),
                 inputs=testcase["inputs"],
                 expected=testcase["expected"],
                 outputs=model_response,
+                extra_body={"otelLinkId": otel_link_id},
             )
 
     return RunResponse(id=run.id, url=_get_run_url(client, project_id, run.id))
@@ -154,7 +168,7 @@ async def async_run_and_evaluate(
     testset_id: str | NotGiven = NOT_GIVEN,
     testcases: List[Testcase] | List[SimpleTestcase] | NotGiven = NOT_GIVEN,
     system_version_id: str | NotGiven = NOT_GIVEN,
-    system: Callable[[SystemInput, SystemVersion | None], SystemOutput],
+    system: Callable[[SystemInput, SystemVersion | None, SystemOptions], SystemOutput],
     trials: int = 1,
 ) -> RunResponse:
     """
@@ -175,7 +189,8 @@ async def async_run_and_evaluate(
 
         system_version_id: The ID of the SystemVersion to use for the run.
 
-        system: The system to run on the Testset.
+        system: The system to run on the Testset. Receives the testcase input, system version (or None),
+            and a SystemOptions dict containing ``otel_link_id`` for trace deduplication.
 
         trials: The number of times to run the system on each Testcase.
     """
@@ -206,13 +221,16 @@ async def async_run_and_evaluate(
     def run_testcase(
         testcase: _SimpleTestcaseWithId,
     ) -> Coroutine[Any, Any, Record]:
-        model_response = system(testcase["inputs"], system_version)
+        otel_link_id = str(uuid.uuid4())
+        options = SystemOptions(otel_link_id=otel_link_id)
+        model_response = system(testcase["inputs"], system_version, options)
         return client.records.create(
             run_id=run.id,
             testcase_id=_omit_if_not_given(testcase["id"]),
             inputs=testcase["inputs"],
             expected=testcase["expected"],
             outputs=model_response,
+            extra_body={"otelLinkId": otel_link_id},
         )
 
     # Create a Record for each Testcase

From 7ce95ac229fa7bc096a464f6ec3890bb0896a00a Mon Sep 17 00:00:00 2001
From: Yash1hi <yash.thapliyal.007@gmail.com>
Date: Fri, 6 Mar 2026 14:52:37 -0800
Subject: [PATCH 2/4] fix: make SystemOptions a non-breaking optional parameter

Use inspect.signature to detect whether the user's system function
accepts 3 positional args. If it does, pass SystemOptions; if not,
call with just (inputs, system_version) as before. Existing 2-arg
system functions continue to work unchanged.
---
 src/scorecard_ai/lib/_helpers.py | 43 ++++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 8 deletions(-)

diff --git a/src/scorecard_ai/lib/_helpers.py b/src/scorecard_ai/lib/_helpers.py
index ed09348..b0aae0f 100644
--- a/src/scorecard_ai/lib/_helpers.py
+++ b/src/scorecard_ai/lib/_helpers.py
@@ -5,6 +5,7 @@
 from __future__ import annotations
 
 import asyncio
+import inspect
 import uuid
 from typing import Any, Dict, List, TypeVar, Callable, Coroutine
 from collections.abc import Generator, AsyncGenerator
@@ -30,6 +31,22 @@ class SystemOptions(TypedDict):
     to deduplicate SDK records with trace-created records."""
 
 
+def _system_accepts_options(system: Callable[..., Any]) -> bool:
+    """Check if the system function accepts a third 'options' parameter."""
+    try:
+        sig = inspect.signature(system)
+        params = list(sig.parameters.values())
+        # Accepts options if it has 3+ positional params
+        positional_kinds = (
+            inspect.Parameter.POSITIONAL_ONLY,
+            inspect.Parameter.POSITIONAL_OR_KEYWORD,
+        )
+        positional = [p for p in params if p.kind in positional_kinds]
+        return len(positional) >= 3
+    except (ValueError, TypeError):
+        return False
+
+
 def _omit_if_not_given(value: _T | NotGiven) -> _T | Omit:
     """
     Converts NotGiven sentinel to Omit sentinel for API calls.
@@ -92,7 +109,7 @@ def run_and_evaluate(
     testset_id: str | NotGiven = NOT_GIVEN,
     testcases: List[Testcase] | List[SimpleTestcase] | NotGiven = NOT_GIVEN,
     system_version_id: str | NotGiven = NOT_GIVEN,
-    system: Callable[[SystemInput, SystemVersion | None, SystemOptions], SystemOutput],
+    system: Callable[..., SystemOutput],
     trials: int = 1,
 ) -> RunResponse:
     """
@@ -113,8 +130,8 @@ def run_and_evaluate(
 
         system_version_id: The ID of the SystemVersion to use for the run.
 
-        system: The system to run on the Testset. Receives the testcase input, system version (or None),
-            and a SystemOptions dict containing ``otel_link_id`` for trace deduplication.
+        system: The system to run on the Testset. Receives the testcase input and system version (or None).
+            Optionally accepts a third ``SystemOptions`` argument containing ``otel_link_id`` for trace deduplication.
 
         trials: The number of times to run the system on each Testcase.
     """
@@ -142,12 +159,17 @@ def run_and_evaluate(
         client.systems.versions.get(system_version_id) if not isinstance(system_version_id, NotGiven) else None
     )
 
+    accepts_options = _system_accepts_options(system)
+
     # Run each Testcase sequentially
     for testcase in testcase_iter:
         for _ in range(trials):
             otel_link_id = str(uuid.uuid4())
             options = SystemOptions(otel_link_id=otel_link_id)
-            model_response = system(testcase["inputs"], system_version, options)
+            if accepts_options:
+                model_response = system(testcase["inputs"], system_version, options)
+            else:
+                model_response = system(testcase["inputs"], system_version)
             client.records.create(
                 run_id=run.id,
                 testcase_id=_omit_if_not_given(testcase["id"]),
@@ -168,7 +190,7 @@ async def async_run_and_evaluate(
     testset_id: str | NotGiven = NOT_GIVEN,
     testcases: List[Testcase] | List[SimpleTestcase] | NotGiven = NOT_GIVEN,
     system_version_id: str | NotGiven = NOT_GIVEN,
-    system: Callable[[SystemInput, SystemVersion | None, SystemOptions], SystemOutput],
+    system: Callable[..., SystemOutput],
     trials: int = 1,
 ) -> RunResponse:
     """
@@ -189,8 +211,8 @@ async def async_run_and_evaluate(
 
         system_version_id: The ID of the SystemVersion to use for the run.
 
-        system: The system to run on the Testset. Receives the testcase input, system version (or None),
-            and a SystemOptions dict containing ``otel_link_id`` for trace deduplication.
+        system: The system to run on the Testset. Receives the testcase input and system version (or None).
+            Optionally accepts a third ``SystemOptions`` argument containing ``otel_link_id`` for trace deduplication.
 
         trials: The number of times to run the system on each Testcase.
     """
@@ -218,12 +240,17 @@ async def async_run_and_evaluate(
         await client.systems.versions.get(system_version_id) if not isinstance(system_version_id, NotGiven) else None
     )
 
+    accepts_options = _system_accepts_options(system)
+
     def run_testcase(
         testcase: _SimpleTestcaseWithId,
     ) -> Coroutine[Any, Any, Record]:
         otel_link_id = str(uuid.uuid4())
         options = SystemOptions(otel_link_id=otel_link_id)
-        model_response = system(testcase["inputs"], system_version, options)
+        if accepts_options:
+            model_response = system(testcase["inputs"], system_version, options)
+        else:
+            model_response = system(testcase["inputs"], system_version)
         return client.records.create(
             run_id=run.id,
             testcase_id=_omit_if_not_given(testcase["id"]),

From 75cebe4e1bd449956ed9673f9360e91a99e1404f Mon Sep 17 00:00:00 2001
From: Yash1hi <yash.thapliyal.007@gmail.com>
Date: Fri, 6 Mar 2026 15:05:36 -0800
Subject: [PATCH 3/4] fix: resolve import sorting lint errors

---
 src/scorecard_ai/lib/__init__.py | 4 ++--
 src/scorecard_ai/lib/_helpers.py | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/scorecard_ai/lib/__init__.py b/src/scorecard_ai/lib/__init__.py
index f70584b..562b9c2 100644
--- a/src/scorecard_ai/lib/__init__.py
+++ b/src/scorecard_ai/lib/__init__.py
@@ -1,7 +1,7 @@
 from ._helpers import (
-    run_and_evaluate,
-    async_run_and_evaluate,
     SystemOptions,
+    async_run_and_evaluate,
+    run_and_evaluate,
 )
 from .wrap_llms import (
     wrap,
diff --git a/src/scorecard_ai/lib/_helpers.py b/src/scorecard_ai/lib/_helpers.py
index b0aae0f..c3d264f 100644
--- a/src/scorecard_ai/lib/_helpers.py
+++ b/src/scorecard_ai/lib/_helpers.py
@@ -7,8 +7,9 @@
 import asyncio
 import inspect
 import uuid
-from typing import Any, Dict, List, TypeVar, Callable, Coroutine
-from collections.abc import Generator, AsyncGenerator
+from collections.abc import AsyncGenerator, Generator
+from typing import Any, Callable, Coroutine, Dict, List, TypeVar
+
 from typing_extensions import TypedDict
 
 from scorecard_ai import Scorecard, AsyncScorecard

From 85214df2eb37d75727b4c8aacc2cd5c493d8858f Mon Sep 17 00:00:00 2001
From: Yash1hi <yash.thapliyal.007@gmail.com>
Date: Fri, 6 Mar 2026 15:17:44 -0800
Subject: [PATCH 4/4] fix: use length-sorted imports per ruff config

---
 src/scorecard_ai/lib/__init__.py | 2 +-
 src/scorecard_ai/lib/_helpers.py | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/scorecard_ai/lib/__init__.py b/src/scorecard_ai/lib/__init__.py
index 562b9c2..2e9aabf 100644
--- a/src/scorecard_ai/lib/__init__.py
+++ b/src/scorecard_ai/lib/__init__.py
@@ -1,7 +1,7 @@
 from ._helpers import (
     SystemOptions,
-    async_run_and_evaluate,
     run_and_evaluate,
+    async_run_and_evaluate,
 )
 from .wrap_llms import (
     wrap,
diff --git a/src/scorecard_ai/lib/_helpers.py b/src/scorecard_ai/lib/_helpers.py
index c3d264f..6341e3a 100644
--- a/src/scorecard_ai/lib/_helpers.py
+++ b/src/scorecard_ai/lib/_helpers.py
@@ -4,12 +4,11 @@
 
 from __future__ import annotations
 
+import uuid
 import asyncio
 import inspect
-import uuid
-from collections.abc import AsyncGenerator, Generator
-from typing import Any, Callable, Coroutine, Dict, List, TypeVar
-
+from typing import Any, Dict, List, TypeVar, Callable, Coroutine
+from collections.abc import Generator, AsyncGenerator
 from typing_extensions import TypedDict
 
 from scorecard_ai import Scorecard, AsyncScorecard