|
21 | 21 | from .llm_interaction_handler import LLMInteractionHandler |
22 | 22 | from .llm_provider import LLMProvider |
23 | 23 | from .models import BBox, Snapshot, StepHookContext |
24 | | -from .verification import AssertContext, AssertOutcome, Predicate |
| 24 | +from .verification import Predicate |
25 | 25 |
|
26 | 26 |
|
27 | 27 | @dataclass(frozen=True) |
@@ -55,6 +55,13 @@ class RuntimeStep: |
55 | 55 | max_vision_executor_attempts: int = 1 |
56 | 56 |
|
57 | 57 |
|
| 58 | +@dataclass(frozen=True) |
| 59 | +class ActOnceResult: |
| 60 | + action: str |
| 61 | + snap: Snapshot |
| 62 | + used_vision: bool |
| 63 | + |
| 64 | + |
58 | 65 | class RuntimeAgent: |
59 | 66 | """ |
60 | 67 | A thin orchestration layer over AgentRuntime: |
@@ -164,6 +171,128 @@ async def run_step( |
164 | 171 | ), |
165 | 172 | ) |
166 | 173 |
|
| 174 | + async def act_once( |
| 175 | + self, |
| 176 | + *, |
| 177 | + task_goal: str, |
| 178 | + step: RuntimeStep, |
| 179 | + allow_vision_fallback: bool = True, |
| 180 | + history_summary: str = "", |
| 181 | + compact_prompt_builder: Callable[ |
| 182 | + [str, str, str, Snapshot, str], tuple[str, str] |
| 183 | + ] |
| 184 | + | None = None, |
| 185 | + dom_context_postprocessor: Callable[[str], str] | None = None, |
| 186 | + ) -> str: |
| 187 | + """ |
| 188 | + Execute exactly one action for a step without owning step lifecycle. |
| 189 | +
|
| 190 | + This helper is designed for orchestration layers (e.g. WebBench) that already |
| 191 | + call `runtime.begin_step(...)` / `runtime.emit_step_end(...)` and want to |
| 192 | + reuse RuntimeAgent's snapshot-first action proposal + execution logic without: |
| 193 | + - double-counting step budgets |
| 194 | + - emitting duplicate step_start/step_end events |
| 195 | +
|
| 196 | + Returns: |
| 197 | + Action string (e.g. "CLICK(123)", "TYPE(5, \"foo\")", "PRESS(\"Enter\")", "FINISH()") |
| 198 | + """ |
| 199 | + res = await self.act_once_result( |
| 200 | + task_goal=task_goal, |
| 201 | + step=step, |
| 202 | + allow_vision_fallback=allow_vision_fallback, |
| 203 | + history_summary=history_summary, |
| 204 | + compact_prompt_builder=compact_prompt_builder, |
| 205 | + dom_context_postprocessor=dom_context_postprocessor, |
| 206 | + ) |
| 207 | + return res.action |
| 208 | + |
| 209 | + async def act_once_with_snapshot( |
| 210 | + self, |
| 211 | + *, |
| 212 | + task_goal: str, |
| 213 | + step: RuntimeStep, |
| 214 | + allow_vision_fallback: bool = True, |
| 215 | + history_summary: str = "", |
| 216 | + compact_prompt_builder: Callable[ |
| 217 | + [str, str, str, Snapshot, str], tuple[str, str] |
| 218 | + ] |
| 219 | + | None = None, |
| 220 | + dom_context_postprocessor: Callable[[str], str] | None = None, |
| 221 | + ) -> tuple[str, Snapshot]: |
| 222 | + """ |
| 223 | + Like `act_once`, but also returns the pre-action snapshot used for proposal. |
| 224 | + """ |
| 225 | + res = await self.act_once_result( |
| 226 | + task_goal=task_goal, |
| 227 | + step=step, |
| 228 | + allow_vision_fallback=allow_vision_fallback, |
| 229 | + history_summary=history_summary, |
| 230 | + compact_prompt_builder=compact_prompt_builder, |
| 231 | + dom_context_postprocessor=dom_context_postprocessor, |
| 232 | + ) |
| 233 | + return res.action, res.snap |
| 234 | + |
| 235 | + async def act_once_result( |
| 236 | + self, |
| 237 | + *, |
| 238 | + task_goal: str, |
| 239 | + step: RuntimeStep, |
| 240 | + allow_vision_fallback: bool = True, |
| 241 | + history_summary: str = "", |
| 242 | + compact_prompt_builder: Callable[ |
| 243 | + [str, str, str, Snapshot, str], tuple[str, str] |
| 244 | + ] |
| 245 | + | None = None, |
| 246 | + dom_context_postprocessor: Callable[[str], str] | None = None, |
| 247 | + ) -> ActOnceResult: |
| 248 | + """ |
| 249 | + Like `act_once`, but returns action + proposal snapshot + whether vision was used. |
| 250 | + """ |
| 251 | + snap = await self._snapshot_with_ramp(step=step) |
| 252 | + |
| 253 | + # Optional short-circuit to vision (bounded by caller). |
| 254 | + if allow_vision_fallback and await self._should_short_circuit_to_vision(step=step, snap=snap): |
| 255 | + if self.vision_executor and self.vision_executor.supports_vision(): |
| 256 | + url = await self._get_url_for_prompt() |
| 257 | + image_b64 = await self._screenshot_base64_png() |
| 258 | + system_prompt, user_prompt = self._vision_executor_prompts( |
| 259 | + task_goal=task_goal, |
| 260 | + step=step, |
| 261 | + url=url, |
| 262 | + snap=snap, |
| 263 | + ) |
| 264 | + resp = self.vision_executor.generate_with_image( |
| 265 | + system_prompt, |
| 266 | + user_prompt, |
| 267 | + image_b64, |
| 268 | + temperature=0.0, |
| 269 | + ) |
| 270 | + action = self._extract_action_from_text(resp.content) |
| 271 | + await self._execute_action(action=action, snap=snap) |
| 272 | + return ActOnceResult(action=action, snap=snap, used_vision=True) |
| 273 | + |
| 274 | + # Structured snapshot-first proposal. |
| 275 | + dom_context = self._structured_llm.build_context(snap, step.goal) |
| 276 | + if dom_context_postprocessor is not None: |
| 277 | + dom_context = dom_context_postprocessor(dom_context) |
| 278 | + |
| 279 | + if compact_prompt_builder is not None: |
| 280 | + system_prompt, user_prompt = compact_prompt_builder( |
| 281 | + task_goal, step.goal, dom_context, snap, history_summary or "" |
| 282 | + ) |
| 283 | + resp = self.executor.generate(system_prompt, user_prompt, temperature=0.0) |
| 284 | + action = self._structured_llm.extract_action(resp.content) |
| 285 | + else: |
| 286 | + combined_goal = task_goal |
| 287 | + if history_summary: |
| 288 | + combined_goal = f"{task_goal}\n\nRECENT STEPS:\n{history_summary}" |
| 289 | + combined_goal = f"{combined_goal}\n\nSTEP: {step.goal}" |
| 290 | + resp = self._structured_llm.query_llm(dom_context, combined_goal) |
| 291 | + action = self._structured_llm.extract_action(resp.content) |
| 292 | + |
| 293 | + await self._execute_action(action=action, snap=snap) |
| 294 | + return ActOnceResult(action=action, snap=snap, used_vision=False) |
| 295 | + |
167 | 296 | async def _run_hook( |
168 | 297 | self, |
169 | 298 | hook: Callable[[StepHookContext], Any] | None, |
|
0 commit comments