From 815803dfe902091843ab1b0cec04054acfa9d20b Mon Sep 17 00:00:00 2001 From: SentienceDEV Date: Sun, 15 Mar 2026 22:02:37 -0700 Subject: [PATCH] new planner executor agent with abstracted webbench task --- planner_executor_local2/README.md | 533 ++++++++++++ planner_executor_local2/log.txt | 434 ++++++++++ planner_executor_local2/main.py | 1006 ++++++++++++++++++++++ planner_executor_local2/requirements.txt | 20 + 4 files changed, 1993 insertions(+) create mode 100644 planner_executor_local2/README.md create mode 100644 planner_executor_local2/log.txt create mode 100644 planner_executor_local2/main.py create mode 100644 planner_executor_local2/requirements.txt diff --git a/planner_executor_local2/README.md b/planner_executor_local2/README.md new file mode 100644 index 0000000..e001ad6 --- /dev/null +++ b/planner_executor_local2/README.md @@ -0,0 +1,533 @@ +# Planner + Executor Demo (SDK AutomationTask) + +This demo showcases the SDK's `PlannerExecutorAgent` with the abstracted `AutomationTask` model. Unlike `planner_executor_local` (which implements everything from scratch), this demo uses the SDK's built-in components. + +## Features + +- **AutomationTask**: Flexible task definition with categories, success criteria, and recovery +- **CAPTCHA Handling**: Multiple solver strategies (abort, human handoff, external solver) +- **Modal Dismissal**: Heuristic-based detection of modals and dialogs +- **Recovery/Rollback**: Automatic checkpoint creation and rollback on failure +- **Custom Heuristics**: Domain-specific element selection for e-commerce sites +- **Auth Boundary Detection**: Automatic detection and graceful handling of login/signin pages +- **Scroll-After-Escalation**: Adaptive viewport-based scrolling to find off-screen elements + +## Architecture + +``` +┌─────────────────────────────────────────────────────────────┐ +│ AutomationTask │ +│ - task_id, starting_url, task (natural language) │ +│ - category (TRANSACTION, SEARCH, etc.) │ +│ - success_criteria, recovery config │ +└─────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ PlannerExecutorAgent │ +├─────────────────────────────────────────────────────────────┤ +│ Planner (gpt-4o) │ Executor (gpt-4o-mini) │ +│ ───────────── │ ──────────────── │ +│ • JSON plan generation │ • Step execution │ +│ • Replanning on failure │ • Heuristics + LLM fallback │ +│ • Predicate verification │ • Vision fallback │ +├─────────────────────────────────────────────────────────────┤ +│ ComposableHeuristics │ RecoveryState │ +│ • Custom EcommerceHeuristics │ • Checkpoint tracking │ +│ • Common hints (add_to_cart) │ • URL-based rollback │ +├─────────────────────────────────────────────────────────────┤ +│ CaptchaConfig │ +│ • abort: fail fast │ • human: manual solve │ +│ • external: 2Captcha/etc │ • custom handler │ +└─────────────────────────────────────────────────────────────┘ +``` + +## Quick Start + +### Prerequisites + +```bash +# Install SDK from local path +pip install -e ../../sdk-python + +# Install other dependencies +pip install -r requirements.txt + +# For OpenAI models (default) +export OPENAI_API_KEY=sk-... + +# For local MLX models (Apple Silicon) +pip install mlx-lm + +# For local HuggingFace models +pip install torch transformers +``` + +### Usage Commands + +#### Default Search Task (OpenAI) + +Searches for "laptop" on Amazon, clicks first product, adds to cart, proceeds to checkout: + +```bash +python main.py +``` + +**Starting URL**: `https://www.amazon.com` + +#### Local LLM Models + +Use `--local` flag to switch from OpenAI to local models: + +```bash +# Default local models (MLX on Apple Silicon) +# Planner: mlx-community/Qwen3-8B-4bit +# Executor: mlx-community/Qwen3-4B-4bit +python main.py --local + +# Custom local models +python main.py --local --planner-model mlx-community/Qwen3-8B-4bit --executor-model mlx-community/Qwen3-4B-4bit + +# Use HuggingFace transformers instead of MLX +python main.py --local --provider hf --planner-model Qwen/Qwen2.5-7B-Instruct --executor-model Qwen/Qwen2.5-3B-Instruct +``` + +#### Custom Search Query + +```bash +# Search for a specific product +python main.py --query "wireless mouse" + +# Or via environment variable +AMAZON_QUERY="thinkpad laptop" python main.py + +# With local models +python main.py --local --query "thinkpad laptop" +``` + +**Starting URL**: `https://www.amazon.com` + +#### High-Level Goal (Less Defined Task) + +When you provide a `--goal`, the planner generates steps to achieve it. The browser still starts at Amazon: + +```bash +# With OpenAI +python main.py --goal "Find a good deal on headphones and add to cart" + +# With local models +python main.py --local --goal "Find a good deal on headphones and add to cart" +``` + +**Starting URL**: `https://www.amazon.com` (default, can be changed in code) + +The planner will: +1. Parse the high-level goal +2. Generate appropriate steps (navigate, search, click, etc.) +3. Include verification predicates for each step +4. Execute and verify each step + +#### CAPTCHA Handling + +```bash +# Abort on CAPTCHA (default) +python main.py + +# Human handoff - waits for you to solve CAPTCHA manually +CAPTCHA_MODE=human python main.py + +# External solver integration (requires API key in code) +CAPTCHA_MODE=external python main.py +``` + +#### Headless Mode + +```bash +# Run without visible browser window +python main.py --headless + +# Or via environment variable +HEADLESS=true python main.py +``` + +#### Debug Mode + +```bash +# Enable verbose logging +DEBUG=true python main.py +``` + +#### Combined Examples + +```bash +# Local models with human CAPTCHA solving +CAPTCHA_MODE=human python main.py --local --query "keyboard" + +# Local models with high-level goal and debug logging +DEBUG=true python main.py --local --goal "Find the cheapest laptop under $500" + +# OpenAI with custom models +PLANNER_MODEL=gpt-4-turbo EXECUTOR_MODEL=gpt-3.5-turbo python main.py + +# Local HuggingFace models with headless mode +python main.py --local --provider hf --headless +``` + +### Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `OPENAI_API_KEY` | (required for OpenAI) | OpenAI API key | +| `PLANNER_MODEL` | `gpt-4o` / `mlx-community/Qwen3-8B-4bit` | Model for planning | +| `EXECUTOR_MODEL` | `gpt-4o-mini` / `mlx-community/Qwen3-4B-4bit` | Model for execution | +| `CAPTCHA_MODE` | `abort` | `abort`, `human`, or `external` | +| `AMAZON_QUERY` | `laptop` | Default search query | +| `HEADLESS` | `false` | Run browser headless | +| `DEBUG` | `false` | Enable debug logging | + +### Command-Line Arguments + +| Argument | Description | +|----------|-------------| +| `--goal "..."` | High-level goal for the planner (less defined task) | +| `--query "..."` | Specific search query (overrides `AMAZON_QUERY`) | +| `--headless` | Run browser in headless mode | +| `--local` | Use local LLM models instead of OpenAI | +| `--provider {mlx,hf}` | Local model provider: `mlx` (Apple Silicon) or `hf` (HuggingFace) | +| `--planner-model "..."` | Override planner model name | +| `--executor-model "..."` | Override executor model name | + +### Website Navigation + +**All modes start at `https://www.amazon.com`** by default: + +| Mode | Starting URL | Task Generation | +|------|--------------|-----------------| +| Default | `https://www.amazon.com` | Search for `AMAZON_QUERY`, add to cart, checkout | +| `--query` | `https://www.amazon.com` | Search for specified query, add to cart, checkout | +| `--goal` | `https://www.amazon.com` | Planner generates steps based on goal | + +To use a different website, modify `starting_url` in `create_automation_task()` in `main.py`. + +## AutomationTask vs WebBenchTask + +The SDK's `AutomationTask` abstracts `WebBenchTask` for general-purpose automation: + +```python +# Old WebBenchTask approach (specific to webbench) +task = WebBenchTask( + id="task-001", + starting_url="https://amazon.com", + task="Search for laptop", + category="CREATE", # WebBench-specific category +) + +# New AutomationTask approach (SDK abstraction) +task = AutomationTask( + task_id="purchase-laptop", + starting_url="https://amazon.com", + task="Find a laptop under $1000 and add to cart", + category=TaskCategory.TRANSACTION, # Generic category + enable_recovery=True, + max_recovery_attempts=2, +) + +# Add success criteria +task = task.with_success_criteria( + {"predicate": "url_contains", "args": ["/cart"]}, + {"predicate": "exists", "args": [".cart-item"]}, +) +``` + +## CAPTCHA Handling + +### Abort Mode (Default) + +Fails immediately when CAPTCHA is detected: + +```bash +CAPTCHA_MODE=abort python main.py +``` + +### Human Handoff + +Waits for manual CAPTCHA solve: + +```bash +CAPTCHA_MODE=human python main.py +``` + +When a CAPTCHA appears, solve it in the browser window within 3 minutes. + +### External Solver + +Integrate with services like 2Captcha or CapSolver: + +```bash +CAPTCHA_MODE=external python main.py +``` + +For production use, modify the `external_solver` function in `main.py`: + +```python +def external_solver(ctx: CaptchaContext) -> bool: + from twocaptcha import TwoCaptcha + solver = TwoCaptcha('YOUR_API_KEY') + + if ctx.captcha.type == "recaptcha": + result = solver.recaptcha( + sitekey=ctx.captcha.sitekey, + url=ctx.url, + ) + # Inject solution... + return True +``` + +## Custom Heuristics + +The demo includes `EcommerceHeuristics` for Amazon-specific element selection: + +```python +class EcommerceHeuristics: + def find_element_for_intent(self, intent, elements, url, goal): + if "add" in intent and "cart" in intent: + return self._find_add_to_cart(elements) + if "checkout" in intent: + return self._find_checkout_button(elements) + # ... + return None # Fall back to LLM +``` + +This allows element selection without LLM calls for common patterns: +- Search box detection +- Add to Cart button +- Checkout/Proceed button +- First product link (matches "Click on product title", "first product link", etc.) +- Modal dismiss buttons +- Cookie consent + +## High-Level Goals + +The planner handles less-defined tasks by generating appropriate steps. When you use `--goal`, the task description is passed directly to the planner without a pre-defined step template: + +```bash +# The planner will figure out what steps are needed +python main.py --goal "Find a good laptop deal and add to cart" + +# More specific goal +python main.py --goal "Search for wireless earbuds under $50 with good reviews" + +# Complex multi-step goal +python main.py --goal "Find a ThinkPad laptop, check the reviews, and add to cart if rating is above 4 stars" +``` + +**How it works:** + +1. The `AutomationTask` is created with only the high-level goal (no predefined steps) +2. The planner LLM (gpt-4o) analyzes the goal and current page +3. It generates a JSON plan with steps like: + - NAVIGATE to search + - CLICK search box + - TYPE_AND_SUBMIT query + - CLICK product link + - CLICK add to cart +4. Each step includes verification predicates +5. The executor runs each step with heuristics or LLM fallback +6. If steps fail, the planner replans + +**Example generated plan for "Find a good laptop deal":** +```json +{ + "task": "Find a good laptop deal and add to cart", + "steps": [ + {"id": 1, "goal": "Click search box", "action": "CLICK", "intent": "search_box"}, + {"id": 2, "goal": "Search for laptop", "action": "TYPE_AND_SUBMIT", "input": "laptop deals"}, + {"id": 3, "goal": "Click first product", "action": "CLICK", "intent": "first_product_link"}, + {"id": 4, "goal": "Add to cart", "action": "CLICK", "intent": "add_to_cart"} + ] +} +``` + +## Recovery and Rollback + +When enabled, the agent: +1. Creates checkpoints after each successful step +2. On failure, attempts recovery to the last checkpoint +3. Re-verifies page state and resumes + +```python +task = AutomationTask( + task_id="...", + starting_url="...", + task="...", + enable_recovery=True, + max_recovery_attempts=2, +) +``` + +## Auth Boundary Detection + +The agent automatically detects authentication boundaries (login/signin pages) and stops gracefully instead of attempting to log in or getting stuck: + +```python +from predicate.agents.planner_executor_agent import AuthBoundaryConfig + +# Configure auth boundary detection +auth_config = AuthBoundaryConfig( + enabled=True, + url_patterns=["/signin", "/login", "/auth", "/account/login"], + element_patterns=["sign in", "log in", "username", "password"], +) + +result = await agent.run( + task=task, + auth_boundary_config=auth_config, +) + +# Check if auth boundary was hit +if result.auth_boundary_hit: + print(f"Stopped at auth page: {result.auth_boundary_url}") +``` + +**Default behavior:** +- Detects URLs containing `/signin`, `/login`, `/auth`, etc. +- Detects form elements with signin/login labels +- Marks task as successful up to the auth boundary +- Reports which step hit the boundary + +## Scroll-After-Escalation + +When an element isn't found even after limit escalation, the agent can scroll the page to find off-screen elements: + +```python +from predicate.agents.planner_executor_agent import SnapshotEscalationConfig + +# Configure snapshot escalation with scrolling +escalation_config = SnapshotEscalationConfig( + enabled=True, + initial_limit=50, + max_limit=200, + + # Scroll configuration (viewport-adaptive) + scroll_after_escalation=True, + scroll_viewport_fraction=0.4, # Scroll by 40% of viewport height + scroll_max_attempts=3, + scroll_directions=["down", "up"], +) + +result = await agent.run( + task=task, + snapshot_escalation_config=escalation_config, +) +``` + +**Key parameters:** +- `scroll_viewport_fraction`: Fraction of viewport height to scroll (default: 0.4 = 40%) +- `scroll_max_attempts`: Maximum scroll attempts per direction (default: 3) +- `scroll_directions`: Which directions to try (default: `["down", "up"]`) + +**How it works:** +1. Initial element search with `initial_limit` elements +2. If not found, escalate to `max_limit` elements +3. If still not found and `scroll_after_escalation=True`: + - Scroll down by `viewport_height * scroll_viewport_fraction` + - Re-capture snapshot and search again + - Repeat up to `scroll_max_attempts` times + - Try opposite direction if needed +4. Adaptive scrolling prevents overshooting (uses viewport-relative distance) + +## Comparison with planner_executor_local + +| Feature | planner_executor_local | planner_executor_local2 (this demo) | +|---------|------------------------|-------------------------------------| +| Implementation | From scratch | SDK's PlannerExecutorAgent | +| Task model | Custom | AutomationTask | +| CAPTCHA | Manual integration | CaptchaConfig | +| Recovery | Custom | RecoveryState | +| Heuristics | Inline functions | EcommerceHeuristics + SDK | +| Modals | Manual detection | ComposableHeuristics | +| OpenAI models | Yes | Yes (default) | +| Local models | HuggingFace/MLX | Yes (`--local` flag) | + +## Local LLM Models + +### MLX (Apple Silicon - Recommended) + +MLX provides efficient inference on M1/M2/M3/M4 Macs: + +```bash +# Install mlx-lm +pip install mlx-lm + +# Run with default local models +python main.py --local + +# Default models: +# - Planner: mlx-community/Qwen3-8B-4bit (9B params, 4-bit quantized) +# - Executor: mlx-community/Qwen3-4B-4bit (4B params, 4-bit quantized) +``` + +### HuggingFace Transformers + +For CUDA GPUs or CPU inference: + +```bash +# Install dependencies +pip install torch transformers + +# Run with HuggingFace provider +python main.py --local --provider hf + +# Custom models +python main.py --local --provider hf \ + --planner-model Qwen/Qwen2.5-7B-Instruct \ + --executor-model Qwen/Qwen2.5-3B-Instruct +``` + +### Model Requirements + +| Role | Recommended Size | Purpose | +|------|------------------|---------| +| Planner | 7B-9B params | JSON plan generation, verification predicates | +| Executor | 3B-4B params | Element selection, action execution | + +Smaller executor models reduce latency per step while maintaining accuracy. + +## Files + +| File | Description | +|------|-------------| +| `main.py` | Main demo script | +| `requirements.txt` | Python dependencies | +| `traces/` | Trace files for visualization | + +## Troubleshooting + +### CAPTCHA Detection + +If CAPTCHA is detected unexpectedly: +- Use `CAPTCHA_MODE=human` for manual solving +- Check if IP is flagged (use VPN/proxy) +- Reduce automation speed + +### Element Not Found + +If elements are not found: +- Increase snapshot limits in `SnapshotEscalationConfig` +- Enable `scroll_after_escalation=True` for off-screen elements +- Adjust `scroll_viewport_fraction` (smaller = finer scrolling) +- Add custom heuristics for the site +- Check if page requires vision fallback + +### Recovery Failures + +If recovery fails repeatedly: +- Check if URLs are bookmarkable +- Verify checkpoint predicates are stable +- Increase `max_recovery_attempts` + +## Documentation + +See the full SDK documentation: +- [PlannerExecutorAgent User Manual](../../sdk-python/docs/PLANNER_EXECUTOR_AGENT.md) +- [AutomationTask Design](../../docs/sdk-python-doc/automation-task-design.md) diff --git a/planner_executor_local2/log.txt b/planner_executor_local2/log.txt new file mode 100644 index 0000000..a671159 --- /dev/null +++ b/planner_executor_local2/log.txt @@ -0,0 +1,434 @@ +2026-03-15 21:54:23,888 [INFO] __main__: Mode: local +2026-03-15 21:54:23,888 [INFO] __main__: Planner model: mlx-community/Qwen3-8B-4bit +2026-03-15 21:54:23,888 [INFO] __main__: Executor model: mlx-community/Qwen3-4B-4bit +2026-03-15 21:54:23,888 [INFO] __main__: Predicate API: enabled +2026-03-15 21:54:25,858 [INFO] __main__: Loading MLX model: mlx-community/Qwen3-8B-4bit +2026-03-15 21:54:26,078 [INFO] httpx: HTTP Request: GET https://huggingface.co/api/models/mlx-community/Qwen3-8B-4bit/revision/main "HTTP/1.1 200 OK" + Fetching 9 files: 0%| | 0/9 [00:00 https://www.amazon.com/s?k=Logitech+mouse&ref=nb_sb_noss + [VERIFY] Accepting TYPE_AND_SUBMIT as successful (URL change fallback) + +[STEP 3] Click on Logitech mouse product + Action: CLICK + Intent: Click on product title +2026-03-15 21:56:13,114 [INFO] httpx: HTTP Request: POST https://api.sentienceapi.com/v1/snapshot "HTTP/1.1 200 OK" +2026-03-15 21:56:25,965 [INFO] httpx: HTTP Request: POST https://api.sentienceapi.com/v1/snapshot "HTTP/1.1 200 OK" + [SNAPSHOT] Elements: 60, URL: https://www.amazon.com/s?k=Logitech+mouse&ref=nb_sb_noss + [EXECUTOR] Using heuristic: CLICK(774) [heuristic] + [ACTION] CLICK(774) + [VERIFY] Running 1 verification predicates... + [VERIFY] Checking predicate: predicate='url_contains' args=['/dp/']... +2026-03-15 21:56:32,928 [INFO] httpx: HTTP Request: POST https://api.sentienceapi.com/v1/snapshot "HTTP/1.1 200 OK" + [VERIFY] Predicate PASSED + [VERIFY] Predicate result: PASS + +[STEP 4] Click Add to Cart + Action: CLICK + Intent: Click Add to Cart button +2026-03-15 21:56:41,083 [INFO] httpx: HTTP Request: POST https://api.sentienceapi.com/v1/snapshot "HTTP/1.1 200 OK" +2026-03-15 21:56:53,755 [INFO] httpx: HTTP Request: POST https://api.sentienceapi.com/v1/snapshot "HTTP/1.1 200 OK" + [SNAPSHOT-ESCALATION] Target element not found, trying scroll-after-escalation... + [SNAPSHOT-ESCALATION] Scrolling down (1/3)... +2026-03-15 21:57:02,074 [INFO] httpx: HTTP Request: POST https://api.sentienceapi.com/v1/snapshot "HTTP/1.1 200 OK" + [SNAPSHOT-ESCALATION] Found target element 1524 after scrolling down + [SNAPSHOT] Elements: 200, URL: https://www.amazon.com/Logitech-Ergonomic-Performance-Ultra-Fast-Scrolling/dp/B0FC5SJNQX/ref=sr_1_1?dib=eyJ2IjoiMSJ9.O8pHNKgBvHrqevJBdtVLA4B9Lg11DIbb7OY2drK--S_UgngNr2_--cYxaspYWoxn6lbYiqriayqGTYdh-dhGmVDODeAZs0VIPdzfmJsNLfFTDeFG5x2sbhBIZQuULIkS7pnZ8cABBkcxIM90zIUiZLmKrjZMFpT0mwybdu3d4olbipdItHchlhUeBQzzbjtFc5wzuiH6l40JD_SEtmeRi4nKqPGZ3qj9Cl6mMlaqN7w.AxdIsV9N-5J4cR08Wu_uLC4j5RMEuo5YEUnZT3rG4YA&dib_tag=se&keywords=Logitech%2Bmouse&qid=1773636945&sr=8-1&th=1 + [EXECUTOR] Using heuristic: CLICK(1524) [heuristic] + [ACTION] CLICK(1524) + [VERIFY] Running 1 verification predicates... + [VERIFY] Checking predicate: predicate='url_contains' args=['add-to-cart']... +2026-03-15 21:57:09,883 [INFO] httpx: HTTP Request: POST https://api.sentienceapi.com/v1/snapshot "HTTP/1.1 200 OK" +2026-03-15 21:57:17,995 [INFO] httpx: HTTP Request: POST https://api.sentienceapi.com/v1/snapshot "HTTP/1.1 200 OK" + [VERIFY] Predicate FAILED + [VERIFY] Predicate result: FAIL +2026-03-15 21:57:25,512 [INFO] httpx: HTTP Request: POST https://api.sentienceapi.com/v1/snapshot "HTTP/1.1 200 OK" + [VERIFY] Predicate failed but DOM changed (57 new elements) + [VERIFY] Accepting CLICK as successful (DOM change fallback) + [MODAL] Attempting dismissal: clicking element 8505 (pattern: 'no thanks') +2026-03-15 21:57:34,401 [INFO] httpx: HTTP Request: POST https://api.sentienceapi.com/v1/snapshot "HTTP/1.1 200 OK" + [MODAL] Dismissal successful (60 elements removed) + +[STEP 5] Proceed to Checkout + Action: CLICK + Intent: Click Proceed to Checkout +2026-03-15 21:57:41,050 [INFO] httpx: HTTP Request: POST https://api.sentienceapi.com/v1/snapshot "HTTP/1.1 200 OK" +2026-03-15 21:57:47,994 [INFO] httpx: HTTP Request: POST https://api.sentienceapi.com/v1/snapshot "HTTP/1.1 200 OK" + [SNAPSHOT] Elements: 60, URL: https://www.amazon.com/cart/smart-wagon?newItems=1b8defd7-fab3-4626-a50b-f657d6c03a9c,1&ref_=sw_refresh + [EXECUTOR] Using heuristic: CLICK(737) [heuristic] + [ACTION] CLICK(737) + [VERIFY] Running 1 verification predicates... + [VERIFY] Checking predicate: predicate='url_contains' args=['checkout']... +2026-03-15 21:57:59,546 [INFO] httpx: HTTP Request: POST https://api.sentienceapi.com/v1/snapshot "HTTP/1.1 200 OK" + [VERIFY] Predicate PASSED + [VERIFY] Predicate result: PASS + +[STEP 6] Handle login + Action: CLICK + Intent: Click sign in button +2026-03-15 21:58:10,069 [INFO] httpx: HTTP Request: POST https://api.sentienceapi.com/v1/snapshot "HTTP/1.1 200 OK" + [AUTH] Detected authentication boundary (URL: /signin) + [AUTH] Auth boundary detected at step start - stopping gracefully + [AUTH] Run completed at authentication boundary +2026-03-15 21:58:10,073 [INFO] __main__: ============================================================ +2026-03-15 21:58:10,073 [INFO] __main__: Run Complete +2026-03-15 21:58:10,073 [INFO] __main__: ============================================================ +2026-03-15 21:58:10,073 [INFO] __main__: Success: True +2026-03-15 21:58:10,073 [INFO] __main__: Steps completed: 6/8 +2026-03-15 21:58:10,073 [INFO] __main__: Replans used: 0 +2026-03-15 21:58:10,073 [INFO] __main__: Duration: 206791ms +2026-03-15 21:58:10,073 [INFO] __main__: Step 1: Navigate to Amazon... - OK +2026-03-15 21:58:10,073 [INFO] __main__: Step 2: Search for Logitech mouse... - OK +2026-03-15 21:58:10,073 [INFO] __main__: Step 3: Click on Logitech mouse product... - OK +2026-03-15 21:58:10,073 [INFO] __main__: Step 4: Click Add to Cart... - OK +2026-03-15 21:58:10,073 [INFO] __main__: Step 5: Proceed to Checkout... - OK +2026-03-15 21:58:10,073 [INFO] __main__: Step 6: Handle login... - OK diff --git a/planner_executor_local2/main.py b/planner_executor_local2/main.py new file mode 100644 index 0000000..271d735 --- /dev/null +++ b/planner_executor_local2/main.py @@ -0,0 +1,1006 @@ +#!/usr/bin/env python3 +""" +Planner + Executor Demo using SDK's Abstracted AutomationTask. + +This demo showcases the SDK's PlannerExecutorAgent with: +- AutomationTask for flexible task definition (abstracted WebBenchTask) +- CAPTCHA handling with multiple solver strategies +- Modal/dialog dismissal via heuristic hints +- Recovery and rollback mechanisms +- Custom IntentHeuristics for domain-specific element selection +- Support for both OpenAI and local LLM models + +Unlike planner_executor_local (which implements everything from scratch), +this demo uses the SDK's built-in PlannerExecutorAgent and AutomationTask. + +Environment variables: +- OPENAI_API_KEY: Required for OpenAI models (when not using --local) +- PLANNER_MODEL: Model for planning (default: gpt-4o or mlx-community/Qwen3-8B-4bit) +- EXECUTOR_MODEL: Model for execution (default: gpt-4o-mini or mlx-community/Qwen3-4B-4bit) +- CAPTCHA_MODE: "abort" | "human" | "external" (default: abort) +- AMAZON_QUERY: Search query (default: laptop) +- HEADLESS: Run browser headless (default: false) +- DEBUG: Enable debug logging (default: false) + +Usage: + # OpenAI models (default) + python main.py + + # Local MLX models (Apple Silicon) + python main.py --local + + # Local models with custom model names + python main.py --local --planner-model mlx-community/Qwen3-8B-4bit --executor-model mlx-community/Qwen3-4B-4bit + + # With human CAPTCHA solving + CAPTCHA_MODE=human python main.py + + # High-level goal (less defined task) + python main.py --goal "Find a good laptop deal and add to cart" +""" + +from __future__ import annotations + +import argparse +import asyncio +import importlib +import logging +import os +import sys +from dataclasses import dataclass +from datetime import datetime +from pathlib import Path +from typing import Any + +from dotenv import load_dotenv + +# Add SDK to path +sdk_path = Path(__file__).parent.parent.parent / "sdk-python" +sys.path.insert(0, str(sdk_path)) + +from predicate import AsyncPredicateBrowser, CaptchaOptions +from predicate.agent_runtime import AgentRuntime +from predicate.agents import ( + AutomationTask, + ComposableHeuristics, + ExtractionSpec, + HeuristicHint, + PlannerExecutorAgent, + PlannerExecutorConfig, + RecoveryState, + SnapshotEscalationConfig, + SuccessCriteria, + TaskCategory, + COMMON_HINTS, + get_common_hint, +) +from predicate.agents.planner_executor_agent import RetryConfig +from predicate.agents.browser_agent import ( + CaptchaConfig, + PermissionRecoveryConfig, + VisionFallbackConfig, +) +from predicate.captcha import CaptchaContext, CaptchaResolution +from predicate.captcha_strategies import ExternalSolver, HumanHandoffSolver +from predicate.llm_provider import LLMProvider, LLMResponse, OpenAIProvider +from predicate.tracer_factory import create_tracer +from predicate.tracing import Tracer +from predicate.backends.playwright_backend import PlaywrightBackend +from predicate.backends import PredicateContext +from predicate.models import SnapshotOptions + +# Load environment from current working directory (.env file) +# Run from sentience-sdk-playground/ to load the .env file there +load_dotenv() + +# Configure logging +logging.basicConfig( + level=logging.DEBUG if os.getenv("DEBUG") else logging.INFO, + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", +) +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Default Local Model Names +# --------------------------------------------------------------------------- +DEFAULT_LOCAL_PLANNER_MODEL = "mlx-community/Qwen3-8B-4bit" +DEFAULT_LOCAL_EXECUTOR_MODEL = "mlx-community/Qwen3-4B-4bit" + + +# --------------------------------------------------------------------------- +# Local LLM Provider (MLX for Apple Silicon) +# --------------------------------------------------------------------------- + + +class LocalMLXProvider(LLMProvider): + """ + Local MLX LLM provider for Apple Silicon. + + Uses mlx-lm for efficient inference on M1/M2/M3/M4 Macs. + Wraps the SDK's LLMProvider interface for compatibility with PlannerExecutorAgent. + """ + + def __init__(self, model: str): + """ + Initialize MLX model. + + Args: + model: Model name (e.g., "mlx-community/Qwen3-8B-4bit") + """ + super().__init__(model) + self._model_name_str = model + + try: + self._mlx_lm = importlib.import_module("mlx_lm") + except ImportError as exc: + raise RuntimeError( + "mlx-lm is required for local MLX models. Install with: pip install mlx-lm" + ) from exc + + load_fn = getattr(self._mlx_lm, "load", None) + if not load_fn: + raise RuntimeError("mlx_lm.load not available in your mlx-lm install.") + + logger.info(f"Loading MLX model: {model}") + self.model, self.tokenizer = load_fn(model) + logger.info(f"MLX model loaded: {model}") + + def _build_prompt(self, system: str, user: str) -> str: + """Build chat prompt using tokenizer's template.""" + messages = [ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + ] + apply_chat_template = getattr(self.tokenizer, "apply_chat_template", None) + if callable(apply_chat_template): + # Disable thinking mode for Qwen 3 models to get direct JSON output + kwargs: dict[str, Any] = {"tokenize": False, "add_generation_prompt": True} + if "qwen3" in self._model_name_str.lower(): + kwargs["enable_thinking"] = False + return apply_chat_template(messages, **kwargs) + return f"{system}\n\n{user}" + + def generate( + self, + system_prompt: str, + user_prompt: str, + **kwargs, + ) -> LLMResponse: + """Generate response using MLX model.""" + prompt = self._build_prompt(system_prompt, user_prompt) + + generate_fn = getattr(self._mlx_lm, "generate", None) + if not generate_fn: + raise RuntimeError("mlx_lm.generate not available in your mlx-lm install.") + + # Use higher default for planner output (JSON plans can be long) + max_tokens = kwargs.get("max_tokens", 2048) + temperature = kwargs.get("temperature", 0.0) + + gen_kwargs: dict[str, Any] = {"max_tokens": max_tokens} + + # Set up sampler for temperature + if temperature and temperature > 0: + try: + sample_utils = importlib.import_module("mlx_lm.sample_utils") + make_sampler = getattr(sample_utils, "make_sampler", None) + if callable(make_sampler): + gen_kwargs["sampler"] = make_sampler(temp=temperature) + except Exception: + pass + + text = generate_fn( + self.model, + self.tokenizer, + prompt, + **gen_kwargs, + ) + + # Calculate token usage + try: + prompt_tokens = len(self.tokenizer.encode(prompt)) + completion_tokens = len(self.tokenizer.encode(text.strip())) + except Exception: + prompt_tokens = 0 + completion_tokens = 0 + + return LLMResponse( + content=text.strip(), + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=prompt_tokens + completion_tokens, + model_name=self._model_name_str, + ) + + def supports_json_mode(self) -> bool: + """MLX models don't have native JSON mode.""" + return False + + @property + def model_name(self) -> str: + """Return model name.""" + return self._model_name_str + + def supports_vision(self) -> bool: + """Local text models don't support vision.""" + return False + + +class LocalHFProvider(LLMProvider): + """ + Local HuggingFace LLM provider using transformers. + + Uses HuggingFace transformers with MPS/CUDA acceleration. + Wraps the SDK's LLMProvider interface for compatibility with PlannerExecutorAgent. + """ + + def __init__(self, model: str): + """ + Initialize HuggingFace model. + + Args: + model: Model name (e.g., "Qwen/Qwen2.5-7B-Instruct") + """ + super().__init__(model) + self._model_name_str = model + + try: + import torch + from transformers import AutoModelForCausalLM, AutoTokenizer + except ImportError as exc: + raise RuntimeError( + "torch and transformers are required for HuggingFace models. " + "Install with: pip install torch transformers" + ) from exc + + self._torch = torch + + # Determine device and dtype + if torch.backends.mps.is_available(): + device_map = "mps" + torch_dtype = torch.bfloat16 + attn_impl = "sdpa" + elif torch.cuda.is_available(): + device_map = "auto" + torch_dtype = torch.float16 + attn_impl = "flash_attention_2" + else: + device_map = "cpu" + torch_dtype = torch.float32 + attn_impl = "eager" + + logger.info(f"Loading HuggingFace model: {model} (device={device_map})") + self.tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True) + + load_kwargs = { + "device_map": device_map, + "torch_dtype": torch_dtype, + "low_cpu_mem_usage": True, + } + if attn_impl != "eager": + load_kwargs["attn_implementation"] = attn_impl + + self.model = AutoModelForCausalLM.from_pretrained(model, **load_kwargs) + logger.info(f"HuggingFace model loaded: {model}") + + def generate( + self, + system_prompt: str, + user_prompt: str, + **kwargs, + ) -> LLMResponse: + """Generate response using HuggingFace model.""" + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt}, + ] + + encoding = self.tokenizer.apply_chat_template( + messages, return_tensors="pt", add_generation_prompt=True + ) + + device = getattr(self.model, "device", "cpu") + if hasattr(encoding, "to"): + encoding = encoding.to(device) + + # Handle different return types from apply_chat_template + try: + input_ids = encoding["input_ids"] + attention_mask = encoding.get("attention_mask") + except (TypeError, KeyError): + input_ids = encoding + attention_mask = None + + input_ids = input_ids.to(device) + if attention_mask is not None: + attention_mask = attention_mask.to(device) + else: + attention_mask = self._torch.ones_like(input_ids) + + # Use higher default for planner output (JSON plans can be long) + max_tokens = kwargs.get("max_tokens", 2048) + temperature = kwargs.get("temperature", 0.0) + do_sample = temperature > 0 + + output_ids = self.model.generate( + input_ids, + attention_mask=attention_mask, + max_new_tokens=max_tokens, + do_sample=do_sample, + temperature=temperature if do_sample else None, + pad_token_id=self.tokenizer.eos_token_id, + eos_token_id=self.tokenizer.eos_token_id, + ) + + generated = output_ids[0][input_ids.shape[-1]:] + text = self.tokenizer.decode(generated, skip_special_tokens=True) + + prompt_tokens = int(input_ids.shape[-1]) + completion_tokens = int(generated.shape[-1]) + + return LLMResponse( + content=text.strip(), + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=prompt_tokens + completion_tokens, + model_name=self._model_name_str, + ) + + def supports_json_mode(self) -> bool: + """HuggingFace models don't have native JSON mode.""" + return False + + @property + def model_name(self) -> str: + """Return model name.""" + return self._model_name_str + + def supports_vision(self) -> bool: + """Local text models don't support vision.""" + return False + + +def create_llm_provider( + model: str, + use_local: bool = False, + provider_type: str = "mlx", +) -> LLMProvider: + """ + Create an LLM provider based on configuration. + + Args: + model: Model name + use_local: Whether to use local models + provider_type: "mlx" or "hf" for local models + + Returns: + LLMProvider instance + """ + if not use_local: + return OpenAIProvider(model=model) + + if provider_type == "mlx": + return LocalMLXProvider(model=model) + elif provider_type == "hf": + return LocalHFProvider(model=model) + else: + raise ValueError(f"Unknown provider type: {provider_type}") + + +# --------------------------------------------------------------------------- +# Custom Heuristics for E-commerce Sites +# --------------------------------------------------------------------------- + + +class EcommerceHeuristics: + """ + Domain-specific heuristics for e-commerce sites like Amazon. + + These heuristics help the executor find elements without LLM calls + for common e-commerce patterns. + """ + + def find_element_for_intent( + self, + intent: str, + elements: list[Any], + url: str, + goal: str, + ) -> int | None: + """Find element ID using domain-specific heuristics.""" + intent_lower = intent.lower().replace("-", "_").replace(" ", "_") + + # Search box detection + if "search" in intent_lower and "box" in intent_lower: + return self._find_search_box(elements) + + # Add to cart button + if "add" in intent_lower and "cart" in intent_lower: + return self._find_add_to_cart(elements) + + # Checkout/proceed button + if "checkout" in intent_lower or "proceed" in intent_lower: + return self._find_checkout_button(elements) + + # First product link in search results + # Match intents like: "first_product_link", "Click on product title", "product link" + if "product" in intent_lower and ("first" in intent_lower or "link" in intent_lower or "title" in intent_lower): + return self._find_first_product_link(elements, url) + + # Close/dismiss modal + if "close" in intent_lower or "dismiss" in intent_lower or "no_thanks" in intent_lower: + return self._find_dismiss_button(elements) + + # Cookie consent + if "cookie" in intent_lower or "accept" in intent_lower: + return self._find_cookie_consent(elements) + + return None + + def priority_order(self) -> list[str]: + """Return intent patterns in priority order.""" + return [ + "add_to_cart", + "checkout", + "proceed_to_checkout", + "search_box", + "first_product_link", + "close", + "dismiss", + "no_thanks", + "accept_cookies", + ] + + def _find_search_box(self, elements: list[Any]) -> int | None: + """Find search box element.""" + candidates = [] + for el in elements: + role = (getattr(el, "role", "") or "").lower() + if role not in {"searchbox", "textbox", "combobox"}: + continue + text = (getattr(el, "text", "") or "").lower() + in_viewport = bool(getattr(el, "in_viewport", True)) + doc_y = getattr(el, "doc_y", None) or 1e9 + importance = getattr(el, "importance", 0) or 0 + + # Prefer elements with "search" in text + prefers_search = 0 if "search" in text else 1 + candidates.append((not in_viewport, prefers_search, doc_y, -importance, el.id)) + + if not candidates: + return None + candidates.sort() + return candidates[0][4] + + def _find_add_to_cart(self, elements: list[Any]) -> int | None: + """Find 'Add to Cart' button.""" + candidates = [] + for el in elements: + role = (getattr(el, "role", "") or "").lower() + if role != "button": + continue + text = (getattr(el, "text", "") or "").lower() + if "add to cart" not in text and "add to bag" not in text: + continue + if "buy now" in text: + continue + + in_viewport = bool(getattr(el, "in_viewport", True)) + doc_y = getattr(el, "doc_y", None) or 1e9 + importance = getattr(el, "importance", 0) or 0 + candidates.append((not in_viewport, doc_y, -importance, el.id)) + + if not candidates: + return None + candidates.sort() + return candidates[0][3] + + def _find_checkout_button(self, elements: list[Any]) -> int | None: + """Find checkout/proceed button.""" + candidates = [] + for el in elements: + role = (getattr(el, "role", "") or "").lower() + if role not in {"button", "link"}: + continue + text = (getattr(el, "text", "") or "").lower() + if "checkout" not in text and "proceed" not in text: + continue + # Exclude add to cart and buy now + if "add to cart" in text or "buy now" in text: + continue + + in_viewport = bool(getattr(el, "in_viewport", True)) + doc_y = getattr(el, "doc_y", None) or 1e9 + importance = getattr(el, "importance", 0) or 0 + is_checkout = 0 if "checkout" in text else 1 + candidates.append((not in_viewport, is_checkout, doc_y, -importance, el.id)) + + if not candidates: + return None + candidates.sort() + return candidates[0][4] + + def _find_first_product_link(self, elements: list[Any], url: str) -> int | None: + """Find first product link in search results.""" + candidates = [] + for el in elements: + role = (getattr(el, "role", "") or "").lower() + if role != "link": + continue + href = (getattr(el, "href", "") or "").lower() + + # Must be a product page link + if "/dp/" not in href and "/gp/product/" not in href: + continue + # Exclude filter links + if "refinements=" in href or "rh=" in href: + continue + + text = (getattr(el, "text", "") or "").strip() + # Skip empty or very short text + if not text or len(text) < 3: + continue + # Skip non-product items + text_lower = text.lower() + skip_patterns = [ + "sponsored", "free shipping", "prime", "filter", "sort by", + "see all", "show more" + ] + if any(p in text_lower for p in skip_patterns): + continue + + in_viewport = bool(getattr(el, "in_viewport", True)) + doc_y = getattr(el, "doc_y", None) or 1e9 + importance = getattr(el, "importance", 0) or 0 + candidates.append((not in_viewport, doc_y, -importance, el.id)) + + if not candidates: + return None + candidates.sort() + return candidates[0][3] + + def _find_dismiss_button(self, elements: list[Any]) -> int | None: + """Find dismiss/close/no thanks button.""" + candidates = [] + dismiss_patterns = ["no thanks", "close", "dismiss", "cancel", "not now", "skip"] + + for el in elements: + role = (getattr(el, "role", "") or "").lower() + if role != "button": + continue + text = (getattr(el, "text", "") or "").lower() + if not any(p in text for p in dismiss_patterns): + continue + + in_viewport = bool(getattr(el, "in_viewport", True)) + doc_y = getattr(el, "doc_y", None) or 1e9 + importance = getattr(el, "importance", 0) or 0 + candidates.append((not in_viewport, doc_y, -importance, el.id)) + + if not candidates: + return None + candidates.sort() + return candidates[0][3] + + def _find_cookie_consent(self, elements: list[Any]) -> int | None: + """Find cookie consent accept button.""" + candidates = [] + accept_patterns = ["accept", "accept all", "allow", "agree", "ok", "got it"] + + for el in elements: + role = (getattr(el, "role", "") or "").lower() + if role != "button": + continue + text = (getattr(el, "text", "") or "").lower() + if not any(p in text for p in accept_patterns): + continue + + in_viewport = bool(getattr(el, "in_viewport", True)) + doc_y = getattr(el, "doc_y", None) or 1e9 + importance = getattr(el, "importance", 0) or 0 + candidates.append((not in_viewport, doc_y, -importance, el.id)) + + if not candidates: + return None + candidates.sort() + return candidates[0][3] + + +# --------------------------------------------------------------------------- +# CAPTCHA Handlers +# --------------------------------------------------------------------------- + + +def create_captcha_config(mode: str) -> CaptchaConfig: + """Create CAPTCHA config based on mode.""" + mode = mode.lower() + + if mode == "abort": + logger.info("CAPTCHA mode: abort (fail on CAPTCHA)") + return CaptchaConfig(policy="abort", min_confidence=0.7) + + if mode == "human": + logger.info("CAPTCHA mode: human handoff") + return CaptchaConfig( + policy="callback", + handler=HumanHandoffSolver( + message="Please solve the CAPTCHA in the browser window", + timeout_ms=180_000, + poll_ms=3_000, + ), + ) + + if mode == "external": + logger.info("CAPTCHA mode: external solver") + + def external_solver(ctx: CaptchaContext) -> bool: + """Placeholder for external CAPTCHA solver integration.""" + logger.info(f"CAPTCHA detected at {ctx.url}") + logger.info(f"Type: {getattr(ctx.captcha, 'type', 'unknown')}") + logger.info(f"Screenshot: {ctx.screenshot_path}") + # In production, integrate with 2Captcha, CapSolver, etc. + # solver = TwoCaptcha('API_KEY') + # result = solver.recaptcha(sitekey=ctx.captcha.sitekey, url=ctx.url) + return True + + return CaptchaConfig( + policy="callback", + handler=ExternalSolver( + resolver=external_solver, + message="Solving CAPTCHA via external service", + timeout_ms=180_000, + ), + ) + + # Default to abort + logger.warning(f"Unknown CAPTCHA mode '{mode}', defaulting to abort") + return CaptchaConfig(policy="abort") + + +# --------------------------------------------------------------------------- +# Task Factory +# --------------------------------------------------------------------------- + + +def create_automation_task( + goal: str | None = None, + query: str | None = None, + starting_url: str = "https://www.amazon.com", +) -> AutomationTask: + """ + Create an AutomationTask for the demo. + + Args: + goal: High-level goal (if provided, creates less-defined task) + query: Search query for Amazon (if provided, creates specific search task) + starting_url: Starting URL + + Returns: + AutomationTask instance + """ + # High-level goal (less defined task) + if goal: + logger.info(f"Creating high-level task: {goal}") + task = AutomationTask( + task_id=f"goal-{datetime.now().strftime('%Y%m%d-%H%M%S')}", + starting_url=starting_url, + task=goal, + category=TaskCategory.TRANSACTION, + enable_recovery=True, + max_recovery_attempts=2, + max_steps=50, + ) + return task + + # Search task with specific query + search_query = query or os.getenv("AMAZON_QUERY", "laptop") + logger.info(f"Creating search task for: {search_query}") + + task = AutomationTask( + task_id=f"search-{search_query.replace(' ', '-')}-{datetime.now().strftime('%H%M%S')}", + starting_url=starting_url, + task=f"Search for '{search_query}' on Amazon, click the first product result, and add it to cart. Then proceed to checkout.", + category=TaskCategory.TRANSACTION, + enable_recovery=True, + max_recovery_attempts=2, + max_steps=50, + domain_hints=("ecommerce", "amazon"), + ) + + # Add success criteria + task = task.with_success_criteria( + {"predicate": "any_of", "args": [ + {"predicate": "url_contains", "args": ["/cart"]}, + {"predicate": "url_contains", "args": ["checkout"]}, + {"predicate": "url_contains", "args": ["signin"]}, + {"predicate": "url_contains", "args": ["/ap/"]}, + ]}, + ) + + return task + + +# --------------------------------------------------------------------------- +# Main Demo +# --------------------------------------------------------------------------- + + +async def run_demo( + goal: str | None = None, + query: str | None = None, + headless: bool = False, + use_local: bool = False, + planner_model: str | None = None, + executor_model: str | None = None, + provider_type: str = "mlx", +) -> dict[str, Any]: + """ + Run the PlannerExecutorAgent demo. + + Args: + goal: High-level goal for less-defined task + query: Search query for specific task + headless: Run browser in headless mode + use_local: Use local LLM models instead of OpenAI + planner_model: Override planner model name + executor_model: Override executor model name + provider_type: "mlx" or "hf" for local models + + Returns: + Result dictionary with run outcome + """ + # Determine model names based on mode + if use_local: + default_planner = DEFAULT_LOCAL_PLANNER_MODEL + default_executor = DEFAULT_LOCAL_EXECUTOR_MODEL + else: + default_planner = "gpt-4o" + default_executor = "gpt-4o-mini" + + planner_model = planner_model or os.getenv("PLANNER_MODEL", default_planner) + executor_model = executor_model or os.getenv("EXECUTOR_MODEL", default_executor) + + # Get Predicate API key for snapshot overlay and cloud features + predicate_api_key = os.getenv("PREDICATE_API_KEY") + use_api = bool((predicate_api_key or "").strip()) + + logger.info(f"Mode: {'local' if use_local else 'openai'}") + logger.info(f"Planner model: {planner_model}") + logger.info(f"Executor model: {executor_model}") + logger.info(f"Predicate API: {'enabled' if use_api else 'disabled (no PREDICATE_API_KEY)'}") + + # Create LLM providers + planner = create_llm_provider(planner_model, use_local, provider_type) + executor = create_llm_provider(executor_model, use_local, provider_type) + + # Create CAPTCHA config + captcha_mode = os.getenv("CAPTCHA_MODE", "abort") + captcha_config = create_captcha_config(captcha_mode) + + # Create agent config + config = PlannerExecutorConfig( + # Snapshot escalation for reliable element capture + snapshot=SnapshotEscalationConfig( + enabled=True, + limit_base=60, + limit_step=30, + limit_max=200, + ), + # Vision fallback for canvas pages + vision=VisionFallbackConfig( + enabled=True, + max_vision_calls=3, + trigger_requires_vision=True, + trigger_canvas_or_low_actionables=True, + ), + # CAPTCHA handling + captcha=captcha_config, + # Retry/verification settings - more lenient for local LLMs + retry=RetryConfig( + verify_timeout_s=15.0, # Increased from 10s + verify_poll_s=0.5, + verify_max_attempts=6, # Increased from 5 + executor_repair_attempts=3, # Increased from 2 + max_replans=2, # Allow more replans + ), + # LLM settings + planner_max_tokens=2048, + planner_temperature=0.0, + executor_max_tokens=96, + executor_temperature=0.0, + # Stabilization + stabilize_enabled=True, + stabilize_poll_s=0.35, + stabilize_max_attempts=6, + # Pre-step verification + pre_step_verification=True, + # Tracing + trace_screenshots=True, + # Verbose mode - print plan and executor prompts to stdout + verbose=True, + ) + + # Create tracer + tracer = create_tracer( + goal=goal or f"Amazon search: {query or os.getenv('AMAZON_QUERY', 'laptop')}", + agent_type="PlannerExecutorAgent", + ) + + # Create context formatter (same as planner_executor_local) + ctx_formatter = PredicateContext(max_elements=120) + + # Wrap the context formatter to match expected signature (snap, goal) -> str + def format_context(snap, goal): + return ctx_formatter._format_snapshot_for_llm(snap) + + # Create agent with custom heuristics and context formatter + agent = PlannerExecutorAgent( + planner=planner, + executor=executor, + config=config, + tracer=tracer, + intent_heuristics=EcommerceHeuristics(), + context_formatter=format_context, + ) + + # Create automation task + task = create_automation_task(goal=goal, query=query) + + logger.info("=" * 60) + logger.info("Starting PlannerExecutorAgent Demo") + logger.info("=" * 60) + logger.info(f"Task ID: {task.task_id}") + logger.info(f"Task: {task.task}") + logger.info(f"Starting URL: {task.starting_url}") + logger.info(f"Category: {task.category}") + logger.info(f"Recovery enabled: {task.enable_recovery}") + logger.info("=" * 60) + + # Run automation + async with AsyncPredicateBrowser( + api_key=predicate_api_key, + headless=headless, + ) as browser: + # AsyncSentienceBrowser creates a page in start() and stores it in browser.page + page = browser.page + await page.goto(task.starting_url) + await page.wait_for_load_state("domcontentloaded", timeout=15_000) + # Wait for network to settle + try: + await page.wait_for_load_state("networkidle", timeout=10_000) + except Exception: + pass # Best effort + # Extra wait for extension to initialize + await page.wait_for_timeout(1000) + + # Create runtime using PlaywrightBackend (same as planner_executor_local) + backend = PlaywrightBackend(page) + runtime = AgentRuntime( + backend=backend, + tracer=tracer, + predicate_api_key=predicate_api_key, + snapshot_options=SnapshotOptions( + limit=60, + screenshot=True, + show_overlay=True, + goal=task.task, + use_api=True if use_api else None, + predicate_api_key=predicate_api_key if use_api else None, + ), + ) + + try: + result = await agent.run(runtime, task) + + logger.info("=" * 60) + logger.info("Run Complete") + logger.info("=" * 60) + logger.info(f"Success: {result.success}") + logger.info(f"Steps completed: {result.steps_completed}/{result.steps_total}") + logger.info(f"Replans used: {result.replans_used}") + logger.info(f"Duration: {result.total_duration_ms}ms") + + if result.error: + logger.error(f"Error: {result.error}") + + # Log step outcomes + for outcome in result.step_outcomes: + status = "OK" if outcome.verification_passed else "FAIL" + vision = " [vision]" if outcome.used_vision else "" + logger.info( + f" Step {outcome.step_id}: {outcome.goal[:50]}... " + f"- {status}{vision}" + ) + + return { + "success": result.success, + "steps_completed": result.steps_completed, + "steps_total": result.steps_total, + "replans_used": result.replans_used, + "duration_ms": result.total_duration_ms, + "error": result.error, + } + + except Exception as e: + logger.exception(f"Demo failed: {e}") + return { + "success": False, + "error": str(e), + } + finally: + tracer.close() + + +def main(): + """Main entry point.""" + parser = argparse.ArgumentParser( + description="PlannerExecutorAgent Demo with SDK's AutomationTask" + ) + parser.add_argument( + "--goal", + type=str, + help="High-level goal for less-defined task (e.g., 'Find a good laptop deal')", + ) + parser.add_argument( + "--query", + type=str, + help="Search query for Amazon (e.g., 'wireless mouse')", + ) + parser.add_argument( + "--headless", + action="store_true", + default=os.getenv("HEADLESS", "").lower() in {"1", "true", "yes"}, + help="Run browser in headless mode", + ) + + # LLM provider options + parser.add_argument( + "--local", + action="store_true", + help="Use local LLM models instead of OpenAI (default: MLX on Apple Silicon)", + ) + parser.add_argument( + "--provider", + type=str, + choices=["mlx", "hf"], + default="mlx", + help="Local model provider: 'mlx' (Apple Silicon) or 'hf' (HuggingFace transformers)", + ) + parser.add_argument( + "--planner-model", + type=str, + help=f"Planner model name (default: gpt-4o or {DEFAULT_LOCAL_PLANNER_MODEL} for local)", + ) + parser.add_argument( + "--executor-model", + type=str, + help=f"Executor model name (default: gpt-4o-mini or {DEFAULT_LOCAL_EXECUTOR_MODEL} for local)", + ) + + args = parser.parse_args() + + # Check for API key if using OpenAI + if not args.local and not os.getenv("OPENAI_API_KEY"): + logger.error( + "OPENAI_API_KEY environment variable is required when not using --local.\n" + "Either set OPENAI_API_KEY or use --local for local models." + ) + sys.exit(1) + + # Run the demo + result = asyncio.run(run_demo( + goal=args.goal, + query=args.query, + headless=args.headless, + use_local=args.local, + planner_model=args.planner_model, + executor_model=args.executor_model, + provider_type=args.provider, + )) + + # Exit with appropriate code + sys.exit(0 if result.get("success") else 1) + + +if __name__ == "__main__": + main() diff --git a/planner_executor_local2/requirements.txt b/planner_executor_local2/requirements.txt new file mode 100644 index 0000000..6025ba1 --- /dev/null +++ b/planner_executor_local2/requirements.txt @@ -0,0 +1,20 @@ +# Core dependencies +openai>=1.0.0 +python-dotenv>=1.0.0 + +# SDK (install from local path) +# pip install -e ../../sdk-python + +# Optional: External CAPTCHA solvers +# 2captcha-python>=1.2.0 +# capsolver>=1.0.0 + +# ============================================ +# Local LLM Models (use with --local flag) +# ============================================ + +# For MLX models on Apple Silicon (recommended for M1/M2/M3/M4) +# pip install mlx-lm + +# For HuggingFace models (use with --local --provider hf) +# pip install torch transformers