From 3a64e75b5a91b4d32d47920a9f3eb209c81c9274 Mon Sep 17 00:00:00 2001 From: mprytyka Date: Mon, 7 Apr 2025 18:42:57 +0300 Subject: [PATCH] Update from original --- .env.example | 7 + .github/ISSUE_TEMPLATE/bug_report.yml | 2 +- .github/ISSUE_TEMPLATE/config.yml | 2 +- .github/ISSUE_TEMPLATE/docs_issue.yml | 4 +- .github/ISSUE_TEMPLATE/feature_request.yml | 2 +- .github/workflows/lint.yml | 24 + .github/workflows/publish.yml | 80 +- .gitignore | 2 +- .pre-commit-config.yaml | 34 +- .vscode/launch.json | 6 +- README.md | 19 +- SECURITY.md | 1 - browser_use/agent/gif.py | 22 +- browser_use/agent/memory/__init__.py | 3 + browser_use/agent/memory/service.py | 120 ++ browser_use/agent/message_manager/service.py | 28 +- browser_use/agent/message_manager/utils.py | 3 +- browser_use/agent/message_manager/views.py | 6 + browser_use/agent/prompts.py | 16 +- browser_use/agent/service.py | 454 ++++- browser_use/agent/system_prompt.md | 11 +- browser_use/agent/tests.py | 4 +- browser_use/agent/views.py | 17 +- browser_use/browser/browser.py | 319 ++-- browser_use/browser/chrome.py | 184 ++ browser_use/browser/context.py | 440 ++++- browser_use/browser/dolphin_service.py | 349 ++++ browser_use/browser/tests/screenshot_test.py | 51 +- .../browser/utils/screen_resolution.py | 41 + browser_use/browser/views.py | 11 + browser_use/controller/registry/service.py | 48 +- browser_use/controller/registry/views.py | 87 +- browser_use/controller/service.py | 391 ++++- browser_use/controller/views.py | 70 +- browser_use/dom/buildDomTree.js | 304 ++-- .../dom/history_tree_processor/view.py | 2 +- browser_use/dom/service.py | 43 +- browser_use/dom/tests/extraction_test.py | 20 +- browser_use/exceptions.py | 5 + browser_use/logging_config.py | 2 +- browser_use/telemetry/service.py | 2 +- browser_use/utils.py | 268 ++- docs/customize/agent-settings.mdx | 73 +- docs/customize/browser-settings.mdx | 18 +- docs/customize/custom-functions.mdx | 9 +- docs/customize/hooks.mdx | 346 ++++ docs/customize/output-format.mdx | 2 +- docs/customize/real-browser.mdx | 2 +- docs/customize/sensitive-data.mdx | 4 +- docs/customize/supported-models.mdx | 20 +- docs/development.mdx | 24 +- docs/development/contribution-guide.mdx | 7 +- docs/development/local-setup.mdx | 33 +- docs/development/n8n-integration.mdx | 122 ++ docs/development/observability.mdx | 2 +- docs/mint.json | 3 +- docs/quickstart.mdx | 2 +- eval/claude.py | 7 +- eval/deepseek-r1.py | 7 +- eval/deepseek.py | 7 +- eval/gemini-1.5-flash.py | 7 +- eval/gemini-2.0-flash.py | 7 +- eval/gpt-4o-no-boundingbox.py | 15 +- eval/gpt-4o-no-vision.py | 7 +- eval/gpt-4o-viewport-0.py | 15 +- eval/gpt-4o.py | 15 +- eval/grok.py | 25 + examples/browser/real_browser.py | 12 +- examples/browser/stealth.py | 79 + examples/custom-functions/action_filters.py | 86 + examples/custom-functions/clipboard.py | 3 +- .../custom_hooks_before_after_step.py | 232 +++ examples/custom-functions/file_upload.py | 4 +- examples/custom-functions/group_ungroup.py | 108 ++ examples/custom-functions/hover_element.py | 96 ++ examples/custom-functions/notification.py | 3 +- examples/custom-functions/onepassword_2fa.py | 56 + .../save_to_file_hugging_face.py | 4 +- examples/features/click_fallback_options.py | 211 +++ examples/features/cross_origin_iframes.py | 51 + examples/features/custom_output.py | 2 +- examples/features/custom_user_agent.py | 2 +- examples/features/drag_drop.py | 46 + examples/features/follow_up_tasks.py | 16 +- .../features/multiple_agents_same_browser.py | 2 +- examples/features/planner.py | 1 - examples/features/restrict_urls.py | 7 +- examples/features/result_processing.py | 11 +- examples/features/save_trace.py | 4 +- .../features/small_model_for_extraction.py | 1 - examples/features/task_with_memory.py | 98 ++ examples/features/validate_output.py | 2 +- examples/integrations/discord/discord_api.py | 12 +- .../integrations/discord/discord_example.py | 2 +- examples/integrations/slack/README.md | 6 +- examples/integrations/slack/slack_api.py | 203 +-- examples/models/azure_openai.py | 32 +- examples/models/bedrock_claude.py | 46 +- examples/models/claude-3.7-sonnet.py | 1 + examples/models/ollama.py | 30 +- examples/models/qwen.py | 6 +- examples/notebook/agent_browsing.ipynb | 1493 ++++++++--------- examples/simple.py | 5 + examples/ui/README.md | 2 +- examples/ui/command_line.py | 101 +- examples/ui/gradio_demo.py | 8 +- examples/ui/streamlit_demo.py | 99 +- examples/use-cases/README.md | 3 - examples/use-cases/captcha.py | 20 +- examples/use-cases/check_appointment.py | 5 +- examples/use-cases/find_and_apply_to_jobs.py | 26 +- examples/use-cases/google_sheets.py | 193 +++ examples/use-cases/online_coding_agent.py | 62 +- examples/use-cases/post-twitter.py | 96 +- examples/use-cases/scrolling_page.py | 8 +- examples/use-cases/shopping.py | 34 +- .../use-cases/twitter_post_using_cookies.py | 6 +- examples/use-cases/web_voyager_agent.py | 12 +- pyproject.toml | 86 +- tests/test_action_filters.py | 305 ++++ tests/test_agent_actions.py | 8 +- tests/test_browser.py | 762 +++++---- tests/test_browser_config_models.py | 209 +++ tests/test_context.py | 642 +++---- tests/test_dropdown.py | 59 +- tests/test_dropdown_complex.py | 67 +- tests/test_dropdown_error.py | 4 +- tests/test_gif_path.py | 13 +- tests/test_mind2web.py | 2 +- tests/test_models.py | 2 +- tests/test_service.py | 2 +- tests/test_vision.py | 4 +- tests/test_wait_for_element.py | 68 + 133 files changed, 7707 insertions(+), 2457 deletions(-) create mode 100644 .github/workflows/lint.yml create mode 100644 browser_use/agent/memory/__init__.py create mode 100644 browser_use/agent/memory/service.py create mode 100644 browser_use/browser/chrome.py create mode 100644 browser_use/browser/dolphin_service.py create mode 100644 browser_use/browser/utils/screen_resolution.py create mode 100644 browser_use/exceptions.py create mode 100644 docs/customize/hooks.mdx create mode 100644 docs/development/n8n-integration.mdx create mode 100644 eval/grok.py create mode 100644 examples/browser/stealth.py create mode 100644 examples/custom-functions/action_filters.py create mode 100644 examples/custom-functions/custom_hooks_before_after_step.py create mode 100644 examples/custom-functions/group_ungroup.py create mode 100644 examples/custom-functions/hover_element.py create mode 100644 examples/custom-functions/onepassword_2fa.py create mode 100644 examples/features/click_fallback_options.py create mode 100644 examples/features/cross_origin_iframes.py create mode 100644 examples/features/drag_drop.py create mode 100644 examples/features/task_with_memory.py create mode 100644 examples/use-cases/google_sheets.py create mode 100644 tests/test_action_filters.py create mode 100644 tests/test_browser_config_models.py create mode 100644 tests/test_wait_for_element.py diff --git a/.env.example b/.env.example index 85438cdadb..0d006cbb2d 100644 --- a/.env.example +++ b/.env.example @@ -1,8 +1,15 @@ OPENAI_API_KEY= ANTHROPIC_API_KEY= +AZURE_ENDPOINT= +AZURE_OPENAI_API_KEY= +GEMINI_API_KEY= +DEEPSEEK_API_KEY= # Set to false to disable anonymized telemetry ANONYMIZED_TELEMETRY=true # LogLevel: Set to debug to enable verbose logging, set to result to get results only. Available: result | debug | info BROWSER_USE_LOGGING_LEVEL=info + +# set this to true to optimize browser-use's chrome for running inside docker +IN_DOCKER=false diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 08a567b049..ae2caa0e18 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -81,4 +81,4 @@ body: attributes: label: Relevant Log Output description: Please copy and paste any relevant log output. This will be automatically formatted into code. - render: shell \ No newline at end of file + render: shell diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index a8607c3ab0..16019e944a 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -8,4 +8,4 @@ contact_links: about: Please ask questions in our Discord community - name: 📖 Documentation url: https://docs.browser-use.com - about: Check our documentation for answers first \ No newline at end of file + about: Check our documentation for answers first diff --git a/.github/ISSUE_TEMPLATE/docs_issue.yml b/.github/ISSUE_TEMPLATE/docs_issue.yml index b0504a4497..aa88e8071d 100644 --- a/.github/ISSUE_TEMPLATE/docs_issue.yml +++ b/.github/ISSUE_TEMPLATE/docs_issue.yml @@ -46,10 +46,10 @@ body: description: If you have specific suggestions for how to improve the documentation, please share them placeholder: | The documentation could be improved by... - + Example: ```python # Your suggested code example or text here ``` validations: - required: true \ No newline at end of file + required: true diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml index 4b5d90f933..b0e9910ffe 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.yml +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -40,4 +40,4 @@ body: placeholder: | - Example use cases - Screenshots or mockups - - Related issues or discussions \ No newline at end of file + - Related issues or discussions diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000000..9271d97cfb --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,24 @@ +name: Lint +on: + push: + pull_request: + workflow_dispatch: +jobs: + ruff: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: astral-sh/setup-uv@v5 + - run: uv run ruff format + - run: uv run pre-commit run --all-files + # TODO: Fix the ignored pytests. + # openai.OpenAIError: The api_key client option must be set either by passing + # api_key to the client or by setting the OPENAI_API_KEY environment variable + - run: uv run --with=dotenv pytest + --ignore=tests/test_dropdown_error.py + --ignore=tests/test_gif_path.py + --ignore=tests/test_models.py + --ignore=tests/test_react_dropdown.py + --ignore=tests/test_save_conversation.py + --ignore=tests/test_vision.py + --ignore=tests/test_wait_for_element.py || true diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 8ee4acc7e9..3adfcde174 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -6,33 +6,87 @@ # separate terms of service, privacy policy, and support # documentation. -name: Upload Python Package +name: Python Package Workflow on: + push: + branches: + - main release: types: [published] + schedule: + - cron: "0 17 * * FRI" # Every Friday at 5 PM UTC permissions: - contents: read + contents: write jobs: - deploy: + pre_commit_and_tests: + if: github.event_name == 'push' && github.ref_name == 'main' runs-on: ubuntu-latest - steps: - uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v5 with: python-version: "3.x" - - name: Install dependencies + - uses: astral-sh/setup-uv@v5 + - run: uv run ruff check --no-fix --select PLE # check only for syntax errors + - run: uv build + - run: uv run --isolated --no-project --with pytest --with dist/*.whl tests/conftest.py + - run: uv run --isolated --no-project --with pytest --with dist/*.tar.gz tests/conftest.py + - run: uv run --with=dotenv pytest \ + --ignore=tests/test_dropdown_error.py \ + --ignore=tests/test_gif_path.py \ + --ignore=tests/test_models.py \ + --ignore=tests/test_react_dropdown.py \ + --ignore=tests/test_save_conversation.py \ + --ignore=tests/test_vision.py \ + --ignore=tests/test_wait_for_element.py || true + - run: uv publish --trusted-publishing always + + tag_pre_release: + if: github.event_name == 'schedule' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Create pre-release tag run: | - python -m pip install --upgrade pip - pip install build hatch - - name: Build package - run: python -m build - - name: Publish package - uses: pypa/gh-action-pypi-publish@release/v1 + git fetch --tags + latest_tag=$(git tag --list --sort=-v:refname | grep -E '^v[0-9]+\.[0-9]+\.[0-9]+rc[0-9]+$' | head -n 1) + if [ -z "$latest_tag" ]; then + new_tag="v0.1.0rc1" + else + new_tag=$(echo $latest_tag | awk -F'rc' '{print $1 "rc" $2+1}') + fi + git tag $new_tag + git push origin $new_tag + + deploy: + if: github.event_name == 'release' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 with: - user: __token__ - password: ${{ secrets.PYPI_API_TOKEN }} + python-version: "3.x" + - uses: astral-sh/setup-uv@v5 + - run: uv run ruff check --no-fix --select PLE # check only for syntax errors + - run: uv build + - run: uv run --isolated --no-project --with pytest --with dist/*.whl tests/conftest.py + - run: uv run --isolated --no-project --with pytest --with dist/*.tar.gz tests/conftest.py + - run: uv run --with=dotenv pytest \ + --ignore=tests/test_dropdown_error.py \ + --ignore=tests/test_gif_path.py \ + --ignore=tests/test_models.py \ + --ignore=tests/test_react_dropdown.py \ + --ignore=tests/test_save_conversation.py \ + --ignore=tests/test_vision.py \ + --ignore=tests/test_wait_for_element.py || true + - run: uv publish --trusted-publishing always + - name: Push to stable branch (if stable release) + if: startsWith(github.ref_name, 'v') && !contains(github.ref_name, 'rc') + run: | + git checkout -b stable + git push origin stable diff --git a/.gitignore b/.gitignore index 0affd45896..f89e499508 100644 --- a/.gitignore +++ b/.gitignore @@ -187,4 +187,4 @@ gcp-login.json *.json *.jsonl -uv.lock \ No newline at end of file +uv.lock diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 34d34cda0f..92cc8ca595 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,18 +1,32 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.3.0 + rev: v0.11.2 hooks: - id: ruff - args: [ - --line-length=130, - --select=E,F,I, - --fix, - ] + - id: ruff-format + # see pyproject.toml for more details on ruff config - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.5.0 + rev: v5.0.0 hooks: - - id: trailing-whitespace - - id: end-of-file-fixer - - id: check-yaml - id: check-toml + - id: check-yaml + - id: check-json + - id: end-of-file-fixer + - id: check-merge-conflict + - id: check-illegal-windows-names + - id: check-case-conflict + - id: check-added-large-files + - id: check-shebang-scripts-are-executable + - id: check-symlinks + - id: destroyed-symlinks + - id: detect-private-key + - id: mixed-line-ending + - id: fix-byte-order-marker + + - repo: https://github.com/codespell-project/codespell + rev: v2.4.1 + hooks: + - id: codespell # See pyproject.toml for args + additional_dependencies: + - tomli diff --git a/.vscode/launch.json b/.vscode/launch.json index 9fcfe11b73..9ab0d802be 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -39,7 +39,7 @@ "-v", "-k", "test_captcha_solver", - "--capture=no", + "--capture=no" ], "console": "integratedTerminal", "justMyCode": false @@ -54,7 +54,7 @@ "-v", "-k", "test_ecommerce_interaction", - "--capture=no", + "--capture=no" ], "console": "integratedTerminal", "justMyCode": false @@ -85,4 +85,4 @@ "justMyCode": false } ] -} \ No newline at end of file +} diff --git a/README.md b/README.md index 3a6f60fe1b..0860f50f84 100644 --- a/README.md +++ b/README.md @@ -28,10 +28,9 @@ With pip (Python>=3.11): pip install browser-use ``` -install playwright: - +Install Playwright: ```bash -playwright install +playwright install chromium ``` Spin up your agent: @@ -57,6 +56,11 @@ Add your API keys for the provider you want to use to your `.env` file. ```bash OPENAI_API_KEY= +ANTHROPIC_API_KEY= +AZURE_ENDPOINT= +AZURE_OPENAI_API_KEY= +GEMINI_API_KEY= +DEEPSEEK_API_KEY= ``` For other settings, models, and more, check out the [documentation 📕](https://docs.browser-use.com). @@ -133,7 +137,7 @@ Tell your computer what to do, and it gets it done. ### Rerunning tasks - [ ] LLM as fallback -- [ ] Make it easy to define workfows templates where LLM fills in the details +- [ ] Make it easy to define workflow templates where LLM fills in the details - [ ] Return playwright script from the agent ### Datasets @@ -156,6 +160,11 @@ We love contributions! Feel free to open issues for bugs or feature requests. To To learn more about the library, check out the [local setup 📕](https://docs.browser-use.com/development/local-setup). + +`main` is the primary development branch with frequent changes. For production use, install a stable [versioned release](https://github.com/browser-use/browser-use/releases) instead. + +--- + ## Cooperations We are forming a commission to define best practices for UI/UX design for browser agents. @@ -181,7 +190,7 @@ If you use Browser Use in your research or project, please cite: } ``` -
+
[![Twitter Follow](https://img.shields.io/twitter/follow/Gregor?style=social)](https://x.com/gregpr07) [![Twitter Follow](https://img.shields.io/twitter/follow/Magnus?style=social)](https://x.com/mamagnus00) diff --git a/SECURITY.md b/SECURITY.md index e0969a3a05..67a6533784 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -17,4 +17,3 @@ Please include as much of the information listed below as you can to help me bet * Impact of the issue, including how an attacker might exploit the issue This information will help me triage your report more quickly. - diff --git a/browser_use/agent/gif.py b/browser_use/agent/gif.py index 1cb7cbc9ce..d93b587ccd 100644 --- a/browser_use/agent/gif.py +++ b/browser_use/agent/gif.py @@ -155,10 +155,28 @@ def _create_task_frame( # Calculate vertical center of image center_y = image.height // 2 - # Draw task text with increased font size + # Draw task text with dynamic font size based on task length margin = 140 # Increased margin max_width = image.width - (2 * margin) - larger_font = ImageFont.truetype(regular_font.path, regular_font.size + 16) # Increase font size more + + # Dynamic font size calculation based on task length + # Start with base font size (regular + 16) + base_font_size = regular_font.size + 16 + min_font_size = max(regular_font.size - 10, 16) # Don't go below 16pt + max_font_size = base_font_size # Cap at the base font size + + # Calculate dynamic font size based on text length and complexity + # Longer texts get progressively smaller fonts + text_length = len(task) + if text_length > 200: + # For very long text, reduce font size logarithmically + font_size = max(base_font_size - int(10 * (text_length / 200)), min_font_size) + else: + font_size = base_font_size + + larger_font = ImageFont.truetype(regular_font.path, font_size) + + # Generate wrapped text with the calculated font size wrapped_text = _wrap_text(task, larger_font, max_width) # Calculate line height with spacing diff --git a/browser_use/agent/memory/__init__.py b/browser_use/agent/memory/__init__.py new file mode 100644 index 0000000000..50f06c1f09 --- /dev/null +++ b/browser_use/agent/memory/__init__.py @@ -0,0 +1,3 @@ +from browser_use.agent.memory.service import Memory, MemorySettings + +__all__ = ['Memory', 'MemorySettings'] diff --git a/browser_use/agent/memory/service.py b/browser_use/agent/memory/service.py new file mode 100644 index 0000000000..f1965b6c06 --- /dev/null +++ b/browser_use/agent/memory/service.py @@ -0,0 +1,120 @@ +from __future__ import annotations + +import logging +from typing import List, Optional + +from langchain_core.language_models.chat_models import BaseChatModel +from langchain_core.messages import ( + BaseMessage, + HumanMessage, +) +from langchain_core.messages.utils import convert_to_openai_messages +from mem0 import Memory as Mem0Memory +from pydantic import BaseModel + +from browser_use.agent.message_manager.service import MessageManager +from browser_use.agent.message_manager.views import ManagedMessage, MessageMetadata +from browser_use.utils import time_execution_sync + +logger = logging.getLogger(__name__) + + +class MemorySettings(BaseModel): + """Settings for procedural memory.""" + + agent_id: str + interval: int = 10 + config: Optional[dict] | None = None + + +class Memory: + """ + Manages procedural memory for agents. + + This class implements a procedural memory management system using Mem0 that transforms agent interaction history + into concise, structured representations at specified intervals. It serves to optimize context window + utilization during extended task execution by converting verbose historical information into compact, + yet comprehensive memory constructs that preserve essential operational knowledge. + """ + + def __init__( + self, + message_manager: MessageManager, + llm: BaseChatModel, + settings: MemorySettings, + ): + self.message_manager = message_manager + self.llm = llm + self.settings = settings + self._memory_config = self.settings.config or {'vector_store': {'provider': 'faiss'}} + self.mem0 = Mem0Memory.from_config(config_dict=self._memory_config) + + @time_execution_sync('--create_procedural_memory') + def create_procedural_memory(self, current_step: int) -> None: + """ + Create a procedural memory if needed based on the current step. + + Args: + current_step: The current step number of the agent + """ + logger.info(f'Creating procedural memory at step {current_step}') + + # Get all messages + all_messages = self.message_manager.state.history.messages + + # Filter out messages that are marked as memory in metadata + messages_to_process = [] + new_messages = [] + for msg in all_messages: + # Exclude system message and initial messages + if isinstance(msg, ManagedMessage) and msg.metadata.message_type in set(['init', 'memory']): + new_messages.append(msg) + else: + messages_to_process.append(msg) + + if len(messages_to_process) <= 1: + logger.info('Not enough non-memory messages to summarize') + return + + # Create a summary + summary = self._create([m.message for m in messages_to_process], current_step) + + if not summary: + logger.warning('Failed to create summary') + return + + # Replace the summarized messages with the summary + summary_message = HumanMessage(content=summary) + summary_tokens = self.message_manager._count_tokens(summary_message) + summary_metadata = MessageMetadata(tokens=summary_tokens, message_type='memory') + + # Calculate the total tokens being removed + removed_tokens = sum(m.metadata.tokens for m in messages_to_process) + + # Add the summary message + new_messages.append(ManagedMessage(message=summary_message, metadata=summary_metadata)) + + # Update the history + self.message_manager.state.history.messages = new_messages + self.message_manager.state.history.current_tokens -= removed_tokens + self.message_manager.state.history.current_tokens += summary_tokens + + logger.info(f'Memories summarized: {len(messages_to_process)} messages converted to procedural memory') + logger.info(f'Token reduction: {removed_tokens - summary_tokens} tokens') + + def _create(self, messages: List[BaseMessage], current_step: int) -> Optional[str]: + parsed_messages = convert_to_openai_messages(messages) + try: + results = self.mem0.add( + messages=parsed_messages, + agent_id=self.settings.agent_id, + llm=self.llm, + memory_type='procedural_memory', + metadata={'step': current_step}, + ) + if len(results.get('results', [])): + return results.get('results', [])[0].get('memory') + return None + except Exception as e: + logger.error(f'Error creating procedural memory: {e}') + return None diff --git a/browser_use/agent/message_manager/service.py b/browser_use/agent/message_manager/service.py index 73b3cf7870..76aab49553 100644 --- a/browser_use/agent/message_manager/service.py +++ b/browser_use/agent/message_manager/service.py @@ -50,25 +50,25 @@ def __init__( def _init_messages(self) -> None: """Initialize the message history with system message, context, task, and other initial messages""" - self._add_message_with_tokens(self.system_prompt) + self._add_message_with_tokens(self.system_prompt, message_type='init') if self.settings.message_context: context_message = HumanMessage(content='Context for the task' + self.settings.message_context) - self._add_message_with_tokens(context_message) + self._add_message_with_tokens(context_message, message_type='init') task_message = HumanMessage( content=f'Your ultimate task is: """{self.task}""". If you achieved your ultimate task, stop everything and use the done action in the next step to complete the task. If not, continue as usual.' ) - self._add_message_with_tokens(task_message) + self._add_message_with_tokens(task_message, message_type='init') if self.settings.sensitive_data: - info = f'Here are placeholders for sensitve data: {list(self.settings.sensitive_data.keys())}' + info = f'Here are placeholders for sensitive data: {list(self.settings.sensitive_data.keys())}' info += 'To use them, write the placeholder name' info_message = HumanMessage(content=info) - self._add_message_with_tokens(info_message) + self._add_message_with_tokens(info_message, message_type='init') placeholder_message = HumanMessage(content='Example output:') - self._add_message_with_tokens(placeholder_message) + self._add_message_with_tokens(placeholder_message, message_type='init') tool_calls = [ { @@ -90,15 +90,15 @@ def _init_messages(self) -> None: content='', tool_calls=tool_calls, ) - self._add_message_with_tokens(example_tool_call) - self.add_tool_message(content='Browser started') + self._add_message_with_tokens(example_tool_call, message_type='init') + self.add_tool_message(content='Browser started', message_type='init') placeholder_message = HumanMessage(content='[Your task history memory starts here]') self._add_message_with_tokens(placeholder_message) if self.settings.available_file_paths: filepaths_msg = HumanMessage(content=f'Here are file paths you can use: {self.settings.available_file_paths}') - self._add_message_with_tokens(filepaths_msg) + self._add_message_with_tokens(filepaths_msg, message_type='init') def add_new_task(self, new_task: str) -> None: content = f'Your new ultimate task is: """{new_task}""". Take the previous context into account and finish your new ultimate task. ' @@ -182,7 +182,9 @@ def get_messages(self) -> List[BaseMessage]: return msg - def _add_message_with_tokens(self, message: BaseMessage, position: int | None = None) -> None: + def _add_message_with_tokens( + self, message: BaseMessage, position: int | None = None, message_type: str | None = None + ) -> None: """Add message with token count metadata position: None for last, -1 for second last, etc. """ @@ -192,7 +194,7 @@ def _add_message_with_tokens(self, message: BaseMessage, position: int | None = message = self._filter_sensitive_data(message) token_count = self._count_tokens(message) - metadata = MessageMetadata(tokens=token_count) + metadata = MessageMetadata(tokens=token_count, message_type=message_type) self.state.history.add_message(message, metadata, position) @time_execution_sync('--filter_sensitive_data') @@ -299,8 +301,8 @@ def _remove_last_state_message(self) -> None: """Remove last state message from history""" self.state.history.remove_last_state_message() - def add_tool_message(self, content: str) -> None: + def add_tool_message(self, content: str, message_type: str | None = None) -> None: """Add tool message to history""" msg = ToolMessage(content=content, tool_call_id=str(self.state.tool_id)) self.state.tool_id += 1 - self._add_message_with_tokens(msg) + self._add_message_with_tokens(msg, message_type=message_type) diff --git a/browser_use/agent/message_manager/utils.py b/browser_use/agent/message_manager/utils.py index ce9490124c..a7113650e4 100644 --- a/browser_use/agent/message_manager/utils.py +++ b/browser_use/agent/message_manager/utils.py @@ -91,7 +91,8 @@ def save_conversation(input_messages: list[BaseMessage], response: Any, target: """Save conversation history to file.""" # create folders if not exists - os.makedirs(os.path.dirname(target), exist_ok=True) + if dirname := os.path.dirname(target): + os.makedirs(dirname, exist_ok=True) with open( target, diff --git a/browser_use/agent/message_manager/views.py b/browser_use/agent/message_manager/views.py index ad8c9c6705..5e5842497d 100644 --- a/browser_use/agent/message_manager/views.py +++ b/browser_use/agent/message_manager/views.py @@ -1,11 +1,15 @@ from __future__ import annotations from typing import TYPE_CHECKING, Any +from warnings import filterwarnings +from langchain_core._api import LangChainBetaWarning from langchain_core.load import dumpd, load from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage, ToolMessage from pydantic import BaseModel, ConfigDict, Field, model_serializer, model_validator +filterwarnings('ignore', category=LangChainBetaWarning) + if TYPE_CHECKING: from browser_use.agent.views import AgentOutput @@ -14,6 +18,7 @@ class MessageMetadata(BaseModel): """Metadata for a message""" tokens: int = 0 + message_type: str | None = None class ManagedMessage(BaseModel): @@ -56,6 +61,7 @@ def validate( """ if isinstance(value, dict) and 'message' in value: # NOTE: We use langchain's load to convert the JSON string back into a BaseMessage object. + filterwarnings('ignore', category=LangChainBetaWarning) value['message'] = load(value['message']) return value diff --git a/browser_use/agent/prompts.py b/browser_use/agent/prompts.py index b78cfe5bc6..a970bbf6aa 100644 --- a/browser_use/agent/prompts.py +++ b/browser_use/agent/prompts.py @@ -1,7 +1,6 @@ -import datetime import importlib.resources from datetime import datetime -from typing import TYPE_CHECKING, List, Optional +from typing import TYPE_CHECKING, List, Optional, Union from langchain_core.messages import HumanMessage, SystemMessage @@ -123,7 +122,7 @@ def get_user_message(self, use_vision: bool = True) -> HumanMessage: error = result.error.split('\n')[-1] state_description += f'\nAction error {i + 1}/{len(self.result)}: ...{error}' - if self.state.screenshot and use_vision == True: + if self.state.screenshot and use_vision is True: # Format message for vision model return HumanMessage( content=[ @@ -139,9 +138,8 @@ def get_user_message(self, use_vision: bool = True) -> HumanMessage: class PlannerPrompt(SystemPrompt): - def get_system_message(self) -> SystemMessage: - return SystemMessage( - content="""You are a planning agent that helps break down tasks into smaller steps and reason about the current state. + def get_system_message(self, is_planner_reasoning) -> Union[SystemMessage, HumanMessage]: + planner_prompt_text = """You are a planning agent that helps break down tasks into smaller steps and reason about the current state. Your role is to: 1. Analyze the current state and history 2. Evaluate progress towards the ultimate goal @@ -162,4 +160,8 @@ def get_system_message(self) -> SystemMessage: Ignore the other AI messages output structures. Keep your responses concise and focused on actionable insights.""" - ) + + if is_planner_reasoning: + return HumanMessage(content=planner_prompt_text) + else: + return SystemMessage(content=planner_prompt_text) diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py index 511ea16488..8f6148dbb0 100644 --- a/browser_use/agent/service.py +++ b/browser_use/agent/service.py @@ -1,12 +1,15 @@ from __future__ import annotations import asyncio +import gc +import inspect import json import logging +import os import re import time from pathlib import Path -from typing import Any, Awaitable, Callable, Dict, Generic, List, Optional, TypeVar +from typing import Any, Awaitable, Callable, Dict, Generic, List, Optional, TypeVar, Union from dotenv import load_dotenv from langchain_core.language_models.chat_models import BaseChatModel @@ -20,10 +23,12 @@ from pydantic import BaseModel, ValidationError from browser_use.agent.gif import create_history_gif +from browser_use.agent.memory.service import Memory, MemorySettings from browser_use.agent.message_manager.service import MessageManager, MessageManagerSettings from browser_use.agent.message_manager.utils import convert_input_messages, extract_json_from_model_output, save_conversation from browser_use.agent.prompts import AgentMessagePrompt, PlannerPrompt, SystemPrompt from browser_use.agent.views import ( + REQUIRED_LLM_API_ENV_VARS, ActionResult, AgentError, AgentHistory, @@ -45,17 +50,20 @@ DOMHistoryElement, HistoryTreeProcessor, ) +from browser_use.exceptions import LLMException from browser_use.telemetry.service import ProductTelemetry from browser_use.telemetry.views import ( AgentEndTelemetryEvent, AgentRunTelemetryEvent, AgentStepTelemetryEvent, ) -from browser_use.utils import time_execution_async, time_execution_sync +from browser_use.utils import check_env_variables, time_execution_async, time_execution_sync load_dotenv() logger = logging.getLogger(__name__) +SKIP_LLM_API_KEY_VERIFICATION = os.environ.get('SKIP_LLM_API_KEY_VERIFICATION', 'false').lower()[0] in 'ty1' + def log_response(response: AgentOutput) -> None: """Utility function to log the model's response.""" @@ -76,6 +84,8 @@ def log_response(response: AgentOutput) -> None: Context = TypeVar('Context') +AgentHookFunc = Callable[['Agent'], None] + class Agent(Generic[Context]): @time_execution_sync('--init (agent)') @@ -91,8 +101,16 @@ def __init__( sensitive_data: Optional[Dict[str, str]] = None, initial_actions: Optional[List[Dict[str, Dict[str, Any]]]] = None, # Cloud Callbacks - register_new_step_callback: Callable[['BrowserState', 'AgentOutput', int], Awaitable[None]] | None = None, - register_done_callback: Callable[['AgentHistoryList'], Awaitable[None]] | None = None, + register_new_step_callback: Union[ + Callable[['BrowserState', 'AgentOutput', int], None], # Sync callback + Callable[['BrowserState', 'AgentOutput', int], Awaitable[None]], # Async callback + None, + ] = None, + register_done_callback: Union[ + Callable[['AgentHistoryList'], Awaitable[None]], # Async Callback + Callable[['AgentHistoryList'], None], # Sync Callback + None, + ] = None, register_external_agent_status_raise_error_callback: Callable[[], Awaitable[bool]] | None = None, # Agent settings use_vision: bool = True, @@ -125,11 +143,16 @@ def __init__( page_extraction_llm: Optional[BaseChatModel] = None, planner_llm: Optional[BaseChatModel] = None, planner_interval: int = 1, # Run planner every N steps + is_planner_reasoning: bool = False, # Inject state injected_agent_state: Optional[AgentState] = None, # context: Context | None = None, captcha_solver: Optional[CaptchaSolverProtocol] = None, + # Memory settings + enable_memory: bool = True, + memory_interval: int = 10, + memory_config: Optional[dict] = None, ): if page_extraction_llm is None: page_extraction_llm = llm @@ -160,6 +183,10 @@ def __init__( page_extraction_llm=page_extraction_llm, planner_llm=planner_llm, planner_interval=planner_interval, + is_planner_reasoning=is_planner_reasoning, + enable_memory=enable_memory, + memory_interval=memory_interval, + memory_config=memory_config, ) # Initialize state @@ -172,18 +199,33 @@ def __init__( # Model setup self._set_model_names() + logger.info( + f'🧠 Starting an agent with main_model={self.model_name}, planner_model={self.planner_model_name}, ' + f'extraction_model={self.settings.page_extraction_llm.model_name if hasattr(self.settings.page_extraction_llm, "model_name") else None}' + ) + + # LLM API connection setup + llm_api_env_vars = REQUIRED_LLM_API_ENV_VARS.get(self.llm.__class__.__name__, []) + if llm_api_env_vars and not check_env_variables(llm_api_env_vars): + logger.error(f'Environment variables not set for {self.llm.__class__.__name__}') + raise ValueError('Environment variables not set') - # for models without tool calling, add available actions to context - self.available_actions = self.controller.registry.get_prompt_description() + # Start non-blocking LLM connection verification + self.llm._verified_api_keys = self._verify_llm_connection(self.llm) + + # Initialize available actions for system prompt (only non-filtered actions) + # These will be used for the system prompt to maintain caching + self.unfiltered_actions = self.controller.registry.get_prompt_description() self.tool_calling_method = self._set_tool_calling_method() self.settings.message_context = self._set_message_context() # Initialize message manager with state + # Initial system prompt with all actions - will be updated during each step self._message_manager = MessageManager( task=task, system_message=SystemPrompt( - action_description=self.available_actions, + action_description=self.unfiltered_actions, max_actions_per_step=self.settings.max_actions_per_step, override_system_message=override_system_message, extend_system_message=extend_system_message, @@ -198,17 +240,29 @@ def __init__( state=self.state.message_manager_state, ) + if self.settings.enable_memory: + memory_settings = MemorySettings( + agent_id=self.state.agent_id, + interval=self.settings.memory_interval, + config=self.settings.memory_config, + ) + + # Initialize memory + self.memory = Memory( + message_manager=self._message_manager, + llm=self.llm, + settings=memory_settings, + ) + else: + self.memory = None + # Browser setup self.injected_browser = browser is not None self.injected_browser_context = browser_context is not None - self.browser = browser if browser is not None else (None if browser_context else Browser()) - if browser_context: - self.browser_context = browser_context - elif self.browser: - self.browser_context = BrowserContext(browser=self.browser, config=self.browser.config.new_context_config) - else: - self.browser = Browser() - self.browser_context = BrowserContext(browser=self.browser) + self.browser = browser or Browser() + self.browser_context = browser_context or BrowserContext( + browser=self.browser, config=self.browser.config.new_context_config + ) # Callbacks self.register_new_step_callback = register_new_step_callback @@ -229,10 +283,11 @@ def __init__( def _set_message_context(self) -> str | None: if self.tool_calling_method == 'raw': + # For raw tool calling, only include actions with no filters initially if self.settings.message_context: - self.settings.message_context += f'\n\nAvailable actions: {self.available_actions}' + self.settings.message_context += f'\n\nAvailable actions: {self.unfiltered_actions}' else: - self.settings.message_context = f'Available actions: {self.available_actions}' + self.settings.message_context = f'Available actions: {self.unfiltered_actions}' return self.settings.message_context def _set_browser_use_version_and_source(self) -> None: @@ -287,6 +342,7 @@ def _set_model_names(self) -> None: def _setup_action_models(self) -> None: """Setup dynamic action models from controller's registry""" + # Initially only include actions with no filters self.ActionModel = self.controller.registry.create_action_model() # Create output model with the dynamic actions self.AgentOutput = AgentOutput.type_with_custom_actions(self.ActionModel) @@ -322,7 +378,7 @@ async def _raise_if_stopped_or_paused(self) -> None: raise InterruptedError if self.state.stopped or self.state.paused: - logger.debug('Agent paused after getting state') + # logger.debug('Agent paused after getting state') raise InterruptedError # @observe(name='agent.step', ignore_output=True, ignore_input=True) @@ -338,9 +394,42 @@ async def step(self, step_info: Optional[AgentStepInfo] = None) -> None: try: state = await self.browser_context.get_state() + active_page = await self.browser_context.get_current_page() + + # generate procedural memory if needed + if self.settings.enable_memory and self.memory and self.state.n_steps % self.settings.memory_interval == 0: + self.memory.create_procedural_memory(self.state.n_steps) await self._raise_if_stopped_or_paused() + # Update action models with page-specific actions + await self._update_action_models_for_page(active_page) + + # Get page-specific filtered actions + page_filtered_actions = self.controller.registry.get_prompt_description(active_page) + + # If there are page-specific actions, add them as a special message for this step only + if page_filtered_actions: + page_action_message = f'For this page, these additional actions are available:\n{page_filtered_actions}' + self._message_manager._add_message_with_tokens(HumanMessage(content=page_action_message)) + + # If using raw tool calling method, we need to update the message context with new actions + if self.tool_calling_method == 'raw': + # For raw tool calling, get all non-filtered actions plus the page-filtered ones + all_unfiltered_actions = self.controller.registry.get_prompt_description() + all_actions = all_unfiltered_actions + if page_filtered_actions: + all_actions += '\n' + page_filtered_actions + + context_lines = self._message_manager.settings.message_context.split('\n') + non_action_lines = [line for line in context_lines if not line.startswith('Available actions:')] + updated_context = '\n'.join(non_action_lines) + if updated_context: + updated_context += f'\n\nAvailable actions: {all_actions}' + else: + updated_context = f'Available actions: {all_actions}' + self._message_manager.settings.message_context = updated_context + self._message_manager.add_state_message(state, self.state.last_result, step_info, self.settings.use_vision) # Run planner at specified intervals if planner is configured @@ -371,20 +460,35 @@ async def step(self, step_info: Optional[AgentStepInfo] = None) -> None: try: model_output = await self.get_next_action(input_messages) + # Check again for paused/stopped state after getting model output + # This is needed in case Ctrl+C was pressed during the get_next_action call + await self._raise_if_stopped_or_paused() + self.state.n_steps += 1 if self.register_new_step_callback: - await self.register_new_step_callback(state, model_output, self.state.n_steps) - + if inspect.iscoroutinefunction(self.register_new_step_callback): + await self.register_new_step_callback(state, model_output, self.state.n_steps) + else: + self.register_new_step_callback(state, model_output, self.state.n_steps) if self.settings.save_conversation_path: target = self.settings.save_conversation_path + f'_{self.state.n_steps}.txt' save_conversation(input_messages, model_output, target, self.settings.save_conversation_path_encoding) self._message_manager._remove_last_state_message() # we dont want the whole state in the chat history + # check again if Ctrl+C was pressed before we commit the output to history await self._raise_if_stopped_or_paused() self._message_manager.add_model_output(model_output) + except asyncio.CancelledError: + # Task was cancelled due to Ctrl+C + self._message_manager._remove_last_state_message() + raise InterruptedError('Model query cancelled by user') + except InterruptedError: + # Agent was paused during get_next_action + self._message_manager._remove_last_state_message() + raise # Re-raise to be caught by the outer try/except except Exception as e: # model call failed, remove last state message from history self._message_manager._remove_last_state_message() @@ -400,13 +504,18 @@ async def step(self, step_info: Optional[AgentStepInfo] = None) -> None: self.state.consecutive_failures = 0 except InterruptedError: - logger.debug('Agent paused') + # logger.debug('Agent paused') self.state.last_result = [ ActionResult( - error='The agent was paused - now continuing actions might need to be repeated', include_in_memory=True + error='The agent was paused mid-step - the last action might need to be repeated', include_in_memory=False ) ] return + except asyncio.CancelledError: + # Directly handle the case where the step is cancelled at a higher level + # logger.debug('Task cancelled - agent was paused with Ctrl+C') + self.state.last_result = [ActionResult(error='The agent was paused with Ctrl+C', include_in_memory=False)] + raise InterruptedError('Step cancelled by user') except Exception as e: result = await self._handle_step_error(e) self.state.last_result = result @@ -441,6 +550,11 @@ async def _handle_step_error(self, error: Exception) -> list[ActionResult]: include_trace = logger.isEnabledFor(logging.DEBUG) error_msg = AgentError.format_error(error, include_trace=include_trace) prefix = f'❌ Result failed {self.state.consecutive_failures + 1}/{self.settings.max_failures} times:\n ' + self.state.consecutive_failures += 1 + + if 'Browser closed' in error_msg: + logger.error('❌ Browser is closed or disconnected, unable to proceed') + return [ActionResult(error='Browser closed or disconnected, unable to proceed', include_in_memory=False)] if isinstance(error, (ValidationError, ValueError)): logger.error(f'{prefix}{error_msg}') @@ -455,18 +569,23 @@ async def _handle_step_error(self, error: Exception) -> list[ActionResult]: # give model a hint how output should look like error_msg += '\n\nReturn a valid JSON object with the required fields.' - self.state.consecutive_failures += 1 else: + from anthropic import RateLimitError as AnthropicRateLimitError from google.api_core.exceptions import ResourceExhausted from openai import RateLimitError - if isinstance(error, RateLimitError) or isinstance(error, ResourceExhausted): + # Define a tuple of rate limit error types for easier maintenance + RATE_LIMIT_ERRORS = ( + RateLimitError, # OpenAI + ResourceExhausted, # Google + AnthropicRateLimitError, # Anthropic + ) + + if isinstance(error, RATE_LIMIT_ERRORS): logger.warning(f'{prefix}{error_msg}') await asyncio.sleep(self.settings.retry_delay) - self.state.consecutive_failures += 1 else: logger.error(f'{prefix}{error_msg}') - self.state.consecutive_failures += 1 return [ActionResult(error=error_msg, include_in_memory=True)] @@ -520,33 +639,80 @@ async def get_next_action(self, input_messages: list[BaseMessage]) -> AgentOutpu input_messages = self._convert_input_messages(input_messages) if self.tool_calling_method == 'raw': - output = self.llm.invoke(input_messages) + logger.debug(f'Using {self.tool_calling_method} for {self.chat_model_library}') + try: + output = self.llm.invoke(input_messages) + response = {'raw': output, 'parsed': None} + except Exception as e: + logger.error(f'Failed to invoke model: {str(e)}') + raise LLMException(401, 'LLM API call failed') from e # TODO: currently invoke does not return reasoning_content, we should override invoke output.content = self._remove_think_tags(str(output.content)) try: parsed_json = extract_json_from_model_output(output.content) parsed = self.AgentOutput(**parsed_json) + response['parsed'] = parsed except (ValueError, ValidationError) as e: logger.warning(f'Failed to parse model output: {output} {str(e)}') raise ValueError('Could not parse response.') elif self.tool_calling_method is None: structured_llm = self.llm.with_structured_output(self.AgentOutput, include_raw=True) - response: dict[str, Any] = await structured_llm.ainvoke(input_messages) # type: ignore - parsed: AgentOutput | None = response['parsed'] + try: + response: dict[str, Any] = await structured_llm.ainvoke(input_messages) # type: ignore + parsed: AgentOutput | None = response['parsed'] + + except Exception as e: + logger.error(f'Failed to invoke model: {str(e)}') + raise LLMException(401, 'LLM API call failed') from e + else: + logger.debug(f'Using {self.tool_calling_method} for {self.chat_model_library}') structured_llm = self.llm.with_structured_output(self.AgentOutput, include_raw=True, method=self.tool_calling_method) response: dict[str, Any] = await structured_llm.ainvoke(input_messages) # type: ignore - parsed: AgentOutput | None = response['parsed'] - if parsed is None: - raise ValueError('Could not parse response.') + # Handle tool call responses + if response.get('parsing_error') and 'raw' in response: + raw_msg = response['raw'] + if hasattr(raw_msg, 'tool_calls') and raw_msg.tool_calls: + # Convert tool calls to AgentOutput format + + tool_call = raw_msg.tool_calls[0] # Take first tool call + + # Create current state + tool_call_name = tool_call['name'] + tool_call_args = tool_call['args'] + + current_state = { + 'page_summary': 'Processing tool call', + 'evaluation_previous_goal': 'Executing action', + 'memory': 'Using tool call', + 'next_goal': f'Execute {tool_call_name}', + } + + # Create action from tool call + action = {tool_call_name: tool_call_args} + + parsed = self.AgentOutput(current_state=current_state, action=[self.ActionModel(**action)]) + else: + parsed = None + else: + parsed = response['parsed'] + + if not parsed: + try: + parsed_json = extract_json_from_model_output(response['raw'].content) + parsed = self.AgentOutput(**parsed_json) + except Exception as e: + logger.warning(f'Failed to parse model output: {response["raw"].content} {str(e)}') + raise ValueError('Could not parse response.') # cut the number of actions to max_actions_per_step if needed if len(parsed.action) > self.settings.max_actions_per_step: parsed.action = parsed.action[: self.settings.max_actions_per_step] - log_response(parsed) + if not (hasattr(self.state, 'paused') and (self.state.paused or self.state.stopped)): + log_response(parsed) return parsed @@ -582,16 +748,38 @@ async def take_step(self) -> tuple[bool, bool]: await self.log_completion() if self.register_done_callback: - await self.register_done_callback(self.state.history) - + if inspect.iscoroutinefunction(self.register_done_callback): + await self.register_done_callback(self.state.history) + else: + self.register_done_callback(self.state.history) return True, True return False, False # @observe(name='agent.run', ignore_output=True) @time_execution_async('--run (agent)') - async def run(self, max_steps: int = 100) -> AgentHistoryList: + async def run( + self, max_steps: int = 100, on_step_start: AgentHookFunc | None = None, on_step_end: AgentHookFunc | None = None + ) -> AgentHistoryList: """Execute the task with maximum number of steps""" + + loop = asyncio.get_event_loop() + + # Set up the Ctrl+C signal handler with callbacks specific to this agent + from browser_use.utils import SignalHandler + + signal_handler = SignalHandler( + loop=loop, + pause_callback=self.pause, + resume_callback=self.resume, + custom_exit_callback=None, # No special cleanup needed on forced exit + exit_on_second_int=True, + ) + signal_handler.register() + + # Start non-blocking LLM connection verification + assert self.llm._verified_api_keys, 'Failed to verify LLM API keys' + try: self._log_agent_run() @@ -601,6 +789,11 @@ async def run(self, max_steps: int = 100) -> AgentHistoryList: self.state.last_result = result for step in range(max_steps): + # Check if waiting for user input after Ctrl+C + if self.state.paused: + signal_handler.wait_for_resume() + signal_handler.reset() + # Check if we should stop due to too many failures if self.state.consecutive_failures >= self.settings.max_failures: logger.error(f'❌ Stopping due to {self.settings.max_failures} consecutive failures') @@ -616,9 +809,15 @@ async def run(self, max_steps: int = 100) -> AgentHistoryList: if self.state.stopped: # Allow stopping while paused break + if on_step_start is not None: + await on_step_start(self) + step_info = AgentStepInfo(step_number=step, max_steps=max_steps) await self.step(step_info) + if on_step_end is not None: + await on_step_end(self) + if self.state.history.is_done(): if self.settings.validate_output and step < max_steps - 1: if not await self._validate_output(): @@ -630,7 +829,16 @@ async def run(self, max_steps: int = 100) -> AgentHistoryList: logger.info('❌ Failed to complete task in maximum steps') return self.state.history + + except KeyboardInterrupt: + # Already handled by our signal handler, but catch any direct KeyboardInterrupt as well + logger.info('Got KeyboardInterrupt during execution, returning current history') + return self.state.history + finally: + # Unregister signal handlers before cleanup + signal_handler.unregister() + self.telemetry.capture( AgentEndTelemetryEvent( agent_id=self.state.agent_id, @@ -644,11 +852,7 @@ async def run(self, max_steps: int = 100) -> AgentHistoryList: ) ) - if not self.injected_browser_context: - await self.browser_context.close() - - if not self.injected_browser and self.browser: - await self.browser.close() + await self.close() if self.settings.generate_gif: output_path: str = 'agent_history.gif' @@ -675,7 +879,20 @@ async def multi_act( for i, action in enumerate(actions): if action.get_index() is not None and i != 0: new_state = await self.browser_context.get_state() - new_path_hashes = set(e.hash.branch_path_hash for e in new_state.selector_map.values()) + new_selector_map = new_state.selector_map + + # Detect index change after previous action + orig_target = cached_selector_map.get(action.get_index()) # type: ignore + orig_target_hash = orig_target.hash.branch_path_hash if orig_target else None + new_target = new_selector_map.get(action.get_index()) # type: ignore + new_target_hash = new_target.hash.branch_path_hash if new_target else None + if orig_target_hash != new_target_hash: + msg = f'Element index changed after action {i} / {len(actions)}, because page changed.' + logger.info(msg) + results.append(ActionResult(extracted_content=msg, include_in_memory=True)) + break + + new_path_hashes = set(e.hash.branch_path_hash for e in new_selector_map.values()) if check_for_new_elements and not new_path_hashes.issubset(cached_path_hashes): # next action requires index but there are new elements on the page msg = f'Something new appeared after action {i} / {len(actions)}' @@ -683,25 +900,34 @@ async def multi_act( results.append(ActionResult(extracted_content=msg, include_in_memory=True)) break - await self._raise_if_stopped_or_paused() + try: + await self._raise_if_stopped_or_paused() - result = await self.controller.act( - action, - self.browser_context, - self.settings.page_extraction_llm, - self.sensitive_data, - self.settings.available_file_paths, - context=self.context, - ) + result = await self.controller.act( + action, + self.browser_context, + self.settings.page_extraction_llm, + self.sensitive_data, + self.settings.available_file_paths, + context=self.context, + ) - results.append(result) + results.append(result) - logger.debug(f'Executed action {i + 1} / {len(actions)}') - if results[-1].is_done or results[-1].error or i == len(actions) - 1: - break + logger.debug(f'Executed action {i + 1} / {len(actions)}') + if results[-1].is_done or results[-1].error or i == len(actions) - 1: + break - await asyncio.sleep(self.browser_context.config.wait_between_actions) - # hash all elements. if it is a subset of cached_state its fine - else break (new elements on page) + await asyncio.sleep(self.browser_context.config.wait_between_actions) + # hash all elements. if it is a subset of cached_state its fine - else break (new elements on page) + + except asyncio.CancelledError: + # Gracefully handle task cancellation + logger.info(f'Action {i + 1} was cancelled due to Ctrl+C') + if not results: + # Add a result for the cancelled action + results.append(ActionResult(error='The action was cancelled due to Ctrl+C', include_in_memory=True)) + raise InterruptedError('Action cancelled by user') return results @@ -759,7 +985,10 @@ async def log_completion(self) -> None: logger.info('❌ Unfinished') if self.register_done_callback: - await self.register_done_callback(self.state.history) + if inspect.iscoroutinefunction(self.register_done_callback): + await self.register_done_callback(self.state.history) + else: + self.register_done_callback(self.state.history) async def rerun_history( self, @@ -889,14 +1118,29 @@ def save_history(self, file_path: Optional[str | Path] = None) -> None: def pause(self) -> None: """Pause the agent before the next step""" - logger.info('🔄 pausing Agent ') + print('\n\n⏸️ Got Ctrl+C, paused the agent and left the browser open.') self.state.paused = True + # The signal handler will handle the asyncio pause logic for us + # No need to duplicate the code here + def resume(self) -> None: """Resume the agent""" - logger.info('▶️ Agent resuming') + print('----------------------------------------------------------------------') + print('▶️ Got Enter, resuming agent execution where it left off...\n') self.state.paused = False + # The signal handler should have already reset the flags + # through its reset() method when called from run() + + # playwright browser is always immediately killed by the first Ctrl+C (no way to stop that) + # so we need to restart the browser if user wants to continue + if self.browser: + logger.info('🌎 Restarting/reconnecting to browser...') + loop = asyncio.get_event_loop() + loop.create_task(self.browser._init()) + loop.create_task(asyncio.sleep(5)) + def stop(self) -> None: """Stop the agent""" logger.info('⏹️ Agent stopping') @@ -924,15 +1168,64 @@ def _convert_initial_actions(self, actions: List[Dict[str, Dict[str, Any]]]) -> return converted_actions + async def _verify_llm_connection(self, llm: BaseChatModel) -> bool: + """ + Verify that the LLM API keys are working properly by sending a simple test prompt + and checking that the response contains the expected answer. + """ + if getattr(llm, '_verified_api_keys', None) is True or SKIP_LLM_API_KEY_VERIFICATION: + # If the LLM API keys have already been verified during a previous run, skip the test + return True + + test_prompt = 'What is the capital of France? Respond with a single word.' + test_answer = 'paris' + required_keys = REQUIRED_LLM_API_ENV_VARS.get(llm.__class__.__name__, ['OPENAI_API_KEY']) + try: + response = await llm.ainvoke([HumanMessage(content=test_prompt)]) + response_text = str(response.content).lower() + + if test_answer in response_text: + logger.debug( + f'🧠 LLM API keys {", ".join(required_keys)} verified, {llm.__class__.__name__} model is connected and responding correctly.' + ) + llm._verified_api_keys = True + return True + else: + logger.debug( + '❌ Got bad LLM response to basic sanity check question: %s EXPECTING: %s GOT: %s', + test_prompt, + test_answer, + response, + ) + raise Exception('LLM responded to a simple test question incorrectly') + except Exception as e: + logger.error( + f'\n\n❌ LLM {llm.__class__.__name__} connection test failed. Check that {", ".join(required_keys)} is set correctly in .env and that the LLM API account has sufficient funding.\n' + ) + raise Exception(f'LLM API connection test failed: {e}') from e + return False + async def _run_planner(self) -> Optional[str]: """Run the planner to analyze state and suggest next steps""" # Skip planning if no planner_llm is set if not self.settings.planner_llm: return None - # Create planner message history using full message history + # Get current state to filter actions by page + page = await self.browser_context.get_current_page() + + # Get all standard actions (no filter) and page-specific actions + standard_actions = self.controller.registry.get_prompt_description() # No page = system prompt actions + page_actions = self.controller.registry.get_prompt_description(page) # Page-specific actions + + # Combine both for the planner + all_actions = standard_actions + if page_actions: + all_actions += '\n' + page_actions + + # Create planner message history using full message history with all available actions planner_messages = [ - PlannerPrompt(self.controller.registry.get_prompt_description()).get_system_message(), + PlannerPrompt(all_actions).get_system_message(self.settings.is_planner_reasoning), *self._message_manager.get_messages()[1:], # Use full message history except the first ] @@ -954,10 +1247,17 @@ async def _run_planner(self) -> Optional[str]: planner_messages = convert_input_messages(planner_messages, self.planner_model_name) # Get planner output - response = await self.settings.planner_llm.ainvoke(planner_messages) + try: + response = await self.settings.planner_llm.ainvoke(planner_messages) + except Exception as e: + logger.error(f'Failed to invoke planner: {str(e)}') + raise LLMException(401, 'LLM API call failed') from e + plan = str(response.content) # if deepseek-reasoner, remove think tags - if self.planner_model_name and ('deepseek-r1' in self.planner_model_name or 'deepseek-reasoner' in self.planner_model_name): + if self.planner_model_name and ( + 'deepseek-r1' in self.planner_model_name or 'deepseek-reasoner' in self.planner_model_name + ): plan = self._remove_think_tags(plan) try: plan_json = json.loads(plan) @@ -973,3 +1273,29 @@ async def _run_planner(self) -> Optional[str]: @property def message_manager(self) -> MessageManager: return self._message_manager + + async def close(self): + """Close all resources""" + try: + # First close browser resources + if self.browser_context and not self.injected_browser_context: + await self.browser_context.close() + if self.browser and not self.injected_browser: + await self.browser.close() + + # Force garbage collection + gc.collect() + + except Exception as e: + logger.error(f'Error during cleanup: {e}') + + async def _update_action_models_for_page(self, page) -> None: + """Update action models with page-specific actions""" + # Create new action model with current page's filtered actions + self.ActionModel = self.controller.registry.create_action_model(page=page) + # Update output model with the new actions + self.AgentOutput = AgentOutput.type_with_custom_actions(self.ActionModel) + + # Update done action model too + self.DoneActionModel = self.controller.registry.create_action_model(include_actions=['done'], page=page) + self.DoneAgentOutput = AgentOutput.type_with_custom_actions(self.DoneActionModel) diff --git a/browser_use/agent/system_prompt.md b/browser_use/agent/system_prompt.md index e70ae4952f..5f7eff0a1f 100644 --- a/browser_use/agent/system_prompt.md +++ b/browser_use/agent/system_prompt.md @@ -48,11 +48,11 @@ Common action sequences: 5. TASK COMPLETION: - Use the done action as the last action as soon as the ultimate task is complete -- Dont use "done" before you are done with everything the user asked you, except you reach the last step of max_steps. -- If you reach your last step, use the done action even if the task is not fully finished. Provide all the information you have gathered so far. If the ultimate task is completly finished set success to true. If not everything the user asked for is completed set success in done to false! +- Dont use "done" before you are done with everything the user asked you, except you reach the last step of max_steps. +- If you reach your last step, use the done action even if the task is not fully finished. Provide all the information you have gathered so far. If the ultimate task is completely finished set success to true. If not everything the user asked for is completed set success in done to false! - If you have to do something repeatedly for example the task says for "each", or "for all", or "x times", count always inside "memory" how many times you have done it and how many remain. Don't stop until you have completed like the task asked you. Only call done after the last step. - Don't hallucinate actions -- Make sure you include everything you found out for the ultimate task in the done text parameter. Do not just say you are done, but include the requested information of the task. +- Make sure you include everything you found out for the ultimate task in the done text parameter. Do not just say you are done, but include the requested information of the task. 6. VISUAL CONTEXT: - When an image is provided, use it to understand the page layout @@ -62,8 +62,9 @@ Common action sequences: - If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field. 8. Long tasks: -- Keep track of the status and subresults in the memory. +- Keep track of the status and subresults in the memory. +- You are provided with procedural memory summaries that condense previous task history (every N steps). Use these summaries to maintain context about completed actions, current progress, and next steps. The summaries appear in chronological order and contain key information about navigation history, findings, errors encountered, and current state. Refer to these summaries to avoid repeating actions and to ensure consistent progress toward the task goal. 9. Extraction: - If your task is to find information - call extract_content on the specific pages to get and store the information. -Your responses must be always JSON with the specified format. \ No newline at end of file +Your responses must be always JSON with the specified format. diff --git a/browser_use/agent/tests.py b/browser_use/agent/tests.py index 15c47357da..a0d084cc7c 100644 --- a/browser_use/agent/tests.py +++ b/browser_use/agent/tests.py @@ -147,7 +147,7 @@ def test_final_result(sample_history: AgentHistoryList): def test_is_done(sample_history: AgentHistoryList): - assert sample_history.is_done() == True + assert sample_history.is_done() is True def test_urls(sample_history: AgentHistoryList): @@ -182,7 +182,7 @@ def test_empty_history(): empty_history = AgentHistoryList(history=[]) assert empty_history.last_action() is None assert empty_history.final_result() is None - assert empty_history.is_done() == False + assert empty_history.is_done() is False assert len(empty_history.urls()) == 0 diff --git a/browser_use/agent/views.py b/browser_use/agent/views.py index 6d8249727b..d46e42df8f 100644 --- a/browser_use/agent/views.py +++ b/browser_use/agent/views.py @@ -22,6 +22,15 @@ from browser_use.dom.views import SelectorMap ToolCallingMethod = Literal['function_calling', 'json_mode', 'raw', 'auto'] +REQUIRED_LLM_API_ENV_VARS = { + 'ChatOpenAI': ['OPENAI_API_KEY'], + 'AzureOpenAI': ['AZURE_ENDPOINT', 'AZURE_OPENAI_API_KEY'], + 'ChatBedrockConverse': ['ANTHROPIC_API_KEY'], + 'ChatAnthropic': ['ANTHROPIC_API_KEY'], + 'ChatGoogleGenerativeAI': ['GEMINI_API_KEY'], + 'ChatDeepSeek': ['DEEPSEEK_API_KEY'], + 'ChatOllama': [], +} class AgentSettings(BaseModel): @@ -58,6 +67,12 @@ class AgentSettings(BaseModel): page_extraction_llm: Optional[BaseChatModel] = None planner_llm: Optional[BaseChatModel] = None planner_interval: int = 1 # Run planner every N steps + is_planner_reasoning: bool = False # type: ignore + + # Procedural memory settings + enable_memory: bool = True + memory_interval: int = 10 + memory_config: Optional[dict] = None class AgentState(BaseModel): @@ -166,7 +181,7 @@ def get_interacted_element(model_output: AgentOutput, selector_map: SelectorMap) elements = [] for action in model_output.action: index = action.get_index() - if index and index in selector_map: + if index is not None and index in selector_map: el: DOMElementNode = selector_map[index] elements.append(HistoryTreeProcessor.convert_dom_element_to_history_element(el)) else: diff --git a/browser_use/browser/browser.py b/browser_use/browser/browser.py index 9278ac34c9..80702e2b2c 100644 --- a/browser_use/browser/browser.py +++ b/browser_use/browser/browser.py @@ -5,34 +5,69 @@ import asyncio import gc import logging -from dataclasses import dataclass, field - -from playwright._impl._api_structures import ProxySettings +import os +import socket +import subprocess +from typing import Literal + +import psutil +import requests +from dotenv import load_dotenv from playwright.async_api import Browser as PlaywrightBrowser from playwright.async_api import ( Playwright, async_playwright, ) +from pydantic import AliasChoices, BaseModel, ConfigDict, Field + +load_dotenv() +from browser_use.browser.chrome import ( + CHROME_ARGS, + CHROME_DETERMINISTIC_RENDERING_ARGS, + CHROME_DISABLE_SECURITY_ARGS, + CHROME_DOCKER_ARGS, + CHROME_HEADLESS_ARGS, +) from browser_use.browser.context import BrowserContext, BrowserContextConfig +from browser_use.browser.utils.screen_resolution import get_screen_resolution, get_window_adjustments from browser_use.utils import time_execution_async logger = logging.getLogger(__name__) +IN_DOCKER = os.environ.get('IN_DOCKER', 'false').lower()[0] in 'ty1' + + +class ProxySettings(BaseModel): + """the same as playwright.sync_api.ProxySettings, but now as a Pydantic BaseModel so pydantic can validate it""" + + server: str + bypass: str | None = None + username: str | None = None + password: str | None = None + + model_config = ConfigDict(populate_by_name=True, from_attributes=True) + + # Support dict-like behavior for compatibility with Playwright's ProxySettings + def __getitem__(self, key): + return getattr(self, key) -@dataclass -class BrowserConfig: + def get(self, key, default=None): + return getattr(self, key, default) + + +class BrowserConfig(BaseModel): r""" Configuration for the Browser. Default values: - headless: True - Whether to run browser in headless mode + headless: False + Whether to run browser in headless mode (not recommended) - disable_security: True - Disable browser security features + disable_security: False + Disable browser security features (required for cross-origin iframe support) - extra_chromium_args: [] + extra_browser_args: [] Extra arguments to pass to the browser wss_url: None @@ -41,22 +76,40 @@ class BrowserConfig: cdp_url: None Connect to a browser instance via CDP - chrome_instance_path: None - Path to a Chrome instance to use to connect to your normal browser + browser_binary_path: None + Path to a Browser instance to use to connect to your normal browser e.g. '/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome' + + keep_alive: False + Keep the browser alive after the agent has finished running + + deterministic_rendering: False + Enable deterministic rendering (makes GPU/font rendering consistent across different OS's and docker) """ - headless: bool = False - disable_security: bool = True - extra_chromium_args: list[str] = field(default_factory=list) - chrome_instance_path: str | None = None + model_config = ConfigDict( + arbitrary_types_allowed=True, + extra='ignore', + populate_by_name=True, + from_attributes=True, + validate_assignment=True, + revalidate_instances='subclass-instances', + ) + wss_url: str | None = None cdp_url: str | None = None - proxy: ProxySettings | None = field(default=None) - new_context_config: BrowserContextConfig = field(default_factory=BrowserContextConfig) + browser_class: Literal['chromium', 'firefox', 'webkit'] = 'chromium' + browser_binary_path: str | None = Field(default=None, alias=AliasChoices('browser_instance_path', 'chrome_instance_path')) + extra_browser_args: list[str] = Field(default_factory=list) + + headless: bool = False + disable_security: bool = False # disable_security=True is dangerous as any malicious URL visited could embed an iframe for the user's bank, and use their cookies to steal money + deterministic_rendering: bool = False + keep_alive: bool = Field(default=False, alias='_force_keep_browser_alive') # used to be called _force_keep_browser_alive - _force_keep_browser_alive: bool = False + proxy: ProxySettings | None = None + new_context_config: BrowserContextConfig = Field(default_factory=BrowserContextConfig) # @singleton: TODO - think about id singleton makes sense here @@ -65,30 +118,22 @@ class Browser: """ Playwright browser on steroids. - This is persistant browser factory that can spawn multiple browser contexts. + This is persistent browser factory that can spawn multiple browser contexts. It is recommended to use only one instance of Browser per your application (RAM usage will grow otherwise). """ def __init__( self, - config: BrowserConfig = BrowserConfig(), + config: BrowserConfig | None = None, ): - logger.debug('Initializing new browser') - self.config = config + logger.debug('🌎 Initializing new browser') + self.config = config or BrowserConfig() self.playwright: Playwright | None = None self.playwright_browser: PlaywrightBrowser | None = None - self.disable_security_args = [] - if self.config.disable_security: - self.disable_security_args = [ - '--disable-web-security', - '--disable-site-isolation-trials', - '--disable-features=IsolateOrigins,site-per-process', - ] - - async def new_context(self, config: BrowserContextConfig = BrowserContextConfig()) -> BrowserContext: + async def new_context(self, config: BrowserContextConfig | None = None) -> BrowserContext: """Create a browser context""" - return BrowserContext(config=config, browser=self) + return BrowserContext(config=config or self.config, browser=self) async def get_playwright_browser(self) -> PlaywrightBrowser: """Get a browser context""" @@ -108,52 +153,70 @@ async def _init(self): return self.playwright_browser - async def _setup_cdp(self, playwright: Playwright) -> PlaywrightBrowser: - """Sets up and returns a Playwright Browser instance with anti-detection measures.""" + async def _setup_remote_cdp_browser(self, playwright: Playwright) -> PlaywrightBrowser: + """Sets up and returns a Playwright Browser instance with anti-detection measures. Firefox has no longer CDP support.""" + if 'firefox' in (self.config.browser_binary_path or '').lower(): + raise ValueError( + 'CDP has been deprecated for firefox, check: https://fxdx.dev/deprecating-cdp-support-in-firefox-embracing-the-future-with-webdriver-bidi/' + ) if not self.config.cdp_url: raise ValueError('CDP URL is required') - logger.info(f'Connecting to remote browser via CDP {self.config.cdp_url}') - browser = await playwright.chromium.connect_over_cdp(self.config.cdp_url) + logger.info(f'🔌 Connecting to remote browser via CDP {self.config.cdp_url}') + browser_class = getattr(playwright, self.config.browser_class) + browser = await browser_class.connect_over_cdp(self.config.cdp_url) return browser - async def _setup_wss(self, playwright: Playwright) -> PlaywrightBrowser: + async def _setup_remote_wss_browser(self, playwright: Playwright) -> PlaywrightBrowser: """Sets up and returns a Playwright Browser instance with anti-detection measures.""" if not self.config.wss_url: raise ValueError('WSS URL is required') - logger.info(f'Connecting to remote browser via WSS {self.config.wss_url}') - browser = await playwright.chromium.connect(self.config.wss_url) + logger.info(f'🔌 Connecting to remote browser via WSS {self.config.wss_url}') + browser_class = getattr(playwright, self.config.browser_class) + browser = await browser_class.connect(self.config.wss_url) return browser - async def _setup_browser_with_instance(self, playwright: Playwright) -> PlaywrightBrowser: + async def _setup_user_provided_browser(self, playwright: Playwright) -> PlaywrightBrowser: """Sets up and returns a Playwright Browser instance with anti-detection measures.""" - if not self.config.chrome_instance_path: - raise ValueError('Chrome instance path is required') - import subprocess + if not self.config.browser_binary_path: + raise ValueError('A browser_binary_path is required') - import requests + assert self.config.browser_class == 'chromium', ( + 'browser_binary_path only supports chromium browsers (make sure browser_class=chromium)' + ) try: # Check if browser is already running response = requests.get('http://localhost:9222/json/version', timeout=2) if response.status_code == 200: - logger.info('Reusing existing Chrome instance') - browser = await playwright.chromium.connect_over_cdp( + logger.info('🔌 Reusing existing browser found running on http://localhost:9222') + browser_class = getattr(playwright, self.config.browser_class) + browser = await browser_class.connect_over_cdp( endpoint_url='http://localhost:9222', timeout=20000, # 20 second timeout for connection ) return browser except requests.ConnectionError: - logger.debug('No existing Chrome instance found, starting a new one') + logger.debug('🌎 No existing Chrome instance found, starting a new one') # Start a new Chrome instance - subprocess.Popen( - [ - self.config.chrome_instance_path, - '--remote-debugging-port=9222', - ] - + self.config.extra_chromium_args, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, + chrome_launch_cmd = [ + self.config.browser_binary_path, + *{ # remove duplicates (usually preserves the order, but not guaranteed) + *CHROME_ARGS, + *(CHROME_DOCKER_ARGS if IN_DOCKER else []), + *(CHROME_HEADLESS_ARGS if self.config.headless else []), + *(CHROME_DISABLE_SECURITY_ARGS if self.config.disable_security else []), + *(CHROME_DETERMINISTIC_RENDERING_ARGS if self.config.deterministic_rendering else []), + *self.config.extra_browser_args, + }, + ] + self._chrome_subprocess = psutil.Process( + subprocess.Popen( + chrome_launch_cmd, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + shell=False, + ).pid ) # Attempt to connect again after starting a new instance @@ -168,76 +231,120 @@ async def _setup_browser_with_instance(self, playwright: Playwright) -> Playwrig # Attempt to connect again after starting a new instance try: - browser = await playwright.chromium.connect_over_cdp( + browser_class = getattr(playwright, self.config.browser_class) + browser = await browser_class.connect_over_cdp( endpoint_url='http://localhost:9222', timeout=20000, # 20 second timeout for connection ) return browser except Exception as e: - logger.error(f'Failed to start a new Chrome instance.: {str(e)}') + logger.error(f'❌ Failed to start a new Chrome instance: {str(e)}') raise RuntimeError( - ' To start chrome in Debug mode, you need to close all existing Chrome instances and try again otherwise we can not connect to the instance.' + 'To start chrome in Debug mode, you need to close all existing Chrome instances and try again otherwise we can not connect to the instance.' ) - async def _setup_standard_browser(self, playwright: Playwright) -> PlaywrightBrowser: + async def _setup_builtin_browser(self, playwright: Playwright) -> PlaywrightBrowser: """Sets up and returns a Playwright Browser instance with anti-detection measures.""" - browser = await playwright.chromium.launch( + assert self.config.browser_binary_path is None, 'browser_binary_path should be None if trying to use the builtin browsers' + + if self.config.headless: + screen_size = {'width': 1920, 'height': 1080} + offset_x, offset_y = 0, 0 + else: + screen_size = get_screen_resolution() + offset_x, offset_y = get_window_adjustments() + + chrome_args = { + *CHROME_ARGS, + *(CHROME_DOCKER_ARGS if IN_DOCKER else []), + *(CHROME_HEADLESS_ARGS if self.config.headless else []), + *(CHROME_DISABLE_SECURITY_ARGS if self.config.disable_security else []), + *(CHROME_DETERMINISTIC_RENDERING_ARGS if self.config.deterministic_rendering else []), + f'--window-position={offset_x},{offset_y}', + f'--window-size={screen_size["width"]},{screen_size["height"]}', + *self.config.extra_browser_args, + } + + # check if port 9222 is already taken, if so remove the remote-debugging-port arg to prevent conflicts + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + if s.connect_ex(('localhost', 9222)) == 0: + chrome_args.remove('--remote-debugging-port=9222') + + browser_class = getattr(playwright, self.config.browser_class) + args = { + 'chromium': list(chrome_args), + 'firefox': [ + *{ + '-no-remote', + *self.config.extra_browser_args, + } + ], + 'webkit': [ + *{ + '--no-startup-window', + *self.config.extra_browser_args, + } + ], + } + + browser = await browser_class.launch( headless=self.config.headless, - args=[ - '--no-sandbox', - '--disable-blink-features=AutomationControlled', - '--disable-infobars', - '--disable-background-timer-throttling', - '--disable-popup-blocking', - '--disable-backgrounding-occluded-windows', - '--disable-renderer-backgrounding', - '--disable-window-activation', - '--disable-focus-on-load', - '--no-first-run', - '--no-default-browser-check', - '--no-startup-window', - '--window-position=0,0', - # '--window-size=1280,1000', - ] - + self.disable_security_args - + self.config.extra_chromium_args, - proxy=self.config.proxy, + args=args[self.config.browser_class], + proxy=self.config.proxy.model_dump() if self.config.proxy else None, + handle_sigterm=False, + handle_sigint=False, ) - # convert to Browser return browser async def _setup_browser(self, playwright: Playwright) -> PlaywrightBrowser: """Sets up and returns a Playwright Browser instance with anti-detection measures.""" try: if self.config.cdp_url: - return await self._setup_cdp(playwright) + return await self._setup_remote_cdp_browser(playwright) if self.config.wss_url: - return await self._setup_wss(playwright) - elif self.config.chrome_instance_path: - return await self._setup_browser_with_instance(playwright) + return await self._setup_remote_wss_browser(playwright) + + if self.config.headless: + logger.warning('⚠️ Headless mode is not recommended. Many sites will detect and block all headless browsers.') + + if self.config.browser_binary_path: + return await self._setup_user_provided_browser(playwright) else: - return await self._setup_standard_browser(playwright) + return await self._setup_builtin_browser(playwright) except Exception as e: - logger.error(f'Failed to initialize Playwright browser: {str(e)}') + logger.error(f'Failed to initialize Playwright browser: {e}') raise async def close(self): """Close the browser instance""" - try: - if not self.config._force_keep_browser_alive: - if self.playwright_browser: - await self.playwright_browser.close() - del self.playwright_browser - if self.playwright: - await self.playwright.stop() - del self.playwright + if self.config.keep_alive: + return + try: + if self.playwright_browser: + await self.playwright_browser.close() + del self.playwright_browser + if self.playwright: + await self.playwright.stop() + del self.playwright + if chrome_proc := getattr(self, '_chrome_subprocess', None): + try: + # always kill all children processes, otherwise chrome leaves a bunch of zombie processes + for proc in chrome_proc.children(recursive=True): + proc.kill() + chrome_proc.kill() + except Exception as e: + logger.debug(f'Failed to terminate chrome subprocess: {e}') + + # Then cleanup httpx clients + await self.cleanup_httpx_clients() except Exception as e: logger.debug(f'Failed to close browser properly: {e}') + finally: self.playwright_browser = None self.playwright = None - + self._chrome_subprocess = None gc.collect() def __del__(self): @@ -251,3 +358,23 @@ def __del__(self): asyncio.run(self.close()) except Exception as e: logger.debug(f'Failed to cleanup browser in destructor: {e}') + + async def cleanup_httpx_clients(self): + """Cleanup all httpx clients""" + import gc + + import httpx + + # Force garbage collection to make sure all clients are in memory + gc.collect() + + # Get all httpx clients + clients = [obj for obj in gc.get_objects() if isinstance(obj, httpx.AsyncClient)] + + # Close all clients + for client in clients: + if not client.is_closed: + try: + await client.aclose() + except Exception as e: + logger.debug(f'Error closing httpx client: {e}') diff --git a/browser_use/browser/chrome.py b/browser_use/browser/chrome.py new file mode 100644 index 0000000000..a829fc4ed0 --- /dev/null +++ b/browser_use/browser/chrome.py @@ -0,0 +1,184 @@ +CHROME_DEFAULT_USER_AGENT = ( + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36' +) +CHROME_EXTENSIONS = {} # coming in a separate PR +CHROME_EXTENSIONS_PATH = 'chrome_extensions' +CHROME_PROFILE_PATH = 'chrome_profile' +CHROME_PROFILE_USER = 'Default' +CHROME_DEBUG_PORT = 9222 +CHROME_DISABLED_COMPONENTS = [ + 'Translate', + 'AcceptCHFrame', + 'OptimizationHints', + 'ProcessPerSiteUpToMainFrameThreshold', + 'InterestFeedContentSuggestions', + 'CalculateNativeWinOcclusion', + 'BackForwardCache', + 'HeavyAdPrivacyMitigations', + 'LazyFrameLoading', + 'ImprovedCookieControls', + 'PrivacySandboxSettings4', + 'AutofillServerCommunication', + 'CertificateTransparencyComponentUpdater', + 'DestroyProfileOnBrowserClose', + 'CrashReporting', + 'OverscrollHistoryNavigation', + 'InfiniteSessionRestore', + #'LockProfileCookieDatabase', # disabling allows multiple chrome instances to concurrently modify profile, but might make chrome much slower https://github.com/yt-dlp/yt-dlp/issues/7271 https://issues.chromium.org/issues/40901624 +] # it's always best to give each chrome instance its own exclusive copy of the user profile + + +CHROME_HEADLESS_ARGS = [ + '--headless=new', + '--test-type', + '--test-type=gpu', # https://github.com/puppeteer/puppeteer/issues/10516 + # '--enable-automation', # <- DONT USE THIS, it makes you easily detectable / blocked by cloudflare +] + +CHROME_DOCKER_ARGS = [ + # Docker-specific options + # https://github.com/GoogleChrome/lighthouse-ci/tree/main/docs/recipes/docker-client#--no-sandbox-issues-explained + '--no-sandbox', # rely on docker sandboxing in docker, otherwise we need cap_add: SYS_ADM to use host sandboxing + '--disable-gpu-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', # docker 75mb default shm size is not big enough, disabling just uses /tmp instead + '--no-xshm', + # dont try to disable (or install) dbus in docker, its not needed, chrome can work without dbus despite the errors +] + +CHROME_DISABLE_SECURITY_ARGS = [ + # DANGER: JS isolation security features (to allow easier tampering with pages during automation) + # chrome://net-internals + '--disable-web-security', # <- WARNING, breaks some sites that expect/enforce strict CORS headers (try webflow.com) + '--disable-site-isolation-trials', + '--disable-features=IsolateOrigins,site-per-process', + # '--allow-file-access-from-files', # <- WARNING, dangerous, allows JS to read filesystem using file:// URLs + # DANGER: Disable HTTPS verification + '--allow-running-insecure-content', # Breaks CORS/CSRF/HSTS etc., useful sometimes but very easy to detect + '--ignore-certificate-errors', + '--ignore-ssl-errors', + '--ignore-certificate-errors-spki-list', + '--allow-insecure-localhost', +] + +# flags to make chrome behave more deterministically across different OS's +CHROME_DETERMINISTIC_RENDERING_ARGS = [ + '--deterministic-mode', + '--js-flags=--random-seed=1157259159', # make all JS random numbers deterministic by providing a seed + '--force-device-scale-factor=1', + '--hide-scrollbars', # hide scrollbars because otherwise they show up in screenshots + # GPU, canvas, text, and pdf rendering config + # chrome://gpu + '--enable-webgl', # enable web-gl graphics support + '--font-render-hinting=none', # make rendering more deterministic by ignoring OS font hints, may also need css override, try: * {text-rendering: geometricprecision !important; -webkit-font-smoothing: antialiased;} + '--force-color-profile=srgb', # make rendering more deterministic by using consistent color profile, if browser looks weird, try: generic-rgb + '--disable-partial-raster', # make rendering more deterministic (TODO: verify if still needed) + '--disable-skia-runtime-opts', # make rendering more deterministic by avoiding Skia hot path runtime optimizations + '--disable-2d-canvas-clip-aa', # make rendering more deterministic by disabling antialiasing on 2d canvas clips + # '--disable-gpu', # falls back to more consistent software renderer across all OS's, especially helps linux text rendering look less weird + # // '--use-gl=swiftshader', <- DO NOT USE, breaks M1 ARM64. it makes rendering more deterministic by using simpler CPU renderer instead of OS GPU renderer bug: https://groups.google.com/a/chromium.org/g/chromium-dev/c/8eR2GctzGuw + # // '--disable-software-rasterizer', <- DO NOT USE, harmless, used in tandem with --disable-gpu + # // '--run-all-compositor-stages-before-draw', <- DO NOT USE, makes headful chrome hang on startup (tested v121 Google Chrome.app on macOS) + # // '--disable-gl-drawing-for-tests', <- DO NOT USE, disables gl output (makes tests run faster if you dont care about canvas) + # // '--blink-settings=imagesEnabled=false', <- DO NOT USE, disables images entirely (only sometimes useful to speed up loading) + # Process management & performance tuning + # chrome://process-internals + '--disable-lazy-loading', # make rendering more deterministic by loading all content up-front instead of on-focus + '--disable-renderer-backgrounding', # dont throttle tab rendering based on focus/visibility + '--disable-background-networking', # dont throttle tab networking based on focus/visibility + '--disable-background-timer-throttling', # dont throttle tab timers based on focus/visibility + '--disable-backgrounding-occluded-windows', # dont throttle tab window based on focus/visibility + '--disable-ipc-flooding-protection', # dont throttle ipc traffic or accessing big request/response/buffer/etc. objects will fail + '--disable-extensions-http-throttling', # dont throttle http traffic based on runtime heuristics + '--disable-field-trial-config', # disable shared field trial state between browser processes + '--disable-back-forward-cache', # disable browsing navigation cache +] + +CHROME_ARGS = [ + # Profile data dir setup + # chrome://profile-internals + # f'--user-data-dir={CHROME_PROFILE_PATH}', # managed by playwright arg instead + # f'--profile-directory={CHROME_PROFILE_USER}', + # '--password-store=basic', # use mock keychain instead of OS-provided keychain (we manage auth.json instead) + # '--use-mock-keychain', + '--disable-cookie-encryption', # we need to be able to write unencrypted cookies to save/load auth.json + '--disable-sync', # don't try to use Google account sync features while automation is active + # Extensions + # chrome://inspect/#extensions + # f'--load-extension={CHROME_EXTENSIONS.map(({unpacked_path}) => unpacked_path).join(',')}', # not needed when using existing profile that already has extensions installed + # f'--allowlisted-extension-id={",".join(CHROME_EXTENSIONS.keys())}', + '--allow-legacy-extension-manifests', + '--allow-pre-commit-input', # allow JS mutations before page rendering is complete + '--disable-blink-features=AutomationControlled', # hide the signatures that announce browser is being remote-controlled + # f'--proxy-server=https://43.159.28.126:2334:u7ce652b7568805c4-zone-custom-region-us-session-szGWq3FRU-sessTime-60:u7ce652b7568805c4', # send all network traffic through a proxy https://2captcha.com/proxy + # f'--proxy-bypass-list=127.0.0.1', + # Browser window and viewport setup + # chrome://version + # f'--user-agent="{DEFAULT_USER_AGENT}"', + # f'--window-size={DEFAULT_VIEWPORT.width},{DEFAULT_VIEWPORT.height}', + # '--window-position=0,0', + # '--start-maximized', + '--install-autogenerated-theme=0,0,0', # black border makes it easier to see which chrome window is browser-use's + #'--virtual-time-budget=60000', # fast-forward all animations & timers by 60s, dont use this it's unfortunately buggy and breaks screenshot and PDF capture sometimes + #'--autoplay-policy=no-user-gesture-required', # auto-start videos so they trigger network requests + show up in outputs + #'--disable-gesture-requirement-for-media-playback', + #'--lang=en-US,en;q=0.9', + # IO: stdin/stdout, debug port config + # chrome://inspect + '--log-level=2', # 1=DEBUG 2=WARNING 3=ERROR + '--enable-logging=stderr', + # '--remote-debugging-address=127.0.0.1', <- never expose to non-localhost, would allow attacker to drive your browser from any machine + f'--remote-debugging-port={CHROME_DEBUG_PORT}', + '--enable-experimental-extension-apis', # add support for tab groups + '--disable-focus-on-load', # prevent browser from hijacking focus + '--disable-window-activation', + # '--in-process-gpu', <- DONT USE THIS, makes headful startup time ~5-10s slower (tested v121 Google Chrome.app on macOS) + # '--disable-component-extensions-with-background-pages', # TODO: check this, disables chrome components that only run in background with no visible UI (could lower startup time) + # uncomment to disable hardware camera/mic/speaker access + present fake devices to websites + # (faster to disable, but disabling breaks recording browser audio in puppeteer-stream screenrecordings) + # '--use-fake-device-for-media-stream', + # '--use-fake-ui-for-media-stream', + # '--disable-features=GlobalMediaControls,MediaRouter,DialMediaRouteProvider', + # Output format options (PDF, screenshot, etc.) + '--export-tagged-pdf', # include table on contents and tags in printed PDFs + '--generate-pdf-document-outline', + # Suppress first-run features, popups, hints, updates, etc. + # chrome://system + '--no-pings', + '--no-first-run', + '--no-default-browser-check', + '--no-startup-window', + '--disable-default-apps', + '--ash-no-nudges', + '--disable-infobars', + '--disable-search-engine-choice-screen', + '--disable-session-crashed-bubble', + '--simulate-outdated-no-au="Tue, 31 Dec 2099 23:59:59 GMT"', # disable browser self-update while automation is active + '--hide-crash-restore-bubble', + '--suppress-message-center-popups', + '--disable-client-side-phishing-detection', + '--disable-domain-reliability', + '--disable-component-update', + '--disable-datasaver-prompt', + '--disable-hang-monitor', + '--disable-session-crashed-bubble', + '--disable-speech-synthesis-api', + '--disable-speech-api', + '--disable-print-preview', + '--safebrowsing-disable-auto-update', + '--deny-permission-prompts', + '--disable-external-intent-requests', + '--disable-notifications', + '--disable-desktop-notifications', + '--noerrdialogs', + '--disable-popup-blocking', + '--disable-prompt-on-repost', + '--silent-debugger-extension-api', + '--block-new-web-contents', + '--metrics-recording-only', + '--disable-breakpad', + # other feature flags + # chrome://flags chrome://components + f'--disable-features={",".join(CHROME_DISABLED_COMPONENTS)}', + '--enable-features=NetworkService', +] diff --git a/browser_use/browser/context.py b/browser_use/browser/context.py index d005be4ea6..16522d9c99 100644 --- a/browser_use/browser/context.py +++ b/browser_use/browser/context.py @@ -11,8 +11,8 @@ import re import time import uuid -from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Optional, TypedDict +from dataclasses import dataclass +from typing import TYPE_CHECKING, Optional from playwright._impl._errors import TimeoutError from playwright.async_api import Browser as PlaywrightBrowser @@ -24,6 +24,7 @@ FrameLocator, Page, ) +from pydantic import BaseModel, ConfigDict, Field from browser_use.browser.views import ( BrowserError, @@ -41,13 +42,27 @@ logger = logging.getLogger(__name__) -class BrowserContextWindowSize(TypedDict): +class BrowserContextWindowSize(BaseModel): + """Window size configuration for browser context""" + width: int height: int + model_config = ConfigDict( + extra='allow', # Allow extra fields to ensure compatibility with dictionary + populate_by_name=True, + from_attributes=True, + ) + + # Support dict-like behavior for compatibility + def __getitem__(self, key): + return getattr(self, key) -@dataclass -class BrowserContextConfig: + def get(self, key, default=None): + return getattr(self, key, default) + + +class BrowserContextConfig(BaseModel): """ Configuration for the BrowserContext. @@ -55,15 +70,15 @@ class BrowserContextConfig: cookies_file: None Path to cookies file for persistence - disable_security: True - Disable browser security features + disable_security: False + Disable browser security features (dangerous, but cross-origin iframe support requires it) minimum_wait_page_load_time: 0.5 Minimum time to wait before getting page state for LLM input - wait_for_network_idle_page_load_time: 1.0 - Time to wait for network requests to finish before getting page state. - Lower values may result in incomplete page loads. + wait_for_network_idle_page_load_time: 1.0 + Time to wait for network requests to finish before getting page state. + Lower values may result in incomplete page loads. maximum_wait_page_load_time: 5.0 Maximum time to wait for page load before proceeding anyway @@ -71,10 +86,7 @@ class BrowserContextConfig: wait_between_actions: 1.0 Time to wait between multiple per step actions - browser_window_size: { - 'width': 1280, - 'height': 1100, - } + browser_window_size: {'width': 1280, 'height': 1100} Default browser window size no_viewport: False @@ -98,7 +110,7 @@ class BrowserContextConfig: highlight_elements: True Highlight elements in the DOM on the screen - viewport_expansion: 500 + viewport_expansion: 0 Viewport expansion in pixels. This amount will increase the number of elements which are included in the state what the LLM will see. If set to -1, all elements will be included (this leads to high token usage). If set to 0, only the elements which are visible in the viewport will be included. allowed_domains: None @@ -107,21 +119,52 @@ class BrowserContextConfig: include_dynamic_attributes: bool = True Include dynamic attributes in the CSS selector. If you want to reuse the css_selectors, it might be better to set this to False. + + http_credentials: None + Dictionary with HTTP basic authentication credentials for corporate intranets (only supports one set of credentials for all URLs at the moment), e.g. + {"username": "bill", "password": "pa55w0rd"} + + is_mobile: None + Whether the meta viewport tag is taken into account and touch events are enabled. + + has_touch: None + Whether to enable touch events in the browser. + + geolocation: None + Geolocation to be used in the browser context. Example: {'latitude': 59.95, 'longitude': 30.31667} + + permissions: None + Browser permissions to grant. Values might include: ['geolocation', 'notifications'] + + timezone_id: None + Changes the timezone of the browser. Example: 'Europe/Berlin' """ + model_config = ConfigDict( + arbitrary_types_allowed=True, + extra='ignore', + populate_by_name=True, + from_attributes=True, + validate_assignment=True, + revalidate_instances='subclass-instances', + ) + cookies_file: str | None = None minimum_wait_page_load_time: float = 0.25 wait_for_network_idle_page_load_time: float = 0.5 maximum_wait_page_load_time: float = 5 wait_between_actions: float = 0.5 - disable_security: bool = True + disable_security: bool = False # disable_security=True is dangerous as any malicious URL visited could embed an iframe for the user's bank, and use their cookies to steal money - browser_window_size: BrowserContextWindowSize = field(default_factory=lambda: {'width': 1280, 'height': 1100}) + browser_window_size: BrowserContextWindowSize = Field( + default_factory=lambda: BrowserContextWindowSize(width=1280, height=1100) + ) no_viewport: Optional[bool] = None save_recording_path: str | None = None save_downloads_path: str | None = None + save_har_path: str | None = None trace_path: str | None = None locale: str | None = None user_agent: str = ( @@ -129,17 +172,69 @@ class BrowserContextConfig: ) highlight_elements: bool = True - viewport_expansion: int = 500 + viewport_expansion: int = 0 allowed_domains: list[str] | None = None include_dynamic_attributes: bool = True + http_credentials: dict[str, str] | None = None - _force_keep_context_alive: bool = False + keep_alive: bool = Field(default=False, alias='_force_keep_context_alive') # used to be called _force_keep_context_alive + is_mobile: bool | None = None + has_touch: bool | None = None + geolocation: dict | None = None + permissions: list[str] | None = None + timezone_id: str | None = None -@dataclass class BrowserSession: - context: PlaywrightBrowserContext - cached_state: BrowserState | None + def __init__(self, context: PlaywrightBrowserContext, cached_state: BrowserState | None = None): + init_script = """ + (() => { + if (!window.getEventListeners) { + window.getEventListeners = function (node) { + return node.__listeners || {}; + }; + + // Save the original addEventListener + const originalAddEventListener = Element.prototype.addEventListener; + + const eventProxy = { + addEventListener: function (type, listener, options = {}) { + // Initialize __listeners if not exists + const defaultOptions = { once: false, passive: false, capture: false }; + if(typeof options === 'boolean') { + options = { capture: options }; + } + options = { ...defaultOptions, ...options }; + if (!this.__listeners) { + this.__listeners = {}; + } + + // Initialize array for this event type if not exists + if (!this.__listeners[type]) { + this.__listeners[type] = []; + } + + + // Add the listener to __listeners + this.__listeners[type].push({ + listener: listener, + type: type, + ...options + }); + + // Call original addEventListener using the saved reference + return originalAddEventListener.call(this, type, listener, options); + } + }; + + Element.prototype.addEventListener = eventProxy.addEventListener; + } + })() + """ + self.active_tab = None + self.context = context + self.cached_state = cached_state + self.context.on('page', lambda page: page.add_init_script(init_script)) @dataclass @@ -155,19 +250,19 @@ class BrowserContext: def __init__( self, browser: 'Browser', - config: BrowserContextConfig = BrowserContextConfig(), + config: BrowserContextConfig | None = None, state: Optional[BrowserContextState] = None, ): self.context_id = str(uuid.uuid4()) - logger.debug(f'Initializing new browser context with id: {self.context_id}') - self.config = config + self.config = config or BrowserContextConfig(**(browser.config.model_dump() if browser.config else {})) self.browser = browser self.state = state or BrowserContextState() # Initialize these as None - they'll be set up when needed self.session: BrowserSession | None = None + self.active_tab: Page | None = None async def __aenter__(self): """Async context manager entry""" @@ -181,7 +276,6 @@ async def __aexit__(self, exc_type, exc_val, exc_tb): @time_execution_async('--close') async def close(self): """Close the browser instance""" - logger.debug('Closing browser context') try: if self.session is None: @@ -205,7 +299,8 @@ async def close(self): logger.debug(f'Failed to stop tracing: {e}') # This is crucial - it closes the CDP connection - if not self.config._force_keep_context_alive: + if not self.config.keep_alive: + logger.debug('Closing browser context') try: await self.session.context.close() except Exception as e: @@ -213,12 +308,13 @@ async def close(self): finally: # Dereference everything + self.active_tab = None self.session = None self._page_event_handler = None def __del__(self): """Cleanup when object is destroyed""" - if not self.config._force_keep_context_alive and self.session is not None: + if not self.config.keep_alive and self.session is not None: logger.debug('BrowserContext was not properly closed before destruction') try: # Use sync Playwright method for force cleanup @@ -233,7 +329,7 @@ def __del__(self): @time_execution_async('--initialize_session') async def _initialize_session(self): """Initialize the browser session""" - logger.debug('Initializing browser context') + logger.debug(f'🌎 Initializing new browser context with id: {self.context_id}') playwright_browser = await self.browser.get_playwright_browser() context = await self._create_context(playwright_browser) @@ -263,12 +359,18 @@ async def _initialize_session(self): # If no target ID or couldn't find it, use existing page or create new if not active_page: - if pages: + if ( + pages + and pages[0].url + and not pages[0].url.startswith('chrome://') # skip chrome internal pages e.g. settings, history, etc + and not pages[0].url.startswith('chrome-extension://') # skip hidden extension background pages + ): active_page = pages[0] - logger.debug('Using existing page') + logger.debug('🔍 Using existing page: %s', active_page.url) else: active_page = await context.new_page() - logger.debug('Created new page') + await active_page.goto('about:blank') + logger.debug('🆕 Created new page: %s', active_page.url) # Get target ID for the active page if self.browser.config.cdp_url: @@ -279,9 +381,12 @@ async def _initialize_session(self): break # Bring page to front + logger.debug('🫨 Bringing tab to front: %s', active_page) await active_page.bring_to_front() await active_page.wait_for_load_state('load') + self.active_tab = active_page + return self.session def _add_new_page_listener(self, context: PlaywrightBrowserContext): @@ -289,7 +394,11 @@ async def on_page(page: Page): if self.browser.config.cdp_url: await page.reload() # Reload the page to avoid timeout errors await page.wait_for_load_state() - logger.debug(f'New page opened: {page.url}') + logger.debug(f'📑 New page opened: {page.url}') + + if not page.url.startswith('chrome-extension://') and not page.url.startswith('chrome://'): + self.active_tab = page + if self.session is not None: self.state.target_id = None @@ -299,7 +408,11 @@ async def on_page(page: Page): async def get_session(self) -> BrowserSession: """Lazy initialization of the browser and related components""" if self.session is None: - return await self._initialize_session() + try: + return await self._initialize_session() + except Exception as e: + logger.error(f'❌ Failed to create new browser session: {e} (did the browser process quit?)') + raise e return self.session async def get_current_page(self) -> Page: @@ -311,21 +424,27 @@ async def _create_context(self, browser: PlaywrightBrowser): """Creates a new browser context with anti-detection measures and loads cookies if available.""" if self.browser.config.cdp_url and len(browser.contexts) > 0: context = browser.contexts[0] - elif self.browser.config.chrome_instance_path and len(browser.contexts) > 0: + elif self.browser.config.browser_binary_path and len(browser.contexts) > 0: # Connect to existing Chrome instance instead of creating new one context = browser.contexts[0] else: # Original code for creating new context context = await browser.new_context( - viewport=self.config.browser_window_size, - no_viewport=False, + no_viewport=True, user_agent=self.config.user_agent, java_script_enabled=True, bypass_csp=self.config.disable_security, ignore_https_errors=self.config.disable_security, record_video_dir=self.config.save_recording_path, - record_video_size=self.config.browser_window_size, + record_video_size=self.config.browser_window_size.model_dump(), + record_har_path=self.config.save_har_path, locale=self.config.locale, + http_credentials=self.config.http_credentials, + is_mobile=self.config.is_mobile, + has_touch=self.config.has_touch, + geolocation=self.config.geolocation, + permissions=self.config.permissions, + timezone_id=self.config.timezone_id, ) if self.config.trace_path: @@ -334,9 +453,22 @@ async def _create_context(self, browser: PlaywrightBrowser): # Load cookies if they exist if self.config.cookies_file and os.path.exists(self.config.cookies_file): with open(self.config.cookies_file, 'r') as f: - cookies = json.load(f) - logger.info(f'Loaded {len(cookies)} cookies from {self.config.cookies_file}') - await context.add_cookies(cookies) + try: + cookies = json.load(f) + + valid_same_site_values = ['Strict', 'Lax', 'None'] + for cookie in cookies: + if 'sameSite' in cookie: + if cookie['sameSite'] not in valid_same_site_values: + logger.warning( + f"Fixed invalid sameSite value '{cookie['sameSite']}' to 'None' for cookie {cookie.get('name')}" + ) + cookie['sameSite'] = 'None' + logger.info(f'🍪 Loaded {len(cookies)} cookies from {self.config.cookies_file}') + await context.add_cookies(cookies) + + except json.JSONDecodeError as e: + logger.error(f'Failed to parse cookies file: {str(e)}') # Expose anti-detection scripts await context.add_init_script( @@ -544,7 +676,7 @@ async def on_response(response): page.remove_listener('request', on_request) page.remove_listener('response', on_response) - logger.debug(f'Network stabilized for {self.config.wait_for_network_idle_page_load_time} seconds') + logger.debug(f'⚖️ Network stabilized for {self.config.wait_for_network_idle_page_load_time} seconds') async def _wait_for_page_and_frames_load(self, timeout_overwrite: float | None = None): """ @@ -565,7 +697,7 @@ async def _wait_for_page_and_frames_load(self, timeout_overwrite: float | None = except URLNotAllowedError as e: raise e except Exception: - logger.warning('Page load failed, continuing...') + logger.warning('⚠️ Page load failed, continuing...') pass # Calculate remaining time to meet minimum WAIT_TIME @@ -589,6 +721,10 @@ def _is_url_allowed(self, url: str) -> bool: parsed_url = urlparse(url) domain = parsed_url.netloc.lower() + # Special case: Allow 'about:blank' explicitly + if url == 'about:blank': + return True + # Remove port number if present if ':' in domain: domain = domain.split(':')[0] @@ -599,17 +735,17 @@ def _is_url_allowed(self, url: str) -> bool: for allowed_domain in self.config.allowed_domains ) except Exception as e: - logger.error(f'Error checking URL allowlist: {str(e)}') + logger.error(f'⛔️ Error checking URL allowlist: {str(e)}') return False async def _check_and_handle_navigation(self, page: Page) -> None: """Check if current page URL is allowed and handle if not.""" if not self._is_url_allowed(page.url): - logger.warning(f'Navigation to non-allowed URL detected: {page.url}') + logger.warning(f'⛔️ Navigation to non-allowed URL detected: {page.url}') try: await self.go_back() except Exception as e: - logger.error(f'Failed to go back after detecting non-allowed URL: {str(e)}') + logger.error(f'⛔️ Failed to go back after detecting non-allowed URL: {str(e)}') raise URLNotAllowedError(f'Navigation to non-allowed URL: {page.url}') async def navigate_to(self, url: str): @@ -636,7 +772,7 @@ async def go_back(self): # await self._wait_for_page_and_frames_load(timeout_overwrite=1.0) except Exception as e: # Continue even if its not fully loaded, because we wait later for the page to load - logger.debug(f'During go_back: {e}') + logger.debug(f'⏮️ Error during go_back: {e}') async def go_forward(self): """Navigate forward in history""" @@ -645,17 +781,18 @@ async def go_forward(self): await page.go_forward(timeout=10, wait_until='domcontentloaded') except Exception as e: # Continue even if its not fully loaded, because we wait later for the page to load - logger.debug(f'During go_forward: {e}') + logger.debug(f'⏭️ Error during go_forward: {e}') async def close_current_tab(self): """Close the current tab""" session = await self.get_session() page = await self._get_current_page(session) await page.close() - + self.active_tab = None # Switch to the first available tab if any exist if session.context.pages: await self.switch_to_tab(0) + self.active_tab = session.context.pages[0] # otherwise the browser will be closed @@ -674,24 +811,24 @@ async def get_page_structure(self) -> str: debug_script = """(() => { function getPageStructure(element = document, depth = 0, maxDepth = 10) { if (depth >= maxDepth) return ''; - + const indent = ' '.repeat(depth); let structure = ''; - + // Skip certain elements that clutter the output const skipTags = new Set(['script', 'style', 'link', 'meta', 'noscript']); - + // Add current element info if it's not the document if (element !== document) { const tagName = element.tagName.toLowerCase(); - + // Skip uninteresting elements if (skipTags.has(tagName)) return ''; - + const id = element.id ? `#${element.id}` : ''; - const classes = element.className && typeof element.className === 'string' ? + const classes = element.className && typeof element.className === 'string' ? `.${element.className.split(' ').filter(c => c).join('.')}` : ''; - + // Get additional useful attributes const attrs = []; if (element.getAttribute('role')) attrs.push(`role="${element.getAttribute('role')}"`); @@ -702,10 +839,10 @@ async def get_page_structure(self) -> str: const src = element.getAttribute('src'); attrs.push(`src="${src.substring(0, 50)}${src.length > 50 ? '...' : ''}"`); } - + // Add element info structure += `${indent}${tagName}${id}${classes}${attrs.length ? ' [' + attrs.join(', ') + ']' : ''}\\n`; - + // Handle iframes specially if (tagName === 'iframe') { try { @@ -721,7 +858,7 @@ async def get_page_structure(self) -> str: } } } - + // Get all child elements const children = element.children || element.childNodes; for (const child of children) { @@ -729,10 +866,10 @@ async def get_page_structure(self) -> str: structure += getPageStructure(child, depth + 1, maxDepth); } } - + return structure; } - + return getPageStructure(); })()""" @@ -763,13 +900,13 @@ async def _update_state(self, focus_element: int = -1) -> BrowserState: # Test if page is still accessible await page.evaluate('1') except Exception as e: - logger.debug(f'Current page is no longer accessible: {str(e)}') + logger.debug(f'👋 Current page is no longer accessible: {str(e)}') # Get all available pages pages = session.context.pages if pages: self.state.target_id = None page = await self._get_current_page(session) - logger.debug(f'Switched to page: {await page.title()}') + logger.debug(f'🔄 Switched to page: {await page.title()}') else: raise BrowserError('Browser closed: no valid pages available') @@ -782,6 +919,28 @@ async def _update_state(self, focus_element: int = -1) -> BrowserState: highlight_elements=self.config.highlight_elements, ) + tabs_info = await self.get_tabs_info() + + # Get all cross-origin iframes within the page and open them in new tabs + # mark the titles of the new tabs so the LLM knows to check them for additional content + # unfortunately too buggy for now, too many sites use invisible cross-origin iframes for ads, tracking, youtube videos, social media, etc. + # and it distracts the bot by opening a lot of new tabs + # iframe_urls = await dom_service.get_cross_origin_iframes() + # for url in iframe_urls: + # if url in [tab.url for tab in tabs_info]: + # continue # skip if the iframe if we already have it open in a tab + # new_page_id = tabs_info[-1].page_id + 1 + # logger.debug(f'Opening cross-origin iframe in new tab #{new_page_id}: {url}') + # await self.create_new_tab(url) + # tabs_info.append( + # TabInfo( + # page_id=new_page_id, + # url=url, + # title=f'iFrame opened as new tab, treat as if embedded inside page #{self.state.target_id}: {page.url}', + # parent_page_id=self.state.target_id, + # ) + # ) + screenshot_b64 = await self.take_screenshot() pixels_above, pixels_below = await self.get_scroll_info(page) @@ -790,7 +949,7 @@ async def _update_state(self, focus_element: int = -1) -> BrowserState: selector_map=content.selector_map, url=page.url, title=await page.title(), - tabs=await self.get_tabs_info(), + tabs=tabs_info, screenshot=screenshot_b64, pixels_above=pixels_above, pixels_below=pixels_below, @@ -798,7 +957,7 @@ async def _update_state(self, focus_element: int = -1) -> BrowserState: return self.current_state except Exception as e: - logger.error(f'Failed to update state: {str(e)}') + logger.error(f'❌ Failed to update state: {str(e)}') # Return last known good state if available if hasattr(self, 'current_state'): return self.current_state @@ -854,7 +1013,7 @@ async def remove_highlights(self): """ ) except Exception as e: - logger.debug(f'Failed to remove highlights (this is usually ok): {str(e)}') + logger.debug(f'⚠ Failed to remove highlights (this is usually ok): {str(e)}') # Don't raise the error since this is not critical functionality pass @@ -879,9 +1038,18 @@ def _convert_simple_xpath_to_css_selector(cls, xpath: str) -> str: if not part: continue + # Handle custom elements with colons by escaping them + if ':' in part and '[' not in part: + base_part = part.replace(':', r'\:') + css_parts.append(base_part) + continue + # Handle index notation [n] if '[' in part: base_part = part[: part.find('[')] + # Handle custom elements with colons in the base part + if ':' in base_part: + base_part = base_part.replace(':', r'\:') index_part = part[part.find('[') :] # Handle multiple indices @@ -1057,7 +1225,79 @@ async def get_locate_element(self, element: DOMElementNode) -> Optional[ElementH return element_handle return None except Exception as e: - logger.error(f'Failed to locate element: {str(e)}') + logger.error(f'❌ Failed to locate element: {str(e)}') + return None + + @time_execution_async('--get_locate_element_by_xpath') + async def get_locate_element_by_xpath(self, xpath: str) -> Optional[ElementHandle]: + """ + Locates an element on the page using the provided XPath. + """ + current_frame = await self.get_current_page() + + try: + # Use XPath to locate the element + element_handle = await current_frame.query_selector(f'xpath={xpath}') + if element_handle: + await element_handle.scroll_into_view_if_needed() + return element_handle + return None + except Exception as e: + logger.error(f'❌ Failed to locate element by XPath {xpath}: {str(e)}') + return None + + @time_execution_async('--get_locate_element_by_css_selector') + async def get_locate_element_by_css_selector(self, css_selector: str) -> Optional[ElementHandle]: + """ + Locates an element on the page using the provided CSS selector. + """ + current_frame = await self.get_current_page() + + try: + # Use CSS selector to locate the element + element_handle = await current_frame.query_selector(css_selector) + if element_handle: + await element_handle.scroll_into_view_if_needed() + return element_handle + return None + except Exception as e: + logger.error(f'❌ Failed to locate element by CSS selector {css_selector}: {str(e)}') + return None + + @time_execution_async('--get_locate_element_by_text') + async def get_locate_element_by_text( + self, text: str, nth: Optional[int] = 0, element_type: Optional[str] = None + ) -> Optional[ElementHandle]: + """ + Locates an element on the page using the provided text. + If `nth` is provided, it returns the nth matching element (0-based). + If `element_type` is provided, filters by tag name (e.g., 'button', 'span'). + """ + current_frame = await self.get_current_page() + try: + # handle also specific element type or use any type. + selector = f'{element_type or "*"}:text("{text}")' + elements = await current_frame.query_selector_all(selector) + # considering only visible elements + elements = [el for el in elements if await el.is_visible()] + + if not elements: + logger.error(f"No visible element with text '{text}' found.") + return None + + if nth is not None: + if 0 <= nth < len(elements): + element_handle = elements[nth] + else: + logger.error(f"Visible element with text '{text}' not found at index {nth}.") + return None + else: + element_handle = elements[0] + + await element_handle.scroll_into_view_if_needed() + return element_handle + except Exception as e: + logger.error(f"❌ Failed to locate element by text '{text}': {str(e)}") return None @time_execution_async('--input_text_element_node') @@ -1084,23 +1324,23 @@ async def _input_text_element_node(self, element_node: DOMElementNode, text: str pass # Get element properties to determine input method - tag_handle = await element_handle.get_property("tagName") + tag_handle = await element_handle.get_property('tagName') tag_name = (await tag_handle.json_value()).lower() is_contenteditable = await element_handle.get_property('isContentEditable') - readonly_handle = await element_handle.get_property("readOnly") - disabled_handle = await element_handle.get_property("disabled") + readonly_handle = await element_handle.get_property('readOnly') + disabled_handle = await element_handle.get_property('disabled') readonly = await readonly_handle.json_value() if readonly_handle else False disabled = await disabled_handle.json_value() if disabled_handle else False if (await is_contenteditable.json_value() or tag_name == 'input') and not (readonly or disabled): - await element_handle.evaluate('el => el.textContent = ""') + await element_handle.evaluate('el => {el.textContent = ""; el.value = "";}') await element_handle.type(text, delay=5) else: await element_handle.fill(text) except Exception as e: - logger.debug(f'Failed to input text into element: {repr(element_node)}. Error: {str(e)}') + logger.debug(f'❌ Failed to input text into element: {repr(element_node)}. Error: {str(e)}') raise BrowserError(f'Failed to input text into index {element_node.highlight_index}') @time_execution_async('--click_element_node') @@ -1134,7 +1374,7 @@ async def perform_click(click_func): unique_filename = await self._get_unique_filename(self.config.save_downloads_path, suggested_filename) download_path = os.path.join(self.config.save_downloads_path, unique_filename) await download.save_as(download_path) - logger.debug(f'Download triggered. Saved file to: {download_path}') + logger.debug(f'⬇️ Download triggered. Saved file to: {download_path}') return download_path except TimeoutError: # If no download is triggered, treat as normal click @@ -1171,7 +1411,13 @@ async def get_tabs_info(self) -> list[TabInfo]: tabs_info = [] for page_id, page in enumerate(session.context.pages): - tab_info = TabInfo(page_id=page_id, url=page.url, title=await page.title()) + try: + tab_info = TabInfo(page_id=page_id, url=page.url, title=await asyncio.wait_for(page.title(), timeout=1)) + except asyncio.TimeoutError: + # page.title() can hang forever on tabs that are crashed/disappeared/about:blank + # we dont want to try automating those tabs because they will hang the whole script + logger.debug('⚠ Failed to get tab info for tab #%s: %s (ignoring)', page_id, page.url) + tab_info = TabInfo(page_id=page_id, url='about:blank', title='ignore this tab and do not use it') tabs_info.append(tab_info) return tabs_info @@ -1199,6 +1445,7 @@ async def switch_to_tab(self, page_id: int) -> None: self.state.target_id = target['targetId'] break + self.active_tab = page await page.bring_to_front() await page.wait_for_load_state() @@ -1210,6 +1457,9 @@ async def create_new_tab(self, url: str | None = None) -> None: session = await self.get_session() new_page = await session.context.new_page() + + self.active_tab = new_page + await new_page.wait_for_load_state() if url: @@ -1239,8 +1489,27 @@ async def _get_current_page(self, session: BrowserSession) -> Page: if page.url == target['url']: return page - # Fallback to last page - return pages[-1] if pages else await session.context.new_page() + if self.active_tab and self.active_tab in session.context.pages and not self.active_tab.is_closed(): + return self.active_tab + + # fall back to most recently opened non-extension page (extensions are almost always invisible background targets) + non_extension_pages = [ + page for page in pages if not page.url.startswith('chrome-extension://') and not page.url.startswith('chrome://') + ] + if non_extension_pages: + return non_extension_pages[-1] + + # Fallback to opening a new tab in the active window + try: + return await session.context.new_page() + except Exception: + # there is no browser window available (perhaps the user closed it?) + # reopen a new window in the browser and try again + logger.warning('⚠️ No browser window available, opening a new window') + await self._initialize_session() + page = await session.context.new_page() + self.active_tab = page + return page async def get_selector_map(self) -> SelectorMap: session = await self.get_session() @@ -1262,7 +1531,7 @@ async def save_cookies(self): if self.session and self.session.context and self.config.cookies_file: try: cookies = await self.session.context.cookies() - logger.debug(f'Saving {len(cookies)} cookies to {self.config.cookies_file}') + logger.debug(f'🍪 Saving {len(cookies)} cookies to {self.config.cookies_file}') # Check if the path is a directory and create it if necessary dirname = os.path.dirname(self.config.cookies_file) @@ -1272,7 +1541,7 @@ async def save_cookies(self): with open(self.config.cookies_file, 'w') as f: json.dump(cookies, f) except Exception as e: - logger.warning(f'Failed to save cookies: {str(e)}') + logger.warning(f'❌ Failed to save cookies: {str(e)}') async def is_file_uploader(self, element_node: DOMElementNode, max_depth: int = 3, current_depth: int = 0) -> bool: """Check if element or its children are file uploaders""" @@ -1321,6 +1590,7 @@ async def reset_context(self): for page in pages: await page.close() + self.active_tab = None session.cached_state = None self.state.target_id = None @@ -1351,3 +1621,17 @@ async def _get_cdp_targets(self) -> list[dict]: except Exception as e: logger.debug(f'Failed to get CDP targets: {e}') return [] + + async def wait_for_element(self, selector: str, timeout: float) -> None: + """ + Waits for an element matching the given CSS selector to become visible. + + Args: + selector (str): The CSS selector of the element. + timeout (float): The maximum time to wait for the element to be visible (in milliseconds). + + Raises: + TimeoutError: If the element does not become visible within the specified timeout. + """ + page = await self.get_current_page() + await page.wait_for_selector(selector, state='visible', timeout=timeout) diff --git a/browser_use/browser/dolphin_service.py b/browser_use/browser/dolphin_service.py new file mode 100644 index 0000000000..0e033a46d2 --- /dev/null +++ b/browser_use/browser/dolphin_service.py @@ -0,0 +1,349 @@ +import logging +import os +from typing import List, Optional + +import aiohttp +from playwright.async_api import Page, async_playwright + +from browser_use.browser.service import Browser +from browser_use.browser.views import BrowserState, TabInfo + +logger = logging.getLogger(__name__) + + +class DolphinBrowser(Browser): + """A class for managing Dolphin Anty browser sessions using Playwright""" + + def __init__(self, headless: bool = False, keep_open: bool = False): + """ + Initialize the DolphinBrowser instance. + + Args: + headless (bool): Run browser in headless mode (default: False). + keep_open (bool): Keep browser open after finishing tasks (default: False). + """ + # Retrieve environment variables for API connection + self.api_token = os.getenv('DOLPHIN_API_TOKEN') + self.api_url = os.getenv('DOLPHIN_API_URL', 'http://localhost:3001/v1.0') + self.profile_id = os.getenv('DOLPHIN_PROFILE_ID') + + # Initialize internal attributes + self.playwright = None + self.browser = None + self.context = None + self.page = None + self.headless = headless + self.keep_open = keep_open + self._pages: List[Page] = [] # List to store open pages + self.session = None + self.cached_state = None + + async def get_current_page(self) -> Page: + """ + Get the currently active page. + + Raises: + Exception: If no active page is available. + """ + if not self.page: + raise Exception('No active page. Browser might not be connected.') + return self.page + + async def create_new_tab(self, url: str | None = None) -> None: + """ + Create a new tab and optionally navigate to a given URL. + + Args: + url (str, optional): URL to navigate to after creating the tab. Defaults to None. + + Raises: + Exception: If browser context is not initialized or navigation fails. + """ + if not self.context: + raise Exception('Browser context not initialized') + + # Create new page (tab) in the current browser context + new_page = await self.context.new_page() + self._pages.append(new_page) + self.page = new_page # Set as current page + + if url: + try: + # Navigate to the URL and wait for the page to load + await new_page.goto(url, wait_until='networkidle') + await self.wait_for_page_load() + except Exception as e: + logger.error(f'Failed to navigate to URL {url}: {str(e)}') + raise + + async def switch_to_tab(self, page_id: int) -> None: + """ + Switch to a specific tab by its page ID. + + Args: + page_id (int): The index of the tab to switch to. + + Raises: + Exception: If the tab index is out of range or no tabs are available. + """ + if not self._pages: + raise Exception('No tabs available') + + # Handle negative indices (e.g., -1 for last tab) + if page_id < 0: + page_id = len(self._pages) + page_id + + if page_id >= len(self._pages) or page_id < 0: + raise Exception(f'Tab index {page_id} out of range') + + # Set the current page to the selected tab + self.page = self._pages[page_id] + await self.page.bring_to_front() # Bring tab to the front + await self.wait_for_page_load() + + async def get_tabs_info(self) -> list[TabInfo]: + """ + Get information about all open tabs. + + Returns: + list: A list of TabInfo objects containing details about each tab. + """ + tabs_info = [] + for idx, page in enumerate(self._pages): + tab_info = TabInfo( + page_id=idx, + url=page.url, + title=await page.title(), # Fetch the title of the page + ) + tabs_info.append(tab_info) + return tabs_info + + async def wait_for_page_load(self, timeout: int = 30000): + """ + Wait for the page to load completely. + + Args: + timeout (int): Maximum time to wait for page load in milliseconds (default: 30000ms). + + Raises: + Exception: If the page fails to load within the specified timeout. + """ + if self.page: + try: + await self.page.wait_for_load_state('networkidle', timeout=timeout) + except Exception as e: + logger.warning(f'Wait for page load timeout: {str(e)}') + + async def get_session(self): + """ + Get the current session. + + Returns: + DolphinBrowser: The current DolphinBrowser instance. + + Raises: + Exception: If the browser is not connected. + """ + if not self.browser: + raise Exception('Browser not connected. Call connect() first.') + self.session = self + return self + + async def authenticate(self): + """ + Authenticate with Dolphin Anty API using the API token. + + Raises: + Exception: If authentication fails. + """ + async with aiohttp.ClientSession() as session: + auth_url = f'{self.api_url}/auth/login-with-token' + auth_data = {'token': self.api_token} + async with session.post(auth_url, json=auth_data) as response: + if not response.ok: + raise Exception(f'Failed to authenticate with Dolphin Anty: {await response.text()}') + return await response.json() + + async def get_browser_profiles(self): + """ + Get a list of available browser profiles from Dolphin Anty. + + Returns: + list: A list of browser profiles. + + Raises: + Exception: If fetching the browser profiles fails. + """ + # Authenticate before fetching profiles + await self.authenticate() + + async with aiohttp.ClientSession() as session: + headers = {'Authorization': f'Bearer {self.api_token}'} + async with session.get(f'{self.api_url}/browser_profiles', headers=headers) as response: + if not response.ok: + raise Exception(f'Failed to get browser profiles: {await response.text()}') + data = await response.json() + return data.get('data', []) # Return the profiles array from the response + + async def start_profile(self, profile_id: Optional[str] = None, headless: bool = False) -> dict: + """ + Start a browser profile on Dolphin Anty. + + Args: + profile_id (str, optional): Profile ID to start (defaults to the one set in the environment). + headless (bool): Run browser in headless mode (default: False). + + Returns: + dict: Information about the started profile. + + Raises: + ValueError: If no profile ID is provided and no default is set. + Exception: If starting the profile fails. + """ + # Authenticate before starting the profile + await self.authenticate() + + profile_id = profile_id or self.profile_id + if not profile_id: + raise ValueError('No profile ID provided') + + url = f'{self.api_url}/browser_profiles/{profile_id}/start' + params = {'automation': 1} + if headless: + params['headless'] = 1 + + async with aiohttp.ClientSession() as session: + async with session.get(url, params=params) as response: + if not response.ok: + raise Exception(f'Failed to start profile: {await response.text()}') + return await response.json() + + async def stop_profile(self, profile_id: Optional[str] = None): + """ + Stop a browser profile on Dolphin Anty. + + Args: + profile_id (str, optional): Profile ID to stop (defaults to the one set in the environment). + + Returns: + dict: Information about the stopped profile. + + Raises: + ValueError: If no profile ID is provided and no default is set. + """ + # Authenticate before stopping the profile + await self.authenticate() + + profile_id = profile_id or self.profile_id + if not profile_id: + raise ValueError('No profile ID provided') + + url = f'{self.api_url}/browser_profiles/{profile_id}/stop' + async with aiohttp.ClientSession() as session: + async with session.get(url) as response: + return await response.json() + + async def connect(self, profile_id: Optional[str] = None): + """ + Connect to a running browser profile using Playwright. + + Args: + profile_id (str, optional): Profile ID to connect to (defaults to the one set in the environment). + + Returns: + PlaywrightBrowser: The connected browser instance. + + Raises: + Exception: If authentication or profile connection fails. + """ + # Authenticate before connecting to the profile + await self.authenticate() + + # Start the browser profile + profile_data = await self.start_profile(profile_id) + + if not profile_data.get('success'): + raise Exception(f'Failed to start profile: {profile_data}') + + automation = profile_data['automation'] + port = automation['port'] + ws_endpoint = automation['wsEndpoint'] + ws_url = f'ws://127.0.0.1:{port}{ws_endpoint}' + + # Use Playwright to connect to the browser's WebSocket endpoint + self.playwright = await async_playwright().start() + self.browser = await self.playwright.chromium.connect_over_cdp(ws_url) + + # Get or create a browser context and page + contexts = self.browser.contexts + self.context = contexts[0] if contexts else await self.browser.new_context() + pages = self.context.pages + self.page = pages[0] if pages else await self.context.new_page() + + self._pages = [self.page] # Initialize pages list with the first page + + return self.browser + + async def close(self, force: bool = False): + """ + Close the browser connection and clean up resources. + + Args: + force (bool): If True, forcefully stop the associated profile (default: False). + """ + try: + # Close all open pages + if self._pages: + for page in self._pages: + try: + await page.close() + except BaseException: + pass + self._pages = [] + + # Close the browser and Playwright instance + if self.browser: + await self.browser.close() + + if self.playwright: + await self.playwright.stop() + + if force: + await self.stop_profile() # Force stop the profile + except Exception as e: + logger.error(f'Error during browser cleanup: {str(e)}') + + async def get_current_state(self) -> BrowserState: + """ + Get the current state of the browser (URL, content, viewport size, tabs). + + Returns: + BrowserState: The current state of the browser. + + Raises: + Exception: If no active page is available. + """ + if not self.page: + raise Exception('No active page') + + # Get page content and viewport size + content = await self.page.content() + viewport_size = await self.page.viewport_size() + + # Create and return the current browser state + state = BrowserState( + url=self.page.url, + content=content, + viewport_height=viewport_size['height'] if viewport_size else 0, + viewport_width=viewport_size['width'] if viewport_size else 0, + tabs=await self.get_tabs_info(), + ) + + # Cache and return the state + self.cached_state = state + return state + + def __del__(self): + """Clean up resources when the DolphinBrowser instance is deleted.""" + # No need to handle session cleanup as we're using self as session + pass diff --git a/browser_use/browser/tests/screenshot_test.py b/browser_use/browser/tests/screenshot_test.py index 7255ccb615..b55fdf8792 100644 --- a/browser_use/browser/tests/screenshot_test.py +++ b/browser_use/browser/tests/screenshot_test.py @@ -1,3 +1,4 @@ +import asyncio import base64 import pytest @@ -5,33 +6,31 @@ from browser_use.browser.browser import Browser, BrowserConfig -@pytest.fixture -async def browser(): - browser_service = Browser(config=BrowserConfig(headless=True)) - yield browser_service - - await browser_service.close() - - -# @pytest.mark.skip(reason='takes too long') -def test_take_full_page_screenshot(browser): - # Go to a test page - browser.go_to_url('https://example.com') - - # Take full page screenshot - screenshot_b64 = browser.take_screenshot(full_page=True) - - # Verify screenshot is not empty and is valid base64 - assert screenshot_b64 is not None - assert isinstance(screenshot_b64, str) - assert len(screenshot_b64) > 0 - - # Test we can decode the base64 string +async def test_take_full_page_screenshot(): + browser = Browser(config=BrowserConfig(headless=False, disable_security=True)) try: - base64.b64decode(screenshot_b64) - except Exception as e: - pytest.fail(f'Failed to decode base64 screenshot: {str(e)}') + async with await browser.new_context() as context: + page = await context.get_current_page() + # Go to a test page + await page.goto('https://example.com') + + await asyncio.sleep(3) + # Take full page screenshot + screenshot_b64 = await context.take_screenshot(full_page=True) + await asyncio.sleep(3) + # Verify screenshot is not empty and is valid base64 + assert screenshot_b64 is not None + assert isinstance(screenshot_b64, str) + assert len(screenshot_b64) > 0 + + # Test we can decode the base64 string + try: + base64.b64decode(screenshot_b64) + except Exception as e: + pytest.fail(f'Failed to decode base64 screenshot: {str(e)}') + finally: + await browser.close() if __name__ == '__main__': - test_take_full_page_screenshot(Browser(config=BrowserConfig(headless=False))) + asyncio.run(test_take_full_page_screenshot()) diff --git a/browser_use/browser/utils/screen_resolution.py b/browser_use/browser/utils/screen_resolution.py new file mode 100644 index 0000000000..260797084f --- /dev/null +++ b/browser_use/browser/utils/screen_resolution.py @@ -0,0 +1,41 @@ +import sys + + +def get_screen_resolution(): + if sys.platform == 'darwin': # macOS + try: + from AppKit import NSScreen + + screen = NSScreen.mainScreen().frame() + return {'width': int(screen.size.width), 'height': int(screen.size.height)} + except ImportError: + print('AppKit is not available. Make sure you are running this on macOS with pyobjc installed.') + except Exception as e: + print(f'Error retrieving macOS screen resolution: {e}') + return {'width': 2560, 'height': 1664} + + else: # Windows & Linux + try: + from screeninfo import get_monitors + + monitors = get_monitors() + if not monitors: + raise Exception('No monitors detected.') + monitor = monitors[0] + return {'width': monitor.width, 'height': monitor.height} + except ImportError: + print("screeninfo package not found. Install it using 'pip install screeninfo'.") + except Exception as e: + print(f'Error retrieving screen resolution: {e}') + + return {'width': 1920, 'height': 1080} + + +def get_window_adjustments(): + """Returns recommended x, y offsets for window positioning""" + if sys.platform == 'darwin': # macOS + return -4, 24 # macOS has a small title bar, no border + elif sys.platform == 'win32': # Windows + return -8, 0 # Windows has a border on the left + else: # Linux + return 0, 0 diff --git a/browser_use/browser/views.py b/browser_use/browser/views.py index 3434d86e26..73304bc424 100644 --- a/browser_use/browser/views.py +++ b/browser_use/browser/views.py @@ -14,6 +14,17 @@ class TabInfo(BaseModel): page_id: int url: str title: str + parent_page_id: Optional[int] = None # parent page that contains this popup or cross-origin iframe + + +class GroupTabsAction(BaseModel): + tab_ids: list[int] + title: str + color: Optional[str] = 'blue' + + +class UngroupTabsAction(BaseModel): + tab_ids: list[int] @dataclass diff --git a/browser_use/controller/registry/service.py b/browser_use/controller/registry/service.py index be52a4b680..e849d9f6c7 100644 --- a/browser_use/controller/registry/service.py +++ b/browser_use/controller/registry/service.py @@ -49,6 +49,8 @@ def action( self, description: str, param_model: Optional[Type[BaseModel]] = None, + domains: Optional[list[str]] = None, + page_filter: Optional[Callable[[Any], bool]] = None, ): """Decorator for registering actions""" @@ -79,6 +81,8 @@ async def async_wrapper(*args, **kwargs): description=description, function=wrapped_func, param_model=actual_param_model, + domains=domains, + page_filter=page_filter, ) self.registry.actions[func.__name__] = action return func @@ -171,29 +175,55 @@ def replace_secrets(value): return params @time_execution_sync('--create_action_model') - def create_action_model(self, include_actions: Optional[list[str]] = None) -> Type[ActionModel]: - """Creates a Pydantic model from registered actions""" + def create_action_model(self, include_actions: Optional[list[str]] = None, page=None) -> Type[ActionModel]: + """Creates a Pydantic model from registered actions, used by LLM APIs that support tool calling & enforce a schema""" + + # Filter actions based on page if provided: + # if page is None, only include actions with no filters + # if page is provided, only include actions that match the page + + available_actions = {} + for name, action in self.registry.actions.items(): + if include_actions is not None and name not in include_actions: + continue + + # If no page provided, only include actions with no filters + if page is None: + if action.page_filter is None and action.domains is None: + available_actions[name] = action + continue + + # Check page_filter if present + domain_is_allowed = self.registry._match_domains(action.domains, page.url) + page_is_allowed = self.registry._match_page_filter(action.page_filter, page) + + # Include action if both filters match (or if either is not present) + if domain_is_allowed and page_is_allowed: + available_actions[name] = action + fields = { name: ( Optional[action.param_model], Field(default=None, description=action.description), ) - for name, action in self.registry.actions.items() - if include_actions is None or name in include_actions + for name, action in available_actions.items() } self.telemetry.capture( ControllerRegisteredFunctionsTelemetryEvent( registered_functions=[ RegisteredFunction(name=name, params=action.param_model.model_json_schema()) - for name, action in self.registry.actions.items() - if include_actions is None or name in include_actions + for name, action in available_actions.items() ] ) ) return create_model('ActionModel', __base__=ActionModel, **fields) # type:ignore - def get_prompt_description(self) -> str: - """Get a description of all actions for the prompt""" - return self.registry.get_prompt_description() + def get_prompt_description(self, page=None) -> str: + """Get a description of all actions for the prompt + + If page is provided, only include actions that are available for that page + based on their filter_func + """ + return self.registry.get_prompt_description(page=page) diff --git a/browser_use/controller/registry/views.py b/browser_use/controller/registry/views.py index 211c767a31..47956a048c 100644 --- a/browser_use/controller/registry/views.py +++ b/browser_use/controller/registry/views.py @@ -1,5 +1,6 @@ from typing import Callable, Dict, Type +from playwright.async_api import Page from pydantic import BaseModel, ConfigDict @@ -11,6 +12,10 @@ class RegisteredAction(BaseModel): function: Callable param_model: Type[BaseModel] + # filters: provide specific domains or a function to determine whether the action should be available on the given page or not + domains: list[str] | None = None # e.g. ['*.google.com', 'www.bing.com', 'yahoo.*] + page_filter: Callable[[Page], bool] | None = None + model_config = ConfigDict(arbitrary_types_allowed=True) def prompt_description(self) -> str: @@ -21,7 +26,7 @@ def prompt_description(self) -> str: s += str( { k: {sub_k: sub_v for sub_k, sub_v in v.items() if sub_k not in skip_keys} - for k, v in self.param_model.schema()['properties'].items() + for k, v in self.param_model.model_json_schema()['properties'].items() } ) s += '}' @@ -65,6 +70,80 @@ class ActionRegistry(BaseModel): actions: Dict[str, RegisteredAction] = {} - def get_prompt_description(self) -> str: - """Get a description of all actions for the prompt""" - return '\n'.join([action.prompt_description() for action in self.actions.values()]) + @staticmethod + def _match_domains(domains: list[str] | None, url: str) -> bool: + """ + Match a list of domain glob patterns against a URL. + + Args: + domain_patterns: A list of domain patterns that can include glob patterns (* wildcard) + url: The URL to match against + + Returns: + True if the URL's domain matches the pattern, False otherwise + """ + + if domains is None or not url: + return True + + import fnmatch + from urllib.parse import urlparse + + # Parse the URL to get the domain + try: + parsed_url = urlparse(url) + if not parsed_url.netloc: + return False + + domain = parsed_url.netloc + # Remove port if present + if ':' in domain: + domain = domain.split(':')[0] + + for domain_pattern in domains: + if fnmatch.fnmatch(domain, domain_pattern): # Perform glob *.matching.* + return True + return False + except Exception: + return False + + @staticmethod + def _match_page_filter(page_filter: Callable[[Page], bool] | None, page: Page) -> bool: + """Match a page filter against a page""" + if page_filter is None: + return True + return page_filter(page) + + def get_prompt_description(self, page: Page | None = None) -> str: + """Get a description of all actions for the prompt + + Args: + page: If provided, filter actions by page using page_filter and domains. + + Returns: + A string description of available actions. + - If page is None: return only actions with no page_filter and no domains (for system prompt) + - If page is provided: return only filtered actions that match the current page (excluding unfiltered actions) + """ + if page is None: + # For system prompt (no page provided), include only actions with no filters + return '\n'.join( + action.prompt_description() + for action in self.actions.values() + if action.page_filter is None and action.domains is None + ) + + # only include filtered actions for the current page + filtered_actions = [] + for action in self.actions.values(): + if not (action.domains or action.page_filter): + # skip actions with no filters, they are already included in the system prompt + continue + + domain_is_allowed = self._match_domains(action.domains, page.url) + page_is_allowed = self._match_page_filter(action.page_filter, page) + + if domain_is_allowed and page_is_allowed: + filtered_actions.append(action) + + return '\n'.join(action.prompt_description() for action in filtered_actions) diff --git a/browser_use/controller/service.py b/browser_use/controller/service.py index cf468f2873..2ccabe496d 100644 --- a/browser_use/controller/service.py +++ b/browser_use/controller/service.py @@ -1,11 +1,14 @@ import asyncio -import json +import datetime import enum +import json import logging -from typing import Dict, Generic, Optional, Type, TypeVar +import re +from typing import Dict, Generic, Optional, Tuple, Type, TypeVar, cast from langchain_core.language_models.chat_models import BaseChatModel from langchain_core.prompts import PromptTemplate +from playwright.async_api import ElementHandle, Page # from lmnr.sdk.laminar import Laminar from pydantic import BaseModel @@ -15,15 +18,22 @@ from browser_use.controller.registry.service import Registry from browser_use.controller.views import ( ClickElementAction, + ClickElementBySelectorAction, + ClickElementByTextAction, + ClickElementByXpathAction, + CloseTabAction, DoneAction, + DragDropAction, GoToUrlAction, InputTextAction, NoParamsAction, OpenTabAction, + Position, ScrollAction, SearchGoogleAction, SendKeysAction, SwitchTabAction, + WaitForElementAction, ) from browser_use.utils import time_execution_sync @@ -50,7 +60,7 @@ class ExtendedOutputModel(BaseModel): # type: ignore data: output_model @self.registry.action( - 'Complete task - with return text and if the task is finished (success=True) or not yet completly finished (success=False), because last step is reached', + 'Complete task - with return text and if the task is finished (success=True) or not yet completely finished (success=False), because last step is reached', param_model=ExtendedOutputModel, ) async def done(params: ExtendedOutputModel): @@ -66,7 +76,7 @@ async def done(params: ExtendedOutputModel): else: @self.registry.action( - 'Complete task - with return text and if the task is finished (success=True) or not yet completly finished (success=False), because last step is reached', + 'Complete task - with return text and if the task is finished (success=True) or not yet completely finished (success=False), because last step is reached', param_model=DoneAction, ) async def done(params: DoneAction): @@ -109,9 +119,22 @@ async def wait(seconds: int = 3): await asyncio.sleep(seconds) return ActionResult(extracted_content=msg, include_in_memory=True) + @self.registry.action('Wait for element to be visible', param_model=WaitForElementAction) + async def wait_for_element(params: WaitForElementAction, browser: BrowserContext): + """Waits for the element specified by the CSS selector to become visible within the given timeout.""" + try: + await browser.wait_for_element(params.selector, params.timeout) + msg = f'👀 Element with selector "{params.selector}" became visible within {params.timeout}ms.' + logger.info(msg) + return ActionResult(extracted_content=msg, include_in_memory=True) + except Exception as e: + err_msg = f'❌ Failed to wait for element "{params.selector}" within {params.timeout}ms: {str(e)}' + logger.error(err_msg) + raise Exception(err_msg) + # Element Interaction Actions - @self.registry.action('Click element', param_model=ClickElementAction) - async def click_element(params: ClickElementAction, browser: BrowserContext): + @self.registry.action('Click element by index', param_model=ClickElementAction) + async def click_element_by_index(params: ClickElementAction, browser: BrowserContext): session = await browser.get_session() if params.index not in await browser.get_selector_map(): @@ -147,6 +170,74 @@ async def click_element(params: ClickElementAction, browser: BrowserContext): logger.warning(f'Element not clickable with index {params.index} - most likely the page changed') return ActionResult(error=str(e)) + @self.registry.action('Click element by selector', param_model=ClickElementBySelectorAction) + async def click_element_by_selector(params: ClickElementBySelectorAction, browser: BrowserContext): + try: + element_node = await browser.get_locate_element_by_css_selector(params.css_selector) + if element_node: + try: + await element_node.scroll_into_view_if_needed() + await element_node.click(timeout=1500, force=True) + except Exception: + try: + # Handle with js evaluate if fails to click using playwright + await element_node.evaluate('el => el.click()') + except Exception as e: + logger.warning(f"Element not clickable with css selector '{params.css_selector}' - {e}") + return ActionResult(error=str(e)) + msg = f'🖱️ Clicked on element with text "{params.css_selector}"' + return ActionResult(extracted_content=msg, include_in_memory=True) + except Exception as e: + logger.warning(f'Element not clickable with selector {params.css_selector} - most likely the page changed') + return ActionResult(error=str(e)) + + @self.registry.action('Click on element by xpath', param_model=ClickElementByXpathAction) + async def click_element_by_xpath(params: ClickElementByXpathAction, browser: BrowserContext): + try: + element_node = await browser.get_locate_element_by_xpath(params.xpath) + if element_node: + try: + await element_node.scroll_into_view_if_needed() + await element_node.click(timeout=1500, force=True) + except Exception: + try: + # Handle with js evaluate if fails to click using playwright + await element_node.evaluate('el => el.click()') + except Exception as e: + logger.warning(f"Element not clickable with xpath '{params.xpath}' - {e}") + return ActionResult(error=str(e)) + msg = f'🖱️ Clicked on element with text "{params.xpath}"' + return ActionResult(extracted_content=msg, include_in_memory=True) + except Exception as e: + logger.warning(f'Element not clickable with xpath {params.xpath} - most likely the page changed') + return ActionResult(error=str(e)) + + @self.registry.action('Click element with text', param_model=ClickElementByTextAction) + async def click_element_by_text(params: ClickElementByTextAction, browser: BrowserContext): + try: + element_node = await browser.get_locate_element_by_text( + text=params.text, nth=params.nth, element_type=params.element_type + ) + + if element_node: + try: + await element_node.scroll_into_view_if_needed() + await element_node.click(timeout=1500, force=True) + except Exception: + try: + # Handle with js evaluate if fails to click using playwright + await element_node.evaluate('el => el.click()') + except Exception as e: + logger.warning(f"Element not clickable with text '{params.text}' - {e}") + return ActionResult(error=str(e)) + msg = f'🖱️ Clicked on element with text "{params.text}"' + return ActionResult(extracted_content=msg, include_in_memory=True) + else: + return ActionResult(error=f"No element found for text '{params.text}'") + except Exception as e: + logger.warning(f"Element not clickable with text '{params.text}' - {e}") + return ActionResult(error=str(e)) + @self.registry.action( 'Input text into a input interactive element', param_model=InputTextAction, @@ -165,6 +256,22 @@ async def input_text(params: InputTextAction, browser: BrowserContext, has_sensi logger.debug(f'Element xpath: {element_node.xpath}') return ActionResult(extracted_content=msg, include_in_memory=True) + # Save PDF + @self.registry.action( + 'Save the current page as a PDF file', + ) + async def save_pdf(browser: BrowserContext): + page = await browser.get_current_page() + short_url = re.sub(r'^https?://(?:www\.)?|/$', '', page.url) + slug = re.sub(r'[^a-zA-Z0-9]+', '-', short_url).strip('-').lower() + sanitized_filename = f'{slug}.pdf' + + await page.emulate_media('screen') + await page.pdf(path=sanitized_filename, format='A4', print_background=False) + msg = f'Saving page with URL {page.url} as PDF to ./{sanitized_filename}' + logger.info(msg) + return ActionResult(extracted_content=msg, include_in_memory=True) + # Tab Management Actions @self.registry.action('Switch tab', param_model=SwitchTabAction) async def switch_tab(params: SwitchTabAction, browser: BrowserContext): @@ -183,15 +290,37 @@ async def open_tab(params: OpenTabAction, browser: BrowserContext): logger.info(msg) return ActionResult(extracted_content=msg, include_in_memory=True) + @self.registry.action('Close an existing tab', param_model=CloseTabAction) + async def close_tab(params: CloseTabAction, browser: BrowserContext): + await browser.switch_to_tab(params.page_id) + page = await browser.get_current_page() + url = page.url + await page.close() + msg = f'❌ Closed tab #{params.page_id} with url {url}' + logger.info(msg) + return ActionResult(extracted_content=msg, include_in_memory=True) + # Content Actions @self.registry.action( - 'Extract page content to retrieve specific information from the page, e.g. all company names, a specifc description, all information about, links with companies in structured format or simply links', + 'Extract page content to retrieve specific information from the page, e.g. all company names, a specific description, all information about, links with companies in structured format or simply links', ) - async def extract_content(goal: str, browser: BrowserContext, page_extraction_llm: BaseChatModel): + async def extract_content( + goal: str, should_strip_link_urls: bool, browser: BrowserContext, page_extraction_llm: BaseChatModel + ): page = await browser.get_current_page() import markdownify - content = markdownify.markdownify(await page.content()) + strip = [] + if should_strip_link_urls: + strip = ['a', 'img'] + + content = markdownify.markdownify(await page.content(), strip=strip) + + # manually append iframe text into the content so it's readable by the LLM (includes cross-origin iframes) + for iframe in page.frames: + if iframe.url != page.url and not iframe.url.startswith('data:'): + content += f'\n\nIFRAME {iframe.url}:\n' + content += markdownify.markdownify(await iframe.content()) prompt = 'Your task is to extract the content of the page. You will be given a page and a goal and you should extract all relevant information around this goal from the page. If the goal is vague, summarize the page. Respond in json format. Extraction goal: {goal}, Page: {page}' template = PromptTemplate(input_variables=['goal', 'page'], template=prompt) @@ -206,6 +335,36 @@ async def extract_content(goal: str, browser: BrowserContext, page_extraction_ll logger.info(msg) return ActionResult(extracted_content=msg) + # HTML Download + @self.registry.action( + 'Save the raw HTML content of the current page to a local file', + param_model=NoParamsAction, + ) + async def save_html_to_file(_: NoParamsAction, browser: BrowserContext) -> ActionResult: + """Retrieves and returns the full HTML content of the current page to a file""" + try: + page = await browser.get_current_page() + html_content = await page.content() + + # Create a filename based on the page URL + short_url = re.sub(r'^https?://(?:www\.)?|/$', '', page.url) + slug = re.sub(r'[^a-zA-Z0-9]+', '-', short_url).strip('-').lower()[:64] + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + sanitized_filename = f'{slug}_{timestamp}.html' + + # Save HTML to file + with open(sanitized_filename, 'w', encoding='utf-8') as f: + f.write(html_content) + + msg = f'Saved HTML content of page with URL {page.url} to ./{sanitized_filename}' + + logger.info(msg) + return ActionResult(extracted_content=msg, include_in_memory=True) + except Exception as e: + error_msg = f'Failed to save HTML content: {str(e)}' + logger.error(error_msg) + return ActionResult(error=error_msg, extracted_content='') + @self.registry.action( 'Scroll down the page by pixel amount - if no amount is specified, scroll down one page', param_model=ScrollAction, @@ -471,6 +630,220 @@ async def select_dropdown_option( logger.error(msg) return ActionResult(error=msg, include_in_memory=True) + @self.registry.action( + 'Drag and drop elements or between coordinates on the page - useful for canvas drawing, sortable lists, sliders, file uploads, and UI rearrangement', + param_model=DragDropAction, + ) + async def drag_drop(params: DragDropAction, browser: BrowserContext) -> ActionResult: + """ + Performs a precise drag and drop operation between elements or coordinates. + """ + + async def get_drag_elements( + page: Page, + source_selector: str, + target_selector: str, + ) -> Tuple[Optional[ElementHandle], Optional[ElementHandle]]: + """Get source and target elements with appropriate error handling.""" + source_element = None + target_element = None + + try: + # page.locator() auto-detects CSS and XPath + source_locator = page.locator(source_selector) + target_locator = page.locator(target_selector) + + # Check if elements exist + source_count = await source_locator.count() + target_count = await target_locator.count() + + if source_count > 0: + source_element = await source_locator.first.element_handle() + logger.debug(f'Found source element with selector: {source_selector}') + else: + logger.warning(f'Source element not found: {source_selector}') + + if target_count > 0: + target_element = await target_locator.first.element_handle() + logger.debug(f'Found target element with selector: {target_selector}') + else: + logger.warning(f'Target element not found: {target_selector}') + + except Exception as e: + logger.error(f'Error finding elements: {str(e)}') + + return source_element, target_element + + async def get_element_coordinates( + source_element: ElementHandle, + target_element: ElementHandle, + source_position: Optional[Position], + target_position: Optional[Position], + ) -> Tuple[Optional[Tuple[int, int]], Optional[Tuple[int, int]]]: + """Get coordinates from elements with appropriate error handling.""" + source_coords = None + target_coords = None + + try: + # Get source coordinates + if source_position: + source_coords = (source_position.x, source_position.y) + else: + source_box = await source_element.bounding_box() + if source_box: + source_coords = ( + int(source_box['x'] + source_box['width'] / 2), + int(source_box['y'] + source_box['height'] / 2), + ) + + # Get target coordinates + if target_position: + target_coords = (target_position.x, target_position.y) + else: + target_box = await target_element.bounding_box() + if target_box: + target_coords = ( + int(target_box['x'] + target_box['width'] / 2), + int(target_box['y'] + target_box['height'] / 2), + ) + except Exception as e: + logger.error(f'Error getting element coordinates: {str(e)}') + + return source_coords, target_coords + + async def execute_drag_operation( + page: Page, + source_x: int, + source_y: int, + target_x: int, + target_y: int, + steps: int, + delay_ms: int, + ) -> Tuple[bool, str]: + """Execute the drag operation with comprehensive error handling.""" + try: + # Try to move to source position + try: + await page.mouse.move(source_x, source_y) + logger.debug(f'Moved to source position ({source_x}, {source_y})') + except Exception as e: + logger.error(f'Failed to move to source position: {str(e)}') + return False, f'Failed to move to source position: {str(e)}' + + # Press mouse button down + await page.mouse.down() + + # Move to target position with intermediate steps + for i in range(1, steps + 1): + ratio = i / steps + intermediate_x = int(source_x + (target_x - source_x) * ratio) + intermediate_y = int(source_y + (target_y - source_y) * ratio) + + await page.mouse.move(intermediate_x, intermediate_y) + + if delay_ms > 0: + await asyncio.sleep(delay_ms / 1000) + + # Move to final target position + await page.mouse.move(target_x, target_y) + + # Move again to ensure dragover events are properly triggered + await page.mouse.move(target_x, target_y) + + # Release mouse button + await page.mouse.up() + + return True, 'Drag operation completed successfully' + + except Exception as e: + return False, f'Error during drag operation: {str(e)}' + + page = await browser.get_current_page() + + try: + # Initialize variables + source_x: Optional[int] = None + source_y: Optional[int] = None + target_x: Optional[int] = None + target_y: Optional[int] = None + + # Normalize parameters + steps = max(1, params.steps or 10) + delay_ms = max(0, params.delay_ms or 5) + + # Case 1: Element selectors provided + if params.element_source and params.element_target: + logger.debug('Using element-based approach with selectors') + + source_element, target_element = await get_drag_elements( + page, + params.element_source, + params.element_target, + ) + + if not source_element or not target_element: + error_msg = f'Failed to find {"source" if not source_element else "target"} element' + return ActionResult(error=error_msg, include_in_memory=True) + + source_coords, target_coords = await get_element_coordinates( + source_element, target_element, params.element_source_offset, params.element_target_offset + ) + + if not source_coords or not target_coords: + error_msg = f'Failed to determine {"source" if not source_coords else "target"} coordinates' + return ActionResult(error=error_msg, include_in_memory=True) + + source_x, source_y = source_coords + target_x, target_y = target_coords + + # Case 2: Coordinates provided directly + elif all( + coord is not None + for coord in [params.coord_source_x, params.coord_source_y, params.coord_target_x, params.coord_target_y] + ): + logger.debug('Using coordinate-based approach') + source_x = params.coord_source_x + source_y = params.coord_source_y + target_x = params.coord_target_x + target_y = params.coord_target_y + else: + error_msg = 'Must provide either source/target selectors or source/target coordinates' + return ActionResult(error=error_msg, include_in_memory=True) + + # Validate coordinates + if any(coord is None for coord in [source_x, source_y, target_x, target_y]): + error_msg = 'Failed to determine source or target coordinates' + return ActionResult(error=error_msg, include_in_memory=True) + + # Perform the drag operation + success, message = await execute_drag_operation( + page, + cast(int, source_x), + cast(int, source_y), + cast(int, target_x), + cast(int, target_y), + steps, + delay_ms, + ) + + if not success: + logger.error(f'Drag operation failed: {message}') + return ActionResult(error=message, include_in_memory=True) + + # Create descriptive message + if params.element_source and params.element_target: + msg = f"🖱️ Dragged element '{params.element_source}' to '{params.element_target}'" + else: + msg = f'🖱️ Dragged from ({source_x}, {source_y}) to ({target_x}, {target_y})' + + logger.info(msg) + return ActionResult(extracted_content=msg, include_in_memory=True) + + except Exception as e: + error_msg = f'Failed to perform drag and drop: {str(e)}' + logger.error(error_msg) + return ActionResult(error=error_msg, include_in_memory=True) + # Register --------------------------------------------------------------- def action(self, description: str, **kwargs): diff --git a/browser_use/controller/views.py b/browser_use/controller/views.py index 82995c9e31..e1a4fa49cf 100644 --- a/browser_use/controller/views.py +++ b/browser_use/controller/views.py @@ -1,6 +1,6 @@ from typing import Optional -from pydantic import BaseModel, model_validator +from pydantic import BaseModel, ConfigDict, Field, model_validator # Action Input Models @@ -12,11 +12,30 @@ class GoToUrlAction(BaseModel): url: str +class WaitForElementAction(BaseModel): + selector: str + timeout: Optional[int] = 10000 # Timeout in milliseconds + + class ClickElementAction(BaseModel): index: int xpath: Optional[str] = None +class ClickElementByXpathAction(BaseModel): + xpath: str + + +class ClickElementBySelectorAction(BaseModel): + css_selector: str + + +class ClickElementByTextAction(BaseModel): + text: str + element_type: Optional[str] + nth: int = 0 + + class InputTextAction(BaseModel): index: int text: str @@ -36,6 +55,10 @@ class OpenTabAction(BaseModel): url: str +class CloseTabAction(BaseModel): + page_id: int + + class ScrollAction(BaseModel): amount: Optional[int] = None # The number of pixels to scroll. If None, scroll down/up one page @@ -44,6 +67,19 @@ class SendKeysAction(BaseModel): keys: str +class GroupTabsAction(BaseModel): + tab_ids: list[int] = Field(..., description='List of tab IDs to group') + title: str = Field(..., description='Name for the tab group') + color: Optional[str] = Field( + 'blue', + description='Color for the group (grey/blue/red/yellow/green/pink/purple/cyan)', + ) + + +class UngroupTabsAction(BaseModel): + tab_ids: list[int] = Field(..., description='List of tab IDs to ungroup') + + class ExtractPageContentAction(BaseModel): value: str @@ -54,12 +90,36 @@ class NoParamsAction(BaseModel): and discards it, so the final parsed model is empty. """ + model_config = ConfigDict(extra='allow') + @model_validator(mode='before') def ignore_all_inputs(cls, values): # No matter what the user sends, discard it and return empty. return {} - class Config: - # If you want to silently allow unknown fields at top-level, - # set extra = 'allow' as well: - extra = 'allow' + +class Position(BaseModel): + x: int + y: int + + +class DragDropAction(BaseModel): + # Element-based approach + element_source: Optional[str] = Field(None, description='CSS selector or XPath of the element to drag from') + element_target: Optional[str] = Field(None, description='CSS selector or XPath of the element to drop onto') + element_source_offset: Optional[Position] = Field( + None, description='Precise position within the source element to start drag (in pixels from top-left corner)' + ) + element_target_offset: Optional[Position] = Field( + None, description='Precise position within the target element to drop (in pixels from top-left corner)' + ) + + # Coordinate-based approach (used if selectors not provided) + coord_source_x: Optional[int] = Field(None, description='Absolute X coordinate on page to start drag from (in pixels)') + coord_source_y: Optional[int] = Field(None, description='Absolute Y coordinate on page to start drag from (in pixels)') + coord_target_x: Optional[int] = Field(None, description='Absolute X coordinate on page to drop at (in pixels)') + coord_target_y: Optional[int] = Field(None, description='Absolute Y coordinate on page to drop at (in pixels)') + + # Common options + steps: Optional[int] = Field(10, description='Number of intermediate points for smoother movement (5-20 recommended)') + delay_ms: Optional[int] = Field(5, description='Delay in milliseconds between steps (0 for fastest, 10-20 for more natural)') diff --git a/browser_use/dom/buildDomTree.js b/browser_use/dom/buildDomTree.js index 539c762259..d69b4601c1 100644 --- a/browser_use/dom/buildDomTree.js +++ b/browser_use/dom/buildDomTree.js @@ -391,7 +391,7 @@ rect.top > window.innerHeight + viewportExpansion || rect.right < -viewportExpansion || rect.left > window.innerWidth + viewportExpansion - ); + ) || viewportExpansion === -1; // Check parent visibility const parentElement = textNode.parentElement; @@ -462,42 +462,137 @@ return false; } - // Special handling for cookie banner elements - const isCookieBannerElement = - (typeof element.closest === 'function') && ( - element.closest('[id*="onetrust"]') || - element.closest('[class*="onetrust"]') || - element.closest('[data-nosnippet="true"]') || - element.closest('[aria-label*="cookie"]') - ); + // Define interactive cursors + const interactiveCursors = new Set([ + 'pointer', // Link/clickable elements + 'move', // Movable elements + 'text', // Text selection + 'grab', // Grabbable elements + 'grabbing', // Currently grabbing + 'cell', // Table cell selection + 'copy', // Copy operation + 'alias', // Alias creation + 'all-scroll', // Scrollable content + 'col-resize', // Column resize + 'context-menu', // Context menu available + 'crosshair', // Precise selection + 'e-resize', // East resize + 'ew-resize', // East-west resize + 'help', // Help available + 'n-resize', // North resize + 'ne-resize', // Northeast resize + 'nesw-resize', // Northeast-southwest resize + 'ns-resize', // North-south resize + 'nw-resize', // Northwest resize + 'nwse-resize', // Northwest-southeast resize + 'row-resize', // Row resize + 's-resize', // South resize + 'se-resize', // Southeast resize + 'sw-resize', // Southwest resize + 'vertical-text', // Vertical text selection + 'w-resize', // West resize + 'zoom-in', // Zoom in + 'zoom-out' // Zoom out + ]); - if (isCookieBannerElement) { - // Check if it's a button or interactive element within the banner - if ( - element.tagName.toLowerCase() === 'button' || - element.getAttribute('role') === 'button' || - element.onclick || - element.getAttribute('onclick') || - (element.classList && ( - element.classList.contains('ot-sdk-button') || - element.classList.contains('accept-button') || - element.classList.contains('reject-button') - )) || - element.getAttribute('aria-label')?.toLowerCase().includes('accept') || - element.getAttribute('aria-label')?.toLowerCase().includes('reject') - ) { - return true; - } + // Define non-interactive cursors + const nonInteractiveCursors = new Set([ + 'not-allowed', // Action not allowed + 'no-drop', // Drop not allowed + 'wait', // Processing + 'progress', // In progress + 'initial', // Initial value + 'inherit' // Inherited value + //? Let's just include all potentially clickable elements that are not specifically blocked + // 'none', // No cursor + // 'default', // Default cursor + // 'auto', // Browser default + ]); + + function doesElementHaveInteractivePointer(element) { + if (element.tagName.toLowerCase() === "html") return false; + const style = getCachedComputedStyle(element); + + if (interactiveCursors.has(style.cursor)) return true; + + return false; + } + + let isInteractiveCursor = doesElementHaveInteractivePointer(element); + + // Genius fix for almost all interactive elements + if (isInteractiveCursor) { + return true; } - // Base interactive elements and roles const interactiveElements = new Set([ - "a", "button", "details", "embed", "input", "menu", "menuitem", - "object", "select", "textarea", "canvas", "summary", "dialog", - "banner" + "a", // Links + "button", // Buttons + "input", // All input types (text, checkbox, radio, etc.) + "select", // Dropdown menus + "textarea", // Text areas + "details", // Expandable details + "summary", // Summary element (clickable part of details) + "label", // Form labels (often clickable) + "option", // Select options + "optgroup", // Option groups + "fieldset", // Form fieldsets (can be interactive with legend) + "legend", // Fieldset legends ]); - const interactiveRoles = new Set(['button-icon', 'dialog', 'button-text-icon-only', 'treeitem', 'alert', 'grid', 'progressbar', 'radio', 'checkbox', 'menuitem', 'option', 'switch', 'dropdown', 'scrollbar', 'combobox', 'a-button-text', 'button', 'region', 'textbox', 'tabpanel', 'tab', 'click', 'button-text', 'spinbutton', 'a-button-inner', 'link', 'menu', 'slider', 'listbox', 'a-dropdown-button', 'button-icon-only', 'searchbox', 'menuitemradio', 'tooltip', 'tree', 'menuitemcheckbox']); + // Define explicit disable attributes and properties + const explicitDisableTags = new Set([ + 'disabled', // Standard disabled attribute + // 'aria-disabled', // ARIA disabled state + 'readonly', // Read-only state + // 'aria-readonly', // ARIA read-only state + // 'aria-hidden', // Hidden from accessibility + // 'hidden', // Hidden attribute + // 'inert', // Inert attribute + // 'aria-inert', // ARIA inert state + // 'tabindex="-1"', // Removed from tab order + // 'aria-hidden="true"' // Hidden from screen readers + ]); + + // handle inputs, select, checkbox, radio, textarea, button and make sure they are not cursor style disabled/not-allowed + if (interactiveElements.has(element.tagName.toLowerCase())) { + const style = getCachedComputedStyle(element); + + // Check for non-interactive cursor + if (nonInteractiveCursors.has(style.cursor)) { + return false; + } + + // Check for explicit disable attributes + for (const disableTag of explicitDisableTags) { + if (element.hasAttribute(disableTag) || + element.getAttribute(disableTag) === 'true' || + element.getAttribute(disableTag) === '') { + return false; + } + } + + // Check for disabled property on form elements + if (element.disabled) { + return false; + } + + // Check for readonly property on form elements + if (element.readOnly) { + return false; + } + + // Check for inert property + if (element.inert) { + return false; + } + + return true; + } + + // return false + + const tagName = element.tagName.toLowerCase(); const role = element.getAttribute("role"); @@ -505,75 +600,56 @@ const tabIndex = element.getAttribute("tabindex"); // Add check for specific class - const hasAddressInputClass = element.classList && ( - element.classList.contains("address-input__container__input") || - element.classList.contains("nav-btn") || - element.classList.contains("pull-left") - ); + // const hasAddressInputClass = element.classList && ( + // element.classList.contains("address-input__container__input") || + // element.classList.contains("nav-btn") || + // element.classList.contains("pull-left") + // ); // Added enhancement to capture dropdown interactive elements if (element.classList && ( + element.classList.contains("button") || element.classList.contains('dropdown-toggle') || + element.getAttribute('data-index') || element.getAttribute('data-toggle') === 'dropdown' || element.getAttribute('aria-haspopup') === 'true' )) { return true; } + // return false + + const interactiveRoles = new Set([ + 'button', // Directly clickable element + // 'link', // Clickable link + 'menuitem', // Clickable menu item + 'menuitemradio', // Radio-style menu item (selectable) + 'menuitemcheckbox', // Checkbox-style menu item (toggleable) + 'radio', // Radio button (selectable) + 'checkbox', // Checkbox (toggleable) + 'tab', // Tab (clickable to switch content) + 'switch', // Toggle switch (clickable to change state) + 'slider', // Slider control (draggable) + 'spinbutton', // Number input with up/down controls + 'combobox', // Dropdown with text input + 'searchbox', // Search input field + 'textbox', // Text input field + 'listbox', // Selectable list + 'option', // Selectable option in a list + 'scrollbar' // Scrollable control + ]); + // Basic role/attribute checks const hasInteractiveRole = - hasAddressInputClass || interactiveElements.has(tagName) || interactiveRoles.has(role) || - interactiveRoles.has(ariaRole) || - (tabIndex !== null && - tabIndex !== "-1" && - element.parentElement?.tagName.toLowerCase() !== "body") || - element.getAttribute("data-action") === "a-dropdown-select" || - element.getAttribute("data-action") === "a-dropdown-button"; + interactiveRoles.has(ariaRole); if (hasInteractiveRole) return true; - // Additional checks for cookie banners and consent UI - const isCookieBanner = - element.id?.toLowerCase().includes('cookie') || - element.id?.toLowerCase().includes('consent') || - element.id?.toLowerCase().includes('notice') || - (element.classList && ( - element.classList.contains('otCenterRounded') || - element.classList.contains('ot-sdk-container') - )) || - element.getAttribute('data-nosnippet') === 'true' || - element.getAttribute('aria-label')?.toLowerCase().includes('cookie') || - element.getAttribute('aria-label')?.toLowerCase().includes('consent') || - (element.tagName.toLowerCase() === 'div' && ( - element.id?.includes('onetrust') || - (element.classList && ( - element.classList.contains('onetrust') || - element.classList.contains('cookie') || - element.classList.contains('consent') - )) - )); - - if (isCookieBanner) return true; - - // Additional check for buttons in cookie banners - const isInCookieBanner = typeof element.closest === 'function' && element.closest( - '[id*="cookie"],[id*="consent"],[class*="cookie"],[class*="consent"],[id*="onetrust"]' - ); + return false - if (isInCookieBanner && ( - element.tagName.toLowerCase() === 'button' || - element.getAttribute('role') === 'button' || - (element.classList && element.classList.contains('button')) || - element.onclick || - element.getAttribute('onclick') - )) { - return true; - } - // Get computed style - const style = window.getComputedStyle(element); // Check for event listeners const hasClickHandler = @@ -591,14 +667,6 @@ const listeners = {}; const eventTypes = [ "click", - "mousedown", - "mouseup", - "touchstart", - "touchend", - "keydown", - "keyup", - "focus", - "blur", ]; for (const type of eventTypes) { @@ -639,30 +707,37 @@ element.draggable || element.getAttribute("draggable") === "true"; return ( - hasAriaProps || - hasClickHandler || - hasClickListeners || - isDraggable || - isContentEditable + // hasAriaProps || + // hasClickHandler || + // hasClickListeners || + // isDraggable || + // isContentEditable || + false ); } + /** * Checks if an element is the topmost element at its position. */ function isTopElement(element) { const rect = getCachedBoundingRect(element); - // If element is not in viewport, consider it top - const isInViewport = ( - rect.left < window.innerWidth && - rect.right > 0 && - rect.top < window.innerHeight && - rect.bottom > 0 - ); - - if (!isInViewport) { - return true; + if (viewportExpansion <= 0) { + if (rect.bottom < 0 || + rect.top > window.innerHeight || + rect.right < 0 || + rect.left > window.innerWidth) { + return false; + } + } else { + // For positive viewportExpansion, only expand in Y direction + if (rect.bottom < -viewportExpansion || + rect.top > window.innerHeight + viewportExpansion || + rect.right < 0 || + rect.left > window.innerWidth) { + return false; + } } // Find the correct document context and root element @@ -775,7 +850,8 @@ element.hasAttribute("role") || element.hasAttribute("tabindex") || element.hasAttribute("aria-") || - element.hasAttribute("data-action"); + element.hasAttribute("data-action") || + element.getAttribute("contenteditable") == "true"; return hasQuickInteractiveAttr; } @@ -955,16 +1031,16 @@ if (domElement) nodeData.children.push(domElement); } } - // Handle shadow DOM - else if (node.shadowRoot) { - nodeData.shadowRoot = true; - for (const child of node.shadowRoot.childNodes) { - const domElement = buildDomTree(child, parentIframe); - if (domElement) nodeData.children.push(domElement); - } - } - // Handle regular elements else { + // Handle shadow DOM + if (node.shadowRoot) { + nodeData.shadowRoot = true; + for (const child of node.shadowRoot.childNodes) { + const domElement = buildDomTree(child, parentIframe); + if (domElement) nodeData.children.push(domElement); + } + } + // Handle regular elements for (const child of node.childNodes) { const domElement = buildDomTree(child, parentIframe); if (domElement) nodeData.children.push(domElement); diff --git a/browser_use/dom/history_tree_processor/view.py b/browser_use/dom/history_tree_processor/view.py index e970ad5b53..250a64903d 100644 --- a/browser_use/dom/history_tree_processor/view.py +++ b/browser_use/dom/history_tree_processor/view.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import TYPE_CHECKING, Optional +from typing import Optional from pydantic import BaseModel diff --git a/browser_use/dom/service.py b/browser_use/dom/service.py index d03fbecfbf..0510d9d983 100644 --- a/browser_use/dom/service.py +++ b/browser_use/dom/service.py @@ -4,6 +4,7 @@ from dataclasses import dataclass from importlib import resources from typing import TYPE_CHECKING, Optional +from urllib.parse import urlparse if TYPE_CHECKING: from playwright.async_api import Page @@ -31,7 +32,7 @@ def __init__(self, page: 'Page'): self.page = page self.xpath_cache = {} - self.js_code = resources.read_text('browser_use.dom', 'buildDomTree.js') + self.js_code = resources.files('browser_use.dom').joinpath('buildDomTree.js').read_text() # region - Clickable elements @time_execution_async('--get_clickable_elements') @@ -44,6 +45,24 @@ async def get_clickable_elements( element_tree, selector_map = await self._build_dom_tree(highlight_elements, focus_element, viewport_expansion) return DOMState(element_tree=element_tree, selector_map=selector_map) + @time_execution_async('--get_cross_origin_iframes') + async def get_cross_origin_iframes(self) -> list[str]: + # invisible cross-origin iframes are used for ads and tracking, dont open those + hidden_frame_urls = await self.page.locator('iframe').filter(visible=False).evaluate_all('e => e.map(e => e.src)') + + is_ad_url = lambda url: any( + domain in urlparse(url).netloc for domain in ('doubleclick.net', 'adroll.com', 'googletagmanager.com') + ) + + return [ + frame.url + for frame in self.page.frames + if urlparse(frame.url).netloc # exclude data:urls and about:blank + and urlparse(frame.url).netloc != urlparse(self.page.url).netloc # exclude same-origin iframes + and frame.url not in hidden_frame_urls # exclude hidden frames + and not is_ad_url(frame.url) # exclude most common ad network tracker frame URLs + ] + @time_execution_async('--build_dom_tree') async def _build_dom_tree( self, @@ -54,6 +73,20 @@ async def _build_dom_tree( if await self.page.evaluate('1+1') != 2: raise ValueError('The page cannot evaluate javascript code properly') + if self.page.url == 'about:blank': + # short-circuit if the page is a new empty tab for speed, no need to inject buildDomTree.js + return ( + DOMElementNode( + tag_name='body', + xpath='', + attributes={}, + children=[], + is_visible=False, + parent=None, + ), + {}, + ) + # NOTE: We execute JS code in the browser to extract important DOM information. # The returned hash map contains information about the DOM tree and the # relationship between the DOM elements. @@ -66,14 +99,18 @@ async def _build_dom_tree( } try: - eval_page = await self.page.evaluate(self.js_code, args) + eval_page: dict = await self.page.evaluate(self.js_code, args) except Exception as e: logger.error('Error evaluating JavaScript: %s', e) raise # Only log performance metrics in debug mode if debug_mode and 'perfMetrics' in eval_page: - logger.debug('DOM Tree Building Performance Metrics:\n%s', json.dumps(eval_page['perfMetrics'], indent=2)) + logger.debug( + 'DOM Tree Building Performance Metrics for: %s\n%s', + self.page.url, + json.dumps(eval_page['perfMetrics'], indent=2), + ) return await self._construct_dom_tree(eval_page) diff --git a/browser_use/dom/tests/extraction_test.py b/browser_use/dom/tests/extraction_test.py index 0a4dce638c..1d862f95d5 100644 --- a/browser_use/dom/tests/extraction_test.py +++ b/browser_use/dom/tests/extraction_test.py @@ -1,12 +1,20 @@ import asyncio import time +from langchain_openai import ChatOpenAI + from browser_use.browser.browser import Browser, BrowserConfig from browser_use.browser.context import BrowserContext, BrowserContextConfig from browser_use.dom.service import DomService from browser_use.utils import time_execution_sync +def count_string_tokens(string: str, model: str) -> int: + """Count the number of tokens in a string using a specified model.""" + llm = ChatOpenAI(model=model) + return llm.count_tokens(string) + + async def test_process_html_file(): config = BrowserContextConfig( cookies_file='cookies3.json', @@ -81,20 +89,24 @@ async def test_viewport(expansion: int, description: str): async def test_focus_vs_all_elements(): config = BrowserContextConfig( - cookies_file='cookies3.json', + # cookies_file='cookies3.json', disable_security=True, wait_for_network_idle_page_load_time=2, ) browser = Browser( config=BrowserConfig( - # chrome_instance_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', + # browser_binary_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', ) ) context = BrowserContext(browser=browser, config=config) # noqa: F821 websites = [ + 'https://en.wikipedia.org/wiki/Humanist_Party_of_Ontario', + 'https://www.google.com/travel/flights?tfs=CBwQARoJagcIARIDTEpVGglyBwgBEgNMSlVAAUgBcAGCAQsI____________AZgBAQ&tfu=KgIIAw&hl=en-US&gl=US', + # 'https://www.concur.com/?&cookie_preferences=cpra', 'https://immobilienscout24.de', + 'https://docs.google.com/spreadsheets/d/1INaIcfpYXlMRWO__de61SHFCaqt1lfHlcvtXZPItlpI/edit', 'https://www.zeiss.com/career/en/job-search.html?page=1', 'https://www.mlb.com/yankees/stats/', 'https://www.amazon.com/s?k=laptop&s=review-rank&crid=1RZCEJ289EUSI&qid=1740202453&sprefix=laptop%2Caps%2C166&ref=sr_st_review-rank&ds=v1%3A4EnYKXVQA7DIE41qCvRZoNB4qN92Jlztd3BPsTFXmxU', @@ -124,13 +136,15 @@ async def test_focus_vs_all_elements(): # First get all elements print('\nGetting all elements:') all_elements_state = await time_execution_sync('get_all_elements')(dom_service.get_clickable_elements)( - highlight_elements=True, viewport_expansion=100 + highlight_elements=True, viewport_expansion=1000 ) selector_map = all_elements_state.selector_map total_elements = len(selector_map.keys()) print(f'Total number of elements: {total_elements}') + print(all_elements_state.element_tree.clickable_elements_to_string()) + answer = input('Press Enter to clear highlights and continue...') if answer == 'q': break diff --git a/browser_use/exceptions.py b/browser_use/exceptions.py new file mode 100644 index 0000000000..2e01cf0cdb --- /dev/null +++ b/browser_use/exceptions.py @@ -0,0 +1,5 @@ +class LLMException(Exception): + def __init__(self, status_code, message): + self.status_code = status_code + self.message = message + super().__init__(f'Error {status_code}: {message}') diff --git a/browser_use/logging_config.py b/browser_use/logging_config.py index 043252bd78..887fe7e7fd 100644 --- a/browser_use/logging_config.py +++ b/browser_use/logging_config.py @@ -77,7 +77,7 @@ def setup_logging(): class BrowserUseFormatter(logging.Formatter): def format(self, record): - if type(record.name) == str and record.name.startswith('browser_use.'): + if isinstance(record.name, str) and record.name.startswith('browser_use.'): record.name = record.name.split('.')[-2] return super().format(record) diff --git a/browser_use/telemetry/service.py b/browser_use/telemetry/service.py index 6a2e82e458..e5e67e4987 100644 --- a/browser_use/telemetry/service.py +++ b/browser_use/telemetry/service.py @@ -42,7 +42,7 @@ def __init__(self) -> None: if telemetry_disabled: self._posthog_client = None else: - logging.info( + logger.info( 'Anonymized telemetry enabled. See https://docs.browser-use.com/development/telemetry for more information.' ) self._posthog_client = Posthog( diff --git a/browser_use/utils.py b/browser_use/utils.py index 860b35a320..bd3bad1fb7 100644 --- a/browser_use/utils.py +++ b/browser_use/utils.py @@ -1,16 +1,277 @@ +import asyncio import logging +import os +import platform +import signal import time from functools import wraps -from typing import Any, Callable, Coroutine, ParamSpec, TypeVar +from sys import stderr +from typing import Any, Callable, Coroutine, List, Optional, ParamSpec, TypeVar logger = logging.getLogger(__name__) +# Global flag to prevent duplicate exit messages +_exiting = False # Define generic type variables for return type and parameters R = TypeVar('R') P = ParamSpec('P') +class SignalHandler: + """ + A modular and reusable signal handling system for managing SIGINT (Ctrl+C), SIGTERM, + and other signals in asyncio applications. + + This class provides: + - Configurable signal handling for SIGINT and SIGTERM + - Support for custom pause/resume callbacks + - Management of event loop state across signals + - Standardized handling of first and second Ctrl+C presses + - Cross-platform compatibility (with simplified behavior on Windows) + """ + + def __init__( + self, + loop: Optional[asyncio.AbstractEventLoop] = None, + pause_callback: Optional[Callable[[], None]] = None, + resume_callback: Optional[Callable[[], None]] = None, + custom_exit_callback: Optional[Callable[[], None]] = None, + exit_on_second_int: bool = True, + interruptible_task_patterns: List[str] = None, + ): + """ + Initialize the signal handler. + + Args: + loop: The asyncio event loop to use. Defaults to current event loop. + pause_callback: Function to call when system is paused (first Ctrl+C) + resume_callback: Function to call when system is resumed + custom_exit_callback: Function to call on exit (second Ctrl+C or SIGTERM) + exit_on_second_int: Whether to exit on second SIGINT (Ctrl+C) + interruptible_task_patterns: List of patterns to match task names that should be + canceled on first Ctrl+C (default: ['step', 'multi_act', 'get_next_action']) + """ + self.loop = loop or asyncio.get_event_loop() + self.pause_callback = pause_callback + self.resume_callback = resume_callback + self.custom_exit_callback = custom_exit_callback + self.exit_on_second_int = exit_on_second_int + self.interruptible_task_patterns = interruptible_task_patterns or ['step', 'multi_act', 'get_next_action'] + self.is_windows = platform.system() == 'Windows' + + # Initialize loop state attributes + self._initialize_loop_state() + + # Store original signal handlers to restore them later if needed + self.original_sigint_handler = None + self.original_sigterm_handler = None + + def _initialize_loop_state(self) -> None: + """Initialize loop state attributes used for signal handling.""" + setattr(self.loop, 'ctrl_c_pressed', False) + setattr(self.loop, 'waiting_for_input', False) + + def register(self) -> None: + """Register signal handlers for SIGINT and SIGTERM.""" + try: + if self.is_windows: + # On Windows, use simple signal handling with immediate exit on Ctrl+C + def windows_handler(sig, frame): + print('\n\n🛑 Got Ctrl+C. Exiting immediately on Windows...\n', file=stderr) + # Run the custom exit callback if provided + if self.custom_exit_callback: + self.custom_exit_callback() + os._exit(0) + + self.original_sigint_handler = signal.signal(signal.SIGINT, windows_handler) + else: + # On Unix-like systems, use asyncio's signal handling for smoother experience + self.original_sigint_handler = self.loop.add_signal_handler(signal.SIGINT, lambda: self.sigint_handler()) + self.original_sigterm_handler = self.loop.add_signal_handler(signal.SIGTERM, lambda: self.sigterm_handler()) + + except Exception: + # there are situations where signal handlers are not supported, e.g. + # - when running in a thread other than the main thread + # - some operating systems + # - inside jupyter notebooks + pass + + def unregister(self) -> None: + """Unregister signal handlers and restore original handlers if possible.""" + try: + if self.is_windows: + # On Windows, just restore the original SIGINT handler + if self.original_sigint_handler: + signal.signal(signal.SIGINT, self.original_sigint_handler) + else: + # On Unix-like systems, use asyncio's signal handler removal + self.loop.remove_signal_handler(signal.SIGINT) + self.loop.remove_signal_handler(signal.SIGTERM) + + # Restore original handlers if available + if self.original_sigint_handler: + signal.signal(signal.SIGINT, self.original_sigint_handler) + if self.original_sigterm_handler: + signal.signal(signal.SIGTERM, self.original_sigterm_handler) + except Exception as e: + logger.warning(f'Error while unregistering signal handlers: {e}') + + def _handle_second_ctrl_c(self) -> None: + """ + Handle a second Ctrl+C press by performing cleanup and exiting. + This is shared logic used by both sigint_handler and wait_for_resume. + """ + global _exiting + + if not _exiting: + _exiting = True + + # Call custom exit callback if provided + if self.custom_exit_callback: + try: + self.custom_exit_callback() + except Exception as e: + logger.error(f'Error in exit callback: {e}') + + # Force immediate exit - more reliable than sys.exit() + print('\n\n🛑 Got second Ctrl+C. Exiting immediately...\n', file=stderr) + os._exit(0) + + def sigint_handler(self) -> None: + """ + SIGINT (Ctrl+C) handler. + + First Ctrl+C: Cancel current step and pause. + Second Ctrl+C: Exit immediately if exit_on_second_int is True. + """ + global _exiting + + if _exiting: + # Already exiting, force exit immediately + os._exit(0) + + if getattr(self.loop, 'ctrl_c_pressed', False): + # If we're in the waiting for input state, let the pause method handle it + if getattr(self.loop, 'waiting_for_input', False): + return + + # Second Ctrl+C - exit immediately if configured to do so + if self.exit_on_second_int: + self._handle_second_ctrl_c() + + # Mark that Ctrl+C was pressed + self.loop.ctrl_c_pressed = True + + # Cancel current tasks that should be interruptible - this is crucial for immediate pausing + self._cancel_interruptible_tasks() + + # Call pause callback if provided - this sets the paused flag + if self.pause_callback: + try: + self.pause_callback() + except Exception as e: + logger.error(f'Error in pause callback: {e}') + + # Log pause message after pause_callback is called (not before) + print('----------------------------------------------------------------------', file=stderr) + + def sigterm_handler(self) -> None: + """ + SIGTERM handler. + + Always exits the program completely. + """ + global _exiting + if not _exiting: + _exiting = True + print('\n\n🛑 SIGTERM received. Exiting immediately...\n\n', file=stderr) + + # Call custom exit callback if provided + if self.custom_exit_callback: + self.custom_exit_callback() + + os._exit(0) + + def _cancel_interruptible_tasks(self) -> None: + """Cancel current tasks that should be interruptible.""" + current_task = asyncio.current_task(self.loop) + for task in asyncio.all_tasks(self.loop): + if task != current_task and not task.done(): + task_name = task.get_name() if hasattr(task, 'get_name') else str(task) + # Cancel tasks that match certain patterns + if any(pattern in task_name for pattern in self.interruptible_task_patterns): + logger.debug(f'Cancelling task: {task_name}') + task.cancel() + # Add exception handler to silence "Task exception was never retrieved" warnings + task.add_done_callback(lambda t: t.exception() if t.cancelled() else None) + + # Also cancel the current task if it's interruptible + if current_task and not current_task.done(): + task_name = current_task.get_name() if hasattr(current_task, 'get_name') else str(current_task) + if any(pattern in task_name for pattern in self.interruptible_task_patterns): + logger.debug(f'Cancelling current task: {task_name}') + current_task.cancel() + + def wait_for_resume(self) -> None: + """ + Wait for user input to resume or exit. + + This method should be called after handling the first Ctrl+C. + It temporarily restores default signal handling to allow catching + a second Ctrl+C directly. + """ + # Set flag to indicate we're waiting for input + setattr(self.loop, 'waiting_for_input', True) + + # Temporarily restore default signal handling for SIGINT + # This ensures KeyboardInterrupt will be raised during input() + original_handler = signal.getsignal(signal.SIGINT) + try: + signal.signal(signal.SIGINT, signal.default_int_handler) + except ValueError: + # we are running in a thread other than the main thread + # or signal handlers are not supported for some other reason + pass + + green = '\x1b[32;1m' + red = '\x1b[31m' + blink = '\033[33;5m' + unblink = '\033[0m' + reset = '\x1b[0m' + + try: # escape code is to blink the ... + print( + f'➡️ Press {green}[Enter]{reset} to resume or {red}[Ctrl+C]{reset} again to exit{blink}...{unblink} ', + end='', + flush=True, + file=stderr, + ) + input() # This will raise KeyboardInterrupt on Ctrl+C + + # Call resume callback if provided + if self.resume_callback: + self.resume_callback() + except KeyboardInterrupt: + # Use the shared method to handle second Ctrl+C + self._handle_second_ctrl_c() + finally: + try: + # Restore our signal handler + signal.signal(signal.SIGINT, original_handler) + setattr(self.loop, 'waiting_for_input', False) + except Exception: + pass + + def reset(self) -> None: + """Reset state after resuming.""" + # Clear the flags + if hasattr(self.loop, 'ctrl_c_pressed'): + self.loop.ctrl_c_pressed = False + if hasattr(self.loop, 'waiting_for_input'): + self.loop.waiting_for_input = False + + def time_execution_sync(additional_text: str = '') -> Callable[[Callable[P, R]], Callable[P, R]]: def decorator(func: Callable[P, R]) -> Callable[P, R]: @wraps(func) @@ -52,3 +313,8 @@ def wrapper(*args, **kwargs): return instance[0] return wrapper + + +def check_env_variables(keys: list[str], any_or_all=all) -> bool: + """Check if all required environment variables are set""" + return any_or_all(os.getenv(key).strip() for key in keys) diff --git a/docs/customize/agent-settings.mdx b/docs/customize/agent-settings.mdx index 58371705b0..501415c224 100644 --- a/docs/customize/agent-settings.mdx +++ b/docs/customize/agent-settings.mdx @@ -47,7 +47,8 @@ agent = Agent( - Disable to reduce costs or use models without vision support - For GPT-4o, image processing costs approximately 800-1000 tokens (~$0.002 USD) per image (but this depends on the defined screen size) - `save_conversation_path`: Path to save the complete conversation history. Useful for debugging. -- `system_prompt_class`: Custom system prompt class. See System Prompt for customization options. +- `override_system_message`: Completely replace the default system prompt with a custom one. +- `extend_system_message`: Add additional instructions to the default system prompt. Vision capabilities are recommended for better web interaction understanding, @@ -128,7 +129,7 @@ documentation](https://playwright.dev/docs/api/class-browsercontext). The agent is executed using the async `run()` method: -- `max_steps` (default: `100`) +- `max_steps` (default: `100`) Maximum number of steps the agent can take during execution. This prevents infinite loops and helps control execution time. ## Agent History @@ -179,6 +180,20 @@ agent = Agent( ) ``` +## Run with message context + +You can configure the agent and provide a separate message to help the LLM understand the task better. + +```python +from langchain_openai import ChatOpenAI + +agent = Agent( + task="your task", + message_context="Additional information about the task", + llm = ChatOpenAI(model='gpt-4o') +) +``` + ## Run with planner model You can configure the agent to use a separate planner model for high-level task planning: @@ -213,3 +228,57 @@ Using a separate planner model can help: The planner model is optional. If not specified, the agent will not use the planner model. + +### Optional Parameters + +- `message_context`: Additional information about the task to help the LLM understand the task better. +- `initial_actions`: List of initial actions to run before the main task. +- `max_actions_per_step`: Maximum number of actions to run in a step. Defaults to `10`. +- `max_failures`: Maximum number of failures before giving up. Defaults to `3`. +- `retry_delay`: Time to wait between retries in seconds when rate limited. Defaults to `10`. +- `generate_gif`: Enable/disable GIF generation. Defaults to `False`. Set to `True` or a string path to save the GIF. +## Memory Management + +Browser Use includes a procedural memory system using [Mem0](https://mem0.ai) that automatically summarizes the agent's conversation history at regular intervals to optimize context window usage during long tasks. + +```python +agent = Agent( + task="your task", + llm=llm, + enable_memory=True, + memory_interval=10, # create procedural memory every 10 steps +) +``` + +### Memory Parameters + +- `enable_memory`: Enable/disable the procedural memory system. Defaults to `True`. +- `memory_interval`: Number of steps between memory summarization. Defaults to `10`. +- `memory_config`: Optional configuration dictionary for the underlying memory system. + +### How Memory Works + +When enabled, the agent periodically compresses its conversation history into concise summaries: + +1. Every `memory_interval` steps, the agent reviews its recent interactions +2. It creates a procedural memory summary using the same LLM as the agent +3. The original messages are replaced with the summary, reducing token usage +4. This process helps maintain important context while freeing up the context window + +### Disabling Memory + +If you want to disable the memory system (for debugging or for shorter tasks), set `enable_memory` to `False`: + +```python +agent = Agent( + task="your task", + llm=llm, + enable_memory=False +) +``` + + + Disabling memory may be useful for debugging or short tasks, but for longer + tasks, it can lead to context window overflow as the conversation history + grows. The memory system helps maintain performance during extended sessions. + diff --git a/docs/customize/browser-settings.mdx b/docs/customize/browser-settings.mdx index 41995f9e1e..13bdefeb3e 100644 --- a/docs/customize/browser-settings.mdx +++ b/docs/customize/browser-settings.mdx @@ -22,7 +22,7 @@ from browser_use import BrowserConfig # Basic configuration config = BrowserConfig( headless=False, - disable_security=True + disable_security=False ) browser = Browser(config=config) @@ -38,12 +38,12 @@ agent = Agent( - **headless** (default: `False`) Runs the browser without a visible UI. Note that some websites may detect headless mode. -- **disable_security** (default: `True`) +- **disable_security** (default: `False`) Disables browser security features. While this can fix certain functionality issues (like cross-site iFrames), it should be used cautiously, especially when visiting untrusted websites. ### Additional Settings -- **extra_chromium_args** (default: `[]`) +- **extra_browser_args** (default: `[]`) Additional arguments are passed to the browser at launch. See the [full list of available arguments](https://github.com/browser-use/browser-use/blob/main/browser_use/browser/browser.py#L180). - **proxy** (default: `None`) @@ -72,7 +72,7 @@ config = BrowserConfig( ``` - **wss_url** (default: `None`) - WebSocket URL for connecting to external browser providers (e.g., anchorbrowser.com, steel.dev, browserbase.com, browserless.io). + WebSocket URL for connecting to external browser providers (e.g., [anchorbrowser.io](https://anchorbrowser.io), steel.dev, browserbase.com, browserless.io, [TestingBot](https://testingbot.com/support/ai/integrations/browser-use)). This overrides local browser settings and uses the provider's configuration. @@ -98,12 +98,12 @@ Connect to your existing Chrome installation to access saved states and cookies. ```python config = BrowserConfig( - chrome_instance_path="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" + browser_binary_path="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" ) ``` -- **chrome_instance_path** (default: `None`) - Path to connect to an existing Chrome installation. Particularly useful for workflows requiring existing login states or browser preferences. +- **browser_binary_path** (default: `None`) + Path to connect to an existing Browser installation. Particularly useful for workflows requiring existing login states or browser preferences. This will overwrite other browser settings. @@ -161,8 +161,8 @@ async def run_search(): Highlight interactive elements on the screen with colorful bounding boxes. - **viewport_expansion** (default: `500`) - Viewport expansion in pixels. With this you can controll how much of the page is included in the context of the LLM. If set to -1, all elements from the entire page will be included (this leads to high token usage). If set to 0, only the elements which are visible in the viewport will be included. - Default is 500 pixels, that means that we inlcude a little bit more than the visible viewport inside the context. + Viewport expansion in pixels. With this you can control how much of the page is included in the context of the LLM. If set to -1, all elements from the entire page will be included (this leads to high token usage). If set to 0, only the elements which are visible in the viewport will be included. + Default is 500 pixels, that means that we include a little bit more than the visible viewport inside the context. ### Restrict URLs diff --git a/docs/customize/custom-functions.mdx b/docs/customize/custom-functions.mdx index 5e3ceb95c2..2820204496 100644 --- a/docs/customize/custom-functions.mdx +++ b/docs/customize/custom-functions.mdx @@ -43,13 +43,19 @@ agent = Agent( For actions that need browser access, simply add the `browser` parameter inside the function parameters: + + Please note that browser-use’s `Browser` class is a wrapper class around + Playwright’s `Browser`. The `Browser.playwright_browser` attr can be used + to directly access the Playwright browser object if needed. + + ```python from browser_use import Browser, Controller, ActionResult controller = Controller() @controller.action('Open website') async def open_website(url: str, browser: Browser): - page = browser.get_current_page() + page = await browser.get_current_page() await page.goto(url) return ActionResult(extracted_content='Website opened') ``` @@ -125,4 +131,3 @@ controller = Controller(exclude_actions=['open_tab', 'search_google']) For more examples like file upload or notifications, visit [examples/custom-functions](https://github.com/browser-use/browser-use/tree/main/examples/custom-functions). - diff --git a/docs/customize/hooks.mdx b/docs/customize/hooks.mdx new file mode 100644 index 0000000000..8b9de9e17e --- /dev/null +++ b/docs/customize/hooks.mdx @@ -0,0 +1,346 @@ +--- +title: "Lifecycle Hooks" +description: "Customize agent behavior with lifecycle hooks" +icon: "Wrench" +author: "Carlos A. Planchón" +--- + +# Using Agent Lifecycle Hooks + +Browser-Use provides lifecycle hooks that allow you to execute custom code at specific points during the agent's execution. These hooks enable you to capture detailed information about the agent's actions, modify behavior, or integrate with external systems. + +## Available Hooks + +Currently, Browser-Use provides the following hooks: + +| Hook | Description | When it's called | +| ---- | ----------- | ---------------- | +| `on_step_start` | Executed at the beginning of each agent step | Before the agent processes the current state and decides on the next action | +| `on_step_end` | Executed at the end of each agent step | After the agent has executed the action for the current step | + +## Using Hooks + +Hooks are passed as parameters to the `agent.run()` method. Each hook should be a callable function that accepts the agent instance as its parameter. + +### Basic Example + +```python +from browser_use import Agent +from langchain_openai import ChatOpenAI + +async def my_step_hook(agent): + # inside a hook you can access all the state and methods under the Agent object: + # agent.settings, agent.state, agent.task + # agent.controller, agent.llm, agent.browser, agent.browser_context + # agent.pause(), agent.resume(), agent.add_new_task(...), etc. + + current_page = await agent.browser_context.get_current_page() + + visit_log = agent.state.history.urls() + current_url = current_page.url + previous_url = visit_log[-2] if len(visit_log) >= 2 else None + print(f"Agent was last on URL: {previous_url} and is now on {current_url}") + + if 'completed' in current_url: + agent.pause() + Path('result.txt').write_text(await current_page.content()) + input('Saved "completed" page content to result.txt, press [Enter] to resume...') + agent.resume() + +agent = Agent( + task="Search for the latest news about AI", + llm=ChatOpenAI(model="gpt-4o"), +) + +await agent.run( + on_step_start=my_step_hook, + # on_step_end=... + max_steps=10 +) +``` + +## Complete Example: Agent Activity Recording System + +This comprehensive example demonstrates a complete implementation for recording and saving Browser-Use agent activity, consisting of both server and client components. + +### Setup Instructions + +To use this example, you'll need to: + +1. Set up the required dependencies: + ```bash + pip install fastapi uvicorn prettyprinter pyobjtojson dotenv browser-use langchain-openai + ``` + +2. Create two separate Python files: + - `api.py` - The FastAPI server component + - `client.py` - The Browser-Use agent with recording hook + +3. Run both components: + - Start the API server first: `python api.py` + - Then run the client: `python client.py` + +### Server Component (api.py) + +The server component handles receiving and storing the agent's activity data: + +```python +#!/usr/bin/env python3 + +# +# FastAPI API to record and save Browser-Use activity data. +# Save this code to api.py and run with `python api.py` +# + +import json +import base64 +from pathlib import Path + +from fastapi import FastAPI, Request +import prettyprinter +import uvicorn + +prettyprinter.install_extras() + +# Utility function to save screenshots +def b64_to_png(b64_string: str, output_file): + """ + Convert a Base64-encoded string to a PNG file. + + :param b64_string: A string containing Base64-encoded data + :param output_file: The path to the output PNG file + """ + with open(output_file, "wb") as f: + f.write(base64.b64decode(b64_string)) + +# Initialize FastAPI app +app = FastAPI() + + +@app.post("/post_agent_history_step") +async def post_agent_history_step(request: Request): + data = await request.json() + prettyprinter.cpprint(data) + + # Ensure the "recordings" folder exists using pathlib + recordings_folder = Path("recordings") + recordings_folder.mkdir(exist_ok=True) + + # Determine the next file number by examining existing .json files + existing_numbers = [] + for item in recordings_folder.iterdir(): + if item.is_file() and item.suffix == ".json": + try: + file_num = int(item.stem) + existing_numbers.append(file_num) + except ValueError: + # In case the file name isn't just a number + pass + + if existing_numbers: + next_number = max(existing_numbers) + 1 + else: + next_number = 1 + + # Construct the file path + file_path = recordings_folder / f"{next_number}.json" + + # Save the JSON data to the file + with file_path.open("w") as f: + json.dump(data, f, indent=2) + + # Optionally save screenshot if needed + # if "website_screenshot" in data and data["website_screenshot"]: + # screenshot_folder = Path("screenshots") + # screenshot_folder.mkdir(exist_ok=True) + # b64_to_png(data["website_screenshot"], screenshot_folder / f"{next_number}.png") + + return {"status": "ok", "message": f"Saved to {file_path}"} + +if __name__ == "__main__": + print("Starting Browser-Use recording API on http://0.0.0.0:9000") + uvicorn.run(app, host="0.0.0.0", port=9000) +``` + +### Client Component (client.py) + +The client component runs the Browser-Use agent with a recording hook: + +```python +#!/usr/bin/env python3 + +# +# Client to record and save Browser-Use activity. +# Save this code to client.py and run with `python client.py` +# + +import asyncio +import requests +from dotenv import load_dotenv +from pyobjtojson import obj_to_json +from langchain_openai import ChatOpenAI +from browser_use import Agent + +# Load environment variables (for API keys) +load_dotenv() + + +def send_agent_history_step(data): + """Send the agent step data to the recording API""" + url = "http://127.0.0.1:9000/post_agent_history_step" + response = requests.post(url, json=data) + return response.json() + + +async def record_activity(agent_obj): + """Hook function that captures and records agent activity at each step""" + website_html = None + website_screenshot = None + urls_json_last_elem = None + model_thoughts_last_elem = None + model_outputs_json_last_elem = None + model_actions_json_last_elem = None + extracted_content_json_last_elem = None + + print('--- ON_STEP_START HOOK ---') + + # Capture current page state + website_html = await agent_obj.browser_context.get_page_html() + website_screenshot = await agent_obj.browser_context.take_screenshot() + + # Make sure we have state history + if hasattr(agent_obj, "state"): + history = agent_obj.state.history + else: + history = None + print("Warning: Agent has no state history") + return + + # Process model thoughts + model_thoughts = obj_to_json( + obj=history.model_thoughts(), + check_circular=False + ) + if len(model_thoughts) > 0: + model_thoughts_last_elem = model_thoughts[-1] + + # Process model outputs + model_outputs = agent_obj.state.history.model_outputs() + model_outputs_json = obj_to_json( + obj=model_outputs, + check_circular=False + ) + if len(model_outputs_json) > 0: + model_outputs_json_last_elem = model_outputs_json[-1] + + # Process model actions + model_actions = agent_obj.state.history.model_actions() + model_actions_json = obj_to_json( + obj=model_actions, + check_circular=False + ) + if len(model_actions_json) > 0: + model_actions_json_last_elem = model_actions_json[-1] + + # Process extracted content + extracted_content = agent_obj.state.history.extracted_content() + extracted_content_json = obj_to_json( + obj=extracted_content, + check_circular=False + ) + if len(extracted_content_json) > 0: + extracted_content_json_last_elem = extracted_content_json[-1] + + # Process URLs + urls = agent_obj.state.history.urls() + urls_json = obj_to_json( + obj=urls, + check_circular=False + ) + if len(urls_json) > 0: + urls_json_last_elem = urls_json[-1] + + # Create a summary of all data for this step + model_step_summary = { + "website_html": website_html, + "website_screenshot": website_screenshot, + "url": urls_json_last_elem, + "model_thoughts": model_thoughts_last_elem, + "model_outputs": model_outputs_json_last_elem, + "model_actions": model_actions_json_last_elem, + "extracted_content": extracted_content_json_last_elem + } + + print("--- MODEL STEP SUMMARY ---") + print(f"URL: {urls_json_last_elem}") + + # Send data to the API + result = send_agent_history_step(data=model_step_summary) + print(f"Recording API response: {result}") + + +async def run_agent(): + """Run the Browser-Use agent with the recording hook""" + agent = Agent( + task="Compare the price of gpt-4o and DeepSeek-V3", + llm=ChatOpenAI(model="gpt-4o"), + ) + + try: + print("Starting Browser-Use agent with recording hook") + await agent.run( + on_step_start=record_activity, + max_steps=30 + ) + except Exception as e: + print(f"Error running agent: {e}") + + +if __name__ == "__main__": + # Check if API is running + try: + requests.get("http://127.0.0.1:9000") + print("Recording API is available") + except: + print("Warning: Recording API may not be running. Start api.py first.") + + # Run the agent + asyncio.run(run_agent()) +``` + +### Working with the Recorded Data + +After running the agent, you'll find the recorded data in the `recordings` directory. Here's how you can use this data: + +1. **View recorded sessions**: Each JSON file contains a snapshot of agent activity for one step +2. **Extract screenshots**: You can modify the API to save screenshots separately +3. **Analyze agent behavior**: Use the recorded data to study how the agent navigates websites + +### Extending the Example + +You can extend this recording system in several ways: + +1. **Save screenshots separately**: Uncomment the screenshot saving code in the API +2. **Add a web dashboard**: Create a simple web interface to view recorded sessions +3. **Add session IDs**: Modify the API to group steps by agent session +4. **Add filtering**: Implement filters to record only specific types of actions + +## Data Available in Hooks + +When working with agent hooks, you have access to the entire agent instance. Here are some useful data points you can access: + +- `agent.state.history.model_thoughts()`: Reasoning from Browser Use's model. +- `agent.state.history.model_outputs()`: Raw outputs from the Browsre Use's model. +- `agent.state.history.model_actions()`: Actions taken by the agent +- `agent.state.history.extracted_content()`: Content extracted from web pages +- `agent.state.history.urls()`: URLs visited by the agent +- `agent.browser_context.get_page_html()`: Current page HTML +- `agent.browser_context.take_screenshot()`: Screenshot of the current page + +## Tips for Using Hooks + +- **Avoid blocking operations**: Since hooks run in the same execution thread as the agent, try to keep them efficient or use asynchronous patterns. +- **Handle exceptions**: Make sure your hook functions handle exceptions gracefully to prevent interrupting the agent's main flow. +- **Consider storage needs**: When capturing full HTML and screenshots, be mindful of storage requirements. + +Contribution by Carlos A. Planchón. diff --git a/docs/customize/output-format.mdx b/docs/customize/output-format.mdx index d893a7513b..b48a888362 100644 --- a/docs/customize/output-format.mdx +++ b/docs/customize/output-format.mdx @@ -47,4 +47,4 @@ async def main(): if __name__ == '__main__': asyncio.run(main()) -``` \ No newline at end of file +``` diff --git a/docs/customize/real-browser.mdx b/docs/customize/real-browser.mdx index aafb92f912..9d7811ced9 100644 --- a/docs/customize/real-browser.mdx +++ b/docs/customize/real-browser.mdx @@ -24,7 +24,7 @@ import asyncio browser = Browser( config=BrowserConfig( # Specify the path to your Chrome executable - chrome_instance_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', # macOS path + browser_binary_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', # macOS path # For Windows, typically: 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe' # For Linux, typically: '/usr/bin/google-chrome' ) diff --git a/docs/customize/sensitive-data.mdx b/docs/customize/sensitive-data.mdx index 4130eff4d1..3bb5f6c665 100644 --- a/docs/customize/sensitive-data.mdx +++ b/docs/customize/sensitive-data.mdx @@ -43,8 +43,8 @@ if __name__ == '__main__': In this example: 1. The model only sees `x_name` and `x_password` as placeholders. 2. When the model wants to use your password it outputs x_password - and we replace it with the actual value. -3. When your password is visable on the current page, we replace it in the LLM input - so that the model never has it in its state. +3. When your password is visible on the current page, we replace it in the LLM input - so that the model never has it in its state. Warning: Vision models still see the image of the page - where the sensitive data might be visible. -This approach ensures that sensitive information remains secure while still allowing the agent to perform tasks that require authentication. \ No newline at end of file +This approach ensures that sensitive information remains secure while still allowing the agent to perform tasks that require authentication. diff --git a/docs/customize/supported-models.mdx b/docs/customize/supported-models.mdx index 1798cfbfbc..8ae3012416 100644 --- a/docs/customize/supported-models.mdx +++ b/docs/customize/supported-models.mdx @@ -113,15 +113,13 @@ AZURE_OPENAI_KEY= ```python from langchain_google_genai import ChatGoogleGenerativeAI from browser_use import Agent -from pydantic import SecretStr -import os from dotenv import load_dotenv -load_dotenv() -api_key = os.getenv("GEMINI_API_KEY") +# Read GOOGLE_API_KEY into env +load_dotenv() # Initialize the model -llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp', api_key=SecretStr(os.getenv('GEMINI_API_KEY'))) +llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp') # Create agent with the model agent = Agent( @@ -133,19 +131,22 @@ agent = Agent( Required environment variables: ```bash .env -GEMINI_API_KEY= +GOOGLE_API_KEY= ``` ### DeepSeek-V3 -The community likes DeepSeek-V3 for its low price, no rate limits, open-source nature, and good performance. +The community likes DeepSeek-V3 for its low price, no rate limits, open-source nature, and good performance. The example is available [here](https://github.com/browser-use/browser-use/blob/main/examples/models/deepseek.py). ```python from langchain_openai import ChatOpenAI from browser_use import Agent from pydantic import SecretStr +from dotenv import load_dotenv +load_dotenv() +api_key = os.getenv("DEEPSEEK_API_KEY") # Initialize the model llm=ChatOpenAI(base_url='https://api.deepseek.com/v1', model='deepseek-chat', api_key=SecretStr(api_key)) @@ -172,7 +173,10 @@ It does not support vision. The model is open-source so you could also use it wi from langchain_openai import ChatOpenAI from browser_use import Agent from pydantic import SecretStr +from dotenv import load_dotenv +load_dotenv() +api_key = os.getenv("DEEPSEEK_API_KEY") # Initialize the model llm=ChatOpenAI(base_url='https://api.deepseek.com/v1', model='deepseek-reasoner', api_key=SecretStr(api_key)) @@ -192,7 +196,7 @@ DEEPSEEK_API_KEY= ``` ### Ollama -Many users asked for local models. Here they are. +Many users asked for local models. Here they are. 1. Download Ollama from [here](https://ollama.ai/download) 2. Run `ollama pull model_name`. Pick a model which supports tool-calling from [here](https://ollama.com/search?c=tools) diff --git a/docs/development.mdx b/docs/development.mdx index c2f2ccdae1..bd0ed7d10c 100644 --- a/docs/development.mdx +++ b/docs/development.mdx @@ -98,9 +98,31 @@ We suggest using extensions on your IDE to recognize and format MDX. If you're a - + Solution: Go to the root of your device and delete the \~/.mintlify folder. Afterwards, run `mintlify dev` again. Curious about what changed in the CLI version? [Check out the CLI changelog.](https://www.npmjs.com/package/mintlify?activeTab=versions) + +# Development Workflow + +## Branches +- **`stable`**: Mirrors the latest stable release. This branch is updated only when a new stable release is published (every few weeks). +- **`main`**: The primary development branch. This branch is updated frequently (every hour or more). + +## Tags +- **`x.x.x`**: Stable release tags. These are created for stable releases and updated every few weeks. +- **`x.x.xrcXX`**: Pre-release tags. These are created for unstable pre-releases and updated every Friday at 5 PM UTC. + +## Workflow Summary +1. **Push to `main`**: + - Runs pre-commit hooks to fix formatting. + - Executes tests to ensure code quality. + +2. **Release a new version**: + - If the tag is a pre-release (`x.x.xrcXX`), the package is pushed to PyPI as a pre-release. + - If the tag is a stable release (`x.x.x`), the package is pushed to PyPI as a stable release, and the `stable` branch is updated to match the release. + +3. **Scheduled Pre-Releases**: + - Every Friday at 5 PM UTC, a new pre-release tag (`x.x.xrcXX`) is created from the `main` branch and pushed to the repository. diff --git a/docs/development/contribution-guide.mdx b/docs/development/contribution-guide.mdx index 37b5519535..c07a288a5b 100644 --- a/docs/development/contribution-guide.mdx +++ b/docs/development/contribution-guide.mdx @@ -4,4 +4,9 @@ description: "Learn how to contribute to Browser Use" icon: "code-pull-request" --- -Working on it! + +- check out our most active issues or ask in [Discord](https://discord.gg/zXJJHtJf3k) for ideas of what to work on +- get inspiration / share what you build in the [`#showcase-your-work`](https://discord.com/channels/1303749220842340412/1305549200678850642) channel and on [`awesome-browser-use-prompts`](https://github.com/browser-use/awesome-prompts)! +- no typo/style-only nit PRs, you can submit nit fixes but only if part of larger bugfix or new feature PRs +- include a demo screenshot/gif, tests, and ideally an example script demonstrating any changes in your PR +- bump your issues/PRs with comments periodically if you want them to be merged faster diff --git a/docs/development/local-setup.mdx b/docs/development/local-setup.mdx index da2bc3f711..a0c97a2c24 100644 --- a/docs/development/local-setup.mdx +++ b/docs/development/local-setup.mdx @@ -19,25 +19,20 @@ cd browser-use ## Environment Setup -1. Create a virtual environment: +1. Create and activate a virtual environment: ```bash uv venv --python 3.11 +source .venv/bin/activate ``` 2. Install dependencies: ```bash # Install the package in editable mode with all development dependencies -uv pip install -e ".[dev]" +uv sync ``` - - The `-e` flag installs the package in "editable" mode, which means your local code changes - will be reflected immediately without requiring reinstallation. The `[dev]` part installs - additional dependencies needed for development. - - ## Configuration Set up your environment variables: @@ -47,16 +42,20 @@ Set up your environment variables: cp .env.example .env ``` -Or manually create a `.env` file with your API keys: +Or manually create a `.env` file with the API key for the models you want to use set: ```bash .env -OPENAI_API_KEY= +OPENAI_API_KEY=... ANTHROPIC_API_KEY= +AZURE_ENDPOINT= +AZURE_OPENAI_API_KEY= +GEMINI_API_KEY= +DEEPSEEK_API_KEY= ``` - You can use any LLM model supported by LangChain. See [LangChain - Models](/customize/supported-models) for available options and their specific + You can use any LLM model supported by LangChain. See + [LangChain Models](/customize/supported-models) for available options and their specific API key requirements. @@ -64,9 +63,10 @@ ANTHROPIC_API_KEY= After setup, you can: -- Run tests with `pytest` -- Build the package with `hatch build` -- Try the examples in the `examples/` directory +- Try demos in the example library with `uv run examples/simple.py` +- Run the linter/formatter with `uv run ruff format examples/some/file.py` +- Run tests with `uv run pytest` +- Build the package with `uv build` ## Getting Help @@ -76,7 +76,6 @@ If you run into any issues: 2. Join our [Discord community](https://link.browser-use.com/discord) for support - We welcome contributions! See our [Contribution - Guide](/development/contribution-guide) for guidelines on how to help improve + We welcome contributions! See our [Contribution Guide](/development/contribution-guide) for guidelines on how to help improve Browser Use. diff --git a/docs/development/n8n-integration.mdx b/docs/development/n8n-integration.mdx new file mode 100644 index 0000000000..2a6fd29b51 --- /dev/null +++ b/docs/development/n8n-integration.mdx @@ -0,0 +1,122 @@ +--- +title: 'n8n Integration' +description: 'Learn how to integrate Browser Use with n8n workflows' +--- + +# Browser Use n8n Integration + +Browser Use can be integrated with [n8n](https://n8n.io), a workflow automation platform, using our community node. This integration allows you to trigger browser automation tasks directly from your n8n workflows. + +## Installing the n8n Community Node + +There are several ways to install the Browser Use community node in n8n: + +### Using n8n Desktop or Cloud + +1. Navigate to **Settings > Community Nodes** +2. Click on **Install** +3. Enter `n8n-nodes-browser-use` in the **Name** field +4. Click **Install** + +### Using a Self-hosted n8n Instance + +Run the following command in your n8n installation directory: + +```bash +npm install n8n-nodes-browser-use +``` + +### For Development + +If you want to develop with the n8n node: + +1. Clone the repository: + ```bash + git clone https://github.com/draphonix/n8n-nodes-browser-use.git + ``` +2. Install dependencies: + ```bash + cd n8n-nodes-browser-use + npm install + ``` +3. Build the code: + ```bash + npm run build + ``` +4. Link to your n8n installation: + ```bash + npm link + ``` +5. In your n8n installation directory: + ```bash + npm link n8n-nodes-browser-use + ``` + +## Setting Up Browser Use Cloud API Credentials + +To use the Browser Use node in n8n, you need to configure API credentials: + +1. Sign up for an account at [Browser Use Cloud](https://cloud.browser-use.com) +2. Navigate to the Settings or API section +3. Generate or copy your API key +4. In n8n, create a new credential: + - Go to **Credentials** tab + - Click **Create New** + - Select **Browser Use Cloud API** + - Enter your API key + - Save the credential + +## Using the Browser Use Node + +Once installed, you can add the Browser Use node to your workflows: + +1. In your workflow editor, search for "Browser Use" in the nodes panel +2. Add the node to your workflow +3. Set-up the credentials +4. Choose your saved credentials +5. Select an operation: + - **Run Task**: Execute a browser automation task with natural language instructions + - **Get Task**: Retrieve task details + - **Get Task Status**: Check task execution status + - **Pause/Resume/Stop Task**: Control running tasks + - **Get Task Media**: Retrieve screenshots, videos, or PDFs + - **List Tasks**: Get a list of tasks + +### Example: Running a Browser Task + +Here's a simple example of how to use the Browser Use node to run a browser task: + +1. Add the Browser Use node to your workflow +2. Select the "Run Task" operation +3. In the "Instructions" field, enter a natural language description of what you want the browser to do, for example: + ``` + Go to example.com, take a screenshot of the homepage, and extract all the main heading texts + ``` +4. Optionally enable "Save Browser Data" to preserve cookies and session information +5. Connect the node to subsequent nodes to process the results + +## Workflow Examples + +The Browser Use n8n node enables various automation scenarios: + +- **Web Scraping**: Extract data from websites on a schedule +- **Form Filling**: Automate data entry across web applications +- **Monitoring**: Check website status and capture visual evidence +- **Report Generation**: Generate PDFs or screenshots of web dashboards +- **Multi-step Processes**: Chain browser tasks together using session persistence + +## Troubleshooting + +If you encounter issues with the Browser Use node: + +- Verify your API key is valid and has sufficient credits +- Check that your instructions are clear and specific +- For complex tasks, consider breaking them into multiple steps +- Refer to the [Browser Use documentation](https://docs.browser-use.com) for instruction best practices + +## Resources + +- [n8n Community Nodes Documentation](https://docs.n8n.io/integrations/community-nodes/) +- [Browser Use Documentation](https://docs.browser-use.com) +- [Browser Use Cloud](https://cloud.browser-use.com) +- [n8n-nodes-browser-use GitHub Repository](https://github.com/draphonix/n8n-nodes-browser-use) diff --git a/docs/development/observability.mdx b/docs/development/observability.mdx index 955e0388eb..874a80b65f 100644 --- a/docs/development/observability.mdx +++ b/docs/development/observability.mdx @@ -63,4 +63,4 @@ In the trace view, you can also see the agent's current step, the tool it's usin ## Laminar -To learn more about tracing and evaluating your browser agents, check out the [Laminar docs](https://docs.lmnr.ai). \ No newline at end of file +To learn more about tracing and evaluating your browser agents, check out the [Laminar docs](https://docs.lmnr.ai). diff --git a/docs/mint.json b/docs/mint.json index 320045f3d1..08ecd5d1bb 100644 --- a/docs/mint.json +++ b/docs/mint.json @@ -60,7 +60,8 @@ "customize/output-format", "customize/system-prompt", "customize/sensitive-data", - "customize/custom-functions" + "customize/custom-functions", + "customize/hooks" ] }, { diff --git a/docs/quickstart.mdx b/docs/quickstart.mdx index 451bbcf63c..aea07056a4 100644 --- a/docs/quickstart.mdx +++ b/docs/quickstart.mdx @@ -35,7 +35,7 @@ uv pip install browser-use Then install playwright: ```bash -playwright install +uv run playwright install ``` ## Create an agent diff --git a/eval/claude.py b/eval/claude.py index b34c668c49..8e9c7b135c 100644 --- a/eval/claude.py +++ b/eval/claude.py @@ -1,18 +1,19 @@ from dotenv import load_dotenv from langchain_anthropic import ChatAnthropic -from browser_use import Agent +from browser_use import Agent, Browser load_dotenv() -async def run_agent(task: str, max_steps: int = 38): +async def run_agent(task: str, browser: Browser | None = None, max_steps: int = 38): + browser = browser or Browser() llm = ChatAnthropic( model_name='claude-3-5-sonnet-20240620', temperature=0.0, timeout=100, stop=None, ) - agent = Agent(task=task, llm=llm) + agent = Agent(task=task, llm=llm, browser=browser) result = await agent.run(max_steps=max_steps) return result diff --git a/eval/deepseek-r1.py b/eval/deepseek-r1.py index 03da9edfc6..a13397ba9f 100644 --- a/eval/deepseek-r1.py +++ b/eval/deepseek-r1.py @@ -4,7 +4,7 @@ from langchain_openai import ChatOpenAI from pydantic import SecretStr -from browser_use import Agent +from browser_use import Agent, Browser load_dotenv() @@ -13,12 +13,13 @@ raise ValueError('DEEPSEEK_API_KEY is not set') -async def run_agent(task: str, max_steps: int = 38): +async def run_agent(task: str, browser: Browser | None = None, max_steps: int = 38): + browser = browser or Browser() llm = ChatOpenAI( base_url='https://api.deepseek.com/v1', model='deepseek-reasoner', api_key=SecretStr(api_key_deepseek), ) - agent = Agent(task=task, llm=llm, use_vision=False) + agent = Agent(task=task, llm=llm, use_vision=False, browser=browser) result = await agent.run(max_steps=max_steps) return result diff --git a/eval/deepseek.py b/eval/deepseek.py index 1ec8289bc2..c1a0b18bd5 100644 --- a/eval/deepseek.py +++ b/eval/deepseek.py @@ -4,7 +4,7 @@ from langchain_openai import ChatOpenAI from pydantic import SecretStr -from browser_use import Agent +from browser_use import Agent, Browser load_dotenv() @@ -13,12 +13,13 @@ raise ValueError('DEEPSEEK_API_KEY is not set') -async def run_agent(task: str, max_steps: int = 38): +async def run_agent(task: str, browser: Browser | None = None, max_steps: int = 38): + browser = browser or Browser() llm = ChatOpenAI( base_url='https://api.deepseek.com/v1', model='deepseek-chat', api_key=SecretStr(api_key_deepseek), ) - agent = Agent(task=task, llm=llm, use_vision=False) + agent = Agent(task=task, llm=llm, use_vision=False, browser=browser) result = await agent.run(max_steps=max_steps) return result diff --git a/eval/gemini-1.5-flash.py b/eval/gemini-1.5-flash.py index 051f85d3c7..4a23414e49 100644 --- a/eval/gemini-1.5-flash.py +++ b/eval/gemini-1.5-flash.py @@ -4,7 +4,7 @@ from langchain_google_genai import ChatGoogleGenerativeAI from pydantic import SecretStr -from browser_use import Agent +from browser_use import Agent, Browser load_dotenv() @@ -13,8 +13,9 @@ raise ValueError('GEMINI_API_KEY is not set') -async def run_agent(task: str, max_steps: int = 38): +async def run_agent(task: str, browser: Browser | None = None, max_steps: int = 38): + browser = browser or Browser() llm = ChatGoogleGenerativeAI(model='gemini-1.5-flash-latest', api_key=SecretStr(api_key)) - agent = Agent(task=task, llm=llm) + agent = Agent(task=task, llm=llm, browser=browser) result = await agent.run(max_steps=max_steps) return result diff --git a/eval/gemini-2.0-flash.py b/eval/gemini-2.0-flash.py index 803895c773..5e4fe99499 100644 --- a/eval/gemini-2.0-flash.py +++ b/eval/gemini-2.0-flash.py @@ -4,7 +4,7 @@ from langchain_google_genai import ChatGoogleGenerativeAI from pydantic import SecretStr -from browser_use import Agent +from browser_use import Agent, Browser load_dotenv() @@ -13,8 +13,9 @@ raise ValueError('GEMINI_API_KEY is not set') -async def run_agent(task: str, max_steps: int = 38): +async def run_agent(task: str, browser: Browser | None = None, max_steps: int = 38): + browser = browser or Browser() llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp', api_key=SecretStr(api_key)) - agent = Agent(task=task, llm=llm) + agent = Agent(task=task, llm=llm, browser=browser) result = await agent.run(max_steps=max_steps) return result diff --git a/eval/gpt-4o-no-boundingbox.py b/eval/gpt-4o-no-boundingbox.py index 6343824147..03b571ac8d 100644 --- a/eval/gpt-4o-no-boundingbox.py +++ b/eval/gpt-4o-no-boundingbox.py @@ -3,21 +3,14 @@ from dotenv import load_dotenv from langchain_openai import ChatOpenAI -from browser_use import Agent, BrowserConfig -from browser_use.browser.browser import Browser -from browser_use.browser.context import BrowserContextConfig +from browser_use import Agent, Browser load_dotenv() -async def run_agent(task: str, max_steps: int = 38): - browser = Browser( - config=BrowserConfig( - new_context_config=BrowserContextConfig( - highlight_elements=False, - ), - ), - ) +async def run_agent(task: str, browser: Browser | None = None, max_steps: int = 38): + browser = browser or Browser() + browser.config.new_context_config.highlight_elements = False llm = ChatOpenAI( model='gpt-4o', temperature=0.0, diff --git a/eval/gpt-4o-no-vision.py b/eval/gpt-4o-no-vision.py index 47a5c21fe0..87be9f8f98 100644 --- a/eval/gpt-4o-no-vision.py +++ b/eval/gpt-4o-no-vision.py @@ -1,16 +1,17 @@ from dotenv import load_dotenv from langchain_openai import ChatOpenAI -from browser_use import Agent +from browser_use import Agent, Browser load_dotenv() -async def run_agent(task: str, max_steps: int = 38): +async def run_agent(task: str, browser: Browser | None = None, max_steps: int = 38): + browser = browser or Browser() llm = ChatOpenAI( model='gpt-4o', temperature=0.0, ) - agent = Agent(task=task, llm=llm, use_vision=False) + agent = Agent(task=task, llm=llm, use_vision=False, browser=browser) result = await agent.run(max_steps=max_steps) return result diff --git a/eval/gpt-4o-viewport-0.py b/eval/gpt-4o-viewport-0.py index eb3ada91b2..3ebd01b7b0 100644 --- a/eval/gpt-4o-viewport-0.py +++ b/eval/gpt-4o-viewport-0.py @@ -3,25 +3,18 @@ from dotenv import load_dotenv from langchain_openai import ChatOpenAI -from browser_use import Agent, BrowserConfig -from browser_use.browser.browser import Browser -from browser_use.browser.context import BrowserContextConfig +from browser_use import Agent, Browser load_dotenv() -async def run_agent(task: str, max_steps: int = 38): +async def run_agent(task: str, browser: Browser | None = None, max_steps: int = 38): + browser = browser or Browser() llm = ChatOpenAI( model='gpt-4o', temperature=0.0, ) - browser = Browser( - config=BrowserConfig( - new_context_config=BrowserContextConfig( - viewport_expansion=0, - ), - ), - ) + browser.config.new_context_config.viewport_expansion = 0 agent = Agent(task=task, llm=llm, browser=browser) result = await agent.run(max_steps=max_steps) return result diff --git a/eval/gpt-4o.py b/eval/gpt-4o.py index 3cdcbd64d4..71d5b90f8a 100644 --- a/eval/gpt-4o.py +++ b/eval/gpt-4o.py @@ -1,24 +1,17 @@ -import asyncio - from dotenv import load_dotenv from langchain_openai import ChatOpenAI -from browser_use import Agent +from browser_use import Agent, Browser load_dotenv() -async def run_agent(task: str, max_steps: int = 38): +async def run_agent(task: str, browser: Browser | None = None, max_steps: int = 38): + browser = browser or Browser() llm = ChatOpenAI( model='gpt-4o', temperature=0.0, ) - agent = Agent(task=task, llm=llm) + agent = Agent(task=task, llm=llm, browser=browser) result = await agent.run(max_steps=max_steps) return result - - -if __name__ == '__main__': - task = 'Go to https://www.google.com and search for "python" and click on the first result' - result = asyncio.run(run_agent(task)) - print(result) diff --git a/eval/grok.py b/eval/grok.py new file mode 100644 index 0000000000..905a776420 --- /dev/null +++ b/eval/grok.py @@ -0,0 +1,25 @@ +import os + +from dotenv import load_dotenv +from langchain_openai import ChatOpenAI +from pydantic import SecretStr + +from browser_use import Agent, Browser + +load_dotenv() + +api_key = os.getenv('GROK_API_KEY', '') +if not api_key: + raise ValueError('GROK_API_KEY is not set') + + +async def run_agent(task: str, browser: Browser | None = None, max_steps: int = 38): + browser = browser or Browser() + agent = Agent( + task=task, + use_vision=False, + llm=ChatOpenAI(model='grok-2-1212', base_url='https://api.x.ai/v1', api_key=SecretStr(api_key)), + browser=browser, + ) + + await agent.run() diff --git a/examples/browser/real_browser.py b/examples/browser/real_browser.py index 1bd255ae88..0291ceefb7 100644 --- a/examples/browser/real_browser.py +++ b/examples/browser/real_browser.py @@ -1,22 +1,20 @@ import os import sys -from pathlib import Path - -from browser_use.agent.views import ActionResult sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import asyncio +import dotenv from langchain_openai import ChatOpenAI -from browser_use import Agent, Controller -from browser_use.browser.browser import Browser, BrowserConfig -from browser_use.browser.context import BrowserContext +from browser_use import Agent, Browser, BrowserConfig + +dotenv.load_dotenv() browser = Browser( config=BrowserConfig( # NOTE: you need to close your chrome browser - so that this can open your browser in debug mode - chrome_instance_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', + browser_binary_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', ) ) diff --git a/examples/browser/stealth.py b/examples/browser/stealth.py new file mode 100644 index 0000000000..aed94c53ee --- /dev/null +++ b/examples/browser/stealth.py @@ -0,0 +1,79 @@ +import asyncio +import os +import sys + +from langchain_openai import ChatOpenAI + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from browser_use import Agent, Browser, BrowserConfig, BrowserContextConfig + +llm = ChatOpenAI(model='gpt-4o') +browser = Browser( + config=BrowserConfig( + headless=False, + disable_security=False, + keep_alive=True, + new_context_config=BrowserContextConfig( + keep_alive=True, + disable_security=False, + ), + ) +) + + +async def main(): + agent = Agent( + task=""" + Go to https://bot-detector.rebrowser.net/ and verify that all the bot checks are passed. + """, + llm=llm, + browser=browser, + ) + await agent.run() + input('Press Enter to continue to the next test...') + + agent = Agent( + task=""" + Go to https://www.webflow.com/ and verify that the page is not blocked by a bot check. + """, + llm=llm, + browser=browser, + ) + await agent.run() + input('Press Enter to continue to the next test...') + + agent = Agent( + task=""" + Go to https://www.okta.com/ and verify that the page is not blocked by a bot check. + """, + llm=llm, + browser=browser, + ) + await agent.run() + + agent = Agent( + task=""" + Go to https://abrahamjuliot.github.io/creepjs/ and verify that the detection score is >50%. + """, + llm=llm, + browser=browser, + ) + await agent.run() + + input('Press Enter to close the browser...') + + agent = Agent( + task=""" + Go to https://nowsecure.nl/ check the "I'm not a robot" checkbox. + """, + llm=llm, + browser=browser, + ) + await agent.run() + + input('Press Enter to close the browser...') + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/examples/custom-functions/action_filters.py b/examples/custom-functions/action_filters.py new file mode 100644 index 0000000000..ef7c602801 --- /dev/null +++ b/examples/custom-functions/action_filters.py @@ -0,0 +1,86 @@ +""" +Action filters (domains and page_filter) let you limit actions available to the Agent on a step-by-step/page-by-page basis. + +@registry.action(..., domains=['*'], page_filter=lambda page: return True) +async def some_action(browser: BrowserContext): + ... + +This helps prevent the LLM from deciding to use an action that is not compatible with the current page. +It helps limit decision fatique by scoping actions only to pages where they make sense. +It also helps prevent mis-triggering stateful actions or actions that could break other programs or leak secrets. + +For example: + - only run on certain domains @registry.action(..., domains=['example.com', '*.example.com', 'example.co.*']) (supports globs, but no regex) + - only fill in a password on a specific login page url + - only run if this action has not run before on this page (e.g. by looking up the url in a file on disk) + +During each step, the agent recalculates the actions available specifically for that page, and informs the LLM. +""" + +import asyncio + +from langchain_openai import ChatOpenAI +from playwright.async_api import Page + +from browser_use.agent.service import Agent, Browser, BrowserContext, Controller + +# Initialize controller and registry +controller = Controller() +registry = controller.registry + + +# Action will only be available to Agent on Google domains because of the domain filter +@registry.action(description='Trigger disco mode', domains=['google.com', '*.google.com']) +async def disco_mode(browser: BrowserContext): + page = await browser.get_current_page() + await page.evaluate("""() => { + // define the wiggle animation + document.styleSheets[0].insertRule('@keyframes wiggle { 0% { transform: rotate(0deg); } 50% { transform: rotate(10deg); } 100% { transform: rotate(0deg); } }'); + + document.querySelectorAll("*").forEach(element => { + element.style.animation = "wiggle 0.5s infinite"; + }); + }""") + + +# you can create a custom page filter function that determines if the action should be available for a given page +def is_login_page(page: Page) -> bool: + return 'login' in page.url.lower() or 'signin' in page.url.lower() + + +# then use it in the action decorator to limit the action to only be available on pages where the filter returns True +@registry.action(description='Use the force, luke', page_filter=is_login_page) +async def use_the_force(browser: BrowserContext): + # this will only ever run on pages that matched the filter + page = await browser.get_current_page() + assert is_login_page(page) + + await page.evaluate("""() => { document.querySelector('body').innerHTML = 'These are not the droids you are looking for';}""") + + +async def main(): + """Main function to run the example""" + browser = Browser() + llm = ChatOpenAI(model_name='gpt-4o') + + # Create the agent + agent = Agent( # disco mode will not be triggered on apple.com because the LLM won't be able to see that action available, it should work on Google.com though. + task=""" + Go to apple.com and trigger disco mode (if dont know how to do that, then just move on). + Then go to google.com and trigger disco mode. + After that, go to the Google login page and Use the force, luke. + """, + llm=llm, + browser=browser, + controller=controller, + ) + + # Run the agent + await agent.run(max_steps=10) + + # Cleanup + await browser.close() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/examples/custom-functions/clipboard.py b/examples/custom-functions/clipboard.py index e06d3e5de1..8d8e8725ac 100644 --- a/examples/custom-functions/clipboard.py +++ b/examples/custom-functions/clipboard.py @@ -1,6 +1,5 @@ import os import sys -from pathlib import Path from browser_use.agent.views import ActionResult @@ -39,7 +38,7 @@ async def paste_from_clipboard(browser: BrowserContext): async def main(): - task = f'Copy the text "Hello, world!" to the clipboard, then go to google.com and paste the text' + task = 'Copy the text "Hello, world!" to the clipboard, then go to google.com and paste the text' model = ChatOpenAI(model='gpt-4o') agent = Agent( task=task, diff --git a/examples/custom-functions/custom_hooks_before_after_step.py b/examples/custom-functions/custom_hooks_before_after_step.py new file mode 100644 index 0000000000..634d11f157 --- /dev/null +++ b/examples/custom-functions/custom_hooks_before_after_step.py @@ -0,0 +1,232 @@ +""" +Description: These Python modules are designed to capture detailed +browser usage datafor analysis, with both server and client +components working together to record and store the information. + +Author: Carlos A. Planchón +https://github.com/carlosplanchon/ + +Adapt this code to your needs. + +Feedback is appreciated! +""" + +##################### +# # +# --- UTILS --- # +# # +##################### + +import base64 + + +def b64_to_png(b64_string: str, output_file): + """ + Convert a Base64-encoded string to a PNG file. + + :param b64_string: A string containing Base64-encoded data + :param output_file: The path to the output PNG file + """ + with open(output_file, 'wb') as f: + f.write(base64.b64decode(b64_string)) + + +################################################################### +# # +# --- FASTAPI API TO RECORD AND SAVE Browser-Use ACTIVITY --- # +# # +################################################################### + +# Save to api.py and run with `python api.py` + +# ! pip install uvicorn +# ! pip install fastapi +# ! pip install prettyprinter + +import json +from pathlib import Path + +import prettyprinter +from fastapi import FastAPI, Request + +prettyprinter.install_extras() + +app = FastAPI() + + +@app.post('/post_agent_history_step') +async def post_agent_history_step(request: Request): + data = await request.json() + prettyprinter.cpprint(data) + + # Ensure the "recordings" folder exists using pathlib + recordings_folder = Path('recordings') + recordings_folder.mkdir(exist_ok=True) + + # Determine the next file number by examining existing .json files + existing_numbers = [] + for item in recordings_folder.iterdir(): + if item.is_file() and item.suffix == '.json': + try: + file_num = int(item.stem) + existing_numbers.append(file_num) + except ValueError: + # In case the file name isn't just a number + ... + + if existing_numbers: + next_number = max(existing_numbers) + 1 + else: + next_number = 1 + + # Construct the file path + file_path = recordings_folder / f'{next_number}.json' + + # Save the JSON data to the file + with file_path.open('w') as f: + json.dump(data, f, indent=2) + + return {'status': 'ok', 'message': f'Saved to {file_path}'} + + +if __name__ == '__main__': + import uvicorn + + uvicorn.run(app, host='0.0.0.0', port=9000) + + +############################################################## +# # +# --- CLIENT TO RECORD AND SAVE Browser-Use ACTIVITY --- # +# # +############################################################## + +""" +pyobjtojson: + +A Python library to safely and recursively serialize any Python object +(including Pydantic models and dataclasses) into JSON-ready structures, +gracefully handling circular references. +""" + +# ! pip install -U pyobjtojson +# ! pip install -U prettyprinter + +import asyncio + +import requests +from dotenv import load_dotenv +from langchain_openai import ChatOpenAI +from pyobjtojson import obj_to_json + +from browser_use import Agent + +# import prettyprinter + +# prettyprinter.install_extras() + +load_dotenv() + + +def send_agent_history_step(data): + url = 'http://127.0.0.1:9000/post_agent_history_step' + response = requests.post(url, json=data) + return response.json() + + +async def record_activity(agent_obj): + website_html = None + website_screenshot = None + urls_json_last_elem = None + model_thoughts_last_elem = None + model_outputs_json_last_elem = None + model_actions_json_last_elem = None + extracted_content_json_last_elem = None + + print('--- ON_STEP_START HOOK ---') + website_html: str = await agent_obj.browser_context.get_page_html() + website_screenshot: str = await agent_obj.browser_context.take_screenshot() + + print('--> History:') + if hasattr(agent_obj, 'state'): + history = agent_obj.state.history + else: + history = None + + model_thoughts = obj_to_json(obj=history.model_thoughts(), check_circular=False) + + # print("--- MODEL THOUGHTS ---") + if len(model_thoughts) > 0: + model_thoughts_last_elem = model_thoughts[-1] + # prettyprinter.cpprint(model_thoughts_last_elem) + + # print("--- MODEL OUTPUT ACTION ---") + model_outputs = agent_obj.state.history.model_outputs() + model_outputs_json = obj_to_json(obj=model_outputs, check_circular=False) + + if len(model_outputs_json) > 0: + model_outputs_json_last_elem = model_outputs_json[-1] + # prettyprinter.cpprint(model_outputs_json_last_elem) + + # print("--- MODEL INTERACTED ELEM ---") + model_actions = agent_obj.state.history.model_actions() + model_actions_json = obj_to_json(obj=model_actions, check_circular=False) + + if len(model_actions_json) > 0: + model_actions_json_last_elem = model_actions_json[-1] + # prettyprinter.cpprint(model_actions_json_last_elem) + + # print("--- EXTRACTED CONTENT ---") + extracted_content = agent_obj.state.history.extracted_content() + extracted_content_json = obj_to_json(obj=extracted_content, check_circular=False) + if len(extracted_content_json) > 0: + extracted_content_json_last_elem = extracted_content_json[-1] + # prettyprinter.cpprint(extracted_content_json_last_elem) + + # print("--- URLS ---") + urls = agent_obj.state.history.urls() + # prettyprinter.cpprint(urls) + urls_json = obj_to_json(obj=urls, check_circular=False) + + if len(urls_json) > 0: + urls_json_last_elem = urls_json[-1] + # prettyprinter.cpprint(urls_json_last_elem) + + model_step_summary = { + 'website_html': website_html, + 'website_screenshot': website_screenshot, + 'url': urls_json_last_elem, + 'model_thoughts': model_thoughts_last_elem, + 'model_outputs': model_outputs_json_last_elem, + 'model_actions': model_actions_json_last_elem, + 'extracted_content': extracted_content_json_last_elem, + } + + print('--- MODEL STEP SUMMARY ---') + # prettyprinter.cpprint(model_step_summary) + + send_agent_history_step(data=model_step_summary) + + # response = send_agent_history_step(data=history) + # print(response) + + # print("--> Website HTML:") + # print(website_html[:200]) + # print("--> Website Screenshot:") + # print(website_screenshot[:200]) + + +agent = Agent( + task='Compare the price of gpt-4o and DeepSeek-V3', + llm=ChatOpenAI(model='gpt-4o'), +) + + +async def run_agent(): + try: + await agent.run(on_step_start=record_activity, max_steps=30) + except Exception as e: + print(e) + + +asyncio.run(run_agent()) diff --git a/examples/custom-functions/file_upload.py b/examples/custom-functions/file_upload.py index f1efdf6c70..3948e312c1 100644 --- a/examples/custom-functions/file_upload.py +++ b/examples/custom-functions/file_upload.py @@ -20,7 +20,7 @@ browser = Browser( config=BrowserConfig( headless=False, - chrome_instance_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', + browser_binary_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', ) ) controller = Controller() @@ -84,7 +84,7 @@ def create_file(file_type: str = 'txt'): async def main(): - task = f'Go to https://kzmpmkh2zfk1ojnpxfn1.lite.vusercontent.net/ and - read the file content and upload them to fields' + task = 'Go to https://kzmpmkh2zfk1ojnpxfn1.lite.vusercontent.net/ and - read the file content and upload them to fields' available_file_paths = [create_file('txt'), create_file('pdf'), create_file('csv')] diff --git a/examples/custom-functions/group_ungroup.py b/examples/custom-functions/group_ungroup.py new file mode 100644 index 0000000000..2349cb1bd8 --- /dev/null +++ b/examples/custom-functions/group_ungroup.py @@ -0,0 +1,108 @@ +import os +import sys + +from browser_use.agent.views import ActionResult +from browser_use.browser.views import GroupTabsAction, UngroupTabsAction + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import asyncio + +from langchain_openai import ChatOpenAI + +from browser_use import Agent, Controller +from browser_use.browser.browser import Browser, BrowserConfig +from browser_use.browser.context import BrowserContext + +# async def group_tabs(self, tab_ids: list[int] , title: str, color: str = "blue"): +# """Reset the browser session +# Call this when you don't want to kill the context but just kill the state +# """ +# # close all tabs and clear cached state +# page = await self.get_current_page() + +# js = f""" +# chrome.tabs.group({{ tabIds: {tab_ids} }}, (groupId) => {{ +# chrome.tabGroups.update(groupId, {{ +# title: "{title}", +# color: "{color}" +# }}); +# }}); +# """ + +# await page.evaluate(js) + +# async def ungroup_tabs(self, tab_ids: list[int]): +# """Reset the browser session +# Call this when you don't want to kill the context but just kill the state +# """ +# # close all tabs and clear cached state +# page = await self.get_current_page() + +# js = f""" +# for (const tabId of {tab_ids}) {{ +# chrome.tabs.ungroup(tabId); +# }} +# """ + +# await page.evaluate(js) + + +# Initialize controller first +browser = Browser( + config=BrowserConfig( + headless=False, + chrome_instance_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', + ) +) +controller = Controller() + + +@controller.action('Visually group browser tabs in Chrome', param_model=GroupTabsAction, requires_browser=True) +async def group_tabs(params: GroupTabsAction, browser: BrowserContext): + try: + # Get tab IDs from params + tab_ids = params.tab_ids + title = params.title + color = params.color + + # Call the low-level implementation in BrowserContext + result = await browser.group_tabs(tab_ids, title, color='red') + return ActionResult(extracted_content=result, include_in_memory=True) + except Exception as e: + return ActionResult(error=f'Failed to group tabs: {str(e)}') + + +# Register ungroup_tabs action +@controller.action('Remove visual grouping from tabs in Chrome', param_model=UngroupTabsAction, requires_browser=True) +async def ungroup_tabs(params: UngroupTabsAction, browser: BrowserContext): + try: + # Get tab IDs from params + tab_ids = params.tab_ids + + # Call the low-level implementation in BrowserContext + result = await browser.ungroup_tabs(tab_ids) + return ActionResult(extracted_content=result, include_in_memory=True) + except Exception as e: + return ActionResult(error=f'Failed to ungroup tabs: {str(e)}') + + +async def main(): + task = 'Group tabs 1 and 2 into a "Research" group, then ungroup them.' + + model = ChatOpenAI(model='gpt-4o') + agent = Agent( + task=task, + llm=model, + controller=controller, + browser=browser, + ) + + await agent.run() + + await browser.close() + + input('Press Enter to close...') + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/examples/custom-functions/hover_element.py b/examples/custom-functions/hover_element.py new file mode 100644 index 0000000000..42bd4ed6d3 --- /dev/null +++ b/examples/custom-functions/hover_element.py @@ -0,0 +1,96 @@ +import os +import sys +from typing import Optional + +from pydantic import BaseModel + +from browser_use.agent.views import ActionResult + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import asyncio + +from langchain_openai import ChatOpenAI + +from browser_use import Agent, Controller +from browser_use.browser.browser import Browser, BrowserConfig +from browser_use.browser.context import BrowserContext + + +class HoverAction(BaseModel): + index: Optional[int] = None + xpath: Optional[str] = None + selector: Optional[str] = None + + +browser = Browser( + config=BrowserConfig( + headless=False, + ) +) +controller = Controller() + + +@controller.registry.action( + 'Hover over an element', + param_model=HoverAction, # Define this model with at least "index: int" field +) +async def hover_element(params: HoverAction, browser: BrowserContext): + """ + Hovers over the element specified by its index from the cached selector map or by XPath. + """ + session = await browser.get_session() + state = session.cached_state + + if params.xpath: + # Use XPath to locate the element + element_handle = await browser.get_locate_element_by_xpath(params.xpath) + if element_handle is None: + raise Exception(f'Failed to locate element with XPath {params.xpath}') + elif params.selector: + # Use CSS selector to locate the element + element_handle = await browser.get_locate_element_by_css_selector(params.selector) + if element_handle is None: + raise Exception(f'Failed to locate element with CSS Selector {params.selector}') + elif params.index is not None: + # Use index to locate the element + if state is None or params.index not in state.selector_map: + raise Exception(f'Element index {params.index} does not exist - retry or use alternative actions') + element_node = state.selector_map[params.index] + element_handle = await browser.get_locate_element(element_node) + if element_handle is None: + raise Exception(f'Failed to locate element with index {params.index}') + else: + raise Exception('Either index or xpath must be provided') + + try: + await element_handle.hover() + msg = ( + f'🖱️ Hovered over element at index {params.index}' + if params.index is not None + else f'🖱️ Hovered over element with XPath {params.xpath}' + ) + return ActionResult(extracted_content=msg, include_in_memory=True) + except Exception as e: + err_msg = f'❌ Failed to hover over element: {str(e)}' + raise Exception(err_msg) + + +async def main(): + task = 'Open https://testpages.eviltester.com/styled/csspseudo/css-hover.html and hover the element with the css selector #hoverdivpara, then click on "Can you click me?"' + # task = 'Open https://testpages.eviltester.com/styled/csspseudo/css-hover.html and hover the element with the xpath //*[@id="hoverdivpara"], then click on "Can you click me?"' + model = ChatOpenAI(model='gpt-4o') + agent = Agent( + task=task, + llm=model, + controller=controller, + browser=browser, + ) + + await agent.run() + await browser.close() + + input('Press Enter to close...') + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/examples/custom-functions/notification.py b/examples/custom-functions/notification.py index 56a8bfedef..971697cc2b 100644 --- a/examples/custom-functions/notification.py +++ b/examples/custom-functions/notification.py @@ -7,7 +7,6 @@ from dotenv import load_dotenv from langchain_openai import ChatOpenAI -from pydantic import BaseModel from browser_use import ActionResult, Agent, Controller @@ -22,7 +21,7 @@ async def done(text: str): # To send emails use # STEP 1: go to https://support.google.com/accounts/answer/185833 - # STEP 2: Create an app password (you cant use here your normal gmail password) + # STEP 2: Create an app password (you can't use here your normal gmail password) # STEP 3: Use the app password in the code below for the password yag = yagmail.SMTP('your_email@gmail.com', 'your_app_password') yag.send( diff --git a/examples/custom-functions/onepassword_2fa.py b/examples/custom-functions/onepassword_2fa.py new file mode 100644 index 0000000000..b4a32ac3f7 --- /dev/null +++ b/examples/custom-functions/onepassword_2fa.py @@ -0,0 +1,56 @@ +import asyncio +import logging +import os +import sys + +from dotenv import load_dotenv +from langchain_openai import ChatOpenAI +from onepassword.client import Client # pip install onepassword-sdk + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from browser_use import ActionResult, Agent, Controller + +# Set up logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +load_dotenv() + +OP_SERVICE_ACCOUNT_TOKEN = os.getenv('OP_SERVICE_ACCOUNT_TOKEN') +OP_ITEM_ID = os.getenv('OP_ITEM_ID') # Go to 1Password, right click on the item, click "Copy Secret Reference" + + +controller = Controller() + + +@controller.registry.action('Get 2FA code from 1Password for Google Account', domains=['*.google.com', 'google.com']) +async def get_1password_2fa() -> ActionResult: + """ + Custom action to retrieve 2FA/MFA code from 1Password using onepassword.client SDK. + """ + client = await Client.authenticate( + # setup instructions: https://github.com/1Password/onepassword-sdk-python/#-get-started + auth=OP_SERVICE_ACCOUNT_TOKEN, + integration_name='Browser-Use', + integration_version='v1.0.0', + ) + + mfa_code = await client.secrets.resolve(f'op://Private/{OP_ITEM_ID}/One-time passcode') + + return ActionResult(extracted_content=mfa_code) + + +async def main(): + # Example task using the 1Password 2FA action + task = 'Go to account.google.com, enter username and password, then if prompted for 2FA code, get 2FA code from 1Password for and enter it' + + model = ChatOpenAI(model='gpt-4o') + agent = Agent(task=task, llm=model, controller=controller) + + result = await agent.run() + print(f'Task completed with result: {result}') + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/examples/custom-functions/save_to_file_hugging_face.py b/examples/custom-functions/save_to_file_hugging_face.py index 9c332a408d..96434b05c8 100644 --- a/examples/custom-functions/save_to_file_hugging_face.py +++ b/examples/custom-functions/save_to_file_hugging_face.py @@ -4,7 +4,7 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import asyncio -from typing import List, Optional +from typing import List from langchain_openai import ChatOpenAI from pydantic import BaseModel @@ -36,7 +36,7 @@ def save_models(params: Models): # video: https://preview.screen.studio/share/EtOhIk0P async def main(): - task = f'Look up models with a license of cc-by-sa-4.0 and sort by most likes on Hugging face, save top 5 to file.' + task = 'Look up models with a license of cc-by-sa-4.0 and sort by most likes on Hugging face, save top 5 to file.' model = ChatOpenAI(model='gpt-4o') agent = Agent(task=task, llm=model, controller=controller) diff --git a/examples/features/click_fallback_options.py b/examples/features/click_fallback_options.py new file mode 100644 index 0000000000..8d0f52b889 --- /dev/null +++ b/examples/features/click_fallback_options.py @@ -0,0 +1,211 @@ +import asyncio +import os +import sys + +from aiohttp import web # make sure to install aiohttp: pip install aiohttp +from dotenv import load_dotenv +from langchain_openai import ChatOpenAI + +# from langchain_google_genai import ChatGoogleGenerativeAI + + +# Adjust path if necessary +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from browser_use import Agent, Controller + +# Define a simple HTML page +HTML_CONTENT = """ + + + + + + Custom Select Div + + + +
+
Select a fruit
+
+
Apples
+
Oranges
+
Pineapples
+
+
+ +
+
Select a fruit
+
+
Apples
+
Oranges
+
Pineapples
+
+
+ +
+
Select a fruit
+
+
Apples
+
Oranges
+
Pineapples
+
+
+ +
+
Select a fruit
+
+
Apples
+
Oranges
+
Pineapples
+
+
+ + + + + + + + + + +""" + + +# aiohttp request handler to serve the HTML content +async def handle_root(request): + return web.Response(text=HTML_CONTENT, content_type='text/html') + + +# Function to run the HTTP server +async def run_http_server(): + app = web.Application() + app.router.add_get('/', handle_root) + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, 'localhost', 8000) + await site.start() + print('HTTP server running on http://localhost:8000') + # Keep the server running indefinitely. + while True: + await asyncio.sleep(3600) + + +# Your agent tasks and other logic +load_dotenv() +controller = Controller() + + +async def main(): + # Start the HTTP server in the background. + server_task = asyncio.create_task(run_http_server()) + + # Example tasks for the agent. + xpath_task = 'Open http://localhost:8000/, click element with the xpath "/html/body/div/div[1]" and then click on Oranges' + css_selector_task = 'Open http://localhost:8000/, click element with the selector div.select-display and then click on apples' + text_task = 'Open http://localhost:8000/, click the third element with the text "Select a fruit" and then click on Apples, then click the second element with the text "Select a fruit" and then click on Oranges' + select_task = 'Open http://localhost:8000/, choose the car BMW' + button_task = 'Open http://localhost:8000/, click on the button' + + llm = ChatOpenAI(model='gpt-4o') + # llm = ChatGoogleGenerativeAI( + # model="gemini-2.0-flash-lite", + # ) + + # Run different agent tasks. + for task in [xpath_task, css_selector_task, text_task, select_task, button_task]: + agent = Agent( + task=task, + llm=llm, + controller=controller, + ) + await agent.run() + + # Wait for user input before shutting down. + input('Press Enter to close...') + # Cancel the server task once finished. + server_task.cancel() + try: + await server_task + except asyncio.CancelledError: + print('HTTP server stopped.') + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/examples/features/cross_origin_iframes.py b/examples/features/cross_origin_iframes.py new file mode 100644 index 0000000000..04e5551021 --- /dev/null +++ b/examples/features/cross_origin_iframes.py @@ -0,0 +1,51 @@ +""" +Example of how it supports cross-origin iframes. + +@dev You need to add OPENAI_API_KEY to your environment variables. +""" + +import os +import sys + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import asyncio + +from dotenv import load_dotenv +from langchain_openai import ChatOpenAI + +from browser_use import Agent, Controller +from browser_use.browser.browser import Browser, BrowserConfig + +# Load environment variables +load_dotenv() +if not os.getenv('OPENAI_API_KEY'): + raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.') + + +browser = Browser( + config=BrowserConfig( + browser_binary_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', + ) +) +controller = Controller() + + +async def main(): + agent = Agent( + task='Click "Go cross-site (simple page)" button on https://csreis.github.io/tests/cross-site-iframe.html then tell me the text within', + llm=ChatOpenAI(model='gpt-4o', temperature=0.0), + controller=controller, + browser=browser, + ) + + await agent.run() + await browser.close() + + input('Press Enter to close...') + + +if __name__ == '__main__': + try: + asyncio.run(main()) + except Exception as e: + print(e) diff --git a/examples/features/custom_output.py b/examples/features/custom_output.py index cf76d9dcc8..a406d5a637 100644 --- a/examples/features/custom_output.py +++ b/examples/features/custom_output.py @@ -16,7 +16,7 @@ from langchain_openai import ChatOpenAI from pydantic import BaseModel -from browser_use import ActionResult, Agent, Controller +from browser_use import Agent, Controller load_dotenv() diff --git a/examples/features/custom_user_agent.py b/examples/features/custom_user_agent.py index f832d92ade..fa5b1bb703 100644 --- a/examples/features/custom_user_agent.py +++ b/examples/features/custom_user_agent.py @@ -49,7 +49,7 @@ def get_llm(provider: str): browser = Browser( config=BrowserConfig( - # chrome_instance_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', + # browser_binary_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', ) ) diff --git a/examples/features/drag_drop.py b/examples/features/drag_drop.py new file mode 100644 index 0000000000..7976649433 --- /dev/null +++ b/examples/features/drag_drop.py @@ -0,0 +1,46 @@ +import asyncio +import os + +from dotenv import load_dotenv +from langchain_google_genai import ChatGoogleGenerativeAI +from pydantic import SecretStr + +from browser_use import Agent + +load_dotenv() +api_key = os.getenv('GEMINI_API_KEY') +if not api_key: + raise ValueError('GEMINI_API_KEY is not set') + +llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp', api_key=SecretStr(api_key)) + + +task_1 = """ +Navigate to: https://sortablejs.github.io/Sortable/. +Then scroll down to the first examplw with title "Simple list example". +Drag the element with name "item 1" to below the element with name "item 3". +""" + + +task_2 = """ +Navigate to: https://excalidraw.com/. +Click on the pencil icon (with index 40). +Then draw a triangle in the canvas. +Draw the triangle starting from coordinate (400,400). +You can use the drag and drop action to draw the triangle. +""" + + +async def run_search(): + agent = Agent( + task=task_1, + llm=llm, + max_actions_per_step=1, + use_vision=True, + ) + + await agent.run(max_steps=25) + + +if __name__ == '__main__': + asyncio.run(run_search()) diff --git a/examples/features/follow_up_tasks.py b/examples/features/follow_up_tasks.py index aa326691a1..1dcf9c92c3 100644 --- a/examples/features/follow_up_tasks.py +++ b/examples/features/follow_up_tasks.py @@ -3,9 +3,7 @@ from dotenv import load_dotenv from langchain_openai import ChatOpenAI -from browser_use import Agent -from browser_use.agent.views import ActionResult -from browser_use.controller.service import Controller +from browser_use import Agent, Browser, BrowserConfig, BrowserContextConfig, Controller load_dotenv() @@ -14,12 +12,22 @@ model='gpt-4o', temperature=0.0, ) +# Get your chrome path +browser = Browser( + config=BrowserConfig( + browser_binary_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', + new_context_config=BrowserContextConfig( + keep_alive=True, + ), + ), +) + controller = Controller() task = 'Find the founders of browser-use and draft them a short personalized message' -agent = Agent(task=task, llm=llm, controller=controller) +agent = Agent(task=task, llm=llm, controller=controller, browser=browser) async def main(): diff --git a/examples/features/multiple_agents_same_browser.py b/examples/features/multiple_agents_same_browser.py index 013b33ff3f..dd48e99fa3 100644 --- a/examples/features/multiple_agents_same_browser.py +++ b/examples/features/multiple_agents_same_browser.py @@ -7,7 +7,7 @@ import asyncio -from browser_use import Agent, Browser, Controller +from browser_use import Agent, Browser # Video: https://preview.screen.studio/share/8Elaq9sm diff --git a/examples/features/planner.py b/examples/features/planner.py index 37f595375c..85bd23c8f8 100644 --- a/examples/features/planner.py +++ b/examples/features/planner.py @@ -4,7 +4,6 @@ from browser_use import Agent - llm = ChatOpenAI(model='gpt-4o', temperature=0.0) planner_llm = ChatOpenAI( model='o3-mini', diff --git a/examples/features/restrict_urls.py b/examples/features/restrict_urls.py index 398c785596..f481277e0c 100644 --- a/examples/features/restrict_urls.py +++ b/examples/features/restrict_urls.py @@ -1,29 +1,26 @@ import os import sys -from langchain_anthropic import ChatAnthropic from langchain_openai import ChatOpenAI from browser_use.browser.context import BrowserContextConfig sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import argparse import asyncio from browser_use import Agent from browser_use.browser.browser import Browser, BrowserConfig -from browser_use.controller.service import Controller llm = ChatOpenAI(model='gpt-4o', temperature=0.0) task = ( - 'go to google.com and search for openai.com and click on the first link then extract content and scroll down - whats there?' + "go to google.com and search for openai.com and click on the first link then extract content and scroll down - what's there?" ) allowed_domains = ['google.com'] browser = Browser( config=BrowserConfig( - chrome_instance_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', + browser_binary_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', new_context_config=BrowserContextConfig( allowed_domains=allowed_domains, ), diff --git a/examples/features/result_processing.py b/examples/features/result_processing.py index 53177f4eee..07f4851615 100644 --- a/examples/features/result_processing.py +++ b/examples/features/result_processing.py @@ -2,12 +2,7 @@ import sys from pprint import pprint -from browser_use.browser.browser import Browser, BrowserConfig -from browser_use.browser.context import ( - BrowserContext, - BrowserContextConfig, - BrowserContextWindowSize, -) +from browser_use.browser.browser import Browser, BrowserConfig, BrowserContextConfig sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import asyncio @@ -16,14 +11,12 @@ from browser_use import Agent from browser_use.agent.views import AgentHistoryList -from browser_use.controller.service import Controller llm = ChatOpenAI(model='gpt-4o') browser = Browser( config=BrowserConfig( headless=False, disable_security=True, - extra_chromium_args=['--window-size=2000,2000'], ) ) @@ -33,7 +26,7 @@ async def main(): config=BrowserContextConfig( trace_path='./tmp/result_processing', no_viewport=False, - browser_window_size=BrowserContextWindowSize(width=1280, height=1000), + browser_window_size={'width': 1280, 'height': 1000}, ) ) as browser_context: agent = Agent( diff --git a/examples/features/save_trace.py b/examples/features/save_trace.py index f2b4bb677f..657a6ed81b 100644 --- a/examples/features/save_trace.py +++ b/examples/features/save_trace.py @@ -16,9 +16,7 @@ async def main(): browser = Browser() - async with await browser.new_context( - config=BrowserContextConfig(trace_path='./tmp/traces/') - ) as context: + async with await browser.new_context(config=BrowserContextConfig(trace_path='./tmp/traces/')) as context: agent = Agent( task='Go to hackernews, then go to apple.com and return all titles of open tabs', llm=llm, diff --git a/examples/features/small_model_for_extraction.py b/examples/features/small_model_for_extraction.py index e86e0235bd..859050a2af 100644 --- a/examples/features/small_model_for_extraction.py +++ b/examples/features/small_model_for_extraction.py @@ -4,7 +4,6 @@ from langchain_openai import ChatOpenAI from browser_use import Agent -from browser_use.controller.service import Controller load_dotenv() diff --git a/examples/features/task_with_memory.py b/examples/features/task_with_memory.py new file mode 100644 index 0000000000..4d9aa2ef66 --- /dev/null +++ b/examples/features/task_with_memory.py @@ -0,0 +1,98 @@ +import asyncio +import json +from typing import List + +from dotenv import load_dotenv + +load_dotenv() + +from langchain_openai import ChatOpenAI +from pydantic import BaseModel + +from browser_use import Agent, Browser, BrowserConfig, Controller + +links = [ + 'https://docs.mem0.ai/components/llms/models/litellm', + 'https://docs.mem0.ai/components/llms/models/mistral_AI', + 'https://docs.mem0.ai/components/llms/models/ollama', + 'https://docs.mem0.ai/components/llms/models/openai', + 'https://docs.mem0.ai/components/llms/models/together', + 'https://docs.mem0.ai/components/llms/models/xAI', + 'https://docs.mem0.ai/components/llms/overview', + 'https://docs.mem0.ai/components/vectordbs/config', + 'https://docs.mem0.ai/components/vectordbs/dbs/azure_ai_search', + 'https://docs.mem0.ai/components/vectordbs/dbs/chroma', + 'https://docs.mem0.ai/components/vectordbs/dbs/elasticsearch', + 'https://docs.mem0.ai/components/vectordbs/dbs/milvus', + 'https://docs.mem0.ai/components/vectordbs/dbs/opensearch', + 'https://docs.mem0.ai/components/vectordbs/dbs/pgvector', + 'https://docs.mem0.ai/components/vectordbs/dbs/pinecone', + 'https://docs.mem0.ai/components/vectordbs/dbs/qdrant', + 'https://docs.mem0.ai/components/vectordbs/dbs/redis', + 'https://docs.mem0.ai/components/vectordbs/dbs/supabase', + 'https://docs.mem0.ai/components/vectordbs/dbs/vertex_ai_vector_search', + 'https://docs.mem0.ai/components/vectordbs/dbs/weaviate', + 'https://docs.mem0.ai/components/vectordbs/overview', + 'https://docs.mem0.ai/contributing/development', + 'https://docs.mem0.ai/contributing/documentation', + 'https://docs.mem0.ai/core-concepts/memory-operations', + 'https://docs.mem0.ai/core-concepts/memory-types', +] + + +class Link(BaseModel): + url: str + title: str + summary: str + + +class Links(BaseModel): + links: List[Link] + + +initial_actions = [ + {'open_tab': {'url': 'https://docs.mem0.ai/'}}, +] +controller = Controller(output_model=Links) +task_description = f""" +Visit all the links provided in {links} and summarize the content of the page with url and title. There are {len(links)} links to visit. Make sure to visit all the links. Return a json with the following format: [{{url: , title: , summary: <summary>}}]. + +Guidelines: +1. Strictly stay on the domain https://docs.mem0.ai +2. Do not visit any other websites. +3. Ignore the links that are hashed (#) or javascript (:), or mailto, or tel, or other protocols +4. Don't visit any other url other than the ones provided above. +5. Capture the unique urls which are not already visited. +6. If you visit any page that doesn't have host name docs.mem0.ai, then do not visit it and come back to the page with host name docs.mem0.ai. +""" + + +async def main(max_steps=500): + config = BrowserConfig(headless=True) + browser = Browser(config=config) + + agent = Agent( + task=task_description, + llm=ChatOpenAI(model='gpt-4o-mini'), + controller=controller, + initial_actions=initial_actions, + enable_memory=True, + browser=browser, + ) + history = await agent.run(max_steps=max_steps) + result = history.final_result() + parsed_result = [] + if result: + parsed: Links = Links.model_validate_json(result) + print(f'Total parsed links: {len(parsed.links)}') + for link in parsed.links: + parsed_result.append({'title': link.title, 'url': link.url, 'summary': link.summary}) + else: + print('No result') + + with open('result.json', 'w+') as f: + f.write(json.dumps(parsed_result, indent=4)) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/examples/features/validate_output.py b/examples/features/validate_output.py index afe96083ed..332c4fddad 100644 --- a/examples/features/validate_output.py +++ b/examples/features/validate_output.py @@ -1,5 +1,5 @@ """ -Demostrate output validator. +Demonstrate output validator. @dev You need to add OPENAI_API_KEY to your environment variables. """ diff --git a/examples/integrations/discord/discord_api.py b/examples/integrations/discord/discord_api.py index 32ee5c0b08..66a2640bf2 100644 --- a/examples/integrations/discord/discord_api.py +++ b/examples/integrations/discord/discord_api.py @@ -55,9 +55,7 @@ def __init__( intents.members = True # Enable members intent for user info # Initialize the bot with a command prefix and intents. - super().__init__( - command_prefix='!', intents=intents - ) # You may not need prefix, just here for flexibility + super().__init__(command_prefix='!', intents=intents) # You may not need prefix, just here for flexibility # self.tree = app_commands.CommandTree(self) # Initialize command tree for slash commands. @@ -86,12 +84,8 @@ async def on_message(self, message): print(f'Error sending start message: {e}') try: - agent_message = await self.run_agent( - message.content.replace(f'{self.prefix} ', '').strip() - ) - await message.channel.send( - content=f'{agent_message}', reference=message, mention_author=True - ) + agent_message = await self.run_agent(message.content.replace(f'{self.prefix} ', '').strip()) + await message.channel.send(content=f'{agent_message}', reference=message, mention_author=True) except Exception as e: await message.channel.send( content=f'Error during task execution: {str(e)}', diff --git a/examples/integrations/discord/discord_example.py b/examples/integrations/discord/discord_example.py index 259e68cc61..c7435d8546 100644 --- a/examples/integrations/discord/discord_example.py +++ b/examples/integrations/discord/discord_example.py @@ -29,7 +29,7 @@ * Click “Authorize”. --> Note: The person adding the bot needs "Manage Server" permissions. 6. Run the code below to start the bot with your bot token. -7. Write e.g. "/bu whats the weather in Tokyo?" to start a browser-use task and get a response inside the Discord channel. +7. Write e.g. "/bu what's the weather in Tokyo?" to start a browser-use task and get a response inside the Discord channel. """ import os diff --git a/examples/integrations/slack/README.md b/examples/integrations/slack/README.md index af98593046..3184dbc143 100644 --- a/examples/integrations/slack/README.md +++ b/examples/integrations/slack/README.md @@ -38,14 +38,14 @@ Steps to create and configure a Slack bot: 6. Invite the bot to a channel: * Use the `/invite @your-bot-name` command in the Slack channel where you want the bot to be active. 7. Run the code in `examples/slack_example.py` to start the bot with your bot token and signing secret. -8. Write e.g. "$bu whats the weather in Tokyo?" to start a browser-use task and get a response inside the Slack channel. +8. Write e.g. "$bu what's the weather in Tokyo?" to start a browser-use task and get a response inside the Slack channel. ## Installing and Starting ngrok To expose your local server to the internet, you can use ngrok. Follow these steps to install and start ngrok: 1. Download ngrok from the official website: https://ngrok.com/download -2. Create a free account and follow the offical steps to install ngrok. +2. Create a free account and follow the official steps to install ngrok. 3. Start ngrok by running the following command in your terminal: ```sh ngrok http 3000 @@ -73,4 +73,4 @@ To run this example, you need to install the following packages: You can install these packages using pip: ```sh -pip install fastapi uvicorn slack_sdk \ No newline at end of file +pip install fastapi uvicorn slack_sdk diff --git a/examples/integrations/slack/slack_api.py b/examples/integrations/slack/slack_api.py index 3df930a890..956e2aa669 100644 --- a/examples/integrations/slack/slack_api.py +++ b/examples/integrations/slack/slack_api.py @@ -1,12 +1,14 @@ import logging -from browser_use import BrowserConfig -from fastapi import FastAPI, Request, HTTPException, Depends + from dotenv import load_dotenv -from slack_sdk.web.async_client import AsyncWebClient +from fastapi import Depends, FastAPI, HTTPException, Request +from langchain_core.language_models.chat_models import BaseChatModel from slack_sdk.errors import SlackApiError from slack_sdk.signature import SignatureVerifier +from slack_sdk.web.async_client import AsyncWebClient + +from browser_use import BrowserConfig from browser_use.agent.service import Agent, Browser -from langchain_core.language_models.chat_models import BaseChatModel from browser_use.logging_config import setup_logging load_dotenv() @@ -16,96 +18,107 @@ app = FastAPI() + class SlackBot: - def __init__(self, llm: BaseChatModel, bot_token: str, signing_secret: str, ack: bool = False, browser_config: BrowserConfig = BrowserConfig(headless=True)): - if not bot_token or not signing_secret: - raise ValueError("Bot token and signing secret must be provided") - - self.llm = llm - self.ack = ack - self.browser_config = browser_config - self.client = AsyncWebClient(token=bot_token) - self.signature_verifier = SignatureVerifier(signing_secret) - self.processed_events = set() - logger.info("SlackBot initialized") - - async def handle_event(self, event, event_id): - try: - logger.info(f"Received event id: {event_id}") - if not event_id: - logger.warning("Event ID missing in event data") - return - - if event_id in self.processed_events: - logger.info(f"Event {event_id} already processed") - return - self.processed_events.add(event_id) - - if 'subtype' in event and event['subtype'] == 'bot_message': - return - - text = event.get('text') - user_id = event.get('user') - if text and text.startswith('$bu '): - task = text[len('$bu '):].strip() - if self.ack: - try: - await self.send_message(event['channel'], f'<@{user_id}> Starting browser use task...', thread_ts=event.get('ts')) - except Exception as e: - logger.error(f"Error sending start message: {e}") - - try: - agent_message = await self.run_agent(task) - await self.send_message(event['channel'], f'<@{user_id}> {agent_message}', thread_ts=event.get('ts')) - except Exception as e: - await self.send_message(event['channel'], f'Error during task execution: {str(e)}', thread_ts=event.get('ts')) - except Exception as e: - logger.error(f"Error in handle_event: {str(e)}") - - async def run_agent(self, task: str) -> str: - try: - browser = Browser(config=self.browser_config) - agent = Agent(task=task, llm=self.llm, browser=browser) - result = await agent.run() - - agent_message = None - if result.is_done(): - agent_message = result.history[-1].result[0].extracted_content - - if agent_message is None: - agent_message = 'Oops! Something went wrong while running Browser-Use.' - - return agent_message - - except Exception as e: - logger.error(f"Error during task execution: {str(e)}") - return f'Error during task execution: {str(e)}' - - async def send_message(self, channel, text, thread_ts=None): - try: - await self.client.chat_postMessage(channel=channel, text=text, thread_ts=thread_ts) - except SlackApiError as e: - logger.error(f"Error sending message: {e.response['error']}") - -@app.post("/slack/events") + def __init__( + self, + llm: BaseChatModel, + bot_token: str, + signing_secret: str, + ack: bool = False, + browser_config: BrowserConfig = BrowserConfig(headless=True), + ): + if not bot_token or not signing_secret: + raise ValueError('Bot token and signing secret must be provided') + + self.llm = llm + self.ack = ack + self.browser_config = browser_config + self.client = AsyncWebClient(token=bot_token) + self.signature_verifier = SignatureVerifier(signing_secret) + self.processed_events = set() + logger.info('SlackBot initialized') + + async def handle_event(self, event, event_id): + try: + logger.info(f'Received event id: {event_id}') + if not event_id: + logger.warning('Event ID missing in event data') + return + + if event_id in self.processed_events: + logger.info(f'Event {event_id} already processed') + return + self.processed_events.add(event_id) + + if 'subtype' in event and event['subtype'] == 'bot_message': + return + + text = event.get('text') + user_id = event.get('user') + if text and text.startswith('$bu '): + task = text[len('$bu ') :].strip() + if self.ack: + try: + await self.send_message( + event['channel'], f'<@{user_id}> Starting browser use task...', thread_ts=event.get('ts') + ) + except Exception as e: + logger.error(f'Error sending start message: {e}') + + try: + agent_message = await self.run_agent(task) + await self.send_message(event['channel'], f'<@{user_id}> {agent_message}', thread_ts=event.get('ts')) + except Exception as e: + await self.send_message(event['channel'], f'Error during task execution: {str(e)}', thread_ts=event.get('ts')) + except Exception as e: + logger.error(f'Error in handle_event: {str(e)}') + + async def run_agent(self, task: str) -> str: + try: + browser = Browser(config=self.browser_config) + agent = Agent(task=task, llm=self.llm, browser=browser) + result = await agent.run() + + agent_message = None + if result.is_done(): + agent_message = result.history[-1].result[0].extracted_content + + if agent_message is None: + agent_message = 'Oops! Something went wrong while running Browser-Use.' + + return agent_message + + except Exception as e: + logger.error(f'Error during task execution: {str(e)}') + return f'Error during task execution: {str(e)}' + + async def send_message(self, channel, text, thread_ts=None): + try: + await self.client.chat_postMessage(channel=channel, text=text, thread_ts=thread_ts) + except SlackApiError as e: + logger.error(f'Error sending message: {e.response["error"]}') + + +@app.post('/slack/events') async def slack_events(request: Request, slack_bot: SlackBot = Depends()): - try: - if not slack_bot.signature_verifier.is_valid_request(await request.body(), dict(request.headers)): - logger.warning("Request verification failed") - raise HTTPException(status_code=400, detail="Request verification failed") - - event_data = await request.json() - logger.info(f"Received event data: {event_data}") - if 'challenge' in event_data: - return {"challenge": event_data['challenge']} - - if 'event' in event_data: - try: - await slack_bot.handle_event(event_data.get('event'), event_data.get('event_id')) - except Exception as e: - logger.error(f"Error handling event: {str(e)}") - - return {} - except Exception as e: - logger.error(f"Error in slack_events: {str(e)}") - raise HTTPException(status_code=500, detail="Internal Server Error") \ No newline at end of file + try: + if not slack_bot.signature_verifier.is_valid_request(await request.body(), dict(request.headers)): + logger.warning('Request verification failed') + raise HTTPException(status_code=400, detail='Request verification failed') + + event_data = await request.json() + logger.info(f'Received event data: {event_data}') + if 'challenge' in event_data: + return {'challenge': event_data['challenge']} + + if 'event' in event_data: + try: + await slack_bot.handle_event(event_data.get('event'), event_data.get('event_id')) + except Exception as e: + logger.error(f'Error handling event: {str(e)}') + + return {} + except Exception as e: + logger.error(f'Error in slack_events: {str(e)}') + raise HTTPException(status_code=500, detail='Internal Server Error') diff --git a/examples/models/azure_openai.py b/examples/models/azure_openai.py index 854195b637..d7ed35e248 100644 --- a/examples/models/azure_openai.py +++ b/examples/models/azure_openai.py @@ -7,6 +7,8 @@ import os import sys +from dotenv import load_dotenv + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import asyncio @@ -15,28 +17,34 @@ from browser_use import Agent +load_dotenv() + # Retrieve Azure-specific environment variables -azure_openai_api_key = os.environ.get('AZURE_OPENAI_API_KEY') -azure_openai_endpoint = os.environ.get('AZURE_OPENAI_ENDPOINT') +azure_openai_api_key = os.getenv('AZURE_OPENAI_API_KEY') +azure_openai_endpoint = os.getenv('AZURE_OPENAI_ENDPOINT') + +if not azure_openai_api_key or not azure_openai_endpoint: + raise ValueError('AZURE_OPENAI_API_KEY or AZURE_OPENAI_ENDPOINT is not set') # Initialize the Azure OpenAI client llm = AzureChatOpenAI( - model_name='gpt-4o', - openai_api_key=azure_openai_api_key, - azure_endpoint=azure_openai_endpoint, # Corrected to use azure_endpoint instead of openai_api_base - deployment_name='gpt-4o', # Use deployment_name for Azure models - api_version='2024-08-01-preview' # Explicitly set the API version here + model_name='gpt-4o', + openai_api_key=azure_openai_api_key, + azure_endpoint=azure_openai_endpoint, # Corrected to use azure_endpoint instead of openai_api_base + deployment_name='gpt-4o', # Use deployment_name for Azure models + api_version='2024-08-01-preview', # Explicitly set the API version here ) agent = Agent( - task='Go to amazon.com, search for laptop, sort by best rating, and give me the price of the first result', - llm=llm, + task='Go to amazon.com, search for laptop, sort by best rating, and give me the price of the first result', + llm=llm, + enable_memory=False, ) async def main(): - await agent.run(max_steps=10) - input('Press Enter to continue...') + await agent.run(max_steps=10) + input('Press Enter to continue...') -asyncio.run(main()) \ No newline at end of file +asyncio.run(main()) diff --git a/examples/models/bedrock_claude.py b/examples/models/bedrock_claude.py index eaf4e20477..091f6cf729 100644 --- a/examples/models/bedrock_claude.py +++ b/examples/models/bedrock_claude.py @@ -4,14 +4,16 @@ @dev Ensure AWS environment variables are set correctly for Bedrock access. """ +import argparse +import asyncio import os import sys -from langchain_aws import ChatBedrock +import boto3 +from botocore.config import Config +from langchain_aws import ChatBedrockConverse sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import argparse -import asyncio from browser_use import Agent from browser_use.browser.browser import Browser, BrowserConfig @@ -19,19 +21,23 @@ def get_llm(): - return ChatBedrock( - model_id="us.anthropic.claude-3-5-sonnet-20241022-v2:0", - temperature=0.0, - max_tokens=None, - ) + config = Config(retries={'max_attempts': 10, 'mode': 'adaptive'}) + bedrock_client = boto3.client('bedrock-runtime', region_name='us-east-1', config=config) + + return ChatBedrockConverse( + model_id='us.anthropic.claude-3-5-sonnet-20241022-v2:0', + temperature=0.0, + max_tokens=None, + client=bedrock_client, + ) # Define the task for the agent task = ( - "Visit cnn.com, navigate to the 'World News' section, and identify the latest headline. " - "Open the first article and summarize its content in 3-4 sentences. " - "Additionally, analyze the sentiment of the article (positive, neutral, or negative) " - "and provide a confidence score for the sentiment. Present the result in a tabular format." + "Visit cnn.com, navigate to the 'World News' section, and identify the latest headline. " + 'Open the first article and summarize its content in 3-4 sentences. ' + 'Additionally, analyze the sentiment of the article (positive, neutral, or negative) ' + 'and provide a confidence score for the sentiment. Present the result in a tabular format.' ) parser = argparse.ArgumentParser() @@ -41,19 +47,23 @@ def get_llm(): llm = get_llm() browser = Browser( - config=BrowserConfig( - # chrome_instance_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', - ) + config=BrowserConfig( + # browser_binary_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', + ) ) agent = Agent( - task=args.query, llm=llm, controller=Controller(), browser=browser, validate_output=True, + task=args.query, + llm=llm, + controller=Controller(), + browser=browser, + validate_output=True, ) async def main(): - await agent.run(max_steps=30) - await browser.close() + await agent.run(max_steps=30) + await browser.close() asyncio.run(main()) diff --git a/examples/models/claude-3.7-sonnet.py b/examples/models/claude-3.7-sonnet.py index a2b8f4f0bc..8e885435dd 100644 --- a/examples/models/claude-3.7-sonnet.py +++ b/examples/models/claude-3.7-sonnet.py @@ -5,6 +5,7 @@ import os import sys + from dotenv import load_dotenv from langchain_anthropic import ChatAnthropic diff --git a/examples/models/ollama.py b/examples/models/ollama.py index a824cb405c..dc297598d2 100644 --- a/examples/models/ollama.py +++ b/examples/models/ollama.py @@ -7,28 +7,30 @@ # os.environ["OLLAMA_HOST"] = "http://x.x.x.x:11434" import asyncio + +from langchain_ollama import ChatOllama + from browser_use import Agent from browser_use.agent.views import AgentHistoryList -from langchain_ollama import ChatOllama async def run_search() -> AgentHistoryList: - agent = Agent( - task="Search for a 'browser use' post on the r/LocalLLaMA subreddit and open it.", - llm=ChatOllama( - model="qwen2.5:32b-instruct-q4_K_M", - num_ctx=32000, - ), - ) + agent = Agent( + task="Search for a 'browser use' post on the r/LocalLLaMA subreddit and open it.", + llm=ChatOllama( + model='qwen2.5:32b-instruct-q4_K_M', + num_ctx=32000, + ), + ) - result = await agent.run() - return result + result = await agent.run() + return result async def main(): - result = await run_search() - print("\n\n", result) + result = await run_search() + print('\n\n', result) -if __name__ == "__main__": - asyncio.run(main()) +if __name__ == '__main__': + asyncio.run(main()) diff --git a/examples/models/qwen.py b/examples/models/qwen.py index e31316fc86..0b621cdf05 100644 --- a/examples/models/qwen.py +++ b/examples/models/qwen.py @@ -1,5 +1,4 @@ import asyncio -import os from langchain_ollama import ChatOllama @@ -9,10 +8,7 @@ async def run_search(): agent = Agent( task=( - '1. Go to https://www.reddit.com/r/LocalLLaMA' - "2. Search for 'browser use' in the search bar" - '3. Click search' - '4. Call done' + "1. Go to https://www.reddit.com/r/LocalLLaMA2. Search for 'browser use' in the search bar3. Click search4. Call done" ), llm=ChatOllama( # model='qwen2.5:32b-instruct-q4_K_M', diff --git a/examples/notebook/agent_browsing.ipynb b/examples/notebook/agent_browsing.ipynb index 2b5ae837de..de5a9e97bf 100644 --- a/examples/notebook/agent_browsing.ipynb +++ b/examples/notebook/agent_browsing.ipynb @@ -1,760 +1,757 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "ZRGlUb8O4fPV" + }, + "outputs": [], + "source": [ + "%pip install -U langgraph langchain_google_genai langchain_community langgraph-checkpoint-postgres openai langchain_groq" + ] }, - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "ZRGlUb8O4fPV" - }, - "outputs": [], - "source": [ - "\n", - "%pip install -U langgraph langchain_google_genai langchain_community langgraph-checkpoint-postgres openai langchain_groq" - ] - }, - { - "cell_type": "code", - "source": [ - "%%capture --no-stderr\n", - "%pip install --upgrade --quiet playwright > /dev/null\n", - "%pip install --upgrade --quiet lxml browser-use langchain_openai" - ], - "metadata": { - "id": "cMfPUmHIxqTi" - }, - "execution_count": 3, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "!playwright install" - ], - "metadata": { - "id": "kkZ7jVUOUV7Q" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "!pip install \"anyio<4\"" - ], - "metadata": { - "id": "-_T1MhnGUl2q" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# This import is required only for jupyter notebooks, since they have their own eventloop\n", - "import nest_asyncio\n", - "\n", - "nest_asyncio.apply()" - ], - "metadata": { - "id": "yARYirp1UhDR" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "from langchain_openai import ChatOpenAI\n", - "from google.colab import userdata\n", - "\n", - "\n", - "llm = ChatOpenAI(model=\"gpt-4o-mini\", temperature=0, api_key=userdata.get('Open_api_key'))\n", - "\n", - "\n", - "\n" - ], - "metadata": { - "id": "jyVP10O_5Qck" - }, - "execution_count": 4, - "outputs": [] + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "cMfPUmHIxqTi" + }, + "outputs": [], + "source": [ + "%%capture --no-stderr\n", + "%pip install --upgrade --quiet playwright > /dev/null\n", + "%pip install --upgrade --quiet lxml browser-use langchain_openai" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kkZ7jVUOUV7Q" + }, + "outputs": [], + "source": [ + "!playwright install" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-_T1MhnGUl2q" + }, + "outputs": [], + "source": [ + "!pip install \"anyio<4\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "yARYirp1UhDR" + }, + "outputs": [], + "source": [ + "# This import is required only for jupyter notebooks, since they have their own eventloop\n", + "import nest_asyncio\n", + "\n", + "nest_asyncio.apply()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "jyVP10O_5Qck" + }, + "outputs": [], + "source": [ + "from google.colab import userdata\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "llm = ChatOpenAI(model='gpt-4o-mini', temperature=0, api_key=userdata.get('Open_api_key'))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "e9duizdv5cOH", + "outputId": "a07b1702-d485-4641-c307-601e6ab34b9b" + }, + "outputs": [ { - "cell_type": "code", - "source": [ - "llm.invoke(\"hi\")" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "e9duizdv5cOH", - "outputId": "a07b1702-d485-4641-c307-601e6ab34b9b" - }, - "execution_count": 5, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "AIMessage(content='Hello! How can I assist you today?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 10, 'prompt_tokens': 8, 'total_tokens': 18, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_bd83329f63', 'finish_reason': 'stop', 'logprobs': None}, id='run-28a9088f-7539-412a-aa80-1663be40e74f-0', usage_metadata={'input_tokens': 8, 'output_tokens': 10, 'total_tokens': 18, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})" - ] - }, - "metadata": {}, - "execution_count": 5 - } + "data": { + "text/plain": [ + "AIMessage(content='Hello! How can I assist you today?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 10, 'prompt_tokens': 8, 'total_tokens': 18, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_bd83329f63', 'finish_reason': 'stop', 'logprobs': None}, id='run-28a9088f-7539-412a-aa80-1663be40e74f-0', usage_metadata={'input_tokens': 8, 'output_tokens': 10, 'total_tokens': 18, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})" ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "llm.invoke('hi')" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "wS8ouhiVQ2dL", + "outputId": "653879a8-b3ac-4178-edee-5cd834e3404a" + }, + "outputs": [ { - "cell_type": "code", - "source": [ - "from browser_use import Agent, Browser\n", - "from browser_use import BrowserConfig\n", - "from langchain_openai import ChatOpenAI\n", - "import asyncio\n", - "\n", - "# Basic configuration for the browser\n", - "config = BrowserConfig(\n", - " headless=True, # Run in headless mode\n", - " # disable_security=True # Uncomment if you want to disable security\n", - ")\n", - "\n", - "# Initialize the browser with the specified configuration\n", - "browser = Browser(config=config)\n", - "\n", - "async def main():\n", - " # Initialize the agent with the task and language model\n", - " agent = Agent(\n", - " task=\"What is Langgraph\",\n", - " llm=llm, # Replace with your LLM configuration\n", - " browser=browser,\n", - " generate_gif=False # Disable GIF generation\n", - " )\n", - "\n", - " # Run the agent and get results asynchronously\n", - " result = await agent.run()\n", - "\n", - " # Process results token-wise\n", - " for action in result.action_results():\n", - " print(action.extracted_content,end=\"\\r\",flush=True)\n", - " print(\"\\n\\n\")\n", - " # if action.is_done:\n", - " # print(action.extracted_content)\n", - "\n", - " # Close the browser after completion\n", - " await browser.close()\n", - "\n", - "# Run the asynchronous main function\n", - "asyncio.run(main())\n" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "wS8ouhiVQ2dL", - "outputId": "653879a8-b3ac-4178-edee-5cd834e3404a" - }, - "execution_count": 32, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "🔍 Searched for \"What is Langgraph?\" in Google\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "📄 Extracted page as markdown\n", - ": ![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdac879f622b3cb30dd7_cohere-logos-\n", - "idbbhgStc3%201.png)![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdacfdbb3072f5258f66_hugging%20face.png)![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdaceb29ce1602beb431_logo.png)![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdac5f6f2a8c34e5575b_wblogo.png)![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdade49955197d2a8941_mosaic.png)![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdac5092327565075208_aws.png)![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdacb28fe27c7784c797_goggle%20drive.png)![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdac325d487977a3398b_milvus.png)![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdac6348e83137a80c17_openai.png)![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdac0d888384ad7d31f3_redis.png)![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdacf9d2dfca1d2a4c81_google%20cloud.png)![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdac76b6b8b79414144f_datastax%20logo.png)![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdac15e6989ae752a9b5_notion%20logo.png)![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdac485cb9900ddafda3_anthropic-\n", - "logo.png)![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdade49955197d2a894d_mongodb.png)![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdacaeab9fdc6452063c_supabase.png)\n", - "\n", - "![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdac879f622b3cb30dd7_cohere-logos-\n", - "idbbhgStc3%201.png)![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdacfdbb3072f5258f66_hugging%20face.png)![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdaceb29ce1602beb431_logo.png)![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdac5f6f2a8c34e5575b_wblogo.png)![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdade49955197d2a8941_mosaic.png)![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdac5092327565075208_aws.png)![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdacb28fe27c7784c797_goggle%20drive.png)![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdac325d487977a3398b_milvus.png)![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdac6348e83137a80c17_openai.png)![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdac0d888384ad7d31f3_redis.png)![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdacf9d2dfca1d2a4c81_google%20cloud.png)![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdac76b6b8b79414144f_datastax%20logo.png)![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdac15e6989ae752a9b5_notion%20logo.png)![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdac485cb9900ddafda3_anthropic-\n", - "logo.png)![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdade49955197d2a894d_mongodb.png)![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c8fdacaeab9fdc6452063c_supabase.png)\n", - "\n", - "![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/667b080e4b3ca12dc5d5d439_Langgraph%20UI-2.webp)\n", - "\n", - "## Controllable cognitive architecture for any task\n", - "\n", - "LangGraph's flexible framework supports diverse control flows – single agent,\n", - "multi-agent, hierarchical, sequential – and robustly handles realistic,\n", - "complex scenarios. \n", - " \n", - "Ensure reliability with easy-to-add moderation and quality loops that prevent\n", - "agents from veering off course. \n", - " \n", - "Use LangGraph Platform to templatize your cognitive architecture so that\n", - "tools, prompts, and models are easily configurable with LangGraph Platform\n", - "Assistants.\n", - "\n", - "[See the docs ](https://langchain-ai.github.io/langgraph/)\n", - "\n", - "## Designed for human-agent collaboration\n", - "\n", - "With built-in statefulness, LangGraph agents seamlessly collaborate with\n", - "humans by writing drafts for review and awaiting approval before acting.\n", - "Easily inspect the agent’s actions and \"time-travel\" to roll back and take a\n", - "different action to correct course.\n", - "\n", - "[Read a conceptual guide ](https://langchain-\n", - "ai.github.io/langgraph/concepts/agentic_concepts/#human-in-the-loop)\n", - "\n", - "![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/667c93d559216bb904fe85a8_gif7%20\\(1\\).gif)\n", - "\n", - "![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/667c57f274b66a77e2a26b82_CleanShot2024-06-26at17.08.03-ezgif.com-\n", - "video-to-gif-converter.gif)\n", - "\n", - "## First class streaming support for better UX design\n", - "\n", - "Bridge user expectations and agent capabilities with native token-by-token\n", - "streaming and streaming of intermediate steps, helpful for showing agent\n", - "reasoning and actions back to the user as they happen. Use LangGraph\n", - "Platform's API to deliver dynamic and interactive user experiences.\n", - "\n", - "[Learn more ](https://langchain-ai.github.io/langgraph/how-tos/streaming-\n", - "tokens/)\n", - "\n", - "## Why choose LangGraph?\n", - "\n", - "### Control, moderate, and guide your agent’s actions.\n", - "\n", - "Prevent agents from veering off course and ensure reliability with easy-to-add\n", - "moderation and quality loops. Add human-in-the-loop to steer and approve agent\n", - "actions.\n", - "\n", - "### Expressive and customizable agent and multi-agent workflows.\n", - "\n", - "LangGraph’s low level abstractions offer the flexibility needed to create\n", - "sophisticated agents. Design diverse control flows – single, multi-agent,\n", - "hierarchical, sequential – all with one framework.\n", - "\n", - "### Persisted context for long-term interactions.\n", - "\n", - "With its stateful design, LangGraph stores conversation histories and session\n", - "data to maintain context over time and ensure smooth handoffs in agentic\n", - "systems.\n", - "\n", - "### First-class streaming support for better UX design.\n", - "\n", - "Bridge user expectations and agent capabilities with native token-by-token\n", - "streaming of intermediate steps, helpful for showing agent reasoning and\n", - "actions back to the user as they happen.\n", - "\n", - "## LangGraph Platform: \n", - "Deploy & develop agents at scale\n", - "\n", - "Craft agent-appropriate UXs using LangGraph Platform's APIs. Quickly deploy\n", - "and scale your agent with purpose-built infrastructure. Choose from multiple\n", - "deployment options.\n", - "\n", - "![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/67878de387cf10f90c7ad65f_LangGraph---\n", - "Memory-HQ.gif)\n", - "\n", - "## Dynamic APIs for designing agent UXs.\n", - "\n", - "Craft personalized experiences with the long-term memory API to recall\n", - "information across conversation sessions. Expose, update, and rewind your\n", - "app's state for better user visibility, steering, and interaction. Kick off\n", - "long-running background jobs for research-style or multi-step work.\n", - "\n", - "[See the docs ](https://langchain-ai.github.io/langgraph/how-tos/streaming-\n", - "tokens/)\n", - "\n", - "![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/67879a0dd9100d8e643eb39e_LangGraph%20-%20Fault-\n", - "tolerant%20scalability.gif)\n", - "\n", - "## Fault-tolerant scalability.\n", - "\n", - "Handle large workloads gracefully with horizontally-scaling servers, task\n", - "queues, and built-in persistence. Enhance resilience with intelligent caching\n", - "and automated retries.\n", - "\n", - "[Learn more in the blog ](https://langchain-ai.github.io/langgraph/how-\n", - "tos/streaming-tokens/)\n", - "\n", - "![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/667c93d559216bb904fe85a8_gif7%20\\(1\\).gif)\n", - "\n", - "## An end-to-end agent experience.\n", - "\n", - "Simplify prototyping, debugging, and sharing of agents in our visual LangGraph\n", - "Studio. Deploy your application with 1-click deploy with our SaaS offering or\n", - "within your own VPC. Then, monitor app performance with LangSmith.\n", - "\n", - "[Discover LangGraph Studio ](https://langchain-ai.github.io/langgraph/how-\n", - "tos/streaming-tokens/)\n", - "\n", - "![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/66db8c2317fe5b9ad2b84ea0_lcacademylogo.png)\n", - "\n", - "## Introduction to LangGraph\n", - "\n", - "Learn the basics of LangGraph in this LangChain Academy Course. You'll learn\n", - "how to build agents that automate real-world tasks with LangGraph\n", - "orchestration.\n", - "\n", - "[Enroll for free](https://academy.langchain.com/courses/intro-to-\n", - "langgraph)[Book enterprise\n", - "training](https://airtable.com/appGjCAN6126Jm7K8/pagNAp7niHQzRH8zk/form)\n", - "\n", - "![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/6787ae429071ad3575902249_card%201%201.webp)![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/6787ae0bce5c99dd808545ce_card%202.webp)\n", - "\n", - "## Deploy agents at scale, monitor carefully, iterate boldly\n", - "\n", - "Design agent-driven user experiences with LangGraph Platform's APIs. Quickly\n", - "deploy and scale your application with infrastructure built for agents. Choose\n", - "from multiple deployment options.\n", - "\n", - "### Fault-tolerant scalability\n", - "\n", - "Handle large workloads gracefully with horizontally-scaling servers, task\n", - "queues, and built-in persistence. Enhance resilience with intelligent caching\n", - "and automated retries.\n", - "\n", - "### Dynamic APIs for designing agent experience\n", - "\n", - "Craft personalized user experiences with APIs featuring long-term memory to\n", - "recall information across conversation sessions. Track, update, and rewind\n", - "your app's state for easy human steering and interaction. Kick off long-\n", - "running background jobs for research-style or multi-step work.\n", - "\n", - "### Integrated developer experience\n", - "\n", - "Simplify prototyping, debugging, and sharing of agents in our visual LangGraph\n", - "Studio. Deploy your application with 1-click deploy with our SaaS offering or\n", - "within your own VPC. Then, monitor app performance with LangSmith.\n", - "\n", - "### Trusted by companies taking agency in AI innovation:\n", - "\n", - "LangGraph helps teams of all sizes, across all industries, from ambitious\n", - "startups to established enterprises.\n", - "\n", - "![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c5308aea1371b447cc4af9_elastic-ar21.png)\n", - "\n", - "“LangChain is streets ahead with what they've put forward with LangGraph.\n", - "LangGraph sets the foundation for how we can build and scale AI workloads —\n", - "from conversational agents, complex task automation, to custom LLM-backed\n", - "experiences that 'just work'. The next chapter in building complex production-\n", - "ready features with LLMs is agentic, and with LangGraph and LangSmith,\n", - "LangChain delivers an out-of-the-box solution to iterate quickly, debug\n", - "immediately, and scale effortlessly.”\n", - "\n", - "![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/667b26a1b4576291d6a9335b_garrett%20spong%201.webp)\n", - "\n", - "Garrett Spong\n", - "\n", - "Principal SWE\n", - "\n", - "![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/6679de9dc4e7bee218d4b058_Norwegian-Cruise-\n", - "Line-Logo%202-2.webp)\n", - "\n", - "“LangGraph has been instrumental for our AI development. Its robust framework\n", - "for building stateful, multi-actor applications with LLMs has transformed how\n", - "we evaluate and optimize the performance of our AI guest-facing solutions.\n", - "LangGraph enables granular control over the agent's thought process, which has\n", - "empowered us to make data-driven and deliberate decisions to meet the diverse\n", - "needs of our guests.”\n", - "\n", - "![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/667b265bed5f5a9d26d6b7d6_andres%20torres%201.webp)\n", - "\n", - "Andres Torres\n", - "\n", - "Sr. Solutions Architect\n", - "\n", - "![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/667c6f809f0ebc7b1d72a99b_Replit.png)\n", - "\n", - "“It's easy to build the prototype of a coding agent, but deceptively hard to\n", - "improve its reliability. Replit wants to give a coding agent to millions of\n", - "users — reliability is our top priority, and will remain so for a long time.\n", - "LangGraph is giving us the control and ergonomics we need to build and ship\n", - "powerful coding agents.”\n", - "\n", - "“As Ally advances its exploration of Generative AI,\n", - "\n", - "![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/667c6fcaaa21bcf2fe006dbe_1690576438641%20\\(1\\)%201.webp)\n", - "\n", - "Michele Catasta\n", - "\n", - "President\n", - "\n", - "![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/6679e1baf7ea357d0763cde1_ally-\n", - "bank%201-2.png)\n", - "\n", - "“As Ally advances its exploration of Generative AI, our tech labs is excited\n", - "by LangGraph, the new library from LangChain, which is central to our\n", - "experiments with multi-actor agentic workflows. We are committed to deepening\n", - "our partnership with LangChain.”\n", - "\n", - "“As Ally advances its exploration of Generative AI,\n", - "\n", - "![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/6679e2d31352c6bd56c84280_ally.png)\n", - "\n", - "Sathish Muthukrishnan\n", - "\n", - "Chief Information, Data and Digital Officer\n", - "\n", - "![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/65c5308aea1371b447cc4af9_elastic-ar21.png)\n", - "\n", - "“LangChain is streets ahead with what they've put forward with LangGraph.\n", - "LangGraph sets the foundation for how we can build and scale AI workloads —\n", - "from conversational agents, complex task automation, to custom LLM-backed\n", - "experiences that 'just work'. The next chapter in building complex production-\n", - "ready features with LLMs is agentic, and with LangGraph and LangSmith,\n", - "LangChain delivers an out-of-the-box solution to iterate quickly, debug\n", - "immediately, and scale effortlessly.”\n", - "\n", - "![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/667b26a1b4576291d6a9335b_garrett%20spong%201.webp)\n", - "\n", - "Garrett Spong\n", - "\n", - "Principal SWE\n", - "\n", - "![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/6679de9dc4e7bee218d4b058_Norwegian-Cruise-\n", - "Line-Logo%202-2.webp)\n", - "\n", - "“LangGraph has been instrumental for our AI development. Its robust framework\n", - "for building stateful, multi-actor applications with LLMs has transformed how\n", - "we evaluate and optimize the performance of our AI guest-facing solutions.\n", - "LangGraph enables granular control over the agent's thought process, which has\n", - "empowered us to make data-driven and deliberate decisions to meet the diverse\n", - "needs of our guests.”\n", - "\n", - "![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/667b265bed5f5a9d26d6b7d6_andres%20torres%201.webp)\n", - "\n", - "Andres Torres\n", - "\n", - "Sr. Solutions Architect\n", - "\n", - "![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/667c6f809f0ebc7b1d72a99b_Replit.png)\n", - "\n", - "“It's easy to build the prototype of a coding agent, but deceptively hard to\n", - "improve its reliability. Replit wants to give a coding agent to millions of\n", - "users — reliability is our top priority, and will remain so for a long time.\n", - "LangGraph is giving us the control and ergonomics we need to build and ship\n", - "powerful coding agents.”\n", - "\n", - "“As Ally advances its exploration of Generative AI,\n", - "\n", - "![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/667c6fcaaa21bcf2fe006dbe_1690576438641%20\\(1\\)%201.webp)\n", - "\n", - "Michele Catasta\n", - "\n", - "President\n", - "\n", - "![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/6679e1baf7ea357d0763cde1_ally-\n", - "bank%201-2.png)\n", - "\n", - "“As Ally advances its exploration of Generative AI, our tech labs is excited\n", - "by LangGraph, the new library from LangChain, which is central to our\n", - "experiments with multi-actor agentic workflows. We are committed to deepening\n", - "our partnership with LangChain.”\n", - "\n", - "“As Ally advances its exploration of Generative AI,\n", - "\n", - "![](https://cdn.prod.website-\n", - "files.com/65b8cd72835ceeacd4449a53/6679e2d31352c6bd56c84280_ally.png)\n", - "\n", - "Sathish Muthukrishnan\n", - "\n", - "Chief Information, Data and Digital Officer\n", - "\n", - "## LangGraph FAQs\n", - "\n", - "Do I need to use LangChain to use LangGraph? What’s the difference?\n", - "\n", - "No. LangGraph is an orchestration framework for complex agentic systems and is\n", - "more low-level and controllable than LangChain agents. LangChain provides a\n", - "standard interface to interact with models and other components, useful for\n", - "straight-forward chains and retrieval flows.\n", - "\n", - "How is LangGraph different from other agent frameworks?\n", - "\n", - "Other agentic frameworks can work for simple, generic tasks but fall short for\n", - "complex tasks bespoke to a company’s needs. LangGraph provides a more\n", - "expressive framework to handle companies’ unique tasks without restricting\n", - "users to a single black-box cognitive architecture.\n", - "\n", - "Does LangGraph impact the performance of my app?\n", - "\n", - "LangGraph will not add any overhead to your code and is specifically designed\n", - "with streaming workflows in mind.\n", - "\n", - "Is LangGraph open source? Is it free?\n", - "\n", - "Yes. LangGraph is an MIT-licensed open-source library and is free to use.\n", - "\n", - "How are LangGraph and LangGraph Platform different?\n", - "\n", - "LangGraph is a stateful, orchestration framework that brings added control to\n", - "agent workflows. LangGraph Platform is a service for deploying and scaling\n", - "LangGraph applications, with an opinionated API for building agent UXs, plus\n", - "an integrated developer studio.\n", - "\n", - "LangGraph (open source)\n", - "\n", - "LangGraph Platform\n", - "\n", - "Features\n", - "\n", - "Stateful orchestration framework for agentic applications\n", - "\n", - "Scalable infrastructure for deploying LangGraph applications \n", - "\n", - "Python and JavaScript\n", - "\n", - "Python and JavaScript \n", - "\n", - "None\n", - "\n", - "Yes - useful for retrieving & updating state or long-term memory, or creating\n", - "a configurable assistant \n", - "\n", - "Basic\n", - "\n", - "Dedicated mode for token-by-token messages \n", - "\n", - "Community contributed\n", - "\n", - "Supported out-of-the-box \n", - "\n", - "Self-managed\n", - "\n", - "Managed Postgres with efficient storage \n", - "\n", - "Self-managed\n", - "\n", - "\\- Cloud SaaS \n", - "\\- Free self-hosted \n", - "\\- Enterprise \n", - "(BYOC or paid self-hosted) \n", - "\n", - "Self-managed\n", - "\n", - "Auto-scaling of task queues and servers \n", - "\n", - "Self-managed\n", - "\n", - "Automated retries \n", - "\n", - "Simple threading\n", - "\n", - "Supports double-texting \n", - "\n", - "None\n", - "\n", - "Cron scheduling \n", - "\n", - "None\n", - "\n", - "Integrated with LangSmith for observability \n", - "\n", - "LangGraph Studio for Desktop\n", - "\n", - "LangGraph Studio for Desktop & Cloud \n", - "\n", - "What are my deployment options for LangGraph Platform?\n", - "\n", - "We currently have the following deployment options for LangGraph applications: \n", - " \n", - "‍**Self-Hosted Lite** : A free (up to 1M nodes executed), limited version of\n", - "LangGraph Platform that you can run locally or in a self-hosted manner. This\n", - "version requires a LangSmith API key and logs all usage to LangSmith. Fewer\n", - "features are available than in paid plans. \n", - "‍**Cloud SaaS:** Fully managed and hosted as part of LangSmith, with automatic\n", - "updates and zero maintenance. \n", - "‍**Bring Your Own Cloud (BYOC):** Deploy LangGraph Platform within your VPC,\n", - "provisioned and run as a service. Keep data in your environment while\n", - "outsourcing the management of the service. \n", - "**Self-Hosted Enterprise:** Deploy LangGraph entirely on your own\n", - "infrastructure.\n", - "\n", - "Is LangGraph Platform open source?\n", - "\n", - "No. LangGraph Platform is proprietary software. \n", - " \n", - "There is a free, self-hosted version of LangGraph Platform with access to\n", - "basic features. The Cloud SaaS deployment option is free while in beta, but\n", - "will eventually be a paid service. We will always give ample notice before\n", - "charging for a service and reward our early adopters with preferential\n", - "pricing. The Bring Your Own Cloud (BYOC) and Self-Hosted Enterprise options\n", - "are also paid services. [Contact our sales team](/contact-sales) to learn\n", - "more. \n", - " \n", - "For more information, see our [LangGraph Platform pricing page](/pricing-\n", - "langgraph-platform).\n", - "\n", - "## Ready to start shipping reliable GenAI apps faster?\n", - "\n", - "Get started with LangChain, LangSmith, and LangGraph to enhance your LLM app\n", - "development, from prototype to production.\n", - "\n", - "[Contact Us](/contact-sales)[Sign Up](https://smith.langchain.com/)\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "LangGraph is a flexible framework designed for building and scaling agentic applications. It allows for complex task handling and human-agent collaboration, supporting various control flows such as single-agent, multi-agent, hierarchical, and sequential. Key features include:\n", - "\n", - "- **Statefulness**: LangGraph agents maintain context over time, enabling smooth interactions.\n", - "- **Streaming Support**: It provides native token-by-token streaming for better user experience.\n", - "- **Moderation and Quality Loops**: These features ensure agents remain reliable and on course.\n", - "- **Dynamic APIs**: LangGraph offers APIs for crafting personalized user experiences and managing long-term memory.\n", - "- **Deployment Options**: It supports various deployment methods, including self-hosted and cloud solutions.\n", - "\n", - "\n", - "\n", - "\n" - ] - } - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "🔍 Searched for \"What is Langgraph?\" in Google\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "📄 Extracted page as markdown\n", + ": ![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdac879f622b3cb30dd7_cohere-logos-\n", + "idbbhgStc3%201.png)![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdacfdbb3072f5258f66_hugging%20face.png)![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdaceb29ce1602beb431_logo.png)![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdac5f6f2a8c34e5575b_wblogo.png)![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdade49955197d2a8941_mosaic.png)![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdac5092327565075208_aws.png)![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdacb28fe27c7784c797_goggle%20drive.png)![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdac325d487977a3398b_milvus.png)![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdac6348e83137a80c17_openai.png)![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdac0d888384ad7d31f3_redis.png)![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdacf9d2dfca1d2a4c81_google%20cloud.png)![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdac76b6b8b79414144f_datastax%20logo.png)![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdac15e6989ae752a9b5_notion%20logo.png)![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdac485cb9900ddafda3_anthropic-\n", + "logo.png)![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdade49955197d2a894d_mongodb.png)![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdacaeab9fdc6452063c_supabase.png)\n", + "\n", + "![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdac879f622b3cb30dd7_cohere-logos-\n", + "idbbhgStc3%201.png)![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdacfdbb3072f5258f66_hugging%20face.png)![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdaceb29ce1602beb431_logo.png)![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdac5f6f2a8c34e5575b_wblogo.png)![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdade49955197d2a8941_mosaic.png)![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdac5092327565075208_aws.png)![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdacb28fe27c7784c797_goggle%20drive.png)![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdac325d487977a3398b_milvus.png)![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdac6348e83137a80c17_openai.png)![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdac0d888384ad7d31f3_redis.png)![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdacf9d2dfca1d2a4c81_google%20cloud.png)![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdac76b6b8b79414144f_datastax%20logo.png)![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdac15e6989ae752a9b5_notion%20logo.png)![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdac485cb9900ddafda3_anthropic-\n", + "logo.png)![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdade49955197d2a894d_mongodb.png)![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c8fdacaeab9fdc6452063c_supabase.png)\n", + "\n", + "![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/667b080e4b3ca12dc5d5d439_Langgraph%20UI-2.webp)\n", + "\n", + "## Controllable cognitive architecture for any task\n", + "\n", + "LangGraph's flexible framework supports diverse control flows – single agent,\n", + "multi-agent, hierarchical, sequential – and robustly handles realistic,\n", + "complex scenarios. \n", + " \n", + "Ensure reliability with easy-to-add moderation and quality loops that prevent\n", + "agents from veering off course. \n", + " \n", + "Use LangGraph Platform to templatize your cognitive architecture so that\n", + "tools, prompts, and models are easily configurable with LangGraph Platform\n", + "Assistants.\n", + "\n", + "[See the docs ](https://langchain-ai.github.io/langgraph/)\n", + "\n", + "## Designed for human-agent collaboration\n", + "\n", + "With built-in statefulness, LangGraph agents seamlessly collaborate with\n", + "humans by writing drafts for review and awaiting approval before acting.\n", + "Easily inspect the agent’s actions and \"time-travel\" to roll back and take a\n", + "different action to correct course.\n", + "\n", + "[Read a conceptual guide ](https://langchain-\n", + "ai.github.io/langgraph/concepts/agentic_concepts/#human-in-the-loop)\n", + "\n", + "![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/667c93d559216bb904fe85a8_gif7%20\\(1\\).gif)\n", + "\n", + "![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/667c57f274b66a77e2a26b82_CleanShot2024-06-26at17.08.03-ezgif.com-\n", + "video-to-gif-converter.gif)\n", + "\n", + "## First class streaming support for better UX design\n", + "\n", + "Bridge user expectations and agent capabilities with native token-by-token\n", + "streaming and streaming of intermediate steps, helpful for showing agent\n", + "reasoning and actions back to the user as they happen. Use LangGraph\n", + "Platform's API to deliver dynamic and interactive user experiences.\n", + "\n", + "[Learn more ](https://langchain-ai.github.io/langgraph/how-tos/streaming-\n", + "tokens/)\n", + "\n", + "## Why choose LangGraph?\n", + "\n", + "### Control, moderate, and guide your agent’s actions.\n", + "\n", + "Prevent agents from veering off course and ensure reliability with easy-to-add\n", + "moderation and quality loops. Add human-in-the-loop to steer and approve agent\n", + "actions.\n", + "\n", + "### Expressive and customizable agent and multi-agent workflows.\n", + "\n", + "LangGraph’s low level abstractions offer the flexibility needed to create\n", + "sophisticated agents. Design diverse control flows – single, multi-agent,\n", + "hierarchical, sequential – all with one framework.\n", + "\n", + "### Persisted context for long-term interactions.\n", + "\n", + "With its stateful design, LangGraph stores conversation histories and session\n", + "data to maintain context over time and ensure smooth handoffs in agentic\n", + "systems.\n", + "\n", + "### First-class streaming support for better UX design.\n", + "\n", + "Bridge user expectations and agent capabilities with native token-by-token\n", + "streaming of intermediate steps, helpful for showing agent reasoning and\n", + "actions back to the user as they happen.\n", + "\n", + "## LangGraph Platform: \n", + "Deploy & develop agents at scale\n", + "\n", + "Craft agent-appropriate UXs using LangGraph Platform's APIs. Quickly deploy\n", + "and scale your agent with purpose-built infrastructure. Choose from multiple\n", + "deployment options.\n", + "\n", + "![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/67878de387cf10f90c7ad65f_LangGraph---\n", + "Memory-HQ.gif)\n", + "\n", + "## Dynamic APIs for designing agent UXs.\n", + "\n", + "Craft personalized experiences with the long-term memory API to recall\n", + "information across conversation sessions. Expose, update, and rewind your\n", + "app's state for better user visibility, steering, and interaction. Kick off\n", + "long-running background jobs for research-style or multi-step work.\n", + "\n", + "[See the docs ](https://langchain-ai.github.io/langgraph/how-tos/streaming-\n", + "tokens/)\n", + "\n", + "![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/67879a0dd9100d8e643eb39e_LangGraph%20-%20Fault-\n", + "tolerant%20scalability.gif)\n", + "\n", + "## Fault-tolerant scalability.\n", + "\n", + "Handle large workloads gracefully with horizontally-scaling servers, task\n", + "queues, and built-in persistence. Enhance resilience with intelligent caching\n", + "and automated retries.\n", + "\n", + "[Learn more in the blog ](https://langchain-ai.github.io/langgraph/how-\n", + "tos/streaming-tokens/)\n", + "\n", + "![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/667c93d559216bb904fe85a8_gif7%20\\(1\\).gif)\n", + "\n", + "## An end-to-end agent experience.\n", + "\n", + "Simplify prototyping, debugging, and sharing of agents in our visual LangGraph\n", + "Studio. Deploy your application with 1-click deploy with our SaaS offering or\n", + "within your own VPC. Then, monitor app performance with LangSmith.\n", + "\n", + "[Discover LangGraph Studio ](https://langchain-ai.github.io/langgraph/how-\n", + "tos/streaming-tokens/)\n", + "\n", + "![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/66db8c2317fe5b9ad2b84ea0_lcacademylogo.png)\n", + "\n", + "## Introduction to LangGraph\n", + "\n", + "Learn the basics of LangGraph in this LangChain Academy Course. You'll learn\n", + "how to build agents that automate real-world tasks with LangGraph\n", + "orchestration.\n", + "\n", + "[Enroll for free](https://academy.langchain.com/courses/intro-to-\n", + "langgraph)[Book enterprise\n", + "training](https://airtable.com/appGjCAN6126Jm7K8/pagNAp7niHQzRH8zk/form)\n", + "\n", + "![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/6787ae429071ad3575902249_card%201%201.webp)![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/6787ae0bce5c99dd808545ce_card%202.webp)\n", + "\n", + "## Deploy agents at scale, monitor carefully, iterate boldly\n", + "\n", + "Design agent-driven user experiences with LangGraph Platform's APIs. Quickly\n", + "deploy and scale your application with infrastructure built for agents. Choose\n", + "from multiple deployment options.\n", + "\n", + "### Fault-tolerant scalability\n", + "\n", + "Handle large workloads gracefully with horizontally-scaling servers, task\n", + "queues, and built-in persistence. Enhance resilience with intelligent caching\n", + "and automated retries.\n", + "\n", + "### Dynamic APIs for designing agent experience\n", + "\n", + "Craft personalized user experiences with APIs featuring long-term memory to\n", + "recall information across conversation sessions. Track, update, and rewind\n", + "your app's state for easy human steering and interaction. Kick off long-\n", + "running background jobs for research-style or multi-step work.\n", + "\n", + "### Integrated developer experience\n", + "\n", + "Simplify prototyping, debugging, and sharing of agents in our visual LangGraph\n", + "Studio. Deploy your application with 1-click deploy with our SaaS offering or\n", + "within your own VPC. Then, monitor app performance with LangSmith.\n", + "\n", + "### Trusted by companies taking agency in AI innovation:\n", + "\n", + "LangGraph helps teams of all sizes, across all industries, from ambitious\n", + "startups to established enterprises.\n", + "\n", + "![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c5308aea1371b447cc4af9_elastic-ar21.png)\n", + "\n", + "“LangChain is streets ahead with what they've put forward with LangGraph.\n", + "LangGraph sets the foundation for how we can build and scale AI workloads —\n", + "from conversational agents, complex task automation, to custom LLM-backed\n", + "experiences that 'just work'. The next chapter in building complex production-\n", + "ready features with LLMs is agentic, and with LangGraph and LangSmith,\n", + "LangChain delivers an out-of-the-box solution to iterate quickly, debug\n", + "immediately, and scale effortlessly.”\n", + "\n", + "![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/667b26a1b4576291d6a9335b_garrett%20spong%201.webp)\n", + "\n", + "Garrett Spong\n", + "\n", + "Principal SWE\n", + "\n", + "![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/6679de9dc4e7bee218d4b058_Norwegian-Cruise-\n", + "Line-Logo%202-2.webp)\n", + "\n", + "“LangGraph has been instrumental for our AI development. Its robust framework\n", + "for building stateful, multi-actor applications with LLMs has transformed how\n", + "we evaluate and optimize the performance of our AI guest-facing solutions.\n", + "LangGraph enables granular control over the agent's thought process, which has\n", + "empowered us to make data-driven and deliberate decisions to meet the diverse\n", + "needs of our guests.”\n", + "\n", + "![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/667b265bed5f5a9d26d6b7d6_andres%20torres%201.webp)\n", + "\n", + "Andres Torres\n", + "\n", + "Sr. Solutions Architect\n", + "\n", + "![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/667c6f809f0ebc7b1d72a99b_Replit.png)\n", + "\n", + "“It's easy to build the prototype of a coding agent, but deceptively hard to\n", + "improve its reliability. Replit wants to give a coding agent to millions of\n", + "users — reliability is our top priority, and will remain so for a long time.\n", + "LangGraph is giving us the control and ergonomics we need to build and ship\n", + "powerful coding agents.”\n", + "\n", + "“As Ally advances its exploration of Generative AI,\n", + "\n", + "![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/667c6fcaaa21bcf2fe006dbe_1690576438641%20\\(1\\)%201.webp)\n", + "\n", + "Michele Catasta\n", + "\n", + "President\n", + "\n", + "![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/6679e1baf7ea357d0763cde1_ally-\n", + "bank%201-2.png)\n", + "\n", + "“As Ally advances its exploration of Generative AI, our tech labs is excited\n", + "by LangGraph, the new library from LangChain, which is central to our\n", + "experiments with multi-actor agentic workflows. We are committed to deepening\n", + "our partnership with LangChain.”\n", + "\n", + "“As Ally advances its exploration of Generative AI,\n", + "\n", + "![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/6679e2d31352c6bd56c84280_ally.png)\n", + "\n", + "Sathish Muthukrishnan\n", + "\n", + "Chief Information, Data and Digital Officer\n", + "\n", + "![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/65c5308aea1371b447cc4af9_elastic-ar21.png)\n", + "\n", + "“LangChain is streets ahead with what they've put forward with LangGraph.\n", + "LangGraph sets the foundation for how we can build and scale AI workloads —\n", + "from conversational agents, complex task automation, to custom LLM-backed\n", + "experiences that 'just work'. The next chapter in building complex production-\n", + "ready features with LLMs is agentic, and with LangGraph and LangSmith,\n", + "LangChain delivers an out-of-the-box solution to iterate quickly, debug\n", + "immediately, and scale effortlessly.”\n", + "\n", + "![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/667b26a1b4576291d6a9335b_garrett%20spong%201.webp)\n", + "\n", + "Garrett Spong\n", + "\n", + "Principal SWE\n", + "\n", + "![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/6679de9dc4e7bee218d4b058_Norwegian-Cruise-\n", + "Line-Logo%202-2.webp)\n", + "\n", + "“LangGraph has been instrumental for our AI development. Its robust framework\n", + "for building stateful, multi-actor applications with LLMs has transformed how\n", + "we evaluate and optimize the performance of our AI guest-facing solutions.\n", + "LangGraph enables granular control over the agent's thought process, which has\n", + "empowered us to make data-driven and deliberate decisions to meet the diverse\n", + "needs of our guests.”\n", + "\n", + "![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/667b265bed5f5a9d26d6b7d6_andres%20torres%201.webp)\n", + "\n", + "Andres Torres\n", + "\n", + "Sr. Solutions Architect\n", + "\n", + "![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/667c6f809f0ebc7b1d72a99b_Replit.png)\n", + "\n", + "“It's easy to build the prototype of a coding agent, but deceptively hard to\n", + "improve its reliability. Replit wants to give a coding agent to millions of\n", + "users — reliability is our top priority, and will remain so for a long time.\n", + "LangGraph is giving us the control and ergonomics we need to build and ship\n", + "powerful coding agents.”\n", + "\n", + "“As Ally advances its exploration of Generative AI,\n", + "\n", + "![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/667c6fcaaa21bcf2fe006dbe_1690576438641%20\\(1\\)%201.webp)\n", + "\n", + "Michele Catasta\n", + "\n", + "President\n", + "\n", + "![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/6679e1baf7ea357d0763cde1_ally-\n", + "bank%201-2.png)\n", + "\n", + "“As Ally advances its exploration of Generative AI, our tech labs is excited\n", + "by LangGraph, the new library from LangChain, which is central to our\n", + "experiments with multi-actor agentic workflows. We are committed to deepening\n", + "our partnership with LangChain.”\n", + "\n", + "“As Ally advances its exploration of Generative AI,\n", + "\n", + "![](https://cdn.prod.website-\n", + "files.com/65b8cd72835ceeacd4449a53/6679e2d31352c6bd56c84280_ally.png)\n", + "\n", + "Sathish Muthukrishnan\n", + "\n", + "Chief Information, Data and Digital Officer\n", + "\n", + "## LangGraph FAQs\n", + "\n", + "Do I need to use LangChain to use LangGraph? What’s the difference?\n", + "\n", + "No. LangGraph is an orchestration framework for complex agentic systems and is\n", + "more low-level and controllable than LangChain agents. LangChain provides a\n", + "standard interface to interact with models and other components, useful for\n", + "straight-forward chains and retrieval flows.\n", + "\n", + "How is LangGraph different from other agent frameworks?\n", + "\n", + "Other agentic frameworks can work for simple, generic tasks but fall short for\n", + "complex tasks bespoke to a company’s needs. LangGraph provides a more\n", + "expressive framework to handle companies’ unique tasks without restricting\n", + "users to a single black-box cognitive architecture.\n", + "\n", + "Does LangGraph impact the performance of my app?\n", + "\n", + "LangGraph will not add any overhead to your code and is specifically designed\n", + "with streaming workflows in mind.\n", + "\n", + "Is LangGraph open source? Is it free?\n", + "\n", + "Yes. LangGraph is an MIT-licensed open-source library and is free to use.\n", + "\n", + "How are LangGraph and LangGraph Platform different?\n", + "\n", + "LangGraph is a stateful, orchestration framework that brings added control to\n", + "agent workflows. LangGraph Platform is a service for deploying and scaling\n", + "LangGraph applications, with an opinionated API for building agent UXs, plus\n", + "an integrated developer studio.\n", + "\n", + "LangGraph (open source)\n", + "\n", + "LangGraph Platform\n", + "\n", + "Features\n", + "\n", + "Stateful orchestration framework for agentic applications\n", + "\n", + "Scalable infrastructure for deploying LangGraph applications \n", + "\n", + "Python and JavaScript\n", + "\n", + "Python and JavaScript \n", + "\n", + "None\n", + "\n", + "Yes - useful for retrieving & updating state or long-term memory, or creating\n", + "a configurable assistant \n", + "\n", + "Basic\n", + "\n", + "Dedicated mode for token-by-token messages \n", + "\n", + "Community contributed\n", + "\n", + "Supported out-of-the-box \n", + "\n", + "Self-managed\n", + "\n", + "Managed Postgres with efficient storage \n", + "\n", + "Self-managed\n", + "\n", + "\\- Cloud SaaS \n", + "\\- Free self-hosted \n", + "\\- Enterprise \n", + "(BYOC or paid self-hosted) \n", + "\n", + "Self-managed\n", + "\n", + "Auto-scaling of task queues and servers \n", + "\n", + "Self-managed\n", + "\n", + "Automated retries \n", + "\n", + "Simple threading\n", + "\n", + "Supports double-texting \n", + "\n", + "None\n", + "\n", + "Cron scheduling \n", + "\n", + "None\n", + "\n", + "Integrated with LangSmith for observability \n", + "\n", + "LangGraph Studio for Desktop\n", + "\n", + "LangGraph Studio for Desktop & Cloud \n", + "\n", + "What are my deployment options for LangGraph Platform?\n", + "\n", + "We currently have the following deployment options for LangGraph applications: \n", + " \n", + "‍**Self-Hosted Lite** : A free (up to 1M nodes executed), limited version of\n", + "LangGraph Platform that you can run locally or in a self-hosted manner. This\n", + "version requires a LangSmith API key and logs all usage to LangSmith. Fewer\n", + "features are available than in paid plans. \n", + "‍**Cloud SaaS:** Fully managed and hosted as part of LangSmith, with automatic\n", + "updates and zero maintenance. \n", + "‍**Bring Your Own Cloud (BYOC):** Deploy LangGraph Platform within your VPC,\n", + "provisioned and run as a service. Keep data in your environment while\n", + "outsourcing the management of the service. \n", + "**Self-Hosted Enterprise:** Deploy LangGraph entirely on your own\n", + "infrastructure.\n", + "\n", + "Is LangGraph Platform open source?\n", + "\n", + "No. LangGraph Platform is proprietary software. \n", + " \n", + "There is a free, self-hosted version of LangGraph Platform with access to\n", + "basic features. The Cloud SaaS deployment option is free while in beta, but\n", + "will eventually be a paid service. We will always give ample notice before\n", + "charging for a service and reward our early adopters with preferential\n", + "pricing. The Bring Your Own Cloud (BYOC) and Self-Hosted Enterprise options\n", + "are also paid services. [Contact our sales team](/contact-sales) to learn\n", + "more. \n", + " \n", + "For more information, see our [LangGraph Platform pricing page](/pricing-\n", + "langgraph-platform).\n", + "\n", + "## Ready to start shipping reliable GenAI apps faster?\n", + "\n", + "Get started with LangChain, LangSmith, and LangGraph to enhance your LLM app\n", + "development, from prototype to production.\n", + "\n", + "[Contact Us](/contact-sales)[Sign Up](https://smith.langchain.com/)\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "LangGraph is a flexible framework designed for building and scaling agentic applications. It allows for complex task handling and human-agent collaboration, supporting various control flows such as single-agent, multi-agent, hierarchical, and sequential. Key features include:\n", + "\n", + "- **Statefulness**: LangGraph agents maintain context over time, enabling smooth interactions.\n", + "- **Streaming Support**: It provides native token-by-token streaming for better user experience.\n", + "- **Moderation and Quality Loops**: These features ensure agents remain reliable and on course.\n", + "- **Dynamic APIs**: LangGraph offers APIs for crafting personalized user experiences and managing long-term memory.\n", + "- **Deployment Options**: It supports various deployment methods, including self-hosted and cloud solutions.\n", + "\n", + "\n", + "\n", + "\n" + ] + } + ], + "source": [ + "import asyncio\n", + "\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "from browser_use import Agent, Browser, BrowserConfig\n", + "\n", + "# Basic configuration for the browser\n", + "config = BrowserConfig(\n", + "\theadless=True, # Run in headless mode\n", + "\t# disable_security=True # Uncomment if you want to disable security\n", + ")\n", + "\n", + "# Initialize the browser with the specified configuration\n", + "browser = Browser(config=config)\n", + "\n", + "\n", + "async def main():\n", + "\t# Initialize the agent with the task and language model\n", + "\tagent = Agent(\n", + "\t\ttask='What is Langgraph',\n", + "\t\tllm=llm, # Replace with your LLM configuration\n", + "\t\tbrowser=browser,\n", + "\t\tgenerate_gif=False, # Disable GIF generation\n", + "\t)\n", + "\n", + "\t# Run the agent and get results asynchronously\n", + "\tresult = await agent.run()\n", + "\n", + "\t# Process results token-wise\n", + "\tfor action in result.action_results():\n", + "\t\tprint(action.extracted_content, end='\\r', flush=True)\n", + "\t\tprint('\\n\\n')\n", + "\t\t# if action.is_done:\n", + "\t\t# print(action.extracted_content)\n", + "\n", + "\t# Close the browser after completion\n", + "\tawait browser.close()\n", + "\n", + "\n", + "# Run the asynchronous main function\n", + "asyncio.run(main())" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "TFK-fNoLDFcF", + "outputId": "d78fbeae-c8f0-4c26-e0e3-7a0a683d3fc1" + }, + "outputs": [ { - "cell_type": "code", - "source": [ - "from browser_use import Agent, Browser\n", - "from playwright.async_api import BrowserContext\n", - "from browser_use import BrowserConfig\n", - "from langchain_openai import ChatOpenAI\n", - "# from browser_use import Agent\n", - "import asyncio\n", - "# Basic configuration\n", - "config = BrowserConfig(\n", - " headless=True,\n", - "\n", - " # disable_security=True\n", - ")\n", - "# Reuse existing browser\n", - "browser = Browser(config=config)\n", - "# async def main():\n", - "agent = Agent(\n", - " task=\"what is langchain\",\n", - " llm=llm,\n", - " browser=browser,\n", - " generate_gif = False # Browser instance will be reused\n", - " )\n", - "\n", - "result = await agent.run()\n", - "print(result)\n", - "# Manually close the browser\n", - "# asyncio.run(main())\n", - "await browser.close()\n" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "TFK-fNoLDFcF", - "outputId": "d78fbeae-c8f0-4c26-e0e3-7a0a683d3fc1" - }, - "execution_count": 11, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "AgentHistoryList(all_results=[ActionResult(is_done=False, extracted_content='🔍 Searched for \"What is LangChain?\" in Google', error=None, include_in_memory=True), ActionResult(is_done=False, extracted_content=\"📄 Extracted page as markdown\\n: # Filters and Topics\\n\\n[All](/search?sca_esv=4c6b8dc13bab3e46&q=What+is+LangChain%3F&source=lnms&fbs=AEQNm0Aa4sjWe7Rqy32pFwRj0UkWd8nbOJfsBGGB5IQQO6L3JyWp6w6_rxLPe8F8fpm5a55blYtaduielx1say4YCS0EIyvBb6VkaLhDZSOnSC94tp-\\nJuFEDkvqUl_u6quB-Is11hrT6R6Y6jGPIGI0MqGRIdRYfHHK4Fm5f9UNWxYphEnPjChpmH-\\nusjmkJN6Sk444PHRuqJvihdKgoqwGrUjYjqVvmxA&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ0pQJegQIEhAB)\\n\\n[Images](/search?sca_esv=4c6b8dc13bab3e46&q=What+is+LangChain%3F&udm=2&fbs=AEQNm0Aa4sjWe7Rqy32pFwRj0UkWd8nbOJfsBGGB5IQQO6L3JyWp6w6_rxLPe8F8fpm5a55blYtaduielx1say4YCS0EIyvBb6VkaLhDZSOnSC94tp-\\nJuFEDkvqUl_u6quB-Is11hrT6R6Y6jGPIGI0MqGRIdRYfHHK4Fm5f9UNWxYphEnPjChpmH-\\nusjmkJN6Sk444PHRuqJvihdKgoqwGrUjYjqVvmxA&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQtKgLegQIExAB)\\n\\n[Videos](/search?sca_esv=4c6b8dc13bab3e46&q=What+is+LangChain%3F&udm=7&fbs=AEQNm0Aa4sjWe7Rqy32pFwRj0UkWd8nbOJfsBGGB5IQQO6L3JyWp6w6_rxLPe8F8fpm5a55blYtaduielx1say4YCS0EIyvBb6VkaLhDZSOnSC94tp-\\nJuFEDkvqUl_u6quB-Is11hrT6R6Y6jGPIGI0MqGRIdRYfHHK4Fm5f9UNWxYphEnPjChpmH-\\nusjmkJN6Sk444PHRuqJvihdKgoqwGrUjYjqVvmxA&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQtKgLegQIERAB)\\n\\n[Forums](/search?sca_esv=4c6b8dc13bab3e46&q=What+is+LangChain%3F&udm=18&fbs=AEQNm0Aa4sjWe7Rqy32pFwRj0UkWd8nbOJfsBGGB5IQQO6L3JyWp6w6_rxLPe8F8fpm5a55blYtaduielx1say4YCS0EIyvBb6VkaLhDZSOnSC94tp-\\nJuFEDkvqUl_u6quB-Is11hrT6R6Y6jGPIGI0MqGRIdRYfHHK4Fm5f9UNWxYphEnPjChpmH-\\nusjmkJN6Sk444PHRuqJvihdKgoqwGrUjYjqVvmxA&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQs6gLegQIDxAB)\\n\\nWeb\\n\\n[Flights](/travel/flights?sca_esv=4c6b8dc13bab3e46&output=search&q=What+is+LangChain%3F&source=lnms&fbs=AEQNm0Aa4sjWe7Rqy32pFwRj0UkWd8nbOJfsBGGB5IQQO6L3JyWp6w6_rxLPe8F8fpm5a55blYtaduielx1say4YCS0EIyvBb6VkaLhDZSOnSC94tp-\\nJuFEDkvqUl_u6quB-Is11hrT6R6Y6jGPIGI0MqGRIdRYfHHK4Fm5f9UNWxYphEnPjChpmH-\\nusjmkJN6Sk444PHRuqJvihdKgoqwGrUjYjqVvmxA&ved=1t:200715&ictx=111)\\n\\n[Finance](/finance?sca_esv=4c6b8dc13bab3e46&output=search&q=What+is+LangChain%3F&source=lnms&fbs=AEQNm0Aa4sjWe7Rqy32pFwRj0UkWd8nbOJfsBGGB5IQQO6L3JyWp6w6_rxLPe8F8fpm5a55blYtaduielx1say4YCS0EIyvBb6VkaLhDZSOnSC94tp-\\nJuFEDkvqUl_u6quB-Is11hrT6R6Y6jGPIGI0MqGRIdRYfHHK4Fm5f9UNWxYphEnPjChpmH-\\nusjmkJN6Sk444PHRuqJvihdKgoqwGrUjYjqVvmxA&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ0pQJegQIDBAB)\\n\\nMore\\n\\n[Books](/search?sca_esv=4c6b8dc13bab3e46&q=What+is+LangChain%3F&udm=36&source=lnms&fbs=AEQNm0Aa4sjWe7Rqy32pFwRj0UkWd8nbOJfsBGGB5IQQO6L3JyWp6w6_rxLPe8F8fpm5a55blYtaduielx1say4YCS0EIyvBb6VkaLhDZSOnSC94tp-\\nJuFEDkvqUl_u6quB-Is11hrT6R6Y6jGPIGI0MqGRIdRYfHHK4Fm5f9UNWxYphEnPjChpmH-\\nusjmkJN6Sk444PHRuqJvihdKgoqwGrUjYjqVvmxA&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ0pQJegQINxAB)\\n\\n[News](/search?sca_esv=4c6b8dc13bab3e46&q=What+is+LangChain%3F&tbm=nws&source=lnms&fbs=AEQNm0Aa4sjWe7Rqy32pFwRj0UkWd8nbOJfsBGGB5IQQO6L3JyWp6w6_rxLPe8F8fpm5a55blYtaduielx1say4YCS0EIyvBb6VkaLhDZSOnSC94tp-\\nJuFEDkvqUl_u6quB-Is11hrT6R6Y6jGPIGI0MqGRIdRYfHHK4Fm5f9UNWxYphEnPjChpmH-\\nusjmkJN6Sk444PHRuqJvihdKgoqwGrUjYjqVvmxA&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ0pQJegQINhAB)\\n\\n[Shopping](/search?sca_esv=4c6b8dc13bab3e46&q=What+is+LangChain%3F&udm=28&fbs=AEQNm0Aa4sjWe7Rqy32pFwRj0UkWd8nbOJfsBGGB5IQQO6L3JyWp6w6_rxLPe8F8fpm5a55blYtaduielx1say4YCS0EIyvBb6VkaLhDZSOnSC94tp-\\nJuFEDkvqUl_u6quB-Is11hrT6R6Y6jGPIGI0MqGRIdRYfHHK4Fm5f9UNWxYphEnPjChpmH-\\nusjmkJN6Sk444PHRuqJvihdKgoqwGrUjYjqVvmxA&ved=1t:220175&ictx=111)\\n\\nTools\\n\\nAny time\\n\\nAny time\\n\\n[Past\\nhour](/search?q=What+is+LangChain%3F&sca_esv=4c6b8dc13bab3e46&udm=14&source=lnt&tbs=qdr:h&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQpwV6BAgGEAc)\\n\\n[Past 24\\nhours](/search?q=What+is+LangChain%3F&sca_esv=4c6b8dc13bab3e46&udm=14&source=lnt&tbs=qdr:d&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQpwV6BAgGEAg)\\n\\n[Past\\nweek](/search?q=What+is+LangChain%3F&sca_esv=4c6b8dc13bab3e46&udm=14&source=lnt&tbs=qdr:w&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQpwV6BAgGEAk)\\n\\n[Past\\nmonth](/search?q=What+is+LangChain%3F&sca_esv=4c6b8dc13bab3e46&udm=14&source=lnt&tbs=qdr:m&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQpwV6BAgGEAo)\\n\\n[Past\\nyear](/search?q=What+is+LangChain%3F&sca_esv=4c6b8dc13bab3e46&udm=14&source=lnt&tbs=qdr:y&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQpwV6BAgGEAs)\\n\\nCustom range...\\n\\nCustom date range\\n\\nFromTo\\n\\nGo\\n\\nAll results\\n\\nAll results\\n\\n[Verbatim](/search?q=What+is+LangChain%3F&sca_esv=4c6b8dc13bab3e46&udm=14&source=lnt&tbs=li:1&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQpwV6BAgGEBM)\\n\\n[ Advanced Search\\n](https://www.google.com/advanced_search?q=What+is+LangChain%3F&udm=14)\\n\\nCtrl+Shift+X to select\\n\\n![Google](https://fonts.gstatic.com/s/i/productlogos/googleg/v6/24px.svg)\\n\\n# Search settings\\n\\n[Search CustomizationOff](/history/optout?hl=en)\\n\\n[SafeSearchBlurring\\non](/safesearch?prev=https://www.google.com/search?q%3DWhat%2Bis%2BLangChain?%26udm%3D14&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8JsIegQIChAH)\\n\\n[LanguageEnglish](/preferences?lang=1&hl=en&prev=https://www.google.com/search?q%3DWhat%2Bis%2BLangChain%253F%26sca_esv%3D4c6b8dc13bab3e46%26udm%3D14#languages)\\n\\n[Dark themeDevice\\ndefault](/setprefs?hl=en&prev=https://www.google.com/search?q%3DWhat%2Bis%2BLangChain?%26udm%3D14%26pccc%3D1&sig=0_jfSkJcafppJyKAIkCWZpHFXzfrs%3D&cs=2&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQqsEHegQIChAJ&ictx=1)\\n\\n[More\\nsettings](/preferences?hl=en&prev=https://www.google.com/search?q%3DWhat%2Bis%2BLangChain%253F%26sca_esv%3D4c6b8dc13bab3e46%26udm%3D14)\\n\\nSend feedback\\n\\n[Help](https://support.google.com/websearch/?p=dsrp_search_hc&hl=en) •\\n[Privacy](https://policies.google.com/privacy?hl=en&fg=1) •\\n[Terms](https://policies.google.com/terms?hl=en&fg=1)\\n\\n# Search Results\\n\\n[ \\nLangChain![]()LangChainhttps://www.langchain.com](https://www.langchain.com/)\\n\\nLangChain\\n\\nhttps://www.langchain.com\\n\\n _LangChain_ is a composable framework to build with LLMs. LangGraph is the\\norchestration framework for controllable agentic workflows. Run.\\n\\n\\u200e[Docs](https://python.langchain.com/docs/introduction/) ·\\n\\u200e[Products](https://www.langchain.com/langchain) · \\u200e[LangChain\\nAcademy](https://academy.langchain.com/) · \\u200e[Join the LangChain\\nCommunity](https://www.langchain.com/join-community)\\n\\n[ \\nWhat is\\nLangChain?![]()Amazon\\nWeb Serviceshttps://aws.amazon.com › ... › Generative\\nAI](https://aws.amazon.com/what-is/langchain/)\\n\\nAmazon Web Services\\n\\nhttps://aws.amazon.com › ... › Generative AI\\n\\nLangChain _provides AI developers with tools to connect language models with\\nexternal data sources_. It is open-source and supported by an active\\ncommunity.\\n\\n[ \\nWhat Is LangChain and How to Use It: A\\nGuide![]()TechTargethttps://www.techtarget.com\\n› definition ›\\nLangChain](https://www.techtarget.com/searchenterpriseai/definition/LangChain)\\n\\nTechTarget\\n\\nhttps://www.techtarget.com › definition › LangChain\\n\\n _LangChain is an open source framework_ that enables software developers\\nworking with artificial intelligence (AI) and its machine learning subset to\\ncombine ...\\n\\n[ \\nIntroduction | 🦜️ LangChain![]()LangChainhttps://python.langchain.com › docs › introduction](https://python.langchain.com/docs/introduction/)\\n\\nLangChain\\n\\nhttps://python.langchain.com › docs › introduction\\n\\n _LangChain_ is a framework for developing applications powered by large\\nlanguage models (LLMs). LangChain simplifies every stage of the LLM\\napplication lifecycle.\\n\\n\\u200e[Introduction](https://python.langchain.com/v0.1/docs/get_started/introduction/)\\n·\\n\\u200e[Langchain.agents...](https://api.python.langchain.com/en/latest/agents/langchain.agents.tool_calling_agent.base.create_tool_calling_agent.html)\\n· \\u200e[LangChain v0.3](https://python.langchain.com/docs/versions/v0_3/) ·\\n\\u200e[Langchain_core.tools.](https://api.python.langchain.com/en/latest/tools/langchain_core.tools.tool.html)\\n\\n[ \\nWhat Is\\nLangChain?![]()IBMhttps://www.ibm.com\\n› think › topics › langchain](https://www.ibm.com/think/topics/langchain)\\n\\nIBM\\n\\nhttps://www.ibm.com › think › topics › langchain\\n\\nLangChain is essentially _a library of abstractions for Python and Javascript_\\n, representing common steps and concepts necessary to work with language\\nmodels.\\n\\n[ \\nWhat is\\nLangChain?![]()YouTube\\n· IBM Technology287.6K+ views · 10 months\\nago](https://www.youtube.com/watch?v=1bUy-1hGZpI)\\n\\nYouTube · IBM Technology\\n\\n287.6K+ views · 10 months ago\\n\\nLang chain is _an open-source orchestration framework_ for the development of\\napplications that use large language models.\\n\\n[ \\nWhat is Langchain and why should I care as a\\ndeveloper?![]()Medium\\n· Logan Kilpatrick370+ likes · 1 year ago](https://medium.com/around-the-\\nprompt/what-is-langchain-and-why-should-i-care-as-a-developer-b2d952c42b28)\\n\\nMedium · Logan Kilpatrick\\n\\n370+ likes · 1 year ago\\n\\n _Langchain_ makes creating agents using large language models simple through\\ntheir agents API. Developers can use OpenAI functions or other means ...\\n\\n[ \\nLangChain![]()Wikipediahttps://en.wikipedia.org\\n› wiki › LangChain](https://en.wikipedia.org/wiki/LangChain)\\n\\nWikipedia\\n\\nhttps://en.wikipedia.org › wiki › LangChain\\n\\nLangChain is a software framework that helps facilitate the integration of\\nlarge language models (LLMs) into applications.\\n\\n\\u200e[History](https://en.wikipedia.org/wiki/LangChain#History) ·\\n\\u200e[Capabilities](https://en.wikipedia.org/wiki/LangChain#Capabilities) ·\\n\\u200e[LangChain tools](https://en.wikipedia.org/wiki/LangChain#LangChain_tools)\\n\\n[ \\nWhat Is LangChain? A Complete Comprehensive\\nOverview![]()DataStaxhttps://www.datastax.com\\n› guides › what-is-langchain](https://www.datastax.com/guides/what-is-\\nlangchain)\\n\\nDataStax\\n\\nhttps://www.datastax.com › guides › what-is-langchain\\n\\nNov 9, 2023 — LangChain is _a Python framework designed to streamline AI\\napplication development_ , focusing on real-time data processing and\\nintegration with ...\\n\\n[ \\nWhat Is\\nLangChain?![]()Google\\nCloudhttps://cloud.google.com › use-cases ›\\nlangchain](https://cloud.google.com/use-cases/langchain)\\n\\nGoogle Cloud\\n\\nhttps://cloud.google.com › use-cases › langchain\\n\\n _LangChain_ is a programming language platform that lets developers construct\\nand connect models to access, transform, and share data seamlessly.\\n\\n\\u200e[Langchain And Ai](https://cloud.google.com/use-\\ncases/langchain#:~:text=LangChain%20and%20AI) · \\u200e[How Does Langchain\\nWork?](https://cloud.google.com/use-\\ncases/langchain#:~:text=How%20does%20LangChain%20work%3F) · \\u200e[Key Features Of\\nLangchain](https://cloud.google.com/use-\\ncases/langchain#:~:text=Key%20features%20of%20LangChain)\\n\\n# Page Navigation\\n\\n| 1|\\n[2](/search?q=What+is+LangChain?&sca_esv=4c6b8dc13bab3e46&udm=14&ei=e8iJZ425Mabg0PEP6LmQGQ&start=10&sa=N&sstk=ATObxK4t7c6xZe8J3zQzlUfrNV-\\nBchujCI0GxH83wgy_vu9jEqYrHuTxd0wVBzubCa-bn_k1uK_Zn1BBIfr2yh6eyUzMdvUxFJ-\\nmCw&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8tMDegQICBAE)|\\n[3](/search?q=What+is+LangChain?&sca_esv=4c6b8dc13bab3e46&udm=14&ei=e8iJZ425Mabg0PEP6LmQGQ&start=20&sa=N&sstk=ATObxK4t7c6xZe8J3zQzlUfrNV-\\nBchujCI0GxH83wgy_vu9jEqYrHuTxd0wVBzubCa-bn_k1uK_Zn1BBIfr2yh6eyUzMdvUxFJ-\\nmCw&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8tMDegQICBAG)|\\n[4](/search?q=What+is+LangChain?&sca_esv=4c6b8dc13bab3e46&udm=14&ei=e8iJZ425Mabg0PEP6LmQGQ&start=30&sa=N&sstk=ATObxK4t7c6xZe8J3zQzlUfrNV-\\nBchujCI0GxH83wgy_vu9jEqYrHuTxd0wVBzubCa-bn_k1uK_Zn1BBIfr2yh6eyUzMdvUxFJ-\\nmCw&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8tMDegQICBAI)|\\n[5](/search?q=What+is+LangChain?&sca_esv=4c6b8dc13bab3e46&udm=14&ei=e8iJZ425Mabg0PEP6LmQGQ&start=40&sa=N&sstk=ATObxK4t7c6xZe8J3zQzlUfrNV-\\nBchujCI0GxH83wgy_vu9jEqYrHuTxd0wVBzubCa-bn_k1uK_Zn1BBIfr2yh6eyUzMdvUxFJ-\\nmCw&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8tMDegQICBAK)|\\n[6](/search?q=What+is+LangChain?&sca_esv=4c6b8dc13bab3e46&udm=14&ei=e8iJZ425Mabg0PEP6LmQGQ&start=50&sa=N&sstk=ATObxK4t7c6xZe8J3zQzlUfrNV-\\nBchujCI0GxH83wgy_vu9jEqYrHuTxd0wVBzubCa-bn_k1uK_Zn1BBIfr2yh6eyUzMdvUxFJ-\\nmCw&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8tMDegQICBAM)|\\n[7](/search?q=What+is+LangChain?&sca_esv=4c6b8dc13bab3e46&udm=14&ei=e8iJZ425Mabg0PEP6LmQGQ&start=60&sa=N&sstk=ATObxK4t7c6xZe8J3zQzlUfrNV-\\nBchujCI0GxH83wgy_vu9jEqYrHuTxd0wVBzubCa-bn_k1uK_Zn1BBIfr2yh6eyUzMdvUxFJ-\\nmCw&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8tMDegQICBAO)|\\n[8](/search?q=What+is+LangChain?&sca_esv=4c6b8dc13bab3e46&udm=14&ei=e8iJZ425Mabg0PEP6LmQGQ&start=70&sa=N&sstk=ATObxK4t7c6xZe8J3zQzlUfrNV-\\nBchujCI0GxH83wgy_vu9jEqYrHuTxd0wVBzubCa-bn_k1uK_Zn1BBIfr2yh6eyUzMdvUxFJ-\\nmCw&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8tMDegQICBAQ)|\\n[9](/search?q=What+is+LangChain?&sca_esv=4c6b8dc13bab3e46&udm=14&ei=e8iJZ425Mabg0PEP6LmQGQ&start=80&sa=N&sstk=ATObxK4t7c6xZe8J3zQzlUfrNV-\\nBchujCI0GxH83wgy_vu9jEqYrHuTxd0wVBzubCa-bn_k1uK_Zn1BBIfr2yh6eyUzMdvUxFJ-\\nmCw&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8tMDegQICBAS)|\\n[10](/search?q=What+is+LangChain?&sca_esv=4c6b8dc13bab3e46&udm=14&ei=e8iJZ425Mabg0PEP6LmQGQ&start=90&sa=N&sstk=ATObxK4t7c6xZe8J3zQzlUfrNV-\\nBchujCI0GxH83wgy_vu9jEqYrHuTxd0wVBzubCa-bn_k1uK_Zn1BBIfr2yh6eyUzMdvUxFJ-\\nmCw&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8tMDegQICBAU)|\\n[Next](/search?q=What+is+LangChain?&sca_esv=4c6b8dc13bab3e46&udm=14&ei=e8iJZ425Mabg0PEP6LmQGQ&start=10&sa=N&sstk=ATObxK4t7c6xZe8J3zQzlUfrNV-\\nBchujCI0GxH83wgy_vu9jEqYrHuTxd0wVBzubCa-bn_k1uK_Zn1BBIfr2yh6eyUzMdvUxFJ-\\nmCw&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8NMDegQICBAW) \\n---|---|---|---|---|---|---|---|---|---|---|--- \\n \\n# Footer Links\\n\\nWasco County, Oregon \\\\- From your IP address\\n\\n\\\\-\\n\\nUpdate location\\n\\nCan't update your locationLearn more\\n\\nUpdating location...\\n\\n[Help](https://support.google.com/websearch/?p=ws_results_help&hl=en&fg=1)Send\\nfeedback[Privacy](https://policies.google.com/privacy?hl=en&fg=1)[Terms](https://policies.google.com/terms?hl=en&fg=1)\\n\\n\\n\", error=None, include_in_memory=False), ActionResult(is_done=True, extracted_content='LangChain is a composable framework designed for building applications with large language models (LLMs). It simplifies the integration of language models with external data sources and is open-source, supported by an active community. LangChain provides tools for developers to streamline the application lifecycle of LLMs.', error=None, include_in_memory=False)], all_model_outputs=[{'search_google': {'query': 'What is LangChain?'}}, {'extract_content': {'include_links': True}}, {'done': {'text': 'LangChain is a composable framework designed for building applications with large language models (LLMs). It simplifies the integration of language models with external data sources and is open-source, supported by an active community. LangChain provides tools for developers to streamline the application lifecycle of LLMs.'}}])\n" - ] - } - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "AgentHistoryList(all_results=[ActionResult(is_done=False, extracted_content='🔍 Searched for \"What is LangChain?\" in Google', error=None, include_in_memory=True), ActionResult(is_done=False, extracted_content=\"📄 Extracted page as markdown\\n: # Filters and Topics\\n\\n[All](/search?sca_esv=4c6b8dc13bab3e46&q=What+is+LangChain%3F&source=lnms&fbs=AEQNm0Aa4sjWe7Rqy32pFwRj0UkWd8nbOJfsBGGB5IQQO6L3JyWp6w6_rxLPe8F8fpm5a55blYtaduielx1say4YCS0EIyvBb6VkaLhDZSOnSC94tp-\\nJuFEDkvqUl_u6quB-Is11hrT6R6Y6jGPIGI0MqGRIdRYfHHK4Fm5f9UNWxYphEnPjChpmH-\\nusjmkJN6Sk444PHRuqJvihdKgoqwGrUjYjqVvmxA&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ0pQJegQIEhAB)\\n\\n[Images](/search?sca_esv=4c6b8dc13bab3e46&q=What+is+LangChain%3F&udm=2&fbs=AEQNm0Aa4sjWe7Rqy32pFwRj0UkWd8nbOJfsBGGB5IQQO6L3JyWp6w6_rxLPe8F8fpm5a55blYtaduielx1say4YCS0EIyvBb6VkaLhDZSOnSC94tp-\\nJuFEDkvqUl_u6quB-Is11hrT6R6Y6jGPIGI0MqGRIdRYfHHK4Fm5f9UNWxYphEnPjChpmH-\\nusjmkJN6Sk444PHRuqJvihdKgoqwGrUjYjqVvmxA&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQtKgLegQIExAB)\\n\\n[Videos](/search?sca_esv=4c6b8dc13bab3e46&q=What+is+LangChain%3F&udm=7&fbs=AEQNm0Aa4sjWe7Rqy32pFwRj0UkWd8nbOJfsBGGB5IQQO6L3JyWp6w6_rxLPe8F8fpm5a55blYtaduielx1say4YCS0EIyvBb6VkaLhDZSOnSC94tp-\\nJuFEDkvqUl_u6quB-Is11hrT6R6Y6jGPIGI0MqGRIdRYfHHK4Fm5f9UNWxYphEnPjChpmH-\\nusjmkJN6Sk444PHRuqJvihdKgoqwGrUjYjqVvmxA&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQtKgLegQIERAB)\\n\\n[Forums](/search?sca_esv=4c6b8dc13bab3e46&q=What+is+LangChain%3F&udm=18&fbs=AEQNm0Aa4sjWe7Rqy32pFwRj0UkWd8nbOJfsBGGB5IQQO6L3JyWp6w6_rxLPe8F8fpm5a55blYtaduielx1say4YCS0EIyvBb6VkaLhDZSOnSC94tp-\\nJuFEDkvqUl_u6quB-Is11hrT6R6Y6jGPIGI0MqGRIdRYfHHK4Fm5f9UNWxYphEnPjChpmH-\\nusjmkJN6Sk444PHRuqJvihdKgoqwGrUjYjqVvmxA&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQs6gLegQIDxAB)\\n\\nWeb\\n\\n[Flights](/travel/flights?sca_esv=4c6b8dc13bab3e46&output=search&q=What+is+LangChain%3F&source=lnms&fbs=AEQNm0Aa4sjWe7Rqy32pFwRj0UkWd8nbOJfsBGGB5IQQO6L3JyWp6w6_rxLPe8F8fpm5a55blYtaduielx1say4YCS0EIyvBb6VkaLhDZSOnSC94tp-\\nJuFEDkvqUl_u6quB-Is11hrT6R6Y6jGPIGI0MqGRIdRYfHHK4Fm5f9UNWxYphEnPjChpmH-\\nusjmkJN6Sk444PHRuqJvihdKgoqwGrUjYjqVvmxA&ved=1t:200715&ictx=111)\\n\\n[Finance](/finance?sca_esv=4c6b8dc13bab3e46&output=search&q=What+is+LangChain%3F&source=lnms&fbs=AEQNm0Aa4sjWe7Rqy32pFwRj0UkWd8nbOJfsBGGB5IQQO6L3JyWp6w6_rxLPe8F8fpm5a55blYtaduielx1say4YCS0EIyvBb6VkaLhDZSOnSC94tp-\\nJuFEDkvqUl_u6quB-Is11hrT6R6Y6jGPIGI0MqGRIdRYfHHK4Fm5f9UNWxYphEnPjChpmH-\\nusjmkJN6Sk444PHRuqJvihdKgoqwGrUjYjqVvmxA&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ0pQJegQIDBAB)\\n\\nMore\\n\\n[Books](/search?sca_esv=4c6b8dc13bab3e46&q=What+is+LangChain%3F&udm=36&source=lnms&fbs=AEQNm0Aa4sjWe7Rqy32pFwRj0UkWd8nbOJfsBGGB5IQQO6L3JyWp6w6_rxLPe8F8fpm5a55blYtaduielx1say4YCS0EIyvBb6VkaLhDZSOnSC94tp-\\nJuFEDkvqUl_u6quB-Is11hrT6R6Y6jGPIGI0MqGRIdRYfHHK4Fm5f9UNWxYphEnPjChpmH-\\nusjmkJN6Sk444PHRuqJvihdKgoqwGrUjYjqVvmxA&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ0pQJegQINxAB)\\n\\n[News](/search?sca_esv=4c6b8dc13bab3e46&q=What+is+LangChain%3F&tbm=nws&source=lnms&fbs=AEQNm0Aa4sjWe7Rqy32pFwRj0UkWd8nbOJfsBGGB5IQQO6L3JyWp6w6_rxLPe8F8fpm5a55blYtaduielx1say4YCS0EIyvBb6VkaLhDZSOnSC94tp-\\nJuFEDkvqUl_u6quB-Is11hrT6R6Y6jGPIGI0MqGRIdRYfHHK4Fm5f9UNWxYphEnPjChpmH-\\nusjmkJN6Sk444PHRuqJvihdKgoqwGrUjYjqVvmxA&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ0pQJegQINhAB)\\n\\n[Shopping](/search?sca_esv=4c6b8dc13bab3e46&q=What+is+LangChain%3F&udm=28&fbs=AEQNm0Aa4sjWe7Rqy32pFwRj0UkWd8nbOJfsBGGB5IQQO6L3JyWp6w6_rxLPe8F8fpm5a55blYtaduielx1say4YCS0EIyvBb6VkaLhDZSOnSC94tp-\\nJuFEDkvqUl_u6quB-Is11hrT6R6Y6jGPIGI0MqGRIdRYfHHK4Fm5f9UNWxYphEnPjChpmH-\\nusjmkJN6Sk444PHRuqJvihdKgoqwGrUjYjqVvmxA&ved=1t:220175&ictx=111)\\n\\nTools\\n\\nAny time\\n\\nAny time\\n\\n[Past\\nhour](/search?q=What+is+LangChain%3F&sca_esv=4c6b8dc13bab3e46&udm=14&source=lnt&tbs=qdr:h&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQpwV6BAgGEAc)\\n\\n[Past 24\\nhours](/search?q=What+is+LangChain%3F&sca_esv=4c6b8dc13bab3e46&udm=14&source=lnt&tbs=qdr:d&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQpwV6BAgGEAg)\\n\\n[Past\\nweek](/search?q=What+is+LangChain%3F&sca_esv=4c6b8dc13bab3e46&udm=14&source=lnt&tbs=qdr:w&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQpwV6BAgGEAk)\\n\\n[Past\\nmonth](/search?q=What+is+LangChain%3F&sca_esv=4c6b8dc13bab3e46&udm=14&source=lnt&tbs=qdr:m&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQpwV6BAgGEAo)\\n\\n[Past\\nyear](/search?q=What+is+LangChain%3F&sca_esv=4c6b8dc13bab3e46&udm=14&source=lnt&tbs=qdr:y&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQpwV6BAgGEAs)\\n\\nCustom range...\\n\\nCustom date range\\n\\nFromTo\\n\\nGo\\n\\nAll results\\n\\nAll results\\n\\n[Verbatim](/search?q=What+is+LangChain%3F&sca_esv=4c6b8dc13bab3e46&udm=14&source=lnt&tbs=li:1&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQpwV6BAgGEBM)\\n\\n[ Advanced Search\\n](https://www.google.com/advanced_search?q=What+is+LangChain%3F&udm=14)\\n\\nCtrl+Shift+X to select\\n\\n![Google](https://fonts.gstatic.com/s/i/productlogos/googleg/v6/24px.svg)\\n\\n# Search settings\\n\\n[Search CustomizationOff](/history/optout?hl=en)\\n\\n[SafeSearchBlurring\\non](/safesearch?prev=https://www.google.com/search?q%3DWhat%2Bis%2BLangChain?%26udm%3D14&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8JsIegQIChAH)\\n\\n[LanguageEnglish](/preferences?lang=1&hl=en&prev=https://www.google.com/search?q%3DWhat%2Bis%2BLangChain%253F%26sca_esv%3D4c6b8dc13bab3e46%26udm%3D14#languages)\\n\\n[Dark themeDevice\\ndefault](/setprefs?hl=en&prev=https://www.google.com/search?q%3DWhat%2Bis%2BLangChain?%26udm%3D14%26pccc%3D1&sig=0_jfSkJcafppJyKAIkCWZpHFXzfrs%3D&cs=2&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQqsEHegQIChAJ&ictx=1)\\n\\n[More\\nsettings](/preferences?hl=en&prev=https://www.google.com/search?q%3DWhat%2Bis%2BLangChain%253F%26sca_esv%3D4c6b8dc13bab3e46%26udm%3D14)\\n\\nSend feedback\\n\\n[Help](https://support.google.com/websearch/?p=dsrp_search_hc&hl=en) •\\n[Privacy](https://policies.google.com/privacy?hl=en&fg=1) •\\n[Terms](https://policies.google.com/terms?hl=en&fg=1)\\n\\n# Search Results\\n\\n[ \\nLangChain![]()LangChainhttps://www.langchain.com](https://www.langchain.com/)\\n\\nLangChain\\n\\nhttps://www.langchain.com\\n\\n _LangChain_ is a composable framework to build with LLMs. LangGraph is the\\norchestration framework for controllable agentic workflows. Run.\\n\\n\\u200e[Docs](https://python.langchain.com/docs/introduction/) ·\\n\\u200e[Products](https://www.langchain.com/langchain) · \\u200e[LangChain\\nAcademy](https://academy.langchain.com/) · \\u200e[Join the LangChain\\nCommunity](https://www.langchain.com/join-community)\\n\\n[ \\nWhat is\\nLangChain?![]()Amazon\\nWeb Serviceshttps://aws.amazon.com › ... › Generative\\nAI](https://aws.amazon.com/what-is/langchain/)\\n\\nAmazon Web Services\\n\\nhttps://aws.amazon.com › ... › Generative AI\\n\\nLangChain _provides AI developers with tools to connect language models with\\nexternal data sources_. It is open-source and supported by an active\\ncommunity.\\n\\n[ \\nWhat Is LangChain and How to Use It: A\\nGuide![]()TechTargethttps://www.techtarget.com\\n› definition ›\\nLangChain](https://www.techtarget.com/searchenterpriseai/definition/LangChain)\\n\\nTechTarget\\n\\nhttps://www.techtarget.com › definition › LangChain\\n\\n _LangChain is an open source framework_ that enables software developers\\nworking with artificial intelligence (AI) and its machine learning subset to\\ncombine ...\\n\\n[ \\nIntroduction | 🦜️ LangChain![]()LangChainhttps://python.langchain.com › docs › introduction](https://python.langchain.com/docs/introduction/)\\n\\nLangChain\\n\\nhttps://python.langchain.com › docs › introduction\\n\\n _LangChain_ is a framework for developing applications powered by large\\nlanguage models (LLMs). LangChain simplifies every stage of the LLM\\napplication lifecycle.\\n\\n\\u200e[Introduction](https://python.langchain.com/v0.1/docs/get_started/introduction/)\\n·\\n\\u200e[Langchain.agents...](https://api.python.langchain.com/en/latest/agents/langchain.agents.tool_calling_agent.base.create_tool_calling_agent.html)\\n· \\u200e[LangChain v0.3](https://python.langchain.com/docs/versions/v0_3/) ·\\n\\u200e[Langchain_core.tools.](https://api.python.langchain.com/en/latest/tools/langchain_core.tools.tool.html)\\n\\n[ \\nWhat Is\\nLangChain?![]()IBMhttps://www.ibm.com\\n› think › topics › langchain](https://www.ibm.com/think/topics/langchain)\\n\\nIBM\\n\\nhttps://www.ibm.com › think › topics › langchain\\n\\nLangChain is essentially _a library of abstractions for Python and Javascript_\\n, representing common steps and concepts necessary to work with language\\nmodels.\\n\\n[ \\nWhat is\\nLangChain?![]()YouTube\\n· IBM Technology287.6K+ views · 10 months\\nago](https://www.youtube.com/watch?v=1bUy-1hGZpI)\\n\\nYouTube · IBM Technology\\n\\n287.6K+ views · 10 months ago\\n\\nLang chain is _an open-source orchestration framework_ for the development of\\napplications that use large language models.\\n\\n[ \\nWhat is Langchain and why should I care as a\\ndeveloper?![]()Medium\\n· Logan Kilpatrick370+ likes · 1 year ago](https://medium.com/around-the-\\nprompt/what-is-langchain-and-why-should-i-care-as-a-developer-b2d952c42b28)\\n\\nMedium · Logan Kilpatrick\\n\\n370+ likes · 1 year ago\\n\\n _Langchain_ makes creating agents using large language models simple through\\ntheir agents API. Developers can use OpenAI functions or other means ...\\n\\n[ \\nLangChain![]()Wikipediahttps://en.wikipedia.org\\n› wiki › LangChain](https://en.wikipedia.org/wiki/LangChain)\\n\\nWikipedia\\n\\nhttps://en.wikipedia.org › wiki › LangChain\\n\\nLangChain is a software framework that helps facilitate the integration of\\nlarge language models (LLMs) into applications.\\n\\n\\u200e[History](https://en.wikipedia.org/wiki/LangChain#History) ·\\n\\u200e[Capabilities](https://en.wikipedia.org/wiki/LangChain#Capabilities) ·\\n\\u200e[LangChain tools](https://en.wikipedia.org/wiki/LangChain#LangChain_tools)\\n\\n[ \\nWhat Is LangChain? A Complete Comprehensive\\nOverview![]()DataStaxhttps://www.datastax.com\\n› guides › what-is-langchain](https://www.datastax.com/guides/what-is-\\nlangchain)\\n\\nDataStax\\n\\nhttps://www.datastax.com › guides › what-is-langchain\\n\\nNov 9, 2023 — LangChain is _a Python framework designed to streamline AI\\napplication development_ , focusing on real-time data processing and\\nintegration with ...\\n\\n[ \\nWhat Is\\nLangChain?![]()Google\\nCloudhttps://cloud.google.com › use-cases ›\\nlangchain](https://cloud.google.com/use-cases/langchain)\\n\\nGoogle Cloud\\n\\nhttps://cloud.google.com › use-cases › langchain\\n\\n _LangChain_ is a programming language platform that lets developers construct\\nand connect models to access, transform, and share data seamlessly.\\n\\n\\u200e[Langchain And Ai](https://cloud.google.com/use-\\ncases/langchain#:~:text=LangChain%20and%20AI) · \\u200e[How Does Langchain\\nWork?](https://cloud.google.com/use-\\ncases/langchain#:~:text=How%20does%20LangChain%20work%3F) · \\u200e[Key Features Of\\nLangchain](https://cloud.google.com/use-\\ncases/langchain#:~:text=Key%20features%20of%20LangChain)\\n\\n# Page Navigation\\n\\n| 1|\\n[2](/search?q=What+is+LangChain?&sca_esv=4c6b8dc13bab3e46&udm=14&ei=e8iJZ425Mabg0PEP6LmQGQ&start=10&sa=N&sstk=ATObxK4t7c6xZe8J3zQzlUfrNV-\\nBchujCI0GxH83wgy_vu9jEqYrHuTxd0wVBzubCa-bn_k1uK_Zn1BBIfr2yh6eyUzMdvUxFJ-\\nmCw&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8tMDegQICBAE)|\\n[3](/search?q=What+is+LangChain?&sca_esv=4c6b8dc13bab3e46&udm=14&ei=e8iJZ425Mabg0PEP6LmQGQ&start=20&sa=N&sstk=ATObxK4t7c6xZe8J3zQzlUfrNV-\\nBchujCI0GxH83wgy_vu9jEqYrHuTxd0wVBzubCa-bn_k1uK_Zn1BBIfr2yh6eyUzMdvUxFJ-\\nmCw&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8tMDegQICBAG)|\\n[4](/search?q=What+is+LangChain?&sca_esv=4c6b8dc13bab3e46&udm=14&ei=e8iJZ425Mabg0PEP6LmQGQ&start=30&sa=N&sstk=ATObxK4t7c6xZe8J3zQzlUfrNV-\\nBchujCI0GxH83wgy_vu9jEqYrHuTxd0wVBzubCa-bn_k1uK_Zn1BBIfr2yh6eyUzMdvUxFJ-\\nmCw&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8tMDegQICBAI)|\\n[5](/search?q=What+is+LangChain?&sca_esv=4c6b8dc13bab3e46&udm=14&ei=e8iJZ425Mabg0PEP6LmQGQ&start=40&sa=N&sstk=ATObxK4t7c6xZe8J3zQzlUfrNV-\\nBchujCI0GxH83wgy_vu9jEqYrHuTxd0wVBzubCa-bn_k1uK_Zn1BBIfr2yh6eyUzMdvUxFJ-\\nmCw&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8tMDegQICBAK)|\\n[6](/search?q=What+is+LangChain?&sca_esv=4c6b8dc13bab3e46&udm=14&ei=e8iJZ425Mabg0PEP6LmQGQ&start=50&sa=N&sstk=ATObxK4t7c6xZe8J3zQzlUfrNV-\\nBchujCI0GxH83wgy_vu9jEqYrHuTxd0wVBzubCa-bn_k1uK_Zn1BBIfr2yh6eyUzMdvUxFJ-\\nmCw&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8tMDegQICBAM)|\\n[7](/search?q=What+is+LangChain?&sca_esv=4c6b8dc13bab3e46&udm=14&ei=e8iJZ425Mabg0PEP6LmQGQ&start=60&sa=N&sstk=ATObxK4t7c6xZe8J3zQzlUfrNV-\\nBchujCI0GxH83wgy_vu9jEqYrHuTxd0wVBzubCa-bn_k1uK_Zn1BBIfr2yh6eyUzMdvUxFJ-\\nmCw&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8tMDegQICBAO)|\\n[8](/search?q=What+is+LangChain?&sca_esv=4c6b8dc13bab3e46&udm=14&ei=e8iJZ425Mabg0PEP6LmQGQ&start=70&sa=N&sstk=ATObxK4t7c6xZe8J3zQzlUfrNV-\\nBchujCI0GxH83wgy_vu9jEqYrHuTxd0wVBzubCa-bn_k1uK_Zn1BBIfr2yh6eyUzMdvUxFJ-\\nmCw&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8tMDegQICBAQ)|\\n[9](/search?q=What+is+LangChain?&sca_esv=4c6b8dc13bab3e46&udm=14&ei=e8iJZ425Mabg0PEP6LmQGQ&start=80&sa=N&sstk=ATObxK4t7c6xZe8J3zQzlUfrNV-\\nBchujCI0GxH83wgy_vu9jEqYrHuTxd0wVBzubCa-bn_k1uK_Zn1BBIfr2yh6eyUzMdvUxFJ-\\nmCw&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8tMDegQICBAS)|\\n[10](/search?q=What+is+LangChain?&sca_esv=4c6b8dc13bab3e46&udm=14&ei=e8iJZ425Mabg0PEP6LmQGQ&start=90&sa=N&sstk=ATObxK4t7c6xZe8J3zQzlUfrNV-\\nBchujCI0GxH83wgy_vu9jEqYrHuTxd0wVBzubCa-bn_k1uK_Zn1BBIfr2yh6eyUzMdvUxFJ-\\nmCw&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8tMDegQICBAU)|\\n[Next](/search?q=What+is+LangChain?&sca_esv=4c6b8dc13bab3e46&udm=14&ei=e8iJZ425Mabg0PEP6LmQGQ&start=10&sa=N&sstk=ATObxK4t7c6xZe8J3zQzlUfrNV-\\nBchujCI0GxH83wgy_vu9jEqYrHuTxd0wVBzubCa-bn_k1uK_Zn1BBIfr2yh6eyUzMdvUxFJ-\\nmCw&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8NMDegQICBAW) \\n---|---|---|---|---|---|---|---|---|---|---|--- \\n \\n# Footer Links\\n\\nWasco County, Oregon \\\\- From your IP address\\n\\n\\\\-\\n\\nUpdate location\\n\\nCan't update your locationLearn more\\n\\nUpdating location...\\n\\n[Help](https://support.google.com/websearch/?p=ws_results_help&hl=en&fg=1)Send\\nfeedback[Privacy](https://policies.google.com/privacy?hl=en&fg=1)[Terms](https://policies.google.com/terms?hl=en&fg=1)\\n\\n\\n\", error=None, include_in_memory=False), ActionResult(is_done=True, extracted_content='LangChain is a composable framework designed for building applications with large language models (LLMs). It simplifies the integration of language models with external data sources and is open-source, supported by an active community. LangChain provides tools for developers to streamline the application lifecycle of LLMs.', error=None, include_in_memory=False)], all_model_outputs=[{'search_google': {'query': 'What is LangChain?'}}, {'extract_content': {'include_links': True}}, {'done': {'text': 'LangChain is a composable framework designed for building applications with large language models (LLMs). It simplifies the integration of language models with external data sources and is open-source, supported by an active community. LangChain provides tools for developers to streamline the application lifecycle of LLMs.'}}])\n" + ] + } + ], + "source": [ + "# from browser_use import Agent\n", + "import asyncio\n", + "\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "from browser_use import Browser, BrowserConfig\n", + "\n", + "# Basic configuration\n", + "config = BrowserConfig(\n", + "\theadless=True,\n", + "\t# disable_security=True\n", + ")\n", + "# Reuse existing browser\n", + "browser = Browser(config=config)\n", + "# async def main():\n", + "agent = Agent(\n", + "\ttask='what is langchain',\n", + "\tllm=llm,\n", + "\tbrowser=browser,\n", + "\tgenerate_gif=False, # Browser instance will be reused\n", + ")\n", + "\n", + "result = await agent.run()\n", + "print(result)\n", + "# Manually close the browser\n", + "# asyncio.run(main())\n", + "await browser.close()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "nKGC936xODry", + "outputId": "de70d715-c30a-4d5b-9d25-40bd79d410de" + }, + "outputs": [ { - "cell_type": "code", - "source": [ - "# display(result.action_results())\n", - "for action in result.action_results():\n", - " if action.is_done:\n", - " print(action.extracted_content)\n", - "" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "nKGC936xODry", - "outputId": "de70d715-c30a-4d5b-9d25-40bd79d410de" - }, - "execution_count": 27, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "LangChain is a composable framework designed for building applications with large language models (LLMs). It simplifies the integration of language models with external data sources and is open-source, supported by an active community. LangChain provides tools for developers to streamline the application lifecycle of LLMs.\n" - ] - } - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "LangChain is a composable framework designed for building applications with large language models (LLMs). It simplifies the integration of language models with external data sources and is open-source, supported by an active community. LangChain provides tools for developers to streamline the application lifecycle of LLMs.\n" + ] } - ] -} \ No newline at end of file + ], + "source": [ + "# display(result.action_results())\n", + "for action in result.action_results():\n", + "\tif action.is_done:\n", + "\t\tprint(action.extracted_content)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/examples/simple.py b/examples/simple.py index 0d586b8bfe..c755e49fb2 100644 --- a/examples/simple.py +++ b/examples/simple.py @@ -1,3 +1,8 @@ +import os +import sys + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + import asyncio from dotenv import load_dotenv diff --git a/examples/ui/README.md b/examples/ui/README.md index 6e1bcb0349..5ffe4ddac0 100644 --- a/examples/ui/README.md +++ b/examples/ui/README.md @@ -4,4 +4,4 @@ |------------------------|-------------------|-------------------------------------------|-------------------------------------------| | `command_line.py` | **Terminal** | Parses arguments for command-line execution. | `python command_line.py` | | `gradio_demo.py` | **Gradio** | Provides a Gradio-based interactive UI. | `python gradio_demo.py` | -| `streamlit_demo.py` | **Streamlit** | Runs a Streamlit-based web interface. | `python -m streamlit run streamlit_demo.py` | \ No newline at end of file +| `streamlit_demo.py` | **Streamlit** | Runs a Streamlit-based web interface. | `python -m streamlit run streamlit_demo.py` | diff --git a/examples/ui/command_line.py b/examples/ui/command_line.py index 715bb1f09d..ea2d1a5ad1 100644 --- a/examples/ui/command_line.py +++ b/examples/ui/command_line.py @@ -11,10 +11,11 @@ python command_line.py --query "find latest Python tutorials on Medium" --provider anthropic """ -import os -import sys + import argparse import asyncio +import os +import sys # Ensure local repository (browser_use) is accessible sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -25,73 +26,73 @@ from browser_use.browser.browser import Browser, BrowserConfig from browser_use.controller.service import Controller - load_dotenv() + def get_llm(provider: str): if provider == 'anthropic': from langchain_anthropic import ChatAnthropic - api_key = os.getenv("ANTHROPIC_API_KEY") + + api_key = os.getenv('ANTHROPIC_API_KEY') if not api_key: - raise ValueError("Error: ANTHROPIC_API_KEY is not set. Please provide a valid API key.") - - return ChatAnthropic( - model_name='claude-3-5-sonnet-20240620', timeout=25, stop=None, temperature=0.0 - ) + raise ValueError('Error: ANTHROPIC_API_KEY is not set. Please provide a valid API key.') + + return ChatAnthropic(model_name='claude-3-5-sonnet-20240620', timeout=25, stop=None, temperature=0.0) elif provider == 'openai': from langchain_openai import ChatOpenAI - api_key = os.getenv("OPENAI_API_KEY") + + api_key = os.getenv('OPENAI_API_KEY') if not api_key: - raise ValueError("Error: OPENAI_API_KEY is not set. Please provide a valid API key.") - + raise ValueError('Error: OPENAI_API_KEY is not set. Please provide a valid API key.') + return ChatOpenAI(model='gpt-4o', temperature=0.0) else: raise ValueError(f'Unsupported provider: {provider}') + def parse_arguments(): - """Parse command-line arguments.""" - parser = argparse.ArgumentParser(description="Automate browser tasks using an LLM agent.") - parser.add_argument( - '--query', - type=str, - help='The query to process', - default='go to reddit and search for posts about browser-use' - ) - parser.add_argument( - '--provider', - type=str, - choices=['openai', 'anthropic'], - default='openai', - help='The model provider to use (default: openai)', - ) - return parser.parse_args() + """Parse command-line arguments.""" + parser = argparse.ArgumentParser(description='Automate browser tasks using an LLM agent.') + parser.add_argument( + '--query', type=str, help='The query to process', default='go to reddit and search for posts about browser-use' + ) + parser.add_argument( + '--provider', + type=str, + choices=['openai', 'anthropic'], + default='openai', + help='The model provider to use (default: openai)', + ) + return parser.parse_args() + def initialize_agent(query: str, provider: str): - """Initialize the browser agent with the given query and provider.""" - llm = get_llm(provider) - controller = Controller() - browser = Browser(config=BrowserConfig()) - - return Agent( - task=query, - llm=llm, - controller=controller, - browser=browser, - use_vision=True, - max_actions_per_step=1, - ), browser + """Initialize the browser agent with the given query and provider.""" + llm = get_llm(provider) + controller = Controller() + browser = Browser(config=BrowserConfig()) + + return Agent( + task=query, + llm=llm, + controller=controller, + browser=browser, + use_vision=True, + max_actions_per_step=1, + ), browser + async def main(): - """Main async function to run the agent.""" - args = parse_arguments() - agent, browser = initialize_agent(args.query, args.provider) + """Main async function to run the agent.""" + args = parse_arguments() + agent, browser = initialize_agent(args.query, args.provider) + + await agent.run(max_steps=25) - await agent.run(max_steps=25) - - input('Press Enter to close the browser...') - await browser.close() + input('Press Enter to close the browser...') + await browser.close() -if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file +if __name__ == '__main__': + asyncio.run(main()) diff --git a/examples/ui/gradio_demo.py b/examples/ui/gradio_demo.py index b67a88da16..aa3915eb37 100644 --- a/examples/ui/gradio_demo.py +++ b/examples/ui/gradio_demo.py @@ -1,5 +1,5 @@ -import os import asyncio +import os from dataclasses import dataclass from typing import List, Optional @@ -85,9 +85,7 @@ def create_ui(): placeholder='E.g., Find flights from New York to London for next week', lines=3, ) - model = gr.Dropdown( - choices=['gpt-4', 'gpt-3.5-turbo'], label='Model', value='gpt-4' - ) + model = gr.Dropdown(choices=['gpt-4', 'gpt-3.5-turbo'], label='Model', value='gpt-4') headless = gr.Checkbox(label='Run Headless', value=True) submit_btn = gr.Button('Run Task') @@ -105,4 +103,4 @@ def create_ui(): if __name__ == '__main__': demo = create_ui() - demo.launch() \ No newline at end of file + demo.launch() diff --git a/examples/ui/streamlit_demo.py b/examples/ui/streamlit_demo.py index 9948f2fd5e..39960b0c9e 100644 --- a/examples/ui/streamlit_demo.py +++ b/examples/ui/streamlit_demo.py @@ -5,9 +5,10 @@ """ +import asyncio import os import sys -import asyncio + import streamlit as st from dotenv import load_dotenv @@ -21,60 +22,66 @@ # Load environment variables load_dotenv() +if os.name == 'nt': + asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy()) + + # Function to get the LLM based on provider def get_llm(provider: str): - if provider == 'anthropic': - from langchain_anthropic import ChatAnthropic - api_key = os.getenv("ANTHROPIC_API_KEY") - if not api_key: - st.error("Error: ANTHROPIC_API_KEY is not set. Please provide a valid API key.") - st.stop() - - return ChatAnthropic( - model_name='claude-3-5-sonnet-20240620', timeout=25, stop=None, temperature=0.0 - ) - elif provider == 'openai': - from langchain_openai import ChatOpenAI - api_key = os.getenv("OPENAI_API_KEY") - if not api_key: - st.error("Error: OPENAI_API_KEY is not set. Please provide a valid API key.") - st.stop() - - return ChatOpenAI(model='gpt-4o', temperature=0.0) - else: - st.error(f'Unsupported provider: {provider}') - st.stop() + if provider == 'anthropic': + from langchain_anthropic import ChatAnthropic + + api_key = os.getenv('ANTHROPIC_API_KEY') + if not api_key: + st.error('Error: ANTHROPIC_API_KEY is not set. Please provide a valid API key.') + st.stop() + + return ChatAnthropic(model_name='claude-3-5-sonnet-20240620', timeout=25, stop=None, temperature=0.0) + elif provider == 'openai': + from langchain_openai import ChatOpenAI + + api_key = os.getenv('OPENAI_API_KEY') + if not api_key: + st.error('Error: OPENAI_API_KEY is not set. Please provide a valid API key.') + st.stop() + + return ChatOpenAI(model='gpt-4o', temperature=0.0) + else: + st.error(f'Unsupported provider: {provider}') + st.stop() + # Function to initialize the agent def initialize_agent(query: str, provider: str): - llm = get_llm(provider) - controller = Controller() - browser = Browser(config=BrowserConfig()) - - return Agent( - task=query, - llm=llm, - controller=controller, - browser=browser, - use_vision=True, - max_actions_per_step=1, - ), browser + llm = get_llm(provider) + controller = Controller() + browser = Browser(config=BrowserConfig()) + + return Agent( + task=query, + llm=llm, + controller=controller, + browser=browser, + use_vision=True, + max_actions_per_step=1, + ), browser + # Streamlit UI -st.title("Automated Browser Agent with LLMs 🤖") +st.title('Automated Browser Agent with LLMs 🤖') -query = st.text_input("Enter your query:", "go to reddit and search for posts about browser-use") -provider = st.radio("Select LLM Provider:", ["openai", "anthropic"], index=0) +query = st.text_input('Enter your query:', 'go to reddit and search for posts about browser-use') +provider = st.radio('Select LLM Provider:', ['openai', 'anthropic'], index=0) -if st.button("Run Agent"): - st.write("Initializing agent...") - agent, browser = initialize_agent(query, provider) +if st.button('Run Agent'): + st.write('Initializing agent...') + agent, browser = initialize_agent(query, provider) - async def run_agent(): - with st.spinner("Running automation..."): - await agent.run(max_steps=25) - st.success("Task completed! 🎉") + async def run_agent(): + with st.spinner('Running automation...'): + await agent.run(max_steps=25) + st.success('Task completed! 🎉') - asyncio.run(run_agent()) + asyncio.run(run_agent()) - st.button("Close Browser", on_click=lambda: asyncio.run(browser.close())) + st.button('Close Browser', on_click=lambda: asyncio.run(browser.close())) diff --git a/examples/use-cases/README.md b/examples/use-cases/README.md index 90ec4c4f43..f45c0977a8 100644 --- a/examples/use-cases/README.md +++ b/examples/use-cases/README.md @@ -10,6 +10,3 @@ | `scrolling_page.py` | Automates webpage scrolling with various scrolling actions and text search functionality. | | `twitter_post_using_cookies.py` | Automates posting on X (Twitter) using stored authentication cookies. | | `web_voyager_agent.py` | A general-purpose web navigation agent for tasks like flight booking and course searching. | - - - diff --git a/examples/use-cases/captcha.py b/examples/use-cases/captcha.py index 784eb04aec..4091c8d746 100644 --- a/examples/use-cases/captcha.py +++ b/examples/use-cases/captcha.py @@ -14,23 +14,27 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import asyncio + +from dotenv import load_dotenv from langchain_openai import ChatOpenAI + from browser_use import Agent -from dotenv import load_dotenv # Load environment variables load_dotenv() if not os.getenv('OPENAI_API_KEY'): - raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.') + raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.') + async def main(): - llm = ChatOpenAI(model='gpt-4o') - agent = Agent( + llm = ChatOpenAI(model='gpt-4o') + agent = Agent( task='go to https://captcha.com/demos/features/captcha-demo.aspx and solve the captcha', llm=llm, ) - await agent.run() - input('Press Enter to exit') + await agent.run() + input('Press Enter to exit') + -if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file +if __name__ == '__main__': + asyncio.run(main()) diff --git a/examples/use-cases/check_appointment.py b/examples/use-cases/check_appointment.py index 8e010f0149..4696baf7f3 100644 --- a/examples/use-cases/check_appointment.py +++ b/examples/use-cases/check_appointment.py @@ -13,13 +13,14 @@ # Load environment variables load_dotenv() if not os.getenv('OPENAI_API_KEY'): - raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.') + raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.') controller = Controller() class WebpageInfo(BaseModel): """Model for webpage link.""" + link: str = 'https://appointment.mfa.gr/en/reservations/aero/ireland-grcon-dub/' @@ -43,5 +44,5 @@ async def main(): await agent.run() -if __name__ == "__main__": +if __name__ == '__main__': asyncio.run(main()) diff --git a/examples/use-cases/find_and_apply_to_jobs.py b/examples/use-cases/find_and_apply_to_jobs.py index daf65897bd..bec4ccc97a 100644 --- a/examples/use-cases/find_and_apply_to_jobs.py +++ b/examples/use-cases/find_and_apply_to_jobs.py @@ -1,35 +1,35 @@ """ -Goal: Searches for job listings, evaluates relevance based on a CV, and applies +Goal: Searches for job listings, evaluates relevance based on a CV, and applies @dev You need to add OPENAI_API_KEY to your environment variables. Also you have to install PyPDF2 to read pdf files: pip install PyPDF2 """ +import asyncio import csv +import logging import os import sys from pathlib import Path -import logging -from typing import List, Optional -import asyncio +from typing import Optional sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from dotenv import load_dotenv -from PyPDF2 import PdfReader -from langchain_openai import AzureChatOpenAI, ChatOpenAI +from langchain_openai import AzureChatOpenAI from pydantic import BaseModel, SecretStr +from PyPDF2 import PdfReader from browser_use import ActionResult, Agent, Controller -from browser_use.browser.context import BrowserContext from browser_use.browser.browser import Browser, BrowserConfig +from browser_use.browser.context import BrowserContext # Validate required environment variables load_dotenv() -required_env_vars = ["AZURE_OPENAI_KEY", "AZURE_OPENAI_ENDPOINT"] +required_env_vars = ['AZURE_OPENAI_KEY', 'AZURE_OPENAI_ENDPOINT'] for var in required_env_vars: - if not os.getenv(var): - raise ValueError(f"{var} is not set. Please add it to your environment variables.") + if not os.getenv(var): + raise ValueError(f'{var} is not set. Please add it to your environment variables.') logger = logging.getLogger(__name__) # full screen mode @@ -110,7 +110,7 @@ async def upload_cv(index: int, browser: BrowserContext): browser = Browser( config=BrowserConfig( - chrome_instance_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', + browser_binary_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', disable_security=True, ) ) @@ -156,5 +156,5 @@ async def main(): await asyncio.gather(*[agent.run() for agent in agents]) -if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file +if __name__ == '__main__': + asyncio.run(main()) diff --git a/examples/use-cases/google_sheets.py b/examples/use-cases/google_sheets.py new file mode 100644 index 0000000000..5602c8122a --- /dev/null +++ b/examples/use-cases/google_sheets.py @@ -0,0 +1,193 @@ +import os +import sys + +from browser_use.browser.context import BrowserContext + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import asyncio + +import pyperclip +from dotenv import load_dotenv +from langchain_openai import ChatOpenAI + +from browser_use import ActionResult, Agent, Controller +from browser_use.browser.browser import Browser, BrowserConfig + +browser = Browser( + config=BrowserConfig( + browser_binary_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', + ), +) + +# Load environment variables +load_dotenv() +if not os.getenv('OPENAI_API_KEY'): + raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.') + + +controller = Controller() + + +def is_google_sheet(page) -> bool: + return page.url.startswith('https://docs.google.com/spreadsheets/') + + +@controller.registry.action('Google Sheets: Open a specific Google Sheet') +async def open_google_sheet(browser: BrowserContext, google_sheet_url: str): + page = await browser.get_current_page() + if page.url != google_sheet_url: + await page.goto(google_sheet_url) + await page.wait_for_load_state() + if not is_google_sheet(page): + return ActionResult(error='Failed to open Google Sheet, are you sure you have permissions to access this sheet?') + return ActionResult(extracted_content=f'Opened Google Sheet {google_sheet_url}', include_in_memory=False) + + +@controller.registry.action('Google Sheets: Get the contents of the entire sheet', page_filter=is_google_sheet) +async def get_sheet_contents(browser: BrowserContext): + page = await browser.get_current_page() + + # select all cells + await page.keyboard.press('Enter') + await page.keyboard.press('Escape') + await page.keyboard.press('ControlOrMeta+A') + await page.keyboard.press('ControlOrMeta+C') + + extracted_tsv = pyperclip.paste() + return ActionResult(extracted_content=extracted_tsv, include_in_memory=True) + + +@controller.registry.action('Google Sheets: Select a specific cell or range of cells', page_filter=is_google_sheet) +async def select_cell_or_range(browser: BrowserContext, cell_or_range: str): + page = await browser.get_current_page() + + await page.keyboard.press('Enter') # make sure we dont delete current cell contents if we were last editing + await page.keyboard.press('Escape') # to clear current focus (otherwise select range popup is additive) + await asyncio.sleep(0.1) + await page.keyboard.press('Home') # move cursor to the top left of the sheet first + await page.keyboard.press('ArrowUp') + await asyncio.sleep(0.1) + await page.keyboard.press('Control+G') # open the goto range popup + await asyncio.sleep(0.2) + await page.keyboard.type(cell_or_range, delay=0.05) + await asyncio.sleep(0.2) + await page.keyboard.press('Enter') + await asyncio.sleep(0.2) + await page.keyboard.press('Escape') # to make sure the popup still closes in the case where the jump failed + return ActionResult(extracted_content=f'Selected cell {cell_or_range}', include_in_memory=False) + + +@controller.registry.action('Google Sheets: Get the contents of a specific cell or range of cells', page_filter=is_google_sheet) +async def get_range_contents(browser: BrowserContext, cell_or_range: str): + page = await browser.get_current_page() + + await select_cell_or_range(browser, cell_or_range) + + await page.keyboard.press('ControlOrMeta+C') + await asyncio.sleep(0.1) + extracted_tsv = pyperclip.paste() + return ActionResult(extracted_content=extracted_tsv, include_in_memory=True) + + +@controller.registry.action('Google Sheets: Clear the currently selected cells', page_filter=is_google_sheet) +async def clear_selected_range(browser: BrowserContext): + page = await browser.get_current_page() + + await page.keyboard.press('Backspace') + return ActionResult(extracted_content='Cleared selected range', include_in_memory=False) + + +@controller.registry.action('Google Sheets: Input text into the currently selected cell', page_filter=is_google_sheet) +async def input_selected_cell_text(browser: BrowserContext, text: str): + page = await browser.get_current_page() + + await page.keyboard.type(text, delay=0.1) + await page.keyboard.press('Enter') # make sure to commit the input so it doesn't get overwritten by the next action + await page.keyboard.press('ArrowUp') + return ActionResult(extracted_content=f'Inputted text {text}', include_in_memory=False) + + +@controller.registry.action('Google Sheets: Batch update a range of cells', page_filter=is_google_sheet) +async def update_range_contents(browser: BrowserContext, range: str, new_contents_tsv: str): + page = await browser.get_current_page() + + await select_cell_or_range(browser, range) + + # simulate paste event from clipboard with TSV content + await page.evaluate(f""" + const clipboardData = new DataTransfer(); + clipboardData.setData('text/plain', `{new_contents_tsv}`); + document.activeElement.dispatchEvent(new ClipboardEvent('paste', {{clipboardData}})); + """) + + return ActionResult(extracted_content=f'Updated cell {range} with {new_contents_tsv}', include_in_memory=False) + + +# many more snippets for keyboard-shortcut based Google Sheets automation can be found here, see: +# - https://github.com/philc/sheetkeys/blob/master/content_scripts/sheet_actions.js +# - https://github.com/philc/sheetkeys/blob/master/content_scripts/commands.js +# - https://support.google.com/docs/answer/181110?hl=en&co=GENIE.Platform%3DDesktop#zippy=%2Cmac-shortcuts + +# Tip: LLM is bad at spatial reasoning, don't make it navigate with arrow keys relative to current cell +# if given arrow keys, it will try to jump from G1 to A2 by pressing Down, without realizing needs to go Down+LeftLeftLeftLeft + + +async def main(): + async with await browser.new_context() as context: + model = ChatOpenAI(model='gpt-4o') + + eraser = Agent( + task=""" + Clear all the existing values in columns A through F in this Google Sheet: + https://docs.google.com/spreadsheets/d/1INaIcfpYXlMRWO__de61SHFCaqt1lfHlcvtXZPItlpI/edit + """, + llm=model, + browser_context=context, + controller=controller, + ) + await eraser.run() + + researcher = Agent( + task=""" + Google to find the full name, nationality, and date of birth of the CEO of the top 10 Fortune 100 companies. + For each company, append a row to this existing Google Sheet: https://docs.google.com/spreadsheets/d/1INaIcfpYXlMRWO__de61SHFCaqt1lfHlcvtXZPItlpI/edit + Make sure column headers are present and all existing values in the sheet are formatted correctly. + Columns: + A: Company Name + B: CEO Full Name + C: CEO Country of Birth + D: CEO Date of Birth (YYYY-MM-DD) + E: Source URL where the information was found + """, + llm=model, + browser_context=context, + controller=controller, + ) + await researcher.run() + + improvised_continuer = Agent( + task=""" + Read the Google Sheet https://docs.google.com/spreadsheets/d/1INaIcfpYXlMRWO__de61SHFCaqt1lfHlcvtXZPItlpI/edit + Add 3 more rows to the bottom continuing the existing pattern, make sure any data you add is sourced correctly. + """, + llm=model, + browser_context=context, + controller=controller, + ) + await improvised_continuer.run() + + final_fact_checker = Agent( + task=""" + Read the Google Sheet https://docs.google.com/spreadsheets/d/1INaIcfpYXlMRWO__de61SHFCaqt1lfHlcvtXZPItlpI/edit + Fact-check every entry, add a new column F with your findings for each row. + Make sure to check the source URL for each row, and make sure the information is correct. + """, + llm=model, + browser_context=context, + controller=controller, + ) + await final_fact_checker.run() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/examples/use-cases/online_coding_agent.py b/examples/use-cases/online_coding_agent.py index 390adc0bc8..e0b9f11e64 100644 --- a/examples/use-cases/online_coding_agent.py +++ b/examples/use-cases/online_coding_agent.py @@ -1,46 +1,48 @@ # Goal: Implements a multi-agent system for online code editors, with separate agents for coding and execution. +import asyncio import os import sys -import asyncio sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from langchain_openai import ChatOpenAI from dotenv import load_dotenv +from langchain_openai import ChatOpenAI from browser_use import Agent, Browser # Load environment variables load_dotenv() if not os.getenv('OPENAI_API_KEY'): - raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.') + raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.') + async def main(): - browser = Browser() - async with await browser.new_context() as context: - model = ChatOpenAI(model='gpt-4o') - - # Initialize browser agent - agent1 = Agent( - task='Open an online code editor programiz.', - llm=model, - browser_context=context, - ) - executor = Agent( - task='Executor. Execute the code written by the coder and suggest some updates if there are errors.', - llm=model, - browser_context=context, - ) - - coder = Agent( - task='Coder. Your job is to write and complete code. You are an expert coder. Code a simple calculator. Write the code on the coding interface after agent1 has opened the link.', - llm=model, - browser_context=context, - ) - await agent1.run() - await executor.run() - await coder.run() - -if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file + browser = Browser() + async with await browser.new_context() as context: + model = ChatOpenAI(model='gpt-4o') + + # Initialize browser agent + agent1 = Agent( + task='Open an online code editor programiz.', + llm=model, + browser_context=context, + ) + executor = Agent( + task='Executor. Execute the code written by the coder and suggest some updates if there are errors.', + llm=model, + browser_context=context, + ) + + coder = Agent( + task='Coder. Your job is to write and complete code. You are an expert coder. Code a simple calculator. Write the code on the coding interface after agent1 has opened the link.', + llm=model, + browser_context=context, + ) + await agent1.run() + await executor.run() + await coder.run() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/examples/use-cases/post-twitter.py b/examples/use-cases/post-twitter.py index 86caef1a0e..a90f024045 100644 --- a/examples/use-cases/post-twitter.py +++ b/examples/use-cases/post-twitter.py @@ -19,70 +19,70 @@ Any issues, contact me on X @defichemist95 """ +import asyncio import os import sys -from typing import Optional -import asyncio sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from dataclasses import dataclass + from dotenv import load_dotenv from langchain_openai import ChatOpenAI -from browser_use.browser.browser import Browser, BrowserConfig from browser_use import Agent, Controller +from browser_use.browser.browser import Browser, BrowserConfig # Load environment variables load_dotenv() if not os.getenv('OPENAI_API_KEY'): - raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.') + raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.') + # ============ Configuration Section ============ @dataclass class TwitterConfig: - """Configuration for Twitter posting""" + """Configuration for Twitter posting""" - openai_api_key: str - chrome_path: str - target_user: str # Twitter handle without @ - message: str - reply_url: str - headless: bool = False - model: str = "gpt-4o-mini" - base_url: str = "https://x.com/home" + openai_api_key: str + chrome_path: str + target_user: str # Twitter handle without @ + message: str + reply_url: str + headless: bool = False + model: str = 'gpt-4o-mini' + base_url: str = 'https://x.com/home' # Customize these settings config = TwitterConfig( - openai_api_key=os.getenv("OPENAI_API_KEY"), - chrome_path="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", # This is for MacOS (Chrome) - target_user="XXXXX", - message="XXXXX", - reply_url="XXXXX", - headless=False, + openai_api_key=os.getenv('OPENAI_API_KEY'), + chrome_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', # This is for MacOS (Chrome) + target_user='XXXXX', + message='XXXXX', + reply_url='XXXXX', + headless=False, ) def create_twitter_agent(config: TwitterConfig) -> Agent: + llm = ChatOpenAI(model=config.model, api_key=config.openai_api_key) - llm = ChatOpenAI(model=config.model, api_key=config.openai_api_key) - - browser = Browser( - config=BrowserConfig( - headless=config.headless, - chrome_instance_path=config.chrome_path, - ) - ) + browser = Browser( + config=BrowserConfig( + headless=config.headless, + browser_binary_path=config.chrome_path, + ) + ) - controller = Controller() + controller = Controller() - # Construct the full message with tag - full_message = f"@{config.target_user} {config.message}" + # Construct the full message with tag + full_message = f'@{config.target_user} {config.message}' - # Create the agent with detailed instructions - return Agent( - task=f"""Navigate to Twitter and create a post and reply to a tweet. + # Create the agent with detailed instructions + return Agent( + task=f"""Navigate to Twitter and create a post and reply to a tweet. Here are the specific steps: @@ -103,25 +103,25 @@ def create_twitter_agent(config: TwitterConfig) -> Agent: - Verify the post button is clickable before clicking - Do not click on the '+' button which will add another tweet """, - llm=llm, - controller=controller, - browser=browser, - ) + llm=llm, + controller=controller, + browser=browser, + ) async def post_tweet(agent: Agent): - - try: - await agent.run(max_steps=100) - agent.create_history_gif() - print("Tweet posted successfully!") - except Exception as e: - print(f"Error posting tweet: {str(e)}") + try: + await agent.run(max_steps=100) + agent.create_history_gif() + print('Tweet posted successfully!') + except Exception as e: + print(f'Error posting tweet: {str(e)}') async def main(): - agent = create_twitter_agent(config) - await agent.run() + agent = create_twitter_agent(config) + await agent.run() + -if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file +if __name__ == '__main__': + asyncio.run(main()) diff --git a/examples/use-cases/scrolling_page.py b/examples/use-cases/scrolling_page.py index 9bc358e1ec..4e649b062c 100644 --- a/examples/use-cases/scrolling_page.py +++ b/examples/use-cases/scrolling_page.py @@ -1,15 +1,15 @@ # Goal: Automates webpage scrolling with various scrolling actions and text search functionality. +import asyncio import os import sys -import asyncio sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from langchain_openai import ChatOpenAI -from browser_use import Agent from dotenv import load_dotenv +from langchain_openai import ChatOpenAI +from browser_use import Agent from browser_use.browser.browser import Browser, BrowserConfig # Load environment variables @@ -38,5 +38,5 @@ async def main(): await agent.run() -if __name__ == "__main__": +if __name__ == '__main__': asyncio.run(main()) diff --git a/examples/use-cases/shopping.py b/examples/use-cases/shopping.py index 1328aabd84..cf6e80befd 100644 --- a/examples/use-cases/shopping.py +++ b/examples/use-cases/shopping.py @@ -1,14 +1,16 @@ -from langchain_openai import ChatOpenAI -from browser_use import Agent, Browser, BrowserConfig from dotenv import load_dotenv +from langchain_openai import ChatOpenAI + +from browser_use import Agent, Browser + load_dotenv() import asyncio -task=""" +task = """ ### Prompt for Shopping Agent – Migros Online Grocery Order -**Objective:** +**Objective:** Visit [Migros Online](https://www.migros.ch/en), search for the required grocery items, add them to the cart, select an appropriate delivery window, and complete the checkout process using TWINT. **Important:** @@ -75,12 +77,12 @@ - If the total order **is below CHF 99**, add **a liquid soap refill** to reach the minimum. If it;s still you can buy some bread, dark chockolate. - At this step, check if you have bought MORE items than needed. If the price is more then CHF200, you MUST remove items. - If an item is not available, choose an alternative. -- if an age verification is needed, remove alchoholic products, we haven't verified yet. +- if an age verification is needed, remove alcoholic products, we haven't verified yet. --- ### Step 5: Select Delivery Window -- Choose a **delivery window within the current week**. It's ok to pay up to CHF2 for the window selction. +- Choose a **delivery window within the current week**. It's ok to pay up to CHF2 for the window selection. - Preferably select a slot within the workweek. --- @@ -90,7 +92,7 @@ - Select **TWINT** as the payment method. - Check out. - -- if it's needed the userename is: nikoskalio.dev@gmail.com +- if it's needed the username is: nikoskalio.dev@gmail.com - and the password is : TheCircuit.Migros.dev! --- @@ -105,15 +107,17 @@ browser = Browser() agent = Agent( - task=task, - llm=ChatOpenAI(model="gpt-4o"), - browser=browser, - ) + task=task, + llm=ChatOpenAI(model='gpt-4o'), + browser=browser, +) + async def main(): - await agent.run() - input("Press Enter to close the browser...") - await browser.close() + await agent.run() + input('Press Enter to close the browser...') + await browser.close() + if __name__ == '__main__': - asyncio.run(main()) + asyncio.run(main()) diff --git a/examples/use-cases/twitter_post_using_cookies.py b/examples/use-cases/twitter_post_using_cookies.py index 72ac98cea2..73a46a8b8f 100644 --- a/examples/use-cases/twitter_post_using_cookies.py +++ b/examples/use-cases/twitter_post_using_cookies.py @@ -1,4 +1,4 @@ -# Goal: Automates posting on X (Twitter) using stored authentication cookies. +# Goal: Automates posting on X (Twitter) using stored authentication cookies. import asyncio import os @@ -21,7 +21,7 @@ browser = Browser( config=BrowserConfig( - # chrome_instance_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', + # browser_binary_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', ) ) file_path = os.path.join(os.path.dirname(__file__), 'twitter_cookies.txt') @@ -39,5 +39,5 @@ async def main(): input('Press Enter to close the browser...') -if __name__ == "__main__": +if __name__ == '__main__': asyncio.run(main()) diff --git a/examples/use-cases/web_voyager_agent.py b/examples/use-cases/web_voyager_agent.py index 0b0de90420..723fa8b0b8 100644 --- a/examples/use-cases/web_voyager_agent.py +++ b/examples/use-cases/web_voyager_agent.py @@ -1,15 +1,15 @@ # Goal: A general-purpose web navigation agent for tasks like flight booking and course searching. +import asyncio import os import sys -import asyncio # Adjust Python path sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from dotenv import load_dotenv -from pydantic import SecretStr from langchain_openai import AzureChatOpenAI +from pydantic import SecretStr from browser_use.agent.service import Agent from browser_use.browser.browser import Browser, BrowserConfig, BrowserContextConfig @@ -18,10 +18,10 @@ load_dotenv() # Validate required environment variables -required_env_vars = ["AZURE_OPENAI_KEY", "AZURE_OPENAI_ENDPOINT"] +required_env_vars = ['AZURE_OPENAI_KEY', 'AZURE_OPENAI_ENDPOINT'] for var in required_env_vars: - if not os.getenv(var): - raise ValueError(f"{var} is not set. Please add it to your environment variables.") + if not os.getenv(var): + raise ValueError(f'{var} is not set. Please add it to your environment variables.') browser = Browser( config=BrowserConfig( @@ -68,5 +68,5 @@ async def main(): history.save_to_file('./tmp/history.json') -if __name__ == "__main__": +if __name__ == '__main__': asyncio.run(main()) diff --git a/pyproject.toml b/pyproject.toml index b05a85fb7b..bc22ea4bd1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "browser-use" description = "Make websites accessible for AI agents" authors = [{ name = "Gregor Zunic" }] -version = "0.1.40" +version = "0.1.41rc2" readme = "README.md" requires-python = ">=3.11,<4.0" classifiers = [ @@ -12,41 +12,54 @@ classifiers = [ ] dependencies = [ "httpx>=0.27.2", - "pydantic>=2.10.4", + "pydantic>=2.10.4,<2.11.0", "python-dotenv>=1.0.1", "requests>=2.32.3", "posthog>=3.7.0", - "playwright==1.51.0", - "setuptools>=75.8.0", - "markdownify==0.14.1", - "langchain-core>=0.3.35", - "langchain-openai==0.3.1", + "playwright>=1.51.0", + "markdownify==1.1.0", + "langchain-core==0.3.49", + "langchain-openai==0.3.11", "langchain-anthropic==0.3.3", - "langchain-ollama==0.2.2", + "langchain-ollama==0.3.0", + "langchain-google-genai==2.1.2", + "langchain>=0.3.21", + "langchain-aws>=0.2.11", + "botocore>=1.37.23", + "google-api-core>=2.24.0", + "pyperclip>=1.9.0", + "pyobjc>=11.0; platform_system == 'darwin'", + "screeninfo>=0.8.1; platform_system != 'darwin'", + "typing-extensions>=4.12.2", + "psutil>=7.0.0", + "faiss-cpu>=1.10.0", + "mem0ai==0.1.81", ] + +# botocore: only needed for Bedrock Claude boto3 examples/models/bedrock_claude.py +# pydantic: >2.11 introduces many pydantic deprecation warnings until langchain-core upgrades their pydantic support lets keep it on 2.10 +# google-api-core: only used for Google LLM APIs +# pyperclip: only used for examples that use copy/paste +# pyobjc: only used to get screen resolution on macOS +# screeninfo: only used to get screen resolution on Linux/Windows +# markdownify: used for page text content extraction for passing to LLM +# openai: datalib,voice-helpers are actually NOT NEEDED but openai produces noisy errors on exit without them TODO: fix urls = { "Repository" = "https://github.com/browser-use/browser-use" } -[project.optional-dependencies] -dev = [ - "tokencost>=0.1.16", - "hatch>=1.13.0", - "build>=1.2.2", - "pytest>=8.3.3", - "pytest-asyncio>=0.24.0", - "fastapi>=0.115.8", - "inngest>=0.4.19", - "uvicorn>=0.34.0", - "langchain>=0.3.18", - "langchain-aws>=0.2.11", - "langchain-fireworks>=0.2.6", - "langchain-google-genai==2.0.8", -] + +[tool.codespell] +ignore-words-list = "bu" +skip = "*.json" [tool.ruff] line-length = 130 -select = ["E", "F", "I"] fix = true +[tool.ruff.lint] +select = ["E", "F", "I", "PLE"] +ignore = ["E101", "E402", "E501", "F841", "E731"] +unfixable = ["E101", "E402", "E501", "F841", "E731"] + [tool.ruff.format] quote-style = "single" indent-style = "tab" @@ -55,3 +68,28 @@ docstring-code-format = true [build-system] requires = ["hatchling"] build-backend = "hatchling.build" + +[tool.hatch.build] +include = [ + "browser_use/**/*.py", + "!browser_use/**/tests/*.py", + "!browser_use/**/tests.py", + "browser_use/agent/system_prompt.md", + "browser_use/dom/buildDomTree.js", +] + +[tool.uv] +dev-dependencies = [ + "ruff>=0.11.2", + "tokencost>=0.1.16", + "build>=1.2.2", + "pytest>=8.3.5", + "pytest-asyncio>=0.24.0", + "fastapi>=0.115.8", + "inngest>=0.4.19", + "uvicorn>=0.34.0", + "langchain-fireworks>=0.2.6", + "ipdb>=0.13.13", + "pre-commit>=4.2.0", + "codespell>=2.4.1", +] diff --git a/tests/test_action_filters.py b/tests/test_action_filters.py new file mode 100644 index 0000000000..4b8d12a90e --- /dev/null +++ b/tests/test_action_filters.py @@ -0,0 +1,305 @@ +from unittest.mock import MagicMock + +import pytest +from playwright.async_api import Page +from pydantic import BaseModel + +from browser_use.controller.registry.service import Registry +from browser_use.controller.registry.views import ActionRegistry, RegisteredAction + + +class EmptyParamModel(BaseModel): + pass + + +class TestActionFilters: + def test_get_prompt_description_no_filters(self): + """Test that system prompt only includes actions with no filters""" + registry = ActionRegistry() + + # Add actions with and without filters + no_filter_action = RegisteredAction( + name='no_filter_action', + description='Action with no filters', + function=lambda: None, + param_model=EmptyParamModel, + domains=None, + page_filter=None, + ) + + page_filter_action = RegisteredAction( + name='page_filter_action', + description='Action with page filter', + function=lambda: None, + param_model=EmptyParamModel, + domains=None, + page_filter=lambda page: True, + ) + + domain_filter_action = RegisteredAction( + name='domain_filter_action', + description='Action with domain filter', + function=lambda: None, + param_model=EmptyParamModel, + domains=['example.com'], + page_filter=None, + ) + + registry.actions = { + 'no_filter_action': no_filter_action, + 'page_filter_action': page_filter_action, + 'domain_filter_action': domain_filter_action, + } + + # System prompt (no page) should only include actions with no filters + system_description = registry.get_prompt_description() + assert 'no_filter_action' in system_description + assert 'page_filter_action' not in system_description + assert 'domain_filter_action' not in system_description + + def test_page_filter_matching(self): + """Test that page filters work correctly""" + registry = ActionRegistry() + + # Create a mock page + mock_page = MagicMock(spec=Page) + mock_page.url = 'https://example.com/page' + + # Create actions with different page filters + matching_action = RegisteredAction( + name='matching_action', + description='Action with matching page filter', + function=lambda: None, + param_model=EmptyParamModel, + domains=None, + page_filter=lambda page: 'example.com' in page.url, + ) + + non_matching_action = RegisteredAction( + name='non_matching_action', + description='Action with non-matching page filter', + function=lambda: None, + param_model=EmptyParamModel, + domains=None, + page_filter=lambda page: 'other.com' in page.url, + ) + + registry.actions = {'matching_action': matching_action, 'non_matching_action': non_matching_action} + + # Page-specific description should only include matching actions + page_description = registry.get_prompt_description(mock_page) + assert 'matching_action' in page_description + assert 'non_matching_action' not in page_description + + def test_domain_filter_matching(self): + """Test that domain filters work correctly with glob patterns""" + registry = ActionRegistry() + + # Create actions with different domain patterns + actions = { + 'exact_match': RegisteredAction( + name='exact_match', + description='Exact domain match', + function=lambda: None, + param_model=EmptyParamModel, + domains=['example.com'], + page_filter=None, + ), + 'subdomain_match': RegisteredAction( + name='subdomain_match', + description='Subdomain wildcard match', + function=lambda: None, + param_model=EmptyParamModel, + domains=['*.example.com'], + page_filter=None, + ), + 'prefix_match': RegisteredAction( + name='prefix_match', + description='Prefix wildcard match', + function=lambda: None, + param_model=EmptyParamModel, + domains=['example*'], + page_filter=None, + ), + 'non_matching': RegisteredAction( + name='non_matching', + description='Non-matching domain', + function=lambda: None, + param_model=EmptyParamModel, + domains=['other.com'], + page_filter=None, + ), + } + + registry.actions = actions + + # Test exact domain match + mock_page = MagicMock(spec=Page) + mock_page.url = 'https://example.com/page' + + exact_match_description = registry.get_prompt_description(mock_page) + assert 'exact_match' in exact_match_description + assert 'non_matching' not in exact_match_description + + # Test subdomain match + mock_page.url = 'https://sub.example.com/page' + subdomain_match_description = registry.get_prompt_description(mock_page) + assert 'subdomain_match' in subdomain_match_description + assert 'exact_match' not in subdomain_match_description + + # Test prefix match + mock_page.url = 'https://example123.org/page' + prefix_match_description = registry.get_prompt_description(mock_page) + assert 'prefix_match' in prefix_match_description + + def test_domain_and_page_filter_together(self): + """Test that actions can be filtered by both domain and page filter""" + registry = ActionRegistry() + + # Create a mock page + mock_page = MagicMock(spec=Page) + mock_page.url = 'https://example.com/admin' + + # Actions with different combinations of filters + actions = { + 'domain_only': RegisteredAction( + name='domain_only', + description='Domain filter only', + function=lambda: None, + param_model=EmptyParamModel, + domains=['example.com'], + page_filter=None, + ), + 'page_only': RegisteredAction( + name='page_only', + description='Page filter only', + function=lambda: None, + param_model=EmptyParamModel, + domains=None, + page_filter=lambda page: 'admin' in page.url, + ), + 'both_matching': RegisteredAction( + name='both_matching', + description='Both filters matching', + function=lambda: None, + param_model=EmptyParamModel, + domains=['example.com'], + page_filter=lambda page: 'admin' in page.url, + ), + 'both_one_fail': RegisteredAction( + name='both_one_fail', + description='One filter fails', + function=lambda: None, + param_model=EmptyParamModel, + domains=['other.com'], + page_filter=lambda page: 'admin' in page.url, + ), + } + + registry.actions = actions + + # Check that only actions with matching filters are included + description = registry.get_prompt_description(mock_page) + assert 'domain_only' in description # Domain matches + assert 'page_only' in description # Page filter matches + assert 'both_matching' in description # Both filters match + assert 'both_one_fail' not in description # Domain filter fails + + # Test with different URL where page filter fails + mock_page.url = 'https://example.com/dashboard' + description = registry.get_prompt_description(mock_page) + assert 'domain_only' in description # Domain matches + assert 'page_only' not in description # Page filter fails + assert 'both_matching' not in description # Page filter fails + assert 'both_one_fail' not in description # Domain filter fails + + @pytest.mark.asyncio + async def test_registry_action_decorator(self): + """Test the action decorator with filters""" + registry = Registry() + + # Define actions with different filters + @registry.action( + description='No filter action', + ) + def no_filter_action(): + pass + + @registry.action(description='Domain filter action', domains=['example.com']) + def domain_filter_action(): + pass + + @registry.action(description='Page filter action', page_filter=lambda page: 'admin' in page.url) + def page_filter_action(): + pass + + # Check that system prompt only includes the no_filter_action + system_description = registry.get_prompt_description() + assert 'No filter action' in system_description + assert 'Domain filter action' not in system_description + assert 'Page filter action' not in system_description + + # Check that page-specific prompt includes the right actions + mock_page = MagicMock(spec=Page) + mock_page.url = 'https://example.com/admin' + + page_description = registry.get_prompt_description(mock_page) + assert 'Domain filter action' in page_description + assert 'Page filter action' in page_description + + @pytest.mark.asyncio + async def test_action_model_creation(self): + """Test that action models are created correctly with filters""" + registry = Registry() + + # Define actions with different filters + @registry.action( + description='No filter action', + ) + def no_filter_action(): + pass + + @registry.action(description='Domain filter action', domains=['example.com']) + def domain_filter_action(): + pass + + @registry.action(description='Page filter action', page_filter=lambda page: 'admin' in page.url) + def page_filter_action(): + pass + + @registry.action(description='Both filters action', domains=['example.com'], page_filter=lambda page: 'admin' in page.url) + def both_filters_action(): + pass + + # Initial action model should only include no_filter_action + initial_model = registry.create_action_model() + assert 'no_filter_action' in initial_model.model_fields + assert 'domain_filter_action' not in initial_model.model_fields + assert 'page_filter_action' not in initial_model.model_fields + assert 'both_filters_action' not in initial_model.model_fields + + # Action model with matching page should include all matching actions + mock_page = MagicMock(spec=Page) + mock_page.url = 'https://example.com/admin' + + page_model = registry.create_action_model(page=mock_page) + assert 'no_filter_action' in page_model.model_fields + assert 'domain_filter_action' in page_model.model_fields + assert 'page_filter_action' in page_model.model_fields + assert 'both_filters_action' in page_model.model_fields + + # Action model with non-matching domain should exclude domain-filtered actions + mock_page.url = 'https://other.com/admin' + non_matching_domain_model = registry.create_action_model(page=mock_page) + assert 'no_filter_action' in non_matching_domain_model.model_fields + assert 'domain_filter_action' not in non_matching_domain_model.model_fields + assert 'page_filter_action' in non_matching_domain_model.model_fields + assert 'both_filters_action' not in non_matching_domain_model.model_fields + + # Action model with non-matching page filter should exclude page-filtered actions + mock_page.url = 'https://example.com/dashboard' + non_matching_page_model = registry.create_action_model(page=mock_page) + assert 'no_filter_action' in non_matching_page_model.model_fields + assert 'domain_filter_action' in non_matching_page_model.model_fields + assert 'page_filter_action' not in non_matching_page_model.model_fields + assert 'both_filters_action' not in non_matching_page_model.model_fields diff --git a/tests/test_agent_actions.py b/tests/test_agent_actions.py index 14d8dbacc7..6a2c049d60 100644 --- a/tests/test_agent_actions.py +++ b/tests/test_agent_actions.py @@ -104,15 +104,11 @@ async def test_error_recovery(llm, context): actions_names = history.action_names() actions = history.model_actions() - assert ( - 'go_to_url' in actions_names or 'open_tab' in actions_names - ), f'{actions_names} does not contain go_to_url or open_tab' + assert 'go_to_url' in actions_names or 'open_tab' in actions_names, f'{actions_names} does not contain go_to_url or open_tab' for action in actions: if 'go_to_url' in action: assert 'url' in action['go_to_url'], 'url is not in go_to_url' - assert action['go_to_url']['url'].endswith( - 'google.com' - ), 'url does not end with google.com' + assert action['go_to_url']['url'].endswith('google.com'), 'url does not end with google.com' break diff --git a/tests/test_browser.py b/tests/test_browser.py index b3acf344f6..8153b2c84b 100644 --- a/tests/test_browser.py +++ b/tests/test_browser.py @@ -1,306 +1,498 @@ import asyncio +import subprocess + import pytest import requests -import subprocess -from browser_use.browser.browser import Browser, BrowserConfig + +from browser_use.browser.browser import Browser, BrowserConfig, ProxySettings from browser_use.browser.context import BrowserContext, BrowserContextConfig -from playwright._impl._api_structures import ProxySettings + @pytest.mark.asyncio -async def test_standard_browser_launch(monkeypatch): - """ - Test that the standard browser is launched correctly: - When no remote (cdp or wss) or chrome instance is provided, the Browser class uses _setup_standard_browser. - This test monkeypatches async_playwright to return dummy objects, and asserts that get_playwright_browser returns the expected DummyBrowser. - """ - class DummyBrowser: - pass - class DummyChromium: - async def launch(self, headless, args, proxy=None): - return DummyBrowser() - class DummyPlaywright: - def __init__(self): - self.chromium = DummyChromium() - async def stop(self): - pass - class DummyAsyncPlaywrightContext: - async def start(self): - return DummyPlaywright() - monkeypatch.setattr("browser_use.browser.browser.async_playwright", lambda: DummyAsyncPlaywrightContext()) - config = BrowserConfig(headless=True, disable_security=False, extra_chromium_args=["--test"]) - browser_obj = Browser(config=config) - result_browser = await browser_obj.get_playwright_browser() - assert isinstance(result_browser, DummyBrowser), "Expected DummyBrowser from _setup_standard_browser" - await browser_obj.close() +async def test_builtin_browser_launch(monkeypatch): + """ + Test that the standard browser is launched correctly: + When no remote (cdp or wss) or chrome instance is provided, the Browser class uses _setup_builtin_browser. + This test monkeypatches async_playwright to return dummy objects, and asserts that get_playwright_browser returns the expected DummyBrowser. + """ + + class DummyBrowser: + pass + + class DummyChromium: + async def launch(self, headless, args, proxy=None): + return DummyBrowser() + + class DummyPlaywright: + def __init__(self): + self.chromium = DummyChromium() + + async def stop(self): + pass + + class DummyAsyncPlaywrightContext: + async def start(self): + return DummyPlaywright() + + monkeypatch.setattr('browser_use.browser.browser.async_playwright', lambda: DummyAsyncPlaywrightContext()) + config = BrowserConfig(headless=True, disable_security=False, extra_browser_args=['--test']) + browser_obj = Browser(config=config) + result_browser = await browser_obj.get_playwright_browser() + assert isinstance(result_browser, DummyBrowser), 'Expected DummyBrowser from _setup_builtin_browser' + await browser_obj.close() + + @pytest.mark.asyncio async def test_cdp_browser_launch(monkeypatch): - """ - Test that when a CDP URL is provided in the configuration, the Browser uses _setup_cdp - and returns the expected DummyBrowser. - """ - class DummyBrowser: - pass - class DummyChromium: - async def connect_over_cdp(self, endpoint_url, timeout=20000): - assert endpoint_url == "ws://dummy-cdp-url", "The endpoint URL should match the configuration." - return DummyBrowser() - class DummyPlaywright: - def __init__(self): - self.chromium = DummyChromium() - async def stop(self): - pass - class DummyAsyncPlaywrightContext: - async def start(self): - return DummyPlaywright() - monkeypatch.setattr("browser_use.browser.browser.async_playwright", lambda: DummyAsyncPlaywrightContext()) - config = BrowserConfig(cdp_url="ws://dummy-cdp-url") - browser_obj = Browser(config=config) - result_browser = await browser_obj.get_playwright_browser() - assert isinstance(result_browser, DummyBrowser), "Expected DummyBrowser from _setup_cdp" - await browser_obj.close() + """ + Test that when a CDP URL is provided in the configuration, the Browser uses _setup_cdp + and returns the expected DummyBrowser. + """ + + class DummyBrowser: + pass + + class DummyChromium: + async def connect_over_cdp(self, endpoint_url, timeout=20000): + assert endpoint_url == 'ws://dummy-cdp-url', 'The endpoint URL should match the configuration.' + return DummyBrowser() + + class DummyPlaywright: + def __init__(self): + self.chromium = DummyChromium() + + async def stop(self): + pass + + class DummyAsyncPlaywrightContext: + async def start(self): + return DummyPlaywright() + + monkeypatch.setattr('browser_use.browser.browser.async_playwright', lambda: DummyAsyncPlaywrightContext()) + config = BrowserConfig(cdp_url='ws://dummy-cdp-url') + browser_obj = Browser(config=config) + result_browser = await browser_obj.get_playwright_browser() + assert isinstance(result_browser, DummyBrowser), 'Expected DummyBrowser from _setup_cdp' + await browser_obj.close() + + @pytest.mark.asyncio async def test_wss_browser_launch(monkeypatch): - """ - Test that when a WSS URL is provided in the configuration, - the Browser uses _setup_wss and returns the expected DummyBrowser. - """ - class DummyBrowser: - pass - class DummyChromium: - async def connect(self, wss_url): - assert wss_url == "ws://dummy-wss-url", "WSS URL should match the configuration." - return DummyBrowser() - class DummyPlaywright: - def __init__(self): - self.chromium = DummyChromium() - async def stop(self): - pass - class DummyAsyncPlaywrightContext: - async def start(self): - return DummyPlaywright() - monkeypatch.setattr("browser_use.browser.browser.async_playwright", lambda: DummyAsyncPlaywrightContext()) - config = BrowserConfig(wss_url="ws://dummy-wss-url") - browser_obj = Browser(config=config) - result_browser = await browser_obj.get_playwright_browser() - assert isinstance(result_browser, DummyBrowser), "Expected DummyBrowser from _setup_wss" - await browser_obj.close() + """ + Test that when a WSS URL is provided in the configuration, + the Browser uses setup_wss and returns the expected DummyBrowser. + """ + + class DummyBrowser: + pass + + class DummyChromium: + async def connect(self, wss_url): + assert wss_url == 'ws://dummy-wss-url', 'WSS URL should match the configuration.' + return DummyBrowser() + + class DummyPlaywright: + def __init__(self): + self.chromium = DummyChromium() + + async def stop(self): + pass + + class DummyAsyncPlaywrightContext: + async def start(self): + return DummyPlaywright() + + monkeypatch.setattr('browser_use.browser.browser.async_playwright', lambda: DummyAsyncPlaywrightContext()) + config = BrowserConfig(wss_url='ws://dummy-wss-url') + browser_obj = Browser(config=config) + result_browser = await browser_obj.get_playwright_browser() + assert isinstance(result_browser, DummyBrowser), 'Expected DummyBrowser from _setup_wss' + await browser_obj.close() + + @pytest.mark.asyncio -async def test_chrome_instance_browser_launch(monkeypatch): - """ - Test that when a chrome instance path is provided the Browser class uses - _setup_browser_with_instance branch and returns the expected DummyBrowser object - by reusing an existing Chrome instance. - """ - # Dummy response for requests.get when checking chrome debugging endpoint. - class DummyResponse: - status_code = 200 - def dummy_get(url, timeout): - if url == "http://localhost:9222/json/version": - return DummyResponse() - raise requests.ConnectionError("Connection failed") - monkeypatch.setattr(requests, "get", dummy_get) - class DummyBrowser: - pass - class DummyChromium: - async def connect_over_cdp(self, endpoint_url, timeout=20000): - assert endpoint_url == "http://localhost:9222", "Endpoint URL must be 'http://localhost:9222'" - return DummyBrowser() - class DummyPlaywright: - def __init__(self): - self.chromium = DummyChromium() - async def stop(self): - pass - class DummyAsyncPlaywrightContext: - async def start(self): - return DummyPlaywright() - monkeypatch.setattr("browser_use.browser.browser.async_playwright", lambda: DummyAsyncPlaywrightContext()) - config = BrowserConfig(chrome_instance_path="dummy/chrome", extra_chromium_args=["--dummy-arg"]) - browser_obj = Browser(config=config) - result_browser = await browser_obj.get_playwright_browser() - assert isinstance(result_browser, DummyBrowser), "Expected DummyBrowser from _setup_browser_with_instance" - await browser_obj.close() +async def test_user_provided_browser_launch(monkeypatch): + """ + Test that when a browser_binary_path is provided the Browser class uses + _setup_user_provided_browser branch and returns the expected DummyBrowser object + by reusing an existing Chrome instance. + """ + + # Dummy response for requests.get when checking chrome debugging endpoint. + class DummyResponse: + status_code = 200 + + def dummy_get(url, timeout): + if url == 'http://localhost:9222/json/version': + return DummyResponse() + raise requests.ConnectionError('Connection failed') + + monkeypatch.setattr(requests, 'get', dummy_get) + + class DummyBrowser: + pass + + class DummyChromium: + async def connect_over_cdp(self, endpoint_url, timeout=20000): + assert endpoint_url == 'http://localhost:9222', "Endpoint URL must be 'http://localhost:9222'" + return DummyBrowser() + + class DummyPlaywright: + def __init__(self): + self.chromium = DummyChromium() + + async def stop(self): + pass + + class DummyAsyncPlaywrightContext: + async def start(self): + return DummyPlaywright() + + monkeypatch.setattr('browser_use.browser.browser.async_playwright', lambda: DummyAsyncPlaywrightContext()) + config = BrowserConfig(browser_binary_path='dummy/chrome', extra_browser_args=['--dummy-arg']) + browser_obj = Browser(config=config) + result_browser = await browser_obj.get_playwright_browser() + assert isinstance(result_browser, DummyBrowser), 'Expected DummyBrowser from _setup_user_provided_browser' + await browser_obj.close() + + @pytest.mark.asyncio -async def test_standard_browser_disable_security_args(monkeypatch): - """ - Test that the standard browser launch includes disable-security arguments when disable_security is True. - This verifies that _setup_standard_browser correctly appends the security disabling arguments along with - the base arguments and any extra arguments provided. - """ - # These are the base arguments defined in _setup_standard_browser. - base_args = [ - '--no-sandbox', - '--disable-blink-features=AutomationControlled', - '--disable-infobars', - '--disable-background-timer-throttling', - '--disable-popup-blocking', - '--disable-backgrounding-occluded-windows', - '--disable-renderer-backgrounding', - '--disable-window-activation', - '--disable-focus-on-load', - '--no-first-run', - '--no-default-browser-check', - '--no-startup-window', - '--window-position=0,0', - ] - # When disable_security is True, these arguments should be added. - disable_security_args = [ - '--disable-web-security', - '--disable-site-isolation-trials', - '--disable-features=IsolateOrigins,site-per-process' - ] - # Additional arbitrary argument for testing extra args - extra_args = ["--dummy-extra"] - class DummyBrowser: - pass - class DummyChromium: - async def launch(self, headless, args, proxy=None): - # Expected args is the base args plus disable security args and the extra args. - expected_args = base_args + disable_security_args + extra_args - assert headless is True, "Expected headless to be True" - assert args == expected_args, f"Expected args {expected_args}, but got {args}" - assert proxy is None, "Expected proxy to be None" - return DummyBrowser() - class DummyPlaywright: - def __init__(self): - self.chromium = DummyChromium() - async def stop(self): - pass - class DummyAsyncPlaywrightContext: - async def start(self): - return DummyPlaywright() - monkeypatch.setattr("browser_use.browser.browser.async_playwright", lambda: DummyAsyncPlaywrightContext()) - config = BrowserConfig(headless=True, disable_security=True, extra_chromium_args=extra_args) - browser_obj = Browser(config=config) - result_browser = await browser_obj.get_playwright_browser() - assert isinstance(result_browser, DummyBrowser), "Expected DummyBrowser from _setup_standard_browser with disable_security active" - await browser_obj.close() +async def test_builtin_browser_disable_security_args(monkeypatch): + """ + Test that the standard browser launch includes disable-security arguments when disable_security is True. + This verifies that _setup_builtin_browser correctly appends the security disabling arguments along with + the base arguments and any extra arguments provided. + """ + # These are the base arguments defined in _setup_builtin_browser. + base_args = [ + '--no-sandbox', + '--disable-blink-features=AutomationControlled', + '--disable-infobars', + '--disable-background-timer-throttling', + '--disable-popup-blocking', + '--disable-backgrounding-occluded-windows', + '--disable-renderer-backgrounding', + '--disable-window-activation', + '--disable-focus-on-load', + '--no-first-run', + '--no-default-browser-check', + '--no-startup-window', + '--window-position=0,0', + ] + # When disable_security is True, these arguments should be added. + disable_security_args = [ + '--disable-web-security', + '--disable-site-isolation-trials', + '--disable-features=IsolateOrigins,site-per-process', + ] + # Additional arbitrary argument for testing extra args + extra_args = ['--dummy-extra'] + + class DummyBrowser: + pass + + class DummyChromium: + async def launch(self, headless, args, proxy=None): + # Expected args is the base args plus disable security args and the extra args. + expected_args = base_args + disable_security_args + extra_args + assert headless is True, 'Expected headless to be True' + assert args == expected_args, f'Expected args {expected_args}, but got {args}' + assert proxy is None, 'Expected proxy to be None' + return DummyBrowser() + + class DummyPlaywright: + def __init__(self): + self.chromium = DummyChromium() + + async def stop(self): + pass + + class DummyAsyncPlaywrightContext: + async def start(self): + return DummyPlaywright() + + monkeypatch.setattr('browser_use.browser.browser.async_playwright', lambda: DummyAsyncPlaywrightContext()) + config = BrowserConfig(headless=True, disable_security=True, extra_browser_args=extra_args) + browser_obj = Browser(config=config) + result_browser = await browser_obj.get_playwright_browser() + assert isinstance(result_browser, DummyBrowser), ( + 'Expected DummyBrowser from _setup_builtin_browser with disable_security active' + ) + await browser_obj.close() + + @pytest.mark.asyncio async def test_new_context_creation(): - """ - Test that the new_context method returns a BrowserContext with the correct attributes. - This verifies that the BrowserContext is initialized with the provided Browser instance and configuration. - """ - config = BrowserConfig() - browser_obj = Browser(config=config) - custom_context_config = BrowserContextConfig() - context = await browser_obj.new_context(custom_context_config) - assert isinstance(context, BrowserContext), "Expected new_context to return an instance of BrowserContext" - assert context.browser is browser_obj, "Expected the context's browser attribute to be the Browser instance" - assert context.config == custom_context_config, "Expected the context's config attribute to be the provided config" - await browser_obj.close() + """ + Test that the new_context method returns a BrowserContext with the correct attributes. + This verifies that the BrowserContext is initialized with the provided Browser instance and configuration. + """ + config = BrowserConfig() + browser_obj = Browser(config=config) + custom_context_config = BrowserContextConfig() + context = await browser_obj.new_context(custom_context_config) + assert isinstance(context, BrowserContext), 'Expected new_context to return an instance of BrowserContext' + assert context.browser is browser_obj, "Expected the context's browser attribute to be the Browser instance" + assert context.config == custom_context_config, "Expected the context's config attribute to be the provided config" + await browser_obj.close() + + @pytest.mark.asyncio -async def test_chrome_instance_browser_launch_failure(monkeypatch): - """ - Test that when a Chrome instance cannot be started or connected to, - the Browser._setup_browser_with_instance branch eventually raises a RuntimeError. - We simulate failure by: - - Forcing requests.get to always raise a ConnectionError (so no existing instance is found). - - Monkeypatching subprocess.Popen to do nothing. - - Replacing asyncio.sleep to avoid delays. - - Having the dummy playwright's connect_over_cdp method always raise an Exception. - """ - def dummy_get(url, timeout): - raise requests.ConnectionError("Simulated connection failure") - monkeypatch.setattr(requests, "get", dummy_get) - monkeypatch.setattr(subprocess, "Popen", lambda args, stdout, stderr: None) - async def fake_sleep(seconds): - return - monkeypatch.setattr(asyncio, "sleep", fake_sleep) - class DummyChromium: - async def connect_over_cdp(self, endpoint_url, timeout=20000): - raise Exception("Connection failed simulation") - class DummyPlaywright: - def __init__(self): - self.chromium = DummyChromium() - async def stop(self): - pass - class DummyAsyncPlaywrightContext: - async def start(self): - return DummyPlaywright() - monkeypatch.setattr("browser_use.browser.browser.async_playwright", lambda: DummyAsyncPlaywrightContext()) - config = BrowserConfig(chrome_instance_path="dummy/chrome", extra_chromium_args=["--dummy-arg"]) - browser_obj = Browser(config=config) - with pytest.raises(RuntimeError, match="To start chrome in Debug mode"): - await browser_obj.get_playwright_browser() - await browser_obj.close() +async def test_user_provided_browser_launch_failure(monkeypatch): + """ + Test that when a Chrome instance cannot be started or connected to, + the Browser._setup_user_provided_browser branch eventually raises a RuntimeError. + We simulate failure by: + - Forcing requests.get to always raise a ConnectionError (so no existing instance is found). + - Monkeypatching subprocess.Popen to do nothing. + - Replacing asyncio.sleep to avoid delays. + - Having the dummy playwright's connect_over_cdp method always raise an Exception. + """ + + def dummy_get(url, timeout): + raise requests.ConnectionError('Simulated connection failure') + + monkeypatch.setattr(requests, 'get', dummy_get) + monkeypatch.setattr(subprocess, 'Popen', lambda args, stdout, stderr: None) + + async def fake_sleep(seconds): + return + + monkeypatch.setattr(asyncio, 'sleep', fake_sleep) + + class DummyChromium: + async def connect_over_cdp(self, endpoint_url, timeout=20000): + raise Exception('Connection failed simulation') + + class DummyPlaywright: + def __init__(self): + self.chromium = DummyChromium() + + async def stop(self): + pass + + class DummyAsyncPlaywrightContext: + async def start(self): + return DummyPlaywright() + + monkeypatch.setattr('browser_use.browser.browser.async_playwright', lambda: DummyAsyncPlaywrightContext()) + config = BrowserConfig(browser_binary_path='dummy/chrome', extra_browser_args=['--dummy-arg']) + browser_obj = Browser(config=config) + with pytest.raises(RuntimeError, match='To start chrome in Debug mode'): + await browser_obj.get_playwright_browser() + await browser_obj.close() + + @pytest.mark.asyncio async def test_get_playwright_browser_caching(monkeypatch): - """ - Test that get_playwright_browser returns a cached browser instance. - On the first call, the browser is initialized; on subsequent calls, - the same instance is returned. - """ - class DummyBrowser: - pass - class DummyChromium: - async def launch(self, headless, args, proxy=None): - return DummyBrowser() - class DummyPlaywright: - def __init__(self): - self.chromium = DummyChromium() - async def stop(self): - pass - class DummyAsyncPlaywrightContext: - async def start(self): - return DummyPlaywright() - monkeypatch.setattr("browser_use.browser.browser.async_playwright", lambda: DummyAsyncPlaywrightContext()) - config = BrowserConfig(headless=True, disable_security=False, extra_chromium_args=["--test"]) - browser_obj = Browser(config=config) - first_browser = await browser_obj.get_playwright_browser() - second_browser = await browser_obj.get_playwright_browser() - assert first_browser is second_browser, "Expected the browser to be cached and reused across calls." - await browser_obj.close() + """ + Test that get_playwright_browser returns a cached browser instance. + On the first call, the browser is initialized; on subsequent calls, + the same instance is returned. + """ + + class DummyBrowser: + pass + + class DummyChromium: + async def launch(self, headless, args, proxy=None): + return DummyBrowser() + + class DummyPlaywright: + def __init__(self): + self.chromium = DummyChromium() + + async def stop(self): + pass + + class DummyAsyncPlaywrightContext: + async def start(self): + return DummyPlaywright() + + monkeypatch.setattr('browser_use.browser.browser.async_playwright', lambda: DummyAsyncPlaywrightContext()) + config = BrowserConfig(headless=True, disable_security=False, extra_browser_args=['--test']) + browser_obj = Browser(config=config) + first_browser = await browser_obj.get_playwright_browser() + second_browser = await browser_obj.get_playwright_browser() + assert first_browser is second_browser, 'Expected the browser to be cached and reused across calls.' + await browser_obj.close() + + @pytest.mark.asyncio async def test_close_error_handling(monkeypatch): - """ - Test that the close method properly handles exceptions thrown by - playwright_browser.close() and playwright.stop(), ensuring that the - browser's attributes are set to None even if errors occur. - """ - class DummyBrowserWithError: - async def close(self): - raise Exception("Close error simulation") - class DummyPlaywrightWithError: - async def stop(self): - raise Exception("Stop error simulation") - config = BrowserConfig() - browser_obj = Browser(config=config) - browser_obj.playwright_browser = DummyBrowserWithError() - browser_obj.playwright = DummyPlaywrightWithError() - await browser_obj.close() - assert browser_obj.playwright_browser is None, "Expected playwright_browser to be None after close" - assert browser_obj.playwright is None, "Expected playwright to be None after close" + """ + Test that the close method properly handles exceptions thrown by + playwright_browser.close() and playwright.stop(), ensuring that the + browser's attributes are set to None even if errors occur. + """ + + class DummyBrowserWithError: + async def close(self): + raise Exception('Close error simulation') + + class DummyPlaywrightWithError: + async def stop(self): + raise Exception('Stop error simulation') + + config = BrowserConfig() + browser_obj = Browser(config=config) + browser_obj.playwright_browser = DummyBrowserWithError() + browser_obj.playwright = DummyPlaywrightWithError() + await browser_obj.close() + assert browser_obj.playwright_browser is None, 'Expected playwright_browser to be None after close' + assert browser_obj.playwright is None, 'Expected playwright to be None after close' + + @pytest.mark.asyncio async def test_standard_browser_launch_with_proxy(monkeypatch): - """ - Test that when a proxy is provided in the BrowserConfig, the _setup_standard_browser method - correctly passes the proxy parameter to the playwright.chromium.launch method. - This test sets up a dummy async_playwright context and verifies that the dummy proxy is received. - """ - class DummyBrowser: - pass - # Create a dummy proxy settings instance. - dummy_proxy = ProxySettings(server="http://dummy.proxy") - class DummyChromium: - async def launch(self, headless, args, proxy=None): - # Assert that the proxy passed equals the dummy proxy provided in the configuration. - assert proxy == dummy_proxy, f"Expected proxy {dummy_proxy} but got {proxy}" - # We can also verify some base parameters if needed (headless, args) but our focus is proxy. - return DummyBrowser() - class DummyPlaywright: - def __init__(self): - self.chromium = DummyChromium() - async def stop(self): - pass - class DummyAsyncPlaywrightContext: - async def start(self): - return DummyPlaywright() - # Monkeypatch async_playwright to return our dummy async playwright context. - monkeypatch.setattr("browser_use.browser.browser.async_playwright", lambda: DummyAsyncPlaywrightContext()) - # Create a BrowserConfig with the dummy proxy. - config = BrowserConfig(headless=False, disable_security=False, proxy=dummy_proxy) - browser_obj = Browser(config=config) - # Call get_playwright_browser and verify that the returned browser is as expected. - result_browser = await browser_obj.get_playwright_browser() - assert isinstance(result_browser, DummyBrowser), "Expected DummyBrowser from _setup_standard_browser with proxy provided" - await browser_obj.close() \ No newline at end of file + """ + Test that when a proxy is provided in the BrowserConfig, the _setup_builtin_browser method + correctly passes the proxy parameter to the playwright.chromium.launch method. + This test sets up a dummy async_playwright context and verifies that the dummy proxy is received. + """ + + class DummyBrowser: + pass + + # Create a dummy proxy settings instance. + dummy_proxy = ProxySettings(server='http://dummy.proxy') + + class DummyChromium: + async def launch(self, headless, args, proxy=None): + # Assert that the proxy passed equals the dummy proxy provided in the configuration. + assert isinstance(proxy, dict) and proxy['server'] == 'http://dummy.proxy', ( + f'Expected proxy {dummy_proxy} but got {proxy}' + ) + # We can also verify some base parameters if needed (headless, args) but our focus is proxy. + return DummyBrowser() + + class DummyPlaywright: + def __init__(self): + self.chromium = DummyChromium() + + async def stop(self): + pass + + class DummyAsyncPlaywrightContext: + async def start(self): + return DummyPlaywright() + + # Monkeypatch async_playwright to return our dummy async playwright context. + monkeypatch.setattr('browser_use.browser.browser.async_playwright', lambda: DummyAsyncPlaywrightContext()) + # Create a BrowserConfig with the dummy proxy. + config = BrowserConfig(headless=False, disable_security=False, proxy=dummy_proxy) + browser_obj = Browser(config=config) + # Call get_playwright_browser and verify that the returned browser is as expected. + result_browser = await browser_obj.get_playwright_browser() + assert isinstance(result_browser, DummyBrowser), 'Expected DummyBrowser from _setup_builtin_browser with proxy provided' + await browser_obj.close() + + +@pytest.mark.asyncio +async def test_browser_window_size(monkeypatch): + """ + Test that when a browser_window_size is provided in BrowserContextConfig, + it's properly converted to a dictionary when passed to Playwright. + """ + + class DummyPage: + def __init__(self): + self.url = 'about:blank' + + async def goto(self, url): + pass + + async def wait_for_load_state(self): + pass + + async def title(self): + return 'Test Page' + + async def bring_to_front(self): + pass + + async def evaluate(self, script): + return True + + def is_closed(self): + return False + + class DummyContext: + def __init__(self): + self.pages = [DummyPage()] + self.tracing = self + + async def new_page(self): + return DummyPage() + + async def add_init_script(self, script): + pass + + async def start(self): + pass + + async def stop(self, path=None): + pass + + def on(self, event, handler): + pass + + async def close(self): + pass + + class DummyBrowser: + def __init__(self): + self.contexts = [] + + async def new_context(self, **kwargs): + # Assert that record_video_size is a dictionary with expected values + assert isinstance(kwargs['record_video_size'], dict), ( + f'Expected record_video_size to be a dictionary, got {type(kwargs["record_video_size"])}' + ) + assert kwargs['record_video_size']['width'] == 1280, ( + f'Expected width to be 1280, got {kwargs["record_video_size"].get("width")}' + ) + assert kwargs['record_video_size']['height'] == 1100, ( + f'Expected height to be 1100, got {kwargs["record_video_size"].get("height")}' + ) + + context = DummyContext() + self.contexts.append(context) + return context + + async def close(self): + pass + + class DummyPlaywright: + def __init__(self): + self.chromium = self + + async def launch(self, **kwargs): + return DummyBrowser() + + async def stop(self): + pass + + class DummyAsyncPlaywrightContext: + async def start(self): + return DummyPlaywright() + + # Monkeypatch async_playwright to return our dummy async playwright context + monkeypatch.setattr('browser_use.browser.browser.async_playwright', lambda: DummyAsyncPlaywrightContext()) + + # Create browser with default config + browser_obj = Browser() + + # Get browser instance + playwright_browser = await browser_obj.get_playwright_browser() + + # Create context config with specific window size + context_config = BrowserContextConfig(browser_window_size={'width': 1280, 'height': 1100}) + + # Create browser context - this will test if browser_window_size is properly converted + browser_context = BrowserContext(browser=browser_obj, config=context_config) + await browser_context._initialize_session() + + # Clean up + await browser_context.close() + await browser_obj.close() diff --git a/tests/test_browser_config_models.py b/tests/test_browser_config_models.py new file mode 100644 index 0000000000..9d3ea91719 --- /dev/null +++ b/tests/test_browser_config_models.py @@ -0,0 +1,209 @@ +import os + +import pytest + +from browser_use.browser.browser import Browser, BrowserConfig, ProxySettings +from browser_use.browser.context import BrowserContext, BrowserContextConfig, BrowserContextWindowSize + + +@pytest.mark.asyncio +async def test_proxy_settings_pydantic_model(): + """ + Test that ProxySettings as a Pydantic model is correctly converted to a dictionary when used. + """ + # Create ProxySettings with Pydantic model + proxy_settings = ProxySettings( + server='http://example.proxy:8080', bypass='localhost', username='testuser', password='testpass' + ) + + # Verify the model has correct dict-like access + assert proxy_settings['server'] == 'http://example.proxy:8080' + assert proxy_settings.get('bypass') == 'localhost' + assert proxy_settings.get('nonexistent', 'default') == 'default' + + # Verify model_dump works correctly + proxy_dict = proxy_settings.model_dump() + assert isinstance(proxy_dict, dict) + assert proxy_dict['server'] == 'http://example.proxy:8080' + assert proxy_dict['bypass'] == 'localhost' + assert proxy_dict['username'] == 'testuser' + assert proxy_dict['password'] == 'testpass' + + # We don't launch the actual browser - we just verify the model itself works as expected + + +@pytest.mark.asyncio +async def test_window_size_pydantic_model(): + """ + Test that BrowserContextWindowSize as a Pydantic model is correctly converted to a dictionary when used. + """ + # Create BrowserContextWindowSize with Pydantic model + window_size = BrowserContextWindowSize(width=1280, height=1100) + + # Verify the model has correct dict-like access + assert window_size['width'] == 1280 + assert window_size.get('height') == 1100 + assert window_size.get('nonexistent', 'default') == 'default' + + # Verify model_dump works correctly + window_dict = window_size.model_dump() + assert isinstance(window_dict, dict) + assert window_dict['width'] == 1280 + assert window_dict['height'] == 1100 + + # Create a context config with the window size and test initialization + config = BrowserContextConfig(browser_window_size=window_size) + assert config.browser_window_size == window_size + + # You can also create from a dictionary + config2 = BrowserContextConfig(browser_window_size={'width': 1920, 'height': 1080}) + assert isinstance(config2.browser_window_size, BrowserContextWindowSize) + assert config2.browser_window_size.width == 1920 + assert config2.browser_window_size.height == 1080 + + +@pytest.mark.asyncio +@pytest.mark.skipif(os.environ.get('CI') == 'true', reason='Skip browser test in CI') +async def test_window_size_with_real_browser(): + """ + Integration test that verifies our window size Pydantic model is correctly + passed to Playwright and the actual browser window is configured with these settings. + This test is skipped in CI environments. + """ + # Create window size with specific dimensions we can check + window_size = BrowserContextWindowSize(width=1024, height=768) + + # Create browser config with headless mode + browser_config = BrowserConfig( + headless=True, # Use headless for faster test + ) + + # Create context config with our window size + context_config = BrowserContextConfig( + browser_window_size=window_size, + maximum_wait_page_load_time=2.0, # Faster timeouts for test + minimum_wait_page_load_time=0.2, + no_viewport=True, # Use actual window size instead of viewport + ) + + # Create browser and context + browser = Browser(config=browser_config) + try: + # Initialize browser + playwright_browser = await browser.get_playwright_browser() + assert playwright_browser is not None, 'Browser initialization failed' + + # Create context + browser_context = BrowserContext(browser=browser, config=context_config) + try: + # Initialize session + await browser_context._initialize_session() + + # Get the current page + page = await browser_context.get_current_page() + assert page is not None, 'Failed to get current page' + + # Get the context configuration used for browser window size + video_size = await page.evaluate(""" + () => { + // This returns information about the context recording settings + // which should match our configured video size (browser_window_size) + try { + const settings = window.getPlaywrightContextSettings ? + window.getPlaywrightContextSettings() : null; + if (settings && settings.recordVideo) { + return settings.recordVideo.size; + } + } catch (e) {} + + // Fallback to window dimensions + return { + width: window.innerWidth, + height: window.innerHeight + }; + } + """) + + # Let's also check the viewport size + viewport_size = await page.evaluate(""" + () => { + return { + width: window.innerWidth, + height: window.innerHeight + } + } + """) + + print(f'Window size config: {window_size.model_dump()}') + print(f'Browser viewport size: {viewport_size}') + + # This is a lightweight test to verify that the page has a size (details may vary by browser) + assert viewport_size['width'] > 0, 'Expected viewport width to be positive' + assert viewport_size['height'] > 0, 'Expected viewport height to be positive' + + # For browser context creation in record_video_size, this is what truly matters + # Verify that our window size was properly serialized to a dictionary + print(f'Content of context session: {browser_context.session.context}') + print('✅ Browser window size used in the test') + finally: + # Clean up context + await browser_context.close() + finally: + # Clean up browser + await browser.close() + + +@pytest.mark.asyncio +async def test_proxy_with_real_browser(): + """ + Integration test that verifies our proxy Pydantic model is correctly + passed to Playwright without requiring a working proxy server. + + This test: + 1. Creates a ProxySettings Pydantic model + 2. Passes it to BrowserConfig + 3. Verifies browser initialization works (proving the model was correctly serialized) + 4. We don't actually verify proxy functionality (would require a working proxy) + """ + # Create proxy settings with a fake proxy server + proxy_settings = ProxySettings( + server='http://non.existent.proxy:9999', bypass='localhost', username='testuser', password='testpass' + ) + + # Test model serialization + proxy_dict = proxy_settings.model_dump() + assert isinstance(proxy_dict, dict) + assert proxy_dict['server'] == 'http://non.existent.proxy:9999' + + # Create browser config with proxy + browser_config = BrowserConfig( + headless=True, + proxy=proxy_settings, + ) + + # Create browser + browser = Browser(config=browser_config) + try: + # Initialize browser - this should succeed even with invalid proxy + # because we're just checking configuration, not actual proxy functionality + try: + playwright_browser = await browser.get_playwright_browser() + assert playwright_browser is not None, 'Browser initialization failed' + + # Success - the browser was initialized with our proxy settings + # We won't try to make requests (which would fail with non-existent proxy) + print('✅ Browser initialized with proxy settings successfully') + + # We can inspect browser settings here to verify proxy was passed + # but the specific API to access these settings depends on the browser + + except Exception as e: + # Make sure any exception isn't related to the proxy configuration format + # (Network errors due to non-existent proxy are acceptable, invalid type conversion isn't) + error_text = str(e).lower() + assert 'proxy' not in error_text or any( + term in error_text for term in ['connect', 'connection', 'network', 'timeout', 'unreachable'] + ), f'Proxy configuration error (not network error): {e}' + finally: + # Clean up browser + await browser.close() diff --git a/tests/test_context.py b/tests/test_context.py index 385f70f48b..9d59951e46 100644 --- a/tests/test_context.py +++ b/tests/test_context.py @@ -1,326 +1,362 @@ -import asyncio import base64 -import os +from unittest.mock import Mock + import pytest + from browser_use.browser.context import BrowserContext, BrowserContextConfig from browser_use.browser.views import BrowserState from browser_use.dom.views import DOMElementNode -from unittest.mock import Mock + def test_is_url_allowed(): - """ - Test the _is_url_allowed method to verify that it correctly checks URLs against - the allowed domains configuration. - Scenario 1: When allowed_domains is None, all URLs should be allowed. - Scenario 2: When allowed_domains is a list, only URLs matching the allowed domain(s) are allowed. - Scenario 3: When the URL is malformed, it should return False. - """ - # Create a dummy Browser mock. Only the 'config' attribute is needed for _is_url_allowed. - dummy_browser = Mock() - # Set an empty config for dummy_browser; it won't be used in _is_url_allowed. - dummy_browser.config = Mock() - # Scenario 1: allowed_domains is None, any URL should be allowed. - config1 = BrowserContextConfig(allowed_domains=None) - context1 = BrowserContext(browser=dummy_browser, config=config1) - assert context1._is_url_allowed("http://anydomain.com") is True - assert context1._is_url_allowed("https://anotherdomain.org/path") is True - # Scenario 2: allowed_domains is provided. - allowed = ["example.com", "mysite.org"] - config2 = BrowserContextConfig(allowed_domains=allowed) - context2 = BrowserContext(browser=dummy_browser, config=config2) - # URL exactly matching - assert context2._is_url_allowed("http://example.com") is True - # URL with subdomain (should be allowed) - assert context2._is_url_allowed("http://sub.example.com/path") is True - # URL with different domain (should not be allowed) - assert context2._is_url_allowed("http://notexample.com") is False - # URL that matches second allowed domain - assert context2._is_url_allowed("https://mysite.org/page") is True - # URL with port number, still allowed (port is stripped) - assert context2._is_url_allowed("http://example.com:8080") is True - # Scenario 3: Malformed URL or empty domain - # urlparse will return an empty netloc for some malformed URLs. - assert context2._is_url_allowed("notaurl") is False + """ + Test the _is_url_allowed method to verify that it correctly checks URLs against + the allowed domains configuration. + Scenario 1: When allowed_domains is None, all URLs should be allowed. + Scenario 2: When allowed_domains is a list, only URLs matching the allowed domain(s) are allowed. + Scenario 3: When the URL is malformed, it should return False. + """ + # Create a dummy Browser mock. Only the 'config' attribute is needed for _is_url_allowed. + dummy_browser = Mock() + # Set an empty config for dummy_browser; it won't be used in _is_url_allowed. + dummy_browser.config = Mock() + # Scenario 1: allowed_domains is None, any URL should be allowed. + config1 = BrowserContextConfig(allowed_domains=None) + context1 = BrowserContext(browser=dummy_browser, config=config1) + assert context1._is_url_allowed('http://anydomain.com') is True + assert context1._is_url_allowed('https://anotherdomain.org/path') is True + # Scenario 2: allowed_domains is provided. + allowed = ['example.com', 'mysite.org'] + config2 = BrowserContextConfig(allowed_domains=allowed) + context2 = BrowserContext(browser=dummy_browser, config=config2) + # URL exactly matching + assert context2._is_url_allowed('http://example.com') is True + # URL with subdomain (should be allowed) + assert context2._is_url_allowed('http://sub.example.com/path') is True + # URL with different domain (should not be allowed) + assert context2._is_url_allowed('http://notexample.com') is False + # URL that matches second allowed domain + assert context2._is_url_allowed('https://mysite.org/page') is True + # URL with port number, still allowed (port is stripped) + assert context2._is_url_allowed('http://example.com:8080') is True + # Scenario 3: Malformed URL or empty domain + # urlparse will return an empty netloc for some malformed URLs. + assert context2._is_url_allowed('notaurl') is False + + def test_convert_simple_xpath_to_css_selector(): - """ - Test the _convert_simple_xpath_to_css_selector method of BrowserContext. - This verifies that simple XPath expressions (with and without indices) are correctly converted to CSS selectors. - """ - # Test empty xpath returns empty string - assert BrowserContext._convert_simple_xpath_to_css_selector('') == '' - # Test a simple xpath without indices - xpath = "/html/body/div/span" - expected = "html > body > div > span" - result = BrowserContext._convert_simple_xpath_to_css_selector(xpath) - assert result == expected - # Test xpath with an index on one element: [2] should translate to :nth-of-type(2) - xpath = "/html/body/div[2]/span" - expected = "html > body > div:nth-of-type(2) > span" - result = BrowserContext._convert_simple_xpath_to_css_selector(xpath) - assert result == expected - # Test xpath with indices on multiple elements: - # For "li[3]" -> li:nth-of-type(3) and for "a[1]" -> a:nth-of-type(1) - xpath = "/ul/li[3]/a[1]" - expected = "ul > li:nth-of-type(3) > a:nth-of-type(1)" - result = BrowserContext._convert_simple_xpath_to_css_selector(xpath) - assert result == expected + """ + Test the _convert_simple_xpath_to_css_selector method of BrowserContext. + This verifies that simple XPath expressions (with and without indices) are correctly converted to CSS selectors. + """ + # Test empty xpath returns empty string + assert BrowserContext._convert_simple_xpath_to_css_selector('') == '' + # Test a simple xpath without indices + xpath = '/html/body/div/span' + expected = 'html > body > div > span' + result = BrowserContext._convert_simple_xpath_to_css_selector(xpath) + assert result == expected + # Test xpath with an index on one element: [2] should translate to :nth-of-type(2) + xpath = '/html/body/div[2]/span' + expected = 'html > body > div:nth-of-type(2) > span' + result = BrowserContext._convert_simple_xpath_to_css_selector(xpath) + assert result == expected + # Test xpath with indices on multiple elements: + # For "li[3]" -> li:nth-of-type(3) and for "a[1]" -> a:nth-of-type(1) + xpath = '/ul/li[3]/a[1]' + expected = 'ul > li:nth-of-type(3) > a:nth-of-type(1)' + result = BrowserContext._convert_simple_xpath_to_css_selector(xpath) + assert result == expected + + def test_get_initial_state(): - """ - Test the _get_initial_state method to verify it returns the correct initial BrowserState. - The test checks that when a dummy page with a URL is provided, - the returned state contains that URL and other default values. - """ - # Create a dummy browser since only its existence is needed. - dummy_browser = Mock() - dummy_browser.config = Mock() - context = BrowserContext(browser=dummy_browser, config=BrowserContextConfig()) - # Define a dummy page with a 'url' attribute. - class DummyPage: - url = "http://dummy.com" - dummy_page = DummyPage() - # Call _get_initial_state with a page: URL should be set from page.url. - state_with_page = context._get_initial_state(page=dummy_page) - assert state_with_page.url == dummy_page.url - # Verify that the element_tree is initialized with tag 'root' - assert state_with_page.element_tree.tag_name == 'root' - # Call _get_initial_state without a page: URL should be empty. - state_without_page = context._get_initial_state() - assert state_without_page.url == "" + """ + Test the _get_initial_state method to verify it returns the correct initial BrowserState. + The test checks that when a dummy page with a URL is provided, + the returned state contains that URL and other default values. + """ + # Create a dummy browser since only its existence is needed. + dummy_browser = Mock() + dummy_browser.config = Mock() + context = BrowserContext(browser=dummy_browser, config=BrowserContextConfig()) + + # Define a dummy page with a 'url' attribute. + class DummyPage: + url = 'http://dummy.com' + + dummy_page = DummyPage() + # Call _get_initial_state with a page: URL should be set from page.url. + state_with_page = context._get_initial_state(page=dummy_page) + assert state_with_page.url == dummy_page.url + # Verify that the element_tree is initialized with tag 'root' + assert state_with_page.element_tree.tag_name == 'root' + # Call _get_initial_state without a page: URL should be empty. + state_without_page = context._get_initial_state() + assert state_without_page.url == '' + + @pytest.mark.asyncio async def test_execute_javascript(): - """ - Test the execute_javascript method by mocking the current page's evaluate function. - This ensures that when execute_javascript is called, it correctly returns the value - from the page's evaluate method. - """ - # Define a dummy page with an async evaluate method. - class DummyPage: - async def evaluate(self, script): - return "dummy_result" - # Create a dummy session object with a dummy current_page. - dummy_session = type("DummySession", (), {})() - dummy_session.current_page = DummyPage() - # Create a dummy browser mock with a minimal config. - dummy_browser = Mock() - dummy_browser.config = Mock() - # Initialize the BrowserContext with the dummy browser and config. - context = BrowserContext(browser=dummy_browser, config=BrowserContextConfig()) - # Manually set the session to our dummy session. - context.session = dummy_session - # Call execute_javascript and verify it returns the expected result. - result = await context.execute_javascript("return 1+1") - assert result == "dummy_result" + """ + Test the execute_javascript method by mocking the current page's evaluate function. + This ensures that when execute_javascript is called, it correctly returns the value + from the page's evaluate method. + """ + + # Define a dummy page with an async evaluate method. + class DummyPage: + async def evaluate(self, script): + return 'dummy_result' + + # Create a dummy session object with a dummy current_page. + dummy_session = type('DummySession', (), {})() + dummy_session.current_page = DummyPage() + # Create a dummy browser mock with a minimal config. + dummy_browser = Mock() + dummy_browser.config = Mock() + # Initialize the BrowserContext with the dummy browser and config. + context = BrowserContext(browser=dummy_browser, config=BrowserContextConfig()) + # Manually set the session to our dummy session. + context.session = dummy_session + # Call execute_javascript and verify it returns the expected result. + result = await context.execute_javascript('return 1+1') + assert result == 'dummy_result' + + @pytest.mark.asyncio async def test_enhanced_css_selector_for_element(): - """ - Test the _enhanced_css_selector_for_element method to verify that - it returns the correct CSS selector string for a dummy DOMElementNode. - The test checks that: - - The provided xpath is correctly converted (handling indices), - - Class attributes are appended as CSS classes, - - Standard and dynamic attributes (including ones with special characters) - are correctly added to the selector. - """ - # Create a dummy DOMElementNode instance with a complex set of attributes. - dummy_element = DOMElementNode( - tag_name="div", - is_visible=True, - parent=None, - xpath="/html/body/div[2]", - attributes={ - "class": "foo bar", - "id": "my-id", - "placeholder": 'some "quoted" text', - "data-testid": "123" - }, - children=[] - ) - # Call the method with include_dynamic_attributes=True. - actual_selector = BrowserContext._enhanced_css_selector_for_element(dummy_element, include_dynamic_attributes=True) - # Expected conversion: - # 1. The xpath "/html/body/div[2]" converts to "html > body > div:nth-of-type(2)". - # 2. The class attribute "foo bar" appends ".foo.bar". - # 3. The "id" attribute is added as [id="my-id"]. - # 4. The "placeholder" attribute contains quotes; it is added as - # [placeholder*="some \"quoted\" text"]. - # 5. The dynamic attribute "data-testid" is added as [data-testid="123"]. - expected_selector = 'html > body > div:nth-of-type(2).foo.bar[id="my-id"][placeholder*="some \\"quoted\\" text"][data-testid="123"]' - assert actual_selector == expected_selector, f"Expected {expected_selector}, but got {actual_selector}" + """ + Test the _enhanced_css_selector_for_element method to verify that + it returns the correct CSS selector string for a dummy DOMElementNode. + The test checks that: + - The provided xpath is correctly converted (handling indices), + - Class attributes are appended as CSS classes, + - Standard and dynamic attributes (including ones with special characters) + are correctly added to the selector. + """ + # Create a dummy DOMElementNode instance with a complex set of attributes. + dummy_element = DOMElementNode( + tag_name='div', + is_visible=True, + parent=None, + xpath='/html/body/div[2]', + attributes={'class': 'foo bar', 'id': 'my-id', 'placeholder': 'some "quoted" text', 'data-testid': '123'}, + children=[], + ) + # Call the method with include_dynamic_attributes=True. + actual_selector = BrowserContext._enhanced_css_selector_for_element(dummy_element, include_dynamic_attributes=True) + # Expected conversion: + # 1. The xpath "/html/body/div[2]" converts to "html > body > div:nth-of-type(2)". + # 2. The class attribute "foo bar" appends ".foo.bar". + # 3. The "id" attribute is added as [id="my-id"]. + # 4. The "placeholder" attribute contains quotes; it is added as + # [placeholder*="some \"quoted\" text"]. + # 5. The dynamic attribute "data-testid" is added as [data-testid="123"]. + expected_selector = ( + 'html > body > div:nth-of-type(2).foo.bar[id="my-id"][placeholder*="some \\"quoted\\" text"][data-testid="123"]' + ) + assert actual_selector == expected_selector, f'Expected {expected_selector}, but got {actual_selector}' + + @pytest.mark.asyncio async def test_get_scroll_info(): - """ - Test the get_scroll_info method by mocking the page's evaluate method. - This dummy page returns preset values for window.scrollY, window.innerHeight, - and document.documentElement.scrollHeight. The test then verifies that the - computed scroll information (pixels_above and pixels_below) match the expected values. - """ - # Define a dummy page with an async evaluate method returning preset values. - class DummyPage: - async def evaluate(self, script): - if "window.scrollY" in script: - return 100 # scrollY - elif "window.innerHeight" in script: - return 500 # innerHeight - elif "document.documentElement.scrollHeight" in script: - return 1200 # total scrollable height - return None - # Create a dummy session with a dummy current_page. - dummy_session = type("DummySession", (), {})() - dummy_session.current_page = DummyPage() - # We also need a dummy context attribute but it won't be used in this test. - dummy_session.context = type("DummyContext", (), {})() - # Create a dummy browser mock. - dummy_browser = Mock() - dummy_browser.config = Mock() - # Initialize BrowserContext with the dummy browser and config. - context = BrowserContext(browser=dummy_browser, config=BrowserContextConfig()) - # Manually set the session to our dummy session. - context.session = dummy_session - # Call get_scroll_info on the dummy page. - pixels_above, pixels_below = await context.get_scroll_info(dummy_session.current_page) - # Expected calculations: - # pixels_above = scrollY = 100 - # pixels_below = total_height - (scrollY + innerHeight) = 1200 - (100 + 500) = 600 - assert pixels_above == 100, f"Expected 100 pixels above, got {pixels_above}" - assert pixels_below == 600, f"Expected 600 pixels below, got {pixels_below}" + """ + Test the get_scroll_info method by mocking the page's evaluate method. + This dummy page returns preset values for window.scrollY, window.innerHeight, + and document.documentElement.scrollHeight. The test then verifies that the + computed scroll information (pixels_above and pixels_below) match the expected values. + """ + + # Define a dummy page with an async evaluate method returning preset values. + class DummyPage: + async def evaluate(self, script): + if 'window.scrollY' in script: + return 100 # scrollY + elif 'window.innerHeight' in script: + return 500 # innerHeight + elif 'document.documentElement.scrollHeight' in script: + return 1200 # total scrollable height + return None + + # Create a dummy session with a dummy current_page. + dummy_session = type('DummySession', (), {})() + dummy_session.current_page = DummyPage() + # We also need a dummy context attribute but it won't be used in this test. + dummy_session.context = type('DummyContext', (), {})() + # Create a dummy browser mock. + dummy_browser = Mock() + dummy_browser.config = Mock() + # Initialize BrowserContext with the dummy browser and config. + context = BrowserContext(browser=dummy_browser, config=BrowserContextConfig()) + # Manually set the session to our dummy session. + context.session = dummy_session + # Call get_scroll_info on the dummy page. + pixels_above, pixels_below = await context.get_scroll_info(dummy_session.current_page) + # Expected calculations: + # pixels_above = scrollY = 100 + # pixels_below = total_height - (scrollY + innerHeight) = 1200 - (100 + 500) = 600 + assert pixels_above == 100, f'Expected 100 pixels above, got {pixels_above}' + assert pixels_below == 600, f'Expected 600 pixels below, got {pixels_below}' + + @pytest.mark.asyncio async def test_reset_context(): - """ - Test the reset_context method to ensure it correctly closes all existing tabs, - resets the cached state, and creates a new page. - """ - # Dummy Page with close and wait_for_load_state methods. - class DummyPage: - def __init__(self, url="http://dummy.com"): - self.url = url - self.closed = False - async def close(self): - self.closed = True - async def wait_for_load_state(self): - pass - # Dummy Context that holds pages and can create a new page. - class DummyContext: - def __init__(self): - self.pages = [] - async def new_page(self): - new_page = DummyPage(url="") - self.pages.append(new_page) - return new_page - # Create a dummy session with a context containing two pages. - dummy_session = type("DummySession", (), {})() - dummy_context = DummyContext() - page1 = DummyPage(url="http://page1.com") - page2 = DummyPage(url="http://page2.com") - dummy_context.pages.extend([page1, page2]) - dummy_session.context = dummy_context - dummy_session.current_page = page1 - dummy_session.cached_state = None - # Create a dummy browser mock. - dummy_browser = Mock() - dummy_browser.config = Mock() - # Initialize BrowserContext using our dummy_browser and config, - # and manually set its session to our dummy session. - context = BrowserContext(browser=dummy_browser, config=BrowserContextConfig()) - context.session = dummy_session - # Confirm session has 2 pages before reset. - assert len(dummy_session.context.pages) == 2 - # Call reset_context which should close existing pages, - # reset the cached state, and create a new page as current_page. - await context.reset_context() - # Verify that initial pages were closed. - assert page1.closed is True - assert page2.closed is True - # Check that a new page is created and set as current_page. - assert dummy_session.current_page is not None - new_page = dummy_session.current_page - # New page URL should be empty as per _get_initial_state. - assert new_page.url == "" - # Verify that cached_state is reset to an initial BrowserState. - state = dummy_session.cached_state - assert isinstance(state, BrowserState) - assert state.url == "" - assert state.element_tree.tag_name == 'root' + """ + Test the reset_context method to ensure it correctly closes all existing tabs, + resets the cached state, and creates a new page. + """ + + # Dummy Page with close and wait_for_load_state methods. + class DummyPage: + def __init__(self, url='http://dummy.com'): + self.url = url + self.closed = False + + async def close(self): + self.closed = True + + async def wait_for_load_state(self): + pass + + # Dummy Context that holds pages and can create a new page. + class DummyContext: + def __init__(self): + self.pages = [] + + async def new_page(self): + new_page = DummyPage(url='') + self.pages.append(new_page) + return new_page + + # Create a dummy session with a context containing two pages. + dummy_session = type('DummySession', (), {})() + dummy_context = DummyContext() + page1 = DummyPage(url='http://page1.com') + page2 = DummyPage(url='http://page2.com') + dummy_context.pages.extend([page1, page2]) + dummy_session.context = dummy_context + dummy_session.current_page = page1 + dummy_session.cached_state = None + # Create a dummy browser mock. + dummy_browser = Mock() + dummy_browser.config = Mock() + # Initialize BrowserContext using our dummy_browser and config, + # and manually set its session to our dummy session. + context = BrowserContext(browser=dummy_browser, config=BrowserContextConfig()) + context.session = dummy_session + # Confirm session has 2 pages before reset. + assert len(dummy_session.context.pages) == 2 + # Call reset_context which should close existing pages, + # reset the cached state, and create a new page as current_page. + await context.reset_context() + # Verify that initial pages were closed. + assert page1.closed is True + assert page2.closed is True + # Check that a new page is created and set as current_page. + assert dummy_session.current_page is not None + new_page = dummy_session.current_page + # New page URL should be empty as per _get_initial_state. + assert new_page.url == '' + # Verify that cached_state is reset to an initial BrowserState. + state = dummy_session.cached_state + assert isinstance(state, BrowserState) + assert state.url == '' + assert state.element_tree.tag_name == 'root' + + @pytest.mark.asyncio async def test_take_screenshot(): - """ - Test the take_screenshot method to verify that it returns a base64 encoded screenshot string. - A dummy page with a mocked screenshot method is used, returning a predefined byte string. - """ - class DummyPage: - async def screenshot(self, full_page, animations): - # Verify that parameters are forwarded correctly. - assert full_page is True, "full_page parameter was not correctly passed" - assert animations == 'disabled', "animations parameter was not correctly passed" - # Return a test byte string. - return b'test' - # Create a dummy session with the DummyPage as the current_page. - dummy_session = type("DummySession", (), {})() - dummy_session.current_page = DummyPage() - dummy_session.context = None # Not used in this test - # Create a dummy browser mock. - dummy_browser = Mock() - dummy_browser.config = Mock() - # Initialize the BrowserContext with the dummy browser and config. - context = BrowserContext(browser=dummy_browser, config=BrowserContextConfig()) - # Manually set the session to our dummy session. - context.session = dummy_session - # Call take_screenshot and check that it returns the expected base64 encoded string. - result = await context.take_screenshot(full_page=True) - expected = base64.b64encode(b'test').decode('utf-8') - assert result == expected, f"Expected {expected}, but got {result}" + """ + Test the take_screenshot method to verify that it returns a base64 encoded screenshot string. + A dummy page with a mocked screenshot method is used, returning a predefined byte string. + """ + + class DummyPage: + async def screenshot(self, full_page, animations): + # Verify that parameters are forwarded correctly. + assert full_page is True, 'full_page parameter was not correctly passed' + assert animations == 'disabled', 'animations parameter was not correctly passed' + # Return a test byte string. + return b'test' + + # Create a dummy session with the DummyPage as the current_page. + dummy_session = type('DummySession', (), {})() + dummy_session.current_page = DummyPage() + dummy_session.context = None # Not used in this test + # Create a dummy browser mock. + dummy_browser = Mock() + dummy_browser.config = Mock() + # Initialize the BrowserContext with the dummy browser and config. + context = BrowserContext(browser=dummy_browser, config=BrowserContextConfig()) + # Manually set the session to our dummy session. + context.session = dummy_session + # Call take_screenshot and check that it returns the expected base64 encoded string. + result = await context.take_screenshot(full_page=True) + expected = base64.b64encode(b'test').decode('utf-8') + assert result == expected, f'Expected {expected}, but got {result}' + + @pytest.mark.asyncio async def test_refresh_page_behavior(): - """ - Test the refresh_page method of BrowserContext to verify that it correctly reloads the current page - and waits for the page's load state. This is done by creating a dummy page that flags when its - reload and wait_for_load_state methods are called. - """ - class DummyPage: - def __init__(self): - self.reload_called = False - self.wait_for_load_state_called = False - async def reload(self): - self.reload_called = True - async def wait_for_load_state(self): - self.wait_for_load_state_called = True - # Create a dummy session with the dummy page as the current_page. - dummy_page = DummyPage() - dummy_session = type("DummySession", (), {})() - dummy_session.current_page = dummy_page - dummy_session.context = None # Not required for this test - # Create a dummy browser mock - dummy_browser = Mock() - dummy_browser.config = Mock() - # Initialize BrowserContext with the dummy browser and config, - # and manually set its session to our dummy session. - context = BrowserContext(browser=dummy_browser, config=BrowserContextConfig()) - context.session = dummy_session - # Call refresh_page and verify that reload and wait_for_load_state were called. - await context.refresh_page() - assert dummy_page.reload_called is True, "Expected the page to call reload()" - assert dummy_page.wait_for_load_state_called is True, "Expected the page to call wait_for_load_state()" + """ + Test the refresh_page method of BrowserContext to verify that it correctly reloads the current page + and waits for the page's load state. This is done by creating a dummy page that flags when its + reload and wait_for_load_state methods are called. + """ + + class DummyPage: + def __init__(self): + self.reload_called = False + self.wait_for_load_state_called = False + + async def reload(self): + self.reload_called = True + + async def wait_for_load_state(self): + self.wait_for_load_state_called = True + + # Create a dummy session with the dummy page as the current_page. + dummy_page = DummyPage() + dummy_session = type('DummySession', (), {})() + dummy_session.current_page = dummy_page + dummy_session.context = None # Not required for this test + # Create a dummy browser mock + dummy_browser = Mock() + dummy_browser.config = Mock() + # Initialize BrowserContext with the dummy browser and config, + # and manually set its session to our dummy session. + context = BrowserContext(browser=dummy_browser, config=BrowserContextConfig()) + context.session = dummy_session + # Call refresh_page and verify that reload and wait_for_load_state were called. + await context.refresh_page() + assert dummy_page.reload_called is True, 'Expected the page to call reload()' + assert dummy_page.wait_for_load_state_called is True, 'Expected the page to call wait_for_load_state()' + + @pytest.mark.asyncio async def test_remove_highlights_failure(): - """ - Test the remove_highlights method to ensure that if the page.evaluate call fails, - the exception is caught and does not propagate (i.e. the method handles errors gracefully). - """ - # Dummy page that always raises an exception when evaluate is called. - class DummyPage: - async def evaluate(self, script): - raise Exception("dummy error") - # Create a dummy session with the DummyPage as current_page. - dummy_session = type("DummySession", (), {})() - dummy_session.current_page = DummyPage() - dummy_session.context = None # Not used in this test - # Create a dummy browser mock. - dummy_browser = Mock() - dummy_browser.config = Mock() - # Initialize BrowserContext with the dummy browser and configuration. - context = BrowserContext(browser=dummy_browser, config=BrowserContextConfig()) - context.session = dummy_session - # Call remove_highlights and verify that no exception is raised. - try: - await context.remove_highlights() - except Exception as e: - pytest.fail(f"remove_highlights raised an exception: {e}") \ No newline at end of file + """ + Test the remove_highlights method to ensure that if the page.evaluate call fails, + the exception is caught and does not propagate (i.e. the method handles errors gracefully). + """ + + # Dummy page that always raises an exception when evaluate is called. + class DummyPage: + async def evaluate(self, script): + raise Exception('dummy error') + + # Create a dummy session with the DummyPage as current_page. + dummy_session = type('DummySession', (), {})() + dummy_session.current_page = DummyPage() + dummy_session.context = None # Not used in this test + # Create a dummy browser mock. + dummy_browser = Mock() + dummy_browser.config = Mock() + # Initialize BrowserContext with the dummy browser and configuration. + context = BrowserContext(browser=dummy_browser, config=BrowserContextConfig()) + context.session = dummy_session + # Call remove_highlights and verify that no exception is raised. + try: + await context.remove_highlights() + except Exception as e: + pytest.fail(f'remove_highlights raised an exception: {e}') diff --git a/tests/test_dropdown.py b/tests/test_dropdown.py index 374d432279..a6af88a5e7 100644 --- a/tests/test_dropdown.py +++ b/tests/test_dropdown.py @@ -1,37 +1,40 @@ """ Test dropdown interaction functionality. """ + import pytest + from browser_use.agent.service import Agent from browser_use.agent.views import AgentHistoryList + @pytest.mark.asyncio async def test_dropdown(llm, browser_context): - """Test selecting an option from a dropdown menu.""" - agent = Agent( - task=( - 'go to https://codepen.io/geheimschriftstift/pen/mPLvQz and first get all options for the dropdown and then select the 5th option' - ), - llm=llm, - browser_context=browser_context, - ) - - try: - history: AgentHistoryList = await agent.run(20) - result = history.final_result() - - # Verify dropdown interaction - assert result is not None - assert 'Duck' in result, "Expected 5th option 'Duck' to be selected" - - # Verify dropdown state - element = await browser_context.get_element_by_selector('select') - assert element is not None, "Dropdown element should exist" - - value = await element.evaluate('el => el.value') - assert value == '5', "Dropdown should have 5th option selected" - - except Exception as e: - pytest.fail(f"Dropdown test failed: {str(e)}") - finally: - await browser_context.close() + """Test selecting an option from a dropdown menu.""" + agent = Agent( + task=( + 'go to https://codepen.io/geheimschriftstift/pen/mPLvQz and first get all options for the dropdown and then select the 5th option' + ), + llm=llm, + browser_context=browser_context, + ) + + try: + history: AgentHistoryList = await agent.run(20) + result = history.final_result() + + # Verify dropdown interaction + assert result is not None + assert 'Duck' in result, "Expected 5th option 'Duck' to be selected" + + # Verify dropdown state + element = await browser_context.get_element_by_selector('select') + assert element is not None, 'Dropdown element should exist' + + value = await element.evaluate('el => el.value') + assert value == '5', 'Dropdown should have 5th option selected' + + except Exception as e: + pytest.fail(f'Dropdown test failed: {str(e)}') + finally: + await browser_context.close() diff --git a/tests/test_dropdown_complex.py b/tests/test_dropdown_complex.py index 774e34203d..ffa2efdabf 100644 --- a/tests/test_dropdown_complex.py +++ b/tests/test_dropdown_complex.py @@ -1,41 +1,44 @@ """ Test complex dropdown interaction functionality. """ + import pytest + from browser_use.agent.service import Agent from browser_use.agent.views import AgentHistoryList + @pytest.mark.asyncio async def test_dropdown_complex(llm, browser_context): - """Test selecting an option from a complex dropdown menu.""" - agent = Agent( - task=( - 'go to https://codepen.io/shyam-king/pen/pvzpByJ and first get all options for the dropdown and then select the json option' - ), - llm=llm, - browser_context=browser_context, - ) - - try: - history: AgentHistoryList = await agent.run(20) - result = history.final_result() - - # Verify dropdown interaction - assert result is not None - assert 'json' in result.lower(), "Expected 'json' option to be selected" - - # Verify dropdown state - element = await browser_context.get_element_by_selector('.select-selected') - assert element is not None, "Custom dropdown element should exist" - - text = await element.text_content() - assert 'json' in text.lower(), "Dropdown should display json option" - - # Verify the selected option's effect - code_element = await browser_context.get_element_by_selector('pre code') - assert code_element is not None, "Code element should be visible when JSON is selected" - - except Exception as e: - pytest.fail(f"Complex dropdown test failed: {str(e)}") - finally: - await browser_context.close() + """Test selecting an option from a complex dropdown menu.""" + agent = Agent( + task=( + 'go to https://codepen.io/shyam-king/pen/pvzpByJ and first get all options for the dropdown and then select the json option' + ), + llm=llm, + browser_context=browser_context, + ) + + try: + history: AgentHistoryList = await agent.run(20) + result = history.final_result() + + # Verify dropdown interaction + assert result is not None + assert 'json' in result.lower(), "Expected 'json' option to be selected" + + # Verify dropdown state + element = await browser_context.get_element_by_selector('.select-selected') + assert element is not None, 'Custom dropdown element should exist' + + text = await element.text_content() + assert 'json' in text.lower(), 'Dropdown should display json option' + + # Verify the selected option's effect + code_element = await browser_context.get_element_by_selector('pre code') + assert code_element is not None, 'Code element should be visible when JSON is selected' + + except Exception as e: + pytest.fail(f'Complex dropdown test failed: {str(e)}') + finally: + await browser_context.close() diff --git a/tests/test_dropdown_error.py b/tests/test_dropdown_error.py index 4f23bc9324..fe1a28d6d5 100644 --- a/tests/test_dropdown_error.py +++ b/tests/test_dropdown_error.py @@ -20,9 +20,7 @@ # browser = Browser(config=BrowserConfig(headless=False)) agent = Agent( - task=( - 'go to https://codepen.io/shyam-king/pen/emOyjKm and select number "4" and return the output of "selected value"' - ), + task=('go to https://codepen.io/shyam-king/pen/emOyjKm and select number "4" and return the output of "selected value"'), llm=llm, browser_context=BrowserContext( browser=Browser(config=BrowserConfig(headless=False, disable_security=True)), diff --git a/tests/test_gif_path.py b/tests/test_gif_path.py index 3839a1145b..d9e327695d 100644 --- a/tests/test_gif_path.py +++ b/tests/test_gif_path.py @@ -19,25 +19,22 @@ llm = ChatOpenAI(model='gpt-4o') agent = Agent( - task=( - 'go to google.com and search for text "hi there"' - ), + task=('go to google.com and search for text "hi there"'), llm=llm, browser_context=BrowserContext( browser=Browser(config=BrowserConfig(headless=False, disable_security=True)), ), - generate_gif="./google.gif" + generate_gif='./google.gif', ) async def test_gif_path(): - if os.path.exists("./google.gif"): - os.unlink("./google.gif") + if os.path.exists('./google.gif'): + os.unlink('./google.gif') history: AgentHistoryList = await agent.run(20) result = history.final_result() assert result is not None - assert os.path.exists("./google.gif"), "google.gif was not created" - + assert os.path.exists('./google.gif'), 'google.gif was not created' diff --git a/tests/test_mind2web.py b/tests/test_mind2web.py index 1bbd3c9079..756f18568c 100644 --- a/tests/test_mind2web.py +++ b/tests/test_mind2web.py @@ -83,7 +83,7 @@ async def test_random_samples(test_cases: List[Dict[str, Any]], llm, context, va samples = random.sample(test_cases, 1) for i, case in enumerate(samples, 1): - task = f"Go to {case['website']}.com and {case['confirmed_task']}" + task = f'Go to {case["website"]}.com and {case["confirmed_task"]}' logger.info(f'--- Random Sample {i}/{len(samples)} ---') logger.info(f'Task: {task}\n') diff --git a/tests/test_models.py b/tests/test_models.py index 0d5f30b7ec..a3865ce9fe 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -126,7 +126,7 @@ async def test_model_search(llm, context): try: response = requests.get('http://127.0.0.1:11434/') if response.status_code != 200: - raise + raise Exception('Ollama is not running - start with `ollama start`') except Exception: raise Exception('Ollama is not running - start with `ollama start`') diff --git a/tests/test_service.py b/tests/test_service.py index 340085bff7..f50f022ca4 100644 --- a/tests/test_service.py +++ b/tests/test_service.py @@ -122,7 +122,7 @@ async def test_step_error_handling(self): assert len(agent._last_result) == 1 assert isinstance(agent._last_result[0], ActionResult) assert 'Test error' in agent._last_result[0].error - assert agent._last_result[0].include_in_memory == True + assert agent._last_result[0].include_in_memory is True class TestRegistry: diff --git a/tests/test_vision.py b/tests/test_vision.py index 91c01b6672..9851b1a6fe 100644 --- a/tests/test_vision.py +++ b/tests/test_vision.py @@ -23,7 +23,7 @@ # use this test to ask the model questions about the page like # which color do you see for bbox labels, list all with their label -# whats the smallest bboxes with labels and +# what's the smallest bboxes with labels and @controller.registry.action(description='explain what you see on the screen and ask user for input') @@ -40,7 +40,7 @@ async def done(text: str) -> str: agent = Agent( - task='call explain_screen all the time the user asks you questions e.g. about the page like bbox which you see are labels - your task is to expalin it and get the next question', + task='call explain_screen all the time the user asks you questions e.g. about the page like bbox which you see are labels - your task is to explain it and get the next question', llm=llm, controller=controller, browser=Browser(config=BrowserConfig(disable_security=True, headless=False)), diff --git a/tests/test_wait_for_element.py b/tests/test_wait_for_element.py new file mode 100644 index 0000000000..4ffe51cff7 --- /dev/null +++ b/tests/test_wait_for_element.py @@ -0,0 +1,68 @@ +import asyncio +import os +import sys + +project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) +if project_root not in sys.path: + sys.path.insert(0, project_root) + +import pytest +from dotenv import load_dotenv +from langchain_openai import ChatOpenAI + +# Third-party imports +from browser_use import Agent, Controller + +# Local imports +from browser_use.browser.browser import Browser, BrowserConfig +from browser_use.browser.context import BrowserContext + +# Load environment variables. +load_dotenv() + +# Initialize language model and controller. +llm = ChatOpenAI(model='gpt-4o') +controller = Controller() + + +@pytest.mark.skip(reason='this is for local testing only') +async def test_wait_for_element(): + """Test 'Wait for element' action.""" + + initial_actions = [ + {'open_tab': {'url': 'https://pypi.org/'}}, + # Uncomment the line below to include the wait action in initial actions. + # {'wait_for_element': {'selector': '#search', 'timeout': 30}}, + ] + + # Set up the browser context. + context = BrowserContext( + browser=Browser(config=BrowserConfig(headless=False, disable_security=True)), + ) + + # Create the agent with the task. + agent = Agent( + task="Wait for element '#search' to be visible with a timeout of 30 seconds.", + llm=llm, + browser_context=context, + initial_actions=initial_actions, + controller=controller, + ) + + # Run the agent for a few steps to trigger navigation and then the wait action. + history = await agent.run(max_steps=3) + action_names = history.action_names() + + # Ensure that the wait_for_element action was executed. + assert 'wait_for_element' in action_names, 'Expected wait_for_element action to be executed.' + + # Verify that the #search element is visible by querying the page. + page = await context.get_current_page() + header_handle = await page.query_selector('#search') + assert header_handle is not None, 'Expected to find a #search element on the page.' + is_visible = await header_handle.is_visible() + assert is_visible, 'Expected the #search element to be visible.' + + +if __name__ == '__main__': + asyncio.run(test_wait_for_element())