AutoBrowser/utils.py at main · exponentialXP/AutoBrowser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from PIL import Image, ImageDraw, ImageFont
import io
import base64

def resize_image(image_bytes, max_width=1024):
    """Resizes an image to a maximum width while maintaining aspect ratio."""
    img = Image.open(io.BytesIO(image_bytes))
    w, h = img.size
    if w > max_width:
        ratio = max_width / w
        new_size = (int(w * ratio), int(h * ratio))
        img = img.resize(new_size, Image.LANCZOS)

    output = io.BytesIO()
    img.save(output, format="JPEG", quality=80)
    return output.getvalue()

def add_grid(image_bytes):
    """Adds a 10x10 numbered grid to the image to help the LLM with coordinates."""
    img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
    draw = ImageDraw.Draw(img)
    w, h = img.size

    # Try to load a larger font, fallback to default
    try:
        # On Windows, arial.ttf is usually present
        font = ImageFont.truetype("arial.ttf", 20)
    except:
        font = ImageFont.load_default()

    # Draw grid lines and numbers
    for i in range(11):
        x = (i * w) // 10
        y = (i * h) // 10

        # Vertical lines
        draw.line([(x, 0), (x, h)], fill="red", width=1)
        # Horizontal lines
        draw.line([(0, y), (w, y)], fill="red", width=1)

        # Add coordinate numbers with a shadow/outline for readability
        if i < 10:
            text = str(i * 100)
            for offset in [(1,1), (-1,-1), (1,-1), (-1,1)]:
                draw.text((x + 2 + offset[0], 2 + offset[1]), text, fill="black", font=font)
                draw.text((2 + offset[0], y + 2 + offset[1]), text, fill="black", font=font)
            draw.text((x + 2, 2), text, fill="red", font=font)
            draw.text((2, y + 2), text, fill="red", font=font)

    output = io.BytesIO()
    img.save(output, format="JPEG", quality=80)
    return output.getvalue()

def mark_click(image_bytes, x_pct, y_pct):
    """Draws a green dot at the specified 0-1000 coordinate for debugging."""
    img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
    draw = ImageDraw.Draw(img)
    w, h = img.size

    x = (x_pct * w) // 1000
    y = (y_pct * h) // 1000

    radius = 10
    draw.ellipse([(x - radius, y - radius), (x + radius, y + radius)], fill="lime", outline="black")

    output = io.BytesIO()
    img.save(output, format="JPEG", quality=80)
    return output.getvalue()

SYSTEM_PROMPT = """
You are a web agent that navigates the web using screenshots.
You will receive a screenshot with a RED GRID overlay (numbered 0-1000).
Your goal is to complete the user's task.

ACTIONS:
1. click(x, y): Click at coordinates (0-1000).
2. type(text, x, y): Click at (x, y) and type text.
3. paste(text, x, y): Click at (x, y) and PASTE text (faster for long strings).
4. scroll(direction): 'up' or 'down'.
5. wait(): Wait 2s.
6. finish(): Task is complete.
7. ask_user(reason): Pause for user input.

PRECISION RULES:
- Use the RED GRID to estimate exact coordinates.
- Before clicking, carefully look at the numbers on the grid.
- Return multiple actions in a list if you are confident.

JSON FORMAT:
{
  "thought": "Describe what you see and the coordinates you calculated",
  "actions": [{"action": "click", "params": [450, 210]}]
}
"""