-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
94 lines (78 loc) · 3.17 KB
/
utils.py
File metadata and controls
94 lines (78 loc) · 3.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from PIL import Image, ImageDraw, ImageFont
import io
import base64
def resize_image(image_bytes, max_width=1024):
"""Resizes an image to a maximum width while maintaining aspect ratio."""
img = Image.open(io.BytesIO(image_bytes))
w, h = img.size
if w > max_width:
ratio = max_width / w
new_size = (int(w * ratio), int(h * ratio))
img = img.resize(new_size, Image.LANCZOS)
output = io.BytesIO()
img.save(output, format="JPEG", quality=80)
return output.getvalue()
def add_grid(image_bytes):
"""Adds a 10x10 numbered grid to the image to help the LLM with coordinates."""
img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
draw = ImageDraw.Draw(img)
w, h = img.size
# Try to load a larger font, fallback to default
try:
# On Windows, arial.ttf is usually present
font = ImageFont.truetype("arial.ttf", 20)
except:
font = ImageFont.load_default()
# Draw grid lines and numbers
for i in range(11):
x = (i * w) // 10
y = (i * h) // 10
# Vertical lines
draw.line([(x, 0), (x, h)], fill="red", width=1)
# Horizontal lines
draw.line([(0, y), (w, y)], fill="red", width=1)
# Add coordinate numbers with a shadow/outline for readability
if i < 10:
text = str(i * 100)
for offset in [(1,1), (-1,-1), (1,-1), (-1,1)]:
draw.text((x + 2 + offset[0], 2 + offset[1]), text, fill="black", font=font)
draw.text((2 + offset[0], y + 2 + offset[1]), text, fill="black", font=font)
draw.text((x + 2, 2), text, fill="red", font=font)
draw.text((2, y + 2), text, fill="red", font=font)
output = io.BytesIO()
img.save(output, format="JPEG", quality=80)
return output.getvalue()
def mark_click(image_bytes, x_pct, y_pct):
"""Draws a green dot at the specified 0-1000 coordinate for debugging."""
img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
draw = ImageDraw.Draw(img)
w, h = img.size
x = (x_pct * w) // 1000
y = (y_pct * h) // 1000
radius = 10
draw.ellipse([(x - radius, y - radius), (x + radius, y + radius)], fill="lime", outline="black")
output = io.BytesIO()
img.save(output, format="JPEG", quality=80)
return output.getvalue()
SYSTEM_PROMPT = """
You are a web agent that navigates the web using screenshots.
You will receive a screenshot with a RED GRID overlay (numbered 0-1000).
Your goal is to complete the user's task.
ACTIONS:
1. click(x, y): Click at coordinates (0-1000).
2. type(text, x, y): Click at (x, y) and type text.
3. paste(text, x, y): Click at (x, y) and PASTE text (faster for long strings).
4. scroll(direction): 'up' or 'down'.
5. wait(): Wait 2s.
6. finish(): Task is complete.
7. ask_user(reason): Pause for user input.
PRECISION RULES:
- Use the RED GRID to estimate exact coordinates.
- Before clicking, carefully look at the numbers on the grid.
- Return multiple actions in a list if you are confident.
JSON FORMAT:
{
"thought": "Describe what you see and the coordinates you calculated",
"actions": [{"action": "click", "params": [450, 210]}]
}
"""