-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdeploy_tiny_vllm_endpoint.py
More file actions
144 lines (125 loc) · 4.51 KB
/
deploy_tiny_vllm_endpoint.py
File metadata and controls
144 lines (125 loc) · 4.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/env python3
"""
Deploy a tiny, known-good RunPod vLLM endpoint (facebook/opt-125m) using the stable
worker image, then test it via the OpenAI-compatible API.
Requirements: RUNPOD_API_KEY in environment (loads .env if present).
"""
import os
import time
import json
import requests
# Load .env if present
try:
from dotenv import load_dotenv
load_dotenv()
except Exception:
pass
RUNPOD_API_KEY = os.getenv("RUNPOD_API_KEY")
if not RUNPOD_API_KEY:
raise SystemExit("RUNPOD_API_KEY not set")
import runpod
runpod.api_key = RUNPOD_API_KEY
TINY_MODEL = "facebook/opt-125m"
IMAGE = "runpod/worker-v1-vllm:v2.7.0stable-cuda12.1.0"
GPU = "NVIDIA RTX A6000" # matches earlier working GPU pool in this account
def create_template():
print("📦 Creating template for tiny vLLM endpoint...")
env = [
{"key": "MODEL_NAME", "value": TINY_MODEL},
{"key": "DTYPE", "value": "auto"},
{"key": "ENABLE_LORA", "value": "0"},
{"key": "GPU_MEMORY_UTILIZATION", "value": "0.90"},
{"key": "BLOCK_SIZE", "value": "16"},
{"key": "SWAP_SPACE", "value": "4"},
{"key": "MAX_NUM_SEQS", "value": "64"},
{"key": "MAX_NUM_BATCHED_TOKENS", "value": "1024"},
{"key": "RAW_OPENAI_OUTPUT", "value": "1"},
{"key": "OPENAI_RESPONSE_ROLE", "value": "assistant"},
{"key": "SKIP_TOKENIZER_INIT", "value": "1"},
{"key": "TRUST_REMOTE_CODE", "value": "0"},
{"key": "HF_HOME", "value": "/tmp"}
]
tpl = runpod.create_template(
name="vllm-tiny-opt125m",
imageName=IMAGE,
env=env,
)
print("✅ Template created:", tpl.get("id"))
return tpl["id"]
def create_endpoint(template_id: str):
print("🚀 Creating serverless endpoint...")
ep = runpod.create_endpoint(
name="vllm-tiny-opt125m",
template_id=template_id,
gpu_ids=GPU,
workers_min=1,
workers_max=1,
idle_timeout=10,
scaler_type="QUEUE_DELAY",
scaler_value=4,
gpu_count=1,
workers_standby=1,
)
print("✅ Endpoint created:", ep.get("id"))
return ep["id"]
def wait_ready(endpoint_id: str, timeout_s: int = 300):
print("⏳ Waiting for worker to become ready...")
base = f"https://api.runpod.ai/v2/{endpoint_id}"
headers = {"Authorization": f"Bearer {RUNPOD_API_KEY}"}
start = time.time()
while time.time() - start < timeout_s:
try:
r = requests.get(f"{base}/health", headers=headers, timeout=10)
if r.status_code == 200:
data = r.json()
workers = data.get("workers", {})
ready = workers.get("ready", 0)
running = workers.get("running", 0)
initializing = workers.get("initializing", 0)
print(f" Workers -> ready:{ready} running:{running} initializing:{initializing}")
if ready > 0 or running > 0:
print("✅ Worker is ready")
return True
except Exception:
pass
time.sleep(5)
print("❌ Timed out waiting for readiness")
return False
def test_chat(endpoint_id: str):
print("🧪 Testing OpenAI-compatible chat endpoint...")
headers = {
"Authorization": f"Bearer {RUNPOD_API_KEY}",
"Content-Type": "application/json",
}
url = f"https://api.runpod.ai/v2/{endpoint_id}/openai/v1/chat/completions"
payload = {
"model": TINY_MODEL,
"messages": [{"role": "user", "content": "Say hi in 5 words."}],
"temperature": 0.7,
"max_tokens": 16,
}
r = requests.post(url, headers=headers, json=payload, timeout=60)
print("Status:", r.status_code)
print("Body:", r.text[:1000])
return r.status_code == 200
if __name__ == "__main__":
tpl_id = create_template()
ep_id = create_endpoint(tpl_id)
print(f"\nIDs -> template: {tpl_id} | endpoint: {ep_id}")
# prime autoscaler
print("\n⚡ Priming autoscaler with models list...")
try:
headers = {"Authorization": f"Bearer {RUNPOD_API_KEY}"}
url = f"https://api.runpod.ai/v2/{ep_id}/openai/v1/models"
requests.get(url, headers=headers, timeout=10)
except Exception:
pass
if wait_ready(ep_id, 360):
ok = test_chat(ep_id)
if ok:
print("\n✅ Tiny vLLM endpoint is working.")
else:
print("\n⚠️ Chat test failed; endpoint may still be initializing.")
else:
print("\n⚠️ Worker did not become ready in time.")
print("\nDone.")