From c4452e5ccb1dd01b0f83c7207ab498b34fee23e5 Mon Sep 17 00:00:00 2001
From: Alexandre Strube <surak@surak.eti.br>
Date: Tue, 8 Apr 2025 19:21:04 +0200
Subject: [PATCH 1/2] Fix the "AttributeError: 'AsyncLLM' object has no
 attribute 'engine'" from #3704

---
 fastchat/serve/vllm_worker.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fastchat/serve/vllm_worker.py b/fastchat/serve/vllm_worker.py
index 0af680bb5..a9022a6c3 100644
--- a/fastchat/serve/vllm_worker.py
+++ b/fastchat/serve/vllm_worker.py
@@ -24,6 +24,9 @@
 )
 from fastchat.utils import get_context_length, is_partial_stop
 
+# This makes vllm > 0.8.0 work again.
+import os
+os.environ['VLLM_USE_V1'] = os.environ.get('VLLM_USE_V1', '0')
 
 app = FastAPI()
 

From 143789ddff0eaf9de1f795824317d80531399bb1 Mon Sep 17 00:00:00 2001
From: Alexandre Strube <surak@surak.eti.br>
Date: Tue, 8 Apr 2025 20:35:00 +0200
Subject: [PATCH 2/2] black formatting

---
 fastchat/serve/vllm_worker.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/fastchat/serve/vllm_worker.py b/fastchat/serve/vllm_worker.py
index a9022a6c3..828b33420 100644
--- a/fastchat/serve/vllm_worker.py
+++ b/fastchat/serve/vllm_worker.py
@@ -24,9 +24,9 @@
 )
 from fastchat.utils import get_context_length, is_partial_stop
 
-# This makes vllm > 0.8.0 work again.
 import os
-os.environ['VLLM_USE_V1'] = os.environ.get('VLLM_USE_V1', '0')
+
+os.environ["VLLM_USE_V1"] = os.environ.get("VLLM_USE_V1", "0")
 
 app = FastAPI()
 
@@ -74,7 +74,7 @@ async def generate_stream(self, params):
         request_id = params.pop("request_id")
         temperature = float(params.get("temperature", 1.0))
         top_p = float(params.get("top_p", 1.0))
-        top_k = params.get("top_k", -1.0)
+        top_k = params.get("top_k", -1)
         presence_penalty = float(params.get("presence_penalty", 0.0))
         frequency_penalty = float(params.get("frequency_penalty", 0.0))
         max_new_tokens = params.get("max_new_tokens", 256)
@@ -110,7 +110,7 @@ async def generate_stream(self, params):
             n=1,
             temperature=temperature,
             top_p=top_p,
-            use_beam_search=use_beam_search,
+            # use_beam_search=use_beam_search,
             stop=list(stop),
             stop_token_ids=stop_token_ids,
             max_tokens=max_new_tokens,
@@ -159,9 +159,11 @@ async def generate_stream(self, params):
                 "cumulative_logprob": [
                     output.cumulative_logprob for output in request_output.outputs
                 ],
-                "finish_reason": request_output.outputs[0].finish_reason
-                if len(request_output.outputs) == 1
-                else [output.finish_reason for output in request_output.outputs],
+                "finish_reason": (
+                    request_output.outputs[0].finish_reason
+                    if len(request_output.outputs) == 1
+                    else [output.finish_reason for output in request_output.outputs]
+                ),
             }
             # Emit twice here to ensure a 'finish_reason' with empty content in the OpenAI API response.
             # This aligns with the behavior of model_worker.