From 7d1edc37f5cdc1233e16972e547316b375925561 Mon Sep 17 00:00:00 2001 From: nicole-lihui Date: Tue, 21 Oct 2025 18:04:06 +0800 Subject: [PATCH] reduce maxTokens for glm-4-9b-chat to fit 50GB GPU Reason: The default 128k maxTokens causes OOM on 50GB GPUs for long-context inference. --- models/zhipuai/glm-4-9b-chat/metadata.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/models/zhipuai/glm-4-9b-chat/metadata.yaml b/models/zhipuai/glm-4-9b-chat/metadata.yaml index 22cf329..44edd55 100644 --- a/models/zhipuai/glm-4-9b-chat/metadata.yaml +++ b/models/zhipuai/glm-4-9b-chat/metadata.yaml @@ -6,7 +6,9 @@ spec: config: maxTokens: 128000 deployments: - - customRuntimeArgs: [] + - customRuntimeArgs: + - --max-num-batched-tokens=32768 # Reduce maxTokens from 128k to 32k to fit 50GB GPU and avoid OOM + - --max-model-len=32768 # Align max-model-len with max-num-batched-tokens for stable inference resourceRequirements: cpu: 4 gpuCount: 1