Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions docker/compose/docker-compose.deepseek-14b-gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,7 @@ services:
--tensor-parallel-size 1
--uvicorn-log-level warning
environment:
- SVC_HOST=deepseek_14b_gpu
- SVC_PORT=8000
- SVC_URL=http://deepseek_14b_gpu:8000
- DISCOVERY_URL=redis://redis:6379
- TOOL_SUPPORT=false
volumes:
Expand Down
3 changes: 1 addition & 2 deletions docker/compose/docker-compose.gemma-27b-gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@ services:
--kv-cache-dtype fp8
--uvicorn-log-level warning
environment:
- SVC_HOST=gemma_27b_gpu
- SVC_PORT=8000
- SVC_URL=http://gemma_27b_gpu:8000
- DISCOVERY_URL=redis://redis:6379
- TOOL_SUPPORT=false
- MULTIMODAL_SUPPORT=true
Expand Down
3 changes: 1 addition & 2 deletions docker/compose/docker-compose.gemma-4b-gpu.ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,7 @@ services:

--uvicorn-log-level warning
environment:
- SVC_HOST=gemma_4b_gpu
- SVC_PORT=8000
- SVC_URL=http://gemma_4b_gpu:8000
- DISCOVERY_URL=redis://redis:6379
- TOOL_SUPPORT=false
- MULTIMODAL_SUPPORT=true
Expand Down
3 changes: 1 addition & 2 deletions docker/compose/docker-compose.gpt-120b-gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,7 @@ services:
--tensor-parallel-size 1
--uvicorn-log-level warning
environment:
- SVC_HOST=gpt_120b_gpu
- SVC_PORT=8000
- SVC_URL=http://gpt_120b_gpu:8000
- DISCOVERY_URL=redis://redis:6379
- TOOL_SUPPORT=true
volumes:
Expand Down
3 changes: 1 addition & 2 deletions docker/compose/docker-compose.gpt-20b-gpu.ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ services:
command: >
--model openai/gpt-oss-20b --gpu-memory-utilization 0.95 --max-model-len 10000 --max-num-batched-tokens 10000 --max-num-seqs 2 --tensor-parallel-size 1 --uvicorn-log-level warning --async-scheduling
environment:
- SVC_HOST=gpt_20b_gpu
- SVC_PORT=8000
- SVC_URL=http://gpt_20b_gpu:8000
- DISCOVERY_URL=redis://redis:6379
- TOOL_SUPPORT=true
volumes:
Expand Down
3 changes: 1 addition & 2 deletions docker/compose/docker-compose.gpt-20b-gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,7 @@ services:
--tensor-parallel-size 1
--uvicorn-log-level warning
environment:
- SVC_HOST=gpt_20b_gpu
- SVC_PORT=8000
- SVC_URL=http://gpt_20b_gpu:8000
- DISCOVERY_URL=redis://redis:6379
- TOOL_SUPPORT=true
volumes:
Expand Down
3 changes: 1 addition & 2 deletions docker/compose/docker-compose.llama-1b-cpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@ services:
--tool-call-parser llama3_json
--uvicorn-log-level warning
environment:
- SVC_HOST=llama_1b_cpu
- SVC_PORT=8000
- SVC_URL=http://llama_1b_cpu:8000
- DISCOVERY_URL=redis://redis:6379
- TOOL_SUPPORT=true
volumes:
Expand Down
3 changes: 1 addition & 2 deletions docker/compose/docker-compose.llama-1b-gpu.ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@ services:
--uvicorn-log-level warning
--dtype half
environment:
- SVC_HOST=llama_1b_gpu
- SVC_PORT=8000
- SVC_URL=http://llama_1b_gpu:8000
- DISCOVERY_URL=redis://redis:6379
- TOOL_SUPPORT=true
- CUDA_LAUNCH_BLOCKING=1
Expand Down
3 changes: 1 addition & 2 deletions docker/compose/docker-compose.llama-1b-gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@ services:
--tool-call-parser llama3_json
--uvicorn-log-level warning
environment:
- SVC_HOST=llama_1b_gpu
- SVC_PORT=8000
- SVC_URL=http://llama_1b_gpu:8000
- DISCOVERY_URL=redis://redis:6379
- TOOL_SUPPORT=true
volumes:
Expand Down
3 changes: 1 addition & 2 deletions docker/compose/docker-compose.llama-3b-gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@ services:
--tool-call-parser llama3_json
--uvicorn-log-level warning
environment:
- SVC_HOST=llama_3b_gpu
- SVC_PORT=8000
- SVC_URL=http://llama_3b_gpu:8000
- DISCOVERY_URL=redis://redis:6379
- TOOL_SUPPORT=true
volumes:
Expand Down
3 changes: 1 addition & 2 deletions docker/compose/docker-compose.llama-70b-gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@ services:
--tool-call-parser llama3_json
--uvicorn-log-level warning
environment:
- SVC_HOST=llama_70b_gpu
- SVC_PORT=8000
- SVC_URL=http://llama_70b_gpu:8000
- DISCOVERY_URL=redis://redis:6379
- TOOL_SUPPORT=true
volumes:
Expand Down
3 changes: 1 addition & 2 deletions docker/compose/docker-compose.llama-8b-gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,7 @@ services:
--enable-auto-tool-choice
--chat-template /daemon/nilai-models/templates/llama3.1_tool_json.jinja
environment:
- SVC_HOST=llama_8b_gpu
- SVC_PORT=8000
- SVC_URL=http://llama_8b_gpu:8000
- DISCOVERY_URL=redis://redis:6379
- TOOL_SUPPORT=true
volumes:
Expand Down
3 changes: 1 addition & 2 deletions docker/compose/docker-compose.lmstudio.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@ services:
redis:
condition: service_healthy
environment:
- SVC_HOST=host.docker.internal
- SVC_PORT=1234
- SVC_URL=http://host.docker.internal:1234
- DISCOVERY_URL=redis://redis:6379
- LMSTUDIO_SUPPORTED_FEATURES=chat_completion
extra_hosts:
Expand Down
3 changes: 1 addition & 2 deletions docker/compose/docker-compose.nilai-prod-1.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@ services:
--kv-cache-dtype fp8
--uvicorn-log-level warning
environment:
- SVC_HOST=gemma_27b_gpu
- SVC_PORT=8000
- SVC_URL=http://gemma_27b_gpu:8000
- DISCOVERY_URL=redis://redis:6379
- TOOL_SUPPORT=false
- MULTIMODAL_SUPPORT=true
Expand Down
6 changes: 2 additions & 4 deletions docker/compose/docker-compose.nilai-prod-2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,7 @@ services:
--enable-auto-tool-choice
--chat-template /daemon/nilai-models/templates/llama3.1_tool_json.jinja
environment:
- SVC_HOST=llama_8b_gpu
- SVC_PORT=8000
- SVC_URL=http://llama_8b_gpu:8000
- DISCOVERY_URL=redis://redis:6379
- TOOL_SUPPORT=true
volumes:
Expand Down Expand Up @@ -74,8 +73,7 @@ services:
--tensor-parallel-size 1
--uvicorn-log-level warning
environment:
- SVC_HOST=gpt_20b_gpu
- SVC_PORT=8000
- SVC_URL=http://gpt_20b_gpu:8000
- DISCOVERY_URL=redis://redis:6379
- TOOL_SUPPORT=true
volumes:
Expand Down
3 changes: 1 addition & 2 deletions docker/compose/docker-compose.qwen-2b-gpu.ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,7 @@ services:
]

environment:
SVC_HOST: qwen2vl_2b_gpu
SVC_PORT: "8000"
SVC_URL: http://qwen2vl_2b_gpu:8000
DISCOVERY_URL: redis://redis:6379
TOOL_SUPPORT: "true"
MULTIMODAL_SUPPORT: "true"
Expand Down
10 changes: 4 additions & 6 deletions nilai-api/src/nilai_api/app.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
# Fast API and serving


from prometheus_fastapi_instrumentator import Instrumentator
from fastapi import Depends, FastAPI
from nilai_api.auth import get_auth_info
Expand All @@ -25,7 +22,8 @@ async def lifespan(app: FastAPI):
yield {"redis": client, "redis_rate_limit_command": rate_limit_command}


host = SETTINGS.host
api_base = SETTINGS.url.rstrip("/")
openapi_url = f"{api_base}/openapi.json"
description = f"""
An AI model serving platform powered by secure, confidential computing.

Expand All @@ -39,7 +37,7 @@ async def lifespan(app: FastAPI):
pip install openapi-generator-cli

# Generate your Python client
openapi-generator-cli generate -i https://{host}/openapi.json -g python -o ./python-client
openapi-generator-cli generate -i {openapi_url} -g python -o ./python-client
```

### For JavaScript/TypeScript Developers
Expand All @@ -48,7 +46,7 @@ async def lifespan(app: FastAPI):
npm install @openapitools/openapi-generator-cli -g

# Generate your TypeScript client
openapi-generator-cli generate -i https://{host}/openapi.json -o ./typescript-client
openapi-generator-cli generate -i {openapi_url} -o ./typescript-client
```

After generating, you'll have a fully functional client library that makes it easy to interact with our AI services. No more manual API request handling!
Expand Down
6 changes: 2 additions & 4 deletions nilai-models/src/nilai_models/daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ async def get_metadata():
while True:
url = None
try:
url = f"http://{SETTINGS.host}:{SETTINGS.port}/v1/models"
url = f"{SETTINGS.url}/v1/models"
async with httpx.AsyncClient() as client:
response = await client.get(url)
response.raise_for_status()
Expand Down Expand Up @@ -94,9 +94,7 @@ async def main():

# Fetch metadata and create endpoint
metadata = await get_metadata()
model_endpoint = ModelEndpoint(
url=f"http://{SETTINGS.host}:{SETTINGS.port}", metadata=metadata
)
model_endpoint = ModelEndpoint(url=SETTINGS.url.rstrip("/"), metadata=metadata)

# Create service task
service_task = asyncio.create_task(run_service(discovery_service, model_endpoint))
Expand Down
6 changes: 2 additions & 4 deletions nilai-models/src/nilai_models/lmstudio_announcer.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,9 +144,7 @@ async def main():
logging.basicConfig(level=logging.INFO)

# Load configuration from environment
api_base = os.getenv(
"LMSTUDIO_API_BASE", f"http://{SETTINGS.host}:{SETTINGS.port}"
).rstrip("/")
api_base = os.getenv("LMSTUDIO_API_BASE", SETTINGS.url).rstrip("/")
models_endpoint = os.getenv("LMSTUDIO_MODELS_ENDPOINT", "/v1/models")
registration_url = os.getenv("LMSTUDIO_REGISTRATION_URL", api_base).rstrip("/")
lease_ttl = int(os.getenv("LMSTUDIO_LEASE_TTL", "60"))
Expand Down Expand Up @@ -192,7 +190,7 @@ async def main():
)

logger.info(
"Announcing LMStudio models %s via %s with Redis at %s:%s",
"Announcing LMStudio models %s via %s with Redis at %s",
", ".join(model_ids),
registration_url,
SETTINGS.discovery_url,
Expand Down
8 changes: 4 additions & 4 deletions packages/nilai-common/src/nilai_common/config/host.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@ def to_bool(value: str) -> bool:
class HostSettings(BaseModel):
"""Infrastructure and service host configuration."""

host: str = Field(default="localhost", description="Host of the service")
port: int = Field(default=8000, description="Port of the service")
url: str = Field(
default="http://localhost:8000", description="Base URL of the service"
)
discovery_url: str = Field(
default="redis://redis:6379",
description="Redis URL of the discovery service (preferred)",
Expand All @@ -23,8 +24,7 @@ class HostSettings(BaseModel):

# Global host settings instance
SETTINGS: HostSettings = HostSettings(
host=str(os.getenv("SVC_HOST", "localhost")),
port=int(os.getenv("SVC_PORT", 8000)),
url=str(os.getenv("SVC_URL", "http://localhost:8000")),
discovery_url=str(os.getenv("DISCOVERY_URL", "redis://redis:6379")),
gunicorn_workers=int(os.getenv("NILAI_GUNICORN_WORKERS", 10)),
)
2 changes: 0 additions & 2 deletions packages/nilai-common/src/nilai_common/discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@ def __init__(
Initialize Redis client for model service discovery.

:param url: Redis URL (e.g., redis:// or rediss://). Preferred default.
:param host: Redis server host
:param port: Redis server port
:param lease_ttl: TTL time for endpoint registration (in seconds)
"""
self.lease_ttl = lease_ttl
Expand Down
Loading