Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
309 changes: 282 additions & 27 deletions clams/app/__init__.py

Large diffs are not rendered by default.

28 changes: 27 additions & 1 deletion clams/appmetadata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,9 +352,20 @@ class AppMetadata(pydantic.BaseModel):
"a package name and its version in the string value at the minimum (e.g., ``clams-python==1.2.3``)."
)
more: Optional[Dict[str, str]] = pydantic.Field(
None,
None,
description="(optional) A string-to-string map that can be used to store any additional metadata of the app."
)
gpu_mem_min: int = pydantic.Field(
0,
description="(optional) Minimum GPU memory required to run the app, in megabytes (MB). "
"Set to 0 (default) if the app does not use GPU."
)
gpu_mem_typ: int = pydantic.Field(
0,
description="(optional) Typical GPU memory usage for default parameters, in megabytes (MB). "
"Must be equal or larger than gpu_mem_min. "
"Set to 0 (default) if the app does not use GPU."
)

model_config = {
'title': 'CLAMS AppMetadata',
Expand All @@ -372,6 +383,21 @@ def assign_versions(cls, data):
data.mmif_version = get_mmif_specver()
return data

@pydantic.model_validator(mode='after')
@classmethod
def validate_gpu_memory(cls, data):
import warnings
if data.gpu_mem_typ > 0 and data.gpu_mem_min > 0:
if data.gpu_mem_typ < data.gpu_mem_min:
warnings.warn(
f"gpu_mem_typ ({data.gpu_mem_typ} MB) is less than "
f"gpu_mem_min ({data.gpu_mem_min} MB). "
f"Setting gpu_mem_typ to {data.gpu_mem_min} MB.",
UserWarning
)
data.gpu_mem_typ = data.gpu_mem_min
return data

@pydantic.field_validator('identifier', mode='before')
@classmethod
def append_version(cls, val):
Expand Down
3 changes: 3 additions & 0 deletions clams/develop/templates/app/metadata.py.template
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ def appmetadata() -> AppMetadata:
# this trick can also be useful (replace ANALYZER_NAME with the pypi dist name)
analyzer_version=[l.strip().rsplit('==')[-1] for l in open(pathlib.Path(__file__).parent / 'requirements.txt').readlines() if re.match(r'^ANALYZER_NAME==', l)][0],
analyzer_license="", # short name for a software license
# GPU memory requirements (in MB). Set to 0 if the app does not use GPU.
# gpu_mem_min=0, # minimum GPU memory required for minimal configuration parameters
# gpu_mem_rec=0, # recommended GPU memory for default parameters, must be equal or larger than gpu_mem_min
)
# and then add I/O specifications: an app must have at least one input and one output
metadata.add_input(DocumentTypes.Document)
Expand Down
64 changes: 61 additions & 3 deletions clams/restify/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from flask_restful import Resource, Api
from mmif import Mmif

from clams.app import ClamsApp
from clams.app import ClamsApp, InsufficientVRAMError


class Restifier(object):
Expand Down Expand Up @@ -42,14 +42,55 @@ def run(self, **options):
def serve_production(self, **options):
"""
Runs the CLAMS app as a flask webapp, using a production-ready web server (gunicorn, https://docs.gunicorn.org/en/stable/#).

:param options: any additional options to pass to the web server.
"""
import gunicorn.app.base
import multiprocessing
import os

def number_of_workers():
return (multiprocessing.cpu_count() * 2) + 1 # +1 to make sure at least two workers are running
# Allow override via environment variable
if 'CLAMS_WORKERS' in os.environ:
return int(os.environ['CLAMS_WORKERS'])

cpu_workers = (multiprocessing.cpu_count() * 2) + 1

# Get GPU memory requirement from app metadata
# Use gpu_mem_typ (typical usage) for worker calculation
try:
metadata = self.cla.metadata
gpu_mem_mb = metadata.gpu_mem_typ # typical usage determines how many workers fit
except Exception:
gpu_mem_mb = 0

if gpu_mem_mb <= 0:
return cpu_workers

# Calculate workers based on total VRAM of the first CUDA device (no other GPUs are considered for now)
# Use nvidia-smi instead of torch to avoid initializing CUDA in parent process before fork
try:
import subprocess
import shutil
if shutil.which('nvidia-smi'):
result = subprocess.run(
['nvidia-smi', '--query-gpu=memory.total', '--format=csv,noheader,nounits', '-i', '0'],
capture_output=True, text=True, timeout=5
)
if result.returncode == 0 and result.stdout.strip():
total_vram_mb = float(result.stdout.strip())
vram_workers = max(1, int(total_vram_mb // gpu_mem_mb))
workers = min(vram_workers, cpu_workers)
self.cla.logger.info(
f"GPU detected: {total_vram_mb:.0f} MB VRAM, "
f"app requires {gpu_mem_mb} MB, "
f"using {workers} workers (max {vram_workers} by VRAM, {cpu_workers} by CPU)"
)
return workers
except Exception:
pass

return cpu_workers

class ProductionApplication(gunicorn.app.base.BaseApplication):

Expand All @@ -58,9 +99,16 @@ def __init__(self, app, host, port, **options):
'bind': f'{host}:{port}',
'workers': number_of_workers(),
'threads': 2,
# disable timeout for long-running GPU workloads (default 30s is too short)
'timeout': 0,
# because the default is 'None'
'accesslog': '-',
# errorlog, however, is redirected to stderr by default since 19.2, so no need to set
# log level is warning by default
'loglevel': os.environ.get('CLAMS_LOGLEVEL', 'warning').lower(),
# default to 1 to free GPU memory after each request
# developers can override via serve_production(max_requests=N) for single-model apps
'max_requests': 1,
}
self.options.update(options)
self.application = app
Expand All @@ -75,6 +123,13 @@ def load_config(self):
def load(self):
return self.application

# Log max_requests setting
max_req = options.get('max_requests', 1) # default is 1, meaning workers are killed after each request
if max_req == 0:
self.cla.logger.info("Worker recycling: disabled (workers persist)")
else:
self.cla.logger.info(f"Worker recycling: after {max_req} request(s)")

ProductionApplication(self.flask_app, self.host, self.port, **options).run()

def serve_development(self, **options):
Expand Down Expand Up @@ -144,6 +199,9 @@ def post(self) -> Response:
return Response(response="Invalid input data. See below for validation error.\n\n" + str(e), status=500, mimetype='text/plain')
try:
return self.json_to_response(self.cla.annotate(raw_data, **raw_params))
except InsufficientVRAMError as e:
self.cla.logger.warning(f"Request rejected due to insufficient VRAM: {e}")
return self.json_to_response(self.cla.record_error(raw_data, **raw_params).serialize(pretty=True), status=503)
except Exception:
self.cla.logger.exception("Error in annotation")
return self.json_to_response(self.cla.record_error(raw_data, **raw_params).serialize(pretty=True), status=500)
Expand Down
11 changes: 11 additions & 0 deletions documentation/clamsapp.md
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,17 @@ $ python app.py
* Be default, the app will be running in *debugging* mode, but you can change it to *production* mode by passing `--production` option to support larger traffic volume.
* As you might have noticed, the default `CMD` in the prebuilt containers is `python app.py --production --port 5000`.

##### Environment variables for production mode

When running in production mode, the following environment variables can be used to configure the app server:

| Variable | Description | Default |
|----------|-------------|---------|
| `CLAMS_WORKERS` | Number of gunicorn worker processes | Auto-calculated based on CPU cores and GPU memory |
| `CLAMS_LOGLEVEL` | Logging verbosity level (`debug`, `info`, `warning`, `error`) | `warning` |

By default, the number of workers is calculated as `(CPU cores × 2) + 1`. For GPU-based apps, see [GPU Memory Management](gpu-apps.md) for details on automatic worker scaling and VRAM management.

#### `metadata.py`: Getting app metadata

Running `metadata.py` will print out the app metadata in JSON format.
Expand Down
Loading
Loading