clamsproject · keighrim · Nov 8, 2025 · Nov 8, 2025 · Nov 20, 2025 · Nov 20, 2025
diff --git a/clams/app/__init__.py b/clams/app/__init__.py
diff --git a/clams/appmetadata/__init__.py b/clams/appmetadata/__init__.py
@@ -352,9 +352,20 @@ class AppMetadata(pydantic.BaseModel):
                     "a package name and its version in the string value at the minimum (e.g., ``clams-python==1.2.3``)."
     )
     more: Optional[Dict[str, str]] = pydantic.Field(
-        None, 
+        None,
         description="(optional) A string-to-string map that can be used to store any additional metadata of the app."
     )
+    gpu_mem_min: int = pydantic.Field(
+        0,
+        description="(optional) Minimum GPU memory required to run the app, in megabytes (MB). "
+                    "Set to 0 (default) if the app does not use GPU."
+    )
+    gpu_mem_typ: int = pydantic.Field(
+        0,
+        description="(optional) Typical GPU memory usage for default parameters, in megabytes (MB). "
+                    "Must be equal or larger than gpu_mem_min. "
+                    "Set to 0 (default) if the app does not use GPU."
+    )
 
     model_config = {
         'title': 'CLAMS AppMetadata',
@@ -372,6 +383,21 @@ def assign_versions(cls, data):
             data.mmif_version = get_mmif_specver()
         return data
 
+    @pydantic.model_validator(mode='after')
+    @classmethod
+    def validate_gpu_memory(cls, data):
+        import warnings
+        if data.gpu_mem_typ > 0 and data.gpu_mem_min > 0:
+            if data.gpu_mem_typ < data.gpu_mem_min:
+                warnings.warn(
+                    f"gpu_mem_typ ({data.gpu_mem_typ} MB) is less than "
+                    f"gpu_mem_min ({data.gpu_mem_min} MB). "
+                    f"Setting gpu_mem_typ to {data.gpu_mem_min} MB.",
+                    UserWarning
+                )
+                data.gpu_mem_typ = data.gpu_mem_min
+        return data
+
     @pydantic.field_validator('identifier', mode='before')
     @classmethod
     def append_version(cls, val):

diff --git a/clams/develop/templates/app/metadata.py.template b/clams/develop/templates/app/metadata.py.template
@@ -39,6 +39,9 @@ def appmetadata() -> AppMetadata:
         # this trick can also be useful (replace ANALYZER_NAME with the pypi dist name)
         analyzer_version=[l.strip().rsplit('==')[-1] for l in open(pathlib.Path(__file__).parent / 'requirements.txt').readlines() if re.match(r'^ANALYZER_NAME==', l)][0],
         analyzer_license="",  # short name for a software license
+        # GPU memory requirements (in MB). Set to 0 if the app does not use GPU.
+        # gpu_mem_min=0,  # minimum GPU memory required for minimal configuration parameters
+        # gpu_mem_rec=0,  # recommended GPU memory for default parameters, must be equal or larger than gpu_mem_min
     )
     # and then add I/O specifications: an app must have at least one input and one output
     metadata.add_input(DocumentTypes.Document)

diff --git a/clams/restify/__init__.py b/clams/restify/__init__.py
@@ -3,7 +3,7 @@
 from flask_restful import Resource, Api
 from mmif import Mmif
 
-from clams.app import ClamsApp
+from clams.app import ClamsApp, InsufficientVRAMError
 
 
 class Restifier(object):
@@ -42,14 +42,55 @@ def run(self, **options):
     def serve_production(self, **options):
         """
         Runs the CLAMS app as a flask webapp, using a production-ready web server (gunicorn, https://docs.gunicorn.org/en/stable/#).
-        
+
         :param options: any additional options to pass to the web server.
         """
         import gunicorn.app.base
         import multiprocessing
+        import os
 
         def number_of_workers():
-            return (multiprocessing.cpu_count() * 2) + 1  # +1 to make sure at least two workers are running
+            # Allow override via environment variable
+            if 'CLAMS_WORKERS' in os.environ:
+                return int(os.environ['CLAMS_WORKERS'])
+
+            cpu_workers = (multiprocessing.cpu_count() * 2) + 1
+
+            # Get GPU memory requirement from app metadata
+            # Use gpu_mem_typ (typical usage) for worker calculation
+            try:
+                metadata = self.cla.metadata
+                gpu_mem_mb = metadata.gpu_mem_typ  # typical usage determines how many workers fit
+            except Exception:
+                gpu_mem_mb = 0
+
+            if gpu_mem_mb <= 0:
+                return cpu_workers
+
+            # Calculate workers based on total VRAM of the first CUDA device (no other GPUs are considered for now)
+            # Use nvidia-smi instead of torch to avoid initializing CUDA in parent process before fork
+            try:
+                import subprocess
+                import shutil
+                if shutil.which('nvidia-smi'):
+                    result = subprocess.run(
+                        ['nvidia-smi', '--query-gpu=memory.total', '--format=csv,noheader,nounits', '-i', '0'],
+                        capture_output=True, text=True, timeout=5
+                    )
+                    if result.returncode == 0 and result.stdout.strip():
+                        total_vram_mb = float(result.stdout.strip())
+                        vram_workers = max(1, int(total_vram_mb // gpu_mem_mb))
+                        workers = min(vram_workers, cpu_workers)
+                        self.cla.logger.info(
+                            f"GPU detected: {total_vram_mb:.0f} MB VRAM, "
+                            f"app requires {gpu_mem_mb} MB, "
+                            f"using {workers} workers (max {vram_workers} by VRAM, {cpu_workers} by CPU)"
+                        )
+                        return workers
+            except Exception:
+                pass
+
+            return cpu_workers
 
         class ProductionApplication(gunicorn.app.base.BaseApplication):
 
@@ -58,9 +99,16 @@ def __init__(self, app, host, port, **options):
                     'bind': f'{host}:{port}',
                     'workers': number_of_workers(),
                     'threads': 2,
+                    # disable timeout for long-running GPU workloads (default 30s is too short)
+                    'timeout': 0,
                     # because the default is 'None'
                     'accesslog': '-',
                     # errorlog, however, is redirected to stderr by default since 19.2, so no need to set
+                    # log level is warning by default
+                    'loglevel': os.environ.get('CLAMS_LOGLEVEL', 'warning').lower(),
+                    # default to 1 to free GPU memory after each request
+                    # developers can override via serve_production(max_requests=N) for single-model apps
+                    'max_requests': 1,
                 }
                 self.options.update(options)
                 self.application = app
@@ -75,6 +123,13 @@ def load_config(self):
             def load(self):
                 return self.application
 
+        # Log max_requests setting
+        max_req = options.get('max_requests', 1)  # default is 1, meaning workers are killed after each request
+        if max_req == 0:
+            self.cla.logger.info("Worker recycling: disabled (workers persist)")
+        else:
+            self.cla.logger.info(f"Worker recycling: after {max_req} request(s)")
+
         ProductionApplication(self.flask_app, self.host, self.port, **options).run()
 
     def serve_development(self, **options):
@@ -144,6 +199,9 @@ def post(self) -> Response:
             return Response(response="Invalid input data. See below for validation error.\n\n" + str(e), status=500, mimetype='text/plain')
         try:
             return self.json_to_response(self.cla.annotate(raw_data, **raw_params))
+        except InsufficientVRAMError as e:
+            self.cla.logger.warning(f"Request rejected due to insufficient VRAM: {e}")
+            return self.json_to_response(self.cla.record_error(raw_data, **raw_params).serialize(pretty=True), status=503)
         except Exception:
             self.cla.logger.exception("Error in annotation")
             return self.json_to_response(self.cla.record_error(raw_data, **raw_params).serialize(pretty=True), status=500)

diff --git a/documentation/clamsapp.md b/documentation/clamsapp.md
@@ -209,6 +209,17 @@ $ python app.py
 * Be default, the app will be running in *debugging* mode, but you can change it to *production* mode by passing `--production` option to support larger traffic volume.
 * As you might have noticed, the default `CMD` in the prebuilt containers is `python app.py --production --port 5000`.
 
+##### Environment variables for production mode
+
+When running in production mode, the following environment variables can be used to configure the app server:
+
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `CLAMS_WORKERS` | Number of gunicorn worker processes | Auto-calculated based on CPU cores and GPU memory |
+| `CLAMS_LOGLEVEL` | Logging verbosity level (`debug`, `info`, `warning`, `error`) | `warning` |
+
+By default, the number of workers is calculated as `(CPU cores × 2) + 1`. For GPU-based apps, see [GPU Memory Management](gpu-apps.md) for details on automatic worker scaling and VRAM management.
+
 #### `metadata.py`: Getting app metadata
 
 Running `metadata.py` will print out the app metadata in JSON format.