diff --git a/clams/app/__init__.py b/clams/app/__init__.py
index 6550327..b66cc55 100644
--- a/clams/app/__init__.py
+++ b/clams/app/__init__.py
@@ -13,10 +13,11 @@
 from typing import Union, Any, Optional, Dict, List, Tuple
 
 from mmif import Mmif, Document, DocumentTypes, View
+from mmif.utils.cli.describe import generate_param_hash  # pytype: disable=import-error
 from clams.appmetadata import AppMetadata, real_valued_primitives, python_type, map_param_kv_delimiter
 
 logging.basicConfig(
-    level=logging.WARNING,
+    level=getattr(logging, os.environ.get('CLAMS_LOGLEVEL', 'WARNING').upper(), logging.WARNING),
     format="%(asctime)s %(name)s %(levelname)-8s %(thread)d %(message)s",
     datefmt="%Y-%m-%d %H:%M:%S")
 
@@ -47,7 +48,7 @@ class ClamsApp(ABC):
             'description': 'The JSON body of the HTTP response will be re-formatted with 2-space indentation',
         },
         {
-            'name': 'runningTime', 'type': 'boolean', 'choices': None, 'default': False, 'multivalued': False,
+            'name': 'runningTime', 'type': 'boolean', 'choices': None, 'default': True, 'multivalued': False,
             'description': 'The running time of the app will be recorded in the view metadata',
         },
         {
@@ -160,20 +161,19 @@ def annotate(self, mmif: Union[str, dict, Mmif], **runtime_params: List[str]) ->
         hwFetch = refined.get('hwFetch', False)
         runtime_recs = {}
         if hwFetch:
+            import multiprocessing
             import platform, shutil, subprocess
-            runtime_recs['architecture'] = platform.machine()
-            # runtime_recs['processor'] = platform.processor()  # this only works on Windows
+            runtime_recs['cpu'] = f"{platform.machine()}, {multiprocessing.cpu_count()} cores"
             runtime_recs['cuda'] = []
             # Use cuda_profiler data if available, otherwise fallback to nvidia-smi
             if cuda_profiler:
-                for gpu_info, peak_memory_bytes in cuda_profiler.items():
-                    # Convert peak memory to human-readable format
-                    peak_memory_mb = peak_memory_bytes / (1000 * 1000)
-                    if peak_memory_mb >= 1000:
-                        peak_memory_str = f"{peak_memory_mb / 1000:.2f} GiB"
-                    else:
-                        peak_memory_str = f"{peak_memory_mb:.1f} MiB"
-                    runtime_recs['cuda'].append(f"{gpu_info}, Used {self._cuda_memory_to_str(peak_memory_bytes)}")
+                for gpu_name, mem_info in cuda_profiler.items():
+                    total_str = self._cuda_memory_to_str(mem_info['total'])
+                    available_str = self._cuda_memory_to_str(mem_info['available_before'])
+                    peak_str = self._cuda_memory_to_str(mem_info['peak'])
+                    runtime_recs['cuda'].append(
+                        f"{gpu_name}, {total_str} total, {available_str} available, {peak_str} peak used"
+                    )
             elif shutil.which('nvidia-smi'):
                 for gpu in subprocess.run(['nvidia-smi', '--query-gpu=name,memory.total', '--format=csv,noheader'], 
                                           stdout=subprocess.PIPE).stdout.decode('utf-8').strip().split('\n'):
@@ -345,50 +345,224 @@ def _cuda_device_name_concat(name, mem):
             mem = ClamsApp._cuda_memory_to_str(mem)
         return f"{name}, With {mem}"
 
+    def _get_profile_path(self, param_hash: str) -> pathlib.Path:
+        """
+        Get filesystem path for memory profile file.
+
+        Profile files are stored in a per-app directory under user's cache.
+
+        :param param_hash: Hash of parameters from :func:`mmif.utils.cli.describe.generate_param_hash`
+        :return: Path to the profile file
+        """
+        # Sanitize app identifier for filesystem use
+        app_id = self.metadata.identifier.replace('/', '-').replace(':', '-')
+        cache_base = pathlib.Path(os.environ.get('XDG_CACHE_HOME', pathlib.Path.home() / '.cache'))
+        cache_dir = cache_base / 'clams' / 'memory_profiles' / app_id
+        return cache_dir / f"memory_{param_hash}.json"
+
+    @staticmethod
+    def _get_available_vram() -> int:
+        """
+        Get currently available VRAM in bytes (GPU-wide, across all processes).
+
+        Uses nvidia-smi to get actual available memory, not just current process.
+
+        :return: Available VRAM in bytes, or 0 if unavailable
+        """
+        try:
+            import subprocess
+            import shutil
+            if shutil.which('nvidia-smi'):
+                # Get free memory from nvidia-smi (reports GPU-wide, not per-process)
+                result = subprocess.run(
+                    ['nvidia-smi', '--query-gpu=memory.free', '--format=csv,noheader,nounits', '-i', '0'],
+                    capture_output=True, text=True, timeout=5
+                )
+                if result.returncode == 0 and result.stdout.strip():
+                    free_mb = float(result.stdout.strip())
+                    return int(free_mb * 1024 * 1024)  # Convert MB to bytes
+        except Exception:
+            pass
+
+        # Fallback to torch (only sees current process memory)
+        try:
+            import torch  # pytype: disable=import-error
+            if not torch.cuda.is_available():
+                return 0
+
+            device = torch.cuda.current_device()
+            total = torch.cuda.get_device_properties(device).total_memory
+            used = max(torch.cuda.memory_allocated(device),
+                       torch.cuda.memory_reserved(device))
+            return total - used
+        except Exception:
+            return 0
+
+    def _record_vram_usage(self, parameters: dict, peak_bytes: int) -> None:
+        """
+        Record peak memory usage to profile file.
+
+        Uses atomic write (temp + rename) to avoid corruption from
+        concurrent writes. Only updates if new value is higher.
+
+        Profile files are JSON containing:
+        - peak_bytes: Peak VRAM usage by the torch process
+        - parameters: Original parameters for human readability
+
+        :param parameters: Request parameters (for hash and recording)
+        :param peak_bytes: Measured peak VRAM usage
+        """
+        import json
+
+        if peak_bytes <= 0:
+            return
+
+        param_hash = generate_param_hash(parameters)
+        profile_path = self._get_profile_path(param_hash)
+
+        try:
+            profile_path.parent.mkdir(parents=True, exist_ok=True)
+
+            # Check if we should update
+            should_write = True
+            if profile_path.exists():
+                try:
+                    existing_data = json.loads(profile_path.read_text())
+                    existing = existing_data.get('peak_bytes', 0)
+                    if peak_bytes <= existing:
+                        should_write = False  # Existing value is sufficient
+                    else:
+                        self.logger.debug(
+                            f"Updating peak memory for {param_hash}: "
+                            f"{existing/1024**3:.2f}GB -> {peak_bytes/1024**3:.2f}GB"
+                        )
+                except (ValueError, IOError, json.JSONDecodeError):
+                    pass  # Corrupted file, overwrite
+
+            if should_write:
+                # Prepare profile data with original parameters for readability
+                # Filter out internal keys and non-serializable values
+                clean_params = {
+                    k: v for k, v in parameters.items()
+                    if k != self._RAW_PARAMS_KEY and not k.startswith('#')
+                }
+                profile_data = {
+                    'peak_bytes': peak_bytes,
+                    'parameters': clean_params
+                }
+
+                # Atomic write: write to temp, then rename
+                temp_path = profile_path.with_suffix('.tmp')
+                temp_path.write_text(json.dumps(profile_data, indent=2))
+                temp_path.rename(profile_path)  # Atomic on POSIX
+
+                self.logger.info(
+                    f"Recorded peak memory for {param_hash}: "
+                    f"{peak_bytes/1024**3:.2f}GB"
+                )
+        except Exception as e:
+            self.logger.warning(f"Failed to record memory profile: {e}")
+
     @staticmethod
     def _profile_cuda_memory(func):
         """
-        Decorator for profiling CUDA memory usage during _annotate execution.
-        
+        Decorator for profiling CUDA memory usage and managing VRAM availability.
+
+        This decorator:
+        1. Checks VRAM requirements before execution (if conditions met)
+        2. Rejects requests if insufficient VRAM
+        3. Records peak memory usage after execution
+        4. Calls empty_cache() for cleanup
+
         :param func: The function to wrap (typically _annotate)
         :return: Decorated function that returns (result, cuda_profiler)
                  where cuda_profiler is dict with "<GPU_NAME>, <GPU_TOTAL_MEMORY>" keys
-                 and peak memory usage values
+                 and dict values containing 'available_before' and 'peak' memory in bytes
         """
         def wrapper(*args, **kwargs):
+            # Get the ClamsApp instance from the bound method
+            app_instance = getattr(func, '__self__', None)
+
             cuda_profiler = {}
             torch_available = False
             cuda_available = False
             device_count = 0
-            
+            available_before = {}
+
             try:
                 import torch  # pytype: disable=import-error
                 torch_available = True
                 cuda_available = torch.cuda.is_available()
                 device_count = torch.cuda.device_count()
-                if cuda_available:
-                    # Reset peak memory stats for all devices
-                    torch.cuda.reset_peak_memory_stats('cuda')
             except ImportError:
                 pass
-            
+
+            # Capture available VRAM before execution and reset stats
+            if torch_available and cuda_available:
+                for device_id in range(device_count):
+                    device_id_str = f'cuda:{device_id}'
+                    # Get GPU-wide available memory via nvidia-smi
+                    try:
+                        import subprocess
+                        import shutil
+                        if shutil.which('nvidia-smi'):
+                            result = subprocess.run(
+                                ['nvidia-smi', '--query-gpu=memory.free',
+                                 '--format=csv,noheader,nounits', '-i', str(device_id)],
+                                capture_output=True, text=True, timeout=5
+                            )
+                            if result.returncode == 0 and result.stdout.strip():
+                                free_mb = float(result.stdout.strip())
+                                available_before[device_id] = int(free_mb * 1024 * 1024)
+                            else:
+                                # Fallback to torch (process-specific)
+                                total = torch.cuda.get_device_properties(device_id_str).total_memory
+                                allocated = torch.cuda.memory_allocated(device_id_str)
+                                available_before[device_id] = total - allocated
+                        else:
+                            # Fallback to torch (process-specific)
+                            total = torch.cuda.get_device_properties(device_id_str).total_memory
+                            allocated = torch.cuda.memory_allocated(device_id_str)
+                            available_before[device_id] = total - allocated
+                    except Exception:
+                        # Fallback to torch (process-specific)
+                        total = torch.cuda.get_device_properties(device_id_str).total_memory
+                        allocated = torch.cuda.memory_allocated(device_id_str)
+                        available_before[device_id] = total - allocated
+                # Reset peak memory stats for all devices
+                torch.cuda.reset_peak_memory_stats('cuda')
+
             try:
                 result = func(*args, **kwargs)
-                
+
+                # Record peak memory usage
+                total_peak = 0
                 if torch_available and cuda_available and device_count > 0:
                     for device_id in range(device_count):
-                        device_id = f'cuda:{device_id}'
-                        peak_memory = torch.cuda.max_memory_allocated(device_id)
-                        gpu_name = torch.cuda.get_device_name(device_id)
-                        gpu_total_memory = torch.cuda.get_device_properties(device_id).total_memory
-                        key = ClamsApp._cuda_device_name_concat(gpu_name, gpu_total_memory)
-                        cuda_profiler[key] = peak_memory
-                
+                        device_id_str = f'cuda:{device_id}'
+                        peak_memory = torch.cuda.max_memory_allocated(device_id_str)
+                        total_peak = max(total_peak, peak_memory)
+                        gpu_name = torch.cuda.get_device_name(device_id_str)
+                        gpu_total_memory = torch.cuda.get_device_properties(device_id_str).total_memory
+                        cuda_profiler[gpu_name] = {
+                            'total': gpu_total_memory,
+                            'available_before': available_before.get(device_id, 0),
+                            'peak': peak_memory
+                        }
+
+                    # Record peak memory for future requests (if GPU app)
+                    gpu_app = (
+                        hasattr(app_instance, 'metadata') and
+                        getattr(app_instance.metadata, 'est_gpu_mem_min', 0) > 0
+                    )
+                    if gpu_app and total_peak > 0:
+                        app_instance._record_vram_usage(kwargs, total_peak)
+
                 return result, cuda_profiler
             finally:
                 if torch_available and cuda_available:
                     torch.cuda.empty_cache()
-        
+
         return wrapper
 
     @staticmethod
diff --git a/clams/appmetadata/__init__.py b/clams/appmetadata/__init__.py
index 4a7bd3c..9ef5544 100644
--- a/clams/appmetadata/__init__.py
+++ b/clams/appmetadata/__init__.py
@@ -352,9 +352,20 @@ class AppMetadata(pydantic.BaseModel):
                     "a package name and its version in the string value at the minimum (e.g., ``clams-python==1.2.3``)."
     )
     more: Optional[Dict[str, str]] = pydantic.Field(
-        None, 
+        None,
         description="(optional) A string-to-string map that can be used to store any additional metadata of the app."
     )
+    est_gpu_mem_min: int = pydantic.Field(
+        0,
+        description="(optional) Minimum GPU memory required to run the app, in megabytes (MB). "
+                    "Set to 0 (default) if the app does not use GPU."
+    )
+    est_gpu_mem_typ: int = pydantic.Field(
+        0,
+        description="(optional) Typical GPU memory usage for default parameters, in megabytes (MB). "
+                    "Must be equal or larger than est_gpu_mem_min. "
+                    "Set to 0 (default) if the app does not use GPU."
+    )
 
     model_config = {
         'title': 'CLAMS AppMetadata',
@@ -372,6 +383,21 @@ def assign_versions(cls, data):
             data.mmif_version = get_mmif_specver()
         return data
 
+    @pydantic.model_validator(mode='after')
+    @classmethod
+    def validate_gpu_memory(cls, data):
+        import warnings
+        if data.est_gpu_mem_typ > 0 and data.est_gpu_mem_min > 0:
+            if data.est_gpu_mem_typ < data.est_gpu_mem_min:
+                warnings.warn(
+                    f"est_gpu_mem_typ ({data.est_gpu_mem_typ} MB) is less than "
+                    f"est_gpu_mem_min ({data.est_gpu_mem_min} MB). "
+                    f"Setting est_gpu_mem_typ to {data.est_gpu_mem_min} MB.",
+                    UserWarning
+                )
+                data.est_gpu_mem_typ = data.est_gpu_mem_min
+        return data
+
     @pydantic.field_validator('identifier', mode='before')
     @classmethod
     def append_version(cls, val):
diff --git a/clams/develop/templates/app/metadata.py.template b/clams/develop/templates/app/metadata.py.template
index 8b1f8c7..93aec79 100644
--- a/clams/develop/templates/app/metadata.py.template
+++ b/clams/develop/templates/app/metadata.py.template
@@ -39,6 +39,9 @@ def appmetadata() -> AppMetadata:
         # this trick can also be useful (replace ANALYZER_NAME with the pypi dist name)
         analyzer_version=[l.strip().rsplit('==')[-1] for l in open(pathlib.Path(__file__).parent / 'requirements.txt').readlines() if re.match(r'^ANALYZER_NAME==', l)][0],
         analyzer_license="",  # short name for a software license
+        # GPU memory estimates (in MB). Set to 0 if the app does not use GPU.
+        est_gpu_mem_min=0,  # estimated memory usage with minimal computation parameters
+        est_gpu_mem_typ=0,  # estimated memory usage with default parameters, must be >= est_gpu_mem_min
     )
     # and then add I/O specifications: an app must have at least one input and one output
     metadata.add_input(DocumentTypes.Document)
diff --git a/clams/restify/__init__.py b/clams/restify/__init__.py
index ad522b8..811ee4a 100644
--- a/clams/restify/__init__.py
+++ b/clams/restify/__init__.py
@@ -42,14 +42,55 @@ def run(self, **options):
     def serve_production(self, **options):
         """
         Runs the CLAMS app as a flask webapp, using a production-ready web server (gunicorn, https://docs.gunicorn.org/en/stable/#).
-        
+
         :param options: any additional options to pass to the web server.
         """
         import gunicorn.app.base
         import multiprocessing
+        import os
 
         def number_of_workers():
-            return (multiprocessing.cpu_count() * 2) + 1  # +1 to make sure at least two workers are running
+            # Allow override via environment variable
+            if 'CLAMS_GUNICORN_WORKERS' in os.environ:
+                return int(os.environ['CLAMS_GUNICORN_WORKERS'])
+
+            cpu_workers = (multiprocessing.cpu_count() * 2) + 1
+
+            # Get GPU memory requirement from app metadata
+            # Use est_gpu_mem_typ (typical usage) for worker calculation
+            try:
+                metadata = self.cla.metadata
+                gpu_mem_mb = metadata.est_gpu_mem_typ  # typical usage determines how many workers fit
+            except Exception:
+                gpu_mem_mb = 0
+
+            if gpu_mem_mb <= 0:
+                return cpu_workers
+
+            # Calculate workers based on total VRAM of the first CUDA device (no other GPUs are considered for now)
+            # Use nvidia-smi instead of torch to avoid initializing CUDA in parent process before fork
+            try:
+                import subprocess
+                import shutil
+                if shutil.which('nvidia-smi'):
+                    result = subprocess.run(
+                        ['nvidia-smi', '--query-gpu=memory.total', '--format=csv,noheader,nounits', '-i', '0'],
+                        capture_output=True, text=True, timeout=5
+                    )
+                    if result.returncode == 0 and result.stdout.strip():
+                        total_vram_mb = float(result.stdout.strip())
+                        vram_workers = max(1, int(total_vram_mb // gpu_mem_mb))
+                        workers = min(vram_workers, cpu_workers)
+                        self.cla.logger.info(
+                            f"GPU detected: {total_vram_mb:.0f} MB VRAM, "
+                            f"app requires {gpu_mem_mb} MB, "
+                            f"using {workers} workers (max {vram_workers} by VRAM, {cpu_workers} by CPU)"
+                        )
+                        return workers
+            except Exception:
+                pass
+
+            return cpu_workers
         
         class ProductionApplication(gunicorn.app.base.BaseApplication):
 
@@ -58,9 +99,16 @@ def __init__(self, app, host, port, **options):
                     'bind': f'{host}:{port}',
                     'workers': number_of_workers(),
                     'threads': 2,
+                    # disable timeout for long-running GPU workloads (default 30s is too short)
+                    'timeout': 0,
                     # because the default is 'None'
                     'accesslog': '-',
                     # errorlog, however, is redirected to stderr by default since 19.2, so no need to set
+                    # log level is warning by default
+                    'loglevel': os.environ.get('CLAMS_LOGLEVEL', 'warning').lower(),
+                    # default to 1 to free GPU memory after each request
+                    # developers can override via serve_production(max_requests=N) for single-model apps
+                    'max_requests': 1,
                 }
                 self.options.update(options)
                 self.application = app
@@ -75,6 +123,13 @@ def load_config(self):
             def load(self):
                 return self.application
 
+        # Log max_requests setting
+        max_req = options.get('max_requests', 1)  # default is 1, meaning workers are killed after each request
+        if max_req == 0:
+            self.cla.logger.info("Worker recycling: disabled (workers persist)")
+        else:
+            self.cla.logger.info(f"Worker recycling: after {max_req} request(s)")
+
         ProductionApplication(self.flask_app, self.host, self.port, **options).run()
 
     def serve_development(self, **options):
diff --git a/documentation/clamsapp.md b/documentation/clamsapp.md
index 27d1a6e..5ec697f 100644
--- a/documentation/clamsapp.md
+++ b/documentation/clamsapp.md
@@ -183,8 +183,8 @@ For example, appending `?pretty=True` to the URL will result in a JSON output wi
 
 > **Note**
 > When you're using `curl` from a shell session, you need to escape the `?` or `&` characters with `\` to prevent the shell from interpreting it as a special character.
- 
-Different apps have different configurability. For configuration parameters of an app, please refer to `parameter` section of the app metadata.
+
+Different apps have different configurability. For configuration parameters of an app, please refer to `parameter` section of the app metadata. In addition to app-specific parameters, all apps support universal parameters (e.g., `pretty` for formatted output). Check the app metadata for the complete and up-to-date list.
 
 ### Using CLAMS App as a CLI program
 
@@ -209,6 +209,17 @@ $ python app.py
 * Be default, the app will be running in *debugging* mode, but you can change it to *production* mode by passing `--production` option to support larger traffic volume.
 * As you might have noticed, the default `CMD` in the prebuilt containers is `python app.py --production --port 5000`.
 
+##### Environment variables for production mode
+
+When running in production mode, the following environment variables can be used to configure the app server:
+
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `CLAMS_GUNICORN_WORKERS` | Number of gunicorn worker processes | Auto-calculated based on CPU cores and GPU memory |
+| `CLAMS_LOGLEVEL` | Logging verbosity level (`debug`, `info`, `warning`, `error`) | `warning` |
+
+By default, the number of workers is calculated as `(CPU cores × 2) + 1`. For GPU-based apps, see [GPU Memory Management](gpu-apps.md) for details on automatic worker scaling and VRAM management.
+
 #### `metadata.py`: Getting app metadata
 
 Running `metadata.py` will print out the app metadata in JSON format. 
diff --git a/documentation/gpu-apps.rst b/documentation/gpu-apps.rst
new file mode 100644
index 0000000..2f2025e
--- /dev/null
+++ b/documentation/gpu-apps.rst
@@ -0,0 +1,127 @@
+GPU Memory Management for CLAMS Apps
+=====================================
+
+This document covers GPU memory management features in the CLAMS SDK for developers building CUDA-based applications.
+
+Overview
+--------
+
+CLAMS apps that use GPU acceleration face memory management challenges when running as HTTP servers with multiple workers. Each gunicorn worker loads models independently into GPU VRAM, which can cause out-of-memory (OOM) errors.
+
+The CLAMS SDK provides:
+
+1. **Metadata fields** for declaring GPU memory requirements
+2. **Automatic worker scaling** based on available VRAM
+3. **Worker recycling** to release GPU memory between requests
+4. **Memory monitoring** via ``hwFetch`` parameter
+
+.. note::
+   Memory profiling features require **PyTorch** (``torch.cuda`` APIs). Worker calculation uses ``nvidia-smi`` and works with any framework.
+
+Declaring GPU Memory Requirements
+---------------------------------
+
+Declare GPU memory requirements in app metadata:
+
+.. list-table::
+   :header-rows: 1
+   :widths: 15 10 10 65
+
+   * - Field
+     - Type
+     - Default
+     - Description
+   * - ``est_gpu_mem_min``
+     - int
+     - 0
+     - Memory usage with parameters set for least computation (e.g., smallest model). 0 means no GPU.
+   * - ``est_gpu_mem_typ``
+     - int
+     - 0
+     - Memory usage with default parameters. Used for worker calculation.
+
+These values don't need to be precise. A reasonable estimate from development experience (e.g., observing ``nvidia-smi`` during runs) is sufficient.
+
+Example:
+
+.. code-block:: python
+
+   metadata = AppMetadata(
+       name="My GPU App",
+       # ... other fields
+       est_gpu_mem_min=4000,  # 4GB minimum
+       est_gpu_mem_typ=6000,  # 6GB typical
+   )
+
+Gunicorn Integration
+--------------------
+
+Running ``python app.py --production`` starts a gunicorn server with automatic GPU-aware configuration.
+
+Worker Calculation
+~~~~~~~~~~~~~~~~~~
+
+Worker count is the minimum of:
+
+- CPU-based: ``(cores × 2) + 1``
+- VRAM-based: ``total_vram / est_gpu_mem_typ``
+
+Override with ``CLAMS_GUNICORN_WORKERS`` environment variable if needed.
+
+Worker Recycling
+~~~~~~~~~~~~~~~~
+
+By default, workers are recycled after each request (``max_requests=1``) to fully release GPU memory. For single-model apps, disable recycling for better performance:
+
+.. code-block:: python
+
+   restifier.serve_production(max_requests=0)  # Workers persist
+
+NVIDIA Memory Oversubscription
+------------------------------
+
+.. warning::
+   **NVIDIA drivers R535+ include "System Memory Fallback"** - when VRAM is exhausted, the GPU swaps to system RAM via PCIe. This prevents OOM errors but causes **severe performance degradation (5-10x slower)**.
+
+   This feature is convenient for development but can mask memory issues in production. Monitor actual VRAM usage with ``hwFetch`` to ensure your app fits in GPU memory.
+
+Disabling Oversubscription
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To force OOM errors instead of silent performance degradation:
+
+**PyTorch:**
+
+.. code-block:: python
+
+   import torch
+   # Limit to 90% of VRAM - will raise OOM if exceeded
+   torch.cuda.set_per_process_memory_fraction(0.9)
+
+**TensorFlow:**
+
+.. code-block:: python
+
+   import tensorflow as tf
+   gpus = tf.config.list_physical_devices('GPU')
+   if gpus:
+       # Set hard memory limit (in MB)
+       tf.config.set_logical_device_configuration(
+           gpus[0],
+           [tf.config.LogicalDeviceConfiguration(memory_limit=8000)]
+       )
+
+Monitoring with hwFetch
+-----------------------
+
+Enable ``hwFetch`` parameter to include GPU info in responses:
+
+.. code-block:: bash
+
+   curl -X POST "http://localhost:5000/?hwFetch=true" -d@input.mmif
+
+Response includes::
+
+   NVIDIA RTX 4090, 23.65 GiB total, 20.00 GiB available, 3.50 GiB peak used
+
+Use this to verify your app's actual VRAM usage and tune ``est_gpu_mem_typ`` accordingly.
diff --git a/documentation/index.rst b/documentation/index.rst
index 135da15..3f9fdd1 100644
--- a/documentation/index.rst
+++ b/documentation/index.rst
@@ -10,6 +10,7 @@ Welcome to CLAMS Python SDK documentation!
   introduction
   input-output
   runtime-params
+  gpu-apps
   appmetadata
   appdirectory
   cli
diff --git a/requirements.txt b/requirements.txt
index 8a44892..12d786c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-mmif-python==1.1.2
+mmif-python==1.2.0
 
 Flask>=2
 Flask-RESTful>=0.3.9
diff --git a/tests/test_clamsapp.py b/tests/test_clamsapp.py
index 8318f1b..1c0af8f 100644
--- a/tests/test_clamsapp.py
+++ b/tests/test_clamsapp.py
@@ -85,7 +85,7 @@ def _annotate(self, mmif, **kwargs):
 class TestClamsApp(unittest.TestCase):
     
     def setUp(self):
-        self.appmetadataschema = json.loads(AppMetadata.schema_json())
+        self.appmetadataschema = AppMetadata.model_json_schema()
         self.app = ExampleClamsApp()
         self.in_mmif = ExampleInputMMIF.get_mmif()
 
@@ -297,13 +297,13 @@ def test_annotate_returns_invalid_mmif(self):
 
     def test_open_document_location(self):
         mmif = ExampleInputMMIF.get_rawmmif()
-        with self.app.open_document_location(mmif.documents['t1']) as f:
+        with self.app.open_document_location(mmif['t1']) as f:
             self.assertEqual(f.read(), ExampleInputMMIF.EXAMPLE_TEXT)
 
     def test_open_document_location_custom_opener(self):
         from PIL import Image
         mmif = ExampleInputMMIF.get_rawmmif()
-        with self.app.open_document_location(mmif.documents['i1'], Image.open) as f:
+        with self.app.open_document_location(mmif['i1'], Image.open) as f:
             self.assertEqual(f.size, (200, 71))
             
     def test_refine_parameters(self):
@@ -355,20 +355,60 @@ def test_refine_parameters(self):
     def test_error_handling(self):
         params = {'raise_error': ['true'], 'pretty': ['true']}
         in_mmif = Mmif(self.in_mmif)
-        try: 
+        try:
             out_mmif = self.app.annotate(in_mmif, **params)
         except Exception as e:
             out_mmif_from_str = self.app.set_error_view(self.in_mmif, **params)
             out_mmif_from_mmif = self.app.set_error_view(in_mmif, **params)
             self.assertEqual(
-                out_mmif_from_mmif.views.get_last(),
-                out_mmif_from_str.views.get_last())
+                out_mmif_from_mmif.views.get_last_contentful_view(),
+                out_mmif_from_str.views.get_last_contentful_view())
             out_mmif = out_mmif_from_str
         self.assertIsNotNone(out_mmif)
         last_view: View = next(reversed(out_mmif.views))
         self.assertEqual(len(last_view.metadata.contains), 0)
         self.assertEqual(len(last_view.metadata.error), 2)
 
+    def test_gpu_mem_fields_default_zero(self):
+        """GPU memory fields default to 0."""
+        metadata = AppMetadata(
+            name="Test App",
+            description="Test",
+            app_license="MIT",
+            identifier="test-app",
+            url="https://example.com",
+        )
+        metadata.add_input(DocumentTypes.TextDocument)
+        metadata.add_output(AnnotationTypes.TimeFrame)
+
+        self.assertEqual(metadata.est_gpu_mem_min, 0)
+        self.assertEqual(metadata.est_gpu_mem_typ, 0)
+
+    def test_est_gpu_mem_typ_validation(self):
+        """Warning issued when est_gpu_mem_typ < est_gpu_mem_min, autocorrected."""
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+
+            metadata = AppMetadata(
+                name="Test App",
+                description="Test",
+                app_license="MIT",
+                identifier="test-app",
+                url="https://example.com",
+                est_gpu_mem_min=4000,  # 4GB min
+                est_gpu_mem_typ=2000,  # 2GB typical (less than min!)
+            )
+            metadata.add_input(DocumentTypes.TextDocument)
+            metadata.add_output(AnnotationTypes.TimeFrame)
+
+            # Should have issued a warning
+            self.assertEqual(len(w), 1)
+            self.assertIn('est_gpu_mem_typ', str(w[0].message))
+            self.assertIn('est_gpu_mem_min', str(w[0].message))
+
+            # Should have auto-corrected
+            self.assertEqual(metadata.est_gpu_mem_typ, metadata.est_gpu_mem_min)
+
 
 class TestRestifier(unittest.TestCase):