NeptuneHub · saivarunkonda · Mar 14, 2026
diff --git a/Dockerfile b/Dockerfile
@@ -95,6 +95,8 @@ RUN set -ux; \
             supervisor procps \
             gcc g++ \
             git vim redis-tools strace iputils-ping \
+            $(if [[ "$BASE_IMAGE" =~ ^nvidia/cuda: ]]; then echo libnvinfer10; fi) \
+            $(if [[ "$BASE_IMAGE" =~ ^nvidia/cuda: ]]; then echo libnvinfer-plugin10; fi) \
             "$(if [[ "$BASE_IMAGE" =~ ^nvidia/cuda:([0-9]+)\.([0-9]+).+$ ]]; then echo "cuda-compiler-${BASH_REMATCH[1]}-${BASH_REMATCH[2]}"; fi)"; then \
             break; \
         fi; \
@@ -125,10 +127,10 @@ COPY requirements/ /app/requirements/
 # Note: --index-strategy unsafe-best-match resolves conflicts between pypi.nvidia.com and pypi.org
 RUN if [[ "$BASE_IMAGE" =~ ^nvidia/cuda: ]]; then \
         echo "NVIDIA base image detected: installing GPU packages (cupy, cuml, onnxruntime-gpu, voyager, torch+cuda)"; \
-        uv pip install --system --no-cache --index-strategy unsafe-best-match -r /app/requirements/gpu.txt -r /app/requirements/common.txt || exit 1; \
+        UV_HTTP_TIMEOUT=600 uv pip install --system --no-cache --index-strategy unsafe-best-match -r /app/requirements/gpu.txt -r /app/requirements/common.txt || exit 1; \
     else \
         echo "CPU base image: installing all packages together for dependency resolution"; \
-        uv pip install --system --no-cache --index-strategy unsafe-best-match -r /app/requirements/cpu.txt -r /app/requirements/common.txt || exit 1; \
+        UV_HTTP_TIMEOUT=600 uv pip install --system --no-cache --index-strategy unsafe-best-match -r /app/requirements/cpu.txt -r /app/requirements/common.txt || exit 1; \
     fi \
     && echo "Verifying psycopg2 installation..." \
     && python3 -c "import psycopg2; print('psycopg2 OK')" \

diff --git a/config.py b/config.py
@@ -71,6 +71,11 @@
 # --- GPU Acceleration for Clustering (Optional, requires NVIDIA GPU and RAPIDS cuML) ---
 USE_GPU_CLUSTERING = os.environ.get("USE_GPU_CLUSTERING", "False").lower() == "true"
 
+# --- TensorRT Execution Provider for ONNX Runtime (Optional, NVIDIA builds only) ---
+# When true and TensorRT libraries are available, ONNX Runtime will prefer
+# TensorrtExecutionProvider before CUDAExecutionProvider.
+USE_TENSORRT = os.environ.get("USE_TENSORRT", "False").lower() == "true"
+
 # --- DBSCAN Only Constants (Ranges for Evolutionary Approach) ---
 # Default ranges for DBSCAN parameters
 DBSCAN_EPS_MIN = float(os.getenv("DBSCAN_EPS_MIN", "0.1"))

diff --git a/deployment/.env.example b/deployment/.env.example
@@ -85,6 +85,13 @@ MISTRAL_API_KEY=
 # Default: false (CPU only)
 USE_GPU_CLUSTERING=false
 
+# --- TensorRT Execution Provider (NVIDIA images only) ---
+# Enable TensorRT for ONNX Runtime inference (MusiCNN, CLAP, MuLan) when available.
+# Requires NVIDIA images with TensorRT runtime libs included.
+# If disabled or unavailable, ONNX Runtime falls back to CUDA, then CPU.
+# Default: false
+USE_TENSORRT=false
+
 # --- CLAP Text Search Configuration ---
 # Enable CLAP (Contrastive Language-Audio Pretraining) for natural language music search
 # CLAP allows searching your music collection using text queries like "upbeat summer songs" or "relaxing piano music"

diff --git a/deployment/docker-compose-navidrome_local.yaml b/deployment/docker-compose-navidrome_local.yaml
@@ -54,6 +54,7 @@ services:
       GEMINI_API_KEY: "${GEMINI_API_KEY}"
       MISTRAL_API_KEY: "${MISTRAL_API_KEY}"
       CLAP_ENABLED: "${CLAP_ENABLED:-true}" # Enable CLAP text search (set to false for slower systems)
+      USE_TENSORRT: "${USE_TENSORRT:-false}" # Prefer TensorRT EP for ONNX Runtime when available
       TEMP_DIR: "/app/temp_audio"
       # Authentication (optional) – leave blank to disable
       API_TOKEN: "${API_TOKEN:-}"
@@ -99,6 +100,7 @@ services:
       GEMINI_API_KEY: "${GEMINI_API_KEY}"
       MISTRAL_API_KEY: "${MISTRAL_API_KEY}"
       CLAP_ENABLED: "${CLAP_ENABLED:-true}" # Enable CLAP text search (set to false for slower systems)
+      USE_TENSORRT: "${USE_TENSORRT:-false}" # Prefer TensorRT EP for ONNX Runtime when available
       TEMP_DIR: "/app/temp_audio"
       # Authentication (optional) – leave blank to disable
       API_TOKEN: "${API_TOKEN:-}"

diff --git a/deployment/docker-compose-nvidia-local.yaml b/deployment/docker-compose-nvidia-local.yaml
@@ -57,6 +57,7 @@ services:
       OLLAMA_SERVER_URL: "${OLLAMA_SERVER_URL:-http://192.168.1.71:11434/api/generate}"
       OLLAMA_MODEL_NAME: "${OLLAMA_MODEL_NAME:-qwen3:1.7b}"
       CLAP_ENABLED: "${CLAP_ENABLED:-true}"
+      USE_TENSORRT: "${USE_TENSORRT:-false}"
       TEMP_DIR: "/app/temp_audio"
       # Authentication (optional) – leave blank to disable
       API_TOKEN: "${API_TOKEN:-}"
@@ -106,6 +107,7 @@ services:
       GEMINI_API_KEY: "${GEMINI_API_KEY}"
       MISTRAL_API_KEY: "${MISTRAL_API_KEY}"
       CLAP_ENABLED: "${CLAP_ENABLED:-true}"
+      USE_TENSORRT: "${USE_TENSORRT:-false}"
       NVIDIA_VISIBLE_DEVICES: "0"
       NVIDIA_DRIVER_CAPABILITIES: "compute,utility"
       USE_GPU_CLUSTERING: "${USE_GPU_CLUSTERING:-true}"

diff --git a/deployment/docker-compose-nvidia.yaml b/deployment/docker-compose-nvidia.yaml
@@ -51,6 +51,7 @@ services:
       GEMINI_API_KEY: "${GEMINI_API_KEY}"
       MISTRAL_API_KEY: "${MISTRAL_API_KEY}"
       CLAP_ENABLED: "${CLAP_ENABLED:-true}" # Enable CLAP text search (set to false for slower systems)
+      USE_TENSORRT: "${USE_TENSORRT:-false}" # Prefer TensorRT EP for ONNX Runtime when available
       TEMP_DIR: "/app/temp_audio"
     volumes:
       - temp-audio-flask:/app/temp_audio # Volume for temporary audio files
@@ -91,6 +92,7 @@ services:
       GEMINI_API_KEY: "${GEMINI_API_KEY}"
       MISTRAL_API_KEY: "${MISTRAL_API_KEY}"
       CLAP_ENABLED: "${CLAP_ENABLED:-true}" # Enable CLAP text search (set to false for slower systems)
+      USE_TENSORRT: "${USE_TENSORRT:-false}" # Prefer TensorRT EP for ONNX Runtime when available
       NVIDIA_VISIBLE_DEVICES: "0"
       NVIDIA_DRIVER_CAPABILITIES: "compute,utility"
       USE_GPU_CLUSTERING: "${USE_GPU_CLUSTERING:-true}"

diff --git a/deployment/docker-compose-server.yaml b/deployment/docker-compose-server.yaml
@@ -57,6 +57,7 @@ services:
       GEMINI_API_KEY: "${GEMINI_API_KEY}"
       MISTRAL_API_KEY: "${MISTRAL_API_KEY}"
       CLAP_ENABLED: "${CLAP_ENABLED:-true}" # Enable CLAP text search (set to false for slower systems)
+      USE_TENSORRT: "${USE_TENSORRT:-false}" # Prefer TensorRT EP for ONNX Runtime when available
       TEMP_DIR: "/app/temp_audio"
       API_TOKEN: "${API_TOKEN:-}"
       AUDIOMUSE_USER: "${AUDIOMUSE_USER:-}"

diff --git a/deployment/docker-compose-worker-nvidia.yaml b/deployment/docker-compose-worker-nvidia.yaml
@@ -32,6 +32,7 @@ services:
       GEMINI_API_KEY: "${GEMINI_API_KEY}"
       MISTRAL_API_KEY: "${MISTRAL_API_KEY}"
       CLAP_ENABLED: "${CLAP_ENABLED:-true}" # Enable CLAP text search (set to false for slower systems)
+      USE_TENSORRT: "${USE_TENSORRT:-false}" # Prefer TensorRT EP for ONNX Runtime when available
       TEMP_DIR: "/app/temp_audio"
       NVIDIA_VISIBLE_DEVICES: "0"
       NVIDIA_DRIVER_CAPABILITIES: "compute,utility"

diff --git a/deployment/docker-compose-worker_local.yaml b/deployment/docker-compose-worker_local.yaml
@@ -43,6 +43,7 @@ services:
       GEMINI_API_KEY: "${GEMINI_API_KEY}"
       MISTRAL_API_KEY: "${MISTRAL_API_KEY}"
       CLAP_ENABLED: "${CLAP_ENABLED:-true}" # Enable CLAP text search (set to false for slower systems)
+      USE_TENSORRT: "${USE_TENSORRT:-false}" # Prefer TensorRT EP for ONNX Runtime when available
       TEMP_DIR: "/app/temp_audio"
       # Authentication (optional) – leave blank to disable
       API_TOKEN: "${API_TOKEN:-}"

diff --git a/docs/GPU.md b/docs/GPU.md
@@ -25,6 +25,21 @@ We suggest **8GB VRAM** on GPU, with less you can experience the NON BLOCKING Ou
 3. Ensure NVIDIA Container Toolkit is installed on your host
 4. Use docker-compose files with GPU support (e.g., `docker-compose-nvidia.yaml` or `docker-compose-worker-nvidia.yaml`)
 
+**TensorRT (optional ONNX acceleration):**
+
+NVIDIA images now include TensorRT runtime libraries required by ONNX Runtime.
+TensorRT remains **opt-in** to avoid changing existing behavior.
+
+Set in your `.env` file:
+
+```
+USE_TENSORRT=true
+```
+
+When enabled and available, ONNX Runtime provider order becomes:
+`TensorrtExecutionProvider -> CUDAExecutionProvider -> CPUExecutionProvider`.
+If TensorRT cannot be used, inference falls back automatically.
+
 **Performance Impact:**
 - **KMeans**: 10-50x faster than CPU
 - **DBSCAN**: 5-100x faster than CPU

diff --git a/docs/PARAMETERS.md b/docs/PARAMETERS.md
@@ -84,6 +84,7 @@ These are the default parameters used when launching analysis or clustering task
 | `CLUSTERING_RUNS`                           | Iterations for Monte Carlo evolutionary search.                                                                           | `1000`          |
 | `TOP_N_PLAYLISTS`                           | POST Clustering it keep only the top N diverse playlist.                                                                  | `8`             |
 | `USE_GPU_CLUSTERING`                        | When true enalbe the use of GPU on K-Means, DBSCAN and PCA                                                                | `false`         |
+| `USE_TENSORRT`                              | When true and TensorRT is available (NVIDIA images), ONNX Runtime prefers TensorRT EP before CUDA for MusiCNN/CLAP/MuLan | `false`         |
 | **Similarity General**                      |                                                                                                                           |                 |
 | `INDEX_NAME`                                | Name of the index, no need to change.                                                                                     | `music_library` |
 | `VOYAGER_EF_CONSTRUCTION`                   | Number of element analyzed to create the neighbor list in the index.                                                      | `1024`          |

diff --git a/tasks/analysis.py b/tasks/analysis.py
@@ -63,6 +63,11 @@
     SessionRecycler,
     comprehensive_memory_cleanup
 )
+from .onnx_providers import (
+    build_ort_provider_options,
+    split_provider_options,
+    log_provider_selection,
+)
 
 
 from psycopg2 import OperationalError
@@ -380,25 +385,13 @@ def analyze_track(file_path, mood_labels_list, model_paths, onnx_sessions=None):
     should_cleanup_sessions = False
 
     # Configure provider options for GPU memory management (used for main and secondary models)
-    available_providers = ort.get_available_providers()
-    if 'CUDAExecutionProvider' in available_providers:
-        # Get GPU device ID from environment or default to 0
-        gpu_device_id = 0
-        cuda_visible = os.environ.get('CUDA_VISIBLE_DEVICES', '')
-        if cuda_visible and cuda_visible != '-1':
-            gpu_device_id = 0
-
-        cuda_options = {
-            'device_id': gpu_device_id,
-            'arena_extend_strategy': 'kSameAsRequested',  # Prevent memory fragmentation
-            'cudnn_conv_algo_search': 'EXHAUSTIVE',
-            'do_copy_in_default_stream': True,
-        }
-        provider_options = [('CUDAExecutionProvider', cuda_options), ('CPUExecutionProvider', {})]
-        logger.info(f"CUDA provider available - attempting to use GPU for analysis (device_id={gpu_device_id})")
-    else:
-        provider_options = [('CPUExecutionProvider', {})]
-        logger.info("CUDA provider not available - using CPU only")
+    provider_options, available_providers = build_ort_provider_options(
+        ort,
+        cuda_algo_search='EXHAUSTIVE',
+        include_copy_stream=True,
+    )
+    providers, provider_opts = split_provider_options(provider_options)
+    log_provider_selection(logger, "MusiCNN", provider_options, available_providers)
 
     try:
         # Use pre-loaded sessions if provided, otherwise load per-song
@@ -411,8 +404,8 @@ def analyze_track(file_path, mood_labels_list, model_paths, onnx_sessions=None):
             try:
                 embedding_sess = ort.InferenceSession(
                     model_paths['embedding'],
-                    providers=[p[0] for p in provider_options],
-                    provider_options=[p[1] for p in provider_options]
+                    providers=providers,
+                    provider_options=provider_opts
                 )
             except Exception:
                 # Fallback to CPU if preferred providers fail
@@ -425,8 +418,8 @@ def analyze_track(file_path, mood_labels_list, model_paths, onnx_sessions=None):
             try:
                 prediction_sess = ort.InferenceSession(
                     model_paths['prediction'],
-                    providers=[p[0] for p in provider_options],
-                    provider_options=[p[1] for p in provider_options]
+                    providers=providers,
+                    provider_options=provider_opts
                 )
             except Exception:
                 # Fallback to CPU if preferred providers fail
@@ -715,30 +708,21 @@ def get_missing_mulan_track_ids(track_ids):
                         if onnx_sessions is None:
                             logger.info(f"Lazy-loading MusiCNN models for album: {album_name}")
                             onnx_sessions = {}
-                            available_providers = ort.get_available_providers()
-
-                            if 'CUDAExecutionProvider' in available_providers:
-                                gpu_device_id = 0
-                                cuda_visible = os.environ.get('CUDA_VISIBLE_DEVICES', '')
-                                if cuda_visible and cuda_visible != '-1':
-                                    gpu_device_id = 0
-                                cuda_options = {
-                                    'device_id': gpu_device_id,
-                                    'arena_extend_strategy': 'kSameAsRequested',  # Prevent memory fragmentation
-                                    'cudnn_conv_algo_search': 'EXHAUSTIVE',      # Find memory-efficient algorithms
-                                    'do_copy_in_default_stream': True,           # Better memory sync
-                                }
-                                provider_options = [('CUDAExecutionProvider', cuda_options), ('CPUExecutionProvider', {})]
-                            else:
-                                provider_options = [('CPUExecutionProvider', {})]
+                            provider_options, available_providers = build_ort_provider_options(
+                                ort,
+                                cuda_algo_search='EXHAUSTIVE',
+                                include_copy_stream=True,
+                            )
+                            providers, provider_opts = split_provider_options(provider_options)
+                            log_provider_selection(logger, "MusiCNN lazy-load", provider_options, available_providers)
 
                             try:
                                 for model_name, model_path in model_paths.items():
                                     try:
                                         onnx_sessions[model_name] = ort.InferenceSession(
                                             model_path,
-                                            providers=[p[0] for p in provider_options],
-                                            provider_options=[p[1] for p in provider_options]
+                                            providers=providers,
+                                            provider_options=provider_opts
                                         )
                                     except Exception:
                                         onnx_sessions[model_name] = ort.InferenceSession(
@@ -763,30 +747,21 @@ def get_missing_mulan_track_ids(track_ids):
 
                             # Recreate sessions
                             onnx_sessions = {}
-                            available_providers = ort.get_available_providers()
-
-                            if 'CUDAExecutionProvider' in available_providers:
-                                gpu_device_id = 0
-                                cuda_visible = os.environ.get('CUDA_VISIBLE_DEVICES', '')
-                                if cuda_visible and cuda_visible != '-1':
-                                    gpu_device_id = 0
-                                cuda_options = {
-                                    'device_id': gpu_device_id,
-                                    'arena_extend_strategy': 'kSameAsRequested',  # Prevent memory fragmentation
-                                    'cudnn_conv_algo_search': 'EXHAUSTIVE',
-                                    'do_copy_in_default_stream': True,
-                                }
-                                provider_options = [('CUDAExecutionProvider', cuda_options), ('CPUExecutionProvider', {})]
-                            else:
-                                provider_options = [('CPUExecutionProvider', {})]
+                            provider_options, available_providers = build_ort_provider_options(
+                                ort,
+                                cuda_algo_search='EXHAUSTIVE',
+                                include_copy_stream=True,
+                            )
+                            providers, provider_opts = split_provider_options(provider_options)
+                            log_provider_selection(logger, "MusiCNN recycle", provider_options, available_providers)
 
                             try:
                                 for model_name, model_path in model_paths.items():
                                     try:
                                         onnx_sessions[model_name] = ort.InferenceSession(
                                             model_path,
-                                            providers=[p[0] for p in provider_options],
-                                            provider_options=[p[1] for p in provider_options]
+                                            providers=providers,
+                                            provider_options=provider_opts
                                         )
                                     except Exception:
                                         onnx_sessions[model_name] = ort.InferenceSession(