diff --git a/Dockerfile b/Dockerfile index 372d5245..bf0b631d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -95,6 +95,8 @@ RUN set -ux; \ supervisor procps \ gcc g++ \ git vim redis-tools strace iputils-ping \ + $(if [[ "$BASE_IMAGE" =~ ^nvidia/cuda: ]]; then echo libnvinfer10; fi) \ + $(if [[ "$BASE_IMAGE" =~ ^nvidia/cuda: ]]; then echo libnvinfer-plugin10; fi) \ "$(if [[ "$BASE_IMAGE" =~ ^nvidia/cuda:([0-9]+)\.([0-9]+).+$ ]]; then echo "cuda-compiler-${BASH_REMATCH[1]}-${BASH_REMATCH[2]}"; fi)"; then \ break; \ fi; \ @@ -125,10 +127,10 @@ COPY requirements/ /app/requirements/ # Note: --index-strategy unsafe-best-match resolves conflicts between pypi.nvidia.com and pypi.org RUN if [[ "$BASE_IMAGE" =~ ^nvidia/cuda: ]]; then \ echo "NVIDIA base image detected: installing GPU packages (cupy, cuml, onnxruntime-gpu, voyager, torch+cuda)"; \ - uv pip install --system --no-cache --index-strategy unsafe-best-match -r /app/requirements/gpu.txt -r /app/requirements/common.txt || exit 1; \ + UV_HTTP_TIMEOUT=600 uv pip install --system --no-cache --index-strategy unsafe-best-match -r /app/requirements/gpu.txt -r /app/requirements/common.txt || exit 1; \ else \ echo "CPU base image: installing all packages together for dependency resolution"; \ - uv pip install --system --no-cache --index-strategy unsafe-best-match -r /app/requirements/cpu.txt -r /app/requirements/common.txt || exit 1; \ + UV_HTTP_TIMEOUT=600 uv pip install --system --no-cache --index-strategy unsafe-best-match -r /app/requirements/cpu.txt -r /app/requirements/common.txt || exit 1; \ fi \ && echo "Verifying psycopg2 installation..." \ && python3 -c "import psycopg2; print('psycopg2 OK')" \ diff --git a/config.py b/config.py index b940b97f..05184f65 100644 --- a/config.py +++ b/config.py @@ -71,6 +71,11 @@ # --- GPU Acceleration for Clustering (Optional, requires NVIDIA GPU and RAPIDS cuML) --- USE_GPU_CLUSTERING = os.environ.get("USE_GPU_CLUSTERING", "False").lower() == "true" +# --- TensorRT Execution Provider for ONNX Runtime (Optional, NVIDIA builds only) --- +# When true and TensorRT libraries are available, ONNX Runtime will prefer +# TensorrtExecutionProvider before CUDAExecutionProvider. +USE_TENSORRT = os.environ.get("USE_TENSORRT", "False").lower() == "true" + # --- DBSCAN Only Constants (Ranges for Evolutionary Approach) --- # Default ranges for DBSCAN parameters DBSCAN_EPS_MIN = float(os.getenv("DBSCAN_EPS_MIN", "0.1")) diff --git a/deployment/.env.example b/deployment/.env.example index ed0a59d6..0a71a9ad 100644 --- a/deployment/.env.example +++ b/deployment/.env.example @@ -85,6 +85,13 @@ MISTRAL_API_KEY= # Default: false (CPU only) USE_GPU_CLUSTERING=false +# --- TensorRT Execution Provider (NVIDIA images only) --- +# Enable TensorRT for ONNX Runtime inference (MusiCNN, CLAP, MuLan) when available. +# Requires NVIDIA images with TensorRT runtime libs included. +# If disabled or unavailable, ONNX Runtime falls back to CUDA, then CPU. +# Default: false +USE_TENSORRT=false + # --- CLAP Text Search Configuration --- # Enable CLAP (Contrastive Language-Audio Pretraining) for natural language music search # CLAP allows searching your music collection using text queries like "upbeat summer songs" or "relaxing piano music" diff --git a/deployment/docker-compose-navidrome_local.yaml b/deployment/docker-compose-navidrome_local.yaml index decab0a4..97354367 100644 --- a/deployment/docker-compose-navidrome_local.yaml +++ b/deployment/docker-compose-navidrome_local.yaml @@ -54,6 +54,7 @@ services: GEMINI_API_KEY: "${GEMINI_API_KEY}" MISTRAL_API_KEY: "${MISTRAL_API_KEY}" CLAP_ENABLED: "${CLAP_ENABLED:-true}" # Enable CLAP text search (set to false for slower systems) + USE_TENSORRT: "${USE_TENSORRT:-false}" # Prefer TensorRT EP for ONNX Runtime when available TEMP_DIR: "/app/temp_audio" # Authentication (optional) – leave blank to disable API_TOKEN: "${API_TOKEN:-}" @@ -99,6 +100,7 @@ services: GEMINI_API_KEY: "${GEMINI_API_KEY}" MISTRAL_API_KEY: "${MISTRAL_API_KEY}" CLAP_ENABLED: "${CLAP_ENABLED:-true}" # Enable CLAP text search (set to false for slower systems) + USE_TENSORRT: "${USE_TENSORRT:-false}" # Prefer TensorRT EP for ONNX Runtime when available TEMP_DIR: "/app/temp_audio" # Authentication (optional) – leave blank to disable API_TOKEN: "${API_TOKEN:-}" diff --git a/deployment/docker-compose-nvidia-local.yaml b/deployment/docker-compose-nvidia-local.yaml index 23f5623f..3e248362 100644 --- a/deployment/docker-compose-nvidia-local.yaml +++ b/deployment/docker-compose-nvidia-local.yaml @@ -57,6 +57,7 @@ services: OLLAMA_SERVER_URL: "${OLLAMA_SERVER_URL:-http://192.168.1.71:11434/api/generate}" OLLAMA_MODEL_NAME: "${OLLAMA_MODEL_NAME:-qwen3:1.7b}" CLAP_ENABLED: "${CLAP_ENABLED:-true}" + USE_TENSORRT: "${USE_TENSORRT:-false}" TEMP_DIR: "/app/temp_audio" # Authentication (optional) – leave blank to disable API_TOKEN: "${API_TOKEN:-}" @@ -106,6 +107,7 @@ services: GEMINI_API_KEY: "${GEMINI_API_KEY}" MISTRAL_API_KEY: "${MISTRAL_API_KEY}" CLAP_ENABLED: "${CLAP_ENABLED:-true}" + USE_TENSORRT: "${USE_TENSORRT:-false}" NVIDIA_VISIBLE_DEVICES: "0" NVIDIA_DRIVER_CAPABILITIES: "compute,utility" USE_GPU_CLUSTERING: "${USE_GPU_CLUSTERING:-true}" diff --git a/deployment/docker-compose-nvidia.yaml b/deployment/docker-compose-nvidia.yaml index 951167de..2cfd9c21 100644 --- a/deployment/docker-compose-nvidia.yaml +++ b/deployment/docker-compose-nvidia.yaml @@ -51,6 +51,7 @@ services: GEMINI_API_KEY: "${GEMINI_API_KEY}" MISTRAL_API_KEY: "${MISTRAL_API_KEY}" CLAP_ENABLED: "${CLAP_ENABLED:-true}" # Enable CLAP text search (set to false for slower systems) + USE_TENSORRT: "${USE_TENSORRT:-false}" # Prefer TensorRT EP for ONNX Runtime when available TEMP_DIR: "/app/temp_audio" volumes: - temp-audio-flask:/app/temp_audio # Volume for temporary audio files @@ -91,6 +92,7 @@ services: GEMINI_API_KEY: "${GEMINI_API_KEY}" MISTRAL_API_KEY: "${MISTRAL_API_KEY}" CLAP_ENABLED: "${CLAP_ENABLED:-true}" # Enable CLAP text search (set to false for slower systems) + USE_TENSORRT: "${USE_TENSORRT:-false}" # Prefer TensorRT EP for ONNX Runtime when available NVIDIA_VISIBLE_DEVICES: "0" NVIDIA_DRIVER_CAPABILITIES: "compute,utility" USE_GPU_CLUSTERING: "${USE_GPU_CLUSTERING:-true}" diff --git a/deployment/docker-compose-server.yaml b/deployment/docker-compose-server.yaml index 10910de0..d90f8565 100644 --- a/deployment/docker-compose-server.yaml +++ b/deployment/docker-compose-server.yaml @@ -57,6 +57,7 @@ services: GEMINI_API_KEY: "${GEMINI_API_KEY}" MISTRAL_API_KEY: "${MISTRAL_API_KEY}" CLAP_ENABLED: "${CLAP_ENABLED:-true}" # Enable CLAP text search (set to false for slower systems) + USE_TENSORRT: "${USE_TENSORRT:-false}" # Prefer TensorRT EP for ONNX Runtime when available TEMP_DIR: "/app/temp_audio" API_TOKEN: "${API_TOKEN:-}" AUDIOMUSE_USER: "${AUDIOMUSE_USER:-}" diff --git a/deployment/docker-compose-worker-nvidia.yaml b/deployment/docker-compose-worker-nvidia.yaml index 7a10db7c..bbcbd93c 100644 --- a/deployment/docker-compose-worker-nvidia.yaml +++ b/deployment/docker-compose-worker-nvidia.yaml @@ -32,6 +32,7 @@ services: GEMINI_API_KEY: "${GEMINI_API_KEY}" MISTRAL_API_KEY: "${MISTRAL_API_KEY}" CLAP_ENABLED: "${CLAP_ENABLED:-true}" # Enable CLAP text search (set to false for slower systems) + USE_TENSORRT: "${USE_TENSORRT:-false}" # Prefer TensorRT EP for ONNX Runtime when available TEMP_DIR: "/app/temp_audio" NVIDIA_VISIBLE_DEVICES: "0" NVIDIA_DRIVER_CAPABILITIES: "compute,utility" diff --git a/deployment/docker-compose-worker_local.yaml b/deployment/docker-compose-worker_local.yaml index a5e88c3f..2f22fd15 100644 --- a/deployment/docker-compose-worker_local.yaml +++ b/deployment/docker-compose-worker_local.yaml @@ -43,6 +43,7 @@ services: GEMINI_API_KEY: "${GEMINI_API_KEY}" MISTRAL_API_KEY: "${MISTRAL_API_KEY}" CLAP_ENABLED: "${CLAP_ENABLED:-true}" # Enable CLAP text search (set to false for slower systems) + USE_TENSORRT: "${USE_TENSORRT:-false}" # Prefer TensorRT EP for ONNX Runtime when available TEMP_DIR: "/app/temp_audio" # Authentication (optional) – leave blank to disable API_TOKEN: "${API_TOKEN:-}" diff --git a/docs/GPU.md b/docs/GPU.md index 42a0a6a2..f6be770d 100644 --- a/docs/GPU.md +++ b/docs/GPU.md @@ -25,6 +25,21 @@ We suggest **8GB VRAM** on GPU, with less you can experience the NON BLOCKING Ou 3. Ensure NVIDIA Container Toolkit is installed on your host 4. Use docker-compose files with GPU support (e.g., `docker-compose-nvidia.yaml` or `docker-compose-worker-nvidia.yaml`) +**TensorRT (optional ONNX acceleration):** + +NVIDIA images now include TensorRT runtime libraries required by ONNX Runtime. +TensorRT remains **opt-in** to avoid changing existing behavior. + +Set in your `.env` file: + +``` +USE_TENSORRT=true +``` + +When enabled and available, ONNX Runtime provider order becomes: +`TensorrtExecutionProvider -> CUDAExecutionProvider -> CPUExecutionProvider`. +If TensorRT cannot be used, inference falls back automatically. + **Performance Impact:** - **KMeans**: 10-50x faster than CPU - **DBSCAN**: 5-100x faster than CPU diff --git a/docs/PARAMETERS.md b/docs/PARAMETERS.md index 532ca10d..b71fd574 100644 --- a/docs/PARAMETERS.md +++ b/docs/PARAMETERS.md @@ -84,6 +84,7 @@ These are the default parameters used when launching analysis or clustering task | `CLUSTERING_RUNS` | Iterations for Monte Carlo evolutionary search. | `1000` | | `TOP_N_PLAYLISTS` | POST Clustering it keep only the top N diverse playlist. | `8` | | `USE_GPU_CLUSTERING` | When true enalbe the use of GPU on K-Means, DBSCAN and PCA | `false` | +| `USE_TENSORRT` | When true and TensorRT is available (NVIDIA images), ONNX Runtime prefers TensorRT EP before CUDA for MusiCNN/CLAP/MuLan | `false` | | **Similarity General** | | | | `INDEX_NAME` | Name of the index, no need to change. | `music_library` | | `VOYAGER_EF_CONSTRUCTION` | Number of element analyzed to create the neighbor list in the index. | `1024` | diff --git a/tasks/analysis.py b/tasks/analysis.py index ed4246a5..c12c0c17 100644 --- a/tasks/analysis.py +++ b/tasks/analysis.py @@ -63,6 +63,11 @@ SessionRecycler, comprehensive_memory_cleanup ) +from .onnx_providers import ( + build_ort_provider_options, + split_provider_options, + log_provider_selection, +) from psycopg2 import OperationalError @@ -380,25 +385,13 @@ def analyze_track(file_path, mood_labels_list, model_paths, onnx_sessions=None): should_cleanup_sessions = False # Configure provider options for GPU memory management (used for main and secondary models) - available_providers = ort.get_available_providers() - if 'CUDAExecutionProvider' in available_providers: - # Get GPU device ID from environment or default to 0 - gpu_device_id = 0 - cuda_visible = os.environ.get('CUDA_VISIBLE_DEVICES', '') - if cuda_visible and cuda_visible != '-1': - gpu_device_id = 0 - - cuda_options = { - 'device_id': gpu_device_id, - 'arena_extend_strategy': 'kSameAsRequested', # Prevent memory fragmentation - 'cudnn_conv_algo_search': 'EXHAUSTIVE', - 'do_copy_in_default_stream': True, - } - provider_options = [('CUDAExecutionProvider', cuda_options), ('CPUExecutionProvider', {})] - logger.info(f"CUDA provider available - attempting to use GPU for analysis (device_id={gpu_device_id})") - else: - provider_options = [('CPUExecutionProvider', {})] - logger.info("CUDA provider not available - using CPU only") + provider_options, available_providers = build_ort_provider_options( + ort, + cuda_algo_search='EXHAUSTIVE', + include_copy_stream=True, + ) + providers, provider_opts = split_provider_options(provider_options) + log_provider_selection(logger, "MusiCNN", provider_options, available_providers) try: # Use pre-loaded sessions if provided, otherwise load per-song @@ -411,8 +404,8 @@ def analyze_track(file_path, mood_labels_list, model_paths, onnx_sessions=None): try: embedding_sess = ort.InferenceSession( model_paths['embedding'], - providers=[p[0] for p in provider_options], - provider_options=[p[1] for p in provider_options] + providers=providers, + provider_options=provider_opts ) except Exception: # Fallback to CPU if preferred providers fail @@ -425,8 +418,8 @@ def analyze_track(file_path, mood_labels_list, model_paths, onnx_sessions=None): try: prediction_sess = ort.InferenceSession( model_paths['prediction'], - providers=[p[0] for p in provider_options], - provider_options=[p[1] for p in provider_options] + providers=providers, + provider_options=provider_opts ) except Exception: # Fallback to CPU if preferred providers fail @@ -715,30 +708,21 @@ def get_missing_mulan_track_ids(track_ids): if onnx_sessions is None: logger.info(f"Lazy-loading MusiCNN models for album: {album_name}") onnx_sessions = {} - available_providers = ort.get_available_providers() - - if 'CUDAExecutionProvider' in available_providers: - gpu_device_id = 0 - cuda_visible = os.environ.get('CUDA_VISIBLE_DEVICES', '') - if cuda_visible and cuda_visible != '-1': - gpu_device_id = 0 - cuda_options = { - 'device_id': gpu_device_id, - 'arena_extend_strategy': 'kSameAsRequested', # Prevent memory fragmentation - 'cudnn_conv_algo_search': 'EXHAUSTIVE', # Find memory-efficient algorithms - 'do_copy_in_default_stream': True, # Better memory sync - } - provider_options = [('CUDAExecutionProvider', cuda_options), ('CPUExecutionProvider', {})] - else: - provider_options = [('CPUExecutionProvider', {})] + provider_options, available_providers = build_ort_provider_options( + ort, + cuda_algo_search='EXHAUSTIVE', + include_copy_stream=True, + ) + providers, provider_opts = split_provider_options(provider_options) + log_provider_selection(logger, "MusiCNN lazy-load", provider_options, available_providers) try: for model_name, model_path in model_paths.items(): try: onnx_sessions[model_name] = ort.InferenceSession( model_path, - providers=[p[0] for p in provider_options], - provider_options=[p[1] for p in provider_options] + providers=providers, + provider_options=provider_opts ) except Exception: onnx_sessions[model_name] = ort.InferenceSession( @@ -763,30 +747,21 @@ def get_missing_mulan_track_ids(track_ids): # Recreate sessions onnx_sessions = {} - available_providers = ort.get_available_providers() - - if 'CUDAExecutionProvider' in available_providers: - gpu_device_id = 0 - cuda_visible = os.environ.get('CUDA_VISIBLE_DEVICES', '') - if cuda_visible and cuda_visible != '-1': - gpu_device_id = 0 - cuda_options = { - 'device_id': gpu_device_id, - 'arena_extend_strategy': 'kSameAsRequested', # Prevent memory fragmentation - 'cudnn_conv_algo_search': 'EXHAUSTIVE', - 'do_copy_in_default_stream': True, - } - provider_options = [('CUDAExecutionProvider', cuda_options), ('CPUExecutionProvider', {})] - else: - provider_options = [('CPUExecutionProvider', {})] + provider_options, available_providers = build_ort_provider_options( + ort, + cuda_algo_search='EXHAUSTIVE', + include_copy_stream=True, + ) + providers, provider_opts = split_provider_options(provider_options) + log_provider_selection(logger, "MusiCNN recycle", provider_options, available_providers) try: for model_name, model_path in model_paths.items(): try: onnx_sessions[model_name] = ort.InferenceSession( model_path, - providers=[p[0] for p in provider_options], - provider_options=[p[1] for p in provider_options] + providers=providers, + provider_options=provider_opts ) except Exception: onnx_sessions[model_name] = ort.InferenceSession( diff --git a/tasks/clap_analyzer.py b/tasks/clap_analyzer.py index 8e2cb6eb..9af1feb2 100644 --- a/tasks/clap_analyzer.py +++ b/tasks/clap_analyzer.py @@ -26,6 +26,7 @@ os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1' import config +from tasks.onnx_providers import build_ort_provider_options, split_provider_options, log_provider_selection try: from config import AUDIO_LOAD_TIMEOUT except Exception: @@ -88,23 +89,12 @@ def _load_audio_model(): session = None # Configure provider options with GPU memory management - available_providers = ort.get_available_providers() - if 'CUDAExecutionProvider' in available_providers: - gpu_device_id = 0 - cuda_visible = os.environ.get('CUDA_VISIBLE_DEVICES', '') - if cuda_visible and cuda_visible != '-1': - gpu_device_id = 0 - - cuda_options = { - 'device_id': gpu_device_id, - 'arena_extend_strategy': 'kSameAsRequested', - 'cudnn_conv_algo_search': 'DEFAULT', - } - provider_options = [('CUDAExecutionProvider', cuda_options), ('CPUExecutionProvider', {})] - logger.info(f"CUDA provider available - will attempt to use GPU (device_id={gpu_device_id})") - else: - provider_options = [('CPUExecutionProvider', {})] - logger.info("CUDA provider not available - using CPU only") + provider_options, available_providers = build_ort_provider_options( + ort, + cuda_algo_search='DEFAULT', + include_copy_stream=False, + ) + log_provider_selection(logger, "CLAP audio", provider_options, available_providers) # Create session — pass file path so ORT resolves external data natively def _create_session(model_input, providers, provider_opts): @@ -115,8 +105,7 @@ def _create_session(model_input, providers, provider_opts): provider_options=provider_opts, ) - preferred_providers = [p[0] for p in provider_options] - preferred_opts = [p[1] for p in provider_options] + preferred_providers, preferred_opts = split_provider_options(provider_options) cpu_providers = ['CPUExecutionProvider'] cpu_opts = [{}] @@ -187,32 +176,21 @@ def _load_text_model(): # Text model typically runs on CPU in Flask containers session = None - available_providers = ort.get_available_providers() - - if 'CUDAExecutionProvider' in available_providers: - gpu_device_id = 0 - cuda_visible = os.environ.get('CUDA_VISIBLE_DEVICES', '') - if cuda_visible and cuda_visible != '-1': - gpu_device_id = 0 - - cuda_options = { - 'device_id': gpu_device_id, - 'arena_extend_strategy': 'kSameAsRequested', - 'cudnn_conv_algo_search': 'DEFAULT', - } - provider_options = [('CUDAExecutionProvider', cuda_options), ('CPUExecutionProvider', {})] - logger.info(f"CUDA provider available - will attempt to use GPU (device_id={gpu_device_id})") - else: - provider_options = [('CPUExecutionProvider', {})] - logger.info("CUDA provider not available - using CPU only") + provider_options, available_providers = build_ort_provider_options( + ort, + cuda_algo_search='DEFAULT', + include_copy_stream=False, + ) + providers, provider_opts = split_provider_options(provider_options) + log_provider_selection(logger, "CLAP text", provider_options, available_providers) # Create session try: session = ort.InferenceSession( model_path, sess_options=sess_options, - providers=[p[0] for p in provider_options], - provider_options=[p[1] for p in provider_options] + providers=providers, + provider_options=provider_opts ) active_provider = session.get_providers()[0] diff --git a/tasks/mulan_analyzer.py b/tasks/mulan_analyzer.py index f6171607..901ac23e 100644 --- a/tasks/mulan_analyzer.py +++ b/tasks/mulan_analyzer.py @@ -21,6 +21,7 @@ from typing import Tuple, Optional from transformers import AutoTokenizer from tasks.memory_utils import cleanup_cuda_memory, cleanup_onnx_session, handle_onnx_memory_error +from tasks.onnx_providers import build_ort_provider_options, split_provider_options, log_provider_selection logger = logging.getLogger(__name__) @@ -65,20 +66,22 @@ def _load_mulan_models(load_text_models=False): # logger.info(f"MuLan: Using {num_threads} threads ({logical_cores} logical cores - 2)") logger.info("MuLan: Using ONNX Runtime automatic thread management") - # Select execution provider (CPU or CUDA) - providers = ['CPUExecutionProvider'] - if ort.get_available_providers() and 'CUDAExecutionProvider' in ort.get_available_providers(): - providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] - logger.info("CUDA available - using GPU acceleration") - else: - logger.info("Using CPU execution") + # Select execution providers (TensorRT/CUDA/CPU) + provider_options, available_providers = build_ort_provider_options( + ort, + cuda_algo_search='DEFAULT', + include_copy_stream=False, + ) + providers, provider_opts = split_provider_options(provider_options) + log_provider_selection(logger, "MuLan", provider_options, available_providers) # Load audio encoder (with external data file) logger.info(f"Loading audio encoder: {config.AUDIO_MODEL_PATH}") _audio_session = ort.InferenceSession( config.AUDIO_MODEL_PATH, sess_options=sess_options, - providers=providers + providers=providers, + provider_options=provider_opts, ) # Load text encoder and tokenizer only if requested (Flask search mode) @@ -92,7 +95,8 @@ def _load_mulan_models(load_text_models=False): _text_session = ort.InferenceSession( config.TEXT_MODEL_PATH, sess_options=sess_options, - providers=providers + providers=providers, + provider_options=provider_opts, ) # Load tokenizer from extracted directory (uses transformers for compatibility) @@ -172,15 +176,20 @@ def initialize_mulan_text_models(): # sess_options.intra_op_num_threads = num_threads # sess_options.inter_op_num_threads = num_threads - providers = ['CPUExecutionProvider'] - if ort.get_available_providers() and 'CUDAExecutionProvider' in ort.get_available_providers(): - providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] + provider_options, available_providers = build_ort_provider_options( + ort, + cuda_algo_search='DEFAULT', + include_copy_stream=False, + ) + providers, provider_opts = split_provider_options(provider_options) + log_provider_selection(logger, "MuLan text", provider_options, available_providers) # Load text encoder _text_session = ort.InferenceSession( config.TEXT_MODEL_PATH, sess_options=sess_options, - providers=providers + providers=providers, + provider_options=provider_opts, ) # Load tokenizer diff --git a/tasks/onnx_providers.py b/tasks/onnx_providers.py new file mode 100644 index 00000000..d07bf17b --- /dev/null +++ b/tasks/onnx_providers.py @@ -0,0 +1,56 @@ +import os + +from config import USE_TENSORRT + + +def build_ort_provider_options( + ort_module, + cuda_algo_search='EXHAUSTIVE', + include_copy_stream=True, +): + """Build ordered ONNX Runtime providers with options. + + Provider preference order: + 1. TensorRT (optional, via USE_TENSORRT=true) + 2. CUDA + 3. CPU + """ + available_providers = ort_module.get_available_providers() or [] + + gpu_device_id = 0 + cuda_visible = os.environ.get('CUDA_VISIBLE_DEVICES', '') + if cuda_visible and cuda_visible != '-1': + gpu_device_id = 0 + + provider_options = [] + + if USE_TENSORRT and 'TensorrtExecutionProvider' in available_providers: + provider_options.append(('TensorrtExecutionProvider', {'device_id': gpu_device_id})) + + if 'CUDAExecutionProvider' in available_providers: + cuda_options = { + 'device_id': gpu_device_id, + 'arena_extend_strategy': 'kSameAsRequested', + } + if cuda_algo_search: + cuda_options['cudnn_conv_algo_search'] = cuda_algo_search + if include_copy_stream: + cuda_options['do_copy_in_default_stream'] = True + provider_options.append(('CUDAExecutionProvider', cuda_options)) + + provider_options.append(('CPUExecutionProvider', {})) + return provider_options, available_providers + + +def split_provider_options(provider_options): + """Split provider tuples into `providers` and `provider_options` lists.""" + providers = [provider_name for provider_name, _ in provider_options] + provider_opts = [options for _, options in provider_options] + return providers, provider_opts + + +def log_provider_selection(logger, context, provider_options, available_providers): + """Log available and preferred ONNX Runtime execution providers.""" + preferred = [provider_name for provider_name, _ in provider_options] + logger.info(f"{context}: available providers: {available_providers}") + logger.info(f"{context}: preferred providers: {preferred}") diff --git a/test/test_gpu_status.py b/test/test_gpu_status.py index 824eb102..a27bdde0 100644 --- a/test/test_gpu_status.py +++ b/test/test_gpu_status.py @@ -79,7 +79,9 @@ def test_onnx_runtime(): import onnxruntime as ort providers = ort.get_available_providers() has_cuda = 'CUDAExecutionProvider' in providers + has_tensorrt = 'TensorrtExecutionProvider' in providers print_result("ONNX Runtime GPU", has_cuda, f"Providers: {providers}") + print_result("TensorRT EP available", has_tensorrt, "Set USE_TENSORRT=true to prefer TensorRT") # Test actual session creation with CUDA if has_cuda: diff --git a/tests/unit/test_onnx_providers.py b/tests/unit/test_onnx_providers.py new file mode 100644 index 00000000..71e1dbd3 --- /dev/null +++ b/tests/unit/test_onnx_providers.py @@ -0,0 +1,42 @@ +from tasks.onnx_providers import build_ort_provider_options, split_provider_options + + +class _FakeOrt: + def __init__(self, providers): + self._providers = providers + + def get_available_providers(self): + return self._providers + + +def test_cuda_cpu_when_tensorrt_disabled(monkeypatch): + monkeypatch.setattr('tasks.onnx_providers.USE_TENSORRT', False) + fake_ort = _FakeOrt(['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']) + + provider_options, _ = build_ort_provider_options(fake_ort, cuda_algo_search='DEFAULT') + providers, provider_opts = split_provider_options(provider_options) + + assert providers == ['CUDAExecutionProvider', 'CPUExecutionProvider'] + assert provider_opts[0]['cudnn_conv_algo_search'] == 'DEFAULT' + + +def test_tensorrt_cuda_cpu_when_tensorrt_enabled(monkeypatch): + monkeypatch.setattr('tasks.onnx_providers.USE_TENSORRT', True) + fake_ort = _FakeOrt(['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']) + + provider_options, _ = build_ort_provider_options(fake_ort, cuda_algo_search='EXHAUSTIVE') + providers, provider_opts = split_provider_options(provider_options) + + assert providers == ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider'] + assert provider_opts[0]['device_id'] == 0 + assert provider_opts[1]['cudnn_conv_algo_search'] == 'EXHAUSTIVE' + + +def test_cpu_only_fallback(monkeypatch): + monkeypatch.setattr('tasks.onnx_providers.USE_TENSORRT', True) + fake_ort = _FakeOrt(['CPUExecutionProvider']) + + provider_options, _ = build_ort_provider_options(fake_ort) + providers, _ = split_provider_options(provider_options) + + assert providers == ['CPUExecutionProvider']