From f5af619b45ba157f865f4163d88765119d0c3513 Mon Sep 17 00:00:00 2001
From: Zhol Internet <zhol@internet.keyboard>
Date: Fri, 6 Feb 2026 12:12:31 -0800
Subject: [PATCH 1/2] add OpenVINO support

Allows for accelerated inferencing on Intel hardware. To allow for
this, a few small changes are made:

* addition of requirements/openvino.txt, which just lets pip install
  the OpenVINO runtime
* util.provider, to prevent code-reuse with
  * get_available_providers, which filters out supported ONNXRuntime
    providers
* tasks.analysis.get_provider_options, which prevents code-reuse
* modifications to Dockerfile, to allow for that optional package
  installation described earlier
---
 Dockerfile                              | 28 +++++++-
 Dockerfile-noavx2                       |  3 +
 deployment/.env.example                 |  4 ++
 requirements/openvino.txt               |  2 +
 student_clap/data/clap_embedder.py      | 14 ++--
 student_clap/data/clap_text_embedder.py | 14 ++--
 tasks/analysis.py                       | 90 +++++++++++--------------
 tasks/clap_analyzer.py                  | 61 ++---------------
 tasks/memory_utils.py                   | 14 ++--
 tasks/mulan_analyzer.py                 | 14 ++--
 tests/unit/test_analysis.py             | 10 +--
 util/__init__.py                        |  0
 util/provider.py                        | 22 ++++++
 13 files changed, 127 insertions(+), 149 deletions(-)
 create mode 100644 requirements/openvino.txt
 create mode 100644 util/__init__.py
 create mode 100644 util/provider.py

diff --git a/Dockerfile b/Dockerfile
index 217fa941..0d3358dd 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -3,8 +3,9 @@
 # Supports both CPU (ubuntu:24.04) and GPU (nvidia/cuda:12.8.1-cudnn-runtime-ubuntu24.04) builds
 #
 # Build examples:
-#   CPU:  docker build -t audiomuse-ai .
-#   GPU:  docker build --build-arg BASE_IMAGE=nvidia/cuda:12.8.1-cudnn-runtime-ubuntu24.04 -t audiomuse-ai-gpu .
+#      CPU:  docker build -t audiomuse-ai .
+# OPENVINO:  docker build --build-arg OPENVINO=true -t audiomuse-ai-openvino
+#      GPU:  docker build --build-arg BASE_IMAGE=nvidia/cuda:12.8.1-cudnn-runtime-ubuntu24.04 -t audiomuse-ai-gpu .
 
 ARG BASE_IMAGE=ubuntu:24.04
 
@@ -73,6 +74,7 @@ RUN set -eux; \
 FROM ${BASE_IMAGE} AS base
 
 ARG BASE_IMAGE
+ARG OPENVINO=false
 
 SHELL ["/bin/bash", "-c"]
 
@@ -111,12 +113,31 @@ RUN set -ux; \
     apt-get autoremove -y || true && \
     rm -f /usr/lib/python3.*/EXTERNALLY-MANAGED
 
+# Install Intel GPU drivers for OpenVINO GPU support (when OPENVINO=true)
+RUN if [ "$OPENVINO" = "true" ]; then \
+        echo "Installing Intel GPU drivers for OpenVINO..." && \
+        apt-get update && \
+        apt-get install -y --no-install-recommends wget gpg && \
+        wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | \
+            gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg && \
+        echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu noble client" | \
+            tee /etc/apt/sources.list.d/intel-gpu-noble.list && \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            ocl-icd-libopencl1 \
+            intel-opencl-icd \
+            intel-level-zero-gpu \
+            level-zero && \
+        rm -rf /var/lib/apt/lists/*; \
+    fi
+
 # ============================================================================
 # Stage 3: Libraries - Python packages installation
 # ============================================================================
 FROM base AS libraries
 
 ARG BASE_IMAGE
+ARG OPENVINO=false
 
 WORKDIR /app
 
@@ -130,6 +151,9 @@ COPY requirements/ /app/requirements/
 RUN if [[ "$BASE_IMAGE" =~ ^nvidia/cuda: ]]; then \
         echo "NVIDIA base image detected: installing GPU packages (cupy, cuml, onnxruntime-gpu, voyager, torch+cuda)"; \
         uv pip install --system --no-cache --index-strategy unsafe-best-match -r /app/requirements/gpu.txt -r /app/requirements/common.txt || exit 1; \
+    elif [[ "$OPENVINO" == true ]]; then\
+        echo "OpenVINO base image detected: installing OpenVINO runtime packages (onnxruntime-openvino)"; \
+        uv pip install --system --no-cache --index-strategy unsafe-best-match -r /app/requirements/openvino.txt -r /app/requirements/common.txt || exit 1; \
     else \
         echo "CPU base image: installing all packages together for dependency resolution"; \
         uv pip install --system --no-cache --index-strategy unsafe-best-match -r /app/requirements/cpu.txt -r /app/requirements/common.txt || exit 1; \
diff --git a/Dockerfile-noavx2 b/Dockerfile-noavx2
index bf2faf73..42c7fc40 100644
--- a/Dockerfile-noavx2
+++ b/Dockerfile-noavx2
@@ -128,6 +128,9 @@ COPY requirements/ /app/requirements/
 RUN if [[ "$BASE_IMAGE" =~ ^nvidia/cuda: ]]; then \
         echo "NVIDIA base image detected: installing GPU packages (cupy, cuml, onnxruntime-gpu, voyager, torch+cuda)"; \
         uv pip install --system --no-cache --index-strategy unsafe-best-match -r /app/requirements/gpu.txt -r /app/requirements/common-noavx2.txt || exit 1; \
+    elif [[ "$BASEIMAGE" =~ ^openvino: ]]; then\
+        echo "OpenVINO base image detected: installing OpenVINO runtime packages (onnxruntime-openvino)" \
+        uv pip install --system --no-cache --index-strategy unsafe-best-match -r /app/requirements/openvino.txt -r /app/requirements/common.txt || exit 1; \
     else \
         echo "CPU base image: installing all packages together for dependency resolution"; \
         uv pip install --system --no-cache --index-strategy unsafe-best-match -r /app/requirements/cpu-noavx2.txt -r /app/requirements/common-noavx2.txt || exit 1; \
diff --git a/deployment/.env.example b/deployment/.env.example
index 0ec06cb6..15fcccdc 100644
--- a/deployment/.env.example
+++ b/deployment/.env.example
@@ -93,3 +93,7 @@ USE_GPU_CLUSTERING=false
 # Models: Audio model (~268MB) for analysis, Text model (~478MB) for search
 # Default: true
 CLAP_ENABLED=true
+
+# --- OpenVINO Acceleration ---
+RENDER_GID=  # render group ID (use `stat -c "%g" /dev/dri/renderD128` on host to verify)
+OPENVINO_CONFIG_JSON_PATH=  # path to have openvino load config https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html#load_config
\ No newline at end of file
diff --git a/requirements/openvino.txt b/requirements/openvino.txt
new file mode 100644
index 00000000..b330c2f1
--- /dev/null
+++ b/requirements/openvino.txt
@@ -0,0 +1,2 @@
+onnxruntime==1.19.2
+onnxruntime-openvino
\ No newline at end of file
diff --git a/student_clap/data/clap_embedder.py b/student_clap/data/clap_embedder.py
index 697e17d2..8882452d 100644
--- a/student_clap/data/clap_embedder.py
+++ b/student_clap/data/clap_embedder.py
@@ -10,6 +10,7 @@
 import librosa
 import onnxruntime as ort
 from typing import Tuple, Optional
+from util import provider
 
 logger = logging.getLogger(__name__)
 
@@ -53,17 +54,14 @@ def __init__(self, model_path: str):
         #sess_options.inter_op_num_threads = 2  # Parallel layers
         
         # Use CUDA if available, otherwise CPU
-        available_providers = ort.get_available_providers()
-        if 'CUDAExecutionProvider' in available_providers:
-            providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
-            logger.info(f"CLAP model loaded: {model_path}")
-            logger.info(f"✅ Using CUDA for ONNX teacher model")
-        else:
-            providers = ['CPUExecutionProvider']
-            logger.info(f"CLAP model loaded: {model_path}")
+        available_providers = provider.get_available_providers()
+        logger.info(f"✅ Using %s for ONNX teacher text model",
+                    [provider.split('ExecutionProvider')[0] for provider in available_providers])
+        elif len(available_providers) == 1:  # only CPUExecutionProvider
             logger.info(f"✅ Using optimized CPU inference (8 threads)")
             logger.info(f"   Performance: ~325ms/segment vs 713ms with CoreML")
             logger.info(f"   Reason: Only 24% of ops supported by CoreML GPU, context switching overhead too high")
+        logger.info(f"CLAP model loaded: {model_path}")
 
         self.session = ort.InferenceSession(
             model_path,
diff --git a/student_clap/data/clap_text_embedder.py b/student_clap/data/clap_text_embedder.py
index 07b11dac..12520fa1 100644
--- a/student_clap/data/clap_text_embedder.py
+++ b/student_clap/data/clap_text_embedder.py
@@ -3,6 +3,7 @@
 import numpy as np
 import onnxruntime as ort
 from typing import List
+from util import provider
 
 logger = logging.getLogger(__name__)
 
@@ -13,15 +14,10 @@ def __init__(self, model_path: str):
         sess_options = ort.SessionOptions()
         sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
         sess_options.log_severity_level = 3
-        available_providers = ort.get_available_providers()
-        if 'CUDAExecutionProvider' in available_providers:
-            providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
-            logger.info(f"CLAP text model loaded: {model_path}")
-            logger.info(f"✅ Using CUDA for ONNX teacher text model")
-        else:
-            providers = ['CPUExecutionProvider']
-            logger.info(f"CLAP text model loaded: {model_path}")
-            logger.info(f"✅ Using CPU for ONNX teacher text model")
+        available_providers = provider.get_available_providers()
+        logger.info(f"CLAP text model loaded: {model_path}")
+        logger.info(f"✅ Using %s for ONNX teacher text model",
+                    [provider.split('ExecutionProvider')[0] for provider in available_providers])
         self.session = ort.InferenceSession(
             model_path,
             sess_options=sess_options,
diff --git a/tasks/analysis.py b/tasks/analysis.py
index 9c3901d7..9809feb1 100644
--- a/tasks/analysis.py
+++ b/tasks/analysis.py
@@ -11,6 +11,7 @@
 import uuid
 import traceback
 import gc
+from typing import Any
 from pydub import AudioSegment
 from tempfile import NamedTemporaryFile
 
@@ -64,6 +65,7 @@
     SessionRecycler,
     comprehensive_memory_cleanup
 )
+from util import provider
 
 
 from psycopg2 import OperationalError
@@ -414,25 +416,7 @@ def analyze_track(file_path, mood_labels_list, model_paths, onnx_sessions=None):
     should_cleanup_sessions = False
     
     # Configure provider options for GPU memory management (used for main and secondary models)
-    available_providers = ort.get_available_providers()
-    if 'CUDAExecutionProvider' in available_providers:
-        # Get GPU device ID from environment or default to 0
-        gpu_device_id = 0
-        cuda_visible = os.environ.get('CUDA_VISIBLE_DEVICES', '')
-        if cuda_visible and cuda_visible != '-1':
-            gpu_device_id = 0
-        
-        cuda_options = {
-            'device_id': gpu_device_id,
-            'arena_extend_strategy': 'kSameAsRequested',  # Prevent memory fragmentation
-            'cudnn_conv_algo_search': 'EXHAUSTIVE',
-            'do_copy_in_default_stream': True,
-        }
-        provider_options = [('CUDAExecutionProvider', cuda_options), ('CPUExecutionProvider', {})]
-        logger.info(f"CUDA provider available - attempting to use GPU for analysis (device_id={gpu_device_id})")
-    else:
-        provider_options = [('CPUExecutionProvider', {})]
-        logger.info("CUDA provider not available - using CPU only")
+    provider_options = get_provider_options(cuda_do_copy_in_default_stream=True)
     
     try:
         # Use pre-loaded sessions if provided, otherwise load per-song
@@ -811,22 +795,8 @@ def get_missing_mulan_track_ids(track_ids):
                         if onnx_sessions is None:
                             logger.info(f"Lazy-loading Essentia models for album: {album_name}")
                             onnx_sessions = {}
-                            available_providers = ort.get_available_providers()
-                            
-                            if 'CUDAExecutionProvider' in available_providers:
-                                gpu_device_id = 0
-                                cuda_visible = os.environ.get('CUDA_VISIBLE_DEVICES', '')
-                                if cuda_visible and cuda_visible != '-1':
-                                    gpu_device_id = 0
-                                cuda_options = {
-                                    'device_id': gpu_device_id,
-                                    'arena_extend_strategy': 'kSameAsRequested',  # Prevent memory fragmentation
-                                    'cudnn_conv_algo_search': 'EXHAUSTIVE',      # Find memory-efficient algorithms
-                                    'do_copy_in_default_stream': True,           # Better memory sync
-                                }
-                                provider_options = [('CUDAExecutionProvider', cuda_options), ('CPUExecutionProvider', {})]
-                            else:
-                                provider_options = [('CPUExecutionProvider', {})]
+                            provider_options = get_provider_options(
+                                cuda_do_copy_in_default_stream=True)
                             
                             try:
                                 for model_name, model_path in model_paths.items():
@@ -859,22 +829,9 @@ def get_missing_mulan_track_ids(track_ids):
                             
                             # Recreate sessions
                             onnx_sessions = {}
-                            available_providers = ort.get_available_providers()
-                            
-                            if 'CUDAExecutionProvider' in available_providers:
-                                gpu_device_id = 0
-                                cuda_visible = os.environ.get('CUDA_VISIBLE_DEVICES', '')
-                                if cuda_visible and cuda_visible != '-1':
-                                    gpu_device_id = 0
-                                cuda_options = {
-                                    'device_id': gpu_device_id,
-                                    'arena_extend_strategy': 'kSameAsRequested',  # Prevent memory fragmentation
-                                    'cudnn_conv_algo_search': 'EXHAUSTIVE',
-                                    'do_copy_in_default_stream': True,
-                                }
-                                provider_options = [('CUDAExecutionProvider', cuda_options), ('CPUExecutionProvider', {})]
-                            else:
-                                provider_options = [('CPUExecutionProvider', {})]
+                            provider_options = get_provider_options(
+                                cuda_do_copy_in_default_stream=True
+                            )
                             
                             try:
                                 for model_name, model_path in model_paths.items():
@@ -1367,3 +1324,34 @@ def monitor_and_clear_jobs():
             logger.critical(f"FATAL ERROR: Analysis failed: {e}", exc_info=True)
             log_and_update_main(f"❌ Main analysis failed: {e}", current_progress, task_state=TASK_STATUS_FAILURE, error_message=str(e), traceback=traceback.format_exc())
             raise
+
+
+def get_provider_options(cuda_do_copy_in_default_stream: bool = False,
+                         cuda_conv_algo_search_mode: str = 'EXHAUSTIVE') -> list[tuple[str, dict[str, Any]]]:
+    provider_options = [('CPUExecutionProvider', {})]
+    available_providers = provider.get_available_providers()
+    if 'OpenVINOExecutionProvider' in available_providers:
+        vino_options = {
+            'device_type': 'AUTO',
+        }
+        if os.path.exists(os.environ.get('OPENVINO_CONFIG_JSON_PATH', '')):
+            vino_options['load_config'] = os.environ.get('OPENVINO_CONFIG_JSON_PATH')
+        provider_options.insert(0, ('OpenVINOExecutionProvider', vino_options))
+        logger.info("OpenVINO provider available - Attempting to use OpenVINO for analysis...")
+    if 'CUDAExecutionProvider' in available_providers:
+        # Get GPU device ID from environment or default to 0
+        gpu_device_id = 0
+        cuda_visible = os.environ.get('CUDA_VISIBLE_DEVICES', '')
+        if cuda_visible and cuda_visible != '-1':
+            gpu_device_id = 0
+        
+        cuda_options = {
+            'device_id': gpu_device_id,
+            'arena_extend_strategy': 'kSameAsRequested',  # Prevent memory fragmentation
+            'cudnn_conv_algo_search': cuda_conv_algo_search_mode,
+        }
+        if cuda_do_copy_in_default_stream:
+            cuda_options['do_copy_in_default_stream'] = True
+        provider_options.insert(0,('CUDAExecutionProvider', cuda_options))
+        logger.info(f"CUDA provider available - attempting to use GPU for analysis (device_id={gpu_device_id})")
+    return provider_options
\ No newline at end of file
diff --git a/tasks/clap_analyzer.py b/tasks/clap_analyzer.py
index d64501ff..32d183f6 100644
--- a/tasks/clap_analyzer.py
+++ b/tasks/clap_analyzer.py
@@ -30,6 +30,7 @@
     from config import AUDIO_LOAD_TIMEOUT
 except Exception:
     AUDIO_LOAD_TIMEOUT = None
+from tasks import analysis
 from tasks.memory_utils import cleanup_cuda_memory, handle_onnx_memory_error, comprehensive_memory_cleanup
 
 logger = logging.getLogger(__name__)
@@ -70,23 +71,7 @@ def _load_audio_model():
     session = None
     
     # Configure provider options with GPU memory management
-    available_providers = ort.get_available_providers()
-    if 'CUDAExecutionProvider' in available_providers:
-        gpu_device_id = 0
-        cuda_visible = os.environ.get('CUDA_VISIBLE_DEVICES', '')
-        if cuda_visible and cuda_visible != '-1':
-            gpu_device_id = 0
-        
-        cuda_options = {
-            'device_id': gpu_device_id,
-            'arena_extend_strategy': 'kSameAsRequested',
-            'cudnn_conv_algo_search': 'DEFAULT',
-        }
-        provider_options = [('CUDAExecutionProvider', cuda_options), ('CPUExecutionProvider', {})]
-        logger.info(f"CUDA provider available - will attempt to use GPU (device_id={gpu_device_id})")
-    else:
-        provider_options = [('CPUExecutionProvider', {})]
-        logger.info("CUDA provider not available - using CPU only")
+    provider_options = analysis.get_provider_options(cuda_conv_algo_search_mode='DEFAULT')
     
     # Create session
     try:
@@ -143,25 +128,8 @@ def _load_text_model():
     
     # Text model typically runs on CPU in Flask containers
     session = None
-    available_providers = ort.get_available_providers()
-    
-    if 'CUDAExecutionProvider' in available_providers:
-        gpu_device_id = 0
-        cuda_visible = os.environ.get('CUDA_VISIBLE_DEVICES', '')
-        if cuda_visible and cuda_visible != '-1':
-            gpu_device_id = 0
-        
-        cuda_options = {
-            'device_id': gpu_device_id,
-            'arena_extend_strategy': 'kSameAsRequested',
-            'cudnn_conv_algo_search': 'DEFAULT',
-        }
-        provider_options = [('CUDAExecutionProvider', cuda_options), ('CPUExecutionProvider', {})]
-        logger.info(f"CUDA provider available - will attempt to use GPU (device_id={gpu_device_id})")
-    else:
-        provider_options = [('CPUExecutionProvider', {})]
-        logger.info("CUDA provider not available - using CPU only")
-    
+    provider_options = analysis.get_provider_options(cuda_conv_algo_search_mode='DEFAULT')
+
     # Create session
     try:
         session = ort.InferenceSession(
@@ -231,26 +199,7 @@ def _load_onnx_model():
     session = None
     
     # Configure provider options with GPU memory management
-    available_providers = ort.get_available_providers()
-    if 'CUDAExecutionProvider' in available_providers:
-        # Get GPU device ID from environment or default to 0
-        # Docker sets NVIDIA_VISIBLE_DEVICES, CUDA runtime uses CUDA_VISIBLE_DEVICES
-        gpu_device_id = 0
-        cuda_visible = os.environ.get('CUDA_VISIBLE_DEVICES', '')
-        if cuda_visible and cuda_visible != '-1':
-            # If CUDA_VISIBLE_DEVICES is set, use first device (already mapped to 0)
-            gpu_device_id = 0
-        
-        cuda_options = {
-            'device_id': gpu_device_id,
-            'arena_extend_strategy': 'kSameAsRequested',  # Prevent memory fragmentation
-            'cudnn_conv_algo_search': 'DEFAULT',
-        }
-        provider_options = [('CUDAExecutionProvider', cuda_options), ('CPUExecutionProvider', {})]
-        logger.info(f"CUDA provider available - will attempt to use GPU (device_id={gpu_device_id})")
-    else:
-        provider_options = [('CPUExecutionProvider', {})]
-        logger.info("CUDA provider not available - using CPU only")
+    provider_options = analysis.get_provider_options(cuda_conv_algo_search_mode='DEFAULT')
     
     # Create session with determined providers
     try:
diff --git a/tasks/memory_utils.py b/tasks/memory_utils.py
index 1754a407..9148ad0a 100644
--- a/tasks/memory_utils.py
+++ b/tasks/memory_utils.py
@@ -208,20 +208,16 @@ def reset_onnx_memory_pool() -> bool:
     """
     try:
         import onnxruntime as ort
+        from util import provider
         
         # Force garbage collection first
         gc.collect()
         
         # Determine available providers
-        providers = ort.get_available_providers()
-        preferred_provider = None
-        
-        if 'CUDAExecutionProvider' in providers:
-            preferred_provider = 'CUDAExecutionProvider'
-            logger.debug("Using CUDA provider for ONNX memory pool reset")
-        elif 'CPUExecutionProvider' in providers:
-            preferred_provider = 'CPUExecutionProvider'
-            logger.debug("Using CPU provider for ONNX memory pool reset")
+        providers = provider.get_available_providers()
+        preferred_provider = providers[0]
+        if preferred_provider:
+            logger.debug("Using %s for ONNX memory pool reset", preferred_provider.split('ExecutionProvider')[0])
         else:
             logger.debug("No suitable ONNX provider found for memory pool reset")
             return False
diff --git a/tasks/mulan_analyzer.py b/tasks/mulan_analyzer.py
index f6171607..9778004f 100644
--- a/tasks/mulan_analyzer.py
+++ b/tasks/mulan_analyzer.py
@@ -21,6 +21,7 @@
 from typing import Tuple, Optional
 from transformers import AutoTokenizer
 from tasks.memory_utils import cleanup_cuda_memory, cleanup_onnx_session, handle_onnx_memory_error
+from util import provider
 
 logger = logging.getLogger(__name__)
 
@@ -66,12 +67,9 @@ def _load_mulan_models(load_text_models=False):
         logger.info("MuLan: Using ONNX Runtime automatic thread management")
         
         # Select execution provider (CPU or CUDA)
-        providers = ['CPUExecutionProvider']
-        if ort.get_available_providers() and 'CUDAExecutionProvider' in ort.get_available_providers():
-            providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
-            logger.info("CUDA available - using GPU acceleration")
-        else:
-            logger.info("Using CPU execution")
+        providers = provider.get_available_providers()
+        logger.info("Using %s acceleration",
+                    [provider.split('ExecutionProvider')[0] for provider in providers])
         
         # Load audio encoder (with external data file)
         logger.info(f"Loading audio encoder: {config.AUDIO_MODEL_PATH}")
@@ -172,9 +170,7 @@ def initialize_mulan_text_models():
         # sess_options.intra_op_num_threads = num_threads
         # sess_options.inter_op_num_threads = num_threads
         
-        providers = ['CPUExecutionProvider']
-        if ort.get_available_providers() and 'CUDAExecutionProvider' in ort.get_available_providers():
-            providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
+        providers = provider.get_available_providers()
         
         # Load text encoder
         _text_session = ort.InferenceSession(
diff --git a/tests/unit/test_analysis.py b/tests/unit/test_analysis.py
index 262b1587..1e596bd1 100644
--- a/tests/unit/test_analysis.py
+++ b/tests/unit/test_analysis.py
@@ -1035,7 +1035,7 @@ class TestOOMFallback:
     @patch('tasks.analysis.librosa.beat.beat_track')
     @patch('tasks.analysis.librosa.feature.melspectrogram')
     @patch('tasks.analysis.robust_load_audio_with_fallback')
-    @patch('tasks.analysis.ort.get_available_providers')
+    @patch('tasks.analysis.provider.get_available_providers')
     def test_embedding_oom_fallback_to_cpu(self, mock_providers, mock_audio_load, mock_mel, 
                                            mock_beat, mock_rms, mock_chroma, mock_onnx_session):
         """Test GPU OOM during embedding inference triggers CPU fallback
@@ -1123,7 +1123,7 @@ def create_session(model_path, providers=None, provider_options=None):
     @patch('tasks.analysis.librosa.beat.beat_track')
     @patch('tasks.analysis.librosa.feature.melspectrogram')
     @patch('tasks.analysis.robust_load_audio_with_fallback')
-    @patch('tasks.analysis.ort.get_available_providers')
+    @patch('tasks.analysis.provider.get_available_providers')
     def test_prediction_oom_fallback_to_cpu(self, mock_providers, mock_audio_load, mock_mel, 
                                             mock_beat, mock_rms, mock_chroma, mock_onnx_session):
         """Test GPU OOM during prediction inference triggers CPU fallback
@@ -1204,7 +1204,7 @@ def create_session(model_path, providers=None, provider_options=None):
     @patch('tasks.analysis.librosa.beat.beat_track')
     @patch('tasks.analysis.librosa.feature.melspectrogram')
     @patch('tasks.analysis.robust_load_audio_with_fallback')
-    @patch('tasks.analysis.ort.get_available_providers')
+    @patch('tasks.analysis.provider.get_available_providers')
     def test_secondary_model_oom_fallback_to_cpu(self, mock_providers, mock_audio_load, mock_mel, 
                                                   mock_beat, mock_rms, mock_chroma, mock_onnx_session):
         """Test GPU OOM during secondary model inference triggers CPU fallback
@@ -1286,7 +1286,7 @@ def create_session(model_path, providers=None, provider_options=None):
     @patch('tasks.analysis.librosa.beat.beat_track')
     @patch('tasks.analysis.librosa.feature.melspectrogram')
     @patch('tasks.analysis.robust_load_audio_with_fallback')
-    @patch('tasks.analysis.ort.get_available_providers')
+    @patch('tasks.analysis.provider.get_available_providers')
     def test_non_oom_exception_is_reraised(self, mock_providers, mock_audio_load, mock_mel, 
                                            mock_beat, mock_rms, mock_chroma, mock_onnx_session):
         """Test non-OOM exceptions are re-raised (not caught by OOM handler)
@@ -1345,7 +1345,7 @@ def gpu_run(output_names, feed_dict):
     @patch('tasks.analysis.librosa.beat.beat_track')
     @patch('tasks.analysis.librosa.feature.melspectrogram')
     @patch('tasks.analysis.robust_load_audio_with_fallback')
-    @patch('tasks.analysis.ort.get_available_providers')
+    @patch('tasks.analysis.provider.get_available_providers')
     def test_successful_gpu_inference_no_fallback(self, mock_providers, mock_audio_load, mock_mel, 
                                                   mock_beat, mock_rms, mock_chroma, mock_onnx_session):
         """Test successful GPU inference doesn't trigger CPU fallback
diff --git a/util/__init__.py b/util/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/util/provider.py b/util/provider.py
new file mode 100644
index 00000000..244a14c9
--- /dev/null
+++ b/util/provider.py
@@ -0,0 +1,22 @@
+from functools import cache
+import logging
+
+import onnxruntime as ort
+
+logger = logging.getLogger(__name__)
+
+
+@cache
+def get_available_providers() -> list[str]:
+    """
+    Filters out ONNXRuntime providers to ones supported by Audiomuse-AI
+    """
+    available_providers = ort.get_available_providers()
+    providers = ['CPUExecutionProvider']
+    if 'OpenVINOExecutionProvider' in available_providers:
+        providers.insert(0, 'OpenVINOExecutionProvider')
+    if 'CUDAExecutionProvider' in available_providers:
+        providers.insert(0, 'CUDAExecutionProvider')
+    logger.info("Providers made available: %s",
+                [provider.split('ExecutionProvider')[0] for provider in available_providers])
+    return providers
\ No newline at end of file

From fec4eed3fa08ecf3d9dfabe985ef3accd3770a8c Mon Sep 17 00:00:00 2001
From: Zhol Internet <zhol@internet.keyboard>
Date: Wed, 11 Mar 2026 19:59:18 -0700
Subject: [PATCH 2/2] have mode be set as env variable

---
 tasks/analysis.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tasks/analysis.py b/tasks/analysis.py
index 9809feb1..9858fb9f 100644
--- a/tasks/analysis.py
+++ b/tasks/analysis.py
@@ -1331,8 +1331,11 @@ def get_provider_options(cuda_do_copy_in_default_stream: bool = False,
     provider_options = [('CPUExecutionProvider', {})]
     available_providers = provider.get_available_providers()
     if 'OpenVINOExecutionProvider' in available_providers:
+        device_type = os.environ.get('OPENVINO_DEVICE_TYPE', 'GPU')
         vino_options = {
-            'device_type': 'AUTO',
+            'device_type': device_type,
+            'num_of_threads': int(os.environ.get('OPENVINO_NUM_OF_THREADS', '2')),
+            'num_streams': int(os.environ.get('OPENVINO_NUM_STREAMS', '1'))
         }
         if os.path.exists(os.environ.get('OPENVINO_CONFIG_JSON_PATH', '')):
             vino_options['load_config'] = os.environ.get('OPENVINO_CONFIG_JSON_PATH')