From 6cb177906951fdc947de6b9d5021fbeb81a67c7e Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Tue, 16 Jun 2037 22:55:00 +0300
Subject: [PATCH 01/39] more suggestion fixing

---
 main/chroma.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main/chroma.py b/main/chroma.py
index ffaef96..1a18fe7 100644
--- a/main/chroma.py
+++ b/main/chroma.py
@@ -40,7 +40,7 @@ def add_text_search_suggestion(self, suggestion_query: str) -> None:
         subsearches = suggestion_query.split()
         self.desc_collection.add(
             documents=[suggestion_query] + subsearches,
-            ids=[str(hash(suggestion_query))]
+            ids=[str(hash(query)) for query in [suggestion_query] + subsearches]
         )
 
     def get_text_search_suggestions(self, search_query: str, top_k: int = 20) -> list[str]:

From 079da0d56f0eaf6551b4b0526401000b9f11ecbd Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Sat, 22 Jun 2024 17:25:18 +0300
Subject: [PATCH 02/39] add whisper support to inference

---
 docker-compose.yml            |   2 +-
 inference/.env.dist           |   1 +
 inference/Dockerfile          |   8 +-
 inference/clip.py             |  83 +++---
 inference/deps.py             |   7 +
 inference/download_whisper.sh |   9 +
 inference/main.py             |  45 ++++
 inference/models.py           |  10 +
 inference/settings.py         |   1 +
 inference/whisper.py          |  18 ++
 main/.env.dist                |   2 +-
 main/chroma.py                |   7 +-
 main/clip.py                  |  22 +-
 main/main.py                  |  19 +-
 main/models.py                |  10 +-
 poetry.lock                   | 493 +++++++++++++++++++++++++++++++++-
 pyproject.toml                |   1 +
 17 files changed, 672 insertions(+), 66 deletions(-)
 create mode 100755 inference/download_whisper.sh
 create mode 100644 inference/main.py
 create mode 100644 inference/models.py
 create mode 100644 inference/whisper.py

diff --git a/docker-compose.yml b/docker-compose.yml
index 234190f..de94422 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -22,7 +22,7 @@ services:
       context: ./inference
       dockerfile: Dockerfile
     container_name: inference
-    command: uvicorn clip:app --host "0.0.0.0" --port 8040
+    command: uvicorn main:app --host "0.0.0.0" --port 8040
     restart: unless-stopped
     volumes:
       - inference-model-data:/app/model_data
diff --git a/inference/.env.dist b/inference/.env.dist
index 8b47efb..71f7798 100644
--- a/inference/.env.dist
+++ b/inference/.env.dist
@@ -1 +1,2 @@
 CLIP_MODEL=laion/CLIP-ViT-g-14-laion2B-s12B-b42K
+WHISPER_PATH=/app/model_data/ggml-large-v3.bin
diff --git a/inference/Dockerfile b/inference/Dockerfile
index cd80f92..1386abc 100644
--- a/inference/Dockerfile
+++ b/inference/Dockerfile
@@ -5,11 +5,15 @@ ENV PYTHONUNBUFFERED 1
 
 WORKDIR /app
 
-COPY requirements.txt /app/
+COPY download_whisper.sh /app/
+RUN ./download_whisper.sh
 
 RUN apt-get update && apt-get install ffmpeg -y
+
+COPY requirements.txt /app/
 RUN python -m pip install --upgrade pip && pip install -r requirements.txt
+
 COPY ./ /app/
 
 EXPOSE 8040
-CMD uvicorn clip:app --port 8040
+CMD uvicorn main:app --port 8040
diff --git a/inference/clip.py b/inference/clip.py
index f361947..cb2ed82 100644
--- a/inference/clip.py
+++ b/inference/clip.py
@@ -1,62 +1,49 @@
-import torch
-from typing import Optional
-from fastapi import FastAPI, HTTPException
-from fastapi.responses import JSONResponse
+from dataclasses import dataclass
+from typing import Callable, Literal
+
 from PIL import Image
-from pydantic import BaseModel
-from deps import Model, Processor, lifespan
-from frame_video import create_key_frames_for_video
-
-app = FastAPI(lifespan=lifespan)
-
-class EncodeRequest(BaseModel):
-    link: Optional[str] = None
-    description: Optional[str] = None
-
-@app.get("/")
-async def root():
-    return JSONResponse(content={"ok": True})
-
-@app.post("/encode")
-async def encode(request: EncodeRequest, processor: Processor, model: Model):
-    if not any((request.description, request.link)):
-        raise HTTPException(
-            status_code=400, detail="Please provide either 'description' as string or 'link' as video URL, or both."
-        )
-    
-    text_features, image_features = None, None
-    
-    if request.description:    
-        text_inputs = processor(text=[request.description], return_tensors="pt", padding=True)
+import torch
+from transformers import CLIPModel, CLIPProcessor
+
+from inference.frame_video import VideoFrame, create_key_frames_for_video
+
+
+@dataclass
+class CLIP:
+    processor: CLIPProcessor
+    model: CLIPModel
+
+    _create_key_frames_for_video: Callable[[str], list[VideoFrame]] = create_key_frames_for_video
+
+    def __call__(self, encode_source: str, encode_type: Literal["text"] | Literal["video"]) -> list[float]:
+        if encode_type == "text":
+            return self._encode_text(encode_source)
+
+        if encode_type == "video":
+            return self._encode_video(encode_source)
+
+    def _encode_text(self, description: str) -> list[float]:
+        text_inputs = self.processor(text=[description], return_tensors="pt", padding=True)
         with torch.no_grad():
-            text_features = model.get_text_features(**text_inputs)
+            text_features = self.model.get_text_features(**text_inputs)
             text_features /= text_features.norm(dim=-1, keepdim=True)
-    
-    if request.link:
-        images = create_key_frames_for_video(request.link)
+
+        return text_features.tolist()[0]
+
+    def _encode_video(self, link: str) -> list[float]:
+        images = self._create_key_frames_for_video(link)
         image_inputs = []
         for image in images:
             image = Image.open(image.file)
-            image_input = processor(images=image, return_tensors="pt")
+            image_input = self.processor(images=image, return_tensors="pt")
             image_inputs.append(image_input)
         with torch.no_grad():
-            image_features = model.get_image_features(**image_inputs[0])
+            image_features = self.model.get_image_features(**image_inputs[0])
             for image_input in image_inputs[1:]:
-                image_feature = model.get_image_features(**image_input)
+                image_feature = self.model.get_image_features(**image_input)
                 image_features = torch.cat((image_features, image_feature), dim=0)
 
             features = torch.mean(image_features, dim=0)
             features /= features.norm(dim=-1, keepdim=True)
 
-    if request.description and request.link:
-        text_weight = 1.0
-        video_weight = 2.0  # Giving more importance to video
-        # Merged weighted vectors of text and video didn't work so well, leave off for now
-        unified_features = (text_features * text_weight + image_features * video_weight) / (text_weight + video_weight)
-        return {"features": image_features.tolist()[0]}
-
-    elif request.description:
-        return {"features": text_features.tolist()[0]}
-
-    elif request.link:
-        return {"features": image_features.tolist()[0]}
+        return features.tolist()[0]
diff --git a/inference/deps.py b/inference/deps.py
index 3f77a14..ad509c6 100644
--- a/inference/deps.py
+++ b/inference/deps.py
@@ -4,6 +4,7 @@
 from fastapi import Depends, FastAPI, Request
 from transformers import CLIPModel, CLIPProcessor
 
+from inference.whisper import WhisperService
 from settings import Settings
 
 
@@ -17,6 +18,7 @@ async def lifespan(app: FastAPI):
         Settings.clip_model,
         cache_dir="./model_cache"
     )
+    app.state.whisper_model = WhisperService()
     yield
 
 
@@ -28,5 +30,10 @@ def _get_clip_processor(request: Request) -> CLIPProcessor:
     return request.app.state.processor
 
 
+def _get_whisper(request: Request) -> WhisperService:
+    return request.app.state.whisper_model
+
+
 Processor = Annotated[CLIPProcessor, Depends(_get_clip_processor)]
 Model = Annotated[CLIPModel, Depends(_get_clip_model)]
+Whisper = Annotated[WhisperService, Depends(_get_whisper)]
diff --git a/inference/download_whisper.sh b/inference/download_whisper.sh
new file mode 100755
index 0000000..b39d2e7
--- /dev/null
+++ b/inference/download_whisper.sh
@@ -0,0 +1,9 @@
+#!usr/bin/bash
+
+if ! test /app/model_data/ggml-large-v3.bin; then
+    mkdir /app/model_data
+    git clone https://github.com/ggerganov/whisper.cpp.git
+    cd whisper.cpp
+    bash ./models/download-ggml-model.sh large-v3
+    mv ./models/ggml-large-v3.bin /app/model_data
+fi
diff --git a/inference/main.py b/inference/main.py
new file mode 100644
index 0000000..f8edabc
--- /dev/null
+++ b/inference/main.py
@@ -0,0 +1,45 @@
+from fastapi import FastAPI
+from fastapi.responses import JSONResponse
+
+from inference.deps import Model, Processor, Whisper, lifespan
+from inference.clip import CLIP
+from inference.models import EncodeRequest, EncodeSearchRequest
+
+app = FastAPI(lifespan=lifespan)
+
+@app.get("/")
+async def root():
+    return JSONResponse(content={"ok": True})
+
+@app.post("/encode")
+async def encode(
+        request: EncodeRequest,
+        processor: Processor,
+        model: Model,
+        whisper: Whisper
+):
+    clip = CLIP(processor=processor, model=model)
+
+    video_features = clip(request.link, encode_type="video")
+    if request.description is not None:
+        description_features = clip(request.description, encode_type="text")
+    else:
+        description_features = None
+
+    audio_transcription = whisper(request.link)
+    audio_features = clip(audio_transcription, encode_type="text")
+    return {
+        "video": video_features,
+        "audio": audio_features,
+        "description": description_features
+    }
+
+@app.post("/encode-search")
+async def encode_search(
+        request: EncodeSearchRequest, processor: Processor, model: Model
+):
+    clip = CLIP(processor=processor, model=model)
+
+    features = clip(request.query, encode_type="text")
+
+    return {"features": features}
diff --git a/inference/models.py b/inference/models.py
new file mode 100644
index 0000000..61888e4
--- /dev/null
+++ b/inference/models.py
@@ -0,0 +1,10 @@
+from pydantic import BaseModel
+
+
+class EncodeRequest(BaseModel):
+    link: str
+    description: str | None = None
+
+
+class EncodeSearchRequest(BaseModel):
+    query: str
diff --git a/inference/settings.py b/inference/settings.py
index 82c3549..9170c72 100644
--- a/inference/settings.py
+++ b/inference/settings.py
@@ -6,3 +6,4 @@
 
 class Settings:
     clip_model: str = env.str("CLIP_MODEL")
+    whisper_path: str = env.str("WHISPER_PATH")
diff --git a/inference/whisper.py b/inference/whisper.py
new file mode 100644
index 0000000..8cde250
--- /dev/null
+++ b/inference/whisper.py
@@ -0,0 +1,18 @@
+from dataclasses import dataclass, field
+from io import BytesIO
+
+import requests
+from whisper_cpp_python import Whisper
+
+from inference.settings import Settings
+
+
+@dataclass
+class WhisperService:
+    _service: Whisper = field(default_factory=lambda: Whisper(model_path=Settings.whisper_path))
+
+    def __call__(self, link: str) -> str:
+        
+        video_data = BytesIO(requests.get(link).content)
+        data = self._service.transcribe(video_data)
+        return data["text"]
diff --git a/main/.env.dist b/main/.env.dist
index 19f980c..353c69c 100644
--- a/main/.env.dist
+++ b/main/.env.dist
@@ -1,3 +1,3 @@
-CLIP_URL=http://inference:8040/encode
+CLIP_URL=http://inference:8040/
 DB_HOST=chroma_db
 DB_PORT=8000
diff --git a/main/chroma.py b/main/chroma.py
index 1a18fe7..97bea98 100644
--- a/main/chroma.py
+++ b/main/chroma.py
@@ -1,3 +1,4 @@
+from uuid import uuid4
 import chromadb
 from chromadb.server import Settings as ChromaSettings
 from models import Feature
@@ -25,8 +26,10 @@ def __init__(
 
     def add_feature(self, feature: Feature) -> None:
         self.collection.add(
-            ids=[feature.link],
+            ids=[str(uuid4())],
             embeddings=[feature.features],
+            uris=[feature.link],
+            metadatas=[{"feature_type": feature.feature_type}]
         )
     
     def search_relevant_videos(self, search_feature: Feature, top_k: int = 100) -> list[str]:
@@ -34,7 +37,7 @@ def search_relevant_videos(self, search_feature: Feature, top_k: int = 100) -> l
             query_embeddings=search_feature.features,
             n_results=top_k
         )
-        return results['ids'][0]
+        return results['uris'][0]
 
     def add_text_search_suggestion(self, suggestion_query: str) -> None:
         subsearches = suggestion_query.split()
diff --git a/main/clip.py b/main/clip.py
index b94c8dd..9ae4e42 100644
--- a/main/clip.py
+++ b/main/clip.py
@@ -1,26 +1,34 @@
 import aiohttp
-from models import Video, Feature
+from models import SearchFeature, Video, Feature
 
 class CLIPService:
     def __init__(self, url: str) -> None:
         self.clip_url = url
     
-    async def get_video_embedding(self, request: Video) -> Feature:
+    async def get_video_embeddings(self, request: Video) -> list[Feature]:
         async with aiohttp.ClientSession().post(
-            url=self.clip_url, 
+            url=f"{self.clip_url}/encode", 
             json=request.model_dump(mode="json")
         ) as resp:
             features = await resp.json()
 
-        return Feature(features=features['features'], link=request.link, description=request.description)
+        return [
+            Feature(
+                features=v,
+                link=request.link,
+                description=request.description,
+                feature_type=k
+            )
+            for k, v in features.items()
+        ]
      
     async def get_text_embedding(
             self, 
-            request: Video, 
+            request: SearchFeature, 
     ) -> Feature:
         async with aiohttp.ClientSession().post(
-            self.clip_url, 
+            f"{self.clip_url}/encode-search", 
             json=request.model_dump(mode="json")
         ) as resp:
             features = await resp.json()
-        return Feature(features=features['features'])
+        return Feature(features=features['features'], feature_type="description")
diff --git a/main/main.py b/main/main.py
index 86903d5..962cc43 100644
--- a/main/main.py
+++ b/main/main.py
@@ -3,18 +3,30 @@
 
 from deps import Opus, Clip, Chroma, Speller, lifespan
 from settings import Settings
-from models import Video, Text, SuggestRequest
+from models import SearchFeature, Video, Text, SuggestRequest
 
 app = FastAPI(lifespan=lifespan)
 
+
 @app.post("/index")
 async def add_video_to_index(request: Video, clip: Clip, chroma: Chroma) -> Video:
     """Добавляет новое видео в хранилище - индекс"""
+<<<<<<< Updated upstream
     feature = await clip.get_video_embedding(request)
     if request.description is not None:
         chroma.add_text_search_suggestion(suggestion_query=request.description)
     chroma.add_feature(feature=feature)
     return request.model_dump(mode="dict")
+=======
+    features = await clip.get_video_embeddings(request)
+    if request.description is not None:
+        chroma.add_text_search_suggestion(suggestion_query=request.description)
+
+    for feature in features:
+        chroma.add_feature(feature=feature)
+    return request
+
+>>>>>>> Stashed changes
 
 @app.get("/search")
 @cache(expire=Settings.cache_lifetime)
@@ -29,8 +41,13 @@ async def search_for_related_videos(
     spelled_search = speller(params.text)
     translated_search = translator(spelled_search)
     search_vector = await clip.get_text_embedding(
+<<<<<<< Updated upstream
         Video(
             description=translated_search
+=======
+        SearchFeature(
+            query=translated_search
+>>>>>>> Stashed changes
         )
     )
     return {"results": chroma.search_relevant_videos(search_feature=search_vector, top_k=params.return_amount)}
diff --git a/main/models.py b/main/models.py
index 633e847..e25975a 100644
--- a/main/models.py
+++ b/main/models.py
@@ -1,17 +1,20 @@
-from typing import Optional
+from typing import Literal, Optional
 from pydantic import BaseModel
 
 class Video(BaseModel):
     """Represents a Link to Video with text description 
     to be vectorized and added to index"""
-    description: Optional[str] = None
-    link: Optional[str] = None
+    link: str
+    description: str | None = None
 
 class Text(BaseModel):
     """Represents a text query to search related videos"""
     text: str
     return_amount: int = 50
 
+class SearchFeature(BaseModel):
+    query: str
+
 class SuggestRequest(BaseModel):
     """Represents a text query to suggest related completions"""
     text: str
@@ -21,3 +24,4 @@ class Feature(BaseModel):
     link: Optional[str] = None
     description: Optional[str] = None
     features: list[float]
+    feature_type: Literal["description"] | Literal["video"] | Literal["audio"]
diff --git a/poetry.lock b/poetry.lock
index 825512c..8e449ae 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -226,6 +226,20 @@ tests = ["attrs[tests-no-zope]", "zope-interface"]
 tests-mypy = ["mypy (>=1.6)", "pytest-mypy-plugins"]
 tests-no-zope = ["attrs[tests-mypy]", "cloudpickle", "hypothesis", "pympler", "pytest (>=4.3.0)", "pytest-xdist[psutil]"]
 
+[[package]]
+name = "audioread"
+version = "3.0.1"
+description = "Multi-library, cross-platform audio decoding."
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "audioread-3.0.1-py3-none-any.whl", hash = "sha256:4cdce70b8adc0da0a3c9e0d85fb10b3ace30fbdf8d1670fd443929b61d117c33"},
+    {file = "audioread-3.0.1.tar.gz", hash = "sha256:ac5460a5498c48bdf2e8e767402583a4dcd13f4414d286f42ce4379e8b35066d"},
+]
+
+[package.extras]
+test = ["tox"]
+
 [[package]]
 name = "autocorrect"
 version = "2.6.1"
@@ -332,6 +346,70 @@ files = [
     {file = "certifi-2024.6.2.tar.gz", hash = "sha256:3cd43f1c6fa7dedc5899d69d3ad0398fd018ad1a17fba83ddaf78aa46c747516"},
 ]
 
+[[package]]
+name = "cffi"
+version = "1.16.0"
+description = "Foreign Function Interface for Python calling C code."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "cffi-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6b3d6606d369fc1da4fd8c357d026317fbb9c9b75d36dc16e90e84c26854b088"},
+    {file = "cffi-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ac0f5edd2360eea2f1daa9e26a41db02dd4b0451b48f7c318e217ee092a213e9"},
+    {file = "cffi-1.16.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e61e3e4fa664a8588aa25c883eab612a188c725755afff6289454d6362b9673"},
+    {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a72e8961a86d19bdb45851d8f1f08b041ea37d2bd8d4fd19903bc3083d80c896"},
+    {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5b50bf3f55561dac5438f8e70bfcdfd74543fd60df5fa5f62d94e5867deca684"},
+    {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7651c50c8c5ef7bdb41108b7b8c5a83013bfaa8a935590c5d74627c047a583c7"},
+    {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4108df7fe9b707191e55f33efbcb2d81928e10cea45527879a4749cbe472614"},
+    {file = "cffi-1.16.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:32c68ef735dbe5857c810328cb2481e24722a59a2003018885514d4c09af9743"},
+    {file = "cffi-1.16.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:673739cb539f8cdaa07d92d02efa93c9ccf87e345b9a0b556e3ecc666718468d"},
+    {file = "cffi-1.16.0-cp310-cp310-win32.whl", hash = "sha256:9f90389693731ff1f659e55c7d1640e2ec43ff725cc61b04b2f9c6d8d017df6a"},
+    {file = "cffi-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:e6024675e67af929088fda399b2094574609396b1decb609c55fa58b028a32a1"},
+    {file = "cffi-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b84834d0cf97e7d27dd5b7f3aca7b6e9263c56308ab9dc8aae9784abb774d404"},
+    {file = "cffi-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1b8ebc27c014c59692bb2664c7d13ce7a6e9a629be20e54e7271fa696ff2b417"},
+    {file = "cffi-1.16.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ee07e47c12890ef248766a6e55bd38ebfb2bb8edd4142d56db91b21ea68b7627"},
+    {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8a9d3ebe49f084ad71f9269834ceccbf398253c9fac910c4fd7053ff1386936"},
+    {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e70f54f1796669ef691ca07d046cd81a29cb4deb1e5f942003f401c0c4a2695d"},
+    {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5bf44d66cdf9e893637896c7faa22298baebcd18d1ddb6d2626a6e39793a1d56"},
+    {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7b78010e7b97fef4bee1e896df8a4bbb6712b7f05b7ef630f9d1da00f6444d2e"},
+    {file = "cffi-1.16.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c6a164aa47843fb1b01e941d385aab7215563bb8816d80ff3a363a9f8448a8dc"},
+    {file = "cffi-1.16.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e09f3ff613345df5e8c3667da1d918f9149bd623cd9070c983c013792a9a62eb"},
+    {file = "cffi-1.16.0-cp311-cp311-win32.whl", hash = "sha256:2c56b361916f390cd758a57f2e16233eb4f64bcbeee88a4881ea90fca14dc6ab"},
+    {file = "cffi-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:db8e577c19c0fda0beb7e0d4e09e0ba74b1e4c092e0e40bfa12fe05b6f6d75ba"},
+    {file = "cffi-1.16.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956"},
+    {file = "cffi-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:68e7c44931cc171c54ccb702482e9fc723192e88d25a0e133edd7aff8fcd1f6e"},
+    {file = "cffi-1.16.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:abd808f9c129ba2beda4cfc53bde801e5bcf9d6e0f22f095e45327c038bfe68e"},
+    {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88e2b3c14bdb32e440be531ade29d3c50a1a59cd4e51b1dd8b0865c54ea5d2e2"},
+    {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357"},
+    {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b7be2d771cdba2942e13215c4e340bfd76398e9227ad10402a8767ab1865d2e6"},
+    {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e715596e683d2ce000574bae5d07bd522c781a822866c20495e52520564f0969"},
+    {file = "cffi-1.16.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2d92b25dbf6cae33f65005baf472d2c245c050b1ce709cc4588cdcdd5495b520"},
+    {file = "cffi-1.16.0-cp312-cp312-win32.whl", hash = "sha256:b2ca4e77f9f47c55c194982e10f058db063937845bb2b7a86c84a6cfe0aefa8b"},
+    {file = "cffi-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:68678abf380b42ce21a5f2abde8efee05c114c2fdb2e9eef2efdb0257fba1235"},
+    {file = "cffi-1.16.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0c9ef6ff37e974b73c25eecc13952c55bceed9112be2d9d938ded8e856138bcc"},
+    {file = "cffi-1.16.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a09582f178759ee8128d9270cd1344154fd473bb77d94ce0aeb2a93ebf0feaf0"},
+    {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e760191dd42581e023a68b758769e2da259b5d52e3103c6060ddc02c9edb8d7b"},
+    {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:80876338e19c951fdfed6198e70bc88f1c9758b94578d5a7c4c91a87af3cf31c"},
+    {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a6a14b17d7e17fa0d207ac08642c8820f84f25ce17a442fd15e27ea18d67c59b"},
+    {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6602bc8dc6f3a9e02b6c22c4fc1e47aa50f8f8e6d3f78a5e16ac33ef5fefa324"},
+    {file = "cffi-1.16.0-cp38-cp38-win32.whl", hash = "sha256:131fd094d1065b19540c3d72594260f118b231090295d8c34e19a7bbcf2e860a"},
+    {file = "cffi-1.16.0-cp38-cp38-win_amd64.whl", hash = "sha256:31d13b0f99e0836b7ff893d37af07366ebc90b678b6664c955b54561fc36ef36"},
+    {file = "cffi-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:582215a0e9adbe0e379761260553ba11c58943e4bbe9c36430c4ca6ac74b15ed"},
+    {file = "cffi-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b29ebffcf550f9da55bec9e02ad430c992a87e5f512cd63388abb76f1036d8d2"},
+    {file = "cffi-1.16.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dc9b18bf40cc75f66f40a7379f6a9513244fe33c0e8aa72e2d56b0196a7ef872"},
+    {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9cb4a35b3642fc5c005a6755a5d17c6c8b6bcb6981baf81cea8bfbc8903e8ba8"},
+    {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b86851a328eedc692acf81fb05444bdf1891747c25af7529e39ddafaf68a4f3f"},
+    {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c0f31130ebc2d37cdd8e44605fb5fa7ad59049298b3f745c74fa74c62fbfcfc4"},
+    {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f8e709127c6c77446a8c0a8c8bf3c8ee706a06cd44b1e827c3e6a2ee6b8c098"},
+    {file = "cffi-1.16.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:748dcd1e3d3d7cd5443ef03ce8685043294ad6bd7c02a38d1bd367cfd968e000"},
+    {file = "cffi-1.16.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8895613bcc094d4a1b2dbe179d88d7fb4a15cee43c052e8885783fac397d91fe"},
+    {file = "cffi-1.16.0-cp39-cp39-win32.whl", hash = "sha256:ed86a35631f7bfbb28e108dd96773b9d5a6ce4811cf6ea468bb6a359b256b1e4"},
+    {file = "cffi-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:3686dffb02459559c74dd3d81748269ffb0eb027c39a6fc99502de37d501faa8"},
+    {file = "cffi-1.16.0.tar.gz", hash = "sha256:bcb3ef43e58665bbda2fb198698fcae6776483e0c4a631aa5647806c25e02cc0"},
+]
+
+[package.dependencies]
+pycparser = "*"
+
 [[package]]
 name = "charset-normalizer"
 version = "3.3.2"
@@ -550,6 +628,17 @@ humanfriendly = ">=9.1"
 [package.extras]
 cron = ["capturer (>=2.4)"]
 
+[[package]]
+name = "decorator"
+version = "5.1.1"
+description = "Decorators for Humans"
+optional = false
+python-versions = ">=3.5"
+files = [
+    {file = "decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"},
+    {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"},
+]
+
 [[package]]
 name = "deprecated"
 version = "1.2.14"
@@ -1360,6 +1449,17 @@ MarkupSafe = ">=2.0"
 [package.extras]
 i18n = ["Babel (>=2.7)"]
 
+[[package]]
+name = "joblib"
+version = "1.4.2"
+description = "Lightweight pipelining with Python functions"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "joblib-1.4.2-py3-none-any.whl", hash = "sha256:06d478d5674cbc267e7496a410ee875abd68e4340feff4490bcb7afb88060ae6"},
+    {file = "joblib-1.4.2.tar.gz", hash = "sha256:2382c5816b2636fbd20a09e0f4e9dad4736765fdfb7dca582943b9c1366b3f0e"},
+]
+
 [[package]]
 name = "jsonpatch"
 version = "1.33"
@@ -1503,6 +1603,86 @@ orjson = ">=3.9.14,<4.0.0"
 pydantic = ">=1,<3"
 requests = ">=2,<3"
 
+[[package]]
+name = "lazy-loader"
+version = "0.4"
+description = "Makes it easy to load subpackages and functions on demand."
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "lazy_loader-0.4-py3-none-any.whl", hash = "sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc"},
+    {file = "lazy_loader-0.4.tar.gz", hash = "sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1"},
+]
+
+[package.dependencies]
+packaging = "*"
+
+[package.extras]
+dev = ["changelist (==0.5)"]
+lint = ["pre-commit (==3.7.0)"]
+test = ["pytest (>=7.4)", "pytest-cov (>=4.1)"]
+
+[[package]]
+name = "librosa"
+version = "0.10.2.post1"
+description = "Python module for audio and music processing"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "librosa-0.10.2.post1-py3-none-any.whl", hash = "sha256:dc882750e8b577a63039f25661b7e39ec4cfbacc99c1cffba666cd664fb0a7a0"},
+    {file = "librosa-0.10.2.post1.tar.gz", hash = "sha256:cd99f16717cbcd1e0983e37308d1db46a6f7dfc2e396e5a9e61e6821e44bd2e7"},
+]
+
+[package.dependencies]
+audioread = ">=2.1.9"
+decorator = ">=4.3.0"
+joblib = ">=0.14"
+lazy-loader = ">=0.1"
+msgpack = ">=1.0"
+numba = ">=0.51.0"
+numpy = ">=1.20.3,<1.22.0 || >1.22.0,<1.22.1 || >1.22.1,<1.22.2 || >1.22.2"
+pooch = ">=1.1"
+scikit-learn = ">=0.20.0"
+scipy = ">=1.2.0"
+soundfile = ">=0.12.1"
+soxr = ">=0.3.2"
+typing-extensions = ">=4.1.1"
+
+[package.extras]
+display = ["matplotlib (>=3.5.0)"]
+docs = ["ipython (>=7.0)", "matplotlib (>=3.5.0)", "mir-eval (>=0.5)", "numba (>=0.51)", "numpydoc", "presets", "sphinx (!=1.3.1)", "sphinx-copybutton (>=0.5.2)", "sphinx-gallery (>=0.7)", "sphinx-multiversion (>=0.2.3)", "sphinx-rtd-theme (>=1.2.0)", "sphinxcontrib-svg2pdfconverter"]
+tests = ["matplotlib (>=3.5.0)", "packaging (>=20.0)", "pytest", "pytest-cov", "pytest-mpl", "resampy (>=0.2.2)", "samplerate", "types-decorator"]
+
+[[package]]
+name = "llvmlite"
+version = "0.43.0"
+description = "lightweight wrapper around basic LLVM functionality"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "llvmlite-0.43.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a289af9a1687c6cf463478f0fa8e8aa3b6fb813317b0d70bf1ed0759eab6f761"},
+    {file = "llvmlite-0.43.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6d4fd101f571a31acb1559ae1af30f30b1dc4b3186669f92ad780e17c81e91bc"},
+    {file = "llvmlite-0.43.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7d434ec7e2ce3cc8f452d1cd9a28591745de022f931d67be688a737320dfcead"},
+    {file = "llvmlite-0.43.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6912a87782acdff6eb8bf01675ed01d60ca1f2551f8176a300a886f09e836a6a"},
+    {file = "llvmlite-0.43.0-cp310-cp310-win_amd64.whl", hash = "sha256:14f0e4bf2fd2d9a75a3534111e8ebeb08eda2f33e9bdd6dfa13282afacdde0ed"},
+    {file = "llvmlite-0.43.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3e8d0618cb9bfe40ac38a9633f2493d4d4e9fcc2f438d39a4e854f39cc0f5f98"},
+    {file = "llvmlite-0.43.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e0a9a1a39d4bf3517f2af9d23d479b4175ead205c592ceeb8b89af48a327ea57"},
+    {file = "llvmlite-0.43.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c1da416ab53e4f7f3bc8d4eeba36d801cc1894b9fbfbf2022b29b6bad34a7df2"},
+    {file = "llvmlite-0.43.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:977525a1e5f4059316b183fb4fd34fa858c9eade31f165427a3977c95e3ee749"},
+    {file = "llvmlite-0.43.0-cp311-cp311-win_amd64.whl", hash = "sha256:d5bd550001d26450bd90777736c69d68c487d17bf371438f975229b2b8241a91"},
+    {file = "llvmlite-0.43.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f99b600aa7f65235a5a05d0b9a9f31150c390f31261f2a0ba678e26823ec38f7"},
+    {file = "llvmlite-0.43.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:35d80d61d0cda2d767f72de99450766250560399edc309da16937b93d3b676e7"},
+    {file = "llvmlite-0.43.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eccce86bba940bae0d8d48ed925f21dbb813519169246e2ab292b5092aba121f"},
+    {file = "llvmlite-0.43.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:df6509e1507ca0760787a199d19439cc887bfd82226f5af746d6977bd9f66844"},
+    {file = "llvmlite-0.43.0-cp312-cp312-win_amd64.whl", hash = "sha256:7a2872ee80dcf6b5dbdc838763d26554c2a18aa833d31a2635bff16aafefb9c9"},
+    {file = "llvmlite-0.43.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9cd2a7376f7b3367019b664c21f0c61766219faa3b03731113ead75107f3b66c"},
+    {file = "llvmlite-0.43.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:18e9953c748b105668487b7c81a3e97b046d8abf95c4ddc0cd3c94f4e4651ae8"},
+    {file = "llvmlite-0.43.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:74937acd22dc11b33946b67dca7680e6d103d6e90eeaaaf932603bec6fe7b03a"},
+    {file = "llvmlite-0.43.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc9efc739cc6ed760f795806f67889923f7274276f0eb45092a1473e40d9b867"},
+    {file = "llvmlite-0.43.0-cp39-cp39-win_amd64.whl", hash = "sha256:47e147cdda9037f94b399bf03bfd8a6b6b1f2f90be94a454e3386f006455a9b4"},
+    {file = "llvmlite-0.43.0.tar.gz", hash = "sha256:ae2b5b5c3ef67354824fb75517c8db5fbe93bc02cd9671f3c62271626bc041d5"},
+]
+
 [[package]]
 name = "markdown-it-py"
 version = "3.0.0"
@@ -1764,6 +1944,71 @@ docs = ["sphinx"]
 gmpy = ["gmpy2 (>=2.1.0a4)"]
 tests = ["pytest (>=4.6)"]
 
+[[package]]
+name = "msgpack"
+version = "1.0.8"
+description = "MessagePack serializer"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "msgpack-1.0.8-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:505fe3d03856ac7d215dbe005414bc28505d26f0c128906037e66d98c4e95868"},
+    {file = "msgpack-1.0.8-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e6b7842518a63a9f17107eb176320960ec095a8ee3b4420b5f688e24bf50c53c"},
+    {file = "msgpack-1.0.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:376081f471a2ef24828b83a641a02c575d6103a3ad7fd7dade5486cad10ea659"},
+    {file = "msgpack-1.0.8-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5e390971d082dba073c05dbd56322427d3280b7cc8b53484c9377adfbae67dc2"},
+    {file = "msgpack-1.0.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00e073efcba9ea99db5acef3959efa45b52bc67b61b00823d2a1a6944bf45982"},
+    {file = "msgpack-1.0.8-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82d92c773fbc6942a7a8b520d22c11cfc8fd83bba86116bfcf962c2f5c2ecdaa"},
+    {file = "msgpack-1.0.8-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9ee32dcb8e531adae1f1ca568822e9b3a738369b3b686d1477cbc643c4a9c128"},
+    {file = "msgpack-1.0.8-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e3aa7e51d738e0ec0afbed661261513b38b3014754c9459508399baf14ae0c9d"},
+    {file = "msgpack-1.0.8-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:69284049d07fce531c17404fcba2bb1df472bc2dcdac642ae71a2d079d950653"},
+    {file = "msgpack-1.0.8-cp310-cp310-win32.whl", hash = "sha256:13577ec9e247f8741c84d06b9ece5f654920d8365a4b636ce0e44f15e07ec693"},
+    {file = "msgpack-1.0.8-cp310-cp310-win_amd64.whl", hash = "sha256:e532dbd6ddfe13946de050d7474e3f5fb6ec774fbb1a188aaf469b08cf04189a"},
+    {file = "msgpack-1.0.8-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9517004e21664f2b5a5fd6333b0731b9cf0817403a941b393d89a2f1dc2bd836"},
+    {file = "msgpack-1.0.8-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d16a786905034e7e34098634b184a7d81f91d4c3d246edc6bd7aefb2fd8ea6ad"},
+    {file = "msgpack-1.0.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e2872993e209f7ed04d963e4b4fbae72d034844ec66bc4ca403329db2074377b"},
+    {file = "msgpack-1.0.8-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c330eace3dd100bdb54b5653b966de7f51c26ec4a7d4e87132d9b4f738220ba"},
+    {file = "msgpack-1.0.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:83b5c044f3eff2a6534768ccfd50425939e7a8b5cf9a7261c385de1e20dcfc85"},
+    {file = "msgpack-1.0.8-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1876b0b653a808fcd50123b953af170c535027bf1d053b59790eebb0aeb38950"},
+    {file = "msgpack-1.0.8-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:dfe1f0f0ed5785c187144c46a292b8c34c1295c01da12e10ccddfc16def4448a"},
+    {file = "msgpack-1.0.8-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:3528807cbbb7f315bb81959d5961855e7ba52aa60a3097151cb21956fbc7502b"},
+    {file = "msgpack-1.0.8-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e2f879ab92ce502a1e65fce390eab619774dda6a6ff719718069ac94084098ce"},
+    {file = "msgpack-1.0.8-cp311-cp311-win32.whl", hash = "sha256:26ee97a8261e6e35885c2ecd2fd4a6d38252246f94a2aec23665a4e66d066305"},
+    {file = "msgpack-1.0.8-cp311-cp311-win_amd64.whl", hash = "sha256:eadb9f826c138e6cf3c49d6f8de88225a3c0ab181a9b4ba792e006e5292d150e"},
+    {file = "msgpack-1.0.8-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:114be227f5213ef8b215c22dde19532f5da9652e56e8ce969bf0a26d7c419fee"},
+    {file = "msgpack-1.0.8-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d661dc4785affa9d0edfdd1e59ec056a58b3dbb9f196fa43587f3ddac654ac7b"},
+    {file = "msgpack-1.0.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d56fd9f1f1cdc8227d7b7918f55091349741904d9520c65f0139a9755952c9e8"},
+    {file = "msgpack-1.0.8-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0726c282d188e204281ebd8de31724b7d749adebc086873a59efb8cf7ae27df3"},
+    {file = "msgpack-1.0.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8db8e423192303ed77cff4dce3a4b88dbfaf43979d280181558af5e2c3c71afc"},
+    {file = "msgpack-1.0.8-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:99881222f4a8c2f641f25703963a5cefb076adffd959e0558dc9f803a52d6a58"},
+    {file = "msgpack-1.0.8-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:b5505774ea2a73a86ea176e8a9a4a7c8bf5d521050f0f6f8426afe798689243f"},
+    {file = "msgpack-1.0.8-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:ef254a06bcea461e65ff0373d8a0dd1ed3aa004af48839f002a0c994a6f72d04"},
+    {file = "msgpack-1.0.8-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:e1dd7839443592d00e96db831eddb4111a2a81a46b028f0facd60a09ebbdd543"},
+    {file = "msgpack-1.0.8-cp312-cp312-win32.whl", hash = "sha256:64d0fcd436c5683fdd7c907eeae5e2cbb5eb872fafbc03a43609d7941840995c"},
+    {file = "msgpack-1.0.8-cp312-cp312-win_amd64.whl", hash = "sha256:74398a4cf19de42e1498368c36eed45d9528f5fd0155241e82c4082b7e16cffd"},
+    {file = "msgpack-1.0.8-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:0ceea77719d45c839fd73abcb190b8390412a890df2f83fb8cf49b2a4b5c2f40"},
+    {file = "msgpack-1.0.8-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1ab0bbcd4d1f7b6991ee7c753655b481c50084294218de69365f8f1970d4c151"},
+    {file = "msgpack-1.0.8-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:1cce488457370ffd1f953846f82323cb6b2ad2190987cd4d70b2713e17268d24"},
+    {file = "msgpack-1.0.8-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3923a1778f7e5ef31865893fdca12a8d7dc03a44b33e2a5f3295416314c09f5d"},
+    {file = "msgpack-1.0.8-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a22e47578b30a3e199ab067a4d43d790249b3c0587d9a771921f86250c8435db"},
+    {file = "msgpack-1.0.8-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bd739c9251d01e0279ce729e37b39d49a08c0420d3fee7f2a4968c0576678f77"},
+    {file = "msgpack-1.0.8-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:d3420522057ebab1728b21ad473aa950026d07cb09da41103f8e597dfbfaeb13"},
+    {file = "msgpack-1.0.8-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:5845fdf5e5d5b78a49b826fcdc0eb2e2aa7191980e3d2cfd2a30303a74f212e2"},
+    {file = "msgpack-1.0.8-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6a0e76621f6e1f908ae52860bdcb58e1ca85231a9b0545e64509c931dd34275a"},
+    {file = "msgpack-1.0.8-cp38-cp38-win32.whl", hash = "sha256:374a8e88ddab84b9ada695d255679fb99c53513c0a51778796fcf0944d6c789c"},
+    {file = "msgpack-1.0.8-cp38-cp38-win_amd64.whl", hash = "sha256:f3709997b228685fe53e8c433e2df9f0cdb5f4542bd5114ed17ac3c0129b0480"},
+    {file = "msgpack-1.0.8-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:f51bab98d52739c50c56658cc303f190785f9a2cd97b823357e7aeae54c8f68a"},
+    {file = "msgpack-1.0.8-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:73ee792784d48aa338bba28063e19a27e8d989344f34aad14ea6e1b9bd83f596"},
+    {file = "msgpack-1.0.8-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f9904e24646570539a8950400602d66d2b2c492b9010ea7e965025cb71d0c86d"},
+    {file = "msgpack-1.0.8-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e75753aeda0ddc4c28dce4c32ba2f6ec30b1b02f6c0b14e547841ba5b24f753f"},
+    {file = "msgpack-1.0.8-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5dbf059fb4b7c240c873c1245ee112505be27497e90f7c6591261c7d3c3a8228"},
+    {file = "msgpack-1.0.8-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4916727e31c28be8beaf11cf117d6f6f188dcc36daae4e851fee88646f5b6b18"},
+    {file = "msgpack-1.0.8-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7938111ed1358f536daf311be244f34df7bf3cdedb3ed883787aca97778b28d8"},
+    {file = "msgpack-1.0.8-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:493c5c5e44b06d6c9268ce21b302c9ca055c1fd3484c25ba41d34476c76ee746"},
+    {file = "msgpack-1.0.8-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5fbb160554e319f7b22ecf530a80a3ff496d38e8e07ae763b9e82fadfe96f273"},
+    {file = "msgpack-1.0.8-cp39-cp39-win32.whl", hash = "sha256:f9af38a89b6a5c04b7d18c492c8ccf2aee7048aff1ce8437c4683bb5a1df893d"},
+    {file = "msgpack-1.0.8-cp39-cp39-win_amd64.whl", hash = "sha256:ed59dd52075f8fc91da6053b12e8c89e37aa043f8986efd89e61fae69dc1b011"},
+    {file = "msgpack-1.0.8.tar.gz", hash = "sha256:95c02b0e27e706e48d0e5426d1710ca78e0f0628d6e89d5b5a5b91a5f12274f3"},
+]
+
 [[package]]
 name = "multidict"
 version = "6.0.5"
@@ -1881,6 +2126,40 @@ doc = ["myst-nb (>=1.0)", "numpydoc (>=1.7)", "pillow (>=9.4)", "pydata-sphinx-t
 extra = ["lxml (>=4.6)", "pydot (>=2.0)", "pygraphviz (>=1.12)", "sympy (>=1.10)"]
 test = ["pytest (>=7.2)", "pytest-cov (>=4.0)"]
 
+[[package]]
+name = "numba"
+version = "0.60.0"
+description = "compiling Python code using LLVM"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "numba-0.60.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5d761de835cd38fb400d2c26bb103a2726f548dc30368853121d66201672e651"},
+    {file = "numba-0.60.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:159e618ef213fba758837f9837fb402bbe65326e60ba0633dbe6c7f274d42c1b"},
+    {file = "numba-0.60.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1527dc578b95c7c4ff248792ec33d097ba6bef9eda466c948b68dfc995c25781"},
+    {file = "numba-0.60.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fe0b28abb8d70f8160798f4de9d486143200f34458d34c4a214114e445d7124e"},
+    {file = "numba-0.60.0-cp310-cp310-win_amd64.whl", hash = "sha256:19407ced081d7e2e4b8d8c36aa57b7452e0283871c296e12d798852bc7d7f198"},
+    {file = "numba-0.60.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a17b70fc9e380ee29c42717e8cc0bfaa5556c416d94f9aa96ba13acb41bdece8"},
+    {file = "numba-0.60.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3fb02b344a2a80efa6f677aa5c40cd5dd452e1b35f8d1c2af0dfd9ada9978e4b"},
+    {file = "numba-0.60.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5f4fde652ea604ea3c86508a3fb31556a6157b2c76c8b51b1d45eb40c8598703"},
+    {file = "numba-0.60.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4142d7ac0210cc86432b818338a2bc368dc773a2f5cf1e32ff7c5b378bd63ee8"},
+    {file = "numba-0.60.0-cp311-cp311-win_amd64.whl", hash = "sha256:cac02c041e9b5bc8cf8f2034ff6f0dbafccd1ae9590dc146b3a02a45e53af4e2"},
+    {file = "numba-0.60.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d7da4098db31182fc5ffe4bc42c6f24cd7d1cb8a14b59fd755bfee32e34b8404"},
+    {file = "numba-0.60.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:38d6ea4c1f56417076ecf8fc327c831ae793282e0ff51080c5094cb726507b1c"},
+    {file = "numba-0.60.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:62908d29fb6a3229c242e981ca27e32a6e606cc253fc9e8faeb0e48760de241e"},
+    {file = "numba-0.60.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0ebaa91538e996f708f1ab30ef4d3ddc344b64b5227b67a57aa74f401bb68b9d"},
+    {file = "numba-0.60.0-cp312-cp312-win_amd64.whl", hash = "sha256:f75262e8fe7fa96db1dca93d53a194a38c46da28b112b8a4aca168f0df860347"},
+    {file = "numba-0.60.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:01ef4cd7d83abe087d644eaa3d95831b777aa21d441a23703d649e06b8e06b74"},
+    {file = "numba-0.60.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:819a3dfd4630d95fd574036f99e47212a1af41cbcb019bf8afac63ff56834449"},
+    {file = "numba-0.60.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0b983bd6ad82fe868493012487f34eae8bf7dd94654951404114f23c3466d34b"},
+    {file = "numba-0.60.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c151748cd269ddeab66334bd754817ffc0cabd9433acb0f551697e5151917d25"},
+    {file = "numba-0.60.0-cp39-cp39-win_amd64.whl", hash = "sha256:3031547a015710140e8c87226b4cfe927cac199835e5bf7d4fe5cb64e814e3ab"},
+    {file = "numba-0.60.0.tar.gz", hash = "sha256:5df6158e5584eece5fc83294b949fd30b9f1125df7708862205217e068aabf16"},
+]
+
+[package.dependencies]
+llvmlite = "==0.43.*"
+numpy = ">=1.22,<2.1"
+
 [[package]]
 name = "numpy"
 version = "1.26.4"
@@ -2617,6 +2896,27 @@ docs = ["furo (>=2023.9.10)", "proselint (>=0.13)", "sphinx (>=7.2.6)", "sphinx-
 test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)"]
 type = ["mypy (>=1.8)"]
 
+[[package]]
+name = "pooch"
+version = "1.8.2"
+description = "A friend to fetch your data files"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "pooch-1.8.2-py3-none-any.whl", hash = "sha256:3529a57096f7198778a5ceefd5ac3ef0e4d06a6ddaf9fc2d609b806f25302c47"},
+    {file = "pooch-1.8.2.tar.gz", hash = "sha256:76561f0de68a01da4df6af38e9955c4c9d1a5c90da73f7e40276a5728ec83d10"},
+]
+
+[package.dependencies]
+packaging = ">=20.0"
+platformdirs = ">=2.5.0"
+requests = ">=2.19.0"
+
+[package.extras]
+progress = ["tqdm (>=4.41.0,<5.0.0)"]
+sftp = ["paramiko (>=2.7.0)"]
+xxhash = ["xxhash (>=1.4.3)"]
+
 [[package]]
 name = "posthog"
 version = "3.5.0"
@@ -2713,6 +3013,17 @@ files = [
 [package.dependencies]
 pyasn1 = ">=0.4.6,<0.7.0"
 
+[[package]]
+name = "pycparser"
+version = "2.22"
+description = "C parser in Python"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"},
+    {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"},
+]
+
 [[package]]
 name = "pydantic"
 version = "2.7.3"
@@ -3374,6 +3685,93 @@ opencv = ["opencv-python"]
 opencv-headless = ["opencv-python-headless"]
 pyav = ["av"]
 
+[[package]]
+name = "scikit-learn"
+version = "1.5.0"
+description = "A set of python modules for machine learning and data mining"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "scikit_learn-1.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:12e40ac48555e6b551f0a0a5743cc94cc5a765c9513fe708e01f0aa001da2801"},
+    {file = "scikit_learn-1.5.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:f405c4dae288f5f6553b10c4ac9ea7754d5180ec11e296464adb5d6ac68b6ef5"},
+    {file = "scikit_learn-1.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:df8ccabbf583315f13160a4bb06037bde99ea7d8211a69787a6b7c5d4ebb6fc3"},
+    {file = "scikit_learn-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c75ea812cd83b1385bbfa94ae971f0d80adb338a9523f6bbcb5e0b0381151d4"},
+    {file = "scikit_learn-1.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:a90c5da84829a0b9b4bf00daf62754b2be741e66b5946911f5bdfaa869fcedd6"},
+    {file = "scikit_learn-1.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2a65af2d8a6cce4e163a7951a4cfbfa7fceb2d5c013a4b593686c7f16445cf9d"},
+    {file = "scikit_learn-1.5.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:4c0c56c3005f2ec1db3787aeaabefa96256580678cec783986836fc64f8ff622"},
+    {file = "scikit_learn-1.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f77547165c00625551e5c250cefa3f03f2fc92c5e18668abd90bfc4be2e0bff"},
+    {file = "scikit_learn-1.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:118a8d229a41158c9f90093e46b3737120a165181a1b58c03461447aa4657415"},
+    {file = "scikit_learn-1.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:a03b09f9f7f09ffe8c5efffe2e9de1196c696d811be6798ad5eddf323c6f4d40"},
+    {file = "scikit_learn-1.5.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:460806030c666addee1f074788b3978329a5bfdc9b7d63e7aad3f6d45c67a210"},
+    {file = "scikit_learn-1.5.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:1b94d6440603752b27842eda97f6395f570941857456c606eb1d638efdb38184"},
+    {file = "scikit_learn-1.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d82c2e573f0f2f2f0be897e7a31fcf4e73869247738ab8c3ce7245549af58ab8"},
+    {file = "scikit_learn-1.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a3a10e1d9e834e84d05e468ec501a356226338778769317ee0b84043c0d8fb06"},
+    {file = "scikit_learn-1.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:855fc5fa8ed9e4f08291203af3d3e5fbdc4737bd617a371559aaa2088166046e"},
+    {file = "scikit_learn-1.5.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:40fb7d4a9a2db07e6e0cae4dc7bdbb8fada17043bac24104d8165e10e4cff1a2"},
+    {file = "scikit_learn-1.5.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:47132440050b1c5beb95f8ba0b2402bbd9057ce96ec0ba86f2f445dd4f34df67"},
+    {file = "scikit_learn-1.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:174beb56e3e881c90424e21f576fa69c4ffcf5174632a79ab4461c4c960315ac"},
+    {file = "scikit_learn-1.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:261fe334ca48f09ed64b8fae13f9b46cc43ac5f580c4a605cbb0a517456c8f71"},
+    {file = "scikit_learn-1.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:057b991ac64b3e75c9c04b5f9395eaf19a6179244c089afdebaad98264bff37c"},
+    {file = "scikit_learn-1.5.0.tar.gz", hash = "sha256:789e3db01c750ed6d496fa2db7d50637857b451e57bcae863bff707c1247bef7"},
+]
+
+[package.dependencies]
+joblib = ">=1.2.0"
+numpy = ">=1.19.5"
+scipy = ">=1.6.0"
+threadpoolctl = ">=3.1.0"
+
+[package.extras]
+benchmark = ["matplotlib (>=3.3.4)", "memory_profiler (>=0.57.0)", "pandas (>=1.1.5)"]
+build = ["cython (>=3.0.10)", "meson-python (>=0.15.0)", "numpy (>=1.19.5)", "scipy (>=1.6.0)"]
+docs = ["Pillow (>=7.1.2)", "matplotlib (>=3.3.4)", "memory_profiler (>=0.57.0)", "numpydoc (>=1.2.0)", "pandas (>=1.1.5)", "plotly (>=5.14.0)", "polars (>=0.20.23)", "pooch (>=1.6.0)", "scikit-image (>=0.17.2)", "seaborn (>=0.9.0)", "sphinx (>=6.0.0)", "sphinx-copybutton (>=0.5.2)", "sphinx-gallery (>=0.15.0)", "sphinx-prompt (>=1.3.0)", "sphinxext-opengraph (>=0.4.2)"]
+examples = ["matplotlib (>=3.3.4)", "pandas (>=1.1.5)", "plotly (>=5.14.0)", "pooch (>=1.6.0)", "scikit-image (>=0.17.2)", "seaborn (>=0.9.0)"]
+install = ["joblib (>=1.2.0)", "numpy (>=1.19.5)", "scipy (>=1.6.0)", "threadpoolctl (>=3.1.0)"]
+maintenance = ["conda-lock (==2.5.6)"]
+tests = ["black (>=24.3.0)", "matplotlib (>=3.3.4)", "mypy (>=1.9)", "numpydoc (>=1.2.0)", "pandas (>=1.1.5)", "polars (>=0.20.23)", "pooch (>=1.6.0)", "pyamg (>=4.0.0)", "pyarrow (>=12.0.0)", "pytest (>=7.1.2)", "pytest-cov (>=2.9.0)", "ruff (>=0.2.1)", "scikit-image (>=0.17.2)"]
+
+[[package]]
+name = "scipy"
+version = "1.13.1"
+description = "Fundamental algorithms for scientific computing in Python"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "scipy-1.13.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:20335853b85e9a49ff7572ab453794298bcf0354d8068c5f6775a0eabf350aca"},
+    {file = "scipy-1.13.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:d605e9c23906d1994f55ace80e0125c587f96c020037ea6aa98d01b4bd2e222f"},
+    {file = "scipy-1.13.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cfa31f1def5c819b19ecc3a8b52d28ffdcc7ed52bb20c9a7589669dd3c250989"},
+    {file = "scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26264b282b9da0952a024ae34710c2aff7d27480ee91a2e82b7b7073c24722f"},
+    {file = "scipy-1.13.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:eccfa1906eacc02de42d70ef4aecea45415f5be17e72b61bafcfd329bdc52e94"},
+    {file = "scipy-1.13.1-cp310-cp310-win_amd64.whl", hash = "sha256:2831f0dc9c5ea9edd6e51e6e769b655f08ec6db6e2e10f86ef39bd32eb11da54"},
+    {file = "scipy-1.13.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:27e52b09c0d3a1d5b63e1105f24177e544a222b43611aaf5bc44d4a0979e32f9"},
+    {file = "scipy-1.13.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:54f430b00f0133e2224c3ba42b805bfd0086fe488835effa33fa291561932326"},
+    {file = "scipy-1.13.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e89369d27f9e7b0884ae559a3a956e77c02114cc60a6058b4e5011572eea9299"},
+    {file = "scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a78b4b3345f1b6f68a763c6e25c0c9a23a9fd0f39f5f3d200efe8feda560a5fa"},
+    {file = "scipy-1.13.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:45484bee6d65633752c490404513b9ef02475b4284c4cfab0ef946def50b3f59"},
+    {file = "scipy-1.13.1-cp311-cp311-win_amd64.whl", hash = "sha256:5713f62f781eebd8d597eb3f88b8bf9274e79eeabf63afb4a737abc6c84ad37b"},
+    {file = "scipy-1.13.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5d72782f39716b2b3509cd7c33cdc08c96f2f4d2b06d51e52fb45a19ca0c86a1"},
+    {file = "scipy-1.13.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:017367484ce5498445aade74b1d5ab377acdc65e27095155e448c88497755a5d"},
+    {file = "scipy-1.13.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:949ae67db5fa78a86e8fa644b9a6b07252f449dcf74247108c50e1d20d2b4627"},
+    {file = "scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de3ade0e53bc1f21358aa74ff4830235d716211d7d077e340c7349bc3542e884"},
+    {file = "scipy-1.13.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2ac65fb503dad64218c228e2dc2d0a0193f7904747db43014645ae139c8fad16"},
+    {file = "scipy-1.13.1-cp312-cp312-win_amd64.whl", hash = "sha256:cdd7dacfb95fea358916410ec61bbc20440f7860333aee6d882bb8046264e949"},
+    {file = "scipy-1.13.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:436bbb42a94a8aeef855d755ce5a465479c721e9d684de76bf61a62e7c2b81d5"},
+    {file = "scipy-1.13.1-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:8335549ebbca860c52bf3d02f80784e91a004b71b059e3eea9678ba994796a24"},
+    {file = "scipy-1.13.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d533654b7d221a6a97304ab63c41c96473ff04459e404b83275b60aa8f4b7004"},
+    {file = "scipy-1.13.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:637e98dcf185ba7f8e663e122ebf908c4702420477ae52a04f9908707456ba4d"},
+    {file = "scipy-1.13.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a014c2b3697bde71724244f63de2476925596c24285c7a637364761f8710891c"},
+    {file = "scipy-1.13.1-cp39-cp39-win_amd64.whl", hash = "sha256:392e4ec766654852c25ebad4f64e4e584cf19820b980bc04960bca0b0cd6eaa2"},
+    {file = "scipy-1.13.1.tar.gz", hash = "sha256:095a87a0312b08dfd6a6155cbbd310a8c51800fc931b8c0b84003014b874ed3c"},
+]
+
+[package.dependencies]
+numpy = ">=1.22.4,<2.3"
+
+[package.extras]
+dev = ["cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy", "pycodestyle", "pydevtool", "rich-click", "ruff", "types-psutil", "typing_extensions"]
+doc = ["jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.12.0)", "jupytext", "matplotlib (>=3.5)", "myst-nb", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0)", "sphinx-design (>=0.4.0)"]
+test = ["array-api-strict", "asv", "gmpy2", "hypothesis (>=6.30)", "mpmath", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"]
+
 [[package]]
 name = "setuptools"
 version = "70.0.0"
@@ -3422,6 +3820,75 @@ files = [
     {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"},
 ]
 
+[[package]]
+name = "soundfile"
+version = "0.12.1"
+description = "An audio library based on libsndfile, CFFI and NumPy"
+optional = false
+python-versions = "*"
+files = [
+    {file = "soundfile-0.12.1-py2.py3-none-any.whl", hash = "sha256:828a79c2e75abab5359f780c81dccd4953c45a2c4cd4f05ba3e233ddf984b882"},
+    {file = "soundfile-0.12.1-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:d922be1563ce17a69582a352a86f28ed8c9f6a8bc951df63476ffc310c064bfa"},
+    {file = "soundfile-0.12.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:bceaab5c4febb11ea0554566784bcf4bc2e3977b53946dda2b12804b4fe524a8"},
+    {file = "soundfile-0.12.1-py2.py3-none-manylinux_2_17_x86_64.whl", hash = "sha256:2dc3685bed7187c072a46ab4ffddd38cef7de9ae5eb05c03df2ad569cf4dacbc"},
+    {file = "soundfile-0.12.1-py2.py3-none-manylinux_2_31_x86_64.whl", hash = "sha256:074247b771a181859d2bc1f98b5ebf6d5153d2c397b86ee9e29ba602a8dfe2a6"},
+    {file = "soundfile-0.12.1-py2.py3-none-win32.whl", hash = "sha256:59dfd88c79b48f441bbf6994142a19ab1de3b9bb7c12863402c2bc621e49091a"},
+    {file = "soundfile-0.12.1-py2.py3-none-win_amd64.whl", hash = "sha256:0d86924c00b62552b650ddd28af426e3ff2d4dc2e9047dae5b3d8452e0a49a77"},
+    {file = "soundfile-0.12.1.tar.gz", hash = "sha256:e8e1017b2cf1dda767aef19d2fd9ee5ebe07e050d430f77a0a7c66ba08b8cdae"},
+]
+
+[package.dependencies]
+cffi = ">=1.0"
+
+[package.extras]
+numpy = ["numpy"]
+
+[[package]]
+name = "soxr"
+version = "0.3.7"
+description = "High quality, one-dimensional sample-rate conversion library"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "soxr-0.3.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ac81c4af6a993d5b7c0b466bbac4835bad2b14ec32f342b2c1f83e4cf825e301"},
+    {file = "soxr-0.3.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8d8a2b3e7f8d0255e2484fb82cb66c86da6fb25b342ef793cceca9ce9a61aa16"},
+    {file = "soxr-0.3.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd6eb6f6bbda2e8de36672cf2f0529ced6e638773150744ef075be0cc4f52c"},
+    {file = "soxr-0.3.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e47d86af35b942c92606fc2d5dfccf3f01309329475571ae2312bbf9edc3a790"},
+    {file = "soxr-0.3.7-cp310-cp310-win_amd64.whl", hash = "sha256:0e291adfaf9f2a7c4dd180a1b8c280f9beb1c84cb381853e4f4b3434d002ed7f"},
+    {file = "soxr-0.3.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:9e811450f0e91972932bd37ac58e32e44002c2c99db2aa926a9e7ba164545034"},
+    {file = "soxr-0.3.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9cea63014ce91035074e1228c9340e2b8609faf964e268705fcac5135d05060c"},
+    {file = "soxr-0.3.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bfab27830f6217a15b83445988225c3aeea3bbccfa9399ced291e53e1b05925d"},
+    {file = "soxr-0.3.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:286858e3078d76c11b6d490b66fed3c9bb2a4229759f6be03ceef5c02189bf2c"},
+    {file = "soxr-0.3.7-cp311-cp311-win_amd64.whl", hash = "sha256:54985ff33292192d2937be80df3e5f3a44d6d53e6835f727d6b99b7cdd3f1611"},
+    {file = "soxr-0.3.7-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:83c74ef6d61d7dcd81be26f91bee0a420f792f5c1982266f2a80e655f0650a98"},
+    {file = "soxr-0.3.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:cb1e14663a43fe88b8fbc287822a159028366a820abe1a0a9670fb53618cb47b"},
+    {file = "soxr-0.3.7-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48acdfbcf870ab54f645b1cfd641bce92c1e3a67346c3bf0f6c0ad2873c1dd35"},
+    {file = "soxr-0.3.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ea663b76f2b0ec1576b8a43aef317aec080abc0a67a4015fcd9f3407039f260a"},
+    {file = "soxr-0.3.7-cp312-cp312-win_amd64.whl", hash = "sha256:42da0d9eb79c70e5a41917f1b48a032e241a48eb4a1bcea7c80577302ff26974"},
+    {file = "soxr-0.3.7-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:511c6b2279c8ddd83459d129d69f628f7aae4616ae0a1912963985bd89e35df7"},
+    {file = "soxr-0.3.7-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a37c518c0b5d70162956d808d6c2e249bae0672e414e0dcfc101e200d8c31f3c"},
+    {file = "soxr-0.3.7-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:27f2890528d2b2e358938ab660a6b8346802863f5b6b646204d7ff8ab0ca2c66"},
+    {file = "soxr-0.3.7-cp37-cp37m-win_amd64.whl", hash = "sha256:52467c8c012495544a6dcfcce6b5bcbbc653d24fe9bb33c0b6191acecdb5e297"},
+    {file = "soxr-0.3.7-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ce12b93747958f2769d6b297e6e27c73d9ad635fe8104ef052bece9c8a322824"},
+    {file = "soxr-0.3.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:1cd65dc7b96ea3cb6c8c48e6020e859680556cc42dd3d4de44779530cce21037"},
+    {file = "soxr-0.3.7-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d994f1a7690b1b13ab639ea33e0c1d78415b64d88d6df4af705a9443f97b9687"},
+    {file = "soxr-0.3.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e87b58bc9e8c2caa16f07726f666bd043f0a49ca937baa803ce7708003b27833"},
+    {file = "soxr-0.3.7-cp38-cp38-win_amd64.whl", hash = "sha256:07f4c0c6125ea1482fa187ad5f007216712ee0a93586a9b2f80e79c0bf944cf7"},
+    {file = "soxr-0.3.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e5267c3ba34d4b873d9bbe3a9e58418b01ae4fd04349a4f944d9943b9ddac0f7"},
+    {file = "soxr-0.3.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6e39668c250e221db888cf3b290a16fbe10a702d9a4eb604a127f720040de583"},
+    {file = "soxr-0.3.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f8ceeb74e5a55d903cc286d3bd12c2d8f8c85d02894071e9ec92ab405430907c"},
+    {file = "soxr-0.3.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0eed6bf58192dd1bb93becd2444de4d712689713d727b32fd55623ae9aae7df7"},
+    {file = "soxr-0.3.7-cp39-cp39-win_amd64.whl", hash = "sha256:7221302b4547d02a3f38dd3cd15317ab2b78873c75921db5f4a070848f0c71be"},
+    {file = "soxr-0.3.7.tar.gz", hash = "sha256:436ddff00c6eb2c75b79c19cfdca7527b1e31b5fad738652f044045ba6258593"},
+]
+
+[package.dependencies]
+numpy = "*"
+
+[package.extras]
+docs = ["linkify-it-py", "myst-parser", "sphinx", "sphinx-book-theme"]
+test = ["pytest"]
+
 [[package]]
 name = "sqlalchemy"
 version = "2.0.30"
@@ -3568,6 +4035,17 @@ files = [
 doc = ["reno", "sphinx"]
 test = ["pytest", "tornado (>=4.5)", "typeguard"]
 
+[[package]]
+name = "threadpoolctl"
+version = "3.5.0"
+description = "threadpoolctl"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "threadpoolctl-3.5.0-py3-none-any.whl", hash = "sha256:56c1e26c150397e58c4926da8eeee87533b1e32bef131bd4bf6a2f45f3185467"},
+    {file = "threadpoolctl-3.5.0.tar.gz", hash = "sha256:082433502dd922bf738de0d8bcc4fdcbf0979ff44c42bd40f5af8a282f6fa107"},
+]
+
 [[package]]
 name = "tokenizers"
 version = "0.19.1"
@@ -4244,6 +4722,19 @@ files = [
     {file = "websockets-12.0.tar.gz", hash = "sha256:81df9cbcbb6c260de1e007e58c011bfebe2dafc8435107b0537f393dd38c8b1b"},
 ]
 
+[[package]]
+name = "whisper-cpp-python"
+version = "0.2.0"
+description = "A Python wrapper for whisper.cpp"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "whisper_cpp_python-0.2.0.tar.gz", hash = "sha256:4e727040711d9f8adc3767f786525ba1543b5dd34e3a851880a953933b402f55"},
+]
+
+[package.dependencies]
+librosa = ">=0.10.0.post2,<0.11.0"
+
 [[package]]
 name = "wrapt"
 version = "1.16.0"
@@ -4444,4 +4935,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "854913e46a4742275f760698c85f5c0003a5df398805ce81d5a40bb074697baa"
+content-hash = "f8851f3ea1cdfc285ecd3281914ea167219f602416c140724e37ae3a63d2675c"
diff --git a/pyproject.toml b/pyproject.toml
index 4bee240..a0f3e54 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,6 +27,7 @@ environs = "^11.0.0"
 fastapi-cache2 = {extras = ["redis"], version = "^0.2.1"}
 aiomcache = "^0.8.2"
 autocorrect = "^2.6.1"
+whisper-cpp-python = "^0.2.0"
 
 
 [build-system]

From 38c90781336b828ca8955265ce0173bbcb8fa38f Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Sat, 22 Jun 2024 17:31:34 +0300
Subject: [PATCH 03/39] add routing between search and encode instances

---
 docker-compose.yml |  8 ++++++--
 main/.env.dist     |  3 ++-
 main/clip.py       | 10 ++++++----
 main/deps.py       |  4 +---
 main/settings.py   |  3 ++-
 5 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index de94422..1142ad0 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -17,7 +17,7 @@ services:
       - "11211:11211"
     restart: always
 
-  inference:
+  encode-inference: &inference
     build:
       context: ./inference
       dockerfile: Dockerfile
@@ -31,6 +31,9 @@ services:
     ports:
       - "8040:8040"
 
+  search-inference:
+    <<: *inference
+
   main:
     build:
       context: ./main
@@ -42,7 +45,8 @@ services:
       - main-model-data:/app/model_data
     depends_on:
       - db
-      - inference
+      - encode-inference
+      - search-inference
       - cache
     env_file:
       - main/.env.dist
diff --git a/main/.env.dist b/main/.env.dist
index 353c69c..ccaf278 100644
--- a/main/.env.dist
+++ b/main/.env.dist
@@ -1,3 +1,4 @@
-CLIP_URL=http://inference:8040/
+ENCODE_CLIP_URL=http://encode-inference:8040/
+SEARCH_CLIP_URL=http://search-inference:8040/
 DB_HOST=chroma_db
 DB_PORT=8000
diff --git a/main/clip.py b/main/clip.py
index 9ae4e42..57c3a6d 100644
--- a/main/clip.py
+++ b/main/clip.py
@@ -1,13 +1,15 @@
 import aiohttp
+from main.settings import Settings
 from models import SearchFeature, Video, Feature
 
 class CLIPService:
-    def __init__(self, url: str) -> None:
-        self.clip_url = url
+    def __init__(self) -> None:
+        self.encode_clip_url = Settings.encode_clip_url
+        self.search_clip_url = Settings.search_clip_url
     
     async def get_video_embeddings(self, request: Video) -> list[Feature]:
         async with aiohttp.ClientSession().post(
-            url=f"{self.clip_url}/encode", 
+            url=f"{self.encode_clip_url}/encode", 
             json=request.model_dump(mode="json")
         ) as resp:
             features = await resp.json()
@@ -27,7 +29,7 @@ async def get_text_embedding(
             request: SearchFeature, 
     ) -> Feature:
         async with aiohttp.ClientSession().post(
-            f"{self.clip_url}/encode-search", 
+            f"{self.search_clip_url}/encode-search", 
             json=request.model_dump(mode="json")
         ) as resp:
             features = await resp.json()
diff --git a/main/deps.py b/main/deps.py
index 18e8277..6afa00e 100644
--- a/main/deps.py
+++ b/main/deps.py
@@ -14,9 +14,7 @@
 
 
 def get_clip_service() -> CLIPService:
-    return CLIPService(
-        url=Settings.clip_url
-    )
+    return CLIPService()
 
 def get_chroma_storage() -> ChromaStorage:
     return ChromaStorage()
diff --git a/main/settings.py b/main/settings.py
index 68e1c3a..88ad0b6 100644
--- a/main/settings.py
+++ b/main/settings.py
@@ -7,6 +7,7 @@
 class Settings:
     db_host: str = env.str("DB_HOST", default="chroma_db")
     db_port: int = env.int("DB_PORT", default=8080)
-    clip_url: str = env.str("CLIP_URL", default="http://inference:8040/encode")
+    encode_clip_url: str = env.str("ENCODE_CLIP_URL", default="http://encode-inference:8040/")
+    search_clip_url: str = env.str("SEARCH_CLIP_URL", default="http://search-inference:8040/")
     memcached_host: str = env.str("MEMCACHED_HOST", default="request_cache")
     cache_lifetime: int = env.int("CACHE_LIFETIME", default=3600)

From e45ccde35076ea057971b645fc68def064fe04a5 Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Sat, 22 Jun 2024 17:51:38 +0300
Subject: [PATCH 04/39] fix docker compose

---
 docker-compose.yml | 8 ++++----
 main/.env.dist     | 4 ++--
 main/settings.py   | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index 1142ad0..f0ebe8c 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -17,7 +17,7 @@ services:
       - "11211:11211"
     restart: always
 
-  encode-inference: &inference
+  encode: &inference
     build:
       context: ./inference
       dockerfile: Dockerfile
@@ -31,7 +31,7 @@ services:
     ports:
       - "8040:8040"
 
-  search-inference:
+  search:
     <<: *inference
 
   main:
@@ -45,8 +45,8 @@ services:
       - main-model-data:/app/model_data
     depends_on:
       - db
-      - encode-inference
-      - search-inference
+      - encode
+      - search
       - cache
     env_file:
       - main/.env.dist
diff --git a/main/.env.dist b/main/.env.dist
index ccaf278..1c25429 100644
--- a/main/.env.dist
+++ b/main/.env.dist
@@ -1,4 +1,4 @@
-ENCODE_CLIP_URL=http://encode-inference:8040/
-SEARCH_CLIP_URL=http://search-inference:8040/
+ENCODE_CLIP_URL=http://encode:8040/
+SEARCH_CLIP_URL=http://search:8040/
 DB_HOST=chroma_db
 DB_PORT=8000
diff --git a/main/settings.py b/main/settings.py
index 88ad0b6..a31cc00 100644
--- a/main/settings.py
+++ b/main/settings.py
@@ -7,7 +7,7 @@
 class Settings:
     db_host: str = env.str("DB_HOST", default="chroma_db")
     db_port: int = env.int("DB_PORT", default=8080)
-    encode_clip_url: str = env.str("ENCODE_CLIP_URL", default="http://encode-inference:8040/")
-    search_clip_url: str = env.str("SEARCH_CLIP_URL", default="http://search-inference:8040/")
+    encode_clip_url: str = env.str("ENCODE_CLIP_URL", default="http://encode:8040/")
+    search_clip_url: str = env.str("SEARCH_CLIP_URL", default="http://search:8040/")
     memcached_host: str = env.str("MEMCACHED_HOST", default="request_cache")
     cache_lifetime: int = env.int("CACHE_LIFETIME", default=3600)

From b291d455c3e794e44d11d9add243ee4c762d4a6b Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Sat, 22 Jun 2024 17:53:40 +0300
Subject: [PATCH 05/39] fix docker compose

---
 docker-compose.yml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index f0ebe8c..dd6735c 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -2,7 +2,6 @@ version: "3.10"
 
 services:
   db:
-    container_name: chroma_db
     image: chromadb/chroma:latest
     volumes:
       - chroma-data:/chroma/chroma
@@ -11,7 +10,6 @@ services:
       - "8000:8000"
  
   cache:
-    container_name: request_cache
     image: memcached:latest
     ports:
       - "11211:11211"
@@ -21,7 +19,6 @@ services:
     build:
       context: ./inference
       dockerfile: Dockerfile
-    container_name: inference
     command: uvicorn main:app --host "0.0.0.0" --port 8040
     restart: unless-stopped
     volumes:
@@ -38,7 +35,6 @@ services:
     build:
       context: ./main
       dockerfile: Dockerfile
-    container_name: main_gateway
     command: uvicorn main:app --host "0.0.0.0" --port 80
     restart: unless-stopped
     volumes:

From afffbc8a6edd32ea1826cb166e3a7ccd4152a8f6 Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Sat, 22 Jun 2024 23:12:51 +0300
Subject: [PATCH 06/39] add whisper.cpp preload

---
 inference/download_whisper.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/inference/download_whisper.sh b/inference/download_whisper.sh
index b39d2e7..4f523a7 100755
--- a/inference/download_whisper.sh
+++ b/inference/download_whisper.sh
@@ -1,6 +1,6 @@
-#!usr/bin/bash
+#!/bin/bash
 
-if ! test /app/model_data/ggml-large-v3.bin; then
+if [  ! -f /app/model_data/ggml-large-v3.bin ]; then
     mkdir /app/model_data
     git clone https://github.com/ggerganov/whisper.cpp.git
     cd whisper.cpp

From 502aa1251ad941c4a952f1621bc1a33fbd62ed05 Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Sat, 22 Jun 2024 23:40:03 +0300
Subject: [PATCH 07/39] change port schematics

---
 docker-compose.yml | 2 ++
 main/.env.dist     | 2 +-
 main/settings.py   | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index dd6735c..937d2dd 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -30,6 +30,8 @@ services:
 
   search:
     <<: *inference
+    ports:
+      - "8050:8040"
 
   main:
     build:
diff --git a/main/.env.dist b/main/.env.dist
index 1c25429..0c4bcfe 100644
--- a/main/.env.dist
+++ b/main/.env.dist
@@ -1,4 +1,4 @@
 ENCODE_CLIP_URL=http://encode:8040/
-SEARCH_CLIP_URL=http://search:8040/
+SEARCH_CLIP_URL=http://search:8050/
 DB_HOST=chroma_db
 DB_PORT=8000
diff --git a/main/settings.py b/main/settings.py
index a31cc00..09db4bd 100644
--- a/main/settings.py
+++ b/main/settings.py
@@ -8,6 +8,6 @@ class Settings:
     db_host: str = env.str("DB_HOST", default="chroma_db")
     db_port: int = env.int("DB_PORT", default=8080)
     encode_clip_url: str = env.str("ENCODE_CLIP_URL", default="http://encode:8040/")
-    search_clip_url: str = env.str("SEARCH_CLIP_URL", default="http://search:8040/")
+    search_clip_url: str = env.str("SEARCH_CLIP_URL", default="http://search:8050/")
     memcached_host: str = env.str("MEMCACHED_HOST", default="request_cache")
     cache_lifetime: int = env.int("CACHE_LIFETIME", default=3600)

From bc45b6edf8f8608793c6138a815137b00e5705cf Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Sat, 22 Jun 2024 23:42:25 +0300
Subject: [PATCH 08/39] fix stash errors

---
 main/main.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/main/main.py b/main/main.py
index 962cc43..98ba956 100644
--- a/main/main.py
+++ b/main/main.py
@@ -11,13 +11,6 @@
 @app.post("/index")
 async def add_video_to_index(request: Video, clip: Clip, chroma: Chroma) -> Video:
     """Добавляет новое видео в хранилище - индекс"""
-<<<<<<< Updated upstream
-    feature = await clip.get_video_embedding(request)
-    if request.description is not None:
-        chroma.add_text_search_suggestion(suggestion_query=request.description)
-    chroma.add_feature(feature=feature)
-    return request.model_dump(mode="dict")
-=======
     features = await clip.get_video_embeddings(request)
     if request.description is not None:
         chroma.add_text_search_suggestion(suggestion_query=request.description)
@@ -26,7 +19,6 @@ async def add_video_to_index(request: Video, clip: Clip, chroma: Chroma) -> Vide
         chroma.add_feature(feature=feature)
     return request
 
->>>>>>> Stashed changes
 
 @app.get("/search")
 @cache(expire=Settings.cache_lifetime)
@@ -41,13 +33,8 @@ async def search_for_related_videos(
     spelled_search = speller(params.text)
     translated_search = translator(spelled_search)
     search_vector = await clip.get_text_embedding(
-<<<<<<< Updated upstream
-        Video(
-            description=translated_search
-=======
         SearchFeature(
             query=translated_search
->>>>>>> Stashed changes
         )
     )
     return {"results": chroma.search_relevant_videos(search_feature=search_vector, top_k=params.return_amount)}

From 940261eeca1eefa92d8f0fcdfbf54ee27df6f946 Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Sat, 22 Jun 2024 23:44:58 +0300
Subject: [PATCH 09/39] fix import errors in containers

---
 inference/clip.py    | 2 +-
 inference/deps.py    | 2 +-
 inference/main.py    | 6 +++---
 inference/whisper.py | 2 +-
 main/clip.py         | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/inference/clip.py b/inference/clip.py
index cb2ed82..3425b6c 100644
--- a/inference/clip.py
+++ b/inference/clip.py
@@ -5,7 +5,7 @@
 import torch
 from transformers import CLIPModel, CLIPProcessor
 
-from inference.frame_video import VideoFrame, create_key_frames_for_video
+from frame_video import VideoFrame, create_key_frames_for_video
 
 
 @dataclass
diff --git a/inference/deps.py b/inference/deps.py
index ad509c6..d90d0ca 100644
--- a/inference/deps.py
+++ b/inference/deps.py
@@ -4,7 +4,7 @@
 from fastapi import Depends, FastAPI, Request
 from transformers import CLIPModel, CLIPProcessor
 
-from inference.whisper import WhisperService
+from whisper import WhisperService
 from settings import Settings
 
 
diff --git a/inference/main.py b/inference/main.py
index f8edabc..c145610 100644
--- a/inference/main.py
+++ b/inference/main.py
@@ -1,9 +1,9 @@
 from fastapi import FastAPI
 from fastapi.responses import JSONResponse
 
-from inference.deps import Model, Processor, Whisper, lifespan
-from inference.clip import CLIP
-from inference.models import EncodeRequest, EncodeSearchRequest
+from deps import Model, Processor, Whisper, lifespan
+from clip import CLIP
+from models import EncodeRequest, EncodeSearchRequest
 
 app = FastAPI(lifespan=lifespan)
 
diff --git a/inference/whisper.py b/inference/whisper.py
index 8cde250..8f3c76b 100644
--- a/inference/whisper.py
+++ b/inference/whisper.py
@@ -4,7 +4,7 @@
 import requests
 from whisper_cpp_python import Whisper
 
-from inference.settings import Settings
+from settings import Settings
 
 
 @dataclass
diff --git a/main/clip.py b/main/clip.py
index 57c3a6d..95d08ba 100644
--- a/main/clip.py
+++ b/main/clip.py
@@ -1,5 +1,5 @@
 import aiohttp
-from main.settings import Settings
+from settings import Settings
 from models import SearchFeature, Video, Feature
 
 class CLIPService:

From dbfc95cb3007ad8e19a1e6d2db16dc15c58a0d8b Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Sat, 22 Jun 2024 23:46:33 +0300
Subject: [PATCH 10/39] add whisper cpp to dep list

---
 inference/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/inference/requirements.txt b/inference/requirements.txt
index 5291272..bc74758 100644
--- a/inference/requirements.txt
+++ b/inference/requirements.txt
@@ -9,3 +9,4 @@ pillow==10.3.0
 scenedetect==0.6.3
 opencv-python==4.10.0.82
 environs==11.0.0
+whisper-cpp-python==0.2.0

From e92c92660a5c8c56c889d805fba9c937f44fab3f Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Sat, 22 Jun 2024 23:52:00 +0300
Subject: [PATCH 11/39] change db host name

---
 main/.env.dist | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main/.env.dist b/main/.env.dist
index 0c4bcfe..a81ca7a 100644
--- a/main/.env.dist
+++ b/main/.env.dist
@@ -1,4 +1,4 @@
 ENCODE_CLIP_URL=http://encode:8040/
 SEARCH_CLIP_URL=http://search:8050/
-DB_HOST=chroma_db
+DB_HOST=db
 DB_PORT=8000

From ad996e74814f5bf88a04653863d91df5d7087d9d Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Sun, 23 Jun 2024 00:01:09 +0300
Subject: [PATCH 12/39] fix paths to inference

---
 main/clip.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main/clip.py b/main/clip.py
index 95d08ba..e9ec7c6 100644
--- a/main/clip.py
+++ b/main/clip.py
@@ -9,7 +9,7 @@ def __init__(self) -> None:
     
     async def get_video_embeddings(self, request: Video) -> list[Feature]:
         async with aiohttp.ClientSession().post(
-            url=f"{self.encode_clip_url}/encode", 
+            url=f"{self.encode_clip_url}encode", 
             json=request.model_dump(mode="json")
         ) as resp:
             features = await resp.json()
@@ -29,7 +29,7 @@ async def get_text_embedding(
             request: SearchFeature, 
     ) -> Feature:
         async with aiohttp.ClientSession().post(
-            f"{self.search_clip_url}/encode-search", 
+            f"{self.search_clip_url}encode-search", 
             json=request.model_dump(mode="json")
         ) as resp:
             features = await resp.json()

From 898b711808154f0de0b2b709c4bcff397f2d4c66 Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Sun, 23 Jun 2024 00:21:00 +0300
Subject: [PATCH 13/39] add tempfiles for whisper

---
 inference/whisper.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/inference/whisper.py b/inference/whisper.py
index 8f3c76b..50749ac 100644
--- a/inference/whisper.py
+++ b/inference/whisper.py
@@ -1,5 +1,6 @@
 from dataclasses import dataclass, field
 from io import BytesIO
+import tempfile
 
 import requests
 from whisper_cpp_python import Whisper
@@ -14,5 +15,8 @@ class WhisperService:
     def __call__(self, link: str) -> str:
         
         video_data = BytesIO(requests.get(link).content)
-        data = self._service.transcribe(video_data)
+        with tempfile.NamedTemporaryFile(delete_on_close=False) as tp:
+            tp.write(video_data.read())
+            tp.close()
+            data = self._service.transcribe(open(tp.name))
         return data["text"]

From f1012c1ebf903ad0f90da25515130f2ac9e823e9 Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Sun, 23 Jun 2024 00:26:38 +0300
Subject: [PATCH 14/39] fix tempfiles + translate

---
 inference/whisper.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/inference/whisper.py b/inference/whisper.py
index 50749ac..dd11e73 100644
--- a/inference/whisper.py
+++ b/inference/whisper.py
@@ -15,8 +15,8 @@ class WhisperService:
     def __call__(self, link: str) -> str:
         
         video_data = BytesIO(requests.get(link).content)
-        with tempfile.NamedTemporaryFile(delete_on_close=False) as tp:
+        with tempfile.NamedTemporaryFile(delete=False) as tp:
             tp.write(video_data.read())
             tp.close()
-            data = self._service.transcribe(open(tp.name))
+            data = self._service.translate(open(tp.name))
         return data["text"]

From d1caad3da71d057a5911691fc27476b5d765aea4 Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Sun, 23 Jun 2024 00:36:40 +0300
Subject: [PATCH 15/39] redo file submission to whisper

---
 inference/whisper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inference/whisper.py b/inference/whisper.py
index dd11e73..455e3f7 100644
--- a/inference/whisper.py
+++ b/inference/whisper.py
@@ -18,5 +18,5 @@ def __call__(self, link: str) -> str:
         with tempfile.NamedTemporaryFile(delete=False) as tp:
             tp.write(video_data.read())
             tp.close()
-            data = self._service.translate(open(tp.name))
+            data = self._service.translate(tp.name)
         return data["text"]

From 85860a8f0c89a907d5555b2d5af3ceb53cd0a764 Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Sun, 23 Jun 2024 16:46:51 +0300
Subject: [PATCH 16/39] fix prompts + add logging

---
 inference/clip.py    |  4 ++++
 inference/deps.py    |  5 +++++
 inference/main.py    | 10 ++++++++--
 inference/whisper.py |  4 +++-
 4 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/inference/clip.py b/inference/clip.py
index 3425b6c..7a80688 100644
--- a/inference/clip.py
+++ b/inference/clip.py
@@ -1,4 +1,5 @@
 from dataclasses import dataclass
+from logging import Logger
 from typing import Callable, Literal
 
 from PIL import Image
@@ -12,14 +13,17 @@
 class CLIP:
     processor: CLIPProcessor
     model: CLIPModel
+    logger: Logger
 
     _create_key_frames_for_video: Callable[[str], list[VideoFrame]] = create_key_frames_for_video
 
     def __call__(self, encode_source: str, encode_type: Literal["text"] | Literal["video"]) -> list[float]:
         if encode_type == "text":
+            self.logger.info("Processing text input: %s, input length: %s", encode_source, len(encode_source))
             return self._encode_text(encode_source)
 
         if encode_type == "video":
+            self.logger.info("Processing video input: %s", encode_source)
             return self._encode_video(encode_source)
 
     def _encode_text(self, description: str) -> list[float]:
diff --git a/inference/deps.py b/inference/deps.py
index d90d0ca..d937d8a 100644
--- a/inference/deps.py
+++ b/inference/deps.py
@@ -1,4 +1,5 @@
 from contextlib import asynccontextmanager
+import logging
 from typing import Annotated
 
 from fastapi import Depends, FastAPI, Request
@@ -10,14 +11,18 @@
 
 @asynccontextmanager
 async def lifespan(app: FastAPI):
+    logger = logging.getLogger(__name__)
+    logger.info("Setting up CLIP model...")
     app.state.clip_model = CLIPModel.from_pretrained(
         Settings.clip_model,
         cache_dir="./model_cache"
     )
+    logger.info("Setting up CLIP processor...")
     app.state.processor = CLIPProcessor.from_pretrained(
         Settings.clip_model,
         cache_dir="./model_cache"
     )
+    logger.info("Setting up Whisper service...")
     app.state.whisper_model = WhisperService()
     yield
 
diff --git a/inference/main.py b/inference/main.py
index c145610..0e9351a 100644
--- a/inference/main.py
+++ b/inference/main.py
@@ -1,3 +1,4 @@
+import logging
 from fastapi import FastAPI
 from fastapi.responses import JSONResponse
 
@@ -6,6 +7,7 @@
 from models import EncodeRequest, EncodeSearchRequest
 
 app = FastAPI(lifespan=lifespan)
+logger = logging.getLogger(__name__)
 
 @app.get("/")
 async def root():
@@ -18,7 +20,9 @@ async def encode(
         model: Model,
         whisper: Whisper
 ):
-    clip = CLIP(processor=processor, model=model)
+    logger.info("Initializing CLIP module...")
+    clip = CLIP(processor=processor, model=model, logger=logger)
+    logger.info("CLIP module successfully initialized")
 
     video_features = clip(request.link, encode_type="video")
     if request.description is not None:
@@ -38,7 +42,9 @@ async def encode(
 async def encode_search(
         request: EncodeSearchRequest, processor: Processor, model: Model
 ):
-    clip = CLIP(processor=processor, model=model)
+    logger.info("Initializing CLIP module...")
+    clip = CLIP(processor=processor, model=model, logger=logger)
+    logger.info("CLIP module successfully initialized")
 
     features = clip(request.query, encode_type="text")
 
diff --git a/inference/whisper.py b/inference/whisper.py
index 455e3f7..37698f2 100644
--- a/inference/whisper.py
+++ b/inference/whisper.py
@@ -18,5 +18,7 @@ def __call__(self, link: str) -> str:
         with tempfile.NamedTemporaryFile(delete=False) as tp:
             tp.write(video_data.read())
             tp.close()
-            data = self._service.translate(tp.name)
+            data = self._service.translate(
+                tp.name, prompt=""
+            )
         return data["text"]

From 2456c34fdd2061a9a493ac5a9d3a64fac7c8f4b2 Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Sun, 23 Jun 2024 17:10:43 +0300
Subject: [PATCH 17/39] adding threads to the problem

---
 inference/whisper.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/inference/whisper.py b/inference/whisper.py
index 37698f2..b2b8ef6 100644
--- a/inference/whisper.py
+++ b/inference/whisper.py
@@ -10,7 +10,12 @@
 
 @dataclass
 class WhisperService:
-    _service: Whisper = field(default_factory=lambda: Whisper(model_path=Settings.whisper_path))
+    _service: Whisper = field(
+        default_factory=lambda: Whisper(
+            model_path=Settings.whisper_path,
+            n_threads=4
+        )
+    )
 
     def __call__(self, link: str) -> str:
         

From 6a7d81e9a3207d3af8f6b3da7f4f014add8b1d1d Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Sun, 23 Jun 2024 17:51:14 +0300
Subject: [PATCH 18/39] piping video to wav and adding logging

---
 docker-compose.yml         |  2 +-
 inference/Dockerfile       |  2 +-
 inference/frame_video.py   | 13 +++++++++++++
 inference/log_conf.yaml    | 34 ++++++++++++++++++++++++++++++++++
 inference/requirements.txt |  1 +
 inference/whisper.py       | 17 +++++++++++++----
 main/clip.py               |  3 ++-
 7 files changed, 65 insertions(+), 7 deletions(-)
 create mode 100644 inference/log_conf.yaml

diff --git a/docker-compose.yml b/docker-compose.yml
index 937d2dd..cd7af13 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -19,7 +19,7 @@ services:
     build:
       context: ./inference
       dockerfile: Dockerfile
-    command: uvicorn main:app --host "0.0.0.0" --port 8040
+    command: uvicorn main:app --host "0.0.0.0" --port 8040 --log-config=log_conf.yaml
     restart: unless-stopped
     volumes:
       - inference-model-data:/app/model_data
diff --git a/inference/Dockerfile b/inference/Dockerfile
index 1386abc..500c877 100644
--- a/inference/Dockerfile
+++ b/inference/Dockerfile
@@ -16,4 +16,4 @@ RUN python -m pip install --upgrade pip && pip install -r requirements.txt
 COPY ./ /app/
 
 EXPOSE 8040
-CMD uvicorn main:app --port 8040
+CMD uvicorn main:app --port 8040 --log-config=log_conf.yaml
diff --git a/inference/frame_video.py b/inference/frame_video.py
index 374cc20..e61cd13 100644
--- a/inference/frame_video.py
+++ b/inference/frame_video.py
@@ -57,3 +57,16 @@ def create_frame_in_ram(video_path: str, timecode: str) -> BytesIO:
     process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     frame_data, _ = process.communicate()
     return BytesIO(frame_data)
+
+def get_audio_in_ram(video_path: str) -> BytesIO:
+    command = [
+        "ffmpeg",
+        "-i", video_path, 
+        "-acodec", "pcm_s16le",
+        "-ac", "1",
+        "-ar", "16000", 
+        "-"
+    ]
+    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    audio_data, _ = process.communicate()
+    return BytesIO(audio_data)
diff --git a/inference/log_conf.yaml b/inference/log_conf.yaml
new file mode 100644
index 0000000..1377c64
--- /dev/null
+++ b/inference/log_conf.yaml
@@ -0,0 +1,34 @@
+version: 1
+disable_existing_loggers: False
+formatters:
+  default:
+    # "()": uvicorn.logging.DefaultFormatter
+    format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+  access:
+    # "()": uvicorn.logging.AccessFormatter
+    format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+handlers:
+  default:
+    formatter: default
+    class: logging.StreamHandler
+    stream: ext://sys.stderr
+  access:
+    formatter: access
+    class: logging.StreamHandler
+    stream: ext://sys.stdout
+loggers:
+  uvicorn.error:
+    level: INFO
+    handlers:
+      - default
+    propagate: no
+  uvicorn.access:
+    level: INFO
+    handlers:
+      - access
+    propagate: no
+root:
+  level: DEBUG
+  handlers:
+    - default
+  propagate: no
diff --git a/inference/requirements.txt b/inference/requirements.txt
index bc74758..fd975b8 100644
--- a/inference/requirements.txt
+++ b/inference/requirements.txt
@@ -10,3 +10,4 @@ scenedetect==0.6.3
 opencv-python==4.10.0.82
 environs==11.0.0
 whisper-cpp-python==0.2.0
+PyYAML>=6.0
diff --git a/inference/whisper.py b/inference/whisper.py
index b2b8ef6..415b88e 100644
--- a/inference/whisper.py
+++ b/inference/whisper.py
@@ -1,10 +1,13 @@
 from dataclasses import dataclass, field
 from io import BytesIO
 import tempfile
+import os
+from typing import Callable
 
 import requests
 from whisper_cpp_python import Whisper
 
+from inference.frame_video import get_audio_in_ram
 from settings import Settings
 
 
@@ -16,14 +19,20 @@ class WhisperService:
             n_threads=4
         )
     )
+    _get_audio_in_ram: Callable[[str], BytesIO] = get_audio_in_ram
 
     def __call__(self, link: str) -> str:
         
         video_data = BytesIO(requests.get(link).content)
-        with tempfile.NamedTemporaryFile(delete=False) as tp:
-            tp.write(video_data.read())
-            tp.close()
+        with tempfile.NamedTemporaryFile() as video:
+            video.write(video_data.read())
+            audio_data = self._get_audio_in_ram(video.name)
+
+        with tempfile.NamedTemporaryFile(delete=False) as audio:
+            audio.write(audio_data.read())
+            audio.close()
             data = self._service.translate(
-                tp.name, prompt=""
+                audio.name, prompt=""
             )
+        os.unlink(audio.name)
         return data["text"]
diff --git a/main/clip.py b/main/clip.py
index e9ec7c6..e5264c9 100644
--- a/main/clip.py
+++ b/main/clip.py
@@ -6,9 +6,10 @@ class CLIPService:
     def __init__(self) -> None:
         self.encode_clip_url = Settings.encode_clip_url
         self.search_clip_url = Settings.search_clip_url
+        self.session_timeout = aiohttp.ClientTimeout(60 * 5)
     
     async def get_video_embeddings(self, request: Video) -> list[Feature]:
-        async with aiohttp.ClientSession().post(
+        async with aiohttp.ClientSession(timeout=self.session_timeout).post(
             url=f"{self.encode_clip_url}encode", 
             json=request.model_dump(mode="json")
         ) as resp:

From 89d75a3a326f853a8749c0f9a016110b14e8d998 Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Sun, 23 Jun 2024 17:59:21 +0300
Subject: [PATCH 19/39] adding summary api

---
 inference/.env.dist   |  1 +
 inference/settings.py |  1 +
 inference/whisper.py  | 11 ++++++++++-
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/inference/.env.dist b/inference/.env.dist
index 71f7798..0281e6a 100644
--- a/inference/.env.dist
+++ b/inference/.env.dist
@@ -1,2 +1,3 @@
 CLIP_MODEL=laion/CLIP-ViT-g-14-laion2B-s12B-b42K
 WHISPER_PATH=/app/model_data/ggml-large-v3.bin
+SUMMARIZATION_MODEL=facebook/bart-large-cnn
diff --git a/inference/settings.py b/inference/settings.py
index 9170c72..30b2e00 100644
--- a/inference/settings.py
+++ b/inference/settings.py
@@ -6,4 +6,5 @@
 
 class Settings:
     clip_model: str = env.str("CLIP_MODEL")
+    summarization_model: str = env.str("SUMMARIZATION_MODEL")
     whisper_path: str = env.str("WHISPER_PATH")
diff --git a/inference/whisper.py b/inference/whisper.py
index 415b88e..fb0fd06 100644
--- a/inference/whisper.py
+++ b/inference/whisper.py
@@ -5,6 +5,7 @@
 from typing import Callable
 
 import requests
+from transformers import Pipeline, pipeline
 from whisper_cpp_python import Whisper
 
 from inference.frame_video import get_audio_in_ram
@@ -19,6 +20,12 @@ class WhisperService:
             n_threads=4
         )
     )
+    _summary_pipeline: Pipeline = field(
+        default_factory=lambda: pipeline(
+            "summarization",
+            model=Settings.summarization_model
+        )
+    )
     _get_audio_in_ram: Callable[[str], BytesIO] = get_audio_in_ram
 
     def __call__(self, link: str) -> str:
@@ -35,4 +42,6 @@ def __call__(self, link: str) -> str:
                 audio.name, prompt=""
             )
         os.unlink(audio.name)
-        return data["text"]
+        text = data["text"]
+        summary = self._summary_pipeline(text, max_length=77)
+        return summary[0]["summary_text"]

From 86e8aeb740ff31eb9b58265dce6e03695abcdeb4 Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Sun, 23 Jun 2024 18:04:43 +0300
Subject: [PATCH 20/39] adding more logging to whisper for debug purposes

---
 inference/whisper.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/inference/whisper.py b/inference/whisper.py
index fb0fd06..b29788b 100644
--- a/inference/whisper.py
+++ b/inference/whisper.py
@@ -1,5 +1,6 @@
 from dataclasses import dataclass, field
 from io import BytesIO
+import logging
 import tempfile
 import os
 from typing import Callable
@@ -26,15 +27,18 @@ class WhisperService:
             model=Settings.summarization_model
         )
     )
+    _logger: logging.Logger = field(
+        default_factory=lambda: logging.getLogger(__name__)
+    )
     _get_audio_in_ram: Callable[[str], BytesIO] = get_audio_in_ram
 
     def __call__(self, link: str) -> str:
-        
+        self._logger.info("Converting video file to WAV")
         video_data = BytesIO(requests.get(link).content)
         with tempfile.NamedTemporaryFile() as video:
             video.write(video_data.read())
             audio_data = self._get_audio_in_ram(video.name)
-
+        self._logger.info("Processing WAV file by whisper")
         with tempfile.NamedTemporaryFile(delete=False) as audio:
             audio.write(audio_data.read())
             audio.close()
@@ -42,6 +46,9 @@ def __call__(self, link: str) -> str:
                 audio.name, prompt=""
             )
         os.unlink(audio.name)
+        self._logger.info("summarizing transcript into 77 CLIP tokens")
         text = data["text"]
         summary = self._summary_pipeline(text, max_length=77)
-        return summary[0]["summary_text"]
+        result: str = summary[0]["summary_text"]  # type: ignore
+        self._logger.info("Processed video file into text description: %s, total length: %s", result, len(result))
+        return result

From 09c2bd4e63d7d92d9dc04c39f0ff62bd844e7ba2 Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Sun, 23 Jun 2024 18:11:15 +0300
Subject: [PATCH 21/39] fix importing error

---
 inference/whisper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inference/whisper.py b/inference/whisper.py
index b29788b..1c2fc15 100644
--- a/inference/whisper.py
+++ b/inference/whisper.py
@@ -9,7 +9,7 @@
 from transformers import Pipeline, pipeline
 from whisper_cpp_python import Whisper
 
-from inference.frame_video import get_audio_in_ram
+from frame_video import get_audio_in_ram
 from settings import Settings
 
 

From 5d848d31d501b661fcc49b110be28bb680cdf8bf Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Sun, 23 Jun 2024 18:37:41 +0300
Subject: [PATCH 22/39] fixing suffix errors + debug

---
 inference/log_conf.yaml | 2 +-
 inference/whisper.py    | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/inference/log_conf.yaml b/inference/log_conf.yaml
index 1377c64..32e7e50 100644
--- a/inference/log_conf.yaml
+++ b/inference/log_conf.yaml
@@ -28,7 +28,7 @@ loggers:
       - access
     propagate: no
 root:
-  level: DEBUG
+  level: INFO
   handlers:
     - default
   propagate: no
diff --git a/inference/whisper.py b/inference/whisper.py
index 1c2fc15..d7fcdbf 100644
--- a/inference/whisper.py
+++ b/inference/whisper.py
@@ -35,11 +35,13 @@ class WhisperService:
     def __call__(self, link: str) -> str:
         self._logger.info("Converting video file to WAV")
         video_data = BytesIO(requests.get(link).content)
-        with tempfile.NamedTemporaryFile() as video:
+        with tempfile.NamedTemporaryFile(delete=False) as video:
             video.write(video_data.read())
+            video.close()
             audio_data = self._get_audio_in_ram(video.name)
+        os.unlink(video.name)
         self._logger.info("Processing WAV file by whisper")
-        with tempfile.NamedTemporaryFile(delete=False) as audio:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as audio:
             audio.write(audio_data.read())
             audio.close()
             data = self._service.translate(

From 20c0f42238994025a4ede8fc721a57cd1a782432 Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Sun, 23 Jun 2024 18:44:27 +0300
Subject: [PATCH 23/39] ditching wav formatting for now

---
 inference/whisper.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/inference/whisper.py b/inference/whisper.py
index d7fcdbf..0030346 100644
--- a/inference/whisper.py
+++ b/inference/whisper.py
@@ -33,21 +33,24 @@ class WhisperService:
     _get_audio_in_ram: Callable[[str], BytesIO] = get_audio_in_ram
 
     def __call__(self, link: str) -> str:
-        self._logger.info("Converting video file to WAV")
+        self._logger.info("Converting video file to transcript")
         video_data = BytesIO(requests.get(link).content)
         with tempfile.NamedTemporaryFile(delete=False) as video:
             video.write(video_data.read())
             video.close()
-            audio_data = self._get_audio_in_ram(video.name)
-        os.unlink(video.name)
-        self._logger.info("Processing WAV file by whisper")
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as audio:
-            audio.write(audio_data.read())
-            audio.close()
             data = self._service.translate(
-                audio.name, prompt=""
+                video.name, prompt=""
             )
-        os.unlink(audio.name)
+        #    audio_data = self._get_audio_in_ram(video.name)
+        os.unlink(video.name)
+        #self._logger.info("Processing WAV file by whisper")
+        #with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as audio:
+        #    audio.write(audio_data.read())
+        #    audio.close()
+        #    data = self._service.translate(
+                #        audio.name, prompt=""
+                #    )
+        #os.unlink(audio.name)
         self._logger.info("summarizing transcript into 77 CLIP tokens")
         text = data["text"]
         summary = self._summary_pipeline(text, max_length=77)

From c9e5051f0fda767de5fa968c81cf5cb91ec422f8 Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Sun, 23 Jun 2024 18:54:59 +0300
Subject: [PATCH 24/39] add logging for clip encoding result

---
 inference/clip.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/inference/clip.py b/inference/clip.py
index 7a80688..89afb89 100644
--- a/inference/clip.py
+++ b/inference/clip.py
@@ -32,7 +32,9 @@ def _encode_text(self, description: str) -> list[float]:
             text_features = self.model.get_text_features(**text_inputs)
             text_features /= text_features.norm(dim=-1, keepdim=True)
 
-        return text_features.tolist()[0]
+        result = text_features.tolist()[0]
+        self.logger.info("Processed result vector - %s", result)
+        return result
 
     def _encode_video(self, link: str) -> list[float]:
         images = self._create_key_frames_for_video(link)
@@ -50,4 +52,7 @@ def _encode_video(self, link: str) -> list[float]:
             features = torch.mean(image_features, dim=0)
             features /= features.norm(dim=-1, keepdim=True)
 
-        return features.tolist()[0]
+        result = features.tolist()[0]
+        self.logger.info("Processed result vector - %s", result)
+        return result
+

From 9cc77536cb97b4aef12250d56d43dd4b725dcdcd Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Sun, 23 Jun 2024 19:00:40 +0300
Subject: [PATCH 25/39] fix video vectorization

---
 inference/clip.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inference/clip.py b/inference/clip.py
index 89afb89..5109475 100644
--- a/inference/clip.py
+++ b/inference/clip.py
@@ -52,7 +52,7 @@ def _encode_video(self, link: str) -> list[float]:
             features = torch.mean(image_features, dim=0)
             features /= features.norm(dim=-1, keepdim=True)
 
-        result = features.tolist()[0]
+        result = features.tolist()
         self.logger.info("Processed result vector - %s", result)
         return result
 

From 22eeb6e460b26dd39084850baf266a1e27ee8fe0 Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Sun, 23 Jun 2024 19:07:07 +0300
Subject: [PATCH 26/39] fix suggestion id generation

---
 main/chroma.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main/chroma.py b/main/chroma.py
index 97bea98..bfdddc8 100644
--- a/main/chroma.py
+++ b/main/chroma.py
@@ -43,7 +43,7 @@ def add_text_search_suggestion(self, suggestion_query: str) -> None:
         subsearches = suggestion_query.split()
         self.desc_collection.add(
             documents=[suggestion_query] + subsearches,
-            ids=[str(hash(query)) for query in [suggestion_query] + subsearches]
+            ids=[str(uuid4()) for _ in [suggestion_query] + subsearches]
         )
 
     def get_text_search_suggestions(self, search_query: str, top_k: int = 20) -> list[str]:

From 28775d3df681a1c2e7cf6f90b77d70fc53ca35a9 Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Sun, 23 Jun 2024 19:34:47 +0300
Subject: [PATCH 27/39] redo ports again

---
 docker-compose.yml   | 6 +++---
 inference/Dockerfile | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index cd7af13..38d9149 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -19,19 +19,19 @@ services:
     build:
       context: ./inference
       dockerfile: Dockerfile
-    command: uvicorn main:app --host "0.0.0.0" --port 8040 --log-config=log_conf.yaml
+    command: uvicorn main:app --host "0.0.0.0" --port 80 --log-config=log_conf.yaml
     restart: unless-stopped
     volumes:
       - inference-model-data:/app/model_data
     env_file:
       - inference/.env.dist
     ports:
-      - "8040:8040"
+      - "8040:80"
 
   search:
     <<: *inference
     ports:
-      - "8050:8040"
+      - "8050:80"
 
   main:
     build:
diff --git a/inference/Dockerfile b/inference/Dockerfile
index 500c877..71d6389 100644
--- a/inference/Dockerfile
+++ b/inference/Dockerfile
@@ -16,4 +16,4 @@ RUN python -m pip install --upgrade pip && pip install -r requirements.txt
 COPY ./ /app/
 
 EXPOSE 8040
-CMD uvicorn main:app --port 8040 --log-config=log_conf.yaml
+CMD uvicorn main:app --port 80 --log-config=log_conf.yaml

From 2fb6192d60801b984308676ee1c65a284ea61d57 Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Sun, 23 Jun 2024 20:40:14 +0300
Subject: [PATCH 28/39] redo ports again

---
 docker-compose.yml | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index 38d9149..f41934d 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -15,7 +15,7 @@ services:
       - "11211:11211"
     restart: always
 
-  encode: &inference
+  encode:
     build:
       context: ./inference
       dockerfile: Dockerfile
@@ -29,7 +29,15 @@ services:
       - "8040:80"
 
   search:
-    <<: *inference
+    build:
+      context: ./inference
+      dockerfile: Dockerfile
+    command: uvicorn main:app --host "0.0.0.0" --port 80 --log-config=log_conf.yaml
+    restart: unless-stopped
+    volumes:
+      - inference-model-data:/app/model_data
+    env_file:
+      - inference/.env.dist
     ports:
       - "8050:80"
 

From 0de8abf6791570220eadb25dd34d49c67abf776c Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Sun, 23 Jun 2024 23:26:55 +0300
Subject: [PATCH 29/39] more port meddling

---
 docker-compose.yml   | 8 ++++----
 inference/Dockerfile | 3 ---
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index f41934d..0b799ea 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -19,27 +19,27 @@ services:
     build:
       context: ./inference
       dockerfile: Dockerfile
-    command: uvicorn main:app --host "0.0.0.0" --port 80 --log-config=log_conf.yaml
+    command: uvicorn main:app --host "0.0.0.0" --port 8040 --log-config=log_conf.yaml
     restart: unless-stopped
     volumes:
       - inference-model-data:/app/model_data
     env_file:
       - inference/.env.dist
     ports:
-      - "8040:80"
+      - "8040:8040"
 
   search:
     build:
       context: ./inference
       dockerfile: Dockerfile
-    command: uvicorn main:app --host "0.0.0.0" --port 80 --log-config=log_conf.yaml
+    command: uvicorn main:app --host "0.0.0.0" --port 8050 --log-config=log_conf.yaml
     restart: unless-stopped
     volumes:
       - inference-model-data:/app/model_data
     env_file:
       - inference/.env.dist
     ports:
-      - "8050:80"
+      - "8050:8050"
 
   main:
     build:
diff --git a/inference/Dockerfile b/inference/Dockerfile
index 71d6389..48c4375 100644
--- a/inference/Dockerfile
+++ b/inference/Dockerfile
@@ -14,6 +14,3 @@ COPY requirements.txt /app/
 RUN python -m pip install --upgrade pip && pip install -r requirements.txt
 
 COPY ./ /app/
-
-EXPOSE 8040
-CMD uvicorn main:app --port 80 --log-config=log_conf.yaml

From 784fd5469df1ea707390fbf399bf086095580049 Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Sun, 23 Jun 2024 23:44:36 +0300
Subject: [PATCH 30/39] save urls to documents instead of uris(they dont work)

---
 main/chroma.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main/chroma.py b/main/chroma.py
index bfdddc8..9a86893 100644
--- a/main/chroma.py
+++ b/main/chroma.py
@@ -28,7 +28,7 @@ def add_feature(self, feature: Feature) -> None:
         self.collection.add(
             ids=[str(uuid4())],
             embeddings=[feature.features],
-            uris=[feature.link],
+            documents=[feature.link],
             metadatas=[{"feature_type": feature.feature_type}]
         )
     
@@ -37,7 +37,7 @@ def search_relevant_videos(self, search_feature: Feature, top_k: int = 100) -> l
             query_embeddings=search_feature.features,
             n_results=top_k
         )
-        return results['uris'][0]
+        return results['documents'][0]
 
     def add_text_search_suggestion(self, suggestion_query: str) -> None:
         subsearches = suggestion_query.split()

From ec62d4f9cf1d7bd1418a81efbf809e9dbd498c7b Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Mon, 24 Jun 2024 10:38:51 +0300
Subject: [PATCH 31/39] move to faster-whisper to improve performance +
 optimizations

---
 inference/Dockerfile          |   3 -
 inference/download_whisper.sh |   9 ---
 inference/requirements.txt    |   2 +-
 inference/settings.py         |   3 +-
 inference/translator.py       |  32 +++++++++
 inference/whisper.py          |  56 +++++++++-------
 poetry.lock                   | 121 +++++++++++++++++++++++++++++++++-
 pyproject.toml                |   1 +
 8 files changed, 188 insertions(+), 39 deletions(-)
 delete mode 100755 inference/download_whisper.sh
 create mode 100644 inference/translator.py

diff --git a/inference/Dockerfile b/inference/Dockerfile
index 48c4375..3496c73 100644
--- a/inference/Dockerfile
+++ b/inference/Dockerfile
@@ -5,9 +5,6 @@ ENV PYTHONUNBUFFERED 1
 
 WORKDIR /app
 
-COPY download_whisper.sh /app/
-RUN ./download_whisper.sh
-
 RUN apt-get update && apt-get install ffmpeg -y
 
 COPY requirements.txt /app/
diff --git a/inference/download_whisper.sh b/inference/download_whisper.sh
deleted file mode 100755
index 4f523a7..0000000
--- a/inference/download_whisper.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-
-if [  ! -f /app/model_data/ggml-large-v3.bin ]; then
-    mkdir /app/model_data
-    git clone https://github.com/ggerganov/whisper.cpp.git
-    cd whisper.cpp
-    bash ./models/download-ggml-model.sh large-v3
-    mv ./models/ggml-large-v3.bin /app/model_data
-fi
diff --git a/inference/requirements.txt b/inference/requirements.txt
index fd975b8..0ce7ad4 100644
--- a/inference/requirements.txt
+++ b/inference/requirements.txt
@@ -9,5 +9,5 @@ pillow==10.3.0
 scenedetect==0.6.3
 opencv-python==4.10.0.82
 environs==11.0.0
-whisper-cpp-python==0.2.0
+faster-whisper==1.0.2
 PyYAML>=6.0
diff --git a/inference/settings.py b/inference/settings.py
index 30b2e00..5c5f3e0 100644
--- a/inference/settings.py
+++ b/inference/settings.py
@@ -7,4 +7,5 @@
 class Settings:
     clip_model: str = env.str("CLIP_MODEL")
     summarization_model: str = env.str("SUMMARIZATION_MODEL")
-    whisper_path: str = env.str("WHISPER_PATH")
+    whisper_model: str = env.str("WHISPER_MODEL")
+    translation_model: str = env.str("TRANSLATION_MODEL")
diff --git a/inference/translator.py b/inference/translator.py
new file mode 100644
index 0000000..65a1aa2
--- /dev/null
+++ b/inference/translator.py
@@ -0,0 +1,32 @@
+from dataclasses import dataclass
+
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+
+from settings import Settings
+
+
+@dataclass
+class OpusTranslatorModel:
+    _model: AutoModelForSeq2SeqLM | None = None
+    _tokenizer: AutoTokenizer | None = None
+
+    _model_name: str = Settings.translation_model
+    _device: str = "cpu"
+
+    def __post_init__(self):
+        self._tokenizer = AutoTokenizer.from_pretrained(
+            self._model_name,
+            cache_dir="./model_cache"
+        )
+        self._model = AutoModelForSeq2SeqLM.from_pretrained(
+            self._model_name,
+            cache_dir="./model_cache"
+        )
+
+
+    def __call__(self, translate_query: str) -> str:
+        input_ids = self._tokenizer.encode(translate_query, return_tensors="pt")
+        output_ids = self._model.generate(input_ids.to(self._device), max_new_tokens=100)
+        en_text = self._tokenizer.decode(output_ids[0], skip_special_tokens=True)
+
+        return en_text
diff --git a/inference/whisper.py b/inference/whisper.py
index 0030346..4937c9f 100644
--- a/inference/whisper.py
+++ b/inference/whisper.py
@@ -1,26 +1,34 @@
 from dataclasses import dataclass, field
 from io import BytesIO
 import logging
-import tempfile
-import os
 from typing import Callable
 
 import requests
 from transformers import Pipeline, pipeline
-from whisper_cpp_python import Whisper
+from faster_whisper import WhisperModel
 
 from frame_video import get_audio_in_ram
+from translator import OpusTranslatorModel
 from settings import Settings
 
 
+model = WhisperModel
+
+
 @dataclass
 class WhisperService:
-    _service: Whisper = field(
-        default_factory=lambda: Whisper(
-            model_path=Settings.whisper_path,
-            n_threads=4
+    _whisper: WhisperModel = field(
+        default_factory=lambda: WhisperModel(
+            Settings.whisper_model,
+            device="cpu",
+            compute_type="float16",
+            cpu_threads=8,
+            num_workers=4,
         )
     )
+    _translator: OpusTranslatorModel = field(
+        default_factory=OpusTranslatorModel
+    )
     _summary_pipeline: Pipeline = field(
         default_factory=lambda: pipeline(
             "summarization",
@@ -35,25 +43,25 @@ class WhisperService:
     def __call__(self, link: str) -> str:
         self._logger.info("Converting video file to transcript")
         video_data = BytesIO(requests.get(link).content)
-        with tempfile.NamedTemporaryFile(delete=False) as video:
-            video.write(video_data.read())
-            video.close()
-            data = self._service.translate(
-                video.name, prompt=""
+        segments, info = self._whisper.transcribe(
+            video_data,
+            language="ru",
+            beam_size=5
+        )
+        if info.language_probability < 0.5:
+            self._logger.info(
+                "Cannot properly identify speech, probability=%s, returning empty string",
+                info.language_probability
             )
-        #    audio_data = self._get_audio_in_ram(video.name)
-        os.unlink(video.name)
-        #self._logger.info("Processing WAV file by whisper")
-        #with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as audio:
-        #    audio.write(audio_data.read())
-        #    audio.close()
-        #    data = self._service.translate(
-                #        audio.name, prompt=""
-                #    )
-        #os.unlink(audio.name)
+            return ""
         self._logger.info("summarizing transcript into 77 CLIP tokens")
-        text = data["text"]
-        summary = self._summary_pipeline(text, max_length=77)
+        full_translation = ""
+        for segment in segments:
+            if segment.no_speech_prob > 0.5:
+                continue
+            translated_segment = self._translator(segment.text)
+            full_translation += " " + translated_segment
+        summary = self._summary_pipeline(full_translation, max_length=77)
         result: str = summary[0]["summary_text"]  # type: ignore
         self._logger.info("Processed video file into text description: %s, total length: %s", result, len(result))
         return result
diff --git a/poetry.lock b/poetry.lock
index 8e449ae..9c7f402 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -250,6 +250,64 @@ files = [
     {file = "autocorrect-2.6.1.tar.gz", hash = "sha256:2bc68192dc645b44bece2613caac338e93548c3dac9c563095b27224c7fd4391"},
 ]
 
+[[package]]
+name = "av"
+version = "12.1.0"
+description = "Pythonic bindings for FFmpeg's libraries."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "av-12.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0df2ad330ccf63ed8192d637306f13123cdf1c06717168d1de8b9a084d62f70"},
+    {file = "av-12.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9e66ad48dc3f618cf4a75cc14dd7e119d1151ff3c13b9b064014c79bad20df85"},
+    {file = "av-12.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f0e8fbbe3cffd04dcbfaf7f9e0469c8c9d3ae962728487aae0dbbac9ebb62567"},
+    {file = "av-12.1.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:93c24d21b116e3af45e2f4b3a7ff1c96ae9a266bcde33a689ace0c52888e74d9"},
+    {file = "av-12.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1eff59d1eb0ba263e9efe8e460ca239c6ee2285f1b92c6b3c64f002c1b2ffd56"},
+    {file = "av-12.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:09f8bd1fd124e389a266c770d209b5b4333f69c4b5a66b9aa2d09a561b0b54ab"},
+    {file = "av-12.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e4c409639699d75e85a5b4b9fbb0538388bb009c8b426f7976b218731815e645"},
+    {file = "av-12.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f624a61d8062bb7128a4b0af018ef5c7642acff2af7cea1bb6cc5aa663954b77"},
+    {file = "av-12.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:73c61635e959dd50857f1ae3ad28984ce813688262672a5188376686dd293333"},
+    {file = "av-12.1.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7f8dcf20ecdfed62cb8b31790d3f394c76f05d5d58d5cc516f7b37c8608b78e2"},
+    {file = "av-12.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebb11aba1ef2acb945713be5f4f7a359439230dc566243c354dddb2b06361367"},
+    {file = "av-12.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:a309994db77f632b606fe22c5bac03302e3dbe48d53c195abc435ccc56192746"},
+    {file = "av-12.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:08401e59a9e33a42511d28cf1fdc570c31d3416426a2d73f4f4aaaaca5945c54"},
+    {file = "av-12.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:efd45e3aa1e478ccbaafd84baf7d95d660b9cef30d850816129fd37d76813589"},
+    {file = "av-12.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1ab553ce72c631477181d6c08c6e710afa44fa3452e61b82d9a75be07b1b2fef"},
+    {file = "av-12.1.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:555f3240306ff02169ff209b152f97b071b57957868c3004c65e25c28130d593"},
+    {file = "av-12.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:07706499489f2047b54a4675dd04e2cf88322caef904b7b6eb03f480e682cf15"},
+    {file = "av-12.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:f669f5fb2515e9a4c9ee05b24ffbe3168d33c241bda93c84c8e384ca682a5cde"},
+    {file = "av-12.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:876302ee793a457a03c4faa8281012671bb52dec843062bec59d6f0ae3735ba6"},
+    {file = "av-12.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e6ad88e1e61e65c69d92ff1db8826686f913f147b427c99aa3202b027e766128"},
+    {file = "av-12.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:49a8f88b26d3d25140633a8ec48328a9467bbe001d01c54472394484cdb60b10"},
+    {file = "av-12.1.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:97873f344344b9b6aef786b22b57fb42c6eaa4ea0798d2020c5ed061f29ab3d6"},
+    {file = "av-12.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fdf4c54354580abbea9390e23a471a346e9a4b4ca19c6929ad11a59d525e2ad3"},
+    {file = "av-12.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:dc1a82e7d43495be6d34b50fd917989a72de7c3a7434d8ec72af0952c1ad4ea3"},
+    {file = "av-12.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:41d13494401bd3968255f7f9af2af203c30b684efc5a7ed92ebe9ec37f9f9264"},
+    {file = "av-12.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fc36f7b74e88db8e73fa69dc869331da74abc4f034ecd55f85f6232fcdddca60"},
+    {file = "av-12.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:81ff7a43ce921f2cc3c794810b148c4fa2cfd7ff10f4404072c94cf57b39b13d"},
+    {file = "av-12.1.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce02915698d605c19c372314b7894033a451e838300d0a45c2708a550044e2d1"},
+    {file = "av-12.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eadd5c7c374c9ff889a9116802cdda7ef9d574b623338f4045effc0f3f3c2cbc"},
+    {file = "av-12.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:f32893849fe34300f3cec51c4ae71c45b0acac448d36336d3452a5bb4f7e11bf"},
+    {file = "av-12.1.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:a0a2a8693fdaa3bbb00255cda388f110f7a0b00817470a8cd8f1aa5c8dcbc3c9"},
+    {file = "av-12.1.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:615f440856cbc5b96b8ae52c75ba722f082b898c3ab837eae024a06a0914e8a6"},
+    {file = "av-12.1.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:257fe519b0ffb4e900b737515137fb9ae0490edca7d70818b6c71c3cd79994ca"},
+    {file = "av-12.1.0-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:04afe8f9005bb42f95717bcfbb22a8950b4b942a862444edb1f0bab71ea702e9"},
+    {file = "av-12.1.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:63cbeaedc0184094b7d36bd4267cd61e6c69c18cb3464cc726ce6a8a438ac87a"},
+    {file = "av-12.1.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:0a0e056baa87037f932d12de3d3f258cbc4284d18d85099ccd845b333ac1bb91"},
+    {file = "av-12.1.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7d549c2e6e9035022ea2280b781150a8c81acc4a03c69bde20b2f53262041a88"},
+    {file = "av-12.1.0-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:3b1e02715cbb985b0efe6b6aaf134f9d1fee760822a07fd19e995a8e461909f4"},
+    {file = "av-12.1.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b348264ba26152d7b06f2aaf0b2a11c90b13c628a447f6daa2a6770b9443fb0"},
+    {file = "av-12.1.0-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c6a3b3e4138cd1977f14e3d16c5f89979de8efa251d7558e2dc10a51cfcc0100"},
+    {file = "av-12.1.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:105b017958eb5b6a128a5399200a4ec2b1040c2047e0b5f5e3714cd64fe7046e"},
+    {file = "av-12.1.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:00596e53db3082193142e32fbdf47349724221de117645b0ed8fcaaec508adf4"},
+    {file = "av-12.1.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ed7c48d2d79961d70ea59f44fcff453bb2444a152793f80d2ceaa17af4331b9c"},
+    {file = "av-12.1.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:d2c486adf83fc5b8e444efcc32f3eef27eefd6d0966ef68607d41205adcd8ec0"},
+    {file = "av-12.1.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abe9475dd2c8bea47338d5e90d6a45a28930d0fe3820ed2d3d09dfbb3316d476"},
+    {file = "av-12.1.0-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0130a8391aa258eee60da3c09d69eb5c9480f14a9f1b1b5312336bac879edd2a"},
+    {file = "av-12.1.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:669f206cfdd5696d0edf2c81c5d220acc40b4153b71cf6662618c376e00b6d3a"},
+    {file = "av-12.1.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:e322533f585c2e8df07aa708c594fcb67f5f27a2f8b4107a7e6a6f90606190c7"},
+    {file = "av-12.1.0.tar.gz", hash = "sha256:67adab9fdabcb8a86bd542787196580e38ed4132331ee9e82234b23cea9546b3"},
+]
+
 [[package]]
 name = "backoff"
 version = "2.2.1"
@@ -628,6 +686,45 @@ humanfriendly = ">=9.1"
 [package.extras]
 cron = ["capturer (>=2.4)"]
 
+[[package]]
+name = "ctranslate2"
+version = "4.3.1"
+description = "Fast inference engine for Transformer models"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "ctranslate2-4.3.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e962c9dc3ddfacf60f2467bea5f91f75239c3d9c17656e4b0c569d956d662b99"},
+    {file = "ctranslate2-4.3.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:49a0d9136d577b667c1bb450267248d9cf205b5eb28b89b3f70c296ec5285da8"},
+    {file = "ctranslate2-4.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:343b24fe3d8a5b6a7c8082332415767bef7ceaf15bb43d0cec7e83665108c51e"},
+    {file = "ctranslate2-4.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7d95ecb440e4985cad4623a1fe7bb91406bab4aa55b00aa89a0c16eb5939d640"},
+    {file = "ctranslate2-4.3.1-cp310-cp310-win_amd64.whl", hash = "sha256:febf7cf0fb641c76035cdece58e97d27f4e8950a5e32fc480f9afa1bcbbb856c"},
+    {file = "ctranslate2-4.3.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a49dc5d339e2f4ed016553db0d0e6cbd369742697c87c6cc0cc15a47c7c72d00"},
+    {file = "ctranslate2-4.3.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:def98f6f8900470b2cec9408e5b0402af75f40f771391ebacd2b60666b8d75b9"},
+    {file = "ctranslate2-4.3.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:30c02fcd5a7be93bf42a8adf81a9ac4f394e23bd639192907b2e11feae589971"},
+    {file = "ctranslate2-4.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a06043910a7dee91ea03634be2cff2e1338a9f87bb51e062c03bae69e2c826b6"},
+    {file = "ctranslate2-4.3.1-cp311-cp311-win_amd64.whl", hash = "sha256:6f49834b63848f17dfdc1b2b8c632c31932ad69e130ce0f7b1e2505aa3923e6c"},
+    {file = "ctranslate2-4.3.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fcf649d976070ddd33cdda00a7a60fde6f1fbe27d65d2c6141dd95153f965f01"},
+    {file = "ctranslate2-4.3.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f63f779f1d4518acdc694b1938887d4f28613ac2dfe507ccc2c0d56dd8c95b40"},
+    {file = "ctranslate2-4.3.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:68301fbc5fb7daa609eb12ca6c2ed8aa29852c20f962532317762d1889e751d9"},
+    {file = "ctranslate2-4.3.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:45c5b352783bd3806f0c9f5dcbfa49d89c0dde71cb7d1b1c527c525e85af3ded"},
+    {file = "ctranslate2-4.3.1-cp312-cp312-win_amd64.whl", hash = "sha256:08626f115d5a39c56a666680735d6eebfc4d8a215288896d4d8afc14cfcdcffe"},
+    {file = "ctranslate2-4.3.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:e40d43c5f7d25f40d31cca0541cf21c2846f89509b99189d340fdee595391196"},
+    {file = "ctranslate2-4.3.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f352bcb802ab9ff1b94a25b4915c4f9f97cdd230993cf45ea290592d8997c2e2"},
+    {file = "ctranslate2-4.3.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c202011fa2ebb8129ba98a65df48df075f0ef53f905f2b13b8cd00f31c7ccff"},
+    {file = "ctranslate2-4.3.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4bca2ce519c497bc2f79e567093609d7bdfaff3313220e0d831797288803f3aa"},
+    {file = "ctranslate2-4.3.1-cp38-cp38-win_amd64.whl", hash = "sha256:ef812a4129e877f64f8ca2438b6247060af0f053a56b438dbfa81dae9ca12675"},
+    {file = "ctranslate2-4.3.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d8679354547260db999c2bcc6f11a31dad828c3d896d6120045bd0333940732f"},
+    {file = "ctranslate2-4.3.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:60bc176dd2e0ee6ddd33682401440f7626d115fed4f1e5e6816d9f7f213d1a62"},
+    {file = "ctranslate2-4.3.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7d394367fe472b6540489e3b081fc7e17cea2264075b074fb28eca30ff63463f"},
+    {file = "ctranslate2-4.3.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f1fd426d9019198d0fd8f37a18bf9c486241f711d597686956c58cd7676d564"},
+    {file = "ctranslate2-4.3.1-cp39-cp39-win_amd64.whl", hash = "sha256:de05e33790d72492a76101a0357c3d87d97ad53af84417c78f45e85df76d39e8"},
+]
+
+[package.dependencies]
+numpy = "*"
+pyyaml = ">=5.3,<7"
+setuptools = "*"
+
 [[package]]
 name = "decorator"
 version = "5.1.1"
@@ -817,6 +914,28 @@ typer = ">=0.12.3"
 [package.extras]
 standard = ["fastapi", "uvicorn[standard] (>=0.15.0)"]
 
+[[package]]
+name = "faster-whisper"
+version = "1.0.2"
+description = "Faster Whisper transcription with CTranslate2"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "faster-whisper-1.0.2.tar.gz", hash = "sha256:54d9fc698f7c665e00a0d5ed65d6e975b72a8862b8214f20a22e79b115c41511"},
+    {file = "faster_whisper-1.0.2-py3-none-any.whl", hash = "sha256:d968c289222e766a49ed97eecec24e934bdef405183f57d6d434a364bb3569c1"},
+]
+
+[package.dependencies]
+av = ">=11.0,<13"
+ctranslate2 = ">=4.0,<5"
+huggingface-hub = ">=0.13"
+onnxruntime = ">=1.14,<2"
+tokenizers = ">=0.13,<1"
+
+[package.extras]
+conversion = ["transformers[torch] (>=4.23)"]
+dev = ["black (==23.*)", "flake8 (==6.*)", "isort (==5.*)", "pytest (==7.*)"]
+
 [[package]]
 name = "filelock"
 version = "3.14.0"
@@ -4935,4 +5054,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "f8851f3ea1cdfc285ecd3281914ea167219f602416c140724e37ae3a63d2675c"
+content-hash = "70e87c46c783b44ad19f49a08a06b8cb1e90ab33467b80cb32a6f1ee9b221616"
diff --git a/pyproject.toml b/pyproject.toml
index a0f3e54..d90b56f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,6 +28,7 @@ fastapi-cache2 = {extras = ["redis"], version = "^0.2.1"}
 aiomcache = "^0.8.2"
 autocorrect = "^2.6.1"
 whisper-cpp-python = "^0.2.0"
+faster-whisper = "^1.0.2"
 
 
 [build-system]

From 84affe88b1f807f61a590150bc1bc1007529a261 Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Mon, 24 Jun 2024 10:50:03 +0300
Subject: [PATCH 32/39] add lost env vars

---
 inference/.env.dist | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/inference/.env.dist b/inference/.env.dist
index 0281e6a..70e8bb6 100644
--- a/inference/.env.dist
+++ b/inference/.env.dist
@@ -1,3 +1,4 @@
 CLIP_MODEL=laion/CLIP-ViT-g-14-laion2B-s12B-b42K
-WHISPER_PATH=/app/model_data/ggml-large-v3.bin
+WHISPER_MODEL=distil-whisper/distil-large-v3
 SUMMARIZATION_MODEL=facebook/bart-large-cnn
+TRANSLATION_MODEL=Helsinki-NLP/opus-mt-ru-en

From bc710eb8a46f99d81f9aeea4393972e346c381e4 Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Mon, 24 Jun 2024 11:31:18 +0300
Subject: [PATCH 33/39] fix snapshot loading

---
 inference/whisper.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/inference/whisper.py b/inference/whisper.py
index 4937c9f..228322f 100644
--- a/inference/whisper.py
+++ b/inference/whisper.py
@@ -3,6 +3,7 @@
 import logging
 from typing import Callable
 
+from huggingface_hub import snapshot_download
 import requests
 from transformers import Pipeline, pipeline
 from faster_whisper import WhisperModel
@@ -19,7 +20,7 @@
 class WhisperService:
     _whisper: WhisperModel = field(
         default_factory=lambda: WhisperModel(
-            Settings.whisper_model,
+            snapshot_download(Settings.whisper_model),
             device="cpu",
             compute_type="float16",
             cpu_threads=8,

From 2dc3f8779662c25177985bb2e341bcb7f801d52e Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Mon, 24 Jun 2024 12:21:44 +0300
Subject: [PATCH 34/39] change whisper model

---
 inference/.env.dist | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inference/.env.dist b/inference/.env.dist
index 70e8bb6..1f358fd 100644
--- a/inference/.env.dist
+++ b/inference/.env.dist
@@ -1,4 +1,4 @@
 CLIP_MODEL=laion/CLIP-ViT-g-14-laion2B-s12B-b42K
-WHISPER_MODEL=distil-whisper/distil-large-v3
+WHISPER_MODEL=flyingleafe/faster-whisper-large-v3
 SUMMARIZATION_MODEL=facebook/bart-large-cnn
 TRANSLATION_MODEL=Helsinki-NLP/opus-mt-ru-en

From e39bc1524e035c377032bd57068107324afe624b Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Mon, 24 Jun 2024 12:27:27 +0300
Subject: [PATCH 35/39] change quantization to int8

---
 inference/whisper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inference/whisper.py b/inference/whisper.py
index 228322f..b094ad2 100644
--- a/inference/whisper.py
+++ b/inference/whisper.py
@@ -22,7 +22,7 @@ class WhisperService:
         default_factory=lambda: WhisperModel(
             snapshot_download(Settings.whisper_model),
             device="cpu",
-            compute_type="float16",
+            compute_type="int8",
             cpu_threads=8,
             num_workers=4,
         )

From a07935a9c5e047808d72fe87c432699bb14dfa4c Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Mon, 24 Jun 2024 12:31:35 +0300
Subject: [PATCH 36/39] add sentencepiece to list of deps

---
 inference/requirements.txt |  1 +
 poetry.lock                | 64 +++++++++++++++++++++++++++++++++++++-
 pyproject.toml             |  1 +
 3 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/inference/requirements.txt b/inference/requirements.txt
index 0ce7ad4..252865d 100644
--- a/inference/requirements.txt
+++ b/inference/requirements.txt
@@ -11,3 +11,4 @@ opencv-python==4.10.0.82
 environs==11.0.0
 faster-whisper==1.0.2
 PyYAML>=6.0
+sentencepiece==0.2.0
diff --git a/poetry.lock b/poetry.lock
index 9c7f402..1120b4a 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -3891,6 +3891,68 @@ dev = ["cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy", "pycodestyle", "pyde
 doc = ["jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.12.0)", "jupytext", "matplotlib (>=3.5)", "myst-nb", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0)", "sphinx-design (>=0.4.0)"]
 test = ["array-api-strict", "asv", "gmpy2", "hypothesis (>=6.30)", "mpmath", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"]
 
+[[package]]
+name = "sentencepiece"
+version = "0.2.0"
+description = "SentencePiece python wrapper"
+optional = false
+python-versions = "*"
+files = [
+    {file = "sentencepiece-0.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:188779e1298a1c8b8253c7d3ad729cb0a9891e5cef5e5d07ce4592c54869e227"},
+    {file = "sentencepiece-0.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bed9cf85b296fa2b76fc2547b9cbb691a523864cebaee86304c43a7b4cb1b452"},
+    {file = "sentencepiece-0.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d7b67e724bead13f18db6e1d10b6bbdc454af574d70efbb36f27d90387be1ca3"},
+    {file = "sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2fde4b08cfe237be4484c6c7c2e2c75fb862cfeab6bd5449ce4caeafd97b767a"},
+    {file = "sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c378492056202d1c48a4979650981635fd97875a00eabb1f00c6a236b013b5e"},
+    {file = "sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1380ce6540a368de2ef6d7e6ba14ba8f3258df650d39ba7d833b79ee68a52040"},
+    {file = "sentencepiece-0.2.0-cp310-cp310-win32.whl", hash = "sha256:a1151d6a6dd4b43e552394aed0edfe9292820272f0194bd56c7c1660a0c06c3d"},
+    {file = "sentencepiece-0.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:d490142b0521ef22bc1085f061d922a2a6666175bb6b42e588ff95c0db6819b2"},
+    {file = "sentencepiece-0.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:17982700c4f6dbb55fa3594f3d7e5dd1c8659a274af3738e33c987d2a27c9d5c"},
+    {file = "sentencepiece-0.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7c867012c0e8bcd5bdad0f791609101cb5c66acb303ab3270218d6debc68a65e"},
+    {file = "sentencepiece-0.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7fd6071249c74f779c5b27183295b9202f8dedb68034e716784364443879eaa6"},
+    {file = "sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27f90c55a65013cbb8f4d7aab0599bf925cde4adc67ae43a0d323677b5a1c6cb"},
+    {file = "sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b293734059ef656dcd65be62ff771507bea8fed0a711b6733976e1ed3add4553"},
+    {file = "sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e58b47f933aca74c6a60a79dcb21d5b9e47416256c795c2d58d55cec27f9551d"},
+    {file = "sentencepiece-0.2.0-cp311-cp311-win32.whl", hash = "sha256:c581258cf346b327c62c4f1cebd32691826306f6a41d8c4bec43b010dee08e75"},
+    {file = "sentencepiece-0.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:0993dbc665f4113017892f1b87c3904a44d0640eda510abcacdfb07f74286d36"},
+    {file = "sentencepiece-0.2.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:ea5f536e32ea8ec96086ee00d7a4a131ce583a1b18d130711707c10e69601cb2"},
+    {file = "sentencepiece-0.2.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d0cb51f53b6aae3c36bafe41e86167c71af8370a039f542c43b0cce5ef24a68c"},
+    {file = "sentencepiece-0.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3212121805afc58d8b00ab4e7dd1f8f76c203ddb9dc94aa4079618a31cf5da0f"},
+    {file = "sentencepiece-0.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2a3149e3066c2a75e0d68a43eb632d7ae728c7925b517f4c05c40f6f7280ce08"},
+    {file = "sentencepiece-0.2.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:632f3594d3e7ac8b367bca204cb3fd05a01d5b21455acd097ea4c0e30e2f63d7"},
+    {file = "sentencepiece-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f295105c6bdbb05bd5e1b0cafbd78ff95036f5d3641e7949455a3f4e5e7c3109"},
+    {file = "sentencepiece-0.2.0-cp312-cp312-win32.whl", hash = "sha256:fb89f811e5efd18bab141afc3fea3de141c3f69f3fe9e898f710ae7fe3aab251"},
+    {file = "sentencepiece-0.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:7a673a72aab81fef5ebe755c6e0cc60087d1f3a4700835d40537183c1703a45f"},
+    {file = "sentencepiece-0.2.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:4547683f330289ec4f093027bfeb87f9ef023b2eb6f879fdc4a8187c7e0ffb90"},
+    {file = "sentencepiece-0.2.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7cd6175f7eaec7142d2bf6f6597ce7db4c9ac89acf93fcdb17410c3a8b781eeb"},
+    {file = "sentencepiece-0.2.0-cp36-cp36m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:859ba1acde782609a0910a26a60e16c191a82bf39b5621107552c0cd79fad00f"},
+    {file = "sentencepiece-0.2.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bcbbef6cc277f8f18f36959e305f10b1c620442d75addc79c21d7073ae581b50"},
+    {file = "sentencepiece-0.2.0-cp36-cp36m-win32.whl", hash = "sha256:536b934e244829e3fe6c4f198652cd82da48adb9aa145c9f00889542726dee3d"},
+    {file = "sentencepiece-0.2.0-cp36-cp36m-win_amd64.whl", hash = "sha256:0a91aaa3c769b52440df56fafda683b3aa48e3f2169cf7ee5b8c8454a7f3ae9b"},
+    {file = "sentencepiece-0.2.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:787e480ca4c1d08c9985a7eb1eae4345c107729c99e9b5a9a00f2575fc7d4b4b"},
+    {file = "sentencepiece-0.2.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4d158189eb2ecffea3a51edf6d25e110b3678ec47f1a40f2d541eafbd8f6250"},
+    {file = "sentencepiece-0.2.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d1e5ca43013e8935f25457a4fca47e315780172c3e821b4b13a890668911c792"},
+    {file = "sentencepiece-0.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7140d9e5a74a0908493bb4a13f1f16a401297bd755ada4c707e842fbf6f0f5bf"},
+    {file = "sentencepiece-0.2.0-cp37-cp37m-win32.whl", hash = "sha256:6cf333625234f247ab357b0bd9836638405ea9082e1543d5b8408f014979dcbf"},
+    {file = "sentencepiece-0.2.0-cp37-cp37m-win_amd64.whl", hash = "sha256:ff88712338b01031910e8e61e7239aff3ce8869ee31a47df63cb38aadd591bea"},
+    {file = "sentencepiece-0.2.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:20813a68d4c221b1849c62c30e1281ea81687894d894b8d4a0f4677d9311e0f5"},
+    {file = "sentencepiece-0.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:926ef920ae2e8182db31d3f5d081ada57804e3e1d3a8c4ef8b117f9d9fb5a945"},
+    {file = "sentencepiece-0.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:89f65f69636b7e9c015b79dff9c9985a9bc7d19ded6f79ef9f1ec920fdd73ecf"},
+    {file = "sentencepiece-0.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f67eae0dbe6f2d7d6ba50a354623d787c99965f068b81e145d53240198021b0"},
+    {file = "sentencepiece-0.2.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:98501e075f35dd1a1d5a20f65be26839fcb1938752ec61539af008a5aa6f510b"},
+    {file = "sentencepiece-0.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e3d1d2cc4882e8d6a1adf9d5927d7716f80617fc693385661caff21888972269"},
+    {file = "sentencepiece-0.2.0-cp38-cp38-win32.whl", hash = "sha256:b99a308a2e5e569031ab164b74e6fab0b6f37dfb493c32f7816225f4d411a6dd"},
+    {file = "sentencepiece-0.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:cdb701eec783d3ec86b7cd4c763adad8eaf6b46db37ee1c36e5e6c44b3fe1b5f"},
+    {file = "sentencepiece-0.2.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1e0f9c4d0a6b0af59b613175f019916e28ade076e21242fd5be24340d8a2f64a"},
+    {file = "sentencepiece-0.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:298f21cc1366eb60311aedba3169d30f885c363ddbf44214b0a587d2908141ad"},
+    {file = "sentencepiece-0.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3f1ec95aa1e5dab11f37ac7eff190493fd87770f7a8b81ebc9dd768d1a3c8704"},
+    {file = "sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b06b70af54daa4b4904cbb90b4eb6d35c9f3252fdc86c9c32d5afd4d30118d8"},
+    {file = "sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:22e37bac44dd6603388cb598c64ff7a76e41ca774646f21c23aadfbf5a2228ab"},
+    {file = "sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0461324897735512a32d222e3d886e24ad6a499761952b6bda2a9ee6e4313ea5"},
+    {file = "sentencepiece-0.2.0-cp39-cp39-win32.whl", hash = "sha256:38aed822fb76435fa1f12185f10465a94ab9e51d5e8a9159e9a540ce926f0ffd"},
+    {file = "sentencepiece-0.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:d8cf876516548b5a1d6ac4745d8b554f5c07891d55da557925e5c13ff0b4e6ad"},
+    {file = "sentencepiece-0.2.0.tar.gz", hash = "sha256:a52c19171daaf2e697dc6cbe67684e0fa341b1248966f6aebb541de654d15843"},
+]
+
 [[package]]
 name = "setuptools"
 version = "70.0.0"
@@ -5054,4 +5116,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "70e87c46c783b44ad19f49a08a06b8cb1e90ab33467b80cb32a6f1ee9b221616"
+content-hash = "d9abc569af387cd74eb903c00e081e203ea028606bde4e4e848121ce19a9d2f9"
diff --git a/pyproject.toml b/pyproject.toml
index d90b56f..a5ea6e6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,6 +29,7 @@ aiomcache = "^0.8.2"
 autocorrect = "^2.6.1"
 whisper-cpp-python = "^0.2.0"
 faster-whisper = "^1.0.2"
+sentencepiece = "^0.2.0"
 
 
 [build-system]

From 87db31fcf9a0f0fd28b175f35de6d4cb72ea0ceb Mon Sep 17 00:00:00 2001
From: Antony Redman <RumataYounger@gmail.com>
Date: Mon, 24 Jun 2024 20:03:57 +0300
Subject: [PATCH 37/39] scrap whisper for time being

---
 inference/.env.dist        |  3 --
 inference/deps.py          |  8 -----
 inference/frame_video.py   | 13 --------
 inference/main.py          | 12 +------
 inference/requirements.txt |  1 -
 inference/settings.py      |  1 -
 inference/translator.py    | 32 ------------------
 inference/whisper.py       | 68 --------------------------------------
 8 files changed, 1 insertion(+), 137 deletions(-)
 delete mode 100644 inference/translator.py
 delete mode 100644 inference/whisper.py

diff --git a/inference/.env.dist b/inference/.env.dist
index 1f358fd..8b47efb 100644
--- a/inference/.env.dist
+++ b/inference/.env.dist
@@ -1,4 +1 @@
 CLIP_MODEL=laion/CLIP-ViT-g-14-laion2B-s12B-b42K
-WHISPER_MODEL=flyingleafe/faster-whisper-large-v3
-SUMMARIZATION_MODEL=facebook/bart-large-cnn
-TRANSLATION_MODEL=Helsinki-NLP/opus-mt-ru-en
diff --git a/inference/deps.py b/inference/deps.py
index d937d8a..dc76a0b 100644
--- a/inference/deps.py
+++ b/inference/deps.py
@@ -5,7 +5,6 @@
 from fastapi import Depends, FastAPI, Request
 from transformers import CLIPModel, CLIPProcessor
 
-from whisper import WhisperService
 from settings import Settings
 
 
@@ -22,8 +21,6 @@ async def lifespan(app: FastAPI):
         Settings.clip_model,
         cache_dir="./model_cache"
     )
-    logger.info("Setting up Whisper service...")
-    app.state.whisper_model = WhisperService()
     yield
 
 
@@ -35,10 +32,5 @@ def _get_clip_processor(request: Request) -> CLIPProcessor:
     return request.app.state.processor
 
 
-def _get_whisper(request: Request) -> WhisperService:
-    return request.app.state.whisper_model
-
-
 Processor = Annotated[CLIPProcessor, Depends(_get_clip_processor)]
 Model = Annotated[CLIPModel, Depends(_get_clip_model)]
-Whisper = Annotated[WhisperService, Depends(_get_whisper)]
diff --git a/inference/frame_video.py b/inference/frame_video.py
index e61cd13..374cc20 100644
--- a/inference/frame_video.py
+++ b/inference/frame_video.py
@@ -57,16 +57,3 @@ def create_frame_in_ram(video_path: str, timecode: str) -> BytesIO:
     process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     frame_data, _ = process.communicate()
     return BytesIO(frame_data)
-
-def get_audio_in_ram(video_path: str) -> BytesIO:
-    command = [
-        "ffmpeg",
-        "-i", video_path, 
-        "-acodec", "pcm_s16le",
-        "-ac", "1",
-        "-ar", "16000", 
-        "-"
-    ]
-    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    audio_data, _ = process.communicate()
-    return BytesIO(audio_data)
diff --git a/inference/main.py b/inference/main.py
index 0e9351a..bffefea 100644
--- a/inference/main.py
+++ b/inference/main.py
@@ -2,7 +2,7 @@
 from fastapi import FastAPI
 from fastapi.responses import JSONResponse
 
-from deps import Model, Processor, Whisper, lifespan
+from deps import Model, Processor, lifespan
 from clip import CLIP
 from models import EncodeRequest, EncodeSearchRequest
 
@@ -18,24 +18,14 @@ async def encode(
         request: EncodeRequest,
         processor: Processor,
         model: Model,
-        whisper: Whisper
 ):
     logger.info("Initializing CLIP module...")
     clip = CLIP(processor=processor, model=model, logger=logger)
     logger.info("CLIP module successfully initialized")
 
     video_features = clip(request.link, encode_type="video")
-    if request.description is not None:
-        description_features = clip(request.description, encode_type="text")
-    else:
-        description_features = None
-
-    audio_transcription = whisper(request.link)
-    audio_features = clip(audio_transcription, encode_type="text")
     return {
         "video": video_features,
-        "audio": audio_features,
-        "description": description_features
     }
 
 @app.post("/encode-search")
diff --git a/inference/requirements.txt b/inference/requirements.txt
index 252865d..14962b3 100644
--- a/inference/requirements.txt
+++ b/inference/requirements.txt
@@ -9,6 +9,5 @@ pillow==10.3.0
 scenedetect==0.6.3
 opencv-python==4.10.0.82
 environs==11.0.0
-faster-whisper==1.0.2
 PyYAML>=6.0
 sentencepiece==0.2.0
diff --git a/inference/settings.py b/inference/settings.py
index 5c5f3e0..6242992 100644
--- a/inference/settings.py
+++ b/inference/settings.py
@@ -7,5 +7,4 @@
 class Settings:
     clip_model: str = env.str("CLIP_MODEL")
     summarization_model: str = env.str("SUMMARIZATION_MODEL")
-    whisper_model: str = env.str("WHISPER_MODEL")
     translation_model: str = env.str("TRANSLATION_MODEL")
diff --git a/inference/translator.py b/inference/translator.py
deleted file mode 100644
index 65a1aa2..0000000
--- a/inference/translator.py
+++ /dev/null
@@ -1,32 +0,0 @@
-from dataclasses import dataclass
-
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-
-from settings import Settings
-
-
-@dataclass
-class OpusTranslatorModel:
-    _model: AutoModelForSeq2SeqLM | None = None
-    _tokenizer: AutoTokenizer | None = None
-
-    _model_name: str = Settings.translation_model
-    _device: str = "cpu"
-
-    def __post_init__(self):
-        self._tokenizer = AutoTokenizer.from_pretrained(
-            self._model_name,
-            cache_dir="./model_cache"
-        )
-        self._model = AutoModelForSeq2SeqLM.from_pretrained(
-            self._model_name,
-            cache_dir="./model_cache"
-        )
-
-
-    def __call__(self, translate_query: str) -> str:
-        input_ids = self._tokenizer.encode(translate_query, return_tensors="pt")
-        output_ids = self._model.generate(input_ids.to(self._device), max_new_tokens=100)
-        en_text = self._tokenizer.decode(output_ids[0], skip_special_tokens=True)
-
-        return en_text
diff --git a/inference/whisper.py b/inference/whisper.py
deleted file mode 100644
index b094ad2..0000000
--- a/inference/whisper.py
+++ /dev/null
@@ -1,68 +0,0 @@
-from dataclasses import dataclass, field
-from io import BytesIO
-import logging
-from typing import Callable
-
-from huggingface_hub import snapshot_download
-import requests
-from transformers import Pipeline, pipeline
-from faster_whisper import WhisperModel
-
-from frame_video import get_audio_in_ram
-from translator import OpusTranslatorModel
-from settings import Settings
-
-
-model = WhisperModel
-
-
-@dataclass
-class WhisperService:
-    _whisper: WhisperModel = field(
-        default_factory=lambda: WhisperModel(
-            snapshot_download(Settings.whisper_model),
-            device="cpu",
-            compute_type="int8",
-            cpu_threads=8,
-            num_workers=4,
-        )
-    )
-    _translator: OpusTranslatorModel = field(
-        default_factory=OpusTranslatorModel
-    )
-    _summary_pipeline: Pipeline = field(
-        default_factory=lambda: pipeline(
-            "summarization",
-            model=Settings.summarization_model
-        )
-    )
-    _logger: logging.Logger = field(
-        default_factory=lambda: logging.getLogger(__name__)
-    )
-    _get_audio_in_ram: Callable[[str], BytesIO] = get_audio_in_ram
-
-    def __call__(self, link: str) -> str:
-        self._logger.info("Converting video file to transcript")
-        video_data = BytesIO(requests.get(link).content)
-        segments, info = self._whisper.transcribe(
-            video_data,
-            language="ru",
-            beam_size=5
-        )
-        if info.language_probability < 0.5:
-            self._logger.info(
-                "Cannot properly identify speech, probability=%s, returning empty string",
-                info.language_probability
-            )
-            return ""
-        self._logger.info("summarizing transcript into 77 CLIP tokens")
-        full_translation = ""
-        for segment in segments:
-            if segment.no_speech_prob > 0.5:
-                continue
-            translated_segment = self._translator(segment.text)
-            full_translation += " " + translated_segment
-        summary = self._summary_pipeline(full_translation, max_length=77)
-        result: str = summary[0]["summary_text"]  # type: ignore
-        self._logger.info("Processed video file into text description: %s, total length: %s", result, len(result))
-        return result

From 9aa10fdfd620a987cdccd5d799977cc54cc9031c Mon Sep 17 00:00:00 2001
From: Arseny Chebyshev <arseny.chebyshev@gmail.com>
Date: Tue, 25 Jun 2024 01:40:39 +0500
Subject: [PATCH 38/39] (upd) Key frame search, trunc text for features

---
 inference/clip.py        |  1 +
 inference/frame_video.py | 16 +++++++++++-----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/inference/clip.py b/inference/clip.py
index 5109475..439ca56 100644
--- a/inference/clip.py
+++ b/inference/clip.py
@@ -27,6 +27,7 @@ def __call__(self, encode_source: str, encode_type: Literal["text"] | Literal["v
             return self._encode_video(encode_source)
 
     def _encode_text(self, description: str) -> list[float]:
+        description = description[:65]  # meet the processor max length
         text_inputs = self.processor(text=[description], return_tensors="pt", padding=True)
         with torch.no_grad():
             text_features = self.model.get_text_features(**text_inputs)
diff --git a/inference/frame_video.py b/inference/frame_video.py
index 374cc20..d5a089d 100644
--- a/inference/frame_video.py
+++ b/inference/frame_video.py
@@ -4,7 +4,7 @@
 from io import BytesIO
 from dataclasses import dataclass
 import requests
-from scenedetect import detect, ContentDetector
+from scenedetect import detect, ContentDetector, AdaptiveDetector
 
 @dataclass
 class VideoFrame:
@@ -14,6 +14,7 @@ class VideoFrame:
 def create_key_frames_for_video(
         video_link: str, 
         frame_change_threshold: float = 7.5,
+        min_scene_len: int = 10,
         num_of_thumbnails: int = 10
     ) -> list[VideoFrame]:
     frames: list[VideoFrame] = []
@@ -21,12 +22,16 @@ def create_key_frames_for_video(
     with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmp_file:
         tmp_file.write(video_data.getvalue())
         video_path = tmp_file.name
-    scenes = detect(video_path, ContentDetector(threshold=frame_change_threshold))
+    scenes = detect(
+        video_path=video_path, 
+        detector=ContentDetector(threshold=frame_change_threshold, min_scene_len=min_scene_len)
+    )
 
-    # Gradually reduce number of key frames with a sliding window
+    # Gradually reduce number of key frames with a increasingly smaller steps
     while len(scenes) > num_of_thumbnails:
-        scenes.pop()
-        scenes.pop(0)
+        step = len(scenes) / (num_of_thumbnails - 1)
+        to_remove_indices = [int(round(i * step)) for i in range(num_of_thumbnails)]
+        scenes = [scenes[i] for i in range(len(scenes)) if i not in to_remove_indices] 
     for i, scene in enumerate(scenes):
         scene_start, _ = scene
         frame_data = create_frame_in_ram(video_path, scene_start.get_timecode())
@@ -39,6 +44,7 @@ def create_key_frames_for_video(
         return create_key_frames_for_video(
             video_link=video_link,
             frame_change_threshold=frame_change_threshold - 2.5,
+            min_scene_len=min_scene_len - 2 if min_scene_len > 2 else min_scene_len,
             num_of_thumbnails=num_of_thumbnails
         )
     return frames

From 7d17375ba0db399a2def10eabe8e59bf63a676a5 Mon Sep 17 00:00:00 2001
From: Arseny Chebyshev <arseny.chebyshev@gmail.com>
Date: Tue, 25 Jun 2024 02:03:28 +0500
Subject: [PATCH 39/39] (upd) Key frame search, trunc text for features

---
 inference/.env.dist   | 1 +
 inference/settings.py | 5 ++---
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/inference/.env.dist b/inference/.env.dist
index 8b47efb..76b4fdc 100644
--- a/inference/.env.dist
+++ b/inference/.env.dist
@@ -1 +1,2 @@
 CLIP_MODEL=laion/CLIP-ViT-g-14-laion2B-s12B-b42K
+TRANSLATION_MODEL=Helsinki-NLP/opus-mt-ru-en
diff --git a/inference/settings.py b/inference/settings.py
index 6242992..55ade0a 100644
--- a/inference/settings.py
+++ b/inference/settings.py
@@ -5,6 +5,5 @@
 
 
 class Settings:
-    clip_model: str = env.str("CLIP_MODEL")
-    summarization_model: str = env.str("SUMMARIZATION_MODEL")
-    translation_model: str = env.str("TRANSLATION_MODEL")
+    clip_model: str = env.str("CLIP_MODEL", default="laion/CLIP-ViT-g-14-laion2B-s12B-b42K")
+    translation_model: str = env.str("TRANSLATION_MODEL", default="Helsinki-NLP/opus-mt-ru-en")