From 25a76f878a73b7d96368b0d6dc80518bc187991d Mon Sep 17 00:00:00 2001 From: llk214 Date: Mon, 9 Feb 2026 02:36:04 +0800 Subject: [PATCH] Add files via upload switched from pytorch to FastEmbed for smaller size and faster response. --- gui.py | 429 ++++++++++++++++++++++++++++++++--------------- locator.py | 335 +++++++++++++++++++----------------- requirements.txt | 8 +- 3 files changed, 481 insertions(+), 291 deletions(-) diff --git a/gui.py b/gui.py index c7c84c2..491757e 100644 --- a/gui.py +++ b/gui.py @@ -4,8 +4,30 @@ """ import sys +import os import io +# Suppress HuggingFace symlink warning on Windows +os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1" + +# ---------------------------- +# FastEmbed cache configuration +# ---------------------------- +from pathlib import Path + +def _default_fastembed_cache_dir() -> str: + """Choose a persistent cache directory for FastEmbed (avoid system Temp).""" + home = str(Path.home()) + if os.name == "nt": + base = os.environ.get("LOCALAPPDATA") or home + return os.path.join(base, "Locus", "fastembed_cache") + if sys.platform == "darwin": + return os.path.join(home, "Library", "Caches", "Locus", "fastembed_cache") + return os.path.join(home, ".cache", "Locus", "fastembed_cache") + +os.environ.setdefault("FASTEMBED_CACHE_PATH", _default_fastembed_cache_dir()) +os.makedirs(os.environ["FASTEMBED_CACHE_PATH"], exist_ok=True) + # Fix for PyInstaller + sentence-transformers (isatty error) if getattr(sys, 'frozen', False): if sys.stdout is None or not hasattr(sys.stdout, 'isatty'): @@ -122,12 +144,37 @@ def close(self): ctk.set_default_color_theme("blue") # "blue", "green", "dark-blue" +def get_app_dir(): + """Get the directory where the app is running from.""" + if getattr(sys, 'frozen', False): + # Running as compiled exe + return os.path.dirname(sys.executable) + else: + # Running as script + return os.path.dirname(os.path.abspath(__file__)) + + def open_pdf_at_page(pdf_path: str, page_num: int): - """Open PDF at specific page using system default viewer.""" + """Open PDF at specific page using bundled or system PDF viewer.""" system = platform.system() pdf_path = os.path.abspath(pdf_path) if system == "Windows": + # Check for bundled SumatraPDF (multiple possible locations) + app_dir = get_app_dir() + bundled_sumatra_paths = [ + os.path.join(app_dir, "_internal", "SumatraPDF", "SumatraPDF.exe"), + os.path.join(app_dir, "_internal", "SumatraPDF.exe"), + os.path.join(app_dir, "SumatraPDF", "SumatraPDF.exe"), + os.path.join(app_dir, "SumatraPDF.exe"), + ] + + for sumatra in bundled_sumatra_paths: + if os.path.exists(sumatra): + subprocess.Popen([sumatra, "-page", str(page_num), pdf_path]) + return True + + # Fall back to system-installed SumatraPDF sumatra_paths = [ r"C:\Program Files\SumatraPDF\SumatraPDF.exe", r"C:\Program Files (x86)\SumatraPDF\SumatraPDF.exe", @@ -364,23 +411,24 @@ def _create_widgets(self): ctk.CTkLabel(left_options, text="Quality:", font=("Segoe UI", 11)).pack(side="left", padx=(0, 4)) + # Bundled model name (included with app, no download needed) + self.bundled_model = "BAAI/bge-small-en-v1.5" + self.quality_options = { - "⚑ Fast": "sentence-transformers/all-MiniLM-L6-v2", "βš–οΈ Balanced": "BAAI/bge-small-en-v1.5", "🎯 High Accuracy": "BAAI/bge-base-en-v1.5", "πŸš€ Best": "BAAI/bge-large-en-v1.5", - "🌍 Multilingual": "BAAI/bge-m3" + "🌍 Multilingual": "intfloat/multilingual-e5-large" } self.quality_sizes = { - "⚑ Fast": "80MB", - "βš–οΈ Balanced": "130MB", - "🎯 High Accuracy": "440MB", - "πŸš€ Best": "1.3GB", + "βš–οΈ Balanced": "Built-in", + "🎯 High Accuracy": "210MB", + "πŸš€ Best": "1.2GB", "🌍 Multilingual": "2.2GB" } - self.quality_var = tk.StringVar(value="⚑ Fast") + self.quality_var = tk.StringVar(value="βš–οΈ Balanced") self.quality_menu = ctk.CTkOptionMenu(left_options, variable=self.quality_var, values=list(self.quality_options.keys()), width=145, height=26, corner_radius=6, @@ -459,7 +507,6 @@ def _browse_dir(self): def _on_quality_change(self, choice): info_map = { - "⚑ Fast": "4GB RAM", "βš–οΈ Balanced": "4GB RAM", "🎯 High Accuracy": "8GB RAM", "πŸš€ Best": "16GB RAM", @@ -471,18 +518,97 @@ def _on_quality_change(self, choice): if self.locator: self.status_var.set("Quality changed - click 'Load Index' to apply") + def _get_fastembed_cache_locations(self): + """Get FastEmbed cache locations (current + legacy).""" + import tempfile + + locations = [] + + # Preferred: app-managed persistent cache + env_path = os.environ.get("FASTEMBED_CACHE_PATH") + if env_path: + locations.append(env_path) + + # Legacy / fallback locations (in case user already downloaded models there) + temp_dir = tempfile.gettempdir() + locations.append(os.path.join(temp_dir, "fastembed_cache")) + + if os.name == "nt": + localappdata = os.environ.get("LOCALAPPDATA", "") + if localappdata: + locations.append(os.path.join(localappdata, "Temp", "fastembed_cache")) + temp_env = os.environ.get("TEMP", "") + if temp_env: + locations.append(os.path.join(temp_env, "fastembed_cache")) + + locations.append(os.path.expanduser("~/.cache/fastembed_cache")) + + # Remove duplicates while preserving order + seen = set() + unique_locations = [] + for loc in locations: + normalized = os.path.normpath(loc) + if normalized and normalized not in seen: + seen.add(normalized) + unique_locations.append(normalized) + + return unique_locations + + def _is_bundled_model(self, model_name): + """Check if model is the bundled one.""" + return model_name == self.bundled_model + + def _get_bundled_model_path(self): + """Get path to bundled model if it exists.""" + if getattr(sys, 'frozen', False): + # Running as PyInstaller exe + base_path = getattr(sys, '_MEIPASS', os.path.dirname(sys.executable)) + possible_paths = [ + os.path.join(base_path, '_internal', 'models', 'bge-small-en-v1.5'), + os.path.join(base_path, 'models', 'bge-small-en-v1.5'), + os.path.join(os.path.dirname(sys.executable), '_internal', 'models', 'bge-small-en-v1.5'), + os.path.join(os.path.dirname(sys.executable), 'models', 'bge-small-en-v1.5'), + ] + else: + # Running as script + script_dir = os.path.dirname(os.path.abspath(__file__)) + possible_paths = [ + os.path.join(script_dir, 'models', 'bge-small-en-v1.5'), + ] + + for path in possible_paths: + if os.path.exists(path): + for root, dirs, files in os.walk(path): + if 'model.onnx' in files or 'model_optimized.onnx' in files: + return path + return None + def _is_model_downloaded(self, model_name): - """Check if model exists in HuggingFace cache.""" - import os - cache_dir = os.path.expanduser("~/.cache/huggingface/hub") - if not os.path.exists(cache_dir): - return False - - # Convert model name to cache folder format - # e.g., "sentence-transformers/all-MiniLM-L6-v2" -> "models--sentence-transformers--all-MiniLM-L6-v2" - model_folder = "models--" + model_name.replace("/", "--") - model_path = os.path.join(cache_dir, model_folder) - return os.path.exists(model_path) + """Check if model exists in FastEmbed cache or is bundled.""" + # Bundled model is always available + if self._is_bundled_model(model_name) and self._get_bundled_model_path(): + return True + + # FastEmbed uses format like "models--qdrant--bge-small-en-v1.5-onnx" + model_short = model_name.split("/")[-1] # e.g., "bge-small-en-v1.5" + + for cache_dir in self._get_fastembed_cache_locations(): + if not os.path.exists(cache_dir): + continue + + try: + for folder in os.listdir(cache_dir): + # Check various naming patterns FastEmbed might use + if model_short in folder or model_short.replace("-", "_") in folder: + folder_path = os.path.join(cache_dir, folder) + # Check if model.onnx exists (actual model file) + for root, dirs, files in os.walk(folder_path): + if 'model.onnx' in files or 'model_optimized.onnx' in files: + return True + except (PermissionError, OSError): + continue + + return False def _update_model_status(self): """Update the download status indicator and action button.""" @@ -490,14 +616,23 @@ def _update_model_status(self): model_name = self.quality_options.get(quality) size = self.quality_sizes.get(quality, "") + # Check if it's the bundled model + is_bundled = self._is_bundled_model(model_name) + if self._is_model_downloaded(model_name): - self.quality_status_var.set("βœ…") - self.quality_status_label.configure(text_color="green") - self.model_action_btn.configure(text="πŸ—‘οΈ", command=self._delete_current_model) + if is_bundled: + self.quality_status_var.set("πŸ“¦") # Package icon for bundled + self.quality_status_label.configure(text_color="blue") + # Don't allow deleting bundled model + self.model_action_btn.configure(text="πŸ“¦", command=lambda: None, state="disabled") + else: + self.quality_status_var.set("βœ…") + self.quality_status_label.configure(text_color="green") + self.model_action_btn.configure(text="πŸ—‘οΈ", command=self._delete_current_model, state="normal") else: self.quality_status_var.set(f"⬇️ {size}") self.quality_status_label.configure(text_color="orange") - self.model_action_btn.configure(text="⬇️", command=self._download_model) + self.model_action_btn.configure(text="⬇️", command=self._download_model, state="normal") def _delete_current_model(self): """Delete the currently selected model.""" @@ -518,6 +653,7 @@ def _download_model(self): if self._is_model_downloaded(model_name): self.status_var.set(f"βœ… {quality} model already downloaded") + self._update_model_status() return # Animation flag @@ -529,7 +665,7 @@ def animate_status(): frames = ["⬇️ Downloading", "⬇️ Downloading.", "⬇️ Downloading..", "⬇️ Downloading..."] i = 0 while self._downloading: - self.status_var.set(f"{frames[i % 4]} {quality} ({model_size})") + self.after(0, lambda f=frames[i % 4]: self.status_var.set(f"{f} {quality} ({model_size})")) i += 1 time.sleep(0.4) @@ -539,144 +675,150 @@ def download(): anim_thread = threading.Thread(target=animate_status, daemon=True) anim_thread.start() - # Download the model - from sentence_transformers import SentenceTransformer - SentenceTransformer(model_name) + self.after(0, lambda: self.status_var.set(f"⬇️ Initializing download for {quality}...")) + + # Download the model using FastEmbed + # FastEmbed automatically downloads from HuggingFace when model not cached + from fastembed import TextEmbedding + + self.after(0, lambda: self.status_var.set(f"⬇️ Downloading {quality} ({model_size})...")) + + # This triggers the download + cache_dir = os.environ.get("FASTEMBED_CACHE_PATH") + model = TextEmbedding(model_name=model_name, cache_dir=cache_dir) + + self.after(0, lambda: self.status_var.set(f"⬇️ Verifying {quality} model...")) + + # Run a dummy embed to ensure model is fully loaded and working + list(model.embed(["test"])) # Stop animation self._downloading = False - # Update UI - self._update_model_status() - self.status_var.set(f"βœ… Downloaded {quality} model successfully!") + # Update UI on main thread + self.after(0, self._update_model_status) + self.after(0, lambda: self.status_var.set(f"βœ… Downloaded {quality} model successfully!")) except Exception as e: self._downloading = False - self.status_var.set(f"❌ Download failed: {e}") - messagebox.showerror("Download Error", str(e)) - - thread = threading.Thread(target=download) + error_msg = str(e) + print(f"Download error: {error_msg}") # Console logging + self.after(0, lambda: self.status_var.set(f"❌ Download failed: {error_msg[:50]}...")) + self.after(0, lambda: messagebox.showerror("Download Error", + f"Failed to download {quality} model.\n\nError: {error_msg}\n\n" + "Please check your internet connection and try again.")) + + thread = threading.Thread(target=download, daemon=True) thread.start() def _get_download_progress(self, model_name): - """Get current download progress in MB by checking HuggingFace download locations.""" - import tempfile - - # Check multiple possible locations where HuggingFace downloads - cache_dir = os.path.expanduser("~/.cache/huggingface/hub") - model_folder = "models--" + model_name.replace("/", "--") - model_path = os.path.join(cache_dir, model_folder) - + """Get current download progress in MB by checking FastEmbed download locations.""" total_size = 0 + model_short = model_name.split("/")[-1] - # 1. Check the model cache directory - if os.path.exists(model_path): - for dirpath, dirnames, filenames in os.walk(model_path): - for f in filenames: - fp = os.path.join(dirpath, f) - try: - total_size += os.path.getsize(fp) - except: - pass - - # 2. Check HuggingFace temp download directory - hf_temp = os.path.join(cache_dir, ".tmp") - if os.path.exists(hf_temp): - for dirpath, dirnames, filenames in os.walk(hf_temp): - for f in filenames: - fp = os.path.join(dirpath, f) - try: - total_size += os.path.getsize(fp) - except: - pass - - # 3. Check system temp for any huggingface downloads - temp_dir = tempfile.gettempdir() - for item in os.listdir(temp_dir): - if 'huggingface' in item.lower() or 'hf' in item.lower(): - item_path = os.path.join(temp_dir, item) - try: - if os.path.isfile(item_path): - total_size += os.path.getsize(item_path) - elif os.path.isdir(item_path): - for dirpath, dirnames, filenames in os.walk(item_path): + for cache_dir in self._get_fastembed_cache_locations(): + if not os.path.exists(cache_dir): + continue + + for folder in os.listdir(cache_dir): + if model_short in folder: + folder_path = os.path.join(cache_dir, folder) + if os.path.isdir(folder_path): + for dirpath, dirnames, filenames in os.walk(folder_path): for f in filenames: fp = os.path.join(dirpath, f) try: total_size += os.path.getsize(fp) except: pass - except: - pass return total_size / (1024 * 1024) # Convert to MB def _get_model_cache_size(self, model_name): """Get the size of a fully cached model in MB.""" - cache_dir = os.path.expanduser("~/.cache/huggingface/hub") - model_folder = "models--" + model_name.replace("/", "--") - model_path = os.path.join(cache_dir, model_folder) - - if not os.path.exists(model_path): - return 0 - total_size = 0 - for dirpath, dirnames, filenames in os.walk(model_path): - for f in filenames: - fp = os.path.join(dirpath, f) - try: - total_size += os.path.getsize(fp) - except: - pass + model_short = model_name.split("/")[-1] + + for cache_dir in self._get_fastembed_cache_locations(): + if not os.path.exists(cache_dir): + continue + + for folder in os.listdir(cache_dir): + if model_short in folder: + folder_path = os.path.join(cache_dir, folder) + if os.path.isdir(folder_path): + for dirpath, dirnames, filenames in os.walk(folder_path): + for f in filenames: + fp = os.path.join(dirpath, f) + try: + total_size += os.path.getsize(fp) + except: + pass return total_size / (1024 * 1024) # Convert to MB def _delete_model(self, model_name): - """Delete a cached model.""" + """Delete a cached model from FastEmbed cache (current + legacy).""" import shutil - cache_dir = os.path.expanduser("~/.cache/huggingface/hub") - model_folder = "models--" + model_name.replace("/", "--") - model_path = os.path.join(cache_dir, model_folder) - - if os.path.exists(model_path): - shutil.rmtree(model_path) - return True - return False + deleted = False + + model_short = model_name.split("/")[-1] + + for cache_dir in self._get_fastembed_cache_locations(): + if not os.path.exists(cache_dir): + continue + + try: + for folder in os.listdir(cache_dir): + # FastEmbed/HF-style folder names include the model name in some form + if model_short in folder or model_short.replace("-", "_") in folder: + folder_path = os.path.join(cache_dir, folder) + try: + shutil.rmtree(folder_path, ignore_errors=False) + deleted = True + except PermissionError: + shutil.rmtree(folder_path, ignore_errors=True) + deleted = True + except (PermissionError, OSError): + continue + + return deleted def _manage_models(self): """Show dialog to manage downloaded models.""" dialog = ctk.CTkToplevel(self) dialog.title("Manage Models") - dialog.geometry("400x350") + dialog.geometry("400x300") dialog.transient(self) dialog.grab_set() + dialog.resizable(False, False) # Center the dialog dialog.update_idletasks() x = self.winfo_x() + (self.winfo_width() - 400) // 2 - y = self.winfo_y() + (self.winfo_height() - 350) // 2 + y = self.winfo_y() + (self.winfo_height() - 300) // 2 dialog.geometry(f"+{x}+{y}") ctk.CTkLabel(dialog, text="Downloaded Models", font=("Segoe UI", 14, "bold")).pack(pady=(15, 10)) - # Scrollable frame for model list - models_frame = ctk.CTkScrollableFrame(dialog, height=180) + # Fixed frame for model list (no scrolling needed for 5 models) + models_frame = ctk.CTkFrame(dialog, fg_color="transparent") models_frame.pack(fill="x", padx=15, pady=5) model_names = { - "⚑ Fast": "sentence-transformers/all-MiniLM-L6-v2", "βš–οΈ Balanced": "BAAI/bge-small-en-v1.5", "🎯 High Accuracy": "BAAI/bge-base-en-v1.5", "πŸš€ Best": "BAAI/bge-large-en-v1.5", - "🌍 Multilingual": "BAAI/bge-m3" + "🌍 Multilingual": "intfloat/multilingual-e5-large" } any_downloaded = False for display_name, model_name in model_names.items(): + is_bundled = self._is_bundled_model(model_name) + if self._is_model_downloaded(model_name): any_downloaded = True - size_mb = self._get_model_cache_size(model_name) row = ctk.CTkFrame(models_frame, fg_color="transparent") row.pack(fill="x", pady=2) @@ -684,34 +826,44 @@ def _manage_models(self): ctk.CTkLabel(row, text=f"{display_name}", font=("Segoe UI", 11), anchor="w", width=120).pack(side="left", padx=(0, 10)) - ctk.CTkLabel(row, text=f"{size_mb:.0f} MB", font=("Segoe UI", 10), - text_color="gray", width=60).pack(side="left") - - def make_delete_callback(mn=model_name, dn=display_name, r=row): - def callback(): - if messagebox.askyesno("Delete Model", - f"Delete {dn}?\nYou'll need to re-download it to use this quality level."): - self._delete_model(mn) - r.destroy() - self._update_model_status() - self.status_var.set(f"Deleted {dn} model") - return callback - - ctk.CTkButton(row, text="Delete", width=60, height=24, corner_radius=4, - fg_color="#dc3545", hover_color="#c82333", - command=make_delete_callback()).pack(side="right") + if is_bundled: + ctk.CTkLabel(row, text="Built-in", font=("Segoe UI", 10), + text_color="blue", width=60).pack(side="left") + # No delete button for bundled model + ctk.CTkLabel(row, text="πŸ“¦", font=("Segoe UI", 10), + width=60).pack(side="right") + else: + size_mb = self._get_model_cache_size(model_name) + ctk.CTkLabel(row, text=f"{size_mb:.0f} MB", font=("Segoe UI", 10), + text_color="gray", width=60).pack(side="left") + + def make_delete_callback(mn=model_name, dn=display_name, r=row): + def callback(): + if messagebox.askyesno("Delete Model", + f"Delete {dn}?\nYou'll need to re-download it to use this quality level."): + self._delete_model(mn) + r.destroy() + self._update_model_status() + self.status_var.set(f"Deleted {dn} model") + return callback + + ctk.CTkButton(row, text="Delete", width=60, height=24, corner_radius=4, + fg_color="#dc3545", hover_color="#c82333", + command=make_delete_callback()).pack(side="right") if not any_downloaded: ctk.CTkLabel(models_frame, text="No models downloaded yet", font=("Segoe UI", 11), text_color="gray").pack(pady=20) - # Total size - total_size = sum(self._get_model_cache_size(m) for m in model_names.values()) - ctk.CTkLabel(dialog, text=f"Total: {total_size:.0f} MB", - font=("Segoe UI", 10), text_color="gray").pack(pady=5) + # Total size (exclude bundled from count) + total_size = sum(self._get_model_cache_size(m) for m in model_names.values() + if not self._is_bundled_model(m)) + ctk.CTkLabel(dialog, text=f"Downloaded: {total_size:.0f} MB", + font=("Segoe UI", 10), text_color="gray").pack(pady=(10, 5)) + # Close button at bottom ctk.CTkButton(dialog, text="Close", command=dialog.destroy, - width=80, height=28, corner_radius=6).pack(pady=10) + width=80, height=28, corner_radius=6).pack(pady=(5, 15)) if self.locator: self.status_var.set("Quality changed - click 'Load Index' to apply") @@ -786,26 +938,37 @@ def select_deep(): def _do_load_index(self, pdf_dir, model_name, quality, precompute=False): """Actually load the index with chosen mode.""" + def update_progress(current, total): + """Callback to update status with progress.""" + percent = int(current / total * 100) + self.after(0, lambda: self.status_var.set( + f"πŸ”¬ Deep indexing: {current}/{total} pages ({percent}%)" + )) + def load(): try: - self.status_var.set("Loading model...") + self.after(0, lambda: self.status_var.set("Step 1/2: Loading model...")) self.locator = HybridLocator(pdf_dir, model_name=model_name) if precompute: - self.status_var.set("Indexing PDF files...") + self.after(0, lambda: self.status_var.set("Step 2/3: Indexing PDF files...")) self.locator.build_index() page_count = len(self.locator.documents) - self.status_var.set(f"Computing embeddings for {page_count} pages...") - self.locator.precompute_embeddings() + self.after(0, lambda: self.status_var.set( + f"Step 3/3: Computing embeddings (0/{page_count})..." + )) + self.locator.precompute_embeddings(progress_callback=update_progress) else: - self.status_var.set("Indexing PDF files...") + self.after(0, lambda: self.status_var.set("Step 2/2: Indexing PDF files...")) self.locator.build_index() self.pdf_dir = pdf_dir page_count = len(self.locator.documents) mode = "Deep" if precompute else "Fast" - self.status_var.set(f"βœ… Ready! Indexed {page_count} pages ({mode} mode)") + self.after(0, lambda: self.status_var.set( + f"βœ… Ready! Indexed {page_count} pages ({mode} mode)" + )) except Exception as e: self.status_var.set(f"❌ Error: {e}") diff --git a/locator.py b/locator.py index ede8559..d165258 100644 --- a/locator.py +++ b/locator.py @@ -1,19 +1,70 @@ """ Hybrid Semantic Page Locator for Course PDFs -BM25 keyword search + Sentence Transformer reranking +BM25 keyword search + FastEmbed reranking (lightweight ONNX-based) """ import os +import sys import json import pickle from pathlib import Path from dataclasses import dataclass, field from typing import Optional + +# ---------------------------- +# Bundled model configuration +# ---------------------------- +BUNDLED_MODEL_NAME = "BAAI/bge-small-en-v1.5" + +def _get_bundled_model_path() -> Optional[str]: + """Get path to bundled model if it exists (for PyInstaller builds).""" + # Check various locations where bundled model might be + if getattr(sys, 'frozen', False): + # Running as PyInstaller exe + base_path = sys._MEIPASS if hasattr(sys, '_MEIPASS') else os.path.dirname(sys.executable) + possible_paths = [ + os.path.join(base_path, '_internal', 'models', 'bge-small-en-v1.5'), + os.path.join(base_path, 'models', 'bge-small-en-v1.5'), + os.path.join(os.path.dirname(sys.executable), '_internal', 'models', 'bge-small-en-v1.5'), + os.path.join(os.path.dirname(sys.executable), 'models', 'bge-small-en-v1.5'), + ] + else: + # Running as script + script_dir = os.path.dirname(os.path.abspath(__file__)) + possible_paths = [ + os.path.join(script_dir, 'models', 'bge-small-en-v1.5'), + ] + + for path in possible_paths: + # Check if model.onnx exists in the path + if os.path.exists(path): + for root, dirs, files in os.walk(path): + if 'model.onnx' in files or 'model_optimized.onnx' in files: + return path + return None + + +# ---------------------------- +# FastEmbed cache configuration +# ---------------------------- +def _default_fastembed_cache_dir() -> str: + """Choose a persistent cache directory for FastEmbed (avoid system Temp).""" + home = str(Path.home()) + if os.name == "nt": + base = os.environ.get("LOCALAPPDATA") or home + return os.path.join(base, "Locus", "fastembed_cache") + if sys.platform == "darwin": + return os.path.join(home, "Library", "Caches", "Locus", "fastembed_cache") + return os.path.join(home, ".cache", "Locus", "fastembed_cache") + +# Set once, early, so all FastEmbed usage is consistent across the app. +os.environ.setdefault("FASTEMBED_CACHE_PATH", _default_fastembed_cache_dir()) +os.makedirs(os.environ["FASTEMBED_CACHE_PATH"], exist_ok=True) + import fitz # PyMuPDF import numpy as np from rank_bm25 import BM25Okapi -from sentence_transformers import SentenceTransformer, util import re @@ -60,7 +111,10 @@ def tokenize(text: str) -> list[str]: 'same', 'so', 'than', 'too', 'very', 'just', 'and', 'but', 'if', 'or', 'because', 'until', 'while', 'this', 'that', 'these', 'those', 'it', 'its'} - return [t for t in tokens if len(t) > 1 or '\u4e00' <= t <= '\u9fff' and t not in stopwords] + def _is_cjk_char(tok: str) -> bool: + return len(tok) == 1 and '\u4e00' <= tok <= '\u9fff' + + return [t for t in tokens if (len(t) > 1 or _is_cjk_char(t)) and t not in stopwords] class PDFIndexer: @@ -130,28 +184,98 @@ def search(self, query: str, top_k: int = 20) -> list[tuple[PageDocument, float] return results +# Model name mapping: GUI names -> FastEmbed model names +MODEL_NAME_MAP = { + "BAAI/bge-small-en-v1.5": "BAAI/bge-small-en-v1.5", + "BAAI/bge-base-en-v1.5": "BAAI/bge-base-en-v1.5", + "BAAI/bge-large-en-v1.5": "BAAI/bge-large-en-v1.5", + "intfloat/multilingual-e5-large": "intfloat/multilingual-e5-large", +} + + +def cosine_similarity(a: np.ndarray, b: np.ndarray) -> np.ndarray: + """Compute cosine similarity between query vector and document vectors.""" + # Normalize vectors + a_norm = a / np.linalg.norm(a) + b_norm = b / np.linalg.norm(b, axis=1, keepdims=True) + return np.dot(b_norm, a_norm) + + class SemanticReranker: - """Sentence transformer-based semantic reranking.""" + """FastEmbed-based semantic reranking (lightweight ONNX).""" - def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"): + def __init__(self, model_name: str = "BAAI/bge-small-en-v1.5"): + from fastembed import TextEmbedding + print(f"Loading search model: {model_name}") - print("(First run downloads the model - please wait...)") - self.model = SentenceTransformer(model_name) + + # Map model name if needed + fastembed_name = MODEL_NAME_MAP.get(model_name, model_name) + cache_dir = os.environ.get("FASTEMBED_CACHE_PATH") + + # Check if we should use bundled model + bundled_path = _get_bundled_model_path() + if bundled_path and model_name == BUNDLED_MODEL_NAME: + print(f"Using bundled model from: {bundled_path}") + # Use local_files_only to prevent download attempts + self.model = TextEmbedding( + model_name=fastembed_name, + cache_dir=cache_dir, + local_files_only=False # Still allow fallback to download + ) + # Copy bundled model to cache if not already there + self._ensure_bundled_model_in_cache(bundled_path, cache_dir, fastembed_name) + else: + print("(First run downloads the model - please wait...)") + self.model = TextEmbedding(model_name=fastembed_name, cache_dir=cache_dir) + self.model_name = model_name - # Check if this is a BGE model (needs query/passage prefixes) - self.is_bge = "bge" in model_name.lower() + # Check if this model needs query/passage prefixes + # BGE models and E5 models both use these prefixes + self.needs_prefix = "bge" in model_name.lower() or "e5" in model_name.lower() print("Model loaded successfully!") + def _ensure_bundled_model_in_cache(self, bundled_path: str, cache_dir: str, model_name: str): + """Copy bundled model to cache directory if not already present.""" + import shutil + + # FastEmbed expects models in a specific structure + # We'll copy to cache so it can find it naturally + model_short = model_name.split("/")[-1] + + # Check if model already exists in cache + if cache_dir: + for folder in os.listdir(cache_dir) if os.path.exists(cache_dir) else []: + if model_short in folder: + folder_path = os.path.join(cache_dir, folder) + for root, dirs, files in os.walk(folder_path): + if 'model.onnx' in files or 'model_optimized.onnx' in files: + return # Model already in cache + + # Model not in cache, but bundled model exists - FastEmbed will handle it + def _add_prefix(self, text: str, is_query: bool = False) -> str: - """Add prefix for BGE models.""" - if not self.is_bge: + """Add prefix for BGE/E5 models.""" + if not self.needs_prefix: return text if is_query: return f"query: {text}" else: return f"passage: {text}" + def encode(self, texts: list[str], is_query: bool = False) -> np.ndarray: + """Encode texts to embeddings.""" + # Add prefixes for BGE models + prefixed_texts = [self._add_prefix(t, is_query) for t in texts] + # FastEmbed returns a generator, convert to numpy array + embeddings = list(self.model.embed(prefixed_texts)) + return np.array(embeddings) + + def encode_single(self, text: str, is_query: bool = False) -> np.ndarray: + """Encode a single text to embedding.""" + return self.encode([text], is_query)[0] + def rerank(self, query: str, candidates: list[tuple[PageDocument, float]], top_k: int = 5, bm25_weight: float = 0.3) -> list[tuple[PageDocument, float]]: """ @@ -166,16 +290,15 @@ def rerank(self, query: str, candidates: list[tuple[PageDocument, float]], if not candidates: return [] - # Encode query (with prefix for BGE) - query_text = self._add_prefix(query, is_query=True) - query_embedding = self.model.encode(query_text, convert_to_tensor=True) + # Encode query + query_embedding = self.encode_single(query, is_query=True) - # Encode all candidate texts (with prefix for BGE, truncate long pages) - texts = [self._add_prefix(doc.text[:2000], is_query=False) for doc, _ in candidates] - doc_embeddings = self.model.encode(texts, convert_to_tensor=True) + # Encode all candidate texts (truncate long pages) + texts = [doc.text[:2000] for doc, _ in candidates] + doc_embeddings = self.encode(texts, is_query=False) # Compute semantic similarities - semantic_scores = util.cos_sim(query_embedding, doc_embeddings)[0].cpu().numpy() + semantic_scores = cosine_similarity(query_embedding, doc_embeddings) # Normalize BM25 scores bm25_scores = np.array([score for _, score in candidates]) @@ -199,7 +322,7 @@ def rerank(self, query: str, candidates: list[tuple[PageDocument, float]], class HybridLocator: """Main interface combining BM25 + semantic reranking.""" - def __init__(self, pdf_dir: str, model_name: Optional[str] = "sentence-transformers/all-MiniLM-L6-v2"): + def __init__(self, pdf_dir: str, model_name: Optional[str] = "BAAI/bge-small-en-v1.5"): self.pdf_dir = Path(pdf_dir) self.model_name = model_name self.indexer: Optional[PDFIndexer] = None @@ -240,24 +363,40 @@ def build_index(self, force_rebuild: bool = False): self.reranker = None print("Running in keywords-only mode (no semantic reranking)") - def precompute_embeddings(self): - """Pre-compute embeddings for all documents (Deep mode).""" + def precompute_embeddings(self, progress_callback=None): + """Pre-compute embeddings for all documents (Deep mode). + + Args: + progress_callback: Optional function(current, total) to report progress + """ if not self.reranker: print("No semantic model loaded, skipping embedding computation") return - print(f"Computing embeddings for {len(self.documents)} pages...") + total = len(self.documents) + print(f"Computing embeddings for {total} pages...") + + # Prepare texts (truncate long pages) + texts = [doc.text[:2000] for doc in self.documents] - # Prepare texts with prefix for BGE models - texts = [self.reranker._add_prefix(doc.text[:2000], is_query=False) - for doc in self.documents] + # Encode in batches to show progress + batch_size = 10 + all_embeddings = [] - # Encode all documents - self.doc_embeddings = self.reranker.model.encode( - texts, - convert_to_tensor=True, - show_progress_bar=True - ) + for i in range(0, total, batch_size): + batch_texts = texts[i:i+batch_size] + batch_embeddings = self.reranker.encode(batch_texts, is_query=False) + all_embeddings.append(batch_embeddings) + + # Report progress + current = min(i + batch_size, total) + if progress_callback: + progress_callback(current, total) + print(f" Processed {current}/{total} pages...") + + # Combine all embeddings + import numpy as np + self.doc_embeddings = np.vstack(all_embeddings) self.deep_mode = True print("Embeddings computed and ready!") @@ -281,7 +420,7 @@ def search(self, query: str, top_k: int = 5, bm25_candidates: int = 20, is_cross_lingual = False # Check if using multilingual model - is_multilingual_model = self.model_name and "bge-m3" in self.model_name.lower() + is_multilingual_model = self.model_name and ("multilingual" in self.model_name.lower() or "e5" in self.model_name.lower()) # Deep mode: use pre-computed embeddings for full semantic search if self.deep_mode and self.reranker and self.doc_embeddings is not None: @@ -333,8 +472,6 @@ def search(self, query: str, top_k: int = 5, bm25_candidates: int = 20, def _search_deep(self, query: str, top_k: int, bm25_weight: float, is_multilingual: bool) -> tuple[list[dict], bool]: """Deep search using pre-computed embeddings.""" - from sentence_transformers import util - is_cross_lingual = False # Get BM25 scores for all documents @@ -351,9 +488,8 @@ def _search_deep(self, query: str, top_k: int, bm25_weight: float, bm25_scores = bm25_scores / bm25_scores.max() # Compute semantic scores using pre-computed embeddings - query_text = self.reranker._add_prefix(query, is_query=True) - query_embedding = self.reranker.model.encode(query_text, convert_to_tensor=True) - semantic_scores = util.cos_sim(query_embedding, self.doc_embeddings)[0].cpu().numpy() + query_embedding = self.reranker.encode_single(query, is_query=True) + semantic_scores = cosine_similarity(query_embedding, self.doc_embeddings) # Combine scores combined_scores = (1 - bm25_weight) * semantic_scores + bm25_weight * bm25_scores @@ -404,7 +540,7 @@ def _extract_snippet(self, text: str, query: str, max_len: int = 200) -> str: def search_formatted(self, query: str, **kwargs) -> str: """Search and return formatted string output.""" - results = self.search(query, **kwargs) + results, _ = self.search(query, **kwargs) if not results: return f"No results found for: {query}" @@ -419,116 +555,6 @@ def search_formatted(self, query: str, **kwargs) -> str: return '\n'.join(lines) -# ============================================================ -# Training utilities for fine-tuning the reranker -# ============================================================ - -class TrainingDataGenerator: - """Generate training pairs from annotated data.""" - - @staticmethod - def from_json(json_path: str) -> list[dict]: - """ - Load training data from JSON file. - - Expected format: - [ - {"question": "What is the Bellman equation?", - "pdf": "RL_lecture.pdf", - "pages": [49, 50]}, - ... - ] - """ - with open(json_path, 'r') as f: - return json.load(f) - - @staticmethod - def create_template(output_path: str, num_examples: int = 50): - """Create a template JSON file for annotation.""" - template = [ - { - "question": "Example: What is Q-learning?", - "pdf": "lecture_notes.pdf", - "pages": [42, 43], - "notes": "Optional notes about this example" - } - ] - - with open(output_path, 'w') as f: - json.dump(template, f, indent=2) - - print(f"Template created at: {output_path}") - print("Edit this file to add your questionβ†’page annotations.") - - -def fine_tune_reranker(locator: HybridLocator, training_data: list[dict], - output_dir: str, epochs: int = 3): - """ - Fine-tune the reranker on your questionβ†’page pairs. - - This creates training triplets: (query, positive_page, negative_page) - """ - from sentence_transformers import InputExample, losses - from torch.utils.data import DataLoader - - # Build document lookup - doc_lookup = {} - for doc in locator.documents: - key = (doc.pdf_name, doc.page_num) - doc_lookup[key] = doc - - # Create training examples - train_examples = [] - - for item in training_data: - question = item['question'] - pdf_name = item['pdf'] - positive_pages = item['pages'] - - for page_num in positive_pages: - key = (pdf_name, page_num) - if key not in doc_lookup: - print(f"Warning: Page not found: {pdf_name} page {page_num}") - continue - - positive_doc = doc_lookup[key] - - # Create a training pair - train_examples.append(InputExample( - texts=[question, positive_doc.text[:1000]], - label=1.0 - )) - - # Add negative examples (random pages from same PDF) - for neg_doc in locator.documents: - if neg_doc.pdf_name == pdf_name and neg_doc.page_num not in positive_pages: - train_examples.append(InputExample( - texts=[question, neg_doc.text[:1000]], - label=0.0 - )) - break # Just one negative per positive - - if not train_examples: - print("No valid training examples found!") - return - - print(f"Created {len(train_examples)} training examples") - - # Fine-tune - model = locator.reranker.model - train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=8) - train_loss = losses.CosineSimilarityLoss(model) - - model.fit( - train_objectives=[(train_dataloader, train_loss)], - epochs=epochs, - warmup_steps=10, - output_path=output_dir - ) - - print(f"Fine-tuned model saved to: {output_dir}") - - # ============================================================ # CLI Interface # ============================================================ @@ -542,18 +568,19 @@ def main(): parser.add_argument('--rebuild', action='store_true', help="Force rebuild index") parser.add_argument('--top-k', type=int, default=5, help="Number of results") parser.add_argument('--interactive', '-i', action='store_true', help="Interactive mode") - parser.add_argument('--create-training-template', help="Create training data template") + parser.add_argument('--deep', '-d', action='store_true', help="Use deep indexing mode") + parser.add_argument('--model', '-m', default="BAAI/bge-small-en-v1.5", + help="Model to use") args = parser.parse_args() - if args.create_training_template: - TrainingDataGenerator.create_template(args.create_training_template) - return - # Initialize locator - locator = HybridLocator(args.pdf_dir) + locator = HybridLocator(args.pdf_dir, model_name=args.model) locator.build_index(force_rebuild=args.rebuild) + if args.deep: + locator.precompute_embeddings() + if args.interactive: print("\nInteractive mode. Type 'quit' to exit.\n") while True: diff --git a/requirements.txt b/requirements.txt index 7dacca9..bf1b27b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -# Core dependencies for Semantic Page Locator +# Core dependencies for Locus PDF Search PyMuPDF>=1.23.0 # PDF text extraction (imported as fitz) -rank-bm25>=0.2.2 # BM25 retrieval -sentence-transformers>=2.2.0 # Semantic embeddings + reranking +rank-bm25>=0.2.2 # BM25 keyword retrieval +fastembed>=0.2.0 # Lightweight ONNX-based embeddings (replaces sentence-transformers) numpy>=1.24.0 -torch>=2.0.0 # Required by sentence-transformers +customtkinter>=5.2.0 # Modern GUI \ No newline at end of file