From 450e3e6905198ca5c38d78926fc84337cba11959 Mon Sep 17 00:00:00 2001 From: Tim Nunamaker Date: Thu, 14 Mar 2024 01:18:17 -0500 Subject: [PATCH 1/2] feat: improved TxtaiVectorStore for LlamaIndex interface --- poetry.lock | 51 ++++++- pyproject.toml | 6 +- selfie/api/completions.py | 6 + selfie/api/documents.py | 5 +- selfie/embeddings/TxtaiVectorStore.py | 197 ++++++++++++++++++++++++++ selfie/embeddings/__init__.py | 103 +++++++++++++- 6 files changed, 363 insertions(+), 5 deletions(-) create mode 100644 selfie/embeddings/TxtaiVectorStore.py diff --git a/poetry.lock b/poetry.lock index 76b4c91..b3f0bc8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1617,9 +1617,11 @@ files = [ ] [package.dependencies] +aiohttp = {version = "*", optional = true, markers = "extra == \"inference\""} filelock = "*" fsspec = ">=2023.5.0" packaging = ">=20.9" +pydantic = {version = ">1.1,<3.0", optional = true, markers = "python_version > \"3.8\" and extra == \"inference\""} pyyaml = ">=5.1" requests = "*" tqdm = ">=4.42.1" @@ -1952,6 +1954,23 @@ local-models = ["optimum[onnxruntime] (>=1.13.2,<2.0.0)", "sentencepiece (>=0.1. postgres = ["asyncpg (>=0.28.0,<0.29.0)", "pgvector (>=0.1.0,<0.2.0)", "psycopg2-binary (>=2.9.9,<3.0.0)"] query-tools = ["guidance (>=0.0.64,<0.0.65)", "jsonpath-ng (>=1.6.0,<2.0.0)", "lm-format-enforcer (>=0.4.3,<0.5.0)", "rank-bm25 (>=0.2.2,<0.3.0)", "scikit-learn", "spacy (>=3.7.1,<4.0.0)"] +[[package]] +name = "llama-index-embeddings-huggingface" +version = "0.1.4" +description = "llama-index embeddings huggingface integration" +optional = false +python-versions = ">=3.8.1,<4.0" +files = [ + {file = "llama_index_embeddings_huggingface-0.1.4-py3-none-any.whl", hash = "sha256:9c80539f3cbbd7191c219e2cda154b1a7151aa912196bc537c16f40e18e4187c"}, + {file = "llama_index_embeddings_huggingface-0.1.4.tar.gz", hash = "sha256:042d249d91039bc4a531711c0c81ebf4f5c921de98629d2d342979bc4511a639"}, +] + +[package.dependencies] +huggingface-hub = {version = ">=0.19.0", extras = ["inference"]} +llama-index-core = ">=0.10.1,<0.11.0" +torch = ">=2.1.2,<3.0.0" +transformers = ">=4.37.0,<5.0.0" + [[package]] name = "llama-index-embeddings-openai" version = "0.1.6" @@ -2034,6 +2053,22 @@ files = [ [package.dependencies] llama-index-core = ">=0.10.1,<0.11.0" +[[package]] +name = "llama-index-llms-openai-like" +version = "0.1.3" +description = "llama-index llms openai like integration" +optional = false +python-versions = ">=3.8.1,<4.0" +files = [ + {file = "llama_index_llms_openai_like-0.1.3-py3-none-any.whl", hash = "sha256:0cf2c56f027c5e1f17c7fc606ad2b991f61daa75a88ba35bb5ea97de9766d4d3"}, + {file = "llama_index_llms_openai_like-0.1.3.tar.gz", hash = "sha256:c3412325077c75263e37d60d501314b9bdc770f12d8a2b16e756e7d9ac8f9e3f"}, +] + +[package.dependencies] +llama-index-core = ">=0.10.1,<0.11.0" +llama-index-llms-openai = ">=0.1.1,<0.2.0" +transformers = ">=4.37.0,<5.0.0" + [[package]] name = "llama-index-multi-modal-llms-openai" version = "0.1.4" @@ -2131,6 +2166,20 @@ llama-index-core = ">=0.10.1,<0.11.0" onnxruntime = ">=1.17.0,<2.0.0" tokenizers = ">=0.15.1,<0.16.0" +[[package]] +name = "llama-index-vector-stores-txtai" +version = "0.1.2" +description = "llama-index vector stores txtai integration" +optional = false +python-versions = ">=3.8.1,<4.0" +files = [ + {file = "llama_index_vector_stores_txtai-0.1.2-py3-none-any.whl", hash = "sha256:e2e260a8fa186b5c4c7ab7fa9f1a84e9822400754d801c23a0220bd3fbc13568"}, + {file = "llama_index_vector_stores_txtai-0.1.2.tar.gz", hash = "sha256:0f2ed454f38034ec82a73c5cc4e1c29d2715c33e090408527dca65fec369650a"}, +] + +[package.dependencies] +llama-index-core = ">=0.10.1,<0.11.0" + [[package]] name = "llama-parse" version = "0.3.8" @@ -6355,4 +6404,4 @@ gpu = ["auto-gptq", "autoawq", "optimum"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<4.0" -content-hash = "6ed61c05e5c4298fb983654bf4abcde89167c15f25dbf0f232eaf70e149b48a4" +content-hash = "71912c379a5c7255b282187aecc65ae703bf50babb08f12c61abd359fe363a78" diff --git a/pyproject.toml b/pyproject.toml index e9b3cbe..54435fa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ llama-cpp-python = "^0.2.26" litellm = "^1.23.12" txtai = {version = "^7.0.0", extras = ["pipeline-llm"]} sse-starlette = "^2.0.0" -llama-index = "^0.10.4" +llama-index = "^0.10.18" numpy = "^1.26.4" html2text = "^2020.1.16" peewee = "^3.17.0" @@ -27,6 +27,10 @@ python-dotenv = "^1.0.1" pillow = "^10.2.0" swig = "^4.2.1" +llama-index-vector-stores-txtai = "^0.1.2" +llama-index-llms-openai-like = "^0.1.3" +llama-index-embeddings-huggingface = "^0.1.4" + # Experimental GPU Features auto-gptq = { version = "^0.6.0", optional = true } optimum = { version = "^1.16.2", optional = true } diff --git a/selfie/api/completions.py b/selfie/api/completions.py index 96f5cc2..565fd96 100644 --- a/selfie/api/completions.py +++ b/selfie/api/completions.py @@ -4,6 +4,12 @@ CreateCompletionResponse as LlamaCppCompletionResponse, CreateChatCompletionResponse as LlamaCppChatCompletionResponse, ) +from llama_index.core import VectorStoreIndex, ServiceContext +from llama_index.core.vector_stores import VectorStoreQuery +from llama_index.llms.openai_like import OpenAILike + +from selfie.embeddings.TxtaiVectorStore import TxtaiVectorStore +from selfie.embeddings import DataIndex from selfie.types.completion_requests import ChatCompletionRequest, CompletionRequest from selfie.text_generation import completion diff --git a/selfie/api/documents.py b/selfie/api/documents.py index 68bb382..ada7819 100644 --- a/selfie/api/documents.py +++ b/selfie/api/documents.py @@ -1,3 +1,4 @@ +from datetime import datetime from typing import List, Optional from fastapi import APIRouter, Query @@ -27,8 +28,8 @@ class FetchedDocument(BaseModel): id: int = Field(..., description="The unique identifier of the document") name: str = Field(..., description="The name of the document") size: int = Field(..., description="The size of the document") - created_at: str = Field(..., description="The timestamp of the document creation") - updated_at: str = Field(..., description="The timestamp of the document update") + created_at: datetime = Field(..., description="The timestamp of the document creation") + updated_at: datetime = Field(..., description="The timestamp of the document update") content_type: str = Field(..., description="The content type of the document") connector_name: str = Field(..., description="The name of the connector") diff --git a/selfie/embeddings/TxtaiVectorStore.py b/selfie/embeddings/TxtaiVectorStore.py new file mode 100644 index 0000000..7f2d0ba --- /dev/null +++ b/selfie/embeddings/TxtaiVectorStore.py @@ -0,0 +1,197 @@ +"""txtai Vector store index. + +An index that is built on top of an existing txtai vector store. +""" +import json +import logging +from typing import Any, List, cast + +import numpy as np +from llama_index.core.bridge.pydantic import PrivateAttr + +from llama_index.core.schema import BaseNode, Node +from llama_index.core.vector_stores import ( + VectorStoreQuery, + VectorStoreQueryResult, +) +from llama_index.core.vector_stores.types import ( + BasePydanticVectorStore, VectorStoreQueryMode +) + +logger = logging.getLogger() + +IMPORT_ERROR_MSG = """ + `txtai` package not found. For instructions on + how to install `txtai` please visit + https://neuml.github.io/txtai/install/ +""" + + +class TxtaiVectorStore(BasePydanticVectorStore): + """txtai Vector Store. + + Embeddings and documents are stored within a txtai index. + + During query time, the index uses txtai to query for the top + k most similar nodes. + + Args: + txtai_embeddings (txtai.embeddings.Embeddings): A txtai embeddings instance + with content storage enabled. + + """ + + stores_text: bool = True + + _embeddings = PrivateAttr() + + def __init__( + self, + txtai_index: Any, + ) -> None: + """Initialize params.""" + try: + import txtai + except ImportError: + raise ImportError(IMPORT_ERROR_MSG) + + if not txtai_index.config.get("content"): + raise ValueError("The txtai embeddings instance must have content storage enabled.") + + self._embeddings = cast(txtai.embeddings.Embeddings, txtai_index) + super().__init__() + + def add( + self, + nodes: List[BaseNode], + **add_kwargs: Any, + ) -> List[str]: + """Add nodes to index. + + Args: + nodes (List[BaseNode]): List of nodes to add to the index. + + """ + docs = [] + for node in nodes: + doc = (node.ref_doc_id, {"text": node.get_text()}, node.extra_info) + docs.append(doc) + + self._embeddings.index(docs) + return [node.ref_doc_id for node in nodes] + + @property + def client(self) -> Any: + """Return txtai client.""" + return self._embeddings + + def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult: + """Query index for top k most similar nodes. + + Args: + query (VectorStoreQuery): Query parameters. + + """ + if query.filters is not None: + # TODO: Implement metadata filters + raise ValueError("Metadata filters not implemented for txtai yet.") + + similarity_top_k = cast(int, query.similarity_top_k) + + # print("query", query) + + if query.mode == VectorStoreQueryMode.DEFAULT: + if not self._embeddings.ann: + raise ValueError("Dense index not enabled.") + + weights = 1 + elif query.mode == VectorStoreQueryMode.SPARSE: + if not self._embeddings.scoring: + raise ValueError("Sparse index not enabled.") + + weights = 0 + else: + if query.alpha is not None and query.alpha > 0 and not self._embeddings.ann: + raise ValueError("Dense index not enabled.") + if query.alpha is not None and query.alpha < 1 and not self._embeddings.scoring: + raise ValueError("Sparse index not enabled.") + + weights = query.alpha if query.alpha is not None else 0.5 # hybrid + + if query.query_str is not None: + print("query.query_str", query.query_str) + # Natural language query + natural_language_query = cast(str, query.query_str) + sql = f"SELECT * FROM txtai WHERE similar(':{natural_language_query}, {weights})') LIMIT {similarity_top_k}" + # results = self._query(where=f"similar(:topic, {hybrid_search_weight})", parameters={"topic": topic}, limit=limit) + + print(sql) + results = self._embeddings.search(sql, weights=weights, limit=similarity_top_k) # TODO: Also use sparse_top_k + print(results) + + # TODO: Figure out how to resolve the fact that query_embedding is provided, unused, and potentially different than the embedding txtai would generate + else: + print("query.query_embedding", query.query_embedding) + # Embedding-based query + query_embedding = cast(List[float], query.query_embedding) + query_embedding_np = np.array(query_embedding, dtype="float32")[np.newaxis, :] + results = self._embeddings.ann.search(query_embedding_np, similarity_top_k) + + nodes = [] + similarities = [] + ids = [] + metadatas = [] + + if query.query_str is not None: + print("STRING") + # Result processing for natural language queries + for result in results: + print({ + 'id': result["id"], + + **json.loads(result["data"]) + }) + # TODO: Where does tags fit here, vs metadata? + node = Node( + text=result["text"], + extra_info={ + 'id': result["id"], + **{k: v for k, v in json.loads(result["data"]).items() if k != "text"} + } + # TODO: populate more keys, like relationships? + ) + nodes.append(node) + similarities.append(result["score"]) + ids.append(result["id"]) + metadatas.append(result) + else: + print("EMBEDDING") + # Result processing for embedding-based queries + for result in results[0]: + doc_id = str(result[0]) + metadata = self._embeddings.search(f"SELECT * FROM txtai WHERE id = '{doc_id}' LIMIT 1")[0] + node = Node( + text=metadata["text"], + extra_info={ + 'id': doc_id, + **{k: v for k, v in json.loads(metadata["data"]).items() if k != "text"} + } + # TODO: populate more keys, like relationships? + ) + nodes.append(node) + similarities.append(result[1]) + ids.append(doc_id) + metadatas.append(metadata) + + return VectorStoreQueryResult(nodes=nodes, similarities=similarities, ids=ids) # , metadatas=metadatas) + + def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None: + """ + Delete nodes using with ref_doc_id. + + Args: + ref_doc_id (str): The doc_id of the document to delete. + + """ + # self._txtai_index.delete([int(ref_doc_id)]) + self._embeddings.delete(where={"id": ref_doc_id}, **delete_kwargs) # TODO: not sure which approach is better diff --git a/selfie/embeddings/__init__.py b/selfie/embeddings/__init__.py index 7c6630b..1d2c7a8 100644 --- a/selfie/embeddings/__init__.py +++ b/selfie/embeddings/__init__.py @@ -9,8 +9,13 @@ import humanize import logging import tiktoken +from llama_index.core import ServiceContext, VectorStoreIndex from llama_index.core.node_parser import SentenceSplitter +from llama_index.core.vector_stores.types import VectorStoreQueryMode +from llama_index.embeddings.huggingface import HuggingFaceEmbedding +from llama_index.llms.openai_like import OpenAILike +from selfie.embeddings.TxtaiVectorStore import TxtaiVectorStore from selfie.config import get_app_config from selfie.data_generators.chat_training_data import ( ChatTrainingDataGenerator, @@ -74,6 +79,8 @@ def __init__(self, character_name, storage_path: str = config.embeddings_storage self.completion = completion or get_default_completion() self.character_name = character_name self.embeddings = Embeddings( + path="sentence-transformers/all-MiniLM-L6-v2", + hybrid=True, sqlite={"wal": True}, # For now, sqlite w/the default driver is the only way to use WAL. content=True @@ -334,6 +341,99 @@ async def index(self, documents: List[EmbeddingDocumentModel], extract_importanc return with_importance # TODO: return document with ID, if possible + async def recall2( + self, + topic: str, + topic_context=None, + character_name: Optional[str] = None, + limit=5, + importance_weight=0, + recency_weight=1, + relevance_weight=1, + include_summary=True, + local_llm=True, + min_score=0.4, + hybrid_search_weight=0.5, # TODO: testing needed + ): + if min_score is None: + min_score = 0.4 + + if not self.has_data(): + return {"documents": [], "summary": "No documents found.", "mean_score": 0} + + service_context = ServiceContext.from_defaults( + llm=OpenAILike( + api_base="http://localhost:5000/v1", + api_key="none", + model="TheBloke_Mistral-7B-Instruct-v0.2-GPTQ_gptq-8bit-32g-actorder_True", + is_chat_model=True + ), + embed_model=HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2"), + ) + + index = VectorStoreIndex.from_vector_store(TxtaiVectorStore(txtai_index=self.embeddings), service_context=service_context) + + retriever = index.as_retriever( + similarity_top_k=limit, + vector_store_query_mode=VectorStoreQueryMode.HYBRID, + filters=None, # TODO: use this to filter by source, etc. + alpha=hybrid_search_weight, + doc_ids=None, # TODO: use this to filter by as set of embedding documents + vector_store_kwargs={}, # TODO: use this to pass additional parameters to the vector store + sparse_top_k=limit, + ) + + documents_list: List[ScoredEmbeddingDocumentModel] = [] + for node, score in retriever.retrieve(topic): + node = node[1] + score = score[1] + document = EmbeddingDocumentModel( + id=node.metadata.get("id"), + text=node.get_text(), + timestamp=node.metadata.get("timestamp"), + importance=node.metadata.get("importance"), + source=node.metadata.get("source"), + updated_timestamp=node.metadata.get("updated_timestamp"), + source_document_id=node.metadata.get("source_document_id"), + ) + relevance_score = self.relevance_scorer.calculate_score(document, score) + recency_score = self.recency_scorer.calculate_score(document) + importance_score = document.importance + retrieval_score = self._calculate_retrieval_score( + recency_score, + relevance_score, + importance_score, + importance_weight, + recency_weight, + relevance_weight, + ) + documents_list.append( + ScoredEmbeddingDocumentModel( + **document.model_dump(), + score=retrieval_score, + relevance=relevance_score, + recency=recency_score, + ) + ) + + documents_list = [m for m in documents_list if m.score > min_score] + if len(documents_list) == 0: + return {"documents": [], "summary": "No documents found.", "mean_score": 0} + + documents_list.sort(key=lambda x: x.score, reverse=True) + short_documents_list = documents_list[:limit] + summary = ( + await self._summarize_documents( + character_name, + short_documents_list, + topic_context if topic_context else topic, + ("local" if local_llm else None), + ) + if include_summary + else None + ) + return {"documents": short_documents_list, "summary": summary, "mean_score": sum([m.score for m in short_documents_list]) / len(short_documents_list)} + async def recall( self, topic: str, @@ -346,6 +446,7 @@ async def recall( include_summary=True, local_llm=True, min_score=0.4, + hybrid_search_weight=0.5, # TODO: testing needed ): if min_score is None: min_score = 0.4 @@ -354,7 +455,7 @@ async def recall( return {"documents": [], "summary": "No documents found.", "mean_score": 0} self.embeddings.load(self.storage_path) - results = self._query(where="similar(:topic)", parameters={"topic": topic}, limit=limit) + results = self._query(where=f"similar(:topic, {hybrid_search_weight})", parameters={"topic": topic}, limit=limit) documents_list: List[ScoredEmbeddingDocumentModel] = [] for result in results: document = EmbeddingDocumentModel( From a148ea07dd5d5a99af7080d0f441b9f3ffc25a6c Mon Sep 17 00:00:00 2001 From: Tim Nunamaker Date: Thu, 14 Mar 2024 03:40:10 -0500 Subject: [PATCH 2/2] Working toward full-featured TxtaiVectorStore --- selfie/api/index_documents.py | 2 +- selfie/connectors/text_files/uischema.json | 5 +- selfie/embeddings/TxtaiVectorStore.py | 336 ++++++++++++++------- selfie/embeddings/__init__.py | 9 +- 4 files changed, 231 insertions(+), 121 deletions(-) diff --git a/selfie/api/index_documents.py b/selfie/api/index_documents.py index 31b9f7e..a093e47 100644 --- a/selfie/api/index_documents.py +++ b/selfie/api/index_documents.py @@ -53,7 +53,7 @@ async def update_index_document(document_id: int, document: EmbeddingDocumentMod async def delete_index_document(document_id: int): # Sometimes self.embeddings.save() errors on "database is locked", bricks it # raise HTTPException(status_code=501, detail="Not implemented") - DataIndex("n/a").delete_document(document_id) + await DataIndex("n/a").delete_document(document_id) return {"message": "Document deleted successfully"} diff --git a/selfie/connectors/text_files/uischema.json b/selfie/connectors/text_files/uischema.json index f4ef5a6..f886166 100644 --- a/selfie/connectors/text_files/uischema.json +++ b/selfie/connectors/text_files/uischema.json @@ -1,8 +1,5 @@ { "files": { - "ui:widget": "nativeFile", - "ui:options": { - "accept": ".json" - } + "ui:widget": "nativeFile" } } diff --git a/selfie/embeddings/TxtaiVectorStore.py b/selfie/embeddings/TxtaiVectorStore.py index 7f2d0ba..0025e3a 100644 --- a/selfie/embeddings/TxtaiVectorStore.py +++ b/selfie/embeddings/TxtaiVectorStore.py @@ -4,21 +4,34 @@ """ import json import logging -from typing import Any, List, cast +import os +import pickle +from pathlib import Path +from typing import Any, Dict, List, Optional, cast +import fsspec import numpy as np +from fsspec.implementations.local import LocalFileSystem from llama_index.core.bridge.pydantic import PrivateAttr - from llama_index.core.schema import BaseNode, Node -from llama_index.core.vector_stores import ( - VectorStoreQuery, - VectorStoreQueryResult, +from llama_index.core.vector_stores import VectorStoreQuery, VectorStoreQueryResult +from llama_index.core.vector_stores.types import ( + BasePydanticVectorStore, + VectorStoreQueryMode, + MetadataFilters, ExactMatchFilter, ) +from pydantic import BaseModel +from llama_index.core.vector_stores.simple import DEFAULT_VECTOR_STORE, NAMESPACE_SEP from llama_index.core.vector_stores.types import ( - BasePydanticVectorStore, VectorStoreQueryMode + DEFAULT_PERSIST_DIR, + DEFAULT_PERSIST_FNAME, + BasePydanticVectorStore, + VectorStoreQuery, + VectorStoreQueryResult, ) +from txtai import Embeddings -logger = logging.getLogger() +logger = logging.getLogger(__name__) IMPORT_ERROR_MSG = """ `txtai` package not found. For instructions on @@ -26,28 +39,19 @@ https://neuml.github.io/txtai/install/ """ +DEFAULT_PERSIST_PATH = os.path.join( + DEFAULT_PERSIST_DIR, f"{DEFAULT_VECTOR_STORE}{NAMESPACE_SEP}{DEFAULT_PERSIST_FNAME}" +) -class TxtaiVectorStore(BasePydanticVectorStore): - """txtai Vector Store. - - Embeddings and documents are stored within a txtai index. - - During query time, the index uses txtai to query for the top - k most similar nodes. - - Args: - txtai_embeddings (txtai.embeddings.Embeddings): A txtai embeddings instance - with content storage enabled. - - """ - - stores_text: bool = True +class TxtaiVectorStore(BasePydanticVectorStore): _embeddings = PrivateAttr() def __init__( self, txtai_index: Any, + stores_text: bool = False, + **kwargs: Any, ) -> None: """Initialize params.""" try: @@ -55,143 +59,247 @@ def __init__( except ImportError: raise ImportError(IMPORT_ERROR_MSG) - if not txtai_index.config.get("content"): - raise ValueError("The txtai embeddings instance must have content storage enabled.") - self._embeddings = cast(txtai.embeddings.Embeddings, txtai_index) - super().__init__() + + if self._embeddings.config.get("content"): + stores_text = True + + super().__init__(stores_text=stores_text, **kwargs) def add( self, nodes: List[BaseNode], + tags: Optional[str] = None, **add_kwargs: Any, ) -> List[str]: - """Add nodes to index. - - Args: - nodes (List[BaseNode]): List of nodes to add to the index. - - """ + """Add nodes to index.""" docs = [] for node in nodes: - doc = (node.ref_doc_id, {"text": node.get_text()}, node.extra_info) + print(node.metadata) + if self.stores_text: + doc = (node.node_id, {"text": node.get_text(), **node.metadata}, tags) + else: + doc = (node.node_id, node.get_embedding()) docs.append(doc) - self._embeddings.index(docs) - return [node.ref_doc_id for node in nodes] + print(docs) + self._embeddings.index(docs) # TODO: should this be upsert? + print(self._embeddings.search("DOWN")) + return [node.node_id for node in nodes] + + def delete( + self, + ref_doc_id: str, + **delete_kwargs: Any + ) -> None: + """ + Delete nodes using with ref_doc_id. + + Args: + ref_doc_id (str): The doc_id of the document to delete. + + """ + self._embeddings.delete([int(ref_doc_id)]) @property def client(self) -> Any: """Return txtai client.""" return self._embeddings - def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult: - """Query index for top k most similar nodes. - - Args: - query (VectorStoreQuery): Query parameters. + @staticmethod + def _build_query(query: VectorStoreQuery) -> str: + """Build SQL query string from VectorStoreQuery.""" + query_str = "SELECT * FROM txtai" - """ if query.filters is not None: - # TODO: Implement metadata filters - raise ValueError("Metadata filters not implemented for txtai yet.") + filter_strs = [] + for filter in query.filters.filters: + if isinstance(filter, ExactMatchFilter): + filter_strs.append(f"{filter.key} = '{filter.value}'") + else: + filter_strs.append(f"{filter.key} {filter.operator.value} '{filter.value}'") + + if filter_strs: + condition = query.filters.condition.value.lower() + query_str += f" WHERE {f' {condition} '.join(filter_strs)}" - similarity_top_k = cast(int, query.similarity_top_k) + if query.query_str is not None: + if query.filters is not None: + query_str += " AND " + else: + query_str += " WHERE " + query_str += f"similar('{query.query_str}')" - # print("query", query) + query_str += f" LIMIT {query.similarity_top_k}" + return query_str + + def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult: + """Query index for top k most similar nodes.""" + nodes = [] + similarities = [] + ids = [] if query.mode == VectorStoreQueryMode.DEFAULT: if not self._embeddings.ann: raise ValueError("Dense index not enabled.") - - weights = 1 + weights = None elif query.mode == VectorStoreQueryMode.SPARSE: if not self._embeddings.scoring: raise ValueError("Sparse index not enabled.") - weights = 0 - else: - if query.alpha is not None and query.alpha > 0 and not self._embeddings.ann: - raise ValueError("Dense index not enabled.") - if query.alpha is not None and query.alpha < 1 and not self._embeddings.scoring: - raise ValueError("Sparse index not enabled.") - - weights = query.alpha if query.alpha is not None else 0.5 # hybrid + else: # VectorStoreQueryMode.HYBRID + if query.alpha is None: + raise ValueError("Alpha must be specified for hybrid search.") + weights = query.alpha if query.query_str is not None: - print("query.query_str", query.query_str) - # Natural language query - natural_language_query = cast(str, query.query_str) - sql = f"SELECT * FROM txtai WHERE similar(':{natural_language_query}, {weights})') LIMIT {similarity_top_k}" - # results = self._query(where=f"similar(:topic, {hybrid_search_weight})", parameters={"topic": topic}, limit=limit) - - print(sql) - results = self._embeddings.search(sql, weights=weights, limit=similarity_top_k) # TODO: Also use sparse_top_k - print(results) + sql_query = self._build_query(query) + logger.debug(f"SQL query: {sql_query}") + results = self._embeddings.search(sql_query, weights=weights, limit=query.similarity_top_k) - # TODO: Figure out how to resolve the fact that query_embedding is provided, unused, and potentially different than the embedding txtai would generate + for result in results: + if self.stores_text: + metadata = json.loads(result["data"]) + node = Node( + id_=result["id"], + text=result["text"], + metadata=metadata, + tags=result.get("tags", []), + relationships={}, + ) + else: + node = Node(id_=result["id"]) + nodes.append(node) + similarities.append(result["score"]) + ids.append(result["id"]) else: - print("query.query_embedding", query.query_embedding) - # Embedding-based query + if not self._embeddings.ann: + raise ValueError("Dense index not enabled for embedding-based queries.") + query_embedding = cast(List[float], query.query_embedding) query_embedding_np = np.array(query_embedding, dtype="float32")[np.newaxis, :] - results = self._embeddings.ann.search(query_embedding_np, similarity_top_k) - nodes = [] - similarities = [] - ids = [] - metadatas = [] + # TODO: + # if query_embedding_np.shape[1] != self._embeddings.config["dimension"]: + # ~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^ + # KeyError: 'dimension' + # + # if query_embedding_np.shape[1] != self._embeddings.config["dimension"]: + # raise ValueError( + # f"Query embedding dimension {query_embedding_np.shape[1]} does not match " + # f"the expected dimension {self._embeddings.config['dimension']}." + # ) + + # TODO: IDs are coming back as -1 for documents created via self.add, even though _embeddings.search("DOWN") returns them + results = self._embeddings.ann.search(query_embedding_np, query.similarity_top_k)[0] + print(results) - if query.query_str is not None: - print("STRING") - # Result processing for natural language queries for result in results: - print({ - 'id': result["id"], - - **json.loads(result["data"]) - }) - # TODO: Where does tags fit here, vs metadata? - node = Node( - text=result["text"], - extra_info={ - 'id': result["id"], - **{k: v for k, v in json.loads(result["data"]).items() if k != "text"} - } - # TODO: populate more keys, like relationships? - ) - nodes.append(node) - similarities.append(result["score"]) - ids.append(result["id"]) - metadatas.append(result) - else: - print("EMBEDDING") - # Result processing for embedding-based queries - for result in results[0]: doc_id = str(result[0]) - metadata = self._embeddings.search(f"SELECT * FROM txtai WHERE id = '{doc_id}' LIMIT 1")[0] - node = Node( - text=metadata["text"], - extra_info={ - 'id': doc_id, - **{k: v for k, v in json.loads(metadata["data"]).items() if k != "text"} - } - # TODO: populate more keys, like relationships? - ) + if self.stores_text: + print(f"SELECT * FROM txtai WHERE id = '{doc_id}' LIMIT 1") + metadata = self._embeddings.search(f"SELECT * FROM txtai WHERE id = '{doc_id}' LIMIT 1")[0] + node = Node( + id_=doc_id, + text=metadata["text"], + metadata=json.loads(metadata["data"]), + tags=metadata.get("tags", []), + relationships={}, + ) + else: + node = Node(id_=doc_id) nodes.append(node) similarities.append(result[1]) ids.append(doc_id) - metadatas.append(metadata) - return VectorStoreQueryResult(nodes=nodes, similarities=similarities, ids=ids) # , metadatas=metadatas) + return VectorStoreQueryResult(nodes=nodes, similarities=similarities, ids=ids) - def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None: - """ - Delete nodes using with ref_doc_id. + def persist( + self, + persist_path: str = DEFAULT_PERSIST_PATH, + fs: Optional[fsspec.AbstractFileSystem] = None, + **kwargs: Any, + ) -> None: + """Save to file. + + This method saves the vector store to disk. Args: - ref_doc_id (str): The doc_id of the document to delete. + persist_path (str): The save_path of the file. + fs (fsspec.AbstractFileSystem): The filesystem to use. """ - # self._txtai_index.delete([int(ref_doc_id)]) - self._embeddings.delete(where={"id": ref_doc_id}, **delete_kwargs) # TODO: not sure which approach is better + if fs and not isinstance(fs, LocalFileSystem): + raise NotImplementedError("txtai only supports local storage for now.") + + dirpath = Path(persist_path).parent + dirpath.mkdir(exist_ok=True) + + jsonconfig = self._embeddings.config.get("format", "pickle") == "json" + # Determine if config is json or pickle + config_path = dirpath / "config.json" if jsonconfig else dirpath / "config" + + # Write configuration + with open( + config_path, + "w" if jsonconfig else "wb", + encoding="utf-8" if jsonconfig else None, + ) as f: + if jsonconfig: + # Write config as JSON + json.dump(self._embeddings.config, f, default=str) + else: + from txtai.version import __pickle__ + + # Write config as pickle format + pickle.dump(self._embeddings.config, f, protocol=__pickle__) + + self._embeddings.save(persist_path, **kwargs) + + @classmethod + def from_persist_path(cls, persist_path: str, **kwargs: Any) -> "TxtaiVectorStore": + """Load the vector store from disk.""" + try: + import txtai + except ImportError: + raise ImportError(IMPORT_ERROR_MSG) + + if not os.path.exists(persist_path): + raise ValueError(f"Persisted index not found at {persist_path}") + + embeddings = txtai.embeddings.Embeddings(path=persist_path) + return cls(txtai_index=embeddings, **kwargs) + + @classmethod + def from_persist_path( + cls, + persist_path: str, + fs: Optional[fsspec.AbstractFileSystem] = None, + ) -> "TxtaiVectorStore": + try: + import txtai + except ImportError: + raise ImportError(IMPORT_ERROR_MSG) + + if fs and not isinstance(fs, LocalFileSystem): + raise NotImplementedError("txtai only supports local storage for now.") + + if not os.path.exists(persist_path): + raise ValueError(f"No existing {__name__} found at {persist_path}.") + + logger.info(f"Loading {__name__} config from {persist_path}.") + parent_directory = Path(persist_path).parent + config_path = parent_directory / "config.json" + jsonconfig = config_path.exists() + # Determine if config is json or pickle + config_path = config_path if jsonconfig else parent_directory / "config" + # Load configuration + with open(config_path, "r" if jsonconfig else "rb") as f: + config = json.load(f) if jsonconfig else pickle.load(f) + + logger.info(f"Loading {__name__} from {persist_path}.") + + txtai_index = Embeddings(config=config) + txtai_index.load(persist_path) + + return cls(txtai_index=txtai_index) diff --git a/selfie/embeddings/__init__.py b/selfie/embeddings/__init__.py index 1d2c7a8..6926417 100644 --- a/selfie/embeddings/__init__.py +++ b/selfie/embeddings/__init__.py @@ -11,7 +11,7 @@ import tiktoken from llama_index.core import ServiceContext, VectorStoreIndex from llama_index.core.node_parser import SentenceSplitter -from llama_index.core.vector_stores.types import VectorStoreQueryMode +from llama_index.core.vector_stores.types import VectorStoreQueryMode, MetadataFilters, MetadataFilter, FilterOperator from llama_index.embeddings.huggingface import HuggingFaceEmbedding from llama_index.llms.openai_like import OpenAILike @@ -376,7 +376,12 @@ async def recall2( retriever = index.as_retriever( similarity_top_k=limit, vector_store_query_mode=VectorStoreQueryMode.HYBRID, - filters=None, # TODO: use this to filter by source, etc. + filters=MetadataFilters( + filters=[ + MetadataFilter(key="source", operator=FilterOperator.EQ, value="whatsapp") + # MetadataFilter(key="source", operator=FilterOperator.EQ, value="text_files") + ] + ), alpha=hybrid_search_weight, doc_ids=None, # TODO: use this to filter by as set of embedding documents vector_store_kwargs={}, # TODO: use this to pass additional parameters to the vector store