diff --git a/poetry.lock b/poetry.lock index 76b4c91..b3f0bc8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1617,9 +1617,11 @@ files = [ ] [package.dependencies] +aiohttp = {version = "*", optional = true, markers = "extra == \"inference\""} filelock = "*" fsspec = ">=2023.5.0" packaging = ">=20.9" +pydantic = {version = ">1.1,<3.0", optional = true, markers = "python_version > \"3.8\" and extra == \"inference\""} pyyaml = ">=5.1" requests = "*" tqdm = ">=4.42.1" @@ -1952,6 +1954,23 @@ local-models = ["optimum[onnxruntime] (>=1.13.2,<2.0.0)", "sentencepiece (>=0.1. postgres = ["asyncpg (>=0.28.0,<0.29.0)", "pgvector (>=0.1.0,<0.2.0)", "psycopg2-binary (>=2.9.9,<3.0.0)"] query-tools = ["guidance (>=0.0.64,<0.0.65)", "jsonpath-ng (>=1.6.0,<2.0.0)", "lm-format-enforcer (>=0.4.3,<0.5.0)", "rank-bm25 (>=0.2.2,<0.3.0)", "scikit-learn", "spacy (>=3.7.1,<4.0.0)"] +[[package]] +name = "llama-index-embeddings-huggingface" +version = "0.1.4" +description = "llama-index embeddings huggingface integration" +optional = false +python-versions = ">=3.8.1,<4.0" +files = [ + {file = "llama_index_embeddings_huggingface-0.1.4-py3-none-any.whl", hash = "sha256:9c80539f3cbbd7191c219e2cda154b1a7151aa912196bc537c16f40e18e4187c"}, + {file = "llama_index_embeddings_huggingface-0.1.4.tar.gz", hash = "sha256:042d249d91039bc4a531711c0c81ebf4f5c921de98629d2d342979bc4511a639"}, +] + +[package.dependencies] +huggingface-hub = {version = ">=0.19.0", extras = ["inference"]} +llama-index-core = ">=0.10.1,<0.11.0" +torch = ">=2.1.2,<3.0.0" +transformers = ">=4.37.0,<5.0.0" + [[package]] name = "llama-index-embeddings-openai" version = "0.1.6" @@ -2034,6 +2053,22 @@ files = [ [package.dependencies] llama-index-core = ">=0.10.1,<0.11.0" +[[package]] +name = "llama-index-llms-openai-like" +version = "0.1.3" +description = "llama-index llms openai like integration" +optional = false +python-versions = ">=3.8.1,<4.0" +files = [ + {file = "llama_index_llms_openai_like-0.1.3-py3-none-any.whl", hash = "sha256:0cf2c56f027c5e1f17c7fc606ad2b991f61daa75a88ba35bb5ea97de9766d4d3"}, + {file = "llama_index_llms_openai_like-0.1.3.tar.gz", hash = "sha256:c3412325077c75263e37d60d501314b9bdc770f12d8a2b16e756e7d9ac8f9e3f"}, +] + +[package.dependencies] +llama-index-core = ">=0.10.1,<0.11.0" +llama-index-llms-openai = ">=0.1.1,<0.2.0" +transformers = ">=4.37.0,<5.0.0" + [[package]] name = "llama-index-multi-modal-llms-openai" version = "0.1.4" @@ -2131,6 +2166,20 @@ llama-index-core = ">=0.10.1,<0.11.0" onnxruntime = ">=1.17.0,<2.0.0" tokenizers = ">=0.15.1,<0.16.0" +[[package]] +name = "llama-index-vector-stores-txtai" +version = "0.1.2" +description = "llama-index vector stores txtai integration" +optional = false +python-versions = ">=3.8.1,<4.0" +files = [ + {file = "llama_index_vector_stores_txtai-0.1.2-py3-none-any.whl", hash = "sha256:e2e260a8fa186b5c4c7ab7fa9f1a84e9822400754d801c23a0220bd3fbc13568"}, + {file = "llama_index_vector_stores_txtai-0.1.2.tar.gz", hash = "sha256:0f2ed454f38034ec82a73c5cc4e1c29d2715c33e090408527dca65fec369650a"}, +] + +[package.dependencies] +llama-index-core = ">=0.10.1,<0.11.0" + [[package]] name = "llama-parse" version = "0.3.8" @@ -6355,4 +6404,4 @@ gpu = ["auto-gptq", "autoawq", "optimum"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<4.0" -content-hash = "6ed61c05e5c4298fb983654bf4abcde89167c15f25dbf0f232eaf70e149b48a4" +content-hash = "71912c379a5c7255b282187aecc65ae703bf50babb08f12c61abd359fe363a78" diff --git a/pyproject.toml b/pyproject.toml index e9b3cbe..54435fa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ llama-cpp-python = "^0.2.26" litellm = "^1.23.12" txtai = {version = "^7.0.0", extras = ["pipeline-llm"]} sse-starlette = "^2.0.0" -llama-index = "^0.10.4" +llama-index = "^0.10.18" numpy = "^1.26.4" html2text = "^2020.1.16" peewee = "^3.17.0" @@ -27,6 +27,10 @@ python-dotenv = "^1.0.1" pillow = "^10.2.0" swig = "^4.2.1" +llama-index-vector-stores-txtai = "^0.1.2" +llama-index-llms-openai-like = "^0.1.3" +llama-index-embeddings-huggingface = "^0.1.4" + # Experimental GPU Features auto-gptq = { version = "^0.6.0", optional = true } optimum = { version = "^1.16.2", optional = true } diff --git a/selfie/api/completions.py b/selfie/api/completions.py index 96f5cc2..565fd96 100644 --- a/selfie/api/completions.py +++ b/selfie/api/completions.py @@ -4,6 +4,12 @@ CreateCompletionResponse as LlamaCppCompletionResponse, CreateChatCompletionResponse as LlamaCppChatCompletionResponse, ) +from llama_index.core import VectorStoreIndex, ServiceContext +from llama_index.core.vector_stores import VectorStoreQuery +from llama_index.llms.openai_like import OpenAILike + +from selfie.embeddings.TxtaiVectorStore import TxtaiVectorStore +from selfie.embeddings import DataIndex from selfie.types.completion_requests import ChatCompletionRequest, CompletionRequest from selfie.text_generation import completion diff --git a/selfie/api/documents.py b/selfie/api/documents.py index 68bb382..ada7819 100644 --- a/selfie/api/documents.py +++ b/selfie/api/documents.py @@ -1,3 +1,4 @@ +from datetime import datetime from typing import List, Optional from fastapi import APIRouter, Query @@ -27,8 +28,8 @@ class FetchedDocument(BaseModel): id: int = Field(..., description="The unique identifier of the document") name: str = Field(..., description="The name of the document") size: int = Field(..., description="The size of the document") - created_at: str = Field(..., description="The timestamp of the document creation") - updated_at: str = Field(..., description="The timestamp of the document update") + created_at: datetime = Field(..., description="The timestamp of the document creation") + updated_at: datetime = Field(..., description="The timestamp of the document update") content_type: str = Field(..., description="The content type of the document") connector_name: str = Field(..., description="The name of the connector") diff --git a/selfie/api/index_documents.py b/selfie/api/index_documents.py index 31b9f7e..a093e47 100644 --- a/selfie/api/index_documents.py +++ b/selfie/api/index_documents.py @@ -53,7 +53,7 @@ async def update_index_document(document_id: int, document: EmbeddingDocumentMod async def delete_index_document(document_id: int): # Sometimes self.embeddings.save() errors on "database is locked", bricks it # raise HTTPException(status_code=501, detail="Not implemented") - DataIndex("n/a").delete_document(document_id) + await DataIndex("n/a").delete_document(document_id) return {"message": "Document deleted successfully"} diff --git a/selfie/connectors/text_files/uischema.json b/selfie/connectors/text_files/uischema.json index f4ef5a6..f886166 100644 --- a/selfie/connectors/text_files/uischema.json +++ b/selfie/connectors/text_files/uischema.json @@ -1,8 +1,5 @@ { "files": { - "ui:widget": "nativeFile", - "ui:options": { - "accept": ".json" - } + "ui:widget": "nativeFile" } } diff --git a/selfie/embeddings/TxtaiVectorStore.py b/selfie/embeddings/TxtaiVectorStore.py new file mode 100644 index 0000000..0025e3a --- /dev/null +++ b/selfie/embeddings/TxtaiVectorStore.py @@ -0,0 +1,305 @@ +"""txtai Vector store index. + +An index that is built on top of an existing txtai vector store. +""" +import json +import logging +import os +import pickle +from pathlib import Path +from typing import Any, Dict, List, Optional, cast + +import fsspec +import numpy as np +from fsspec.implementations.local import LocalFileSystem +from llama_index.core.bridge.pydantic import PrivateAttr +from llama_index.core.schema import BaseNode, Node +from llama_index.core.vector_stores import VectorStoreQuery, VectorStoreQueryResult +from llama_index.core.vector_stores.types import ( + BasePydanticVectorStore, + VectorStoreQueryMode, + MetadataFilters, ExactMatchFilter, +) +from pydantic import BaseModel +from llama_index.core.vector_stores.simple import DEFAULT_VECTOR_STORE, NAMESPACE_SEP +from llama_index.core.vector_stores.types import ( + DEFAULT_PERSIST_DIR, + DEFAULT_PERSIST_FNAME, + BasePydanticVectorStore, + VectorStoreQuery, + VectorStoreQueryResult, +) +from txtai import Embeddings + +logger = logging.getLogger(__name__) + +IMPORT_ERROR_MSG = """ + `txtai` package not found. For instructions on + how to install `txtai` please visit + https://neuml.github.io/txtai/install/ +""" + +DEFAULT_PERSIST_PATH = os.path.join( + DEFAULT_PERSIST_DIR, f"{DEFAULT_VECTOR_STORE}{NAMESPACE_SEP}{DEFAULT_PERSIST_FNAME}" +) + + +class TxtaiVectorStore(BasePydanticVectorStore): + _embeddings = PrivateAttr() + + def __init__( + self, + txtai_index: Any, + stores_text: bool = False, + **kwargs: Any, + ) -> None: + """Initialize params.""" + try: + import txtai + except ImportError: + raise ImportError(IMPORT_ERROR_MSG) + + self._embeddings = cast(txtai.embeddings.Embeddings, txtai_index) + + if self._embeddings.config.get("content"): + stores_text = True + + super().__init__(stores_text=stores_text, **kwargs) + + def add( + self, + nodes: List[BaseNode], + tags: Optional[str] = None, + **add_kwargs: Any, + ) -> List[str]: + """Add nodes to index.""" + docs = [] + for node in nodes: + print(node.metadata) + if self.stores_text: + doc = (node.node_id, {"text": node.get_text(), **node.metadata}, tags) + else: + doc = (node.node_id, node.get_embedding()) + docs.append(doc) + + print(docs) + self._embeddings.index(docs) # TODO: should this be upsert? + print(self._embeddings.search("DOWN")) + return [node.node_id for node in nodes] + + def delete( + self, + ref_doc_id: str, + **delete_kwargs: Any + ) -> None: + """ + Delete nodes using with ref_doc_id. + + Args: + ref_doc_id (str): The doc_id of the document to delete. + + """ + self._embeddings.delete([int(ref_doc_id)]) + + @property + def client(self) -> Any: + """Return txtai client.""" + return self._embeddings + + @staticmethod + def _build_query(query: VectorStoreQuery) -> str: + """Build SQL query string from VectorStoreQuery.""" + query_str = "SELECT * FROM txtai" + + if query.filters is not None: + filter_strs = [] + for filter in query.filters.filters: + if isinstance(filter, ExactMatchFilter): + filter_strs.append(f"{filter.key} = '{filter.value}'") + else: + filter_strs.append(f"{filter.key} {filter.operator.value} '{filter.value}'") + + if filter_strs: + condition = query.filters.condition.value.lower() + query_str += f" WHERE {f' {condition} '.join(filter_strs)}" + + if query.query_str is not None: + if query.filters is not None: + query_str += " AND " + else: + query_str += " WHERE " + query_str += f"similar('{query.query_str}')" + + query_str += f" LIMIT {query.similarity_top_k}" + return query_str + + def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult: + """Query index for top k most similar nodes.""" + nodes = [] + similarities = [] + ids = [] + + if query.mode == VectorStoreQueryMode.DEFAULT: + if not self._embeddings.ann: + raise ValueError("Dense index not enabled.") + weights = None + elif query.mode == VectorStoreQueryMode.SPARSE: + if not self._embeddings.scoring: + raise ValueError("Sparse index not enabled.") + weights = 0 + else: # VectorStoreQueryMode.HYBRID + if query.alpha is None: + raise ValueError("Alpha must be specified for hybrid search.") + weights = query.alpha + + if query.query_str is not None: + sql_query = self._build_query(query) + logger.debug(f"SQL query: {sql_query}") + results = self._embeddings.search(sql_query, weights=weights, limit=query.similarity_top_k) + + for result in results: + if self.stores_text: + metadata = json.loads(result["data"]) + node = Node( + id_=result["id"], + text=result["text"], + metadata=metadata, + tags=result.get("tags", []), + relationships={}, + ) + else: + node = Node(id_=result["id"]) + nodes.append(node) + similarities.append(result["score"]) + ids.append(result["id"]) + else: + if not self._embeddings.ann: + raise ValueError("Dense index not enabled for embedding-based queries.") + + query_embedding = cast(List[float], query.query_embedding) + query_embedding_np = np.array(query_embedding, dtype="float32")[np.newaxis, :] + + # TODO: + # if query_embedding_np.shape[1] != self._embeddings.config["dimension"]: + # ~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^ + # KeyError: 'dimension' + # + # if query_embedding_np.shape[1] != self._embeddings.config["dimension"]: + # raise ValueError( + # f"Query embedding dimension {query_embedding_np.shape[1]} does not match " + # f"the expected dimension {self._embeddings.config['dimension']}." + # ) + + # TODO: IDs are coming back as -1 for documents created via self.add, even though _embeddings.search("DOWN") returns them + results = self._embeddings.ann.search(query_embedding_np, query.similarity_top_k)[0] + print(results) + + for result in results: + doc_id = str(result[0]) + if self.stores_text: + print(f"SELECT * FROM txtai WHERE id = '{doc_id}' LIMIT 1") + metadata = self._embeddings.search(f"SELECT * FROM txtai WHERE id = '{doc_id}' LIMIT 1")[0] + node = Node( + id_=doc_id, + text=metadata["text"], + metadata=json.loads(metadata["data"]), + tags=metadata.get("tags", []), + relationships={}, + ) + else: + node = Node(id_=doc_id) + nodes.append(node) + similarities.append(result[1]) + ids.append(doc_id) + + return VectorStoreQueryResult(nodes=nodes, similarities=similarities, ids=ids) + + def persist( + self, + persist_path: str = DEFAULT_PERSIST_PATH, + fs: Optional[fsspec.AbstractFileSystem] = None, + **kwargs: Any, + ) -> None: + """Save to file. + + This method saves the vector store to disk. + + Args: + persist_path (str): The save_path of the file. + fs (fsspec.AbstractFileSystem): The filesystem to use. + + """ + if fs and not isinstance(fs, LocalFileSystem): + raise NotImplementedError("txtai only supports local storage for now.") + + dirpath = Path(persist_path).parent + dirpath.mkdir(exist_ok=True) + + jsonconfig = self._embeddings.config.get("format", "pickle") == "json" + # Determine if config is json or pickle + config_path = dirpath / "config.json" if jsonconfig else dirpath / "config" + + # Write configuration + with open( + config_path, + "w" if jsonconfig else "wb", + encoding="utf-8" if jsonconfig else None, + ) as f: + if jsonconfig: + # Write config as JSON + json.dump(self._embeddings.config, f, default=str) + else: + from txtai.version import __pickle__ + + # Write config as pickle format + pickle.dump(self._embeddings.config, f, protocol=__pickle__) + + self._embeddings.save(persist_path, **kwargs) + + @classmethod + def from_persist_path(cls, persist_path: str, **kwargs: Any) -> "TxtaiVectorStore": + """Load the vector store from disk.""" + try: + import txtai + except ImportError: + raise ImportError(IMPORT_ERROR_MSG) + + if not os.path.exists(persist_path): + raise ValueError(f"Persisted index not found at {persist_path}") + + embeddings = txtai.embeddings.Embeddings(path=persist_path) + return cls(txtai_index=embeddings, **kwargs) + + @classmethod + def from_persist_path( + cls, + persist_path: str, + fs: Optional[fsspec.AbstractFileSystem] = None, + ) -> "TxtaiVectorStore": + try: + import txtai + except ImportError: + raise ImportError(IMPORT_ERROR_MSG) + + if fs and not isinstance(fs, LocalFileSystem): + raise NotImplementedError("txtai only supports local storage for now.") + + if not os.path.exists(persist_path): + raise ValueError(f"No existing {__name__} found at {persist_path}.") + + logger.info(f"Loading {__name__} config from {persist_path}.") + parent_directory = Path(persist_path).parent + config_path = parent_directory / "config.json" + jsonconfig = config_path.exists() + # Determine if config is json or pickle + config_path = config_path if jsonconfig else parent_directory / "config" + # Load configuration + with open(config_path, "r" if jsonconfig else "rb") as f: + config = json.load(f) if jsonconfig else pickle.load(f) + + logger.info(f"Loading {__name__} from {persist_path}.") + + txtai_index = Embeddings(config=config) + txtai_index.load(persist_path) + + return cls(txtai_index=txtai_index) diff --git a/selfie/embeddings/__init__.py b/selfie/embeddings/__init__.py index 7c6630b..6926417 100644 --- a/selfie/embeddings/__init__.py +++ b/selfie/embeddings/__init__.py @@ -9,8 +9,13 @@ import humanize import logging import tiktoken +from llama_index.core import ServiceContext, VectorStoreIndex from llama_index.core.node_parser import SentenceSplitter +from llama_index.core.vector_stores.types import VectorStoreQueryMode, MetadataFilters, MetadataFilter, FilterOperator +from llama_index.embeddings.huggingface import HuggingFaceEmbedding +from llama_index.llms.openai_like import OpenAILike +from selfie.embeddings.TxtaiVectorStore import TxtaiVectorStore from selfie.config import get_app_config from selfie.data_generators.chat_training_data import ( ChatTrainingDataGenerator, @@ -74,6 +79,8 @@ def __init__(self, character_name, storage_path: str = config.embeddings_storage self.completion = completion or get_default_completion() self.character_name = character_name self.embeddings = Embeddings( + path="sentence-transformers/all-MiniLM-L6-v2", + hybrid=True, sqlite={"wal": True}, # For now, sqlite w/the default driver is the only way to use WAL. content=True @@ -334,6 +341,104 @@ async def index(self, documents: List[EmbeddingDocumentModel], extract_importanc return with_importance # TODO: return document with ID, if possible + async def recall2( + self, + topic: str, + topic_context=None, + character_name: Optional[str] = None, + limit=5, + importance_weight=0, + recency_weight=1, + relevance_weight=1, + include_summary=True, + local_llm=True, + min_score=0.4, + hybrid_search_weight=0.5, # TODO: testing needed + ): + if min_score is None: + min_score = 0.4 + + if not self.has_data(): + return {"documents": [], "summary": "No documents found.", "mean_score": 0} + + service_context = ServiceContext.from_defaults( + llm=OpenAILike( + api_base="http://localhost:5000/v1", + api_key="none", + model="TheBloke_Mistral-7B-Instruct-v0.2-GPTQ_gptq-8bit-32g-actorder_True", + is_chat_model=True + ), + embed_model=HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2"), + ) + + index = VectorStoreIndex.from_vector_store(TxtaiVectorStore(txtai_index=self.embeddings), service_context=service_context) + + retriever = index.as_retriever( + similarity_top_k=limit, + vector_store_query_mode=VectorStoreQueryMode.HYBRID, + filters=MetadataFilters( + filters=[ + MetadataFilter(key="source", operator=FilterOperator.EQ, value="whatsapp") + # MetadataFilter(key="source", operator=FilterOperator.EQ, value="text_files") + ] + ), + alpha=hybrid_search_weight, + doc_ids=None, # TODO: use this to filter by as set of embedding documents + vector_store_kwargs={}, # TODO: use this to pass additional parameters to the vector store + sparse_top_k=limit, + ) + + documents_list: List[ScoredEmbeddingDocumentModel] = [] + for node, score in retriever.retrieve(topic): + node = node[1] + score = score[1] + document = EmbeddingDocumentModel( + id=node.metadata.get("id"), + text=node.get_text(), + timestamp=node.metadata.get("timestamp"), + importance=node.metadata.get("importance"), + source=node.metadata.get("source"), + updated_timestamp=node.metadata.get("updated_timestamp"), + source_document_id=node.metadata.get("source_document_id"), + ) + relevance_score = self.relevance_scorer.calculate_score(document, score) + recency_score = self.recency_scorer.calculate_score(document) + importance_score = document.importance + retrieval_score = self._calculate_retrieval_score( + recency_score, + relevance_score, + importance_score, + importance_weight, + recency_weight, + relevance_weight, + ) + documents_list.append( + ScoredEmbeddingDocumentModel( + **document.model_dump(), + score=retrieval_score, + relevance=relevance_score, + recency=recency_score, + ) + ) + + documents_list = [m for m in documents_list if m.score > min_score] + if len(documents_list) == 0: + return {"documents": [], "summary": "No documents found.", "mean_score": 0} + + documents_list.sort(key=lambda x: x.score, reverse=True) + short_documents_list = documents_list[:limit] + summary = ( + await self._summarize_documents( + character_name, + short_documents_list, + topic_context if topic_context else topic, + ("local" if local_llm else None), + ) + if include_summary + else None + ) + return {"documents": short_documents_list, "summary": summary, "mean_score": sum([m.score for m in short_documents_list]) / len(short_documents_list)} + async def recall( self, topic: str, @@ -346,6 +451,7 @@ async def recall( include_summary=True, local_llm=True, min_score=0.4, + hybrid_search_weight=0.5, # TODO: testing needed ): if min_score is None: min_score = 0.4 @@ -354,7 +460,7 @@ async def recall( return {"documents": [], "summary": "No documents found.", "mean_score": 0} self.embeddings.load(self.storage_path) - results = self._query(where="similar(:topic)", parameters={"topic": topic}, limit=limit) + results = self._query(where=f"similar(:topic, {hybrid_search_weight})", parameters={"topic": topic}, limit=limit) documents_list: List[ScoredEmbeddingDocumentModel] = [] for result in results: document = EmbeddingDocumentModel(