diff --git a/.github/workflows/license_tests.yml b/.github/workflows/license_tests.yml index 40414385..3b7ff929 100644 --- a/.github/workflows/license_tests.yml +++ b/.github/workflows/license_tests.yml @@ -32,7 +32,7 @@ jobs: pip freeze > requirements-all.txt - name: Check python id: license_check_report - uses: pilosus/action-pip-license-checker@v0.5.0 + uses: pilosus/action-pip-license-checker@v3.1.0 with: requirements: 'requirements-all.txt' fail: 'Copyleft,Other,Error' diff --git a/.github/workflows/pipaudit.yml b/.github/workflows/pipaudit.yml index f1d74be5..0a16cc27 100644 --- a/.github/workflows/pipaudit.yml +++ b/.github/workflows/pipaudit.yml @@ -29,7 +29,7 @@ jobs: - name: Install package run: | pip install . - - uses: pypa/gh-action-pip-audit@v1.0.0 + - uses: pypa/gh-action-pip-audit@v1.1.0 with: # Ignore setuptools vulnerability we can't do much about ignore-vulns: | diff --git a/CHANGELOG.md b/CHANGELOG.md index af6d2ab3..dbd034fb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,12 +1,49 @@ # Changelog -## [2.2.0a1](https://github.com/OpenVoiceOS/ovos-plugin-manager/tree/2.2.0a1) (2026-01-28) +## [2.2.3a1](https://github.com/OpenVoiceOS/ovos-plugin-manager/tree/2.2.3a1) (2026-01-30) -[Full Changelog](https://github.com/OpenVoiceOS/ovos-plugin-manager/compare/2.1.1...2.2.0a1) +[Full Changelog](https://github.com/OpenVoiceOS/ovos-plugin-manager/compare/2.2.2a1...2.2.3a1) **Merged pull requests:** -- feat: introduce base class for "memory" plugins [\#363](https://github.com/OpenVoiceOS/ovos-plugin-manager/pull/363) ([JarbasAl](https://github.com/JarbasAl)) +- fix: explicitly model streaming tokens vs sentences [\#374](https://github.com/OpenVoiceOS/ovos-plugin-manager/pull/374) ([JarbasAl](https://github.com/JarbasAl)) + +## [2.2.2a1](https://github.com/OpenVoiceOS/ovos-plugin-manager/tree/2.2.2a1) (2026-01-29) + +[Full Changelog](https://github.com/OpenVoiceOS/ovos-plugin-manager/compare/2.2.1a5...2.2.2a1) + +**Merged pull requests:** + +- fix: deprecated import [\#372](https://github.com/OpenVoiceOS/ovos-plugin-manager/pull/372) ([JarbasAl](https://github.com/JarbasAl)) + +## [2.2.1a5](https://github.com/OpenVoiceOS/ovos-plugin-manager/tree/2.2.1a5) (2026-01-29) + +[Full Changelog](https://github.com/OpenVoiceOS/ovos-plugin-manager/compare/2.2.1a4...2.2.1a5) + +**Merged pull requests:** + +- Deprecate solver plugins [\#365](https://github.com/OpenVoiceOS/ovos-plugin-manager/pull/365) ([JarbasAl](https://github.com/JarbasAl)) + +## [2.2.1a4](https://github.com/OpenVoiceOS/ovos-plugin-manager/tree/2.2.1a4) (2026-01-29) + +[Full Changelog](https://github.com/OpenVoiceOS/ovos-plugin-manager/compare/2.2.1a3...2.2.1a4) + +## [2.2.1a3](https://github.com/OpenVoiceOS/ovos-plugin-manager/tree/2.2.1a3) (2026-01-29) + +[Full Changelog](https://github.com/OpenVoiceOS/ovos-plugin-manager/compare/2.2.1a1...2.2.1a3) + +**Merged pull requests:** + +- chore\(deps\): update pilosus/action-pip-license-checker action to v3 [\#355](https://github.com/OpenVoiceOS/ovos-plugin-manager/pull/355) ([renovate[bot]](https://github.com/apps/renovate)) +- chore\(deps\): update pypa/gh-action-pip-audit action to v1.1.0 - autoclosed [\#351](https://github.com/OpenVoiceOS/ovos-plugin-manager/pull/351) ([renovate[bot]](https://github.com/apps/renovate)) + +## [2.2.1a1](https://github.com/OpenVoiceOS/ovos-plugin-manager/tree/2.2.1a1) (2026-01-29) + +[Full Changelog](https://github.com/OpenVoiceOS/ovos-plugin-manager/compare/2.2.0...2.2.1a1) + +**Merged pull requests:** + +- chore: move 3rd party code to own file [\#361](https://github.com/OpenVoiceOS/ovos-plugin-manager/pull/361) ([JarbasAl](https://github.com/JarbasAl)) diff --git a/ovos_plugin_manager/agents.py b/ovos_plugin_manager/agents.py index 1c026efa..f4097189 100644 --- a/ovos_plugin_manager/agents.py +++ b/ovos_plugin_manager/agents.py @@ -1,5 +1,9 @@ from typing import Dict, Type -from ovos_plugin_manager.templates.agents import AgentContextManager, MultimodalAdapter + +from ovos_plugin_manager.templates.agents import ( + AgentContextManager, MultimodalAdapter, RetrievalEngine, ChatEngine, MultimodalChatEngine, SummarizerEngine, + ChatSummarizerEngine, ExtractiveQAEngine, ReRankerEngine, YesNoEngine, NaturalLanguageInferenceEngine, + DocumentIndexerEngine, QAIndexerEngine, CoreferenceEngine) from ovos_plugin_manager.utils import PluginTypes @@ -39,3 +43,231 @@ def load_multimodal_adapter_plugin(module_name: str) -> Type[MultimodalAdapter]: """ from ovos_plugin_manager.utils import load_plugin return load_plugin(module_name, PluginTypes.AGENT_MULTIMODAL_ADAPTER) + + +def find_retrieval_plugins() -> Dict[str, Type[RetrievalEngine]]: + """ + Find all installed plugins + @return: dict plugin names to entrypoints + """ + from ovos_plugin_manager.utils import find_plugins + return find_plugins(PluginTypes.AGENT_RETRIEVAL) + + +def load_retrieval_plugin(module_name: str) -> Type[RetrievalEngine]: + """ + Get an uninstantiated class for the requested module_name + @param module_name: Plugin entrypoint name to load + @return: Uninstantiated class + """ + from ovos_plugin_manager.utils import load_plugin + return load_plugin(module_name, PluginTypes.AGENT_RETRIEVAL) + + +def find_chat_plugins() -> Dict[str, Type[ChatEngine]]: + """ + Find all installed plugins + @return: dict plugin names to entrypoints + """ + from ovos_plugin_manager.utils import find_plugins + return find_plugins(PluginTypes.AGENT_CHAT) + + +def load_chat_plugin(module_name: str) -> Type[ChatEngine]: + """ + Get an uninstantiated class for the requested module_name + @param module_name: Plugin entrypoint name to load + @return: Uninstantiated class + """ + from ovos_plugin_manager.utils import load_plugin + return load_plugin(module_name, PluginTypes.AGENT_CHAT) + + +def find_multimodal_chat_plugins() -> Dict[str, Type[MultimodalChatEngine]]: + """ + Find all installed plugins + @return: dict plugin names to entrypoints + """ + from ovos_plugin_manager.utils import find_plugins + return find_plugins(PluginTypes.AGENT_CHAT_MULTIMODAL) + + +def load_multimodal_chat_plugin(module_name: str) -> Type[MultimodalChatEngine]: + """ + Get an uninstantiated class for the requested module_name + @param module_name: Plugin entrypoint name to load + @return: Uninstantiated class + """ + from ovos_plugin_manager.utils import load_plugin + return load_plugin(module_name, PluginTypes.AGENT_CHAT_MULTIMODAL) + + +def find_summarizer_plugins() -> Dict[str, Type[SummarizerEngine]]: + """ + Find all installed plugins + @return: dict plugin names to entrypoints + """ + from ovos_plugin_manager.utils import find_plugins + return find_plugins(PluginTypes.AGENT_SUMMARIZER) + + +def load_summarizer_plugin(module_name: str) -> Type[SummarizerEngine]: + """ + Get an uninstantiated class for the requested module_name + @param module_name: Plugin entrypoint name to load + @return: Uninstantiated class + """ + from ovos_plugin_manager.utils import load_plugin + return load_plugin(module_name, PluginTypes.AGENT_SUMMARIZER) + + +def find_chat_summarizer_plugins() -> Dict[str, Type[ChatSummarizerEngine]]: + """ + Find all installed plugins + @return: dict plugin names to entrypoints + """ + from ovos_plugin_manager.utils import find_plugins + return find_plugins(PluginTypes.AGENT_CHAT_SUMMARIZER) + + +def load_chat_summarizer_plugin(module_name: str) -> Type[ChatSummarizerEngine]: + """ + Get an uninstantiated class for the requested module_name + @param module_name: Plugin entrypoint name to load + @return: Uninstantiated class + """ + from ovos_plugin_manager.utils import load_plugin + return load_plugin(module_name, PluginTypes.AGENT_CHAT_SUMMARIZER) + + +def find_extractive_qa_plugins() -> Dict[str, Type[ExtractiveQAEngine]]: + """ + Find all installed plugins + @return: dict plugin names to entrypoints + """ + from ovos_plugin_manager.utils import find_plugins + return find_plugins(PluginTypes.AGENT_EXTRACTIVE_QA) + + +def load_extractive_qa_plugin(module_name: str) -> Type[ExtractiveQAEngine]: + """ + Get an uninstantiated class for the requested module_name + @param module_name: Plugin entrypoint name to load + @return: Uninstantiated class + """ + from ovos_plugin_manager.utils import load_plugin + return load_plugin(module_name, PluginTypes.AGENT_EXTRACTIVE_QA) + + +def find_reranker_plugins() -> Dict[str, Type[ReRankerEngine]]: + """ + Find all installed plugins + @return: dict plugin names to entrypoints + """ + from ovos_plugin_manager.utils import find_plugins + return find_plugins(PluginTypes.AGENT_RERANKER) + + +def load_reranker_plugin(module_name: str) -> Type[ReRankerEngine]: + """ + Get an uninstantiated class for the requested module_name + @param module_name: Plugin entrypoint name to load + @return: Uninstantiated class + """ + from ovos_plugin_manager.utils import load_plugin + return load_plugin(module_name, PluginTypes.AGENT_RERANKER) + + +def find_yesno_plugins() -> Dict[str, Type[YesNoEngine]]: + """ + Find all installed plugins + @return: dict plugin names to entrypoints + """ + from ovos_plugin_manager.utils import find_plugins + return find_plugins(PluginTypes.AGENT_YES_NO) + + +def load_yesno_plugin(module_name: str) -> Type[YesNoEngine]: + """ + Get an uninstantiated class for the requested module_name + @param module_name: Plugin entrypoint name to load + @return: Uninstantiated class + """ + from ovos_plugin_manager.utils import load_plugin + return load_plugin(module_name, PluginTypes.AGENT_YES_NO) + + +def find_natural_language_inference_plugins() -> Dict[str, Type[NaturalLanguageInferenceEngine]]: + """ + Find all installed plugins + @return: dict plugin names to entrypoints + """ + from ovos_plugin_manager.utils import find_plugins + return find_plugins(PluginTypes.AGENT_NLI) + + +def load_natural_language_inference_plugin(module_name: str) -> Type[NaturalLanguageInferenceEngine]: + """ + Get an uninstantiated class for the requested module_name + @param module_name: Plugin entrypoint name to load + @return: Uninstantiated class + """ + from ovos_plugin_manager.utils import load_plugin + return load_plugin(module_name, PluginTypes.AGENT_NLI) + + +def find_document_indexer_plugins() -> Dict[str, Type[DocumentIndexerEngine]]: + """ + Find all installed plugins + @return: dict plugin names to entrypoints + """ + from ovos_plugin_manager.utils import find_plugins + return find_plugins(PluginTypes.AGENT_DOC_RETRIEVAL) + + +def load_document_indexer_plugin(module_name: str) -> Type[DocumentIndexerEngine]: + """ + Get an uninstantiated class for the requested module_name + @param module_name: Plugin entrypoint name to load + @return: Uninstantiated class + """ + from ovos_plugin_manager.utils import load_plugin + return load_plugin(module_name, PluginTypes.AGENT_DOC_RETRIEVAL) + + +def find_qa_indexer_plugins() -> Dict[str, Type[QAIndexerEngine]]: + """ + Find all installed plugins + @return: dict plugin names to entrypoints + """ + from ovos_plugin_manager.utils import find_plugins + return find_plugins(PluginTypes.AGENT_QA_RETRIEVAL) + + +def load_qa_indexer_plugin(module_name: str) -> Type[QAIndexerEngine]: + """ + Get an uninstantiated class for the requested module_name + @param module_name: Plugin entrypoint name to load + @return: Uninstantiated class + """ + from ovos_plugin_manager.utils import load_plugin + return load_plugin(module_name, PluginTypes.AGENT_QA_RETRIEVAL) + + +def find_coreference_plugins() -> Dict[str, Type[CoreferenceEngine]]: + """ + Find all installed plugins + @return: dict plugin names to entrypoints + """ + from ovos_plugin_manager.utils import find_plugins + return find_plugins(PluginTypes.AGENT_COREF) + + +def load_coreference_plugin(module_name: str) -> Type[CoreferenceEngine]: + """ + Get an uninstantiated class for the requested module_name + @param module_name: Plugin entrypoint name to load + @return: Uninstantiated class + """ + from ovos_plugin_manager.utils import load_plugin + return load_plugin(module_name, PluginTypes.AGENT_COREF) \ No newline at end of file diff --git a/ovos_plugin_manager/audio2ipa.py b/ovos_plugin_manager/audio2ipa.py index 9c1dcbbd..35186bae 100644 --- a/ovos_plugin_manager/audio2ipa.py +++ b/ovos_plugin_manager/audio2ipa.py @@ -1,5 +1,5 @@ from typing import Optional -from ovos_plugin_manager.utils import normalize_lang, PluginTypes, PluginConfigTypes +from ovos_plugin_manager.utils import PluginTypes, PluginConfigTypes from ovos_plugin_manager.templates.audio2ipa import Audio2IPA from ovos_utils.log import LOG diff --git a/ovos_plugin_manager/coreference.py b/ovos_plugin_manager/coreference.py index 0f94274c..26a47064 100644 --- a/ovos_plugin_manager/coreference.py +++ b/ovos_plugin_manager/coreference.py @@ -1,11 +1,18 @@ from typing import Optional -from ovos_plugin_manager.utils import normalize_lang, PluginTypes, \ - PluginConfigTypes +from ovos_plugin_manager.utils import PluginTypes, PluginConfigTypes from ovos_config import Configuration from ovos_utils.log import LOG from ovos_plugin_manager.templates.coreference import CoreferenceSolverEngine, \ replace_coreferences +from ovos_utils.log import log_deprecation +from ovos_plugin_manager.version import VERSION_MAJOR + + +log_deprecation("coreference plugins have been deprecated and will be removed in the next major release.\n" + "Please migrate your code to use AbstractAgentEngine.\n" + "The new classes live in ovos_plugin_manager.templates.agents", + f"{VERSION_MAJOR + 1}.0.0") def find_coref_plugins() -> dict: diff --git a/ovos_plugin_manager/g2p.py b/ovos_plugin_manager/g2p.py index 63d41996..dfc3e27b 100644 --- a/ovos_plugin_manager/g2p.py +++ b/ovos_plugin_manager/g2p.py @@ -1,6 +1,6 @@ from typing import Optional from ovos_config import Configuration -from ovos_plugin_manager.utils import normalize_lang, PluginTypes, PluginConfigTypes +from ovos_plugin_manager.utils import PluginTypes, PluginConfigTypes from ovos_plugin_manager.templates.g2p import Grapheme2PhonemePlugin, PhonemeAlphabet from ovos_utils.log import LOG diff --git a/ovos_plugin_manager/keywords.py b/ovos_plugin_manager/keywords.py index 1de2d34b..1406af47 100644 --- a/ovos_plugin_manager/keywords.py +++ b/ovos_plugin_manager/keywords.py @@ -1,4 +1,4 @@ -from ovos_plugin_manager.utils import normalize_lang, PluginTypes, PluginConfigTypes +from ovos_plugin_manager.utils import PluginTypes, PluginConfigTypes from ovos_config import Configuration from ovos_utils.log import LOG from ovos_plugin_manager.templates.keywords import KeywordExtractor diff --git a/ovos_plugin_manager/metadata_transformers.py b/ovos_plugin_manager/metadata_transformers.py index c70fbc8e..8ded55ac 100644 --- a/ovos_plugin_manager/metadata_transformers.py +++ b/ovos_plugin_manager/metadata_transformers.py @@ -1,5 +1,4 @@ -from ovos_plugin_manager.utils import normalize_lang, PluginTypes, \ - PluginConfigTypes +from ovos_plugin_manager.utils import PluginTypes, PluginConfigTypes from ovos_plugin_manager.templates.transformers import MetadataTransformer diff --git a/ovos_plugin_manager/postag.py b/ovos_plugin_manager/postag.py index 4a05e09e..47979d7c 100644 --- a/ovos_plugin_manager/postag.py +++ b/ovos_plugin_manager/postag.py @@ -1,5 +1,4 @@ -from ovos_plugin_manager.utils import normalize_lang, PluginTypes, \ - PluginConfigTypes +from ovos_plugin_manager.utils import PluginTypes, PluginConfigTypes from ovos_config import Configuration from ovos_utils.log import LOG from ovos_plugin_manager.templates.postag import PosTagger diff --git a/ovos_plugin_manager/segmentation.py b/ovos_plugin_manager/segmentation.py index 57528334..b3939810 100644 --- a/ovos_plugin_manager/segmentation.py +++ b/ovos_plugin_manager/segmentation.py @@ -1,5 +1,4 @@ -from ovos_plugin_manager.utils import normalize_lang, \ - PluginTypes, PluginConfigTypes +from ovos_plugin_manager.utils import PluginTypes, PluginConfigTypes from ovos_config import Configuration from ovos_utils.log import LOG from ovos_plugin_manager.templates.segmentation import Segmenter diff --git a/ovos_plugin_manager/solvers.py b/ovos_plugin_manager/solvers.py index bfa2f2a6..46fbc07b 100644 --- a/ovos_plugin_manager/solvers.py +++ b/ovos_plugin_manager/solvers.py @@ -2,6 +2,14 @@ EntailmentSolver, MultipleChoiceSolver, EvidenceSolver, ChatMessageSolver from ovos_plugin_manager.utils import PluginTypes, PluginConfigTypes +from ovos_utils.log import log_deprecation +from ovos_plugin_manager.version import VERSION_MAJOR + +log_deprecation("solver plugins have been deprecated and will be removed in the next major release.\n" + "Please migrate your code to use AbstractAgentEngine.\n" + "The new classes live in ovos_plugin_manager.templates.agents", + f"{VERSION_MAJOR + 1}.0.0") + def find_chat_solver_plugins() -> dict: """ diff --git a/ovos_plugin_manager/stt.py b/ovos_plugin_manager/stt.py index 2194c20f..3cd0623e 100644 --- a/ovos_plugin_manager/stt.py +++ b/ovos_plugin_manager/stt.py @@ -1,5 +1,4 @@ -from ovos_plugin_manager.utils import normalize_lang, \ - PluginTypes, PluginConfigTypes +from ovos_plugin_manager.utils import PluginTypes, PluginConfigTypes from ovos_config import Configuration from ovos_plugin_manager.utils.config import get_valid_plugin_configs, \ sort_plugin_configs, get_plugin_config diff --git a/ovos_plugin_manager/templates/agents.py b/ovos_plugin_manager/templates/agents.py index 0b99ef3a..59d2908c 100644 --- a/ovos_plugin_manager/templates/agents.py +++ b/ovos_plugin_manager/templates/agents.py @@ -1,8 +1,14 @@ import abc +import difflib +import time from abc import ABC from dataclasses import dataclass, field from enum import Enum -from typing import Optional, List, Dict, Any +from typing import Optional, List, Iterable, Tuple, Union, Dict, Any + +from ovos_bus_client.session import SessionManager, Session +from ovos_utils.lang import standardize_lang_tag +from ovos_utils.log import LOG class MessageRole(str, Enum): @@ -120,9 +126,9 @@ class MultimodalAgentMessage(AgentMessage): """ role: MessageRole content: str - image_content: List[str] = field(default_factory=list) # b64 encoded - audio_content: List[str] = field(default_factory=list) # b64 encoded - file_content: List[str] = field(default_factory=list) # b64 encoded + image_content: Optional[List[str]] = field(default_factory=list) # b64 encoded + audio_content: Optional[List[str]] = field(default_factory=list) # b64 encoded + file_content: Optional[List[str]] = field(default_factory=list) # b64 encoded class MultimodalAdapter(ABC): @@ -134,3 +140,631 @@ class MultimodalAdapter(ABC): @abc.abstractmethod def convert(self, message: MultimodalAgentMessage) -> AgentMessage: raise NotImplementedError() + + +######## +# Agent engines replace the previous "solver plugins" +# each task now has a well-defined api contract +# automatic translation is no longer implemented +######## +class AbstractAgentEngine(ABC): + """ + Base class for agent engines that process input to produce specific outputs. + """ + + def __init__(self, config: Optional[Dict[str, Any]] = None): + """ + Initializes the engine. + + Args: + config (dict): Configuration mapping for the specific engine. + """ + self.config = config or {} + + @property + def lang(self) -> str: + """Get default language from config or SessionManager.""" + lang = self.config.get("lang") or SessionManager.get().lang + return standardize_lang_tag(lang) + + +class RetrievalEngine(AbstractAgentEngine): + """ + Interface for querying external or internal knowledge bases. + + Supports integrations with remote APIs (Wikipedia, Wolfram Alpha) + or local databases. + """ + + @abc.abstractmethod + def query(self, query: str, lang: Optional[str] = None, k: int = 3) -> List[Tuple[str, float]]: + """ + Searches the knowledge base for relevant documents or data. + + Args: + query: The search string. + lang: BCP-47 language code. + k: The maximum number of results to return. + + Returns: + List of tuples (content, score) for the top k matches. + """ + raise NotImplementedError + + +class ChatEngine(AbstractAgentEngine): + """ + An engine designed for multi-turn conversations using message list formats. + + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Knock knock."}, + {"role": "assistant", "content": "Who's there?"}, + {"role": "user", "content": "Orange."}, + ] + + ChatEngine plugins are responsible for filtering any unsupported roles + """ + + @abc.abstractmethod + def continue_chat(self, messages: List[AgentMessage], + session_id: str = "default", + lang: Optional[str] = None, + units: Optional[str] = None) -> AgentMessage: + """ + Generate a response message based on the provided chat history. + + Args: + messages (List[AgentMessage]): Full list of messages in the conversation. + session_id (str): Identifier for the session. + lang (str, optional): BCP-47 language code. + units (str, optional): Preferred unit system (e.g., "metric", "imperial"). + + Returns: + AgentMessage: The generated response message from the assistant. + """ + raise NotImplementedError() + + def stream_tokens(self, messages: List[AgentMessage], + session_id: str = "default", + lang: Optional[str] = None, + units: Optional[str] = None) -> Iterable[str]: + """ + Stream back response tokens as they are generated. + + Returns partial sentences and is not suitable for direct TTS. + + Once merged the output corresponds to the content of a AgentMessage with MessageRole.ASSISTANT + + Note: + Default implementation yields the full response from continue_chat. + Subclasses should override this for real-time token streaming. + + Args: + messages (List[AgentMessage]): Full list of messages. + session_id (str): Identifier for the session. + lang (str, optional): Language code. + units (str, optional): Unit system. + + Returns: + Iterable[str]: A stream of tokens/partial text. + """ + yield from self.continue_chat(messages, session_id, lang, units).content.split() + + def stream_sentences(self, messages: List[AgentMessage], + session_id: str = "default", + lang: Optional[str] = None, + units: Optional[str] = None) -> Iterable[str]: + """ + Stream back response sentences as they are generated. + + Returns full sentences only, suitable for direct TTS. + + Once merged the output corresponds to the content of a AgentMessage with MessageRole.ASSISTANT + + Note: + Default implementation yields the full response from continue_chat. + Subclasses should override this for real-time sentence streaming. + + Args: + messages (List[AgentMessage]): Full list of messages. + session_id (str): Identifier for the session. + lang (str, optional): Language code. + units (str, optional): Unit system. + + Returns: + Iterable[str]: A stream of complete sentences. + """ + yield from self.continue_chat(messages, session_id, lang, units).content.split("\n") + + def get_response(self, utterance: str, + session_id: str = "default", + lang: Optional[str] = None, + units: Optional[str] = None) -> str: + """ + High-level wrapper for single-turn text-in/text-out interactions. + + Args: + utterance: The user's input string. + session_id: The session identifier. + lang: BCP-47 language code. + units: Preferred measurement system. + + Returns: + The plain-text content of the assistant's response. + """ + message = AgentMessage(role=MessageRole.USER, content=utterance) + return self.continue_chat(messages=[message], + session_id=session_id, + lang=lang, + units=units).content + + +class MultimodalChatEngine(ChatEngine): + """ + An engine designed for multi-turn conversations using message list formats. + + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Knock knock."}, + {"role": "assistant", "content": "Who's there?"}, + {"role": "user", "content": "Orange."}, + ] + """ + + @abc.abstractmethod + def continue_chat(self, messages: List[MultimodalAgentMessage], + session_id: str = "default", + lang: Optional[str] = None, + units: Optional[str] = None) -> MultimodalAgentMessage: + """ + Generate a response message based on the provided chat history. + + Args: + messages (List[AgentMessage]): Full list of messages in the conversation. + session_id (str): Identifier for the session. + lang (str, optional): BCP-47 language code. + units (str, optional): Preferred unit system (e.g., "metric", "imperial"). + + Returns: + AgentMessage: The generated response message from the assistant. + """ + raise NotImplementedError() + + def stream_chat(self, messages: List[MultimodalAgentMessage], + session_id: str = "default", + lang: Optional[str] = None, + units: Optional[str] = None) -> Iterable[MultimodalAgentMessage]: + """ + Stream back response messages as they are generated. + + Note: + Default implementation yields the full response from continue_chat. + Subclasses should override this for real-time token streaming. + + Args: + messages (List[AgentMessage]): Full list of messages. + session_id (str): Identifier for the session. + lang (str, optional): Language code. + units (str, optional): Unit system. + + Returns: + Iterable[AgentMessage]: A stream of response messages. + """ + yield self.continue_chat(messages, session_id, lang, units) + + def get_response(self, utterance: str, + image_content: Optional[List[str]] = None, # b64 encoded + audio_content: Optional[List[str]] = None, # b64 encoded + file_content: Optional[List[str]] = None, # b64 encoded + session_id: str = "default", + lang: Optional[str] = None, + units: Optional[str] = None) -> str: + """ + High-level wrapper for single-turn text-in/text-out interactions. + + Args: + utterance: The user's input string. + session_id: The session identifier. + lang: BCP-47 language code. + units: Preferred measurement system. + + Returns: + The plain-text content of the assistant's response. + """ + message = MultimodalAgentMessage(role=MessageRole.USER, content=utterance, + image_content=image_content, + audio_content=audio_content, + file_content=file_content) + return self.continue_chat(messages=[message], + session_id=session_id, + lang=lang, + units=units).content + + +class SummarizerEngine(AbstractAgentEngine): + """Engine designed for condensing long documents into concise summaries.""" + + @abc.abstractmethod + def summarize(self, document: str, lang: Optional[str] = None) -> str: + """ + Create a summary of the provided text. + + Args: + document (str): The full text to be summarized. + lang (str, optional): The language of the document. + + Returns: + str: The summarized text. + """ + raise NotImplementedError + + +class ChatSummarizerEngine(AbstractAgentEngine): + """Engine specialized in summarizing structured chat histories.""" + + @abc.abstractmethod + def summarize(self, messages: List[AgentMessage], lang: Optional[str] = None) -> str: + """ + Converts a list of AgentMessages into a narrative or bulleted summary. + + Args: + messages (List[AgentMessage]): Full list of messages in the conversation. + lang (str, optional): The language of the document. + + Returns: + str: The summarized text. + """ + raise NotImplementedError + + +class ExtractiveQAEngine(AbstractAgentEngine): + """ + Engine for extractive Question Answering (QA). + + Identifies the specific segment of a text (the "evidence") that + answers a given question. + """ + + @abc.abstractmethod + def get_best_passage(self, evidence: str, question: str, + lang: Optional[str] = None) -> str: + """ + Extracts the most relevant passage from the evidence. + + Args: + evidence (str): The source text to search. + question (str): The query to answer. + lang (str, optional): The language of the texts. + + Returns: + str: The extracted passage answering the question. + """ + raise NotImplementedError + + +class ReRankerEngine(AbstractAgentEngine): + """ + Engine for evaluating and sorting a list of candidates against a query. + """ + + @abc.abstractmethod + def rerank(self, query: str, options: List[str], + lang: Optional[str] = None, + return_index: bool = False) -> List[Tuple[float, Union[str, int]]]: + """ + Score and rank a list of options against a query. + + Args: + query (str): The search or selection query. + options (List[str]): Potential candidates to rank. + lang (str, optional): Language code. + return_index (bool): If True, returns the option index instead of text in the tuple. + + Returns: + List[Tuple[float, Union[str, int]]]: A sorted list of (score, option/index) pairs. + """ + raise NotImplementedError + + def select_answer(self, query: str, + options: List[str], + lang: Optional[str] = None, + return_index: bool = False) -> Union[str, int]: + """ + Select the single best answer from a list of options. + + Args: + query (str): The query to match. + options (List[str]): List of possible answers. + lang (str, optional): Language code. + return_index (bool): Whether to return the index of the option or the text. + + Returns: + Union[str, int]: The top-ranked option or its index. + """ + return self.rerank(query, options, lang=lang, return_index=return_index)[0][1] + + +class YesNoEngine(AbstractAgentEngine): + """ + Engine for evaluating answers to yes/no questions. + + Determines if a user input means "yes", "no" or undefined + """ + + @abc.abstractmethod + def yes_or_no(self, question: str, response: str, lang: Optional[str] = None) -> Optional[bool]: + """ + True: user answered yes + False: user answered no + None: invalid/neutral answer + """ + raise NotImplementedError + + +class NaturalLanguageInferenceEngine(AbstractAgentEngine): + """ + Engine for Natural Language Inference (NLI). + + Determines if a 'hypothesis' is logically supported by a 'premise'. + """ + + @abc.abstractmethod + def predict_entailment(self, premise: str, hypothesis: str, + lang: Optional[str] = None) -> bool: + """ + Determine if the premise logically entails the hypothesis. + + Args: + premise (str): The base statement or context. + hypothesis (str): The statement to be verified against the premise. + lang (str, optional): Language code. + + Returns: + bool: True if the premise entails the hypothesis, False otherwise. + """ + raise NotImplementedError + + +class DocumentIndexerEngine(RetrievalEngine): + """ + A RetrievalEngine that supports document ingestion and local indexing. + """ + + @abc.abstractmethod + def ingest_corpus(self, corpus: List[str]): + """ + Adds a collection of documents to the local index. + + Args: + corpus: A list of text documents to be indexed. + """ + raise NotImplementedError + + @abc.abstractmethod + def query(self, query: str, lang: Optional[str] = None, k: int = 3) -> List[Tuple[str, float]]: + """Searches the ingested corpus for matching documents.""" + raise NotImplementedError + + +class QAIndexerEngine(RetrievalEngine): + """ + A RetrievalEngine specialized in indexing Question-Answer pairs. + """ + + @abc.abstractmethod + def ingest_corpus(self, corpus: Dict[str, str]): + """ + Adds question-answer pairs to the index. + + Args: + corpus: A dictionary where keys are questions and values are answers. + """ + raise NotImplementedError + + @abc.abstractmethod + def query(self, query: str, lang: Optional[str] = None, k: int = 3) -> List[Tuple[str, float]]: + """ + Matches a user query against indexed questions and returns the best answers. + + Returns: + An iterable of (answer, score) tuples. + """ + raise NotImplementedError + + +class CoreferenceEngine(AbstractAgentEngine): + """ + Base class for Coreference Resolution engines in OVOS. + + This class manages the "State" (Context History), while the inheriting + plugin class provides the "Intelligence" (NLP Logic). + """ + + def __init__(self, config: Optional[Dict[str, Any]] = None): + """ + Args: + config: Configuration dict. + keys: + 'lang': default language override + 'context_ttl': seconds to keep context (default: 120) + """ + super().__init__(config) + # Structure: { lang: { pronoun: [(entity, timestamp)] } } + self.context_data: Dict[str, Dict[str, List[Tuple[str, float]]]] = {} + + @property + def context_ttl(self) -> int: + """Time in seconds before a context entry is considered 'stale'.""" + return self.config.get("context_ttl", 120) + + # ========================================================================= + # Public API - Consumers call these + # ========================================================================= + def resolve(self, text: str, lang: Optional[str] = None, use_memory: bool = False) -> str: + """ + Main entry point. Resolves coreferences using both historical context + and the active NLP solver. + + Flow: + 1. Prune stale context (older than TTL). + 2. Apply known context (e.g., 'her' -> 'mom') to the text. + 3. Pass the result to the NLP solver plugin. + 4. Compare Input vs Output to learn NEW context for next time. + """ + lang = standardize_lang_tag(lang or self.lang) + + # 1. Cleanup old memories + self._prune_context(lang) + + # 2. Apply 'Vault' (Memory) Context + # This handles cases where we manually registered "her" = "mom" + if use_memory: + text_with_context = self._apply_memory(text, lang) + else: + text_with_context = text + + # 3. Apply 'Intelligence' (Plugin NLP) + # Only run expensive NLP if pronouns/ambiguity exist + if self.contains_corefs(text_with_context, lang): + final_solved = self.solve_corefs(text_with_context, lang) + else: + final_solved = text_with_context + + # 4. Update Memory + # If the NLP changed "it" to "the dog", we learn that for next time. + if use_memory: + self._learn_context(text_with_context, final_solved, lang) + + return final_solved + + def set_context(self, pronoun: str, entity: str, lang: Optional[str] = None): + """ + Manually inject context. + Useful for Skills to force a reference. + + Example: set_context("her", "mom") -> "Tell her hi" becomes "Tell mom hi" + """ + lang = standardize_lang_tag(lang or self.lang) + if lang not in self.context_data: + self.context_data[lang] = {} + + pronoun = pronoun.lower() + if pronoun not in self.context_data[lang]: + self.context_data[lang][pronoun] = [] + + # Insert at the top as the most recent + self.context_data[lang][pronoun].insert(0, (entity, time.time())) + + def reset_context(self, lang: Optional[str] = None): + """Clear context history. Call this at end of sessions.""" + if lang: + self.context_data[standardize_lang_tag(lang)] = {} + else: + self.context_data = {} + + # ========================================================================= + # Abstract Interface - Plugin Developers Implement These + # ========================================================================= + + @abc.abstractmethod + def solve_corefs(self, text: str, lang: str) -> str: + """ + Implement the actual coreference resolution logic here. + Example input: "I saw the dog. It was running." + Example output: "I saw the dog. The dog was running." + """ + raise NotImplementedError() + + @abc.abstractmethod + def contains_corefs(self, text: str, lang: str) -> bool: + """ + Return True if the text contains words that need resolving (pronouns, references). + + Used to optimize performance by avoiding calls to self.solve_corefs. + + eg. A basic implementation can match the input against a wordlist of lang specific pronouns. + """ + raise NotImplementedError() + + # ========================================================================= + # Internal Helpers + # ========================================================================= + + def _prune_context(self, lang: str): + """Remove context entries older than self.context_ttl.""" + if lang not in self.context_data: + return + + now = time.time() + ttl = self.context_ttl + + keys_to_remove = [] + for word, history in self.context_data[lang].items(): + # Filter keep only fresh entries + valid_entries = [entry for entry in history if (now - entry[1]) < ttl] + + if not valid_entries: + keys_to_remove.append(word) + else: + self.context_data[lang][word] = valid_entries + + for k in keys_to_remove: + del self.context_data[lang][k] + + def _apply_memory(self, text: str, lang: str) -> str: + """Replace words in text based on current memory.""" + if lang not in self.context_data: + return text + + words = text.split() + dirty = False + + for i, word in enumerate(words): + w_lower = word.lower() + if w_lower in self.context_data[lang]: + # Get the most recent entity (index 0) + replacement_entity = self.context_data[lang][w_lower][0][0] + words[i] = replacement_entity + dirty = True + + return " ".join(words) if dirty else text + + def _learn_context(self, original: str, solved: str, lang: str): + """Diff original vs solved to extract new replacements and save them.""" + replacements = self._extract_replacements(original, solved) + + for pronoun, entities in replacements.items(): + # Register all identified replacements + for entity in entities: + self.set_context(pronoun, entity, lang) + + @staticmethod + def _extract_replacements(original: str, solved: str) -> Dict[str, List[str]]: + """ + Compares the original text with the solved text to identify exactly + which words were replaced using difflib. + """ + + # 1. Tokenize inputs + seq_original = original.lower().split() + seq_solved = solved.lower().split() + + # 2. Diff the sequences + matcher = difflib.SequenceMatcher(None, seq_original, seq_solved) + + replacements: Dict[str, List[str]] = {} + + # 3. Extract replacements + for tag, i1, i2, j1, j2 in matcher.get_opcodes(): + if tag == 'replace': + old_phrase = " ".join(seq_original[i1:i2]) + new_phrase = " ".join(seq_solved[j1:j2]) + + if old_phrase not in replacements: + replacements[old_phrase] = [] + + if new_phrase not in replacements[old_phrase]: + replacements[old_phrase].append(new_phrase) + + return replacements diff --git a/ovos_plugin_manager/templates/coreference.py b/ovos_plugin_manager/templates/coreference.py index 8a4a9733..1adc3e90 100644 --- a/ovos_plugin_manager/templates/coreference.py +++ b/ovos_plugin_manager/templates/coreference.py @@ -4,10 +4,23 @@ from ovos_utils.process_utils import RuntimeRequirements from quebra_frases import word_tokenize import abc +from ovos_utils.log import LOG, log_deprecation, deprecated +from ovos_plugin_manager.version import VERSION_MAJOR + + +log_deprecation("ovos_plugin_manager.templates.coreference has been deprecated and will be removed in the next major release.\n" + "Please migrate your code to use CoreferenceEngine.\n" + "The new classes live in ovos_plugin_manager.templates.agents", + f"{VERSION_MAJOR + 1}.0.0") class CoreferenceSolverEngine: def __init__(self, config=None): + log_deprecation("CoreferenceSolverEngine has been deprecated and will be removed in the next major release.\n" + "Please migrate your code to use CoreferenceEngine.\n" + "The new classes live in ovos_plugin_manager.templates.agents", + f"{VERSION_MAJOR + 1}.0.0") + self.config = config or {} self._prev_sentence = "" self._prev_solved = "" diff --git a/ovos_plugin_manager/templates/solvers.py b/ovos_plugin_manager/templates/solvers.py index 34c9096d..dfbcfd67 100644 --- a/ovos_plugin_manager/templates/solvers.py +++ b/ovos_plugin_manager/templates/solvers.py @@ -5,11 +5,17 @@ from json_database import JsonStorageXDG from ovos_utils.lang import standardize_lang_tag -from ovos_utils.log import LOG, log_deprecation +from ovos_utils.log import LOG, log_deprecation, deprecated from ovos_utils.xdg_utils import xdg_cache_home from ovos_plugin_manager.templates.language import LanguageTranslator, LanguageDetector from ovos_plugin_manager.thirdparty.solvers import AbstractSolver +from ovos_plugin_manager.version import VERSION_MAJOR + +log_deprecation("ovos_plugin_manager.templates.solvers has been deprecated and will be removed in the next major release.\n" + "Please migrate your code to use AbstractAgentEngine.\n" + "The new classes live in ovos_plugin_manager.templates.agents", + f"{VERSION_MAJOR + 1}.0.0") def auto_translate(translate_keys: List[str], translate_str_args=True): @@ -133,6 +139,9 @@ def __init__(self, config: Optional[Dict] = None, enable_cache (bool): Flag to enable caching. internal_lang (Optional[str]): Internal language code. Defaults to None. """ + log_deprecation("QuestionSolver has been deprecated and will be removed in the next major release. " + "Please migrate your code to use ChatEngine / RetrievalEngine", + f"{VERSION_MAJOR + 1}.0.0") super().__init__(config, translator, detector, priority, enable_tx, enable_cache, internal_lang, *args, **kwargs) @@ -438,6 +447,9 @@ def __init__(self, config=None, enable_tx: bool = False, enable_cache: bool = False, *args, **kwargs): + log_deprecation("CorpusSolver has been deprecated and will be removed in the next major release. " + "Please migrate your code to use DocumentIndexerEngine/QAIndexerEngine", + f"{VERSION_MAJOR + 1}.0.0") super().__init__(config, translator, detector, priority, enable_tx, enable_cache, *args, **kwargs) @@ -458,7 +470,7 @@ def retrieve_from_corpus(self, query: str, k: int = 3, lang: Optional[str] = Non res = [] for doc, score in self.query(query, lang, k=k): # this log can be very spammy, only enable for debug during dev - #LOG.debug(f"Rank {len(res) + 1} (score: {score}): {doc}") + # LOG.debug(f"Rank {len(res) + 1} (score: {score}): {doc}") if self.config.get("min_conf"): if score >= self.config["min_conf"]: res.append((score, doc)) @@ -509,6 +521,12 @@ class TldrSolver(AbstractSolver): handling automatic translation as needed. """ + def __init__(self, *args, **kwargs): + log_deprecation("TldrSolver has been deprecated and will be removed in the next major release. " + "Please migrate your code to use SummarizerEngine", + f"{VERSION_MAJOR + 1}.0.0") + super().__init__(*args, **kwargs) + @abc.abstractmethod def get_tldr(self, document: str, lang: Optional[str] = None) -> str: @@ -547,6 +565,12 @@ class EvidenceSolver(AbstractSolver): handling automatic translation as needed. """ + def __init__(self, *args, **kwargs): + log_deprecation("EvidenceSolver has been deprecated and will be removed in the next major release. " + "Please migrate your code to use ExtractiveQAEngine", + f"{VERSION_MAJOR + 1}.0.0") + super().__init__(*args, **kwargs) + @abc.abstractmethod def get_best_passage(self, evidence: str, question: str, lang: Optional[str] = None) -> str: @@ -588,6 +612,12 @@ class MultipleChoiceSolver(AbstractSolver): handling automatic translation as needed. """ + def __init__(self, *args, **kwargs): + log_deprecation("MultipleChoiceSolver has been deprecated and will be removed in the next major release. " + "Please migrate your code to use ReRankerEngine", + f"{VERSION_MAJOR + 1}.0.0") + super().__init__(*args, **kwargs) + @abc.abstractmethod def rerank(self, query: str, options: List[str], lang: Optional[str] = None, @@ -629,6 +659,12 @@ class EntailmentSolver(AbstractSolver): """ select best answer from question + multiple choice handling automatic translation back and forth as needed""" + def __init__(self, *args, **kwargs): + log_deprecation("EntailmentSolver has been deprecated and will be removed in the next major release. " + "Please migrate your code to use NaturalLanguageInferenceEngine", + f"{VERSION_MAJOR + 1}.0.0") + super().__init__(*args, **kwargs) + @abc.abstractmethod def check_entailment(self, premise: str, hypothesis: str, lang: Optional[str] = None) -> bool: diff --git a/ovos_plugin_manager/text_transformers.py b/ovos_plugin_manager/text_transformers.py index 4f118092..e49d16d2 100644 --- a/ovos_plugin_manager/text_transformers.py +++ b/ovos_plugin_manager/text_transformers.py @@ -1,5 +1,4 @@ -from ovos_plugin_manager.utils import normalize_lang, \ - PluginTypes, PluginConfigTypes +from ovos_plugin_manager.utils import PluginTypes, PluginConfigTypes from ovos_plugin_manager.templates.transformers import UtteranceTransformer diff --git a/ovos_plugin_manager/thirdparty/solvers.py b/ovos_plugin_manager/thirdparty/solvers.py index ab08ec81..7556b796 100644 --- a/ovos_plugin_manager/thirdparty/solvers.py +++ b/ovos_plugin_manager/thirdparty/solvers.py @@ -1,3 +1,4 @@ +# TODO - delete this file in next major release when ovos_plugin_manager.templates.solvers gets deprecated # NEON AI (TM) SOFTWARE, Software Development Kit & Application Framework # All trademark and other rights reserved by their respective owners # Copyright 2008-2022 Neongecko.com Inc. @@ -106,7 +107,7 @@ def sentence_split(text: str, max_sentences: int = 25) -> List[str]: for t in text.split("\n")])[:max_sentences] except Exception as e: LOG.exception(f"Error in sentence_split: {e}") - return [text] + return text.split("\n") @lru_cache(maxsize=128) def detect_language(self, text: str) -> str: diff --git a/ovos_plugin_manager/thirdparty/sr.py b/ovos_plugin_manager/thirdparty/sr.py new file mode 100644 index 00000000..66522039 --- /dev/null +++ b/ovos_plugin_manager/thirdparty/sr.py @@ -0,0 +1,466 @@ +""" +extracted from https://github.com/Uberi/speech_recognition +""" +import aifc +import audioop +import io +import os +import shutil +import subprocess +import wave + + +class srAudioData: + """ + Creates a new ``AudioData`` instance, which represents mono audio data. + + The raw audio data is specified by ``frame_data``, which is a sequence of bytes representing audio samples. This is the frame data structure used by the PCM WAV format. + + The width of each sample, in bytes, is specified by ``sample_width``. Each group of ``sample_width`` bytes represents a single audio sample. + + The audio data is assumed to have a sample rate of ``sample_rate`` samples per second (Hertz). + """ + + def __init__(self, frame_data, sample_rate: int, sample_width: int): + """ + Initialize an AudioData instance holding mono PCM audio frames. + + Parameters: + frame_data (bytes-like): Raw PCM frame bytes for a single channel. + sample_rate (int): Sample rate in Hertz; must be greater than zero. + sample_width (int): Sample width in bytes (1 to 4); stored as an integer. + + Raises: + AssertionError: If sample_rate is not greater than zero or sample_width is not an integer between 1 and 4. + """ + assert sample_rate > 0, "Sample rate must be a positive integer" + assert ( + sample_width % 1 == 0 and 1 <= sample_width <= 4 + ), "Sample width must be between 1 and 4 inclusive" + self.frame_data = frame_data + self.sample_rate = sample_rate + self.sample_width = int(sample_width) + + def get_segment(self, start_ms=None, end_ms=None) -> 'srAudioData': + """ + Return an AudioData instance trimmed to the specified millisecond interval. + + Parameters: + start_ms (float | int | None): Start time in milliseconds (inclusive). If None, start at the beginning. + end_ms (float | int | None): End time in milliseconds (exclusive). If None, end at the end of the audio. + + Returns: + AudioData: A new AudioData containing the audio frames from [start_ms, end_ms). + """ + assert ( + start_ms is None or start_ms >= 0 + ), "``start_ms`` must be a non-negative number" + assert end_ms is None or end_ms >= ( + 0 if start_ms is None else start_ms + ), "``end_ms`` must be a non-negative number greater or equal to ``start_ms``" + if start_ms is None: + start_byte = 0 + else: + start_byte = int( + (start_ms * self.sample_rate * self.sample_width) // 1000 + ) + if end_ms is None: + end_byte = len(self.frame_data) + else: + end_byte = int( + (end_ms * self.sample_rate * self.sample_width) // 1000 + ) + return srAudioData( + self.frame_data[start_byte:end_byte], + self.sample_rate, + self.sample_width, + ) + + def get_raw_data(self, convert_rate=None, convert_width=None) -> bytes: + """ + Get raw PCM frame bytes for this audio, optionally resampled or converted to a different sample width. + + Parameters: + convert_rate (int|None): If provided, resample audio to this sample rate in Hz. + convert_width (int|None): If provided, convert samples to this width in bytes (1–4). A value of 1 produces unsigned 8-bit samples. + + Returns: + bytes: Raw PCM frame data reflecting any requested rate or width conversions. + """ + assert ( + convert_rate is None or convert_rate > 0 + ), "Sample rate to convert to must be a positive integer" + assert convert_width is None or ( + convert_width % 1 == 0 and 1 <= convert_width <= 4 + ), "Sample width to convert to must be between 1 and 4 inclusive" + + raw_data = self.frame_data + + # make sure unsigned 8-bit audio (which uses unsigned samples) is handled like higher sample width audio (which uses signed samples) + if self.sample_width == 1: + raw_data = audioop.bias( + raw_data, 1, -128 + ) # subtract 128 from every sample to make them act like signed samples + + # resample audio at the desired rate if specified + if convert_rate is not None and self.sample_rate != convert_rate: + raw_data, _ = audioop.ratecv( + raw_data, + self.sample_width, + 1, + self.sample_rate, + convert_rate, + None, + ) + + # convert samples to desired sample width if specified + if convert_width is not None and self.sample_width != convert_width: + if ( + convert_width == 3 + ): # we're converting the audio into 24-bit (workaround for https://bugs.python.org/issue12866) + raw_data = audioop.lin2lin( + raw_data, self.sample_width, 4 + ) # convert audio into 32-bit first, which is always supported + try: + audioop.bias( + b"", 3, 0 + ) # test whether 24-bit audio is supported (for example, ``audioop`` in Python 3.3 and below don't support sample width 3, while Python 3.4+ do) + except ( + audioop.error + ): # this version of audioop doesn't support 24-bit audio (probably Python 3.3 or less) + raw_data = b"".join( + raw_data[i + 1: i + 4] + for i in range(0, len(raw_data), 4) + ) # since we're in little endian, we discard the first byte from each 32-bit sample to get a 24-bit sample + else: # 24-bit audio fully supported, we don't need to shim anything + raw_data = audioop.lin2lin( + raw_data, self.sample_width, convert_width + ) + else: + raw_data = audioop.lin2lin( + raw_data, self.sample_width, convert_width + ) + + # if the output is 8-bit audio with unsigned samples, convert the samples we've been treating as signed to unsigned again + if convert_width == 1: + raw_data = audioop.bias( + raw_data, 1, 128 + ) # add 128 to every sample to make them act like unsigned samples again + + return raw_data + + def get_wav_data(self, convert_rate=None, convert_width=None) -> bytes: + """ + Produce WAV-format file bytes containing this AudioData. + + Parameters: + convert_rate (int or None): If given, resample audio to this sample rate in Hz. + convert_width (int or None): If given, convert sample width to this number of bytes (1–4). + + Returns: + bytes: WAV file bytes (mono) containing the audio, with any requested sample-rate or sample-width conversion applied. + """ + raw_data = self.get_raw_data(convert_rate, convert_width) + sample_rate = ( + self.sample_rate if convert_rate is None else convert_rate + ) + sample_width = ( + self.sample_width if convert_width is None else convert_width + ) + + # generate the WAV file contents + with io.BytesIO() as wav_file: + wav_writer = wave.open(wav_file, "wb") + try: # note that we can't use context manager, since that was only added in Python 3.4 + wav_writer.setframerate(sample_rate) + wav_writer.setsampwidth(sample_width) + wav_writer.setnchannels(1) + wav_writer.writeframes(raw_data) + wav_data = wav_file.getvalue() + finally: # make sure resources are cleaned up + wav_writer.close() + return wav_data + + def get_aiff_data(self, convert_rate=None, convert_width=None) -> bytes: + """ + Produce AIFF-C file bytes for the audio data. + + Parameters: + convert_rate (int, optional): Target sample rate in Hz. If omitted, the instance's sample rate is used. + convert_width (int, optional): Target sample width in bytes (1–4). If omitted, the instance's sample width is used. + + Returns: + bytes: AIFF-C file contents containing the audio with the requested sample rate and sample width. + """ + raw_data = self.get_raw_data(convert_rate, convert_width) + sample_rate = ( + self.sample_rate if convert_rate is None else convert_rate + ) + sample_width = ( + self.sample_width if convert_width is None else convert_width + ) + + # the AIFF format is big-endian, so we need to convert the little-endian raw data to big-endian + if hasattr( + audioop, "byteswap" + ): # ``audioop.byteswap`` was only added in Python 3.4 + raw_data = audioop.byteswap(raw_data, sample_width) + else: # manually reverse the bytes of each sample, which is slower but works well enough as a fallback + raw_data = raw_data[sample_width - 1:: -1] + b"".join( + raw_data[i + sample_width: i: -1] + for i in range(sample_width - 1, len(raw_data), sample_width) + ) + + # generate the AIFF-C file contents + with io.BytesIO() as aiff_file: + aiff_writer = aifc.open(aiff_file, "wb") + try: # note that we can't use context manager, since that was only added in Python 3.4 + aiff_writer.setframerate(sample_rate) + aiff_writer.setsampwidth(sample_width) + aiff_writer.setnchannels(1) + aiff_writer.writeframes(raw_data) + aiff_data = aiff_file.getvalue() + finally: # make sure resources are cleaned up + aiff_writer.close() + return aiff_data + + def get_flac_data(self, convert_rate=None, convert_width=None) -> bytes: + """ + Return FLAC-encoded bytes for this AudioData. + + If `convert_rate` is provided and differs from the instance sample rate, the audio is resampled to `convert_rate` Hz. If `convert_width` is provided, the audio samples are converted to that many bytes per sample; `convert_width` must be 1, 2, or 3 when given. If the source is wider than 3 bytes and `convert_width` is not specified, the output is converted to 3-byte (24-bit) samples because 32-bit FLAC is not supported. + + Returns: + flac_bytes (bytes): A byte string containing a valid FLAC file representing the (optionally converted) audio. + """ + assert convert_width is None or ( + convert_width % 1 == 0 and 1 <= convert_width <= 3 + ), "Sample width to convert to must be between 1 and 3 inclusive" + + if ( + self.sample_width > 3 and convert_width is None + ): # resulting WAV data would be 32-bit, which is not convertable to FLAC using our encoder + convert_width = 3 # the largest supported sample width is 24-bit, so we'll limit the sample width to that + + # run the FLAC converter with the WAV data to get the FLAC data + wav_data = self.get_wav_data(convert_rate, convert_width) + flac_converter = get_flac_converter() + if ( + os.name == "nt" + ): # on Windows, specify that the process is to be started without showing a console window + startup_info = subprocess.STARTUPINFO() + startup_info.dwFlags |= ( + subprocess.STARTF_USESHOWWINDOW + ) # specify that the wShowWindow field of `startup_info` contains a value + startup_info.wShowWindow = ( + subprocess.SW_HIDE + ) # specify that the console window should be hidden + else: + startup_info = None # default startupinfo + process = subprocess.Popen( + [ + flac_converter, + "--stdout", + "--totally-silent", + # put the resulting FLAC file in stdout, and make sure it's not mixed with any program output + "--best", # highest level of compression available + "-", # the input FLAC file contents will be given in stdin + ], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + startupinfo=startup_info, + ) + flac_data, stderr = process.communicate(wav_data) + return flac_data + + +class srAudioFile: + """ + Creates a new ``AudioFile`` instance given a WAV/AIFF/FLAC audio file ``filename_or_fileobject``. Subclass of ``AudioSource``. + + If ``filename_or_fileobject`` is a string, then it is interpreted as a path to an audio file on the filesystem. Otherwise, ``filename_or_fileobject`` should be a file-like object such as ``io.BytesIO`` or similar. + + Note that functions that read from the audio (such as ``recognizer_instance.record`` or ``recognizer_instance.listen``) will move ahead in the stream. For example, if you execute ``recognizer_instance.record(audiofile_instance, duration=10)`` twice, the first time it will return the first 10 seconds of audio, and the second time it will return the 10 seconds of audio right after that. This is always reset to the beginning when entering an ``AudioFile`` context. + + WAV files must be in PCM/LPCM format; WAVE_FORMAT_EXTENSIBLE and compressed WAV are not supported and may result in undefined behaviour. + + Both AIFF and AIFF-C (compressed AIFF) formats are supported. + + FLAC files must be in native FLAC format; OGG-FLAC is not supported and may result in undefined behaviour. + """ + + def __init__(self, filename_or_fileobject): + """ + Initialize an AudioFile wrapper for reading WAV/AIFF/FLAC audio sources. + + Parameters: + filename_or_fileobject (str or file-like): Path to an audio file or a readable file-like object. + + Raises: + AssertionError: If `filename_or_fileobject` is not a string path nor an object with a `read()` method. + """ + assert isinstance(filename_or_fileobject, (type(""), type(u""))) or hasattr(filename_or_fileobject, + "read"), "Given audio file must be a filename string or a file-like object" + self.filename_or_fileobject = filename_or_fileobject + self.stream = None + self.DURATION = None + + self.audio_reader = None + self.little_endian = False + self.SAMPLE_RATE = None + self.CHUNK = None + self.FRAME_COUNT = None + + def __enter__(self): + """ + Open the audio source and prepare it for reading, detecting format and configuring stream properties. + + Tries to interpret the provided filename or file-like object as WAV, AIFF/AIFF-C, or FLAC (decoded to AIFF). On success, configures the instance for streaming by setting SAMPLE_RATE, SAMPLE_WIDTH (may be adjusted to 4 when 24-bit samples must be handled as 32-bit internally), CHUNK, FRAME_COUNT, DURATION, little_endian flag, and stream (an AudioFileStream instance). Validates that the audio has 1 or 2 channels. If the source cannot be parsed as WAV, AIFF, or native FLAC, raises ValueError. + + @returns + self: the prepared AudioFile instance with an open audio_reader and ready-to-use stream + """ + assert self.stream is None, "This audio source is already inside a context manager" + try: + # attempt to read the file as WAV + self.audio_reader = wave.open(self.filename_or_fileobject, "rb") + self.little_endian = True # RIFF WAV is a little-endian format (most ``audioop`` operations assume that the frames are stored in little-endian form) + except (wave.Error, EOFError): + try: + # attempt to read the file as AIFF + self.audio_reader = aifc.open(self.filename_or_fileobject, "rb") + self.little_endian = False # AIFF is a big-endian format + except (aifc.Error, EOFError): + # attempt to read the file as FLAC + if hasattr(self.filename_or_fileobject, "read"): + flac_data = self.filename_or_fileobject.read() + else: + with open(self.filename_or_fileobject, "rb") as f: + flac_data = f.read() + + # run the FLAC converter with the FLAC data to get the AIFF data + flac_converter = get_flac_converter() + if os.name == "nt": # on Windows, specify that the process is to be started without showing a console window + startup_info = subprocess.STARTUPINFO() + startup_info.dwFlags |= subprocess.STARTF_USESHOWWINDOW # specify that the wShowWindow field of `startup_info` contains a value + startup_info.wShowWindow = subprocess.SW_HIDE # specify that the console window should be hidden + else: + startup_info = None # default startupinfo + process = subprocess.Popen([ + flac_converter, + "--stdout", "--totally-silent", + # put the resulting AIFF file in stdout, and make sure it's not mixed with any program output + "--decode", "--force-aiff-format", # decode the FLAC file into an AIFF file + "-", # the input FLAC file contents will be given in stdin + ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, startupinfo=startup_info) + aiff_data, _ = process.communicate(flac_data) + aiff_file = io.BytesIO(aiff_data) + try: + self.audio_reader = aifc.open(aiff_file, "rb") + except (aifc.Error, EOFError): + raise ValueError( + "Audio file could not be read as PCM WAV, AIFF/AIFF-C, or Native FLAC; check if file is corrupted or in another format") + self.little_endian = False # AIFF is a big-endian format + assert 1 <= self.audio_reader.getnchannels() <= 2, "Audio must be mono or stereo" + self.SAMPLE_WIDTH = self.audio_reader.getsampwidth() + + # 24-bit audio needs some special handling for old Python versions (workaround for https://bugs.python.org/issue12866) + samples_24_bit_pretending_to_be_32_bit = False + if self.SAMPLE_WIDTH == 3: # 24-bit audio + try: + audioop.bias(b"", self.SAMPLE_WIDTH, + 0) # test whether this sample width is supported (for example, ``audioop`` in Python 3.3 and below don't support sample width 3, while Python 3.4+ do) + except audioop.error: # this version of audioop doesn't support 24-bit audio (probably Python 3.3 or less) + samples_24_bit_pretending_to_be_32_bit = True # while the ``AudioFile`` instance will outwardly appear to be 32-bit, it will actually internally be 24-bit + self.SAMPLE_WIDTH = 4 # the ``AudioFile`` instance should present itself as a 32-bit stream now, since we'll be converting into 32-bit on the fly when reading + + self.SAMPLE_RATE = self.audio_reader.getframerate() + self.CHUNK = 4096 + self.FRAME_COUNT = self.audio_reader.getnframes() + self.DURATION = self.FRAME_COUNT / float(self.SAMPLE_RATE) + self.stream = srAudioFile.AudioFileStream(self.audio_reader, self.little_endian, + samples_24_bit_pretending_to_be_32_bit) + return self + + def __exit__(self, exc_type, exc_value, traceback): + """ + Close and clean up the AudioFile context, releasing any resources opened by this instance. + + If the original source was a filename (not a file-like object), closes the underlying audio reader. Resets the internal stream and duration state. + """ + if not hasattr(self.filename_or_fileobject, "read"): # only close the file if it was opened by this class in the first place (if the file was originally given as a path) + self.audio_reader.close() + self.stream = None + self.DURATION = None + + class AudioFileStream: + def __init__(self, audio_reader, little_endian, samples_24_bit_pretending_to_be_32_bit): + """ + Initialize the AudioFileStream with an underlying audio reader and format flags. + + Parameters: + audio_reader: A file-like audio reader (e.g., a wave.Wave_read or aifc.Aifc_read) that provides a readframes-like interface. + little_endian (bool): True when the source audio frames are little-endian; False when frames are big-endian and must be byte-swapped before processing. + samples_24_bit_pretending_to_be_32_bit (bool): True when the source uses 24-bit samples represented/stored as 32-bit frames (a compatibility mode); the stream will convert these to actual 24-bit data on read. + """ + self.audio_reader = audio_reader # an audio file object (e.g., a `wave.Wave_read` instance) + self.little_endian = little_endian # whether the audio data is little-endian (when working with big-endian things, we'll have to convert it to little-endian before we process it) + self.samples_24_bit_pretending_to_be_32_bit = samples_24_bit_pretending_to_be_32_bit # this is true if the audio is 24-bit audio, but 24-bit audio isn't supported, so we have to pretend that this is 32-bit audio and convert it on the fly + + def read(self, size=-1): + """ + Read up to `size` frames from the underlying audio reader and return mono, little-endian PCM bytes. + + This method: + - Reads `size` frames (or all frames if `size` is -1). If the reader returns a non-bytes value, an empty bytes object is returned. + - Converts big-endian input to little-endian on the fly. + - If `samples_24_bit_pretending_to_be_32_bit` is set, expands 24-bit samples into 32-bit little-endian samples. + - Converts multi-channel input to mono by mixing channels equally. + + Parameters: + size (int): Number of frames to read from the underlying reader; -1 means "read all available frames". + + Returns: + bytes: PCM audio data containing mono, little-endian samples (may be 32-bit if 24-bit-to-32-bit expansion occurred). + """ + buffer = self.audio_reader.readframes(self.audio_reader.getnframes() if size == -1 else size) + if not isinstance(buffer, bytes): buffer = b"" # workaround for https://bugs.python.org/issue24608 + + sample_width = self.audio_reader.getsampwidth() + if not self.little_endian: # big endian format, convert to little endian on the fly + if hasattr(audioop, + "byteswap"): # ``audioop.byteswap`` was only added in Python 3.4 (incidentally, that also means that we don't need to worry about 24-bit audio being unsupported, since Python 3.4+ always has that functionality) + buffer = audioop.byteswap(buffer, sample_width) + else: # manually reverse the bytes of each sample, which is slower but works well enough as a fallback + buffer = buffer[sample_width - 1::-1] + b"".join( + buffer[i + sample_width:i:-1] for i in range(sample_width - 1, len(buffer), sample_width)) + + # workaround for https://bugs.python.org/issue12866 + if self.samples_24_bit_pretending_to_be_32_bit: # we need to convert samples from 24-bit to 32-bit before we can process them with ``audioop`` functions + buffer = b"".join(b"\x00" + buffer[i:i + sample_width] for i in range(0, len(buffer), + sample_width)) # since we're in little endian, we prepend a zero byte to each 24-bit sample to get a 32-bit sample + sample_width = 4 # make sure we thread the buffer as 32-bit audio now, after converting it from 24-bit audio + if self.audio_reader.getnchannels() != 1: # stereo audio + buffer = audioop.tomono(buffer, sample_width, 1, 1) # convert stereo audio data to mono + return buffer + + +def get_flac_converter(): + """ + Locate the system FLAC encoder and return its absolute filesystem path. + + Returns: + flac_path (str): Absolute path to the `flac` executable. + + Raises: + OSError: If no `flac` converter can be found on PATH. + """ + flac_converter = shutil.which("flac") # check for installed version first + if flac_converter is None: # flac utility is not installed + raise OSError( + "FLAC conversion utility not available - consider installing the FLAC command line application by running `apt-get install flac` or your operating system's equivalent" + ) + return flac_converter + + + diff --git a/ovos_plugin_manager/tokenization.py b/ovos_plugin_manager/tokenization.py index 2ce8f1ff..97d1cada 100644 --- a/ovos_plugin_manager/tokenization.py +++ b/ovos_plugin_manager/tokenization.py @@ -1,5 +1,4 @@ -from ovos_plugin_manager.utils import normalize_lang, \ - PluginTypes, PluginConfigTypes +from ovos_plugin_manager.utils import PluginTypes, PluginConfigTypes from ovos_config import Configuration from ovos_utils.log import LOG from ovos_plugin_manager.templates.tokenization import Tokenizer diff --git a/ovos_plugin_manager/utils/__init__.py b/ovos_plugin_manager/utils/__init__.py index 6a373cff..69f16352 100644 --- a/ovos_plugin_manager/utils/__init__.py +++ b/ovos_plugin_manager/utils/__init__.py @@ -14,7 +14,6 @@ from collections import deque import time -import warnings from enum import Enum from ovos_utils.log import LOG, log_deprecation, deprecated from threading import Event, Lock @@ -78,13 +77,18 @@ class PluginTypes(str, Enum): INTENT_TRANSFORMER = "opm.transformer.intent" AGENT_MEMORY = "opm.agents.memory" AGENT_MULTIMODAL_ADAPTER = "opm.agents.multimodal_adapter" - QUESTION_SOLVER = "opm.solver.question" - CHAT_SOLVER = "opm.solver.chat" - TLDR_SOLVER = "opm.solver.summarization" - ENTAILMENT_SOLVER = "opm.solver.entailment" - MULTIPLE_CHOICE_SOLVER = "opm.solver.multiple_choice" - READING_COMPREHENSION_SOLVER = "opm.solver.reading_comprehension" - COREFERENCE_SOLVER = "opm.coreference" + AGENT_CHAT = "opm.agents.chat" + AGENT_CHAT_MULTIMODAL = "opm.agents.chat.multimodal" + AGENT_RETRIEVAL = "opm.agents.retrieval" + AGENT_DOC_RETRIEVAL = "opm.agents.retrieval.documents" + AGENT_QA_RETRIEVAL = "opm.agents.retrieval.qa" + AGENT_RERANKER = "opm.agents.reranker" + AGENT_SUMMARIZER = "opm.agents.summarizer" + AGENT_CHAT_SUMMARIZER = "opm.agents.summarizer.chat" + AGENT_EXTRACTIVE_QA = "opm.agents.extractive_qa" + AGENT_NLI = "opm.agents.nli" + AGENT_COREF = "opm.agents.coref" + AGENT_YES_NO = "opm.agents.yesno" KEYWORD_EXTRACTION = "opm.keywords" UTTERANCE_SEGMENTATION = "opm.segmentation" TOKENIZATION = "opm.tokenization" @@ -95,6 +99,14 @@ class PluginTypes(str, Enum): WEB_PLAYER = "opm.media.web" PERSONA = "opm.plugin.persona" # personas are a dict, they have no config because they ARE a config + # solver plugins are deprecated! + QUESTION_SOLVER = "opm.solver.question" + CHAT_SOLVER = "opm.solver.chat" + TLDR_SOLVER = "opm.solver.summarization" + ENTAILMENT_SOLVER = "opm.solver.entailment" + MULTIPLE_CHOICE_SOLVER = "opm.solver.multiple_choice" + READING_COMPREHENSION_SOLVER = "opm.solver.reading_comprehension" + COREFERENCE_SOLVER = "opm.coreference" class PluginConfigTypes(str, Enum): TRIPLES = "opm.triples.config" @@ -127,13 +139,18 @@ class PluginConfigTypes(str, Enum): INTENT_TRANSFORMER = "opm.transformer.intent.config" AGENT_MEMORY = "opm.agents.memory.config" AGENT_MULTIMODAL_ADAPTER = "opm.agents.multimodal_adapter.config" - QUESTION_SOLVER = "opm.solver.config" - CHAT_SOLVER = "opm.solver.chat.config" - TLDR_SOLVER = "opm.solver.summarization.config" - ENTAILMENT_SOLVER = "opm.solver.entailment.config" - MULTIPLE_CHOICE_SOLVER = "opm.solver.multiple_choice.config" - READING_COMPREHENSION_SOLVER = "opm.solver.reading_comprehension.config" - COREFERENCE_SOLVER = "opm.coreference.config" + AGENT_CHAT = "opm.agents.chat.config" + AGENT_CHAT_MULTIMODAL = "opm.agents.chat.multimodal.config" + AGENT_RETRIEVAL = "opm.agents.retrieval.config" + AGENT_DOC_RETRIEVAL = "opm.agents.retrieval.documents.config" + AGENT_QA_RETRIEVAL = "opm.agents.retrieval.qa.config" + AGENT_RERANKER = "opm.agents.reranker.config" + AGENT_SUMMARIZER = "opm.agents.summarizer.config" + AGENT_CHAT_SUMMARIZER = "opm.agents.summarizer.chat.config" + AGENT_EXTRACTIVE_QA = "opm.agents.extractive_qa.config" + AGENT_NLI = "opm.agents.nli.config" + AGENT_COREF = "opm.agents.coref.config" + AGENT_YES_NO = "opm.agents.yesno.config" KEYWORD_EXTRACTION = "opm.keywords.config" UTTERANCE_SEGMENTATION = "opm.segmentation.config" TOKENIZATION = "opm.tokenization.config" @@ -143,6 +160,14 @@ class PluginConfigTypes(str, Enum): VIDEO_PLAYER = "opm.media.video.config" WEB_PLAYER = "opm.media.web.config" + # solver plugins are deprecated! + QUESTION_SOLVER = "opm.solver.config" + CHAT_SOLVER = "opm.solver.chat.config" + TLDR_SOLVER = "opm.solver.summarization.config" + ENTAILMENT_SOLVER = "opm.solver.entailment.config" + MULTIPLE_CHOICE_SOLVER = "opm.solver.multiple_choice.config" + READING_COMPREHENSION_SOLVER = "opm.solver.reading_comprehension.config" + COREFERENCE_SOLVER = "opm.coreference.config" def find_plugins(plug_type: PluginTypes = None) -> dict: """ @@ -267,17 +292,6 @@ def load_plugin(plug_name: str, plug_type: Optional[PluginTypes] = None): return None -@deprecated("normalize_lang has been deprecated! update to 'from ovos_utils.lang import standardize_lang_tag'", "1.0.0") -def normalize_lang(lang): - warnings.warn( - "update to 'from ovos_utils.lang import standardize_lang_tag'", - DeprecationWarning, - stacklevel=2, - ) - from ovos_utils.lang import standardize_lang_tag - return standardize_lang_tag(lang) - - class ReadWriteStream: """ Class used to support writing binary audio data at any pace, diff --git a/ovos_plugin_manager/utils/audio.py b/ovos_plugin_manager/utils/audio.py index b86f76fd..b868e9dd 100644 --- a/ovos_plugin_manager/utils/audio.py +++ b/ovos_plugin_manager/utils/audio.py @@ -1,15 +1,12 @@ """ -extracted from https://github.com/Uberi/speech_recognition +original unmodified src from https://github.com/Uberi/speech_recognition shipped under ovos_plugin_manager.thirdparty.sr extended with methods to support conversion to/from numpy arrays """ -import aifc -import audioop import io -import os -import subprocess -import wave -import shutil + +from ovos_plugin_manager.thirdparty.sr import srAudioData, srAudioFile, get_flac_converter + try: import numpy as np @@ -22,7 +19,7 @@ Array = Any # Typing helper -class AudioData: +class AudioData(srAudioData): """ Creates a new ``AudioData`` instance, which represents mono audio data. @@ -33,37 +30,34 @@ class AudioData: The audio data is assumed to have a sample rate of ``sample_rate`` samples per second (Hertz). """ - def __init__(self, frame_data, sample_rate: int, sample_width: int): - """ - Initialize an AudioData instance holding mono PCM audio frames. - - Parameters: - frame_data (bytes-like): Raw PCM frame bytes for a single channel. - sample_rate (int): Sample rate in Hertz; must be greater than zero. - sample_width (int): Sample width in bytes (1 to 4); stored as an integer. - - Raises: - AssertionError: If sample_rate is not greater than zero or sample_width is not an integer between 1 and 4. - """ - assert sample_rate > 0, "Sample rate must be a positive integer" - assert ( - sample_width % 1 == 0 and 1 <= sample_width <= 4 - ), "Sample width must be between 1 and 4 inclusive" - self.frame_data = frame_data - self.sample_rate = sample_rate - self.sample_width = int(sample_width) - @classmethod def from_file(cls, file_path: str) -> 'AudioData': """ Create an AudioData instance from the audio file at the given path. - + + Parameters: + file_path (str): Filesystem path to a WAV/AIFF/FLAC audio file. + Returns: audio_data (AudioData): AudioData containing the file's mono PCM frame data, sample rate, and sample width. """ with AudioFile(file_path) as source: return source.read() + def save(self, file_path: str, convert_rate=None, convert_width=None): + """ + Write the audio data to a WAV file at the given path. + + Optionally convert the sample rate or sample width before writing. + + Parameters: + file_path (str): Filesystem path to write the WAV file to. + convert_rate (int | None): Target sample rate in Hz to convert to, or `None` to keep the current rate. + convert_width (int | None): Target sample width in bytes (e.g., 1, 2, 3, 4), or `None` to keep the current width. + """ + with open(file_path, "wb") as f: + f.write(self.get_wav_data(convert_rate, convert_width)) + @classmethod def from_array(cls, data: Array, sample_rate: int, sample_width: int) -> 'AudioData': """ @@ -139,238 +133,6 @@ def from_array(cls, data: Array, sample_rate: int, sample_width: int) -> 'AudioD frame_data = int_data_le.tobytes() return cls(frame_data, sample_rate, sample_width) - def get_segment(self, start_ms=None, end_ms=None) -> 'AudioData': - """ - Return an AudioData instance trimmed to the specified millisecond interval. - - Parameters: - start_ms (float | int | None): Start time in milliseconds (inclusive). If None, start at the beginning. - end_ms (float | int | None): End time in milliseconds (exclusive). If None, end at the end of the audio. - - Returns: - AudioData: A new AudioData containing the audio frames from [start_ms, end_ms). - """ - assert ( - start_ms is None or start_ms >= 0 - ), "``start_ms`` must be a non-negative number" - assert end_ms is None or end_ms >= ( - 0 if start_ms is None else start_ms - ), "``end_ms`` must be a non-negative number greater or equal to ``start_ms``" - if start_ms is None: - start_byte = 0 - else: - start_byte = int( - (start_ms * self.sample_rate * self.sample_width) // 1000 - ) - if end_ms is None: - end_byte = len(self.frame_data) - else: - end_byte = int( - (end_ms * self.sample_rate * self.sample_width) // 1000 - ) - return AudioData( - self.frame_data[start_byte:end_byte], - self.sample_rate, - self.sample_width, - ) - - def get_raw_data(self, convert_rate=None, convert_width=None) -> bytes: - """ - Get raw PCM frame bytes for this audio, optionally resampled or converted to a different sample width. - - Parameters: - convert_rate (int|None): If provided, resample audio to this sample rate in Hz. - convert_width (int|None): If provided, convert samples to this width in bytes (1–4). A value of 1 produces unsigned 8-bit samples. - - Returns: - bytes: Raw PCM frame data reflecting any requested rate or width conversions. - """ - assert ( - convert_rate is None or convert_rate > 0 - ), "Sample rate to convert to must be a positive integer" - assert convert_width is None or ( - convert_width % 1 == 0 and 1 <= convert_width <= 4 - ), "Sample width to convert to must be between 1 and 4 inclusive" - - raw_data = self.frame_data - - # make sure unsigned 8-bit audio (which uses unsigned samples) is handled like higher sample width audio (which uses signed samples) - if self.sample_width == 1: - raw_data = audioop.bias( - raw_data, 1, -128 - ) # subtract 128 from every sample to make them act like signed samples - - # resample audio at the desired rate if specified - if convert_rate is not None and self.sample_rate != convert_rate: - raw_data, _ = audioop.ratecv( - raw_data, - self.sample_width, - 1, - self.sample_rate, - convert_rate, - None, - ) - - # convert samples to desired sample width if specified - if convert_width is not None and self.sample_width != convert_width: - if ( - convert_width == 3 - ): # we're converting the audio into 24-bit (workaround for https://bugs.python.org/issue12866) - raw_data = audioop.lin2lin( - raw_data, self.sample_width, 4 - ) # convert audio into 32-bit first, which is always supported - try: - audioop.bias( - b"", 3, 0 - ) # test whether 24-bit audio is supported (for example, ``audioop`` in Python 3.3 and below don't support sample width 3, while Python 3.4+ do) - except ( - audioop.error - ): # this version of audioop doesn't support 24-bit audio (probably Python 3.3 or less) - raw_data = b"".join( - raw_data[i + 1: i + 4] - for i in range(0, len(raw_data), 4) - ) # since we're in little endian, we discard the first byte from each 32-bit sample to get a 24-bit sample - else: # 24-bit audio fully supported, we don't need to shim anything - raw_data = audioop.lin2lin( - raw_data, self.sample_width, convert_width - ) - else: - raw_data = audioop.lin2lin( - raw_data, self.sample_width, convert_width - ) - - # if the output is 8-bit audio with unsigned samples, convert the samples we've been treating as signed to unsigned again - if convert_width == 1: - raw_data = audioop.bias( - raw_data, 1, 128 - ) # add 128 to every sample to make them act like unsigned samples again - - return raw_data - - def get_wav_data(self, convert_rate=None, convert_width=None) -> bytes: - """ - Produce WAV-format file bytes containing this AudioData. - - Parameters: - convert_rate (int or None): If given, resample audio to this sample rate in Hz. - convert_width (int or None): If given, convert sample width to this number of bytes (1–4). - - Returns: - bytes: WAV file bytes (mono) containing the audio, with any requested sample-rate or sample-width conversion applied. - """ - raw_data = self.get_raw_data(convert_rate, convert_width) - sample_rate = ( - self.sample_rate if convert_rate is None else convert_rate - ) - sample_width = ( - self.sample_width if convert_width is None else convert_width - ) - - # generate the WAV file contents - with io.BytesIO() as wav_file: - wav_writer = wave.open(wav_file, "wb") - try: # note that we can't use context manager, since that was only added in Python 3.4 - wav_writer.setframerate(sample_rate) - wav_writer.setsampwidth(sample_width) - wav_writer.setnchannels(1) - wav_writer.writeframes(raw_data) - wav_data = wav_file.getvalue() - finally: # make sure resources are cleaned up - wav_writer.close() - return wav_data - - def get_aiff_data(self, convert_rate=None, convert_width=None) -> bytes: - """ - Produce AIFF-C file bytes for the audio data. - - Parameters: - convert_rate (int, optional): Target sample rate in Hz. If omitted, the instance's sample rate is used. - convert_width (int, optional): Target sample width in bytes (1–4). If omitted, the instance's sample width is used. - - Returns: - bytes: AIFF-C file contents containing the audio with the requested sample rate and sample width. - """ - raw_data = self.get_raw_data(convert_rate, convert_width) - sample_rate = ( - self.sample_rate if convert_rate is None else convert_rate - ) - sample_width = ( - self.sample_width if convert_width is None else convert_width - ) - - # the AIFF format is big-endian, so we need to convert the little-endian raw data to big-endian - if hasattr( - audioop, "byteswap" - ): # ``audioop.byteswap`` was only added in Python 3.4 - raw_data = audioop.byteswap(raw_data, sample_width) - else: # manually reverse the bytes of each sample, which is slower but works well enough as a fallback - raw_data = raw_data[sample_width - 1:: -1] + b"".join( - raw_data[i + sample_width: i: -1] - for i in range(sample_width - 1, len(raw_data), sample_width) - ) - - # generate the AIFF-C file contents - with io.BytesIO() as aiff_file: - aiff_writer = aifc.open(aiff_file, "wb") - try: # note that we can't use context manager, since that was only added in Python 3.4 - aiff_writer.setframerate(sample_rate) - aiff_writer.setsampwidth(sample_width) - aiff_writer.setnchannels(1) - aiff_writer.writeframes(raw_data) - aiff_data = aiff_file.getvalue() - finally: # make sure resources are cleaned up - aiff_writer.close() - return aiff_data - - def get_flac_data(self, convert_rate=None, convert_width=None) -> bytes: - """ - Return FLAC-encoded bytes for this AudioData. - - If `convert_rate` is provided and differs from the instance sample rate, the audio is resampled to `convert_rate` Hz. If `convert_width` is provided, the audio samples are converted to that many bytes per sample; `convert_width` must be 1, 2, or 3 when given. If the source is wider than 3 bytes and `convert_width` is not specified, the output is converted to 3-byte (24-bit) samples because 32-bit FLAC is not supported. - - Returns: - flac_bytes (bytes): A byte string containing a valid FLAC file representing the (optionally converted) audio. - """ - assert convert_width is None or ( - convert_width % 1 == 0 and 1 <= convert_width <= 3 - ), "Sample width to convert to must be between 1 and 3 inclusive" - - if ( - self.sample_width > 3 and convert_width is None - ): # resulting WAV data would be 32-bit, which is not convertable to FLAC using our encoder - convert_width = 3 # the largest supported sample width is 24-bit, so we'll limit the sample width to that - - # run the FLAC converter with the WAV data to get the FLAC data - wav_data = self.get_wav_data(convert_rate, convert_width) - flac_converter = get_flac_converter() - if ( - os.name == "nt" - ): # on Windows, specify that the process is to be started without showing a console window - startup_info = subprocess.STARTUPINFO() - startup_info.dwFlags |= ( - subprocess.STARTF_USESHOWWINDOW - ) # specify that the wShowWindow field of `startup_info` contains a value - startup_info.wShowWindow = ( - subprocess.SW_HIDE - ) # specify that the console window should be hidden - else: - startup_info = None # default startupinfo - process = subprocess.Popen( - [ - flac_converter, - "--stdout", - "--totally-silent", - # put the resulting FLAC file in stdout, and make sure it's not mixed with any program output - "--best", # highest level of compression available - "-", # the input FLAC file contents will be given in stdin - ], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - startupinfo=startup_info, - ) - flac_data, stderr = process.communicate(wav_data) - return flac_data - def get_np_int16(self, convert_rate=None) -> Array: """ Produce a NumPy int16 array containing the audio samples. @@ -408,8 +170,27 @@ def get_np_float32(self, normalize=True, convert_rate=None) -> Array: return audio_as_np_float32 / max_int16 return audio_as_np_float32 + def get_segment(self, start_ms=None, end_ms=None) -> 'AudioData': + """ + Return an AudioData instance trimmed to the specified millisecond interval. -class AudioFile: + Parameters: + start_ms (float | int | None): Start time in milliseconds (inclusive). If None, start at the beginning. + end_ms (float | int | None): End time in milliseconds (exclusive). If None, end at the end of the audio. + + Returns: + AudioData: A new AudioData containing the audio frames from [start_ms, end_ms). + """ + data: srAudioData = super().get_segment(start_ms, end_ms) + # convert to patched AudioData class + return AudioData( + data.frame_data, + data.sample_rate, + data.sample_width, + ) + + +class AudioFile(srAudioFile): """ Creates a new ``AudioFile`` instance given a WAV/AIFF/FLAC audio file ``filename_or_fileobject``. Subclass of ``AudioSource``. @@ -424,111 +205,6 @@ class AudioFile: FLAC files must be in native FLAC format; OGG-FLAC is not supported and may result in undefined behaviour. """ - def __init__(self, filename_or_fileobject): - """ - Initialize an AudioFile wrapper for reading WAV/AIFF/FLAC audio sources. - - Parameters: - filename_or_fileobject (str or file-like): Path to an audio file or a readable file-like object. - - Raises: - AssertionError: If `filename_or_fileobject` is not a string path nor an object with a `read()` method. - """ - assert isinstance(filename_or_fileobject, (type(""), type(u""))) or hasattr(filename_or_fileobject, - "read"), "Given audio file must be a filename string or a file-like object" - self.filename_or_fileobject = filename_or_fileobject - self.stream = None - self.DURATION = None - - self.audio_reader = None - self.little_endian = False - self.SAMPLE_RATE = None - self.CHUNK = None - self.FRAME_COUNT = None - - def __enter__(self): - """ - Open the audio source and prepare it for reading, detecting format and configuring stream properties. - - Tries to interpret the provided filename or file-like object as WAV, AIFF/AIFF-C, or FLAC (decoded to AIFF). On success, configures the instance for streaming by setting SAMPLE_RATE, SAMPLE_WIDTH (may be adjusted to 4 when 24-bit samples must be handled as 32-bit internally), CHUNK, FRAME_COUNT, DURATION, little_endian flag, and stream (an AudioFileStream instance). Validates that the audio has 1 or 2 channels. If the source cannot be parsed as WAV, AIFF, or native FLAC, raises ValueError. - - @returns - self: the prepared AudioFile instance with an open audio_reader and ready-to-use stream - """ - assert self.stream is None, "This audio source is already inside a context manager" - try: - # attempt to read the file as WAV - self.audio_reader = wave.open(self.filename_or_fileobject, "rb") - self.little_endian = True # RIFF WAV is a little-endian format (most ``audioop`` operations assume that the frames are stored in little-endian form) - except (wave.Error, EOFError): - try: - # attempt to read the file as AIFF - self.audio_reader = aifc.open(self.filename_or_fileobject, "rb") - self.little_endian = False # AIFF is a big-endian format - except (aifc.Error, EOFError): - # attempt to read the file as FLAC - if hasattr(self.filename_or_fileobject, "read"): - flac_data = self.filename_or_fileobject.read() - else: - with open(self.filename_or_fileobject, "rb") as f: - flac_data = f.read() - - # run the FLAC converter with the FLAC data to get the AIFF data - flac_converter = get_flac_converter() - if os.name == "nt": # on Windows, specify that the process is to be started without showing a console window - startup_info = subprocess.STARTUPINFO() - startup_info.dwFlags |= subprocess.STARTF_USESHOWWINDOW # specify that the wShowWindow field of `startup_info` contains a value - startup_info.wShowWindow = subprocess.SW_HIDE # specify that the console window should be hidden - else: - startup_info = None # default startupinfo - process = subprocess.Popen([ - flac_converter, - "--stdout", "--totally-silent", - # put the resulting AIFF file in stdout, and make sure it's not mixed with any program output - "--decode", "--force-aiff-format", # decode the FLAC file into an AIFF file - "-", # the input FLAC file contents will be given in stdin - ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, startupinfo=startup_info) - aiff_data, _ = process.communicate(flac_data) - aiff_file = io.BytesIO(aiff_data) - try: - self.audio_reader = aifc.open(aiff_file, "rb") - except (aifc.Error, EOFError): - raise ValueError( - "Audio file could not be read as PCM WAV, AIFF/AIFF-C, or Native FLAC; check if file is corrupted or in another format") - self.little_endian = False # AIFF is a big-endian format - assert 1 <= self.audio_reader.getnchannels() <= 2, "Audio must be mono or stereo" - self.SAMPLE_WIDTH = self.audio_reader.getsampwidth() - - # 24-bit audio needs some special handling for old Python versions (workaround for https://bugs.python.org/issue12866) - samples_24_bit_pretending_to_be_32_bit = False - if self.SAMPLE_WIDTH == 3: # 24-bit audio - try: - audioop.bias(b"", self.SAMPLE_WIDTH, - 0) # test whether this sample width is supported (for example, ``audioop`` in Python 3.3 and below don't support sample width 3, while Python 3.4+ do) - except audioop.error: # this version of audioop doesn't support 24-bit audio (probably Python 3.3 or less) - samples_24_bit_pretending_to_be_32_bit = True # while the ``AudioFile`` instance will outwardly appear to be 32-bit, it will actually internally be 24-bit - self.SAMPLE_WIDTH = 4 # the ``AudioFile`` instance should present itself as a 32-bit stream now, since we'll be converting into 32-bit on the fly when reading - - self.SAMPLE_RATE = self.audio_reader.getframerate() - self.CHUNK = 4096 - self.FRAME_COUNT = self.audio_reader.getnframes() - self.DURATION = self.FRAME_COUNT / float(self.SAMPLE_RATE) - self.stream = AudioFile.AudioFileStream(self.audio_reader, self.little_endian, - samples_24_bit_pretending_to_be_32_bit) - return self - - def __exit__(self, exc_type, exc_value, traceback): - """ - Close and clean up the AudioFile context, releasing any resources opened by this instance. - - If the original source was a filename (not a file-like object), closes the underlying audio reader. Resets the internal stream and duration state. - """ - if not hasattr(self.filename_or_fileobject, - "read"): # only close the file if it was opened by this class in the first place (if the file was originally given as a path) - self.audio_reader.close() - self.stream = None - self.DURATION = None - def read(self, duration=None, offset=None) -> AudioData: """ Read up to `duration` seconds from the opened audio stream, beginning at `offset` seconds, and return an AudioData containing the captured PCM frames. @@ -566,79 +242,13 @@ def read(self, duration=None, offset=None) -> AudioData: frames.close() return AudioData(frame_data, self.SAMPLE_RATE, self.SAMPLE_WIDTH) - class AudioFileStream: - def __init__(self, audio_reader, little_endian, samples_24_bit_pretending_to_be_32_bit): - """ - Initialize the AudioFileStream with an underlying audio reader and format flags. - - Parameters: - audio_reader: A file-like audio reader (e.g., a wave.Wave_read or aifc.Aifc_read) that provides a readframes-like interface. - little_endian (bool): True when the source audio frames are little-endian; False when frames are big-endian and must be byte-swapped before processing. - samples_24_bit_pretending_to_be_32_bit (bool): True when the source uses 24-bit samples represented/stored as 32-bit frames (a compatibility mode); the stream will convert these to actual 24-bit data on read. - """ - self.audio_reader = audio_reader # an audio file object (e.g., a `wave.Wave_read` instance) - self.little_endian = little_endian # whether the audio data is little-endian (when working with big-endian things, we'll have to convert it to little-endian before we process it) - self.samples_24_bit_pretending_to_be_32_bit = samples_24_bit_pretending_to_be_32_bit # this is true if the audio is 24-bit audio, but 24-bit audio isn't supported, so we have to pretend that this is 32-bit audio and convert it on the fly - - def read(self, size=-1): - """ - Read up to `size` frames from the underlying audio reader and return mono, little-endian PCM bytes. - - This method: - - Reads `size` frames (or all frames if `size` is -1). If the reader returns a non-bytes value, an empty bytes object is returned. - - Converts big-endian input to little-endian on the fly. - - If `samples_24_bit_pretending_to_be_32_bit` is set, expands 24-bit samples into 32-bit little-endian samples. - - Converts multi-channel input to mono by mixing channels equally. - - Parameters: - size (int): Number of frames to read from the underlying reader; -1 means "read all available frames". - - Returns: - bytes: PCM audio data containing mono, little-endian samples (may be 32-bit if 24-bit-to-32-bit expansion occurred). - """ - buffer = self.audio_reader.readframes(self.audio_reader.getnframes() if size == -1 else size) - if not isinstance(buffer, bytes): buffer = b"" # workaround for https://bugs.python.org/issue24608 - - sample_width = self.audio_reader.getsampwidth() - if not self.little_endian: # big endian format, convert to little endian on the fly - if hasattr(audioop, - "byteswap"): # ``audioop.byteswap`` was only added in Python 3.4 (incidentally, that also means that we don't need to worry about 24-bit audio being unsupported, since Python 3.4+ always has that functionality) - buffer = audioop.byteswap(buffer, sample_width) - else: # manually reverse the bytes of each sample, which is slower but works well enough as a fallback - buffer = buffer[sample_width - 1::-1] + b"".join( - buffer[i + sample_width:i:-1] for i in range(sample_width - 1, len(buffer), sample_width)) - - # workaround for https://bugs.python.org/issue12866 - if self.samples_24_bit_pretending_to_be_32_bit: # we need to convert samples from 24-bit to 32-bit before we can process them with ``audioop`` functions - buffer = b"".join(b"\x00" + buffer[i:i + sample_width] for i in range(0, len(buffer), - sample_width)) # since we're in little endian, we prepend a zero byte to each 24-bit sample to get a 32-bit sample - sample_width = 4 # make sure we thread the buffer as 32-bit audio now, after converting it from 24-bit audio - if self.audio_reader.getnchannels() != 1: # stereo audio - buffer = audioop.tomono(buffer, sample_width, 1, 1) # convert stereo audio data to mono - return buffer - - -def get_flac_converter(): - """ - Locate the system FLAC encoder and return its absolute filesystem path. - - Returns: - flac_path (str): Absolute path to the `flac` executable. - - Raises: - OSError: If no `flac` converter can be found on PATH. - """ - flac_converter = shutil.which("flac") # check for installed version first - if flac_converter is None: # flac utility is not installed - raise OSError( - "FLAC conversion utility not available - consider installing the FLAC command line application by running `apt-get install flac` or your operating system's equivalent" - ) - return flac_converter - # patch for type checks in plugins to pass +# TODO - remove in next major version try: import speech_recognition + speech_recognition.AudioData = AudioData + speech_recognition.AudioFile = AudioFile except ImportError: pass \ No newline at end of file diff --git a/ovos_plugin_manager/version.py b/ovos_plugin_manager/version.py index c2390d9c..a120cdbd 100644 --- a/ovos_plugin_manager/version.py +++ b/ovos_plugin_manager/version.py @@ -1,6 +1,6 @@ # START_VERSION_BLOCK VERSION_MAJOR = 2 VERSION_MINOR = 2 -VERSION_BUILD = 0 -VERSION_ALPHA = 0 +VERSION_BUILD = 3 +VERSION_ALPHA = 1 # END_VERSION_BLOCK