diff --git a/README.md b/README.md index f530b2e..727d4f4 100644 --- a/README.md +++ b/README.md @@ -62,11 +62,11 @@ Speech Center allows to easily convert an audio resource into its associated tex Speech recognition CLI will retrieve an audio file stored locally and send it through streaming to Speech Center's GRPC interface. To run the client please run the `recognizer_stream.py` from the `cli-client` directory. An example transcription request is shown below, all the flags are then detailed. -**Example for streaming** +**Example for CLI** ```shell cd cli-client/ -python3 recognizer_stream.py --audio-file file.wav --topic GENERIC --language en-US --host us.speechcenter.verbio.com --token token.file --asr-version V1 --label project1 +python3 recognizer_stream.py --audio-file file.wav --topic GENERIC --language en-US --host us.speechcenter.verbio.com --token token.file --asr-version V2 --label project1 --formatting ``` You can use the `--help`command for more options. @@ -92,10 +92,21 @@ This code will generate the following terminal output on success: "duration": 4.460000 [2023-04-04 12:28:35,412][INFO]:Stream inactivity detected, closing stream... [2023-04-04 12:28:35,413][INFO]:Recognition finished +``` + +**Example for CLI with GUI** +Alternatively, you can use the `--gui` parameter to active the GUI mode: +```shell +cd cli-client/ +python3 recognizer_stream.py --audio-file file.wav --topic GENERIC --language en-US --host us.speechcenter.verbio.com --token token.file --asr-version V2 --label project1 --formatting --gui ``` +This code will generate the following terminal output on success: + +![Real-Time Speech-to-Text GUI](img/STT_gui.png) + You can also run: ```shell python3 recognizer_stream.py --help @@ -242,6 +253,14 @@ This option allows for a one word argument to be sent so that the speech transcr - **Argument must be the same each time for the same project. If there is a typo another project will be created.** - **There is no limit on the amount of projects that can be created.** +#### GUI Mode + +``` +--gui +``` + +This option enables GUI mode. Check out the **Example for CLI with GUI** above to understand the differences between the standard and the GUI modes. + ## Text-To-Speech diff --git a/cli-client/helpers/common.py b/cli-client/helpers/common.py index 4ab1ba8..23a5b8e 100644 --- a/cli-client/helpers/common.py +++ b/cli-client/helpers/common.py @@ -98,7 +98,7 @@ def parse_tts_command_line() -> SynthesizerOptions: def retrieve_token(options: SynthesizerOptions) -> str: - logging.info("Reading Speech Center JWT token from %s ...", options.token_file) + logging.info("Reading Speech Center JWT token from %s...", options.token_file) if options.client_id: return SpeechCenterCredentials.get_refreshed_token(options.client_id, options.client_secret, options.token_file) else: @@ -130,6 +130,7 @@ def __init__(self): self.inactivity_timeout = False self.asr_version = None self.label = None + self.gui = False self.client_id = None self.client_secret = None @@ -185,6 +186,7 @@ def parse_csr_commandline() -> RecognizerOptions: required=False, default=5.0) parser.add_argument('--asr-version', choices=['V1', 'V2'], help='Selectable asr version', required=True) parser.add_argument('--label', help='Label for the request', required=False, default="") + parser.add_argument('--gui', help='Enables GUI mode', required=False, default=False, action='store_true') credential_group = parser.add_argument_group( 'credentials', @@ -210,6 +212,7 @@ def parse_csr_commandline() -> RecognizerOptions: options.inactivity_timeout = float(args.inactivity_timeout) options.asr_version = args.asr_version options.label = args.label + options.gui = args.gui if args.inline_grammar: options.grammar = VerbioGrammar(VerbioGrammar.INLINE, args.inline_grammar) diff --git a/cli-client/helpers/csr_client.py b/cli-client/helpers/csr_client.py index b52d531..6bd6fb6 100644 --- a/cli-client/helpers/csr_client.py +++ b/cli-client/helpers/csr_client.py @@ -2,6 +2,7 @@ import pause import logging import datetime +from typing import Optional sys.path.insert(1, '../proto/generated') import threading @@ -9,6 +10,7 @@ from concurrent.futures import ThreadPoolExecutor import recognition_streaming_request_pb2 +from helpers.csr_gui import CsrGUI from helpers.common import split_audio from helpers.audio_importer import AudioImporter from helpers.common import VerbioGrammar, RecognizerOptions @@ -16,7 +18,7 @@ class CSRClient: - def __init__(self, executor: ThreadPoolExecutor, stub, options: RecognizerOptions, audio_resource: AudioImporter, token: str): + def __init__(self, executor: ThreadPoolExecutor, stub, options: RecognizerOptions, audio_resource: AudioImporter, token: str, gui: Optional[CsrGUI]): self._executor = executor self._stub = stub self._resources = audio_resource @@ -34,6 +36,7 @@ def __init__(self, executor: ThreadPoolExecutor, stub, options: RecognizerOption self._diarization = options.diarization self._hide_partial_results = options.hide_partial_results self._label = options.label + self._gui = gui self._messages = None def _close_stream_by_inactivity(self): @@ -45,6 +48,19 @@ def _start_inactivity_timer(self, inactivity_timeout: float): self._inactivity_timer.start() def _print_result(self, response): + if self._gui: + self._print_result_in_gui(response) + else: + self._print_result_in_logs(response) + + def _print_result_in_gui(self, response): + transcript = response.result.alternatives[0].transcript + if response.result.is_final: + self._gui.add_final_transcript(transcript) + elif not self._hide_partial_results: + self._gui.add_partial_transcript(transcript) + + def _print_result_in_logs(self, response): if response.result.is_final: transcript = "Final result:\n" \ f'\t"transcript": "{response.result.alternatives[0].transcript}",\n' \ @@ -96,6 +112,8 @@ def wait_for_response(self) -> bool: return True def __message_iterator(self): + if self._gui: + self._gui.start_progress_bar_task(total_audio_samples=self._resources.n_samples) for message_type, message in self._messages: logging.info("Sending streaming message " + message_type) get_up_time = datetime.datetime.now() @@ -103,6 +121,8 @@ def __message_iterator(self): sent_audio_samples = len(message.audio) // self._resources.sample_width sent_audio_duration = sent_audio_samples / self._resources.sample_rate get_up_time += datetime.timedelta(seconds=sent_audio_duration) + if self._gui: + self._gui.advance_progress_bar(advance=sent_audio_samples) yield message pause.until(get_up_time) logging.info("All audio messages sent") diff --git a/cli-client/helpers/csr_gui.py b/cli-client/helpers/csr_gui.py new file mode 100644 index 0000000..300c536 --- /dev/null +++ b/cli-client/helpers/csr_gui.py @@ -0,0 +1,128 @@ +import os +from rich.live import Live +from rich.panel import Panel +from rich.layout import Layout +from rich.padding import Padding +from rich.console import Console, RenderableType +from rich.progress import Progress, TextColumn, BarColumn, TimeRemainingColumn + +GUI_NUM_REFRESH_PER_SECOND = 4 +GUI_PARTIAL_RESULT_TAG = "" +GUI_FINAL_RESULT_TAG = "" +GUI_PROGRESS_BAR_TASK_DESCRIPTION_RUNNING = "Sending streaming audio messages" +GUI_PROGRESS_BAR_TASK_DESCRIPTION_FINISHED = "[bold green]All audio sent successfully!" + + +class CsrGUI: + def __init__(self): + self.__init_gui_layout() + self.__set_logging_panel() + self.__set_transcript_panel() + self.__set_progress_bar() + + def __init_gui_layout(self): + self._layout = Layout(name="root") + self._layout.split( + Layout(name="status", ratio=3), + Layout(name="transcript", minimum_size=10), + ) + self._layout["status"].split_column( + Layout(name="progress", size=3), + Layout(name="logging"), + ) + + def __set_logging_panel(self) -> RenderableType: + self._logging_console = LogPanel() + logging_panel = Panel(self._logging_console, title="Logs", border_style="green") + self._layout["logging"].update(logging_panel) + + def __set_transcript_panel(self) -> RenderableType: + self._transcript_console = TranscriptPanel() + transcript_panel = Panel(self._transcript_console, title="Transcript", border_style="blue", padding=(2, 2)) + self._layout["transcript"].update(transcript_panel) + + def __set_progress_bar(self) -> RenderableType: + self._progress_bar = Progress( + TextColumn("{task.description}", justify="right"), + BarColumn(bar_width=None), + TextColumn("{task.completed} / {task.total} samples"), + "[progress.percentage]({task.percentage:>3.1f}%)", + TimeRemainingColumn(), + ) + progress_panel = Padding(self._progress_bar, pad=(1, 1)) + self._layout["progress"].update(progress_panel) + + def start(self): + self._screen = Live(self._layout, refresh_per_second=GUI_NUM_REFRESH_PER_SECOND, screen=False) + self._screen.start(refresh=True) + + def stop(self): + self._screen.stop() + + def start_progress_bar_task(self, total_audio_samples: int): + self._streaming_task = self._progress_bar.add_task( + GUI_PROGRESS_BAR_TASK_DESCRIPTION_RUNNING, total=total_audio_samples + ) + + def advance_progress_bar(self, advance: int): + if not self._progress_bar.finished: + self._progress_bar.update(self._streaming_task, advance=advance) + if self._progress_bar.finished: + self._progress_bar.update(self._streaming_task, description=GUI_PROGRESS_BAR_TASK_DESCRIPTION_FINISHED) + self._screen.refresh() + + def add_partial_transcript(self, transcript: str): + self._transcript_console.print(GUI_PARTIAL_RESULT_TAG, end='') + self._transcript_console.print(transcript, end='') + self._screen.refresh() + + def add_final_transcript(self, transcript: str): + self._transcript_console.print(GUI_FINAL_RESULT_TAG, end='') + self._transcript_console.print(transcript) + self._screen.refresh() + + +class LogPanel(Console): + def __init__(self, *args, **kwargs): + console_file = open(os.devnull, 'w') + super().__init__(record=True, file=console_file, *args, **kwargs) + self.stored_logs = [] + + def __rich_console__(self, console, options): + logs = self.export_text(clear=True) + clean_logs = [log for log in logs.split('\n') if log] + total_logs = self.stored_logs + clean_logs + self.stored_logs = total_logs[-options.height:] + for line in self.stored_logs: + yield line + + +class TranscriptPanel(Console): + def __init__(self, *args, **kwargs): + console_file = open(os.devnull, 'w') + super().__init__(record=True, file=console_file, *args, **kwargs) + self.stored_transcripts = [] + self.in_partial_line = False + self.last_partial_line = "" + + def __rich_console__(self, console, options): + temporary_transcripts = [] + transcripts = self.export_text(clear=True) + + for line in transcripts.split('\n'): + if GUI_FINAL_RESULT_TAG in line: + final_line = line.split(GUI_FINAL_RESULT_TAG)[-1] + self.stored_transcripts.append(final_line) + self.last_partial_line = "" + self.in_partial_line = False + elif GUI_PARTIAL_RESULT_TAG in line: + temporary_line = line.split(GUI_PARTIAL_RESULT_TAG)[-1] + temporary_transcripts.append(temporary_line) + self.last_partial_line = temporary_line + self.in_partial_line = True + + if self.in_partial_line and len(temporary_transcripts) == 0: + temporary_transcripts = [self.last_partial_line] + + total_transcripts = self.stored_transcripts + temporary_transcripts + yield "".join(total_transcripts)[-(options.height * options.max_width):] \ No newline at end of file diff --git a/cli-client/recognizer_stream.py b/cli-client/recognizer_stream.py index 24025dc..4d27fa3 100755 --- a/cli-client/recognizer_stream.py +++ b/cli-client/recognizer_stream.py @@ -1,41 +1,64 @@ #!/usr/bin/env python3 import sys import logging +from typing import Optional +from rich.logging import RichHandler sys.path.insert(1, '../proto/generated') import grpc import recognition_pb2_grpc from concurrent.futures import ThreadPoolExecutor +from helpers.csr_gui import CsrGUI from helpers.csr_client import CSRClient from helpers.audio_importer import AudioImporter from helpers.grpc_connection import GrpcConnection from helpers.common import retrieve_token, parse_csr_commandline, RecognizerOptions -def process_recognition(executor: ThreadPoolExecutor, channel: grpc.Channel, options: RecognizerOptions, access_token: str): +def process_recognition(executor: ThreadPoolExecutor, channel: grpc.Channel, options: RecognizerOptions, access_token: str, gui: Optional[CsrGUI]): audio_resource = AudioImporter(options.audio_file, options.convert_audio) stub = recognition_pb2_grpc.RecognizerStub(channel) - client = CSRClient(executor, stub, options, audio_resource, access_token) + client = CSRClient(executor, stub, options, audio_resource, access_token, gui) client.send_audio() client.wait_for_response() logging.info("Recognition finished") +def init_gui() -> CsrGUI: + gui = CsrGUI() + logging_handler = RichHandler( + console=gui._logging_console, + show_time=False, + show_level=False, + show_path=False + ) + logging.basicConfig( + level=logging.INFO, + format='[%(asctime)s][%(levelname)s]:%(message)s', + handlers=[logging_handler], + force=True + ) + gui.start() + return gui + + def run(options: RecognizerOptions): - logging.info("Connecting to %s", command_line_options.host) + gui = init_gui() if options.gui else None access_token = retrieve_token(command_line_options) grpc_connection = GrpcConnection(options.secure_channel, options.client_id, options.client_secret, access_token) with grpc_connection.open(options.host) as grpc_channel: executor = ThreadPoolExecutor() - future = executor.submit(process_recognition, executor, grpc_channel, options, access_token) + future = executor.submit(process_recognition, executor, grpc_channel, options, access_token, gui) future.result() + if gui: + gui.stop() + if __name__ == '__main__': logging.basicConfig(level=logging.INFO, format='[%(asctime)s][%(levelname)s]:%(message)s') - logging.info("Running speechcenter streaming...") command_line_options = parse_csr_commandline() command_line_options.check() run(command_line_options) diff --git a/img/STT_gui.png b/img/STT_gui.png new file mode 100644 index 0000000..f66dab3 Binary files /dev/null and b/img/STT_gui.png differ diff --git a/requirements.txt b/requirements.txt index b6bc6ce..f8e8447 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ grpcio-tools==1.57.0 requests==2.31.0 pyjwt==2.8.0 pause==0.3 +rich==13.9.4