Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 21 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,11 @@ Speech Center allows to easily convert an audio resource into its associated tex
Speech recognition CLI will retrieve an audio file stored locally and send it through streaming to Speech Center's GRPC interface. To run the client please run the `recognizer_stream.py` from the `cli-client` directory. An example transcription request is shown below, all the flags are then detailed.


**Example for streaming**
**Example for CLI**

```shell
cd cli-client/
python3 recognizer_stream.py --audio-file file.wav --topic GENERIC --language en-US --host us.speechcenter.verbio.com --token token.file --asr-version V1 --label project1
python3 recognizer_stream.py --audio-file file.wav --topic GENERIC --language en-US --host us.speechcenter.verbio.com --token token.file --asr-version V2 --label project1 --formatting
```

You can use the `--help`command for more options.
Expand All @@ -92,10 +92,21 @@ This code will generate the following terminal output on success:
"duration": 4.460000
[2023-04-04 12:28:35,412][INFO]:Stream inactivity detected, closing stream...
[2023-04-04 12:28:35,413][INFO]:Recognition finished
```

**Example for CLI with GUI**

Alternatively, you can use the `--gui` parameter to active the GUI mode:

```shell
cd cli-client/
python3 recognizer_stream.py --audio-file file.wav --topic GENERIC --language en-US --host us.speechcenter.verbio.com --token token.file --asr-version V2 --label project1 --formatting --gui
```

This code will generate the following terminal output on success:

![Real-Time Speech-to-Text GUI](img/STT_gui.png)

You can also run:
```shell
python3 recognizer_stream.py --help
Expand Down Expand Up @@ -242,6 +253,14 @@ This option allows for a one word argument to be sent so that the speech transcr
- **Argument must be the same each time for the same project. If there is a typo another project will be created.**
- **There is no limit on the amount of projects that can be created.**

#### GUI Mode

```
--gui
```

This option enables GUI mode. Check out the **Example for CLI with GUI** above to understand the differences between the standard and the GUI modes.


## Text-To-Speech

Expand Down
5 changes: 4 additions & 1 deletion cli-client/helpers/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def parse_tts_command_line() -> SynthesizerOptions:


def retrieve_token(options: SynthesizerOptions) -> str:
logging.info("Reading Speech Center JWT token from %s ...", options.token_file)
logging.info("Reading Speech Center JWT token from %s...", options.token_file)
if options.client_id:
return SpeechCenterCredentials.get_refreshed_token(options.client_id, options.client_secret, options.token_file)
else:
Expand Down Expand Up @@ -130,6 +130,7 @@ def __init__(self):
self.inactivity_timeout = False
self.asr_version = None
self.label = None
self.gui = False
self.client_id = None
self.client_secret = None

Expand Down Expand Up @@ -185,6 +186,7 @@ def parse_csr_commandline() -> RecognizerOptions:
required=False, default=5.0)
parser.add_argument('--asr-version', choices=['V1', 'V2'], help='Selectable asr version', required=True)
parser.add_argument('--label', help='Label for the request', required=False, default="")
parser.add_argument('--gui', help='Enables GUI mode', required=False, default=False, action='store_true')

credential_group = parser.add_argument_group(
'credentials',
Expand All @@ -210,6 +212,7 @@ def parse_csr_commandline() -> RecognizerOptions:
options.inactivity_timeout = float(args.inactivity_timeout)
options.asr_version = args.asr_version
options.label = args.label
options.gui = args.gui

if args.inline_grammar:
options.grammar = VerbioGrammar(VerbioGrammar.INLINE, args.inline_grammar)
Expand Down
22 changes: 21 additions & 1 deletion cli-client/helpers/csr_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,23 @@
import pause
import logging
import datetime
from typing import Optional
sys.path.insert(1, '../proto/generated')

import threading
from threading import Timer
from concurrent.futures import ThreadPoolExecutor
import recognition_streaming_request_pb2

from helpers.csr_gui import CsrGUI
from helpers.common import split_audio
from helpers.audio_importer import AudioImporter
from helpers.common import VerbioGrammar, RecognizerOptions
from helpers.compiled_grammar_processing import get_compiled_grammar


class CSRClient:
def __init__(self, executor: ThreadPoolExecutor, stub, options: RecognizerOptions, audio_resource: AudioImporter, token: str):
def __init__(self, executor: ThreadPoolExecutor, stub, options: RecognizerOptions, audio_resource: AudioImporter, token: str, gui: Optional[CsrGUI]):
self._executor = executor
self._stub = stub
self._resources = audio_resource
Expand All @@ -34,6 +36,7 @@ def __init__(self, executor: ThreadPoolExecutor, stub, options: RecognizerOption
self._diarization = options.diarization
self._hide_partial_results = options.hide_partial_results
self._label = options.label
self._gui = gui
self._messages = None

def _close_stream_by_inactivity(self):
Expand All @@ -45,6 +48,19 @@ def _start_inactivity_timer(self, inactivity_timeout: float):
self._inactivity_timer.start()

def _print_result(self, response):
if self._gui:
self._print_result_in_gui(response)
else:
self._print_result_in_logs(response)

def _print_result_in_gui(self, response):
transcript = response.result.alternatives[0].transcript
if response.result.is_final:
self._gui.add_final_transcript(transcript)
elif not self._hide_partial_results:
self._gui.add_partial_transcript(transcript)

def _print_result_in_logs(self, response):
if response.result.is_final:
transcript = "Final result:\n" \
f'\t"transcript": "{response.result.alternatives[0].transcript}",\n' \
Expand Down Expand Up @@ -96,13 +112,17 @@ def wait_for_response(self) -> bool:
return True

def __message_iterator(self):
if self._gui:
self._gui.start_progress_bar_task(total_audio_samples=self._resources.n_samples)
for message_type, message in self._messages:
logging.info("Sending streaming message " + message_type)
get_up_time = datetime.datetime.now()
if message_type == "audio":
sent_audio_samples = len(message.audio) // self._resources.sample_width
sent_audio_duration = sent_audio_samples / self._resources.sample_rate
get_up_time += datetime.timedelta(seconds=sent_audio_duration)
if self._gui:
self._gui.advance_progress_bar(advance=sent_audio_samples)
yield message
pause.until(get_up_time)
logging.info("All audio messages sent")
Expand Down
128 changes: 128 additions & 0 deletions cli-client/helpers/csr_gui.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import os
from rich.live import Live
from rich.panel import Panel
from rich.layout import Layout
from rich.padding import Padding
from rich.console import Console, RenderableType
from rich.progress import Progress, TextColumn, BarColumn, TimeRemainingColumn

GUI_NUM_REFRESH_PER_SECOND = 4
GUI_PARTIAL_RESULT_TAG = "<PARTIAL>"
GUI_FINAL_RESULT_TAG = "<FINAL>"
GUI_PROGRESS_BAR_TASK_DESCRIPTION_RUNNING = "Sending streaming audio messages"
GUI_PROGRESS_BAR_TASK_DESCRIPTION_FINISHED = "[bold green]All audio sent successfully!"


class CsrGUI:
def __init__(self):
self.__init_gui_layout()
self.__set_logging_panel()
self.__set_transcript_panel()
self.__set_progress_bar()

def __init_gui_layout(self):
self._layout = Layout(name="root")
self._layout.split(
Layout(name="status", ratio=3),
Layout(name="transcript", minimum_size=10),
)
self._layout["status"].split_column(
Layout(name="progress", size=3),
Layout(name="logging"),
)

def __set_logging_panel(self) -> RenderableType:
self._logging_console = LogPanel()
logging_panel = Panel(self._logging_console, title="Logs", border_style="green")
self._layout["logging"].update(logging_panel)

def __set_transcript_panel(self) -> RenderableType:
self._transcript_console = TranscriptPanel()
transcript_panel = Panel(self._transcript_console, title="Transcript", border_style="blue", padding=(2, 2))
self._layout["transcript"].update(transcript_panel)

def __set_progress_bar(self) -> RenderableType:
self._progress_bar = Progress(
TextColumn("{task.description}", justify="right"),
BarColumn(bar_width=None),
TextColumn("{task.completed} / {task.total} samples"),
"[progress.percentage]({task.percentage:>3.1f}%)",
TimeRemainingColumn(),
)
progress_panel = Padding(self._progress_bar, pad=(1, 1))
self._layout["progress"].update(progress_panel)

def start(self):
self._screen = Live(self._layout, refresh_per_second=GUI_NUM_REFRESH_PER_SECOND, screen=False)
self._screen.start(refresh=True)

def stop(self):
self._screen.stop()

def start_progress_bar_task(self, total_audio_samples: int):
self._streaming_task = self._progress_bar.add_task(
GUI_PROGRESS_BAR_TASK_DESCRIPTION_RUNNING, total=total_audio_samples
)

def advance_progress_bar(self, advance: int):
if not self._progress_bar.finished:
self._progress_bar.update(self._streaming_task, advance=advance)
if self._progress_bar.finished:
self._progress_bar.update(self._streaming_task, description=GUI_PROGRESS_BAR_TASK_DESCRIPTION_FINISHED)
self._screen.refresh()

def add_partial_transcript(self, transcript: str):
self._transcript_console.print(GUI_PARTIAL_RESULT_TAG, end='')
self._transcript_console.print(transcript, end='')
self._screen.refresh()

def add_final_transcript(self, transcript: str):
self._transcript_console.print(GUI_FINAL_RESULT_TAG, end='')
self._transcript_console.print(transcript)
self._screen.refresh()


class LogPanel(Console):
def __init__(self, *args, **kwargs):
console_file = open(os.devnull, 'w')
super().__init__(record=True, file=console_file, *args, **kwargs)
self.stored_logs = []

def __rich_console__(self, console, options):
logs = self.export_text(clear=True)
clean_logs = [log for log in logs.split('\n') if log]
total_logs = self.stored_logs + clean_logs
self.stored_logs = total_logs[-options.height:]
for line in self.stored_logs:
yield line


class TranscriptPanel(Console):
def __init__(self, *args, **kwargs):
console_file = open(os.devnull, 'w')
super().__init__(record=True, file=console_file, *args, **kwargs)
self.stored_transcripts = []
self.in_partial_line = False
self.last_partial_line = ""

def __rich_console__(self, console, options):
temporary_transcripts = []
transcripts = self.export_text(clear=True)

for line in transcripts.split('\n'):
if GUI_FINAL_RESULT_TAG in line:
final_line = line.split(GUI_FINAL_RESULT_TAG)[-1]
self.stored_transcripts.append(final_line)
self.last_partial_line = ""
self.in_partial_line = False
elif GUI_PARTIAL_RESULT_TAG in line:
temporary_line = line.split(GUI_PARTIAL_RESULT_TAG)[-1]
temporary_transcripts.append(temporary_line)
self.last_partial_line = temporary_line
self.in_partial_line = True

if self.in_partial_line and len(temporary_transcripts) == 0:
temporary_transcripts = [self.last_partial_line]

total_transcripts = self.stored_transcripts + temporary_transcripts
yield "".join(total_transcripts)[-(options.height * options.max_width):]
33 changes: 28 additions & 5 deletions cli-client/recognizer_stream.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,64 @@
#!/usr/bin/env python3
import sys
import logging
from typing import Optional
from rich.logging import RichHandler
sys.path.insert(1, '../proto/generated')

import grpc
import recognition_pb2_grpc
from concurrent.futures import ThreadPoolExecutor

from helpers.csr_gui import CsrGUI
from helpers.csr_client import CSRClient
from helpers.audio_importer import AudioImporter
from helpers.grpc_connection import GrpcConnection
from helpers.common import retrieve_token, parse_csr_commandline, RecognizerOptions


def process_recognition(executor: ThreadPoolExecutor, channel: grpc.Channel, options: RecognizerOptions, access_token: str):
def process_recognition(executor: ThreadPoolExecutor, channel: grpc.Channel, options: RecognizerOptions, access_token: str, gui: Optional[CsrGUI]):
audio_resource = AudioImporter(options.audio_file, options.convert_audio)
stub = recognition_pb2_grpc.RecognizerStub(channel)
client = CSRClient(executor, stub, options, audio_resource, access_token)
client = CSRClient(executor, stub, options, audio_resource, access_token, gui)
client.send_audio()
client.wait_for_response()
logging.info("Recognition finished")


def init_gui() -> CsrGUI:
gui = CsrGUI()
logging_handler = RichHandler(
console=gui._logging_console,
show_time=False,
show_level=False,
show_path=False
)
logging.basicConfig(
level=logging.INFO,
format='[%(asctime)s][%(levelname)s]:%(message)s',
handlers=[logging_handler],
force=True
)
gui.start()
return gui


def run(options: RecognizerOptions):
logging.info("Connecting to %s", command_line_options.host)
gui = init_gui() if options.gui else None
access_token = retrieve_token(command_line_options)
grpc_connection = GrpcConnection(options.secure_channel, options.client_id, options.client_secret, access_token)

with grpc_connection.open(options.host) as grpc_channel:
executor = ThreadPoolExecutor()
future = executor.submit(process_recognition, executor, grpc_channel, options, access_token)
future = executor.submit(process_recognition, executor, grpc_channel, options, access_token, gui)
future.result()

if gui:
gui.stop()


if __name__ == '__main__':
logging.basicConfig(level=logging.INFO, format='[%(asctime)s][%(levelname)s]:%(message)s')
logging.info("Running speechcenter streaming...")
command_line_options = parse_csr_commandline()
command_line_options.check()
run(command_line_options)
Binary file added img/STT_gui.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ grpcio-tools==1.57.0
requests==2.31.0
pyjwt==2.8.0
pause==0.3
rich==13.9.4