From cea112dce2b5912047c6180a084cac203122fbcd Mon Sep 17 00:00:00 2001 From: Jeroen Veen Date: Sun, 13 Apr 2025 13:14:45 +0200 Subject: [PATCH 01/10] Added basic Voice Activity Detection (VAD) functionality - Implement energy-based VAD node - Add VoiceActivity message type - Create launch file for audio capture with VAD - Update documentation with VAD usage instructions and parameters --- CMakeLists.txt | 31 +- ROS2_Audio_Tools_project_knowledge.md | 812 ++++++++++++++++++++++++ docs/about/implementation.md | 55 ++ docs/usage/overview.md | 12 + docs/usage/voice_activity_detection.md | 123 ++++ launch/audio_vad.launch.py | 31 + msg/VoiceActivity.msg | 8 + src/vad_node.cpp | 205 ++++++ src/vad_node.hpp | 122 ++++ 9 files changed, 1396 insertions(+), 3 deletions(-) create mode 100644 ROS2_Audio_Tools_project_knowledge.md create mode 100644 docs/usage/voice_activity_detection.md create mode 100644 launch/audio_vad.launch.py create mode 100644 msg/VoiceActivity.msg create mode 100644 src/vad_node.cpp create mode 100644 src/vad_node.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 410e587..5be7790 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,12 +12,14 @@ rosidl_generate_interfaces(${PROJECT_NAME} msg/AudioData.msg msg/AudioDataStamped.msg msg/AudioInfo.msg + msg/VoiceActivity.msg DEPENDENCIES std_msgs ) ament_export_dependencies(rosidl_default_runtime) rosidl_get_typesupport_target(cpp_typesupport_target ${PROJECT_NAME} "rosidl_typesupport_cpp") +# Audio Capture Node add_executable(audio_capture_node src/audio_capture_node.cpp) ament_target_dependencies(audio_capture_node @@ -36,7 +38,7 @@ install(TARGETS audio_capture_node DESTINATION lib/${PROJECT_NAME}) - +# Audio Playback Node add_executable(audio_playback_node src/audio_playback_node.cpp) target_link_libraries(audio_playback_node "${cpp_typesupport_target}" ${SDL2_LIBRARIES}) target_include_directories(audio_playback_node PUBLIC ${SDL2_INCLUDE_DIRS}) @@ -50,8 +52,31 @@ ament_target_dependencies(audio_playback_node std_msgs ) +# Voice Activity Detection Node +add_executable(vad_node src/vad_node.cpp) +target_link_libraries(vad_node "${cpp_typesupport_target}") + +if(UNIX AND NOT APPLE) + target_compile_options(vad_node PRIVATE -Wall) +endif() +ament_target_dependencies(vad_node + rclcpp + std_msgs +) + +# Install targets install(TARGETS -audio_playback_node -DESTINATION lib/${PROJECT_NAME}) + audio_capture_node + audio_playback_node + vad_node # Added VAD node + DESTINATION lib/${PROJECT_NAME} +) + +# Install launch files +install(DIRECTORY + launch/ + DESTINATION share/${PROJECT_NAME}/launch +) + ament_package() \ No newline at end of file diff --git a/ROS2_Audio_Tools_project_knowledge.md b/ROS2_Audio_Tools_project_knowledge.md new file mode 100644 index 0000000..080d213 --- /dev/null +++ b/ROS2_Audio_Tools_project_knowledge.md @@ -0,0 +1,812 @@ +# EdgeVox Project Documentation + +# Project Structure +``` +ROS2_Audio_Tools/ +├── conda/ +│ ├── lin/ +│ │ ├── recipe/ +│ │ │ ├── activate.sh +│ │ │ ├── build_ament_cmake.sh +│ │ │ ├── build_ament_python.sh +│ │ │ ├── build_catkin.sh +│ │ │ ├── deactivate.sh +│ │ │ ├── recipe.yaml +│ │ ├── pixi.toml +│ ├── win/ +│ │ ├── recipe/ +│ │ │ ├── activate.bat +│ │ │ ├── bld_ament_cmake.bat +│ │ │ ├── bld_ament_python.bat +│ │ │ ├── bld_catkin.bat +│ │ │ ├── bld_catkin_merge.bat +│ │ │ ├── bld_colcon_merge.bat +│ │ │ ├── deactivate.bat +│ │ │ ├── recipe.yaml +│ │ │ ├── rosdistro_snapshot.yaml +│ │ ├── pixi.toml +├── docs/ +│ ├── about/ +│ │ ├── implementation.md +│ ├── assets/ +│ │ ├── stylesheets/ +│ │ │ ├── logo.css +│ │ ├── favicon.png +│ │ ├── logo.png +│ ├── contributing/ +│ │ ├── license.md +│ │ ├── rules.md +│ ├── usage/ +│ │ ├── overview.md +│ │ ├── ros2_pixi_build_linux_windows.md +│ │ ├── usage_with_native_linux.md +│ ├── index.md +├── msg/ +│ ├── AudioData.msg +│ ├── AudioDataStamped.msg +│ ├── AudioInfo.msg +├── src/ +│ ├── audio_capture_node.cpp +│ ├── audio_capture_node.hpp +│ ├── audio_playback_node.cpp +│ ├── audio_playback_node.hpp +├── CMakeLists.txt +├── LICENSE +├── README.md +├── colcon.pkg +├── mkdocs.yml +├── package.xml +├── pixi.toml +``` + +# src/audio_capture_node.cpp +```cpp +/** + * @file audio_capture_node.cpp + * @brief Audio Capture Node for ROS 2 using SDL2 + * + * Copyright 2025 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "audio_capture_node.hpp" +#include + +AudioCaptureNode::AudioCaptureNode() + : Node("audio_capture_node"), _desired_rate(-1.0) { + + // Declare and get parameters + this->declare_parameter("sample_format", "S16LE"); + this->get_parameter("sample_format", _sample_format); + + this->declare_parameter("channels", 1); + this->declare_parameter("sample_rate", 16000); + this->get_parameter("channels", _channels); + this->get_parameter("sample_rate", _sample_rate); + + if (_channels < 1 || _channels > 8) { + RCLCPP_ERROR_STREAM(this->get_logger(), + "Invalid channel count: " << _channels); + throw std::runtime_error("Unsupported media type."); + } + + this->declare_parameter("device", -1); + this->get_parameter("device", _device); + + this->declare_parameter("chunk_size", 1024); + int chunk_size; + this->get_parameter("chunk_size", chunk_size); + + if (chunk_size < 512 || chunk_size > 4096) { + RCLCPP_ERROR_STREAM(this->get_logger(), + "Invalid chunk size: " << chunk_size); + throw std::runtime_error("Unsupported chunk size!"); + } + + if (!ConfigureSDLStream(_device, _sample_format, chunk_size, _channels)) { + throw std::runtime_error("Failed to configure SDL audio capture."); + } + + // Publishers + _pub = this->create_publisher("audio", 10); + auto info_qos = rclcpp::QoS(rclcpp::KeepLast(1)).transient_local(); + _pub_info = this->create_publisher("audio_info", + info_qos); + + _pub_stamped = this->create_publisher( + "audio_stamped", 10); + + _timer_info = rclcpp::create_timer(this, get_clock(), std::chrono::seconds(5), + [this] { publishInfo(); }); + publishInfo(); +} + +void AudioCaptureNode::callback(uint8_t *stream, int len) { + audio_tools::msg::AudioData msg; + audio_tools::msg::AudioDataStamped stamped_msg; + + // Generate frame ID + stamped_msg.header.frame_id = std::to_string(_audio_pos++); + // Compute timestamp based on buffer size + size_t buffer_size_total_nsec = + (static_cast(len) * 1e9) / (_sample_rate * _nbytes); + + _nano_sec += buffer_size_total_nsec; + while (_nano_sec >= RCL_S_TO_NS(1)) { + _sec++; + _nano_sec -= RCL_S_TO_NS(1); + } + + stamped_msg.header.stamp.sec = _sec; + stamped_msg.header.stamp.nanosec = _nano_sec; + + // Copy audio data + msg.data.assign(stream, stream + len); + + stamped_msg.audio = msg; + stamped_msg.info = _info_msg; + // Publish messages + _pub->publish(msg); + _pub_stamped->publish(stamped_msg); +} + +void AudioCaptureNode::publishInfo() { + _info_msg.channels = _channels; + _info_msg.sample_rate = _sample_rate; + _info_msg.sample_format = _sample_format; + + _pub_info->publish(_info_msg); +} + +AudioCaptureNode::~AudioCaptureNode() { + SDL_PauseAudioDevice(_dev_id_in, 1); + SDL_CloseAudioDevice(_dev_id_in); +} + +bool AudioCaptureNode::ConfigureSDLStream(int device, const std::string &format, + int chunk_size, int channels) { + SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO); + + if (SDL_Init(SDL_INIT_AUDIO) < 0) { + RCLCPP_ERROR(this->get_logger(), "SDL initialization failed: %s", + SDL_GetError()); + return false; + } + + SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", + SDL_HINT_OVERRIDE); + + int nDevices = SDL_GetNumAudioDevices(SDL_TRUE); + RCLCPP_INFO(this->get_logger(), "%s: Found %d capture devices:\n", __func__, + nDevices); + + for (int i = 0; i < nDevices; i++) { + RCLCPP_INFO(this->get_logger(), " - Capture device #%d: '%s'\n", i, + SDL_GetAudioDeviceName(i, SDL_TRUE)); + } + + SDL_zero(_capture_spec_requested); + SDL_zero(_capture_spec_obtained); + + auto format_it = _format_map.find(format); + if (format_it == _format_map.end()) { + RCLCPP_ERROR(this->get_logger(), "Unknown audio format: %s", + format.c_str()); + return false; + } + _nbytes = format_it->second.second; + + _capture_spec_requested.freq = _sample_rate; + _capture_spec_requested.format = format_it->second.first; + _capture_spec_requested.channels = channels; + _capture_spec_requested.samples = chunk_size; + _capture_spec_requested.callback = [](void *userdata, uint8_t *stream, + int len) { + static_cast(userdata)->callback(stream, len); + }; + _capture_spec_requested.userdata = this; + + if (device >= 0 && device < nDevices) { + RCLCPP_INFO(this->get_logger(), + "%s: Attempting to open capture device #%d: '%s'...\n", + __func__, device, SDL_GetAudioDeviceName(device, SDL_TRUE)); + + _dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(device, SDL_TRUE), + SDL_TRUE, &_capture_spec_requested, + &_capture_spec_obtained, 0); + } else { + RCLCPP_INFO(this->get_logger(), + "%s: Attempting to open default capture device...\n", __func__); + _dev_id_in = + SDL_OpenAudioDevice(nullptr, SDL_TRUE, &_capture_spec_requested, + &_capture_spec_obtained, 0); + } + + if (!_dev_id_in) { + RCLCPP_INFO(this->get_logger(), "%s: Failed to open audio device: %s\n", + __func__, SDL_GetError()); + return false; + } + + RCLCPP_INFO(this->get_logger(), "%s: Capture device opened (SDL Id = %d):\n", + __func__, _dev_id_in); + RCLCPP_INFO(this->get_logger(), " - Sample rate: %d\n", + _capture_spec_obtained.freq); + RCLCPP_INFO(this->get_logger(), " - Format: %d (Requested: %d)\n", + _capture_spec_obtained.format, _capture_spec_requested.format); + RCLCPP_INFO(this->get_logger(), " - Channels: %d (Requested: %d)\n", + _capture_spec_obtained.channels, + _capture_spec_requested.channels); + RCLCPP_INFO(this->get_logger(), " - Samples per frame: %d\n", + _capture_spec_obtained.samples); + + _sample_rate = _capture_spec_obtained.freq; + SDL_PauseAudioDevice(_dev_id_in, 0); + return true; +} + +int main(int argc, char *argv[]) { + rclcpp::init(argc, argv); + rclcpp::spin(std::make_shared()); + rclcpp::shutdown(); + return 0; +} +``` + + +# src/audio_capture_node.hpp +```cpp +/** + * @file audio_capture_node.hpp + * @brief Audio Capture Node for ROS 2 using SDL2 + * + * Copyright 2025 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef AUDIO_CAPTURE_NODE_HPP +#define AUDIO_CAPTURE_NODE_HPP + +#include +#include +#include + +#include "audio_tools/msg/audio_data.hpp" +#include "audio_tools/msg/audio_data_stamped.hpp" +#include "audio_tools/msg/audio_info.hpp" + +/** + * @class AudioCaptureNode + * @brief A ROS 2 node for capturing and publishing audio data using SDL2. + */ +class AudioCaptureNode : public rclcpp::Node { +public: + /** + * @brief Constructor for the AudioCaptureNode. + * Initializes audio capture, publishers, and diagnostics. + */ + AudioCaptureNode(); + + /** + * @brief Exits the program with the given exit code. + * @param code The exit code. + */ + void exitOnMainThread(int code); + + /** + * @brief SDL2 audio callback function. + * Processes captured audio and publishes it as a ROS message. + * @param stream Pointer to the audio stream buffer. + * @param len Length of the audio buffer in bytes. + */ + void callback(uint8_t *stream, int len); + + /** + * @brief Publishes audio information (sample rate, channels, format). + */ + void publishInfo(); + + /** + * @brief Destructor for AudioCaptureNode. + * Cleans up SDL audio resources. + */ + ~AudioCaptureNode(); + +private: + /** + * @brief ROS publisher for raw audio data. + */ + rclcpp::Publisher::SharedPtr _pub; + + /** + * @brief ROS publisher for audio metadata (e.g., sample rate, format). + */ + rclcpp::Publisher::SharedPtr _pub_info; + + /** + * @brief Timer for periodically publishing audio metadata. + */ + rclcpp::TimerBase::SharedPtr _timer_info; + + /** + * @brief Audio information metadata + */ + audio_tools::msg::AudioInfo _info_msg; + + /** + * @brief SDL audio specifications for requested and obtained formats. + */ + SDL_AudioSpec _capture_spec_requested; + SDL_AudioSpec _capture_spec_obtained; + + /** + * @brief Audio parameters: bitrate, channels, depth, sample rate, and device + * index. + */ + int _bitrate, _channels, _sample_rate, _device; + + /** + * @brief Number of bytes processed. + */ + size_t _nbytes; + + /** + * @brief Position counter for audio frames. + */ + size_t _audio_pos = 0; + + /** + * @brief Nanosecond and second timestamps for synchronization. + */ + size_t _nano_sec = 0; + size_t _sec = 0; + + /** + * @brief Desired publishing rate for diagnostics. + */ + double _desired_rate; + + /** + * @brief Audio sample format (e.g., "S16LE", "F32"). + */ + std::string _sample_format; + + /** + * @brief SDL audio device ID. + */ + SDL_AudioDeviceID _dev_id_in = 0; + + /** + * @brief ROS publisher for timestamped audio data with diagnostics. + */ + rclcpp::Publisher::SharedPtr _pub_stamped; + + /** + * @brief Configures the SDL2 audio capture stream. + * @param device The SDL2 device index (-1 for default). + * @param format The requested audio format (e.g., "S16LE"). + * @param chunk_size The buffer size for each audio chunk (e.g., 1024 + * samples). + * @param channels The number of audio channels (e.g., 1 for mono, 2 for + * stereo). + * @return True if configuration is successful, false otherwise. + */ + bool ConfigureSDLStream(int device, const std::string &format, int chunk_size, + int channels); + + /** + * @brief Maps audio format strings to SDL2 audio formats and corresponding + * byte sizes. + */ + const std::unordered_map> + _format_map = { + {"U8", {AUDIO_U8, 1}}, ///< Unsigned 8-bit (1 byte) + {"S8", {AUDIO_S8, 1}}, ///< Signed 8-bit (1 byte) + {"U16LE", + {AUDIO_U16LSB, 2}}, ///< Unsigned 16-bit little-endian (2 bytes) + {"U16BE", + {AUDIO_U16MSB, 2}}, ///< Unsigned 16-bit big-endian (2 bytes) + {"S16LE", + {AUDIO_S16LSB, 2}}, ///< Signed 16-bit little-endian (2 bytes) + {"S16BE", {AUDIO_S16MSB, 2}}, ///< Signed 16-bit big-endian (2 bytes) + {"S32LE", + {AUDIO_S32LSB, 4}}, ///< Signed 32-bit little-endian (4 bytes) + {"S32BE", {AUDIO_S32MSB, 4}}, ///< Signed 32-bit big-endian (4 bytes) + {"F32LE", + {AUDIO_F32LSB, 4}}, ///< 32-bit float little-endian (4 bytes) + {"F32BE", {AUDIO_F32MSB, 4}}, ///< 32-bit float big-endian (4 bytes) + {"F32", {AUDIO_F32, 4}} ///< 32-bit float (generic) (4 bytes) + }; +}; + +#endif /* AUDIO_CAPTURE_NODE_HPP */ + +``` + + +# src/audio_playback_node.cpp +```cpp +/** + * @file audio_playback_node.cpp + * @brief Audio Capture Node for ROS 2 using SDL2 + * + * Copyright 2025 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "audio_playback_node.hpp" +#include + +AudioPlaybackNode::AudioPlaybackNode() + : Node("audio_playback_node"), _audio_device(0), _initialized(false) { + // Declare and get topic name parameter + std::string topic_name = + this->declare_parameter("audio_topic", "/audio_playback"); + int device_index = + this->declare_parameter("device_index", -1); // -1 = default + + _subscription = this->create_subscription( + topic_name, 10, + std::bind(&AudioPlaybackNode::audio_cb, this, std::placeholders::_1)); + + if (SDL_Init(SDL_INIT_AUDIO) < 0) { + RCLCPP_FATAL(get_logger(), "SDL_Init failed: %s", SDL_GetError()); + rclcpp::shutdown(); + } + int num_devices = SDL_GetNumAudioDevices(0); + RCLCPP_INFO(get_logger(), "Available audio output devices:"); + for (int i = 0; i < num_devices; ++i) { + const char *name = SDL_GetAudioDeviceName(i, 0); + RCLCPP_INFO(get_logger(), " [%d]: %s%s", i, name, + (i == device_index ? " (selected)" : "")); + } + + if (device_index >= 0 && device_index < num_devices) { + _device_name = SDL_GetAudioDeviceName(device_index, 0); + RCLCPP_INFO(get_logger(), "Using audio device: %s", _device_name.c_str()); + } else if (device_index == -1) { + _device_name = ""; // SDL default + RCLCPP_INFO(get_logger(), "Using default audio device"); + } else { + RCLCPP_FATAL(get_logger(), "Invalid device_index: %d (available: 0 to %d)", + device_index, num_devices - 1); + rclcpp::shutdown(); + } + + RCLCPP_INFO(get_logger(), "Subscribed to topic: %s", topic_name.c_str()); +} + +AudioPlaybackNode::~AudioPlaybackNode() { + if (_audio_device != 0) { + SDL_CloseAudioDevice(_audio_device); + } + SDL_Quit(); +} + +bool AudioPlaybackNode::init_audio(const audio_tools::msg::AudioInfo &info) { + SDL_AudioSpec desired_spec; + SDL_zero(desired_spec); + + desired_spec.freq = static_cast(info.sample_rate); + desired_spec.channels = info.channels; + desired_spec.samples = 4096; + desired_spec.callback = nullptr; + + if (info.sample_format == "S16LE") { + desired_spec.format = AUDIO_S16LSB; + } else if (info.sample_format == "S16BE") { + desired_spec.format = AUDIO_S16MSB; + } else if (info.sample_format == "U8") { + desired_spec.format = AUDIO_U8; + } else if (info.sample_format == "S8") { + desired_spec.format = AUDIO_S8; + } else if (info.sample_format == "F32LE") { + desired_spec.format = AUDIO_F32LSB; + } else if (info.sample_format == "F32BE") { + desired_spec.format = AUDIO_F32MSB; + } else { + RCLCPP_ERROR(get_logger(), "Unsupported format: %s", + info.sample_format.c_str()); + return false; + } + + _audio_device = + SDL_OpenAudioDevice(_device_name.empty() ? nullptr : _device_name.c_str(), + 0, // 0 = output + &desired_spec, &_obtained_spec, 0); + if (_audio_device == 0) { + RCLCPP_ERROR(get_logger(), "SDL_OpenAudioDevice error: %s", SDL_GetError()); + return false; + } + + SDL_PauseAudioDevice(_audio_device, 0); + RCLCPP_INFO(get_logger(), "Audio device initialized: %d Hz, %d channels", + _obtained_spec.freq, _obtained_spec.channels); + + _initialized = true; + return true; +} + +void AudioPlaybackNode::audio_cb( + const audio_tools::msg::AudioDataStamped::SharedPtr msg) { + const auto &info = msg->info; + const auto &audio = msg->audio; + + if (!_initialized) { + if (!init_audio(info)) { + RCLCPP_ERROR(get_logger(), "Failed to initialize audio output."); + return; + } + } + + if (audio.data.empty()) + return; + + if (SDL_QueueAudio(_audio_device, audio.data.data(), audio.data.size()) < 0) { + RCLCPP_ERROR(get_logger(), "SDL_QueueAudio error: %s", SDL_GetError()); + } +} + +int main(int argc, char *argv[]) { + rclcpp::init(argc, argv); + rclcpp::spin(std::make_shared()); + rclcpp::shutdown(); + return 0; +} +``` + + +# src/audio_playback_node.hpp +```cpp +/** + * @file audio_playback_node.hpp + * @brief Audio Playback Node for ROS 2 using SDL2 + * + * Copyright 2025 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef AUDIO_PLAYBACK_NODE_HPP +#define AUDIO_PLAYBACK_NODE_HPP +#include +#include +#include + +#include "audio_tools/msg/audio_data.hpp" +#include "audio_tools/msg/audio_data_stamped.hpp" +#include "audio_tools/msg/audio_info.hpp" + +/** + * @class AudioPlaybackNode + * @brief A ROS 2 node for capturing and publishing audio data using SDL2. + */ +class AudioPlaybackNode : public rclcpp::Node { +public: + /** + * @brief Constructor for the AudioPlaybackNode. + * Initializes audio capture, publishers, and diagnostics. + */ + AudioPlaybackNode(); + + /** + * @brief Destructor for AudioPlaybackNode. + * Cleans up SDL audio resources. + */ + ~AudioPlaybackNode(); + +private: + /** + * @brief Callback function for received audio_msg on subscribed topic + * @param msg The audio message which got received + */ + void audio_cb(const audio_tools::msg::AudioDataStamped::SharedPtr msg); + /** + * @brief Function which automatically configures the audio_interface based on + * the received audio information + * @param info Custom message which contains the channels, sample_rate and + * encoding format + * @return false if failed e.g. invalid format, true if ok + */ + bool init_audio(const audio_tools::msg::AudioInfo &info); + + /** + * @brief Subscription object/handle for the subscribed audio topic + */ + rclcpp::Subscription::SharedPtr + _subscription; + + /** + * @brief SDL audio device handle. + */ + SDL_AudioDeviceID _audio_device; + + /** + * @brief SDL audio specifications for obtained format. + */ + SDL_AudioSpec _obtained_spec; + + /** + * @brief Full name of audio_device + */ + std::string _device_name; + + /** + * @brief Internal state to prevent initializing audio interface twice + */ + bool _initialized; +}; +#endif /* AUDIO_PLAYBACK_NODE */ +``` + + +# CMakeLists.txt +```cmake +cmake_minimum_required(VERSION 3.1...3.14) +project(audio_tools C CXX) + +find_package(ament_cmake REQUIRED) +find_package(rclcpp REQUIRED) +find_package(builtin_interfaces REQUIRED) +find_package(rosidl_default_generators REQUIRED) +find_package(std_msgs REQUIRED) +find_package(SDL2 REQUIRED) + +rosidl_generate_interfaces(${PROJECT_NAME} + msg/AudioData.msg + msg/AudioDataStamped.msg + msg/AudioInfo.msg + DEPENDENCIES std_msgs +) + +ament_export_dependencies(rosidl_default_runtime) +rosidl_get_typesupport_target(cpp_typesupport_target ${PROJECT_NAME} "rosidl_typesupport_cpp") + +add_executable(audio_capture_node src/audio_capture_node.cpp) + +ament_target_dependencies(audio_capture_node + rclcpp + std_msgs +) + +target_link_libraries(audio_capture_node "${cpp_typesupport_target}" ${SDL2_LIBRARIES}) +target_include_directories(audio_capture_node PUBLIC ${SDL2_INCLUDE_DIRS}) + +if(UNIX AND NOT APPLE) # Linux only + target_compile_options(audio_capture_node PRIVATE -Wall) +endif() + +install(TARGETS +audio_capture_node +DESTINATION lib/${PROJECT_NAME}) + + +add_executable(audio_playback_node src/audio_playback_node.cpp) +target_link_libraries(audio_playback_node "${cpp_typesupport_target}" ${SDL2_LIBRARIES}) +target_include_directories(audio_playback_node PUBLIC ${SDL2_INCLUDE_DIRS}) + +if(UNIX AND NOT APPLE) + target_compile_options(audio_playback_node PRIVATE -Wall) +endif() + +ament_target_dependencies(audio_playback_node + rclcpp + std_msgs +) + + +install(TARGETS +audio_playback_node +DESTINATION lib/${PROJECT_NAME}) +ament_package() +``` + + +# mkdocs.yml +```text +site_name: ROS2_Audio_Tools +extra_css: + - assets/stylesheets/logo.css +theme: + name: material + # custom_dir: docs/overrides + features: + - announce.dismiss + - content.action.edit + - content.action.view + - content.code.annotate + - content.code.copy + # - content.code.select + # - content.footnote.tooltips + # - content.tabs.link + - content.tooltips + # - header.autohide + # - navigation.expand + - navigation.footer + - navigation.indexes + # - navigation.instant + # - navigation.instant.prefetch + # - navigation.instant.progress + # - navigation.prune + - navigation.sections + - navigation.tabs + # - navigation.tabs.sticky + - navigation.top + - navigation.tracking + - search.highlight + - search.share + - search.suggest + - toc.follow + # - toc.integrate + palette: + scheme: default + primary: green + toggle: + icon: material/toggle-switch + name: Switch to dark mode + font: + text: Avenir Next Condensed + code: Avenir Next Condensed + favicon: assets/favicon.png + logo: assets/logo.png + +markdown_extensions: + - attr_list + - md_in_html + - admonition + - pymdownx.details + - pymdownx.superfences + +nav: + - index.md + - Getting Started: + - usage/overview.md + - Build Environment: + - usage/ros2_pixi_build_linux_windows.md + - usage/usage_with_native_linux.md + - Implementation: + - about/implementation.md + - Contributing: + - contributing/rules.md + - contributing/license.md +``` + diff --git a/docs/about/implementation.md b/docs/about/implementation.md index 292fabc..61a96a2 100644 --- a/docs/about/implementation.md +++ b/docs/about/implementation.md @@ -140,3 +140,58 @@ ros2 run audio_tools audio_playback_node \ - `SDL2` - Custom messages in `audio_tools` + +## `vad_node` + +**Voice Activity Detection Node for ROS 2** + +Analyzes audio streams to detect the presence of speech or voice activity and publishes the detection state. Useful for conversation systems, wake word triggers, and audio recording automation. + +--- +### ✅ Topics +| Topic | Type | Description | +|------------------|------------------------------------------|-----------------------------------| +| `/voice_activity` | `audio_tools/msg/VoiceActivity` | Voice activity detection state | +| Subscribes to: | | | +| `/audio_stamped` | `audio_tools/msg/AudioDataStamped` | Audio input for analysis | +--- + +### ⚙️ Parameters +| Name | Type | Default | Description | +|----------------|----------|---------|---------------------------------------------| +| `energy_threshold`| `float` | `0.01` | Detection sensitivity threshold | +| `hold_time` | `float` | `0.5` | Time in seconds to maintain detection after voice stops | +| `min_samples` | `int` | `160` | Minimum samples needed for detection decision | +--- + +### 📦 Message Notes +- `VoiceActivity`: + - `std_msgs/Header header` with `stamp` and `frame_id` + - `bool active` - Detection state (true/false) + - `float32 energy_level` - Current audio energy level + - `float32 threshold` - Detection threshold in use + - `float32 hold_time` - Time to hold detection state after voice stops +--- + +### 🧩 Implementation Notes +- Uses an **energy-based algorithm** for voice detection. +- Converts all audio formats to normalized float [-1.0, 1.0] for processing. +- Energy calculation: `energy = sum(sample * sample) / num_samples` +- Hold timer prevents rapid switching between states during pauses. +- Format conversion supports U8, S8, S16LE, S16BE, F32, etc. +--- + +### 🏁 Launch Example +```bash +ros2 run audio_tools vad_node \ + --ros-args \ +-p energy_threshold:=0.015 \ +-p hold_time:=0.7 \ +-p min_samples:=160 +``` +--- + +### 🧪 Dependencies +- `rclcpp` +- `audio_tools/msg/AudioDataStamped` +- `audio_tools/msg/VoiceActivity` \ No newline at end of file diff --git a/docs/usage/overview.md b/docs/usage/overview.md index caa16c0..567b3d1 100644 --- a/docs/usage/overview.md +++ b/docs/usage/overview.md @@ -57,3 +57,15 @@ ros2 run audio_tools audio_capture_node ros2 run audio_tools audio_playback_node \ --ros-args -p audio_topic:=/audio_stamped ``` + +### 🗣️ 5. Voice Activity Detection + +Capture microphone input and publish to `/audio_stamped`. Simple energy-based Voice Activity Detection (VAD) node on `/audio_stamped` and publishes detection results to `/voice_activity`. Note that VAD parameters are set in launch file. + +```bash +# Terminal 1 +ros2 run audio_tools audio_vad.launch.py + +# Terminal 2 +ros2 topic echo /voice_activity +``` diff --git a/docs/usage/voice_activity_detection.md b/docs/usage/voice_activity_detection.md new file mode 100644 index 0000000..c9a7547 --- /dev/null +++ b/docs/usage/voice_activity_detection.md @@ -0,0 +1,123 @@ +# Voice Activity Detection + +The Voice Activity Detection (VAD) node provides real-time detection of voice or speech in audio streams. The node uses a simple energy-based algorithm to determine if voice is present in the audio signal. + +## How it works + +The VAD node subscribes to audio data and performs the following steps: + +1. Converts the incoming audio to floating-point samples +2. Calculates the energy level of the audio signal +3. Compares this energy to a threshold +4. Applies a hold time to prevent rapid on/off switching +5. Publishes the detection state + +## Node Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `energy_threshold` | float | 0.01 | Energy threshold for detection | +| `hold_time` | float | 0.5 | Time in seconds to maintain detection after voice stops | +| `min_samples` | int | 160 | Minimum number of samples required for detection | + +## Topics + +### Subscribed Topics + +- `audio_stamped` ([audio_tools/AudioDataStamped](../msg/AudioDataStamped.md)): Audio data with timestamp and format information + +### Published Topics + +- `voice_activity` ([audio_tools/VoiceActivity](../msg/VoiceActivity.md)): Voice activity detection results + +## Message Format + +The `VoiceActivity` message includes: + +``` +std_msgs/Header header # Timestamp and frame information +bool active # True when voice activity is detected +float32 energy_level # Current audio energy level +float32 threshold # Current threshold being used +float32 hold_time # Hold time in seconds for the detection +``` + +## Usage Example + +To run the VAD node separately: + +```bash +ros2 run audio_tools vad_node --ros-args -p energy_threshold:=0.015 -p hold_time:=0.3 +``` + +To run with the audio capture node: + +```bash +ros2 launch audio_tools audio_vad.launch.py +``` + +## Tuning the VAD + +The energy threshold is the most important parameter to tune: + +- **Too low**: Will trigger on background noise +- **Too high**: May miss quiet speech + +### Step-by-Step Tuning Process + +1. **Monitor the energy level of your audio**: + ```bash + ros2 topic echo /voice_activity + ``` + +2. **Determine your noise floor**: + Remain silent and observe the `energy_level` values. Note the maximum level during silence. + +3. **Determine speech energy level**: + Speak at a normal volume and note the typical `energy_level` values. + +4. **Set the threshold**: + Set the threshold slightly above your noise floor: + ```bash + ros2 param set /vad energy_threshold 0.015 # Adjust based on your observations + ``` + +5. **Adjust hold time**: + If detection is fragmented during normal speech, increase the hold time: + ```bash + ros2 param set /vad hold_time 0.7 # Adjust as needed + ``` + +### Visual Tuning + +For visual tuning, use RQT to plot the energy levels: + +```bash +rqt_plot /voice_activity/energy_level /voice_activity/threshold /voice_activity/active +``` + +This will display: +- Your audio energy over time +- The threshold line +- The detection state (0 or 1) + +### Environment-Specific Settings + +Here are suggested starting points for different environments: + +| Environment | Suggested energy_threshold | Suggested hold_time | +|-------------|----------------------------|---------------------| +| Quiet room, close mic | 0.005-0.01 | 0.3-0.5s | +| Office, close mic | 0.01-0.02 | 0.5-0.7s | +| Noisy environment | 0.02-0.05 | 0.7-1.0s | +| Far-field mic | 0.01-0.03 | 0.7-1.0s | + +## Limitations + +This implementation uses a simple energy-based detector which works well in low-noise environments but may not perform optimally in: + +- Noisy environments +- Situations with varying background noise +- When capturing audio from distant microphones + +For more advanced detection in challenging environments, consider integrating a machine learning-based VAD algorithm. \ No newline at end of file diff --git a/launch/audio_vad.launch.py b/launch/audio_vad.launch.py new file mode 100644 index 0000000..3299074 --- /dev/null +++ b/launch/audio_vad.launch.py @@ -0,0 +1,31 @@ +from launch import LaunchDescription +from launch_ros.actions import Node + +def generate_launch_description(): + return LaunchDescription([ + Node( + package='audio_tools', + executable='audio_capture_node', + name='audio_capture', + parameters=[{ + 'sample_format': 'S16LE', + 'channels': 1, + 'sample_rate': 16000, + 'device': -1, # Use default device + 'chunk_size': 1024, + }], + output='screen', + ), + + Node( + package='audio_tools', + executable='vad_node', + name='vad', + parameters=[{ + 'energy_threshold': 0.01, + 'hold_time': 0.5, + 'min_samples': 160, + }], # Office, close mic + output='screen', + ), + ]) \ No newline at end of file diff --git a/msg/VoiceActivity.msg b/msg/VoiceActivity.msg new file mode 100644 index 0000000..8a31c90 --- /dev/null +++ b/msg/VoiceActivity.msg @@ -0,0 +1,8 @@ +std_msgs/Header header +bool active +# Current audio energy level +float32 energy_level +# Current threshold being used +float32 threshold +# Hold time in seconds for the detection +float32 hold_time diff --git a/src/vad_node.cpp b/src/vad_node.cpp new file mode 100644 index 0000000..f4b614a --- /dev/null +++ b/src/vad_node.cpp @@ -0,0 +1,205 @@ +/** + * @file vad_node.cpp + * @brief Voice Activity Detection Node for ROS 2 + * + * Copyright 2025 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "vad_node.hpp" +#include +#include +#include + +VADNode::VADNode() : Node("vad_node"), _current_vad_state(false) { + // Declare and get parameters + this->declare_parameter("energy_threshold", 0.01); + this->declare_parameter("hold_time", 0.5); + this->declare_parameter("min_samples", 160); + + this->get_parameter("energy_threshold", _energy_threshold); + this->get_parameter("hold_time", _hold_time); + this->get_parameter("min_samples", _min_samples); + + RCLCPP_INFO(this->get_logger(), "VAD initialized with parameters:"); + RCLCPP_INFO(this->get_logger(), " - Energy threshold: %.4f", _energy_threshold); + RCLCPP_INFO(this->get_logger(), " - Hold time: %.2f seconds", _hold_time); + RCLCPP_INFO(this->get_logger(), " - Min samples: %d", _min_samples); + + // Initialize the last voice time to current time + _last_voice_time = this->now(); + + // Create publisher for voice activity detection + _vad_pub = this->create_publisher("voice_activity", 10); + + // Subscribe to audio data stamped topic + _audio_sub = this->create_subscription( + "audio_stamped", 10, + std::bind(&VADNode::audioCallback, this, std::placeholders::_1)); + + RCLCPP_INFO(this->get_logger(), "VAD node is ready"); +} + +void VADNode::audioCallback(const audio_tools::msg::AudioDataStamped::SharedPtr msg) { + const auto &audio_data = msg->audio.data; + const auto &audio_info = msg->info; + + // Skip if we have no data + if (audio_data.empty()) { + return; + } + + // Convert audio data to float samples for processing + std::vector samples; + if (!convertToFloatSamples(audio_data, audio_info.sample_format, samples)) { + RCLCPP_WARN(this->get_logger(), "Failed to convert audio format %s", + audio_info.sample_format.c_str()); + return; + } + + // Calculate energy level + float energy_level = calculateEnergyLevel(samples); + + // Detect voice activity + bool voice_active = (energy_level > _energy_threshold) && + (samples.size() >= static_cast(_min_samples)); + + // Handle hold time logic + if (voice_active) { + _last_voice_time = this->now(); + + if (!_current_vad_state) { + _current_vad_state = true; + RCLCPP_DEBUG(this->get_logger(), "Voice activity started"); + } + } else { + // Check if we're within the hold time + rclcpp::Duration time_since_voice = this->now() - _last_voice_time; + if (_current_vad_state && time_since_voice.seconds() > _hold_time) { + _current_vad_state = false; + RCLCPP_DEBUG(this->get_logger(), "Voice activity ended"); + } + } + + // Publish the current VAD state + publishVoiceActivity(_current_vad_state, energy_level, msg->header.stamp); +} + +bool VADNode::detectVoiceActivity(const std::vector &audio_data, + const std::string &sample_format) { + // Convert to float samples + std::vector samples; + if (!convertToFloatSamples(audio_data, sample_format, samples)) { + return false; + } + + // Check if we have enough samples + if (samples.size() < static_cast(_min_samples)) { + return false; + } + + // Calculate energy level + float energy_level = calculateEnergyLevel(samples); + + // Compare with threshold + return energy_level > _energy_threshold; +} + +float VADNode::calculateEnergyLevel(const std::vector &samples) { + float energy = 0.0f; + + for (const auto &sample : samples) { + energy += sample * sample; + } + + // Normalize by number of samples + if (!samples.empty()) { + energy /= samples.size(); + } + + return energy; +} + +bool VADNode::convertToFloatSamples(const std::vector &audio_data, + const std::string &sample_format, + std::vector &out_samples) { + // Find the byte size for this format + auto it = _format_byte_size.find(sample_format); + if (it == _format_byte_size.end()) { + RCLCPP_ERROR(this->get_logger(), "Unknown audio format: %s", sample_format.c_str()); + return false; + } + + int bytes_per_sample = it->second; + size_t num_samples = audio_data.size() / bytes_per_sample; + out_samples.resize(num_samples); + + // Convert based on format + if (sample_format == "F32" || sample_format == "F32LE") { + // Direct copy for float format (assuming little endian) + std::memcpy(out_samples.data(), audio_data.data(), audio_data.size()); + } + else if (sample_format == "S16LE") { + // Convert signed 16-bit LE to float + const int16_t *int_samples = reinterpret_cast(audio_data.data()); + for (size_t i = 0; i < num_samples; i++) { + out_samples[i] = int_samples[i] / 32768.0f; + } + } + else if (sample_format == "S8") { + // Convert signed 8-bit to float + const int8_t *int_samples = reinterpret_cast(audio_data.data()); + for (size_t i = 0; i < num_samples; i++) { + out_samples[i] = int_samples[i] / 128.0f; + } + } + else if (sample_format == "U8") { + // Convert unsigned 8-bit to float + for (size_t i = 0; i < num_samples; i++) { + out_samples[i] = (audio_data[i] - 128) / 128.0f; + } + } + else { + RCLCPP_WARN(this->get_logger(), + "Format conversion not implemented for %s, using raw values", + sample_format.c_str()); + + // Just copy raw bytes as a fallback + for (size_t i = 0; i < num_samples; i++) { + out_samples[i] = static_cast(audio_data[i * bytes_per_sample]) / 255.0f; + } + } + + return true; +} + +void VADNode::publishVoiceActivity(bool active, float energy_level, + const rclcpp::Time ×tamp) { + auto msg = std::make_unique(); + + msg->header.stamp = timestamp; + msg->header.frame_id = "audio_frame"; + msg->active = active; + msg->energy_level = energy_level; + msg->threshold = _energy_threshold; + msg->hold_time = _hold_time; + + _vad_pub->publish(std::move(msg)); +} + +int main(int argc, char *argv[]) { + rclcpp::init(argc, argv); + rclcpp::spin(std::make_shared()); + rclcpp::shutdown(); + return 0; +} \ No newline at end of file diff --git a/src/vad_node.hpp b/src/vad_node.hpp new file mode 100644 index 0000000..16c5011 --- /dev/null +++ b/src/vad_node.hpp @@ -0,0 +1,122 @@ +/** + * @file vad_node.hpp + * @brief Voice Activity Detection Node for ROS 2 + * + * Copyright 2025 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef VAD_NODE_HPP +#define VAD_NODE_HPP + +#include + +#include "audio_tools/msg/audio_data.hpp" +#include "audio_tools/msg/audio_data_stamped.hpp" +#include "audio_tools/msg/audio_info.hpp" +#include "audio_tools/msg/voice_activity.hpp" + +/** + * @class VADNode + * @brief A ROS 2 node for Voice Activity Detection (VAD) using audio streams. + */ +class VADNode : public rclcpp::Node { +public: + /** + * @brief Constructor for the VADNode. + * Initializes subscribers, publishers, and parameters. + */ + VADNode(); + + /** + * @brief Destructor for VADNode. + */ + ~VADNode() = default; + +private: + /** + * @brief Callback for audio data messages. + * @param msg The audio message received from the audio capture node. + */ + void audioCallback(const audio_tools::msg::AudioDataStamped::SharedPtr msg); + + /** + * @brief Processes audio data to detect voice activity. + * @param audio_data The raw audio data samples. + * @param sample_format The format of the audio samples. + * @return True if voice activity is detected, false otherwise. + */ + bool detectVoiceActivity(const std::vector &audio_data, + const std::string &sample_format); + + /** + * @brief Converts audio data to float samples for processing. + * @param audio_data The raw audio data. + * @param sample_format The format of the audio samples. + * @param out_samples Output vector of float samples. + * @return True if conversion was successful, false otherwise. + */ + bool convertToFloatSamples(const std::vector &audio_data, + const std::string &sample_format, + std::vector &out_samples); + + /** + * @brief Calculates the energy level of the audio samples. + * @param samples The audio samples in float format. + * @return The calculated energy level. + */ + float calculateEnergyLevel(const std::vector &samples); + + /** + * @brief Publishes the current voice activity state. + * @param active Whether voice activity is detected. + * @param energy_level The current audio energy level. + * @param timestamp The timestamp for the message. + */ + void publishVoiceActivity(bool active, float energy_level, + const rclcpp::Time ×tamp); + + /** + * @brief ROS publisher for voice activity detection results. + */ + rclcpp::Publisher::SharedPtr _vad_pub; + + /** + * @brief ROS subscriber for audio data. + */ + rclcpp::Subscription::SharedPtr _audio_sub; + + /** + * @brief Parameters for VAD. + */ + float _energy_threshold; // Threshold for energy-based detection + float _hold_time; // Time in seconds to hold detection state + int _min_samples; // Minimum number of samples for detection + + /** + * @brief State variables for VAD. + */ + bool _current_vad_state; // Current voice activity state + rclcpp::Time _last_voice_time; // Time when voice was last detected + + /** + * @brief Conversion map for audio formats to byte sizes. + */ + const std::unordered_map _format_byte_size = { + {"U8", 1}, {"S8", 1}, {"U16LE", 2}, {"U16BE", 2}, + {"S16LE", 2}, {"S16BE", 2}, {"S32LE", 4}, {"S32BE", 4}, + {"F32LE", 4}, {"F32BE", 4}, {"F32", 4} + }; +}; + +#endif /* VAD_NODE_HPP */ \ No newline at end of file From 65ae51a7ccd6d930eea7650691e0ff12a80b248b Mon Sep 17 00:00:00 2001 From: Jeroen Veen Date: Sun, 13 Apr 2025 13:17:05 +0200 Subject: [PATCH 02/10] Updated mkdocs.yml navigation --- docs/{usage => about}/voice_activity_detection.md | 0 mkdocs.yml | 1 + 2 files changed, 1 insertion(+) rename docs/{usage => about}/voice_activity_detection.md (100%) diff --git a/docs/usage/voice_activity_detection.md b/docs/about/voice_activity_detection.md similarity index 100% rename from docs/usage/voice_activity_detection.md rename to docs/about/voice_activity_detection.md diff --git a/mkdocs.yml b/mkdocs.yml index f9321c3..ae06784 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -60,6 +60,7 @@ nav: - usage/usage_with_native_linux.md - Implementation: - about/implementation.md + - about/voice_activity_detection.md - Contributing: - contributing/rules.md - contributing/license.md \ No newline at end of file From 8b98a8665d6903e34e939d6d1c3b9890fcde1399 Mon Sep 17 00:00:00 2001 From: Jeroen Veen Date: Sun, 13 Apr 2025 19:45:50 +0200 Subject: [PATCH 03/10] removed notes --- ROS2_Audio_Tools_project_knowledge.md | 812 -------------------------- 1 file changed, 812 deletions(-) delete mode 100644 ROS2_Audio_Tools_project_knowledge.md diff --git a/ROS2_Audio_Tools_project_knowledge.md b/ROS2_Audio_Tools_project_knowledge.md deleted file mode 100644 index 080d213..0000000 --- a/ROS2_Audio_Tools_project_knowledge.md +++ /dev/null @@ -1,812 +0,0 @@ -# EdgeVox Project Documentation - -# Project Structure -``` -ROS2_Audio_Tools/ -├── conda/ -│ ├── lin/ -│ │ ├── recipe/ -│ │ │ ├── activate.sh -│ │ │ ├── build_ament_cmake.sh -│ │ │ ├── build_ament_python.sh -│ │ │ ├── build_catkin.sh -│ │ │ ├── deactivate.sh -│ │ │ ├── recipe.yaml -│ │ ├── pixi.toml -│ ├── win/ -│ │ ├── recipe/ -│ │ │ ├── activate.bat -│ │ │ ├── bld_ament_cmake.bat -│ │ │ ├── bld_ament_python.bat -│ │ │ ├── bld_catkin.bat -│ │ │ ├── bld_catkin_merge.bat -│ │ │ ├── bld_colcon_merge.bat -│ │ │ ├── deactivate.bat -│ │ │ ├── recipe.yaml -│ │ │ ├── rosdistro_snapshot.yaml -│ │ ├── pixi.toml -├── docs/ -│ ├── about/ -│ │ ├── implementation.md -│ ├── assets/ -│ │ ├── stylesheets/ -│ │ │ ├── logo.css -│ │ ├── favicon.png -│ │ ├── logo.png -│ ├── contributing/ -│ │ ├── license.md -│ │ ├── rules.md -│ ├── usage/ -│ │ ├── overview.md -│ │ ├── ros2_pixi_build_linux_windows.md -│ │ ├── usage_with_native_linux.md -│ ├── index.md -├── msg/ -│ ├── AudioData.msg -│ ├── AudioDataStamped.msg -│ ├── AudioInfo.msg -├── src/ -│ ├── audio_capture_node.cpp -│ ├── audio_capture_node.hpp -│ ├── audio_playback_node.cpp -│ ├── audio_playback_node.hpp -├── CMakeLists.txt -├── LICENSE -├── README.md -├── colcon.pkg -├── mkdocs.yml -├── package.xml -├── pixi.toml -``` - -# src/audio_capture_node.cpp -```cpp -/** - * @file audio_capture_node.cpp - * @brief Audio Capture Node for ROS 2 using SDL2 - * - * Copyright 2025 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "audio_capture_node.hpp" -#include - -AudioCaptureNode::AudioCaptureNode() - : Node("audio_capture_node"), _desired_rate(-1.0) { - - // Declare and get parameters - this->declare_parameter("sample_format", "S16LE"); - this->get_parameter("sample_format", _sample_format); - - this->declare_parameter("channels", 1); - this->declare_parameter("sample_rate", 16000); - this->get_parameter("channels", _channels); - this->get_parameter("sample_rate", _sample_rate); - - if (_channels < 1 || _channels > 8) { - RCLCPP_ERROR_STREAM(this->get_logger(), - "Invalid channel count: " << _channels); - throw std::runtime_error("Unsupported media type."); - } - - this->declare_parameter("device", -1); - this->get_parameter("device", _device); - - this->declare_parameter("chunk_size", 1024); - int chunk_size; - this->get_parameter("chunk_size", chunk_size); - - if (chunk_size < 512 || chunk_size > 4096) { - RCLCPP_ERROR_STREAM(this->get_logger(), - "Invalid chunk size: " << chunk_size); - throw std::runtime_error("Unsupported chunk size!"); - } - - if (!ConfigureSDLStream(_device, _sample_format, chunk_size, _channels)) { - throw std::runtime_error("Failed to configure SDL audio capture."); - } - - // Publishers - _pub = this->create_publisher("audio", 10); - auto info_qos = rclcpp::QoS(rclcpp::KeepLast(1)).transient_local(); - _pub_info = this->create_publisher("audio_info", - info_qos); - - _pub_stamped = this->create_publisher( - "audio_stamped", 10); - - _timer_info = rclcpp::create_timer(this, get_clock(), std::chrono::seconds(5), - [this] { publishInfo(); }); - publishInfo(); -} - -void AudioCaptureNode::callback(uint8_t *stream, int len) { - audio_tools::msg::AudioData msg; - audio_tools::msg::AudioDataStamped stamped_msg; - - // Generate frame ID - stamped_msg.header.frame_id = std::to_string(_audio_pos++); - // Compute timestamp based on buffer size - size_t buffer_size_total_nsec = - (static_cast(len) * 1e9) / (_sample_rate * _nbytes); - - _nano_sec += buffer_size_total_nsec; - while (_nano_sec >= RCL_S_TO_NS(1)) { - _sec++; - _nano_sec -= RCL_S_TO_NS(1); - } - - stamped_msg.header.stamp.sec = _sec; - stamped_msg.header.stamp.nanosec = _nano_sec; - - // Copy audio data - msg.data.assign(stream, stream + len); - - stamped_msg.audio = msg; - stamped_msg.info = _info_msg; - // Publish messages - _pub->publish(msg); - _pub_stamped->publish(stamped_msg); -} - -void AudioCaptureNode::publishInfo() { - _info_msg.channels = _channels; - _info_msg.sample_rate = _sample_rate; - _info_msg.sample_format = _sample_format; - - _pub_info->publish(_info_msg); -} - -AudioCaptureNode::~AudioCaptureNode() { - SDL_PauseAudioDevice(_dev_id_in, 1); - SDL_CloseAudioDevice(_dev_id_in); -} - -bool AudioCaptureNode::ConfigureSDLStream(int device, const std::string &format, - int chunk_size, int channels) { - SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO); - - if (SDL_Init(SDL_INIT_AUDIO) < 0) { - RCLCPP_ERROR(this->get_logger(), "SDL initialization failed: %s", - SDL_GetError()); - return false; - } - - SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", - SDL_HINT_OVERRIDE); - - int nDevices = SDL_GetNumAudioDevices(SDL_TRUE); - RCLCPP_INFO(this->get_logger(), "%s: Found %d capture devices:\n", __func__, - nDevices); - - for (int i = 0; i < nDevices; i++) { - RCLCPP_INFO(this->get_logger(), " - Capture device #%d: '%s'\n", i, - SDL_GetAudioDeviceName(i, SDL_TRUE)); - } - - SDL_zero(_capture_spec_requested); - SDL_zero(_capture_spec_obtained); - - auto format_it = _format_map.find(format); - if (format_it == _format_map.end()) { - RCLCPP_ERROR(this->get_logger(), "Unknown audio format: %s", - format.c_str()); - return false; - } - _nbytes = format_it->second.second; - - _capture_spec_requested.freq = _sample_rate; - _capture_spec_requested.format = format_it->second.first; - _capture_spec_requested.channels = channels; - _capture_spec_requested.samples = chunk_size; - _capture_spec_requested.callback = [](void *userdata, uint8_t *stream, - int len) { - static_cast(userdata)->callback(stream, len); - }; - _capture_spec_requested.userdata = this; - - if (device >= 0 && device < nDevices) { - RCLCPP_INFO(this->get_logger(), - "%s: Attempting to open capture device #%d: '%s'...\n", - __func__, device, SDL_GetAudioDeviceName(device, SDL_TRUE)); - - _dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(device, SDL_TRUE), - SDL_TRUE, &_capture_spec_requested, - &_capture_spec_obtained, 0); - } else { - RCLCPP_INFO(this->get_logger(), - "%s: Attempting to open default capture device...\n", __func__); - _dev_id_in = - SDL_OpenAudioDevice(nullptr, SDL_TRUE, &_capture_spec_requested, - &_capture_spec_obtained, 0); - } - - if (!_dev_id_in) { - RCLCPP_INFO(this->get_logger(), "%s: Failed to open audio device: %s\n", - __func__, SDL_GetError()); - return false; - } - - RCLCPP_INFO(this->get_logger(), "%s: Capture device opened (SDL Id = %d):\n", - __func__, _dev_id_in); - RCLCPP_INFO(this->get_logger(), " - Sample rate: %d\n", - _capture_spec_obtained.freq); - RCLCPP_INFO(this->get_logger(), " - Format: %d (Requested: %d)\n", - _capture_spec_obtained.format, _capture_spec_requested.format); - RCLCPP_INFO(this->get_logger(), " - Channels: %d (Requested: %d)\n", - _capture_spec_obtained.channels, - _capture_spec_requested.channels); - RCLCPP_INFO(this->get_logger(), " - Samples per frame: %d\n", - _capture_spec_obtained.samples); - - _sample_rate = _capture_spec_obtained.freq; - SDL_PauseAudioDevice(_dev_id_in, 0); - return true; -} - -int main(int argc, char *argv[]) { - rclcpp::init(argc, argv); - rclcpp::spin(std::make_shared()); - rclcpp::shutdown(); - return 0; -} -``` - - -# src/audio_capture_node.hpp -```cpp -/** - * @file audio_capture_node.hpp - * @brief Audio Capture Node for ROS 2 using SDL2 - * - * Copyright 2025 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef AUDIO_CAPTURE_NODE_HPP -#define AUDIO_CAPTURE_NODE_HPP - -#include -#include -#include - -#include "audio_tools/msg/audio_data.hpp" -#include "audio_tools/msg/audio_data_stamped.hpp" -#include "audio_tools/msg/audio_info.hpp" - -/** - * @class AudioCaptureNode - * @brief A ROS 2 node for capturing and publishing audio data using SDL2. - */ -class AudioCaptureNode : public rclcpp::Node { -public: - /** - * @brief Constructor for the AudioCaptureNode. - * Initializes audio capture, publishers, and diagnostics. - */ - AudioCaptureNode(); - - /** - * @brief Exits the program with the given exit code. - * @param code The exit code. - */ - void exitOnMainThread(int code); - - /** - * @brief SDL2 audio callback function. - * Processes captured audio and publishes it as a ROS message. - * @param stream Pointer to the audio stream buffer. - * @param len Length of the audio buffer in bytes. - */ - void callback(uint8_t *stream, int len); - - /** - * @brief Publishes audio information (sample rate, channels, format). - */ - void publishInfo(); - - /** - * @brief Destructor for AudioCaptureNode. - * Cleans up SDL audio resources. - */ - ~AudioCaptureNode(); - -private: - /** - * @brief ROS publisher for raw audio data. - */ - rclcpp::Publisher::SharedPtr _pub; - - /** - * @brief ROS publisher for audio metadata (e.g., sample rate, format). - */ - rclcpp::Publisher::SharedPtr _pub_info; - - /** - * @brief Timer for periodically publishing audio metadata. - */ - rclcpp::TimerBase::SharedPtr _timer_info; - - /** - * @brief Audio information metadata - */ - audio_tools::msg::AudioInfo _info_msg; - - /** - * @brief SDL audio specifications for requested and obtained formats. - */ - SDL_AudioSpec _capture_spec_requested; - SDL_AudioSpec _capture_spec_obtained; - - /** - * @brief Audio parameters: bitrate, channels, depth, sample rate, and device - * index. - */ - int _bitrate, _channels, _sample_rate, _device; - - /** - * @brief Number of bytes processed. - */ - size_t _nbytes; - - /** - * @brief Position counter for audio frames. - */ - size_t _audio_pos = 0; - - /** - * @brief Nanosecond and second timestamps for synchronization. - */ - size_t _nano_sec = 0; - size_t _sec = 0; - - /** - * @brief Desired publishing rate for diagnostics. - */ - double _desired_rate; - - /** - * @brief Audio sample format (e.g., "S16LE", "F32"). - */ - std::string _sample_format; - - /** - * @brief SDL audio device ID. - */ - SDL_AudioDeviceID _dev_id_in = 0; - - /** - * @brief ROS publisher for timestamped audio data with diagnostics. - */ - rclcpp::Publisher::SharedPtr _pub_stamped; - - /** - * @brief Configures the SDL2 audio capture stream. - * @param device The SDL2 device index (-1 for default). - * @param format The requested audio format (e.g., "S16LE"). - * @param chunk_size The buffer size for each audio chunk (e.g., 1024 - * samples). - * @param channels The number of audio channels (e.g., 1 for mono, 2 for - * stereo). - * @return True if configuration is successful, false otherwise. - */ - bool ConfigureSDLStream(int device, const std::string &format, int chunk_size, - int channels); - - /** - * @brief Maps audio format strings to SDL2 audio formats and corresponding - * byte sizes. - */ - const std::unordered_map> - _format_map = { - {"U8", {AUDIO_U8, 1}}, ///< Unsigned 8-bit (1 byte) - {"S8", {AUDIO_S8, 1}}, ///< Signed 8-bit (1 byte) - {"U16LE", - {AUDIO_U16LSB, 2}}, ///< Unsigned 16-bit little-endian (2 bytes) - {"U16BE", - {AUDIO_U16MSB, 2}}, ///< Unsigned 16-bit big-endian (2 bytes) - {"S16LE", - {AUDIO_S16LSB, 2}}, ///< Signed 16-bit little-endian (2 bytes) - {"S16BE", {AUDIO_S16MSB, 2}}, ///< Signed 16-bit big-endian (2 bytes) - {"S32LE", - {AUDIO_S32LSB, 4}}, ///< Signed 32-bit little-endian (4 bytes) - {"S32BE", {AUDIO_S32MSB, 4}}, ///< Signed 32-bit big-endian (4 bytes) - {"F32LE", - {AUDIO_F32LSB, 4}}, ///< 32-bit float little-endian (4 bytes) - {"F32BE", {AUDIO_F32MSB, 4}}, ///< 32-bit float big-endian (4 bytes) - {"F32", {AUDIO_F32, 4}} ///< 32-bit float (generic) (4 bytes) - }; -}; - -#endif /* AUDIO_CAPTURE_NODE_HPP */ - -``` - - -# src/audio_playback_node.cpp -```cpp -/** - * @file audio_playback_node.cpp - * @brief Audio Capture Node for ROS 2 using SDL2 - * - * Copyright 2025 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "audio_playback_node.hpp" -#include - -AudioPlaybackNode::AudioPlaybackNode() - : Node("audio_playback_node"), _audio_device(0), _initialized(false) { - // Declare and get topic name parameter - std::string topic_name = - this->declare_parameter("audio_topic", "/audio_playback"); - int device_index = - this->declare_parameter("device_index", -1); // -1 = default - - _subscription = this->create_subscription( - topic_name, 10, - std::bind(&AudioPlaybackNode::audio_cb, this, std::placeholders::_1)); - - if (SDL_Init(SDL_INIT_AUDIO) < 0) { - RCLCPP_FATAL(get_logger(), "SDL_Init failed: %s", SDL_GetError()); - rclcpp::shutdown(); - } - int num_devices = SDL_GetNumAudioDevices(0); - RCLCPP_INFO(get_logger(), "Available audio output devices:"); - for (int i = 0; i < num_devices; ++i) { - const char *name = SDL_GetAudioDeviceName(i, 0); - RCLCPP_INFO(get_logger(), " [%d]: %s%s", i, name, - (i == device_index ? " (selected)" : "")); - } - - if (device_index >= 0 && device_index < num_devices) { - _device_name = SDL_GetAudioDeviceName(device_index, 0); - RCLCPP_INFO(get_logger(), "Using audio device: %s", _device_name.c_str()); - } else if (device_index == -1) { - _device_name = ""; // SDL default - RCLCPP_INFO(get_logger(), "Using default audio device"); - } else { - RCLCPP_FATAL(get_logger(), "Invalid device_index: %d (available: 0 to %d)", - device_index, num_devices - 1); - rclcpp::shutdown(); - } - - RCLCPP_INFO(get_logger(), "Subscribed to topic: %s", topic_name.c_str()); -} - -AudioPlaybackNode::~AudioPlaybackNode() { - if (_audio_device != 0) { - SDL_CloseAudioDevice(_audio_device); - } - SDL_Quit(); -} - -bool AudioPlaybackNode::init_audio(const audio_tools::msg::AudioInfo &info) { - SDL_AudioSpec desired_spec; - SDL_zero(desired_spec); - - desired_spec.freq = static_cast(info.sample_rate); - desired_spec.channels = info.channels; - desired_spec.samples = 4096; - desired_spec.callback = nullptr; - - if (info.sample_format == "S16LE") { - desired_spec.format = AUDIO_S16LSB; - } else if (info.sample_format == "S16BE") { - desired_spec.format = AUDIO_S16MSB; - } else if (info.sample_format == "U8") { - desired_spec.format = AUDIO_U8; - } else if (info.sample_format == "S8") { - desired_spec.format = AUDIO_S8; - } else if (info.sample_format == "F32LE") { - desired_spec.format = AUDIO_F32LSB; - } else if (info.sample_format == "F32BE") { - desired_spec.format = AUDIO_F32MSB; - } else { - RCLCPP_ERROR(get_logger(), "Unsupported format: %s", - info.sample_format.c_str()); - return false; - } - - _audio_device = - SDL_OpenAudioDevice(_device_name.empty() ? nullptr : _device_name.c_str(), - 0, // 0 = output - &desired_spec, &_obtained_spec, 0); - if (_audio_device == 0) { - RCLCPP_ERROR(get_logger(), "SDL_OpenAudioDevice error: %s", SDL_GetError()); - return false; - } - - SDL_PauseAudioDevice(_audio_device, 0); - RCLCPP_INFO(get_logger(), "Audio device initialized: %d Hz, %d channels", - _obtained_spec.freq, _obtained_spec.channels); - - _initialized = true; - return true; -} - -void AudioPlaybackNode::audio_cb( - const audio_tools::msg::AudioDataStamped::SharedPtr msg) { - const auto &info = msg->info; - const auto &audio = msg->audio; - - if (!_initialized) { - if (!init_audio(info)) { - RCLCPP_ERROR(get_logger(), "Failed to initialize audio output."); - return; - } - } - - if (audio.data.empty()) - return; - - if (SDL_QueueAudio(_audio_device, audio.data.data(), audio.data.size()) < 0) { - RCLCPP_ERROR(get_logger(), "SDL_QueueAudio error: %s", SDL_GetError()); - } -} - -int main(int argc, char *argv[]) { - rclcpp::init(argc, argv); - rclcpp::spin(std::make_shared()); - rclcpp::shutdown(); - return 0; -} -``` - - -# src/audio_playback_node.hpp -```cpp -/** - * @file audio_playback_node.hpp - * @brief Audio Playback Node for ROS 2 using SDL2 - * - * Copyright 2025 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef AUDIO_PLAYBACK_NODE_HPP -#define AUDIO_PLAYBACK_NODE_HPP -#include -#include -#include - -#include "audio_tools/msg/audio_data.hpp" -#include "audio_tools/msg/audio_data_stamped.hpp" -#include "audio_tools/msg/audio_info.hpp" - -/** - * @class AudioPlaybackNode - * @brief A ROS 2 node for capturing and publishing audio data using SDL2. - */ -class AudioPlaybackNode : public rclcpp::Node { -public: - /** - * @brief Constructor for the AudioPlaybackNode. - * Initializes audio capture, publishers, and diagnostics. - */ - AudioPlaybackNode(); - - /** - * @brief Destructor for AudioPlaybackNode. - * Cleans up SDL audio resources. - */ - ~AudioPlaybackNode(); - -private: - /** - * @brief Callback function for received audio_msg on subscribed topic - * @param msg The audio message which got received - */ - void audio_cb(const audio_tools::msg::AudioDataStamped::SharedPtr msg); - /** - * @brief Function which automatically configures the audio_interface based on - * the received audio information - * @param info Custom message which contains the channels, sample_rate and - * encoding format - * @return false if failed e.g. invalid format, true if ok - */ - bool init_audio(const audio_tools::msg::AudioInfo &info); - - /** - * @brief Subscription object/handle for the subscribed audio topic - */ - rclcpp::Subscription::SharedPtr - _subscription; - - /** - * @brief SDL audio device handle. - */ - SDL_AudioDeviceID _audio_device; - - /** - * @brief SDL audio specifications for obtained format. - */ - SDL_AudioSpec _obtained_spec; - - /** - * @brief Full name of audio_device - */ - std::string _device_name; - - /** - * @brief Internal state to prevent initializing audio interface twice - */ - bool _initialized; -}; -#endif /* AUDIO_PLAYBACK_NODE */ -``` - - -# CMakeLists.txt -```cmake -cmake_minimum_required(VERSION 3.1...3.14) -project(audio_tools C CXX) - -find_package(ament_cmake REQUIRED) -find_package(rclcpp REQUIRED) -find_package(builtin_interfaces REQUIRED) -find_package(rosidl_default_generators REQUIRED) -find_package(std_msgs REQUIRED) -find_package(SDL2 REQUIRED) - -rosidl_generate_interfaces(${PROJECT_NAME} - msg/AudioData.msg - msg/AudioDataStamped.msg - msg/AudioInfo.msg - DEPENDENCIES std_msgs -) - -ament_export_dependencies(rosidl_default_runtime) -rosidl_get_typesupport_target(cpp_typesupport_target ${PROJECT_NAME} "rosidl_typesupport_cpp") - -add_executable(audio_capture_node src/audio_capture_node.cpp) - -ament_target_dependencies(audio_capture_node - rclcpp - std_msgs -) - -target_link_libraries(audio_capture_node "${cpp_typesupport_target}" ${SDL2_LIBRARIES}) -target_include_directories(audio_capture_node PUBLIC ${SDL2_INCLUDE_DIRS}) - -if(UNIX AND NOT APPLE) # Linux only - target_compile_options(audio_capture_node PRIVATE -Wall) -endif() - -install(TARGETS -audio_capture_node -DESTINATION lib/${PROJECT_NAME}) - - -add_executable(audio_playback_node src/audio_playback_node.cpp) -target_link_libraries(audio_playback_node "${cpp_typesupport_target}" ${SDL2_LIBRARIES}) -target_include_directories(audio_playback_node PUBLIC ${SDL2_INCLUDE_DIRS}) - -if(UNIX AND NOT APPLE) - target_compile_options(audio_playback_node PRIVATE -Wall) -endif() - -ament_target_dependencies(audio_playback_node - rclcpp - std_msgs -) - - -install(TARGETS -audio_playback_node -DESTINATION lib/${PROJECT_NAME}) -ament_package() -``` - - -# mkdocs.yml -```text -site_name: ROS2_Audio_Tools -extra_css: - - assets/stylesheets/logo.css -theme: - name: material - # custom_dir: docs/overrides - features: - - announce.dismiss - - content.action.edit - - content.action.view - - content.code.annotate - - content.code.copy - # - content.code.select - # - content.footnote.tooltips - # - content.tabs.link - - content.tooltips - # - header.autohide - # - navigation.expand - - navigation.footer - - navigation.indexes - # - navigation.instant - # - navigation.instant.prefetch - # - navigation.instant.progress - # - navigation.prune - - navigation.sections - - navigation.tabs - # - navigation.tabs.sticky - - navigation.top - - navigation.tracking - - search.highlight - - search.share - - search.suggest - - toc.follow - # - toc.integrate - palette: - scheme: default - primary: green - toggle: - icon: material/toggle-switch - name: Switch to dark mode - font: - text: Avenir Next Condensed - code: Avenir Next Condensed - favicon: assets/favicon.png - logo: assets/logo.png - -markdown_extensions: - - attr_list - - md_in_html - - admonition - - pymdownx.details - - pymdownx.superfences - -nav: - - index.md - - Getting Started: - - usage/overview.md - - Build Environment: - - usage/ros2_pixi_build_linux_windows.md - - usage/usage_with_native_linux.md - - Implementation: - - about/implementation.md - - Contributing: - - contributing/rules.md - - contributing/license.md -``` - From 058025c58750f78f72100661447610e3faa737cc Mon Sep 17 00:00:00 2001 From: Jeroen Veen Date: Sun, 13 Apr 2025 19:50:21 +0200 Subject: [PATCH 04/10] removed duplicate docs --- docs/about/voice_activity_detection.md | 123 ------------------------ 1 file changed, 123 deletions(-) delete mode 100644 docs/about/voice_activity_detection.md diff --git a/docs/about/voice_activity_detection.md b/docs/about/voice_activity_detection.md deleted file mode 100644 index c9a7547..0000000 --- a/docs/about/voice_activity_detection.md +++ /dev/null @@ -1,123 +0,0 @@ -# Voice Activity Detection - -The Voice Activity Detection (VAD) node provides real-time detection of voice or speech in audio streams. The node uses a simple energy-based algorithm to determine if voice is present in the audio signal. - -## How it works - -The VAD node subscribes to audio data and performs the following steps: - -1. Converts the incoming audio to floating-point samples -2. Calculates the energy level of the audio signal -3. Compares this energy to a threshold -4. Applies a hold time to prevent rapid on/off switching -5. Publishes the detection state - -## Node Parameters - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `energy_threshold` | float | 0.01 | Energy threshold for detection | -| `hold_time` | float | 0.5 | Time in seconds to maintain detection after voice stops | -| `min_samples` | int | 160 | Minimum number of samples required for detection | - -## Topics - -### Subscribed Topics - -- `audio_stamped` ([audio_tools/AudioDataStamped](../msg/AudioDataStamped.md)): Audio data with timestamp and format information - -### Published Topics - -- `voice_activity` ([audio_tools/VoiceActivity](../msg/VoiceActivity.md)): Voice activity detection results - -## Message Format - -The `VoiceActivity` message includes: - -``` -std_msgs/Header header # Timestamp and frame information -bool active # True when voice activity is detected -float32 energy_level # Current audio energy level -float32 threshold # Current threshold being used -float32 hold_time # Hold time in seconds for the detection -``` - -## Usage Example - -To run the VAD node separately: - -```bash -ros2 run audio_tools vad_node --ros-args -p energy_threshold:=0.015 -p hold_time:=0.3 -``` - -To run with the audio capture node: - -```bash -ros2 launch audio_tools audio_vad.launch.py -``` - -## Tuning the VAD - -The energy threshold is the most important parameter to tune: - -- **Too low**: Will trigger on background noise -- **Too high**: May miss quiet speech - -### Step-by-Step Tuning Process - -1. **Monitor the energy level of your audio**: - ```bash - ros2 topic echo /voice_activity - ``` - -2. **Determine your noise floor**: - Remain silent and observe the `energy_level` values. Note the maximum level during silence. - -3. **Determine speech energy level**: - Speak at a normal volume and note the typical `energy_level` values. - -4. **Set the threshold**: - Set the threshold slightly above your noise floor: - ```bash - ros2 param set /vad energy_threshold 0.015 # Adjust based on your observations - ``` - -5. **Adjust hold time**: - If detection is fragmented during normal speech, increase the hold time: - ```bash - ros2 param set /vad hold_time 0.7 # Adjust as needed - ``` - -### Visual Tuning - -For visual tuning, use RQT to plot the energy levels: - -```bash -rqt_plot /voice_activity/energy_level /voice_activity/threshold /voice_activity/active -``` - -This will display: -- Your audio energy over time -- The threshold line -- The detection state (0 or 1) - -### Environment-Specific Settings - -Here are suggested starting points for different environments: - -| Environment | Suggested energy_threshold | Suggested hold_time | -|-------------|----------------------------|---------------------| -| Quiet room, close mic | 0.005-0.01 | 0.3-0.5s | -| Office, close mic | 0.01-0.02 | 0.5-0.7s | -| Noisy environment | 0.02-0.05 | 0.7-1.0s | -| Far-field mic | 0.01-0.03 | 0.7-1.0s | - -## Limitations - -This implementation uses a simple energy-based detector which works well in low-noise environments but may not perform optimally in: - -- Noisy environments -- Situations with varying background noise -- When capturing audio from distant microphones - -For more advanced detection in challenging environments, consider integrating a machine learning-based VAD algorithm. \ No newline at end of file From ee109a6df80d344c3609109e7f0f11ff52a7e45b Mon Sep 17 00:00:00 2001 From: Jeroen Veen Date: Sun, 13 Apr 2025 19:52:04 +0200 Subject: [PATCH 05/10] removed double install --- CMakeLists.txt | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5be7790..69c1197 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,10 +34,6 @@ if(UNIX AND NOT APPLE) # Linux only target_compile_options(audio_capture_node PRIVATE -Wall) endif() -install(TARGETS -audio_capture_node -DESTINATION lib/${PROJECT_NAME}) - # Audio Playback Node add_executable(audio_playback_node src/audio_playback_node.cpp) target_link_libraries(audio_playback_node "${cpp_typesupport_target}" ${SDL2_LIBRARIES}) @@ -69,7 +65,7 @@ ament_target_dependencies(vad_node install(TARGETS audio_capture_node audio_playback_node - vad_node # Added VAD node + vad_node DESTINATION lib/${PROJECT_NAME} ) From e9e53b4df4f22e76ac1e1a8c25272843d041e803 Mon Sep 17 00:00:00 2001 From: Jeroen Veen Date: Sun, 13 Apr 2025 19:53:27 +0200 Subject: [PATCH 06/10] removed vad md file --- mkdocs.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/mkdocs.yml b/mkdocs.yml index ae06784..f9321c3 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -60,7 +60,6 @@ nav: - usage/usage_with_native_linux.md - Implementation: - about/implementation.md - - about/voice_activity_detection.md - Contributing: - contributing/rules.md - contributing/license.md \ No newline at end of file From 803e58d355c582763d1577c605ac329a50a405ba Mon Sep 17 00:00:00 2001 From: Jeroen Veen Date: Sun, 13 Apr 2025 19:56:09 +0200 Subject: [PATCH 07/10] linted --- src/vad_node.cpp | 97 +++++++++++++++++++++++++----------------------- src/vad_node.hpp | 26 ++++++------- 2 files changed, 63 insertions(+), 60 deletions(-) diff --git a/src/vad_node.cpp b/src/vad_node.cpp index f4b614a..96457b6 100644 --- a/src/vad_node.cpp +++ b/src/vad_node.cpp @@ -17,22 +17,23 @@ * limitations under the License. */ #include "vad_node.hpp" -#include #include #include +#include VADNode::VADNode() : Node("vad_node"), _current_vad_state(false) { // Declare and get parameters this->declare_parameter("energy_threshold", 0.01); this->declare_parameter("hold_time", 0.5); this->declare_parameter("min_samples", 160); - + this->get_parameter("energy_threshold", _energy_threshold); this->get_parameter("hold_time", _hold_time); this->get_parameter("min_samples", _min_samples); RCLCPP_INFO(this->get_logger(), "VAD initialized with parameters:"); - RCLCPP_INFO(this->get_logger(), " - Energy threshold: %.4f", _energy_threshold); + RCLCPP_INFO(this->get_logger(), " - Energy threshold: %.4f", + _energy_threshold); RCLCPP_INFO(this->get_logger(), " - Hold time: %.2f seconds", _hold_time); RCLCPP_INFO(this->get_logger(), " - Min samples: %d", _min_samples); @@ -40,20 +41,22 @@ VADNode::VADNode() : Node("vad_node"), _current_vad_state(false) { _last_voice_time = this->now(); // Create publisher for voice activity detection - _vad_pub = this->create_publisher("voice_activity", 10); - + _vad_pub = this->create_publisher( + "voice_activity", 10); + // Subscribe to audio data stamped topic _audio_sub = this->create_subscription( "audio_stamped", 10, std::bind(&VADNode::audioCallback, this, std::placeholders::_1)); - + RCLCPP_INFO(this->get_logger(), "VAD node is ready"); } -void VADNode::audioCallback(const audio_tools::msg::AudioDataStamped::SharedPtr msg) { +void VADNode::audioCallback( + const audio_tools::msg::AudioDataStamped::SharedPtr msg) { const auto &audio_data = msg->audio.data; const auto &audio_info = msg->info; - + // Skip if we have no data if (audio_data.empty()) { return; @@ -62,22 +65,22 @@ void VADNode::audioCallback(const audio_tools::msg::AudioDataStamped::SharedPtr // Convert audio data to float samples for processing std::vector samples; if (!convertToFloatSamples(audio_data, audio_info.sample_format, samples)) { - RCLCPP_WARN(this->get_logger(), "Failed to convert audio format %s", + RCLCPP_WARN(this->get_logger(), "Failed to convert audio format %s", audio_info.sample_format.c_str()); return; } - + // Calculate energy level float energy_level = calculateEnergyLevel(samples); - + // Detect voice activity - bool voice_active = (energy_level > _energy_threshold) && + bool voice_active = (energy_level > _energy_threshold) && (samples.size() >= static_cast(_min_samples)); - + // Handle hold time logic if (voice_active) { _last_voice_time = this->now(); - + if (!_current_vad_state) { _current_vad_state = true; RCLCPP_DEBUG(this->get_logger(), "Voice activity started"); @@ -90,110 +93,110 @@ void VADNode::audioCallback(const audio_tools::msg::AudioDataStamped::SharedPtr RCLCPP_DEBUG(this->get_logger(), "Voice activity ended"); } } - + // Publish the current VAD state publishVoiceActivity(_current_vad_state, energy_level, msg->header.stamp); } -bool VADNode::detectVoiceActivity(const std::vector &audio_data, - const std::string &sample_format) { +bool VADNode::detectVoiceActivity(const std::vector &audio_data, + const std::string &sample_format) { // Convert to float samples std::vector samples; if (!convertToFloatSamples(audio_data, sample_format, samples)) { return false; } - + // Check if we have enough samples if (samples.size() < static_cast(_min_samples)) { return false; } - + // Calculate energy level float energy_level = calculateEnergyLevel(samples); - + // Compare with threshold return energy_level > _energy_threshold; } float VADNode::calculateEnergyLevel(const std::vector &samples) { float energy = 0.0f; - + for (const auto &sample : samples) { energy += sample * sample; } - + // Normalize by number of samples if (!samples.empty()) { energy /= samples.size(); } - + return energy; } bool VADNode::convertToFloatSamples(const std::vector &audio_data, - const std::string &sample_format, - std::vector &out_samples) { + const std::string &sample_format, + std::vector &out_samples) { // Find the byte size for this format auto it = _format_byte_size.find(sample_format); if (it == _format_byte_size.end()) { - RCLCPP_ERROR(this->get_logger(), "Unknown audio format: %s", sample_format.c_str()); + RCLCPP_ERROR(this->get_logger(), "Unknown audio format: %s", + sample_format.c_str()); return false; } - + int bytes_per_sample = it->second; size_t num_samples = audio_data.size() / bytes_per_sample; out_samples.resize(num_samples); - + // Convert based on format if (sample_format == "F32" || sample_format == "F32LE") { // Direct copy for float format (assuming little endian) std::memcpy(out_samples.data(), audio_data.data(), audio_data.size()); - } - else if (sample_format == "S16LE") { + } else if (sample_format == "S16LE") { // Convert signed 16-bit LE to float - const int16_t *int_samples = reinterpret_cast(audio_data.data()); + const int16_t *int_samples = + reinterpret_cast(audio_data.data()); for (size_t i = 0; i < num_samples; i++) { out_samples[i] = int_samples[i] / 32768.0f; } - } - else if (sample_format == "S8") { + } else if (sample_format == "S8") { // Convert signed 8-bit to float - const int8_t *int_samples = reinterpret_cast(audio_data.data()); + const int8_t *int_samples = + reinterpret_cast(audio_data.data()); for (size_t i = 0; i < num_samples; i++) { out_samples[i] = int_samples[i] / 128.0f; } - } - else if (sample_format == "U8") { + } else if (sample_format == "U8") { // Convert unsigned 8-bit to float for (size_t i = 0; i < num_samples; i++) { out_samples[i] = (audio_data[i] - 128) / 128.0f; } - } - else { - RCLCPP_WARN(this->get_logger(), - "Format conversion not implemented for %s, using raw values", + } else { + RCLCPP_WARN(this->get_logger(), + "Format conversion not implemented for %s, using raw values", sample_format.c_str()); - + // Just copy raw bytes as a fallback for (size_t i = 0; i < num_samples; i++) { - out_samples[i] = static_cast(audio_data[i * bytes_per_sample]) / 255.0f; + out_samples[i] = + static_cast(audio_data[i * bytes_per_sample]) / 255.0f; } } - + return true; } -void VADNode::publishVoiceActivity(bool active, float energy_level, - const rclcpp::Time ×tamp) { +void VADNode::publishVoiceActivity(bool active, float energy_level, + const rclcpp::Time ×tamp) { auto msg = std::make_unique(); - + msg->header.stamp = timestamp; msg->header.frame_id = "audio_frame"; msg->active = active; msg->energy_level = energy_level; msg->threshold = _energy_threshold; msg->hold_time = _hold_time; - + _vad_pub->publish(std::move(msg)); } diff --git a/src/vad_node.hpp b/src/vad_node.hpp index 16c5011..0458ddf 100644 --- a/src/vad_node.hpp +++ b/src/vad_node.hpp @@ -56,9 +56,9 @@ class VADNode : public rclcpp::Node { * @param sample_format The format of the audio samples. * @return True if voice activity is detected, false otherwise. */ - bool detectVoiceActivity(const std::vector &audio_data, + bool detectVoiceActivity(const std::vector &audio_data, const std::string &sample_format); - + /** * @brief Converts audio data to float samples for processing. * @param audio_data The raw audio data. @@ -83,7 +83,7 @@ class VADNode : public rclcpp::Node { * @param energy_level The current audio energy level. * @param timestamp The timestamp for the message. */ - void publishVoiceActivity(bool active, float energy_level, + void publishVoiceActivity(bool active, float energy_level, const rclcpp::Time ×tamp); /** @@ -94,29 +94,29 @@ class VADNode : public rclcpp::Node { /** * @brief ROS subscriber for audio data. */ - rclcpp::Subscription::SharedPtr _audio_sub; + rclcpp::Subscription::SharedPtr + _audio_sub; /** * @brief Parameters for VAD. */ - float _energy_threshold; // Threshold for energy-based detection - float _hold_time; // Time in seconds to hold detection state - int _min_samples; // Minimum number of samples for detection + float _energy_threshold; // Threshold for energy-based detection + float _hold_time; // Time in seconds to hold detection state + int _min_samples; // Minimum number of samples for detection /** * @brief State variables for VAD. */ - bool _current_vad_state; // Current voice activity state - rclcpp::Time _last_voice_time; // Time when voice was last detected - + bool _current_vad_state; // Current voice activity state + rclcpp::Time _last_voice_time; // Time when voice was last detected + /** * @brief Conversion map for audio formats to byte sizes. */ const std::unordered_map _format_byte_size = { - {"U8", 1}, {"S8", 1}, {"U16LE", 2}, {"U16BE", 2}, + {"U8", 1}, {"S8", 1}, {"U16LE", 2}, {"U16BE", 2}, {"S16LE", 2}, {"S16BE", 2}, {"S32LE", 4}, {"S32BE", 4}, - {"F32LE", 4}, {"F32BE", 4}, {"F32", 4} - }; + {"F32LE", 4}, {"F32BE", 4}, {"F32", 4}}; }; #endif /* VAD_NODE_HPP */ \ No newline at end of file From 2441d8f8055b361e506c938ed4f5e5f20ae23e1d Mon Sep 17 00:00:00 2001 From: Jeroen Veen Date: Sun, 13 Apr 2025 20:09:51 +0200 Subject: [PATCH 08/10] Now output topic names are retrieved from parameters. --- src/vad_node.cpp | 26 +++++++++++++++----------- src/vad_node.hpp | 14 ++++++++++++-- 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/src/vad_node.cpp b/src/vad_node.cpp index 96457b6..81ab573 100644 --- a/src/vad_node.cpp +++ b/src/vad_node.cpp @@ -26,27 +26,36 @@ VADNode::VADNode() : Node("vad_node"), _current_vad_state(false) { this->declare_parameter("energy_threshold", 0.01); this->declare_parameter("hold_time", 0.5); this->declare_parameter("min_samples", 160); + this->declare_parameter("voice_activity_topic", + "voice_activity"); + this->declare_parameter("audio_data_topic", "audio_stamped"); this->get_parameter("energy_threshold", _energy_threshold); this->get_parameter("hold_time", _hold_time); this->get_parameter("min_samples", _min_samples); + this->get_parameter("voice_activity_topic", _voice_activity_topic); + this->get_parameter("audio_data_topic", _audio_data_topic); RCLCPP_INFO(this->get_logger(), "VAD initialized with parameters:"); RCLCPP_INFO(this->get_logger(), " - Energy threshold: %.4f", _energy_threshold); RCLCPP_INFO(this->get_logger(), " - Hold time: %.2f seconds", _hold_time); RCLCPP_INFO(this->get_logger(), " - Min samples: %d", _min_samples); + RCLCPP_INFO(this->get_logger(), " - Voice activity topic: %s", + _voice_activity_topic.c_str()); + RCLCPP_INFO(this->get_logger(), " - Audio data topic: %s", + _audio_data_topic.c_str()); // Initialize the last voice time to current time _last_voice_time = this->now(); // Create publisher for voice activity detection _vad_pub = this->create_publisher( - "voice_activity", 10); + _voice_activity_topic, 10); // Subscribe to audio data stamped topic _audio_sub = this->create_subscription( - "audio_stamped", 10, + _audio_data_topic, 10, std::bind(&VADNode::audioCallback, this, std::placeholders::_1)); RCLCPP_INFO(this->get_logger(), "VAD node is ready"); @@ -172,15 +181,10 @@ bool VADNode::convertToFloatSamples(const std::vector &audio_data, out_samples[i] = (audio_data[i] - 128) / 128.0f; } } else { - RCLCPP_WARN(this->get_logger(), - "Format conversion not implemented for %s, using raw values", - sample_format.c_str()); - - // Just copy raw bytes as a fallback - for (size_t i = 0; i < num_samples; i++) { - out_samples[i] = - static_cast(audio_data[i * bytes_per_sample]) / 255.0f; - } + RCLCPP_ERROR(this->get_logger(), "Format conversion not implemented for %s", + sample_format.c_str()); + throw std::runtime_error("Format conversion not implemented for " + + sample_format); } return true; diff --git a/src/vad_node.hpp b/src/vad_node.hpp index 0458ddf..00da1b9 100644 --- a/src/vad_node.hpp +++ b/src/vad_node.hpp @@ -8,10 +8,10 @@ * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, + * distributed under the License is an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. @@ -117,6 +117,16 @@ class VADNode : public rclcpp::Node { {"U8", 1}, {"S8", 1}, {"U16LE", 2}, {"U16BE", 2}, {"S16LE", 2}, {"S16BE", 2}, {"S32LE", 4}, {"S32BE", 4}, {"F32LE", 4}, {"F32BE", 4}, {"F32", 4}}; + + /** + * @brief Parameter for the voice activity topic name. + */ + std::string _voice_activity_topic; + + /** + * @brief Parameter for the audio data topic name. + */ + std::string _audio_data_topic; }; #endif /* VAD_NODE_HPP */ \ No newline at end of file From a7c77ac7240638af9cfde982c2eb29c239aee6b0 Mon Sep 17 00:00:00 2001 From: Jeroen Veen Date: Sun, 13 Apr 2025 20:15:45 +0200 Subject: [PATCH 09/10] VAD node output parameter definitions set --- launch/audio_vad.launch.py | 62 +++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 27 deletions(-) diff --git a/launch/audio_vad.launch.py b/launch/audio_vad.launch.py index 3299074..c56b88f 100644 --- a/launch/audio_vad.launch.py +++ b/launch/audio_vad.launch.py @@ -1,31 +1,39 @@ from launch import LaunchDescription from launch_ros.actions import Node + def generate_launch_description(): - return LaunchDescription([ - Node( - package='audio_tools', - executable='audio_capture_node', - name='audio_capture', - parameters=[{ - 'sample_format': 'S16LE', - 'channels': 1, - 'sample_rate': 16000, - 'device': -1, # Use default device - 'chunk_size': 1024, - }], - output='screen', - ), - - Node( - package='audio_tools', - executable='vad_node', - name='vad', - parameters=[{ - 'energy_threshold': 0.01, - 'hold_time': 0.5, - 'min_samples': 160, - }], # Office, close mic - output='screen', - ), - ]) \ No newline at end of file + return LaunchDescription( + [ + Node( + package="audio_tools", + executable="audio_capture_node", + name="audio_capture", + parameters=[ + { + "sample_format": "S16LE", + "channels": 1, + "sample_rate": 16000, + "device": -1, # Use default device + "chunk_size": 1024, + } + ], + output="screen", + ), + Node( + package="audio_tools", + executable="vad_node", + name="vad", + parameters=[ + { + "energy_threshold": 0.01, + "hold_time": 0.5, + "min_samples": 160, + "voice_activity_topic": "voice_activity", + "audio_data_topic": "audio_stamped", + } + ], # Office, close mic + output="screen", + ), + ] + ) From 41f304f423711d0b17ec1c51cb29c72115fb123c Mon Sep 17 00:00:00 2001 From: Jeroen Veen Date: Sun, 13 Apr 2025 20:22:49 +0200 Subject: [PATCH 10/10] fixed typo --- docs/usage/overview.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage/overview.md b/docs/usage/overview.md index 567b3d1..9d8a144 100644 --- a/docs/usage/overview.md +++ b/docs/usage/overview.md @@ -64,7 +64,7 @@ Capture microphone input and publish to `/audio_stamped`. Simple energy-based Vo ```bash # Terminal 1 -ros2 run audio_tools audio_vad.launch.py +ros2 launch audio_tools audio_vad.launch.py # Terminal 2 ros2 topic echo /voice_activity