diff --git a/CMakeLists.txt b/CMakeLists.txt index 410e587..69c1197 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,12 +12,14 @@ rosidl_generate_interfaces(${PROJECT_NAME} msg/AudioData.msg msg/AudioDataStamped.msg msg/AudioInfo.msg + msg/VoiceActivity.msg DEPENDENCIES std_msgs ) ament_export_dependencies(rosidl_default_runtime) rosidl_get_typesupport_target(cpp_typesupport_target ${PROJECT_NAME} "rosidl_typesupport_cpp") +# Audio Capture Node add_executable(audio_capture_node src/audio_capture_node.cpp) ament_target_dependencies(audio_capture_node @@ -32,11 +34,7 @@ if(UNIX AND NOT APPLE) # Linux only target_compile_options(audio_capture_node PRIVATE -Wall) endif() -install(TARGETS -audio_capture_node -DESTINATION lib/${PROJECT_NAME}) - - +# Audio Playback Node add_executable(audio_playback_node src/audio_playback_node.cpp) target_link_libraries(audio_playback_node "${cpp_typesupport_target}" ${SDL2_LIBRARIES}) target_include_directories(audio_playback_node PUBLIC ${SDL2_INCLUDE_DIRS}) @@ -50,8 +48,31 @@ ament_target_dependencies(audio_playback_node std_msgs ) +# Voice Activity Detection Node +add_executable(vad_node src/vad_node.cpp) +target_link_libraries(vad_node "${cpp_typesupport_target}") + +if(UNIX AND NOT APPLE) + target_compile_options(vad_node PRIVATE -Wall) +endif() + +ament_target_dependencies(vad_node + rclcpp + std_msgs +) +# Install targets install(TARGETS -audio_playback_node -DESTINATION lib/${PROJECT_NAME}) + audio_capture_node + audio_playback_node + vad_node + DESTINATION lib/${PROJECT_NAME} +) + +# Install launch files +install(DIRECTORY + launch/ + DESTINATION share/${PROJECT_NAME}/launch +) + ament_package() \ No newline at end of file diff --git a/docs/about/implementation.md b/docs/about/implementation.md index 292fabc..61a96a2 100644 --- a/docs/about/implementation.md +++ b/docs/about/implementation.md @@ -140,3 +140,58 @@ ros2 run audio_tools audio_playback_node \ - `SDL2` - Custom messages in `audio_tools` + +## `vad_node` + +**Voice Activity Detection Node for ROS 2** + +Analyzes audio streams to detect the presence of speech or voice activity and publishes the detection state. Useful for conversation systems, wake word triggers, and audio recording automation. + +--- +### โœ… Topics +| Topic | Type | Description | +|------------------|------------------------------------------|-----------------------------------| +| `/voice_activity` | `audio_tools/msg/VoiceActivity` | Voice activity detection state | +| Subscribes to: | | | +| `/audio_stamped` | `audio_tools/msg/AudioDataStamped` | Audio input for analysis | +--- + +### โš™๏ธ Parameters +| Name | Type | Default | Description | +|----------------|----------|---------|---------------------------------------------| +| `energy_threshold`| `float` | `0.01` | Detection sensitivity threshold | +| `hold_time` | `float` | `0.5` | Time in seconds to maintain detection after voice stops | +| `min_samples` | `int` | `160` | Minimum samples needed for detection decision | +--- + +### ๐Ÿ“ฆ Message Notes +- `VoiceActivity`: + - `std_msgs/Header header` with `stamp` and `frame_id` + - `bool active` - Detection state (true/false) + - `float32 energy_level` - Current audio energy level + - `float32 threshold` - Detection threshold in use + - `float32 hold_time` - Time to hold detection state after voice stops +--- + +### ๐Ÿงฉ Implementation Notes +- Uses an **energy-based algorithm** for voice detection. +- Converts all audio formats to normalized float [-1.0, 1.0] for processing. +- Energy calculation: `energy = sum(sample * sample) / num_samples` +- Hold timer prevents rapid switching between states during pauses. +- Format conversion supports U8, S8, S16LE, S16BE, F32, etc. +--- + +### ๐Ÿ Launch Example +```bash +ros2 run audio_tools vad_node \ + --ros-args \ +-p energy_threshold:=0.015 \ +-p hold_time:=0.7 \ +-p min_samples:=160 +``` +--- + +### ๐Ÿงช Dependencies +- `rclcpp` +- `audio_tools/msg/AudioDataStamped` +- `audio_tools/msg/VoiceActivity` \ No newline at end of file diff --git a/docs/usage/overview.md b/docs/usage/overview.md index caa16c0..9d8a144 100644 --- a/docs/usage/overview.md +++ b/docs/usage/overview.md @@ -57,3 +57,15 @@ ros2 run audio_tools audio_capture_node ros2 run audio_tools audio_playback_node \ --ros-args -p audio_topic:=/audio_stamped ``` + +### ๐Ÿ—ฃ๏ธ 5. Voice Activity Detection + +Capture microphone input and publish to `/audio_stamped`. Simple energy-based Voice Activity Detection (VAD) node on `/audio_stamped` and publishes detection results to `/voice_activity`. Note that VAD parameters are set in launch file. + +```bash +# Terminal 1 +ros2 launch audio_tools audio_vad.launch.py + +# Terminal 2 +ros2 topic echo /voice_activity +``` diff --git a/launch/audio_vad.launch.py b/launch/audio_vad.launch.py new file mode 100644 index 0000000..c56b88f --- /dev/null +++ b/launch/audio_vad.launch.py @@ -0,0 +1,39 @@ +from launch import LaunchDescription +from launch_ros.actions import Node + + +def generate_launch_description(): + return LaunchDescription( + [ + Node( + package="audio_tools", + executable="audio_capture_node", + name="audio_capture", + parameters=[ + { + "sample_format": "S16LE", + "channels": 1, + "sample_rate": 16000, + "device": -1, # Use default device + "chunk_size": 1024, + } + ], + output="screen", + ), + Node( + package="audio_tools", + executable="vad_node", + name="vad", + parameters=[ + { + "energy_threshold": 0.01, + "hold_time": 0.5, + "min_samples": 160, + "voice_activity_topic": "voice_activity", + "audio_data_topic": "audio_stamped", + } + ], # Office, close mic + output="screen", + ), + ] + ) diff --git a/msg/VoiceActivity.msg b/msg/VoiceActivity.msg new file mode 100644 index 0000000..8a31c90 --- /dev/null +++ b/msg/VoiceActivity.msg @@ -0,0 +1,8 @@ +std_msgs/Header header +bool active +# Current audio energy level +float32 energy_level +# Current threshold being used +float32 threshold +# Hold time in seconds for the detection +float32 hold_time diff --git a/src/vad_node.cpp b/src/vad_node.cpp new file mode 100644 index 0000000..81ab573 --- /dev/null +++ b/src/vad_node.cpp @@ -0,0 +1,212 @@ +/** + * @file vad_node.cpp + * @brief Voice Activity Detection Node for ROS 2 + * + * Copyright 2025 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "vad_node.hpp" +#include +#include +#include + +VADNode::VADNode() : Node("vad_node"), _current_vad_state(false) { + // Declare and get parameters + this->declare_parameter("energy_threshold", 0.01); + this->declare_parameter("hold_time", 0.5); + this->declare_parameter("min_samples", 160); + this->declare_parameter("voice_activity_topic", + "voice_activity"); + this->declare_parameter("audio_data_topic", "audio_stamped"); + + this->get_parameter("energy_threshold", _energy_threshold); + this->get_parameter("hold_time", _hold_time); + this->get_parameter("min_samples", _min_samples); + this->get_parameter("voice_activity_topic", _voice_activity_topic); + this->get_parameter("audio_data_topic", _audio_data_topic); + + RCLCPP_INFO(this->get_logger(), "VAD initialized with parameters:"); + RCLCPP_INFO(this->get_logger(), " - Energy threshold: %.4f", + _energy_threshold); + RCLCPP_INFO(this->get_logger(), " - Hold time: %.2f seconds", _hold_time); + RCLCPP_INFO(this->get_logger(), " - Min samples: %d", _min_samples); + RCLCPP_INFO(this->get_logger(), " - Voice activity topic: %s", + _voice_activity_topic.c_str()); + RCLCPP_INFO(this->get_logger(), " - Audio data topic: %s", + _audio_data_topic.c_str()); + + // Initialize the last voice time to current time + _last_voice_time = this->now(); + + // Create publisher for voice activity detection + _vad_pub = this->create_publisher( + _voice_activity_topic, 10); + + // Subscribe to audio data stamped topic + _audio_sub = this->create_subscription( + _audio_data_topic, 10, + std::bind(&VADNode::audioCallback, this, std::placeholders::_1)); + + RCLCPP_INFO(this->get_logger(), "VAD node is ready"); +} + +void VADNode::audioCallback( + const audio_tools::msg::AudioDataStamped::SharedPtr msg) { + const auto &audio_data = msg->audio.data; + const auto &audio_info = msg->info; + + // Skip if we have no data + if (audio_data.empty()) { + return; + } + + // Convert audio data to float samples for processing + std::vector samples; + if (!convertToFloatSamples(audio_data, audio_info.sample_format, samples)) { + RCLCPP_WARN(this->get_logger(), "Failed to convert audio format %s", + audio_info.sample_format.c_str()); + return; + } + + // Calculate energy level + float energy_level = calculateEnergyLevel(samples); + + // Detect voice activity + bool voice_active = (energy_level > _energy_threshold) && + (samples.size() >= static_cast(_min_samples)); + + // Handle hold time logic + if (voice_active) { + _last_voice_time = this->now(); + + if (!_current_vad_state) { + _current_vad_state = true; + RCLCPP_DEBUG(this->get_logger(), "Voice activity started"); + } + } else { + // Check if we're within the hold time + rclcpp::Duration time_since_voice = this->now() - _last_voice_time; + if (_current_vad_state && time_since_voice.seconds() > _hold_time) { + _current_vad_state = false; + RCLCPP_DEBUG(this->get_logger(), "Voice activity ended"); + } + } + + // Publish the current VAD state + publishVoiceActivity(_current_vad_state, energy_level, msg->header.stamp); +} + +bool VADNode::detectVoiceActivity(const std::vector &audio_data, + const std::string &sample_format) { + // Convert to float samples + std::vector samples; + if (!convertToFloatSamples(audio_data, sample_format, samples)) { + return false; + } + + // Check if we have enough samples + if (samples.size() < static_cast(_min_samples)) { + return false; + } + + // Calculate energy level + float energy_level = calculateEnergyLevel(samples); + + // Compare with threshold + return energy_level > _energy_threshold; +} + +float VADNode::calculateEnergyLevel(const std::vector &samples) { + float energy = 0.0f; + + for (const auto &sample : samples) { + energy += sample * sample; + } + + // Normalize by number of samples + if (!samples.empty()) { + energy /= samples.size(); + } + + return energy; +} + +bool VADNode::convertToFloatSamples(const std::vector &audio_data, + const std::string &sample_format, + std::vector &out_samples) { + // Find the byte size for this format + auto it = _format_byte_size.find(sample_format); + if (it == _format_byte_size.end()) { + RCLCPP_ERROR(this->get_logger(), "Unknown audio format: %s", + sample_format.c_str()); + return false; + } + + int bytes_per_sample = it->second; + size_t num_samples = audio_data.size() / bytes_per_sample; + out_samples.resize(num_samples); + + // Convert based on format + if (sample_format == "F32" || sample_format == "F32LE") { + // Direct copy for float format (assuming little endian) + std::memcpy(out_samples.data(), audio_data.data(), audio_data.size()); + } else if (sample_format == "S16LE") { + // Convert signed 16-bit LE to float + const int16_t *int_samples = + reinterpret_cast(audio_data.data()); + for (size_t i = 0; i < num_samples; i++) { + out_samples[i] = int_samples[i] / 32768.0f; + } + } else if (sample_format == "S8") { + // Convert signed 8-bit to float + const int8_t *int_samples = + reinterpret_cast(audio_data.data()); + for (size_t i = 0; i < num_samples; i++) { + out_samples[i] = int_samples[i] / 128.0f; + } + } else if (sample_format == "U8") { + // Convert unsigned 8-bit to float + for (size_t i = 0; i < num_samples; i++) { + out_samples[i] = (audio_data[i] - 128) / 128.0f; + } + } else { + RCLCPP_ERROR(this->get_logger(), "Format conversion not implemented for %s", + sample_format.c_str()); + throw std::runtime_error("Format conversion not implemented for " + + sample_format); + } + + return true; +} + +void VADNode::publishVoiceActivity(bool active, float energy_level, + const rclcpp::Time ×tamp) { + auto msg = std::make_unique(); + + msg->header.stamp = timestamp; + msg->header.frame_id = "audio_frame"; + msg->active = active; + msg->energy_level = energy_level; + msg->threshold = _energy_threshold; + msg->hold_time = _hold_time; + + _vad_pub->publish(std::move(msg)); +} + +int main(int argc, char *argv[]) { + rclcpp::init(argc, argv); + rclcpp::spin(std::make_shared()); + rclcpp::shutdown(); + return 0; +} \ No newline at end of file diff --git a/src/vad_node.hpp b/src/vad_node.hpp new file mode 100644 index 0000000..00da1b9 --- /dev/null +++ b/src/vad_node.hpp @@ -0,0 +1,132 @@ +/** + * @file vad_node.hpp + * @brief Voice Activity Detection Node for ROS 2 + * + * Copyright 2025 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef VAD_NODE_HPP +#define VAD_NODE_HPP + +#include + +#include "audio_tools/msg/audio_data.hpp" +#include "audio_tools/msg/audio_data_stamped.hpp" +#include "audio_tools/msg/audio_info.hpp" +#include "audio_tools/msg/voice_activity.hpp" + +/** + * @class VADNode + * @brief A ROS 2 node for Voice Activity Detection (VAD) using audio streams. + */ +class VADNode : public rclcpp::Node { +public: + /** + * @brief Constructor for the VADNode. + * Initializes subscribers, publishers, and parameters. + */ + VADNode(); + + /** + * @brief Destructor for VADNode. + */ + ~VADNode() = default; + +private: + /** + * @brief Callback for audio data messages. + * @param msg The audio message received from the audio capture node. + */ + void audioCallback(const audio_tools::msg::AudioDataStamped::SharedPtr msg); + + /** + * @brief Processes audio data to detect voice activity. + * @param audio_data The raw audio data samples. + * @param sample_format The format of the audio samples. + * @return True if voice activity is detected, false otherwise. + */ + bool detectVoiceActivity(const std::vector &audio_data, + const std::string &sample_format); + + /** + * @brief Converts audio data to float samples for processing. + * @param audio_data The raw audio data. + * @param sample_format The format of the audio samples. + * @param out_samples Output vector of float samples. + * @return True if conversion was successful, false otherwise. + */ + bool convertToFloatSamples(const std::vector &audio_data, + const std::string &sample_format, + std::vector &out_samples); + + /** + * @brief Calculates the energy level of the audio samples. + * @param samples The audio samples in float format. + * @return The calculated energy level. + */ + float calculateEnergyLevel(const std::vector &samples); + + /** + * @brief Publishes the current voice activity state. + * @param active Whether voice activity is detected. + * @param energy_level The current audio energy level. + * @param timestamp The timestamp for the message. + */ + void publishVoiceActivity(bool active, float energy_level, + const rclcpp::Time ×tamp); + + /** + * @brief ROS publisher for voice activity detection results. + */ + rclcpp::Publisher::SharedPtr _vad_pub; + + /** + * @brief ROS subscriber for audio data. + */ + rclcpp::Subscription::SharedPtr + _audio_sub; + + /** + * @brief Parameters for VAD. + */ + float _energy_threshold; // Threshold for energy-based detection + float _hold_time; // Time in seconds to hold detection state + int _min_samples; // Minimum number of samples for detection + + /** + * @brief State variables for VAD. + */ + bool _current_vad_state; // Current voice activity state + rclcpp::Time _last_voice_time; // Time when voice was last detected + + /** + * @brief Conversion map for audio formats to byte sizes. + */ + const std::unordered_map _format_byte_size = { + {"U8", 1}, {"S8", 1}, {"U16LE", 2}, {"U16BE", 2}, + {"S16LE", 2}, {"S16BE", 2}, {"S32LE", 4}, {"S32BE", 4}, + {"F32LE", 4}, {"F32BE", 4}, {"F32", 4}}; + + /** + * @brief Parameter for the voice activity topic name. + */ + std::string _voice_activity_topic; + + /** + * @brief Parameter for the audio data topic name. + */ + std::string _audio_data_topic; +}; + +#endif /* VAD_NODE_HPP */ \ No newline at end of file