CLFML · ducroq · Apr 14, 2025 · Apr 13, 2025 · Apr 13, 2025 · Apr 13, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -12,12 +12,14 @@ rosidl_generate_interfaces(${PROJECT_NAME}
   msg/AudioData.msg
   msg/AudioDataStamped.msg
   msg/AudioInfo.msg
+  msg/VoiceActivity.msg  
   DEPENDENCIES std_msgs
 )
 
 ament_export_dependencies(rosidl_default_runtime)
 rosidl_get_typesupport_target(cpp_typesupport_target ${PROJECT_NAME} "rosidl_typesupport_cpp")
 
+# Audio Capture Node
 add_executable(audio_capture_node src/audio_capture_node.cpp)
 
 ament_target_dependencies(audio_capture_node
@@ -32,11 +34,7 @@ if(UNIX AND NOT APPLE)  # Linux only
   target_compile_options(audio_capture_node PRIVATE -Wall)
 endif()
 
-install(TARGETS
-audio_capture_node
-DESTINATION lib/${PROJECT_NAME}) 
-
-
+# Audio Playback Node
 add_executable(audio_playback_node src/audio_playback_node.cpp)
 target_link_libraries(audio_playback_node "${cpp_typesupport_target}" ${SDL2_LIBRARIES}) 
 target_include_directories(audio_playback_node PUBLIC ${SDL2_INCLUDE_DIRS})
@@ -50,8 +48,31 @@ ament_target_dependencies(audio_playback_node
     std_msgs
 )
 
+# Voice Activity Detection Node
+add_executable(vad_node src/vad_node.cpp)
+target_link_libraries(vad_node "${cpp_typesupport_target}")
+
+if(UNIX AND NOT APPLE)
+  target_compile_options(vad_node PRIVATE -Wall)
+endif()
+
+ament_target_dependencies(vad_node
+    rclcpp
+    std_msgs
+)
 
+# Install targets
 install(TARGETS
-audio_playback_node
-DESTINATION lib/${PROJECT_NAME}) 
+  audio_capture_node
+  audio_playback_node
+  vad_node
+  DESTINATION lib/${PROJECT_NAME}
+)
+
+# Install launch files
+install(DIRECTORY
+  launch/
+  DESTINATION share/${PROJECT_NAME}/launch
+)
+
 ament_package()
diff --git a/docs/about/implementation.md b/docs/about/implementation.md
@@ -140,3 +140,58 @@ ros2 run audio_tools audio_playback_node \
 - `SDL2`
 - Custom messages in `audio_tools`
 
+
+## `vad_node`
+
+**Voice Activity Detection Node for ROS 2**
+
+Analyzes audio streams to detect the presence of speech or voice activity and publishes the detection state. Useful for conversation systems, wake word triggers, and audio recording automation.
+
+---
+### ✅ Topics
+| Topic | Type | Description |
+|------------------|------------------------------------------|-----------------------------------|
+| `/voice_activity` | `audio_tools/msg/VoiceActivity` | Voice activity detection state |
+| Subscribes to: | | |
+| `/audio_stamped` | `audio_tools/msg/AudioDataStamped` | Audio input for analysis |
+---
+
+### ⚙️ Parameters
+| Name | Type | Default | Description |
+|----------------|----------|---------|---------------------------------------------|
+| `energy_threshold`| `float` | `0.01` | Detection sensitivity threshold |
+| `hold_time` | `float` | `0.5` | Time in seconds to maintain detection after voice stops |
+| `min_samples` | `int` | `160` | Minimum samples needed for detection decision |
+---
+
+### 📦 Message Notes
+- `VoiceActivity`:
+  - `std_msgs/Header header` with `stamp` and `frame_id`
+  - `bool active` - Detection state (true/false)
+  - `float32 energy_level` - Current audio energy level
+  - `float32 threshold` - Detection threshold in use
+  - `float32 hold_time` - Time to hold detection state after voice stops
+---
+
+### 🧩 Implementation Notes
+- Uses an **energy-based algorithm** for voice detection.
+- Converts all audio formats to normalized float [-1.0, 1.0] for processing.
+- Energy calculation: `energy = sum(sample * sample) / num_samples`
+- Hold timer prevents rapid switching between states during pauses.
+- Format conversion supports U8, S8, S16LE, S16BE, F32, etc.
+---
+
+### 🏁 Launch Example
+```bash
+ros2 run audio_tools vad_node \
+ --ros-args \
+-p energy_threshold:=0.015 \
+-p hold_time:=0.7 \
+-p min_samples:=160
+```
+---
+
+### 🧪 Dependencies
+- `rclcpp`
+- `audio_tools/msg/AudioDataStamped`
+- `audio_tools/msg/VoiceActivity`
diff --git a/docs/usage/overview.md b/docs/usage/overview.md
@@ -57,3 +57,15 @@ ros2 run audio_tools audio_capture_node
 ros2 run audio_tools audio_playback_node \
   --ros-args -p audio_topic:=/audio_stamped
 ```
+
+### 🗣️ 5. Voice Activity Detection
+
+Capture microphone input and publish to `/audio_stamped`. Simple energy-based Voice Activity Detection (VAD) node on `/audio_stamped` and publishes detection results to `/voice_activity`. Note that VAD parameters are set in launch file.
+
+```bash
+# Terminal 1
+ros2 launch audio_tools audio_vad.launch.py
+
+# Terminal 2
+ros2 topic echo /voice_activity
+```
diff --git a/launch/audio_vad.launch.py b/launch/audio_vad.launch.py
@@ -0,0 +1,39 @@
+from launch import LaunchDescription
+from launch_ros.actions import Node
+
+
+def generate_launch_description():
+    return LaunchDescription(
+        [
+            Node(
+                package="audio_tools",
+                executable="audio_capture_node",
+                name="audio_capture",
+                parameters=[
+                    {
+                        "sample_format": "S16LE",
+                        "channels": 1,
+                        "sample_rate": 16000,
+                        "device": -1,  # Use default device
+                        "chunk_size": 1024,
+                    }
+                ],
+                output="screen",
+            ),
+            Node(
+                package="audio_tools",
+                executable="vad_node",
+                name="vad",
+                parameters=[
+                    {
+                        "energy_threshold": 0.01,
+                        "hold_time": 0.5,
+                        "min_samples": 160,
+                        "voice_activity_topic": "voice_activity",
+                        "audio_data_topic": "audio_stamped",
+                    }
+                ],  # Office, close mic
+                output="screen",
+            ),
+        ]
+    )
diff --git a/msg/VoiceActivity.msg b/msg/VoiceActivity.msg
@@ -0,0 +1,8 @@
+std_msgs/Header header
+bool active
+# Current audio energy level
+float32 energy_level
+# Current threshold being used
+float32 threshold
+# Hold time in seconds for the detection
+float32 hold_time