Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 28 additions & 7 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,14 @@ rosidl_generate_interfaces(${PROJECT_NAME}
msg/AudioData.msg
msg/AudioDataStamped.msg
msg/AudioInfo.msg
msg/VoiceActivity.msg
DEPENDENCIES std_msgs
)

ament_export_dependencies(rosidl_default_runtime)
rosidl_get_typesupport_target(cpp_typesupport_target ${PROJECT_NAME} "rosidl_typesupport_cpp")

# Audio Capture Node
add_executable(audio_capture_node src/audio_capture_node.cpp)

ament_target_dependencies(audio_capture_node
Expand All @@ -32,11 +34,7 @@ if(UNIX AND NOT APPLE) # Linux only
target_compile_options(audio_capture_node PRIVATE -Wall)
endif()

install(TARGETS
audio_capture_node
DESTINATION lib/${PROJECT_NAME})


# Audio Playback Node
add_executable(audio_playback_node src/audio_playback_node.cpp)
target_link_libraries(audio_playback_node "${cpp_typesupport_target}" ${SDL2_LIBRARIES})
target_include_directories(audio_playback_node PUBLIC ${SDL2_INCLUDE_DIRS})
Expand All @@ -50,8 +48,31 @@ ament_target_dependencies(audio_playback_node
std_msgs
)

# Voice Activity Detection Node
add_executable(vad_node src/vad_node.cpp)
target_link_libraries(vad_node "${cpp_typesupport_target}")

if(UNIX AND NOT APPLE)
target_compile_options(vad_node PRIVATE -Wall)
endif()

ament_target_dependencies(vad_node
rclcpp
std_msgs
)

# Install targets
install(TARGETS
audio_playback_node
DESTINATION lib/${PROJECT_NAME})
audio_capture_node
audio_playback_node
vad_node
DESTINATION lib/${PROJECT_NAME}
)

# Install launch files
install(DIRECTORY
launch/
DESTINATION share/${PROJECT_NAME}/launch
)

ament_package()
55 changes: 55 additions & 0 deletions docs/about/implementation.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,58 @@ ros2 run audio_tools audio_playback_node \
- `SDL2`
- Custom messages in `audio_tools`


## `vad_node`

**Voice Activity Detection Node for ROS 2**

Analyzes audio streams to detect the presence of speech or voice activity and publishes the detection state. Useful for conversation systems, wake word triggers, and audio recording automation.

---
### ✅ Topics
| Topic | Type | Description |
|------------------|------------------------------------------|-----------------------------------|
| `/voice_activity` | `audio_tools/msg/VoiceActivity` | Voice activity detection state |
| Subscribes to: | | |
| `/audio_stamped` | `audio_tools/msg/AudioDataStamped` | Audio input for analysis |
---

### ⚙️ Parameters
| Name | Type | Default | Description |
|----------------|----------|---------|---------------------------------------------|
| `energy_threshold`| `float` | `0.01` | Detection sensitivity threshold |
| `hold_time` | `float` | `0.5` | Time in seconds to maintain detection after voice stops |
| `min_samples` | `int` | `160` | Minimum samples needed for detection decision |
---

### 📦 Message Notes
- `VoiceActivity`:
- `std_msgs/Header header` with `stamp` and `frame_id`
- `bool active` - Detection state (true/false)
- `float32 energy_level` - Current audio energy level
- `float32 threshold` - Detection threshold in use
- `float32 hold_time` - Time to hold detection state after voice stops
---

### 🧩 Implementation Notes
- Uses an **energy-based algorithm** for voice detection.
- Converts all audio formats to normalized float [-1.0, 1.0] for processing.
- Energy calculation: `energy = sum(sample * sample) / num_samples`
- Hold timer prevents rapid switching between states during pauses.
- Format conversion supports U8, S8, S16LE, S16BE, F32, etc.
---

### 🏁 Launch Example
```bash
ros2 run audio_tools vad_node \
--ros-args \
-p energy_threshold:=0.015 \
-p hold_time:=0.7 \
-p min_samples:=160
```
---

### 🧪 Dependencies
- `rclcpp`
- `audio_tools/msg/AudioDataStamped`
- `audio_tools/msg/VoiceActivity`
12 changes: 12 additions & 0 deletions docs/usage/overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,15 @@ ros2 run audio_tools audio_capture_node
ros2 run audio_tools audio_playback_node \
--ros-args -p audio_topic:=/audio_stamped
```

### 🗣️ 5. Voice Activity Detection

Capture microphone input and publish to `/audio_stamped`. Simple energy-based Voice Activity Detection (VAD) node on `/audio_stamped` and publishes detection results to `/voice_activity`. Note that VAD parameters are set in launch file.

```bash
# Terminal 1
ros2 launch audio_tools audio_vad.launch.py

# Terminal 2
ros2 topic echo /voice_activity
```
39 changes: 39 additions & 0 deletions launch/audio_vad.launch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from launch import LaunchDescription
from launch_ros.actions import Node


def generate_launch_description():
return LaunchDescription(
[
Node(
package="audio_tools",
executable="audio_capture_node",
name="audio_capture",
parameters=[
{
"sample_format": "S16LE",
"channels": 1,
"sample_rate": 16000,
"device": -1, # Use default device
"chunk_size": 1024,
}
],
output="screen",
),
Node(
package="audio_tools",
executable="vad_node",
name="vad",
parameters=[
{
"energy_threshold": 0.01,
"hold_time": 0.5,
"min_samples": 160,
"voice_activity_topic": "voice_activity",
"audio_data_topic": "audio_stamped",
}
], # Office, close mic
output="screen",
),
]
)
8 changes: 8 additions & 0 deletions msg/VoiceActivity.msg
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
std_msgs/Header header
bool active
# Current audio energy level
float32 energy_level
# Current threshold being used
float32 threshold
# Hold time in seconds for the detection
float32 hold_time
Loading