From 4f8628afb0b9159a2ac512c7f2713877c193983a Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 4 Dec 2025 04:23:17 +0000
Subject: [PATCH 01/10] Initial plan
From 23c5ea0f37ee19f898bd7cf4e0516cdb5423c0ba Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 4 Dec 2025 04:34:53 +0000
Subject: [PATCH 02/10] Add AudioQuery models and basic UI implementation
Co-authored-by: yuto-trd <66758394+yuto-trd@users.noreply.github.com>
---
.../Models/AccentPhrase.cs | 33 ++++
.../Models/AudioQuery.cs | 69 +++++++
src/Beutl.Extensions.Voice/Models/Mora.cs | 45 +++++
.../ViewModels/AccentPhraseViewModel.cs | 53 ++++++
.../ViewModels/TtsTabViewModel.cs | 112 ++++++++++-
.../Views/TtsTabView.axaml | 176 ++++++++++++++++++
6 files changed, 480 insertions(+), 8 deletions(-)
create mode 100644 src/Beutl.Extensions.Voice/Models/AccentPhrase.cs
create mode 100644 src/Beutl.Extensions.Voice/Models/AudioQuery.cs
create mode 100644 src/Beutl.Extensions.Voice/Models/Mora.cs
create mode 100644 src/Beutl.Extensions.Voice/ViewModels/AccentPhraseViewModel.cs
diff --git a/src/Beutl.Extensions.Voice/Models/AccentPhrase.cs b/src/Beutl.Extensions.Voice/Models/AccentPhrase.cs
new file mode 100644
index 0000000..83da860
--- /dev/null
+++ b/src/Beutl.Extensions.Voice/Models/AccentPhrase.cs
@@ -0,0 +1,33 @@
+using System.Text.Json.Serialization;
+
+namespace Beutl.Extensions.Voice.Models;
+
+///
+/// アクセント句
+///
+public class AccentPhrase
+{
+ ///
+ /// モーラのリスト
+ ///
+ [JsonPropertyName("moras")]
+ public Mora[] Moras { get; set; } = [];
+
+ ///
+ /// アクセント位置(1から始まる)
+ ///
+ [JsonPropertyName("accent")]
+ public int Accent { get; set; }
+
+ ///
+ /// 疑問文かどうか
+ ///
+ [JsonPropertyName("is_interrogative")]
+ public bool IsInterrogative { get; set; }
+
+ ///
+ /// 後ろに無音を付けるか
+ ///
+ [JsonPropertyName("pause_mora")]
+ public Mora? PauseMora { get; set; }
+}
diff --git a/src/Beutl.Extensions.Voice/Models/AudioQuery.cs b/src/Beutl.Extensions.Voice/Models/AudioQuery.cs
new file mode 100644
index 0000000..7642d1f
--- /dev/null
+++ b/src/Beutl.Extensions.Voice/Models/AudioQuery.cs
@@ -0,0 +1,69 @@
+using System.Text.Json.Serialization;
+
+namespace Beutl.Extensions.Voice.Models;
+
+///
+/// 音声合成用のクエリ
+///
+public class AudioQuery
+{
+ ///
+ /// アクセント句のリスト
+ ///
+ [JsonPropertyName("accent_phrases")]
+ public AccentPhrase[] AccentPhrases { get; set; } = [];
+
+ ///
+ /// 全体の話速
+ ///
+ [JsonPropertyName("speedScale")]
+ public float SpeedScale { get; set; } = 1.0f;
+
+ ///
+ /// 全体の音高
+ ///
+ [JsonPropertyName("pitchScale")]
+ public float PitchScale { get; set; } = 0.0f;
+
+ ///
+ /// 全体の抑揚
+ ///
+ [JsonPropertyName("intonationScale")]
+ public float IntonationScale { get; set; } = 1.0f;
+
+ ///
+ /// 全体の音量
+ ///
+ [JsonPropertyName("volumeScale")]
+ public float VolumeScale { get; set; } = 1.0f;
+
+ ///
+ /// 音声の前の無音時間(秒)
+ ///
+ [JsonPropertyName("prePhonemeLength")]
+ public float PrePhonemeLength { get; set; } = 0.1f;
+
+ ///
+ /// 音声の後の無音時間(秒)
+ ///
+ [JsonPropertyName("postPhonemeLength")]
+ public float PostPhonemeLength { get; set; } = 0.1f;
+
+ ///
+ /// 音声データの出力サンプリングレート
+ ///
+ [JsonPropertyName("outputSamplingRate")]
+ public int OutputSamplingRate { get; set; } = 24000;
+
+ ///
+ /// 音声データをステレオ出力するか否か
+ ///
+ [JsonPropertyName("outputStereo")]
+ public bool OutputStereo { get; set; } = false;
+
+ ///
+ /// [読み取り専用] AquesTalk風記法
+ ///
+ [JsonPropertyName("kana")]
+ public string? Kana { get; set; }
+}
diff --git a/src/Beutl.Extensions.Voice/Models/Mora.cs b/src/Beutl.Extensions.Voice/Models/Mora.cs
new file mode 100644
index 0000000..27108ae
--- /dev/null
+++ b/src/Beutl.Extensions.Voice/Models/Mora.cs
@@ -0,0 +1,45 @@
+using System.Text.Json.Serialization;
+
+namespace Beutl.Extensions.Voice.Models;
+
+///
+/// モーラ(音声の最小単位)
+///
+public class Mora
+{
+ ///
+ /// 文字
+ ///
+ [JsonPropertyName("text")]
+ public string Text { get; set; } = "";
+
+ ///
+ /// 子音の音素
+ ///
+ [JsonPropertyName("consonant")]
+ public string? Consonant { get; set; }
+
+ ///
+ /// 子音の音長(秒)
+ ///
+ [JsonPropertyName("consonant_length")]
+ public float? ConsonantLength { get; set; }
+
+ ///
+ /// 母音の音素
+ ///
+ [JsonPropertyName("vowel")]
+ public string Vowel { get; set; } = "";
+
+ ///
+ /// 母音の音長(秒)
+ ///
+ [JsonPropertyName("vowel_length")]
+ public float VowelLength { get; set; }
+
+ ///
+ /// 音高(Hz)
+ ///
+ [JsonPropertyName("pitch")]
+ public float Pitch { get; set; }
+}
diff --git a/src/Beutl.Extensions.Voice/ViewModels/AccentPhraseViewModel.cs b/src/Beutl.Extensions.Voice/ViewModels/AccentPhraseViewModel.cs
new file mode 100644
index 0000000..aa3b0a5
--- /dev/null
+++ b/src/Beutl.Extensions.Voice/ViewModels/AccentPhraseViewModel.cs
@@ -0,0 +1,53 @@
+using System.Collections.ObjectModel;
+using Beutl.Extensions.Voice.Models;
+using Reactive.Bindings;
+
+namespace Beutl.Extensions.Voice.ViewModels;
+
+public class AccentPhraseViewModel
+{
+ public AccentPhraseViewModel(AccentPhrase accentPhrase, int phraseIndex)
+ {
+ Model = accentPhrase;
+ PhraseIndex = phraseIndex;
+ Accent = new ReactiveProperty(accentPhrase.Accent);
+ IsInterrogative = new ReactiveProperty(accentPhrase.IsInterrogative);
+
+ Moras = new ObservableCollection(
+ accentPhrase.Moras.Select((m, i) => new MoraViewModel(m, i)));
+
+ // Update model when properties change
+ Accent.Subscribe(value => Model.Accent = value);
+ IsInterrogative.Subscribe(value => Model.IsInterrogative = value);
+ }
+
+ public AccentPhrase Model { get; }
+ public int PhraseIndex { get; }
+ public ReactiveProperty Accent { get; }
+ public ReactiveProperty IsInterrogative { get; }
+ public ObservableCollection Moras { get; }
+
+ public string DisplayText => string.Join("", Moras.Select(m => m.Text.Value));
+}
+
+public class MoraViewModel
+{
+ public MoraViewModel(Mora mora, int moraIndex)
+ {
+ Model = mora;
+ MoraIndex = moraIndex;
+ Text = new ReactiveProperty(mora.Text);
+ Pitch = new ReactiveProperty(mora.Pitch);
+ VowelLength = new ReactiveProperty(mora.VowelLength);
+
+ // Update model when properties change
+ Pitch.Subscribe(value => Model.Pitch = value);
+ VowelLength.Subscribe(value => Model.VowelLength = value);
+ }
+
+ public Mora Model { get; }
+ public int MoraIndex { get; }
+ public ReactiveProperty Text { get; }
+ public ReactiveProperty Pitch { get; }
+ public ReactiveProperty VowelLength { get; }
+}
diff --git a/src/Beutl.Extensions.Voice/ViewModels/TtsTabViewModel.cs b/src/Beutl.Extensions.Voice/ViewModels/TtsTabViewModel.cs
index 5c012f3..23fabe4 100644
--- a/src/Beutl.Extensions.Voice/ViewModels/TtsTabViewModel.cs
+++ b/src/Beutl.Extensions.Voice/ViewModels/TtsTabViewModel.cs
@@ -1,3 +1,5 @@
+using System.Collections.ObjectModel;
+using System.Text.Json;
using System.Text.Json.Nodes;
using Avalonia.Threading;
using Beutl.Extensibility;
@@ -68,6 +70,12 @@ public TtsTabViewModel(TtsTabExtension extension, IEditorContext editorContext)
public ReactiveProperty IsVoiceVoxInstalled { get; } = new(true);
+ public ReactiveProperty CurrentAudioQuery { get; } = new();
+
+ public ReactiveProperty IsAudioQueryGenerated { get; } = new();
+
+ public ObservableCollection AccentPhrases { get; } = new();
+
public void OnLoaded()
{
var loader = TtsLoader.VoiceVoxLoader.Value;
@@ -88,6 +96,75 @@ public void OnLoaded()
_initTcs.SetResult();
}
+ public Task GenerateAudioQuery()
+ {
+ return Task.Run(() =>
+ {
+ try
+ {
+ IsGenerating.Value = true;
+ var synthesizer = TtsLoader.VoiceVoxLoader.Value?.Synthesizer;
+ var voice = SelectedVoice.Value;
+ var style = SelectedStyle.Value ?? voice?.Styles.FirstOrDefault();
+
+ if (synthesizer == null)
+ {
+ _logger.LogError("Synthesizer is not initialized");
+ return;
+ }
+
+ if (style == null)
+ {
+ _logger.LogError("Style is not selected");
+ return;
+ }
+
+ if (string.IsNullOrWhiteSpace(Text.Value))
+ {
+ _logger.LogError("Text is empty");
+ return;
+ }
+
+ _logger.LogInformation("Generating AudioQuery...");
+ var result = synthesizer.CreateAudioQuery(
+ Text.Value, style.Id, AudioQueryOptions.Default(),
+ out var audioQueryJson);
+
+ if (result != ResultCode.RESULT_OK)
+ {
+ _logger.LogError("Failed to generate AudioQuery: {Result}", result.ToMessage());
+ return;
+ }
+
+ _logger.LogInformation("AudioQuery generated successfully");
+ var audioQuery = JsonSerializer.Deserialize(audioQueryJson!);
+ Dispatcher.UIThread.Post(() =>
+ {
+ CurrentAudioQuery.Value = audioQuery;
+ IsAudioQueryGenerated.Value = true;
+
+ // Populate AccentPhrases collection
+ AccentPhrases.Clear();
+ if (audioQuery?.AccentPhrases != null)
+ {
+ for (int i = 0; i < audioQuery.AccentPhrases.Length; i++)
+ {
+ AccentPhrases.Add(new AccentPhraseViewModel(audioQuery.AccentPhrases[i], i));
+ }
+ }
+ });
+ }
+ catch (Exception ex)
+ {
+ _logger.LogError(ex, "Failed to generate AudioQuery");
+ }
+ finally
+ {
+ IsGenerating.Value = false;
+ }
+ });
+ }
+
public Task Generate()
{
return Task.Run(async () =>
@@ -200,16 +277,35 @@ public Task Play()
return null;
}
- var result = synthesizer.Tts(
- Text.Value, style.Id, TtsOptions.Default(),
- out var outputWavSize, out var outputWav);
- if (result != ResultCode.RESULT_OK)
+ // Use AudioQuery if available, otherwise use direct TTS
+ if (CurrentAudioQuery.Value != null)
{
- _logger.LogError("Failed to generate TTS: {Result}", result.ToMessage());
- return null;
+ _logger.LogInformation("Synthesizing from AudioQuery...");
+ var audioQueryJson = JsonSerializer.Serialize(CurrentAudioQuery.Value);
+ var result = synthesizer.Synthesis(
+ audioQueryJson, style.Id, SynthesisOptions.Default(),
+ out var outputWavSize, out var outputWav);
+ if (result != ResultCode.RESULT_OK)
+ {
+ _logger.LogError("Failed to synthesize: {Result}", result.ToMessage());
+ return null;
+ }
+ return outputWav;
+ }
+ else
+ {
+ _logger.LogInformation("Generating TTS directly...");
+ var result = synthesizer.Tts(
+ Text.Value, style.Id, TtsOptions.Default(),
+ out var outputWavSize, out var outputWav);
+ if (result != ResultCode.RESULT_OK)
+ {
+ _logger.LogError("Failed to generate TTS: {Result}", result.ToMessage());
+ return null;
+ }
+
+ return outputWav;
}
-
- return outputWav;
}
catch (Exception ex)
{
diff --git a/src/Beutl.Extensions.Voice/Views/TtsTabView.axaml b/src/Beutl.Extensions.Voice/Views/TtsTabView.axaml
index 718fb23..30673d8 100644
--- a/src/Beutl.Extensions.Voice/Views/TtsTabView.axaml
+++ b/src/Beutl.Extensions.Voice/Views/TtsTabView.axaml
@@ -62,6 +62,182 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+