From 4f8628afb0b9159a2ac512c7f2713877c193983a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 4 Dec 2025 04:23:17 +0000 Subject: [PATCH 01/10] Initial plan From 23c5ea0f37ee19f898bd7cf4e0516cdb5423c0ba Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 4 Dec 2025 04:34:53 +0000 Subject: [PATCH 02/10] Add AudioQuery models and basic UI implementation Co-authored-by: yuto-trd <66758394+yuto-trd@users.noreply.github.com> --- .../Models/AccentPhrase.cs | 33 ++++ .../Models/AudioQuery.cs | 69 +++++++ src/Beutl.Extensions.Voice/Models/Mora.cs | 45 +++++ .../ViewModels/AccentPhraseViewModel.cs | 53 ++++++ .../ViewModels/TtsTabViewModel.cs | 112 ++++++++++- .../Views/TtsTabView.axaml | 176 ++++++++++++++++++ 6 files changed, 480 insertions(+), 8 deletions(-) create mode 100644 src/Beutl.Extensions.Voice/Models/AccentPhrase.cs create mode 100644 src/Beutl.Extensions.Voice/Models/AudioQuery.cs create mode 100644 src/Beutl.Extensions.Voice/Models/Mora.cs create mode 100644 src/Beutl.Extensions.Voice/ViewModels/AccentPhraseViewModel.cs diff --git a/src/Beutl.Extensions.Voice/Models/AccentPhrase.cs b/src/Beutl.Extensions.Voice/Models/AccentPhrase.cs new file mode 100644 index 0000000..83da860 --- /dev/null +++ b/src/Beutl.Extensions.Voice/Models/AccentPhrase.cs @@ -0,0 +1,33 @@ +using System.Text.Json.Serialization; + +namespace Beutl.Extensions.Voice.Models; + +/// +/// アクセント句 +/// +public class AccentPhrase +{ + /// + /// モーラのリスト + /// + [JsonPropertyName("moras")] + public Mora[] Moras { get; set; } = []; + + /// + /// アクセント位置(1から始まる) + /// + [JsonPropertyName("accent")] + public int Accent { get; set; } + + /// + /// 疑問文かどうか + /// + [JsonPropertyName("is_interrogative")] + public bool IsInterrogative { get; set; } + + /// + /// 後ろに無音を付けるか + /// + [JsonPropertyName("pause_mora")] + public Mora? PauseMora { get; set; } +} diff --git a/src/Beutl.Extensions.Voice/Models/AudioQuery.cs b/src/Beutl.Extensions.Voice/Models/AudioQuery.cs new file mode 100644 index 0000000..7642d1f --- /dev/null +++ b/src/Beutl.Extensions.Voice/Models/AudioQuery.cs @@ -0,0 +1,69 @@ +using System.Text.Json.Serialization; + +namespace Beutl.Extensions.Voice.Models; + +/// +/// 音声合成用のクエリ +/// +public class AudioQuery +{ + /// + /// アクセント句のリスト + /// + [JsonPropertyName("accent_phrases")] + public AccentPhrase[] AccentPhrases { get; set; } = []; + + /// + /// 全体の話速 + /// + [JsonPropertyName("speedScale")] + public float SpeedScale { get; set; } = 1.0f; + + /// + /// 全体の音高 + /// + [JsonPropertyName("pitchScale")] + public float PitchScale { get; set; } = 0.0f; + + /// + /// 全体の抑揚 + /// + [JsonPropertyName("intonationScale")] + public float IntonationScale { get; set; } = 1.0f; + + /// + /// 全体の音量 + /// + [JsonPropertyName("volumeScale")] + public float VolumeScale { get; set; } = 1.0f; + + /// + /// 音声の前の無音時間(秒) + /// + [JsonPropertyName("prePhonemeLength")] + public float PrePhonemeLength { get; set; } = 0.1f; + + /// + /// 音声の後の無音時間(秒) + /// + [JsonPropertyName("postPhonemeLength")] + public float PostPhonemeLength { get; set; } = 0.1f; + + /// + /// 音声データの出力サンプリングレート + /// + [JsonPropertyName("outputSamplingRate")] + public int OutputSamplingRate { get; set; } = 24000; + + /// + /// 音声データをステレオ出力するか否か + /// + [JsonPropertyName("outputStereo")] + public bool OutputStereo { get; set; } = false; + + /// + /// [読み取り専用] AquesTalk風記法 + /// + [JsonPropertyName("kana")] + public string? Kana { get; set; } +} diff --git a/src/Beutl.Extensions.Voice/Models/Mora.cs b/src/Beutl.Extensions.Voice/Models/Mora.cs new file mode 100644 index 0000000..27108ae --- /dev/null +++ b/src/Beutl.Extensions.Voice/Models/Mora.cs @@ -0,0 +1,45 @@ +using System.Text.Json.Serialization; + +namespace Beutl.Extensions.Voice.Models; + +/// +/// モーラ(音声の最小単位) +/// +public class Mora +{ + /// + /// 文字 + /// + [JsonPropertyName("text")] + public string Text { get; set; } = ""; + + /// + /// 子音の音素 + /// + [JsonPropertyName("consonant")] + public string? Consonant { get; set; } + + /// + /// 子音の音長(秒) + /// + [JsonPropertyName("consonant_length")] + public float? ConsonantLength { get; set; } + + /// + /// 母音の音素 + /// + [JsonPropertyName("vowel")] + public string Vowel { get; set; } = ""; + + /// + /// 母音の音長(秒) + /// + [JsonPropertyName("vowel_length")] + public float VowelLength { get; set; } + + /// + /// 音高(Hz) + /// + [JsonPropertyName("pitch")] + public float Pitch { get; set; } +} diff --git a/src/Beutl.Extensions.Voice/ViewModels/AccentPhraseViewModel.cs b/src/Beutl.Extensions.Voice/ViewModels/AccentPhraseViewModel.cs new file mode 100644 index 0000000..aa3b0a5 --- /dev/null +++ b/src/Beutl.Extensions.Voice/ViewModels/AccentPhraseViewModel.cs @@ -0,0 +1,53 @@ +using System.Collections.ObjectModel; +using Beutl.Extensions.Voice.Models; +using Reactive.Bindings; + +namespace Beutl.Extensions.Voice.ViewModels; + +public class AccentPhraseViewModel +{ + public AccentPhraseViewModel(AccentPhrase accentPhrase, int phraseIndex) + { + Model = accentPhrase; + PhraseIndex = phraseIndex; + Accent = new ReactiveProperty(accentPhrase.Accent); + IsInterrogative = new ReactiveProperty(accentPhrase.IsInterrogative); + + Moras = new ObservableCollection( + accentPhrase.Moras.Select((m, i) => new MoraViewModel(m, i))); + + // Update model when properties change + Accent.Subscribe(value => Model.Accent = value); + IsInterrogative.Subscribe(value => Model.IsInterrogative = value); + } + + public AccentPhrase Model { get; } + public int PhraseIndex { get; } + public ReactiveProperty Accent { get; } + public ReactiveProperty IsInterrogative { get; } + public ObservableCollection Moras { get; } + + public string DisplayText => string.Join("", Moras.Select(m => m.Text.Value)); +} + +public class MoraViewModel +{ + public MoraViewModel(Mora mora, int moraIndex) + { + Model = mora; + MoraIndex = moraIndex; + Text = new ReactiveProperty(mora.Text); + Pitch = new ReactiveProperty(mora.Pitch); + VowelLength = new ReactiveProperty(mora.VowelLength); + + // Update model when properties change + Pitch.Subscribe(value => Model.Pitch = value); + VowelLength.Subscribe(value => Model.VowelLength = value); + } + + public Mora Model { get; } + public int MoraIndex { get; } + public ReactiveProperty Text { get; } + public ReactiveProperty Pitch { get; } + public ReactiveProperty VowelLength { get; } +} diff --git a/src/Beutl.Extensions.Voice/ViewModels/TtsTabViewModel.cs b/src/Beutl.Extensions.Voice/ViewModels/TtsTabViewModel.cs index 5c012f3..23fabe4 100644 --- a/src/Beutl.Extensions.Voice/ViewModels/TtsTabViewModel.cs +++ b/src/Beutl.Extensions.Voice/ViewModels/TtsTabViewModel.cs @@ -1,3 +1,5 @@ +using System.Collections.ObjectModel; +using System.Text.Json; using System.Text.Json.Nodes; using Avalonia.Threading; using Beutl.Extensibility; @@ -68,6 +70,12 @@ public TtsTabViewModel(TtsTabExtension extension, IEditorContext editorContext) public ReactiveProperty IsVoiceVoxInstalled { get; } = new(true); + public ReactiveProperty CurrentAudioQuery { get; } = new(); + + public ReactiveProperty IsAudioQueryGenerated { get; } = new(); + + public ObservableCollection AccentPhrases { get; } = new(); + public void OnLoaded() { var loader = TtsLoader.VoiceVoxLoader.Value; @@ -88,6 +96,75 @@ public void OnLoaded() _initTcs.SetResult(); } + public Task GenerateAudioQuery() + { + return Task.Run(() => + { + try + { + IsGenerating.Value = true; + var synthesizer = TtsLoader.VoiceVoxLoader.Value?.Synthesizer; + var voice = SelectedVoice.Value; + var style = SelectedStyle.Value ?? voice?.Styles.FirstOrDefault(); + + if (synthesizer == null) + { + _logger.LogError("Synthesizer is not initialized"); + return; + } + + if (style == null) + { + _logger.LogError("Style is not selected"); + return; + } + + if (string.IsNullOrWhiteSpace(Text.Value)) + { + _logger.LogError("Text is empty"); + return; + } + + _logger.LogInformation("Generating AudioQuery..."); + var result = synthesizer.CreateAudioQuery( + Text.Value, style.Id, AudioQueryOptions.Default(), + out var audioQueryJson); + + if (result != ResultCode.RESULT_OK) + { + _logger.LogError("Failed to generate AudioQuery: {Result}", result.ToMessage()); + return; + } + + _logger.LogInformation("AudioQuery generated successfully"); + var audioQuery = JsonSerializer.Deserialize(audioQueryJson!); + Dispatcher.UIThread.Post(() => + { + CurrentAudioQuery.Value = audioQuery; + IsAudioQueryGenerated.Value = true; + + // Populate AccentPhrases collection + AccentPhrases.Clear(); + if (audioQuery?.AccentPhrases != null) + { + for (int i = 0; i < audioQuery.AccentPhrases.Length; i++) + { + AccentPhrases.Add(new AccentPhraseViewModel(audioQuery.AccentPhrases[i], i)); + } + } + }); + } + catch (Exception ex) + { + _logger.LogError(ex, "Failed to generate AudioQuery"); + } + finally + { + IsGenerating.Value = false; + } + }); + } + public Task Generate() { return Task.Run(async () => @@ -200,16 +277,35 @@ public Task Play() return null; } - var result = synthesizer.Tts( - Text.Value, style.Id, TtsOptions.Default(), - out var outputWavSize, out var outputWav); - if (result != ResultCode.RESULT_OK) + // Use AudioQuery if available, otherwise use direct TTS + if (CurrentAudioQuery.Value != null) { - _logger.LogError("Failed to generate TTS: {Result}", result.ToMessage()); - return null; + _logger.LogInformation("Synthesizing from AudioQuery..."); + var audioQueryJson = JsonSerializer.Serialize(CurrentAudioQuery.Value); + var result = synthesizer.Synthesis( + audioQueryJson, style.Id, SynthesisOptions.Default(), + out var outputWavSize, out var outputWav); + if (result != ResultCode.RESULT_OK) + { + _logger.LogError("Failed to synthesize: {Result}", result.ToMessage()); + return null; + } + return outputWav; + } + else + { + _logger.LogInformation("Generating TTS directly..."); + var result = synthesizer.Tts( + Text.Value, style.Id, TtsOptions.Default(), + out var outputWavSize, out var outputWav); + if (result != ResultCode.RESULT_OK) + { + _logger.LogError("Failed to generate TTS: {Result}", result.ToMessage()); + return null; + } + + return outputWav; } - - return outputWav; } catch (Exception ex) { diff --git a/src/Beutl.Extensions.Voice/Views/TtsTabView.axaml b/src/Beutl.Extensions.Voice/Views/TtsTabView.axaml index 718fb23..30673d8 100644 --- a/src/Beutl.Extensions.Voice/Views/TtsTabView.axaml +++ b/src/Beutl.Extensions.Voice/Views/TtsTabView.axaml @@ -62,6 +62,182 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +