Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,15 @@ Allow Microphone access when prompted.
,)
```

You can run the same flow against the local WebGPU microservice (see
`webgpu-local-pipeline/`) by passing `:pipeline/provider :webgpu` and, if
necessary, a custom `:webgpu/base-url`:

```clojure
(make-local-flow {:pipeline/provider :webgpu
:webgpu/base-url "http://localhost:4173"})
```

Which roughly translates to:

![Flow Diagram](./resources/flow.png)
Expand Down
10 changes: 9 additions & 1 deletion bin/chat
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ fi
if [ ! -f "resources/secrets.edn" ]; then
print_color $RED "❌ Error: resources/secrets.edn not found"
print_color $YELLOW "Please create resources/secrets.edn with your OpenAI API key:"
echo '{:openai {:new-api-sk "your-openai-api-key-here"}}'
echo '{:openai {:api-key "your-openai-api-key-here"}}'
exit 1
fi

Expand All @@ -45,6 +45,10 @@ while [[ $# -gt 0 ]]; do
ARGS="$ARGS --scenario"
shift
;;
--webgpu)
ARGS="$ARGS --webgpu"
shift
;;
--help|-h)
HELP=true
shift
Expand Down Expand Up @@ -95,6 +99,10 @@ if [[ "$ARGS" == *"--scenario"* ]]; then
print_color $YELLOW "🎭 Scenario mode enabled"
fi

if [[ "$ARGS" == *"--webgpu"* ]]; then
print_color $YELLOW "🧠 Using local WebGPU service"
fi

print_color $GREEN "🔧 Loading Clojure dependencies..."

# Run the chat demo
Expand Down
243 changes: 136 additions & 107 deletions examples/src/simulflow_examples/local.clj
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
[simulflow.transport.out :as transport-out]
[simulflow.utils.core :as u]
[simulflow.vad.silero :as silero]
[simulflow-examples.webgpu-processors :as webgpu]
[taoensso.telemere :as t]))

(t/set-min-level! :debug)
Expand All @@ -28,117 +29,145 @@
these are the defaults. See each process for details
"
([] (make-local-flow {}))
([{:keys [llm-context extra-procs extra-conns debug?
language chunk-duration-ms]
:or {llm-context {:messages
[{:role "system"
:content "You are a voice agent operating via phone. Be
([opts]
(let [{:keys [llm-context extra-procs extra-conns debug?
language chunk-duration-ms]
:or {llm-context {:messages
[{:role "system"
:content "You are a voice agent operating via phone. Be
concise in your answers. The input you receive comes from a
speech-to-text (transcription) system that isn't always
efficient and may send unclear text. Ask for
clarification when you're unsure what the person said."}]
:tools
[{:type :function
:function
{:name "get_weather"
:handler (fn [{:keys [town]}] (str "The weather in " town " is 17 degrees celsius"))
:description "Get the current weather of a location"
:parameters {:type :object
:required [:town]
:properties {:town {:type :string
:description "Town for which to retrieve the current weather"}}
:additionalProperties false}
:strict true}}]}

language :en

debug? false
chunk-duration-ms 20
extra-procs {}
extra-conns []}}]

(flow/create-flow
{:procs
(u/deep-merge
{;; Capture audio from microphone and send raw-audio-input frames further in the pipeline
:transport-in {:proc transport-in/microphone-transport-in
:args {:vad/analyser :vad.analyser/silero}}
;; raw-audio-input -> transcription frames
:transcriptor {:proc deepgram/deepgram-processor
:args {:transcription/api-key (secret [:deepgram :api-key])
:transcription/interim-results? true
:transcription/punctuate? false
:transcription/vad-events? false
:transcription/smart-format? true
:transcription/model :nova-2
:transcription/utterance-end-ms 1000
:transcription/language language}}

;; user transcription & llm message frames -> llm-context frames
;; responsible for keeping the full conversation history
:context-aggregator {:proc context/context-aggregator
:args {:llm/context llm-context
:aggregator/debug? debug?}}

;; Takes llm-context frames and produces new llm-text-chunk & llm-tool-call-chunk frames
:llm {:proc openai/openai-llm-process
:args {:openai/api-key (secret [:openai :new-api-sk])
:llm/model :gpt-4.1-mini}}

;; llm-text-chunk & llm-tool-call-chunk -> llm-context-messages-append frames
:assistant-context-assembler {:proc context/assistant-context-assembler
:args {:debug? debug?}}

;; llm-text-chunk -> sentence speak frames (faster for text to speech)
:llm-sentence-assembler {:proc context/llm-sentence-assembler}

;; speak-frames -> audio-output-raw frames
:tts {:proc xi/elevenlabs-tts-process
:args {:elevenlabs/api-key (secret [:elevenlabs :api-key])
:elevenlabs/model-id "eleven_flash_v2_5"
:elevenlabs/voice-id (secret [:elevenlabs :voice-id])
:voice/stability 0.5
:voice/similarity-boost 0.8
:voice/use-speaker-boost? true
:pipeline/language language}}

;; audio-output-raw -> smaller audio-output-raw frames (used for sending audio in realtime)
:audio-splitter {:proc transport/audio-splitter
:args {:audio.out/duration-ms chunk-duration-ms}}

;; speakers out
:transport-out {:proc transport-out/realtime-speakers-out-processor
:args {:audio.out/sending-interval chunk-duration-ms
:audio.out/duration-ms chunk-duration-ms}}

:activity-monitor {:proc activity-monitor/process
:args {::activity-monitor/timeout-ms 5000}}}
extra-procs)
:conns (concat
[[[:transport-in :out] [:transcriptor :in]]

[[:transcriptor :out] [:context-aggregator :in]]
[[:transport-in :sys-out] [:context-aggregator :sys-in]]
[[:context-aggregator :out] [:llm :in]]

;; Aggregate full context
[[:llm :out] [:assistant-context-assembler :in]]
[[:assistant-context-assembler :out] [:context-aggregator :in]]

;; Assemble sentence by sentence for fast speech
[[:llm :out] [:llm-sentence-assembler :in]]
[[:llm-sentence-assembler :sys-out] [:tts :sys-in]]

[[:tts :out] [:audio-splitter :in]]
[[:audio-splitter :out] [:transport-out :in]]

;; Activity detection
[[:transport-out :sys-out] [:activity-monitor :sys-in]]
[[:transport-in :sys-out] [:activity-monitor :sys-in]]
[[:transcriptor :sys-out] [:activity-monitor :sys-in]]
[[:activity-monitor :out] [:context-aggregator :in]]
[[:activity-monitor :out] [:tts :in]]]
extra-conns)})))
:tools
[{:type :function
:function
{:name "get_weather"
:handler (fn [{:keys [town]}] (str "The weather in " town " is 17 degrees celsius"))
:description "Get the current weather of a location"
:parameters {:type :object
:required [:town]
:properties {:town {:type :string
:description "Town for which to retrieve the current weather"}}
:additionalProperties false}
:strict true}}]}

language :en

debug? false
chunk-duration-ms 20
extra-procs {}
extra-conns []}} opts
provider (keyword (or (:pipeline/provider opts) :cloud))
base-url (:webgpu/base-url opts)
stt (:webgpu/stt opts)
llm (:webgpu/llm opts)
tts (:webgpu/tts opts)
webgpu-base (or base-url (System/getenv "WEBGPU_SERVICE_URL") "http://localhost:4173")
webgpu-common {:service/base-url webgpu-base}
stt-args (merge webgpu-common {:audio/sample-rate 16000} stt)
llm-args (merge webgpu-common llm)
tts-args (merge webgpu-common tts)
transcriptor (if (= provider :webgpu)
{:proc webgpu/speech-stt-process
:args stt-args}
{:proc deepgram/deepgram-processor
:args {:transcription/api-key (secret [:deepgram :api-key])
:transcription/interim-results? true
:transcription/punctuate? false
:transcription/vad-events? false
:transcription/smart-format? true
:transcription/model :nova-2
:transcription/utterance-end-ms 1000
:transcription/language language}})
llm-proc (if (= provider :webgpu)
{:proc webgpu/text-llm-process
:args llm-args}
{:proc openai/openai-llm-process
:args {:openai/api-key (secret [:openai :api-key])
:llm/model :gpt-4.1-mini}})
tts-proc (if (= provider :webgpu)
{:proc webgpu/speech-tts-process
:args (merge {:service/path "/tts"} tts-args)}
{:proc xi/elevenlabs-tts-process
:args {:elevenlabs/api-key (secret [:elevenlabs :api-key])
:elevenlabs/model-id "eleven_flash_v2_5"
:elevenlabs/voice-id (secret [:elevenlabs :voice-id])
:voice/stability 0.5
:voice/similarity-boost 0.8
:voice/use-speaker-boost? true
:pipeline/language language}})]

(when (= provider :webgpu)
(t/log! {:level :info :id :simulflow.local}
(str "Using WebGPU service " webgpu-base " for STT/LLM/TTS")))

(flow/create-flow
{:procs
(u/deep-merge
{;; Capture audio from microphone and send raw-audio-input frames further in the pipeline
:transport-in {:proc transport-in/microphone-transport-in
:args {:vad/analyser :vad.analyser/silero}}
;; raw-audio-input -> transcription frames
:transcriptor transcriptor

;; user transcription & llm message frames -> llm-context frames
;; responsible for keeping the full conversation history
:context-aggregator {:proc context/context-aggregator
:args {:llm/context llm-context
:aggregator/debug? debug?}}

;; Takes llm-context frames and produces new llm-text-chunk & llm-tool-call-chunk frames
:llm llm-proc

;; llm-text-chunk & llm-tool-call-chunk -> llm-context-messages-append frames
:assistant-context-assembler {:proc context/assistant-context-assembler
:args {:debug? debug?}}

;; llm-text-chunk -> sentence speak frames (faster for text to speech)
:llm-sentence-assembler {:proc context/llm-sentence-assembler}

;; speak-frames -> audio-output-raw frames
:tts tts-proc

;; audio-output-raw -> smaller audio-output-raw frames (used for sending audio in realtime)
:audio-splitter {:proc transport/audio-splitter
:args {:audio.out/duration-ms chunk-duration-ms}}

;; speakers out
:transport-out {:proc transport-out/realtime-speakers-out-processor
:args {:audio.out/sending-interval chunk-duration-ms
:audio.out/duration-ms chunk-duration-ms}}

:activity-monitor {:proc activity-monitor/process
:args {::activity-monitor/timeout-ms 5000}}}
extra-procs)
:conns (concat
[[[:transport-in :out] [:transcriptor :in]]

[[:transcriptor :out] [:context-aggregator :in]]
[[:transport-in :sys-out] [:context-aggregator :sys-in]]
[[:context-aggregator :out] [:llm :in]]

;; Aggregate full context
[[:llm :out] [:assistant-context-assembler :in]]
[[:assistant-context-assembler :out] [:context-aggregator :in]]

;; Assemble sentence by sentence for fast speech
[[:llm :out] [:llm-sentence-assembler :in]]
[[:llm-sentence-assembler :sys-out] [:tts :sys-in]]

[[:tts :out] [:audio-splitter :in]]
[[:audio-splitter :out] [:transport-out :in]]

;; Activity detection
[[:transport-out :sys-out] [:activity-monitor :sys-in]]
[[:transport-in :sys-out] [:activity-monitor :sys-in]]
[[:transcriptor :sys-out] [:activity-monitor :sys-in]]
[[:activity-monitor :out] [:context-aggregator :in]]
[[:activity-monitor :out] [:tts :in]]]
extra-conns)})))
)

(comment

Expand Down
Loading