shipclojure · bmorphism · Sep 24, 2025 · Sep 24, 2025
diff --git a/README.md b/README.md
@@ -253,6 +253,15 @@ Allow Microphone access when prompted.
   ,)
 ```
 
+You can run the same flow against the local WebGPU microservice (see
+`webgpu-local-pipeline/`) by passing `:pipeline/provider :webgpu` and, if
+necessary, a custom `:webgpu/base-url`:
+
+```clojure
+(make-local-flow {:pipeline/provider :webgpu
+                  :webgpu/base-url "http://localhost:4173"})
+```
+
 Which roughly translates to:
 
 ![Flow Diagram](./resources/flow.png)

diff --git a/bin/chat b/bin/chat
@@ -27,7 +27,7 @@ fi
 if [ ! -f "resources/secrets.edn" ]; then
     print_color $RED "❌ Error: resources/secrets.edn not found"
     print_color $YELLOW "Please create resources/secrets.edn with your OpenAI API key:"
-    echo '{:openai {:new-api-sk "your-openai-api-key-here"}}'
+    echo '{:openai {:api-key "your-openai-api-key-here"}}'
     exit 1
 fi
 
@@ -45,6 +45,10 @@ while [[ $# -gt 0 ]]; do
             ARGS="$ARGS --scenario"
             shift
             ;;
+        --webgpu)
+            ARGS="$ARGS --webgpu"
+            shift
+            ;;
         --help|-h)
             HELP=true
             shift
@@ -95,6 +99,10 @@ if [[ "$ARGS" == *"--scenario"* ]]; then
     print_color $YELLOW "🎭 Scenario mode enabled"
 fi
 
+if [[ "$ARGS" == *"--webgpu"* ]]; then
+    print_color $YELLOW "🧠 Using local WebGPU service"
+fi
+
 print_color $GREEN "🔧 Loading Clojure dependencies..."
 
 # Run the chat demo

diff --git a/examples/src/simulflow_examples/local.clj b/examples/src/simulflow_examples/local.clj
@@ -15,6 +15,7 @@
    [simulflow.transport.out :as transport-out]
    [simulflow.utils.core :as u]
    [simulflow.vad.silero :as silero]
+   [simulflow-examples.webgpu-processors :as webgpu]
    [taoensso.telemere :as t]))
 
 (t/set-min-level! :debug)
@@ -28,117 +29,145 @@
   these are the defaults. See each process for details
   "
   ([] (make-local-flow {}))
-  ([{:keys [llm-context extra-procs extra-conns debug?
-            language chunk-duration-ms]
-     :or {llm-context {:messages
-                       [{:role "system"
-                         :content "You are a voice agent operating via phone. Be
+  ([opts]
+   (let [{:keys [llm-context extra-procs extra-conns debug?
+                 language chunk-duration-ms]
+          :or {llm-context {:messages
+                            [{:role "system"
+                              :content "You are a voice agent operating via phone. Be
                        concise in your answers. The input you receive comes from a
                        speech-to-text (transcription) system that isn't always
                        efficient and may send unclear text. Ask for
                        clarification when you're unsure what the person said."}]
-                       :tools
-                       [{:type :function
-                         :function
-                         {:name "get_weather"
-                          :handler (fn [{:keys [town]}] (str "The weather in " town " is 17 degrees celsius"))
-                          :description "Get the current weather of a location"
-                          :parameters {:type :object
-                                       :required [:town]
-                                       :properties {:town {:type :string
-                                                           :description "Town for which to retrieve the current weather"}}
-                                       :additionalProperties false}
-                          :strict true}}]}
-
-          language :en
-
-          debug? false
-          chunk-duration-ms 20
-          extra-procs {}
-          extra-conns []}}]
-
-   (flow/create-flow
-     {:procs
-      (u/deep-merge
-        {;; Capture audio from microphone and send raw-audio-input frames further in the pipeline
-         :transport-in {:proc transport-in/microphone-transport-in
-                        :args {:vad/analyser :vad.analyser/silero}}
-         ;; raw-audio-input -> transcription frames
-         :transcriptor {:proc deepgram/deepgram-processor
-                        :args {:transcription/api-key (secret [:deepgram :api-key])
-                               :transcription/interim-results? true
-                               :transcription/punctuate? false
-                               :transcription/vad-events? false
-                               :transcription/smart-format? true
-                               :transcription/model :nova-2
-                               :transcription/utterance-end-ms 1000
-                               :transcription/language language}}
-
-         ;; user transcription & llm message frames -> llm-context frames
-         ;; responsible for keeping the full conversation history
-         :context-aggregator  {:proc context/context-aggregator
-                               :args {:llm/context llm-context
-                                      :aggregator/debug? debug?}}
-
-         ;; Takes llm-context frames and produces new llm-text-chunk & llm-tool-call-chunk frames
-         :llm {:proc openai/openai-llm-process
-               :args {:openai/api-key (secret [:openai :new-api-sk])
-                      :llm/model :gpt-4.1-mini}}
-
-         ;; llm-text-chunk & llm-tool-call-chunk -> llm-context-messages-append frames
-         :assistant-context-assembler {:proc context/assistant-context-assembler
-                                       :args {:debug? debug?}}
-
-         ;; llm-text-chunk -> sentence speak frames (faster for text to speech)
-         :llm-sentence-assembler {:proc context/llm-sentence-assembler}
-
-         ;; speak-frames -> audio-output-raw frames
-         :tts {:proc xi/elevenlabs-tts-process
-               :args {:elevenlabs/api-key (secret [:elevenlabs :api-key])
-                      :elevenlabs/model-id "eleven_flash_v2_5"
-                      :elevenlabs/voice-id (secret [:elevenlabs :voice-id])
-                      :voice/stability 0.5
-                      :voice/similarity-boost 0.8
-                      :voice/use-speaker-boost? true
-                      :pipeline/language language}}
-
-         ;; audio-output-raw -> smaller audio-output-raw frames (used for sending audio in realtime)
-         :audio-splitter {:proc transport/audio-splitter
-                          :args {:audio.out/duration-ms chunk-duration-ms}}
-
-         ;; speakers out
-         :transport-out {:proc transport-out/realtime-speakers-out-processor
-                         :args {:audio.out/sending-interval chunk-duration-ms
-                                :audio.out/duration-ms chunk-duration-ms}}
-
-         :activity-monitor {:proc activity-monitor/process
-                            :args {::activity-monitor/timeout-ms 5000}}}
-        extra-procs)
-      :conns (concat
-               [[[:transport-in :out] [:transcriptor :in]]
-
-                [[:transcriptor :out] [:context-aggregator :in]]
-                [[:transport-in :sys-out] [:context-aggregator :sys-in]]
-                [[:context-aggregator :out] [:llm :in]]
-
-                ;; Aggregate full context
-                [[:llm :out] [:assistant-context-assembler :in]]
-                [[:assistant-context-assembler :out] [:context-aggregator :in]]
-
-                ;; Assemble sentence by sentence for fast speech
-                [[:llm :out] [:llm-sentence-assembler :in]]
-                [[:llm-sentence-assembler :sys-out] [:tts :sys-in]]
-
-                [[:tts :out] [:audio-splitter :in]]
-                [[:audio-splitter :out] [:transport-out :in]]
-
-                ;; Activity detection
-                [[:transport-out :sys-out] [:activity-monitor :sys-in]]
-                [[:transport-in :sys-out] [:activity-monitor :sys-in]]
-                [[:transcriptor :sys-out] [:activity-monitor :sys-in]]
-                [[:activity-monitor :out] [:context-aggregator :in]]
-                [[:activity-monitor :out] [:tts :in]]]
-               extra-conns)})))
+                            :tools
+                            [{:type :function
+                              :function
+                              {:name "get_weather"
+                               :handler (fn [{:keys [town]}] (str "The weather in " town " is 17 degrees celsius"))
+                               :description "Get the current weather of a location"
+                               :parameters {:type :object
+                                            :required [:town]
+                                            :properties {:town {:type :string
+                                                                :description "Town for which to retrieve the current weather"}}
+                                            :additionalProperties false}
+                               :strict true}}]}
+
+                language :en
+
+                debug? false
+                chunk-duration-ms 20
+                extra-procs {}
+                extra-conns []}} opts
+         provider (keyword (or (:pipeline/provider opts) :cloud))
+         base-url (:webgpu/base-url opts)
+         stt (:webgpu/stt opts)
+         llm (:webgpu/llm opts)
+         tts (:webgpu/tts opts)
+         webgpu-base (or base-url (System/getenv "WEBGPU_SERVICE_URL") "http://localhost:4173")
+         webgpu-common {:service/base-url webgpu-base}
+         stt-args (merge webgpu-common {:audio/sample-rate 16000} stt)
+         llm-args (merge webgpu-common llm)
+         tts-args (merge webgpu-common tts)
+         transcriptor (if (= provider :webgpu)
+                        {:proc webgpu/speech-stt-process
+                         :args stt-args}
+                        {:proc deepgram/deepgram-processor
+                         :args {:transcription/api-key (secret [:deepgram :api-key])
+                                :transcription/interim-results? true
+                                :transcription/punctuate? false
+                                :transcription/vad-events? false
+                                :transcription/smart-format? true
+                                :transcription/model :nova-2
+                                :transcription/utterance-end-ms 1000
+                                :transcription/language language}})
+         llm-proc (if (= provider :webgpu)
+                    {:proc webgpu/text-llm-process
+                     :args llm-args}
+                    {:proc openai/openai-llm-process
+                     :args {:openai/api-key (secret [:openai :api-key])
+                            :llm/model :gpt-4.1-mini}})
+         tts-proc (if (= provider :webgpu)
+                    {:proc webgpu/speech-tts-process
+                     :args (merge {:service/path "/tts"} tts-args)}
+                    {:proc xi/elevenlabs-tts-process
+                     :args {:elevenlabs/api-key (secret [:elevenlabs :api-key])
+                            :elevenlabs/model-id "eleven_flash_v2_5"
+                            :elevenlabs/voice-id (secret [:elevenlabs :voice-id])
+                            :voice/stability 0.5
+                            :voice/similarity-boost 0.8
+                            :voice/use-speaker-boost? true
+                            :pipeline/language language}})]
+
+     (when (= provider :webgpu)
+       (t/log! {:level :info :id :simulflow.local}
+               (str "Using WebGPU service " webgpu-base " for STT/LLM/TTS")))
+
+     (flow/create-flow
+       {:procs
+        (u/deep-merge
+          {;; Capture audio from microphone and send raw-audio-input frames further in the pipeline
+           :transport-in {:proc transport-in/microphone-transport-in
+                          :args {:vad/analyser :vad.analyser/silero}}
+           ;; raw-audio-input -> transcription frames
+           :transcriptor transcriptor
+
+           ;; user transcription & llm message frames -> llm-context frames
+           ;; responsible for keeping the full conversation history
+           :context-aggregator {:proc context/context-aggregator
+                                :args {:llm/context llm-context
+                                       :aggregator/debug? debug?}}
+
+           ;; Takes llm-context frames and produces new llm-text-chunk & llm-tool-call-chunk frames
+           :llm llm-proc
+
+           ;; llm-text-chunk & llm-tool-call-chunk -> llm-context-messages-append frames
+           :assistant-context-assembler {:proc context/assistant-context-assembler
+                                         :args {:debug? debug?}}
+
+           ;; llm-text-chunk -> sentence speak frames (faster for text to speech)
+           :llm-sentence-assembler {:proc context/llm-sentence-assembler}
+
+           ;; speak-frames -> audio-output-raw frames
+           :tts tts-proc
+
+           ;; audio-output-raw -> smaller audio-output-raw frames (used for sending audio in realtime)
+           :audio-splitter {:proc transport/audio-splitter
+                            :args {:audio.out/duration-ms chunk-duration-ms}}
+
+           ;; speakers out
+           :transport-out {:proc transport-out/realtime-speakers-out-processor
+                           :args {:audio.out/sending-interval chunk-duration-ms
+                                  :audio.out/duration-ms chunk-duration-ms}}
+
+           :activity-monitor {:proc activity-monitor/process
+                              :args {::activity-monitor/timeout-ms 5000}}}
+          extra-procs)
+        :conns (concat
+                 [[[:transport-in :out] [:transcriptor :in]]
+
+                  [[:transcriptor :out] [:context-aggregator :in]]
+                  [[:transport-in :sys-out] [:context-aggregator :sys-in]]
+                  [[:context-aggregator :out] [:llm :in]]
+
+                  ;; Aggregate full context
+                  [[:llm :out] [:assistant-context-assembler :in]]
+                  [[:assistant-context-assembler :out] [:context-aggregator :in]]
+
+                  ;; Assemble sentence by sentence for fast speech
+                  [[:llm :out] [:llm-sentence-assembler :in]]
+                  [[:llm-sentence-assembler :sys-out] [:tts :sys-in]]
+
+                  [[:tts :out] [:audio-splitter :in]]
+                  [[:audio-splitter :out] [:transport-out :in]]
+
+                  ;; Activity detection
+                  [[:transport-out :sys-out] [:activity-monitor :sys-in]]
+                  [[:transport-in :sys-out] [:activity-monitor :sys-in]]
+                  [[:transcriptor :sys-out] [:activity-monitor :sys-in]]
+                  [[:activity-monitor :out] [:context-aggregator :in]]
+                  [[:activity-monitor :out] [:tts :in]]]
+                 extra-conns)})))
+    )
 
 (comment