From ca1526f8d890f763d14c146e3e2e599ddb17c2b0 Mon Sep 17 00:00:00 2001 From: Adir Amsalem Date: Mon, 16 Feb 2026 15:42:41 +0200 Subject: [PATCH 1/4] refactor(realtime): replace isAvatarLive flag with modelName for extensible model-specific behavior Replace the boolean isAvatarLive flag with a typed modelName parameter (RealTimeModels) throughout the WebRTC stack. This enables model-specific branching beyond a binary avatar/non-avatar check. Unify initial prompt delivery so all models send the prompt via WebSocket during the connection handshake, rather than only live_avatar doing so pre-handshake while other models sent it post-connection via setPrompt. Update the event replay test to match the new prompt delivery flow. --- packages/sdk/src/realtime/client.ts | 22 ++--- .../sdk/src/realtime/webrtc-connection.ts | 90 +++++++++++++++---- packages/sdk/src/realtime/webrtc-manager.ts | 5 +- packages/sdk/tests/unit.test.ts | 13 ++- 4 files changed, 96 insertions(+), 34 deletions(-) diff --git a/packages/sdk/src/realtime/client.ts b/packages/sdk/src/realtime/client.ts index 017239c..9d92f27 100644 --- a/packages/sdk/src/realtime/client.ts +++ b/packages/sdk/src/realtime/client.ts @@ -1,5 +1,5 @@ import { z } from "zod"; -import { modelDefinitionSchema } from "../shared/model"; +import { modelDefinitionSchema, type RealTimeModels } from "../shared/model"; import { modelStateSchema } from "../shared/types"; import { createWebrtcError, type DecartSDKError } from "../utils/errors"; import { AudioStreamManager } from "./audio-stream-manager"; @@ -165,11 +165,13 @@ export const createRealTimeClient = (opts: RealTimeClientOptions) => { } } - // For live_avatar: prepare initial prompt to send before WebRTC handshake - const initialPrompt = - isAvatarLive && initialState?.prompt - ? { text: initialState.prompt.text, enhance: initialState.prompt.enhance } - : undefined; + // Prepare initial prompt to send via WebSocket before WebRTC handshake + const initialPrompt = initialState?.prompt + ? { + text: initialState.prompt.text, + enhance: initialState.prompt.enhance, + } + : undefined; const url = `${baseUrl}${options.model.urlPath}`; @@ -189,7 +191,7 @@ export const createRealTimeClient = (opts: RealTimeClientOptions) => { customizeOffer: options.customizeOffer as ((offer: RTCSessionDescriptionInit) => Promise) | undefined, vp8MinBitrate: 300, vp8StartBitrate: 600, - isAvatarLive, + modelName: options.model.name as RealTimeModels, avatarImageBase64, initialPrompt, }); @@ -213,12 +215,6 @@ export const createRealTimeClient = (opts: RealTimeClientOptions) => { const methods = realtimeMethods(manager, imageToBase64); - // For non-live_avatar models: send initial prompt after connection is established - if (!isAvatarLive && initialState?.prompt) { - const { text, enhance } = initialState.prompt; - await methods.setPrompt(text, { enhance }); - } - const client: RealTimeClient = { set: methods.set, setPrompt: methods.setPrompt, diff --git a/packages/sdk/src/realtime/webrtc-connection.ts b/packages/sdk/src/realtime/webrtc-connection.ts index a412615..584a145 100644 --- a/packages/sdk/src/realtime/webrtc-connection.ts +++ b/packages/sdk/src/realtime/webrtc-connection.ts @@ -10,6 +10,7 @@ import type { SetImageAckMessage, TurnConfig, } from "./types"; +import type { RealTimeModels } from "../shared/model"; const ICE_SERVERS: RTCIceServer[] = [{ urls: "stun:stun.l.google.com:19302" }]; const AVATAR_SETUP_TIMEOUT_MS = 30_000; // 30 seconds @@ -21,7 +22,7 @@ interface ConnectionCallbacks { customizeOffer?: (offer: RTCSessionDescriptionInit) => Promise; vp8MinBitrate?: number; vp8StartBitrate?: number; - isAvatarLive?: boolean; + modelName?: RealTimeModels; avatarImageBase64?: string; initialPrompt?: { text: string; enhance?: boolean }; } @@ -42,7 +43,12 @@ export class WebRTCConnection { websocketMessagesEmitter = mitt(); constructor(private callbacks: ConnectionCallbacks = {}) {} - async connect(url: string, localStream: MediaStream | null, timeout: number, integration?: string): Promise { + async connect( + url: string, + localStream: MediaStream | null, + timeout: number, + integration?: string, + ): Promise { const deadline = Date.now() + timeout; this.localStream = localStream; @@ -66,7 +72,10 @@ export class WebRTCConnection { // Phase 1: WebSocket setup await Promise.race([ new Promise((resolve, reject) => { - const timer = setTimeout(() => reject(new Error("WebSocket timeout")), timeout); + const timer = setTimeout( + () => reject(new Error("WebSocket timeout")), + timeout, + ); this.ws = new WebSocket(wsUrl); this.ws.onopen = () => { @@ -89,7 +98,9 @@ export class WebRTCConnection { this.ws.onclose = () => { this.setState("disconnected"); clearTimeout(timer); - reject(new Error("WebSocket closed before connection was established")); + reject( + new Error("WebSocket closed before connection was established"), + ); rejectConnect(new Error("WebSocket closed")); }; }), @@ -99,10 +110,16 @@ export class WebRTCConnection { // Phase 2: Pre-handshake setup (avatar image + initial prompt) // connectionReject is already active, so ws.onclose or server errors abort these too if (this.callbacks.avatarImageBase64) { - await Promise.race([this.sendAvatarImage(this.callbacks.avatarImageBase64), connectAbort]); + await Promise.race([ + this.sendAvatarImage(this.callbacks.avatarImageBase64), + connectAbort, + ]); } if (this.callbacks.initialPrompt) { - await Promise.race([this.sendInitialPrompt(this.callbacks.initialPrompt), connectAbort]); + await Promise.race([ + this.sendInitialPrompt(this.callbacks.initialPrompt), + connectAbort, + ]); } // Phase 3: WebRTC handshake @@ -131,7 +148,9 @@ export class WebRTCConnection { } } - private async handleSignalingMessage(msg: IncomingWebRTCMessage): Promise { + private async handleSignalingMessage( + msg: IncomingWebRTCMessage, + ): Promise { try { // Handle messages that don't require peer connection first if (msg.type === "error") { @@ -260,7 +279,12 @@ export class WebRTCConnection { this.websocketMessagesEmitter.on("setImageAck", listener); - const message: { type: "set_image"; image_data: string | null; prompt?: string; enhance_prompt?: boolean } = { + const message: { + type: "set_image"; + image_data: string | null; + prompt?: string; + enhance_prompt?: boolean; + } = { type: "set_image", image_data: imageBase64, }; @@ -283,7 +307,10 @@ export class WebRTCConnection { /** * Send the initial prompt to the server before WebRTC handshake. */ - private async sendInitialPrompt(prompt: { text: string; enhance?: boolean }): Promise { + private async sendInitialPrompt(prompt: { + text: string; + enhance?: boolean; + }): Promise { return new Promise((resolve, reject) => { const timeoutId = setTimeout(() => { this.websocketMessagesEmitter.off("promptAck", listener); @@ -304,7 +331,13 @@ export class WebRTCConnection { this.websocketMessagesEmitter.on("promptAck", listener); - if (!this.send({ type: "prompt", prompt: prompt.text, enhance_prompt: prompt.enhance ?? true })) { + if ( + !this.send({ + type: "prompt", + prompt: prompt.text, + enhance_prompt: prompt.enhance ?? true, + }) + ) { clearTimeout(timeoutId); this.websocketMessagesEmitter.off("promptAck", listener); reject(new Error("WebSocket is not open")); @@ -341,7 +374,7 @@ export class WebRTCConnection { if (this.localStream) { // For live_avatar: add receive-only video transceiver (sends audio only, receives audio+video) - if (this.callbacks.isAvatarLive) { + if (this.callbacks.modelName === "live_avatar") { this.pc.addTransceiver("video", { direction: "recvonly" }); } @@ -375,7 +408,11 @@ export class WebRTCConnection { if (!this.pc) return; const s = this.pc.connectionState; const nextState = - s === "connected" ? "connected" : ["connecting", "new"].includes(s) ? "connecting" : "disconnected"; + s === "connected" + ? "connected" + : ["connecting", "new"].includes(s) + ? "connecting" + : "disconnected"; // Keep "generating" sticky unless the connection is actually lost. if (this.state === "generating" && nextState !== "disconnected") return; this.setState(nextState); @@ -406,16 +443,27 @@ export class WebRTCConnection { applyCodecPreference(preferredCodecName: "video/VP8" | "video/H264") { if (!this.pc) return; - if (typeof RTCRtpSender === "undefined" || typeof RTCRtpSender.getCapabilities !== "function") { - console.warn("RTCRtpSender capabilities are not available in this environment."); + if ( + typeof RTCRtpSender === "undefined" || + typeof RTCRtpSender.getCapabilities !== "function" + ) { + console.warn( + "RTCRtpSender capabilities are not available in this environment.", + ); return; } const videoTransceiver = this.pc .getTransceivers() - .find((r) => r.sender.track?.kind === "video" || r.receiver.track?.kind === "video"); + .find( + (r) => + r.sender.track?.kind === "video" || + r.receiver.track?.kind === "video", + ); if (!videoTransceiver) { - console.error("Could not find video transceiver. Ensure track is added to peer connection."); + console.error( + "Could not find video transceiver. Ensure track is added to peer connection.", + ); return; } @@ -443,7 +491,9 @@ export class WebRTCConnection { try { videoTransceiver.setCodecPreferences(orderedCodecs); } catch { - console.warn("[WebRTC] setCodecPreferences not supported, skipping codec preference."); + console.warn( + "[WebRTC] setCodecPreferences not supported, skipping codec preference.", + ); } } @@ -477,7 +527,11 @@ export class WebRTCConnection { let fmtpIndex = -1; let insertAfterIndex = i; // Default: insert after rtpmap line - for (let j = i + 1; j < sdpLines.length && sdpLines[j].startsWith("a="); j++) { + for ( + let j = i + 1; + j < sdpLines.length && sdpLines[j].startsWith("a="); + j++ + ) { // Check if fmtp already exists if (sdpLines[j].startsWith(`a=fmtp:${payloadType}`)) { fmtpIndex = j; diff --git a/packages/sdk/src/realtime/webrtc-manager.ts b/packages/sdk/src/realtime/webrtc-manager.ts index f2ae2c7..24ce7ca 100644 --- a/packages/sdk/src/realtime/webrtc-manager.ts +++ b/packages/sdk/src/realtime/webrtc-manager.ts @@ -1,6 +1,7 @@ import pRetry, { AbortError } from "p-retry"; import type { ConnectionState, OutgoingMessage } from "./types"; import { WebRTCConnection } from "./webrtc-connection"; +import type { RealTimeModels } from "../shared/model"; export interface WebRTCConfig { webrtcUrl: string; @@ -11,7 +12,7 @@ export interface WebRTCConfig { customizeOffer?: (offer: RTCSessionDescriptionInit) => Promise; vp8MinBitrate?: number; vp8StartBitrate?: number; - isAvatarLive?: boolean; + modelName?: RealTimeModels; avatarImageBase64?: string; initialPrompt?: { text: string; enhance?: boolean }; } @@ -54,7 +55,7 @@ export class WebRTCManager { customizeOffer: config.customizeOffer, vp8MinBitrate: config.vp8MinBitrate, vp8StartBitrate: config.vp8StartBitrate, - isAvatarLive: config.isAvatarLive, + modelName: config.modelName, avatarImageBase64: config.avatarImageBase64, initialPrompt: config.initialPrompt, }); diff --git a/packages/sdk/tests/unit.test.ts b/packages/sdk/tests/unit.test.ts index bcf4623..cc5bf6b 100644 --- a/packages/sdk/tests/unit.test.ts +++ b/packages/sdk/tests/unit.test.ts @@ -1705,11 +1705,22 @@ describe("WebSockets Connection", () => { const connectSpy = vi.spyOn(WebRTCManager.prototype, "connect").mockImplementation(async function () { const manager = this as unknown as { - config: { onConnectionStateChange?: (state: import("../src/realtime/types").ConnectionState) => void }; + config: { + onConnectionStateChange?: (state: import("../src/realtime/types").ConnectionState) => void; + initialPrompt?: { text: string; enhance?: boolean }; + }; managerState: import("../src/realtime/types").ConnectionState; }; manager.managerState = "connected"; manager.config.onConnectionStateChange?.("connected"); + + // Simulate initial prompt sent via WebSocket during connection setup + if (manager.config.initialPrompt) { + await new Promise((resolve) => setTimeout(resolve, 0)); + manager.managerState = "generating"; + manager.config.onConnectionStateChange?.("generating"); + } + return true; }); const stateSpy = vi.spyOn(WebRTCManager.prototype, "getConnectionState").mockImplementation(function () { From 487e14e2984483c5b7c306d36047124345c82c7e Mon Sep 17 00:00:00 2001 From: Adir Amsalem Date: Mon, 16 Feb 2026 15:44:30 +0200 Subject: [PATCH 2/4] style: apply Biome formatting and import ordering --- .../sdk/src/realtime/webrtc-connection.ts | 72 ++++--------------- packages/sdk/src/realtime/webrtc-manager.ts | 2 +- 2 files changed, 16 insertions(+), 58 deletions(-) diff --git a/packages/sdk/src/realtime/webrtc-connection.ts b/packages/sdk/src/realtime/webrtc-connection.ts index 584a145..c78836d 100644 --- a/packages/sdk/src/realtime/webrtc-connection.ts +++ b/packages/sdk/src/realtime/webrtc-connection.ts @@ -1,4 +1,5 @@ import mitt from "mitt"; +import type { RealTimeModels } from "../shared/model"; import { buildUserAgent } from "../utils/user-agent"; import type { ConnectionState, @@ -10,7 +11,6 @@ import type { SetImageAckMessage, TurnConfig, } from "./types"; -import type { RealTimeModels } from "../shared/model"; const ICE_SERVERS: RTCIceServer[] = [{ urls: "stun:stun.l.google.com:19302" }]; const AVATAR_SETUP_TIMEOUT_MS = 30_000; // 30 seconds @@ -43,12 +43,7 @@ export class WebRTCConnection { websocketMessagesEmitter = mitt(); constructor(private callbacks: ConnectionCallbacks = {}) {} - async connect( - url: string, - localStream: MediaStream | null, - timeout: number, - integration?: string, - ): Promise { + async connect(url: string, localStream: MediaStream | null, timeout: number, integration?: string): Promise { const deadline = Date.now() + timeout; this.localStream = localStream; @@ -72,10 +67,7 @@ export class WebRTCConnection { // Phase 1: WebSocket setup await Promise.race([ new Promise((resolve, reject) => { - const timer = setTimeout( - () => reject(new Error("WebSocket timeout")), - timeout, - ); + const timer = setTimeout(() => reject(new Error("WebSocket timeout")), timeout); this.ws = new WebSocket(wsUrl); this.ws.onopen = () => { @@ -98,9 +90,7 @@ export class WebRTCConnection { this.ws.onclose = () => { this.setState("disconnected"); clearTimeout(timer); - reject( - new Error("WebSocket closed before connection was established"), - ); + reject(new Error("WebSocket closed before connection was established")); rejectConnect(new Error("WebSocket closed")); }; }), @@ -110,16 +100,10 @@ export class WebRTCConnection { // Phase 2: Pre-handshake setup (avatar image + initial prompt) // connectionReject is already active, so ws.onclose or server errors abort these too if (this.callbacks.avatarImageBase64) { - await Promise.race([ - this.sendAvatarImage(this.callbacks.avatarImageBase64), - connectAbort, - ]); + await Promise.race([this.sendAvatarImage(this.callbacks.avatarImageBase64), connectAbort]); } if (this.callbacks.initialPrompt) { - await Promise.race([ - this.sendInitialPrompt(this.callbacks.initialPrompt), - connectAbort, - ]); + await Promise.race([this.sendInitialPrompt(this.callbacks.initialPrompt), connectAbort]); } // Phase 3: WebRTC handshake @@ -148,9 +132,7 @@ export class WebRTCConnection { } } - private async handleSignalingMessage( - msg: IncomingWebRTCMessage, - ): Promise { + private async handleSignalingMessage(msg: IncomingWebRTCMessage): Promise { try { // Handle messages that don't require peer connection first if (msg.type === "error") { @@ -307,10 +289,7 @@ export class WebRTCConnection { /** * Send the initial prompt to the server before WebRTC handshake. */ - private async sendInitialPrompt(prompt: { - text: string; - enhance?: boolean; - }): Promise { + private async sendInitialPrompt(prompt: { text: string; enhance?: boolean }): Promise { return new Promise((resolve, reject) => { const timeoutId = setTimeout(() => { this.websocketMessagesEmitter.off("promptAck", listener); @@ -408,11 +387,7 @@ export class WebRTCConnection { if (!this.pc) return; const s = this.pc.connectionState; const nextState = - s === "connected" - ? "connected" - : ["connecting", "new"].includes(s) - ? "connecting" - : "disconnected"; + s === "connected" ? "connected" : ["connecting", "new"].includes(s) ? "connecting" : "disconnected"; // Keep "generating" sticky unless the connection is actually lost. if (this.state === "generating" && nextState !== "disconnected") return; this.setState(nextState); @@ -443,27 +418,16 @@ export class WebRTCConnection { applyCodecPreference(preferredCodecName: "video/VP8" | "video/H264") { if (!this.pc) return; - if ( - typeof RTCRtpSender === "undefined" || - typeof RTCRtpSender.getCapabilities !== "function" - ) { - console.warn( - "RTCRtpSender capabilities are not available in this environment.", - ); + if (typeof RTCRtpSender === "undefined" || typeof RTCRtpSender.getCapabilities !== "function") { + console.warn("RTCRtpSender capabilities are not available in this environment."); return; } const videoTransceiver = this.pc .getTransceivers() - .find( - (r) => - r.sender.track?.kind === "video" || - r.receiver.track?.kind === "video", - ); + .find((r) => r.sender.track?.kind === "video" || r.receiver.track?.kind === "video"); if (!videoTransceiver) { - console.error( - "Could not find video transceiver. Ensure track is added to peer connection.", - ); + console.error("Could not find video transceiver. Ensure track is added to peer connection."); return; } @@ -491,9 +455,7 @@ export class WebRTCConnection { try { videoTransceiver.setCodecPreferences(orderedCodecs); } catch { - console.warn( - "[WebRTC] setCodecPreferences not supported, skipping codec preference.", - ); + console.warn("[WebRTC] setCodecPreferences not supported, skipping codec preference."); } } @@ -527,11 +489,7 @@ export class WebRTCConnection { let fmtpIndex = -1; let insertAfterIndex = i; // Default: insert after rtpmap line - for ( - let j = i + 1; - j < sdpLines.length && sdpLines[j].startsWith("a="); - j++ - ) { + for (let j = i + 1; j < sdpLines.length && sdpLines[j].startsWith("a="); j++) { // Check if fmtp already exists if (sdpLines[j].startsWith(`a=fmtp:${payloadType}`)) { fmtpIndex = j; diff --git a/packages/sdk/src/realtime/webrtc-manager.ts b/packages/sdk/src/realtime/webrtc-manager.ts index 24ce7ca..8d0399b 100644 --- a/packages/sdk/src/realtime/webrtc-manager.ts +++ b/packages/sdk/src/realtime/webrtc-manager.ts @@ -1,7 +1,7 @@ import pRetry, { AbortError } from "p-retry"; +import type { RealTimeModels } from "../shared/model"; import type { ConnectionState, OutgoingMessage } from "./types"; import { WebRTCConnection } from "./webrtc-connection"; -import type { RealTimeModels } from "../shared/model"; export interface WebRTCConfig { webrtcUrl: string; From cf5a21e4cbaed3efc7f25f513ae4aa26e97d3a0e Mon Sep 17 00:00:00 2001 From: Adir Amsalem Date: Mon, 16 Feb 2026 16:07:02 +0200 Subject: [PATCH 3/4] refactor(realtime): replace avatar option with initialState.image Remove the dedicated avatar/avatarImage connect option and AvatarOptions type. Image is now passed as initialState.image alongside prompt, using the more capable imageToBase64 utility (supports Blob, File, URL, data URL, raw base64). In the WebRTC connection Phase 2 pre-handshake: if image is provided, send via setImageBase64() which can carry prompt+enhance in the same message; otherwise fall back to sendInitialPrompt() for prompt-only. - Add image field to modelStateSchema (Blob | File | string) - Remove avatarOptionsSchema, AvatarOptions type, avatar connect option - Replace avatarImageBase64 with initialImage through the WebRTC stack - Remove sendAvatarImage() wrapper method - Update examples, README, and tests --- examples/sdk-core/README.md | 12 ++++++--- examples/sdk-core/realtime/live-avatar.ts | 8 ++---- packages/sdk/src/index.ts | 1 - packages/sdk/src/realtime/client.ts | 26 +++--------------- .../sdk/src/realtime/webrtc-connection.ts | 27 ++++++++----------- packages/sdk/src/realtime/webrtc-manager.ts | 4 +-- packages/sdk/src/shared/types.ts | 1 + packages/sdk/tests/unit.test.ts | 4 +-- 8 files changed, 30 insertions(+), 53 deletions(-) diff --git a/examples/sdk-core/README.md b/examples/sdk-core/README.md index ad24a49..d3b1059 100644 --- a/examples/sdk-core/README.md +++ b/examples/sdk-core/README.md @@ -114,8 +114,10 @@ realtimeClient.disconnect(); const realtimeClient = await client.realtime.connect(null, { model: models.realtime("live_avatar"), onRemoteStream: (videoStream) => { ... }, - avatar: { avatarImage: "https://example.com/avatar.png" }, - initialState: { prompt: { text: "A friendly assistant", enhance: true } }, + initialState: { + image: "https://example.com/avatar.png", + prompt: { text: "A friendly assistant", enhance: true }, + }, }); await realtimeClient.playAudio(audioBlob); @@ -124,7 +126,9 @@ const micStream = await navigator.mediaDevices.getUserMedia({ audio: true, video const realtimeClient = await client.realtime.connect(micStream, { model: models.realtime("live_avatar"), onRemoteStream: (videoStream) => { ... }, - avatar: { avatarImage: avatarFile }, - initialState: { prompt: { text: "A friendly assistant", enhance: true } }, + initialState: { + image: avatarFile, + prompt: { text: "A friendly assistant", enhance: true }, + }, }); ``` diff --git a/examples/sdk-core/realtime/live-avatar.ts b/examples/sdk-core/realtime/live-avatar.ts index 9d55584..10cb72e 100644 --- a/examples/sdk-core/realtime/live-avatar.ts +++ b/examples/sdk-core/realtime/live-avatar.ts @@ -22,10 +22,8 @@ async function withPlayAudio() { const video = document.getElementById("output") as HTMLVideoElement; video.srcObject = videoStream; }, - avatar: { - avatarImage: "https://example.com/avatar.png", // or File/Blob - }, initialState: { + image: "https://example.com/avatar.png", // or File/Blob prompt: { text: "A friendly assistant", enhance: true }, }, }); @@ -63,10 +61,8 @@ async function withMicInput() { const video = document.getElementById("output") as HTMLVideoElement; video.srcObject = videoStream; }, - avatar: { - avatarImage: "https://example.com/avatar.png", - }, initialState: { + image: "https://example.com/avatar.png", prompt: { text: "A friendly assistant", enhance: true }, }, }); diff --git a/packages/sdk/src/index.ts b/packages/sdk/src/index.ts index 0a86932..6f2b506 100644 --- a/packages/sdk/src/index.ts +++ b/packages/sdk/src/index.ts @@ -18,7 +18,6 @@ export type { QueueSubmitOptions, } from "./queue/types"; export type { - AvatarOptions, Events as RealTimeEvents, RealTimeClient, RealTimeClientConnectOptions, diff --git a/packages/sdk/src/realtime/client.ts b/packages/sdk/src/realtime/client.ts index 9d92f27..498dd12 100644 --- a/packages/sdk/src/realtime/client.ts +++ b/packages/sdk/src/realtime/client.ts @@ -80,11 +80,6 @@ export type RealTimeClientInitialState = z.infer(schema: T) => z.custom[0]>((fn) => schema.implementAsync(fn as Parameters[0])); -const avatarOptionsSchema = z.object({ - avatarImage: z.union([z.instanceof(Blob), z.instanceof(File), z.string()]), -}); -export type AvatarOptions = z.infer; - const realTimeClientConnectOptionsSchema = z.object({ model: modelDefinitionSchema, onRemoteStream: z.custom((val) => typeof val === "function", { @@ -92,7 +87,6 @@ const realTimeClientConnectOptionsSchema = z.object({ }), initialState: realTimeClientInitialStateSchema.optional(), customizeOffer: createAsyncFunctionSchema(z.function()).optional(), - avatar: avatarOptionsSchema.optional(), }); export type RealTimeClientConnectOptions = z.infer; @@ -133,7 +127,7 @@ export const createRealTimeClient = (opts: RealTimeClientOptions) => { const isAvatarLive = options.model.name === "live_avatar"; - const { onRemoteStream, initialState, avatar } = parsedOptions.data; + const { onRemoteStream, initialState } = parsedOptions.data; // For live_avatar without user-provided stream: create AudioStreamManager for continuous silent stream with audio injection // If user provides their own stream (e.g., mic input), use it directly @@ -150,20 +144,8 @@ export const createRealTimeClient = (opts: RealTimeClientOptions) => { let webrtcManager: WebRTCManager | undefined; try { - // For live_avatar: prepare avatar image base64 before connection - let avatarImageBase64: string | undefined; - if (isAvatarLive && avatar?.avatarImage) { - if (typeof avatar.avatarImage === "string") { - const response = await fetch(avatar.avatarImage); - if (!response.ok) { - throw new Error(`Failed to fetch image: ${response.status} ${response.statusText}`); - } - const imageBlob = await response.blob(); - avatarImageBase64 = await blobToBase64(imageBlob); - } else { - avatarImageBase64 = await blobToBase64(avatar.avatarImage); - } - } + // Prepare initial image base64 before connection + const initialImage = initialState?.image ? await imageToBase64(initialState.image) : undefined; // Prepare initial prompt to send via WebSocket before WebRTC handshake const initialPrompt = initialState?.prompt @@ -192,7 +174,7 @@ export const createRealTimeClient = (opts: RealTimeClientOptions) => { vp8MinBitrate: 300, vp8StartBitrate: 600, modelName: options.model.name as RealTimeModels, - avatarImageBase64, + initialImage, initialPrompt, }); diff --git a/packages/sdk/src/realtime/webrtc-connection.ts b/packages/sdk/src/realtime/webrtc-connection.ts index c78836d..000936c 100644 --- a/packages/sdk/src/realtime/webrtc-connection.ts +++ b/packages/sdk/src/realtime/webrtc-connection.ts @@ -23,7 +23,7 @@ interface ConnectionCallbacks { vp8MinBitrate?: number; vp8StartBitrate?: number; modelName?: RealTimeModels; - avatarImageBase64?: string; + initialImage?: string; initialPrompt?: { text: string; enhance?: boolean }; } @@ -97,12 +97,17 @@ export class WebRTCConnection { connectAbort, ]); - // Phase 2: Pre-handshake setup (avatar image + initial prompt) + // Phase 2: Pre-handshake setup (initial image and/or prompt) // connectionReject is already active, so ws.onclose or server errors abort these too - if (this.callbacks.avatarImageBase64) { - await Promise.race([this.sendAvatarImage(this.callbacks.avatarImageBase64), connectAbort]); - } - if (this.callbacks.initialPrompt) { + if (this.callbacks.initialImage) { + await Promise.race([ + this.setImageBase64(this.callbacks.initialImage, { + prompt: this.callbacks.initialPrompt?.text, + enhance: this.callbacks.initialPrompt?.enhance, + }), + connectAbort, + ]); + } else if (this.callbacks.initialPrompt) { await Promise.race([this.sendInitialPrompt(this.callbacks.initialPrompt), connectAbort]); } @@ -229,16 +234,6 @@ export class WebRTCConnection { return false; } - private async sendAvatarImage(imageBase64: string): Promise { - return this.setImageBase64(imageBase64); - } - - /** - * Send an image to the server (e.g., as a reference for inference). - * Can be called after connection is established. - * Pass null to clear the reference image or use a placeholder. - * Optionally include a prompt to send with the image. - */ async setImageBase64( imageBase64: string | null, options?: { prompt?: string; enhance?: boolean; timeout?: number }, diff --git a/packages/sdk/src/realtime/webrtc-manager.ts b/packages/sdk/src/realtime/webrtc-manager.ts index 8d0399b..b29b276 100644 --- a/packages/sdk/src/realtime/webrtc-manager.ts +++ b/packages/sdk/src/realtime/webrtc-manager.ts @@ -13,7 +13,7 @@ export interface WebRTCConfig { vp8MinBitrate?: number; vp8StartBitrate?: number; modelName?: RealTimeModels; - avatarImageBase64?: string; + initialImage?: string; initialPrompt?: { text: string; enhance?: boolean }; } @@ -56,7 +56,7 @@ export class WebRTCManager { vp8MinBitrate: config.vp8MinBitrate, vp8StartBitrate: config.vp8StartBitrate, modelName: config.modelName, - avatarImageBase64: config.avatarImageBase64, + initialImage: config.initialImage, initialPrompt: config.initialPrompt, }); } diff --git a/packages/sdk/src/shared/types.ts b/packages/sdk/src/shared/types.ts index c725c93..37f7d0a 100644 --- a/packages/sdk/src/shared/types.ts +++ b/packages/sdk/src/shared/types.ts @@ -7,5 +7,6 @@ export const modelStateSchema = z.object({ enhance: z.boolean().optional().default(true), }) .optional(), + image: z.union([z.instanceof(Blob), z.instanceof(File), z.string()]).optional(), }); export type ModelState = z.infer; diff --git a/packages/sdk/tests/unit.test.ts b/packages/sdk/tests/unit.test.ts index cc5bf6b..eecc208 100644 --- a/packages/sdk/tests/unit.test.ts +++ b/packages/sdk/tests/unit.test.ts @@ -1084,7 +1084,7 @@ describe("WebRTCConnection", () => { }); describe("RealTimeClient cleanup", () => { - it("cleans up AudioStreamManager when avatar fetch fails before WebRTC connect", async () => { + it("cleans up AudioStreamManager when initial image fetch fails before WebRTC connect", async () => { class FakeAudioContext { createMediaStreamDestination() { return { stream: {} }; @@ -1121,7 +1121,7 @@ describe("RealTimeClient cleanup", () => { realtime.connect(null, { model: models.realtime("live_avatar"), onRemoteStream: vi.fn(), - avatar: { avatarImage: "https://example.com/avatar.png" }, + initialState: { image: "https://example.com/avatar.png" }, }), ).rejects.toThrow("Failed to fetch image: 404 Not Found"); From 4a2b43298f83222d28a50c94acf6e0301923b2df Mon Sep 17 00:00:00 2001 From: Adir Amsalem Date: Tue, 17 Feb 2026 14:26:06 +0200 Subject: [PATCH 4/4] feat(realtime): load initial reference image for specific models Added functionality to load an initial reference image for models 'lucy_2_rt' and 'mirage_v2' during the connection process. The image is fetched as a Blob and included in the initialState of the WebRTC connection, enhancing the model's capabilities. --- packages/sdk/index.html | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/packages/sdk/index.html b/packages/sdk/index.html index fbe0c4c..fe1fd0d 100644 --- a/packages/sdk/index.html +++ b/packages/sdk/index.html @@ -498,6 +498,13 @@

Console Logs

addLog('Connecting to Decart server...', 'info'); + // Load initial reference image for models that support it + let initialImage; + if (model.name === 'lucy_2_rt' || model.name === 'mirage_v2') { + const initialImageResponse = await fetch('./tests/fixtures/image.png'); + initialImage = await initialImageResponse.blob(); + } + decartRealtime = await decartClient.realtime.connect(localStream, { model, onRemoteStream: (stream) => { @@ -507,7 +514,8 @@

Console Logs

initialState: { prompt: { text: "Lego World", - } + }, + ...(initialImage && { image: initialImage }), } });