Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions examples/sdk-core/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,10 @@ realtimeClient.disconnect();
const realtimeClient = await client.realtime.connect(null, {
model: models.realtime("live_avatar"),
onRemoteStream: (videoStream) => { ... },
avatar: { avatarImage: "https://example.com/avatar.png" },
initialState: { prompt: { text: "A friendly assistant", enhance: true } },
initialState: {
image: "https://example.com/avatar.png",
prompt: { text: "A friendly assistant", enhance: true },
},
});
await realtimeClient.playAudio(audioBlob);

Expand All @@ -124,7 +126,9 @@ const micStream = await navigator.mediaDevices.getUserMedia({ audio: true, video
const realtimeClient = await client.realtime.connect(micStream, {
model: models.realtime("live_avatar"),
onRemoteStream: (videoStream) => { ... },
avatar: { avatarImage: avatarFile },
initialState: { prompt: { text: "A friendly assistant", enhance: true } },
initialState: {
image: avatarFile,
prompt: { text: "A friendly assistant", enhance: true },
},
});
```
8 changes: 2 additions & 6 deletions examples/sdk-core/realtime/live-avatar.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,8 @@ async function withPlayAudio() {
const video = document.getElementById("output") as HTMLVideoElement;
video.srcObject = videoStream;
},
avatar: {
avatarImage: "https://example.com/avatar.png", // or File/Blob
},
initialState: {
image: "https://example.com/avatar.png", // or File/Blob
prompt: { text: "A friendly assistant", enhance: true },
},
});
Expand Down Expand Up @@ -63,10 +61,8 @@ async function withMicInput() {
const video = document.getElementById("output") as HTMLVideoElement;
video.srcObject = videoStream;
},
avatar: {
avatarImage: "https://example.com/avatar.png",
},
initialState: {
image: "https://example.com/avatar.png",
prompt: { text: "A friendly assistant", enhance: true },
},
});
Expand Down
10 changes: 9 additions & 1 deletion packages/sdk/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,13 @@ <h3>Console Logs</h3>

addLog('Connecting to Decart server...', 'info');

// Load initial reference image for models that support it
let initialImage;
if (model.name === 'lucy_2_rt' || model.name === 'mirage_v2') {
const initialImageResponse = await fetch('./tests/fixtures/image.png');
initialImage = await initialImageResponse.blob();
}

decartRealtime = await decartClient.realtime.connect(localStream, {
model,
onRemoteStream: (stream) => {
Expand All @@ -507,7 +514,8 @@ <h3>Console Logs</h3>
initialState: {
prompt: {
text: "Lego World",
}
},
...(initialImage && { image: initialImage }),
}
});

Expand Down
1 change: 0 additions & 1 deletion packages/sdk/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ export type {
QueueSubmitOptions,
} from "./queue/types";
export type {
AvatarOptions,
Events as RealTimeEvents,
RealTimeClient,
RealTimeClientConnectOptions,
Expand Down
48 changes: 13 additions & 35 deletions packages/sdk/src/realtime/client.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { z } from "zod";
import { modelDefinitionSchema } from "../shared/model";
import { modelDefinitionSchema, type RealTimeModels } from "../shared/model";
import { modelStateSchema } from "../shared/types";
import { createWebrtcError, type DecartSDKError } from "../utils/errors";
import { AudioStreamManager } from "./audio-stream-manager";
Expand Down Expand Up @@ -80,19 +80,13 @@ export type RealTimeClientInitialState = z.infer<typeof realTimeClientInitialSta
const createAsyncFunctionSchema = <T extends z.core.$ZodFunction>(schema: T) =>
z.custom<Parameters<T["implementAsync"]>[0]>((fn) => schema.implementAsync(fn as Parameters<T["implementAsync"]>[0]));

const avatarOptionsSchema = z.object({
avatarImage: z.union([z.instanceof(Blob), z.instanceof(File), z.string()]),
});
export type AvatarOptions = z.infer<typeof avatarOptionsSchema>;

const realTimeClientConnectOptionsSchema = z.object({
model: modelDefinitionSchema,
onRemoteStream: z.custom<OnRemoteStreamFn>((val) => typeof val === "function", {
message: "onRemoteStream must be a function",
}),
initialState: realTimeClientInitialStateSchema.optional(),
customizeOffer: createAsyncFunctionSchema(z.function()).optional(),
avatar: avatarOptionsSchema.optional(),
});
export type RealTimeClientConnectOptions = z.infer<typeof realTimeClientConnectOptionsSchema>;

Expand Down Expand Up @@ -133,7 +127,7 @@ export const createRealTimeClient = (opts: RealTimeClientOptions) => {

const isAvatarLive = options.model.name === "live_avatar";

const { onRemoteStream, initialState, avatar } = parsedOptions.data;
const { onRemoteStream, initialState } = parsedOptions.data;

// For live_avatar without user-provided stream: create AudioStreamManager for continuous silent stream with audio injection
// If user provides their own stream (e.g., mic input), use it directly
Expand All @@ -150,26 +144,16 @@ export const createRealTimeClient = (opts: RealTimeClientOptions) => {
let webrtcManager: WebRTCManager | undefined;

try {
// For live_avatar: prepare avatar image base64 before connection
let avatarImageBase64: string | undefined;
if (isAvatarLive && avatar?.avatarImage) {
if (typeof avatar.avatarImage === "string") {
const response = await fetch(avatar.avatarImage);
if (!response.ok) {
throw new Error(`Failed to fetch image: ${response.status} ${response.statusText}`);
// Prepare initial image base64 before connection
const initialImage = initialState?.image ? await imageToBase64(initialState.image) : undefined;

// Prepare initial prompt to send via WebSocket before WebRTC handshake
const initialPrompt = initialState?.prompt
? {
text: initialState.prompt.text,
enhance: initialState.prompt.enhance,
}
const imageBlob = await response.blob();
avatarImageBase64 = await blobToBase64(imageBlob);
} else {
avatarImageBase64 = await blobToBase64(avatar.avatarImage);
}
}

// For live_avatar: prepare initial prompt to send before WebRTC handshake
const initialPrompt =
isAvatarLive && initialState?.prompt
? { text: initialState.prompt.text, enhance: initialState.prompt.enhance }
: undefined;
: undefined;

const url = `${baseUrl}${options.model.urlPath}`;

Expand All @@ -189,8 +173,8 @@ export const createRealTimeClient = (opts: RealTimeClientOptions) => {
customizeOffer: options.customizeOffer as ((offer: RTCSessionDescriptionInit) => Promise<void>) | undefined,
vp8MinBitrate: 300,
vp8StartBitrate: 600,
isAvatarLive,
avatarImageBase64,
modelName: options.model.name as RealTimeModels,
initialImage,
initialPrompt,
});

Expand All @@ -213,12 +197,6 @@ export const createRealTimeClient = (opts: RealTimeClientOptions) => {

const methods = realtimeMethods(manager, imageToBase64);

// For non-live_avatar models: send initial prompt after connection is established
if (!isAvatarLive && initialState?.prompt) {
const { text, enhance } = initialState.prompt;
await methods.setPrompt(text, { enhance });
}

const client: RealTimeClient = {
set: methods.set,
setPrompt: methods.setPrompt,
Expand Down
47 changes: 27 additions & 20 deletions packages/sdk/src/realtime/webrtc-connection.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import mitt from "mitt";
import type { RealTimeModels } from "../shared/model";
import { buildUserAgent } from "../utils/user-agent";
import type {
ConnectionState,
Expand All @@ -21,8 +22,8 @@ interface ConnectionCallbacks {
customizeOffer?: (offer: RTCSessionDescriptionInit) => Promise<void>;
vp8MinBitrate?: number;
vp8StartBitrate?: number;
isAvatarLive?: boolean;
avatarImageBase64?: string;
modelName?: RealTimeModels;
initialImage?: string;
initialPrompt?: { text: string; enhance?: boolean };
}

Expand Down Expand Up @@ -96,12 +97,17 @@ export class WebRTCConnection {
connectAbort,
]);

// Phase 2: Pre-handshake setup (avatar image + initial prompt)
// Phase 2: Pre-handshake setup (initial image and/or prompt)
// connectionReject is already active, so ws.onclose or server errors abort these too
if (this.callbacks.avatarImageBase64) {
await Promise.race([this.sendAvatarImage(this.callbacks.avatarImageBase64), connectAbort]);
}
if (this.callbacks.initialPrompt) {
if (this.callbacks.initialImage) {
await Promise.race([
this.setImageBase64(this.callbacks.initialImage, {
prompt: this.callbacks.initialPrompt?.text,
enhance: this.callbacks.initialPrompt?.enhance,
}),
connectAbort,
]);
} else if (this.callbacks.initialPrompt) {
await Promise.race([this.sendInitialPrompt(this.callbacks.initialPrompt), connectAbort]);
}

Expand Down Expand Up @@ -228,16 +234,6 @@ export class WebRTCConnection {
return false;
}

private async sendAvatarImage(imageBase64: string): Promise<void> {
return this.setImageBase64(imageBase64);
}

/**
* Send an image to the server (e.g., as a reference for inference).
* Can be called after connection is established.
* Pass null to clear the reference image or use a placeholder.
* Optionally include a prompt to send with the image.
*/
async setImageBase64(
imageBase64: string | null,
options?: { prompt?: string; enhance?: boolean; timeout?: number },
Expand All @@ -260,7 +256,12 @@ export class WebRTCConnection {

this.websocketMessagesEmitter.on("setImageAck", listener);

const message: { type: "set_image"; image_data: string | null; prompt?: string; enhance_prompt?: boolean } = {
const message: {
type: "set_image";
image_data: string | null;
prompt?: string;
enhance_prompt?: boolean;
} = {
type: "set_image",
image_data: imageBase64,
};
Expand Down Expand Up @@ -304,7 +305,13 @@ export class WebRTCConnection {

this.websocketMessagesEmitter.on("promptAck", listener);

if (!this.send({ type: "prompt", prompt: prompt.text, enhance_prompt: prompt.enhance ?? true })) {
if (
!this.send({
type: "prompt",
prompt: prompt.text,
enhance_prompt: prompt.enhance ?? true,
})
) {
clearTimeout(timeoutId);
this.websocketMessagesEmitter.off("promptAck", listener);
reject(new Error("WebSocket is not open"));
Expand Down Expand Up @@ -341,7 +348,7 @@ export class WebRTCConnection {

if (this.localStream) {
// For live_avatar: add receive-only video transceiver (sends audio only, receives audio+video)
if (this.callbacks.isAvatarLive) {
if (this.callbacks.modelName === "live_avatar") {
this.pc.addTransceiver("video", { direction: "recvonly" });
}

Expand Down
9 changes: 5 additions & 4 deletions packages/sdk/src/realtime/webrtc-manager.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pRetry, { AbortError } from "p-retry";
import type { RealTimeModels } from "../shared/model";
import type { ConnectionState, OutgoingMessage } from "./types";
import { WebRTCConnection } from "./webrtc-connection";

Expand All @@ -11,8 +12,8 @@ export interface WebRTCConfig {
customizeOffer?: (offer: RTCSessionDescriptionInit) => Promise<void>;
vp8MinBitrate?: number;
vp8StartBitrate?: number;
isAvatarLive?: boolean;
avatarImageBase64?: string;
modelName?: RealTimeModels;
initialImage?: string;
initialPrompt?: { text: string; enhance?: boolean };
}

Expand Down Expand Up @@ -54,8 +55,8 @@ export class WebRTCManager {
customizeOffer: config.customizeOffer,
vp8MinBitrate: config.vp8MinBitrate,
vp8StartBitrate: config.vp8StartBitrate,
isAvatarLive: config.isAvatarLive,
avatarImageBase64: config.avatarImageBase64,
modelName: config.modelName,
initialImage: config.initialImage,
initialPrompt: config.initialPrompt,
});
}
Expand Down
1 change: 1 addition & 0 deletions packages/sdk/src/shared/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,6 @@ export const modelStateSchema = z.object({
enhance: z.boolean().optional().default(true),
})
.optional(),
image: z.union([z.instanceof(Blob), z.instanceof(File), z.string()]).optional(),
});
export type ModelState = z.infer<typeof modelStateSchema>;
17 changes: 14 additions & 3 deletions packages/sdk/tests/unit.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1084,7 +1084,7 @@ describe("WebRTCConnection", () => {
});

describe("RealTimeClient cleanup", () => {
it("cleans up AudioStreamManager when avatar fetch fails before WebRTC connect", async () => {
it("cleans up AudioStreamManager when initial image fetch fails before WebRTC connect", async () => {
class FakeAudioContext {
createMediaStreamDestination() {
return { stream: {} };
Expand Down Expand Up @@ -1121,7 +1121,7 @@ describe("RealTimeClient cleanup", () => {
realtime.connect(null, {
model: models.realtime("live_avatar"),
onRemoteStream: vi.fn(),
avatar: { avatarImage: "https://example.com/avatar.png" },
initialState: { image: "https://example.com/avatar.png" },
}),
).rejects.toThrow("Failed to fetch image: 404 Not Found");

Expand Down Expand Up @@ -1705,11 +1705,22 @@ describe("WebSockets Connection", () => {

const connectSpy = vi.spyOn(WebRTCManager.prototype, "connect").mockImplementation(async function () {
const manager = this as unknown as {
config: { onConnectionStateChange?: (state: import("../src/realtime/types").ConnectionState) => void };
config: {
onConnectionStateChange?: (state: import("../src/realtime/types").ConnectionState) => void;
initialPrompt?: { text: string; enhance?: boolean };
};
managerState: import("../src/realtime/types").ConnectionState;
};
manager.managerState = "connected";
manager.config.onConnectionStateChange?.("connected");

// Simulate initial prompt sent via WebSocket during connection setup
if (manager.config.initialPrompt) {
await new Promise((resolve) => setTimeout(resolve, 0));
manager.managerState = "generating";
manager.config.onConnectionStateChange?.("generating");
}

return true;
});
const stateSpy = vi.spyOn(WebRTCManager.prototype, "getConnectionState").mockImplementation(function () {
Expand Down
Loading