From 02e782a0cd08186b9428d7fadafcacb2103cd0b4 Mon Sep 17 00:00:00 2001 From: Qiong Zhou Huang Date: Thu, 25 Jun 2026 16:06:27 -0700 Subject: [PATCH 1/3] feat(phonic): stream ahead of real time at native pcm_24000 Bump phonic to 0.32.5 and enable stream_ahead_of_real_time so assistant audio is sent to the client as soon as it is generated. Switch the input and output formats to pcm_24000 (the STS buffer's native rate) to avoid resampling. Drop the ConfigOptions cast now that stream_ahead_of_real_time is typed, and align tool definitions with the 0.32.5 ToolDefinition types. Co-authored-by: Cursor --- plugins/phonic/package.json | 2 +- plugins/phonic/src/realtime/realtime_model.ts | 55 +++++++++++++------ pnpm-lock.yaml | 10 ++-- 3 files changed, 43 insertions(+), 24 deletions(-) diff --git a/plugins/phonic/package.json b/plugins/phonic/package.json index 7ac06ed77..252fb3ee6 100644 --- a/plugins/phonic/package.json +++ b/plugins/phonic/package.json @@ -41,7 +41,7 @@ "typescript": "^5.0.0" }, "dependencies": { - "phonic": "^0.31.10" + "phonic": "^0.32.5" }, "peerDependencies": { "@livekit/agents": "workspace:*", diff --git a/plugins/phonic/src/realtime/realtime_model.ts b/plugins/phonic/src/realtime/realtime_model.ts index bfbe1eb6c..4a619f45e 100644 --- a/plugins/phonic/src/realtime/realtime_model.ts +++ b/plugins/phonic/src/realtime/realtime_model.ts @@ -7,18 +7,20 @@ import { DEFAULT_API_CONNECT_OPTIONS, Future, asError, + createTimedString, llm, log, shortuuid, stream, + type TimedString, } from '@livekit/agents'; import { AudioFrame, AudioResampler } from '@livekit/rtc-node'; import type { Phonic } from 'phonic'; import { PhonicClient } from 'phonic'; import type { ServerEvent, Voice } from './api_proto.js'; -const PHONIC_INPUT_SAMPLE_RATE = 44100; -const PHONIC_OUTPUT_SAMPLE_RATE = 44100; +const PHONIC_INPUT_SAMPLE_RATE = 24000; +const PHONIC_OUTPUT_SAMPLE_RATE = 24000; const PHONIC_NUM_CHANNELS = 1; const PHONIC_INPUT_FRAME_MS = 20; const DEFAULT_MODEL = 'merritt'; @@ -183,7 +185,7 @@ export class RealtimeModel extends llm.RealtimeModel { midSessionInstructionsUpdate: true, midSessionToolsUpdate: true, perResponseToolChoice: false, - nativeTranscriptSync: true, + nativeTranscriptSync: false, }); const apiKey = options.apiKey || process.env.PHONIC_API_KEY; @@ -245,9 +247,10 @@ interface GenerationState { responseId: string; messageChannel: stream.StreamChannel; functionChannel: stream.StreamChannel; - textChannel: stream.StreamChannel; + textChannel: stream.StreamChannel; audioChannel: stream.StreamChannel; outputText: string; + audioCursorSec: number; } /** @@ -274,7 +277,7 @@ export class RealtimeSession extends llm.RealtimeSession { private toolsReady = new Future(); private closedFuture = new Future(); private connectTask: Promise; - private toolDefinitions: Record[] = []; + private toolDefinitions: Phonic.InlineWebSocketTool[] = []; private forbidSpeechAfterToolCall = new Set(); private pendingToolCallIds = new Set(); private readyToStart = new Future(); @@ -407,7 +410,7 @@ export class RealtimeSession extends llm.RealtimeSession { this.toolsReady.resolve(); } - private buildToolDefinitions(tools: llm.ToolContext): Record[] { + private buildToolDefinitions(tools: llm.ToolContext): Phonic.InlineWebSocketTool[] { this.forbidSpeechAfterToolCall = new Set(this.options.forbidSpeechAfterToolCall ?? []); return Object.entries(tools) .filter(([, tool]) => llm.isFunctionTool(tool)) @@ -418,7 +421,7 @@ export class RealtimeSession extends llm.RealtimeSession { function: { name, description: tool.description, - parameters: llm.toJsonSchema(tool.parameters), + parameters: llm.toJsonSchema(tool.parameters) as Phonic.OpenAiFunctionParameters, strict: true, }, }, @@ -466,7 +469,7 @@ export class RealtimeSession extends llm.RealtimeSession { this.closeCurrentGeneration({ interrupted: true }); this.pendingUserText = undefined; - const toolsPayload: Phonic.ConfigOptions.Tools.Item[] = [ + const toolsPayload: Phonic.ToolDefinition[] = [ ...(this.options.phonicTools ?? []), ...this.toolDefinitions, ]; @@ -749,10 +752,8 @@ export class RealtimeSession extends llm.RealtimeSession { const gen = this.currentGeneration; if (gen === undefined) return; - if (message.text) { - gen.outputText += message.text; - gen.textChannel.write(message.text); - } + let audioFrame: AudioFrame | undefined; + let audioDurationSec = 0; if (message.audio) { const bytes = Buffer.from(message.audio, 'base64'); @@ -764,15 +765,31 @@ export class RealtimeSession extends llm.RealtimeSession { bytes.byteOffset + sampleCount * Int16Array.BYTES_PER_ELEMENT, ), ); - const frame = new AudioFrame( + audioFrame = new AudioFrame( pcm, PHONIC_OUTPUT_SAMPLE_RATE, PHONIC_NUM_CHANNELS, sampleCount / PHONIC_NUM_CHANNELS, ); - gen.audioChannel.write(frame); + audioDurationSec = audioFrame.samplesPerChannel / PHONIC_OUTPUT_SAMPLE_RATE; } } + + if (message.text) { + gen.outputText += message.text; + gen.textChannel.write( + createTimedString({ + text: message.text, + startTime: gen.audioCursorSec, + endTime: gen.audioCursorSec + audioDurationSec, + }), + ); + } + + if (audioFrame) { + gen.audioChannel.write(audioFrame); + gen.audioCursorSec += audioDurationSec; + } } private handleInputText(message: Phonic.InputTextPayload): void { @@ -838,7 +855,7 @@ export class RealtimeSession extends llm.RealtimeSession { const responseId = shortuuid('PS_'); - const textChannel = stream.createStreamChannel(); + const textChannel = stream.createStreamChannel(); const audioChannel = stream.createStreamChannel(); const functionChannel = stream.createStreamChannel(); const messageChannel = stream.createStreamChannel(); @@ -857,6 +874,7 @@ export class RealtimeSession extends llm.RealtimeSession { textChannel, audioChannel, outputText: '', + audioCursorSec: 0, }; const generationEvent: llm.GenerationCreatedEvent = { @@ -922,7 +940,7 @@ export class RealtimeSession extends llm.RealtimeSession { toolsPayload, }: { systemPrompt: string; - toolsPayload: Phonic.ConfigOptions.Tools.Item[]; + toolsPayload: Phonic.ToolDefinition[]; }): Phonic.ConfigOptions { return { agent: this.options.phonicAgent, @@ -931,8 +949,8 @@ export class RealtimeSession extends llm.RealtimeSession { generate_welcome_message: this.options.generateWelcomeMessage, system_prompt: systemPrompt, voice_id: this.options.voice, - input_format: 'pcm_44100', - output_format: 'pcm_44100', + input_format: 'pcm_24000', + output_format: 'pcm_24000', ...(this.options.defaultLanguage !== undefined && { default_language: this.options.defaultLanguage, }), @@ -952,6 +970,7 @@ export class RealtimeSession extends llm.RealtimeSession { no_input_poke_sec: this.options.noInputPokeSec, no_input_poke_text: this.options.noInputPokeText, no_input_end_conversation_sec: this.options.noInputEndConversationSec, + stream_ahead_of_real_time: true, }; } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index fd07b1a76..7af8c92fc 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1100,8 +1100,8 @@ importers: plugins/phonic: dependencies: phonic: - specifier: ^0.31.10 - version: 0.31.12 + specifier: ^0.32.5 + version: 0.32.5 devDependencies: '@livekit/agents': specifier: workspace:* @@ -4556,8 +4556,8 @@ packages: pathe@2.0.3: resolution: {integrity: sha512-WUjGcAqP1gQacoQe+OBJsFA7Ld4DyXuUIjZ5cc75cLHvJ7dtNsTugphxIADwspS+AraAUePCKrSVtPLFj/F88w==} - phonic@0.31.12: - resolution: {integrity: sha512-y4WKcivisgCYAfsahYs3n18W+uB9PrHDUaooAT8gCGn5MjO+nBMljnBhamqURn8vit2ZFh47ZOED54KY15Ln3w==} + phonic@0.32.5: + resolution: {integrity: sha512-Zyd7ZCuT9ozjGzAxaPYCNRRZUzlYYckQNoQi2QH5xCqEGiBcj9uWKLMkilPHZ3yZ2Uy7IA6ZeX0Mqrkv5sa+Fg==} engines: {node: '>=18.0.0'} picocolors@1.1.1: @@ -8613,7 +8613,7 @@ snapshots: pathe@2.0.3: {} - phonic@0.31.12: + phonic@0.32.5: dependencies: ws: 8.20.1 transitivePeerDependencies: From 78a0497005d3783dc81e1ef0653dad4959f6c7c1 Mon Sep 17 00:00:00 2001 From: Qiong Zhou Huang Date: Thu, 25 Jun 2026 16:41:52 -0700 Subject: [PATCH 2/3] chore: add changeset for phonic stream ahead Co-authored-by: Cursor --- .changeset/phonic-stream-ahead.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .changeset/phonic-stream-ahead.md diff --git a/.changeset/phonic-stream-ahead.md b/.changeset/phonic-stream-ahead.md new file mode 100644 index 000000000..699a56def --- /dev/null +++ b/.changeset/phonic-stream-ahead.md @@ -0,0 +1,5 @@ +--- +'@livekit/agents-plugin-phonic': patch +--- + +Enable stream_ahead_of_real_time mode for phonic From 33febb0d9c450fcb310db7323aec46d304592311 Mon Sep 17 00:00:00 2001 From: Qiong Zhou Huang Date: Thu, 25 Jun 2026 17:11:23 -0700 Subject: [PATCH 3/3] refactor(phonic): build session config from a single source Make buildConfigOptions the single source of truth for session config so sendConfig and sendReset stay in sync. Drop the redundant field overrides in sendConfig and move min_words_to_interrupt into buildConfigOptions so it is also applied on mid-session resets. Co-authored-by: Cursor --- plugins/phonic/src/realtime/realtime_model.ts | 24 ++++--------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/plugins/phonic/src/realtime/realtime_model.ts b/plugins/phonic/src/realtime/realtime_model.ts index 4a619f45e..d6af99c56 100644 --- a/plugins/phonic/src/realtime/realtime_model.ts +++ b/plugins/phonic/src/realtime/realtime_model.ts @@ -6,13 +6,13 @@ import { AudioByteStream, DEFAULT_API_CONNECT_OPTIONS, Future, + type TimedString, asError, createTimedString, llm, log, shortuuid, stream, - type TimedString, } from '@livekit/agents'; import { AudioFrame, AudioResampler } from '@livekit/rtc-node'; import type { Phonic } from 'phonic'; @@ -661,22 +661,6 @@ export class RealtimeSession extends llm.RealtimeSession { systemPrompt: this.options.instructions + this.systemPromptPostfix, toolsPayload: [...(this.options.phonicTools ?? []), ...this.toolDefinitions], }), - ...(this.options.additionalLanguages !== undefined && { - additional_languages: this.options.additionalLanguages, - }), - ...(this.options.multilingualMode !== undefined && { - multilingual_mode: this.options.multilingualMode, - }), - audio_speed: this.options.audioSpeed, - tools: [...(this.options.phonicTools ?? []), ...this.toolDefinitions], - boosted_keywords: this.options.boostedKeywords, - ...(this.options.minWordsToInterrupt !== undefined && { - min_words_to_interrupt: this.options.minWordsToInterrupt, - }), - generate_no_input_poke_text: this.options.generateNoInputPokeText, - no_input_poke_sec: this.options.noInputPokeSec, - no_input_poke_text: this.options.noInputPokeText, - no_input_end_conversation_sec: this.options.noInputEndConversationSec, }); } @@ -963,9 +947,9 @@ export class RealtimeSession extends llm.RealtimeSession { audio_speed: this.options.audioSpeed, tools: toolsPayload, boosted_keywords: this.options.boostedKeywords, - // ...(this.options.minWordsToInterrupt !== undefined && { - // min_words_to_interrupt: this.options.minWordsToInterrupt, - // }), + ...(this.options.minWordsToInterrupt !== undefined && { + min_words_to_interrupt: this.options.minWordsToInterrupt, + }), generate_no_input_poke_text: this.options.generateNoInputPokeText, no_input_poke_sec: this.options.noInputPokeSec, no_input_poke_text: this.options.noInputPokeText,