diff --git a/.changeset/preserve-transcription-item-id.md b/.changeset/preserve-transcription-item-id.md new file mode 100644 index 000000000..f24be5cd6 --- /dev/null +++ b/.changeset/preserve-transcription-item-id.md @@ -0,0 +1,5 @@ +--- +'@livekit/agents': patch +--- + +Preserve realtime transcription item IDs on user input transcription events. diff --git a/agents/src/voice/agent_activity.test.ts b/agents/src/voice/agent_activity.test.ts index 5600418d3..17e18d7cd 100644 --- a/agents/src/voice/agent_activity.test.ts +++ b/agents/src/voice/agent_activity.test.ts @@ -22,6 +22,7 @@ import { Future, Task } from '../utils.js'; import { _getActivityTaskInfo } from './agent.js'; import { AgentActivity } from './agent_activity.js'; import type { PreemptiveGenerationInfo } from './audio_recognition.js'; +import type { AgentSessionEventTypes, UserInputTranscribedEvent } from './events.js'; import { SpeechHandle } from './speech_handle.js'; const agentMocks = vi.hoisted(() => ({ @@ -132,6 +133,29 @@ function buildMainTaskRunner() { } describe('AgentActivity - mainTask', () => { + it('preserves realtime user input transcription item IDs', () => { + const capturedEvents: UserInputTranscribedEvent[] = []; + const activity = Object.create(AgentActivity.prototype) as AgentActivity; + Object.assign(activity, { + agentSession: { + emit: (_type: AgentSessionEventTypes, ev: UserInputTranscribedEvent) => { + capturedEvents.push(ev); + }, + }, + }); + + activity.onInputAudioTranscriptionCompleted({ + itemId: 'item_123', + transcript: 'hello', + isFinal: false, + }); + + expect(capturedEvents).toHaveLength(1); + expect(capturedEvents[0]?.transcript).toBe('hello'); + expect(capturedEvents[0]?.isFinal).toBe(false); + expect(capturedEvents[0]?.itemId).toBe('item_123'); + }); + it('should recover when speech handle is interrupted after authorization', async () => { const { fakeActivity, mainTask, speechQueue, q_updated } = buildMainTaskRunner(); diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts index c9cdbf384..b93e5cce9 100644 --- a/agents/src/voice/agent_activity.ts +++ b/agents/src/voice/agent_activity.ts @@ -1168,6 +1168,7 @@ export class AgentActivity implements RecognitionHooks { createUserInputTranscribedEvent({ transcript: ev.transcript, isFinal: ev.isFinal, + itemId: ev.itemId, }), ); diff --git a/agents/src/voice/events.ts b/agents/src/voice/events.ts index b79b72930..3a3050f4b 100644 --- a/agents/src/voice/events.ts +++ b/agents/src/voice/events.ts @@ -94,6 +94,8 @@ export type UserInputTranscribedEvent = { type: 'user_input_transcribed'; transcript: string; isFinal: boolean; + /** Provider-specific ID for the transcribed input item, when available. */ + itemId: string | null; // TODO(AJS-106): add multi participant support /** Not supported yet. Always null by default. */ speakerId: string | null; @@ -104,12 +106,14 @@ export type UserInputTranscribedEvent = { export const createUserInputTranscribedEvent = ({ transcript, isFinal, + itemId = null, speakerId = null, language = null, createdAt = Date.now(), }: { transcript: string; isFinal: boolean; + itemId?: string | null; speakerId?: string | null; language?: LanguageCode | null; createdAt?: number; @@ -117,6 +121,7 @@ export const createUserInputTranscribedEvent = ({ type: 'user_input_transcribed', transcript, isFinal, + itemId, speakerId, language, createdAt, diff --git a/agents/src/voice/report.test.ts b/agents/src/voice/report.test.ts index f29b5201a..e401c9c6a 100644 --- a/agents/src/voice/report.test.ts +++ b/agents/src/voice/report.test.ts @@ -254,6 +254,7 @@ describe('sessionReportToJSON', () => { type: 'user_input_transcribed', transcript: 'hello', is_final: true, + item_id: null, speaker_id: 'spk_1', language: null, created_at: 9,