diff --git a/apps/site/docs/en/api.mdx b/apps/site/docs/en/api.mdx index f604563de8..2a92793e34 100644 --- a/apps/site/docs/en/api.mdx +++ b/apps/site/docs/en/api.mdx @@ -292,6 +292,7 @@ function aiInput( cacheable?: boolean; autoDismissKeyboard?: boolean; mode?: 'replace' | 'clear' | 'typeOnly'; + caret?: 'start' | 'end'; }, ): Promise; @@ -320,6 +321,23 @@ function aiInput( - `'replace'`: Clear the input field first, then input the text. - `'typeOnly'`: Type the value directly without clearing the field first. - `'clear'`: Clear the input field without entering new text. + - `caret?: 'start' | 'end'` - Web only. When `mode` is `'typeOnly'`, Midscene will try to move the caret before typing. Omit this option to keep Midscene's default focus-and-type behavior without controlling the caret position. + + :::note Web caret behavior + + `mode: 'append'` is kept only as a backward-compatible alias for `typeOnly`; it does not mean the text will be appended at the end. To explicitly request append-at-end behavior on Web, use: + + ```typescript + await agent.aiInput('The search input box', { + value: 'Hello World', + mode: 'typeOnly', + caret: 'end', + }); + ``` + + Due to the complexity of input types on Web pages, such as rich-text editors, iframe, and shadow DOM, caret movement and input clearing cannot work with 100% reliability for every input field. + + ::: **Backward compatible usage (deprecated but still supported):** - `value: string | number` - The text content to input. diff --git a/apps/site/docs/zh/api.mdx b/apps/site/docs/zh/api.mdx index 803ff40a4c..7f64efc0fa 100644 --- a/apps/site/docs/zh/api.mdx +++ b/apps/site/docs/zh/api.mdx @@ -285,6 +285,7 @@ function aiInput( cacheable?: boolean; autoDismissKeyboard?: boolean; mode?: 'replace' | 'clear' | 'typeOnly'; + caret?: 'start' | 'end'; }, ): Promise; @@ -313,6 +314,23 @@ function aiInput( - `'replace'`: 先清空输入框,然后输入文本。 - `'typeOnly'`: 直接输入文本,不会先清空输入框。 - `'clear'`: 清空输入框,不会输入新的文本。 + - `caret?: 'start' | 'end'` - 仅 Web 平台生效。当 `mode` 为 `'typeOnly'` 时,输入前会尝试移动光标。不传该参数时,Midscene 只执行默认的聚焦和输入流程,不主动控制光标位置。 + + :::note Web 光标行为 + + `mode: 'append'` 只作为 `typeOnly` 的向后兼容别名保留,并不表示会将文本追加到末尾。如果需要在 Web 平台明确请求“追加到末尾”,请使用: + + ```typescript + await agent.aiInput('搜索框', { + value: 'Hello World', + mode: 'typeOnly', + caret: 'end', + }); + ``` + + 受限于 Web 页面中复杂的输入框类型(富文本编辑器、iframe、shadow DOM 等),移动光标或清空输入框内容的能力无法对所有的输入框百分百生效。 + + ::: **兼容用法**(已过时,但仍然支持): - `value: string | number` - 要输入的文本内容。 diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts index 9a7848cd20..25adbec48f 100644 --- a/packages/core/src/agent/agent.ts +++ b/packages/core/src/agent/agent.ts @@ -63,7 +63,7 @@ import { } from '@midscene/shared/env'; import { getDebug } from '@midscene/shared/logger'; import { assert, ifInBrowser, uuid } from '@midscene/shared/utils'; -import { defineActionSleep } from '../device'; +import { type ActionInputCaret, defineActionSleep } from '../device'; import { TaskCache } from './task-cache'; import { TaskExecutionError, @@ -144,6 +144,20 @@ export type AiActOptions = { abortSignal?: AbortSignal; }; +type AiInputMode = 'replace' | 'clear' | 'typeOnly' | 'append'; +type BaseAiInputOption = LocateOption & { + // Platform-sensitive legacy option exposed through core aiInput. + // It mainly affects mobile soft-keyboard flows and is ignored by platforms + // that do not implement keyboard dismissal. + autoDismissKeyboard?: boolean; + mode?: AiInputMode; + caret?: ActionInputCaret; +}; +type AiInputOption = BaseAiInputOption & { + value: string | number; +}; +type LegacyAiInputOption = BaseAiInputOption; + export class Agent< InterfaceType extends AbstractInterface = AbstractInterface, > { @@ -616,12 +630,7 @@ export class Agent< } // New signature, always use locatePrompt as the first param - async aiInput( - locatePrompt: TUserPrompt, - opt: LocateOption & { value: string | number } & { - autoDismissKeyboard?: boolean; - } & { mode?: 'replace' | 'clear' | 'typeOnly' | 'append' }, - ): Promise; + async aiInput(locatePrompt: TUserPrompt, opt: AiInputOption): Promise; // Legacy signature - deprecated /** @@ -630,29 +639,18 @@ export class Agent< async aiInput( value: string | number, locatePrompt: TUserPrompt, - opt?: LocateOption & { autoDismissKeyboard?: boolean } & { - mode?: 'replace' | 'clear' | 'typeOnly' | 'append'; - }, // AndroidDeviceInputOpt & + opt?: LegacyAiInputOption, ): Promise; // Implementation async aiInput( locatePromptOrValue: TUserPrompt | string | number, - locatePromptOrOpt: - | TUserPrompt - | (LocateOption & { value: string | number } & { - autoDismissKeyboard?: boolean; - } & { mode?: 'replace' | 'clear' | 'typeOnly' | 'append' }) // AndroidDeviceInputOpt & - | undefined, - optOrUndefined?: LocateOption, // AndroidDeviceInputOpt & + locatePromptOrOpt: TUserPrompt | AiInputOption | undefined, + optOrUndefined?: LegacyAiInputOption, ) { let value: string | number; let locatePrompt: TUserPrompt; - let opt: - | (LocateOption & { value: string | number } & { - autoDismissKeyboard?: boolean; - } & { mode?: 'replace' | 'clear' | 'typeOnly' | 'append' }) // AndroidDeviceInputOpt & - | undefined; + let opt: AiInputOption | undefined; // Check if using new signature (first param is locatePrompt, second has value) if ( @@ -662,11 +660,7 @@ export class Agent< ) { // New signature: aiInput(locatePrompt, opt) locatePrompt = locatePromptOrValue as TUserPrompt; - const optWithValue = locatePromptOrOpt as LocateOption & { - // AndroidDeviceInputOpt & - value: string | number; - autoDismissKeyboard?: boolean; - }; + const optWithValue = locatePromptOrOpt as AiInputOption; value = optWithValue.value; opt = optWithValue; } else { diff --git a/packages/core/src/device/index.ts b/packages/core/src/device/index.ts index 141a25008d..a638493a86 100644 --- a/packages/core/src/device/index.ts +++ b/packages/core/src/device/index.ts @@ -68,6 +68,7 @@ export interface KeyboardInputPrimitives { target?: unknown; replace?: boolean; focusOnly?: boolean; + caret?: ActionInputCaret; }, ): Promise; clearInput(target?: unknown): Promise; @@ -408,21 +409,41 @@ export const actionInputParamSchema = z.object({ 'If true, the keyboard will be dismissed after the input is completed. Do not set it unless the user asks you to do so.', ), }); +const actionInputCaretParamSchema = z + .enum(['start', 'end']) + .optional() + .describe( + 'Web only. In typeOnly mode, best-effort move the caret before typing. Use only when the user explicitly asks to insert at the start or append at the end.', + ); +const webActionInputParamSchema = actionInputParamSchema.extend({ + caret: actionInputCaretParamSchema, +}); +export type ActionInputCaret = 'start' | 'end'; export type ActionInputParam = { value: string; locate?: LocateResultElement; mode?: 'replace' | 'clear' | 'typeOnly' | 'append'; autoDismissKeyboard?: boolean; + caret?: ActionInputCaret; +}; + +export type DefineActionInputOptions = { + caret?: boolean; }; export const defineActionInput = ( keyboard: KeyboardInputPrimitives, + options: DefineActionInputOptions = {}, ): DeviceAction => { - return defineAction({ + const paramSchema = options.caret + ? webActionInputParamSchema + : actionInputParamSchema; + + return defineAction({ name: 'Input', description: 'Input the value into the element', interfaceAlias: 'aiInput', - paramSchema: actionInputParamSchema, + paramSchema, sample: { value: 'test@example.com', locate: { prompt: 'the email input field' }, @@ -446,6 +467,7 @@ export const defineActionInput = ( target: param.locate, replace: param.mode !== 'typeOnly', autoDismissKeyboard: param.autoDismissKeyboard, + caret: param.caret, }); }, }); @@ -952,6 +974,7 @@ export interface InputPrimitiveActionOptions { sleep?: (timeMs: number) => Promise; includeSwipe?: boolean; includePinch?: boolean; + inputCaret?: boolean; systemActions?: SystemInputActionOptions; } @@ -997,7 +1020,7 @@ export function defineActionsFromInputPrimitives( if (keyboard) { actions.push( - defineActionInput(keyboard), + defineActionInput(keyboard, { caret: options.inputCaret }), defineActionClearInput(keyboard.clearInput), defineActionKeyboardPress(keyboard.keyboardPress), defineActionCursorMove({ keyboard, sleep: options.sleep }), diff --git a/packages/web-integration/src/chrome-extension/page.ts b/packages/web-integration/src/chrome-extension/page.ts index b1404e9317..61a8e07dca 100644 --- a/packages/web-integration/src/chrome-extension/page.ts +++ b/packages/web-integration/src/chrome-extension/page.ts @@ -13,7 +13,11 @@ import type { Rect, Size, } from '@midscene/core'; -import type { AbstractInterface, DeviceAction } from '@midscene/core/device'; +import type { + AbstractInterface, + ActionInputCaret, + DeviceAction, +} from '@midscene/core/device'; import type { ElementInfo } from '@midscene/shared/extractor'; import { treeToList } from '@midscene/shared/extractor'; import { createImgBase64ByFormat } from '@midscene/shared/img'; @@ -28,6 +32,7 @@ import { sanitizeXpaths, } from '../common/cache-helper'; import { + type FocusedInputCapability, type KeyInput, type MouseButton, commonWebActionsForWebPage, @@ -40,6 +45,7 @@ import { } from './dynamic-scripts'; const debug = getDebug('web:chrome-extension:page'); +const warn = getDebug('web:chrome-extension:page', { console: true }); function sleep(ms: number) { return new Promise((resolve) => setTimeout(resolve, ms)); @@ -608,13 +614,23 @@ export default class ChromeExtensionProxyPage implements AbstractInterface { ); } - async clearInput(element: ElementInfo) { - if (!element) { - console.warn('No element to clear input'); + async clearInput(element?: ElementInfo, capability?: FocusedInputCapability) { + if (element) { + await this.mouse.click(element.center[0], element.center[1]); + } + const focusedInputCapability = + capability ?? (await this.getFocusedInputCapability()); + if (focusedInputCapability?.supportsClear === false) { + warn( + `[midscene:warning] clearInput skipped: focused input capability is ${focusedInputCapability.kind}`, + ); return; } - - await this.mouse.click(element.center[0], element.center[1]); + if (focusedInputCapability?.supportsClear === 'unknown') { + warn( + `[midscene:warning] clearInput continued for unknown focused input capability: ${focusedInputCapability.kind}`, + ); + } await this.sendCommandToDebugger('Input.dispatchKeyEvent', { type: 'keyDown', @@ -633,6 +649,175 @@ export default class ChromeExtensionProxyPage implements AbstractInterface { }); } + async getFocusedInputCapability(): Promise< + FocusedInputCapability | undefined + > { + const result = await this.sendCommandToDebugger<{ + result?: { value?: FocusedInputCapability }; + }>('Runtime.evaluate', { + expression: `(() => { + const inspectDocument = (targetDocument, depth = 0) => { + const activeElement = targetDocument.activeElement; + if (!activeElement) { + return { + kind: 'no-active-element', + supportsClear: false, + supportsCaret: false, + }; + } + + const tagName = activeElement.tagName.toLowerCase(); + if (tagName === 'iframe' || tagName === 'frame') { + if (depth >= 2) { + return { + kind: 'unknown-frame-out-of-recursion-limit', + supportsClear: 'unknown', + supportsCaret: false, + }; + } + + if (depth < 2) { + try { + const childDocument = activeElement.contentDocument; + if (childDocument) { + return inspectDocument(childDocument, depth + 1); + } + } catch (error) { + // Ignore and return unknown-frame below. + } + } + return { + kind: 'unknown-frame', + supportsClear: 'unknown', + supportsCaret: false, + }; + } + + if (activeElement.shadowRoot) { + return { + kind: 'unknown-shadow-root', + supportsClear: 'unknown', + supportsCaret: false, + }; + } + + const supportsClear = + typeof activeElement.matches === 'function' && + activeElement.matches(':read-write'); + let kind = 'non-input'; + let supportsCaret = false; + if (activeElement instanceof HTMLInputElement) { + kind = 'native-input'; + supportsCaret = true; + } else if (activeElement instanceof HTMLTextAreaElement) { + kind = 'native-textarea'; + supportsCaret = true; + } else if (activeElement.isContentEditable) { + kind = 'contenteditable'; + } + + return { + kind, + supportsClear, + supportsCaret, + }; + }; + + return inspectDocument(document); + })()`, + returnByValue: true, + }); + return result.result?.value; + } + + async setFocusedInputCaret( + caret: ActionInputCaret, + capability?: FocusedInputCapability, + ): Promise { + if (capability && !capability.supportsCaret) { + warn( + `[midscene:warning] caret movement skipped: focused input capability is ${capability.kind}`, + ); + return; + } + + const result = await this.sendCommandToDebugger<{ + result?: { value?: { success: boolean; reason?: string } }; + }>('Runtime.evaluate', { + expression: `(() => { + const setCaretInDocument = (targetDocument, depth = 0) => { + const activeElement = targetDocument.activeElement; + if (!activeElement) { + return { + success: false, + reason: 'no active element', + }; + } + + const tagName = activeElement.tagName.toLowerCase(); + if (tagName === 'iframe' || tagName === 'frame') { + if (depth >= 2) { + return { + success: false, + reason: 'focused frame recursion limit exceeded', + }; + } + + if (depth < 2) { + try { + const childDocument = activeElement.contentDocument; + if (childDocument) { + return setCaretInDocument(childDocument, depth + 1); + } + } catch (error) { + // Ignore and return unknown-frame below. + } + } + return { + success: false, + reason: 'focused frame cannot be inspected', + }; + } + + if ( + !( + activeElement instanceof HTMLInputElement || + activeElement instanceof HTMLTextAreaElement + ) + ) { + return { + success: false, + reason: 'focused element is not a native input or textarea', + }; + } + + const targetCaret = ${JSON.stringify(caret)}; + const offset = + targetCaret === 'start' ? 0 : activeElement.value.length; + try { + activeElement.setSelectionRange(offset, offset); + return { success: true }; + } catch (error) { + return { + success: false, + reason: error instanceof Error ? error.message : String(error), + }; + } + }; + + return setCaretInDocument(document); + })()`, + returnByValue: true, + }); + + const value = result.result?.value; + if (!value?.success) { + warn( + `[midscene:warning] caret movement skipped: ${value?.reason || 'unknown reason'}`, + ); + } + } + private latestMouseX = 100; private latestMouseY = 100; diff --git a/packages/web-integration/src/puppeteer/base-page.ts b/packages/web-integration/src/puppeteer/base-page.ts index 71c8f9abb3..feeaab68e1 100644 --- a/packages/web-integration/src/puppeteer/base-page.ts +++ b/packages/web-integration/src/puppeteer/base-page.ts @@ -9,6 +9,7 @@ import type { } from '@midscene/core'; import type { AbstractInterface, + ActionInputCaret, MjpegStreamHandle, MjpegStreamOptions, } from '@midscene/core/device'; @@ -27,8 +28,16 @@ import { getExtraReturnLogic, } from '@midscene/shared/node'; import { assert } from '@midscene/shared/utils'; -import type { Page as PlaywrightPage } from 'playwright'; -import type { CDPSession, Protocol, Page as PuppeteerPage } from 'puppeteer'; +import type { + Frame as PlaywrightFrame, + Page as PlaywrightPage, +} from 'playwright'; +import type { + CDPSession, + Protocol, + Frame as PuppeteerFrame, + Page as PuppeteerPage, +} from 'puppeteer'; import { type CacheFeatureOptions, type WebElementCacheFeature, @@ -37,6 +46,7 @@ import { sanitizeXpaths, } from '../common/cache-helper'; import { + type FocusedInputCapability, type KeyInput, type MouseButton, commonWebActionsForWebPage, @@ -45,6 +55,29 @@ import { export const debugPage = getDebug('web:page'); const warnPage = getDebug('web:page', { console: true }); +type FocusedInputContext = + | PuppeteerPage + | PlaywrightPage + | PuppeteerFrame + | PlaywrightFrame; +type PuppeteerInputContext = PuppeteerPage | PuppeteerFrame; +type PlaywrightInputContext = PlaywrightPage | PlaywrightFrame; + +type ActiveElementHandleWithFrame = { + contentFrame(): Promise; + dispose?(): Promise; +}; + +type ActiveElementProbe = { + isFrame: boolean; + capability: FocusedInputCapability; +}; + +type SetCaretResult = { + success: boolean; + reason?: string; +}; + export const BROWSER_NAVIGATION_ERROR_PATTERN = /execution context was destroyed|frame was detached|target closed|page has been closed|context was destroyed|net::ERR_ABORTED/i; @@ -710,39 +743,254 @@ export class Page< }; } - async clearInput(element?: ElementInfo): Promise { + async clearInput( + element?: ElementInfo, + capability?: FocusedInputCapability, + ): Promise { const backspace = async () => { await sleep(100); await this.keyboard.press([{ key: 'Backspace' }]); }; const isMac = process.platform === 'darwin'; + const modifierKey = isMac ? 'Meta' : 'Control'; debugPage('clearInput begin'); - if (isMac) { - if (this.interfaceType === 'puppeteer') { - // https://github.com/segment-boneyard/nightmare/issues/810#issuecomment-452669866 - element && - (await this.mouse.click(element.center[0], element.center[1], { - count: 3, - })); - await backspace(); - } - - element && (await this.mouse.click(element.center[0], element.center[1])); - await this.underlyingPage.keyboard.down('Meta'); - await this.underlyingPage.keyboard.press('a'); - await this.underlyingPage.keyboard.up('Meta'); + if (element) { + await this.mouse.click(element.center[0], element.center[1]); + } + const focusedInputCapability = + capability ?? (await this.getFocusedInputCapability()); + if (focusedInputCapability?.supportsClear === false) { + warnPage( + `[midscene:warning] clearInput skipped: focused input capability is ${focusedInputCapability.kind}`, + ); + return; + } + if (focusedInputCapability?.supportsClear === 'unknown') { + warnPage( + `[midscene:warning] clearInput continued for unknown focused input capability: ${focusedInputCapability.kind}`, + ); + } + if (isMac && this.interfaceType === 'puppeteer' && element) { + // Puppeteer on macOS does not reliably clear a focused input with + // Meta+A Backspace alone. Use the legacy triple-click Backspace fallback. + // See: + // https://github.com/segment-boneyard/nightmare/issues/810#issuecomment-452669866 + // tests/ai/web/puppeteer/clear-input-keyboard-shortcut.test.ts. + await this.mouse.click(element.center[0], element.center[1], { + count: 3, + }); await backspace(); } else { - element && (await this.mouse.click(element.center[0], element.center[1])); - await this.underlyingPage.keyboard.down('Control'); + await this.underlyingPage.keyboard.down(modifierKey); await this.underlyingPage.keyboard.press('a'); - await this.underlyingPage.keyboard.up('Control'); + await this.underlyingPage.keyboard.up(modifierKey); await backspace(); } debugPage('clearInput end'); } + private async getFocusedInputContextAndCapability(): Promise<{ + context: FocusedInputContext; + capability: FocusedInputCapability; + }> { + let context: FocusedInputContext = this.underlyingPage; + const maxFrameDepth = 2; + + for (let depth = 0; depth < maxFrameDepth; depth++) { + const probe = await this.evaluateFocusedInputProbe(context); + + if (probe.isFrame) { + const activeElementHandle = + await this.evaluateFocusedInputHandle(context); + const frameHandle = + activeElementHandle as unknown as ActiveElementHandleWithFrame; + const frame = await frameHandle.contentFrame(); + await activeElementHandle.dispose?.(); + if (frame) { + context = frame; + continue; + } + + return { + context, + capability: probe.capability, + }; + } + + return { + context, + capability: probe.capability, + }; + } + + return { + context, + capability: { + kind: 'unknown-frame-out-of-recursion-limit', + supportsClear: 'unknown', + supportsCaret: false, + }, + }; + } + + private async evaluateFocusedInputHandle(context: FocusedInputContext) { + if (this.interfaceType === 'puppeteer') { + return (context as PuppeteerInputContext).evaluateHandle( + () => document.activeElement, + ); + } + + return (context as PlaywrightInputContext).evaluateHandle( + () => document.activeElement, + ); + } + + private async evaluateFocusedInputProbe( + context: FocusedInputContext, + ): Promise { + const probeActiveElement = () => { + const activeElement = document.activeElement; + const noActiveElementCapability: FocusedInputCapability = { + kind: 'no-active-element', + supportsClear: false, + supportsCaret: false, + }; + if (!activeElement) { + return { + isFrame: false, + capability: noActiveElementCapability, + }; + } + + const tagName = activeElement.tagName.toLowerCase(); + if (tagName === 'iframe' || tagName === 'frame') { + return { + isFrame: true, + capability: { + kind: 'unknown-frame', + supportsClear: 'unknown', + supportsCaret: false, + } satisfies FocusedInputCapability, + }; + } + + if ((activeElement as HTMLElement).shadowRoot) { + return { + isFrame: false, + capability: { + kind: 'unknown-shadow-root', + supportsClear: 'unknown', + supportsCaret: false, + } satisfies FocusedInputCapability, + }; + } + + const supportsClear = + typeof activeElement.matches === 'function' && + activeElement.matches(':read-write'); + let kind: FocusedInputCapability['kind'] = 'non-input'; + let supportsCaret = false; + + if (activeElement instanceof HTMLInputElement) { + kind = 'native-input'; + supportsCaret = true; + } else if (activeElement instanceof HTMLTextAreaElement) { + kind = 'native-textarea'; + supportsCaret = true; + } else if ((activeElement as HTMLElement).isContentEditable) { + kind = 'contenteditable'; + } + + return { + isFrame: false, + capability: { + kind, + supportsClear, + supportsCaret, + } satisfies FocusedInputCapability, + }; + }; + + if (this.interfaceType === 'puppeteer') { + return (context as PuppeteerInputContext).evaluate(probeActiveElement); + } + + return (context as PlaywrightInputContext).evaluate(probeActiveElement); + } + + private async evaluateSetFocusedInputCaret( + context: FocusedInputContext, + caret: ActionInputCaret, + ): Promise { + const setCaret = (targetCaret: ActionInputCaret) => { + const activeElement = document.activeElement; + if ( + !( + activeElement instanceof HTMLInputElement || + activeElement instanceof HTMLTextAreaElement + ) + ) { + return { + success: false, + reason: 'focused element is not a native input or textarea', + }; + } + + const offset = targetCaret === 'start' ? 0 : activeElement.value.length; + try { + activeElement.setSelectionRange(offset, offset); + return { success: true }; + } catch (error) { + return { + success: false, + reason: error instanceof Error ? error.message : String(error), + }; + } + }; + + if (this.interfaceType === 'puppeteer') { + return (context as PuppeteerInputContext).evaluate(setCaret, caret); + } + + return (context as PlaywrightInputContext).evaluate(setCaret, caret); + } + + async getFocusedInputCapability(): Promise< + FocusedInputCapability | undefined + > { + return (await this.getFocusedInputContextAndCapability()).capability; + } + + async setFocusedInputCaret( + caret: ActionInputCaret, + capability?: FocusedInputCapability, + ): Promise { + if (capability && !capability.supportsCaret) { + warnPage( + `[midscene:warning] caret movement skipped: focused input capability is ${capability.kind}`, + ); + return; + } + + const focusedInput = await this.getFocusedInputContextAndCapability(); + if (!focusedInput.capability.supportsCaret) { + warnPage( + `[midscene:warning] caret movement skipped: focused input capability is ${focusedInput.capability.kind}`, + ); + return; + } + + const result = await this.evaluateSetFocusedInputCaret( + focusedInput.context, + caret, + ); + + if (!result.success) { + warnPage(`[midscene:warning] caret movement skipped: ${result.reason}`); + } + } + private everMoved = false; private async moveToPointBeforeScroll(point?: Point): Promise { if (point) { diff --git a/packages/web-integration/src/web-page.ts b/packages/web-integration/src/web-page.ts index 610f9f796d..644f99847b 100644 --- a/packages/web-integration/src/web-page.ts +++ b/packages/web-integration/src/web-page.ts @@ -2,6 +2,7 @@ import type { Point } from '@midscene/core'; import { z } from '@midscene/core'; import { AbstractInterface, + type ActionInputCaret, type BrowserInputPrimitives, type DeviceAction, defineAction, @@ -363,6 +364,23 @@ export interface KeyboardAction { ) => Promise; } +export type FocusedInputCapabilityKind = + | 'native-input' + | 'native-textarea' + | 'contenteditable' + | 'non-input' + | 'unknown-frame' + | 'unknown-frame-out-of-recursion-limit' + | 'unknown-shadow-root' + | 'no-active-element' + | 'unknown'; + +export type FocusedInputCapability = { + kind: FocusedInputCapabilityKind; + supportsClear: boolean | 'unknown'; + supportsCaret: boolean; +}; + export interface ChromePageDestroyOptions { closeTab?: boolean; // should close the tab when the page object is destroyed } @@ -403,7 +421,21 @@ export abstract class AbstractWebPage extends AbstractInterface { }; } - async clearInput(element?: ElementInfo): Promise {} + async getFocusedInputCapability(): Promise< + FocusedInputCapability | undefined + > { + return undefined; + } + + async setFocusedInputCaret( + _caret: ActionInputCaret, + _capability?: FocusedInputCapability, + ): Promise {} + + async clearInput( + _element?: ElementInfo, + _capability?: FocusedInputCapability, + ): Promise {} abstract scrollUntilTop(startingPoint?: Point): Promise; abstract scrollUntilBottom(startingPoint?: Point): Promise; @@ -455,14 +487,20 @@ export function createWebInputPrimitives( keyboard: { typeText: async (value, opts) => { const element = opts?.target; - if (element && opts?.replace !== false) { - await page.clearInput(element as ElementInfo); - } else if (element) { - const target = element as ElementInfo; + let target: ElementInfo | undefined; + if (element) { + target = element as ElementInfo; await page.mouse.click(target.center[0], target.center[1], { button: 'left', }); - await page.keyboard.press([{ key: 'End' }]); + } + + if (opts?.replace !== false) { + const capability = await page.getFocusedInputCapability(); + await page.clearInput(target, capability); + } else if (opts?.caret) { + const capability = await page.getFocusedInputCapability(); + await page.setFocusedInputCaret(opts.caret, capability); } if (opts?.focusOnly) { @@ -494,7 +532,8 @@ export function createWebInputPrimitives( } }, clearInput: async (target) => { - await page.clearInput(target as ElementInfo | undefined); + const element = target as ElementInfo | undefined; + await page.clearInput(element); }, }, touch: { @@ -563,6 +602,7 @@ export const commonWebActionsForWebPage = ( ...defineActionsFromInputPrimitives(input, { size: () => page.size(), includeSwipe: includeTouchEvents, + inputCaret: true, }), defineAction({ diff --git a/packages/web-integration/tests/ai/fixtures/read-write-inputs.html b/packages/web-integration/tests/ai/fixtures/read-write-inputs.html new file mode 100644 index 0000000000..b3e365cec2 --- /dev/null +++ b/packages/web-integration/tests/ai/fixtures/read-write-inputs.html @@ -0,0 +1,184 @@ + + + + + :read-write editable fixture + + + +
+ + + + +
+ div contenteditable +
editable div value
+
+ +
+ p contenteditable +

+ editable paragraph value +

+
+ + + + + + + + + + + +
+ role textbox without contenteditable +
+ +
+ visually input-like but not editable +
+ +
+ + +
+ + + + diff --git a/packages/web-integration/tests/ai/web/playwright/clear-input-read-write.spec.ts b/packages/web-integration/tests/ai/web/playwright/clear-input-read-write.spec.ts new file mode 100644 index 0000000000..066c4bdda6 --- /dev/null +++ b/packages/web-integration/tests/ai/web/playwright/clear-input-read-write.spec.ts @@ -0,0 +1,95 @@ +import { join } from 'node:path'; +import { PlaywrightWebPage } from '@/playwright'; +import type { ElementInfo } from '@midscene/shared/extractor'; +import { type Page, expect, test } from '@playwright/test'; + +const fixtureUrl = `file://${join( + __dirname, + '../../fixtures/read-write-inputs.html', +)}`; + +test.describe('clearInput read-write guard', () => { + test(':read-write should only match editable text targets', async ({ + page, + }) => { + await page.goto(fixtureUrl); + + const readWriteIds = await page.evaluate( + `Array.from(document.querySelectorAll('*:read-write')) + .map((element) => element.id) + .filter(Boolean) + .sort()`, + ); + + expect(readWriteIds).toEqual([ + 'editable-div', + 'editable-paragraph', + 'text-input', + 'textarea-input', + ]); + + const iframe = await page.locator('#editable-iframe').elementHandle(); + const frame = await iframe?.contentFrame(); + if (!frame) { + throw new Error('Failed to resolve editable iframe'); + } + + const iframeReadWriteIds = await frame.evaluate( + `Array.from(document.querySelectorAll('*:read-write')) + .map((element) => element.id) + .filter(Boolean) + .sort()`, + ); + + expect(iframeReadWriteIds).toEqual([ + 'iframe-editable-div', + 'iframe-text-input', + ]); + }); + + test('clearInput should clear read-write elements', async ({ page }) => { + await page.goto(fixtureUrl); + const webPage = new PlaywrightWebPage(page); + + await webPage.clearInput(await getElementInfo(page, '#text-input')); + await webPage.clearInput(await getElementInfo(page, '#editable-div')); + + await expect(page.locator('#text-input')).toHaveValue(''); + await expect(page.locator('#editable-div')).toHaveText(''); + }); + + test('clearInput should skip non-read-write elements', async ({ page }) => { + await page.goto(fixtureUrl); + const webPage = new PlaywrightWebPage(page); + + await webPage.clearInput(await getElementInfo(page, '#button-like-input')); + await webPage.clearInput(await getElementInfo(page, '#role-textbox-only')); + await webPage.clearInput(await getElementInfo(page, '#fake-input')); + + const probe = await page.evaluate<{ backspaceCount: number }>( + 'window.clearInputProbe', + ); + expect(probe.backspaceCount).toBe(0); + await expect(page.locator('#button-like-input')).toHaveText('button text'); + await expect(page.locator('#role-textbox-only')).toHaveText( + 'role textbox without contenteditable', + ); + await expect(page.locator('#fake-input')).toHaveText( + 'visually input-like but not editable', + ); + }); +}); + +async function getElementInfo( + page: Page, + selector: string, +): Promise { + const box = await page.locator(selector).boundingBox(); + if (!box) { + throw new Error(`Element is not visible: ${selector}`); + } + + return { + center: [box.x + box.width / 2, box.y + box.height / 2], + } as ElementInfo; +} diff --git a/packages/web-integration/tests/ai/web/playwright/focused-input-capability.spec.ts b/packages/web-integration/tests/ai/web/playwright/focused-input-capability.spec.ts new file mode 100644 index 0000000000..26dcd62001 --- /dev/null +++ b/packages/web-integration/tests/ai/web/playwright/focused-input-capability.spec.ts @@ -0,0 +1,106 @@ +import { join } from 'node:path'; +import { PlaywrightWebPage } from '@/playwright'; +import type { FocusedInputCapability } from '@/web-page'; +import { type Page, expect, test } from '@playwright/test'; + +const fixtureUrl = `file://${join( + __dirname, + '../../fixtures/read-write-inputs.html', +)}`; + +test.describe('focused input capability', () => { + test('should detect native and editable input capabilities', async ({ + page, + }) => { + await page.goto(fixtureUrl); + const webPage = new PlaywrightWebPage(page); + + await expectFocusedInputCapability(page, webPage, '#text-input', { + kind: 'native-input', + supportsClear: true, + supportsCaret: true, + }); + await expectFocusedInputCapability(page, webPage, '#textarea-input', { + kind: 'native-textarea', + supportsClear: true, + supportsCaret: true, + }); + await expectFocusedInputCapability(page, webPage, '#editable-div', { + kind: 'contenteditable', + supportsClear: true, + supportsCaret: false, + }); + }); + + test('should detect non-clearable or non-input capabilities', async ({ + page, + }) => { + await page.goto(fixtureUrl); + const webPage = new PlaywrightWebPage(page); + + await expectFocusedInputCapability(page, webPage, '#readonly-input', { + kind: 'native-input', + supportsClear: false, + supportsCaret: true, + }); + await expectFocusedInputCapability(page, webPage, '#checkbox-input', { + kind: 'native-input', + supportsClear: false, + supportsCaret: true, + }); + await expectFocusedInputCapability(page, webPage, '#button-like-input', { + kind: 'non-input', + supportsClear: false, + supportsCaret: false, + }); + await expectFocusedInputCapability(page, webPage, '#role-textbox-only', { + kind: 'non-input', + supportsClear: false, + supportsCaret: false, + }); + await expectFocusedInputCapability(page, webPage, '#fake-input', { + kind: 'non-input', + supportsClear: false, + supportsCaret: false, + }); + }); + + test('should recurse into focused iframe input', async ({ page }) => { + await page.goto(fixtureUrl); + const webPage = new PlaywrightWebPage(page); + const frame = page.frameLocator('#editable-iframe'); + + await frame.locator('#iframe-text-input').click(); + + await expect(webPage.getFocusedInputCapability()).resolves.toEqual({ + kind: 'native-input', + supportsClear: true, + supportsCaret: true, + }); + }); + + test('should mark focused shadow host as unknown shadow root', async ({ + page, + }) => { + await page.goto(fixtureUrl); + const webPage = new PlaywrightWebPage(page); + + await page.locator('#shadow-host').click(); + + await expect(webPage.getFocusedInputCapability()).resolves.toEqual({ + kind: 'unknown-shadow-root', + supportsClear: 'unknown', + supportsCaret: false, + }); + }); +}); + +async function expectFocusedInputCapability( + page: Page, + webPage: PlaywrightWebPage, + selector: string, + expected: FocusedInputCapability, +) { + await page.locator(selector).click(); + await expect(webPage.getFocusedInputCapability()).resolves.toEqual(expected); +} diff --git a/packages/web-integration/tests/ai/web/playwright/focused-input-caret.spec.ts b/packages/web-integration/tests/ai/web/playwright/focused-input-caret.spec.ts new file mode 100644 index 0000000000..739fae82d5 --- /dev/null +++ b/packages/web-integration/tests/ai/web/playwright/focused-input-caret.spec.ts @@ -0,0 +1,100 @@ +import { join } from 'node:path'; +import { PlaywrightWebPage } from '@/playwright'; +import { expect, test } from '@playwright/test'; + +const fixtureUrl = `file://${join( + __dirname, + '../../fixtures/read-write-inputs.html', +)}`; + +test.describe('focused input caret', () => { + test('should move native input caret to start and end', async ({ page }) => { + await page.goto(fixtureUrl); + const webPage = new PlaywrightWebPage(page); + + await page.locator('#text-input').click(); + await webPage.setFocusedInputCaret('start'); + await expect(page.locator('#text-input')).toHaveJSProperty( + 'selectionStart', + 0, + ); + await expect(page.locator('#text-input')).toHaveJSProperty( + 'selectionEnd', + 0, + ); + + await webPage.setFocusedInputCaret('end'); + await expect(page.locator('#text-input')).toHaveJSProperty( + 'selectionStart', + 'text value'.length, + ); + await expect(page.locator('#text-input')).toHaveJSProperty( + 'selectionEnd', + 'text value'.length, + ); + }); + + test('should move textarea caret to start and end', async ({ page }) => { + await page.goto(fixtureUrl); + const webPage = new PlaywrightWebPage(page); + + await page.locator('#textarea-input').click(); + await webPage.setFocusedInputCaret('start'); + await expect(page.locator('#textarea-input')).toHaveJSProperty( + 'selectionStart', + 0, + ); + await expect(page.locator('#textarea-input')).toHaveJSProperty( + 'selectionEnd', + 0, + ); + + await webPage.setFocusedInputCaret('end'); + await expect(page.locator('#textarea-input')).toHaveJSProperty( + 'selectionStart', + 'textarea value'.length, + ); + await expect(page.locator('#textarea-input')).toHaveJSProperty( + 'selectionEnd', + 'textarea value'.length, + ); + }); + + test('should move caret inside focused iframe input', async ({ page }) => { + await page.goto(fixtureUrl); + const webPage = new PlaywrightWebPage(page); + const frame = page.frameLocator('#editable-iframe'); + const input = frame.locator('#iframe-text-input'); + + await input.click(); + await webPage.setFocusedInputCaret('start'); + await expect(input).toHaveJSProperty('selectionStart', 0); + await expect(input).toHaveJSProperty('selectionEnd', 0); + + await webPage.setFocusedInputCaret('end'); + await expect(input).toHaveJSProperty( + 'selectionStart', + 'iframe text value'.length, + ); + await expect(input).toHaveJSProperty( + 'selectionEnd', + 'iframe text value'.length, + ); + }); + + test('should not throw when focused element does not support caret movement', async ({ + page, + }) => { + await page.goto(fixtureUrl); + const webPage = new PlaywrightWebPage(page); + + await page.locator('#editable-div').click(); + await expect(webPage.setFocusedInputCaret('end')).resolves.toBeUndefined(); + + await page.locator('#fake-input').click(); + await expect(webPage.setFocusedInputCaret('end')).resolves.toBeUndefined(); + + await page.locator('#shadow-host').click(); + await expect(webPage.setFocusedInputCaret('end')).resolves.toBeUndefined(); + }); +}); diff --git a/packages/web-integration/tests/ai/web/puppeteer/clear-input-keyboard-shortcut.test.ts b/packages/web-integration/tests/ai/web/puppeteer/clear-input-keyboard-shortcut.test.ts new file mode 100644 index 0000000000..f5c9b07257 --- /dev/null +++ b/packages/web-integration/tests/ai/web/puppeteer/clear-input-keyboard-shortcut.test.ts @@ -0,0 +1,60 @@ +import puppeteer from 'puppeteer'; +import { describe, expect, it } from 'vitest'; + +describe.skipIf(process.platform !== 'darwin')( + 'Puppeteer macOS input clearing', + () => { + it('Meta+A Backspace does not reliably clear focused input content', async () => { + const browser = await puppeteer.launch({ + headless: true, + args: ['--no-sandbox', '--disable-setuid-sandbox'], + }); + const page = await browser.newPage(); + + try { + await page.setContent(` + + + + + + + `); + + const input = await page.$('#test-input'); + if (!input) { + throw new Error('test input not found'); + } + + await input.click(); + await page.keyboard.down('Meta'); + await page.keyboard.press('a'); + await page.keyboard.up('Meta'); + await page.keyboard.press('Backspace'); + + const valueAfterKeyboardClear = await page.$eval( + '#test-input', + (element) => (element as HTMLInputElement).value, + ); + + expect(valueAfterKeyboardClear).not.toBe(''); + + await page.$eval('#test-input', (element) => { + (element as HTMLInputElement).value = 'hello'; + }); + + await input.click({ count: 3 }); + await page.keyboard.press('Backspace'); + + const valueAfterTripleClickClear = await page.$eval( + '#test-input', + (element) => (element as HTMLInputElement).value, + ); + + expect(valueAfterTripleClickClear).toBe(''); + } finally { + await browser.close(); + } + }); + }, +); diff --git a/packages/web-integration/tests/unit-test/yaml/input-mode-typeonly.test.ts b/packages/web-integration/tests/unit-test/yaml/input-mode-typeonly.test.ts index 8e8b0a2e62..1f18a7426f 100644 --- a/packages/web-integration/tests/unit-test/yaml/input-mode-typeonly.test.ts +++ b/packages/web-integration/tests/unit-test/yaml/input-mode-typeonly.test.ts @@ -1,133 +1,191 @@ -import { commonWebActionsForWebPage } from '@/web-page'; +import { + commonWebActionsForWebPage, + createWebInputPrimitives, +} from '@/web-page'; +import { defineActionInput } from '@midscene/core/device'; import { describe, expect, test, vi } from 'vitest'; -describe('Input action typeOnly mode', () => { - test('typeOnly mode should click to focus but not clear input', async () => { - const clearInputMock = vi.fn(); - const mouseClickMock = vi.fn(); - const keyboardTypeMock = vi.fn(); - - // Create a mock page object - const mockPage = { - clearInput: clearInputMock, - mouse: { - click: mouseClickMock, - move: vi.fn(), - wheel: vi.fn(), - drag: vi.fn(), - }, - keyboard: { - type: keyboardTypeMock, - press: vi.fn(), - }, - } as any; - - // Get actions from commonWebActionsForWebPage - const actions = commonWebActionsForWebPage(mockPage, false); +describe('Input action mode and caret behavior', () => { + const target = { center: [100, 200] }; + const capability = { + kind: 'native-input', + supportsClear: true, + supportsCaret: true, + }; + + const createMockPage = () => ({ + mouse: { + click: vi.fn(), + move: vi.fn(), + wheel: vi.fn(), + drag: vi.fn(), + }, + keyboard: { + type: vi.fn(), + press: vi.fn(), + }, + getFocusedInputCapability: vi.fn().mockResolvedValue(capability), + setFocusedInputCaret: vi.fn(), + clearInput: vi.fn(), + flushPendingVisualUpdate: vi.fn(), + }); - // Find the Input action + test('typeOnly without caret should focus and type without moving caret', async () => { + const mockPage = createMockPage(); + const actions = commonWebActionsForWebPage(mockPage as any, false); const inputAction = actions.find((a) => a.name === 'Input'); - expect(inputAction).toBeDefined(); - // Test with mode = 'typeOnly' await inputAction!.call( { value: 'new text', - locate: { center: [100, 200] }, + locate: target, mode: 'typeOnly', }, {} as any, ); - // Verify: clearInput should NOT be called - expect(clearInputMock).not.toHaveBeenCalled(); - - // Verify: mouse.click should be called to focus the element - expect(mouseClickMock).toHaveBeenCalledTimes(1); - expect(mouseClickMock).toHaveBeenCalledWith(100, 200, { button: 'left' }); - - // Verify: keyboard.type should be called with the value - expect(keyboardTypeMock).toHaveBeenCalledWith('new text'); + expect(mockPage.mouse.click).toHaveBeenCalledWith(100, 200, { + button: 'left', + }); + expect(mockPage.getFocusedInputCapability).not.toHaveBeenCalled(); + expect(mockPage.setFocusedInputCaret).not.toHaveBeenCalled(); + expect(mockPage.clearInput).not.toHaveBeenCalled(); + expect(mockPage.keyboard.press).not.toHaveBeenCalled(); + expect(mockPage.keyboard.type).toHaveBeenCalledWith('new text'); }); - test('replace mode should clear input', async () => { - const clearInputMock = vi.fn(); - const mouseClickMock = vi.fn(); - const keyboardTypeMock = vi.fn(); + test('typeOnly with caret should focus, move caret, and type', async () => { + const mockPage = createMockPage(); + const actions = commonWebActionsForWebPage(mockPage as any, false); + const inputAction = actions.find((a) => a.name === 'Input'); - const mockPage = { - clearInput: clearInputMock, - mouse: { - click: mouseClickMock, - move: vi.fn(), - wheel: vi.fn(), - drag: vi.fn(), - }, - keyboard: { - type: keyboardTypeMock, - press: vi.fn(), + await inputAction!.call( + { + value: 'new text', + locate: target, + mode: 'typeOnly', + caret: 'end', }, - } as any; + {} as any, + ); + + expect(mockPage.mouse.click).toHaveBeenCalledWith(100, 200, { + button: 'left', + }); + expect(mockPage.getFocusedInputCapability).toHaveBeenCalledTimes(1); + expect(mockPage.setFocusedInputCaret).toHaveBeenCalledWith( + 'end', + capability, + ); + expect(mockPage.clearInput).not.toHaveBeenCalled(); + expect(mockPage.keyboard.type).toHaveBeenCalledWith('new text'); + }); - const actions = commonWebActionsForWebPage(mockPage, false); + test('replace mode should focus, clear, and type', async () => { + const mockPage = createMockPage(); + const actions = commonWebActionsForWebPage(mockPage as any, false); const inputAction = actions.find((a) => a.name === 'Input'); - // Test with mode = 'replace' (default) await inputAction!.call( { value: 'replaced text', - locate: { center: [100, 200] }, + locate: target, mode: 'replace', }, {} as any, ); - // Verify: clearInput should be called - expect(clearInputMock).toHaveBeenCalledTimes(1); - - // Verify: direct mouse.click should NOT be called (clearInput handles focusing) - expect(mouseClickMock).not.toHaveBeenCalled(); - - // Verify: keyboard.type should be called - expect(keyboardTypeMock).toHaveBeenCalledWith('replaced text'); + expect(mockPage.mouse.click).toHaveBeenCalledWith(100, 200, { + button: 'left', + }); + expect(mockPage.getFocusedInputCapability).toHaveBeenCalledTimes(1); + expect(mockPage.clearInput).toHaveBeenCalledWith(target, capability); + expect(mockPage.setFocusedInputCaret).not.toHaveBeenCalled(); + expect(mockPage.keyboard.type).toHaveBeenCalledWith('replaced text'); }); - test('clear mode should only clear without typing', async () => { - const clearInputMock = vi.fn(); - const mouseClickMock = vi.fn(); - const keyboardTypeMock = vi.fn(); - - const mockPage = { - clearInput: clearInputMock, - mouse: { - click: mouseClickMock, - move: vi.fn(), - wheel: vi.fn(), - drag: vi.fn(), - }, - keyboard: { - type: keyboardTypeMock, - press: vi.fn(), - }, - } as any; - - const actions = commonWebActionsForWebPage(mockPage, false); + test('clear mode should clear without typing', async () => { + const mockPage = createMockPage(); + const actions = commonWebActionsForWebPage(mockPage as any, false); const inputAction = actions.find((a) => a.name === 'Input'); - // Test with mode = 'clear' await inputAction!.call( { value: 'this should not be typed', - locate: { center: [100, 200] }, + locate: target, mode: 'clear', }, {} as any, ); - // Verify: clearInput should be called - expect(clearInputMock).toHaveBeenCalledTimes(1); + expect(mockPage.mouse.click).not.toHaveBeenCalled(); + expect(mockPage.getFocusedInputCapability).not.toHaveBeenCalled(); + expect(mockPage.clearInput).toHaveBeenCalledWith(target); + expect(mockPage.keyboard.type).not.toHaveBeenCalled(); + }); + + test('Web Input action schema should expose caret', () => { + const actions = commonWebActionsForWebPage(createMockPage() as any, false); + const inputAction = actions.find((a) => a.name === 'Input'); + const schemaShape = (inputAction!.paramSchema as any).shape; + + expect(schemaShape.caret).toBeDefined(); + expect(inputAction!.paramSchema.safeParse({ value: 'text' }).success).toBe( + true, + ); + expect( + inputAction!.paramSchema.safeParse({ + value: 'text', + mode: 'typeOnly', + caret: 'end', + }).success, + ).toBe(true); + }); + + test('default Input action schema should not expose caret', () => { + const inputAction = defineActionInput({ + clearInput: async () => {}, + typeText: async () => {}, + keyboardPress: async () => {}, + }); + const schemaShape = (inputAction.paramSchema as any).shape; - // Verify: keyboard.type should NOT be called - expect(keyboardTypeMock).not.toHaveBeenCalled(); + expect(schemaShape.caret).toBeUndefined(); + }); +}); + +describe('Web input primitives', () => { + test('typeText should pass caret to the focused page capability flow', async () => { + const capability = { + kind: 'native-textarea', + supportsClear: true, + supportsCaret: true, + }; + const mockPage = { + mouse: { click: vi.fn() }, + keyboard: { type: vi.fn(), press: vi.fn() }, + getFocusedInputCapability: vi.fn().mockResolvedValue(capability), + setFocusedInputCaret: vi.fn(), + clearInput: vi.fn(), + flushPendingVisualUpdate: vi.fn(), + }; + const input = createWebInputPrimitives(mockPage as any); + + await input.keyboard.typeText('abc', { + target: { center: [10, 20] } as any, + replace: false, + caret: 'start', + }); + + expect(mockPage.mouse.click).toHaveBeenCalledWith(10, 20, { + button: 'left', + }); + expect(mockPage.getFocusedInputCapability).toHaveBeenCalledTimes(1); + expect(mockPage.setFocusedInputCaret).toHaveBeenCalledWith( + 'start', + capability, + ); + expect(mockPage.clearInput).not.toHaveBeenCalled(); + expect(mockPage.keyboard.type).toHaveBeenCalledWith('abc'); }); }); diff --git a/packages/web-integration/tests/unit-test/yaml/player.test.ts b/packages/web-integration/tests/unit-test/yaml/player.test.ts index 3c5eeba94c..2374098f37 100644 --- a/packages/web-integration/tests/unit-test/yaml/player.test.ts +++ b/packages/web-integration/tests/unit-test/yaml/player.test.ts @@ -856,6 +856,69 @@ tasks: ).toHaveLength(0); }); + test('aiInput should pass caret through to Input action', async () => { + const yamlString = ` +target: + url: "https://example.com" +tasks: + - name: input_with_caret + flow: + - aiInput: 'append input' + value: ' tail' + mode: 'typeOnly' + caret: 'end' + - aiInput: 'prefix' + locate: 'legacy input' + mode: 'typeOnly' + caret: 'start' +`; + + const script = parseYamlScript(yamlString); + const mockAgent = await getMockAgent(); + const player = new ScriptPlayer( + script, + async () => mockAgent, + ); + + await player.run(); + + expect(player.errorInSetup).toBeUndefined(); + expect(player.status).toBe('done'); + expect( + (mockAgent.agent.callActionInActionSpace as MockedFunction).mock + .calls, + ).toEqual([ + [ + 'Input', + { + locate: { + cacheable: true, + deepLocate: false, + prompt: 'append input', + xpath: undefined, + }, + value: ' tail', + mode: 'typeOnly', + caret: 'end', + }, + ], + [ + 'Input', + { + locate: { + cacheable: true, + deepLocate: false, + prompt: 'legacy input', + xpath: undefined, + }, + value: 'prefix', + mode: 'typeOnly', + caret: 'start', + }, + ], + ]); + }); + test('aiScroll without locate keeps global scroll semantics', async () => { const yamlString = ` android: