chore: turn vision into capability (#679)

Fixes https://github.com/microsoft/playwright-mcp/issues/420
This commit is contained in:
Pavel Feldman
2025-07-16 16:40:00 -07:00
committed by GitHub
parent 012c906500
commit d61aa16fee
23 changed files with 366 additions and 575 deletions

View File

@@ -15,7 +15,7 @@
*/
import { z } from 'zod';
import { defineTool, type ToolFactory } from './tool.js';
import { defineTool } from './tool.js';
const close = defineTool({
capability: 'core',
@@ -38,7 +38,7 @@ const close = defineTool({
},
});
const resize: ToolFactory = captureSnapshot => defineTool({
const resize = defineTool({
capability: 'core',
schema: {
name: 'browser_resize',
@@ -66,13 +66,13 @@ const resize: ToolFactory = captureSnapshot => defineTool({
return {
code,
action,
captureSnapshot,
captureSnapshot: true,
waitForNetwork: true
};
},
});
export default (captureSnapshot: boolean) => [
export default [
close,
resize(captureSnapshot)
resize
];

View File

@@ -15,9 +15,9 @@
*/
import { z } from 'zod';
import { defineTool, type ToolFactory } from './tool.js';
import { defineTool } from './tool.js';
const handleDialog: ToolFactory = captureSnapshot => defineTool({
const handleDialog = defineTool({
capability: 'core',
schema: {
@@ -49,7 +49,7 @@ const handleDialog: ToolFactory = captureSnapshot => defineTool({
return {
code,
captureSnapshot,
captureSnapshot: true,
waitForNetwork: false,
};
},
@@ -57,6 +57,6 @@ const handleDialog: ToolFactory = captureSnapshot => defineTool({
clearsModalState: 'dialog',
});
export default (captureSnapshot: boolean) => [
handleDialog(captureSnapshot),
export default [
handleDialog,
];

View File

@@ -15,10 +15,10 @@
*/
import { z } from 'zod';
import { defineTool, type ToolFactory } from './tool.js';
import { defineTool } from './tool.js';
const uploadFile: ToolFactory = captureSnapshot => defineTool({
capability: 'files',
const uploadFile = defineTool({
capability: 'core',
schema: {
name: 'browser_file_upload',
@@ -47,13 +47,13 @@ const uploadFile: ToolFactory = captureSnapshot => defineTool({
return {
code,
action,
captureSnapshot,
captureSnapshot: true,
waitForNetwork: true,
};
},
clearsModalState: 'fileChooser',
});
export default (captureSnapshot: boolean) => [
uploadFile(captureSnapshot),
export default [
uploadFile,
];

View File

@@ -23,7 +23,7 @@ import { defineTool } from './tool.js';
import { fileURLToPath } from 'node:url';
const install = defineTool({
capability: 'install',
capability: 'core-install',
schema: {
name: 'browser_install',
title: 'Install the browser specified in the config',

View File

@@ -15,9 +15,13 @@
*/
import { z } from 'zod';
import { defineTool, type ToolFactory } from './tool.js';
const pressKey: ToolFactory = captureSnapshot => defineTool({
import { defineTool } from './tool.js';
import { elementSchema } from './snapshot.js';
import { generateLocator } from './utils.js';
import * as javascript from '../javascript.js';
const pressKey = defineTool({
capability: 'core',
schema: {
@@ -43,12 +47,61 @@ const pressKey: ToolFactory = captureSnapshot => defineTool({
return {
code,
action,
captureSnapshot,
captureSnapshot: true,
waitForNetwork: true
};
},
});
export default (captureSnapshot: boolean) => [
pressKey(captureSnapshot),
const typeSchema = elementSchema.extend({
text: z.string().describe('Text to type into the element'),
submit: z.boolean().optional().describe('Whether to submit entered text (press Enter after)'),
slowly: z.boolean().optional().describe('Whether to type one character at a time. Useful for triggering key handlers in the page. By default entire text is filled in at once.'),
});
const type = defineTool({
capability: 'core',
schema: {
name: 'browser_type',
title: 'Type text',
description: 'Type text into editable element',
inputSchema: typeSchema,
type: 'destructive',
},
handle: async (context, params) => {
const snapshot = context.currentTabOrDie().snapshotOrDie();
const locator = snapshot.refLocator(params);
const code: string[] = [];
const steps: (() => Promise<void>)[] = [];
if (params.slowly) {
code.push(`// Press "${params.text}" sequentially into "${params.element}"`);
code.push(`await page.${await generateLocator(locator)}.pressSequentially(${javascript.quote(params.text)});`);
steps.push(() => locator.pressSequentially(params.text));
} else {
code.push(`// Fill "${params.text}" into "${params.element}"`);
code.push(`await page.${await generateLocator(locator)}.fill(${javascript.quote(params.text)});`);
steps.push(() => locator.fill(params.text));
}
if (params.submit) {
code.push(`// Submit text`);
code.push(`await page.${await generateLocator(locator)}.press('Enter');`);
steps.push(() => locator.press('Enter'));
}
return {
code,
action: () => steps.reduce((acc, step) => acc.then(step), Promise.resolve()),
captureSnapshot: true,
waitForNetwork: true,
};
},
});
export default [
pressKey,
type,
];

View File

@@ -17,50 +17,14 @@
import { z } from 'zod';
import { defineTool } from './tool.js';
import * as javascript from '../javascript.js';
const elementSchema = z.object({
element: z.string().describe('Human-readable element description used to obtain permission to interact with the element'),
});
const screenshot = defineTool({
capability: 'core',
const mouseMove = defineTool({
capability: 'vision',
schema: {
name: 'browser_screen_capture',
title: 'Take a screenshot',
description: 'Take a screenshot of the current page',
inputSchema: z.object({}),
type: 'readOnly',
},
handle: async context => {
const tab = await context.ensureTab();
const options = { type: 'jpeg' as 'jpeg', quality: 50, scale: 'css' as 'css' };
const code = [
`// Take a screenshot of the current page`,
`await page.screenshot(${javascript.formatObject(options)});`,
];
const action = () => tab.page.screenshot(options).then(buffer => {
return {
content: [{ type: 'image' as 'image', data: buffer.toString('base64'), mimeType: 'image/jpeg' }],
};
});
return {
code,
action,
captureSnapshot: false,
waitForNetwork: false
};
},
});
const moveMouse = defineTool({
capability: 'core',
schema: {
name: 'browser_screen_move_mouse',
name: 'browser_mouse_move_xy',
title: 'Move mouse',
description: 'Move mouse to a given position',
inputSchema: elementSchema.extend({
@@ -86,12 +50,12 @@ const moveMouse = defineTool({
},
});
const click = defineTool({
capability: 'core',
const mouseClick = defineTool({
capability: 'vision',
schema: {
name: 'browser_screen_click',
name: 'browser_mouse_click_xy',
title: 'Click',
description: 'Click left mouse button',
description: 'Click left mouse button at a given position',
inputSchema: elementSchema.extend({
x: z.number().describe('X coordinate'),
y: z.number().describe('Y coordinate'),
@@ -121,12 +85,12 @@ const click = defineTool({
},
});
const drag = defineTool({
capability: 'core',
const mouseDrag = defineTool({
capability: 'vision',
schema: {
name: 'browser_screen_drag',
name: 'browser_mouse_drag_xy',
title: 'Drag mouse',
description: 'Drag left mouse button',
description: 'Drag left mouse button to a given position',
inputSchema: elementSchema.extend({
startX: z.number().describe('Start X coordinate'),
startY: z.number().describe('Start Y coordinate'),
@@ -163,51 +127,8 @@ const drag = defineTool({
},
});
const type = defineTool({
capability: 'core',
schema: {
name: 'browser_screen_type',
title: 'Type text',
description: 'Type text',
inputSchema: z.object({
text: z.string().describe('Text to type into the element'),
submit: z.boolean().optional().describe('Whether to submit entered text (press Enter after)'),
}),
type: 'destructive',
},
handle: async (context, params) => {
const tab = context.currentTabOrDie();
const code = [
`// Type ${params.text}`,
`await page.keyboard.type('${params.text}');`,
];
const action = async () => {
await tab.page.keyboard.type(params.text);
if (params.submit)
await tab.page.keyboard.press('Enter');
};
if (params.submit) {
code.push(`// Submit text`);
code.push(`await page.keyboard.press('Enter');`);
}
return {
code,
action,
captureSnapshot: false,
waitForNetwork: true,
};
},
});
export default [
screenshot,
moveMouse,
click,
drag,
type,
mouseMove,
mouseClick,
mouseDrag,
];

View File

@@ -15,9 +15,9 @@
*/
import { z } from 'zod';
import { defineTool, type ToolFactory } from './tool.js';
import { defineTool } from './tool.js';
const navigate: ToolFactory = captureSnapshot => defineTool({
const navigate = defineTool({
capability: 'core',
schema: {
@@ -41,14 +41,14 @@ const navigate: ToolFactory = captureSnapshot => defineTool({
return {
code,
captureSnapshot,
captureSnapshot: true,
waitForNetwork: false,
};
},
});
const goBack: ToolFactory = captureSnapshot => defineTool({
capability: 'history',
const goBack = defineTool({
capability: 'core',
schema: {
name: 'browser_navigate_back',
title: 'Go back',
@@ -67,14 +67,14 @@ const goBack: ToolFactory = captureSnapshot => defineTool({
return {
code,
captureSnapshot,
captureSnapshot: true,
waitForNetwork: false,
};
},
});
const goForward: ToolFactory = captureSnapshot => defineTool({
capability: 'history',
const goForward = defineTool({
capability: 'core',
schema: {
name: 'browser_navigate_forward',
title: 'Go forward',
@@ -91,14 +91,14 @@ const goForward: ToolFactory = captureSnapshot => defineTool({
];
return {
code,
captureSnapshot,
captureSnapshot: true,
waitForNetwork: false,
};
},
});
export default (captureSnapshot: boolean) => [
navigate(captureSnapshot),
goBack(captureSnapshot),
goForward(captureSnapshot),
export default [
navigate,
goBack,
goForward,
];

View File

@@ -79,7 +79,7 @@ const screenshot = defineTool({
return {
code,
action,
captureSnapshot: true,
captureSnapshot: false,
waitForNetwork: false,
};
}

View File

@@ -41,7 +41,7 @@ const snapshot = defineTool({
},
});
const elementSchema = z.object({
export const elementSchema = z.object({
element: z.string().describe('Human-readable element description used to obtain permission to interact with the element'),
ref: z.string().describe('Exact target element reference from the page snapshot'),
});
@@ -144,54 +144,6 @@ const hover = defineTool({
},
});
const typeSchema = elementSchema.extend({
text: z.string().describe('Text to type into the element'),
submit: z.boolean().optional().describe('Whether to submit entered text (press Enter after)'),
slowly: z.boolean().optional().describe('Whether to type one character at a time. Useful for triggering key handlers in the page. By default entire text is filled in at once.'),
});
const type = defineTool({
capability: 'core',
schema: {
name: 'browser_type',
title: 'Type text',
description: 'Type text into editable element',
inputSchema: typeSchema,
type: 'destructive',
},
handle: async (context, params) => {
const snapshot = context.currentTabOrDie().snapshotOrDie();
const locator = snapshot.refLocator(params);
const code: string[] = [];
const steps: (() => Promise<void>)[] = [];
if (params.slowly) {
code.push(`// Press "${params.text}" sequentially into "${params.element}"`);
code.push(`await page.${await generateLocator(locator)}.pressSequentially(${javascript.quote(params.text)});`);
steps.push(() => locator.pressSequentially(params.text));
} else {
code.push(`// Fill "${params.text}" into "${params.element}"`);
code.push(`await page.${await generateLocator(locator)}.fill(${javascript.quote(params.text)});`);
steps.push(() => locator.fill(params.text));
}
if (params.submit) {
code.push(`// Submit text`);
code.push(`await page.${await generateLocator(locator)}.press('Enter');`);
steps.push(() => locator.press('Enter'));
}
return {
code,
action: () => steps.reduce((acc, step) => acc.then(step), Promise.resolve()),
captureSnapshot: true,
waitForNetwork: true,
};
},
});
const selectOptionSchema = elementSchema.extend({
values: z.array(z.string()).describe('Array of values to select in the dropdown. This can be a single value or multiple values.'),
});
@@ -229,6 +181,5 @@ export default [
click,
drag,
hover,
type,
selectOption,
];

View File

@@ -15,10 +15,10 @@
*/
import { z } from 'zod';
import { defineTool, type ToolFactory } from './tool.js';
import { defineTool } from './tool.js';
const listTabs = defineTool({
capability: 'tabs',
capability: 'core-tabs',
schema: {
name: 'browser_tab_list',
@@ -44,8 +44,8 @@ const listTabs = defineTool({
},
});
const selectTab: ToolFactory = captureSnapshot => defineTool({
capability: 'tabs',
const selectTab = defineTool({
capability: 'core-tabs',
schema: {
name: 'browser_tab_select',
@@ -65,14 +65,14 @@ const selectTab: ToolFactory = captureSnapshot => defineTool({
return {
code,
captureSnapshot,
captureSnapshot: true,
waitForNetwork: false
};
},
});
const newTab: ToolFactory = captureSnapshot => defineTool({
capability: 'tabs',
const newTab = defineTool({
capability: 'core-tabs',
schema: {
name: 'browser_tab_new',
@@ -94,14 +94,14 @@ const newTab: ToolFactory = captureSnapshot => defineTool({
];
return {
code,
captureSnapshot,
captureSnapshot: true,
waitForNetwork: false
};
},
});
const closeTab: ToolFactory = captureSnapshot => defineTool({
capability: 'tabs',
const closeTab = defineTool({
capability: 'core-tabs',
schema: {
name: 'browser_tab_close',
@@ -120,15 +120,15 @@ const closeTab: ToolFactory = captureSnapshot => defineTool({
];
return {
code,
captureSnapshot,
captureSnapshot: true,
waitForNetwork: false
};
},
});
export default (captureSnapshot: boolean) => [
export default [
listTabs,
newTab(captureSnapshot),
selectTab(captureSnapshot),
closeTab(captureSnapshot),
newTab,
selectTab,
closeTab,
];

View File

@@ -61,8 +61,6 @@ export type Tool<Input extends InputType = InputType> = {
handle: (context: Context, params: z.output<Input>) => Promise<ToolResult>;
};
export type ToolFactory = (snapshot: boolean) => Tool<any>;
export function defineTool<Input extends InputType>(tool: Tool<Input>): Tool<Input> {
return tool;
}

View File

@@ -15,10 +15,10 @@
*/
import { z } from 'zod';
import { defineTool, type ToolFactory } from './tool.js';
import { defineTool } from './tool.js';
const wait: ToolFactory = captureSnapshot => defineTool({
capability: 'wait',
const wait = defineTool({
capability: 'core',
schema: {
name: 'browser_wait_for',
@@ -40,7 +40,7 @@ const wait: ToolFactory = captureSnapshot => defineTool({
if (params.time) {
code.push(`await new Promise(f => setTimeout(f, ${params.time!} * 1000));`);
await new Promise(f => setTimeout(f, Math.min(10000, params.time! * 1000)));
await new Promise(f => setTimeout(f, Math.min(30000, params.time! * 1000)));
}
const tab = context.currentTabOrDie();
@@ -59,12 +59,12 @@ const wait: ToolFactory = captureSnapshot => defineTool({
return {
code,
captureSnapshot,
captureSnapshot: true,
waitForNetwork: false,
};
},
});
export default (captureSnapshot: boolean) => [
wait(captureSnapshot),
export default [
wait,
];