chore: turn vision into capability (#679)

Fixes https://github.com/microsoft/playwright-mcp/issues/420
This commit is contained in:
Pavel Feldman
2025-07-16 16:40:00 -07:00
committed by GitHub
parent 012c906500
commit d61aa16fee
23 changed files with 366 additions and 575 deletions

View File

@@ -49,7 +49,6 @@ export type CLIOptions = {
userAgent?: string;
userDataDir?: string;
viewportSize?: string;
vision?: boolean;
};
const defaultConfig: FullConfig = {
@@ -185,7 +184,6 @@ export async function configFromCLIOptions(cliOptions: CLIOptions): Promise<Conf
host: cliOptions.host,
},
capabilities: cliOptions.caps?.split(',').map((c: string) => c.trim() as ToolCapability),
vision: !!cliOptions.vision,
network: {
allowedOrigins: cliOptions.allowedOrigins,
blockedOrigins: cliOptions.blockedOrigins,

View File

@@ -19,7 +19,7 @@ import { CallToolRequestSchema, ListToolsRequestSchema, Tool as McpTool } from '
import { zodToJsonSchema } from 'zod-to-json-schema';
import { Context } from './context.js';
import { snapshotTools, visionTools } from './tools.js';
import { allTools } from './tools.js';
import { packageJSON } from './package.js';
import { FullConfig } from './config.js';
@@ -27,8 +27,7 @@ import { FullConfig } from './config.js';
import type { BrowserContextFactory } from './browserContextFactory.js';
export function createConnection(config: FullConfig, browserContextFactory: BrowserContextFactory): Connection {
const allTools = config.vision ? visionTools : snapshotTools;
const tools = allTools.filter(tool => !config.capabilities || tool.capability === 'core' || config.capabilities.includes(tool.capability));
const tools = allTools.filter(tool => tool.capability.startsWith('core') || config.capabilities?.includes(tool.capability));
const context = new Context(tools, config, browserContextFactory);
const server = new McpServer({ name: 'Playwright', version: packageJSON.version }, {
capabilities: {

View File

@@ -14,7 +14,7 @@
* limitations under the License.
*/
import { program } from 'commander';
import { program, Option } from 'commander';
// @ts-ignore
import { startTraceViewerServer } from 'playwright-core/lib/server';
@@ -31,7 +31,7 @@ program
.option('--block-service-workers', 'block service workers')
.option('--browser <browser>', 'browser or chrome channel to use, possible values: chrome, firefox, webkit, msedge.')
.option('--browser-agent <endpoint>', 'Use browser agent (experimental).')
.option('--caps <caps>', 'comma-separated list of capabilities to enable, possible values: tabs, pdf, history, wait, files, install. Default is all.')
.option('--caps <caps>', 'comma-separated list of additional capabilities to enable, possible values: vision, pdf.')
.option('--cdp-endpoint <endpoint>', 'CDP endpoint to connect to.')
.option('--config <path>', 'path to the configuration file.')
.option('--device <device>', 'device to emulate, for example: "iPhone 15"')
@@ -51,8 +51,13 @@ program
.option('--user-agent <ua string>', 'specify user agent string')
.option('--user-data-dir <path>', 'path to the user data directory. If not specified, a temporary directory will be created.')
.option('--viewport-size <size>', 'specify browser viewport size in pixels, for example "1280, 720"')
.option('--vision', 'Run server that uses screenshots (Aria snapshots are used by default)')
.addOption(new Option('--vision', 'Legacy option, use --caps=vision instead').hideHelp())
.action(async options => {
if (options.vision) {
// eslint-disable-next-line no-console
console.error('The --vision option is deprecated, use --caps=vision instead');
options.caps = 'vision';
}
const config = await resolveCLIConfig(options);
const httpServer = config.server.port !== undefined ? await startHttpServer(config.server) : undefined;

View File

@@ -27,39 +27,25 @@ import pdf from './tools/pdf.js';
import snapshot from './tools/snapshot.js';
import tabs from './tools/tabs.js';
import screenshot from './tools/screenshot.js';
import vision from './tools/vision.js';
import wait from './tools/wait.js';
import mouse from './tools/mouse.js';
import type { Tool } from './tools/tool.js';
export const snapshotTools: Tool<any>[] = [
...common(true),
export const allTools: Tool<any>[] = [
...common,
...console,
...dialogs(true),
...dialogs,
...evaluate,
...files(true),
...files,
...install,
...keyboard(true),
...navigate(true),
...keyboard,
...navigate,
...network,
...mouse,
...pdf,
...screenshot,
...snapshot,
...tabs(true),
...wait(true),
];
export const visionTools: Tool<any>[] = [
...common(false),
...console,
...dialogs(false),
...files(false),
...install,
...keyboard(false),
...navigate(false),
...network,
...pdf,
...tabs(false),
...vision,
...wait(false),
...tabs,
...wait,
];

View File

@@ -15,7 +15,7 @@
*/
import { z } from 'zod';
import { defineTool, type ToolFactory } from './tool.js';
import { defineTool } from './tool.js';
const close = defineTool({
capability: 'core',
@@ -38,7 +38,7 @@ const close = defineTool({
},
});
const resize: ToolFactory = captureSnapshot => defineTool({
const resize = defineTool({
capability: 'core',
schema: {
name: 'browser_resize',
@@ -66,13 +66,13 @@ const resize: ToolFactory = captureSnapshot => defineTool({
return {
code,
action,
captureSnapshot,
captureSnapshot: true,
waitForNetwork: true
};
},
});
export default (captureSnapshot: boolean) => [
export default [
close,
resize(captureSnapshot)
resize
];

View File

@@ -15,9 +15,9 @@
*/
import { z } from 'zod';
import { defineTool, type ToolFactory } from './tool.js';
import { defineTool } from './tool.js';
const handleDialog: ToolFactory = captureSnapshot => defineTool({
const handleDialog = defineTool({
capability: 'core',
schema: {
@@ -49,7 +49,7 @@ const handleDialog: ToolFactory = captureSnapshot => defineTool({
return {
code,
captureSnapshot,
captureSnapshot: true,
waitForNetwork: false,
};
},
@@ -57,6 +57,6 @@ const handleDialog: ToolFactory = captureSnapshot => defineTool({
clearsModalState: 'dialog',
});
export default (captureSnapshot: boolean) => [
handleDialog(captureSnapshot),
export default [
handleDialog,
];

View File

@@ -15,10 +15,10 @@
*/
import { z } from 'zod';
import { defineTool, type ToolFactory } from './tool.js';
import { defineTool } from './tool.js';
const uploadFile: ToolFactory = captureSnapshot => defineTool({
capability: 'files',
const uploadFile = defineTool({
capability: 'core',
schema: {
name: 'browser_file_upload',
@@ -47,13 +47,13 @@ const uploadFile: ToolFactory = captureSnapshot => defineTool({
return {
code,
action,
captureSnapshot,
captureSnapshot: true,
waitForNetwork: true,
};
},
clearsModalState: 'fileChooser',
});
export default (captureSnapshot: boolean) => [
uploadFile(captureSnapshot),
export default [
uploadFile,
];

View File

@@ -23,7 +23,7 @@ import { defineTool } from './tool.js';
import { fileURLToPath } from 'node:url';
const install = defineTool({
capability: 'install',
capability: 'core-install',
schema: {
name: 'browser_install',
title: 'Install the browser specified in the config',

View File

@@ -15,9 +15,13 @@
*/
import { z } from 'zod';
import { defineTool, type ToolFactory } from './tool.js';
const pressKey: ToolFactory = captureSnapshot => defineTool({
import { defineTool } from './tool.js';
import { elementSchema } from './snapshot.js';
import { generateLocator } from './utils.js';
import * as javascript from '../javascript.js';
const pressKey = defineTool({
capability: 'core',
schema: {
@@ -43,12 +47,61 @@ const pressKey: ToolFactory = captureSnapshot => defineTool({
return {
code,
action,
captureSnapshot,
captureSnapshot: true,
waitForNetwork: true
};
},
});
export default (captureSnapshot: boolean) => [
pressKey(captureSnapshot),
const typeSchema = elementSchema.extend({
text: z.string().describe('Text to type into the element'),
submit: z.boolean().optional().describe('Whether to submit entered text (press Enter after)'),
slowly: z.boolean().optional().describe('Whether to type one character at a time. Useful for triggering key handlers in the page. By default entire text is filled in at once.'),
});
const type = defineTool({
capability: 'core',
schema: {
name: 'browser_type',
title: 'Type text',
description: 'Type text into editable element',
inputSchema: typeSchema,
type: 'destructive',
},
handle: async (context, params) => {
const snapshot = context.currentTabOrDie().snapshotOrDie();
const locator = snapshot.refLocator(params);
const code: string[] = [];
const steps: (() => Promise<void>)[] = [];
if (params.slowly) {
code.push(`// Press "${params.text}" sequentially into "${params.element}"`);
code.push(`await page.${await generateLocator(locator)}.pressSequentially(${javascript.quote(params.text)});`);
steps.push(() => locator.pressSequentially(params.text));
} else {
code.push(`// Fill "${params.text}" into "${params.element}"`);
code.push(`await page.${await generateLocator(locator)}.fill(${javascript.quote(params.text)});`);
steps.push(() => locator.fill(params.text));
}
if (params.submit) {
code.push(`// Submit text`);
code.push(`await page.${await generateLocator(locator)}.press('Enter');`);
steps.push(() => locator.press('Enter'));
}
return {
code,
action: () => steps.reduce((acc, step) => acc.then(step), Promise.resolve()),
captureSnapshot: true,
waitForNetwork: true,
};
},
});
export default [
pressKey,
type,
];

View File

@@ -17,50 +17,14 @@
import { z } from 'zod';
import { defineTool } from './tool.js';
import * as javascript from '../javascript.js';
const elementSchema = z.object({
element: z.string().describe('Human-readable element description used to obtain permission to interact with the element'),
});
const screenshot = defineTool({
capability: 'core',
const mouseMove = defineTool({
capability: 'vision',
schema: {
name: 'browser_screen_capture',
title: 'Take a screenshot',
description: 'Take a screenshot of the current page',
inputSchema: z.object({}),
type: 'readOnly',
},
handle: async context => {
const tab = await context.ensureTab();
const options = { type: 'jpeg' as 'jpeg', quality: 50, scale: 'css' as 'css' };
const code = [
`// Take a screenshot of the current page`,
`await page.screenshot(${javascript.formatObject(options)});`,
];
const action = () => tab.page.screenshot(options).then(buffer => {
return {
content: [{ type: 'image' as 'image', data: buffer.toString('base64'), mimeType: 'image/jpeg' }],
};
});
return {
code,
action,
captureSnapshot: false,
waitForNetwork: false
};
},
});
const moveMouse = defineTool({
capability: 'core',
schema: {
name: 'browser_screen_move_mouse',
name: 'browser_mouse_move_xy',
title: 'Move mouse',
description: 'Move mouse to a given position',
inputSchema: elementSchema.extend({
@@ -86,12 +50,12 @@ const moveMouse = defineTool({
},
});
const click = defineTool({
capability: 'core',
const mouseClick = defineTool({
capability: 'vision',
schema: {
name: 'browser_screen_click',
name: 'browser_mouse_click_xy',
title: 'Click',
description: 'Click left mouse button',
description: 'Click left mouse button at a given position',
inputSchema: elementSchema.extend({
x: z.number().describe('X coordinate'),
y: z.number().describe('Y coordinate'),
@@ -121,12 +85,12 @@ const click = defineTool({
},
});
const drag = defineTool({
capability: 'core',
const mouseDrag = defineTool({
capability: 'vision',
schema: {
name: 'browser_screen_drag',
name: 'browser_mouse_drag_xy',
title: 'Drag mouse',
description: 'Drag left mouse button',
description: 'Drag left mouse button to a given position',
inputSchema: elementSchema.extend({
startX: z.number().describe('Start X coordinate'),
startY: z.number().describe('Start Y coordinate'),
@@ -163,51 +127,8 @@ const drag = defineTool({
},
});
const type = defineTool({
capability: 'core',
schema: {
name: 'browser_screen_type',
title: 'Type text',
description: 'Type text',
inputSchema: z.object({
text: z.string().describe('Text to type into the element'),
submit: z.boolean().optional().describe('Whether to submit entered text (press Enter after)'),
}),
type: 'destructive',
},
handle: async (context, params) => {
const tab = context.currentTabOrDie();
const code = [
`// Type ${params.text}`,
`await page.keyboard.type('${params.text}');`,
];
const action = async () => {
await tab.page.keyboard.type(params.text);
if (params.submit)
await tab.page.keyboard.press('Enter');
};
if (params.submit) {
code.push(`// Submit text`);
code.push(`await page.keyboard.press('Enter');`);
}
return {
code,
action,
captureSnapshot: false,
waitForNetwork: true,
};
},
});
export default [
screenshot,
moveMouse,
click,
drag,
type,
mouseMove,
mouseClick,
mouseDrag,
];

View File

@@ -15,9 +15,9 @@
*/
import { z } from 'zod';
import { defineTool, type ToolFactory } from './tool.js';
import { defineTool } from './tool.js';
const navigate: ToolFactory = captureSnapshot => defineTool({
const navigate = defineTool({
capability: 'core',
schema: {
@@ -41,14 +41,14 @@ const navigate: ToolFactory = captureSnapshot => defineTool({
return {
code,
captureSnapshot,
captureSnapshot: true,
waitForNetwork: false,
};
},
});
const goBack: ToolFactory = captureSnapshot => defineTool({
capability: 'history',
const goBack = defineTool({
capability: 'core',
schema: {
name: 'browser_navigate_back',
title: 'Go back',
@@ -67,14 +67,14 @@ const goBack: ToolFactory = captureSnapshot => defineTool({
return {
code,
captureSnapshot,
captureSnapshot: true,
waitForNetwork: false,
};
},
});
const goForward: ToolFactory = captureSnapshot => defineTool({
capability: 'history',
const goForward = defineTool({
capability: 'core',
schema: {
name: 'browser_navigate_forward',
title: 'Go forward',
@@ -91,14 +91,14 @@ const goForward: ToolFactory = captureSnapshot => defineTool({
];
return {
code,
captureSnapshot,
captureSnapshot: true,
waitForNetwork: false,
};
},
});
export default (captureSnapshot: boolean) => [
navigate(captureSnapshot),
goBack(captureSnapshot),
goForward(captureSnapshot),
export default [
navigate,
goBack,
goForward,
];

View File

@@ -79,7 +79,7 @@ const screenshot = defineTool({
return {
code,
action,
captureSnapshot: true,
captureSnapshot: false,
waitForNetwork: false,
};
}

View File

@@ -41,7 +41,7 @@ const snapshot = defineTool({
},
});
const elementSchema = z.object({
export const elementSchema = z.object({
element: z.string().describe('Human-readable element description used to obtain permission to interact with the element'),
ref: z.string().describe('Exact target element reference from the page snapshot'),
});
@@ -144,54 +144,6 @@ const hover = defineTool({
},
});
const typeSchema = elementSchema.extend({
text: z.string().describe('Text to type into the element'),
submit: z.boolean().optional().describe('Whether to submit entered text (press Enter after)'),
slowly: z.boolean().optional().describe('Whether to type one character at a time. Useful for triggering key handlers in the page. By default entire text is filled in at once.'),
});
const type = defineTool({
capability: 'core',
schema: {
name: 'browser_type',
title: 'Type text',
description: 'Type text into editable element',
inputSchema: typeSchema,
type: 'destructive',
},
handle: async (context, params) => {
const snapshot = context.currentTabOrDie().snapshotOrDie();
const locator = snapshot.refLocator(params);
const code: string[] = [];
const steps: (() => Promise<void>)[] = [];
if (params.slowly) {
code.push(`// Press "${params.text}" sequentially into "${params.element}"`);
code.push(`await page.${await generateLocator(locator)}.pressSequentially(${javascript.quote(params.text)});`);
steps.push(() => locator.pressSequentially(params.text));
} else {
code.push(`// Fill "${params.text}" into "${params.element}"`);
code.push(`await page.${await generateLocator(locator)}.fill(${javascript.quote(params.text)});`);
steps.push(() => locator.fill(params.text));
}
if (params.submit) {
code.push(`// Submit text`);
code.push(`await page.${await generateLocator(locator)}.press('Enter');`);
steps.push(() => locator.press('Enter'));
}
return {
code,
action: () => steps.reduce((acc, step) => acc.then(step), Promise.resolve()),
captureSnapshot: true,
waitForNetwork: true,
};
},
});
const selectOptionSchema = elementSchema.extend({
values: z.array(z.string()).describe('Array of values to select in the dropdown. This can be a single value or multiple values.'),
});
@@ -229,6 +181,5 @@ export default [
click,
drag,
hover,
type,
selectOption,
];

View File

@@ -15,10 +15,10 @@
*/
import { z } from 'zod';
import { defineTool, type ToolFactory } from './tool.js';
import { defineTool } from './tool.js';
const listTabs = defineTool({
capability: 'tabs',
capability: 'core-tabs',
schema: {
name: 'browser_tab_list',
@@ -44,8 +44,8 @@ const listTabs = defineTool({
},
});
const selectTab: ToolFactory = captureSnapshot => defineTool({
capability: 'tabs',
const selectTab = defineTool({
capability: 'core-tabs',
schema: {
name: 'browser_tab_select',
@@ -65,14 +65,14 @@ const selectTab: ToolFactory = captureSnapshot => defineTool({
return {
code,
captureSnapshot,
captureSnapshot: true,
waitForNetwork: false
};
},
});
const newTab: ToolFactory = captureSnapshot => defineTool({
capability: 'tabs',
const newTab = defineTool({
capability: 'core-tabs',
schema: {
name: 'browser_tab_new',
@@ -94,14 +94,14 @@ const newTab: ToolFactory = captureSnapshot => defineTool({
];
return {
code,
captureSnapshot,
captureSnapshot: true,
waitForNetwork: false
};
},
});
const closeTab: ToolFactory = captureSnapshot => defineTool({
capability: 'tabs',
const closeTab = defineTool({
capability: 'core-tabs',
schema: {
name: 'browser_tab_close',
@@ -120,15 +120,15 @@ const closeTab: ToolFactory = captureSnapshot => defineTool({
];
return {
code,
captureSnapshot,
captureSnapshot: true,
waitForNetwork: false
};
},
});
export default (captureSnapshot: boolean) => [
export default [
listTabs,
newTab(captureSnapshot),
selectTab(captureSnapshot),
closeTab(captureSnapshot),
newTab,
selectTab,
closeTab,
];

View File

@@ -61,8 +61,6 @@ export type Tool<Input extends InputType = InputType> = {
handle: (context: Context, params: z.output<Input>) => Promise<ToolResult>;
};
export type ToolFactory = (snapshot: boolean) => Tool<any>;
export function defineTool<Input extends InputType>(tool: Tool<Input>): Tool<Input> {
return tool;
}

View File

@@ -15,10 +15,10 @@
*/
import { z } from 'zod';
import { defineTool, type ToolFactory } from './tool.js';
import { defineTool } from './tool.js';
const wait: ToolFactory = captureSnapshot => defineTool({
capability: 'wait',
const wait = defineTool({
capability: 'core',
schema: {
name: 'browser_wait_for',
@@ -40,7 +40,7 @@ const wait: ToolFactory = captureSnapshot => defineTool({
if (params.time) {
code.push(`await new Promise(f => setTimeout(f, ${params.time!} * 1000));`);
await new Promise(f => setTimeout(f, Math.min(10000, params.time! * 1000)));
await new Promise(f => setTimeout(f, Math.min(30000, params.time! * 1000)));
}
const tab = context.currentTabOrDie();
@@ -59,12 +59,12 @@ const wait: ToolFactory = captureSnapshot => defineTool({
return {
code,
captureSnapshot,
captureSnapshot: true,
waitForNetwork: false,
};
},
});
export default (captureSnapshot: boolean) => [
wait(captureSnapshot),
export default [
wait,
];