chore: add eval script (#743)

This commit is contained in:
Pavel Feldman
2025-07-23 10:31:37 -07:00
committed by GitHub
parent 288f1b863b
commit 2c5eac89a8
8 changed files with 405 additions and 59 deletions

View File

@@ -17,7 +17,6 @@
import { Server as McpServer } from '@modelcontextprotocol/sdk/server/index.js';
import { CallToolRequestSchema, ListToolsRequestSchema, Tool as McpTool } from '@modelcontextprotocol/sdk/types.js';
import { zodToJsonSchema } from 'zod-to-json-schema';
import { Context } from './context.js';
import { Response } from './response.js';
import { allTools } from './tools.js';

119
src/eval/loopClaude.ts Normal file
View File

@@ -0,0 +1,119 @@
/**
* Copyright (c) Microsoft Corporation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import Anthropic from '@anthropic-ai/sdk';
import debug from 'debug';
import type { Tool, ImageContent, TextContent } from '@modelcontextprotocol/sdk/types.js';
import type { Client } from '@modelcontextprotocol/sdk/client/index.js';
const model = 'claude-sonnet-4-20250514';
export async function runTask(client: Client, task: string): Promise<string | undefined> {
const anthropic = new Anthropic();
const messages: Anthropic.Messages.MessageParam[] = [];
const { tools } = await client.listTools();
const claudeTools = tools.map(tool => asClaudeDeclaration(tool));
// Add initial user message
messages.push({
role: 'user',
content: `Perform following task: ${task}.`
});
for (let iteration = 0; iteration < 5; ++iteration) {
debug('history')(messages);
const response = await anthropic.messages.create({
model,
max_tokens: 10000,
messages,
tools: claudeTools,
});
const content = response.content;
const toolUseBlocks = content.filter(block => block.type === 'tool_use');
const textBlocks = content.filter(block => block.type === 'text');
messages.push({
role: 'assistant',
content: content
});
if (toolUseBlocks.length === 0)
return textBlocks.map(block => block.text).join('\n');
const toolResults: Anthropic.Messages.ToolResultBlockParam[] = [];
for (const toolUse of toolUseBlocks) {
if (toolUse.name === 'done')
return JSON.stringify(toolUse.input, null, 2);
try {
debug('tool')(toolUse.name, toolUse.input);
const response = await client.callTool({
name: toolUse.name,
arguments: toolUse.input as any,
});
const responseContent = (response.content || []) as (TextContent | ImageContent)[];
debug('tool')(responseContent);
const text = responseContent.filter(part => part.type === 'text').map(part => part.text).join('\n');
toolResults.push({
type: 'tool_result',
tool_use_id: toolUse.id,
content: text,
});
} catch (error) {
debug('tool')(error);
toolResults.push({
type: 'tool_result',
tool_use_id: toolUse.id,
content: `Error while executing tool "${toolUse.name}": ${error instanceof Error ? error.message : String(error)}\n\nPlease try to recover and complete the task.`,
is_error: true,
});
// Skip remaining tool calls for this iteration
for (const remainingToolUse of toolUseBlocks.slice(toolUseBlocks.indexOf(toolUse) + 1)) {
toolResults.push({
type: 'tool_result',
tool_use_id: remainingToolUse.id,
content: `This tool call is skipped due to previous error.`,
is_error: true,
});
}
break;
}
}
// Add tool results as user message
messages.push({
role: 'user',
content: toolResults
});
}
throw new Error('Failed to perform step, max attempts reached');
}
function asClaudeDeclaration(tool: Tool): Anthropic.Messages.Tool {
return {
name: tool.name,
description: tool.description,
input_schema: tool.inputSchema,
};
}

105
src/eval/loopOpenAI.ts Normal file
View File

@@ -0,0 +1,105 @@
/**
* Copyright (c) Microsoft Corporation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import OpenAI from 'openai';
import debug from 'debug';
import type { Tool, ImageContent, TextContent } from '@modelcontextprotocol/sdk/types.js';
import type { Client } from '@modelcontextprotocol/sdk/client/index.js';
const model = 'gpt-4.1';
export async function runTask(client: Client, task: string): Promise<string | undefined> {
const openai = new OpenAI();
const messages: OpenAI.Chat.Completions.ChatCompletionMessageParam[] = [
{
role: 'user',
content: `Peform following task: ${task}. Once the task is complete, call the "done" tool.`
}
];
const { tools } = await client.listTools();
for (let iteration = 0; iteration < 5; ++iteration) {
debug('history')(messages);
const response = await openai.chat.completions.create({
model,
messages,
tools: tools.map(tool => asOpenAIDeclaration(tool)),
tool_choice: 'auto'
});
const message = response.choices[0].message;
if (!message.tool_calls?.length)
return JSON.stringify(message.content, null, 2);
messages.push({
role: 'assistant',
tool_calls: message.tool_calls
});
for (const toolCall of message.tool_calls) {
const functionCall = toolCall.function;
if (functionCall.name === 'done')
return JSON.stringify(functionCall.arguments, null, 2);
try {
debug('tool')(functionCall.name, functionCall.arguments);
const response = await client.callTool({
name: functionCall.name,
arguments: JSON.parse(functionCall.arguments)
});
const content = (response.content || []) as (TextContent | ImageContent)[];
debug('tool')(content);
const text = content.filter(part => part.type === 'text').map(part => part.text).join('\n');
messages.push({
role: 'tool',
tool_call_id: toolCall.id,
content: text,
});
} catch (error) {
debug('tool')(error);
messages.push({
role: 'tool',
tool_call_id: toolCall.id,
content: `Error while executing tool "${functionCall.name}": ${error instanceof Error ? error.message : String(error)}\n\nPlease try to recover and complete the task.`,
});
for (const ignoredToolCall of message.tool_calls.slice(message.tool_calls.indexOf(toolCall) + 1)) {
messages.push({
role: 'tool',
tool_call_id: ignoredToolCall.id,
content: `This tool call is skipped due to previous error.`,
});
}
break;
}
}
}
throw new Error('Failed to perform step, max attempts reached');
}
function asOpenAIDeclaration(tool: Tool): OpenAI.Chat.Completions.ChatCompletionTool {
return {
type: 'function',
function: {
name: tool.name,
description: tool.description,
parameters: tool.inputSchema,
},
};
}

68
src/eval/main.ts Normal file
View File

@@ -0,0 +1,68 @@
/**
* Copyright (c) Microsoft Corporation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* eslint-disable no-console */
import path from 'path';
import url from 'url';
import dotenv from 'dotenv';
import { StdioClientTransport } from '@modelcontextprotocol/sdk/client/stdio.js';
import { Client } from '@modelcontextprotocol/sdk/client/index.js';
import { program } from 'commander';
import { runTask as runTaskOpenAI } from './loopOpenAI.js';
import { runTask as runTaskClaude } from './loopClaude.js';
dotenv.config();
const __filename = url.fileURLToPath(import.meta.url);
async function run(runTask: (client: Client, task: string) => Promise<string | undefined>) {
const transport = new StdioClientTransport({
command: 'node',
args: [
path.resolve(__filename, '../../../cli.js'),
'--save-session',
'--output-dir', path.resolve(__filename, '../../../sessions')
],
stderr: 'inherit',
env: process.env as Record<string, string>,
});
const client = new Client({ name: 'test', version: '1.0.0' });
await client.connect(transport);
await client.ping();
let lastResult: string | undefined;
for (const task of tasks)
lastResult = await runTask(client, task);
console.log(lastResult);
await client.close();
}
const tasks = [
'Open https://playwright.dev/',
];
program
.option('--model <model>', 'model to use')
.action(async options => {
if (options.model === 'claude')
await run(runTaskClaude);
else
await run(runTaskOpenAI);
});
void program.parseAsync(process.argv);

View File

@@ -14,9 +14,9 @@
* limitations under the License.
*/
import fs from 'node:fs';
import url from 'node:url';
import path from 'node:path';
import fs from 'fs';
import path from 'path';
import url from 'url';
const __filename = url.fileURLToPath(import.meta.url);
export const packageJSON = JSON.parse(fs.readFileSync(path.join(path.dirname(__filename), '..', 'package.json'), 'utf8'));