chore: add eval script (#743)

2026-03-18 11:03:09 +00:00 · 2025-07-23 10:31:37 -07:00
parent 288f1b863b
commit 2c5eac89a8
8 changed files with 405 additions and 59 deletions
--- a/src/connection.ts
+++ b/src/connection.ts
@@ -17,7 +17,6 @@
 import { Server as McpServer } from '@modelcontextprotocol/sdk/server/index.js';
 import { CallToolRequestSchema, ListToolsRequestSchema, Tool as McpTool } from '@modelcontextprotocol/sdk/types.js';
 import { zodToJsonSchema } from 'zod-to-json-schema';
-
 import { Context } from './context.js';
 import { Response } from './response.js';
 import { allTools } from './tools.js';
--- a/src/eval/loopClaude.ts
+++ b/src/eval/loopClaude.ts
@@ -0,0 +1,119 @@
+/**
+ * Copyright (c) Microsoft Corporation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import Anthropic from '@anthropic-ai/sdk';
+import debug from 'debug';
+
+import type { Tool, ImageContent, TextContent } from '@modelcontextprotocol/sdk/types.js';
+import type { Client } from '@modelcontextprotocol/sdk/client/index.js';
+
+const model = 'claude-sonnet-4-20250514';
+
+export async function runTask(client: Client, task: string): Promise<string | undefined> {
+  const anthropic = new Anthropic();
+  const messages: Anthropic.Messages.MessageParam[] = [];
+
+  const { tools } = await client.listTools();
+  const claudeTools = tools.map(tool => asClaudeDeclaration(tool));
+
+  // Add initial user message
+  messages.push({
+    role: 'user',
+    content: `Perform following task: ${task}.`
+  });
+
+  for (let iteration = 0; iteration < 5; ++iteration) {
+    debug('history')(messages);
+
+    const response = await anthropic.messages.create({
+      model,
+      max_tokens: 10000,
+      messages,
+      tools: claudeTools,
+    });
+
+    const content = response.content;
+
+    const toolUseBlocks = content.filter(block => block.type === 'tool_use');
+    const textBlocks = content.filter(block => block.type === 'text');
+
+    messages.push({
+      role: 'assistant',
+      content: content
+    });
+
+    if (toolUseBlocks.length === 0)
+      return textBlocks.map(block => block.text).join('\n');
+
+    const toolResults: Anthropic.Messages.ToolResultBlockParam[] = [];
+
+    for (const toolUse of toolUseBlocks) {
+      if (toolUse.name === 'done')
+        return JSON.stringify(toolUse.input, null, 2);
+
+      try {
+        debug('tool')(toolUse.name, toolUse.input);
+        const response = await client.callTool({
+          name: toolUse.name,
+          arguments: toolUse.input as any,
+        });
+        const responseContent = (response.content || []) as (TextContent | ImageContent)[];
+        debug('tool')(responseContent);
+        const text = responseContent.filter(part => part.type === 'text').map(part => part.text).join('\n');
+
+        toolResults.push({
+          type: 'tool_result',
+          tool_use_id: toolUse.id,
+          content: text,
+        });
+      } catch (error) {
+        debug('tool')(error);
+        toolResults.push({
+          type: 'tool_result',
+          tool_use_id: toolUse.id,
+          content: `Error while executing tool "${toolUse.name}": ${error instanceof Error ? error.message : String(error)}\n\nPlease try to recover and complete the task.`,
+          is_error: true,
+        });
+        // Skip remaining tool calls for this iteration
+        for (const remainingToolUse of toolUseBlocks.slice(toolUseBlocks.indexOf(toolUse) + 1)) {
+          toolResults.push({
+            type: 'tool_result',
+            tool_use_id: remainingToolUse.id,
+            content: `This tool call is skipped due to previous error.`,
+            is_error: true,
+          });
+        }
+        break;
+      }
+    }
+
+    // Add tool results as user message
+    messages.push({
+      role: 'user',
+      content: toolResults
+    });
+  }
+
+  throw new Error('Failed to perform step, max attempts reached');
+}
+
+function asClaudeDeclaration(tool: Tool): Anthropic.Messages.Tool {
+  return {
+    name: tool.name,
+    description: tool.description,
+    input_schema: tool.inputSchema,
+  };
+}
--- a/src/eval/loopOpenAI.ts
+++ b/src/eval/loopOpenAI.ts
@@ -0,0 +1,105 @@
+/**
+ * Copyright (c) Microsoft Corporation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import OpenAI from 'openai';
+import debug from 'debug';
+
+import type { Tool, ImageContent, TextContent } from '@modelcontextprotocol/sdk/types.js';
+import type { Client } from '@modelcontextprotocol/sdk/client/index.js';
+
+const model = 'gpt-4.1';
+
+export async function runTask(client: Client, task: string): Promise<string | undefined> {
+  const openai = new OpenAI();
+  const messages: OpenAI.Chat.Completions.ChatCompletionMessageParam[] = [
+    {
+      role: 'user',
+      content: `Peform following task: ${task}. Once the task is complete, call the "done" tool.`
+    }
+  ];
+
+  const { tools } = await client.listTools();
+
+  for (let iteration = 0; iteration < 5; ++iteration) {
+    debug('history')(messages);
+
+    const response = await openai.chat.completions.create({
+      model,
+      messages,
+      tools: tools.map(tool => asOpenAIDeclaration(tool)),
+      tool_choice: 'auto'
+    });
+
+    const message = response.choices[0].message;
+    if (!message.tool_calls?.length)
+      return JSON.stringify(message.content, null, 2);
+
+    messages.push({
+      role: 'assistant',
+      tool_calls: message.tool_calls
+    });
+
+    for (const toolCall of message.tool_calls) {
+      const functionCall = toolCall.function;
+
+      if (functionCall.name === 'done')
+        return JSON.stringify(functionCall.arguments, null, 2);
+
+      try {
+        debug('tool')(functionCall.name, functionCall.arguments);
+        const response = await client.callTool({
+          name: functionCall.name,
+          arguments: JSON.parse(functionCall.arguments)
+        });
+        const content = (response.content || []) as (TextContent | ImageContent)[];
+        debug('tool')(content);
+        const text = content.filter(part => part.type === 'text').map(part => part.text).join('\n');
+        messages.push({
+          role: 'tool',
+          tool_call_id: toolCall.id,
+          content: text,
+        });
+      } catch (error) {
+        debug('tool')(error);
+        messages.push({
+          role: 'tool',
+          tool_call_id: toolCall.id,
+          content: `Error while executing tool "${functionCall.name}": ${error instanceof Error ? error.message : String(error)}\n\nPlease try to recover and complete the task.`,
+        });
+        for (const ignoredToolCall of message.tool_calls.slice(message.tool_calls.indexOf(toolCall) + 1)) {
+          messages.push({
+            role: 'tool',
+            tool_call_id: ignoredToolCall.id,
+            content: `This tool call is skipped due to previous error.`,
+          });
+        }
+        break;
+      }
+    }
+  }
+  throw new Error('Failed to perform step, max attempts reached');
+}
+
+function asOpenAIDeclaration(tool: Tool): OpenAI.Chat.Completions.ChatCompletionTool {
+  return {
+    type: 'function',
+    function: {
+      name: tool.name,
+      description: tool.description,
+      parameters: tool.inputSchema,
+    },
+  };
+}
--- a/src/eval/main.ts
+++ b/src/eval/main.ts
@@ -0,0 +1,68 @@
+/**
+ * Copyright (c) Microsoft Corporation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* eslint-disable no-console */
+
+import path from 'path';
+import url from 'url';
+import dotenv from 'dotenv';
+
+import { StdioClientTransport } from '@modelcontextprotocol/sdk/client/stdio.js';
+import { Client } from '@modelcontextprotocol/sdk/client/index.js';
+import { program } from 'commander';
+import { runTask as runTaskOpenAI } from './loopOpenAI.js';
+import { runTask as runTaskClaude } from './loopClaude.js';
+
+dotenv.config();
+
+const __filename = url.fileURLToPath(import.meta.url);
+
+async function run(runTask: (client: Client, task: string) => Promise<string | undefined>) {
+  const transport = new StdioClientTransport({
+    command: 'node',
+    args: [
+      path.resolve(__filename, '../../../cli.js'),
+      '--save-session',
+      '--output-dir', path.resolve(__filename, '../../../sessions')
+    ],
+    stderr: 'inherit',
+    env: process.env as Record<string, string>,
+  });
+
+  const client = new Client({ name: 'test', version: '1.0.0' });
+  await client.connect(transport);
+  await client.ping();
+
+  let lastResult: string | undefined;
+  for (const task of tasks)
+    lastResult = await runTask(client, task);
+  console.log(lastResult);
+  await client.close();
+}
+
+const tasks = [
+  'Open https://playwright.dev/',
+];
+
+program
+    .option('--model <model>', 'model to use')
+    .action(async options => {
+      if (options.model === 'claude')
+        await run(runTaskClaude);
+      else
+        await run(runTaskOpenAI);
+    });
+void program.parseAsync(process.argv);
--- a/src/package.ts
+++ b/src/package.ts
@@ -14,9 +14,9 @@
 * limitations under the License.
 */

-import fs from 'node:fs';
-import url from 'node:url';
-import path from 'node:path';
+import fs from 'fs';
+import path from 'path';
+import url from 'url';

 const __filename = url.fileURLToPath(import.meta.url);
 export const packageJSON = JSON.parse(fs.readFileSync(path.join(path.dirname(__filename), '..', 'package.json'), 'utf8'));