Files
claude-task-master/tests/e2e/utils/llm-analyzer.js
2025-07-19 00:18:16 +03:00

172 lines
6.0 KiB
JavaScript

import { readFileSync } from 'fs';
import fetch from 'node-fetch';
export class LLMAnalyzer {
constructor(config, logger) {
this.config = config;
this.logger = logger;
this.apiKey = process.env.ANTHROPIC_API_KEY;
this.apiEndpoint = 'https://api.anthropic.com/v1/messages';
}
async analyzeLog(logFile, providerSummaryFile = null) {
if (!this.config.llmAnalysis.enabled) {
this.logger.info('LLM analysis is disabled in configuration');
return null;
}
if (!this.apiKey) {
this.logger.error('ANTHROPIC_API_KEY not found in environment');
return null;
}
try {
const logContent = readFileSync(logFile, 'utf8');
const prompt = this.buildAnalysisPrompt(logContent, providerSummaryFile);
const response = await this.callLLM(prompt);
const analysis = this.parseResponse(response);
// Calculate and log cost
if (response.usage) {
const cost = this.calculateCost(response.usage);
this.logger.addCost(cost);
this.logger.info(`LLM Analysis AI Cost: $${cost.toFixed(6)} USD`);
}
return analysis;
} catch (error) {
this.logger.error(`LLM analysis failed: ${error.message}`);
return null;
}
}
buildAnalysisPrompt(logContent, providerSummaryFile) {
let providerSummary = '';
if (providerSummaryFile) {
try {
providerSummary = readFileSync(providerSummaryFile, 'utf8');
} catch (error) {
this.logger.warning(
`Could not read provider summary file: ${error.message}`
);
}
}
return `Analyze the following E2E test log for the task-master tool. The log contains output from various 'task-master' commands executed sequentially.
Your goal is to:
1. Verify if the key E2E steps completed successfully based on the log messages (e.g., init, parse PRD, list tasks, analyze complexity, expand task, set status, manage models, add/remove dependencies, add/update/remove tasks/subtasks, generate files).
2. **Specifically analyze the Multi-Provider Add-Task Test Sequence:**
a. Identify which providers were tested for \`add-task\`. Look for log steps like "Testing Add-Task with Provider: ..." and the summary log 'provider_add_task_summary.log'.
b. For each tested provider, determine if \`add-task\` succeeded or failed. Note the created task ID if successful.
c. Review the corresponding \`add_task_show_output_<provider>_id_<id>.log\` file (if created) for each successful \`add-task\` execution.
d. **Compare the quality and completeness** of the task generated by each successful provider based on their \`show\` output. Assign a score (e.g., 1-10, 10 being best) based on relevance to the prompt, detail level, and correctness.
e. Note any providers where \`add-task\` failed or where the task ID could not be extracted.
3. Identify any general explicit "[ERROR]" messages or stack traces throughout the *entire* log.
4. Identify any potential warnings or unusual output that might indicate a problem even if not marked as an explicit error.
5. Provide an overall assessment of the test run's health based *only* on the log content.
${providerSummary ? `\nProvider Summary:\n${providerSummary}\n` : ''}
Return your analysis **strictly** in the following JSON format. Do not include any text outside of the JSON structure:
{
"overall_status": "Success|Failure|Warning",
"verified_steps": [ "Initialization", "PRD Parsing", /* ...other general steps observed... */ ],
"provider_add_task_comparison": {
"prompt_used": "... (extract from log if possible or state 'standard auth prompt') ...",
"provider_results": {
"anthropic": { "status": "Success|Failure|ID_Extraction_Failed|Set_Model_Failed", "task_id": "...", "score": "X/10 | N/A", "notes": "..." },
"openai": { "status": "Success|Failure|...", "task_id": "...", "score": "X/10 | N/A", "notes": "..." },
/* ... include all tested providers ... */
},
"comparison_summary": "Brief overall comparison of generated tasks..."
},
"detected_issues": [ { "severity": "Error|Warning|Anomaly", "description": "...", "log_context": "[Optional, short snippet from log near the issue]" } ],
"llm_summary_points": [ "Overall summary point 1", "Provider comparison highlight", "Any major issues noted" ]
}
Here is the main log content:
${logContent}`;
}
async callLLM(prompt) {
const payload = {
model: this.config.llmAnalysis.model,
max_tokens: this.config.llmAnalysis.maxTokens,
messages: [{ role: 'user', content: prompt }]
};
const response = await fetch(this.apiEndpoint, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'x-api-key': this.apiKey,
'anthropic-version': '2023-06-01'
},
body: JSON.stringify(payload)
});
if (!response.ok) {
const error = await response.text();
throw new Error(`LLM API call failed: ${response.status} - ${error}`);
}
return response.json();
}
parseResponse(response) {
try {
const content = response.content[0].text;
const jsonStart = content.indexOf('{');
const jsonEnd = content.lastIndexOf('}');
if (jsonStart === -1 || jsonEnd === -1) {
throw new Error('No JSON found in response');
}
const jsonString = content.substring(jsonStart, jsonEnd + 1);
return JSON.parse(jsonString);
} catch (error) {
this.logger.error(`Failed to parse LLM response: ${error.message}`);
return null;
}
}
calculateCost(usage) {
const modelCosts = {
'claude-3-7-sonnet-20250219': {
input: 3.0, // per 1M tokens
output: 15.0 // per 1M tokens
}
};
const costs = modelCosts[this.config.llmAnalysis.model] || {
input: 0,
output: 0
};
const inputCost = (usage.input_tokens / 1000000) * costs.input;
const outputCost = (usage.output_tokens / 1000000) * costs.output;
return inputCost + outputCost;
}
formatReport(analysis) {
if (!analysis) return null;
const report = {
title: 'TASKMASTER E2E Test Analysis Report',
timestamp: new Date().toISOString(),
status: analysis.overall_status,
summary: analysis.llm_summary_points,
verifiedSteps: analysis.verified_steps,
providerComparison: analysis.provider_add_task_comparison,
issues: analysis.detected_issues
};
return report;
}
}