claude-task-master/tests/e2e/utils/llm-analyzer.js

import { readFileSync } from 'fs';
import fetch from 'node-fetch';

export class LLMAnalyzer {
	constructor(config, logger) {
		this.config = config;
		this.logger = logger;
		this.apiKey = process.env.ANTHROPIC_API_KEY;
		this.apiEndpoint = 'https://api.anthropic.com/v1/messages';
	}

	async analyzeLog(logFile, providerSummaryFile = null) {
		if (!this.config.llmAnalysis.enabled) {
			this.logger.info('LLM analysis is disabled in configuration');
			return null;
		}

		if (!this.apiKey) {
			this.logger.error('ANTHROPIC_API_KEY not found in environment');
			return null;
		}

		try {
			const logContent = readFileSync(logFile, 'utf8');
			const prompt = this.buildAnalysisPrompt(logContent, providerSummaryFile);

			const response = await this.callLLM(prompt);
			const analysis = this.parseResponse(response);

			// Calculate and log cost
			if (response.usage) {
				const cost = this.calculateCost(response.usage);
				this.logger.addCost(cost);
				this.logger.info(`LLM Analysis AI Cost: $${cost.toFixed(6)} USD`);
			}

			return analysis;
		} catch (error) {
			this.logger.error(`LLM analysis failed: ${error.message}`);
			return null;
		}
	}

	buildAnalysisPrompt(logContent, providerSummaryFile) {
		let providerSummary = '';
		if (providerSummaryFile) {
			try {
				providerSummary = readFileSync(providerSummaryFile, 'utf8');
			} catch (error) {
				this.logger.warning(
					`Could not read provider summary file: ${error.message}`
				);
			}
		}

		return `Analyze the following E2E test log for the task-master tool. The log contains output from various 'task-master' commands executed sequentially.

Your goal is to:
1. Verify if the key E2E steps completed successfully based on the log messages (e.g., init, parse PRD, list tasks, analyze complexity, expand task, set status, manage models, add/remove dependencies, add/update/remove tasks/subtasks, generate files).
2. **Specifically analyze the Multi-Provider Add-Task Test Sequence:**
   a. Identify which providers were tested for \`add-task\`. Look for log steps like "Testing Add-Task with Provider: ..." and the summary log 'provider_add_task_summary.log'.
   b. For each tested provider, determine if \`add-task\` succeeded or failed. Note the created task ID if successful.
   c. Review the corresponding \`add_task_show_output_<provider>_id_<id>.log\` file (if created) for each successful \`add-task\` execution.
   d. **Compare the quality and completeness** of the task generated by each successful provider based on their \`show\` output. Assign a score (e.g., 1-10, 10 being best) based on relevance to the prompt, detail level, and correctness.
   e. Note any providers where \`add-task\` failed or where the task ID could not be extracted.
3. Identify any general explicit "[ERROR]" messages or stack traces throughout the *entire* log.
4. Identify any potential warnings or unusual output that might indicate a problem even if not marked as an explicit error.
5. Provide an overall assessment of the test run's health based *only* on the log content.

${providerSummary ? `\nProvider Summary:\n${providerSummary}\n` : ''}

Return your analysis **strictly** in the following JSON format. Do not include any text outside of the JSON structure:

{
  "overall_status": "Success|Failure|Warning",
  "verified_steps": [ "Initialization", "PRD Parsing", /* ...other general steps observed... */ ],
  "provider_add_task_comparison": {
     "prompt_used": "... (extract from log if possible or state 'standard auth prompt') ...",
     "provider_results": {
       "anthropic": { "status": "Success|Failure|ID_Extraction_Failed|Set_Model_Failed", "task_id": "...", "score": "X/10 | N/A", "notes": "..." },
       "openai": { "status": "Success|Failure|...", "task_id": "...", "score": "X/10 | N/A", "notes": "..." },
       /* ... include all tested providers ... */
     },
     "comparison_summary": "Brief overall comparison of generated tasks..."
  },
  "detected_issues": [ { "severity": "Error|Warning|Anomaly", "description": "...", "log_context": "[Optional, short snippet from log near the issue]" } ],
  "llm_summary_points": [ "Overall summary point 1", "Provider comparison highlight", "Any major issues noted" ]
}

Here is the main log content:

${logContent}`;
	}

	async callLLM(prompt) {
		const payload = {
			model: this.config.llmAnalysis.model,
			max_tokens: this.config.llmAnalysis.maxTokens,
			messages: [{ role: 'user', content: prompt }]
		};

		const response = await fetch(this.apiEndpoint, {
			method: 'POST',
			headers: {
				'Content-Type': 'application/json',
				'x-api-key': this.apiKey,
				'anthropic-version': '2023-06-01'
			},
			body: JSON.stringify(payload)
		});

		if (!response.ok) {
			const error = await response.text();
			throw new Error(`LLM API call failed: ${response.status} - ${error}`);
		}

		return response.json();
	}

	parseResponse(response) {
		try {
			const content = response.content[0].text;
			const jsonStart = content.indexOf('{');
			const jsonEnd = content.lastIndexOf('}');

			if (jsonStart === -1 || jsonEnd === -1) {
				throw new Error('No JSON found in response');
			}

			const jsonString = content.substring(jsonStart, jsonEnd + 1);
			return JSON.parse(jsonString);
		} catch (error) {
			this.logger.error(`Failed to parse LLM response: ${error.message}`);
			return null;
		}
	}

	calculateCost(usage) {
		const modelCosts = {
			'claude-3-7-sonnet-20250219': {
				input: 3.0, // per 1M tokens
				output: 15.0 // per 1M tokens
			}
		};

		const costs = modelCosts[this.config.llmAnalysis.model] || {
			input: 0,
			output: 0
		};
		const inputCost = (usage.input_tokens / 1000000) * costs.input;
		const outputCost = (usage.output_tokens / 1000000) * costs.output;

		return inputCost + outputCost;
	}

	formatReport(analysis) {
		if (!analysis) return null;

		const report = {
			title: 'TASKMASTER E2E Test Analysis Report',
			timestamp: new Date().toISOString(),
			status: analysis.overall_status,
			summary: analysis.llm_summary_points,
			verifiedSteps: analysis.verified_steps,
			providerComparison: analysis.provider_add_task_comparison,
			issues: analysis.detected_issues
		};

		return report;
	}
}