Refactor: Improve MCP logging, update E2E & tests

Refactors MCP server logging and updates testing infrastructure. - MCP Server: - Replaced manual logger wrappers with centralized `createLogWrapper` utility. - Updated direct function calls to use `{ session, mcpLog }` context. - Removed deprecated `model` parameter from analyze, expand-all, expand-task tools. - Adjusted MCP tool import paths and parameter descriptions. - Documentation: - Modified `docs/configuration.md`. - Modified `docs/tutorial.md`. - Testing: - E2E Script (`run_e2e.sh`): - Removed `set -e`. - Added LLM analysis function (`analyze_log_with_llm`) & integration. - Adjusted test run directory creation timing. - Added debug echo statements. - Deleted Unit Tests: Removed `ai-client-factory.test.js`, `ai-client-utils.test.js`, `ai-services.test.js`. - Modified Fixtures: Updated `scripts/task-complexity-report.json`. - Dev Scripts: - Modified `scripts/dev.js`.
2025-04-28 14:38:01 -04:00
parent 5f504fafb8
commit 4cf7e8a74a
37 changed files with 687 additions and 1736 deletions
--- a/tests/e2e/run_e2e.sh
+++ b/tests/e2e/run_e2e.sh
@@ -1,7 +1,5 @@
 #!/bin/bash

-# Exit immediately if a command exits with a non-zero status.
-set -e
 # Treat unset variables as an error when substituting.
 set -u
 # Prevent errors in pipelines from being masked.
@@ -33,6 +31,11 @@ mkdir -p "$LOG_DIR"
 TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
 LOG_FILE="$LOG_DIR/e2e_run_$TIMESTAMP.log"

+# Define and create the test run directory *before* the main pipe
+mkdir -p "$BASE_TEST_DIR" # Ensure base exists first
+TEST_RUN_DIR="$BASE_TEST_DIR/run_$TIMESTAMP"
+mkdir -p "$TEST_RUN_DIR"
+
 # Echo starting message to the original terminal BEFORE the main piped block
 echo "Starting E2E test. Output will be shown here and saved to: $LOG_FILE"
 echo "Running from directory: $(pwd)"
@@ -82,6 +85,125 @@ overall_start_time=$(date +%s)
    echo "  STEP ${test_step_count}: [$(_get_elapsed_time_for_log)] $(date +"%Y-%m-%d %H:%M:%S") $1"
    echo "============================================="
  }
+
+  analyze_log_with_llm() {
+    local log_file="$1"
+    local provider_summary_log="provider_add_task_summary.log" # File summarizing provider test outcomes
+    local api_key=""
+    local api_endpoint="https://api.anthropic.com/v1/messages"
+    local api_key_name="CLAUDE_API_KEY"
+
+    echo "" # Add a newline before analysis starts
+    log_info "Attempting LLM analysis of log: $log_file"
+
+    # Check for jq and curl
+    if ! command -v jq &> /dev/null; then
+      log_error "LLM Analysis requires 'jq'. Skipping analysis."
+      return 1
+    fi
+    if ! command -v curl &> /dev/null; then
+      log_error "LLM Analysis requires 'curl'. Skipping analysis."
+      return 1
+    fi
+
+    # Check for API Key in the TEST_RUN_DIR/.env (copied earlier)
+    if [ -f ".env" ]; then
+      # Using grep and sed for better handling of potential quotes/spaces
+      api_key=$(grep "^${api_key_name}=" .env | sed -e "s/^${api_key_name}=//" -e 's/^[[:space:]"]*//' -e 's/[[:space:]"]*$//')
+    fi
+
+    if [ -z "$api_key" ]; then
+      log_error "${api_key_name} not found or empty in .env file in the test run directory ($(pwd)/.env). Skipping LLM analysis."
+      return 1
+    fi
+
+    if [ ! -f "$log_file" ]; then
+      log_error "Log file not found: $log_file. Skipping LLM analysis."
+      return 1
+    fi
+
+    log_info "Reading log file content..."
+    local log_content
+    # Read entire file, handle potential errors
+    log_content=$(cat "$log_file") || {
+      log_error "Failed to read log file: $log_file. Skipping LLM analysis."
+      return 1
+    }
+
+    # Prepare the prompt
+    # Using printf with %s for the log content is generally safer than direct variable expansion
+    local prompt_template='Analyze the following E2E test log for the task-master tool. The log contains output from various '\''task-master'\'' commands executed sequentially.\n\nYour goal is to:\n1. Verify if the key E2E steps completed successfully based on the log messages (e.g., init, parse PRD, list tasks, analyze complexity, expand task, set status, manage models, add/remove dependencies, add/update/remove tasks/subtasks, generate files).\n2. **Specifically analyze the Multi-Provider Add-Task Test Sequence:**\n   a. Identify which providers were tested for `add-task`. Look for log steps like "Testing Add-Task with Provider: ..." and the summary log `'"$provider_summary_log"'`.\n   b. For each tested provider, determine if `add-task` succeeded or failed. Note the created task ID if successful.\n   c. Review the corresponding `add_task_show_output_<provider>_id_<id>.log` file (if created) for each successful `add-task` execution.\n   d. **Compare the quality and completeness** of the task generated by each successful provider based on their `show` output. Assign a score (e.g., 1-10, 10 being best) based on relevance to the prompt, detail level, and correctness.\n   e. Note any providers where `add-task` failed or where the task ID could not be extracted.\n3. Identify any general explicit "[ERROR]" messages or stack traces throughout the *entire* log.\n4. Identify any potential warnings or unusual output that might indicate a problem even if not marked as an explicit error.\n5. Provide an overall assessment of the test run'\''s health based *only* on the log content.\n\nReturn your analysis **strictly** in the following JSON format. Do not include any text outside of the JSON structure:\n\n{\n  "overall_status": "Success|Failure|Warning",\n  "verified_steps": [ "Initialization", "PRD Parsing", /* ...other general steps observed... */ ],\n  "provider_add_task_comparison": {\n     "prompt_used": "... (extract from log if possible or state 'standard auth prompt') ...",\n     "provider_results": {\n       "anthropic": { "status": "Success|Failure|ID_Extraction_Failed|Set_Model_Failed", "task_id": "...", "score": "X/10 | N/A", "notes": "..." },\n       "openai": { "status": "Success|Failure|...", "task_id": "...", "score": "X/10 | N/A", "notes": "..." },\n       /* ... include all tested providers ... */\n     },\n     "comparison_summary": "Brief overall comparison of generated tasks..."\n   },\n  "detected_issues": [ { "severity": "Error|Warning|Anomaly", "description": "...", "log_context": "[Optional, short snippet from log near the issue]" } ],\n  "llm_summary_points": [ "Overall summary point 1", "Provider comparison highlight", "Any major issues noted" ]\n}\n\nHere is the main log content:\n\n%s'
+
+    local full_prompt
+    printf -v full_prompt "$prompt_template" "$log_content"
+
+    # Construct the JSON payload for Claude Messages API
+    # Using jq for robust JSON construction
+    local payload
+    payload=$(jq -n --arg prompt "$full_prompt" '{
+      "model": "claude-3-7-sonnet-20250219", 
+      "max_tokens": 10000,
+      "messages": [
+        {"role": "user", "content": $prompt}
+      ],
+      "temperature": 0.0
+    }') || {
+        log_error "Failed to create JSON payload using jq."
+        return 1
+    }
+
+    log_info "Sending request to LLM API endpoint: $api_endpoint ..."
+    local response_raw response_http_code response_body
+    # Capture body and HTTP status code separately
+    response_raw=$(curl -s -w "\nHTTP_STATUS_CODE:%{http_code}" -X POST "$api_endpoint" \
+         -H "Content-Type: application/json" \
+         -H "x-api-key: $api_key" \
+         -H "anthropic-version: 2023-06-01" \
+         --data "$payload")
+
+    # Extract status code and body
+    response_http_code=$(echo "$response_raw" | grep '^HTTP_STATUS_CODE:' | sed 's/HTTP_STATUS_CODE://')
+    response_body=$(echo "$response_raw" | sed '$d') # Remove last line (status code)
+
+    if [ "$response_http_code" != "200" ]; then
+        log_error "LLM API call failed with HTTP status $response_http_code."
+        log_error "Response Body: $response_body"
+        return 1
+    fi
+
+    if [ -z "$response_body" ]; then
+        log_error "LLM API call returned empty response body."
+        return 1
+    fi
+
+    log_info "Received LLM response (HTTP 200). Parsing analysis JSON..."
+
+    # Extract the analysis JSON string from the API response (adjust jq path if needed)
+    local analysis_json_string
+    analysis_json_string=$(echo "$response_body" | jq -r '.content[0].text' 2>/dev/null) # Assumes Messages API structure
+
+    if [ -z "$analysis_json_string" ]; then
+        log_error "Failed to extract 'content[0].text' from LLM response JSON."
+        log_error "Full API response body: $response_body"
+        return 1
+    fi
+
+    # Validate and pretty-print the extracted JSON
+    if ! echo "$analysis_json_string" | jq -e . > /dev/null 2>&1; then
+        log_error "Extracted content from LLM is not valid JSON."
+        log_error "Raw extracted content: $analysis_json_string"
+        return 1
+    fi
+
+    log_success "LLM analysis completed successfully."
+    echo ""
+    echo "--- LLM Analysis ---"
+    # Pretty print the JSON analysis
+    echo "$analysis_json_string" | jq '.'
+    echo "--------------------"
+
+    return 0
+  }
  # ---

  # --- Test Setup (Output to tee) ---
@@ -95,12 +217,9 @@ overall_start_time=$(date +%s)
    exit 1
  fi

-  mkdir -p "$BASE_TEST_DIR"
  log_info "Ensured base test directory exists: $BASE_TEST_DIR"

-  TEST_RUN_DIR="$BASE_TEST_DIR/run_$TIMESTAMP"
-  mkdir -p "$TEST_RUN_DIR"
-  log_info "Created test run directory: $TEST_RUN_DIR"
+  log_info "Using test run directory (created earlier): $TEST_RUN_DIR"

  # Check if source .env file exists
  if [ ! -f "$MAIN_ENV_FILE" ]; then
@@ -209,8 +328,103 @@ overall_start_time=$(date +%s)
  log_step "Checking final model configuration"
  task-master models > models_final_config.log
  log_success "Final model config saved to models_final_config.log"
+
+  log_step "Resetting main model to default (Claude Sonnet) before provider tests"
+  task-master models --set-main claude-3-7-sonnet-20250219
+  log_success "Main model reset to claude-3-7-sonnet-20250219."
+
  # === End Model Commands Test ===

+  # === Multi-Provider Add-Task Test ===
+  log_step "Starting Multi-Provider Add-Task Test Sequence"
+
+  # Define providers, models, and flags
+  # Array order matters: providers[i] corresponds to models[i] and flags[i]
+  declare -a providers=("anthropic" "openai" "google" "perplexity" "xai" "openrouter")
+  declare -a models=(
+    "claude-3-7-sonnet-20250219"
+    "gpt-4o"
+    "gemini-2.5-pro-exp-03-25"
+    "sonar-pro"
+    "grok-3"
+    "anthropic/claude-3.7-sonnet" # OpenRouter uses Claude 3.7 
+  )
+  # Flags: Add provider-specific flags here, e.g., --openrouter. Use empty string if none.
+  declare -a flags=("" "" "" "" "" "--openrouter")
+
+  # Consistent prompt for all providers
+  add_task_prompt="Create a task to implement user authentication using OAuth 2.0 with Google as the provider. Include steps for registering the app, handling the callback, and storing user sessions."
+  log_info "Using consistent prompt for add-task tests: \"$add_task_prompt\""
+
+  for i in "${!providers[@]}"; do
+    provider="${providers[$i]}"
+    model="${models[$i]}"
+    flag="${flags[$i]}"
+
+    log_step "Testing Add-Task with Provider: $provider (Model: $model)"
+
+    # 1. Set the main model for this provider
+    log_info "Setting main model to $model for $provider ${flag:+using flag $flag}..."
+    set_model_cmd="task-master models --set-main \"$model\" $flag"
+    echo "Executing: $set_model_cmd"
+    if eval $set_model_cmd; then
+      log_success "Successfully set main model for $provider."
+    else
+      log_error "Failed to set main model for $provider. Skipping add-task for this provider."
+      # Optionally save failure info here if needed for LLM analysis
+      echo "Provider $provider set-main FAILED" >> provider_add_task_summary.log
+      continue # Skip to the next provider
+    fi
+
+    # 2. Run add-task
+    log_info "Running add-task with prompt..."
+    add_task_output_file="add_task_raw_output_${provider}.log"
+    # Run add-task and capture ALL output (stdout & stderr) to a file AND a variable
+    add_task_cmd_output=$(task-master add-task --prompt "$add_task_prompt" 2>&1 | tee "$add_task_output_file")
+    add_task_exit_code=${PIPESTATUS[0]}
+
+    # 3. Check for success and extract task ID
+    new_task_id=""
+    if [ $add_task_exit_code -eq 0 ] && echo "$add_task_cmd_output" | grep -q "Successfully added task with ID:"; then
+      # Attempt to extract the ID (adjust grep/sed/awk as needed based on actual output format)
+      new_task_id=$(echo "$add_task_cmd_output" | grep "Successfully added task with ID:" | sed 's/.*Successfully added task with ID: \([0-9.]\+\).*/\1/')
+      if [ -n "$new_task_id" ]; then
+        log_success "Add-task succeeded for $provider. New task ID: $new_task_id"
+        echo "Provider $provider add-task SUCCESS (ID: $new_task_id)" >> provider_add_task_summary.log
+      else
+        # Succeeded but couldn't parse ID - treat as warning/anomaly
+        log_error "Add-task command succeeded for $provider, but failed to extract task ID from output."
+        echo "Provider $provider add-task SUCCESS (ID extraction FAILED)" >> provider_add_task_summary.log
+        new_task_id="UNKNOWN_ID_EXTRACTION_FAILED"
+      fi
+    else
+      log_error "Add-task command failed for $provider (Exit Code: $add_task_exit_code). See $add_task_output_file for details."
+      echo "Provider $provider add-task FAILED (Exit Code: $add_task_exit_code)" >> provider_add_task_summary.log
+      new_task_id="FAILED"
+    fi
+
+    # 4. Run task show if ID was obtained (even if extraction failed, use placeholder)
+    if [ "$new_task_id" != "FAILED" ] && [ "$new_task_id" != "UNKNOWN_ID_EXTRACTION_FAILED" ]; then
+      log_info "Running task show for new task ID: $new_task_id"
+      show_output_file="add_task_show_output_${provider}_id_${new_task_id}.log"
+      if task-master show "$new_task_id" > "$show_output_file"; then
+        log_success "Task show output saved to $show_output_file"
+      else
+        log_error "task show command failed for ID $new_task_id. Check log."
+        # Still keep the file, it might contain error output
+      fi
+    elif [ "$new_task_id" == "UNKNOWN_ID_EXTRACTION_FAILED" ]; then
+       log_info "Skipping task show for $provider due to ID extraction failure."
+    else
+       log_info "Skipping task show for $provider due to add-task failure."
+    fi
+
+  done # End of provider loop
+
+  log_step "Finished Multi-Provider Add-Task Test Sequence"
+  echo "Provider add-task summary log available at: provider_add_task_summary.log"
+  # === End Multi-Provider Add-Task Test ===
+
  log_step "Listing tasks again (final)"
  task-master list --with-subtasks > task_list_final.log
  log_success "Final task list saved to task_list_final.log"
@@ -386,4 +600,26 @@ else
 fi
 echo "-------------------------"

+# --- Attempt LLM Analysis ---
+echo "DEBUG: Entering LLM Analysis section..."
+# Run this *after* the main execution block and tee pipe finish writing the log file
+# It will read the completed log file and append its output to the terminal (and the log via subsequent writes if tee is still active, though it shouldn't be)
+# Change directory back into the test run dir where .env is located
+if [ -d "$TEST_RUN_DIR" ]; then
+  echo "DEBUG: Found TEST_RUN_DIR: $TEST_RUN_DIR. Attempting cd..."
+  cd "$TEST_RUN_DIR"
+  echo "DEBUG: Changed directory to $(pwd). Calling analyze_log_with_llm..."
+  analyze_log_with_llm "$LOG_FILE"
+  echo "DEBUG: analyze_log_with_llm function call finished."
+  # Optional: cd back again if needed, though script is ending
+  # cd "$ORIGINAL_DIR"
+else
+  # Use log_error format even outside the pipe for consistency
+  current_time_for_error=$(date +%s)
+  elapsed_seconds_for_error=$((current_time_for_error - overall_start_time)) # Use overall start time
+  formatted_duration_for_error=$(_format_duration "$elapsed_seconds_for_error")
+  echo "[ERROR] [$formatted_duration_for_error] $(date +"%Y-%m-%d %H:%M:%S") Test run directory $TEST_RUN_DIR not found. Cannot perform LLM analysis." >&2
+fi
+
+echo "DEBUG: Reached end of script before final exit."
 exit $EXIT_CODE # Exit with the status of the main script block