refactor: optimize token usage, deduplicate code, fix bugs across agents

Token reduction (~40% per session, ~2.3M fewer tokens per 200-feature project): - Agent-type-specific tool lists: coding 9, testing 5, init 5 (was 19 for all) - Right-sized max_turns: coding 300, testing 100 (was 1000 for all) - Trimmed coding prompt template (~150 lines removed) - Streamlined testing prompt with batch support - YOLO mode now strips browser testing instructions from prompt - Added Grep, WebFetch, WebSearch to expand project session Performance improvements: - Rate limit retries start at ~15s with jitter (was fixed 60s) - Post-spawn delay reduced to 0.5s (was 2s) - Orchestrator consolidated to 1 DB query per loop (was 5-7) - Testing agents batch 3 features per session (was 1) - Smart context compaction preserves critical state, discards noise Bug fixes: - Removed ghost feature_release_testing MCP tool (wasted tokens every test session) - Forward all 9 Vertex AI env vars to chat sessions (was missing 3) - Fix DetachedInstanceError risk in test batch ORM access - Prevent duplicate testing of same features in parallel mode Code deduplication: - _get_project_path(): 9 copies -> 1 shared utility (project_helpers.py) - validate_project_name(): 9 copies -> 2 variants in 1 file (validation.py) - ROOT_DIR: 10 copies -> 1 definition (chat_constants.py) - API_ENV_VARS: 4 copies -> 1 source of truth (env_constants.py) Security hardening: - Unified sensitive directory blocklist (14 dirs, was two divergent lists) - Cached get_blocked_paths() for O(1) directory listing checks - Terminal security warning when ALLOW_REMOTE=1 exposes WebSocket - 20 new security tests for EXTRA_READ_PATHS blocking - Extracted _validate_command_list() and _validate_pkill_processes() helpers Type safety: - 87 mypy errors -> 0 across 58 source files - Installed types-PyYAML for proper yaml stub types - Fixed SQLAlchemy Column[T] coercions across all routers Dead code removed: - 13 files deleted (~2,679 lines): unused UI components, debug logs, outdated docs - 7 unused npm packages removed (Radix UI components with 0 imports) - AgentAvatar.tsx reduced from 615 -> 119 lines (SVGs extracted to mascotData.tsx) New CLI options: - --testing-batch-size (1-5) for parallel mode test batching - --testing-feature-ids for direct multi-feature testing Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-19 11:53:09 +00:00 · 2026-02-01 13:16:24 +02:00
parent dc5bcc4ae9
commit 94e0b05cb1
57 changed files with 1974 additions and 4300 deletions
--- a/client.py
+++ b/client.py
@@ -16,7 +16,8 @@ from claude_agent_sdk import ClaudeAgentOptions, ClaudeSDKClient
 from claude_agent_sdk.types import HookContext, HookInput, HookMatcher, SyncHookJSONOutput
 from dotenv import load_dotenv

-from security import bash_security_hook
+from env_constants import API_ENV_VARS
+from security import SENSITIVE_DIRECTORIES, bash_security_hook

 # Load environment variables from .env file if present
 load_dotenv()
@@ -31,43 +32,15 @@ DEFAULT_PLAYWRIGHT_HEADLESS = True
 # Firefox is recommended for lower CPU usage
 DEFAULT_PLAYWRIGHT_BROWSER = "firefox"

-# Environment variables to pass through to Claude CLI for API configuration
-# These allow using alternative API endpoints (e.g., GLM via z.ai, Vertex AI) without
-# affecting the user's global Claude Code settings
-API_ENV_VARS = [
-    "ANTHROPIC_BASE_URL",              # Custom API endpoint (e.g., https://api.z.ai/api/anthropic)
-    "ANTHROPIC_AUTH_TOKEN",            # API authentication token
-    "API_TIMEOUT_MS",                  # Request timeout in milliseconds
-    "ANTHROPIC_DEFAULT_SONNET_MODEL",  # Model override for Sonnet
-    "ANTHROPIC_DEFAULT_OPUS_MODEL",    # Model override for Opus
-    "ANTHROPIC_DEFAULT_HAIKU_MODEL",   # Model override for Haiku
-    # Vertex AI configuration
-    "CLAUDE_CODE_USE_VERTEX",          # Enable Vertex AI mode (set to "1")
-    "CLOUD_ML_REGION",                 # GCP region (e.g., us-east5)
-    "ANTHROPIC_VERTEX_PROJECT_ID",     # GCP project ID
-]
-
 # Extra read paths for cross-project file access (read-only)
 # Set EXTRA_READ_PATHS environment variable with comma-separated absolute paths
 # Example: EXTRA_READ_PATHS=/Volumes/Data/dev,/Users/shared/libs
 EXTRA_READ_PATHS_VAR = "EXTRA_READ_PATHS"

-# Sensitive directories that should never be allowed via EXTRA_READ_PATHS
-# These contain credentials, keys, or system-critical files
-EXTRA_READ_PATHS_BLOCKLIST = {
-    ".ssh",
-    ".aws",
-    ".azure",
-    ".kube",
-    ".gnupg",
-    ".gpg",
-    ".password-store",
-    ".docker",
-    ".config/gcloud",
-    ".npmrc",
-    ".pypirc",
-    ".netrc",
-}
+# Sensitive directories that should never be allowed via EXTRA_READ_PATHS.
+# Delegates to the canonical SENSITIVE_DIRECTORIES set in security.py so that
+# this blocklist and the filesystem browser API share a single source of truth.
+EXTRA_READ_PATHS_BLOCKLIST = SENSITIVE_DIRECTORIES

 def convert_model_for_vertex(model: str) -> str:
    """
@@ -209,32 +182,55 @@ def get_extra_read_paths() -> list[Path]:
    return validated_paths


-# Feature MCP tools for feature/test management
-FEATURE_MCP_TOOLS = [
-    # Core feature operations
+# Per-agent-type MCP tool lists.
+# Only expose the tools each agent type actually needs, reducing tool schema
+# overhead and preventing agents from calling tools meant for other roles.
+#
+# Tools intentionally omitted from ALL agent lists (UI/orchestrator only):
+#   feature_get_ready, feature_get_blocked, feature_get_graph,
+#   feature_remove_dependency
+#
+# The ghost tool "feature_release_testing" was removed entirely -- it was
+# listed here but never implemented in mcp_server/feature_mcp.py.
+
+CODING_AGENT_TOOLS = [
    "mcp__features__feature_get_stats",
-    "mcp__features__feature_get_by_id",  # Get assigned feature details
-    "mcp__features__feature_get_summary",  # Lightweight: id, name, status, deps only
+    "mcp__features__feature_get_by_id",
+    "mcp__features__feature_get_summary",
+    "mcp__features__feature_claim_and_get",
    "mcp__features__feature_mark_in_progress",
-    "mcp__features__feature_claim_and_get",  # Atomic claim + get details
    "mcp__features__feature_mark_passing",
-    "mcp__features__feature_mark_failing",  # Mark regression detected
+    "mcp__features__feature_mark_failing",
    "mcp__features__feature_skip",
-    "mcp__features__feature_create_bulk",
-    "mcp__features__feature_create",
    "mcp__features__feature_clear_in_progress",
-    "mcp__features__feature_release_testing",  # Release testing claim
-    # Dependency management
-    "mcp__features__feature_add_dependency",
-    "mcp__features__feature_remove_dependency",
-    "mcp__features__feature_set_dependencies",
-    # Query tools
-    "mcp__features__feature_get_ready",
-    "mcp__features__feature_get_blocked",
-    "mcp__features__feature_get_graph",
 ]

-# Playwright MCP tools for browser automation
+TESTING_AGENT_TOOLS = [
+    "mcp__features__feature_get_stats",
+    "mcp__features__feature_get_by_id",
+    "mcp__features__feature_get_summary",
+    "mcp__features__feature_mark_passing",
+    "mcp__features__feature_mark_failing",
+]
+
+INITIALIZER_AGENT_TOOLS = [
+    "mcp__features__feature_get_stats",
+    "mcp__features__feature_create_bulk",
+    "mcp__features__feature_create",
+    "mcp__features__feature_add_dependency",
+    "mcp__features__feature_set_dependencies",
+]
+
+# Union of all agent tool lists -- used for permissions (all tools remain
+# *permitted* so the MCP server can respond, but only the agent-type-specific
+# list is included in allowed_tools, which controls what the LLM sees).
+ALL_FEATURE_MCP_TOOLS = sorted(
+    set(CODING_AGENT_TOOLS) | set(TESTING_AGENT_TOOLS) | set(INITIALIZER_AGENT_TOOLS)
+)
+
+# Playwright MCP tools for browser automation.
+# Full set of tools for comprehensive UI testing including drag-and-drop,
+# hover menus, file uploads, tab management, etc.
 PLAYWRIGHT_TOOLS = [
    # Core navigation & screenshots
    "mcp__playwright__browser_navigate",
@@ -247,9 +243,10 @@ PLAYWRIGHT_TOOLS = [
    "mcp__playwright__browser_type",
    "mcp__playwright__browser_fill_form",
    "mcp__playwright__browser_select_option",
-    "mcp__playwright__browser_hover",
-    "mcp__playwright__browser_drag",
    "mcp__playwright__browser_press_key",
+    "mcp__playwright__browser_drag",
+    "mcp__playwright__browser_hover",
+    "mcp__playwright__browser_file_upload",

    # JavaScript & debugging
    "mcp__playwright__browser_evaluate",
@@ -258,16 +255,17 @@ PLAYWRIGHT_TOOLS = [
    "mcp__playwright__browser_network_requests",

    # Browser management
-    "mcp__playwright__browser_close",
    "mcp__playwright__browser_resize",
-    "mcp__playwright__browser_tabs",
    "mcp__playwright__browser_wait_for",
    "mcp__playwright__browser_handle_dialog",
-    "mcp__playwright__browser_file_upload",
    "mcp__playwright__browser_install",
+    "mcp__playwright__browser_close",
+    "mcp__playwright__browser_tabs",
 ]

-# Built-in tools
+# Built-in tools available to agents.
+# WebFetch and WebSearch are included so coding agents can look up current
+# documentation for frameworks and libraries they are implementing.
 BUILTIN_TOOLS = [
    "Read",
    "Write",
@@ -285,6 +283,7 @@ def create_client(
    model: str,
    yolo_mode: bool = False,
    agent_id: str | None = None,
+    agent_type: str = "coding",
 ):
    """
    Create a Claude Agent SDK client with multi-layered security.
@@ -295,6 +294,8 @@ def create_client(
        yolo_mode: If True, skip Playwright MCP server for rapid prototyping
        agent_id: Optional unique identifier for browser isolation in parallel mode.
                  When provided, each agent gets its own browser profile.
+        agent_type: One of "coding", "testing", or "initializer". Controls which
+                    MCP tools are exposed and the max_turns limit.

    Returns:
        Configured ClaudeSDKClient (from claude_agent_sdk)
@@ -308,13 +309,34 @@ def create_client(
    Note: Authentication is handled by start.bat/start.sh before this runs.
    The Claude SDK auto-detects credentials from the Claude CLI configuration
    """
-    # Build allowed tools list based on mode
-    # In YOLO mode, exclude Playwright tools for faster prototyping
-    allowed_tools = [*BUILTIN_TOOLS, *FEATURE_MCP_TOOLS]
+    # Select the feature MCP tools appropriate for this agent type
+    feature_tools_map = {
+        "coding": CODING_AGENT_TOOLS,
+        "testing": TESTING_AGENT_TOOLS,
+        "initializer": INITIALIZER_AGENT_TOOLS,
+    }
+    feature_tools = feature_tools_map.get(agent_type, CODING_AGENT_TOOLS)
+
+    # Select max_turns based on agent type:
+    #   - coding/initializer: 300 turns (complex multi-step implementation)
+    #   - testing: 100 turns (focused verification of a single feature)
+    max_turns_map = {
+        "coding": 300,
+        "testing": 100,
+        "initializer": 300,
+    }
+    max_turns = max_turns_map.get(agent_type, 300)
+
+    # Build allowed tools list based on mode and agent type.
+    # In YOLO mode, exclude Playwright tools for faster prototyping.
+    allowed_tools = [*BUILTIN_TOOLS, *feature_tools]
    if not yolo_mode:
        allowed_tools.extend(PLAYWRIGHT_TOOLS)

-    # Build permissions list
+    # Build permissions list.
+    # We permit ALL feature MCP tools at the security layer (so the MCP server
+    # can respond if called), but the LLM only *sees* the agent-type-specific
+    # subset via allowed_tools above.
    permissions_list = [
        # Allow all file operations within the project directory
        "Read(./**)",
@@ -325,11 +347,11 @@ def create_client(
        # Bash permission granted here, but actual commands are validated
        # by the bash_security_hook (see security.py for allowed commands)
        "Bash(*)",
-        # Allow web tools for documentation lookup
-        "WebFetch",
-        "WebSearch",
+        # Allow web tools for looking up framework/library documentation
+        "WebFetch(*)",
+        "WebSearch(*)",
        # Allow Feature MCP tools for feature management
-        *FEATURE_MCP_TOOLS,
+        *ALL_FEATURE_MCP_TOOLS,
    ]

    # Add extra read paths from environment variable (read-only access)
@@ -461,9 +483,10 @@ def create_client(
        context["project_dir"] = str(project_dir.resolve())
        return await bash_security_hook(input_data, tool_use_id, context)

-    # PreCompact hook for logging and customizing context compaction
+    # PreCompact hook for logging and customizing context compaction.
    # Compaction is handled automatically by Claude Code CLI when context approaches limits.
-    # This hook allows us to log when compaction occurs and optionally provide custom instructions.
+    # This hook provides custom instructions that guide the summarizer to preserve
+    # critical workflow state while discarding verbose/redundant content.
    async def pre_compact_hook(
        input_data: HookInput,
        tool_use_id: str | None,
@@ -476,8 +499,9 @@ def create_client(
        - "auto": Automatic compaction when context approaches token limits
        - "manual": User-initiated compaction via /compact command

-        The hook can customize compaction via hookSpecificOutput:
-        - customInstructions: String with focus areas for summarization
+        Returns custom instructions that guide the compaction summarizer to:
+        1. Preserve critical workflow state (feature ID, modified files, test results)
+        2. Discard verbose content (screenshots, long grep outputs, repeated reads)
        """
        trigger = input_data.get("trigger", "auto")
        custom_instructions = input_data.get("custom_instructions")
@@ -488,18 +512,53 @@ def create_client(
            print("[Context] Manual compaction requested")

        if custom_instructions:
-            print(f"[Context] Custom instructions: {custom_instructions}")
+            print(f"[Context] Custom instructions provided: {custom_instructions}")

-        # Return empty dict to allow compaction to proceed with default behavior
-        # To customize, return:
-        # {
-        #     "hookSpecificOutput": {
-        #         "hookEventName": "PreCompact",
-        #         "customInstructions": "Focus on preserving file paths and test results"
-        #     }
-        # }
-        return SyncHookJSONOutput()
+        # Build compaction instructions that preserve workflow-critical context
+        # while discarding verbose content that inflates token usage.
+        #
+        # The summarizer receives these instructions and uses them to decide
+        # what to keep vs. discard during context compaction.
+        compaction_guidance = "\n".join([
+            "## PRESERVE (critical workflow state)",
+            "- Current feature ID, feature name, and feature status (pending/in_progress/passing/failing)",
+            "- List of all files created or modified during this session, with their paths",
+            "- Last test/lint/type-check results: command run, pass/fail status, and key error messages",
+            "- Current step in the workflow (e.g., implementing, testing, fixing lint errors)",
+            "- Any dependency information (which features block this one)",
+            "- Git operations performed (commits, branches created)",
+            "- MCP tool call results (feature_claim_and_get, feature_mark_passing, etc.)",
+            "- Key architectural decisions made during this session",
+            "",
+            "## DISCARD (verbose content safe to drop)",
+            "- Full screenshot base64 data (just note that a screenshot was taken and what it showed)",
+            "- Long grep/find/glob output listings (summarize to: searched for X, found Y relevant files)",
+            "- Repeated file reads of the same file (keep only the latest read or a summary of changes)",
+            "- Full file contents from Read tool (summarize to: read file X, key sections were Y)",
+            "- Verbose npm/pip install output (just note: dependencies installed successfully/failed)",
+            "- Full lint/type-check output when passing (just note: lint passed with no errors)",
+            "- Browser console message dumps (summarize to: N errors found, key error was X)",
+            "- Redundant tool result confirmations ([Done] markers)",
+        ])

+        print("[Context] Applying custom compaction instructions (preserve workflow state, discard verbose content)")
+
+        # The SDK's HookSpecificOutput union type does not yet include a
+        # PreCompactHookSpecificOutput variant, but the CLI protocol accepts
+        # {"hookEventName": "PreCompact", "customInstructions": "..."}.
+        # The dict is serialized to JSON and sent to the CLI process directly,
+        # so the runtime behavior is correct despite the type mismatch.
+        return SyncHookJSONOutput(
+            hookSpecificOutput={  # type: ignore[typeddict-item]
+                "hookEventName": "PreCompact",
+                "customInstructions": compaction_guidance,
+            }
+        )
+
+    # PROMPT CACHING: The Claude Code CLI applies cache_control breakpoints internally.
+    # Our system_prompt benefits from automatic caching without explicit configuration.
+    # If explicit cache_control is needed, the SDK would need to accept content blocks
+    # with cache_control fields (not currently supported in v0.1.x).
    return ClaudeSDKClient(
        options=ClaudeAgentOptions(
            model=model,
@@ -508,7 +567,7 @@ def create_client(
            setting_sources=["project"],  # Enable skills, commands, and CLAUDE.md from project dir
            max_buffer_size=10 * 1024 * 1024,  # 10MB for large Playwright screenshots
            allowed_tools=allowed_tools,
-            mcp_servers=mcp_servers,
+            mcp_servers=mcp_servers,  # type: ignore[arg-type]  # SDK accepts dict config at runtime
            hooks={
                "PreToolUse": [
                    HookMatcher(matcher="Bash", hooks=[bash_hook_with_context]),
@@ -520,7 +579,7 @@ def create_client(
                    HookMatcher(hooks=[pre_compact_hook]),
                ],
            },
-            max_turns=1000,
+            max_turns=max_turns,
            cwd=str(project_dir.resolve()),
            settings=str(settings_file.resolve()),  # Use absolute path
            env=sdk_env,  # Pass API configuration overrides to CLI subprocess
@@ -538,7 +597,7 @@ def create_client(
            # parameters. Instead, context is managed via:
            # 1. betas=["context-1m-2025-08-07"] - Extended context window
            # 2. PreCompact hook - Intercept and customize compaction behavior
-            # 3. max_turns - Limit conversation turns (set to 1000 for long sessions)
+            # 3. max_turns - Limit conversation turns (per agent type: coding=300, testing=100)
            #
            # Future SDK versions may add explicit compaction controls. When available,
            # consider adding: