From 13128361b032e60bac70dabcf8d8cbcc98a9bc7a Mon Sep 17 00:00:00 2001 From: Auto Date: Sun, 18 Jan 2026 13:49:50 +0200 Subject: [PATCH] feat: add dedicated testing agents and enhanced parallel orchestration Introduce a new testing agent architecture that runs regression tests independently from coding agents, improving quality assurance in parallel mode. Key changes: Testing Agent System: - Add testing_prompt.template.md for dedicated testing agent role - Add feature_mark_failing MCP tool for regression detection - Add --agent-type flag to select initializer/coding/testing mode - Remove regression testing from coding prompt (now handled by testing agents) Parallel Orchestrator Enhancements: - Add testing agent spawning with configurable ratio (--testing-agent-ratio) - Add comprehensive debug logging system (DebugLog class) - Improve database session management to prevent stale reads - Add engine.dispose() calls to refresh connections after subprocess commits - Fix f-string linting issues (remove unnecessary f-prefixes) UI Improvements: - Add testing agent mascot (Chip) to AgentAvatar - Enhance AgentCard to display testing agent status - Add testing agent ratio slider in SettingsModal - Update WebSocket handling for testing agent updates - Improve ActivityFeed to show testing agent activity API & Server Updates: - Add testing_agent_ratio to settings schema and endpoints - Update process manager to support testing agent type - Enhance WebSocket messages for agent_update events Template Changes: - Delete coding_prompt_yolo.template.md (consolidated into main prompt) - Update initializer_prompt.template.md with improved structure - Streamline coding_prompt.template.md workflow Co-Authored-By: Claude Opus 4.5 --- .claude/templates/coding_prompt.template.md | 61 +- .../templates/coding_prompt_yolo.template.md | 274 -------- .../templates/initializer_prompt.template.md | 144 +++-- .claude/templates/testing_prompt.template.md | 190 ++++++ .gitignore | 1 + agent.py | 75 ++- autonomous_agent_demo.py | 130 ++-- client.py | 1 + mcp_server/feature_mcp.py | 47 +- parallel_orchestrator.py | 605 ++++++++++++++++-- prompts.py | 21 +- server/routers/agent.py | 32 +- server/routers/settings.py | 27 + server/schemas.py | 33 +- server/services/process_manager.py | 39 +- server/websocket.py | 139 +++- ui/src/components/ActivityFeed.tsx | 19 + ui/src/components/AgentAvatar.tsx | 344 +++++++++- ui/src/components/AgentCard.tsx | 57 +- ui/src/components/AgentControl.tsx | 20 +- ui/src/components/NewProjectModal.tsx | 6 +- ui/src/components/SettingsModal.tsx | 82 +++ ui/src/components/SpecCreationChat.tsx | 37 +- ui/src/hooks/useProjects.ts | 4 + ui/src/hooks/useWebSocket.ts | 7 +- ui/src/lib/api.ts | 4 + ui/src/lib/types.ts | 22 +- 27 files changed, 1885 insertions(+), 536 deletions(-) delete mode 100644 .claude/templates/coding_prompt_yolo.template.md create mode 100644 .claude/templates/testing_prompt.template.md diff --git a/.claude/templates/coding_prompt.template.md b/.claude/templates/coding_prompt.template.md index 823d297..6c24ed6 100644 --- a/.claude/templates/coding_prompt.template.md +++ b/.claude/templates/coding_prompt.template.md @@ -48,38 +48,7 @@ chmod +x init.sh Otherwise, start servers manually and document the process. -### STEP 3: VERIFICATION TEST (CRITICAL!) - -**MANDATORY BEFORE NEW WORK:** - -The previous session may have introduced bugs. Before implementing anything -new, you MUST run verification tests. - -Run 1-2 of the features marked as passing that are most core to the app's functionality to verify they still work. - -To get passing features for regression testing: - -``` -Use the feature_get_for_regression tool (returns up to 3 random passing features) -``` - -For example, if this were a chat app, you should perform a test that logs into the app, sends a message, and gets a response. - -**If you find ANY issues (functional or visual):** - -- Mark that feature as "passes": false immediately -- Add issues to a list -- Fix all issues BEFORE moving to new features -- This includes UI bugs like: - - White-on-white text or poor contrast - - Random characters displayed - - Incorrect timestamps - - Layout issues or overflow - - Buttons too close together - - Missing hover states - - Console errors - -### STEP 4: CHOOSE ONE FEATURE TO IMPLEMENT +### STEP 3: CHOOSE ONE FEATURE TO IMPLEMENT #### TEST-DRIVEN DEVELOPMENT MINDSET (CRITICAL) @@ -140,16 +109,16 @@ Use the feature_skip tool with feature_id={id} Document the SPECIFIC external blocker in `claude-progress.txt`. "Functionality not built" is NEVER a valid reason. -### STEP 5: IMPLEMENT THE FEATURE +### STEP 4: IMPLEMENT THE FEATURE Implement the chosen feature thoroughly: 1. Write the code (frontend and/or backend as needed) -2. Test manually using browser automation (see Step 6) +2. Test manually using browser automation (see Step 5) 3. Fix any issues discovered 4. Verify the feature works end-to-end -### STEP 6: VERIFY WITH BROWSER AUTOMATION +### STEP 5: VERIFY WITH BROWSER AUTOMATION **CRITICAL:** You MUST verify features through the actual UI. @@ -174,7 +143,7 @@ Use browser automation tools: - Skip visual verification - Mark tests passing without thorough verification -### STEP 6.5: MANDATORY VERIFICATION CHECKLIST (BEFORE MARKING ANY TEST PASSING) +### STEP 5.5: MANDATORY VERIFICATION CHECKLIST (BEFORE MARKING ANY TEST PASSING) **You MUST complete ALL of these checks before marking any feature as "passes": true** @@ -209,7 +178,7 @@ Use browser automation tools: - [ ] Loading states appeared during API calls - [ ] Error states handle failures gracefully -### STEP 6.6: MOCK DATA DETECTION SWEEP +### STEP 5.6: MOCK DATA DETECTION SWEEP **Run this sweep AFTER EVERY FEATURE before marking it as passing:** @@ -252,7 +221,7 @@ For API endpoints used by this feature: - Verify response contains actual database data - Empty database = empty response (not pre-populated mock data) -### STEP 7: UPDATE FEATURE STATUS (CAREFULLY!) +### STEP 6: UPDATE FEATURE STATUS (CAREFULLY!) **YOU CAN ONLY MODIFY ONE FIELD: "passes"** @@ -273,7 +242,7 @@ Use the feature_mark_passing tool with feature_id=42 **ONLY MARK A FEATURE AS PASSING AFTER VERIFICATION WITH SCREENSHOTS.** -### STEP 8: COMMIT YOUR PROGRESS +### STEP 7: COMMIT YOUR PROGRESS Make a descriptive git commit: @@ -288,7 +257,7 @@ git commit -m "Implement [feature name] - verified end-to-end " ``` -### STEP 9: UPDATE PROGRESS NOTES +### STEP 8: UPDATE PROGRESS NOTES Update `claude-progress.txt` with: @@ -298,7 +267,7 @@ Update `claude-progress.txt` with: - What should be worked on next - Current completion status (e.g., "45/200 tests passing") -### STEP 10: END SESSION CLEANLY +### STEP 9: END SESSION CLEANLY Before context fills up: @@ -374,12 +343,12 @@ feature_get_next # 3. Mark a feature as in-progress (call immediately after feature_get_next) feature_mark_in_progress with feature_id={id} -# 4. Get up to 3 random passing features for regression testing -feature_get_for_regression - -# 5. Mark a feature as passing (after verification) +# 4. Mark a feature as passing (after verification) feature_mark_passing with feature_id={id} +# 5. Mark a feature as failing (if you discover it's broken) +feature_mark_failing with feature_id={id} + # 6. Skip a feature (moves to end of queue) - ONLY when blocked by dependency feature_skip with feature_id={id} @@ -436,7 +405,7 @@ This allows you to fully test email-dependent flows without needing external ema - **All navigation works - no 404s or broken links** **You have unlimited time.** Take as long as needed to get it right. The most important thing is that you -leave the code base in a clean state before terminating the session (Step 10). +leave the code base in a clean state before terminating the session (Step 9). --- diff --git a/.claude/templates/coding_prompt_yolo.template.md b/.claude/templates/coding_prompt_yolo.template.md deleted file mode 100644 index 1ab2179..0000000 --- a/.claude/templates/coding_prompt_yolo.template.md +++ /dev/null @@ -1,274 +0,0 @@ - - - -## YOLO MODE - Rapid Prototyping (Testing Disabled) - -**WARNING:** This mode skips all browser testing and regression tests. -Features are marked as passing after lint/type-check succeeds. -Use for rapid prototyping only - not for production-quality development. - ---- - -## YOUR ROLE - CODING AGENT (YOLO MODE) - -You are continuing work on a long-running autonomous development task. -This is a FRESH context window - you have no memory of previous sessions. - -### STEP 1: GET YOUR BEARINGS (MANDATORY) - -Start by orienting yourself: - -```bash -# 1. See your working directory -pwd - -# 2. List files to understand project structure -ls -la - -# 3. Read the project specification to understand what you're building -cat app_spec.txt - -# 4. Read progress notes from previous sessions (last 500 lines to avoid context overflow) -tail -500 claude-progress.txt - -# 5. Check recent git history -git log --oneline -20 -``` - -Then use MCP tools to check feature status: - -``` -# 6. Get progress statistics (passing/total counts) -Use the feature_get_stats tool - -# 7. Get the next feature to work on -Use the feature_get_next tool -``` - -Understanding the `app_spec.txt` is critical - it contains the full requirements -for the application you're building. - -### STEP 2: START SERVERS (IF NOT RUNNING) - -If `init.sh` exists, run it: - -```bash -chmod +x init.sh -./init.sh -``` - -Otherwise, start servers manually and document the process. - -### STEP 3: CHOOSE ONE FEATURE TO IMPLEMENT - -Get the next feature to implement: - -``` -# Get the highest-priority pending feature -Use the feature_get_next tool -``` - -Once you've retrieved the feature, **immediately mark it as in-progress**: - -``` -# Mark feature as in-progress to prevent other sessions from working on it -Use the feature_mark_in_progress tool with feature_id=42 -``` - -Focus on completing one feature in this session before moving on to other features. -It's ok if you only complete one feature in this session, as there will be more sessions later that continue to make progress. - -#### When to Skip a Feature (EXTREMELY RARE) - -**Skipping should almost NEVER happen.** Only skip for truly external blockers you cannot control: - -- **External API not configured**: Third-party service credentials missing (e.g., Stripe keys, OAuth secrets) -- **External service unavailable**: Dependency on service that's down or inaccessible -- **Environment limitation**: Hardware or system requirement you cannot fulfill - -**NEVER skip because:** - -| Situation | Wrong Action | Correct Action | -|-----------|--------------|----------------| -| "Page doesn't exist" | Skip | Create the page | -| "API endpoint missing" | Skip | Implement the endpoint | -| "Database table not ready" | Skip | Create the migration | -| "Component not built" | Skip | Build the component | -| "No data to test with" | Skip | Create test data or build data entry flow | -| "Feature X needs to be done first" | Skip | Build feature X as part of this feature | - -If a feature requires building other functionality first, **build that functionality**. You are the coding agent - your job is to make the feature work, not to defer it. - -If you must skip (truly external blocker only): - -``` -Use the feature_skip tool with feature_id={id} -``` - -Document the SPECIFIC external blocker in `claude-progress.txt`. "Functionality not built" is NEVER a valid reason. - -### STEP 4: IMPLEMENT THE FEATURE - -Implement the chosen feature thoroughly: - -1. Write the code (frontend and/or backend as needed) -2. Ensure proper error handling -3. Follow existing code patterns in the codebase - -### STEP 5: VERIFY WITH LINT AND TYPE CHECK (YOLO MODE) - -**In YOLO mode, verification is done through static analysis only.** - -Run the appropriate lint and type-check commands for your project: - -**For TypeScript/JavaScript projects:** -```bash -npm run lint -npm run typecheck # or: npx tsc --noEmit -``` - -**For Python projects:** -```bash -ruff check . -mypy . -``` - -**If lint/type-check passes:** Proceed to mark the feature as passing. - -**If lint/type-check fails:** Fix the errors before proceeding. - -### STEP 6: UPDATE FEATURE STATUS - -**YOU CAN ONLY MODIFY ONE FIELD: "passes"** - -After lint/type-check passes, mark the feature as passing: - -``` -# Mark feature #42 as passing (replace 42 with the actual feature ID) -Use the feature_mark_passing tool with feature_id=42 -``` - -**NEVER:** - -- Delete features -- Edit feature descriptions -- Modify feature steps -- Combine or consolidate features -- Reorder features - -### STEP 7: COMMIT YOUR PROGRESS - -Make a descriptive git commit: - -```bash -git add . -git commit -m "Implement [feature name] - YOLO mode - -- Added [specific changes] -- Lint/type-check passing -- Marked feature #X as passing -" -``` - -### STEP 8: UPDATE PROGRESS NOTES - -Update `claude-progress.txt` with: - -- What you accomplished this session -- Which feature(s) you completed -- Any issues discovered or fixed -- What should be worked on next -- Current completion status (e.g., "45/200 features passing") - -### STEP 9: END SESSION CLEANLY - -Before context fills up: - -1. Commit all working code -2. Update claude-progress.txt -3. Mark features as passing if lint/type-check verified -4. Ensure no uncommitted changes -5. Leave app in working state - ---- - -## FEATURE TOOL USAGE RULES (CRITICAL - DO NOT VIOLATE) - -The feature tools exist to reduce token usage. **DO NOT make exploratory queries.** - -### ALLOWED Feature Tools (ONLY these): - -``` -# 1. Get progress stats (passing/in_progress/total counts) -feature_get_stats - -# 2. Get the NEXT feature to work on (one feature only) -feature_get_next - -# 3. Mark a feature as in-progress (call immediately after feature_get_next) -feature_mark_in_progress with feature_id={id} - -# 4. Mark a feature as passing (after lint/type-check succeeds) -feature_mark_passing with feature_id={id} - -# 5. Skip a feature (moves to end of queue) - ONLY when blocked by dependency -feature_skip with feature_id={id} - -# 6. Clear in-progress status (when abandoning a feature) -feature_clear_in_progress with feature_id={id} -``` - -### RULES: - -- Do NOT try to fetch lists of all features -- Do NOT query features by category -- Do NOT list all pending features - -**You do NOT need to see all features.** The feature_get_next tool tells you exactly what to work on. Trust it. - ---- - -## EMAIL INTEGRATION (DEVELOPMENT MODE) - -When building applications that require email functionality (password resets, email verification, notifications, etc.), you typically won't have access to a real email service or the ability to read email inboxes. - -**Solution:** Configure the application to log emails to the terminal instead of sending them. - -- Password reset links should be printed to the console -- Email verification links should be printed to the console -- Any notification content should be logged to the terminal - -**During testing:** - -1. Trigger the email action (e.g., click "Forgot Password") -2. Check the terminal/server logs for the generated link -3. Use that link directly to verify the functionality works - -This allows you to fully test email-dependent flows without needing external email services. - ---- - -## IMPORTANT REMINDERS (YOLO MODE) - -**Your Goal:** Rapidly prototype the application with all features implemented - -**This Session's Goal:** Complete at least one feature - -**Quality Bar (YOLO Mode):** - -- Code compiles without errors (lint/type-check passing) -- Follows existing code patterns -- Basic error handling in place -- Features are implemented according to spec - -**Note:** Browser testing and regression testing are SKIPPED in YOLO mode. -Features may have bugs that would be caught by manual testing. -Use standard mode for production-quality verification. - -**You have unlimited time.** Take as long as needed to implement features correctly. -The most important thing is that you leave the code base in a clean state before -terminating the session (Step 9). - ---- - -Begin by running Step 1 (Get Your Bearings). diff --git a/.claude/templates/initializer_prompt.template.md b/.claude/templates/initializer_prompt.template.md index 080e81c..f0baffb 100644 --- a/.claude/templates/initializer_prompt.template.md +++ b/.claude/templates/initializer_prompt.template.md @@ -26,10 +26,22 @@ which is the single source of truth for what needs to be built. **Creating Features:** -Use the feature_create_bulk tool to add all features at once: +Use the feature_create_bulk tool to add all features at once. Note: You MUST include `depends_on_indices` +to specify dependencies. Features with no dependencies can run first and enable parallel execution. ``` Use the feature_create_bulk tool with features=[ + { + "category": "functional", + "name": "App loads without errors", + "description": "Application starts and renders homepage", + "steps": [ + "Step 1: Navigate to homepage", + "Step 2: Verify no console errors", + "Step 3: Verify main content renders" + ] + // No depends_on_indices = FOUNDATION feature (runs first) + }, { "category": "functional", "name": "User can create an account", @@ -38,7 +50,8 @@ Use the feature_create_bulk tool with features=[ "Step 1: Navigate to registration page", "Step 2: Fill in required fields", "Step 3: Submit form and verify account created" - ] + ], + "depends_on_indices": [0] // Depends on app loading }, { "category": "functional", @@ -49,7 +62,7 @@ Use the feature_create_bulk tool with features=[ "Step 2: Enter credentials", "Step 3: Verify successful login and redirect" ], - "depends_on_indices": [0] + "depends_on_indices": [0, 1] // Depends on app loading AND registration }, { "category": "functional", @@ -60,7 +73,18 @@ Use the feature_create_bulk tool with features=[ "Step 2: Navigate to dashboard", "Step 3: Verify personalized content displays" ], - "depends_on_indices": [1] + "depends_on_indices": [2] // Depends on login only + }, + { + "category": "functional", + "name": "User can update profile", + "description": "User can modify their profile information", + "steps": [ + "Step 1: Log in as user", + "Step 2: Navigate to profile settings", + "Step 3: Update and save profile" + ], + "depends_on_indices": [2] // ALSO depends on login (WIDE GRAPH - can run parallel with dashboard!) } ] ``` @@ -69,7 +93,15 @@ Use the feature_create_bulk tool with features=[ - IDs and priorities are assigned automatically based on order - All features start with `passes: false` by default - You can create features in batches if there are many (e.g., 50 at a time) -- Use `depends_on_indices` to specify dependencies (see FEATURE DEPENDENCIES section below) +- **CRITICAL:** Use `depends_on_indices` to specify dependencies (see FEATURE DEPENDENCIES section below) + +**DEPENDENCY REQUIREMENT:** +You MUST specify dependencies using `depends_on_indices` for features that logically depend on others. +- Features 0-9 should have NO dependencies (foundation/setup features) +- Features 10+ MUST have at least some dependencies where logical +- Create WIDE dependency graphs, not linear chains: + - BAD: A -> B -> C -> D -> E (linear chain, only 1 feature can run at a time) + - GOOD: A -> B, A -> C, A -> D, B -> E, C -> E (wide graph, multiple features can run in parallel) **Requirements for features:** @@ -88,10 +120,19 @@ Use the feature_create_bulk tool with features=[ --- -## FEATURE DEPENDENCIES +## FEATURE DEPENDENCIES (MANDATORY) + +**THIS SECTION IS MANDATORY. You MUST specify dependencies for features.** Dependencies enable **parallel execution** of independent features. When you specify dependencies correctly, multiple agents can work on unrelated features simultaneously, dramatically speeding up development. +**WARNING:** If you do not specify dependencies, ALL features will be ready immediately, which: +1. Overwhelms the parallel agents trying to work on unrelated features +2. Results in features being implemented in random order +3. Causes logical issues (e.g., "Edit user" attempted before "Create user") + +You MUST analyze each feature and specify its dependencies using `depends_on_indices`. + ### Why Dependencies Matter 1. **Parallel Execution**: Features without dependencies can run in parallel @@ -137,35 +178,64 @@ Since feature IDs aren't assigned until after creation, use **array indices** (0 1. **Start with foundation features** (index 0-10): Core setup, basic navigation, authentication 2. **Group related features together**: Keep CRUD operations adjacent -3. **Chain complex flows**: Registration → Login → Dashboard → Settings +3. **Chain complex flows**: Registration -> Login -> Dashboard -> Settings 4. **Keep dependencies shallow**: Prefer 1-2 dependencies over deep chains 5. **Skip dependencies for independent features**: Visual tests often have no dependencies -### Example: Todo App Feature Chain +### Minimum Dependency Coverage + +**REQUIREMENT:** At least 60% of your features (after index 10) should have at least one dependency. + +Target structure for a 150-feature project: +- Features 0-9: Foundation (0 dependencies) - App loads, basic setup +- Features 10-149: At least 84 should have dependencies (60% of 140) + +This ensures: +- A good mix of parallelizable features (foundation) +- Logical ordering for dependent features + +### Example: Todo App Feature Chain (Wide Graph Pattern) + +This example shows the CORRECT wide graph pattern where multiple features share the same dependency, +enabling parallel execution: ```json [ - // Foundation (no dependencies) + // FOUNDATION TIER (indices 0-2, no dependencies) + // These run first and enable everything else { "name": "App loads without errors", "category": "functional" }, { "name": "Navigation bar displays", "category": "style" }, + { "name": "Homepage renders correctly", "category": "functional" }, - // Auth chain + // AUTH TIER (indices 3-5, depend on foundation) + // These can all run in parallel once foundation passes { "name": "User can register", "depends_on_indices": [0] }, - { "name": "User can login", "depends_on_indices": [2] }, - { "name": "User can logout", "depends_on_indices": [3] }, + { "name": "User can login", "depends_on_indices": [0, 3] }, + { "name": "User can logout", "depends_on_indices": [4] }, - // Todo CRUD (depends on auth) - { "name": "User can create todo", "depends_on_indices": [3] }, - { "name": "User can view todos", "depends_on_indices": [5] }, - { "name": "User can edit todo", "depends_on_indices": [5] }, - { "name": "User can delete todo", "depends_on_indices": [5] }, + // CORE CRUD TIER (indices 6-9, depend on auth) + // WIDE GRAPH: All 4 of these depend on login (index 4) + // This means all 4 can start as soon as login passes! + { "name": "User can create todo", "depends_on_indices": [4] }, + { "name": "User can view todos", "depends_on_indices": [4] }, + { "name": "User can edit todo", "depends_on_indices": [4, 6] }, + { "name": "User can delete todo", "depends_on_indices": [4, 6] }, - // Advanced features (multiple dependencies) - { "name": "User can filter todos", "depends_on_indices": [6] }, - { "name": "User can search todos", "depends_on_indices": [6] } + // ADVANCED TIER (indices 10-11, depend on CRUD) + // Note: filter and search both depend on view (7), not on each other + { "name": "User can filter todos", "depends_on_indices": [7] }, + { "name": "User can search todos", "depends_on_indices": [7] } ] ``` +**Parallelism analysis of this example:** +- Foundation tier: 3 features can run in parallel +- Auth tier: 3 features wait for foundation, then can run (mostly parallel) +- CRUD tier: 4 features can start once login passes (all 4 in parallel!) +- Advanced tier: 2 features can run once view passes (both in parallel) + +**Result:** With 3 parallel agents, this 12-feature project completes in ~5-6 cycles instead of 12 sequential cycles. + --- ## MANDATORY TEST CATEGORIES @@ -585,32 +655,16 @@ Set up the basic project structure based on what's specified in `app_spec.txt`. This typically includes directories for frontend, backend, and any other components mentioned in the spec. -### OPTIONAL: Start Implementation - -If you have time remaining in this session, you may begin implementing -the highest-priority features. Get the next feature with: - -``` -Use the feature_get_next tool -``` - -Remember: -- Work on ONE feature at a time -- Test thoroughly before marking as passing -- Commit your progress before session ends - ### ENDING THIS SESSION -Before your context fills up: +Once you have completed the four tasks above: -1. Commit all work with descriptive messages -2. Create `claude-progress.txt` with a summary of what you accomplished -3. Verify features were created using the feature_get_stats tool -4. Leave the environment in a clean, working state +1. Commit all work with a descriptive message +2. Verify features were created using the feature_get_stats tool +3. Leave the environment in a clean, working state +4. Exit cleanly -The next agent will continue from here with a fresh context window. - ---- - -**Remember:** You have unlimited time across many sessions. Focus on -quality over speed. Production-ready is the goal. +**IMPORTANT:** Do NOT attempt to implement any features. Your job is setup only. +Feature implementation will be handled by parallel coding agents that spawn after +you complete initialization. Starting implementation here would create a bottleneck +and defeat the purpose of the parallel architecture. diff --git a/.claude/templates/testing_prompt.template.md b/.claude/templates/testing_prompt.template.md new file mode 100644 index 0000000..c6c8447 --- /dev/null +++ b/.claude/templates/testing_prompt.template.md @@ -0,0 +1,190 @@ +## YOUR ROLE - TESTING AGENT + +You are a **testing agent** responsible for **regression testing** previously-passing features. + +Your job is to ensure that features marked as "passing" still work correctly. If you find a regression (a feature that no longer works), you must fix it. + +### STEP 1: GET YOUR BEARINGS (MANDATORY) + +Start by orienting yourself: + +```bash +# 1. See your working directory +pwd + +# 2. List files to understand project structure +ls -la + +# 3. Read progress notes from previous sessions (last 200 lines) +tail -200 claude-progress.txt + +# 4. Check recent git history +git log --oneline -10 +``` + +Then use MCP tools to check feature status: + +``` +# 5. Get progress statistics +Use the feature_get_stats tool +``` + +### STEP 2: START SERVERS (IF NOT RUNNING) + +If `init.sh` exists, run it: + +```bash +chmod +x init.sh +./init.sh +``` + +Otherwise, start servers manually. + +### STEP 3: GET A FEATURE TO TEST + +Request ONE passing feature for regression testing: + +``` +Use the feature_get_for_regression tool with limit=1 +``` + +This returns a random feature that is currently marked as passing. Your job is to verify it still works. + +### STEP 4: VERIFY THE FEATURE + +**CRITICAL:** You MUST verify the feature through the actual UI using browser automation. + +For the feature returned: +1. Read and understand the feature's verification steps +2. Navigate to the relevant part of the application +3. Execute each verification step using browser automation +4. Take screenshots to document the verification +5. Check for console errors + +Use browser automation tools: + +**Navigation & Screenshots:** +- browser_navigate - Navigate to a URL +- browser_take_screenshot - Capture screenshot (use for visual verification) +- browser_snapshot - Get accessibility tree snapshot + +**Element Interaction:** +- browser_click - Click elements +- browser_type - Type text into editable elements +- browser_fill_form - Fill multiple form fields +- browser_select_option - Select dropdown options +- browser_press_key - Press keyboard keys + +**Debugging:** +- browser_console_messages - Get browser console output (check for errors) +- browser_network_requests - Monitor API calls + +### STEP 5: HANDLE RESULTS + +#### If the feature PASSES: + +The feature still works correctly. Simply confirm this and end your session: + +``` +# Log the successful verification +echo "[Testing] Feature #{id} verified - still passing" >> claude-progress.txt +``` + +**DO NOT** call feature_mark_passing again - it's already passing. + +#### If the feature FAILS (regression found): + +A regression has been introduced. You MUST fix it: + +1. **Mark the feature as failing:** + ``` + Use the feature_mark_failing tool with feature_id={id} + ``` + +2. **Investigate the root cause:** + - Check console errors + - Review network requests + - Examine recent git commits that might have caused the regression + +3. **Fix the regression:** + - Make the necessary code changes + - Test your fix using browser automation + - Ensure the feature works correctly again + +4. **Verify the fix:** + - Run through all verification steps again + - Take screenshots confirming the fix + +5. **Mark as passing after fix:** + ``` + Use the feature_mark_passing tool with feature_id={id} + ``` + +6. **Commit the fix:** + ```bash + git add . + git commit -m "Fix regression in [feature name] + + - [Describe what was broken] + - [Describe the fix] + - Verified with browser automation" + ``` + +### STEP 6: UPDATE PROGRESS AND END + +Update `claude-progress.txt`: + +```bash +echo "[Testing] Session complete - verified/fixed feature #{id}" >> claude-progress.txt +``` + +--- + +## AVAILABLE MCP TOOLS + +### Feature Management +- `feature_get_stats` - Get progress overview (passing/in_progress/total counts) +- `feature_get_for_regression` - Get a random passing feature to test +- `feature_mark_failing` - Mark a feature as failing (when you find a regression) +- `feature_mark_passing` - Mark a feature as passing (after fixing a regression) + +### Browser Automation (Playwright) +All interaction tools have **built-in auto-wait** - no manual timeouts needed. + +- `browser_navigate` - Navigate to URL +- `browser_take_screenshot` - Capture screenshot +- `browser_snapshot` - Get accessibility tree +- `browser_click` - Click elements +- `browser_type` - Type text +- `browser_fill_form` - Fill form fields +- `browser_select_option` - Select dropdown +- `browser_press_key` - Keyboard input +- `browser_console_messages` - Check for JS errors +- `browser_network_requests` - Monitor API calls + +--- + +## IMPORTANT REMINDERS + +**Your Goal:** Verify that passing features still work, and fix any regressions found. + +**This Session's Goal:** Test ONE feature thoroughly. + +**Quality Bar:** +- Zero console errors +- All verification steps pass +- Visual appearance correct +- API calls succeed + +**If you find a regression:** +1. Mark the feature as failing immediately +2. Fix the issue +3. Verify the fix with browser automation +4. Mark as passing only after thorough verification +5. Commit the fix + +**You have one iteration.** Focus on testing ONE feature thoroughly. + +--- + +Begin by running Step 1 (Get Your Bearings). diff --git a/.gitignore b/.gitignore index 6935128..6a4175e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ # Agent-generated output directories generations/ automaker/ +temp/ nul issues/ diff --git a/agent.py b/agent.py index 79d585c..59b6354 100644 --- a/agent.py +++ b/agent.py @@ -27,9 +27,9 @@ from progress import has_features, print_progress_summary, print_session_header from prompts import ( copy_spec_to_project, get_coding_prompt, - get_coding_prompt_yolo, get_initializer_prompt, get_single_feature_prompt, + get_testing_prompt, ) # Configuration @@ -116,6 +116,7 @@ async def run_autonomous_agent( max_iterations: Optional[int] = None, yolo_mode: bool = False, feature_id: Optional[int] = None, + agent_type: Optional[str] = None, ) -> None: """ Run the autonomous agent loop. @@ -124,20 +125,21 @@ async def run_autonomous_agent( project_dir: Directory for the project model: Claude model to use max_iterations: Maximum number of iterations (None for unlimited) - yolo_mode: If True, skip browser testing and use YOLO prompt - feature_id: If set, work only on this specific feature (used by parallel orchestrator) + yolo_mode: If True, skip browser testing in coding agent prompts + feature_id: If set, work only on this specific feature (used by orchestrator for coding agents) + agent_type: Type of agent: "initializer", "coding", "testing", or None (auto-detect) """ print("\n" + "=" * 70) - print(" AUTONOMOUS CODING AGENT DEMO") + print(" AUTONOMOUS CODING AGENT") print("=" * 70) print(f"\nProject directory: {project_dir}") print(f"Model: {model}") + if agent_type: + print(f"Agent type: {agent_type}") if yolo_mode: - print("Mode: YOLO (testing disabled)") - else: - print("Mode: Standard (full testing)") + print("Mode: YOLO (testing agents disabled)") if feature_id: - print(f"Single-feature mode: Feature #{feature_id}") + print(f"Feature assignment: #{feature_id}") if max_iterations: print(f"Max iterations: {max_iterations}") else: @@ -147,24 +149,34 @@ async def run_autonomous_agent( # Create project directory project_dir.mkdir(parents=True, exist_ok=True) - # Check if this is a fresh start or continuation - # Uses has_features() which checks if the database actually has features, - # not just if the file exists (empty db should still trigger initializer) - is_first_run = not has_features(project_dir) + # Determine agent type if not explicitly set + if agent_type is None: + # Auto-detect based on whether we have features + # (This path is for legacy compatibility - orchestrator should always set agent_type) + is_first_run = not has_features(project_dir) + if is_first_run: + agent_type = "initializer" + else: + agent_type = "coding" - if is_first_run: - print("Fresh start - will use initializer agent") + is_initializer = agent_type == "initializer" + + if is_initializer: + print("Running as INITIALIZER agent") print() print("=" * 70) - print(" NOTE: First session takes 10-20+ minutes!") - print(" The agent is generating 200 detailed test cases.") + print(" NOTE: Initialization takes 10-20+ minutes!") + print(" The agent is generating detailed test cases.") print(" This may appear to hang - it's working. Watch for [Tool: ...] output.") print("=" * 70) print() # Copy the app spec into the project directory for the agent to read copy_spec_to_project(project_dir) + elif agent_type == "testing": + print("Running as TESTING agent (regression testing)") + print_progress_summary(project_dir) else: - print("Continuing existing project") + print("Running as CODING agent") print_progress_summary(project_dir) # Main loop @@ -180,27 +192,30 @@ async def run_autonomous_agent( break # Print session header - print_session_header(iteration, is_first_run) + print_session_header(iteration, is_initializer) # Create client (fresh context) - # In single-feature mode, pass agent_id for browser isolation - agent_id = f"feature-{feature_id}" if feature_id else None + # Pass agent_id for browser isolation in multi-agent scenarios + import os + if agent_type == "testing": + agent_id = f"testing-{os.getpid()}" # Unique ID for testing agents + elif feature_id: + agent_id = f"feature-{feature_id}" + else: + agent_id = None client = create_client(project_dir, model, yolo_mode=yolo_mode, agent_id=agent_id) - # Choose prompt based on session type - # Pass project_dir to enable project-specific prompts - if is_first_run: + # Choose prompt based on agent type + if agent_type == "initializer": prompt = get_initializer_prompt(project_dir) - is_first_run = False # Only use initializer once + elif agent_type == "testing": + prompt = get_testing_prompt(project_dir) elif feature_id: - # Single-feature mode (used by parallel orchestrator) + # Single-feature mode (used by orchestrator for coding agents) prompt = get_single_feature_prompt(feature_id, project_dir, yolo_mode) else: - # Use YOLO prompt if in YOLO mode - if yolo_mode: - prompt = get_coding_prompt_yolo(project_dir) - else: - prompt = get_coding_prompt(project_dir) + # General coding prompt (legacy path) + prompt = get_coding_prompt(project_dir) # Run session with async context manager # Wrap in try/except to handle MCP server startup failures gracefully diff --git a/autonomous_agent_demo.py b/autonomous_agent_demo.py index 47fdcb3..abe8992 100644 --- a/autonomous_agent_demo.py +++ b/autonomous_agent_demo.py @@ -4,8 +4,10 @@ Autonomous Coding Agent Demo ============================ A minimal harness demonstrating long-running autonomous coding with Claude. -This script implements the two-agent pattern (initializer + coding agent) and -incorporates all the strategies from the long-running agents guide. +This script implements a unified orchestrator pattern that handles: +- Initialization (creating features from app_spec) +- Coding agents (implementing features) +- Testing agents (regression testing) Example Usage: # Using absolute path directly @@ -14,17 +16,22 @@ Example Usage: # Using registered project name (looked up from registry) python autonomous_agent_demo.py --project-dir my-app - # Limit iterations for testing + # Limit iterations for testing (when running as subprocess) python autonomous_agent_demo.py --project-dir my-app --max-iterations 5 - # YOLO mode: rapid prototyping without browser testing + # YOLO mode: rapid prototyping without testing agents python autonomous_agent_demo.py --project-dir my-app --yolo - # Parallel execution with 3 concurrent agents (default) - python autonomous_agent_demo.py --project-dir my-app --parallel + # Parallel execution with 3 concurrent coding agents + python autonomous_agent_demo.py --project-dir my-app --concurrency 3 - # Parallel execution with 5 concurrent agents - python autonomous_agent_demo.py --project-dir my-app --parallel 5 + # Single agent mode (orchestrator with concurrency=1, the default) + python autonomous_agent_demo.py --project-dir my-app + + # Run as specific agent type (used by orchestrator to spawn subprocesses) + python autonomous_agent_demo.py --project-dir my-app --agent-type initializer + python autonomous_agent_demo.py --project-dir my-app --agent-type coding --feature-id 42 + python autonomous_agent_demo.py --project-dir my-app --agent-type testing """ import argparse @@ -44,25 +51,28 @@ from registry import DEFAULT_MODEL, get_project_path def parse_args() -> argparse.Namespace: """Parse command line arguments.""" parser = argparse.ArgumentParser( - description="Autonomous Coding Agent Demo - Long-running agent harness", + description="Autonomous Coding Agent Demo - Unified orchestrator pattern", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: - # Use absolute path directly + # Use absolute path directly (single agent, default) python autonomous_agent_demo.py --project-dir C:/Projects/my-app # Use registered project name (looked up from registry) python autonomous_agent_demo.py --project-dir my-app - # Use a specific model - python autonomous_agent_demo.py --project-dir my-app --model claude-sonnet-4-5-20250929 + # Parallel execution with 3 concurrent agents + python autonomous_agent_demo.py --project-dir my-app --concurrency 3 - # Limit iterations for testing - python autonomous_agent_demo.py --project-dir my-app --max-iterations 5 - - # YOLO mode: rapid prototyping without browser testing + # YOLO mode: rapid prototyping without testing agents python autonomous_agent_demo.py --project-dir my-app --yolo + # Configure testing agent ratio (2 testing agents per coding agent) + python autonomous_agent_demo.py --project-dir my-app --testing-ratio 2 + + # Disable testing agents (similar to YOLO but with verification) + python autonomous_agent_demo.py --project-dir my-app --testing-ratio 0 + Authentication: Uses Claude CLI authentication (run 'claude login' if not logged in) Authentication is handled by start.bat/start.sh before this runs @@ -80,7 +90,7 @@ Authentication: "--max-iterations", type=int, default=None, - help="Maximum number of agent iterations (default: unlimited)", + help="Maximum number of agent iterations (default: unlimited, typically 1 for subprocesses)", ) parser.add_argument( @@ -94,25 +104,56 @@ Authentication: "--yolo", action="store_true", default=False, - help="Enable YOLO mode: rapid prototyping without browser testing", + help="Enable YOLO mode: skip testing agents for rapid prototyping", ) + # Unified orchestrator mode (replaces --parallel) parser.add_argument( - "--parallel", - "-p", + "--concurrency", "-c", + type=int, + default=1, + help="Number of concurrent coding agents (default: 1, max: 5)", + ) + + # Backward compatibility: --parallel is deprecated alias for --concurrency + parser.add_argument( + "--parallel", "-p", type=int, nargs="?", const=3, default=None, metavar="N", - help="Enable parallel execution with N concurrent agents (default: 3, max: 5)", + help="DEPRECATED: Use --concurrency instead. Alias for --concurrency.", ) parser.add_argument( "--feature-id", type=int, default=None, - help="Work on a specific feature ID only (used by parallel orchestrator)", + help="Work on a specific feature ID only (used by orchestrator for coding agents)", + ) + + # Agent type for subprocess mode + parser.add_argument( + "--agent-type", + choices=["initializer", "coding", "testing"], + default=None, + help="Agent type (used by orchestrator to spawn specialized subprocesses)", + ) + + # Testing agent configuration + parser.add_argument( + "--testing-ratio", + type=int, + default=1, + help="Testing agents per coding agent (0-3, default: 1). Set to 0 to disable testing agents.", + ) + + parser.add_argument( + "--count-testing", + action="store_true", + default=False, + help="Count testing agents toward concurrency limit (default: false)", ) return parser.parse_args() @@ -120,11 +161,17 @@ Authentication: def main() -> None: """Main entry point.""" + print("[ENTRY] autonomous_agent_demo.py starting...", flush=True) args = parse_args() # Note: Authentication is handled by start.bat/start.sh before this script runs. # The Claude SDK auto-detects credentials from ~/.claude/.credentials.json + # Handle deprecated --parallel flag + if args.parallel is not None: + print("WARNING: --parallel is deprecated. Use --concurrency instead.", flush=True) + args.concurrency = args.parallel + # Resolve project directory: # 1. If absolute path, use as-is # 2. Otherwise, look up from registry by name @@ -147,28 +194,35 @@ def main() -> None: return try: - if args.parallel is not None: - # Parallel execution mode - from parallel_orchestrator import run_parallel_orchestrator - - print(f"Running in parallel mode with {args.parallel} concurrent agents") - asyncio.run( - run_parallel_orchestrator( - project_dir=project_dir, - max_concurrency=args.parallel, - model=args.model, - yolo_mode=args.yolo, - ) - ) - else: - # Standard single-agent mode (MCP server handles feature database) + if args.agent_type: + # Subprocess mode - spawned by orchestrator for a specific role asyncio.run( run_autonomous_agent( project_dir=project_dir, model=args.model, - max_iterations=args.max_iterations, + max_iterations=args.max_iterations or 1, yolo_mode=args.yolo, feature_id=args.feature_id, + agent_type=args.agent_type, + ) + ) + else: + # Entry point mode - always use unified orchestrator + from parallel_orchestrator import run_parallel_orchestrator + + # Clamp concurrency to valid range (1-5) + concurrency = max(1, min(args.concurrency, 5)) + if concurrency != args.concurrency: + print(f"Clamping concurrency to valid range: {concurrency}", flush=True) + + asyncio.run( + run_parallel_orchestrator( + project_dir=project_dir, + max_concurrency=concurrency, + model=args.model, + yolo_mode=args.yolo, + testing_agent_ratio=args.testing_ratio, + count_testing_in_concurrency=args.count_testing, ) ) except KeyboardInterrupt: diff --git a/client.py b/client.py index 6ce7dfb..ef7dc34 100644 --- a/client.py +++ b/client.py @@ -59,6 +59,7 @@ FEATURE_MCP_TOOLS = [ "mcp__features__feature_get_for_regression", "mcp__features__feature_mark_in_progress", "mcp__features__feature_mark_passing", + "mcp__features__feature_mark_failing", # Mark regression detected "mcp__features__feature_skip", "mcp__features__feature_create_bulk", "mcp__features__feature_create", diff --git a/mcp_server/feature_mcp.py b/mcp_server/feature_mcp.py index 20abc77..e46403b 100755 --- a/mcp_server/feature_mcp.py +++ b/mcp_server/feature_mcp.py @@ -11,6 +11,7 @@ Tools: - feature_get_next: Get next feature to implement - feature_get_for_regression: Get random passing features for testing - feature_mark_passing: Mark a feature as passing +- feature_mark_failing: Mark a feature as failing (regression detected) - feature_skip: Skip a feature (move to end of queue) - feature_mark_in_progress: Mark a feature as in-progress - feature_clear_in_progress: Clear in-progress status @@ -358,7 +359,8 @@ def feature_get_for_regression( ) -> str: """Get random passing features for regression testing. - Returns a random selection of features that are currently passing. + Returns a random selection of features that are currently passing + and NOT currently in progress (to avoid conflicts with coding agents). Use this to verify that previously implemented features still work after making changes. @@ -373,6 +375,7 @@ def feature_get_for_regression( features = ( session.query(Feature) .filter(Feature.passes == True) + .filter(Feature.in_progress == False) # Avoid conflicts with coding agents .order_by(func.random()) .limit(limit) .all() @@ -418,6 +421,48 @@ def feature_mark_passing( session.close() +@mcp.tool() +def feature_mark_failing( + feature_id: Annotated[int, Field(description="The ID of the feature to mark as failing", ge=1)] +) -> str: + """Mark a feature as failing after finding a regression. + + Updates the feature's passes field to false and clears the in_progress flag. + Use this when a testing agent discovers that a previously-passing feature + no longer works correctly (regression detected). + + After marking as failing, you should: + 1. Investigate the root cause + 2. Fix the regression + 3. Verify the fix + 4. Call feature_mark_passing once fixed + + Args: + feature_id: The ID of the feature to mark as failing + + Returns: + JSON with the updated feature details, or error if not found. + """ + session = get_session() + try: + feature = session.query(Feature).filter(Feature.id == feature_id).first() + + if feature is None: + return json.dumps({"error": f"Feature with ID {feature_id} not found"}) + + feature.passes = False + feature.in_progress = False + session.commit() + session.refresh(feature) + + return json.dumps({ + "message": f"Feature #{feature_id} marked as failing - regression detected", + "feature": feature.to_dict() + }, indent=2) + finally: + session.close() + + @mcp.tool() def feature_skip( feature_id: Annotated[int, Field(description="The ID of the feature to skip", ge=1)] diff --git a/parallel_orchestrator.py b/parallel_orchestrator.py index da348c8..09f4e22 100644 --- a/parallel_orchestrator.py +++ b/parallel_orchestrator.py @@ -2,11 +2,19 @@ Parallel Orchestrator ===================== -Coordinates parallel execution of independent features using multiple agent processes. +Unified orchestrator that handles all agent lifecycle: +- Initialization: Creates features from app_spec if needed +- Coding agents: Implement features one at a time +- Testing agents: Regression test passing features (optional) + Uses dependency-aware scheduling to ensure features are only started when their dependencies are satisfied. Usage: + # Entry point (always uses orchestrator) + python autonomous_agent_demo.py --project-dir my-app --concurrency 3 + + # Direct orchestrator usage python parallel_orchestrator.py --project-dir my-app --max-concurrency 3 """ @@ -15,22 +23,88 @@ import os import subprocess import sys import threading +from datetime import datetime from pathlib import Path -from typing import Callable +from typing import Callable, Literal import psutil from api.database import Feature, create_database from api.dependency_resolver import are_dependencies_satisfied, compute_scheduling_scores +from progress import has_features # Root directory of autocoder (where this script and autonomous_agent_demo.py live) AUTOCODER_ROOT = Path(__file__).parent.resolve() +# Debug log file path +DEBUG_LOG_FILE = AUTOCODER_ROOT / "orchestrator_debug.log" + + +class DebugLogger: + """Thread-safe debug logger that writes to a file.""" + + def __init__(self, log_file: Path = DEBUG_LOG_FILE): + self.log_file = log_file + self._lock = threading.Lock() + self._session_started = False + # DON'T clear on import - only mark session start when run_loop begins + + def start_session(self): + """Mark the start of a new orchestrator session. Clears previous logs.""" + with self._lock: + self._session_started = True + with open(self.log_file, "w") as f: + f.write(f"=== Orchestrator Debug Log Started: {datetime.now().isoformat()} ===\n") + f.write(f"=== PID: {os.getpid()} ===\n\n") + + def log(self, category: str, message: str, **kwargs): + """Write a timestamped log entry.""" + timestamp = datetime.now().strftime("%H:%M:%S.%f")[:-3] + with self._lock: + with open(self.log_file, "a") as f: + f.write(f"[{timestamp}] [{category}] {message}\n") + for key, value in kwargs.items(): + f.write(f" {key}: {value}\n") + f.write("\n") + + def section(self, title: str): + """Write a section header.""" + with self._lock: + with open(self.log_file, "a") as f: + f.write(f"\n{'='*60}\n") + f.write(f" {title}\n") + f.write(f"{'='*60}\n\n") + + +# Global debug logger instance +debug_log = DebugLogger() + + +def _dump_database_state(session, label: str = ""): + """Helper to dump full database state to debug log.""" + from api.database import Feature + all_features = session.query(Feature).all() + + passing = [f for f in all_features if f.passes] + in_progress = [f for f in all_features if f.in_progress and not f.passes] + pending = [f for f in all_features if not f.passes and not f.in_progress] + + debug_log.log("DB_DUMP", f"Full database state {label}", + total_features=len(all_features), + passing_count=len(passing), + passing_ids=[f.id for f in passing], + in_progress_count=len(in_progress), + in_progress_ids=[f.id for f in in_progress], + pending_count=len(pending), + pending_ids=[f.id for f in pending[:10]]) # First 10 pending only + # Performance: Limit parallel agents to prevent memory exhaustion MAX_PARALLEL_AGENTS = 5 +MAX_TOTAL_AGENTS = 10 # Hard limit on total agents (coding + testing) DEFAULT_CONCURRENCY = 3 POLL_INTERVAL = 5 # seconds between checking for ready features MAX_FEATURE_RETRIES = 3 # Maximum times to retry a failed feature +INITIALIZER_TIMEOUT = 1800 # 30 minutes timeout for initializer def _kill_process_tree(proc: subprocess.Popen, timeout: float = 5.0) -> None: @@ -95,6 +169,8 @@ class ParallelOrchestrator: max_concurrency: int = DEFAULT_CONCURRENCY, model: str = None, yolo_mode: bool = False, + testing_agent_ratio: int = 1, + count_testing_in_concurrency: bool = False, on_output: Callable[[int, str], None] = None, on_status: Callable[[int, str], None] = None, ): @@ -102,9 +178,11 @@ class ParallelOrchestrator: Args: project_dir: Path to the project directory - max_concurrency: Maximum number of concurrent agents (1-5) + max_concurrency: Maximum number of concurrent coding agents (1-5) model: Claude model to use (or None for default) - yolo_mode: Whether to run in YOLO mode (skip browser testing) + yolo_mode: Whether to run in YOLO mode (skip testing agents) + testing_agent_ratio: Testing agents per coding agent (0-3, default 1) + count_testing_in_concurrency: If True, testing agents count toward concurrency limit on_output: Callback for agent output (feature_id, line) on_status: Callback for agent status changes (feature_id, status) """ @@ -112,12 +190,19 @@ class ParallelOrchestrator: self.max_concurrency = min(max(max_concurrency, 1), MAX_PARALLEL_AGENTS) self.model = model self.yolo_mode = yolo_mode + self.testing_agent_ratio = min(max(testing_agent_ratio, 0), 3) # Clamp 0-3 + self.count_testing_in_concurrency = count_testing_in_concurrency self.on_output = on_output self.on_status = on_status # Thread-safe state self._lock = threading.Lock() - self.running_agents: dict[int, subprocess.Popen] = {} + # Coding agents: feature_id -> process + self.running_coding_agents: dict[int, subprocess.Popen] = {} + # Testing agents: list of processes (not tied to specific features) + self.running_testing_agents: list[subprocess.Popen] = [] + # Legacy alias for backward compatibility + self.running_agents = self.running_coding_agents self.abort_events: dict[int, threading.Event] = {} self.is_running = False @@ -154,7 +239,7 @@ class ParallelOrchestrator: for f in stale: # Skip if already running in this orchestrator instance with self._lock: - if f.id in self.running_agents: + if f.id in self.running_coding_agents: continue # Skip if feature has failed too many times if self._failure_counts.get(f.id, 0) >= MAX_FEATURE_RETRIES: @@ -181,19 +266,28 @@ class ParallelOrchestrator: all_dicts = [f.to_dict() for f in all_features] ready = [] + skipped_reasons = {"passes": 0, "in_progress": 0, "running": 0, "failed": 0, "deps": 0} for f in all_features: - if f.passes or f.in_progress: + if f.passes: + skipped_reasons["passes"] += 1 + continue + if f.in_progress: + skipped_reasons["in_progress"] += 1 continue # Skip if already running in this orchestrator with self._lock: - if f.id in self.running_agents: + if f.id in self.running_coding_agents: + skipped_reasons["running"] += 1 continue # Skip if feature has failed too many times if self._failure_counts.get(f.id, 0) >= MAX_FEATURE_RETRIES: + skipped_reasons["failed"] += 1 continue # Check dependencies if are_dependencies_satisfied(f.to_dict(), all_dicts): ready.append(f.to_dict()) + else: + skipped_reasons["deps"] += 1 # Sort by scheduling score (higher = first), then priority, then id scores = compute_scheduling_scores(all_dicts) @@ -207,12 +301,30 @@ class ParallelOrchestrator: f"{passing} passing, {in_progress} in_progress, {len(all_features)} total", flush=True ) + print( + f"[DEBUG] Skipped: {skipped_reasons['passes']} passing, {skipped_reasons['in_progress']} in_progress, " + f"{skipped_reasons['running']} running, {skipped_reasons['failed']} failed, {skipped_reasons['deps']} blocked by deps", + flush=True + ) + + # Log to debug file (but not every call to avoid spam) + debug_log.log("READY", "get_ready_features() called", + ready_count=len(ready), + ready_ids=[f['id'] for f in ready[:5]], # First 5 only + passing=passing, + in_progress=in_progress, + total=len(all_features), + skipped=skipped_reasons) + return ready finally: session.close() def get_all_complete(self) -> bool: - """Check if all features are complete or permanently failed.""" + """Check if all features are complete or permanently failed. + + Returns False if there are no features (initialization needed). + """ session = self.get_session() try: # Force fresh read from database to avoid stale cached data @@ -220,6 +332,11 @@ class ParallelOrchestrator: session.expire_all() all_features = session.query(Feature).all() + + # No features = NOT complete, need initialization + if len(all_features) == 0: + return False + passing_count = 0 failed_count = 0 pending_count = 0 @@ -243,8 +360,17 @@ class ParallelOrchestrator: finally: session.close() + def get_passing_count(self) -> int: + """Get the number of passing features.""" + session = self.get_session() + try: + session.expire_all() + return session.query(Feature).filter(Feature.passes == True).count() + finally: + session.close() + def start_feature(self, feature_id: int, resume: bool = False) -> tuple[bool, str]: - """Start a single feature agent. + """Start a single coding agent for a feature. Args: feature_id: ID of the feature to start @@ -254,9 +380,9 @@ class ParallelOrchestrator: Tuple of (success, message) """ with self._lock: - if feature_id in self.running_agents: + if feature_id in self.running_coding_agents: return False, "Feature already running" - if len(self.running_agents) >= self.max_concurrency: + if len(self.running_coding_agents) >= self.max_concurrency: return False, "At max concurrency" # Mark as in_progress in database (or verify it's resumable) @@ -281,6 +407,19 @@ class ParallelOrchestrator: finally: session.close() + # Start coding agent subprocess + success, message = self._spawn_coding_agent(feature_id) + if not success: + return False, message + + # NOTE: Testing agents are spawned in _on_agent_complete() after a coding agent + # succeeds, not here. This ensures we only spawn testing agents when there are + # actually passing features to test. + + return True, f"Started feature {feature_id}" + + def _spawn_coding_agent(self, feature_id: int) -> tuple[bool, str]: + """Spawn a coding agent subprocess for a specific feature.""" # Create abort event abort_event = threading.Event() @@ -290,8 +429,9 @@ class ParallelOrchestrator: "-u", # Force unbuffered stdout/stderr str(AUTOCODER_ROOT / "autonomous_agent_demo.py"), "--project-dir", str(self.project_dir), - "--max-iterations", "1", # Single feature mode - "--feature-id", str(feature_id), # Work on this specific feature only + "--max-iterations", "1", + "--agent-type", "coding", + "--feature-id", str(feature_id), ] if self.model: cmd.extend(["--model", self.model]) @@ -304,7 +444,7 @@ class ParallelOrchestrator: stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, - cwd=str(AUTOCODER_ROOT), # Run from autocoder root for proper imports + cwd=str(AUTOCODER_ROOT), env={**os.environ, "PYTHONUNBUFFERED": "1"}, ) except Exception as e: @@ -320,23 +460,157 @@ class ParallelOrchestrator: return False, f"Failed to start agent: {e}" with self._lock: - self.running_agents[feature_id] = proc + self.running_coding_agents[feature_id] = proc self.abort_events[feature_id] = abort_event # Start output reader thread threading.Thread( target=self._read_output, - args=(feature_id, proc, abort_event), + args=(feature_id, proc, abort_event, "coding"), daemon=True ).start() if self.on_status: self.on_status(feature_id, "running") - print(f"Started agent for feature #{feature_id}", flush=True) + print(f"Started coding agent for feature #{feature_id}", flush=True) return True, f"Started feature {feature_id}" - def _read_output(self, feature_id: int, proc: subprocess.Popen, abort: threading.Event): + def _spawn_testing_agents(self) -> None: + """Spawn testing agents based on testing_agent_ratio.""" + for _ in range(self.testing_agent_ratio): + # Check resource limits + with self._lock: + total_agents = len(self.running_coding_agents) + len(self.running_testing_agents) + if total_agents >= MAX_TOTAL_AGENTS: + print(f"[DEBUG] At max total agents ({MAX_TOTAL_AGENTS}), skipping testing agent", flush=True) + break + + if self.count_testing_in_concurrency: + if total_agents >= self.max_concurrency: + print("[DEBUG] Testing agents count toward concurrency, at limit", flush=True) + break + + # Spawn a testing agent + self._spawn_testing_agent() + + def _spawn_testing_agent(self) -> tuple[bool, str]: + """Spawn a testing agent subprocess for regression testing.""" + debug_log.log("TESTING", "Attempting to spawn testing agent subprocess") + + cmd = [ + sys.executable, + "-u", + str(AUTOCODER_ROOT / "autonomous_agent_demo.py"), + "--project-dir", str(self.project_dir), + "--max-iterations", "1", + "--agent-type", "testing", + ] + if self.model: + cmd.extend(["--model", self.model]) + # Testing agents don't need --yolo flag (they use testing prompt regardless) + + try: + proc = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + cwd=str(AUTOCODER_ROOT), + env={**os.environ, "PYTHONUNBUFFERED": "1"}, + ) + except Exception as e: + debug_log.log("TESTING", f"FAILED to spawn testing agent: {e}") + return False, f"Failed to start testing agent: {e}" + + with self._lock: + self.running_testing_agents.append(proc) + testing_count = len(self.running_testing_agents) + + # Start output reader thread (feature_id=None for testing agents) + threading.Thread( + target=self._read_output, + args=(None, proc, threading.Event(), "testing"), + daemon=True + ).start() + + print(f"Started testing agent (PID {proc.pid})", flush=True) + debug_log.log("TESTING", "Successfully spawned testing agent", + pid=proc.pid, + total_testing_agents=testing_count) + return True, "Started testing agent" + + async def _run_initializer(self) -> bool: + """Run initializer agent as blocking subprocess. + + Returns True if initialization succeeded (features were created). + """ + debug_log.section("INITIALIZER PHASE") + debug_log.log("INIT", "Starting initializer subprocess", + project_dir=str(self.project_dir)) + + cmd = [ + sys.executable, "-u", + str(AUTOCODER_ROOT / "autonomous_agent_demo.py"), + "--project-dir", str(self.project_dir), + "--agent-type", "initializer", + "--max-iterations", "1", + ] + if self.model: + cmd.extend(["--model", self.model]) + + print("Running initializer agent...", flush=True) + + proc = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + cwd=str(AUTOCODER_ROOT), + env={**os.environ, "PYTHONUNBUFFERED": "1"}, + ) + + debug_log.log("INIT", "Initializer subprocess started", pid=proc.pid) + + # Stream output with timeout + loop = asyncio.get_running_loop() + try: + async def stream_output(): + while True: + line = await loop.run_in_executor(None, proc.stdout.readline) + if not line: + break + print(line.rstrip(), flush=True) + if self.on_output: + self.on_output(0, line.rstrip()) # Use 0 as feature_id for initializer + proc.wait() + + await asyncio.wait_for(stream_output(), timeout=INITIALIZER_TIMEOUT) + + except asyncio.TimeoutError: + print(f"ERROR: Initializer timed out after {INITIALIZER_TIMEOUT // 60} minutes", flush=True) + debug_log.log("INIT", "TIMEOUT - Initializer exceeded time limit", + timeout_minutes=INITIALIZER_TIMEOUT // 60) + _kill_process_tree(proc) + return False + + debug_log.log("INIT", "Initializer subprocess completed", + return_code=proc.returncode, + success=proc.returncode == 0) + + if proc.returncode != 0: + print(f"ERROR: Initializer failed with exit code {proc.returncode}", flush=True) + return False + + return True + + def _read_output( + self, + feature_id: int | None, + proc: subprocess.Popen, + abort: threading.Event, + agent_type: Literal["coding", "testing"] = "coding", + ): """Read output from subprocess and emit events.""" try: for line in proc.stdout: @@ -344,34 +618,93 @@ class ParallelOrchestrator: break line = line.rstrip() if self.on_output: - self.on_output(feature_id, line) + self.on_output(feature_id or 0, line) else: - print(f"[Feature #{feature_id}] {line}", flush=True) + if agent_type == "testing": + print(f"[Testing] {line}", flush=True) + else: + print(f"[Feature #{feature_id}] {line}", flush=True) proc.wait() finally: - self._on_feature_complete(feature_id, proc.returncode) + self._on_agent_complete(feature_id, proc.returncode, agent_type, proc) - def _on_feature_complete(self, feature_id: int, return_code: int): - """Handle feature completion. + def _on_agent_complete( + self, + feature_id: int | None, + return_code: int, + agent_type: Literal["coding", "testing"], + proc: subprocess.Popen, + ): + """Handle agent completion. - ALWAYS clears in_progress when agent exits, regardless of success/failure. - This prevents features from getting stuck if an agent crashes or is killed. - The agent marks features as passing BEFORE clearing in_progress, so this - is safe - we won't accidentally clear a feature that's being worked on. + For coding agents: + - ALWAYS clears in_progress when agent exits, regardless of success/failure. + - This prevents features from getting stuck if an agent crashes or is killed. + - The agent marks features as passing BEFORE clearing in_progress, so this + is safe. + + For testing agents: + - Just remove from the running list. """ + if agent_type == "testing": + with self._lock: + if proc in self.running_testing_agents: + self.running_testing_agents.remove(proc) + + status = "completed" if return_code == 0 else "failed" + print(f"Testing agent (PID {proc.pid}) {status}", flush=True) + debug_log.log("COMPLETE", "Testing agent finished", + pid=proc.pid, + status=status) + return + + # Coding agent completion + debug_log.log("COMPLETE", f"Coding agent for feature #{feature_id} finished", + return_code=return_code, + status="success" if return_code == 0 else "failed") + with self._lock: - self.running_agents.pop(feature_id, None) + self.running_coding_agents.pop(feature_id, None) self.abort_events.pop(feature_id, None) - # ALWAYS clear in_progress when agent exits to prevent stuck features - # The agent marks features as passing before clearing in_progress, - # so if in_progress is still True here, the feature didn't complete successfully + # BEFORE dispose: Query database state to see if it's stale + session_before = self.get_session() + try: + session_before.expire_all() + feature_before = session_before.query(Feature).filter(Feature.id == feature_id).first() + all_before = session_before.query(Feature).all() + passing_before = sum(1 for f in all_before if f.passes) + debug_log.log("DB", f"BEFORE engine.dispose() - Feature #{feature_id} state", + passes=feature_before.passes if feature_before else None, + in_progress=feature_before.in_progress if feature_before else None, + total_passing_in_db=passing_before) + finally: + session_before.close() + + # CRITICAL: Refresh database connection to see subprocess commits + # The coding agent runs as a subprocess and commits changes (e.g., passes=True). + # SQLAlchemy may have stale connections. Disposing the engine forces new connections + # that will see the subprocess's committed changes. + debug_log.log("DB", "Disposing database engine now...") + self._engine.dispose() + + # AFTER dispose: Query again to compare session = self.get_session() try: feature = session.query(Feature).filter(Feature.id == feature_id).first() + all_after = session.query(Feature).all() + passing_after = sum(1 for f in all_after if f.passes) + feature_passes = feature.passes if feature else None + feature_in_progress = feature.in_progress if feature else None + debug_log.log("DB", f"AFTER engine.dispose() - Feature #{feature_id} state", + passes=feature_passes, + in_progress=feature_in_progress, + total_passing_in_db=passing_after, + passing_changed=(passing_after != passing_before) if 'passing_before' in dir() else "unknown") if feature and feature.in_progress and not feature.passes: feature.in_progress = False session.commit() + debug_log.log("DB", f"Cleared in_progress for feature #{feature_id} (agent failed)") finally: session.close() @@ -382,6 +715,8 @@ class ParallelOrchestrator: failure_count = self._failure_counts[feature_id] if failure_count >= MAX_FEATURE_RETRIES: print(f"Feature #{feature_id} has failed {failure_count} times, will not retry", flush=True) + debug_log.log("COMPLETE", f"Feature #{feature_id} exceeded max retries", + failure_count=failure_count) status = "completed" if return_code == 0 else "failed" if self.on_status: @@ -389,14 +724,32 @@ class ParallelOrchestrator: # CRITICAL: This print triggers the WebSocket to emit agent_update with state='error' or 'success' print(f"Feature #{feature_id} {status}", flush=True) + # Spawn testing agents after successful coding agent completion + # This is the correct place to spawn testing agents - after we know there are + # passing features (the one this agent just completed, plus any previous ones) + if return_code == 0 and not self.yolo_mode and self.testing_agent_ratio > 0: + passing_count = self.get_passing_count() + print(f"[DEBUG] Coding agent completed successfully, passing_count={passing_count}", flush=True) + debug_log.log("TESTING", "Checking if testing agents should spawn", + yolo_mode=self.yolo_mode, + testing_agent_ratio=self.testing_agent_ratio, + passing_count=passing_count) + if passing_count > 0: + print(f"[DEBUG] Spawning testing agents (ratio={self.testing_agent_ratio})", flush=True) + debug_log.log("TESTING", f"Spawning {self.testing_agent_ratio} testing agent(s)") + self._spawn_testing_agents() + elif return_code == 0: + debug_log.log("TESTING", "Skipping testing agents", + reason="yolo_mode" if self.yolo_mode else f"ratio={self.testing_agent_ratio}") + def stop_feature(self, feature_id: int) -> tuple[bool, str]: - """Stop a running feature agent and all its child processes.""" + """Stop a running coding agent and all its child processes.""" with self._lock: - if feature_id not in self.running_agents: + if feature_id not in self.running_coding_agents: return False, "Feature not running" abort = self.abort_events.get(feature_id) - proc = self.running_agents.get(feature_id) + proc = self.running_coding_agents.get(feature_id) if abort: abort.set() @@ -407,22 +760,106 @@ class ParallelOrchestrator: return True, f"Stopped feature {feature_id}" def stop_all(self) -> None: - """Stop all running feature agents.""" + """Stop all running agents (coding and testing).""" self.is_running = False + + # Stop coding agents with self._lock: - feature_ids = list(self.running_agents.keys()) + feature_ids = list(self.running_coding_agents.keys()) for fid in feature_ids: self.stop_feature(fid) + # Stop testing agents + with self._lock: + testing_procs = list(self.running_testing_agents) + + for proc in testing_procs: + _kill_process_tree(proc, timeout=5.0) + async def run_loop(self): """Main orchestration loop.""" self.is_running = True - print(f"Starting parallel orchestrator with max_concurrency={self.max_concurrency}", flush=True) + # Start debug logging session (clears previous logs) + debug_log.start_session() + + # Log startup to debug file + debug_log.section("ORCHESTRATOR STARTUP") + debug_log.log("STARTUP", "Orchestrator run_loop starting", + project_dir=str(self.project_dir), + max_concurrency=self.max_concurrency, + yolo_mode=self.yolo_mode, + testing_agent_ratio=self.testing_agent_ratio, + count_testing_in_concurrency=self.count_testing_in_concurrency) + + print("=" * 70, flush=True) + print(" UNIFIED ORCHESTRATOR SETTINGS", flush=True) + print("=" * 70, flush=True) print(f"Project: {self.project_dir}", flush=True) + print(f"Max concurrency: {self.max_concurrency} coding agents", flush=True) + print(f"YOLO mode: {self.yolo_mode}", flush=True) + print(f"Testing agent ratio: {self.testing_agent_ratio} per coding agent", flush=True) + print(f"Count testing in concurrency: {self.count_testing_in_concurrency}", flush=True) + print("=" * 70, flush=True) print(flush=True) + # Phase 1: Check if initialization needed + if not has_features(self.project_dir): + print("=" * 70, flush=True) + print(" INITIALIZATION PHASE", flush=True) + print("=" * 70, flush=True) + print("No features found - running initializer agent first...", flush=True) + print("NOTE: This may take 10-20+ minutes to generate features.", flush=True) + print(flush=True) + + success = await self._run_initializer() + + if not success or not has_features(self.project_dir): + print("ERROR: Initializer did not create features. Exiting.", flush=True) + return + + print(flush=True) + print("=" * 70, flush=True) + print(" INITIALIZATION COMPLETE - Starting feature loop", flush=True) + print("=" * 70, flush=True) + print(flush=True) + + # CRITICAL: Recreate database connection after initializer subprocess commits + # The initializer runs as a subprocess and commits to the database file. + # SQLAlchemy may have stale connections or cached state. Disposing the old + # engine and creating a fresh engine/session_maker ensures we see all the + # newly created features. + debug_log.section("INITIALIZATION COMPLETE") + debug_log.log("INIT", "Disposing old database engine and creating fresh connection") + print("[DEBUG] Recreating database connection after initialization...", flush=True) + if self._engine is not None: + self._engine.dispose() + self._engine, self._session_maker = create_database(self.project_dir) + + # Debug: Show state immediately after initialization + print("[DEBUG] Post-initialization state check:", flush=True) + print(f"[DEBUG] max_concurrency={self.max_concurrency}", flush=True) + print(f"[DEBUG] yolo_mode={self.yolo_mode}", flush=True) + print(f"[DEBUG] testing_agent_ratio={self.testing_agent_ratio}", flush=True) + + # Verify features were created and are visible + session = self.get_session() + try: + feature_count = session.query(Feature).count() + all_features = session.query(Feature).all() + feature_names = [f"{f.id}: {f.name}" for f in all_features[:10]] + print(f"[DEBUG] features in database={feature_count}", flush=True) + debug_log.log("INIT", "Post-initialization database state", + max_concurrency=self.max_concurrency, + yolo_mode=self.yolo_mode, + testing_agent_ratio=self.testing_agent_ratio, + feature_count=feature_count, + first_10_features=feature_names) + finally: + session.close() + + # Phase 2: Feature loop # Check for features to resume from previous session resumable = self.get_resumable_features() if resumable: @@ -431,7 +868,31 @@ class ParallelOrchestrator: print(f" - Feature #{f['id']}: {f['name']}", flush=True) print(flush=True) + debug_log.section("FEATURE LOOP STARTING") + loop_iteration = 0 while self.is_running: + loop_iteration += 1 + if loop_iteration <= 3: + print(f"[DEBUG] === Loop iteration {loop_iteration} ===", flush=True) + + # Log every iteration to debug file (first 10, then every 5th) + if loop_iteration <= 10 or loop_iteration % 5 == 0: + with self._lock: + running_ids = list(self.running_coding_agents.keys()) + testing_count = len(self.running_testing_agents) + debug_log.log("LOOP", f"Iteration {loop_iteration}", + running_coding_agents=running_ids, + running_testing_agents=testing_count, + max_concurrency=self.max_concurrency) + + # Full database dump every 5 iterations + if loop_iteration == 1 or loop_iteration % 5 == 0: + session = self.get_session() + try: + _dump_database_state(session, f"(iteration {loop_iteration})") + finally: + session.close() + try: # Check if all complete if self.get_all_complete(): @@ -440,8 +901,19 @@ class ParallelOrchestrator: # Check capacity with self._lock: - current = len(self.running_agents) + current = len(self.running_coding_agents) + current_testing = len(self.running_testing_agents) + running_ids = list(self.running_coding_agents.keys()) + + debug_log.log("CAPACITY", "Checking capacity", + current_coding=current, + current_testing=current_testing, + running_coding_ids=running_ids, + max_concurrency=self.max_concurrency, + at_capacity=(current >= self.max_concurrency)) + if current >= self.max_concurrency: + debug_log.log("CAPACITY", "At max capacity, sleeping...") await asyncio.sleep(POLL_INTERVAL) continue @@ -484,9 +956,32 @@ class ParallelOrchestrator: # Start features up to capacity slots = self.max_concurrency - current - for feature in ready[:slots]: - print(f"Starting feature #{feature['id']}: {feature['name']}", flush=True) - self.start_feature(feature["id"]) + print(f"[DEBUG] Spawning loop: {len(ready)} ready, {slots} slots available, max_concurrency={self.max_concurrency}", flush=True) + print(f"[DEBUG] Will attempt to start {min(len(ready), slots)} features", flush=True) + features_to_start = ready[:slots] + print(f"[DEBUG] Features to start: {[f['id'] for f in features_to_start]}", flush=True) + + debug_log.log("SPAWN", "Starting features batch", + ready_count=len(ready), + slots_available=slots, + features_to_start=[f['id'] for f in features_to_start]) + + for i, feature in enumerate(features_to_start): + print(f"[DEBUG] Starting feature {i+1}/{len(features_to_start)}: #{feature['id']} - {feature['name']}", flush=True) + success, msg = self.start_feature(feature["id"]) + if not success: + print(f"[DEBUG] Failed to start feature #{feature['id']}: {msg}", flush=True) + debug_log.log("SPAWN", f"FAILED to start feature #{feature['id']}", + feature_name=feature['name'], + error=msg) + else: + print(f"[DEBUG] Successfully started feature #{feature['id']}", flush=True) + with self._lock: + running_count = len(self.running_coding_agents) + print(f"[DEBUG] Running coding agents after start: {running_count}", flush=True) + debug_log.log("SPAWN", f"Successfully started feature #{feature['id']}", + feature_name=feature['name'], + running_coding_agents=running_count) await asyncio.sleep(2) # Brief pause between starts @@ -498,7 +993,9 @@ class ParallelOrchestrator: print("Waiting for running agents to complete...", flush=True) while True: with self._lock: - if not self.running_agents: + coding_done = len(self.running_coding_agents) == 0 + testing_done = len(self.running_testing_agents) == 0 + if coding_done and testing_done: break await asyncio.sleep(1) @@ -508,10 +1005,15 @@ class ParallelOrchestrator: """Get current orchestrator status.""" with self._lock: return { - "running_features": list(self.running_agents.keys()), - "count": len(self.running_agents), + "running_features": list(self.running_coding_agents.keys()), + "coding_agent_count": len(self.running_coding_agents), + "testing_agent_count": len(self.running_testing_agents), + "count": len(self.running_coding_agents), # Legacy compatibility "max_concurrency": self.max_concurrency, + "testing_agent_ratio": self.testing_agent_ratio, + "count_testing_in_concurrency": self.count_testing_in_concurrency, "is_running": self.is_running, + "yolo_mode": self.yolo_mode, } @@ -520,20 +1022,27 @@ async def run_parallel_orchestrator( max_concurrency: int = DEFAULT_CONCURRENCY, model: str = None, yolo_mode: bool = False, + testing_agent_ratio: int = 1, + count_testing_in_concurrency: bool = False, ) -> None: - """Run the parallel orchestrator. + """Run the unified orchestrator. Args: project_dir: Path to the project directory - max_concurrency: Maximum number of concurrent agents + max_concurrency: Maximum number of concurrent coding agents model: Claude model to use - yolo_mode: Whether to run in YOLO mode + yolo_mode: Whether to run in YOLO mode (skip testing agents) + testing_agent_ratio: Testing agents per coding agent (0-3) + count_testing_in_concurrency: If True, testing agents count toward concurrency limit """ + print(f"[ORCHESTRATOR] run_parallel_orchestrator called with max_concurrency={max_concurrency}", flush=True) orchestrator = ParallelOrchestrator( project_dir=project_dir, max_concurrency=max_concurrency, model=model, yolo_mode=yolo_mode, + testing_agent_ratio=testing_agent_ratio, + count_testing_in_concurrency=count_testing_in_concurrency, ) try: diff --git a/prompts.py b/prompts.py index 2c0dcfc..ad76ff0 100644 --- a/prompts.py +++ b/prompts.py @@ -74,31 +74,30 @@ def get_coding_prompt(project_dir: Path | None = None) -> str: return load_prompt("coding_prompt", project_dir) -def get_coding_prompt_yolo(project_dir: Path | None = None) -> str: - """Load the YOLO mode coding agent prompt (project-specific if available).""" - return load_prompt("coding_prompt_yolo", project_dir) +def get_testing_prompt(project_dir: Path | None = None) -> str: + """Load the testing agent prompt (project-specific if available).""" + return load_prompt("testing_prompt", project_dir) def get_single_feature_prompt(feature_id: int, project_dir: Path | None = None, yolo_mode: bool = False) -> str: """ Load the coding prompt with single-feature focus instructions prepended. - When the parallel orchestrator assigns a specific feature to an agent, + When the orchestrator assigns a specific feature to a coding agent, this prompt ensures the agent works ONLY on that feature. Args: feature_id: The specific feature ID to work on project_dir: Optional project directory for project-specific prompts - yolo_mode: If True, use the YOLO prompt variant + yolo_mode: Ignored (kept for backward compatibility). Testing is now + handled by separate testing agents, not YOLO prompts. Returns: The prompt with single-feature instructions prepended """ - # Get the base prompt - if yolo_mode: - base_prompt = get_coding_prompt_yolo(project_dir) - else: - base_prompt = get_coding_prompt(project_dir) + # Always use the standard coding prompt + # (Testing/regression is handled by separate testing agents) + base_prompt = get_coding_prompt(project_dir) # Prepend single-feature instructions single_feature_header = f"""## SINGLE FEATURE MODE @@ -185,8 +184,8 @@ def scaffold_project_prompts(project_dir: Path) -> Path: templates = [ ("app_spec.template.txt", "app_spec.txt"), ("coding_prompt.template.md", "coding_prompt.md"), - ("coding_prompt_yolo.template.md", "coding_prompt_yolo.md"), ("initializer_prompt.template.md", "initializer_prompt.md"), + ("testing_prompt.template.md", "testing_prompt.md"), ] copied_files = [] diff --git a/server/routers/agent.py b/server/routers/agent.py index a6d121b..25871c4 100644 --- a/server/routers/agent.py +++ b/server/routers/agent.py @@ -26,8 +26,12 @@ def _get_project_path(project_name: str) -> Path: return get_project_path(project_name) -def _get_settings_defaults() -> tuple[bool, str]: - """Get YOLO mode and model defaults from global settings.""" +def _get_settings_defaults() -> tuple[bool, str, int, bool]: + """Get defaults from global settings. + + Returns: + Tuple of (yolo_mode, model, testing_agent_ratio, count_testing_in_concurrency) + """ import sys root = Path(__file__).parent.parent.parent if str(root) not in sys.path: @@ -38,7 +42,16 @@ def _get_settings_defaults() -> tuple[bool, str]: settings = get_all_settings() yolo_mode = (settings.get("yolo_mode") or "false").lower() == "true" model = settings.get("model", DEFAULT_MODEL) - return yolo_mode, model + + # Parse testing agent settings with defaults + try: + testing_agent_ratio = int(settings.get("testing_agent_ratio", "1")) + except (ValueError, TypeError): + testing_agent_ratio = 1 + + count_testing = (settings.get("count_testing_in_concurrency") or "false").lower() == "true" + + return yolo_mode, model, testing_agent_ratio, count_testing router = APIRouter(prefix="/api/projects/{project_name}/agent", tags=["agent"]) @@ -87,6 +100,8 @@ async def get_agent_status(project_name: str): model=manager.model, parallel_mode=manager.parallel_mode, max_concurrency=manager.max_concurrency, + testing_agent_ratio=manager.testing_agent_ratio, + count_testing_in_concurrency=manager.count_testing_in_concurrency, ) @@ -99,17 +114,20 @@ async def start_agent( manager = get_project_manager(project_name) # Get defaults from global settings if not provided in request - default_yolo, default_model = _get_settings_defaults() + default_yolo, default_model, default_testing_ratio, default_count_testing = _get_settings_defaults() + yolo_mode = request.yolo_mode if request.yolo_mode is not None else default_yolo model = request.model if request.model else default_model - parallel_mode = request.parallel_mode or False - max_concurrency = request.max_concurrency + max_concurrency = request.max_concurrency or 1 + testing_agent_ratio = request.testing_agent_ratio if request.testing_agent_ratio is not None else default_testing_ratio + count_testing = request.count_testing_in_concurrency if request.count_testing_in_concurrency is not None else default_count_testing success, message = await manager.start( yolo_mode=yolo_mode, model=model, - parallel_mode=parallel_mode, max_concurrency=max_concurrency, + testing_agent_ratio=testing_agent_ratio, + count_testing_in_concurrency=count_testing, ) return AgentActionResponse( diff --git a/server/routers/settings.py b/server/routers/settings.py index 78d6ff8..66bf88d 100644 --- a/server/routers/settings.py +++ b/server/routers/settings.py @@ -52,6 +52,23 @@ async def get_available_models(): ) +def _parse_int(value: str | None, default: int) -> int: + """Parse integer setting with default fallback.""" + if value is None: + return default + try: + return int(value) + except (ValueError, TypeError): + return default + + +def _parse_bool(value: str | None, default: bool = False) -> bool: + """Parse boolean setting with default fallback.""" + if value is None: + return default + return value.lower() == "true" + + @router.get("", response_model=SettingsResponse) async def get_settings(): """Get current global settings.""" @@ -61,6 +78,8 @@ async def get_settings(): yolo_mode=_parse_yolo_mode(all_settings.get("yolo_mode")), model=all_settings.get("model", DEFAULT_MODEL), glm_mode=_is_glm_mode(), + testing_agent_ratio=_parse_int(all_settings.get("testing_agent_ratio"), 1), + count_testing_in_concurrency=_parse_bool(all_settings.get("count_testing_in_concurrency")), ) @@ -73,10 +92,18 @@ async def update_settings(update: SettingsUpdate): if update.model is not None: set_setting("model", update.model) + if update.testing_agent_ratio is not None: + set_setting("testing_agent_ratio", str(update.testing_agent_ratio)) + + if update.count_testing_in_concurrency is not None: + set_setting("count_testing_in_concurrency", "true" if update.count_testing_in_concurrency else "false") + # Return updated settings all_settings = get_all_settings() return SettingsResponse( yolo_mode=_parse_yolo_mode(all_settings.get("yolo_mode")), model=all_settings.get("model", DEFAULT_MODEL), glm_mode=_is_glm_mode(), + testing_agent_ratio=_parse_int(all_settings.get("testing_agent_ratio"), 1), + count_testing_in_concurrency=_parse_bool(all_settings.get("count_testing_in_concurrency")), ) diff --git a/server/schemas.py b/server/schemas.py index b91ba5a..1140b84 100644 --- a/server/schemas.py +++ b/server/schemas.py @@ -169,8 +169,10 @@ class AgentStartRequest(BaseModel): """Request schema for starting the agent.""" yolo_mode: bool | None = None # None means use global settings model: str | None = None # None means use global settings - parallel_mode: bool | None = None # Enable parallel execution - max_concurrency: int | None = None # Max concurrent agents (1-5) + parallel_mode: bool | None = None # DEPRECATED: Use max_concurrency instead + max_concurrency: int | None = None # Max concurrent coding agents (1-5) + testing_agent_ratio: int | None = None # Testing agents per coding agent (0-3) + count_testing_in_concurrency: bool | None = None # Count testing toward limit @field_validator('model') @classmethod @@ -188,6 +190,14 @@ class AgentStartRequest(BaseModel): raise ValueError("max_concurrency must be between 1 and 5") return v + @field_validator('testing_agent_ratio') + @classmethod + def validate_testing_ratio(cls, v: int | None) -> int | None: + """Validate testing_agent_ratio is between 0 and 3.""" + if v is not None and (v < 0 or v > 3): + raise ValueError("testing_agent_ratio must be between 0 and 3") + return v + class AgentStatus(BaseModel): """Current agent status.""" @@ -196,8 +206,10 @@ class AgentStatus(BaseModel): started_at: datetime | None = None yolo_mode: bool = False model: str | None = None # Model being used by running agent - parallel_mode: bool = False + parallel_mode: bool = False # DEPRECATED: Always True now (unified orchestrator) max_concurrency: int | None = None + testing_agent_ratio: int = 1 # Testing agents per coding agent + count_testing_in_concurrency: bool = False # Count testing toward limit class AgentActionResponse(BaseModel): @@ -257,6 +269,9 @@ class WSAgentStatusMessage(BaseModel): # Agent state for multi-agent tracking AgentState = Literal["idle", "thinking", "working", "testing", "success", "error", "struggling"] +# Agent type (coding vs testing) +AgentType = Literal["coding", "testing"] + # Agent mascot names assigned by index AGENT_MASCOTS = ["Spark", "Fizz", "Octo", "Hoot", "Buzz"] @@ -266,6 +281,7 @@ class WSAgentUpdateMessage(BaseModel): type: Literal["agent_update"] = "agent_update" agentIndex: int agentName: str # One of AGENT_MASCOTS + agentType: AgentType = "coding" # "coding" or "testing" featureId: int featureName: str state: AgentState @@ -368,6 +384,8 @@ class SettingsResponse(BaseModel): yolo_mode: bool = False model: str = DEFAULT_MODEL glm_mode: bool = False # True if GLM API is configured via .env + testing_agent_ratio: int = 1 # Testing agents per coding agent (0-3) + count_testing_in_concurrency: bool = False # Count testing toward concurrency class ModelsResponse(BaseModel): @@ -380,6 +398,8 @@ class SettingsUpdate(BaseModel): """Request schema for updating global settings.""" yolo_mode: bool | None = None model: str | None = None + testing_agent_ratio: int | None = None # 0-3 + count_testing_in_concurrency: bool | None = None @field_validator('model') @classmethod @@ -388,6 +408,13 @@ class SettingsUpdate(BaseModel): raise ValueError(f"Invalid model. Must be one of: {VALID_MODELS}") return v + @field_validator('testing_agent_ratio') + @classmethod + def validate_testing_ratio(cls, v: int | None) -> int | None: + if v is not None and (v < 0 or v > 3): + raise ValueError("testing_agent_ratio must be between 0 and 3") + return v + # ============================================================================ # Dev Server Schemas diff --git a/server/services/process_manager.py b/server/services/process_manager.py index 2dc1137..0c50fd3 100644 --- a/server/services/process_manager.py +++ b/server/services/process_manager.py @@ -8,6 +8,7 @@ Provides start/stop/pause/resume functionality with cross-platform support. import asyncio import logging +import os import re import subprocess import sys @@ -82,6 +83,8 @@ class AgentProcessManager: self.model: str | None = None # Model being used self.parallel_mode: bool = False # Parallel execution mode self.max_concurrency: int | None = None # Max concurrent agents + self.testing_agent_ratio: int = 1 # Testing agents per coding agent + self.count_testing_in_concurrency: bool = False # Count testing toward limit # Support multiple callbacks (for multiple WebSocket clients) self._output_callbacks: Set[Callable[[str], Awaitable[None]]] = set() @@ -292,15 +295,19 @@ class AgentProcessManager: model: str | None = None, parallel_mode: bool = False, max_concurrency: int | None = None, + testing_agent_ratio: int = 1, + count_testing_in_concurrency: bool = False, ) -> tuple[bool, str]: """ Start the agent as a subprocess. Args: - yolo_mode: If True, run in YOLO mode (no browser testing) + yolo_mode: If True, run in YOLO mode (skip testing agents) model: Model to use (e.g., claude-opus-4-5-20251101) - parallel_mode: If True, run multiple features in parallel - max_concurrency: Max concurrent agents (default 3 if parallel enabled) + parallel_mode: DEPRECATED - ignored, always uses unified orchestrator + max_concurrency: Max concurrent coding agents (1-5, default 1) + testing_agent_ratio: Testing agents per coding agent (0-3, default 1) + count_testing_in_concurrency: If True, testing agents count toward limit Returns: Tuple of (success, message) @@ -314,12 +321,15 @@ class AgentProcessManager: # Store for status queries self.yolo_mode = yolo_mode self.model = model - self.parallel_mode = parallel_mode - self.max_concurrency = max_concurrency + self.parallel_mode = True # Always True now (unified orchestrator) + self.max_concurrency = max_concurrency or 1 + self.testing_agent_ratio = testing_agent_ratio + self.count_testing_in_concurrency = count_testing_in_concurrency - # Build command - pass absolute path to project directory + # Build command - unified orchestrator with --concurrency cmd = [ sys.executable, + "-u", # Force unbuffered stdout/stderr for real-time output str(self.root_dir / "autonomous_agent_demo.py"), "--project-dir", str(self.project_dir.resolve()), @@ -333,19 +343,24 @@ class AgentProcessManager: if yolo_mode: cmd.append("--yolo") - # Add --parallel flag if parallel mode is enabled - if parallel_mode: - cmd.append("--parallel") - cmd.append(str(max_concurrency or 3)) # Default to 3 concurrent agents + # Add --concurrency flag (unified orchestrator always uses this) + cmd.extend(["--concurrency", str(max_concurrency or 1)]) + + # Add testing agent configuration + cmd.extend(["--testing-ratio", str(testing_agent_ratio)]) + if count_testing_in_concurrency: + cmd.append("--count-testing") try: # Start subprocess with piped stdout/stderr # Use project_dir as cwd so Claude SDK sandbox allows access to project files + # IMPORTANT: Set PYTHONUNBUFFERED to ensure output isn't delayed self.process = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=str(self.project_dir), + env={**os.environ, "PYTHONUNBUFFERED": "1"}, ) # Atomic lock creation - if it fails, another process beat us @@ -412,6 +427,8 @@ class AgentProcessManager: self.model = None # Reset model self.parallel_mode = False # Reset parallel mode self.max_concurrency = None # Reset concurrency + self.testing_agent_ratio = 1 # Reset testing ratio + self.count_testing_in_concurrency = False # Reset count testing return True, "Agent stopped" except Exception as e: @@ -496,6 +513,8 @@ class AgentProcessManager: "model": self.model, "parallel_mode": self.parallel_mode, "max_concurrency": self.max_concurrency, + "testing_agent_ratio": self.testing_agent_ratio, + "count_testing_in_concurrency": self.count_testing_in_concurrency, } diff --git a/server/websocket.py b/server/websocket.py index 63a2a1d..6d8c849 100644 --- a/server/websocket.py +++ b/server/websocket.py @@ -24,9 +24,12 @@ _count_passing_tests = None logger = logging.getLogger(__name__) -# Pattern to extract feature ID from parallel orchestrator output +# Pattern to extract feature ID from parallel orchestrator output (coding agents) FEATURE_ID_PATTERN = re.compile(r'\[Feature #(\d+)\]\s*(.*)') +# Pattern to extract testing agent output +TESTING_AGENT_PATTERN = re.compile(r'\[Testing\]\s*(.*)') + # Patterns for detecting agent activity and thoughts THOUGHT_PATTERNS = [ # Claude's tool usage patterns (actual format: [Tool: name]) @@ -49,8 +52,12 @@ THOUGHT_PATTERNS = [ class AgentTracker: """Tracks active agents and their states for multi-agent mode.""" + # Use a special key for the testing agent since it doesn't have a fixed feature ID + TESTING_AGENT_KEY = -1 + def __init__(self): - # feature_id -> {name, state, last_thought, agent_index} + # feature_id -> {name, state, last_thought, agent_index, agent_type} + # For testing agents, use TESTING_AGENT_KEY as the key self.active_agents: dict[int, dict] = {} self._next_agent_index = 0 self._lock = asyncio.Lock() @@ -61,16 +68,24 @@ class AgentTracker: Returns None if no update should be emitted. """ - # Check for feature-specific output + # Check for testing agent output first + testing_match = TESTING_AGENT_PATTERN.match(line) + if testing_match: + content = testing_match.group(1) + return await self._process_testing_agent_line(content) + + # Check for feature-specific output (coding agents) match = FEATURE_ID_PATTERN.match(line) if not match: # Also check for orchestrator status messages - if line.startswith("Started agent for feature #"): + if line.startswith("Started coding agent for feature #"): try: feature_id = int(re.search(r'#(\d+)', line).group(1)) - return await self._handle_agent_start(feature_id, line) + return await self._handle_agent_start(feature_id, line, agent_type="coding") except (AttributeError, ValueError): pass + elif line.startswith("Started testing agent"): + return await self._handle_testing_agent_start(line) elif line.startswith("Feature #") and ("completed" in line or "failed" in line): try: feature_id = int(re.search(r'#(\d+)', line).group(1)) @@ -78,6 +93,10 @@ class AgentTracker: return await self._handle_agent_complete(feature_id, is_success) except (AttributeError, ValueError): pass + elif line.startswith("Testing agent") and ("completed" in line or "failed" in line): + # Format: "Testing agent (PID xxx) completed" or "Testing agent (PID xxx) failed" + is_success = "completed" in line + return await self._handle_testing_agent_complete(is_success) return None feature_id = int(match.group(1)) @@ -91,6 +110,7 @@ class AgentTracker: self.active_agents[feature_id] = { 'name': AGENT_MASCOTS[agent_index % len(AGENT_MASCOTS)], 'agent_index': agent_index, + 'agent_type': 'coding', 'state': 'thinking', 'feature_name': f'Feature #{feature_id}', 'last_thought': None, @@ -119,6 +139,7 @@ class AgentTracker: 'type': 'agent_update', 'agentIndex': agent['agent_index'], 'agentName': agent['name'], + 'agentType': agent['agent_type'], 'featureId': feature_id, 'featureName': agent['feature_name'], 'state': state, @@ -128,6 +149,108 @@ class AgentTracker: return None + async def _process_testing_agent_line(self, content: str) -> dict | None: + """Process output from a testing agent.""" + async with self._lock: + # Ensure testing agent is tracked + if self.TESTING_AGENT_KEY not in self.active_agents: + agent_index = self._next_agent_index + self._next_agent_index += 1 + self.active_agents[self.TESTING_AGENT_KEY] = { + 'name': AGENT_MASCOTS[agent_index % len(AGENT_MASCOTS)], + 'agent_index': agent_index, + 'agent_type': 'testing', + 'state': 'testing', + 'feature_name': 'Regression Testing', + 'last_thought': None, + } + + agent = self.active_agents[self.TESTING_AGENT_KEY] + + # Detect state and thought from content + state = 'testing' + thought = None + + for pattern, detected_state in THOUGHT_PATTERNS: + m = pattern.search(content) + if m: + state = detected_state + thought = m.group(1) if m.lastindex else content[:100] + break + + # Only emit update if state changed or we have a new thought + if state != agent['state'] or thought != agent['last_thought']: + agent['state'] = state + if thought: + agent['last_thought'] = thought + + return { + 'type': 'agent_update', + 'agentIndex': agent['agent_index'], + 'agentName': agent['name'], + 'agentType': 'testing', + 'featureId': 0, # Testing agents work on random features + 'featureName': agent['feature_name'], + 'state': state, + 'thought': thought, + 'timestamp': datetime.now().isoformat(), + } + + return None + + async def _handle_testing_agent_start(self, line: str) -> dict | None: + """Handle testing agent start message from orchestrator.""" + async with self._lock: + agent_index = self._next_agent_index + self._next_agent_index += 1 + + self.active_agents[self.TESTING_AGENT_KEY] = { + 'name': AGENT_MASCOTS[agent_index % len(AGENT_MASCOTS)], + 'agent_index': agent_index, + 'agent_type': 'testing', + 'state': 'testing', + 'feature_name': 'Regression Testing', + 'last_thought': 'Starting regression tests...', + } + + return { + 'type': 'agent_update', + 'agentIndex': agent_index, + 'agentName': AGENT_MASCOTS[agent_index % len(AGENT_MASCOTS)], + 'agentType': 'testing', + 'featureId': 0, + 'featureName': 'Regression Testing', + 'state': 'testing', + 'thought': 'Starting regression tests...', + 'timestamp': datetime.now().isoformat(), + } + + async def _handle_testing_agent_complete(self, is_success: bool) -> dict | None: + """Handle testing agent completion.""" + async with self._lock: + if self.TESTING_AGENT_KEY not in self.active_agents: + return None + + agent = self.active_agents[self.TESTING_AGENT_KEY] + state = 'success' if is_success else 'error' + + result = { + 'type': 'agent_update', + 'agentIndex': agent['agent_index'], + 'agentName': agent['name'], + 'agentType': 'testing', + 'featureId': 0, + 'featureName': agent['feature_name'], + 'state': state, + 'thought': 'Tests passed!' if is_success else 'Found regressions', + 'timestamp': datetime.now().isoformat(), + } + + # Remove from active agents + del self.active_agents[self.TESTING_AGENT_KEY] + + return result + def get_agent_info(self, feature_id: int) -> tuple[int | None, str | None]: """Get agent index and name for a feature ID. @@ -139,7 +262,7 @@ class AgentTracker: return agent['agent_index'], agent['name'] return None, None - async def _handle_agent_start(self, feature_id: int, line: str) -> dict | None: + async def _handle_agent_start(self, feature_id: int, line: str, agent_type: str = "coding") -> dict | None: """Handle agent start message from orchestrator.""" async with self._lock: agent_index = self._next_agent_index @@ -154,6 +277,7 @@ class AgentTracker: self.active_agents[feature_id] = { 'name': AGENT_MASCOTS[agent_index % len(AGENT_MASCOTS)], 'agent_index': agent_index, + 'agent_type': agent_type, 'state': 'thinking', 'feature_name': feature_name, 'last_thought': 'Starting work...', @@ -163,6 +287,7 @@ class AgentTracker: 'type': 'agent_update', 'agentIndex': agent_index, 'agentName': AGENT_MASCOTS[agent_index % len(AGENT_MASCOTS)], + 'agentType': agent_type, 'featureId': feature_id, 'featureName': feature_name, 'state': 'thinking', @@ -178,11 +303,13 @@ class AgentTracker: agent = self.active_agents[feature_id] state = 'success' if is_success else 'error' + agent_type = agent.get('agent_type', 'coding') result = { 'type': 'agent_update', 'agentIndex': agent['agent_index'], 'agentName': agent['name'], + 'agentType': agent_type, 'featureId': feature_id, 'featureName': agent['feature_name'], 'state': state, diff --git a/ui/src/components/ActivityFeed.tsx b/ui/src/components/ActivityFeed.tsx index b986b0f..46a695b 100644 --- a/ui/src/components/ActivityFeed.tsx +++ b/ui/src/components/ActivityFeed.tsx @@ -83,11 +83,30 @@ export function ActivityFeed({ activities, maxItems = 5, showHeader = true }: Ac function getMascotColor(name: AgentMascot): string { const colors: Record = { + // Original 5 Spark: '#3B82F6', Fizz: '#F97316', Octo: '#8B5CF6', Hoot: '#22C55E', Buzz: '#EAB308', + // Tech-inspired + Pixel: '#EC4899', + Byte: '#06B6D4', + Nova: '#F43F5E', + Chip: '#84CC16', + Bolt: '#FBBF24', + // Energetic + Dash: '#14B8A6', + Zap: '#A855F7', + Gizmo: '#64748B', + Turbo: '#EF4444', + Blip: '#10B981', + // Playful + Neon: '#D946EF', + Widget: '#6366F1', + Zippy: '#F59E0B', + Quirk: '#0EA5E9', + Flux: '#7C3AED', } return colors[name] || '#6B7280' } diff --git a/ui/src/components/AgentAvatar.tsx b/ui/src/components/AgentAvatar.tsx index 5d0c9f1..72a798b 100644 --- a/ui/src/components/AgentAvatar.tsx +++ b/ui/src/components/AgentAvatar.tsx @@ -8,11 +8,30 @@ interface AgentAvatarProps { } const AVATAR_COLORS: Record = { + // Original 5 Spark: { primary: '#3B82F6', secondary: '#60A5FA', accent: '#DBEAFE' }, // Blue robot Fizz: { primary: '#F97316', secondary: '#FB923C', accent: '#FFEDD5' }, // Orange fox Octo: { primary: '#8B5CF6', secondary: '#A78BFA', accent: '#EDE9FE' }, // Purple octopus Hoot: { primary: '#22C55E', secondary: '#4ADE80', accent: '#DCFCE7' }, // Green owl Buzz: { primary: '#EAB308', secondary: '#FACC15', accent: '#FEF9C3' }, // Yellow bee + // Tech-inspired + Pixel: { primary: '#EC4899', secondary: '#F472B6', accent: '#FCE7F3' }, // Pink + Byte: { primary: '#06B6D4', secondary: '#22D3EE', accent: '#CFFAFE' }, // Cyan + Nova: { primary: '#F43F5E', secondary: '#FB7185', accent: '#FFE4E6' }, // Rose + Chip: { primary: '#84CC16', secondary: '#A3E635', accent: '#ECFCCB' }, // Lime + Bolt: { primary: '#FBBF24', secondary: '#FCD34D', accent: '#FEF3C7' }, // Amber + // Energetic + Dash: { primary: '#14B8A6', secondary: '#2DD4BF', accent: '#CCFBF1' }, // Teal + Zap: { primary: '#A855F7', secondary: '#C084FC', accent: '#F3E8FF' }, // Violet + Gizmo: { primary: '#64748B', secondary: '#94A3B8', accent: '#F1F5F9' }, // Slate + Turbo: { primary: '#EF4444', secondary: '#F87171', accent: '#FEE2E2' }, // Red + Blip: { primary: '#10B981', secondary: '#34D399', accent: '#D1FAE5' }, // Emerald + // Playful + Neon: { primary: '#D946EF', secondary: '#E879F9', accent: '#FAE8FF' }, // Fuchsia + Widget: { primary: '#6366F1', secondary: '#818CF8', accent: '#E0E7FF' }, // Indigo + Zippy: { primary: '#F59E0B', secondary: '#FBBF24', accent: '#FEF3C7' }, // Orange-yellow + Quirk: { primary: '#0EA5E9', secondary: '#38BDF8', accent: '#E0F2FE' }, // Sky + Flux: { primary: '#7C3AED', secondary: '#8B5CF6', accent: '#EDE9FE' }, // Purple } const SIZES = { @@ -150,12 +169,335 @@ function BuzzSVG({ colors, size }: { colors: typeof AVATAR_COLORS.Buzz; size: nu ) } +// Pixel - cute pixel art style character +function PixelSVG({ colors, size }: { colors: typeof AVATAR_COLORS.Pixel; size: number }) { + return ( + + {/* Blocky body */} + + + + {/* Head */} + + {/* Eyes */} + + + + + {/* Mouth */} + + + ) +} + +// Byte - data cube character +function ByteSVG({ colors, size }: { colors: typeof AVATAR_COLORS.Byte; size: number }) { + return ( + + {/* 3D cube body */} + + + + {/* Face */} + + + + + + + ) +} + +// Nova - star character +function NovaSVG({ colors, size }: { colors: typeof AVATAR_COLORS.Nova; size: number }) { + return ( + + {/* Star points */} + + + {/* Face */} + + + + + + + ) +} + +// Chip - circuit board character +function ChipSVG({ colors, size }: { colors: typeof AVATAR_COLORS.Chip; size: number }) { + return ( + + {/* Chip body */} + + {/* Pins */} + + + + + + + {/* Face */} + + + + + + + ) +} + +// Bolt - lightning character +function BoltSVG({ colors, size }: { colors: typeof AVATAR_COLORS.Bolt; size: number }) { + return ( + + {/* Lightning bolt body */} + + + {/* Face */} + + + + + + ) +} + +// Dash - speedy character +function DashSVG({ colors, size }: { colors: typeof AVATAR_COLORS.Dash; size: number }) { + return ( + + {/* Speed lines */} + + + {/* Aerodynamic body */} + + + {/* Face */} + + + + + + + ) +} + +// Zap - electric orb +function ZapSVG({ colors, size }: { colors: typeof AVATAR_COLORS.Zap; size: number }) { + return ( + + {/* Electric sparks */} + + + {/* Orb */} + + + {/* Face */} + + + + + + + ) +} + +// Gizmo - gear character +function GizmoSVG({ colors, size }: { colors: typeof AVATAR_COLORS.Gizmo; size: number }) { + return ( + + {/* Gear teeth */} + + + + + {/* Gear body */} + + + {/* Face */} + + + + + + + ) +} + +// Turbo - rocket character +function TurboSVG({ colors, size }: { colors: typeof AVATAR_COLORS.Turbo; size: number }) { + return ( + + {/* Flames */} + + + {/* Rocket body */} + + {/* Nose cone */} + + {/* Fins */} + + + {/* Window/Face */} + + + + + + ) +} + +// Blip - radar dot character +function BlipSVG({ colors, size }: { colors: typeof AVATAR_COLORS.Blip; size: number }) { + return ( + + {/* Radar rings */} + + + {/* Main dot */} + + + {/* Face */} + + + + + + + ) +} + +// Neon - glowing character +function NeonSVG({ colors, size }: { colors: typeof AVATAR_COLORS.Neon; size: number }) { + return ( + + {/* Glow effect */} + + + {/* Body */} + + {/* Inner glow */} + + {/* Face */} + + + + + + + ) +} + +// Widget - UI component character +function WidgetSVG({ colors, size }: { colors: typeof AVATAR_COLORS.Widget; size: number }) { + return ( + + {/* Window frame */} + + {/* Title bar */} + + + + + {/* Content area / Face */} + + + + + + + + ) +} + +// Zippy - fast bunny-like character +function ZippySVG({ colors, size }: { colors: typeof AVATAR_COLORS.Zippy; size: number }) { + return ( + + {/* Ears */} + + + + + {/* Head */} + + {/* Face */} + + + + + {/* Nose and mouth */} + + + + ) +} + +// Quirk - question mark character +function QuirkSVG({ colors, size }: { colors: typeof AVATAR_COLORS.Quirk; size: number }) { + return ( + + {/* Question mark body */} + + + {/* Face on the dot */} + + + + + {/* Decorative swirl */} + + + ) +} + +// Flux - flowing wave character +function FluxSVG({ colors, size }: { colors: typeof AVATAR_COLORS.Flux; size: number }) { + return ( + + {/* Wave body */} + + + {/* Face */} + + + + + {/* Sparkles */} + + + + ) +} + const MASCOT_SVGS: Record = { + // Original 5 Spark: SparkSVG, Fizz: FizzSVG, Octo: OctoSVG, Hoot: HootSVG, Buzz: BuzzSVG, + // Tech-inspired + Pixel: PixelSVG, + Byte: ByteSVG, + Nova: NovaSVG, + Chip: ChipSVG, + Bolt: BoltSVG, + // Energetic + Dash: DashSVG, + Zap: ZapSVG, + Gizmo: GizmoSVG, + Turbo: TurboSVG, + Blip: BlipSVG, + // Playful + Neon: NeonSVG, + Widget: WidgetSVG, + Zippy: ZippySVG, + Quirk: QuirkSVG, + Flux: FluxSVG, } // Animation classes based on state @@ -256,6 +598,6 @@ export function AgentAvatar({ name, state, size = 'md', showName = false }: Agen // Get mascot name by index (cycles through available mascots) export function getMascotName(index: number): AgentMascot { - const mascots: AgentMascot[] = ['Spark', 'Fizz', 'Octo', 'Hoot', 'Buzz'] + const mascots = Object.keys(MASCOT_SVGS) as AgentMascot[] return mascots[index % mascots.length] } diff --git a/ui/src/components/AgentCard.tsx b/ui/src/components/AgentCard.tsx index 2c027b2..befe63b 100644 --- a/ui/src/components/AgentCard.tsx +++ b/ui/src/components/AgentCard.tsx @@ -1,8 +1,8 @@ -import { MessageCircle, ScrollText, X, Copy, Check } from 'lucide-react' +import { MessageCircle, ScrollText, X, Copy, Check, Code, FlaskConical } from 'lucide-react' import { useState } from 'react' import { createPortal } from 'react-dom' import { AgentAvatar } from './AgentAvatar' -import type { ActiveAgent, AgentLogEntry } from '../lib/types' +import type { ActiveAgent, AgentLogEntry, AgentType } from '../lib/types' interface AgentCardProps { agent: ActiveAgent @@ -50,9 +50,28 @@ function getStateColor(state: ActiveAgent['state']): string { } } +// Get agent type badge config +function getAgentTypeBadge(agentType: AgentType): { label: string; className: string; icon: typeof Code } { + if (agentType === 'testing') { + return { + label: 'TEST', + className: 'bg-purple-100 text-purple-700 border-purple-300', + icon: FlaskConical, + } + } + // Default to coding + return { + label: 'CODE', + className: 'bg-blue-100 text-blue-700 border-blue-300', + icon: Code, + } +} + export function AgentCard({ agent, onShowLogs }: AgentCardProps) { const isActive = ['thinking', 'working', 'testing'].includes(agent.state) const hasLogs = agent.logs && agent.logs.length > 0 + const typeBadge = getAgentTypeBadge(agent.agentType || 'coding') + const TypeIcon = typeBadge.icon return (
+ {/* Agent type badge */} +
+ + + {typeBadge.label} + +
+ {/* Header with avatar and name */}
@@ -122,6 +155,8 @@ interface AgentLogModalProps { export function AgentLogModal({ agent, logs, onClose }: AgentLogModalProps) { const [copied, setCopied] = useState(false) + const typeBadge = getAgentTypeBadge(agent.agentType || 'coding') + const TypeIcon = typeBadge.icon const handleCopy = async () => { const logText = logs @@ -159,9 +194,21 @@ export function AgentLogModal({ agent, logs, onClose }: AgentLogModalProps) {
-

- {agent.agentName} Logs -

+
+

+ {agent.agentName} Logs +

+ + + {typeBadge.label} + +

Feature #{agent.featureId}: {agent.featureName}

diff --git a/ui/src/components/AgentControl.tsx b/ui/src/components/AgentControl.tsx index e3d0a92..616e709 100644 --- a/ui/src/components/AgentControl.tsx +++ b/ui/src/components/AgentControl.tsx @@ -24,21 +24,24 @@ export function AgentControl({ projectName, status }: AgentControlProps) { const isLoading = startAgent.isPending || stopAgent.isPending const isRunning = status === 'running' || status === 'paused' + const isLoadingStatus = status === 'loading' // Status unknown, waiting for WebSocket const isParallel = concurrency > 1 const handleStart = () => startAgent.mutate({ yoloMode, parallelMode: isParallel, - maxConcurrency: isParallel ? concurrency : undefined, + maxConcurrency: concurrency, // Always pass concurrency (1-5) + testingAgentRatio: settings?.testing_agent_ratio, + countTestingInConcurrency: settings?.count_testing_in_concurrency, }) const handleStop = () => stopAgent.mutate() - // Simplified: either show Start (when stopped/crashed) or Stop (when running/paused) + // Simplified: either show Start (when stopped/crashed), Stop (when running/paused), or loading spinner const isStopped = status === 'stopped' || status === 'crashed' return (
- {/* Concurrency slider - always visible when stopped */} + {/* Concurrency slider - visible when stopped (not during loading or running) */} {isStopped && (
@@ -67,7 +70,16 @@ export function AgentControl({ projectName, status }: AgentControlProps) {
)} - {isStopped ? ( + {isLoadingStatus ? ( + + ) : isStopped ? (
+ {/* Testing Agent Ratio */} +
+ +

+ Regression testing agents spawned per coding agent (0 = disabled) +

+
+ {[0, 1, 2, 3].map((ratio) => ( + + ))} +
+
+ + {/* Count Testing in Concurrency Toggle */} +
+
+
+ +

+ If enabled, testing agents count toward the concurrency limit +

+
+ +
+
+ {/* Update Error */} {updateSettings.isError && (
diff --git a/ui/src/components/SpecCreationChat.tsx b/ui/src/components/SpecCreationChat.tsx index 6fcf2e8..9a12cc6 100644 --- a/ui/src/components/SpecCreationChat.tsx +++ b/ui/src/components/SpecCreationChat.tsx @@ -6,7 +6,7 @@ */ import { useCallback, useEffect, useRef, useState } from 'react' -import { Send, X, CheckCircle2, AlertCircle, Wifi, WifiOff, RotateCcw, Loader2, ArrowRight, Zap, Paperclip, ExternalLink } from 'lucide-react' +import { Send, X, CheckCircle2, AlertCircle, Wifi, WifiOff, RotateCcw, Loader2, ArrowRight, Zap, Paperclip, ExternalLink, FileText } from 'lucide-react' import { useSpecChat } from '../hooks/useSpecChat' import { ChatMessage } from './ChatMessage' import { QuestionOptions } from './QuestionOptions' @@ -17,6 +17,24 @@ import type { ImageAttachment } from '../lib/types' const MAX_FILE_SIZE = 5 * 1024 * 1024 // 5 MB const ALLOWED_TYPES = ['image/jpeg', 'image/png'] +// Sample prompt for quick testing +const SAMPLE_PROMPT = `Let's call it Simple Todo. This is a really simple web app that I can use to track my to-do items using a Kanban board. I should be able to add to-dos and then drag and drop them through the Kanban board. The different columns in the Kanban board are: + +- To Do +- In Progress +- Done + +The app should use a neobrutalism design. + +There is no need for user authentication either. All the to-dos will be stored in local storage, so each user has access to all of their to-dos when they open their browser. So do not worry about implementing a backend with user authentication or a database. Simply store everything in local storage. As for the design, please try to avoid AI slop, so use your front-end design skills to design something beautiful and practical. As for the content of the to-dos, we should store: + +- The name or the title at the very least +- Optionally, we can also set tags, due dates, and priorities which should be represented as beautiful little badges on the to-do card + +Users should have the ability to easily clear out all the completed To-Dos. They should also be able to filter and search for To-Dos as well. + +You choose the rest. Keep it simple. Should be 25 features.` + type InitializerStatus = 'idle' | 'starting' | 'error' interface SpecCreationChatProps { @@ -223,6 +241,23 @@ export function SpecCreationChat({ )} + {/* Load Sample Prompt */} + + {/* Exit to Project - always visible escape hatch */}