feat: add concurrent agents with dependency system and delightful UI

Major feature implementation for parallel agent execution with dependency-aware scheduling and an engaging multi-agent UI experience. Backend Changes: - Add parallel_orchestrator.py for concurrent feature processing - Add api/dependency_resolver.py with cycle detection (Kahn's algorithm + DFS) - Add atomic feature_claim_next() with retry limit and exponential backoff - Fix circular dependency check arguments in 4 locations - Add AgentTracker class for parsing agent output and emitting updates - Add browser isolation with --isolated flag for Playwright MCP - Extend WebSocket protocol with agent_update messages and log attribution - Add WSAgentUpdateMessage schema with agent states and mascot names - Fix WSProgressMessage to include in_progress field New UI Components: - AgentMissionControl: Dashboard showing active agents with collapsible activity - AgentCard: Individual agent status with avatar and thought bubble - AgentAvatar: SVG mascots (Spark, Fizz, Octo, Hoot, Buzz) with animations - ActivityFeed: Recent activity stream with stable keys (no flickering) - CelebrationOverlay: Confetti animation with click/Escape dismiss - DependencyGraph: Interactive node graph visualization with dagre layout - DependencyBadge: Visual indicator for feature dependencies - ViewToggle: Switch between Kanban and Graph views - KeyboardShortcutsHelp: Help overlay accessible via ? key UI/UX Improvements: - Celebration queue system to handle rapid success messages - Accessibility attributes on AgentAvatar (role, aria-label, aria-live) - Collapsible Recent Activity section with persisted preference - Agent count display in header - Keyboard shortcut G to toggle Kanban/Graph view - Real-time thought bubbles and state animations Bug Fixes: - Fix circular dependency validation (swapped source/target arguments) - Add MAX_CLAIM_RETRIES=10 to prevent stack overflow under contention - Fix THOUGHT_PATTERNS to match actual [Tool: name] format - Fix ActivityFeed key prop to prevent re-renders on new items - Add featureId/agentIndex to log messages for proper attribution Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-17 02:43:09 +00:00 · 2026-01-17 12:59:42 +02:00
parent 91cc00a9d0
commit 85f6940a54
39 changed files with 4532 additions and 157 deletions
--- a/parallel_orchestrator.py
+++ b/parallel_orchestrator.py
@@ -0,0 +1,504 @@
+"""
+Parallel Orchestrator
+=====================
+
+Coordinates parallel execution of independent features using multiple agent processes.
+Uses dependency-aware scheduling to ensure features are only started when their
+dependencies are satisfied.
+
+Usage:
+    python parallel_orchestrator.py --project-dir my-app --max-concurrency 3
+"""
+
+import asyncio
+import os
+import subprocess
+import sys
+import threading
+import time
+from pathlib import Path
+from typing import Callable, Awaitable
+
+from api.database import Feature, create_database
+from api.dependency_resolver import are_dependencies_satisfied
+
+# Root directory of autocoder (where this script and autonomous_agent_demo.py live)
+AUTOCODER_ROOT = Path(__file__).parent.resolve()
+
+# Performance: Limit parallel agents to prevent memory exhaustion
+MAX_PARALLEL_AGENTS = 5
+DEFAULT_CONCURRENCY = 3
+POLL_INTERVAL = 5  # seconds between checking for ready features
+MAX_FEATURE_RETRIES = 3  # Maximum times to retry a failed feature
+
+
+class ParallelOrchestrator:
+    """Orchestrates parallel execution of independent features."""
+
+    def __init__(
+        self,
+        project_dir: Path,
+        max_concurrency: int = DEFAULT_CONCURRENCY,
+        model: str = None,
+        yolo_mode: bool = False,
+        on_output: Callable[[int, str], None] = None,
+        on_status: Callable[[int, str], None] = None,
+    ):
+        """Initialize the orchestrator.
+
+        Args:
+            project_dir: Path to the project directory
+            max_concurrency: Maximum number of concurrent agents (1-5)
+            model: Claude model to use (or None for default)
+            yolo_mode: Whether to run in YOLO mode (skip browser testing)
+            on_output: Callback for agent output (feature_id, line)
+            on_status: Callback for agent status changes (feature_id, status)
+        """
+        self.project_dir = project_dir
+        self.max_concurrency = min(max(max_concurrency, 1), MAX_PARALLEL_AGENTS)
+        self.model = model
+        self.yolo_mode = yolo_mode
+        self.on_output = on_output
+        self.on_status = on_status
+
+        # Thread-safe state
+        self._lock = threading.Lock()
+        self.running_agents: dict[int, subprocess.Popen] = {}
+        self.abort_events: dict[int, threading.Event] = {}
+        self.is_running = False
+
+        # Track feature failures to prevent infinite retry loops
+        self._failure_counts: dict[int, int] = {}
+
+        # Database session for this orchestrator
+        self._engine, self._session_maker = create_database(project_dir)
+
+    def get_session(self):
+        """Get a new database session."""
+        return self._session_maker()
+
+    def get_resumable_features(self) -> list[dict]:
+        """Get features that were left in_progress from a previous session.
+
+        These are features where in_progress=True but passes=False, and they're
+        not currently being worked on by this orchestrator. This handles the case
+        where a previous session was interrupted before completing the feature.
+        """
+        session = self.get_session()
+        try:
+            # Find features that are in_progress but not complete
+            stale = session.query(Feature).filter(
+                Feature.in_progress == True,
+                Feature.passes == False
+            ).all()
+
+            resumable = []
+            for f in stale:
+                # Skip if already running in this orchestrator instance
+                with self._lock:
+                    if f.id in self.running_agents:
+                        continue
+                # Skip if feature has failed too many times
+                if self._failure_counts.get(f.id, 0) >= MAX_FEATURE_RETRIES:
+                    continue
+                resumable.append(f.to_dict())
+
+            # Sort by priority (highest priority first)
+            resumable.sort(key=lambda f: (f["priority"], f["id"]))
+            return resumable
+        finally:
+            session.close()
+
+    def get_ready_features(self) -> list[dict]:
+        """Get features with satisfied dependencies, not already running."""
+        session = self.get_session()
+        try:
+            all_features = session.query(Feature).all()
+            all_dicts = [f.to_dict() for f in all_features]
+
+            ready = []
+            for f in all_features:
+                if f.passes or f.in_progress:
+                    continue
+                # Skip if already running in this orchestrator
+                with self._lock:
+                    if f.id in self.running_agents:
+                        continue
+                # Skip if feature has failed too many times
+                if self._failure_counts.get(f.id, 0) >= MAX_FEATURE_RETRIES:
+                    continue
+                # Check dependencies
+                if are_dependencies_satisfied(f.to_dict(), all_dicts):
+                    ready.append(f.to_dict())
+
+            # Sort by priority
+            ready.sort(key=lambda f: (f["priority"], f["id"]))
+            return ready
+        finally:
+            session.close()
+
+    def get_all_complete(self) -> bool:
+        """Check if all features are complete."""
+        session = self.get_session()
+        try:
+            pending = session.query(Feature).filter(Feature.passes == False).count()
+            return pending == 0
+        finally:
+            session.close()
+
+    def start_feature(self, feature_id: int, resume: bool = False) -> tuple[bool, str]:
+        """Start a single feature agent.
+
+        Args:
+            feature_id: ID of the feature to start
+            resume: If True, resume a feature that's already in_progress from a previous session
+
+        Returns:
+            Tuple of (success, message)
+        """
+        with self._lock:
+            if feature_id in self.running_agents:
+                return False, "Feature already running"
+            if len(self.running_agents) >= self.max_concurrency:
+                return False, "At max concurrency"
+
+        # Mark as in_progress in database (or verify it's resumable)
+        session = self.get_session()
+        try:
+            feature = session.query(Feature).filter(Feature.id == feature_id).first()
+            if not feature:
+                return False, "Feature not found"
+            if feature.passes:
+                return False, "Feature already complete"
+
+            if resume:
+                # Resuming: feature should already be in_progress
+                if not feature.in_progress:
+                    return False, "Feature not in progress, cannot resume"
+            else:
+                # Starting fresh: feature should not be in_progress
+                if feature.in_progress:
+                    return False, "Feature already in progress"
+                feature.in_progress = True
+                session.commit()
+        finally:
+            session.close()
+
+        # Create abort event
+        abort_event = threading.Event()
+
+        # Start subprocess for this feature
+        cmd = [
+            sys.executable,
+            "-u",  # Force unbuffered stdout/stderr
+            str(AUTOCODER_ROOT / "autonomous_agent_demo.py"),
+            "--project-dir", str(self.project_dir),
+            "--max-iterations", "1",  # Single feature mode
+            "--feature-id", str(feature_id),  # Work on this specific feature only
+        ]
+        if self.model:
+            cmd.extend(["--model", self.model])
+        if self.yolo_mode:
+            cmd.append("--yolo")
+
+        try:
+            proc = subprocess.Popen(
+                cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                text=True,
+                cwd=str(AUTOCODER_ROOT),  # Run from autocoder root for proper imports
+                env={**os.environ, "PYTHONUNBUFFERED": "1"},
+            )
+        except Exception as e:
+            # Reset in_progress on failure
+            session = self.get_session()
+            try:
+                feature = session.query(Feature).filter(Feature.id == feature_id).first()
+                if feature:
+                    feature.in_progress = False
+                    session.commit()
+            finally:
+                session.close()
+            return False, f"Failed to start agent: {e}"
+
+        with self._lock:
+            self.running_agents[feature_id] = proc
+            self.abort_events[feature_id] = abort_event
+
+        # Start output reader thread
+        threading.Thread(
+            target=self._read_output,
+            args=(feature_id, proc, abort_event),
+            daemon=True
+        ).start()
+
+        if self.on_status:
+            self.on_status(feature_id, "running")
+
+        print(f"Started agent for feature #{feature_id}", flush=True)
+        return True, f"Started feature {feature_id}"
+
+    def _read_output(self, feature_id: int, proc: subprocess.Popen, abort: threading.Event):
+        """Read output from subprocess and emit events."""
+        try:
+            for line in proc.stdout:
+                if abort.is_set():
+                    break
+                line = line.rstrip()
+                if self.on_output:
+                    self.on_output(feature_id, line)
+                else:
+                    print(f"[Feature #{feature_id}] {line}", flush=True)
+            proc.wait()
+        finally:
+            self._on_feature_complete(feature_id, proc.returncode)
+
+    def _on_feature_complete(self, feature_id: int, return_code: int):
+        """Handle feature completion.
+
+        ALWAYS clears in_progress when agent exits, regardless of success/failure.
+        This prevents features from getting stuck if an agent crashes or is killed.
+        The agent marks features as passing BEFORE clearing in_progress, so this
+        is safe - we won't accidentally clear a feature that's being worked on.
+        """
+        with self._lock:
+            self.running_agents.pop(feature_id, None)
+            self.abort_events.pop(feature_id, None)
+
+        # ALWAYS clear in_progress when agent exits to prevent stuck features
+        # The agent marks features as passing before clearing in_progress,
+        # so if in_progress is still True here, the feature didn't complete successfully
+        session = self.get_session()
+        try:
+            feature = session.query(Feature).filter(Feature.id == feature_id).first()
+            if feature and feature.in_progress and not feature.passes:
+                feature.in_progress = False
+                session.commit()
+        finally:
+            session.close()
+
+        # Track failures to prevent infinite retry loops
+        if return_code != 0:
+            with self._lock:
+                self._failure_counts[feature_id] = self._failure_counts.get(feature_id, 0) + 1
+                failure_count = self._failure_counts[feature_id]
+            if failure_count >= MAX_FEATURE_RETRIES:
+                print(f"Feature #{feature_id} has failed {failure_count} times, will not retry", flush=True)
+
+        status = "completed" if return_code == 0 else "failed"
+        if self.on_status:
+            self.on_status(feature_id, status)
+        print(f"Feature #{feature_id} {status}", flush=True)
+
+    def stop_feature(self, feature_id: int) -> tuple[bool, str]:
+        """Stop a running feature agent."""
+        with self._lock:
+            if feature_id not in self.running_agents:
+                return False, "Feature not running"
+
+            abort = self.abort_events.get(feature_id)
+            proc = self.running_agents.get(feature_id)
+
+        if abort:
+            abort.set()
+        if proc:
+            proc.terminate()
+            try:
+                proc.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                proc.kill()
+
+        return True, f"Stopped feature {feature_id}"
+
+    def stop_all(self) -> None:
+        """Stop all running feature agents."""
+        self.is_running = False
+        with self._lock:
+            feature_ids = list(self.running_agents.keys())
+
+        for fid in feature_ids:
+            self.stop_feature(fid)
+
+    async def run_loop(self):
+        """Main orchestration loop."""
+        self.is_running = True
+
+        print(f"Starting parallel orchestrator with max_concurrency={self.max_concurrency}", flush=True)
+        print(f"Project: {self.project_dir}", flush=True)
+        print(flush=True)
+
+        # Check for features to resume from previous session
+        resumable = self.get_resumable_features()
+        if resumable:
+            print(f"Found {len(resumable)} feature(s) to resume from previous session:", flush=True)
+            for f in resumable:
+                print(f"  - Feature #{f['id']}: {f['name']}", flush=True)
+            print(flush=True)
+
+        while self.is_running:
+            try:
+                # Check if all complete
+                if self.get_all_complete():
+                    print("\nAll features complete!", flush=True)
+                    break
+
+                # Check capacity
+                with self._lock:
+                    current = len(self.running_agents)
+                if current >= self.max_concurrency:
+                    await asyncio.sleep(POLL_INTERVAL)
+                    continue
+
+                # Priority 1: Resume features from previous session
+                resumable = self.get_resumable_features()
+                if resumable:
+                    slots = self.max_concurrency - current
+                    for feature in resumable[:slots]:
+                        print(f"Resuming feature #{feature['id']}: {feature['name']}", flush=True)
+                        self.start_feature(feature["id"], resume=True)
+                    await asyncio.sleep(2)
+                    continue
+
+                # Priority 2: Start new ready features
+                ready = self.get_ready_features()
+                if not ready:
+                    # Wait for running features to complete
+                    if current > 0:
+                        await asyncio.sleep(POLL_INTERVAL)
+                        continue
+                    else:
+                        # No ready features and nothing running - might be blocked
+                        print("No ready features available. All remaining features may be blocked by dependencies.", flush=True)
+                        await asyncio.sleep(POLL_INTERVAL * 2)
+                        continue
+
+                # Start features up to capacity
+                slots = self.max_concurrency - current
+                for feature in ready[:slots]:
+                    print(f"Starting feature #{feature['id']}: {feature['name']}", flush=True)
+                    self.start_feature(feature["id"])
+
+                await asyncio.sleep(2)  # Brief pause between starts
+
+            except Exception as e:
+                print(f"Orchestrator error: {e}", flush=True)
+                await asyncio.sleep(POLL_INTERVAL)
+
+        # Wait for remaining agents to complete
+        print("Waiting for running agents to complete...", flush=True)
+        while True:
+            with self._lock:
+                if not self.running_agents:
+                    break
+            await asyncio.sleep(1)
+
+        print("Orchestrator finished.", flush=True)
+
+    def get_status(self) -> dict:
+        """Get current orchestrator status."""
+        with self._lock:
+            return {
+                "running_features": list(self.running_agents.keys()),
+                "count": len(self.running_agents),
+                "max_concurrency": self.max_concurrency,
+                "is_running": self.is_running,
+            }
+
+
+async def run_parallel_orchestrator(
+    project_dir: Path,
+    max_concurrency: int = DEFAULT_CONCURRENCY,
+    model: str = None,
+    yolo_mode: bool = False,
+) -> None:
+    """Run the parallel orchestrator.
+
+    Args:
+        project_dir: Path to the project directory
+        max_concurrency: Maximum number of concurrent agents
+        model: Claude model to use
+        yolo_mode: Whether to run in YOLO mode
+    """
+    orchestrator = ParallelOrchestrator(
+        project_dir=project_dir,
+        max_concurrency=max_concurrency,
+        model=model,
+        yolo_mode=yolo_mode,
+    )
+
+    try:
+        await orchestrator.run_loop()
+    except KeyboardInterrupt:
+        print("\n\nInterrupted by user. Stopping agents...", flush=True)
+        orchestrator.stop_all()
+
+
+def main():
+    """Main entry point for parallel orchestration."""
+    import argparse
+    from dotenv import load_dotenv
+    from registry import DEFAULT_MODEL, get_project_path
+
+    load_dotenv()
+
+    parser = argparse.ArgumentParser(
+        description="Parallel Feature Orchestrator - Run multiple agent instances",
+    )
+    parser.add_argument(
+        "--project-dir",
+        type=str,
+        required=True,
+        help="Project directory path (absolute) or registered project name",
+    )
+    parser.add_argument(
+        "--max-concurrency",
+        "-p",
+        type=int,
+        default=DEFAULT_CONCURRENCY,
+        help=f"Maximum concurrent agents (1-{MAX_PARALLEL_AGENTS}, default: {DEFAULT_CONCURRENCY})",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default=DEFAULT_MODEL,
+        help=f"Claude model to use (default: {DEFAULT_MODEL})",
+    )
+    parser.add_argument(
+        "--yolo",
+        action="store_true",
+        default=False,
+        help="Enable YOLO mode: rapid prototyping without browser testing",
+    )
+
+    args = parser.parse_args()
+
+    # Resolve project directory
+    project_dir_input = args.project_dir
+    project_dir = Path(project_dir_input)
+
+    if project_dir.is_absolute():
+        if not project_dir.exists():
+            print(f"Error: Project directory does not exist: {project_dir}", flush=True)
+            sys.exit(1)
+    else:
+        registered_path = get_project_path(project_dir_input)
+        if registered_path:
+            project_dir = registered_path
+        else:
+            print(f"Error: Project '{project_dir_input}' not found in registry", flush=True)
+            sys.exit(1)
+
+    try:
+        asyncio.run(run_parallel_orchestrator(
+            project_dir=project_dir,
+            max_concurrency=args.max_concurrency,
+            model=args.model,
+            yolo_mode=args.yolo,
+        ))
+    except KeyboardInterrupt:
+        print("\n\nInterrupted by user", flush=True)
+
+
+if __name__ == "__main__":
+    main()