mirror of
https://github.com/leonvanzyl/autocoder.git
synced 2026-01-30 06:12:06 +00:00
feat: add concurrent agents with dependency system and delightful UI
Major feature implementation for parallel agent execution with dependency-aware scheduling and an engaging multi-agent UI experience. Backend Changes: - Add parallel_orchestrator.py for concurrent feature processing - Add api/dependency_resolver.py with cycle detection (Kahn's algorithm + DFS) - Add atomic feature_claim_next() with retry limit and exponential backoff - Fix circular dependency check arguments in 4 locations - Add AgentTracker class for parsing agent output and emitting updates - Add browser isolation with --isolated flag for Playwright MCP - Extend WebSocket protocol with agent_update messages and log attribution - Add WSAgentUpdateMessage schema with agent states and mascot names - Fix WSProgressMessage to include in_progress field New UI Components: - AgentMissionControl: Dashboard showing active agents with collapsible activity - AgentCard: Individual agent status with avatar and thought bubble - AgentAvatar: SVG mascots (Spark, Fizz, Octo, Hoot, Buzz) with animations - ActivityFeed: Recent activity stream with stable keys (no flickering) - CelebrationOverlay: Confetti animation with click/Escape dismiss - DependencyGraph: Interactive node graph visualization with dagre layout - DependencyBadge: Visual indicator for feature dependencies - ViewToggle: Switch between Kanban and Graph views - KeyboardShortcutsHelp: Help overlay accessible via ? key UI/UX Improvements: - Celebration queue system to handle rapid success messages - Accessibility attributes on AgentAvatar (role, aria-label, aria-live) - Collapsible Recent Activity section with persisted preference - Agent count display in header - Keyboard shortcut G to toggle Kanban/Graph view - Real-time thought bubbles and state animations Bug Fixes: - Fix circular dependency validation (swapped source/target arguments) - Add MAX_CLAIM_RETRIES=10 to prevent stack overflow under contention - Fix THOUGHT_PATTERNS to match actual [Tool: name] format - Fix ActivityFeed key prop to prevent re-renders on new items - Add featureId/agentIndex to log messages for proper attribution Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
504
parallel_orchestrator.py
Normal file
504
parallel_orchestrator.py
Normal file
@@ -0,0 +1,504 @@
|
||||
"""
|
||||
Parallel Orchestrator
|
||||
=====================
|
||||
|
||||
Coordinates parallel execution of independent features using multiple agent processes.
|
||||
Uses dependency-aware scheduling to ensure features are only started when their
|
||||
dependencies are satisfied.
|
||||
|
||||
Usage:
|
||||
python parallel_orchestrator.py --project-dir my-app --max-concurrency 3
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Callable, Awaitable
|
||||
|
||||
from api.database import Feature, create_database
|
||||
from api.dependency_resolver import are_dependencies_satisfied
|
||||
|
||||
# Root directory of autocoder (where this script and autonomous_agent_demo.py live)
|
||||
AUTOCODER_ROOT = Path(__file__).parent.resolve()
|
||||
|
||||
# Performance: Limit parallel agents to prevent memory exhaustion
|
||||
MAX_PARALLEL_AGENTS = 5
|
||||
DEFAULT_CONCURRENCY = 3
|
||||
POLL_INTERVAL = 5 # seconds between checking for ready features
|
||||
MAX_FEATURE_RETRIES = 3 # Maximum times to retry a failed feature
|
||||
|
||||
|
||||
class ParallelOrchestrator:
|
||||
"""Orchestrates parallel execution of independent features."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
project_dir: Path,
|
||||
max_concurrency: int = DEFAULT_CONCURRENCY,
|
||||
model: str = None,
|
||||
yolo_mode: bool = False,
|
||||
on_output: Callable[[int, str], None] = None,
|
||||
on_status: Callable[[int, str], None] = None,
|
||||
):
|
||||
"""Initialize the orchestrator.
|
||||
|
||||
Args:
|
||||
project_dir: Path to the project directory
|
||||
max_concurrency: Maximum number of concurrent agents (1-5)
|
||||
model: Claude model to use (or None for default)
|
||||
yolo_mode: Whether to run in YOLO mode (skip browser testing)
|
||||
on_output: Callback for agent output (feature_id, line)
|
||||
on_status: Callback for agent status changes (feature_id, status)
|
||||
"""
|
||||
self.project_dir = project_dir
|
||||
self.max_concurrency = min(max(max_concurrency, 1), MAX_PARALLEL_AGENTS)
|
||||
self.model = model
|
||||
self.yolo_mode = yolo_mode
|
||||
self.on_output = on_output
|
||||
self.on_status = on_status
|
||||
|
||||
# Thread-safe state
|
||||
self._lock = threading.Lock()
|
||||
self.running_agents: dict[int, subprocess.Popen] = {}
|
||||
self.abort_events: dict[int, threading.Event] = {}
|
||||
self.is_running = False
|
||||
|
||||
# Track feature failures to prevent infinite retry loops
|
||||
self._failure_counts: dict[int, int] = {}
|
||||
|
||||
# Database session for this orchestrator
|
||||
self._engine, self._session_maker = create_database(project_dir)
|
||||
|
||||
def get_session(self):
|
||||
"""Get a new database session."""
|
||||
return self._session_maker()
|
||||
|
||||
def get_resumable_features(self) -> list[dict]:
|
||||
"""Get features that were left in_progress from a previous session.
|
||||
|
||||
These are features where in_progress=True but passes=False, and they're
|
||||
not currently being worked on by this orchestrator. This handles the case
|
||||
where a previous session was interrupted before completing the feature.
|
||||
"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
# Find features that are in_progress but not complete
|
||||
stale = session.query(Feature).filter(
|
||||
Feature.in_progress == True,
|
||||
Feature.passes == False
|
||||
).all()
|
||||
|
||||
resumable = []
|
||||
for f in stale:
|
||||
# Skip if already running in this orchestrator instance
|
||||
with self._lock:
|
||||
if f.id in self.running_agents:
|
||||
continue
|
||||
# Skip if feature has failed too many times
|
||||
if self._failure_counts.get(f.id, 0) >= MAX_FEATURE_RETRIES:
|
||||
continue
|
||||
resumable.append(f.to_dict())
|
||||
|
||||
# Sort by priority (highest priority first)
|
||||
resumable.sort(key=lambda f: (f["priority"], f["id"]))
|
||||
return resumable
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def get_ready_features(self) -> list[dict]:
|
||||
"""Get features with satisfied dependencies, not already running."""
|
||||
session = self.get_session()
|
||||
try:
|
||||
all_features = session.query(Feature).all()
|
||||
all_dicts = [f.to_dict() for f in all_features]
|
||||
|
||||
ready = []
|
||||
for f in all_features:
|
||||
if f.passes or f.in_progress:
|
||||
continue
|
||||
# Skip if already running in this orchestrator
|
||||
with self._lock:
|
||||
if f.id in self.running_agents:
|
||||
continue
|
||||
# Skip if feature has failed too many times
|
||||
if self._failure_counts.get(f.id, 0) >= MAX_FEATURE_RETRIES:
|
||||
continue
|
||||
# Check dependencies
|
||||
if are_dependencies_satisfied(f.to_dict(), all_dicts):
|
||||
ready.append(f.to_dict())
|
||||
|
||||
# Sort by priority
|
||||
ready.sort(key=lambda f: (f["priority"], f["id"]))
|
||||
return ready
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def get_all_complete(self) -> bool:
|
||||
"""Check if all features are complete."""
|
||||
session = self.get_session()
|
||||
try:
|
||||
pending = session.query(Feature).filter(Feature.passes == False).count()
|
||||
return pending == 0
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def start_feature(self, feature_id: int, resume: bool = False) -> tuple[bool, str]:
|
||||
"""Start a single feature agent.
|
||||
|
||||
Args:
|
||||
feature_id: ID of the feature to start
|
||||
resume: If True, resume a feature that's already in_progress from a previous session
|
||||
|
||||
Returns:
|
||||
Tuple of (success, message)
|
||||
"""
|
||||
with self._lock:
|
||||
if feature_id in self.running_agents:
|
||||
return False, "Feature already running"
|
||||
if len(self.running_agents) >= self.max_concurrency:
|
||||
return False, "At max concurrency"
|
||||
|
||||
# Mark as in_progress in database (or verify it's resumable)
|
||||
session = self.get_session()
|
||||
try:
|
||||
feature = session.query(Feature).filter(Feature.id == feature_id).first()
|
||||
if not feature:
|
||||
return False, "Feature not found"
|
||||
if feature.passes:
|
||||
return False, "Feature already complete"
|
||||
|
||||
if resume:
|
||||
# Resuming: feature should already be in_progress
|
||||
if not feature.in_progress:
|
||||
return False, "Feature not in progress, cannot resume"
|
||||
else:
|
||||
# Starting fresh: feature should not be in_progress
|
||||
if feature.in_progress:
|
||||
return False, "Feature already in progress"
|
||||
feature.in_progress = True
|
||||
session.commit()
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
# Create abort event
|
||||
abort_event = threading.Event()
|
||||
|
||||
# Start subprocess for this feature
|
||||
cmd = [
|
||||
sys.executable,
|
||||
"-u", # Force unbuffered stdout/stderr
|
||||
str(AUTOCODER_ROOT / "autonomous_agent_demo.py"),
|
||||
"--project-dir", str(self.project_dir),
|
||||
"--max-iterations", "1", # Single feature mode
|
||||
"--feature-id", str(feature_id), # Work on this specific feature only
|
||||
]
|
||||
if self.model:
|
||||
cmd.extend(["--model", self.model])
|
||||
if self.yolo_mode:
|
||||
cmd.append("--yolo")
|
||||
|
||||
try:
|
||||
proc = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
text=True,
|
||||
cwd=str(AUTOCODER_ROOT), # Run from autocoder root for proper imports
|
||||
env={**os.environ, "PYTHONUNBUFFERED": "1"},
|
||||
)
|
||||
except Exception as e:
|
||||
# Reset in_progress on failure
|
||||
session = self.get_session()
|
||||
try:
|
||||
feature = session.query(Feature).filter(Feature.id == feature_id).first()
|
||||
if feature:
|
||||
feature.in_progress = False
|
||||
session.commit()
|
||||
finally:
|
||||
session.close()
|
||||
return False, f"Failed to start agent: {e}"
|
||||
|
||||
with self._lock:
|
||||
self.running_agents[feature_id] = proc
|
||||
self.abort_events[feature_id] = abort_event
|
||||
|
||||
# Start output reader thread
|
||||
threading.Thread(
|
||||
target=self._read_output,
|
||||
args=(feature_id, proc, abort_event),
|
||||
daemon=True
|
||||
).start()
|
||||
|
||||
if self.on_status:
|
||||
self.on_status(feature_id, "running")
|
||||
|
||||
print(f"Started agent for feature #{feature_id}", flush=True)
|
||||
return True, f"Started feature {feature_id}"
|
||||
|
||||
def _read_output(self, feature_id: int, proc: subprocess.Popen, abort: threading.Event):
|
||||
"""Read output from subprocess and emit events."""
|
||||
try:
|
||||
for line in proc.stdout:
|
||||
if abort.is_set():
|
||||
break
|
||||
line = line.rstrip()
|
||||
if self.on_output:
|
||||
self.on_output(feature_id, line)
|
||||
else:
|
||||
print(f"[Feature #{feature_id}] {line}", flush=True)
|
||||
proc.wait()
|
||||
finally:
|
||||
self._on_feature_complete(feature_id, proc.returncode)
|
||||
|
||||
def _on_feature_complete(self, feature_id: int, return_code: int):
|
||||
"""Handle feature completion.
|
||||
|
||||
ALWAYS clears in_progress when agent exits, regardless of success/failure.
|
||||
This prevents features from getting stuck if an agent crashes or is killed.
|
||||
The agent marks features as passing BEFORE clearing in_progress, so this
|
||||
is safe - we won't accidentally clear a feature that's being worked on.
|
||||
"""
|
||||
with self._lock:
|
||||
self.running_agents.pop(feature_id, None)
|
||||
self.abort_events.pop(feature_id, None)
|
||||
|
||||
# ALWAYS clear in_progress when agent exits to prevent stuck features
|
||||
# The agent marks features as passing before clearing in_progress,
|
||||
# so if in_progress is still True here, the feature didn't complete successfully
|
||||
session = self.get_session()
|
||||
try:
|
||||
feature = session.query(Feature).filter(Feature.id == feature_id).first()
|
||||
if feature and feature.in_progress and not feature.passes:
|
||||
feature.in_progress = False
|
||||
session.commit()
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
# Track failures to prevent infinite retry loops
|
||||
if return_code != 0:
|
||||
with self._lock:
|
||||
self._failure_counts[feature_id] = self._failure_counts.get(feature_id, 0) + 1
|
||||
failure_count = self._failure_counts[feature_id]
|
||||
if failure_count >= MAX_FEATURE_RETRIES:
|
||||
print(f"Feature #{feature_id} has failed {failure_count} times, will not retry", flush=True)
|
||||
|
||||
status = "completed" if return_code == 0 else "failed"
|
||||
if self.on_status:
|
||||
self.on_status(feature_id, status)
|
||||
print(f"Feature #{feature_id} {status}", flush=True)
|
||||
|
||||
def stop_feature(self, feature_id: int) -> tuple[bool, str]:
|
||||
"""Stop a running feature agent."""
|
||||
with self._lock:
|
||||
if feature_id not in self.running_agents:
|
||||
return False, "Feature not running"
|
||||
|
||||
abort = self.abort_events.get(feature_id)
|
||||
proc = self.running_agents.get(feature_id)
|
||||
|
||||
if abort:
|
||||
abort.set()
|
||||
if proc:
|
||||
proc.terminate()
|
||||
try:
|
||||
proc.wait(timeout=5)
|
||||
except subprocess.TimeoutExpired:
|
||||
proc.kill()
|
||||
|
||||
return True, f"Stopped feature {feature_id}"
|
||||
|
||||
def stop_all(self) -> None:
|
||||
"""Stop all running feature agents."""
|
||||
self.is_running = False
|
||||
with self._lock:
|
||||
feature_ids = list(self.running_agents.keys())
|
||||
|
||||
for fid in feature_ids:
|
||||
self.stop_feature(fid)
|
||||
|
||||
async def run_loop(self):
|
||||
"""Main orchestration loop."""
|
||||
self.is_running = True
|
||||
|
||||
print(f"Starting parallel orchestrator with max_concurrency={self.max_concurrency}", flush=True)
|
||||
print(f"Project: {self.project_dir}", flush=True)
|
||||
print(flush=True)
|
||||
|
||||
# Check for features to resume from previous session
|
||||
resumable = self.get_resumable_features()
|
||||
if resumable:
|
||||
print(f"Found {len(resumable)} feature(s) to resume from previous session:", flush=True)
|
||||
for f in resumable:
|
||||
print(f" - Feature #{f['id']}: {f['name']}", flush=True)
|
||||
print(flush=True)
|
||||
|
||||
while self.is_running:
|
||||
try:
|
||||
# Check if all complete
|
||||
if self.get_all_complete():
|
||||
print("\nAll features complete!", flush=True)
|
||||
break
|
||||
|
||||
# Check capacity
|
||||
with self._lock:
|
||||
current = len(self.running_agents)
|
||||
if current >= self.max_concurrency:
|
||||
await asyncio.sleep(POLL_INTERVAL)
|
||||
continue
|
||||
|
||||
# Priority 1: Resume features from previous session
|
||||
resumable = self.get_resumable_features()
|
||||
if resumable:
|
||||
slots = self.max_concurrency - current
|
||||
for feature in resumable[:slots]:
|
||||
print(f"Resuming feature #{feature['id']}: {feature['name']}", flush=True)
|
||||
self.start_feature(feature["id"], resume=True)
|
||||
await asyncio.sleep(2)
|
||||
continue
|
||||
|
||||
# Priority 2: Start new ready features
|
||||
ready = self.get_ready_features()
|
||||
if not ready:
|
||||
# Wait for running features to complete
|
||||
if current > 0:
|
||||
await asyncio.sleep(POLL_INTERVAL)
|
||||
continue
|
||||
else:
|
||||
# No ready features and nothing running - might be blocked
|
||||
print("No ready features available. All remaining features may be blocked by dependencies.", flush=True)
|
||||
await asyncio.sleep(POLL_INTERVAL * 2)
|
||||
continue
|
||||
|
||||
# Start features up to capacity
|
||||
slots = self.max_concurrency - current
|
||||
for feature in ready[:slots]:
|
||||
print(f"Starting feature #{feature['id']}: {feature['name']}", flush=True)
|
||||
self.start_feature(feature["id"])
|
||||
|
||||
await asyncio.sleep(2) # Brief pause between starts
|
||||
|
||||
except Exception as e:
|
||||
print(f"Orchestrator error: {e}", flush=True)
|
||||
await asyncio.sleep(POLL_INTERVAL)
|
||||
|
||||
# Wait for remaining agents to complete
|
||||
print("Waiting for running agents to complete...", flush=True)
|
||||
while True:
|
||||
with self._lock:
|
||||
if not self.running_agents:
|
||||
break
|
||||
await asyncio.sleep(1)
|
||||
|
||||
print("Orchestrator finished.", flush=True)
|
||||
|
||||
def get_status(self) -> dict:
|
||||
"""Get current orchestrator status."""
|
||||
with self._lock:
|
||||
return {
|
||||
"running_features": list(self.running_agents.keys()),
|
||||
"count": len(self.running_agents),
|
||||
"max_concurrency": self.max_concurrency,
|
||||
"is_running": self.is_running,
|
||||
}
|
||||
|
||||
|
||||
async def run_parallel_orchestrator(
|
||||
project_dir: Path,
|
||||
max_concurrency: int = DEFAULT_CONCURRENCY,
|
||||
model: str = None,
|
||||
yolo_mode: bool = False,
|
||||
) -> None:
|
||||
"""Run the parallel orchestrator.
|
||||
|
||||
Args:
|
||||
project_dir: Path to the project directory
|
||||
max_concurrency: Maximum number of concurrent agents
|
||||
model: Claude model to use
|
||||
yolo_mode: Whether to run in YOLO mode
|
||||
"""
|
||||
orchestrator = ParallelOrchestrator(
|
||||
project_dir=project_dir,
|
||||
max_concurrency=max_concurrency,
|
||||
model=model,
|
||||
yolo_mode=yolo_mode,
|
||||
)
|
||||
|
||||
try:
|
||||
await orchestrator.run_loop()
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nInterrupted by user. Stopping agents...", flush=True)
|
||||
orchestrator.stop_all()
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for parallel orchestration."""
|
||||
import argparse
|
||||
from dotenv import load_dotenv
|
||||
from registry import DEFAULT_MODEL, get_project_path
|
||||
|
||||
load_dotenv()
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Parallel Feature Orchestrator - Run multiple agent instances",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--project-dir",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Project directory path (absolute) or registered project name",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-concurrency",
|
||||
"-p",
|
||||
type=int,
|
||||
default=DEFAULT_CONCURRENCY,
|
||||
help=f"Maximum concurrent agents (1-{MAX_PARALLEL_AGENTS}, default: {DEFAULT_CONCURRENCY})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
type=str,
|
||||
default=DEFAULT_MODEL,
|
||||
help=f"Claude model to use (default: {DEFAULT_MODEL})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--yolo",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Enable YOLO mode: rapid prototyping without browser testing",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Resolve project directory
|
||||
project_dir_input = args.project_dir
|
||||
project_dir = Path(project_dir_input)
|
||||
|
||||
if project_dir.is_absolute():
|
||||
if not project_dir.exists():
|
||||
print(f"Error: Project directory does not exist: {project_dir}", flush=True)
|
||||
sys.exit(1)
|
||||
else:
|
||||
registered_path = get_project_path(project_dir_input)
|
||||
if registered_path:
|
||||
project_dir = registered_path
|
||||
else:
|
||||
print(f"Error: Project '{project_dir_input}' not found in registry", flush=True)
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
asyncio.run(run_parallel_orchestrator(
|
||||
project_dir=project_dir,
|
||||
max_concurrency=args.max_concurrency,
|
||||
model=args.model,
|
||||
yolo_mode=args.yolo,
|
||||
))
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nInterrupted by user", flush=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user