mirror of
https://github.com/leonvanzyl/autocoder.git
synced 2026-01-30 22:32:06 +00:00
feat: decouple regression testing agents from coding agents
Major refactoring of the parallel orchestrator to run regression testing agents independently from coding agents. This improves system reliability and provides better control over testing behavior. Key changes: Database & MCP Layer: - Add testing_in_progress and last_tested_at columns to Feature model - Add feature_claim_for_testing() for atomic test claim with retry - Add feature_release_testing() to release claims after testing - Refactor claim functions to iterative loops (no recursion) - Add OperationalError retry handling for transient DB errors - Reduce MAX_CLAIM_RETRIES from 10 to 5 Orchestrator: - Decouple testing agent lifecycle from coding agents - Add _maintain_testing_agents() for continuous testing maintenance - Fix TOCTOU race in _spawn_testing_agent() - hold lock during spawn - Add _cleanup_stale_testing_locks() with 30-min timeout - Fix log ordering - start_session() before stale flag cleanup - Add stale testing_in_progress cleanup on startup Dead Code Removal: - Remove count_testing_in_concurrency from entire stack (12+ files) - Remove ineffective with_for_update() from features router API & UI: - Pass testing_agent_ratio via CLI to orchestrator - Update testing prompt template to use new claim/release tools - Rename UI label to "Regression Agents" with clearer description - Add process_utils.py for cross-platform process tree management Testing agents now: - Run continuously as long as passing features exist - Can re-test features multiple times to catch regressions - Are controlled by fixed count (0-3) via testing_agent_ratio setting - Have atomic claiming to prevent concurrent testing of same feature Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -26,11 +26,11 @@ def _get_project_path(project_name: str) -> Path:
|
||||
return get_project_path(project_name)
|
||||
|
||||
|
||||
def _get_settings_defaults() -> tuple[bool, str, int, bool]:
|
||||
def _get_settings_defaults() -> tuple[bool, str, int]:
|
||||
"""Get defaults from global settings.
|
||||
|
||||
Returns:
|
||||
Tuple of (yolo_mode, model, testing_agent_ratio, count_testing_in_concurrency)
|
||||
Tuple of (yolo_mode, model, testing_agent_ratio)
|
||||
"""
|
||||
import sys
|
||||
root = Path(__file__).parent.parent.parent
|
||||
@@ -49,9 +49,7 @@ def _get_settings_defaults() -> tuple[bool, str, int, bool]:
|
||||
except (ValueError, TypeError):
|
||||
testing_agent_ratio = 1
|
||||
|
||||
count_testing = (settings.get("count_testing_in_concurrency") or "false").lower() == "true"
|
||||
|
||||
return yolo_mode, model, testing_agent_ratio, count_testing
|
||||
return yolo_mode, model, testing_agent_ratio
|
||||
|
||||
|
||||
router = APIRouter(prefix="/api/projects/{project_name}/agent", tags=["agent"])
|
||||
@@ -101,7 +99,6 @@ async def get_agent_status(project_name: str):
|
||||
parallel_mode=manager.parallel_mode,
|
||||
max_concurrency=manager.max_concurrency,
|
||||
testing_agent_ratio=manager.testing_agent_ratio,
|
||||
count_testing_in_concurrency=manager.count_testing_in_concurrency,
|
||||
)
|
||||
|
||||
|
||||
@@ -114,20 +111,18 @@ async def start_agent(
|
||||
manager = get_project_manager(project_name)
|
||||
|
||||
# Get defaults from global settings if not provided in request
|
||||
default_yolo, default_model, default_testing_ratio, default_count_testing = _get_settings_defaults()
|
||||
default_yolo, default_model, default_testing_ratio = _get_settings_defaults()
|
||||
|
||||
yolo_mode = request.yolo_mode if request.yolo_mode is not None else default_yolo
|
||||
model = request.model if request.model else default_model
|
||||
max_concurrency = request.max_concurrency or 1
|
||||
testing_agent_ratio = request.testing_agent_ratio if request.testing_agent_ratio is not None else default_testing_ratio
|
||||
count_testing = request.count_testing_in_concurrency if request.count_testing_in_concurrency is not None else default_count_testing
|
||||
|
||||
success, message = await manager.start(
|
||||
yolo_mode=yolo_mode,
|
||||
model=model,
|
||||
max_concurrency=max_concurrency,
|
||||
testing_agent_ratio=testing_agent_ratio,
|
||||
count_testing_in_concurrency=count_testing,
|
||||
)
|
||||
|
||||
# Notify scheduler of manual start (to prevent auto-stop during scheduled window)
|
||||
|
||||
@@ -258,15 +258,16 @@ async def create_features_bulk(project_name: str, bulk: FeatureBulkCreate):
|
||||
|
||||
try:
|
||||
with get_db_session(project_dir) as session:
|
||||
# Determine starting priority with row-level lock to prevent race conditions
|
||||
# Determine starting priority
|
||||
# Note: SQLite uses file-level locking, not row-level locking, so we rely on
|
||||
# SQLite's transaction isolation. Concurrent bulk creates may get overlapping
|
||||
# priorities, but this is acceptable since priorities can be reordered.
|
||||
if bulk.starting_priority is not None:
|
||||
current_priority = bulk.starting_priority
|
||||
else:
|
||||
# Lock the max priority row to prevent concurrent inserts from getting same priority
|
||||
max_priority_feature = (
|
||||
session.query(Feature)
|
||||
.order_by(Feature.priority.desc())
|
||||
.with_for_update()
|
||||
.first()
|
||||
)
|
||||
current_priority = (max_priority_feature.priority + 1) if max_priority_feature else 1
|
||||
|
||||
@@ -79,7 +79,6 @@ async def get_settings():
|
||||
model=all_settings.get("model", DEFAULT_MODEL),
|
||||
glm_mode=_is_glm_mode(),
|
||||
testing_agent_ratio=_parse_int(all_settings.get("testing_agent_ratio"), 1),
|
||||
count_testing_in_concurrency=_parse_bool(all_settings.get("count_testing_in_concurrency")),
|
||||
)
|
||||
|
||||
|
||||
@@ -95,9 +94,6 @@ async def update_settings(update: SettingsUpdate):
|
||||
if update.testing_agent_ratio is not None:
|
||||
set_setting("testing_agent_ratio", str(update.testing_agent_ratio))
|
||||
|
||||
if update.count_testing_in_concurrency is not None:
|
||||
set_setting("count_testing_in_concurrency", "true" if update.count_testing_in_concurrency else "false")
|
||||
|
||||
# Return updated settings
|
||||
all_settings = get_all_settings()
|
||||
return SettingsResponse(
|
||||
@@ -105,5 +101,4 @@ async def update_settings(update: SettingsUpdate):
|
||||
model=all_settings.get("model", DEFAULT_MODEL),
|
||||
glm_mode=_is_glm_mode(),
|
||||
testing_agent_ratio=_parse_int(all_settings.get("testing_agent_ratio"), 1),
|
||||
count_testing_in_concurrency=_parse_bool(all_settings.get("count_testing_in_concurrency")),
|
||||
)
|
||||
|
||||
@@ -171,8 +171,7 @@ class AgentStartRequest(BaseModel):
|
||||
model: str | None = None # None means use global settings
|
||||
parallel_mode: bool | None = None # DEPRECATED: Use max_concurrency instead
|
||||
max_concurrency: int | None = None # Max concurrent coding agents (1-5)
|
||||
testing_agent_ratio: int | None = None # Testing agents per coding agent (0-3)
|
||||
count_testing_in_concurrency: bool | None = None # Count testing toward limit
|
||||
testing_agent_ratio: int | None = None # Regression testing agents (0-3)
|
||||
|
||||
@field_validator('model')
|
||||
@classmethod
|
||||
@@ -208,8 +207,7 @@ class AgentStatus(BaseModel):
|
||||
model: str | None = None # Model being used by running agent
|
||||
parallel_mode: bool = False # DEPRECATED: Always True now (unified orchestrator)
|
||||
max_concurrency: int | None = None
|
||||
testing_agent_ratio: int = 1 # Testing agents per coding agent
|
||||
count_testing_in_concurrency: bool = False # Count testing toward limit
|
||||
testing_agent_ratio: int = 1 # Regression testing agents (0-3)
|
||||
|
||||
|
||||
class AgentActionResponse(BaseModel):
|
||||
@@ -384,8 +382,7 @@ class SettingsResponse(BaseModel):
|
||||
yolo_mode: bool = False
|
||||
model: str = DEFAULT_MODEL
|
||||
glm_mode: bool = False # True if GLM API is configured via .env
|
||||
testing_agent_ratio: int = 1 # Testing agents per coding agent (0-3)
|
||||
count_testing_in_concurrency: bool = False # Count testing toward concurrency
|
||||
testing_agent_ratio: int = 1 # Regression testing agents (0-3)
|
||||
|
||||
|
||||
class ModelsResponse(BaseModel):
|
||||
@@ -399,7 +396,6 @@ class SettingsUpdate(BaseModel):
|
||||
yolo_mode: bool | None = None
|
||||
model: str | None = None
|
||||
testing_agent_ratio: int | None = None # 0-3
|
||||
count_testing_in_concurrency: bool | None = None
|
||||
|
||||
@field_validator('model')
|
||||
@classmethod
|
||||
|
||||
@@ -24,6 +24,7 @@ from typing import Awaitable, Callable, Literal, Set
|
||||
import psutil
|
||||
|
||||
from registry import list_registered_projects
|
||||
from server.utils.process_utils import kill_process_tree
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -370,37 +371,16 @@ class DevServerProcessManager:
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
# Use psutil to terminate the entire process tree
|
||||
# This is important for dev servers that spawn child processes
|
||||
try:
|
||||
parent = psutil.Process(self.process.pid)
|
||||
children = parent.children(recursive=True)
|
||||
|
||||
# Terminate children first
|
||||
for child in children:
|
||||
try:
|
||||
child.terminate()
|
||||
except psutil.NoSuchProcess:
|
||||
pass
|
||||
|
||||
# Terminate parent
|
||||
parent.terminate()
|
||||
|
||||
# Wait for graceful shutdown
|
||||
_, still_alive = psutil.wait_procs(
|
||||
[parent] + children, timeout=5
|
||||
)
|
||||
|
||||
# Force kill any remaining processes
|
||||
for proc in still_alive:
|
||||
try:
|
||||
proc.kill()
|
||||
except psutil.NoSuchProcess:
|
||||
pass
|
||||
|
||||
except psutil.NoSuchProcess:
|
||||
# Process already gone
|
||||
pass
|
||||
# Use shared utility to terminate the entire process tree
|
||||
# This is important for dev servers that spawn child processes (like Node.js)
|
||||
proc = self.process # Capture reference before async call
|
||||
loop = asyncio.get_running_loop()
|
||||
result = await loop.run_in_executor(None, kill_process_tree, proc, 5.0)
|
||||
logger.debug(
|
||||
"Process tree kill result: status=%s, children=%d (terminated=%d, killed=%d)",
|
||||
result.status, result.children_found,
|
||||
result.children_terminated, result.children_killed
|
||||
)
|
||||
|
||||
self._remove_lock()
|
||||
self.status = "stopped"
|
||||
|
||||
@@ -23,6 +23,7 @@ import psutil
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||
from auth import AUTH_ERROR_HELP_SERVER as AUTH_ERROR_HELP # noqa: E402
|
||||
from auth import is_auth_error
|
||||
from server.utils.process_utils import kill_process_tree
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -83,8 +84,7 @@ class AgentProcessManager:
|
||||
self.model: str | None = None # Model being used
|
||||
self.parallel_mode: bool = False # Parallel execution mode
|
||||
self.max_concurrency: int | None = None # Max concurrent agents
|
||||
self.testing_agent_ratio: int = 1 # Testing agents per coding agent
|
||||
self.count_testing_in_concurrency: bool = False # Count testing toward limit
|
||||
self.testing_agent_ratio: int = 1 # Regression testing agents (0-3)
|
||||
|
||||
# Support multiple callbacks (for multiple WebSocket clients)
|
||||
self._output_callbacks: Set[Callable[[str], Awaitable[None]]] = set()
|
||||
@@ -296,7 +296,6 @@ class AgentProcessManager:
|
||||
parallel_mode: bool = False,
|
||||
max_concurrency: int | None = None,
|
||||
testing_agent_ratio: int = 1,
|
||||
count_testing_in_concurrency: bool = False,
|
||||
) -> tuple[bool, str]:
|
||||
"""
|
||||
Start the agent as a subprocess.
|
||||
@@ -306,8 +305,7 @@ class AgentProcessManager:
|
||||
model: Model to use (e.g., claude-opus-4-5-20251101)
|
||||
parallel_mode: DEPRECATED - ignored, always uses unified orchestrator
|
||||
max_concurrency: Max concurrent coding agents (1-5, default 1)
|
||||
testing_agent_ratio: Testing agents per coding agent (0-3, default 1)
|
||||
count_testing_in_concurrency: If True, testing agents count toward limit
|
||||
testing_agent_ratio: Number of regression testing agents (0-3, default 1)
|
||||
|
||||
Returns:
|
||||
Tuple of (success, message)
|
||||
@@ -324,7 +322,6 @@ class AgentProcessManager:
|
||||
self.parallel_mode = True # Always True now (unified orchestrator)
|
||||
self.max_concurrency = max_concurrency or 1
|
||||
self.testing_agent_ratio = testing_agent_ratio
|
||||
self.count_testing_in_concurrency = count_testing_in_concurrency
|
||||
|
||||
# Build command - unified orchestrator with --concurrency
|
||||
cmd = [
|
||||
@@ -348,8 +345,6 @@ class AgentProcessManager:
|
||||
|
||||
# Add testing agent configuration
|
||||
cmd.extend(["--testing-ratio", str(testing_agent_ratio)])
|
||||
if count_testing_in_concurrency:
|
||||
cmd.append("--count-testing")
|
||||
|
||||
try:
|
||||
# Start subprocess with piped stdout/stderr
|
||||
@@ -387,7 +382,9 @@ class AgentProcessManager:
|
||||
|
||||
async def stop(self) -> tuple[bool, str]:
|
||||
"""
|
||||
Stop the agent (SIGTERM then SIGKILL if needed).
|
||||
Stop the agent and all its child processes (SIGTERM then SIGKILL if needed).
|
||||
|
||||
CRITICAL: Kills entire process tree to prevent orphaned coding/testing agents.
|
||||
|
||||
Returns:
|
||||
Tuple of (success, message)
|
||||
@@ -404,20 +401,16 @@ class AgentProcessManager:
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
# Terminate gracefully first
|
||||
self.process.terminate()
|
||||
|
||||
# Wait up to 5 seconds for graceful shutdown
|
||||
# CRITICAL: Kill entire process tree, not just orchestrator
|
||||
# This ensures all spawned coding/testing agents are also terminated
|
||||
proc = self.process # Capture reference before async call
|
||||
loop = asyncio.get_running_loop()
|
||||
try:
|
||||
await asyncio.wait_for(
|
||||
loop.run_in_executor(None, self.process.wait),
|
||||
timeout=5.0
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
# Force kill if still running
|
||||
self.process.kill()
|
||||
await loop.run_in_executor(None, self.process.wait)
|
||||
result = await loop.run_in_executor(None, kill_process_tree, proc, 10.0)
|
||||
logger.debug(
|
||||
"Process tree kill result: status=%s, children=%d (terminated=%d, killed=%d)",
|
||||
result.status, result.children_found,
|
||||
result.children_terminated, result.children_killed
|
||||
)
|
||||
|
||||
self._remove_lock()
|
||||
self.status = "stopped"
|
||||
@@ -428,7 +421,6 @@ class AgentProcessManager:
|
||||
self.parallel_mode = False # Reset parallel mode
|
||||
self.max_concurrency = None # Reset concurrency
|
||||
self.testing_agent_ratio = 1 # Reset testing ratio
|
||||
self.count_testing_in_concurrency = False # Reset count testing
|
||||
|
||||
return True, "Agent stopped"
|
||||
except Exception as e:
|
||||
@@ -514,7 +506,6 @@ class AgentProcessManager:
|
||||
"parallel_mode": self.parallel_mode,
|
||||
"max_concurrency": self.max_concurrency,
|
||||
"testing_agent_ratio": self.testing_agent_ratio,
|
||||
"count_testing_in_concurrency": self.count_testing_in_concurrency,
|
||||
}
|
||||
|
||||
|
||||
|
||||
134
server/utils/process_utils.py
Normal file
134
server/utils/process_utils.py
Normal file
@@ -0,0 +1,134 @@
|
||||
"""
|
||||
Process Utilities
|
||||
=================
|
||||
|
||||
Shared utilities for process management across the codebase.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import subprocess
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal
|
||||
|
||||
import psutil
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class KillResult:
|
||||
"""Result of a process tree kill operation.
|
||||
|
||||
Attributes:
|
||||
status: "success" if all processes terminated, "partial" if some required
|
||||
force-kill, "failure" if parent couldn't be killed
|
||||
parent_pid: PID of the parent process
|
||||
children_found: Number of child processes found
|
||||
children_terminated: Number of children that terminated gracefully
|
||||
children_killed: Number of children that required SIGKILL
|
||||
parent_forcekilled: Whether the parent required SIGKILL
|
||||
"""
|
||||
|
||||
status: Literal["success", "partial", "failure"]
|
||||
parent_pid: int
|
||||
children_found: int = 0
|
||||
children_terminated: int = 0
|
||||
children_killed: int = 0
|
||||
parent_forcekilled: bool = False
|
||||
|
||||
|
||||
def kill_process_tree(proc: subprocess.Popen, timeout: float = 5.0) -> KillResult:
|
||||
"""Kill a process and all its child processes.
|
||||
|
||||
On Windows, subprocess.terminate() only kills the immediate process, leaving
|
||||
orphaned child processes (e.g., spawned browser instances, coding/testing agents).
|
||||
This function uses psutil to kill the entire process tree.
|
||||
|
||||
Args:
|
||||
proc: The subprocess.Popen object to kill
|
||||
timeout: Seconds to wait for graceful termination before force-killing
|
||||
|
||||
Returns:
|
||||
KillResult with status and statistics about the termination
|
||||
"""
|
||||
result = KillResult(status="success", parent_pid=proc.pid)
|
||||
|
||||
try:
|
||||
parent = psutil.Process(proc.pid)
|
||||
# Get all children recursively before terminating
|
||||
children = parent.children(recursive=True)
|
||||
result.children_found = len(children)
|
||||
|
||||
logger.debug(
|
||||
"Killing process tree: PID %d with %d children",
|
||||
proc.pid, len(children)
|
||||
)
|
||||
|
||||
# Terminate children first (graceful)
|
||||
for child in children:
|
||||
try:
|
||||
logger.debug("Terminating child PID %d (%s)", child.pid, child.name())
|
||||
child.terminate()
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied) as e:
|
||||
# NoSuchProcess: already dead
|
||||
# AccessDenied: Windows can raise this for system processes or already-exited processes
|
||||
logger.debug("Child PID %d already gone or inaccessible: %s", child.pid, e)
|
||||
|
||||
# Wait for children to terminate
|
||||
gone, still_alive = psutil.wait_procs(children, timeout=timeout)
|
||||
result.children_terminated = len(gone)
|
||||
|
||||
logger.debug(
|
||||
"Children after graceful wait: %d terminated, %d still alive",
|
||||
len(gone), len(still_alive)
|
||||
)
|
||||
|
||||
# Force kill any remaining children
|
||||
for child in still_alive:
|
||||
try:
|
||||
logger.debug("Force-killing child PID %d", child.pid)
|
||||
child.kill()
|
||||
result.children_killed += 1
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied) as e:
|
||||
logger.debug("Child PID %d gone during force-kill: %s", child.pid, e)
|
||||
|
||||
if result.children_killed > 0:
|
||||
result.status = "partial"
|
||||
|
||||
# Now terminate the parent
|
||||
logger.debug("Terminating parent PID %d", proc.pid)
|
||||
proc.terminate()
|
||||
try:
|
||||
proc.wait(timeout=timeout)
|
||||
logger.debug("Parent PID %d terminated gracefully", proc.pid)
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.debug("Parent PID %d did not terminate, force-killing", proc.pid)
|
||||
proc.kill()
|
||||
proc.wait()
|
||||
result.parent_forcekilled = True
|
||||
result.status = "partial"
|
||||
|
||||
logger.debug(
|
||||
"Process tree kill complete: status=%s, children=%d (terminated=%d, killed=%d)",
|
||||
result.status, result.children_found,
|
||||
result.children_terminated, result.children_killed
|
||||
)
|
||||
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied) as e:
|
||||
# NoSuchProcess: Process already dead
|
||||
# AccessDenied: Windows can raise this for protected/system processes
|
||||
# In either case, just ensure cleanup
|
||||
logger.debug("Parent PID %d inaccessible (%s), attempting direct cleanup", proc.pid, e)
|
||||
try:
|
||||
proc.terminate()
|
||||
proc.wait(timeout=1)
|
||||
logger.debug("Direct termination of PID %d succeeded", proc.pid)
|
||||
except (subprocess.TimeoutExpired, OSError):
|
||||
try:
|
||||
proc.kill()
|
||||
logger.debug("Direct force-kill of PID %d succeeded", proc.pid)
|
||||
except OSError as kill_error:
|
||||
logger.debug("Direct force-kill of PID %d failed: %s", proc.pid, kill_error)
|
||||
result.status = "failure"
|
||||
|
||||
return result
|
||||
@@ -199,13 +199,23 @@ class AgentTracker:
|
||||
return None
|
||||
|
||||
async def _handle_testing_agent_start(self, line: str) -> dict | None:
|
||||
"""Handle testing agent start message from orchestrator."""
|
||||
"""Handle testing agent start message from orchestrator.
|
||||
|
||||
Reuses existing testing agent entry if present to avoid ghost agents in UI.
|
||||
"""
|
||||
async with self._lock:
|
||||
agent_index = self._next_agent_index
|
||||
self._next_agent_index += 1
|
||||
# Reuse existing testing agent entry if present
|
||||
existing = self.active_agents.get(self.TESTING_AGENT_KEY)
|
||||
if existing:
|
||||
agent_index = existing['agent_index']
|
||||
agent_name = existing['name']
|
||||
else:
|
||||
agent_index = self._next_agent_index
|
||||
self._next_agent_index += 1
|
||||
agent_name = AGENT_MASCOTS[agent_index % len(AGENT_MASCOTS)]
|
||||
|
||||
self.active_agents[self.TESTING_AGENT_KEY] = {
|
||||
'name': AGENT_MASCOTS[agent_index % len(AGENT_MASCOTS)],
|
||||
'name': agent_name,
|
||||
'agent_index': agent_index,
|
||||
'agent_type': 'testing',
|
||||
'state': 'testing',
|
||||
@@ -216,7 +226,7 @@ class AgentTracker:
|
||||
return {
|
||||
'type': 'agent_update',
|
||||
'agentIndex': agent_index,
|
||||
'agentName': AGENT_MASCOTS[agent_index % len(AGENT_MASCOTS)],
|
||||
'agentName': agent_name,
|
||||
'agentType': 'testing',
|
||||
'featureId': 0,
|
||||
'featureName': 'Regression Testing',
|
||||
@@ -251,16 +261,31 @@ class AgentTracker:
|
||||
|
||||
return result
|
||||
|
||||
def get_agent_info(self, feature_id: int) -> tuple[int | None, str | None]:
|
||||
async def get_agent_info(self, feature_id: int) -> tuple[int | None, str | None]:
|
||||
"""Get agent index and name for a feature ID.
|
||||
|
||||
Thread-safe method that acquires the lock before reading state.
|
||||
|
||||
Returns:
|
||||
Tuple of (agentIndex, agentName) or (None, None) if not tracked.
|
||||
"""
|
||||
agent = self.active_agents.get(feature_id)
|
||||
if agent:
|
||||
return agent['agent_index'], agent['name']
|
||||
return None, None
|
||||
async with self._lock:
|
||||
agent = self.active_agents.get(feature_id)
|
||||
if agent:
|
||||
return agent['agent_index'], agent['name']
|
||||
return None, None
|
||||
|
||||
async def reset(self):
|
||||
"""Reset tracker state when orchestrator stops or crashes.
|
||||
|
||||
Clears all active agents and resets the index counter to prevent
|
||||
ghost agents accumulating across start/stop cycles.
|
||||
|
||||
Must be called with await since it acquires the async lock.
|
||||
"""
|
||||
async with self._lock:
|
||||
self.active_agents.clear()
|
||||
self._next_agent_index = 0
|
||||
|
||||
async def _handle_agent_start(self, feature_id: int, line: str, agent_type: str = "coding") -> dict | None:
|
||||
"""Handle agent start message from orchestrator."""
|
||||
@@ -482,7 +507,7 @@ async def project_websocket(websocket: WebSocket, project_name: str):
|
||||
match = FEATURE_ID_PATTERN.match(line)
|
||||
if match:
|
||||
feature_id = int(match.group(1))
|
||||
agent_index, _ = agent_tracker.get_agent_info(feature_id)
|
||||
agent_index, _ = await agent_tracker.get_agent_info(feature_id)
|
||||
|
||||
# Send the raw log line with optional feature/agent attribution
|
||||
log_msg = {
|
||||
@@ -512,6 +537,9 @@ async def project_websocket(websocket: WebSocket, project_name: str):
|
||||
"type": "agent_status",
|
||||
"status": status,
|
||||
})
|
||||
# Reset tracker when agent stops OR crashes to prevent ghost agents on restart
|
||||
if status in ("stopped", "crashed"):
|
||||
await agent_tracker.reset()
|
||||
except Exception:
|
||||
pass # Connection may be closed
|
||||
|
||||
|
||||
Reference in New Issue
Block a user