mirror of
https://github.com/leonvanzyl/autocoder.git
synced 2026-01-31 06:42:06 +00:00
feat: decouple regression testing agents from coding agents
Major refactoring of the parallel orchestrator to run regression testing agents independently from coding agents. This improves system reliability and provides better control over testing behavior. Key changes: Database & MCP Layer: - Add testing_in_progress and last_tested_at columns to Feature model - Add feature_claim_for_testing() for atomic test claim with retry - Add feature_release_testing() to release claims after testing - Refactor claim functions to iterative loops (no recursion) - Add OperationalError retry handling for transient DB errors - Reduce MAX_CLAIM_RETRIES from 10 to 5 Orchestrator: - Decouple testing agent lifecycle from coding agents - Add _maintain_testing_agents() for continuous testing maintenance - Fix TOCTOU race in _spawn_testing_agent() - hold lock during spawn - Add _cleanup_stale_testing_locks() with 30-min timeout - Fix log ordering - start_session() before stale flag cleanup - Add stale testing_in_progress cleanup on startup Dead Code Removal: - Remove count_testing_in_concurrency from entire stack (12+ files) - Remove ineffective with_for_update() from features router API & UI: - Pass testing_agent_ratio via CLI to orchestrator - Update testing prompt template to use new claim/release tools - Rename UI label to "Regression Agents" with clearer description - Add process_utils.py for cross-platform process tree management Testing agents now: - Run continuously as long as passing features exist - Can re-test features multiple times to catch regressions - Are controlled by fixed count (0-3) via testing_agent_ratio setting - Have atomic claiming to prevent concurrent testing of same feature Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
134
server/utils/process_utils.py
Normal file
134
server/utils/process_utils.py
Normal file
@@ -0,0 +1,134 @@
|
||||
"""
|
||||
Process Utilities
|
||||
=================
|
||||
|
||||
Shared utilities for process management across the codebase.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import subprocess
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal
|
||||
|
||||
import psutil
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class KillResult:
|
||||
"""Result of a process tree kill operation.
|
||||
|
||||
Attributes:
|
||||
status: "success" if all processes terminated, "partial" if some required
|
||||
force-kill, "failure" if parent couldn't be killed
|
||||
parent_pid: PID of the parent process
|
||||
children_found: Number of child processes found
|
||||
children_terminated: Number of children that terminated gracefully
|
||||
children_killed: Number of children that required SIGKILL
|
||||
parent_forcekilled: Whether the parent required SIGKILL
|
||||
"""
|
||||
|
||||
status: Literal["success", "partial", "failure"]
|
||||
parent_pid: int
|
||||
children_found: int = 0
|
||||
children_terminated: int = 0
|
||||
children_killed: int = 0
|
||||
parent_forcekilled: bool = False
|
||||
|
||||
|
||||
def kill_process_tree(proc: subprocess.Popen, timeout: float = 5.0) -> KillResult:
|
||||
"""Kill a process and all its child processes.
|
||||
|
||||
On Windows, subprocess.terminate() only kills the immediate process, leaving
|
||||
orphaned child processes (e.g., spawned browser instances, coding/testing agents).
|
||||
This function uses psutil to kill the entire process tree.
|
||||
|
||||
Args:
|
||||
proc: The subprocess.Popen object to kill
|
||||
timeout: Seconds to wait for graceful termination before force-killing
|
||||
|
||||
Returns:
|
||||
KillResult with status and statistics about the termination
|
||||
"""
|
||||
result = KillResult(status="success", parent_pid=proc.pid)
|
||||
|
||||
try:
|
||||
parent = psutil.Process(proc.pid)
|
||||
# Get all children recursively before terminating
|
||||
children = parent.children(recursive=True)
|
||||
result.children_found = len(children)
|
||||
|
||||
logger.debug(
|
||||
"Killing process tree: PID %d with %d children",
|
||||
proc.pid, len(children)
|
||||
)
|
||||
|
||||
# Terminate children first (graceful)
|
||||
for child in children:
|
||||
try:
|
||||
logger.debug("Terminating child PID %d (%s)", child.pid, child.name())
|
||||
child.terminate()
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied) as e:
|
||||
# NoSuchProcess: already dead
|
||||
# AccessDenied: Windows can raise this for system processes or already-exited processes
|
||||
logger.debug("Child PID %d already gone or inaccessible: %s", child.pid, e)
|
||||
|
||||
# Wait for children to terminate
|
||||
gone, still_alive = psutil.wait_procs(children, timeout=timeout)
|
||||
result.children_terminated = len(gone)
|
||||
|
||||
logger.debug(
|
||||
"Children after graceful wait: %d terminated, %d still alive",
|
||||
len(gone), len(still_alive)
|
||||
)
|
||||
|
||||
# Force kill any remaining children
|
||||
for child in still_alive:
|
||||
try:
|
||||
logger.debug("Force-killing child PID %d", child.pid)
|
||||
child.kill()
|
||||
result.children_killed += 1
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied) as e:
|
||||
logger.debug("Child PID %d gone during force-kill: %s", child.pid, e)
|
||||
|
||||
if result.children_killed > 0:
|
||||
result.status = "partial"
|
||||
|
||||
# Now terminate the parent
|
||||
logger.debug("Terminating parent PID %d", proc.pid)
|
||||
proc.terminate()
|
||||
try:
|
||||
proc.wait(timeout=timeout)
|
||||
logger.debug("Parent PID %d terminated gracefully", proc.pid)
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.debug("Parent PID %d did not terminate, force-killing", proc.pid)
|
||||
proc.kill()
|
||||
proc.wait()
|
||||
result.parent_forcekilled = True
|
||||
result.status = "partial"
|
||||
|
||||
logger.debug(
|
||||
"Process tree kill complete: status=%s, children=%d (terminated=%d, killed=%d)",
|
||||
result.status, result.children_found,
|
||||
result.children_terminated, result.children_killed
|
||||
)
|
||||
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied) as e:
|
||||
# NoSuchProcess: Process already dead
|
||||
# AccessDenied: Windows can raise this for protected/system processes
|
||||
# In either case, just ensure cleanup
|
||||
logger.debug("Parent PID %d inaccessible (%s), attempting direct cleanup", proc.pid, e)
|
||||
try:
|
||||
proc.terminate()
|
||||
proc.wait(timeout=1)
|
||||
logger.debug("Direct termination of PID %d succeeded", proc.pid)
|
||||
except (subprocess.TimeoutExpired, OSError):
|
||||
try:
|
||||
proc.kill()
|
||||
logger.debug("Direct force-kill of PID %d succeeded", proc.pid)
|
||||
except OSError as kill_error:
|
||||
logger.debug("Direct force-kill of PID %d failed: %s", proc.pid, kill_error)
|
||||
result.status = "failure"
|
||||
|
||||
return result
|
||||
Reference in New Issue
Block a user