feat: decouple regression testing agents from coding agents

Major refactoring of the parallel orchestrator to run regression testing agents independently from coding agents. This improves system reliability and provides better control over testing behavior. Key changes: Database & MCP Layer: - Add testing_in_progress and last_tested_at columns to Feature model - Add feature_claim_for_testing() for atomic test claim with retry - Add feature_release_testing() to release claims after testing - Refactor claim functions to iterative loops (no recursion) - Add OperationalError retry handling for transient DB errors - Reduce MAX_CLAIM_RETRIES from 10 to 5 Orchestrator: - Decouple testing agent lifecycle from coding agents - Add _maintain_testing_agents() for continuous testing maintenance - Fix TOCTOU race in _spawn_testing_agent() - hold lock during spawn - Add _cleanup_stale_testing_locks() with 30-min timeout - Fix log ordering - start_session() before stale flag cleanup - Add stale testing_in_progress cleanup on startup Dead Code Removal: - Remove count_testing_in_concurrency from entire stack (12+ files) - Remove ineffective with_for_update() from features router API & UI: - Pass testing_agent_ratio via CLI to orchestrator - Update testing prompt template to use new claim/release tools - Rename UI label to "Regression Agents" with clearer description - Add process_utils.py for cross-platform process tree management Testing agents now: - Run continuously as long as passing features exist - Can re-test features multiple times to catch regressions - Are controlled by fixed count (0-3) via testing_agent_ratio setting - Have atomic claiming to prevent concurrent testing of same feature Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-17 02:43:09 +00:00 · 2026-01-22 15:22:48 +02:00
parent 29c6b252a9
commit 357083dbae
20 changed files with 841 additions and 382 deletions
--- a/parallel_orchestrator.py
+++ b/parallel_orchestrator.py
@@ -23,15 +23,16 @@ import os
 import subprocess
 import sys
 import threading
-from datetime import datetime
+from datetime import datetime, timedelta, timezone
 from pathlib import Path
 from typing import Callable, Literal

-import psutil
+from sqlalchemy import text

 from api.database import Feature, create_database
 from api.dependency_resolver import are_dependencies_satisfied, compute_scheduling_scores
 from progress import has_features
+from server.utils.process_utils import kill_process_tree

 # Root directory of autocoder (where this script and autonomous_agent_demo.py live)
 AUTOCODER_ROOT = Path(__file__).parent.resolve()
@@ -98,70 +99,44 @@ def _dump_database_state(session, label: str = ""):
        pending_count=len(pending),
        pending_ids=[f.id for f in pending[:10]])  # First 10 pending only

-# Performance: Limit parallel agents to prevent memory exhaustion
+# =============================================================================
+# Process Limits
+# =============================================================================
+# These constants bound the number of concurrent agent processes to prevent
+# resource exhaustion (memory, CPU, API rate limits).
+#
+# MAX_PARALLEL_AGENTS: Max concurrent coding agents (each is a Claude session)
+# MAX_TOTAL_AGENTS: Hard limit on total child processes (coding + testing)
+#
+# Expected process count during normal operation:
+#   - 1 orchestrator process (this script)
+#   - Up to MAX_PARALLEL_AGENTS coding agents
+#   - Up to max_concurrency testing agents
+#   - Total never exceeds MAX_TOTAL_AGENTS + 1 (including orchestrator)
+#
+# Stress test verification:
+#   1. Note baseline: tasklist | findstr python | find /c /v ""
+#   2. Run: python autonomous_agent_demo.py --project-dir test --parallel --max-concurrency 5
+#   3. During run: count should never exceed baseline + 11 (1 orchestrator + 10 agents)
+#   4. After stop: should return to baseline
+# =============================================================================
 MAX_PARALLEL_AGENTS = 5
-MAX_TOTAL_AGENTS = 10  # Hard limit on total agents (coding + testing)
+MAX_TOTAL_AGENTS = 10
 DEFAULT_CONCURRENCY = 3
 POLL_INTERVAL = 5  # seconds between checking for ready features
 MAX_FEATURE_RETRIES = 3  # Maximum times to retry a failed feature
 INITIALIZER_TIMEOUT = 1800  # 30 minutes timeout for initializer
-
-
-def _kill_process_tree(proc: subprocess.Popen, timeout: float = 5.0) -> None:
-    """Kill a process and all its child processes.
-
-    On Windows, subprocess.terminate() only kills the immediate process, leaving
-    orphaned child processes (e.g., spawned browser instances). This function
-    uses psutil to kill the entire process tree.
-
-    Args:
-        proc: The subprocess.Popen object to kill
-        timeout: Seconds to wait for graceful termination before force-killing
-    """
-    try:
-        parent = psutil.Process(proc.pid)
-        # Get all children recursively before terminating
-        children = parent.children(recursive=True)
-
-        # Terminate children first (graceful)
-        for child in children:
-            try:
-                child.terminate()
-            except psutil.NoSuchProcess:
-                pass
-
-        # Wait for children to terminate
-        _, still_alive = psutil.wait_procs(children, timeout=timeout)
-
-        # Force kill any remaining children
-        for child in still_alive:
-            try:
-                child.kill()
-            except psutil.NoSuchProcess:
-                pass
-
-        # Now terminate the parent
-        proc.terminate()
-        try:
-            proc.wait(timeout=timeout)
-        except subprocess.TimeoutExpired:
-            proc.kill()
-            proc.wait()
-
-    except psutil.NoSuchProcess:
-        # Process already dead, just ensure cleanup
-        try:
-            proc.terminate()
-            proc.wait(timeout=1)
-        except (subprocess.TimeoutExpired, OSError):
-            try:
-                proc.kill()
-            except OSError:
-                pass
+STALE_TESTING_LOCK_MINUTES = 30  # Auto-release testing locks older than this


 class ParallelOrchestrator:
-    """Orchestrates parallel execution of independent features."""
+    """Orchestrates parallel execution of independent features.
+
+    Process bounds:
+    - Up to MAX_PARALLEL_AGENTS (5) coding agents concurrently
+    - Up to max_concurrency testing agents concurrently
+    - Hard limit of MAX_TOTAL_AGENTS (10) total child processes
+    """

    def __init__(
        self,
@@ -170,7 +145,6 @@ class ParallelOrchestrator:
        model: str = None,
        yolo_mode: bool = False,
        testing_agent_ratio: int = 1,
-        count_testing_in_concurrency: bool = False,
        on_output: Callable[[int, str], None] = None,
        on_status: Callable[[int, str], None] = None,
    ):
@@ -178,11 +152,12 @@ class ParallelOrchestrator:

        Args:
            project_dir: Path to the project directory
-            max_concurrency: Maximum number of concurrent coding agents (1-5)
+            max_concurrency: Maximum number of concurrent coding agents (1-5).
+                Also caps testing agents at the same limit.
            model: Claude model to use (or None for default)
-            yolo_mode: Whether to run in YOLO mode (skip testing agents)
-            testing_agent_ratio: Testing agents per coding agent (0-3, default 1)
-            count_testing_in_concurrency: If True, testing agents count toward concurrency limit
+            yolo_mode: Whether to run in YOLO mode (skip testing agents entirely)
+            testing_agent_ratio: Number of regression testing agents to maintain (0-3).
+                0 = disabled, 1-3 = maintain that many testing agents running independently.
            on_output: Callback for agent output (feature_id, line)
            on_status: Callback for agent status changes (feature_id, status)
        """
@@ -191,7 +166,6 @@ class ParallelOrchestrator:
        self.model = model
        self.yolo_mode = yolo_mode
        self.testing_agent_ratio = min(max(testing_agent_ratio, 0), 3)  # Clamp 0-3
-        self.count_testing_in_concurrency = count_testing_in_concurrency
        self.on_output = on_output
        self.on_status = on_status

@@ -209,6 +183,9 @@ class ParallelOrchestrator:
        # Track feature failures to prevent infinite retry loops
        self._failure_counts: dict[int, int] = {}

+        # Session tracking for logging/debugging
+        self.session_start_time: datetime = None
+
        # Database session for this orchestrator
        self._engine, self._session_maker = create_database(project_dir)

@@ -369,6 +346,110 @@ class ParallelOrchestrator:
        finally:
            session.close()

+    def _cleanup_stale_testing_locks(self) -> None:
+        """Release stale testing locks from crashed testing agents.
+
+        A feature is considered stale if:
+        - testing_in_progress=True AND
+        - last_tested_at is NOT NULL AND older than STALE_TESTING_LOCK_MINUTES
+
+        Note: We do NOT release features with last_tested_at=NULL because that would
+        incorrectly release features that are legitimately in the middle of their
+        first test. The last_tested_at is only set when testing completes.
+
+        This handles the case where a testing agent crashes mid-test, leaving
+        the feature locked until orchestrator restart. By checking periodically,
+        we can release these locks without requiring a restart.
+        """
+        session = self.get_session()
+        try:
+            # Use timezone-aware UTC, then strip timezone for SQLite compatibility
+            # (SQLite stores datetimes as naive strings, but we want consistency with
+            # datetime.now(timezone.utc) used elsewhere in the codebase)
+            cutoff_time = (datetime.now(timezone.utc) - timedelta(minutes=STALE_TESTING_LOCK_MINUTES)).replace(tzinfo=None)
+
+            # Find stale locks: testing_in_progress=True AND last_tested_at < cutoff
+            # Excludes NULL last_tested_at to avoid false positives on first-time tests
+            stale_features = (
+                session.query(Feature)
+                .filter(Feature.testing_in_progress == True)
+                .filter(Feature.last_tested_at.isnot(None))
+                .filter(Feature.last_tested_at < cutoff_time)
+                .all()
+            )
+
+            if stale_features:
+                stale_ids = [f.id for f in stale_features]
+                # Use ORM update instead of raw SQL for SQLite IN clause compatibility
+                session.query(Feature).filter(Feature.id.in_(stale_ids)).update(
+                    {"testing_in_progress": False},
+                    synchronize_session=False
+                )
+                session.commit()
+                print(f"[CLEANUP] Released {len(stale_ids)} stale testing locks: {stale_ids}", flush=True)
+                debug_log.log("CLEANUP", "Released stale testing locks", feature_ids=stale_ids)
+        except Exception as e:
+            session.rollback()
+            print(f"[CLEANUP] Error cleaning stale locks: {e}", flush=True)
+            debug_log.log("CLEANUP", f"Error cleaning stale locks: {e}")
+        finally:
+            session.close()
+
+    def _maintain_testing_agents(self) -> None:
+        """Maintain the desired count of testing agents independently.
+
+        This runs every loop iteration and spawns testing agents as needed to maintain
+        the configured testing_agent_ratio. Testing agents run independently from
+        coding agents and continuously re-test passing features to catch regressions.
+
+        Also periodically releases stale testing locks (features stuck in
+        testing_in_progress=True for more than STALE_TESTING_LOCK_MINUTES).
+
+        Stops spawning when:
+        - YOLO mode is enabled
+        - testing_agent_ratio is 0
+        - No passing features exist yet
+        """
+        # Skip if testing is disabled
+        if self.yolo_mode or self.testing_agent_ratio == 0:
+            return
+
+        # Periodically clean up stale testing locks (features stuck mid-test due to crash)
+        # A feature is considered stale if testing_in_progress=True and last_tested_at
+        # is either NULL or older than STALE_TESTING_LOCK_MINUTES
+        self._cleanup_stale_testing_locks()
+
+        # No testing until there are passing features
+        passing_count = self.get_passing_count()
+        if passing_count == 0:
+            return
+
+        # Spawn testing agents one at a time, re-checking limits each time
+        # This avoids TOCTOU race by holding lock during the decision
+        while True:
+            # Check limits and decide whether to spawn (atomically)
+            with self._lock:
+                current_testing = len(self.running_testing_agents)
+                desired = self.testing_agent_ratio
+                total_agents = len(self.running_coding_agents) + current_testing
+
+                # Check if we need more testing agents
+                if current_testing >= desired:
+                    return  # Already at desired count
+
+                # Check hard limit on total agents
+                if total_agents >= MAX_TOTAL_AGENTS:
+                    return  # At max total agents
+
+                # We're going to spawn - log while still holding lock
+                spawn_index = current_testing + 1
+                debug_log.log("TESTING", f"Spawning testing agent ({spawn_index}/{desired})",
+                    passing_count=passing_count)
+
+            # Spawn outside lock (I/O bound operation)
+            print(f"[DEBUG] Spawning testing agent ({spawn_index}/{desired})", flush=True)
+            self._spawn_testing_agent()
+
    def start_feature(self, feature_id: int, resume: bool = False) -> tuple[bool, str]:
        """Start a single coding agent for a feature.

@@ -384,6 +465,10 @@ class ParallelOrchestrator:
                return False, "Feature already running"
            if len(self.running_coding_agents) >= self.max_concurrency:
                return False, "At max concurrency"
+            # Enforce hard limit on total agents (coding + testing)
+            total_agents = len(self.running_coding_agents) + len(self.running_testing_agents)
+            if total_agents >= MAX_TOTAL_AGENTS:
+                return False, f"At max total agents ({total_agents}/{MAX_TOTAL_AGENTS})"

        # Mark as in_progress in database (or verify it's resumable)
        session = self.get_session()
@@ -412,16 +497,8 @@ class ParallelOrchestrator:
        if not success:
            return False, message

-        # Spawn ONE testing agent when coding agent STARTS (if not YOLO mode and passing features exist)
-        # Testing agents exit after one test, so we spawn fresh ones with each coding agent start
-        if not self.yolo_mode and self.testing_agent_ratio > 0:
-            passing_count = self.get_passing_count()
-            if passing_count > 0:
-                print(f"[DEBUG] Coding agent started, spawning testing agent (passing_count={passing_count})", flush=True)
-                debug_log.log("TESTING", "Spawning testing agent on coding agent start",
-                    feature_id=feature_id,
-                    passing_count=passing_count)
-                self._spawn_testing_agent()
+        # NOTE: Testing agents are now maintained independently via _maintain_testing_agents()
+        # called in the main loop, rather than being spawned when coding agents start.

        return True, f"Started feature {feature_id}"

@@ -483,58 +560,58 @@ class ParallelOrchestrator:
        print(f"Started coding agent for feature #{feature_id}", flush=True)
        return True, f"Started feature {feature_id}"

-    def _spawn_testing_agents(self) -> None:
-        """Spawn testing agents based on testing_agent_ratio."""
-        for _ in range(self.testing_agent_ratio):
-            # Check resource limits
-            with self._lock:
-                total_agents = len(self.running_coding_agents) + len(self.running_testing_agents)
-                if total_agents >= MAX_TOTAL_AGENTS:
-                    print(f"[DEBUG] At max total agents ({MAX_TOTAL_AGENTS}), skipping testing agent", flush=True)
-                    break
-
-                if self.count_testing_in_concurrency:
-                    if total_agents >= self.max_concurrency:
-                        print("[DEBUG] Testing agents count toward concurrency, at limit", flush=True)
-                        break
-
-            # Spawn a testing agent
-            self._spawn_testing_agent()
-
    def _spawn_testing_agent(self) -> tuple[bool, str]:
-        """Spawn a testing agent subprocess for regression testing."""
-        debug_log.log("TESTING", "Attempting to spawn testing agent subprocess")
-
-        cmd = [
-            sys.executable,
-            "-u",
-            str(AUTOCODER_ROOT / "autonomous_agent_demo.py"),
-            "--project-dir", str(self.project_dir),
-            "--max-iterations", "1",
-            "--agent-type", "testing",
-        ]
-        if self.model:
-            cmd.extend(["--model", self.model])
-        # Testing agents don't need --yolo flag (they use testing prompt regardless)
-
-        try:
-            proc = subprocess.Popen(
-                cmd,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.STDOUT,
-                text=True,
-                cwd=str(AUTOCODER_ROOT),
-                env={**os.environ, "PYTHONUNBUFFERED": "1"},
-            )
-        except Exception as e:
-            debug_log.log("TESTING", f"FAILED to spawn testing agent: {e}")
-            return False, f"Failed to start testing agent: {e}"
+        """Spawn a testing agent subprocess for regression testing.

+        CRITICAL: Lock is held during the entire spawn operation to prevent
+        TOCTOU race conditions where multiple threads could pass limit checks
+        and spawn excess agents.
+        """
+        # Hold lock for entire operation to prevent TOCTOU race
        with self._lock:
+            # Check limits
+            current_testing_count = len(self.running_testing_agents)
+            if current_testing_count >= self.max_concurrency:
+                debug_log.log("TESTING", f"Skipped spawn - at max testing agents ({current_testing_count}/{self.max_concurrency})")
+                return False, f"At max testing agents ({current_testing_count})"
+            total_agents = len(self.running_coding_agents) + len(self.running_testing_agents)
+            if total_agents >= MAX_TOTAL_AGENTS:
+                debug_log.log("TESTING", f"Skipped spawn - at max total agents ({total_agents}/{MAX_TOTAL_AGENTS})")
+                return False, f"At max total agents ({total_agents})"
+
+            debug_log.log("TESTING", "Attempting to spawn testing agent subprocess")
+
+            cmd = [
+                sys.executable,
+                "-u",
+                str(AUTOCODER_ROOT / "autonomous_agent_demo.py"),
+                "--project-dir", str(self.project_dir),
+                "--max-iterations", "1",
+                "--agent-type", "testing",
+            ]
+            if self.model:
+                cmd.extend(["--model", self.model])
+            # Testing agents don't need --yolo flag (they use testing prompt regardless)
+
+            try:
+                proc = subprocess.Popen(
+                    cmd,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.STDOUT,
+                    text=True,
+                    cwd=str(AUTOCODER_ROOT),
+                    env={**os.environ, "PYTHONUNBUFFERED": "1"},
+                )
+            except Exception as e:
+                debug_log.log("TESTING", f"FAILED to spawn testing agent: {e}")
+                return False, f"Failed to start testing agent: {e}"
+
+            # Register process immediately while still holding lock
            self.running_testing_agents.append(proc)
            testing_count = len(self.running_testing_agents)

        # Start output reader thread (feature_id=None for testing agents)
+        # This can be outside lock since process is already registered
        threading.Thread(
            target=self._read_output,
            args=(None, proc, threading.Event(), "testing"),
@@ -598,7 +675,9 @@ class ParallelOrchestrator:
            print(f"ERROR: Initializer timed out after {INITIALIZER_TIMEOUT // 60} minutes", flush=True)
            debug_log.log("INIT", "TIMEOUT - Initializer exceeded time limit",
                timeout_minutes=INITIALIZER_TIMEOUT // 60)
-            _kill_process_tree(proc)
+            result = kill_process_tree(proc)
+            debug_log.log("INIT", "Killed timed-out initializer process tree",
+                status=result.status, children_found=result.children_found)
            return False

        debug_log.log("INIT", "Initializer subprocess completed",
@@ -747,7 +826,10 @@ class ParallelOrchestrator:
            abort.set()
        if proc:
            # Kill entire process tree to avoid orphaned children (e.g., browser instances)
-            _kill_process_tree(proc, timeout=5.0)
+            result = kill_process_tree(proc, timeout=5.0)
+            debug_log.log("STOP", f"Killed feature {feature_id} process tree",
+                status=result.status, children_found=result.children_found,
+                children_terminated=result.children_terminated, children_killed=result.children_killed)

        return True, f"Stopped feature {feature_id}"

@@ -767,15 +849,35 @@ class ParallelOrchestrator:
            testing_procs = list(self.running_testing_agents)

        for proc in testing_procs:
-            _kill_process_tree(proc, timeout=5.0)
+            result = kill_process_tree(proc, timeout=5.0)
+            debug_log.log("STOP", f"Killed testing agent PID {proc.pid} process tree",
+                status=result.status, children_found=result.children_found,
+                children_terminated=result.children_terminated, children_killed=result.children_killed)

    async def run_loop(self):
        """Main orchestration loop."""
        self.is_running = True

-        # Start debug logging session (clears previous logs)
+        # Track session start for regression testing (UTC for consistency with last_tested_at)
+        self.session_start_time = datetime.now(timezone.utc)
+
+        # Start debug logging session FIRST (clears previous logs)
+        # Must happen before any debug_log.log() calls
        debug_log.start_session()

+        # Clear stale testing_in_progress flags from crashed testing agents
+        # This ensures features aren't permanently locked if a previous session crashed
+        session = self.get_session()
+        try:
+            stale_count = session.query(Feature).filter(Feature.testing_in_progress == True).count()
+            if stale_count > 0:
+                session.execute(text("UPDATE features SET testing_in_progress = 0 WHERE testing_in_progress = 1"))
+                session.commit()
+                print(f"[STARTUP] Cleared {stale_count} stale testing_in_progress flags", flush=True)
+                debug_log.log("STARTUP", f"Cleared {stale_count} stale testing_in_progress flags")
+        finally:
+            session.close()
+
        # Log startup to debug file
        debug_log.section("ORCHESTRATOR STARTUP")
        debug_log.log("STARTUP", "Orchestrator run_loop starting",
@@ -783,7 +885,7 @@ class ParallelOrchestrator:
            max_concurrency=self.max_concurrency,
            yolo_mode=self.yolo_mode,
            testing_agent_ratio=self.testing_agent_ratio,
-            count_testing_in_concurrency=self.count_testing_in_concurrency)
+            session_start_time=self.session_start_time.isoformat())

        print("=" * 70, flush=True)
        print("  UNIFIED ORCHESTRATOR SETTINGS", flush=True)
@@ -791,8 +893,7 @@ class ParallelOrchestrator:
        print(f"Project: {self.project_dir}", flush=True)
        print(f"Max concurrency: {self.max_concurrency} coding agents", flush=True)
        print(f"YOLO mode: {self.yolo_mode}", flush=True)
-        print(f"Testing agent ratio: {self.testing_agent_ratio} per coding agent", flush=True)
-        print(f"Count testing in concurrency: {self.count_testing_in_concurrency}", flush=True)
+        print(f"Regression agents: {self.testing_agent_ratio} (maintained independently)", flush=True)
        print("=" * 70, flush=True)
        print(flush=True)

@@ -891,6 +992,9 @@ class ParallelOrchestrator:
                    print("\nAll features complete!", flush=True)
                    break

+                # Maintain testing agents independently (runs every iteration)
+                self._maintain_testing_agents()
+
                # Check capacity
                with self._lock:
                    current = len(self.running_coding_agents)
@@ -1003,7 +1107,6 @@ class ParallelOrchestrator:
                "count": len(self.running_coding_agents),  # Legacy compatibility
                "max_concurrency": self.max_concurrency,
                "testing_agent_ratio": self.testing_agent_ratio,
-                "count_testing_in_concurrency": self.count_testing_in_concurrency,
                "is_running": self.is_running,
                "yolo_mode": self.yolo_mode,
            }
@@ -1015,7 +1118,6 @@ async def run_parallel_orchestrator(
    model: str = None,
    yolo_mode: bool = False,
    testing_agent_ratio: int = 1,
-    count_testing_in_concurrency: bool = False,
 ) -> None:
    """Run the unified orchestrator.

@@ -1024,8 +1126,7 @@ async def run_parallel_orchestrator(
        max_concurrency: Maximum number of concurrent coding agents
        model: Claude model to use
        yolo_mode: Whether to run in YOLO mode (skip testing agents)
-        testing_agent_ratio: Testing agents per coding agent (0-3)
-        count_testing_in_concurrency: If True, testing agents count toward concurrency limit
+        testing_agent_ratio: Number of regression agents to maintain (0-3)
    """
    print(f"[ORCHESTRATOR] run_parallel_orchestrator called with max_concurrency={max_concurrency}", flush=True)
    orchestrator = ParallelOrchestrator(
@@ -1034,7 +1135,6 @@ async def run_parallel_orchestrator(
        model=model,
        yolo_mode=yolo_mode,
        testing_agent_ratio=testing_agent_ratio,
-        count_testing_in_concurrency=count_testing_in_concurrency,
    )

    try:
@@ -1082,6 +1182,12 @@ def main():
        default=False,
        help="Enable YOLO mode: rapid prototyping without browser testing",
    )
+    parser.add_argument(
+        "--testing-agent-ratio",
+        type=int,
+        default=1,
+        help="Number of regression testing agents (0-3, default: 1). Set to 0 to disable testing agents.",
+    )

    args = parser.parse_args()

@@ -1107,6 +1213,7 @@ def main():
            max_concurrency=args.max_concurrency,
            model=args.model,
            yolo_mode=args.yolo,
+            testing_agent_ratio=args.testing_agent_ratio,
        ))
    except KeyboardInterrupt:
        print("\n\nInterrupted by user", flush=True)