fix: prevent runaway testing agent spawning (critical)

running_testing_agents was keyed by feature_id, so when multiple agents tested the same feature, each spawn overwrote the previous dict entry. The count stayed at 1 regardless of how many processes were actually running, causing the maintain loop to spawn agents indefinitely (~130+). Re-key the dict by PID so each agent gets a unique entry and the existing max-agent guards work correctly. Also check the return value of _spawn_testing_agent() to break the loop on failure. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-17 19:03:09 +00:00 · 2026-01-29 15:02:08 +01:00
parent 3edb380b58
commit 80b6af7b2b
1 changed files with 14 additions and 12 deletions
--- a/parallel_orchestrator.py
+++ b/parallel_orchestrator.py
@@ -170,8 +170,9 @@ class ParallelOrchestrator:
        self._lock = threading.Lock()
        # Coding agents: feature_id -> process
        self.running_coding_agents: dict[int, subprocess.Popen] = {}
-        # Testing agents: feature_id -> process (feature being tested)
-        self.running_testing_agents: dict[int, subprocess.Popen] = {}
+        # Testing agents: pid -> (feature_id, process)
+        # Keyed by PID (not feature_id) because multiple agents can test the same feature
+        self.running_testing_agents: dict[int, tuple[int, subprocess.Popen]] = {}
        # Legacy alias for backward compatibility
        self.running_agents = self.running_coding_agents
        self.abort_events: dict[int, threading.Event] = {}
@@ -429,7 +430,10 @@ class ParallelOrchestrator:

            # Spawn outside lock (I/O bound operation)
            print(f"[DEBUG] Spawning testing agent ({spawn_index}/{desired})", flush=True)
-            self._spawn_testing_agent()
+            success, msg = self._spawn_testing_agent()
+            if not success:
+                debug_log.log("TESTING", f"Spawn failed, stopping: {msg}")
+                return

    def start_feature(self, feature_id: int, resume: bool = False) -> tuple[bool, str]:
        """Start a single coding agent for a feature.
@@ -611,8 +615,9 @@ class ParallelOrchestrator:
                debug_log.log("TESTING", f"FAILED to spawn testing agent: {e}")
                return False, f"Failed to start testing agent: {e}"

-            # Register process with feature ID (same pattern as coding agents)
-            self.running_testing_agents[feature_id] = proc
+            # Register process by PID (not feature_id) to avoid overwrites
+            # when multiple agents test the same feature
+            self.running_testing_agents[proc.pid] = (feature_id, proc)
            testing_count = len(self.running_testing_agents)

        # Start output reader thread with feature ID (same as coding agents)
@@ -795,11 +800,8 @@ class ParallelOrchestrator:
        """
        if agent_type == "testing":
            with self._lock:
-                # Remove from dict by finding the feature_id for this proc
-                for fid, p in list(self.running_testing_agents.items()):
-                    if p is proc:
-                        del self.running_testing_agents[fid]
-                        break
+                # Remove by PID
+                self.running_testing_agents.pop(proc.pid, None)

            status = "completed" if return_code == 0 else "failed"
            print(f"Feature #{feature_id} testing {status}", flush=True)
@@ -898,9 +900,9 @@ class ParallelOrchestrator:
        with self._lock:
            testing_items = list(self.running_testing_agents.items())

-        for feature_id, proc in testing_items:
+        for pid, (feature_id, proc) in testing_items:
            result = kill_process_tree(proc, timeout=5.0)
-            debug_log.log("STOP", f"Killed testing agent for feature #{feature_id} (PID {proc.pid})",
+            debug_log.log("STOP", f"Killed testing agent for feature #{feature_id} (PID {pid})",
                status=result.status, children_found=result.children_found,
                children_terminated=result.children_terminated, children_killed=result.children_killed)