fix: prevent runaway testing agent spawning (critical)

running_testing_agents was keyed by feature_id, so when multiple agents
tested the same feature, each spawn overwrote the previous dict entry.
The count stayed at 1 regardless of how many processes were actually
running, causing the maintain loop to spawn agents indefinitely (~130+).

Re-key the dict by PID so each agent gets a unique entry and the
existing max-agent guards work correctly. Also check the return value
of _spawn_testing_agent() to break the loop on failure.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Marian Paul
2026-01-29 15:02:08 +01:00
parent 3edb380b58
commit 80b6af7b2b

View File

@@ -170,8 +170,9 @@ class ParallelOrchestrator:
self._lock = threading.Lock()
# Coding agents: feature_id -> process
self.running_coding_agents: dict[int, subprocess.Popen] = {}
# Testing agents: feature_id -> process (feature being tested)
self.running_testing_agents: dict[int, subprocess.Popen] = {}
# Testing agents: pid -> (feature_id, process)
# Keyed by PID (not feature_id) because multiple agents can test the same feature
self.running_testing_agents: dict[int, tuple[int, subprocess.Popen]] = {}
# Legacy alias for backward compatibility
self.running_agents = self.running_coding_agents
self.abort_events: dict[int, threading.Event] = {}
@@ -429,7 +430,10 @@ class ParallelOrchestrator:
# Spawn outside lock (I/O bound operation)
print(f"[DEBUG] Spawning testing agent ({spawn_index}/{desired})", flush=True)
self._spawn_testing_agent()
success, msg = self._spawn_testing_agent()
if not success:
debug_log.log("TESTING", f"Spawn failed, stopping: {msg}")
return
def start_feature(self, feature_id: int, resume: bool = False) -> tuple[bool, str]:
"""Start a single coding agent for a feature.
@@ -611,8 +615,9 @@ class ParallelOrchestrator:
debug_log.log("TESTING", f"FAILED to spawn testing agent: {e}")
return False, f"Failed to start testing agent: {e}"
# Register process with feature ID (same pattern as coding agents)
self.running_testing_agents[feature_id] = proc
# Register process by PID (not feature_id) to avoid overwrites
# when multiple agents test the same feature
self.running_testing_agents[proc.pid] = (feature_id, proc)
testing_count = len(self.running_testing_agents)
# Start output reader thread with feature ID (same as coding agents)
@@ -795,11 +800,8 @@ class ParallelOrchestrator:
"""
if agent_type == "testing":
with self._lock:
# Remove from dict by finding the feature_id for this proc
for fid, p in list(self.running_testing_agents.items()):
if p is proc:
del self.running_testing_agents[fid]
break
# Remove by PID
self.running_testing_agents.pop(proc.pid, None)
status = "completed" if return_code == 0 else "failed"
print(f"Feature #{feature_id} testing {status}", flush=True)
@@ -898,9 +900,9 @@ class ParallelOrchestrator:
with self._lock:
testing_items = list(self.running_testing_agents.items())
for feature_id, proc in testing_items:
for pid, (feature_id, proc) in testing_items:
result = kill_process_tree(proc, timeout=5.0)
debug_log.log("STOP", f"Killed testing agent for feature #{feature_id} (PID {proc.pid})",
debug_log.log("STOP", f"Killed testing agent for feature #{feature_id} (PID {pid})",
status=result.status, children_found=result.children_found,
children_terminated=result.children_terminated, children_killed=result.children_killed)