Merge remote-tracking branch 'origin/master' into feature/blocked-for-human-input

# Conflicts:
#	server/services/process_manager.py
This commit is contained in:
Auto
2026-02-12 07:36:11 +02:00
49 changed files with 2558 additions and 252 deletions

View File

@@ -194,6 +194,7 @@ class ParallelOrchestrator:
# Legacy alias for backward compatibility
self.running_agents = self.running_coding_agents
self.abort_events: dict[int, threading.Event] = {}
self._testing_session_counter = 0
self.is_running = False
# Track feature failures to prevent infinite retry loops
@@ -212,6 +213,9 @@ class ParallelOrchestrator:
# Signal handlers only set this flag; cleanup happens in the main loop
self._shutdown_requested = False
# Graceful pause (drain mode) flag
self._drain_requested = False
# Session tracking for logging/debugging
self.session_start_time: datetime | None = None
@@ -852,7 +856,7 @@ class ParallelOrchestrator:
"encoding": "utf-8",
"errors": "replace",
"cwd": str(self.project_dir), # Run from project dir so CLI creates .claude/ in project
"env": {**os.environ, "PYTHONUNBUFFERED": "1"},
"env": {**os.environ, "PYTHONUNBUFFERED": "1", "NODE_COMPILE_CACHE": "", "PLAYWRIGHT_CLI_SESSION": f"coding-{feature_id}"},
}
if sys.platform == "win32":
popen_kwargs["creationflags"] = subprocess.CREATE_NO_WINDOW
@@ -915,7 +919,7 @@ class ParallelOrchestrator:
"encoding": "utf-8",
"errors": "replace",
"cwd": str(self.project_dir), # Run from project dir so CLI creates .claude/ in project
"env": {**os.environ, "PYTHONUNBUFFERED": "1"},
"env": {**os.environ, "PYTHONUNBUFFERED": "1", "NODE_COMPILE_CACHE": "", "PLAYWRIGHT_CLI_SESSION": f"coding-{primary_id}"},
}
if sys.platform == "win32":
popen_kwargs["creationflags"] = subprocess.CREATE_NO_WINDOW
@@ -1019,8 +1023,9 @@ class ParallelOrchestrator:
"encoding": "utf-8",
"errors": "replace",
"cwd": str(self.project_dir), # Run from project dir so CLI creates .claude/ in project
"env": {**os.environ, "PYTHONUNBUFFERED": "1"},
"env": {**os.environ, "PYTHONUNBUFFERED": "1", "NODE_COMPILE_CACHE": "", "PLAYWRIGHT_CLI_SESSION": f"testing-{self._testing_session_counter}"},
}
self._testing_session_counter += 1
if sys.platform == "win32":
popen_kwargs["creationflags"] = subprocess.CREATE_NO_WINDOW
@@ -1080,7 +1085,7 @@ class ParallelOrchestrator:
"encoding": "utf-8",
"errors": "replace",
"cwd": str(AUTOFORGE_ROOT),
"env": {**os.environ, "PYTHONUNBUFFERED": "1"},
"env": {**os.environ, "PYTHONUNBUFFERED": "1", "NODE_COMPILE_CACHE": ""},
}
if sys.platform == "win32":
popen_kwargs["creationflags"] = subprocess.CREATE_NO_WINDOW
@@ -1166,6 +1171,19 @@ class ParallelOrchestrator:
debug_log.log("CLEANUP", f"Error killing process tree for {agent_type} agent", error=str(e))
self._on_agent_complete(feature_id, proc.returncode, agent_type, proc)
def _run_inter_session_cleanup(self):
"""Run lightweight cleanup between agent sessions.
Removes stale temp files and project screenshots to prevent
disk space accumulation during long overnight runs.
"""
try:
from temp_cleanup import cleanup_project_screenshots, cleanup_stale_temp
cleanup_stale_temp()
cleanup_project_screenshots(self.project_dir)
except Exception as e:
debug_log.log("CLEANUP", f"Inter-session cleanup failed (non-fatal): {e}")
def _signal_agent_completed(self):
"""Signal that an agent has completed, waking the main loop.
@@ -1241,6 +1259,8 @@ class ParallelOrchestrator:
pid=proc.pid,
feature_id=feature_id,
status=status)
# Run lightweight cleanup between sessions
self._run_inter_session_cleanup()
# Signal main loop that an agent slot is available
self._signal_agent_completed()
return
@@ -1307,6 +1327,8 @@ class ParallelOrchestrator:
else:
print(f"Feature #{feature_id} {status}", flush=True)
# Run lightweight cleanup between sessions
self._run_inter_session_cleanup()
# Signal main loop that an agent slot is available
self._signal_agent_completed()
@@ -1374,6 +1396,9 @@ class ParallelOrchestrator:
# Must happen before any debug_log.log() calls
debug_log.start_session()
# Clear any stale drain signal from a previous session
self._clear_drain_signal()
# Log startup to debug file
debug_log.section("ORCHESTRATOR STARTUP")
debug_log.log("STARTUP", "Orchestrator run_loop starting",
@@ -1495,6 +1520,34 @@ class ParallelOrchestrator:
print("\nAll features complete!", flush=True)
break
# --- Graceful pause (drain mode) ---
if not self._drain_requested and self._check_drain_signal():
self._drain_requested = True
print("Graceful pause requested - draining running agents...", flush=True)
debug_log.log("DRAIN", "Graceful pause requested, draining running agents")
if self._drain_requested:
with self._lock:
coding_count = len(self.running_coding_agents)
testing_count = len(self.running_testing_agents)
if coding_count == 0 and testing_count == 0:
print("All agents drained - paused.", flush=True)
debug_log.log("DRAIN", "All agents drained, entering paused state")
# Wait until signal file is removed (resume) or shutdown
while self._check_drain_signal() and self.is_running and not self._shutdown_requested:
await asyncio.sleep(1)
if not self.is_running or self._shutdown_requested:
break
self._drain_requested = False
print("Resuming from graceful pause...", flush=True)
debug_log.log("DRAIN", "Resuming from graceful pause")
continue
else:
debug_log.log("DRAIN", f"Waiting for agents to finish: coding={coding_count}, testing={testing_count}")
await self._wait_for_agent_completion()
continue
# Maintain testing agents independently (runs every iteration)
self._maintain_testing_agents(feature_dicts)
@@ -1619,6 +1672,17 @@ class ParallelOrchestrator:
"yolo_mode": self.yolo_mode,
}
def _check_drain_signal(self) -> bool:
"""Check if the graceful pause (drain) signal file exists."""
from autoforge_paths import get_pause_drain_path
return get_pause_drain_path(self.project_dir).exists()
def _clear_drain_signal(self) -> None:
"""Delete the drain signal file and reset the flag."""
from autoforge_paths import get_pause_drain_path
get_pause_drain_path(self.project_dir).unlink(missing_ok=True)
self._drain_requested = False
def cleanup(self) -> None:
"""Clean up database resources. Safe to call multiple times.