mirror of
https://github.com/leonvanzyl/autocoder.git
synced 2026-01-30 06:12:06 +00:00
fix: production readiness fixes for dependency trees and parallel agents
Critical fixes: - Lock file TOCTOU race condition: Use atomic O_CREAT|O_EXCL for lock creation - PID reuse vulnerability on Windows: Store PID:CREATE_TIME in lock file to detect when a different process has reused the same PID - WAL mode on network drives: Detect network paths (UNC, mapped drives, NFS, CIFS) and fall back to DELETE journal mode to prevent corruption High priority fixes: - JSON migration now preserves dependencies field during legacy migration - Process tree termination on Windows: Use psutil to kill child processes recursively to prevent orphaned browser instances - Retry backoff jitter: Add random 30% jitter to prevent synchronized retries under high contention with 5 concurrent agents Files changed: - server/services/process_manager.py: Atomic lock creation, PID+create_time - api/database.py: Network filesystem detection for WAL mode fallback - api/migration.py: Add dependencies field to JSON migration - parallel_orchestrator.py: _kill_process_tree helper function - mcp_server/feature_mcp.py: Add jitter to exponential backoff Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -19,6 +19,8 @@ import time
|
||||
from pathlib import Path
|
||||
from typing import Callable, Awaitable
|
||||
|
||||
import psutil
|
||||
|
||||
from api.database import Feature, create_database
|
||||
from api.dependency_resolver import are_dependencies_satisfied, compute_scheduling_scores
|
||||
|
||||
@@ -32,6 +34,59 @@ POLL_INTERVAL = 5 # seconds between checking for ready features
|
||||
MAX_FEATURE_RETRIES = 3 # Maximum times to retry a failed feature
|
||||
|
||||
|
||||
def _kill_process_tree(proc: subprocess.Popen, timeout: float = 5.0) -> None:
|
||||
"""Kill a process and all its child processes.
|
||||
|
||||
On Windows, subprocess.terminate() only kills the immediate process, leaving
|
||||
orphaned child processes (e.g., spawned browser instances). This function
|
||||
uses psutil to kill the entire process tree.
|
||||
|
||||
Args:
|
||||
proc: The subprocess.Popen object to kill
|
||||
timeout: Seconds to wait for graceful termination before force-killing
|
||||
"""
|
||||
try:
|
||||
parent = psutil.Process(proc.pid)
|
||||
# Get all children recursively before terminating
|
||||
children = parent.children(recursive=True)
|
||||
|
||||
# Terminate children first (graceful)
|
||||
for child in children:
|
||||
try:
|
||||
child.terminate()
|
||||
except psutil.NoSuchProcess:
|
||||
pass
|
||||
|
||||
# Wait for children to terminate
|
||||
_, still_alive = psutil.wait_procs(children, timeout=timeout)
|
||||
|
||||
# Force kill any remaining children
|
||||
for child in still_alive:
|
||||
try:
|
||||
child.kill()
|
||||
except psutil.NoSuchProcess:
|
||||
pass
|
||||
|
||||
# Now terminate the parent
|
||||
proc.terminate()
|
||||
try:
|
||||
proc.wait(timeout=timeout)
|
||||
except subprocess.TimeoutExpired:
|
||||
proc.kill()
|
||||
proc.wait()
|
||||
|
||||
except psutil.NoSuchProcess:
|
||||
# Process already dead, just ensure cleanup
|
||||
try:
|
||||
proc.terminate()
|
||||
proc.wait(timeout=1)
|
||||
except (subprocess.TimeoutExpired, OSError):
|
||||
try:
|
||||
proc.kill()
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
class ParallelOrchestrator:
|
||||
"""Orchestrates parallel execution of independent features."""
|
||||
|
||||
@@ -302,7 +357,7 @@ class ParallelOrchestrator:
|
||||
print(f"Feature #{feature_id} {status}", flush=True)
|
||||
|
||||
def stop_feature(self, feature_id: int) -> tuple[bool, str]:
|
||||
"""Stop a running feature agent."""
|
||||
"""Stop a running feature agent and all its child processes."""
|
||||
with self._lock:
|
||||
if feature_id not in self.running_agents:
|
||||
return False, "Feature not running"
|
||||
@@ -313,11 +368,8 @@ class ParallelOrchestrator:
|
||||
if abort:
|
||||
abort.set()
|
||||
if proc:
|
||||
proc.terminate()
|
||||
try:
|
||||
proc.wait(timeout=5)
|
||||
except subprocess.TimeoutExpired:
|
||||
proc.kill()
|
||||
# Kill entire process tree to avoid orphaned children (e.g., browser instances)
|
||||
_kill_process_tree(proc, timeout=5.0)
|
||||
|
||||
return True, f"Stopped feature {feature_id}"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user