mirror of
https://github.com/leonvanzyl/autocoder.git
synced 2026-01-30 06:12:06 +00:00
fix: production readiness fixes for dependency trees and parallel agents
Critical fixes: - Lock file TOCTOU race condition: Use atomic O_CREAT|O_EXCL for lock creation - PID reuse vulnerability on Windows: Store PID:CREATE_TIME in lock file to detect when a different process has reused the same PID - WAL mode on network drives: Detect network paths (UNC, mapped drives, NFS, CIFS) and fall back to DELETE journal mode to prevent corruption High priority fixes: - JSON migration now preserves dependencies field during legacy migration - Process tree termination on Windows: Use psutil to kill child processes recursively to prevent orphaned browser instances - Retry backoff jitter: Add random 30% jitter to prevent synchronized retries under high contention with 5 concurrent agents Files changed: - server/services/process_manager.py: Atomic lock creation, PID+create_time - api/database.py: Network filesystem detection for WAL mode fallback - api/migration.py: Add dependencies field to JSON migration - parallel_orchestrator.py: _kill_process_tree helper function - mcp_server/feature_mcp.py: Add jitter to exponential backoff Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -5,6 +5,7 @@ Database Models and Connection
|
|||||||
SQLite database schema for feature storage using SQLAlchemy.
|
SQLite database schema for feature storage using SQLAlchemy.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
@@ -112,6 +113,57 @@ def _migrate_add_dependencies_column(engine) -> None:
|
|||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def _is_network_path(path: Path) -> bool:
|
||||||
|
"""Detect if path is on a network filesystem.
|
||||||
|
|
||||||
|
WAL mode doesn't work reliably on network filesystems (NFS, SMB, CIFS)
|
||||||
|
and can cause database corruption. This function detects common network
|
||||||
|
path patterns so we can fall back to DELETE mode.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: The path to check
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if the path appears to be on a network filesystem
|
||||||
|
"""
|
||||||
|
path_str = str(path.resolve())
|
||||||
|
|
||||||
|
if sys.platform == "win32":
|
||||||
|
# Windows UNC paths: \\server\share or \\?\UNC\server\share
|
||||||
|
if path_str.startswith("\\\\"):
|
||||||
|
return True
|
||||||
|
# Mapped network drives - check if the drive is a network drive
|
||||||
|
try:
|
||||||
|
import ctypes
|
||||||
|
drive = path_str[:2] # e.g., "Z:"
|
||||||
|
if len(drive) == 2 and drive[1] == ":":
|
||||||
|
# DRIVE_REMOTE = 4
|
||||||
|
drive_type = ctypes.windll.kernel32.GetDriveTypeW(drive + "\\")
|
||||||
|
if drive_type == 4: # DRIVE_REMOTE
|
||||||
|
return True
|
||||||
|
except (AttributeError, OSError):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
# Unix: Check mount type via /proc/mounts or mount command
|
||||||
|
try:
|
||||||
|
with open("/proc/mounts", "r") as f:
|
||||||
|
mounts = f.read()
|
||||||
|
# Check each mount point to find which one contains our path
|
||||||
|
for line in mounts.splitlines():
|
||||||
|
parts = line.split()
|
||||||
|
if len(parts) >= 3:
|
||||||
|
mount_point = parts[1]
|
||||||
|
fs_type = parts[2]
|
||||||
|
# Check if path is under this mount point and if it's a network FS
|
||||||
|
if path_str.startswith(mount_point):
|
||||||
|
if fs_type in ("nfs", "nfs4", "cifs", "smbfs", "fuse.sshfs"):
|
||||||
|
return True
|
||||||
|
except (FileNotFoundError, PermissionError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def create_database(project_dir: Path) -> tuple:
|
def create_database(project_dir: Path) -> tuple:
|
||||||
"""
|
"""
|
||||||
Create database and return engine + session maker.
|
Create database and return engine + session maker.
|
||||||
@@ -129,9 +181,13 @@ def create_database(project_dir: Path) -> tuple:
|
|||||||
})
|
})
|
||||||
Base.metadata.create_all(bind=engine)
|
Base.metadata.create_all(bind=engine)
|
||||||
|
|
||||||
# Enable WAL mode for better concurrent read/write performance
|
# Choose journal mode based on filesystem type
|
||||||
|
# WAL mode doesn't work reliably on network filesystems and can cause corruption
|
||||||
|
is_network = _is_network_path(project_dir)
|
||||||
|
journal_mode = "DELETE" if is_network else "WAL"
|
||||||
|
|
||||||
with engine.connect() as conn:
|
with engine.connect() as conn:
|
||||||
conn.execute(text("PRAGMA journal_mode=WAL"))
|
conn.execute(text(f"PRAGMA journal_mode={journal_mode}"))
|
||||||
conn.execute(text("PRAGMA busy_timeout=30000"))
|
conn.execute(text("PRAGMA busy_timeout=30000"))
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
|||||||
@@ -83,6 +83,7 @@ def migrate_json_to_sqlite(
|
|||||||
steps=feature_dict.get("steps", []),
|
steps=feature_dict.get("steps", []),
|
||||||
passes=feature_dict.get("passes", False),
|
passes=feature_dict.get("passes", False),
|
||||||
in_progress=feature_dict.get("in_progress", False),
|
in_progress=feature_dict.get("in_progress", False),
|
||||||
|
dependencies=feature_dict.get("dependencies"),
|
||||||
)
|
)
|
||||||
session.add(feature)
|
session.add(feature)
|
||||||
imported_count += 1
|
imported_count += 1
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ Tools:
|
|||||||
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import random
|
||||||
import sys
|
import sys
|
||||||
import threading
|
import threading
|
||||||
import time as _time
|
import time as _time
|
||||||
@@ -313,9 +314,11 @@ def _feature_claim_next_internal(attempt: int = 0) -> str:
|
|||||||
if result.rowcount == 0:
|
if result.rowcount == 0:
|
||||||
# Another process claimed it first - retry with backoff
|
# Another process claimed it first - retry with backoff
|
||||||
session.close()
|
session.close()
|
||||||
# Exponential backoff: 0.1s, 0.2s, 0.4s, ... up to 1.0s
|
# Exponential backoff with jitter: base 0.1s, 0.2s, 0.4s, ... up to 1.0s
|
||||||
|
# Jitter of up to 30% prevents synchronized retries under high contention
|
||||||
backoff = min(0.1 * (2 ** attempt), 1.0)
|
backoff = min(0.1 * (2 ** attempt), 1.0)
|
||||||
_time.sleep(backoff)
|
jitter = random.uniform(0, backoff * 0.3)
|
||||||
|
_time.sleep(backoff + jitter)
|
||||||
return _feature_claim_next_internal(attempt + 1)
|
return _feature_claim_next_internal(attempt + 1)
|
||||||
|
|
||||||
# Fetch the claimed feature
|
# Fetch the claimed feature
|
||||||
|
|||||||
@@ -19,6 +19,8 @@ import time
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Callable, Awaitable
|
from typing import Callable, Awaitable
|
||||||
|
|
||||||
|
import psutil
|
||||||
|
|
||||||
from api.database import Feature, create_database
|
from api.database import Feature, create_database
|
||||||
from api.dependency_resolver import are_dependencies_satisfied, compute_scheduling_scores
|
from api.dependency_resolver import are_dependencies_satisfied, compute_scheduling_scores
|
||||||
|
|
||||||
@@ -32,6 +34,59 @@ POLL_INTERVAL = 5 # seconds between checking for ready features
|
|||||||
MAX_FEATURE_RETRIES = 3 # Maximum times to retry a failed feature
|
MAX_FEATURE_RETRIES = 3 # Maximum times to retry a failed feature
|
||||||
|
|
||||||
|
|
||||||
|
def _kill_process_tree(proc: subprocess.Popen, timeout: float = 5.0) -> None:
|
||||||
|
"""Kill a process and all its child processes.
|
||||||
|
|
||||||
|
On Windows, subprocess.terminate() only kills the immediate process, leaving
|
||||||
|
orphaned child processes (e.g., spawned browser instances). This function
|
||||||
|
uses psutil to kill the entire process tree.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
proc: The subprocess.Popen object to kill
|
||||||
|
timeout: Seconds to wait for graceful termination before force-killing
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
parent = psutil.Process(proc.pid)
|
||||||
|
# Get all children recursively before terminating
|
||||||
|
children = parent.children(recursive=True)
|
||||||
|
|
||||||
|
# Terminate children first (graceful)
|
||||||
|
for child in children:
|
||||||
|
try:
|
||||||
|
child.terminate()
|
||||||
|
except psutil.NoSuchProcess:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Wait for children to terminate
|
||||||
|
_, still_alive = psutil.wait_procs(children, timeout=timeout)
|
||||||
|
|
||||||
|
# Force kill any remaining children
|
||||||
|
for child in still_alive:
|
||||||
|
try:
|
||||||
|
child.kill()
|
||||||
|
except psutil.NoSuchProcess:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Now terminate the parent
|
||||||
|
proc.terminate()
|
||||||
|
try:
|
||||||
|
proc.wait(timeout=timeout)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
proc.kill()
|
||||||
|
proc.wait()
|
||||||
|
|
||||||
|
except psutil.NoSuchProcess:
|
||||||
|
# Process already dead, just ensure cleanup
|
||||||
|
try:
|
||||||
|
proc.terminate()
|
||||||
|
proc.wait(timeout=1)
|
||||||
|
except (subprocess.TimeoutExpired, OSError):
|
||||||
|
try:
|
||||||
|
proc.kill()
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class ParallelOrchestrator:
|
class ParallelOrchestrator:
|
||||||
"""Orchestrates parallel execution of independent features."""
|
"""Orchestrates parallel execution of independent features."""
|
||||||
|
|
||||||
@@ -302,7 +357,7 @@ class ParallelOrchestrator:
|
|||||||
print(f"Feature #{feature_id} {status}", flush=True)
|
print(f"Feature #{feature_id} {status}", flush=True)
|
||||||
|
|
||||||
def stop_feature(self, feature_id: int) -> tuple[bool, str]:
|
def stop_feature(self, feature_id: int) -> tuple[bool, str]:
|
||||||
"""Stop a running feature agent."""
|
"""Stop a running feature agent and all its child processes."""
|
||||||
with self._lock:
|
with self._lock:
|
||||||
if feature_id not in self.running_agents:
|
if feature_id not in self.running_agents:
|
||||||
return False, "Feature not running"
|
return False, "Feature not running"
|
||||||
@@ -313,11 +368,8 @@ class ParallelOrchestrator:
|
|||||||
if abort:
|
if abort:
|
||||||
abort.set()
|
abort.set()
|
||||||
if proc:
|
if proc:
|
||||||
proc.terminate()
|
# Kill entire process tree to avoid orphaned children (e.g., browser instances)
|
||||||
try:
|
_kill_process_tree(proc, timeout=5.0)
|
||||||
proc.wait(timeout=5)
|
|
||||||
except subprocess.TimeoutExpired:
|
|
||||||
proc.kill()
|
|
||||||
|
|
||||||
return True, f"Stopped feature {feature_id}"
|
return True, f"Stopped feature {feature_id}"
|
||||||
|
|
||||||
|
|||||||
@@ -148,16 +148,36 @@ class AgentProcessManager:
|
|||||||
return self.process.pid if self.process else None
|
return self.process.pid if self.process else None
|
||||||
|
|
||||||
def _check_lock(self) -> bool:
|
def _check_lock(self) -> bool:
|
||||||
"""Check if another agent is already running for this project."""
|
"""Check if another agent is already running for this project.
|
||||||
|
|
||||||
|
Uses PID + process creation time to handle PID reuse on Windows.
|
||||||
|
"""
|
||||||
if not self.lock_file.exists():
|
if not self.lock_file.exists():
|
||||||
return True
|
return True
|
||||||
|
|
||||||
try:
|
try:
|
||||||
pid = int(self.lock_file.read_text().strip())
|
lock_content = self.lock_file.read_text().strip()
|
||||||
|
# Support both legacy format (just PID) and new format (PID:CREATE_TIME)
|
||||||
|
if ":" in lock_content:
|
||||||
|
pid_str, create_time_str = lock_content.split(":", 1)
|
||||||
|
pid = int(pid_str)
|
||||||
|
stored_create_time = float(create_time_str)
|
||||||
|
else:
|
||||||
|
# Legacy format - just PID
|
||||||
|
pid = int(lock_content)
|
||||||
|
stored_create_time = None
|
||||||
|
|
||||||
if psutil.pid_exists(pid):
|
if psutil.pid_exists(pid):
|
||||||
# Check if it's actually our agent process
|
# Check if it's actually our agent process
|
||||||
try:
|
try:
|
||||||
proc = psutil.Process(pid)
|
proc = psutil.Process(pid)
|
||||||
|
# Verify it's the same process using creation time (handles PID reuse)
|
||||||
|
if stored_create_time is not None:
|
||||||
|
# Allow 1 second tolerance for creation time comparison
|
||||||
|
if abs(proc.create_time() - stored_create_time) > 1.0:
|
||||||
|
# Different process reused the PID - stale lock
|
||||||
|
self.lock_file.unlink(missing_ok=True)
|
||||||
|
return True
|
||||||
cmdline = " ".join(proc.cmdline())
|
cmdline = " ".join(proc.cmdline())
|
||||||
if "autonomous_agent_demo.py" in cmdline:
|
if "autonomous_agent_demo.py" in cmdline:
|
||||||
return False # Another agent is running
|
return False # Another agent is running
|
||||||
@@ -170,11 +190,34 @@ class AgentProcessManager:
|
|||||||
self.lock_file.unlink(missing_ok=True)
|
self.lock_file.unlink(missing_ok=True)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _create_lock(self) -> None:
|
def _create_lock(self) -> bool:
|
||||||
"""Create lock file with current process PID."""
|
"""Atomically create lock file with current process PID and creation time.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if lock was created successfully, False if lock already exists.
|
||||||
|
"""
|
||||||
self.lock_file.parent.mkdir(parents=True, exist_ok=True)
|
self.lock_file.parent.mkdir(parents=True, exist_ok=True)
|
||||||
if self.process:
|
if not self.process:
|
||||||
self.lock_file.write_text(str(self.process.pid))
|
return False
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get process creation time for PID reuse detection
|
||||||
|
create_time = psutil.Process(self.process.pid).create_time()
|
||||||
|
lock_content = f"{self.process.pid}:{create_time}"
|
||||||
|
|
||||||
|
# Atomic lock creation using O_CREAT | O_EXCL
|
||||||
|
# This prevents TOCTOU race conditions
|
||||||
|
import os
|
||||||
|
fd = os.open(str(self.lock_file), os.O_CREAT | os.O_EXCL | os.O_WRONLY)
|
||||||
|
os.write(fd, lock_content.encode())
|
||||||
|
os.close(fd)
|
||||||
|
return True
|
||||||
|
except FileExistsError:
|
||||||
|
# Another process beat us to it
|
||||||
|
return False
|
||||||
|
except (psutil.NoSuchProcess, OSError) as e:
|
||||||
|
logger.warning(f"Failed to create lock file: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
def _remove_lock(self) -> None:
|
def _remove_lock(self) -> None:
|
||||||
"""Remove lock file."""
|
"""Remove lock file."""
|
||||||
@@ -305,7 +348,17 @@ class AgentProcessManager:
|
|||||||
cwd=str(self.project_dir),
|
cwd=str(self.project_dir),
|
||||||
)
|
)
|
||||||
|
|
||||||
self._create_lock()
|
# Atomic lock creation - if it fails, another process beat us
|
||||||
|
if not self._create_lock():
|
||||||
|
# Kill the process we just started since we couldn't get the lock
|
||||||
|
self.process.terminate()
|
||||||
|
try:
|
||||||
|
self.process.wait(timeout=5)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
self.process.kill()
|
||||||
|
self.process = None
|
||||||
|
return False, "Another agent instance is already running for this project"
|
||||||
|
|
||||||
self.started_at = datetime.now()
|
self.started_at = datetime.now()
|
||||||
self.status = "running"
|
self.status = "running"
|
||||||
|
|
||||||
@@ -511,13 +564,29 @@ def cleanup_orphaned_locks() -> int:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
pid_str = lock_file.read_text().strip()
|
lock_content = lock_file.read_text().strip()
|
||||||
pid = int(pid_str)
|
# Support both legacy format (just PID) and new format (PID:CREATE_TIME)
|
||||||
|
if ":" in lock_content:
|
||||||
|
pid_str, create_time_str = lock_content.split(":", 1)
|
||||||
|
pid = int(pid_str)
|
||||||
|
stored_create_time = float(create_time_str)
|
||||||
|
else:
|
||||||
|
# Legacy format - just PID
|
||||||
|
pid = int(lock_content)
|
||||||
|
stored_create_time = None
|
||||||
|
|
||||||
# Check if process is still running
|
# Check if process is still running
|
||||||
if psutil.pid_exists(pid):
|
if psutil.pid_exists(pid):
|
||||||
try:
|
try:
|
||||||
proc = psutil.Process(pid)
|
proc = psutil.Process(pid)
|
||||||
|
# Verify it's the same process using creation time (handles PID reuse)
|
||||||
|
if stored_create_time is not None:
|
||||||
|
if abs(proc.create_time() - stored_create_time) > 1.0:
|
||||||
|
# Different process reused the PID - stale lock
|
||||||
|
lock_file.unlink(missing_ok=True)
|
||||||
|
cleaned += 1
|
||||||
|
logger.info("Removed orphaned lock file for project '%s' (PID reused)", name)
|
||||||
|
continue
|
||||||
cmdline = " ".join(proc.cmdline())
|
cmdline = " ".join(proc.cmdline())
|
||||||
if "autonomous_agent_demo.py" in cmdline:
|
if "autonomous_agent_demo.py" in cmdline:
|
||||||
# Process is still running, don't remove
|
# Process is still running, don't remove
|
||||||
|
|||||||
Reference in New Issue
Block a user