fix: production readiness fixes for dependency trees and parallel agents

Critical fixes:
- Lock file TOCTOU race condition: Use atomic O_CREAT|O_EXCL for lock creation
- PID reuse vulnerability on Windows: Store PID:CREATE_TIME in lock file to
  detect when a different process has reused the same PID
- WAL mode on network drives: Detect network paths (UNC, mapped drives, NFS,
  CIFS) and fall back to DELETE journal mode to prevent corruption

High priority fixes:
- JSON migration now preserves dependencies field during legacy migration
- Process tree termination on Windows: Use psutil to kill child processes
  recursively to prevent orphaned browser instances
- Retry backoff jitter: Add random 30% jitter to prevent synchronized retries
  under high contention with 5 concurrent agents

Files changed:
- server/services/process_manager.py: Atomic lock creation, PID+create_time
- api/database.py: Network filesystem detection for WAL mode fallback
- api/migration.py: Add dependencies field to JSON migration
- parallel_orchestrator.py: _kill_process_tree helper function
- mcp_server/feature_mcp.py: Add jitter to exponential backoff

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Auto
2026-01-17 14:45:27 +02:00
parent 92450a0029
commit 126151dccd
5 changed files with 200 additions and 19 deletions

View File

@@ -5,6 +5,7 @@ Database Models and Connection
SQLite database schema for feature storage using SQLAlchemy. SQLite database schema for feature storage using SQLAlchemy.
""" """
import sys
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
@@ -112,6 +113,57 @@ def _migrate_add_dependencies_column(engine) -> None:
conn.commit() conn.commit()
def _is_network_path(path: Path) -> bool:
"""Detect if path is on a network filesystem.
WAL mode doesn't work reliably on network filesystems (NFS, SMB, CIFS)
and can cause database corruption. This function detects common network
path patterns so we can fall back to DELETE mode.
Args:
path: The path to check
Returns:
True if the path appears to be on a network filesystem
"""
path_str = str(path.resolve())
if sys.platform == "win32":
# Windows UNC paths: \\server\share or \\?\UNC\server\share
if path_str.startswith("\\\\"):
return True
# Mapped network drives - check if the drive is a network drive
try:
import ctypes
drive = path_str[:2] # e.g., "Z:"
if len(drive) == 2 and drive[1] == ":":
# DRIVE_REMOTE = 4
drive_type = ctypes.windll.kernel32.GetDriveTypeW(drive + "\\")
if drive_type == 4: # DRIVE_REMOTE
return True
except (AttributeError, OSError):
pass
else:
# Unix: Check mount type via /proc/mounts or mount command
try:
with open("/proc/mounts", "r") as f:
mounts = f.read()
# Check each mount point to find which one contains our path
for line in mounts.splitlines():
parts = line.split()
if len(parts) >= 3:
mount_point = parts[1]
fs_type = parts[2]
# Check if path is under this mount point and if it's a network FS
if path_str.startswith(mount_point):
if fs_type in ("nfs", "nfs4", "cifs", "smbfs", "fuse.sshfs"):
return True
except (FileNotFoundError, PermissionError):
pass
return False
def create_database(project_dir: Path) -> tuple: def create_database(project_dir: Path) -> tuple:
""" """
Create database and return engine + session maker. Create database and return engine + session maker.
@@ -129,9 +181,13 @@ def create_database(project_dir: Path) -> tuple:
}) })
Base.metadata.create_all(bind=engine) Base.metadata.create_all(bind=engine)
# Enable WAL mode for better concurrent read/write performance # Choose journal mode based on filesystem type
# WAL mode doesn't work reliably on network filesystems and can cause corruption
is_network = _is_network_path(project_dir)
journal_mode = "DELETE" if is_network else "WAL"
with engine.connect() as conn: with engine.connect() as conn:
conn.execute(text("PRAGMA journal_mode=WAL")) conn.execute(text(f"PRAGMA journal_mode={journal_mode}"))
conn.execute(text("PRAGMA busy_timeout=30000")) conn.execute(text("PRAGMA busy_timeout=30000"))
conn.commit() conn.commit()

View File

@@ -83,6 +83,7 @@ def migrate_json_to_sqlite(
steps=feature_dict.get("steps", []), steps=feature_dict.get("steps", []),
passes=feature_dict.get("passes", False), passes=feature_dict.get("passes", False),
in_progress=feature_dict.get("in_progress", False), in_progress=feature_dict.get("in_progress", False),
dependencies=feature_dict.get("dependencies"),
) )
session.add(feature) session.add(feature)
imported_count += 1 imported_count += 1

View File

@@ -20,6 +20,7 @@ Tools:
import json import json
import os import os
import random
import sys import sys
import threading import threading
import time as _time import time as _time
@@ -313,9 +314,11 @@ def _feature_claim_next_internal(attempt: int = 0) -> str:
if result.rowcount == 0: if result.rowcount == 0:
# Another process claimed it first - retry with backoff # Another process claimed it first - retry with backoff
session.close() session.close()
# Exponential backoff: 0.1s, 0.2s, 0.4s, ... up to 1.0s # Exponential backoff with jitter: base 0.1s, 0.2s, 0.4s, ... up to 1.0s
# Jitter of up to 30% prevents synchronized retries under high contention
backoff = min(0.1 * (2 ** attempt), 1.0) backoff = min(0.1 * (2 ** attempt), 1.0)
_time.sleep(backoff) jitter = random.uniform(0, backoff * 0.3)
_time.sleep(backoff + jitter)
return _feature_claim_next_internal(attempt + 1) return _feature_claim_next_internal(attempt + 1)
# Fetch the claimed feature # Fetch the claimed feature

View File

@@ -19,6 +19,8 @@ import time
from pathlib import Path from pathlib import Path
from typing import Callable, Awaitable from typing import Callable, Awaitable
import psutil
from api.database import Feature, create_database from api.database import Feature, create_database
from api.dependency_resolver import are_dependencies_satisfied, compute_scheduling_scores from api.dependency_resolver import are_dependencies_satisfied, compute_scheduling_scores
@@ -32,6 +34,59 @@ POLL_INTERVAL = 5 # seconds between checking for ready features
MAX_FEATURE_RETRIES = 3 # Maximum times to retry a failed feature MAX_FEATURE_RETRIES = 3 # Maximum times to retry a failed feature
def _kill_process_tree(proc: subprocess.Popen, timeout: float = 5.0) -> None:
"""Kill a process and all its child processes.
On Windows, subprocess.terminate() only kills the immediate process, leaving
orphaned child processes (e.g., spawned browser instances). This function
uses psutil to kill the entire process tree.
Args:
proc: The subprocess.Popen object to kill
timeout: Seconds to wait for graceful termination before force-killing
"""
try:
parent = psutil.Process(proc.pid)
# Get all children recursively before terminating
children = parent.children(recursive=True)
# Terminate children first (graceful)
for child in children:
try:
child.terminate()
except psutil.NoSuchProcess:
pass
# Wait for children to terminate
_, still_alive = psutil.wait_procs(children, timeout=timeout)
# Force kill any remaining children
for child in still_alive:
try:
child.kill()
except psutil.NoSuchProcess:
pass
# Now terminate the parent
proc.terminate()
try:
proc.wait(timeout=timeout)
except subprocess.TimeoutExpired:
proc.kill()
proc.wait()
except psutil.NoSuchProcess:
# Process already dead, just ensure cleanup
try:
proc.terminate()
proc.wait(timeout=1)
except (subprocess.TimeoutExpired, OSError):
try:
proc.kill()
except OSError:
pass
class ParallelOrchestrator: class ParallelOrchestrator:
"""Orchestrates parallel execution of independent features.""" """Orchestrates parallel execution of independent features."""
@@ -302,7 +357,7 @@ class ParallelOrchestrator:
print(f"Feature #{feature_id} {status}", flush=True) print(f"Feature #{feature_id} {status}", flush=True)
def stop_feature(self, feature_id: int) -> tuple[bool, str]: def stop_feature(self, feature_id: int) -> tuple[bool, str]:
"""Stop a running feature agent.""" """Stop a running feature agent and all its child processes."""
with self._lock: with self._lock:
if feature_id not in self.running_agents: if feature_id not in self.running_agents:
return False, "Feature not running" return False, "Feature not running"
@@ -313,11 +368,8 @@ class ParallelOrchestrator:
if abort: if abort:
abort.set() abort.set()
if proc: if proc:
proc.terminate() # Kill entire process tree to avoid orphaned children (e.g., browser instances)
try: _kill_process_tree(proc, timeout=5.0)
proc.wait(timeout=5)
except subprocess.TimeoutExpired:
proc.kill()
return True, f"Stopped feature {feature_id}" return True, f"Stopped feature {feature_id}"

View File

@@ -148,16 +148,36 @@ class AgentProcessManager:
return self.process.pid if self.process else None return self.process.pid if self.process else None
def _check_lock(self) -> bool: def _check_lock(self) -> bool:
"""Check if another agent is already running for this project.""" """Check if another agent is already running for this project.
Uses PID + process creation time to handle PID reuse on Windows.
"""
if not self.lock_file.exists(): if not self.lock_file.exists():
return True return True
try: try:
pid = int(self.lock_file.read_text().strip()) lock_content = self.lock_file.read_text().strip()
# Support both legacy format (just PID) and new format (PID:CREATE_TIME)
if ":" in lock_content:
pid_str, create_time_str = lock_content.split(":", 1)
pid = int(pid_str)
stored_create_time = float(create_time_str)
else:
# Legacy format - just PID
pid = int(lock_content)
stored_create_time = None
if psutil.pid_exists(pid): if psutil.pid_exists(pid):
# Check if it's actually our agent process # Check if it's actually our agent process
try: try:
proc = psutil.Process(pid) proc = psutil.Process(pid)
# Verify it's the same process using creation time (handles PID reuse)
if stored_create_time is not None:
# Allow 1 second tolerance for creation time comparison
if abs(proc.create_time() - stored_create_time) > 1.0:
# Different process reused the PID - stale lock
self.lock_file.unlink(missing_ok=True)
return True
cmdline = " ".join(proc.cmdline()) cmdline = " ".join(proc.cmdline())
if "autonomous_agent_demo.py" in cmdline: if "autonomous_agent_demo.py" in cmdline:
return False # Another agent is running return False # Another agent is running
@@ -170,11 +190,34 @@ class AgentProcessManager:
self.lock_file.unlink(missing_ok=True) self.lock_file.unlink(missing_ok=True)
return True return True
def _create_lock(self) -> None: def _create_lock(self) -> bool:
"""Create lock file with current process PID.""" """Atomically create lock file with current process PID and creation time.
Returns:
True if lock was created successfully, False if lock already exists.
"""
self.lock_file.parent.mkdir(parents=True, exist_ok=True) self.lock_file.parent.mkdir(parents=True, exist_ok=True)
if self.process: if not self.process:
self.lock_file.write_text(str(self.process.pid)) return False
try:
# Get process creation time for PID reuse detection
create_time = psutil.Process(self.process.pid).create_time()
lock_content = f"{self.process.pid}:{create_time}"
# Atomic lock creation using O_CREAT | O_EXCL
# This prevents TOCTOU race conditions
import os
fd = os.open(str(self.lock_file), os.O_CREAT | os.O_EXCL | os.O_WRONLY)
os.write(fd, lock_content.encode())
os.close(fd)
return True
except FileExistsError:
# Another process beat us to it
return False
except (psutil.NoSuchProcess, OSError) as e:
logger.warning(f"Failed to create lock file: {e}")
return False
def _remove_lock(self) -> None: def _remove_lock(self) -> None:
"""Remove lock file.""" """Remove lock file."""
@@ -305,7 +348,17 @@ class AgentProcessManager:
cwd=str(self.project_dir), cwd=str(self.project_dir),
) )
self._create_lock() # Atomic lock creation - if it fails, another process beat us
if not self._create_lock():
# Kill the process we just started since we couldn't get the lock
self.process.terminate()
try:
self.process.wait(timeout=5)
except subprocess.TimeoutExpired:
self.process.kill()
self.process = None
return False, "Another agent instance is already running for this project"
self.started_at = datetime.now() self.started_at = datetime.now()
self.status = "running" self.status = "running"
@@ -511,13 +564,29 @@ def cleanup_orphaned_locks() -> int:
continue continue
try: try:
pid_str = lock_file.read_text().strip() lock_content = lock_file.read_text().strip()
pid = int(pid_str) # Support both legacy format (just PID) and new format (PID:CREATE_TIME)
if ":" in lock_content:
pid_str, create_time_str = lock_content.split(":", 1)
pid = int(pid_str)
stored_create_time = float(create_time_str)
else:
# Legacy format - just PID
pid = int(lock_content)
stored_create_time = None
# Check if process is still running # Check if process is still running
if psutil.pid_exists(pid): if psutil.pid_exists(pid):
try: try:
proc = psutil.Process(pid) proc = psutil.Process(pid)
# Verify it's the same process using creation time (handles PID reuse)
if stored_create_time is not None:
if abs(proc.create_time() - stored_create_time) > 1.0:
# Different process reused the PID - stale lock
lock_file.unlink(missing_ok=True)
cleaned += 1
logger.info("Removed orphaned lock file for project '%s' (PID reused)", name)
continue
cmdline = " ".join(proc.cmdline()) cmdline = " ".join(proc.cmdline())
if "autonomous_agent_demo.py" in cmdline: if "autonomous_agent_demo.py" in cmdline:
# Process is still running, don't remove # Process is still running, don't remove