fix: production readiness fixes for dependency trees and parallel agents

Critical fixes: - Lock file TOCTOU race condition: Use atomic O_CREAT|O_EXCL for lock creation - PID reuse vulnerability on Windows: Store PID:CREATE_TIME in lock file to detect when a different process has reused the same PID - WAL mode on network drives: Detect network paths (UNC, mapped drives, NFS, CIFS) and fall back to DELETE journal mode to prevent corruption High priority fixes: - JSON migration now preserves dependencies field during legacy migration - Process tree termination on Windows: Use psutil to kill child processes recursively to prevent orphaned browser instances - Retry backoff jitter: Add random 30% jitter to prevent synchronized retries under high contention with 5 concurrent agents Files changed: - server/services/process_manager.py: Atomic lock creation, PID+create_time - api/database.py: Network filesystem detection for WAL mode fallback - api/migration.py: Add dependencies field to JSON migration - parallel_orchestrator.py: _kill_process_tree helper function - mcp_server/feature_mcp.py: Add jitter to exponential backoff Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-16 18:33:08 +00:00 · 2026-01-17 14:45:27 +02:00
parent 92450a0029
commit 126151dccd
5 changed files with 200 additions and 19 deletions
--- a/api/database.py
+++ b/api/database.py
@@ -5,6 +5,7 @@ Database Models and Connection
 SQLite database schema for feature storage using SQLAlchemy.
 """
 import sys
 from pathlib import Path
 from typing import Optional
@@ -112,6 +113,57 @@ def _migrate_add_dependencies_column(engine) -> None:
            conn.commit()
 def _is_network_path(path: Path) -> bool:
    """Detect if path is on a network filesystem.
    WAL mode doesn't work reliably on network filesystems (NFS, SMB, CIFS)
    and can cause database corruption. This function detects common network
    path patterns so we can fall back to DELETE mode.
    Args:
        path: The path to check
    Returns:
        True if the path appears to be on a network filesystem
    """
    path_str = str(path.resolve())
    if sys.platform == "win32":
        # Windows UNC paths: \\server\share or \\?\UNC\server\share
        if path_str.startswith("\\\\"):
            return True
        # Mapped network drives - check if the drive is a network drive
        try:
            import ctypes
            drive = path_str[:2]  # e.g., "Z:"
            if len(drive) == 2 and drive[1] == ":":
                # DRIVE_REMOTE = 4
                drive_type = ctypes.windll.kernel32.GetDriveTypeW(drive + "\\")
                if drive_type == 4:  # DRIVE_REMOTE
                    return True
        except (AttributeError, OSError):
            pass
    else:
        # Unix: Check mount type via /proc/mounts or mount command
        try:
            with open("/proc/mounts", "r") as f:
                mounts = f.read()
                # Check each mount point to find which one contains our path
                for line in mounts.splitlines():
                    parts = line.split()
                    if len(parts) >= 3:
                        mount_point = parts[1]
                        fs_type = parts[2]
                        # Check if path is under this mount point and if it's a network FS
                        if path_str.startswith(mount_point):
                            if fs_type in ("nfs", "nfs4", "cifs", "smbfs", "fuse.sshfs"):
                                return True
        except (FileNotFoundError, PermissionError):
            pass
    return False
 def create_database(project_dir: Path) -> tuple:
    """
    Create database and return engine + session maker.
@@ -129,9 +181,13 @@ def create_database(project_dir: Path) -> tuple:
    })
    Base.metadata.create_all(bind=engine)
-    # Enable WAL mode for better concurrent read/write performance
+    # Choose journal mode based on filesystem type
    # WAL mode doesn't work reliably on network filesystems and can cause corruption
    is_network = _is_network_path(project_dir)
    journal_mode = "DELETE" if is_network else "WAL"
    with engine.connect() as conn:
-        conn.execute(text("PRAGMA journal_mode=WAL"))
+        conn.execute(text(f"PRAGMA journal_mode={journal_mode}"))
        conn.execute(text("PRAGMA busy_timeout=30000"))
        conn.commit()
--- a/api/migration.py
+++ b/api/migration.py
@@ -83,6 +83,7 @@ def migrate_json_to_sqlite(
                steps=feature_dict.get("steps", []),
                passes=feature_dict.get("passes", False),
                in_progress=feature_dict.get("in_progress", False),
                dependencies=feature_dict.get("dependencies"),
            )
            session.add(feature)
            imported_count += 1
--- a/mcp_server/feature_mcp.py
+++ b/mcp_server/feature_mcp.py
@@ -20,6 +20,7 @@ Tools:
 import json
 import os
 import random
 import sys
 import threading
 import time as _time
@@ -313,9 +314,11 @@ def _feature_claim_next_internal(attempt: int = 0) -> str:
            if result.rowcount == 0:
                # Another process claimed it first - retry with backoff
                session.close()
-                # Exponential backoff: 0.1s, 0.2s, 0.4s, ... up to 1.0s
+                # Exponential backoff with jitter: base 0.1s, 0.2s, 0.4s, ... up to 1.0s
                # Jitter of up to 30% prevents synchronized retries under high contention
                backoff = min(0.1 * (2 ** attempt), 1.0)
-                _time.sleep(backoff)
+                jitter = random.uniform(0, backoff * 0.3)
                _time.sleep(backoff + jitter)
                return _feature_claim_next_internal(attempt + 1)
            # Fetch the claimed feature
--- a/parallel_orchestrator.py
+++ b/parallel_orchestrator.py
@@ -19,6 +19,8 @@ import time
 from pathlib import Path
 from typing import Callable, Awaitable
 import psutil
 from api.database import Feature, create_database
 from api.dependency_resolver import are_dependencies_satisfied, compute_scheduling_scores
@@ -32,6 +34,59 @@ POLL_INTERVAL = 5  # seconds between checking for ready features
 MAX_FEATURE_RETRIES = 3  # Maximum times to retry a failed feature
 def _kill_process_tree(proc: subprocess.Popen, timeout: float = 5.0) -> None:
    """Kill a process and all its child processes.
    On Windows, subprocess.terminate() only kills the immediate process, leaving
    orphaned child processes (e.g., spawned browser instances). This function
    uses psutil to kill the entire process tree.
    Args:
        proc: The subprocess.Popen object to kill
        timeout: Seconds to wait for graceful termination before force-killing
    """
    try:
        parent = psutil.Process(proc.pid)
        # Get all children recursively before terminating
        children = parent.children(recursive=True)
        # Terminate children first (graceful)
        for child in children:
            try:
                child.terminate()
            except psutil.NoSuchProcess:
                pass
        # Wait for children to terminate
        _, still_alive = psutil.wait_procs(children, timeout=timeout)
        # Force kill any remaining children
        for child in still_alive:
            try:
                child.kill()
            except psutil.NoSuchProcess:
                pass
        # Now terminate the parent
        proc.terminate()
        try:
            proc.wait(timeout=timeout)
        except subprocess.TimeoutExpired:
            proc.kill()
            proc.wait()
    except psutil.NoSuchProcess:
        # Process already dead, just ensure cleanup
        try:
            proc.terminate()
            proc.wait(timeout=1)
        except (subprocess.TimeoutExpired, OSError):
            try:
                proc.kill()
            except OSError:
                pass
 class ParallelOrchestrator:
    """Orchestrates parallel execution of independent features."""
@@ -302,7 +357,7 @@ class ParallelOrchestrator:
        print(f"Feature #{feature_id} {status}", flush=True)
    def stop_feature(self, feature_id: int) -> tuple[bool, str]:
-        """Stop a running feature agent."""
+        """Stop a running feature agent and all its child processes."""
        with self._lock:
            if feature_id not in self.running_agents:
                return False, "Feature not running"
@@ -313,11 +368,8 @@ class ParallelOrchestrator:
        if abort:
            abort.set()
        if proc:
-            proc.terminate()
+            # Kill entire process tree to avoid orphaned children (e.g., browser instances)
-            try:
+            _kill_process_tree(proc, timeout=5.0)
                proc.wait(timeout=5)
            except subprocess.TimeoutExpired:
                proc.kill()
        return True, f"Stopped feature {feature_id}"
--- a/server/services/process_manager.py
+++ b/server/services/process_manager.py
@@ -148,16 +148,36 @@ class AgentProcessManager:
        return self.process.pid if self.process else None
    def _check_lock(self) -> bool:
-        """Check if another agent is already running for this project."""
+        """Check if another agent is already running for this project.
        Uses PID + process creation time to handle PID reuse on Windows.
        """
        if not self.lock_file.exists():
            return True
        try:
-            pid = int(self.lock_file.read_text().strip())
+            lock_content = self.lock_file.read_text().strip()
            # Support both legacy format (just PID) and new format (PID:CREATE_TIME)
            if ":" in lock_content:
                pid_str, create_time_str = lock_content.split(":", 1)
                pid = int(pid_str)
                stored_create_time = float(create_time_str)
            else:
                # Legacy format - just PID
                pid = int(lock_content)
                stored_create_time = None
            if psutil.pid_exists(pid):
                # Check if it's actually our agent process
                try:
                    proc = psutil.Process(pid)
                    # Verify it's the same process using creation time (handles PID reuse)
                    if stored_create_time is not None:
                        # Allow 1 second tolerance for creation time comparison
                        if abs(proc.create_time() - stored_create_time) > 1.0:
                            # Different process reused the PID - stale lock
                            self.lock_file.unlink(missing_ok=True)
                            return True
                    cmdline = " ".join(proc.cmdline())
                    if "autonomous_agent_demo.py" in cmdline:
                        return False  # Another agent is running
@@ -170,11 +190,34 @@ class AgentProcessManager:
            self.lock_file.unlink(missing_ok=True)
            return True
-    def _create_lock(self) -> None:
+    def _create_lock(self) -> bool:
-        """Create lock file with current process PID."""
+        """Atomically create lock file with current process PID and creation time.
        Returns:
            True if lock was created successfully, False if lock already exists.
        """
        self.lock_file.parent.mkdir(parents=True, exist_ok=True)
-        if self.process:
+        if not self.process:
-            self.lock_file.write_text(str(self.process.pid))
+            return False
        try:
            # Get process creation time for PID reuse detection
            create_time = psutil.Process(self.process.pid).create_time()
            lock_content = f"{self.process.pid}:{create_time}"
            # Atomic lock creation using O_CREAT | O_EXCL
            # This prevents TOCTOU race conditions
            import os
            fd = os.open(str(self.lock_file), os.O_CREAT | os.O_EXCL | os.O_WRONLY)
            os.write(fd, lock_content.encode())
            os.close(fd)
            return True
        except FileExistsError:
            # Another process beat us to it
            return False
        except (psutil.NoSuchProcess, OSError) as e:
            logger.warning(f"Failed to create lock file: {e}")
            return False
    def _remove_lock(self) -> None:
        """Remove lock file."""
@@ -305,7 +348,17 @@ class AgentProcessManager:
                cwd=str(self.project_dir),
            )
-            self._create_lock()
+            # Atomic lock creation - if it fails, another process beat us
            if not self._create_lock():
                # Kill the process we just started since we couldn't get the lock
                self.process.terminate()
                try:
                    self.process.wait(timeout=5)
                except subprocess.TimeoutExpired:
                    self.process.kill()
                self.process = None
                return False, "Another agent instance is already running for this project"
            self.started_at = datetime.now()
            self.status = "running"
@@ -511,13 +564,29 @@ def cleanup_orphaned_locks() -> int:
                continue
            try:
-                pid_str = lock_file.read_text().strip()
+                lock_content = lock_file.read_text().strip()
-                pid = int(pid_str)
+                # Support both legacy format (just PID) and new format (PID:CREATE_TIME)
                if ":" in lock_content:
                    pid_str, create_time_str = lock_content.split(":", 1)
                    pid = int(pid_str)
                    stored_create_time = float(create_time_str)
                else:
                    # Legacy format - just PID
                    pid = int(lock_content)
                    stored_create_time = None
                # Check if process is still running
                if psutil.pid_exists(pid):
                    try:
                        proc = psutil.Process(pid)
                        # Verify it's the same process using creation time (handles PID reuse)
                        if stored_create_time is not None:
                            if abs(proc.create_time() - stored_create_time) > 1.0:
                                # Different process reused the PID - stale lock
                                lock_file.unlink(missing_ok=True)
                                cleaned += 1
                                logger.info("Removed orphaned lock file for project '%s' (PID reused)", name)
                                continue
                        cmdline = " ".join(proc.cmdline())
                        if "autonomous_agent_demo.py" in cmdline:
                            # Process is still running, don't remove