feat: decouple regression testing agents from coding agents

Major refactoring of the parallel orchestrator to run regression testing agents independently from coding agents. This improves system reliability and provides better control over testing behavior. Key changes: Database & MCP Layer: - Add testing_in_progress and last_tested_at columns to Feature model - Add feature_claim_for_testing() for atomic test claim with retry - Add feature_release_testing() to release claims after testing - Refactor claim functions to iterative loops (no recursion) - Add OperationalError retry handling for transient DB errors - Reduce MAX_CLAIM_RETRIES from 10 to 5 Orchestrator: - Decouple testing agent lifecycle from coding agents - Add _maintain_testing_agents() for continuous testing maintenance - Fix TOCTOU race in _spawn_testing_agent() - hold lock during spawn - Add _cleanup_stale_testing_locks() with 30-min timeout - Fix log ordering - start_session() before stale flag cleanup - Add stale testing_in_progress cleanup on startup Dead Code Removal: - Remove count_testing_in_concurrency from entire stack (12+ files) - Remove ineffective with_for_update() from features router API & UI: - Pass testing_agent_ratio via CLI to orchestrator - Update testing prompt template to use new claim/release tools - Rename UI label to "Regression Agents" with clearer description - Add process_utils.py for cross-platform process tree management Testing agents now: - Run continuously as long as passing features exist - Can re-test features multiple times to catch regressions - Are controlled by fixed count (0-3) via testing_agent_ratio setting - Have atomic claiming to prevent concurrent testing of same feature Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-17 19:03:09 +00:00 · 2026-01-22 15:22:48 +02:00
parent 29c6b252a9
commit 357083dbae
20 changed files with 841 additions and 382 deletions
--- a/mcp_server/feature_mcp.py
+++ b/mcp_server/feature_mcp.py
@@ -26,12 +26,14 @@ import sys
 import threading
 import time as _time
 from contextlib import asynccontextmanager
+from datetime import datetime, timezone
 from pathlib import Path
 from typing import Annotated

 from mcp.server.fastmcp import FastMCP
 from pydantic import BaseModel, Field
 from sqlalchemy import text
+from sqlalchemy.exc import OperationalError
 from sqlalchemy.sql.expression import func

 # Add parent directory to path so we can import from api module
@@ -224,112 +226,119 @@ def feature_get_next() -> str:


 # Maximum retry attempts for feature claiming under contention
-MAX_CLAIM_RETRIES = 10
+MAX_CLAIM_RETRIES = 5


-def _feature_claim_next_internal(attempt: int = 0) -> str:
-    """Internal implementation of feature claiming with retry tracking.
+def _feature_claim_next_internal() -> str:
+    """Internal implementation of feature claiming with iterative retry.

-    Args:
-        attempt: Current retry attempt (0-indexed)
+    Uses an iterative loop instead of recursion to avoid double session.close()
+    issues and deep call stacks under contention.

    Returns:
        JSON with claimed feature details, or error message if no feature available.
    """
-    if attempt >= MAX_CLAIM_RETRIES:
-        return json.dumps({
-            "error": "Failed to claim feature after maximum retries",
-            "hint": "High contention detected - try again or reduce parallel agents"
-        })
+    for attempt in range(MAX_CLAIM_RETRIES):
+        session = get_session()
+        try:
+            # Use a lock to prevent concurrent claims within this process
+            with _priority_lock:
+                all_features = session.query(Feature).all()
+                all_feature_ids = {f.id for f in all_features}
+                passing_ids = {f.id for f in all_features if f.passes}

-    session = get_session()
-    try:
-        # Use a lock to prevent concurrent claims within this process
-        with _priority_lock:
-            all_features = session.query(Feature).all()
-            all_feature_ids = {f.id for f in all_features}
-            passing_ids = {f.id for f in all_features if f.passes}
+                # Get pending, non-in-progress features
+                pending = [f for f in all_features if not f.passes and not f.in_progress]

-            # Get pending, non-in-progress features
-            pending = [f for f in all_features if not f.passes and not f.in_progress]
+                # Sort by scheduling score (higher = first), then priority, then id
+                all_dicts = [f.to_dict() for f in all_features]
+                scores = compute_scheduling_scores(all_dicts)
+                pending.sort(key=lambda f: (-scores.get(f.id, 0), f.priority, f.id))

-            # Sort by scheduling score (higher = first), then priority, then id
-            all_dicts = [f.to_dict() for f in all_features]
-            scores = compute_scheduling_scores(all_dicts)
-            pending.sort(key=lambda f: (-scores.get(f.id, 0), f.priority, f.id))
+                if not pending:
+                    if any(f.in_progress for f in all_features if not f.passes):
+                        return json.dumps({"error": "All pending features are in progress by other agents"})
+                    return json.dumps({"error": "All features are passing! No more work to do."})

-            if not pending:
-                if any(f.in_progress for f in all_features if not f.passes):
-                    return json.dumps({"error": "All pending features are in progress by other agents"})
-                return json.dumps({"error": "All features are passing! No more work to do."})
-
-            # Find first feature with satisfied dependencies
-            candidate_id = None
-            for feature in pending:
-                deps = feature.dependencies or []
-                # Filter out orphaned dependencies (IDs that no longer exist)
-                valid_deps = [d for d in deps if d in all_feature_ids]
-                if all(dep_id in passing_ids for dep_id in valid_deps):
-                    candidate_id = feature.id
-                    break
-
-            if candidate_id is None:
-                # All pending features are blocked by unmet dependencies
-                blocking_info = []
-                for feature in pending[:3]:
+                # Find first feature with satisfied dependencies
+                candidate_id = None
+                for feature in pending:
                    deps = feature.dependencies or []
+                    # Filter out orphaned dependencies (IDs that no longer exist)
                    valid_deps = [d for d in deps if d in all_feature_ids]
-                    orphaned = [d for d in deps if d not in all_feature_ids]
-                    unmet = [d for d in valid_deps if d not in passing_ids]
-                    info = f"#{feature.id} '{feature.name}'"
-                    if unmet:
-                        info += f" blocked by: {unmet}"
-                    if orphaned:
-                        info += f" (orphaned deps ignored: {orphaned})"
-                    blocking_info.append(info)
+                    if all(dep_id in passing_ids for dep_id in valid_deps):
+                        candidate_id = feature.id
+                        break

-                return json.dumps({
-                    "error": "All pending features are blocked by unmet dependencies",
-                    "blocked_features": len(pending),
-                    "examples": blocking_info,
-                    "hint": "Complete the blocking dependencies first, or remove invalid dependencies"
-                }, indent=2)
+                if candidate_id is None:
+                    # All pending features are blocked by unmet dependencies
+                    blocking_info = []
+                    for feature in pending[:3]:
+                        deps = feature.dependencies or []
+                        valid_deps = [d for d in deps if d in all_feature_ids]
+                        orphaned = [d for d in deps if d not in all_feature_ids]
+                        unmet = [d for d in valid_deps if d not in passing_ids]
+                        info = f"#{feature.id} '{feature.name}'"
+                        if unmet:
+                            info += f" blocked by: {unmet}"
+                        if orphaned:
+                            info += f" (orphaned deps ignored: {orphaned})"
+                        blocking_info.append(info)

-            # Atomic claim: UPDATE only if still claimable
-            # This prevents race conditions even across processes
-            result = session.execute(
-                text("""
-                    UPDATE features
-                    SET in_progress = 1
-                    WHERE id = :feature_id
-                      AND in_progress = 0
-                      AND passes = 0
-                """),
-                {"feature_id": candidate_id}
-            )
-            session.commit()
+                    return json.dumps({
+                        "error": "All pending features are blocked by unmet dependencies",
+                        "blocked_features": len(pending),
+                        "examples": blocking_info,
+                        "hint": "Complete the blocking dependencies first, or remove invalid dependencies"
+                    }, indent=2)

-            # Check if we actually claimed it
-            if result.rowcount == 0:
-                # Another process claimed it first - retry with backoff
-                session.close()
-                # Exponential backoff with jitter: base 0.1s, 0.2s, 0.4s, ... up to 1.0s
-                # Jitter of up to 30% prevents synchronized retries under high contention
-                backoff = min(0.1 * (2 ** attempt), 1.0)
-                jitter = random.uniform(0, backoff * 0.3)
-                _time.sleep(backoff + jitter)
-                return _feature_claim_next_internal(attempt + 1)
+                # Atomic claim: UPDATE only if still claimable
+                # This prevents race conditions even across processes
+                result = session.execute(
+                    text("""
+                        UPDATE features
+                        SET in_progress = 1
+                        WHERE id = :feature_id
+                          AND in_progress = 0
+                          AND passes = 0
+                    """),
+                    {"feature_id": candidate_id}
+                )
+                session.commit()

-            # Fetch the claimed feature
-            session.expire_all()  # Clear cache to get fresh data
-            claimed_feature = session.query(Feature).filter(Feature.id == candidate_id).first()
-            return json.dumps(claimed_feature.to_dict(), indent=2)
+                # Check if we actually claimed it
+                if result.rowcount == 0:
+                    # Another process claimed it first - will retry after backoff
+                    pass  # Fall through to finally block, then retry loop
+                else:
+                    # Successfully claimed - fetch and return
+                    session.expire_all()  # Clear cache to get fresh data
+                    claimed_feature = session.query(Feature).filter(Feature.id == candidate_id).first()
+                    return json.dumps(claimed_feature.to_dict(), indent=2)

-    except Exception as e:
-        session.rollback()
-        return json.dumps({"error": f"Failed to claim feature: {str(e)}"})
-    finally:
-        session.close()
+        except OperationalError:
+            # Transient database error (e.g., SQLITE_BUSY) - rollback and retry
+            session.rollback()
+            # Fall through to backoff and retry
+        except Exception as e:
+            # Non-transient error - fail immediately
+            session.rollback()
+            return json.dumps({"error": f"Failed to claim feature: {str(e)}"})
+        finally:
+            session.close()
+
+        # Exponential backoff with jitter before next attempt
+        # Base 0.1s, 0.2s, 0.4s, 0.8s, 1.0s (capped)
+        # Jitter of up to 30% prevents synchronized retries under high contention
+        backoff = min(0.1 * (2 ** attempt), 1.0)
+        jitter = random.uniform(0, backoff * 0.3)
+        _time.sleep(backoff + jitter)
+
+    # Exhausted all retries
+    return json.dumps({
+        "error": "Failed to claim feature after maximum retries",
+        "hint": "High contention detected - try again or reduce parallel agents"
+    })


@mcp.tool()
@@ -346,11 +355,12 @@ def feature_claim_next() -> str:
    3. All dependency IDs actually exist (orphaned dependencies are ignored)

    On success, the feature's in_progress flag is set to True.
+    Uses exponential backoff retry (up to 5 attempts) under contention.

    Returns:
        JSON with claimed feature details, or error message if no feature available.
    """
-    return _feature_claim_next_internal(attempt=0)
+    return _feature_claim_next_internal()


@mcp.tool()
@@ -389,6 +399,156 @@ def feature_get_for_regression(
        session.close()


+def _feature_claim_for_testing_internal() -> str:
+    """Internal implementation of testing feature claim with iterative retry.
+
+    Uses an iterative loop instead of recursion to avoid double session.close()
+    issues and deep call stacks under contention.
+
+    Returns:
+        JSON with claimed feature details, or message if no features available.
+    """
+    for attempt in range(MAX_CLAIM_RETRIES):
+        session = get_session()
+        try:
+            # Use lock to prevent concurrent claims within this process
+            with _priority_lock:
+                # Find a candidate feature
+                candidate = (
+                    session.query(Feature)
+                    .filter(Feature.passes == True)
+                    .filter(Feature.in_progress == False)
+                    .filter(Feature.testing_in_progress == False)
+                    .order_by(func.random())
+                    .first()
+                )
+
+                if not candidate:
+                    return json.dumps({
+                        "message": "No features available for testing",
+                        "hint": "All passing features are either being coded or tested"
+                    })
+
+                # Atomic claim using UPDATE with WHERE clause
+                # This prevents race conditions even across processes
+                result = session.execute(
+                    text("""
+                        UPDATE features
+                        SET testing_in_progress = 1
+                        WHERE id = :feature_id
+                          AND passes = 1
+                          AND in_progress = 0
+                          AND testing_in_progress = 0
+                    """),
+                    {"feature_id": candidate.id}
+                )
+                session.commit()
+
+                # Check if we actually claimed it
+                if result.rowcount == 0:
+                    # Another process claimed it first - will retry after backoff
+                    pass  # Fall through to finally block, then retry loop
+                else:
+                    # Successfully claimed - fetch and return
+                    session.expire_all()
+                    claimed = session.query(Feature).filter(Feature.id == candidate.id).first()
+                    return json.dumps(claimed.to_dict(), indent=2)
+
+        except OperationalError:
+            # Transient database error (e.g., SQLITE_BUSY) - rollback and retry
+            session.rollback()
+            # Fall through to backoff and retry
+        except Exception as e:
+            # Non-transient error - fail immediately
+            session.rollback()
+            return json.dumps({"error": f"Failed to claim feature: {str(e)}"})
+        finally:
+            session.close()
+
+        # Exponential backoff with jitter before next attempt
+        backoff = min(0.1 * (2 ** attempt), 1.0)
+        jitter = random.uniform(0, backoff * 0.3)
+        _time.sleep(backoff + jitter)
+
+    # Exhausted all retries
+    return json.dumps({
+        "error": "Failed to claim feature after maximum retries",
+        "hint": "High contention detected - try again or reduce testing agents"
+    })
+
+
+@mcp.tool()
+def feature_claim_for_testing() -> str:
+    """Atomically claim a passing feature for regression testing.
+
+    Returns a random passing feature that is:
+    - Currently passing (passes=True)
+    - Not being worked on by coding agents (in_progress=False)
+    - Not already being tested (testing_in_progress=False)
+
+    The feature's testing_in_progress flag is set to True atomically to prevent
+    other testing agents from claiming the same feature. Uses exponential backoff
+    retry (up to 5 attempts) under contention.
+
+    After testing, you MUST call feature_release_testing() to release the claim.
+
+    Returns:
+        JSON with feature details if available, or message if no features available.
+    """
+    return _feature_claim_for_testing_internal()
+
+
+@mcp.tool()
+def feature_release_testing(
+    feature_id: Annotated[int, Field(description="The ID of the feature to release", ge=1)],
+    tested_ok: Annotated[bool, Field(description="True if the feature passed testing, False if regression found")] = True
+) -> str:
+    """Release a feature after regression testing completes.
+
+    Clears the testing_in_progress flag and updates last_tested_at timestamp.
+
+    This should be called after testing is complete, whether the feature
+    passed or failed. If tested_ok=False, the feature was marked as failing
+    by a previous call to feature_mark_failing.
+
+    Args:
+        feature_id: The ID of the feature that was being tested
+        tested_ok: True if testing passed, False if a regression was found
+
+    Returns:
+        JSON with release confirmation or error message.
+    """
+    session = get_session()
+    try:
+        feature = session.query(Feature).filter(Feature.id == feature_id).first()
+
+        if feature is None:
+            return json.dumps({"error": f"Feature with ID {feature_id} not found"})
+
+        if not feature.testing_in_progress:
+            return json.dumps({
+                "warning": f"Feature {feature_id} was not being tested",
+                "feature": feature.to_dict()
+            })
+
+        # Clear testing flag and update timestamp
+        feature.testing_in_progress = False
+        feature.last_tested_at = datetime.now(timezone.utc)
+        session.commit()
+        session.refresh(feature)
+
+        status = "passed" if tested_ok else "failed (regression detected)"
+        return json.dumps({
+            "message": f"Feature #{feature_id} testing {status}",
+            "feature": feature.to_dict()
+        }, indent=2)
+    except Exception as e:
+        session.rollback()
+        return json.dumps({"error": f"Failed to release testing claim: {str(e)}"})
+    finally:
+        session.close()
+
+
@mcp.tool()
 def feature_mark_passing(
    feature_id: Annotated[int, Field(description="The ID of the feature to mark as passing", ge=1)]
@@ -417,6 +577,9 @@ def feature_mark_passing(
        session.refresh(feature)

        return json.dumps(feature.to_dict(), indent=2)
+    except Exception as e:
+        session.rollback()
+        return json.dumps({"error": f"Failed to mark feature passing: {str(e)}"})
    finally:
        session.close()

@@ -459,6 +622,9 @@ def feature_mark_failing(
            "message": f"Feature #{feature_id} marked as failing - regression detected",
            "feature": feature.to_dict()
        }, indent=2)
+    except Exception as e:
+        session.rollback()
+        return json.dumps({"error": f"Failed to mark feature failing: {str(e)}"})
    finally:
        session.close()

@@ -515,6 +681,9 @@ def feature_skip(
            "new_priority": new_priority,
            "message": f"Feature '{feature.name}' moved to end of queue"
        }, indent=2)
+    except Exception as e:
+        session.rollback()
+        return json.dumps({"error": f"Failed to skip feature: {str(e)}"})
    finally:
        session.close()

@@ -552,6 +721,9 @@ def feature_mark_in_progress(
        session.refresh(feature)

        return json.dumps(feature.to_dict(), indent=2)
+    except Exception as e:
+        session.rollback()
+        return json.dumps({"error": f"Failed to mark feature in-progress: {str(e)}"})
    finally:
        session.close()

@@ -583,6 +755,9 @@ def feature_clear_in_progress(
        session.refresh(feature)

        return json.dumps(feature.to_dict(), indent=2)
+    except Exception as e:
+        session.rollback()
+        return json.dumps({"error": f"Failed to clear in-progress status: {str(e)}"})
    finally:
        session.close()

@@ -807,6 +982,9 @@ def feature_add_dependency(
            "feature_id": feature_id,
            "dependencies": feature.dependencies
        })
+    except Exception as e:
+        session.rollback()
+        return json.dumps({"error": f"Failed to add dependency: {str(e)}"})
    finally:
        session.close()

@@ -844,6 +1022,9 @@ def feature_remove_dependency(
            "feature_id": feature_id,
            "dependencies": feature.dependencies or []
        })
+    except Exception as e:
+        session.rollback()
+        return json.dumps({"error": f"Failed to remove dependency: {str(e)}"})
    finally:
        session.close()

@@ -1040,6 +1221,9 @@ def feature_set_dependencies(
            "feature_id": feature_id,
            "dependencies": feature.dependencies or []
        })
+    except Exception as e:
+        session.rollback()
+        return json.dumps({"error": f"Failed to set dependencies: {str(e)}"})
    finally:
        session.close()