refactor: optimize token usage, deduplicate code, fix bugs across agents

Token reduction (~40% per session, ~2.3M fewer tokens per 200-feature project): - Agent-type-specific tool lists: coding 9, testing 5, init 5 (was 19 for all) - Right-sized max_turns: coding 300, testing 100 (was 1000 for all) - Trimmed coding prompt template (~150 lines removed) - Streamlined testing prompt with batch support - YOLO mode now strips browser testing instructions from prompt - Added Grep, WebFetch, WebSearch to expand project session Performance improvements: - Rate limit retries start at ~15s with jitter (was fixed 60s) - Post-spawn delay reduced to 0.5s (was 2s) - Orchestrator consolidated to 1 DB query per loop (was 5-7) - Testing agents batch 3 features per session (was 1) - Smart context compaction preserves critical state, discards noise Bug fixes: - Removed ghost feature_release_testing MCP tool (wasted tokens every test session) - Forward all 9 Vertex AI env vars to chat sessions (was missing 3) - Fix DetachedInstanceError risk in test batch ORM access - Prevent duplicate testing of same features in parallel mode Code deduplication: - _get_project_path(): 9 copies -> 1 shared utility (project_helpers.py) - validate_project_name(): 9 copies -> 2 variants in 1 file (validation.py) - ROOT_DIR: 10 copies -> 1 definition (chat_constants.py) - API_ENV_VARS: 4 copies -> 1 source of truth (env_constants.py) Security hardening: - Unified sensitive directory blocklist (14 dirs, was two divergent lists) - Cached get_blocked_paths() for O(1) directory listing checks - Terminal security warning when ALLOW_REMOTE=1 exposes WebSocket - 20 new security tests for EXTRA_READ_PATHS blocking - Extracted _validate_command_list() and _validate_pkill_processes() helpers Type safety: - 87 mypy errors -> 0 across 58 source files - Installed types-PyYAML for proper yaml stub types - Fixed SQLAlchemy Column[T] coercions across all routers Dead code removed: - 13 files deleted (~2,679 lines): unused UI components, debug logs, outdated docs - 7 unused npm packages removed (Radix UI components with 0 imports) - AgentAvatar.tsx reduced from 615 -> 119 lines (SVGs extracted to mascotData.tsx) New CLI options: - --testing-batch-size (1-5) for parallel mode test batching - --testing-feature-ids for direct multi-feature testing Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-02 23:33:35 +00:00 · 2026-02-01 13:16:24 +02:00
parent dc5bcc4ae9
commit 94e0b05cb1
57 changed files with 1974 additions and 4300 deletions
--- a/parallel_orchestrator.py
+++ b/parallel_orchestrator.py
@@ -20,6 +20,7 @@ Usage:

 import asyncio
 import atexit
+import logging
 import os
 import signal
 import subprocess
@@ -27,7 +28,7 @@ import sys
 import threading
 from datetime import datetime, timezone
 from pathlib import Path
-from typing import Callable, Literal
+from typing import Any, Callable, Literal

 from sqlalchemy import text

@@ -36,6 +37,8 @@ from api.dependency_resolver import are_dependencies_satisfied, compute_scheduli
 from progress import has_features
 from server.utils.process_utils import kill_process_tree

+logger = logging.getLogger(__name__)
+
 # Root directory of autocoder (where this script and autonomous_agent_demo.py live)
 AUTOCODER_ROOT = Path(__file__).parent.resolve()

@@ -83,23 +86,25 @@ class DebugLogger:
 debug_log = DebugLogger()


-def _dump_database_state(session, label: str = ""):
-    """Helper to dump full database state to debug log."""
-    from api.database import Feature
-    all_features = session.query(Feature).all()
+def _dump_database_state(feature_dicts: list[dict], label: str = ""):
+    """Helper to dump full database state to debug log.

-    passing = [f for f in all_features if f.passes]
-    in_progress = [f for f in all_features if f.in_progress and not f.passes]
-    pending = [f for f in all_features if not f.passes and not f.in_progress]
+    Args:
+        feature_dicts: Pre-fetched list of feature dicts.
+        label: Optional label for the dump entry.
+    """
+    passing = [f for f in feature_dicts if f.get("passes")]
+    in_progress = [f for f in feature_dicts if f.get("in_progress") and not f.get("passes")]
+    pending = [f for f in feature_dicts if not f.get("passes") and not f.get("in_progress")]

    debug_log.log("DB_DUMP", f"Full database state {label}",
-        total_features=len(all_features),
+        total_features=len(feature_dicts),
        passing_count=len(passing),
-        passing_ids=[f.id for f in passing],
+        passing_ids=[f["id"] for f in passing],
        in_progress_count=len(in_progress),
-        in_progress_ids=[f.id for f in in_progress],
+        in_progress_ids=[f["id"] for f in in_progress],
        pending_count=len(pending),
-        pending_ids=[f.id for f in pending[:10]])  # First 10 pending only
+        pending_ids=[f["id"] for f in pending[:10]])  # First 10 pending only

 # =============================================================================
 # Process Limits
@@ -125,6 +130,7 @@ def _dump_database_state(session, label: str = ""):
 MAX_PARALLEL_AGENTS = 5
 MAX_TOTAL_AGENTS = 10
 DEFAULT_CONCURRENCY = 3
+DEFAULT_TESTING_BATCH_SIZE = 3  # Number of features per testing batch (1-5)
 POLL_INTERVAL = 5  # seconds between checking for ready features
 MAX_FEATURE_RETRIES = 3  # Maximum times to retry a failed feature
 INITIALIZER_TIMEOUT = 1800  # 30 minutes timeout for initializer
@@ -146,6 +152,7 @@ class ParallelOrchestrator:
        model: str | None = None,
        yolo_mode: bool = False,
        testing_agent_ratio: int = 1,
+        testing_batch_size: int = DEFAULT_TESTING_BATCH_SIZE,
        on_output: Callable[[int, str], None] | None = None,
        on_status: Callable[[int, str], None] | None = None,
    ):
@@ -159,6 +166,8 @@ class ParallelOrchestrator:
            yolo_mode: Whether to run in YOLO mode (skip testing agents entirely)
            testing_agent_ratio: Number of regression testing agents to maintain (0-3).
                0 = disabled, 1-3 = maintain that many testing agents running independently.
+            testing_batch_size: Number of features to include per testing session (1-5).
+                Each testing agent receives this many features to regression test.
            on_output: Callback for agent output (feature_id, line)
            on_status: Callback for agent status changes (feature_id, status)
        """
@@ -167,6 +176,7 @@ class ParallelOrchestrator:
        self.model = model
        self.yolo_mode = yolo_mode
        self.testing_agent_ratio = min(max(testing_agent_ratio, 0), 3)  # Clamp 0-3
+        self.testing_batch_size = min(max(testing_batch_size, 1), 5)  # Clamp 1-5
        self.on_output = on_output
        self.on_status = on_status

@@ -186,6 +196,10 @@ class ParallelOrchestrator:
        # Track feature failures to prevent infinite retry loops
        self._failure_counts: dict[int, int] = {}

+        # Track recently tested feature IDs to avoid redundant re-testing.
+        # Cleared when all passing features have been covered at least once.
+        self._recently_tested: set[int] = set()
+
        # Shutdown flag for async-safe signal handling
        # Signal handlers only set this flag; cleanup happens in the main loop
        self._shutdown_requested = False
@@ -213,6 +227,9 @@ class ParallelOrchestrator:
        This simplifies the architecture by removing unnecessary coordination.

        Returns the feature ID if available, None if no passing features exist.
+
+        Note: Prefer _get_test_batch() for batch testing mode. This method is
+        retained for backward compatibility.
        """
        from sqlalchemy.sql.expression import func

@@ -231,164 +248,282 @@ class ParallelOrchestrator:
        finally:
            session.close()

-    def get_resumable_features(self) -> list[dict]:
+    def _get_test_batch(self, batch_size: int = 3) -> list[int]:
+        """Select a prioritized batch of passing features for regression testing.
+
+        Uses weighted scoring to prioritize features that:
+        1. Haven't been tested recently in this orchestrator session
+        2. Are depended on by many other features (higher impact if broken)
+        3. Have more dependencies themselves (complex integration points)
+
+        When all passing features have been recently tested, the tracking set
+        is cleared so the cycle starts fresh.
+
+        Args:
+            batch_size: Maximum number of feature IDs to return (1-5).
+
+        Returns:
+            List of feature IDs to test, may be shorter than batch_size if
+            fewer passing features are available. Empty list if none available.
+        """
+        session = self.get_session()
+        try:
+            session.expire_all()
+            passing = (
+                session.query(Feature)
+                .filter(Feature.passes == True)
+                .filter(Feature.in_progress == False)  # Don't test while coding
+                .all()
+            )
+
+            # Extract data from ORM objects before closing the session to avoid
+            # DetachedInstanceError when accessing attributes after session.close().
+            passing_data: list[dict] = []
+            for f in passing:
+                passing_data.append({
+                    'id': f.id,
+                    'dependencies': f.get_dependencies_safe() if hasattr(f, 'get_dependencies_safe') else [],
+                })
+        finally:
+            session.close()
+
+        if not passing_data:
+            return []
+
+        # Build a reverse dependency map: feature_id -> count of features that depend on it.
+        # The Feature model stores dependencies (what I depend ON), so we invert to find
+        # dependents (what depends ON me).
+        dependent_counts: dict[int, int] = {}
+        for fd in passing_data:
+            for dep_id in fd['dependencies']:
+                dependent_counts[dep_id] = dependent_counts.get(dep_id, 0) + 1
+
+        # Exclude features that are already being tested by running testing agents
+        # to avoid redundant concurrent testing of the same features.
+        # running_testing_agents is dict[pid, (primary_feature_id, process)]
+        with self._lock:
+            currently_testing_ids: set[int] = set()
+            for _pid, (feat_id, _proc) in self.running_testing_agents.items():
+                currently_testing_ids.add(feat_id)
+
+        # If all passing features have been recently tested, reset the tracker
+        # so we cycle through them again rather than returning empty batches.
+        passing_ids = {fd['id'] for fd in passing_data}
+        if passing_ids.issubset(self._recently_tested):
+            self._recently_tested.clear()
+
+        # Score each feature by testing priority
+        scored: list[tuple[int, int]] = []
+        for fd in passing_data:
+            f_id = fd['id']
+
+            # Skip features already being tested by a running testing agent
+            if f_id in currently_testing_ids:
+                continue
+
+            score = 0
+
+            # Weight 1: Features depended on by many others are higher impact
+            # if they regress, so test them more often
+            score += dependent_counts.get(f_id, 0) * 2
+
+            # Weight 2: Strongly prefer features not tested recently
+            if f_id not in self._recently_tested:
+                score += 5
+
+            # Weight 3: Features with more dependencies are integration points
+            # that are more likely to regress when other code changes
+            dep_count = len(fd['dependencies'])
+            score += min(dep_count, 3)  # Cap at 3 to avoid over-weighting
+
+            scored.append((f_id, score))
+
+        # Sort by score descending (highest priority first)
+        scored.sort(key=lambda x: x[1], reverse=True)
+        selected = [fid for fid, _ in scored[:batch_size]]
+
+        # Track what we've tested to avoid re-testing the same features next batch
+        self._recently_tested.update(selected)
+
+        debug_log.log("TEST_BATCH", f"Selected {len(selected)} features for testing batch",
+            selected_ids=selected,
+            recently_tested_count=len(self._recently_tested),
+            total_passing=len(passing_data))
+
+        return selected
+
+    def get_resumable_features(
+        self,
+        feature_dicts: list[dict] | None = None,
+        scheduling_scores: dict[int, float] | None = None,
+    ) -> list[dict]:
        """Get features that were left in_progress from a previous session.

        These are features where in_progress=True but passes=False, and they're
        not currently being worked on by this orchestrator. This handles the case
        where a previous session was interrupted before completing the feature.
+
+        Args:
+            feature_dicts: Pre-fetched list of feature dicts. If None, queries the database.
+            scheduling_scores: Pre-computed scheduling scores. If None, computed from feature_dicts.
        """
-        session = self.get_session()
-        try:
-            # Force fresh read from database to avoid stale cached data
-            # This is critical when agent subprocesses have committed changes
-            session.expire_all()
+        if feature_dicts is None:
+            session = self.get_session()
+            try:
+                session.expire_all()
+                all_features = session.query(Feature).all()
+                feature_dicts = [f.to_dict() for f in all_features]
+            finally:
+                session.close()

-            # Find features that are in_progress but not complete
-            stale = session.query(Feature).filter(
-                Feature.in_progress == True,
-                Feature.passes == False
-            ).all()
+        # Snapshot running IDs once to avoid acquiring lock per feature
+        with self._lock:
+            running_ids = set(self.running_coding_agents.keys())

-            resumable = []
-            for f in stale:
-                # Skip if already running in this orchestrator instance
-                with self._lock:
-                    if f.id in self.running_coding_agents:
-                        continue
-                # Skip if feature has failed too many times
-                if self._failure_counts.get(f.id, 0) >= MAX_FEATURE_RETRIES:
-                    continue
-                resumable.append(f.to_dict())
+        resumable = []
+        for fd in feature_dicts:
+            if not fd.get("in_progress") or fd.get("passes"):
+                continue
+            # Skip if already running in this orchestrator instance
+            if fd["id"] in running_ids:
+                continue
+            # Skip if feature has failed too many times
+            if self._failure_counts.get(fd["id"], 0) >= MAX_FEATURE_RETRIES:
+                continue
+            resumable.append(fd)

-            # Sort by scheduling score (higher = first), then priority, then id
-            all_dicts = [f.to_dict() for f in session.query(Feature).all()]
-            scores = compute_scheduling_scores(all_dicts)
-            resumable.sort(key=lambda f: (-scores.get(f["id"], 0), f["priority"], f["id"]))
-            return resumable
-        finally:
-            session.close()
+        # Sort by scheduling score (higher = first), then priority, then id
+        if scheduling_scores is None:
+            scheduling_scores = compute_scheduling_scores(feature_dicts)
+        resumable.sort(key=lambda f: (-scheduling_scores.get(f["id"], 0), f["priority"], f["id"]))
+        return resumable

-    def get_ready_features(self) -> list[dict]:
-        """Get features with satisfied dependencies, not already running."""
-        session = self.get_session()
-        try:
-            # Force fresh read from database to avoid stale cached data
-            # This is critical when agent subprocesses have committed changes
-            session.expire_all()
+    def get_ready_features(
+        self,
+        feature_dicts: list[dict] | None = None,
+        scheduling_scores: dict[int, float] | None = None,
+    ) -> list[dict]:
+        """Get features with satisfied dependencies, not already running.

-            all_features = session.query(Feature).all()
-            all_dicts = [f.to_dict() for f in all_features]
+        Args:
+            feature_dicts: Pre-fetched list of feature dicts. If None, queries the database.
+            scheduling_scores: Pre-computed scheduling scores. If None, computed from feature_dicts.
+        """
+        if feature_dicts is None:
+            session = self.get_session()
+            try:
+                session.expire_all()
+                all_features = session.query(Feature).all()
+                feature_dicts = [f.to_dict() for f in all_features]
+            finally:
+                session.close()

-            # Pre-compute passing_ids once to avoid O(n^2) in the loop
-            passing_ids = {f.id for f in all_features if f.passes}
+        # Pre-compute passing_ids once to avoid O(n^2) in the loop
+        passing_ids = {fd["id"] for fd in feature_dicts if fd.get("passes")}

-            ready = []
-            skipped_reasons = {"passes": 0, "in_progress": 0, "running": 0, "failed": 0, "deps": 0}
-            for f in all_features:
-                if f.passes:
-                    skipped_reasons["passes"] += 1
-                    continue
-                if f.in_progress:
-                    skipped_reasons["in_progress"] += 1
-                    continue
-                # Skip if already running in this orchestrator
-                with self._lock:
-                    if f.id in self.running_coding_agents:
-                        skipped_reasons["running"] += 1
-                        continue
-                # Skip if feature has failed too many times
-                if self._failure_counts.get(f.id, 0) >= MAX_FEATURE_RETRIES:
-                    skipped_reasons["failed"] += 1
-                    continue
-                # Check dependencies (pass pre-computed passing_ids)
-                if are_dependencies_satisfied(f.to_dict(), all_dicts, passing_ids):
-                    ready.append(f.to_dict())
-                else:
-                    skipped_reasons["deps"] += 1
+        # Snapshot running IDs once to avoid acquiring lock per feature
+        with self._lock:
+            running_ids = set(self.running_coding_agents.keys())

-            # Sort by scheduling score (higher = first), then priority, then id
-            scores = compute_scheduling_scores(all_dicts)
-            ready.sort(key=lambda f: (-scores.get(f["id"], 0), f["priority"], f["id"]))
+        ready = []
+        skipped_reasons = {"passes": 0, "in_progress": 0, "running": 0, "failed": 0, "deps": 0}
+        for fd in feature_dicts:
+            if fd.get("passes"):
+                skipped_reasons["passes"] += 1
+                continue
+            if fd.get("in_progress"):
+                skipped_reasons["in_progress"] += 1
+                continue
+            # Skip if already running in this orchestrator
+            if fd["id"] in running_ids:
+                skipped_reasons["running"] += 1
+                continue
+            # Skip if feature has failed too many times
+            if self._failure_counts.get(fd["id"], 0) >= MAX_FEATURE_RETRIES:
+                skipped_reasons["failed"] += 1
+                continue
+            # Check dependencies (pass pre-computed passing_ids)
+            if are_dependencies_satisfied(fd, feature_dicts, passing_ids):
+                ready.append(fd)
+            else:
+                skipped_reasons["deps"] += 1

-            # Debug logging
-            passing = sum(1 for f in all_features if f.passes)
-            in_progress = sum(1 for f in all_features if f.in_progress and not f.passes)
-            print(
-                f"[DEBUG] get_ready_features: {len(ready)} ready, "
-                f"{passing} passing, {in_progress} in_progress, {len(all_features)} total",
-                flush=True
-            )
-            print(
-                f"[DEBUG]   Skipped: {skipped_reasons['passes']} passing, {skipped_reasons['in_progress']} in_progress, "
-                f"{skipped_reasons['running']} running, {skipped_reasons['failed']} failed, {skipped_reasons['deps']} blocked by deps",
-                flush=True
-            )
+        # Sort by scheduling score (higher = first), then priority, then id
+        if scheduling_scores is None:
+            scheduling_scores = compute_scheduling_scores(feature_dicts)
+        ready.sort(key=lambda f: (-scheduling_scores.get(f["id"], 0), f["priority"], f["id"]))

-            # Log to debug file (but not every call to avoid spam)
-            debug_log.log("READY", "get_ready_features() called",
-                ready_count=len(ready),
-                ready_ids=[f['id'] for f in ready[:5]],  # First 5 only
-                passing=passing,
-                in_progress=in_progress,
-                total=len(all_features),
-                skipped=skipped_reasons)
+        # Summary counts for logging
+        passing = skipped_reasons["passes"]
+        in_progress = skipped_reasons["in_progress"]
+        total = len(feature_dicts)

-            return ready
-        finally:
-            session.close()
+        debug_log.log("READY", "get_ready_features() called",
+            ready_count=len(ready),
+            ready_ids=[f['id'] for f in ready[:5]],  # First 5 only
+            passing=passing,
+            in_progress=in_progress,
+            total=total,
+            skipped=skipped_reasons)

-    def get_all_complete(self) -> bool:
+        return ready
+
+    def get_all_complete(self, feature_dicts: list[dict] | None = None) -> bool:
        """Check if all features are complete or permanently failed.

        Returns False if there are no features (initialization needed).
+
+        Args:
+            feature_dicts: Pre-fetched list of feature dicts. If None, queries the database.
        """
-        session = self.get_session()
-        try:
-            # Force fresh read from database to avoid stale cached data
-            # This is critical when agent subprocesses have committed changes
-            session.expire_all()
+        if feature_dicts is None:
+            session = self.get_session()
+            try:
+                session.expire_all()
+                all_features = session.query(Feature).all()
+                feature_dicts = [f.to_dict() for f in all_features]
+            finally:
+                session.close()

-            all_features = session.query(Feature).all()
+        # No features = NOT complete, need initialization
+        if len(feature_dicts) == 0:
+            return False

-            # No features = NOT complete, need initialization
-            if len(all_features) == 0:
-                return False
+        passing_count = 0
+        failed_count = 0
+        pending_count = 0
+        for fd in feature_dicts:
+            if fd.get("passes"):
+                passing_count += 1
+                continue  # Completed successfully
+            if self._failure_counts.get(fd["id"], 0) >= MAX_FEATURE_RETRIES:
+                failed_count += 1
+                continue  # Permanently failed, count as "done"
+            pending_count += 1

-            passing_count = 0
-            failed_count = 0
-            pending_count = 0
-            for f in all_features:
-                if f.passes:
-                    passing_count += 1
-                    continue  # Completed successfully
-                if self._failure_counts.get(f.id, 0) >= MAX_FEATURE_RETRIES:
-                    failed_count += 1
-                    continue  # Permanently failed, count as "done"
-                pending_count += 1
+        total = len(feature_dicts)
+        is_complete = pending_count == 0
+        debug_log.log("COMPLETE_CHECK", f"get_all_complete: {passing_count}/{total} passing, "
+            f"{failed_count} failed, {pending_count} pending -> {is_complete}")
+        return is_complete

-            total = len(all_features)
-            is_complete = pending_count == 0
-            print(
-                f"[DEBUG] get_all_complete: {passing_count}/{total} passing, "
-                f"{failed_count} failed, {pending_count} pending -> {is_complete}",
-                flush=True
-            )
-            return is_complete
-        finally:
-            session.close()
+    def get_passing_count(self, feature_dicts: list[dict] | None = None) -> int:
+        """Get the number of passing features.

-    def get_passing_count(self) -> int:
-        """Get the number of passing features."""
-        session = self.get_session()
-        try:
-            session.expire_all()
-            count: int = session.query(Feature).filter(Feature.passes == True).count()
-            return count
-        finally:
-            session.close()
+        Args:
+            feature_dicts: Pre-fetched list of feature dicts. If None, queries the database.
+        """
+        if feature_dicts is None:
+            session = self.get_session()
+            try:
+                session.expire_all()
+                count: int = session.query(Feature).filter(Feature.passes == True).count()
+                return count
+            finally:
+                session.close()
+        return sum(1 for fd in feature_dicts if fd.get("passes"))

-    def _maintain_testing_agents(self) -> None:
+    def _maintain_testing_agents(self, feature_dicts: list[dict] | None = None) -> None:
        """Maintain the desired count of testing agents independently.

        This runs every loop iteration and spawns testing agents as needed to maintain
@@ -402,18 +537,21 @@ class ParallelOrchestrator:
        - YOLO mode is enabled
        - testing_agent_ratio is 0
        - No passing features exist yet
+
+        Args:
+            feature_dicts: Pre-fetched list of feature dicts. If None, queries the database.
        """
        # Skip if testing is disabled
        if self.yolo_mode or self.testing_agent_ratio == 0:
            return

        # No testing until there are passing features
-        passing_count = self.get_passing_count()
+        passing_count = self.get_passing_count(feature_dicts)
        if passing_count == 0:
            return

        # Don't spawn testing agents if all features are already complete
-        if self.get_all_complete():
+        if self.get_all_complete(feature_dicts):
            return

        # Spawn testing agents one at a time, re-checking limits each time
@@ -439,7 +577,7 @@ class ParallelOrchestrator:
                    passing_count=passing_count)

            # Spawn outside lock (I/O bound operation)
-            print(f"[DEBUG] Spawning testing agent ({spawn_index}/{desired})", flush=True)
+            logger.debug("Spawning testing agent (%d/%d)", spawn_index, desired)
            success, msg = self._spawn_testing_agent()
            if not success:
                debug_log.log("TESTING", f"Spawn failed, stopping: {msg}")
@@ -521,7 +659,7 @@ class ParallelOrchestrator:
            # CREATE_NO_WINDOW on Windows prevents console window pop-ups
            # stdin=DEVNULL prevents blocking on stdin reads
            # encoding="utf-8" and errors="replace" fix Windows CP1252 issues
-            popen_kwargs = {
+            popen_kwargs: dict[str, Any] = {
                "stdin": subprocess.DEVNULL,
                "stdout": subprocess.PIPE,
                "stderr": subprocess.STDOUT,
@@ -565,11 +703,14 @@ class ParallelOrchestrator:
        return True, f"Started feature {feature_id}"

    def _spawn_testing_agent(self) -> tuple[bool, str]:
-        """Spawn a testing agent subprocess for regression testing.
+        """Spawn a testing agent subprocess for batch regression testing.

-        Picks a random passing feature to test. Multiple testing agents can test
-        the same feature concurrently - this is intentional and simplifies the
-        architecture by removing claim coordination.
+        Selects a prioritized batch of passing features using weighted scoring
+        (via _get_test_batch) and passes them as --testing-feature-ids to the
+        subprocess. Falls back to single --testing-feature-id for batches of one.
+
+        Multiple testing agents can test the same feature concurrently - this is
+        intentional and simplifies the architecture by removing claim coordination.
        """
        # Check limits first (under lock)
        with self._lock:
@@ -582,13 +723,16 @@ class ParallelOrchestrator:
                debug_log.log("TESTING", f"Skipped spawn - at max total agents ({total_agents}/{MAX_TOTAL_AGENTS})")
                return False, f"At max total agents ({total_agents})"

-        # Pick a random passing feature (no claim needed - concurrent testing is fine)
-        feature_id = self._get_random_passing_feature()
-        if feature_id is None:
+        # Select a weighted batch of passing features for regression testing
+        batch = self._get_test_batch(self.testing_batch_size)
+        if not batch:
            debug_log.log("TESTING", "No features available for testing")
            return False, "No features available for testing"

-        debug_log.log("TESTING", f"Selected feature #{feature_id} for testing")
+        # Use the first feature ID as the representative for logging/tracking
+        primary_feature_id = batch[0]
+        batch_str = ",".join(str(fid) for fid in batch)
+        debug_log.log("TESTING", f"Selected batch for testing: [{batch_str}]")

        # Spawn the testing agent
        with self._lock:
@@ -604,7 +748,7 @@ class ParallelOrchestrator:
                "--project-dir", str(self.project_dir),
                "--max-iterations", "1",
                "--agent-type", "testing",
-                "--testing-feature-id", str(feature_id),
+                "--testing-feature-ids", batch_str,
            ]
            if self.model:
                cmd.extend(["--model", self.model])
@@ -613,7 +757,7 @@ class ParallelOrchestrator:
                # CREATE_NO_WINDOW on Windows prevents console window pop-ups
                # stdin=DEVNULL prevents blocking on stdin reads
                # encoding="utf-8" and errors="replace" fix Windows CP1252 issues
-                popen_kwargs = {
+                popen_kwargs: dict[str, Any] = {
                    "stdin": subprocess.DEVNULL,
                    "stdout": subprocess.PIPE,
                    "stderr": subprocess.STDOUT,
@@ -633,22 +777,22 @@ class ParallelOrchestrator:

            # Register process by PID (not feature_id) to avoid overwrites
            # when multiple agents test the same feature
-            self.running_testing_agents[proc.pid] = (feature_id, proc)
+            self.running_testing_agents[proc.pid] = (primary_feature_id, proc)
            testing_count = len(self.running_testing_agents)

-        # Start output reader thread with feature ID (same as coding agents)
+        # Start output reader thread with primary feature ID for log attribution
        threading.Thread(
            target=self._read_output,
-            args=(feature_id, proc, threading.Event(), "testing"),
+            args=(primary_feature_id, proc, threading.Event(), "testing"),
            daemon=True
        ).start()

-        print(f"Started testing agent for feature #{feature_id} (PID {proc.pid})", flush=True)
-        debug_log.log("TESTING", f"Successfully spawned testing agent for feature #{feature_id}",
+        print(f"Started testing agent for features [{batch_str}] (PID {proc.pid})", flush=True)
+        debug_log.log("TESTING", f"Successfully spawned testing agent for batch [{batch_str}]",
            pid=proc.pid,
-            feature_id=feature_id,
+            feature_ids=batch,
            total_testing_agents=testing_count)
-        return True, f"Started testing agent for feature #{feature_id}"
+        return True, f"Started testing agent for features [{batch_str}]"

    async def _run_initializer(self) -> bool:
        """Run initializer agent as blocking subprocess.
@@ -674,7 +818,7 @@ class ParallelOrchestrator:
        # CREATE_NO_WINDOW on Windows prevents console window pop-ups
        # stdin=DEVNULL prevents blocking on stdin reads
        # encoding="utf-8" and errors="replace" fix Windows CP1252 issues
-        popen_kwargs = {
+        popen_kwargs: dict[str, Any] = {
            "stdin": subprocess.DEVNULL,
            "stdout": subprocess.PIPE,
            "stderr": subprocess.STDOUT,
@@ -1000,16 +1144,15 @@ class ParallelOrchestrator:
            # newly created features.
            debug_log.section("INITIALIZATION COMPLETE")
            debug_log.log("INIT", "Disposing old database engine and creating fresh connection")
-            print("[DEBUG] Recreating database connection after initialization...", flush=True)
+            logger.debug("Recreating database connection after initialization")
            if self._engine is not None:
                self._engine.dispose()
            self._engine, self._session_maker = create_database(self.project_dir)

            # Debug: Show state immediately after initialization
-            print("[DEBUG] Post-initialization state check:", flush=True)
-            print(f"[DEBUG]   max_concurrency={self.max_concurrency}", flush=True)
-            print(f"[DEBUG]   yolo_mode={self.yolo_mode}", flush=True)
-            print(f"[DEBUG]   testing_agent_ratio={self.testing_agent_ratio}", flush=True)
+            logger.debug("Post-initialization state check")
+            logger.debug("Post-initialization state: max_concurrency=%d, yolo_mode=%s, testing_agent_ratio=%d",
+                self.max_concurrency, self.yolo_mode, self.testing_agent_ratio)

            # Verify features were created and are visible
            session = self.get_session()
@@ -1017,7 +1160,7 @@ class ParallelOrchestrator:
                feature_count = session.query(Feature).count()
                all_features = session.query(Feature).all()
                feature_names = [f"{f.id}: {f.name}" for f in all_features[:10]]
-                print(f"[DEBUG]   features in database={feature_count}", flush=True)
+                logger.debug("Features in database: %d", feature_count)
                debug_log.log("INIT", "Post-initialization database state",
                    max_concurrency=self.max_concurrency,
                    yolo_mode=self.yolo_mode,
@@ -1041,7 +1184,18 @@ class ParallelOrchestrator:
        while self.is_running and not self._shutdown_requested:
            loop_iteration += 1
            if loop_iteration <= 3:
-                print(f"[DEBUG] === Loop iteration {loop_iteration} ===", flush=True)
+                logger.debug("=== Loop iteration %d ===", loop_iteration)
+
+            # Query all features ONCE per iteration and build reusable snapshot.
+            # Every sub-method receives this snapshot instead of re-querying the DB.
+            session = self.get_session()
+            session.expire_all()
+            all_features = session.query(Feature).all()
+            feature_dicts = [f.to_dict() for f in all_features]
+            session.close()
+
+            # Pre-compute scheduling scores once (BFS + reverse topo sort)
+            scheduling_scores = compute_scheduling_scores(feature_dicts)

            # Log every iteration to debug file (first 10, then every 5th)
            if loop_iteration <= 10 or loop_iteration % 5 == 0:
@@ -1055,20 +1209,16 @@ class ParallelOrchestrator:

                # Full database dump every 5 iterations
                if loop_iteration == 1 or loop_iteration % 5 == 0:
-                    session = self.get_session()
-                    try:
-                        _dump_database_state(session, f"(iteration {loop_iteration})")
-                    finally:
-                        session.close()
+                    _dump_database_state(feature_dicts, f"(iteration {loop_iteration})")

            try:
                # Check if all complete
-                if self.get_all_complete():
+                if self.get_all_complete(feature_dicts):
                    print("\nAll features complete!", flush=True)
                    break

                # Maintain testing agents independently (runs every iteration)
-                self._maintain_testing_agents()
+                self._maintain_testing_agents(feature_dicts)

                # Check capacity
                with self._lock:
@@ -1089,17 +1239,17 @@ class ParallelOrchestrator:
                    continue

                # Priority 1: Resume features from previous session
-                resumable = self.get_resumable_features()
+                resumable = self.get_resumable_features(feature_dicts, scheduling_scores)
                if resumable:
                    slots = self.max_concurrency - current
                    for feature in resumable[:slots]:
                        print(f"Resuming feature #{feature['id']}: {feature['name']}", flush=True)
                        self.start_feature(feature["id"], resume=True)
-                    await asyncio.sleep(2)
+                    await asyncio.sleep(0.5)  # Brief delay for subprocess to claim feature before re-querying
                    continue

                # Priority 2: Start new ready features
-                ready = self.get_ready_features()
+                ready = self.get_ready_features(feature_dicts, scheduling_scores)
                if not ready:
                    # Wait for running features to complete
                    if current > 0:
@@ -1112,11 +1262,12 @@ class ParallelOrchestrator:
                        session = self.get_session()
                        try:
                            session.expire_all()
+                            fresh_dicts = [f.to_dict() for f in session.query(Feature).all()]
                        finally:
                            session.close()

                        # Recheck if all features are now complete
-                        if self.get_all_complete():
+                        if self.get_all_complete(fresh_dicts):
                            print("\nAll features complete!", flush=True)
                            break

@@ -1127,10 +1278,10 @@ class ParallelOrchestrator:

                # Start features up to capacity
                slots = self.max_concurrency - current
-                print(f"[DEBUG] Spawning loop: {len(ready)} ready, {slots} slots available, max_concurrency={self.max_concurrency}", flush=True)
-                print(f"[DEBUG] Will attempt to start {min(len(ready), slots)} features", flush=True)
+                logger.debug("Spawning loop: %d ready, %d slots available, max_concurrency=%d",
+                    len(ready), slots, self.max_concurrency)
                features_to_start = ready[:slots]
-                print(f"[DEBUG] Features to start: {[f['id'] for f in features_to_start]}", flush=True)
+                logger.debug("Features to start: %s", [f['id'] for f in features_to_start])

                debug_log.log("SPAWN", "Starting features batch",
                    ready_count=len(ready),
@@ -1138,23 +1289,24 @@ class ParallelOrchestrator:
                    features_to_start=[f['id'] for f in features_to_start])

                for i, feature in enumerate(features_to_start):
-                    print(f"[DEBUG] Starting feature {i+1}/{len(features_to_start)}: #{feature['id']} - {feature['name']}", flush=True)
+                    logger.debug("Starting feature %d/%d: #%d - %s",
+                        i + 1, len(features_to_start), feature['id'], feature['name'])
                    success, msg = self.start_feature(feature["id"])
                    if not success:
-                        print(f"[DEBUG] Failed to start feature #{feature['id']}: {msg}", flush=True)
+                        logger.debug("Failed to start feature #%d: %s", feature['id'], msg)
                        debug_log.log("SPAWN", f"FAILED to start feature #{feature['id']}",
                            feature_name=feature['name'],
                            error=msg)
                    else:
-                        print(f"[DEBUG] Successfully started feature #{feature['id']}", flush=True)
+                        logger.debug("Successfully started feature #%d", feature['id'])
                        with self._lock:
                            running_count = len(self.running_coding_agents)
-                            print(f"[DEBUG] Running coding agents after start: {running_count}", flush=True)
+                            logger.debug("Running coding agents after start: %d", running_count)
                        debug_log.log("SPAWN", f"Successfully started feature #{feature['id']}",
                            feature_name=feature['name'],
                            running_coding_agents=running_count)

-                await asyncio.sleep(2)  # Brief pause between starts
+                await asyncio.sleep(0.5)  # Brief delay for subprocess to claim feature before re-querying

            except Exception as e:
                print(f"Orchestrator error: {e}", flush=True)
@@ -1223,6 +1375,7 @@ async def run_parallel_orchestrator(
    model: str | None = None,
    yolo_mode: bool = False,
    testing_agent_ratio: int = 1,
+    testing_batch_size: int = DEFAULT_TESTING_BATCH_SIZE,
 ) -> None:
    """Run the unified orchestrator.

@@ -1232,6 +1385,7 @@ async def run_parallel_orchestrator(
        model: Claude model to use
        yolo_mode: Whether to run in YOLO mode (skip testing agents)
        testing_agent_ratio: Number of regression agents to maintain (0-3)
+        testing_batch_size: Number of features per testing batch (1-5)
    """
    print(f"[ORCHESTRATOR] run_parallel_orchestrator called with max_concurrency={max_concurrency}", flush=True)
    orchestrator = ParallelOrchestrator(
@@ -1240,6 +1394,7 @@ async def run_parallel_orchestrator(
        model=model,
        yolo_mode=yolo_mode,
        testing_agent_ratio=testing_agent_ratio,
+        testing_batch_size=testing_batch_size,
    )

    # Set up cleanup to run on exit (handles normal exit, exceptions)
@@ -1319,6 +1474,12 @@ def main():
        default=1,
        help="Number of regression testing agents (0-3, default: 1). Set to 0 to disable testing agents.",
    )
+    parser.add_argument(
+        "--testing-batch-size",
+        type=int,
+        default=DEFAULT_TESTING_BATCH_SIZE,
+        help=f"Number of features per testing batch (1-5, default: {DEFAULT_TESTING_BATCH_SIZE})",
+    )

    args = parser.parse_args()

@@ -1345,6 +1506,7 @@ def main():
            model=args.model,
            yolo_mode=args.yolo,
            testing_agent_ratio=args.testing_agent_ratio,
+            testing_batch_size=args.testing_batch_size,
        ))
    except KeyboardInterrupt:
        print("\n\nInterrupted by user", flush=True)