refactor: optimize token usage, deduplicate code, fix bugs across agents

Token reduction (~40% per session, ~2.3M fewer tokens per 200-feature project):
- Agent-type-specific tool lists: coding 9, testing 5, init 5 (was 19 for all)
- Right-sized max_turns: coding 300, testing 100 (was 1000 for all)
- Trimmed coding prompt template (~150 lines removed)
- Streamlined testing prompt with batch support
- YOLO mode now strips browser testing instructions from prompt
- Added Grep, WebFetch, WebSearch to expand project session

Performance improvements:
- Rate limit retries start at ~15s with jitter (was fixed 60s)
- Post-spawn delay reduced to 0.5s (was 2s)
- Orchestrator consolidated to 1 DB query per loop (was 5-7)
- Testing agents batch 3 features per session (was 1)
- Smart context compaction preserves critical state, discards noise

Bug fixes:
- Removed ghost feature_release_testing MCP tool (wasted tokens every test session)
- Forward all 9 Vertex AI env vars to chat sessions (was missing 3)
- Fix DetachedInstanceError risk in test batch ORM access
- Prevent duplicate testing of same features in parallel mode

Code deduplication:
- _get_project_path(): 9 copies -> 1 shared utility (project_helpers.py)
- validate_project_name(): 9 copies -> 2 variants in 1 file (validation.py)
- ROOT_DIR: 10 copies -> 1 definition (chat_constants.py)
- API_ENV_VARS: 4 copies -> 1 source of truth (env_constants.py)

Security hardening:
- Unified sensitive directory blocklist (14 dirs, was two divergent lists)
- Cached get_blocked_paths() for O(1) directory listing checks
- Terminal security warning when ALLOW_REMOTE=1 exposes WebSocket
- 20 new security tests for EXTRA_READ_PATHS blocking
- Extracted _validate_command_list() and _validate_pkill_processes() helpers

Type safety:
- 87 mypy errors -> 0 across 58 source files
- Installed types-PyYAML for proper yaml stub types
- Fixed SQLAlchemy Column[T] coercions across all routers

Dead code removed:
- 13 files deleted (~2,679 lines): unused UI components, debug logs, outdated docs
- 7 unused npm packages removed (Radix UI components with 0 imports)
- AgentAvatar.tsx reduced from 615 -> 119 lines (SVGs extracted to mascotData.tsx)

New CLI options:
- --testing-batch-size (1-5) for parallel mode test batching
- --testing-feature-ids for direct multi-feature testing

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Auto
2026-02-01 13:16:24 +02:00
parent dc5bcc4ae9
commit 94e0b05cb1
57 changed files with 1974 additions and 4300 deletions

View File

@@ -20,6 +20,7 @@ Usage:
import asyncio
import atexit
import logging
import os
import signal
import subprocess
@@ -27,7 +28,7 @@ import sys
import threading
from datetime import datetime, timezone
from pathlib import Path
from typing import Callable, Literal
from typing import Any, Callable, Literal
from sqlalchemy import text
@@ -36,6 +37,8 @@ from api.dependency_resolver import are_dependencies_satisfied, compute_scheduli
from progress import has_features
from server.utils.process_utils import kill_process_tree
logger = logging.getLogger(__name__)
# Root directory of autocoder (where this script and autonomous_agent_demo.py live)
AUTOCODER_ROOT = Path(__file__).parent.resolve()
@@ -83,23 +86,25 @@ class DebugLogger:
debug_log = DebugLogger()
def _dump_database_state(session, label: str = ""):
"""Helper to dump full database state to debug log."""
from api.database import Feature
all_features = session.query(Feature).all()
def _dump_database_state(feature_dicts: list[dict], label: str = ""):
"""Helper to dump full database state to debug log.
passing = [f for f in all_features if f.passes]
in_progress = [f for f in all_features if f.in_progress and not f.passes]
pending = [f for f in all_features if not f.passes and not f.in_progress]
Args:
feature_dicts: Pre-fetched list of feature dicts.
label: Optional label for the dump entry.
"""
passing = [f for f in feature_dicts if f.get("passes")]
in_progress = [f for f in feature_dicts if f.get("in_progress") and not f.get("passes")]
pending = [f for f in feature_dicts if not f.get("passes") and not f.get("in_progress")]
debug_log.log("DB_DUMP", f"Full database state {label}",
total_features=len(all_features),
total_features=len(feature_dicts),
passing_count=len(passing),
passing_ids=[f.id for f in passing],
passing_ids=[f["id"] for f in passing],
in_progress_count=len(in_progress),
in_progress_ids=[f.id for f in in_progress],
in_progress_ids=[f["id"] for f in in_progress],
pending_count=len(pending),
pending_ids=[f.id for f in pending[:10]]) # First 10 pending only
pending_ids=[f["id"] for f in pending[:10]]) # First 10 pending only
# =============================================================================
# Process Limits
@@ -125,6 +130,7 @@ def _dump_database_state(session, label: str = ""):
MAX_PARALLEL_AGENTS = 5
MAX_TOTAL_AGENTS = 10
DEFAULT_CONCURRENCY = 3
DEFAULT_TESTING_BATCH_SIZE = 3 # Number of features per testing batch (1-5)
POLL_INTERVAL = 5 # seconds between checking for ready features
MAX_FEATURE_RETRIES = 3 # Maximum times to retry a failed feature
INITIALIZER_TIMEOUT = 1800 # 30 minutes timeout for initializer
@@ -146,6 +152,7 @@ class ParallelOrchestrator:
model: str | None = None,
yolo_mode: bool = False,
testing_agent_ratio: int = 1,
testing_batch_size: int = DEFAULT_TESTING_BATCH_SIZE,
on_output: Callable[[int, str], None] | None = None,
on_status: Callable[[int, str], None] | None = None,
):
@@ -159,6 +166,8 @@ class ParallelOrchestrator:
yolo_mode: Whether to run in YOLO mode (skip testing agents entirely)
testing_agent_ratio: Number of regression testing agents to maintain (0-3).
0 = disabled, 1-3 = maintain that many testing agents running independently.
testing_batch_size: Number of features to include per testing session (1-5).
Each testing agent receives this many features to regression test.
on_output: Callback for agent output (feature_id, line)
on_status: Callback for agent status changes (feature_id, status)
"""
@@ -167,6 +176,7 @@ class ParallelOrchestrator:
self.model = model
self.yolo_mode = yolo_mode
self.testing_agent_ratio = min(max(testing_agent_ratio, 0), 3) # Clamp 0-3
self.testing_batch_size = min(max(testing_batch_size, 1), 5) # Clamp 1-5
self.on_output = on_output
self.on_status = on_status
@@ -186,6 +196,10 @@ class ParallelOrchestrator:
# Track feature failures to prevent infinite retry loops
self._failure_counts: dict[int, int] = {}
# Track recently tested feature IDs to avoid redundant re-testing.
# Cleared when all passing features have been covered at least once.
self._recently_tested: set[int] = set()
# Shutdown flag for async-safe signal handling
# Signal handlers only set this flag; cleanup happens in the main loop
self._shutdown_requested = False
@@ -213,6 +227,9 @@ class ParallelOrchestrator:
This simplifies the architecture by removing unnecessary coordination.
Returns the feature ID if available, None if no passing features exist.
Note: Prefer _get_test_batch() for batch testing mode. This method is
retained for backward compatibility.
"""
from sqlalchemy.sql.expression import func
@@ -231,164 +248,282 @@ class ParallelOrchestrator:
finally:
session.close()
def get_resumable_features(self) -> list[dict]:
def _get_test_batch(self, batch_size: int = 3) -> list[int]:
"""Select a prioritized batch of passing features for regression testing.
Uses weighted scoring to prioritize features that:
1. Haven't been tested recently in this orchestrator session
2. Are depended on by many other features (higher impact if broken)
3. Have more dependencies themselves (complex integration points)
When all passing features have been recently tested, the tracking set
is cleared so the cycle starts fresh.
Args:
batch_size: Maximum number of feature IDs to return (1-5).
Returns:
List of feature IDs to test, may be shorter than batch_size if
fewer passing features are available. Empty list if none available.
"""
session = self.get_session()
try:
session.expire_all()
passing = (
session.query(Feature)
.filter(Feature.passes == True)
.filter(Feature.in_progress == False) # Don't test while coding
.all()
)
# Extract data from ORM objects before closing the session to avoid
# DetachedInstanceError when accessing attributes after session.close().
passing_data: list[dict] = []
for f in passing:
passing_data.append({
'id': f.id,
'dependencies': f.get_dependencies_safe() if hasattr(f, 'get_dependencies_safe') else [],
})
finally:
session.close()
if not passing_data:
return []
# Build a reverse dependency map: feature_id -> count of features that depend on it.
# The Feature model stores dependencies (what I depend ON), so we invert to find
# dependents (what depends ON me).
dependent_counts: dict[int, int] = {}
for fd in passing_data:
for dep_id in fd['dependencies']:
dependent_counts[dep_id] = dependent_counts.get(dep_id, 0) + 1
# Exclude features that are already being tested by running testing agents
# to avoid redundant concurrent testing of the same features.
# running_testing_agents is dict[pid, (primary_feature_id, process)]
with self._lock:
currently_testing_ids: set[int] = set()
for _pid, (feat_id, _proc) in self.running_testing_agents.items():
currently_testing_ids.add(feat_id)
# If all passing features have been recently tested, reset the tracker
# so we cycle through them again rather than returning empty batches.
passing_ids = {fd['id'] for fd in passing_data}
if passing_ids.issubset(self._recently_tested):
self._recently_tested.clear()
# Score each feature by testing priority
scored: list[tuple[int, int]] = []
for fd in passing_data:
f_id = fd['id']
# Skip features already being tested by a running testing agent
if f_id in currently_testing_ids:
continue
score = 0
# Weight 1: Features depended on by many others are higher impact
# if they regress, so test them more often
score += dependent_counts.get(f_id, 0) * 2
# Weight 2: Strongly prefer features not tested recently
if f_id not in self._recently_tested:
score += 5
# Weight 3: Features with more dependencies are integration points
# that are more likely to regress when other code changes
dep_count = len(fd['dependencies'])
score += min(dep_count, 3) # Cap at 3 to avoid over-weighting
scored.append((f_id, score))
# Sort by score descending (highest priority first)
scored.sort(key=lambda x: x[1], reverse=True)
selected = [fid for fid, _ in scored[:batch_size]]
# Track what we've tested to avoid re-testing the same features next batch
self._recently_tested.update(selected)
debug_log.log("TEST_BATCH", f"Selected {len(selected)} features for testing batch",
selected_ids=selected,
recently_tested_count=len(self._recently_tested),
total_passing=len(passing_data))
return selected
def get_resumable_features(
self,
feature_dicts: list[dict] | None = None,
scheduling_scores: dict[int, float] | None = None,
) -> list[dict]:
"""Get features that were left in_progress from a previous session.
These are features where in_progress=True but passes=False, and they're
not currently being worked on by this orchestrator. This handles the case
where a previous session was interrupted before completing the feature.
Args:
feature_dicts: Pre-fetched list of feature dicts. If None, queries the database.
scheduling_scores: Pre-computed scheduling scores. If None, computed from feature_dicts.
"""
session = self.get_session()
try:
# Force fresh read from database to avoid stale cached data
# This is critical when agent subprocesses have committed changes
session.expire_all()
if feature_dicts is None:
session = self.get_session()
try:
session.expire_all()
all_features = session.query(Feature).all()
feature_dicts = [f.to_dict() for f in all_features]
finally:
session.close()
# Find features that are in_progress but not complete
stale = session.query(Feature).filter(
Feature.in_progress == True,
Feature.passes == False
).all()
# Snapshot running IDs once to avoid acquiring lock per feature
with self._lock:
running_ids = set(self.running_coding_agents.keys())
resumable = []
for f in stale:
# Skip if already running in this orchestrator instance
with self._lock:
if f.id in self.running_coding_agents:
continue
# Skip if feature has failed too many times
if self._failure_counts.get(f.id, 0) >= MAX_FEATURE_RETRIES:
continue
resumable.append(f.to_dict())
resumable = []
for fd in feature_dicts:
if not fd.get("in_progress") or fd.get("passes"):
continue
# Skip if already running in this orchestrator instance
if fd["id"] in running_ids:
continue
# Skip if feature has failed too many times
if self._failure_counts.get(fd["id"], 0) >= MAX_FEATURE_RETRIES:
continue
resumable.append(fd)
# Sort by scheduling score (higher = first), then priority, then id
all_dicts = [f.to_dict() for f in session.query(Feature).all()]
scores = compute_scheduling_scores(all_dicts)
resumable.sort(key=lambda f: (-scores.get(f["id"], 0), f["priority"], f["id"]))
return resumable
finally:
session.close()
# Sort by scheduling score (higher = first), then priority, then id
if scheduling_scores is None:
scheduling_scores = compute_scheduling_scores(feature_dicts)
resumable.sort(key=lambda f: (-scheduling_scores.get(f["id"], 0), f["priority"], f["id"]))
return resumable
def get_ready_features(self) -> list[dict]:
"""Get features with satisfied dependencies, not already running."""
session = self.get_session()
try:
# Force fresh read from database to avoid stale cached data
# This is critical when agent subprocesses have committed changes
session.expire_all()
def get_ready_features(
self,
feature_dicts: list[dict] | None = None,
scheduling_scores: dict[int, float] | None = None,
) -> list[dict]:
"""Get features with satisfied dependencies, not already running.
all_features = session.query(Feature).all()
all_dicts = [f.to_dict() for f in all_features]
Args:
feature_dicts: Pre-fetched list of feature dicts. If None, queries the database.
scheduling_scores: Pre-computed scheduling scores. If None, computed from feature_dicts.
"""
if feature_dicts is None:
session = self.get_session()
try:
session.expire_all()
all_features = session.query(Feature).all()
feature_dicts = [f.to_dict() for f in all_features]
finally:
session.close()
# Pre-compute passing_ids once to avoid O(n^2) in the loop
passing_ids = {f.id for f in all_features if f.passes}
# Pre-compute passing_ids once to avoid O(n^2) in the loop
passing_ids = {fd["id"] for fd in feature_dicts if fd.get("passes")}
ready = []
skipped_reasons = {"passes": 0, "in_progress": 0, "running": 0, "failed": 0, "deps": 0}
for f in all_features:
if f.passes:
skipped_reasons["passes"] += 1
continue
if f.in_progress:
skipped_reasons["in_progress"] += 1
continue
# Skip if already running in this orchestrator
with self._lock:
if f.id in self.running_coding_agents:
skipped_reasons["running"] += 1
continue
# Skip if feature has failed too many times
if self._failure_counts.get(f.id, 0) >= MAX_FEATURE_RETRIES:
skipped_reasons["failed"] += 1
continue
# Check dependencies (pass pre-computed passing_ids)
if are_dependencies_satisfied(f.to_dict(), all_dicts, passing_ids):
ready.append(f.to_dict())
else:
skipped_reasons["deps"] += 1
# Snapshot running IDs once to avoid acquiring lock per feature
with self._lock:
running_ids = set(self.running_coding_agents.keys())
# Sort by scheduling score (higher = first), then priority, then id
scores = compute_scheduling_scores(all_dicts)
ready.sort(key=lambda f: (-scores.get(f["id"], 0), f["priority"], f["id"]))
ready = []
skipped_reasons = {"passes": 0, "in_progress": 0, "running": 0, "failed": 0, "deps": 0}
for fd in feature_dicts:
if fd.get("passes"):
skipped_reasons["passes"] += 1
continue
if fd.get("in_progress"):
skipped_reasons["in_progress"] += 1
continue
# Skip if already running in this orchestrator
if fd["id"] in running_ids:
skipped_reasons["running"] += 1
continue
# Skip if feature has failed too many times
if self._failure_counts.get(fd["id"], 0) >= MAX_FEATURE_RETRIES:
skipped_reasons["failed"] += 1
continue
# Check dependencies (pass pre-computed passing_ids)
if are_dependencies_satisfied(fd, feature_dicts, passing_ids):
ready.append(fd)
else:
skipped_reasons["deps"] += 1
# Debug logging
passing = sum(1 for f in all_features if f.passes)
in_progress = sum(1 for f in all_features if f.in_progress and not f.passes)
print(
f"[DEBUG] get_ready_features: {len(ready)} ready, "
f"{passing} passing, {in_progress} in_progress, {len(all_features)} total",
flush=True
)
print(
f"[DEBUG] Skipped: {skipped_reasons['passes']} passing, {skipped_reasons['in_progress']} in_progress, "
f"{skipped_reasons['running']} running, {skipped_reasons['failed']} failed, {skipped_reasons['deps']} blocked by deps",
flush=True
)
# Sort by scheduling score (higher = first), then priority, then id
if scheduling_scores is None:
scheduling_scores = compute_scheduling_scores(feature_dicts)
ready.sort(key=lambda f: (-scheduling_scores.get(f["id"], 0), f["priority"], f["id"]))
# Log to debug file (but not every call to avoid spam)
debug_log.log("READY", "get_ready_features() called",
ready_count=len(ready),
ready_ids=[f['id'] for f in ready[:5]], # First 5 only
passing=passing,
in_progress=in_progress,
total=len(all_features),
skipped=skipped_reasons)
# Summary counts for logging
passing = skipped_reasons["passes"]
in_progress = skipped_reasons["in_progress"]
total = len(feature_dicts)
return ready
finally:
session.close()
debug_log.log("READY", "get_ready_features() called",
ready_count=len(ready),
ready_ids=[f['id'] for f in ready[:5]], # First 5 only
passing=passing,
in_progress=in_progress,
total=total,
skipped=skipped_reasons)
def get_all_complete(self) -> bool:
return ready
def get_all_complete(self, feature_dicts: list[dict] | None = None) -> bool:
"""Check if all features are complete or permanently failed.
Returns False if there are no features (initialization needed).
Args:
feature_dicts: Pre-fetched list of feature dicts. If None, queries the database.
"""
session = self.get_session()
try:
# Force fresh read from database to avoid stale cached data
# This is critical when agent subprocesses have committed changes
session.expire_all()
if feature_dicts is None:
session = self.get_session()
try:
session.expire_all()
all_features = session.query(Feature).all()
feature_dicts = [f.to_dict() for f in all_features]
finally:
session.close()
all_features = session.query(Feature).all()
# No features = NOT complete, need initialization
if len(feature_dicts) == 0:
return False
# No features = NOT complete, need initialization
if len(all_features) == 0:
return False
passing_count = 0
failed_count = 0
pending_count = 0
for fd in feature_dicts:
if fd.get("passes"):
passing_count += 1
continue # Completed successfully
if self._failure_counts.get(fd["id"], 0) >= MAX_FEATURE_RETRIES:
failed_count += 1
continue # Permanently failed, count as "done"
pending_count += 1
passing_count = 0
failed_count = 0
pending_count = 0
for f in all_features:
if f.passes:
passing_count += 1
continue # Completed successfully
if self._failure_counts.get(f.id, 0) >= MAX_FEATURE_RETRIES:
failed_count += 1
continue # Permanently failed, count as "done"
pending_count += 1
total = len(feature_dicts)
is_complete = pending_count == 0
debug_log.log("COMPLETE_CHECK", f"get_all_complete: {passing_count}/{total} passing, "
f"{failed_count} failed, {pending_count} pending -> {is_complete}")
return is_complete
total = len(all_features)
is_complete = pending_count == 0
print(
f"[DEBUG] get_all_complete: {passing_count}/{total} passing, "
f"{failed_count} failed, {pending_count} pending -> {is_complete}",
flush=True
)
return is_complete
finally:
session.close()
def get_passing_count(self, feature_dicts: list[dict] | None = None) -> int:
"""Get the number of passing features.
def get_passing_count(self) -> int:
"""Get the number of passing features."""
session = self.get_session()
try:
session.expire_all()
count: int = session.query(Feature).filter(Feature.passes == True).count()
return count
finally:
session.close()
Args:
feature_dicts: Pre-fetched list of feature dicts. If None, queries the database.
"""
if feature_dicts is None:
session = self.get_session()
try:
session.expire_all()
count: int = session.query(Feature).filter(Feature.passes == True).count()
return count
finally:
session.close()
return sum(1 for fd in feature_dicts if fd.get("passes"))
def _maintain_testing_agents(self) -> None:
def _maintain_testing_agents(self, feature_dicts: list[dict] | None = None) -> None:
"""Maintain the desired count of testing agents independently.
This runs every loop iteration and spawns testing agents as needed to maintain
@@ -402,18 +537,21 @@ class ParallelOrchestrator:
- YOLO mode is enabled
- testing_agent_ratio is 0
- No passing features exist yet
Args:
feature_dicts: Pre-fetched list of feature dicts. If None, queries the database.
"""
# Skip if testing is disabled
if self.yolo_mode or self.testing_agent_ratio == 0:
return
# No testing until there are passing features
passing_count = self.get_passing_count()
passing_count = self.get_passing_count(feature_dicts)
if passing_count == 0:
return
# Don't spawn testing agents if all features are already complete
if self.get_all_complete():
if self.get_all_complete(feature_dicts):
return
# Spawn testing agents one at a time, re-checking limits each time
@@ -439,7 +577,7 @@ class ParallelOrchestrator:
passing_count=passing_count)
# Spawn outside lock (I/O bound operation)
print(f"[DEBUG] Spawning testing agent ({spawn_index}/{desired})", flush=True)
logger.debug("Spawning testing agent (%d/%d)", spawn_index, desired)
success, msg = self._spawn_testing_agent()
if not success:
debug_log.log("TESTING", f"Spawn failed, stopping: {msg}")
@@ -521,7 +659,7 @@ class ParallelOrchestrator:
# CREATE_NO_WINDOW on Windows prevents console window pop-ups
# stdin=DEVNULL prevents blocking on stdin reads
# encoding="utf-8" and errors="replace" fix Windows CP1252 issues
popen_kwargs = {
popen_kwargs: dict[str, Any] = {
"stdin": subprocess.DEVNULL,
"stdout": subprocess.PIPE,
"stderr": subprocess.STDOUT,
@@ -565,11 +703,14 @@ class ParallelOrchestrator:
return True, f"Started feature {feature_id}"
def _spawn_testing_agent(self) -> tuple[bool, str]:
"""Spawn a testing agent subprocess for regression testing.
"""Spawn a testing agent subprocess for batch regression testing.
Picks a random passing feature to test. Multiple testing agents can test
the same feature concurrently - this is intentional and simplifies the
architecture by removing claim coordination.
Selects a prioritized batch of passing features using weighted scoring
(via _get_test_batch) and passes them as --testing-feature-ids to the
subprocess. Falls back to single --testing-feature-id for batches of one.
Multiple testing agents can test the same feature concurrently - this is
intentional and simplifies the architecture by removing claim coordination.
"""
# Check limits first (under lock)
with self._lock:
@@ -582,13 +723,16 @@ class ParallelOrchestrator:
debug_log.log("TESTING", f"Skipped spawn - at max total agents ({total_agents}/{MAX_TOTAL_AGENTS})")
return False, f"At max total agents ({total_agents})"
# Pick a random passing feature (no claim needed - concurrent testing is fine)
feature_id = self._get_random_passing_feature()
if feature_id is None:
# Select a weighted batch of passing features for regression testing
batch = self._get_test_batch(self.testing_batch_size)
if not batch:
debug_log.log("TESTING", "No features available for testing")
return False, "No features available for testing"
debug_log.log("TESTING", f"Selected feature #{feature_id} for testing")
# Use the first feature ID as the representative for logging/tracking
primary_feature_id = batch[0]
batch_str = ",".join(str(fid) for fid in batch)
debug_log.log("TESTING", f"Selected batch for testing: [{batch_str}]")
# Spawn the testing agent
with self._lock:
@@ -604,7 +748,7 @@ class ParallelOrchestrator:
"--project-dir", str(self.project_dir),
"--max-iterations", "1",
"--agent-type", "testing",
"--testing-feature-id", str(feature_id),
"--testing-feature-ids", batch_str,
]
if self.model:
cmd.extend(["--model", self.model])
@@ -613,7 +757,7 @@ class ParallelOrchestrator:
# CREATE_NO_WINDOW on Windows prevents console window pop-ups
# stdin=DEVNULL prevents blocking on stdin reads
# encoding="utf-8" and errors="replace" fix Windows CP1252 issues
popen_kwargs = {
popen_kwargs: dict[str, Any] = {
"stdin": subprocess.DEVNULL,
"stdout": subprocess.PIPE,
"stderr": subprocess.STDOUT,
@@ -633,22 +777,22 @@ class ParallelOrchestrator:
# Register process by PID (not feature_id) to avoid overwrites
# when multiple agents test the same feature
self.running_testing_agents[proc.pid] = (feature_id, proc)
self.running_testing_agents[proc.pid] = (primary_feature_id, proc)
testing_count = len(self.running_testing_agents)
# Start output reader thread with feature ID (same as coding agents)
# Start output reader thread with primary feature ID for log attribution
threading.Thread(
target=self._read_output,
args=(feature_id, proc, threading.Event(), "testing"),
args=(primary_feature_id, proc, threading.Event(), "testing"),
daemon=True
).start()
print(f"Started testing agent for feature #{feature_id} (PID {proc.pid})", flush=True)
debug_log.log("TESTING", f"Successfully spawned testing agent for feature #{feature_id}",
print(f"Started testing agent for features [{batch_str}] (PID {proc.pid})", flush=True)
debug_log.log("TESTING", f"Successfully spawned testing agent for batch [{batch_str}]",
pid=proc.pid,
feature_id=feature_id,
feature_ids=batch,
total_testing_agents=testing_count)
return True, f"Started testing agent for feature #{feature_id}"
return True, f"Started testing agent for features [{batch_str}]"
async def _run_initializer(self) -> bool:
"""Run initializer agent as blocking subprocess.
@@ -674,7 +818,7 @@ class ParallelOrchestrator:
# CREATE_NO_WINDOW on Windows prevents console window pop-ups
# stdin=DEVNULL prevents blocking on stdin reads
# encoding="utf-8" and errors="replace" fix Windows CP1252 issues
popen_kwargs = {
popen_kwargs: dict[str, Any] = {
"stdin": subprocess.DEVNULL,
"stdout": subprocess.PIPE,
"stderr": subprocess.STDOUT,
@@ -1000,16 +1144,15 @@ class ParallelOrchestrator:
# newly created features.
debug_log.section("INITIALIZATION COMPLETE")
debug_log.log("INIT", "Disposing old database engine and creating fresh connection")
print("[DEBUG] Recreating database connection after initialization...", flush=True)
logger.debug("Recreating database connection after initialization")
if self._engine is not None:
self._engine.dispose()
self._engine, self._session_maker = create_database(self.project_dir)
# Debug: Show state immediately after initialization
print("[DEBUG] Post-initialization state check:", flush=True)
print(f"[DEBUG] max_concurrency={self.max_concurrency}", flush=True)
print(f"[DEBUG] yolo_mode={self.yolo_mode}", flush=True)
print(f"[DEBUG] testing_agent_ratio={self.testing_agent_ratio}", flush=True)
logger.debug("Post-initialization state check")
logger.debug("Post-initialization state: max_concurrency=%d, yolo_mode=%s, testing_agent_ratio=%d",
self.max_concurrency, self.yolo_mode, self.testing_agent_ratio)
# Verify features were created and are visible
session = self.get_session()
@@ -1017,7 +1160,7 @@ class ParallelOrchestrator:
feature_count = session.query(Feature).count()
all_features = session.query(Feature).all()
feature_names = [f"{f.id}: {f.name}" for f in all_features[:10]]
print(f"[DEBUG] features in database={feature_count}", flush=True)
logger.debug("Features in database: %d", feature_count)
debug_log.log("INIT", "Post-initialization database state",
max_concurrency=self.max_concurrency,
yolo_mode=self.yolo_mode,
@@ -1041,7 +1184,18 @@ class ParallelOrchestrator:
while self.is_running and not self._shutdown_requested:
loop_iteration += 1
if loop_iteration <= 3:
print(f"[DEBUG] === Loop iteration {loop_iteration} ===", flush=True)
logger.debug("=== Loop iteration %d ===", loop_iteration)
# Query all features ONCE per iteration and build reusable snapshot.
# Every sub-method receives this snapshot instead of re-querying the DB.
session = self.get_session()
session.expire_all()
all_features = session.query(Feature).all()
feature_dicts = [f.to_dict() for f in all_features]
session.close()
# Pre-compute scheduling scores once (BFS + reverse topo sort)
scheduling_scores = compute_scheduling_scores(feature_dicts)
# Log every iteration to debug file (first 10, then every 5th)
if loop_iteration <= 10 or loop_iteration % 5 == 0:
@@ -1055,20 +1209,16 @@ class ParallelOrchestrator:
# Full database dump every 5 iterations
if loop_iteration == 1 or loop_iteration % 5 == 0:
session = self.get_session()
try:
_dump_database_state(session, f"(iteration {loop_iteration})")
finally:
session.close()
_dump_database_state(feature_dicts, f"(iteration {loop_iteration})")
try:
# Check if all complete
if self.get_all_complete():
if self.get_all_complete(feature_dicts):
print("\nAll features complete!", flush=True)
break
# Maintain testing agents independently (runs every iteration)
self._maintain_testing_agents()
self._maintain_testing_agents(feature_dicts)
# Check capacity
with self._lock:
@@ -1089,17 +1239,17 @@ class ParallelOrchestrator:
continue
# Priority 1: Resume features from previous session
resumable = self.get_resumable_features()
resumable = self.get_resumable_features(feature_dicts, scheduling_scores)
if resumable:
slots = self.max_concurrency - current
for feature in resumable[:slots]:
print(f"Resuming feature #{feature['id']}: {feature['name']}", flush=True)
self.start_feature(feature["id"], resume=True)
await asyncio.sleep(2)
await asyncio.sleep(0.5) # Brief delay for subprocess to claim feature before re-querying
continue
# Priority 2: Start new ready features
ready = self.get_ready_features()
ready = self.get_ready_features(feature_dicts, scheduling_scores)
if not ready:
# Wait for running features to complete
if current > 0:
@@ -1112,11 +1262,12 @@ class ParallelOrchestrator:
session = self.get_session()
try:
session.expire_all()
fresh_dicts = [f.to_dict() for f in session.query(Feature).all()]
finally:
session.close()
# Recheck if all features are now complete
if self.get_all_complete():
if self.get_all_complete(fresh_dicts):
print("\nAll features complete!", flush=True)
break
@@ -1127,10 +1278,10 @@ class ParallelOrchestrator:
# Start features up to capacity
slots = self.max_concurrency - current
print(f"[DEBUG] Spawning loop: {len(ready)} ready, {slots} slots available, max_concurrency={self.max_concurrency}", flush=True)
print(f"[DEBUG] Will attempt to start {min(len(ready), slots)} features", flush=True)
logger.debug("Spawning loop: %d ready, %d slots available, max_concurrency=%d",
len(ready), slots, self.max_concurrency)
features_to_start = ready[:slots]
print(f"[DEBUG] Features to start: {[f['id'] for f in features_to_start]}", flush=True)
logger.debug("Features to start: %s", [f['id'] for f in features_to_start])
debug_log.log("SPAWN", "Starting features batch",
ready_count=len(ready),
@@ -1138,23 +1289,24 @@ class ParallelOrchestrator:
features_to_start=[f['id'] for f in features_to_start])
for i, feature in enumerate(features_to_start):
print(f"[DEBUG] Starting feature {i+1}/{len(features_to_start)}: #{feature['id']} - {feature['name']}", flush=True)
logger.debug("Starting feature %d/%d: #%d - %s",
i + 1, len(features_to_start), feature['id'], feature['name'])
success, msg = self.start_feature(feature["id"])
if not success:
print(f"[DEBUG] Failed to start feature #{feature['id']}: {msg}", flush=True)
logger.debug("Failed to start feature #%d: %s", feature['id'], msg)
debug_log.log("SPAWN", f"FAILED to start feature #{feature['id']}",
feature_name=feature['name'],
error=msg)
else:
print(f"[DEBUG] Successfully started feature #{feature['id']}", flush=True)
logger.debug("Successfully started feature #%d", feature['id'])
with self._lock:
running_count = len(self.running_coding_agents)
print(f"[DEBUG] Running coding agents after start: {running_count}", flush=True)
logger.debug("Running coding agents after start: %d", running_count)
debug_log.log("SPAWN", f"Successfully started feature #{feature['id']}",
feature_name=feature['name'],
running_coding_agents=running_count)
await asyncio.sleep(2) # Brief pause between starts
await asyncio.sleep(0.5) # Brief delay for subprocess to claim feature before re-querying
except Exception as e:
print(f"Orchestrator error: {e}", flush=True)
@@ -1223,6 +1375,7 @@ async def run_parallel_orchestrator(
model: str | None = None,
yolo_mode: bool = False,
testing_agent_ratio: int = 1,
testing_batch_size: int = DEFAULT_TESTING_BATCH_SIZE,
) -> None:
"""Run the unified orchestrator.
@@ -1232,6 +1385,7 @@ async def run_parallel_orchestrator(
model: Claude model to use
yolo_mode: Whether to run in YOLO mode (skip testing agents)
testing_agent_ratio: Number of regression agents to maintain (0-3)
testing_batch_size: Number of features per testing batch (1-5)
"""
print(f"[ORCHESTRATOR] run_parallel_orchestrator called with max_concurrency={max_concurrency}", flush=True)
orchestrator = ParallelOrchestrator(
@@ -1240,6 +1394,7 @@ async def run_parallel_orchestrator(
model=model,
yolo_mode=yolo_mode,
testing_agent_ratio=testing_agent_ratio,
testing_batch_size=testing_batch_size,
)
# Set up cleanup to run on exit (handles normal exit, exceptions)
@@ -1319,6 +1474,12 @@ def main():
default=1,
help="Number of regression testing agents (0-3, default: 1). Set to 0 to disable testing agents.",
)
parser.add_argument(
"--testing-batch-size",
type=int,
default=DEFAULT_TESTING_BATCH_SIZE,
help=f"Number of features per testing batch (1-5, default: {DEFAULT_TESTING_BATCH_SIZE})",
)
args = parser.parse_args()
@@ -1345,6 +1506,7 @@ def main():
model=args.model,
yolo_mode=args.yolo,
testing_agent_ratio=args.testing_agent_ratio,
testing_batch_size=args.testing_batch_size,
))
except KeyboardInterrupt:
print("\n\nInterrupted by user", flush=True)