mirror of
https://github.com/leonvanzyl/autocoder.git
synced 2026-02-02 23:33:35 +00:00
refactor: optimize token usage, deduplicate code, fix bugs across agents
Token reduction (~40% per session, ~2.3M fewer tokens per 200-feature project): - Agent-type-specific tool lists: coding 9, testing 5, init 5 (was 19 for all) - Right-sized max_turns: coding 300, testing 100 (was 1000 for all) - Trimmed coding prompt template (~150 lines removed) - Streamlined testing prompt with batch support - YOLO mode now strips browser testing instructions from prompt - Added Grep, WebFetch, WebSearch to expand project session Performance improvements: - Rate limit retries start at ~15s with jitter (was fixed 60s) - Post-spawn delay reduced to 0.5s (was 2s) - Orchestrator consolidated to 1 DB query per loop (was 5-7) - Testing agents batch 3 features per session (was 1) - Smart context compaction preserves critical state, discards noise Bug fixes: - Removed ghost feature_release_testing MCP tool (wasted tokens every test session) - Forward all 9 Vertex AI env vars to chat sessions (was missing 3) - Fix DetachedInstanceError risk in test batch ORM access - Prevent duplicate testing of same features in parallel mode Code deduplication: - _get_project_path(): 9 copies -> 1 shared utility (project_helpers.py) - validate_project_name(): 9 copies -> 2 variants in 1 file (validation.py) - ROOT_DIR: 10 copies -> 1 definition (chat_constants.py) - API_ENV_VARS: 4 copies -> 1 source of truth (env_constants.py) Security hardening: - Unified sensitive directory blocklist (14 dirs, was two divergent lists) - Cached get_blocked_paths() for O(1) directory listing checks - Terminal security warning when ALLOW_REMOTE=1 exposes WebSocket - 20 new security tests for EXTRA_READ_PATHS blocking - Extracted _validate_command_list() and _validate_pkill_processes() helpers Type safety: - 87 mypy errors -> 0 across 58 source files - Installed types-PyYAML for proper yaml stub types - Fixed SQLAlchemy Column[T] coercions across all routers Dead code removed: - 13 files deleted (~2,679 lines): unused UI components, debug logs, outdated docs - 7 unused npm packages removed (Radix UI components with 0 imports) - AgentAvatar.tsx reduced from 615 -> 119 lines (SVGs extracted to mascotData.tsx) New CLI options: - --testing-batch-size (1-5) for parallel mode test batching - --testing-feature-ids for direct multi-feature testing Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -20,6 +20,7 @@ Usage:
|
||||
|
||||
import asyncio
|
||||
import atexit
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import subprocess
|
||||
@@ -27,7 +28,7 @@ import sys
|
||||
import threading
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Callable, Literal
|
||||
from typing import Any, Callable, Literal
|
||||
|
||||
from sqlalchemy import text
|
||||
|
||||
@@ -36,6 +37,8 @@ from api.dependency_resolver import are_dependencies_satisfied, compute_scheduli
|
||||
from progress import has_features
|
||||
from server.utils.process_utils import kill_process_tree
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Root directory of autocoder (where this script and autonomous_agent_demo.py live)
|
||||
AUTOCODER_ROOT = Path(__file__).parent.resolve()
|
||||
|
||||
@@ -83,23 +86,25 @@ class DebugLogger:
|
||||
debug_log = DebugLogger()
|
||||
|
||||
|
||||
def _dump_database_state(session, label: str = ""):
|
||||
"""Helper to dump full database state to debug log."""
|
||||
from api.database import Feature
|
||||
all_features = session.query(Feature).all()
|
||||
def _dump_database_state(feature_dicts: list[dict], label: str = ""):
|
||||
"""Helper to dump full database state to debug log.
|
||||
|
||||
passing = [f for f in all_features if f.passes]
|
||||
in_progress = [f for f in all_features if f.in_progress and not f.passes]
|
||||
pending = [f for f in all_features if not f.passes and not f.in_progress]
|
||||
Args:
|
||||
feature_dicts: Pre-fetched list of feature dicts.
|
||||
label: Optional label for the dump entry.
|
||||
"""
|
||||
passing = [f for f in feature_dicts if f.get("passes")]
|
||||
in_progress = [f for f in feature_dicts if f.get("in_progress") and not f.get("passes")]
|
||||
pending = [f for f in feature_dicts if not f.get("passes") and not f.get("in_progress")]
|
||||
|
||||
debug_log.log("DB_DUMP", f"Full database state {label}",
|
||||
total_features=len(all_features),
|
||||
total_features=len(feature_dicts),
|
||||
passing_count=len(passing),
|
||||
passing_ids=[f.id for f in passing],
|
||||
passing_ids=[f["id"] for f in passing],
|
||||
in_progress_count=len(in_progress),
|
||||
in_progress_ids=[f.id for f in in_progress],
|
||||
in_progress_ids=[f["id"] for f in in_progress],
|
||||
pending_count=len(pending),
|
||||
pending_ids=[f.id for f in pending[:10]]) # First 10 pending only
|
||||
pending_ids=[f["id"] for f in pending[:10]]) # First 10 pending only
|
||||
|
||||
# =============================================================================
|
||||
# Process Limits
|
||||
@@ -125,6 +130,7 @@ def _dump_database_state(session, label: str = ""):
|
||||
MAX_PARALLEL_AGENTS = 5
|
||||
MAX_TOTAL_AGENTS = 10
|
||||
DEFAULT_CONCURRENCY = 3
|
||||
DEFAULT_TESTING_BATCH_SIZE = 3 # Number of features per testing batch (1-5)
|
||||
POLL_INTERVAL = 5 # seconds between checking for ready features
|
||||
MAX_FEATURE_RETRIES = 3 # Maximum times to retry a failed feature
|
||||
INITIALIZER_TIMEOUT = 1800 # 30 minutes timeout for initializer
|
||||
@@ -146,6 +152,7 @@ class ParallelOrchestrator:
|
||||
model: str | None = None,
|
||||
yolo_mode: bool = False,
|
||||
testing_agent_ratio: int = 1,
|
||||
testing_batch_size: int = DEFAULT_TESTING_BATCH_SIZE,
|
||||
on_output: Callable[[int, str], None] | None = None,
|
||||
on_status: Callable[[int, str], None] | None = None,
|
||||
):
|
||||
@@ -159,6 +166,8 @@ class ParallelOrchestrator:
|
||||
yolo_mode: Whether to run in YOLO mode (skip testing agents entirely)
|
||||
testing_agent_ratio: Number of regression testing agents to maintain (0-3).
|
||||
0 = disabled, 1-3 = maintain that many testing agents running independently.
|
||||
testing_batch_size: Number of features to include per testing session (1-5).
|
||||
Each testing agent receives this many features to regression test.
|
||||
on_output: Callback for agent output (feature_id, line)
|
||||
on_status: Callback for agent status changes (feature_id, status)
|
||||
"""
|
||||
@@ -167,6 +176,7 @@ class ParallelOrchestrator:
|
||||
self.model = model
|
||||
self.yolo_mode = yolo_mode
|
||||
self.testing_agent_ratio = min(max(testing_agent_ratio, 0), 3) # Clamp 0-3
|
||||
self.testing_batch_size = min(max(testing_batch_size, 1), 5) # Clamp 1-5
|
||||
self.on_output = on_output
|
||||
self.on_status = on_status
|
||||
|
||||
@@ -186,6 +196,10 @@ class ParallelOrchestrator:
|
||||
# Track feature failures to prevent infinite retry loops
|
||||
self._failure_counts: dict[int, int] = {}
|
||||
|
||||
# Track recently tested feature IDs to avoid redundant re-testing.
|
||||
# Cleared when all passing features have been covered at least once.
|
||||
self._recently_tested: set[int] = set()
|
||||
|
||||
# Shutdown flag for async-safe signal handling
|
||||
# Signal handlers only set this flag; cleanup happens in the main loop
|
||||
self._shutdown_requested = False
|
||||
@@ -213,6 +227,9 @@ class ParallelOrchestrator:
|
||||
This simplifies the architecture by removing unnecessary coordination.
|
||||
|
||||
Returns the feature ID if available, None if no passing features exist.
|
||||
|
||||
Note: Prefer _get_test_batch() for batch testing mode. This method is
|
||||
retained for backward compatibility.
|
||||
"""
|
||||
from sqlalchemy.sql.expression import func
|
||||
|
||||
@@ -231,164 +248,282 @@ class ParallelOrchestrator:
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
def get_resumable_features(self) -> list[dict]:
|
||||
def _get_test_batch(self, batch_size: int = 3) -> list[int]:
|
||||
"""Select a prioritized batch of passing features for regression testing.
|
||||
|
||||
Uses weighted scoring to prioritize features that:
|
||||
1. Haven't been tested recently in this orchestrator session
|
||||
2. Are depended on by many other features (higher impact if broken)
|
||||
3. Have more dependencies themselves (complex integration points)
|
||||
|
||||
When all passing features have been recently tested, the tracking set
|
||||
is cleared so the cycle starts fresh.
|
||||
|
||||
Args:
|
||||
batch_size: Maximum number of feature IDs to return (1-5).
|
||||
|
||||
Returns:
|
||||
List of feature IDs to test, may be shorter than batch_size if
|
||||
fewer passing features are available. Empty list if none available.
|
||||
"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
session.expire_all()
|
||||
passing = (
|
||||
session.query(Feature)
|
||||
.filter(Feature.passes == True)
|
||||
.filter(Feature.in_progress == False) # Don't test while coding
|
||||
.all()
|
||||
)
|
||||
|
||||
# Extract data from ORM objects before closing the session to avoid
|
||||
# DetachedInstanceError when accessing attributes after session.close().
|
||||
passing_data: list[dict] = []
|
||||
for f in passing:
|
||||
passing_data.append({
|
||||
'id': f.id,
|
||||
'dependencies': f.get_dependencies_safe() if hasattr(f, 'get_dependencies_safe') else [],
|
||||
})
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
if not passing_data:
|
||||
return []
|
||||
|
||||
# Build a reverse dependency map: feature_id -> count of features that depend on it.
|
||||
# The Feature model stores dependencies (what I depend ON), so we invert to find
|
||||
# dependents (what depends ON me).
|
||||
dependent_counts: dict[int, int] = {}
|
||||
for fd in passing_data:
|
||||
for dep_id in fd['dependencies']:
|
||||
dependent_counts[dep_id] = dependent_counts.get(dep_id, 0) + 1
|
||||
|
||||
# Exclude features that are already being tested by running testing agents
|
||||
# to avoid redundant concurrent testing of the same features.
|
||||
# running_testing_agents is dict[pid, (primary_feature_id, process)]
|
||||
with self._lock:
|
||||
currently_testing_ids: set[int] = set()
|
||||
for _pid, (feat_id, _proc) in self.running_testing_agents.items():
|
||||
currently_testing_ids.add(feat_id)
|
||||
|
||||
# If all passing features have been recently tested, reset the tracker
|
||||
# so we cycle through them again rather than returning empty batches.
|
||||
passing_ids = {fd['id'] for fd in passing_data}
|
||||
if passing_ids.issubset(self._recently_tested):
|
||||
self._recently_tested.clear()
|
||||
|
||||
# Score each feature by testing priority
|
||||
scored: list[tuple[int, int]] = []
|
||||
for fd in passing_data:
|
||||
f_id = fd['id']
|
||||
|
||||
# Skip features already being tested by a running testing agent
|
||||
if f_id in currently_testing_ids:
|
||||
continue
|
||||
|
||||
score = 0
|
||||
|
||||
# Weight 1: Features depended on by many others are higher impact
|
||||
# if they regress, so test them more often
|
||||
score += dependent_counts.get(f_id, 0) * 2
|
||||
|
||||
# Weight 2: Strongly prefer features not tested recently
|
||||
if f_id not in self._recently_tested:
|
||||
score += 5
|
||||
|
||||
# Weight 3: Features with more dependencies are integration points
|
||||
# that are more likely to regress when other code changes
|
||||
dep_count = len(fd['dependencies'])
|
||||
score += min(dep_count, 3) # Cap at 3 to avoid over-weighting
|
||||
|
||||
scored.append((f_id, score))
|
||||
|
||||
# Sort by score descending (highest priority first)
|
||||
scored.sort(key=lambda x: x[1], reverse=True)
|
||||
selected = [fid for fid, _ in scored[:batch_size]]
|
||||
|
||||
# Track what we've tested to avoid re-testing the same features next batch
|
||||
self._recently_tested.update(selected)
|
||||
|
||||
debug_log.log("TEST_BATCH", f"Selected {len(selected)} features for testing batch",
|
||||
selected_ids=selected,
|
||||
recently_tested_count=len(self._recently_tested),
|
||||
total_passing=len(passing_data))
|
||||
|
||||
return selected
|
||||
|
||||
def get_resumable_features(
|
||||
self,
|
||||
feature_dicts: list[dict] | None = None,
|
||||
scheduling_scores: dict[int, float] | None = None,
|
||||
) -> list[dict]:
|
||||
"""Get features that were left in_progress from a previous session.
|
||||
|
||||
These are features where in_progress=True but passes=False, and they're
|
||||
not currently being worked on by this orchestrator. This handles the case
|
||||
where a previous session was interrupted before completing the feature.
|
||||
|
||||
Args:
|
||||
feature_dicts: Pre-fetched list of feature dicts. If None, queries the database.
|
||||
scheduling_scores: Pre-computed scheduling scores. If None, computed from feature_dicts.
|
||||
"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
# Force fresh read from database to avoid stale cached data
|
||||
# This is critical when agent subprocesses have committed changes
|
||||
session.expire_all()
|
||||
if feature_dicts is None:
|
||||
session = self.get_session()
|
||||
try:
|
||||
session.expire_all()
|
||||
all_features = session.query(Feature).all()
|
||||
feature_dicts = [f.to_dict() for f in all_features]
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
# Find features that are in_progress but not complete
|
||||
stale = session.query(Feature).filter(
|
||||
Feature.in_progress == True,
|
||||
Feature.passes == False
|
||||
).all()
|
||||
# Snapshot running IDs once to avoid acquiring lock per feature
|
||||
with self._lock:
|
||||
running_ids = set(self.running_coding_agents.keys())
|
||||
|
||||
resumable = []
|
||||
for f in stale:
|
||||
# Skip if already running in this orchestrator instance
|
||||
with self._lock:
|
||||
if f.id in self.running_coding_agents:
|
||||
continue
|
||||
# Skip if feature has failed too many times
|
||||
if self._failure_counts.get(f.id, 0) >= MAX_FEATURE_RETRIES:
|
||||
continue
|
||||
resumable.append(f.to_dict())
|
||||
resumable = []
|
||||
for fd in feature_dicts:
|
||||
if not fd.get("in_progress") or fd.get("passes"):
|
||||
continue
|
||||
# Skip if already running in this orchestrator instance
|
||||
if fd["id"] in running_ids:
|
||||
continue
|
||||
# Skip if feature has failed too many times
|
||||
if self._failure_counts.get(fd["id"], 0) >= MAX_FEATURE_RETRIES:
|
||||
continue
|
||||
resumable.append(fd)
|
||||
|
||||
# Sort by scheduling score (higher = first), then priority, then id
|
||||
all_dicts = [f.to_dict() for f in session.query(Feature).all()]
|
||||
scores = compute_scheduling_scores(all_dicts)
|
||||
resumable.sort(key=lambda f: (-scores.get(f["id"], 0), f["priority"], f["id"]))
|
||||
return resumable
|
||||
finally:
|
||||
session.close()
|
||||
# Sort by scheduling score (higher = first), then priority, then id
|
||||
if scheduling_scores is None:
|
||||
scheduling_scores = compute_scheduling_scores(feature_dicts)
|
||||
resumable.sort(key=lambda f: (-scheduling_scores.get(f["id"], 0), f["priority"], f["id"]))
|
||||
return resumable
|
||||
|
||||
def get_ready_features(self) -> list[dict]:
|
||||
"""Get features with satisfied dependencies, not already running."""
|
||||
session = self.get_session()
|
||||
try:
|
||||
# Force fresh read from database to avoid stale cached data
|
||||
# This is critical when agent subprocesses have committed changes
|
||||
session.expire_all()
|
||||
def get_ready_features(
|
||||
self,
|
||||
feature_dicts: list[dict] | None = None,
|
||||
scheduling_scores: dict[int, float] | None = None,
|
||||
) -> list[dict]:
|
||||
"""Get features with satisfied dependencies, not already running.
|
||||
|
||||
all_features = session.query(Feature).all()
|
||||
all_dicts = [f.to_dict() for f in all_features]
|
||||
Args:
|
||||
feature_dicts: Pre-fetched list of feature dicts. If None, queries the database.
|
||||
scheduling_scores: Pre-computed scheduling scores. If None, computed from feature_dicts.
|
||||
"""
|
||||
if feature_dicts is None:
|
||||
session = self.get_session()
|
||||
try:
|
||||
session.expire_all()
|
||||
all_features = session.query(Feature).all()
|
||||
feature_dicts = [f.to_dict() for f in all_features]
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
# Pre-compute passing_ids once to avoid O(n^2) in the loop
|
||||
passing_ids = {f.id for f in all_features if f.passes}
|
||||
# Pre-compute passing_ids once to avoid O(n^2) in the loop
|
||||
passing_ids = {fd["id"] for fd in feature_dicts if fd.get("passes")}
|
||||
|
||||
ready = []
|
||||
skipped_reasons = {"passes": 0, "in_progress": 0, "running": 0, "failed": 0, "deps": 0}
|
||||
for f in all_features:
|
||||
if f.passes:
|
||||
skipped_reasons["passes"] += 1
|
||||
continue
|
||||
if f.in_progress:
|
||||
skipped_reasons["in_progress"] += 1
|
||||
continue
|
||||
# Skip if already running in this orchestrator
|
||||
with self._lock:
|
||||
if f.id in self.running_coding_agents:
|
||||
skipped_reasons["running"] += 1
|
||||
continue
|
||||
# Skip if feature has failed too many times
|
||||
if self._failure_counts.get(f.id, 0) >= MAX_FEATURE_RETRIES:
|
||||
skipped_reasons["failed"] += 1
|
||||
continue
|
||||
# Check dependencies (pass pre-computed passing_ids)
|
||||
if are_dependencies_satisfied(f.to_dict(), all_dicts, passing_ids):
|
||||
ready.append(f.to_dict())
|
||||
else:
|
||||
skipped_reasons["deps"] += 1
|
||||
# Snapshot running IDs once to avoid acquiring lock per feature
|
||||
with self._lock:
|
||||
running_ids = set(self.running_coding_agents.keys())
|
||||
|
||||
# Sort by scheduling score (higher = first), then priority, then id
|
||||
scores = compute_scheduling_scores(all_dicts)
|
||||
ready.sort(key=lambda f: (-scores.get(f["id"], 0), f["priority"], f["id"]))
|
||||
ready = []
|
||||
skipped_reasons = {"passes": 0, "in_progress": 0, "running": 0, "failed": 0, "deps": 0}
|
||||
for fd in feature_dicts:
|
||||
if fd.get("passes"):
|
||||
skipped_reasons["passes"] += 1
|
||||
continue
|
||||
if fd.get("in_progress"):
|
||||
skipped_reasons["in_progress"] += 1
|
||||
continue
|
||||
# Skip if already running in this orchestrator
|
||||
if fd["id"] in running_ids:
|
||||
skipped_reasons["running"] += 1
|
||||
continue
|
||||
# Skip if feature has failed too many times
|
||||
if self._failure_counts.get(fd["id"], 0) >= MAX_FEATURE_RETRIES:
|
||||
skipped_reasons["failed"] += 1
|
||||
continue
|
||||
# Check dependencies (pass pre-computed passing_ids)
|
||||
if are_dependencies_satisfied(fd, feature_dicts, passing_ids):
|
||||
ready.append(fd)
|
||||
else:
|
||||
skipped_reasons["deps"] += 1
|
||||
|
||||
# Debug logging
|
||||
passing = sum(1 for f in all_features if f.passes)
|
||||
in_progress = sum(1 for f in all_features if f.in_progress and not f.passes)
|
||||
print(
|
||||
f"[DEBUG] get_ready_features: {len(ready)} ready, "
|
||||
f"{passing} passing, {in_progress} in_progress, {len(all_features)} total",
|
||||
flush=True
|
||||
)
|
||||
print(
|
||||
f"[DEBUG] Skipped: {skipped_reasons['passes']} passing, {skipped_reasons['in_progress']} in_progress, "
|
||||
f"{skipped_reasons['running']} running, {skipped_reasons['failed']} failed, {skipped_reasons['deps']} blocked by deps",
|
||||
flush=True
|
||||
)
|
||||
# Sort by scheduling score (higher = first), then priority, then id
|
||||
if scheduling_scores is None:
|
||||
scheduling_scores = compute_scheduling_scores(feature_dicts)
|
||||
ready.sort(key=lambda f: (-scheduling_scores.get(f["id"], 0), f["priority"], f["id"]))
|
||||
|
||||
# Log to debug file (but not every call to avoid spam)
|
||||
debug_log.log("READY", "get_ready_features() called",
|
||||
ready_count=len(ready),
|
||||
ready_ids=[f['id'] for f in ready[:5]], # First 5 only
|
||||
passing=passing,
|
||||
in_progress=in_progress,
|
||||
total=len(all_features),
|
||||
skipped=skipped_reasons)
|
||||
# Summary counts for logging
|
||||
passing = skipped_reasons["passes"]
|
||||
in_progress = skipped_reasons["in_progress"]
|
||||
total = len(feature_dicts)
|
||||
|
||||
return ready
|
||||
finally:
|
||||
session.close()
|
||||
debug_log.log("READY", "get_ready_features() called",
|
||||
ready_count=len(ready),
|
||||
ready_ids=[f['id'] for f in ready[:5]], # First 5 only
|
||||
passing=passing,
|
||||
in_progress=in_progress,
|
||||
total=total,
|
||||
skipped=skipped_reasons)
|
||||
|
||||
def get_all_complete(self) -> bool:
|
||||
return ready
|
||||
|
||||
def get_all_complete(self, feature_dicts: list[dict] | None = None) -> bool:
|
||||
"""Check if all features are complete or permanently failed.
|
||||
|
||||
Returns False if there are no features (initialization needed).
|
||||
|
||||
Args:
|
||||
feature_dicts: Pre-fetched list of feature dicts. If None, queries the database.
|
||||
"""
|
||||
session = self.get_session()
|
||||
try:
|
||||
# Force fresh read from database to avoid stale cached data
|
||||
# This is critical when agent subprocesses have committed changes
|
||||
session.expire_all()
|
||||
if feature_dicts is None:
|
||||
session = self.get_session()
|
||||
try:
|
||||
session.expire_all()
|
||||
all_features = session.query(Feature).all()
|
||||
feature_dicts = [f.to_dict() for f in all_features]
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
all_features = session.query(Feature).all()
|
||||
# No features = NOT complete, need initialization
|
||||
if len(feature_dicts) == 0:
|
||||
return False
|
||||
|
||||
# No features = NOT complete, need initialization
|
||||
if len(all_features) == 0:
|
||||
return False
|
||||
passing_count = 0
|
||||
failed_count = 0
|
||||
pending_count = 0
|
||||
for fd in feature_dicts:
|
||||
if fd.get("passes"):
|
||||
passing_count += 1
|
||||
continue # Completed successfully
|
||||
if self._failure_counts.get(fd["id"], 0) >= MAX_FEATURE_RETRIES:
|
||||
failed_count += 1
|
||||
continue # Permanently failed, count as "done"
|
||||
pending_count += 1
|
||||
|
||||
passing_count = 0
|
||||
failed_count = 0
|
||||
pending_count = 0
|
||||
for f in all_features:
|
||||
if f.passes:
|
||||
passing_count += 1
|
||||
continue # Completed successfully
|
||||
if self._failure_counts.get(f.id, 0) >= MAX_FEATURE_RETRIES:
|
||||
failed_count += 1
|
||||
continue # Permanently failed, count as "done"
|
||||
pending_count += 1
|
||||
total = len(feature_dicts)
|
||||
is_complete = pending_count == 0
|
||||
debug_log.log("COMPLETE_CHECK", f"get_all_complete: {passing_count}/{total} passing, "
|
||||
f"{failed_count} failed, {pending_count} pending -> {is_complete}")
|
||||
return is_complete
|
||||
|
||||
total = len(all_features)
|
||||
is_complete = pending_count == 0
|
||||
print(
|
||||
f"[DEBUG] get_all_complete: {passing_count}/{total} passing, "
|
||||
f"{failed_count} failed, {pending_count} pending -> {is_complete}",
|
||||
flush=True
|
||||
)
|
||||
return is_complete
|
||||
finally:
|
||||
session.close()
|
||||
def get_passing_count(self, feature_dicts: list[dict] | None = None) -> int:
|
||||
"""Get the number of passing features.
|
||||
|
||||
def get_passing_count(self) -> int:
|
||||
"""Get the number of passing features."""
|
||||
session = self.get_session()
|
||||
try:
|
||||
session.expire_all()
|
||||
count: int = session.query(Feature).filter(Feature.passes == True).count()
|
||||
return count
|
||||
finally:
|
||||
session.close()
|
||||
Args:
|
||||
feature_dicts: Pre-fetched list of feature dicts. If None, queries the database.
|
||||
"""
|
||||
if feature_dicts is None:
|
||||
session = self.get_session()
|
||||
try:
|
||||
session.expire_all()
|
||||
count: int = session.query(Feature).filter(Feature.passes == True).count()
|
||||
return count
|
||||
finally:
|
||||
session.close()
|
||||
return sum(1 for fd in feature_dicts if fd.get("passes"))
|
||||
|
||||
def _maintain_testing_agents(self) -> None:
|
||||
def _maintain_testing_agents(self, feature_dicts: list[dict] | None = None) -> None:
|
||||
"""Maintain the desired count of testing agents independently.
|
||||
|
||||
This runs every loop iteration and spawns testing agents as needed to maintain
|
||||
@@ -402,18 +537,21 @@ class ParallelOrchestrator:
|
||||
- YOLO mode is enabled
|
||||
- testing_agent_ratio is 0
|
||||
- No passing features exist yet
|
||||
|
||||
Args:
|
||||
feature_dicts: Pre-fetched list of feature dicts. If None, queries the database.
|
||||
"""
|
||||
# Skip if testing is disabled
|
||||
if self.yolo_mode or self.testing_agent_ratio == 0:
|
||||
return
|
||||
|
||||
# No testing until there are passing features
|
||||
passing_count = self.get_passing_count()
|
||||
passing_count = self.get_passing_count(feature_dicts)
|
||||
if passing_count == 0:
|
||||
return
|
||||
|
||||
# Don't spawn testing agents if all features are already complete
|
||||
if self.get_all_complete():
|
||||
if self.get_all_complete(feature_dicts):
|
||||
return
|
||||
|
||||
# Spawn testing agents one at a time, re-checking limits each time
|
||||
@@ -439,7 +577,7 @@ class ParallelOrchestrator:
|
||||
passing_count=passing_count)
|
||||
|
||||
# Spawn outside lock (I/O bound operation)
|
||||
print(f"[DEBUG] Spawning testing agent ({spawn_index}/{desired})", flush=True)
|
||||
logger.debug("Spawning testing agent (%d/%d)", spawn_index, desired)
|
||||
success, msg = self._spawn_testing_agent()
|
||||
if not success:
|
||||
debug_log.log("TESTING", f"Spawn failed, stopping: {msg}")
|
||||
@@ -521,7 +659,7 @@ class ParallelOrchestrator:
|
||||
# CREATE_NO_WINDOW on Windows prevents console window pop-ups
|
||||
# stdin=DEVNULL prevents blocking on stdin reads
|
||||
# encoding="utf-8" and errors="replace" fix Windows CP1252 issues
|
||||
popen_kwargs = {
|
||||
popen_kwargs: dict[str, Any] = {
|
||||
"stdin": subprocess.DEVNULL,
|
||||
"stdout": subprocess.PIPE,
|
||||
"stderr": subprocess.STDOUT,
|
||||
@@ -565,11 +703,14 @@ class ParallelOrchestrator:
|
||||
return True, f"Started feature {feature_id}"
|
||||
|
||||
def _spawn_testing_agent(self) -> tuple[bool, str]:
|
||||
"""Spawn a testing agent subprocess for regression testing.
|
||||
"""Spawn a testing agent subprocess for batch regression testing.
|
||||
|
||||
Picks a random passing feature to test. Multiple testing agents can test
|
||||
the same feature concurrently - this is intentional and simplifies the
|
||||
architecture by removing claim coordination.
|
||||
Selects a prioritized batch of passing features using weighted scoring
|
||||
(via _get_test_batch) and passes them as --testing-feature-ids to the
|
||||
subprocess. Falls back to single --testing-feature-id for batches of one.
|
||||
|
||||
Multiple testing agents can test the same feature concurrently - this is
|
||||
intentional and simplifies the architecture by removing claim coordination.
|
||||
"""
|
||||
# Check limits first (under lock)
|
||||
with self._lock:
|
||||
@@ -582,13 +723,16 @@ class ParallelOrchestrator:
|
||||
debug_log.log("TESTING", f"Skipped spawn - at max total agents ({total_agents}/{MAX_TOTAL_AGENTS})")
|
||||
return False, f"At max total agents ({total_agents})"
|
||||
|
||||
# Pick a random passing feature (no claim needed - concurrent testing is fine)
|
||||
feature_id = self._get_random_passing_feature()
|
||||
if feature_id is None:
|
||||
# Select a weighted batch of passing features for regression testing
|
||||
batch = self._get_test_batch(self.testing_batch_size)
|
||||
if not batch:
|
||||
debug_log.log("TESTING", "No features available for testing")
|
||||
return False, "No features available for testing"
|
||||
|
||||
debug_log.log("TESTING", f"Selected feature #{feature_id} for testing")
|
||||
# Use the first feature ID as the representative for logging/tracking
|
||||
primary_feature_id = batch[0]
|
||||
batch_str = ",".join(str(fid) for fid in batch)
|
||||
debug_log.log("TESTING", f"Selected batch for testing: [{batch_str}]")
|
||||
|
||||
# Spawn the testing agent
|
||||
with self._lock:
|
||||
@@ -604,7 +748,7 @@ class ParallelOrchestrator:
|
||||
"--project-dir", str(self.project_dir),
|
||||
"--max-iterations", "1",
|
||||
"--agent-type", "testing",
|
||||
"--testing-feature-id", str(feature_id),
|
||||
"--testing-feature-ids", batch_str,
|
||||
]
|
||||
if self.model:
|
||||
cmd.extend(["--model", self.model])
|
||||
@@ -613,7 +757,7 @@ class ParallelOrchestrator:
|
||||
# CREATE_NO_WINDOW on Windows prevents console window pop-ups
|
||||
# stdin=DEVNULL prevents blocking on stdin reads
|
||||
# encoding="utf-8" and errors="replace" fix Windows CP1252 issues
|
||||
popen_kwargs = {
|
||||
popen_kwargs: dict[str, Any] = {
|
||||
"stdin": subprocess.DEVNULL,
|
||||
"stdout": subprocess.PIPE,
|
||||
"stderr": subprocess.STDOUT,
|
||||
@@ -633,22 +777,22 @@ class ParallelOrchestrator:
|
||||
|
||||
# Register process by PID (not feature_id) to avoid overwrites
|
||||
# when multiple agents test the same feature
|
||||
self.running_testing_agents[proc.pid] = (feature_id, proc)
|
||||
self.running_testing_agents[proc.pid] = (primary_feature_id, proc)
|
||||
testing_count = len(self.running_testing_agents)
|
||||
|
||||
# Start output reader thread with feature ID (same as coding agents)
|
||||
# Start output reader thread with primary feature ID for log attribution
|
||||
threading.Thread(
|
||||
target=self._read_output,
|
||||
args=(feature_id, proc, threading.Event(), "testing"),
|
||||
args=(primary_feature_id, proc, threading.Event(), "testing"),
|
||||
daemon=True
|
||||
).start()
|
||||
|
||||
print(f"Started testing agent for feature #{feature_id} (PID {proc.pid})", flush=True)
|
||||
debug_log.log("TESTING", f"Successfully spawned testing agent for feature #{feature_id}",
|
||||
print(f"Started testing agent for features [{batch_str}] (PID {proc.pid})", flush=True)
|
||||
debug_log.log("TESTING", f"Successfully spawned testing agent for batch [{batch_str}]",
|
||||
pid=proc.pid,
|
||||
feature_id=feature_id,
|
||||
feature_ids=batch,
|
||||
total_testing_agents=testing_count)
|
||||
return True, f"Started testing agent for feature #{feature_id}"
|
||||
return True, f"Started testing agent for features [{batch_str}]"
|
||||
|
||||
async def _run_initializer(self) -> bool:
|
||||
"""Run initializer agent as blocking subprocess.
|
||||
@@ -674,7 +818,7 @@ class ParallelOrchestrator:
|
||||
# CREATE_NO_WINDOW on Windows prevents console window pop-ups
|
||||
# stdin=DEVNULL prevents blocking on stdin reads
|
||||
# encoding="utf-8" and errors="replace" fix Windows CP1252 issues
|
||||
popen_kwargs = {
|
||||
popen_kwargs: dict[str, Any] = {
|
||||
"stdin": subprocess.DEVNULL,
|
||||
"stdout": subprocess.PIPE,
|
||||
"stderr": subprocess.STDOUT,
|
||||
@@ -1000,16 +1144,15 @@ class ParallelOrchestrator:
|
||||
# newly created features.
|
||||
debug_log.section("INITIALIZATION COMPLETE")
|
||||
debug_log.log("INIT", "Disposing old database engine and creating fresh connection")
|
||||
print("[DEBUG] Recreating database connection after initialization...", flush=True)
|
||||
logger.debug("Recreating database connection after initialization")
|
||||
if self._engine is not None:
|
||||
self._engine.dispose()
|
||||
self._engine, self._session_maker = create_database(self.project_dir)
|
||||
|
||||
# Debug: Show state immediately after initialization
|
||||
print("[DEBUG] Post-initialization state check:", flush=True)
|
||||
print(f"[DEBUG] max_concurrency={self.max_concurrency}", flush=True)
|
||||
print(f"[DEBUG] yolo_mode={self.yolo_mode}", flush=True)
|
||||
print(f"[DEBUG] testing_agent_ratio={self.testing_agent_ratio}", flush=True)
|
||||
logger.debug("Post-initialization state check")
|
||||
logger.debug("Post-initialization state: max_concurrency=%d, yolo_mode=%s, testing_agent_ratio=%d",
|
||||
self.max_concurrency, self.yolo_mode, self.testing_agent_ratio)
|
||||
|
||||
# Verify features were created and are visible
|
||||
session = self.get_session()
|
||||
@@ -1017,7 +1160,7 @@ class ParallelOrchestrator:
|
||||
feature_count = session.query(Feature).count()
|
||||
all_features = session.query(Feature).all()
|
||||
feature_names = [f"{f.id}: {f.name}" for f in all_features[:10]]
|
||||
print(f"[DEBUG] features in database={feature_count}", flush=True)
|
||||
logger.debug("Features in database: %d", feature_count)
|
||||
debug_log.log("INIT", "Post-initialization database state",
|
||||
max_concurrency=self.max_concurrency,
|
||||
yolo_mode=self.yolo_mode,
|
||||
@@ -1041,7 +1184,18 @@ class ParallelOrchestrator:
|
||||
while self.is_running and not self._shutdown_requested:
|
||||
loop_iteration += 1
|
||||
if loop_iteration <= 3:
|
||||
print(f"[DEBUG] === Loop iteration {loop_iteration} ===", flush=True)
|
||||
logger.debug("=== Loop iteration %d ===", loop_iteration)
|
||||
|
||||
# Query all features ONCE per iteration and build reusable snapshot.
|
||||
# Every sub-method receives this snapshot instead of re-querying the DB.
|
||||
session = self.get_session()
|
||||
session.expire_all()
|
||||
all_features = session.query(Feature).all()
|
||||
feature_dicts = [f.to_dict() for f in all_features]
|
||||
session.close()
|
||||
|
||||
# Pre-compute scheduling scores once (BFS + reverse topo sort)
|
||||
scheduling_scores = compute_scheduling_scores(feature_dicts)
|
||||
|
||||
# Log every iteration to debug file (first 10, then every 5th)
|
||||
if loop_iteration <= 10 or loop_iteration % 5 == 0:
|
||||
@@ -1055,20 +1209,16 @@ class ParallelOrchestrator:
|
||||
|
||||
# Full database dump every 5 iterations
|
||||
if loop_iteration == 1 or loop_iteration % 5 == 0:
|
||||
session = self.get_session()
|
||||
try:
|
||||
_dump_database_state(session, f"(iteration {loop_iteration})")
|
||||
finally:
|
||||
session.close()
|
||||
_dump_database_state(feature_dicts, f"(iteration {loop_iteration})")
|
||||
|
||||
try:
|
||||
# Check if all complete
|
||||
if self.get_all_complete():
|
||||
if self.get_all_complete(feature_dicts):
|
||||
print("\nAll features complete!", flush=True)
|
||||
break
|
||||
|
||||
# Maintain testing agents independently (runs every iteration)
|
||||
self._maintain_testing_agents()
|
||||
self._maintain_testing_agents(feature_dicts)
|
||||
|
||||
# Check capacity
|
||||
with self._lock:
|
||||
@@ -1089,17 +1239,17 @@ class ParallelOrchestrator:
|
||||
continue
|
||||
|
||||
# Priority 1: Resume features from previous session
|
||||
resumable = self.get_resumable_features()
|
||||
resumable = self.get_resumable_features(feature_dicts, scheduling_scores)
|
||||
if resumable:
|
||||
slots = self.max_concurrency - current
|
||||
for feature in resumable[:slots]:
|
||||
print(f"Resuming feature #{feature['id']}: {feature['name']}", flush=True)
|
||||
self.start_feature(feature["id"], resume=True)
|
||||
await asyncio.sleep(2)
|
||||
await asyncio.sleep(0.5) # Brief delay for subprocess to claim feature before re-querying
|
||||
continue
|
||||
|
||||
# Priority 2: Start new ready features
|
||||
ready = self.get_ready_features()
|
||||
ready = self.get_ready_features(feature_dicts, scheduling_scores)
|
||||
if not ready:
|
||||
# Wait for running features to complete
|
||||
if current > 0:
|
||||
@@ -1112,11 +1262,12 @@ class ParallelOrchestrator:
|
||||
session = self.get_session()
|
||||
try:
|
||||
session.expire_all()
|
||||
fresh_dicts = [f.to_dict() for f in session.query(Feature).all()]
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
# Recheck if all features are now complete
|
||||
if self.get_all_complete():
|
||||
if self.get_all_complete(fresh_dicts):
|
||||
print("\nAll features complete!", flush=True)
|
||||
break
|
||||
|
||||
@@ -1127,10 +1278,10 @@ class ParallelOrchestrator:
|
||||
|
||||
# Start features up to capacity
|
||||
slots = self.max_concurrency - current
|
||||
print(f"[DEBUG] Spawning loop: {len(ready)} ready, {slots} slots available, max_concurrency={self.max_concurrency}", flush=True)
|
||||
print(f"[DEBUG] Will attempt to start {min(len(ready), slots)} features", flush=True)
|
||||
logger.debug("Spawning loop: %d ready, %d slots available, max_concurrency=%d",
|
||||
len(ready), slots, self.max_concurrency)
|
||||
features_to_start = ready[:slots]
|
||||
print(f"[DEBUG] Features to start: {[f['id'] for f in features_to_start]}", flush=True)
|
||||
logger.debug("Features to start: %s", [f['id'] for f in features_to_start])
|
||||
|
||||
debug_log.log("SPAWN", "Starting features batch",
|
||||
ready_count=len(ready),
|
||||
@@ -1138,23 +1289,24 @@ class ParallelOrchestrator:
|
||||
features_to_start=[f['id'] for f in features_to_start])
|
||||
|
||||
for i, feature in enumerate(features_to_start):
|
||||
print(f"[DEBUG] Starting feature {i+1}/{len(features_to_start)}: #{feature['id']} - {feature['name']}", flush=True)
|
||||
logger.debug("Starting feature %d/%d: #%d - %s",
|
||||
i + 1, len(features_to_start), feature['id'], feature['name'])
|
||||
success, msg = self.start_feature(feature["id"])
|
||||
if not success:
|
||||
print(f"[DEBUG] Failed to start feature #{feature['id']}: {msg}", flush=True)
|
||||
logger.debug("Failed to start feature #%d: %s", feature['id'], msg)
|
||||
debug_log.log("SPAWN", f"FAILED to start feature #{feature['id']}",
|
||||
feature_name=feature['name'],
|
||||
error=msg)
|
||||
else:
|
||||
print(f"[DEBUG] Successfully started feature #{feature['id']}", flush=True)
|
||||
logger.debug("Successfully started feature #%d", feature['id'])
|
||||
with self._lock:
|
||||
running_count = len(self.running_coding_agents)
|
||||
print(f"[DEBUG] Running coding agents after start: {running_count}", flush=True)
|
||||
logger.debug("Running coding agents after start: %d", running_count)
|
||||
debug_log.log("SPAWN", f"Successfully started feature #{feature['id']}",
|
||||
feature_name=feature['name'],
|
||||
running_coding_agents=running_count)
|
||||
|
||||
await asyncio.sleep(2) # Brief pause between starts
|
||||
await asyncio.sleep(0.5) # Brief delay for subprocess to claim feature before re-querying
|
||||
|
||||
except Exception as e:
|
||||
print(f"Orchestrator error: {e}", flush=True)
|
||||
@@ -1223,6 +1375,7 @@ async def run_parallel_orchestrator(
|
||||
model: str | None = None,
|
||||
yolo_mode: bool = False,
|
||||
testing_agent_ratio: int = 1,
|
||||
testing_batch_size: int = DEFAULT_TESTING_BATCH_SIZE,
|
||||
) -> None:
|
||||
"""Run the unified orchestrator.
|
||||
|
||||
@@ -1232,6 +1385,7 @@ async def run_parallel_orchestrator(
|
||||
model: Claude model to use
|
||||
yolo_mode: Whether to run in YOLO mode (skip testing agents)
|
||||
testing_agent_ratio: Number of regression agents to maintain (0-3)
|
||||
testing_batch_size: Number of features per testing batch (1-5)
|
||||
"""
|
||||
print(f"[ORCHESTRATOR] run_parallel_orchestrator called with max_concurrency={max_concurrency}", flush=True)
|
||||
orchestrator = ParallelOrchestrator(
|
||||
@@ -1240,6 +1394,7 @@ async def run_parallel_orchestrator(
|
||||
model=model,
|
||||
yolo_mode=yolo_mode,
|
||||
testing_agent_ratio=testing_agent_ratio,
|
||||
testing_batch_size=testing_batch_size,
|
||||
)
|
||||
|
||||
# Set up cleanup to run on exit (handles normal exit, exceptions)
|
||||
@@ -1319,6 +1474,12 @@ def main():
|
||||
default=1,
|
||||
help="Number of regression testing agents (0-3, default: 1). Set to 0 to disable testing agents.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--testing-batch-size",
|
||||
type=int,
|
||||
default=DEFAULT_TESTING_BATCH_SIZE,
|
||||
help=f"Number of features per testing batch (1-5, default: {DEFAULT_TESTING_BATCH_SIZE})",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -1345,6 +1506,7 @@ def main():
|
||||
model=args.model,
|
||||
yolo_mode=args.yolo,
|
||||
testing_agent_ratio=args.testing_agent_ratio,
|
||||
testing_batch_size=args.testing_batch_size,
|
||||
))
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nInterrupted by user", flush=True)
|
||||
|
||||
Reference in New Issue
Block a user