feat: add API provider selection UI and fix stuck features on agent crash

API Provider Selection:
- Add provider switcher in Settings modal (Claude, Kimi, GLM, Ollama, Custom)
- Auth tokens stored locally only (registry.db), never returned by API
- get_effective_sdk_env() builds provider-specific env vars for agent subprocess
- All chat sessions (spec, expand, assistant) use provider settings
- Backward compatible: defaults to Claude, env vars still work as override

Fix Stuck Features:
- Add _cleanup_stale_features() to process_manager.py
- Reset in_progress features when agent stops, crashes, or fails healthcheck
- Prevents features from being permanently stuck after rate limit crashes
- Uses separate SQLAlchemy engine to avoid session conflicts with subprocess

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
nioasoft
2026-02-06 05:55:51 +02:00
parent 70131f2271
commit 13785325d7
13 changed files with 561 additions and 63 deletions

View File

@@ -258,15 +258,11 @@ class AssistantChatSession:
system_cli = shutil.which("claude")
# Build environment overrides for API configuration
sdk_env: dict[str, str] = {}
for var in API_ENV_VARS:
value = os.getenv(var)
if value:
sdk_env[var] = value
from registry import get_effective_sdk_env
sdk_env = get_effective_sdk_env()
# Determine model from environment or use default
# This allows using alternative APIs (e.g., GLM via z.ai) that may not support Claude model names
model = os.getenv("ANTHROPIC_DEFAULT_OPUS_MODEL", "claude-opus-4-5-20251101")
# Determine model from SDK env (provider-aware) or fallback to env/default
model = sdk_env.get("ANTHROPIC_DEFAULT_OPUS_MODEL") or os.getenv("ANTHROPIC_DEFAULT_OPUS_MODEL", "claude-opus-4-5-20251101")
try:
logger.info("Creating ClaudeSDKClient...")

View File

@@ -154,16 +154,11 @@ class ExpandChatSession:
system_prompt = skill_content.replace("$ARGUMENTS", project_path)
# Build environment overrides for API configuration
# Filter to only include vars that are actually set (non-None)
sdk_env: dict[str, str] = {}
for var in API_ENV_VARS:
value = os.getenv(var)
if value:
sdk_env[var] = value
from registry import get_effective_sdk_env
sdk_env = get_effective_sdk_env()
# Determine model from environment or use default
# This allows using alternative APIs (e.g., GLM via z.ai) that may not support Claude model names
model = os.getenv("ANTHROPIC_DEFAULT_OPUS_MODEL", "claude-opus-4-5-20251101")
# Determine model from SDK env (provider-aware) or fallback to env/default
model = sdk_env.get("ANTHROPIC_DEFAULT_OPUS_MODEL") or os.getenv("ANTHROPIC_DEFAULT_OPUS_MODEL", "claude-opus-4-5-20251101")
# Build MCP servers config for feature creation
mcp_servers = {

View File

@@ -227,6 +227,46 @@ class AgentProcessManager:
"""Remove lock file."""
self.lock_file.unlink(missing_ok=True)
def _cleanup_stale_features(self) -> None:
"""Clear in_progress flag for all features when agent stops/crashes.
When the agent process exits (normally or crash), any features left
with in_progress=True were being worked on and didn't complete.
Reset them so they can be picked up on next agent start.
"""
try:
from autoforge_paths import get_features_db_path
features_db = get_features_db_path(self.project_dir)
if not features_db.exists():
return
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from api.database import Feature
engine = create_engine(f"sqlite:///{features_db}")
Session = sessionmaker(bind=engine)
session = Session()
try:
stuck = session.query(Feature).filter(
Feature.in_progress == True, # noqa: E712
Feature.passes == False, # noqa: E712
).all()
if stuck:
for f in stuck:
f.in_progress = False
session.commit()
logger.info(
"Cleaned up %d stuck feature(s) for %s",
len(stuck), self.project_name,
)
finally:
session.close()
engine.dispose()
except Exception as e:
logger.warning("Failed to cleanup features for %s: %s", self.project_name, e)
async def _broadcast_output(self, line: str) -> None:
"""Broadcast output line to all registered callbacks."""
with self._callbacks_lock:
@@ -288,6 +328,7 @@ class AgentProcessManager:
self.status = "crashed"
elif self.status == "running":
self.status = "stopped"
self._cleanup_stale_features()
self._remove_lock()
async def start(
@@ -359,12 +400,22 @@ class AgentProcessManager:
# stdin=DEVNULL prevents blocking if Claude CLI or child process tries to read stdin
# CREATE_NO_WINDOW on Windows prevents console window pop-ups
# PYTHONUNBUFFERED ensures output isn't delayed
# Build subprocess environment with API provider settings
from registry import get_effective_sdk_env
api_env = get_effective_sdk_env()
subprocess_env = {
**os.environ,
"PYTHONUNBUFFERED": "1",
"PLAYWRIGHT_HEADLESS": "true" if playwright_headless else "false",
**api_env,
}
popen_kwargs: dict[str, Any] = {
"stdin": subprocess.DEVNULL,
"stdout": subprocess.PIPE,
"stderr": subprocess.STDOUT,
"cwd": str(self.project_dir),
"env": {**os.environ, "PYTHONUNBUFFERED": "1", "PLAYWRIGHT_HEADLESS": "true" if playwright_headless else "false"},
"env": subprocess_env,
}
if sys.platform == "win32":
popen_kwargs["creationflags"] = subprocess.CREATE_NO_WINDOW
@@ -425,6 +476,7 @@ class AgentProcessManager:
result.children_terminated, result.children_killed
)
self._cleanup_stale_features()
self._remove_lock()
self.status = "stopped"
self.process = None
@@ -502,6 +554,7 @@ class AgentProcessManager:
if poll is not None:
# Process has terminated
if self.status in ("running", "paused"):
self._cleanup_stale_features()
self.status = "crashed"
self._remove_lock()
return False

View File

@@ -140,16 +140,11 @@ class SpecChatSession:
system_cli = shutil.which("claude")
# Build environment overrides for API configuration
# Filter to only include vars that are actually set (non-None)
sdk_env: dict[str, str] = {}
for var in API_ENV_VARS:
value = os.getenv(var)
if value:
sdk_env[var] = value
from registry import get_effective_sdk_env
sdk_env = get_effective_sdk_env()
# Determine model from environment or use default
# This allows using alternative APIs (e.g., GLM via z.ai) that may not support Claude model names
model = os.getenv("ANTHROPIC_DEFAULT_OPUS_MODEL", "claude-opus-4-5-20251101")
# Determine model from SDK env (provider-aware) or fallback to env/default
model = sdk_env.get("ANTHROPIC_DEFAULT_OPUS_MODEL") or os.getenv("ANTHROPIC_DEFAULT_OPUS_MODEL", "claude-opus-4-5-20251101")
try:
self.client = ClaudeSDKClient(