fix: prevent SQLite corruption in parallel mode with atomic operations

Replace ineffective threading.Lock() with atomic SQL operations for
cross-process safety. Key changes:

- Add SQLAlchemy event hooks (do_connect/do_begin) for BEGIN IMMEDIATE
  transactions in api/database.py
- Add atomic_transaction() context manager for multi-statement ops
- Convert all feature MCP write operations to atomic UPDATE...WHERE
  with compare-and-swap patterns (feature_claim, mark_passing, etc.)
- Add WHERE passes=0 state guard to feature_mark_passing
- Add WAL checkpoint on shutdown and idempotent cleanup() in
  parallel_orchestrator.py with async-safe signal handling
- Wrap SQLite connections with contextlib.closing() in progress.py
- Add thread-safe engine cache with double-checked locking in
  assistant_database.py
- Migrate to SQLAlchemy 2.0 DeclarativeBase across all modules

Inspired by PR #108 (cabana8471-arch), with fixes for nested
BEGIN EXCLUSIVE bug and missing state guards.

Closes #106

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Auto
2026-02-01 09:45:20 +02:00
parent 6609a0f7d6
commit d8a8c83447
6 changed files with 489 additions and 255 deletions

View File

@@ -19,7 +19,9 @@ Usage:
"""
import asyncio
import atexit
import os
import signal
import subprocess
import sys
import threading
@@ -27,6 +29,8 @@ from datetime import datetime, timezone
from pathlib import Path
from typing import Callable, Literal
from sqlalchemy import text
from api.database import Feature, create_database
from api.dependency_resolver import are_dependencies_satisfied, compute_scheduling_scores
from progress import has_features
@@ -139,11 +143,11 @@ class ParallelOrchestrator:
self,
project_dir: Path,
max_concurrency: int = DEFAULT_CONCURRENCY,
model: str = None,
model: str | None = None,
yolo_mode: bool = False,
testing_agent_ratio: int = 1,
on_output: Callable[[int, str], None] = None,
on_status: Callable[[int, str], None] = None,
on_output: Callable[[int, str], None] | None = None,
on_status: Callable[[int, str], None] | None = None,
):
"""Initialize the orchestrator.
@@ -182,14 +186,18 @@ class ParallelOrchestrator:
# Track feature failures to prevent infinite retry loops
self._failure_counts: dict[int, int] = {}
# Shutdown flag for async-safe signal handling
# Signal handlers only set this flag; cleanup happens in the main loop
self._shutdown_requested = False
# Session tracking for logging/debugging
self.session_start_time: datetime = None
self.session_start_time: datetime | None = None
# Event signaled when any agent completes, allowing the main loop to wake
# immediately instead of waiting for the full POLL_INTERVAL timeout.
# This reduces latency when spawning the next feature after completion.
self._agent_completed_event: asyncio.Event = None # Created in run_loop
self._event_loop: asyncio.AbstractEventLoop = None # Stored for thread-safe signaling
self._agent_completed_event: asyncio.Event | None = None # Created in run_loop
self._event_loop: asyncio.AbstractEventLoop | None = None # Stored for thread-safe signaling
# Database session for this orchestrator
self._engine, self._session_maker = create_database(project_dir)
@@ -375,7 +383,8 @@ class ParallelOrchestrator:
session = self.get_session()
try:
session.expire_all()
return session.query(Feature).filter(Feature.passes == True).count()
count: int = session.query(Feature).filter(Feature.passes == True).count()
return count
finally:
session.close()
@@ -511,11 +520,14 @@ class ParallelOrchestrator:
try:
# CREATE_NO_WINDOW on Windows prevents console window pop-ups
# stdin=DEVNULL prevents blocking on stdin reads
# encoding="utf-8" and errors="replace" fix Windows CP1252 issues
popen_kwargs = {
"stdin": subprocess.DEVNULL,
"stdout": subprocess.PIPE,
"stderr": subprocess.STDOUT,
"text": True,
"encoding": "utf-8",
"errors": "replace",
"cwd": str(AUTOCODER_ROOT), # Run from autocoder root for proper imports
"env": {**os.environ, "PYTHONUNBUFFERED": "1"},
}
@@ -546,7 +558,7 @@ class ParallelOrchestrator:
daemon=True
).start()
if self.on_status:
if self.on_status is not None:
self.on_status(feature_id, "running")
print(f"Started coding agent for feature #{feature_id}", flush=True)
@@ -600,11 +612,14 @@ class ParallelOrchestrator:
try:
# CREATE_NO_WINDOW on Windows prevents console window pop-ups
# stdin=DEVNULL prevents blocking on stdin reads
# encoding="utf-8" and errors="replace" fix Windows CP1252 issues
popen_kwargs = {
"stdin": subprocess.DEVNULL,
"stdout": subprocess.PIPE,
"stderr": subprocess.STDOUT,
"text": True,
"encoding": "utf-8",
"errors": "replace",
"cwd": str(AUTOCODER_ROOT),
"env": {**os.environ, "PYTHONUNBUFFERED": "1"},
}
@@ -658,11 +673,14 @@ class ParallelOrchestrator:
# CREATE_NO_WINDOW on Windows prevents console window pop-ups
# stdin=DEVNULL prevents blocking on stdin reads
# encoding="utf-8" and errors="replace" fix Windows CP1252 issues
popen_kwargs = {
"stdin": subprocess.DEVNULL,
"stdout": subprocess.PIPE,
"stderr": subprocess.STDOUT,
"text": True,
"encoding": "utf-8",
"errors": "replace",
"cwd": str(AUTOCODER_ROOT),
"env": {**os.environ, "PYTHONUNBUFFERED": "1"},
}
@@ -682,7 +700,7 @@ class ParallelOrchestrator:
if not line:
break
print(line.rstrip(), flush=True)
if self.on_output:
if self.on_output is not None:
self.on_output(0, line.rstrip()) # Use 0 as feature_id for initializer
proc.wait()
@@ -716,11 +734,14 @@ class ParallelOrchestrator:
):
"""Read output from subprocess and emit events."""
try:
if proc.stdout is None:
proc.wait()
return
for line in proc.stdout:
if abort.is_set():
break
line = line.rstrip()
if self.on_output:
if self.on_output is not None:
self.on_output(feature_id or 0, line)
else:
# Both coding and testing agents now use [Feature #X] format
@@ -814,6 +835,9 @@ class ParallelOrchestrator:
self._signal_agent_completed()
return
# feature_id is required for coding agents (always passed from start_feature)
assert feature_id is not None, "feature_id must not be None for coding agents"
# Coding agent completion
debug_log.log("COMPLETE", f"Coding agent for feature #{feature_id} finished",
return_code=return_code,
@@ -855,7 +879,7 @@ class ParallelOrchestrator:
failure_count=failure_count)
status = "completed" if return_code == 0 else "failed"
if self.on_status:
if self.on_status is not None:
self.on_status(feature_id, status)
# CRITICAL: This print triggers the WebSocket to emit agent_update with state='error' or 'success'
print(f"Feature #{feature_id} {status}", flush=True)
@@ -1014,7 +1038,7 @@ class ParallelOrchestrator:
debug_log.section("FEATURE LOOP STARTING")
loop_iteration = 0
while self.is_running:
while self.is_running and not self._shutdown_requested:
loop_iteration += 1
if loop_iteration <= 3:
print(f"[DEBUG] === Loop iteration {loop_iteration} ===", flush=True)
@@ -1163,11 +1187,40 @@ class ParallelOrchestrator:
"yolo_mode": self.yolo_mode,
}
def cleanup(self) -> None:
"""Clean up database resources. Safe to call multiple times.
Forces WAL checkpoint to flush pending writes to main database file,
then disposes engine to close all connections. Prevents stale cache
issues when the orchestrator restarts.
"""
# Atomically grab and clear the engine reference to prevent re-entry
engine = self._engine
self._engine = None
if engine is None:
return # Already cleaned up
try:
debug_log.log("CLEANUP", "Forcing WAL checkpoint before dispose")
with engine.connect() as conn:
conn.execute(text("PRAGMA wal_checkpoint(FULL)"))
conn.commit()
debug_log.log("CLEANUP", "WAL checkpoint completed, disposing engine")
except Exception as e:
debug_log.log("CLEANUP", f"WAL checkpoint failed (non-fatal): {e}")
try:
engine.dispose()
debug_log.log("CLEANUP", "Engine disposed successfully")
except Exception as e:
debug_log.log("CLEANUP", f"Engine dispose failed: {e}")
async def run_parallel_orchestrator(
project_dir: Path,
max_concurrency: int = DEFAULT_CONCURRENCY,
model: str = None,
model: str | None = None,
yolo_mode: bool = False,
testing_agent_ratio: int = 1,
) -> None:
@@ -1189,11 +1242,37 @@ async def run_parallel_orchestrator(
testing_agent_ratio=testing_agent_ratio,
)
# Set up cleanup to run on exit (handles normal exit, exceptions)
def cleanup_handler():
debug_log.log("CLEANUP", "atexit cleanup handler invoked")
orchestrator.cleanup()
atexit.register(cleanup_handler)
# Set up async-safe signal handler for graceful shutdown
# Only sets flags - everything else is unsafe in signal context
def signal_handler(signum, frame):
orchestrator._shutdown_requested = True
orchestrator.is_running = False
# Register SIGTERM handler for process termination signals
# Note: On Windows, SIGTERM handlers only fire from os.kill() calls within Python.
# External termination (Task Manager, taskkill, Popen.terminate()) uses
# TerminateProcess() which bypasses signal handlers entirely.
signal.signal(signal.SIGTERM, signal_handler)
# Note: We intentionally do NOT register SIGINT handler
# Let Python raise KeyboardInterrupt naturally so the except block works
try:
await orchestrator.run_loop()
except KeyboardInterrupt:
print("\n\nInterrupted by user. Stopping agents...", flush=True)
orchestrator.stop_all()
finally:
# CRITICAL: Always clean up database resources on exit
# This forces WAL checkpoint and disposes connections
orchestrator.cleanup()
def main():