improve performance

2026-03-17 10:53:09 +00:00 · 2026-01-23 14:37:43 +02:00
parent 1be42cc734
commit 874359fcf6
9 changed files with 396 additions and 672 deletions
--- a/parallel_orchestrator.py
+++ b/parallel_orchestrator.py
@@ -186,6 +186,12 @@ class ParallelOrchestrator:
        # Session tracking for logging/debugging
        self.session_start_time: datetime = None

+        # Event signaled when any agent completes, allowing the main loop to wake
+        # immediately instead of waiting for the full POLL_INTERVAL timeout.
+        # This reduces latency when spawning the next feature after completion.
+        self._agent_completed_event: asyncio.Event = None  # Created in run_loop
+        self._event_loop: asyncio.AbstractEventLoop = None  # Stored for thread-safe signaling
+
        # Database session for this orchestrator
        self._engine, self._session_maker = create_database(project_dir)

@@ -311,6 +317,9 @@ class ParallelOrchestrator:
            all_features = session.query(Feature).all()
            all_dicts = [f.to_dict() for f in all_features]

+            # Pre-compute passing_ids once to avoid O(n^2) in the loop
+            passing_ids = {f.id for f in all_features if f.passes}
+
            ready = []
            skipped_reasons = {"passes": 0, "in_progress": 0, "running": 0, "failed": 0, "deps": 0}
            for f in all_features:
@@ -329,8 +338,8 @@ class ParallelOrchestrator:
                if self._failure_counts.get(f.id, 0) >= MAX_FEATURE_RETRIES:
                    skipped_reasons["failed"] += 1
                    continue
-                # Check dependencies
-                if are_dependencies_satisfied(f.to_dict(), all_dicts):
+                # Check dependencies (pass pre-computed passing_ids)
+                if are_dependencies_satisfied(f.to_dict(), all_dicts, passing_ids):
                    ready.append(f.to_dict())
                else:
                    skipped_reasons["deps"] += 1
@@ -794,6 +803,52 @@ class ParallelOrchestrator:
        finally:
            self._on_agent_complete(feature_id, proc.returncode, agent_type, proc)

+    def _signal_agent_completed(self):
+        """Signal that an agent has completed, waking the main loop.
+
+        This method is safe to call from any thread. It schedules the event.set()
+        call to run on the event loop thread to avoid cross-thread issues with
+        asyncio.Event.
+        """
+        if self._agent_completed_event is not None and self._event_loop is not None:
+            try:
+                # Use the stored event loop reference to schedule the set() call
+                # This is necessary because asyncio.Event is not thread-safe and
+                # asyncio.get_event_loop() fails in threads without an event loop
+                if self._event_loop.is_running():
+                    self._event_loop.call_soon_threadsafe(self._agent_completed_event.set)
+                else:
+                    # Fallback: set directly if loop isn't running (shouldn't happen during normal operation)
+                    self._agent_completed_event.set()
+            except RuntimeError:
+                # Event loop closed, ignore (orchestrator may be shutting down)
+                pass
+
+    async def _wait_for_agent_completion(self, timeout: float = POLL_INTERVAL):
+        """Wait for an agent to complete or until timeout expires.
+
+        This replaces fixed `asyncio.sleep(POLL_INTERVAL)` calls with event-based
+        waiting. When an agent completes, _signal_agent_completed() sets the event,
+        causing this method to return immediately. If no agent completes within
+        the timeout, we return anyway to check for ready features.
+
+        Args:
+            timeout: Maximum seconds to wait (default: POLL_INTERVAL)
+        """
+        if self._agent_completed_event is None:
+            # Fallback if event not initialized (shouldn't happen in normal operation)
+            await asyncio.sleep(timeout)
+            return
+
+        try:
+            await asyncio.wait_for(self._agent_completed_event.wait(), timeout=timeout)
+            # Event was set - an agent completed. Clear it for the next wait cycle.
+            self._agent_completed_event.clear()
+            debug_log.log("EVENT", "Woke up immediately - agent completed")
+        except asyncio.TimeoutError:
+            # Timeout reached without agent completion - this is normal, just check anyway
+            pass
+
    def _on_agent_complete(
        self,
        feature_id: int | None,
@@ -832,6 +887,8 @@ class ParallelOrchestrator:
                pid=proc.pid,
                feature_id=feature_id,
                status=status)
+            # Signal main loop that an agent slot is available
+            self._signal_agent_completed()
            return

        # Coding agent completion
@@ -843,40 +900,20 @@ class ParallelOrchestrator:
            self.running_coding_agents.pop(feature_id, None)
            self.abort_events.pop(feature_id, None)

-        # BEFORE dispose: Query database state to see if it's stale
-        session_before = self.get_session()
-        try:
-            session_before.expire_all()
-            feature_before = session_before.query(Feature).filter(Feature.id == feature_id).first()
-            all_before = session_before.query(Feature).all()
-            passing_before = sum(1 for f in all_before if f.passes)
-            debug_log.log("DB", f"BEFORE engine.dispose() - Feature #{feature_id} state",
-                passes=feature_before.passes if feature_before else None,
-                in_progress=feature_before.in_progress if feature_before else None,
-                total_passing_in_db=passing_before)
-        finally:
-            session_before.close()
-
-        # CRITICAL: Refresh database connection to see subprocess commits
+        # Refresh session cache to see subprocess commits
        # The coding agent runs as a subprocess and commits changes (e.g., passes=True).
-        # SQLAlchemy may have stale connections. Disposing the engine forces new connections
-        # that will see the subprocess's committed changes.
-        debug_log.log("DB", "Disposing database engine now...")
-        self._engine.dispose()
-
-        # AFTER dispose: Query again to compare
+        # Using session.expire_all() is lighter weight than engine.dispose() for SQLite WAL mode
+        # and is sufficient to invalidate cached data and force fresh reads.
+        # engine.dispose() is only called on orchestrator shutdown, not on every agent completion.
        session = self.get_session()
        try:
+            session.expire_all()
            feature = session.query(Feature).filter(Feature.id == feature_id).first()
-            all_after = session.query(Feature).all()
-            passing_after = sum(1 for f in all_after if f.passes)
            feature_passes = feature.passes if feature else None
            feature_in_progress = feature.in_progress if feature else None
-            debug_log.log("DB", f"AFTER engine.dispose() - Feature #{feature_id} state",
+            debug_log.log("DB", f"Feature #{feature_id} state after session.expire_all()",
                passes=feature_passes,
-                in_progress=feature_in_progress,
-                total_passing_in_db=passing_after,
-                passing_changed=(passing_after != passing_before) if 'passing_before' in dir() else "unknown")
+                in_progress=feature_in_progress)
            if feature and feature.in_progress and not feature.passes:
                feature.in_progress = False
                session.commit()
@@ -900,6 +937,9 @@ class ParallelOrchestrator:
        # CRITICAL: This print triggers the WebSocket to emit agent_update with state='error' or 'success'
        print(f"Feature #{feature_id} {status}", flush=True)

+        # Signal main loop that an agent slot is available
+        self._signal_agent_completed()
+
        # NOTE: Testing agents are now spawned in start_feature() when coding agents START,
        # not here when they complete. This ensures 1:1 ratio and proper termination.

@@ -949,6 +989,12 @@ class ParallelOrchestrator:
        """Main orchestration loop."""
        self.is_running = True

+        # Initialize the agent completion event for this run
+        # Must be created in the async context where it will be used
+        self._agent_completed_event = asyncio.Event()
+        # Store the event loop reference for thread-safe signaling from output reader threads
+        self._event_loop = asyncio.get_running_loop()
+
        # Track session start for regression testing (UTC for consistency with last_tested_at)
        self.session_start_time = datetime.now(timezone.utc)

@@ -1100,8 +1146,8 @@ class ParallelOrchestrator:
                    at_capacity=(current >= self.max_concurrency))

                if current >= self.max_concurrency:
-                    debug_log.log("CAPACITY", "At max capacity, sleeping...")
-                    await asyncio.sleep(POLL_INTERVAL)
+                    debug_log.log("CAPACITY", "At max capacity, waiting for agent completion...")
+                    await self._wait_for_agent_completion()
                    continue

                # Priority 1: Resume features from previous session
@@ -1119,7 +1165,7 @@ class ParallelOrchestrator:
                if not ready:
                    # Wait for running features to complete
                    if current > 0:
-                        await asyncio.sleep(POLL_INTERVAL)
+                        await self._wait_for_agent_completion()
                        continue
                    else:
                        # No ready features and nothing running
@@ -1138,7 +1184,7 @@ class ParallelOrchestrator:

                        # Still have pending features but all are blocked by dependencies
                        print("No ready features available. All remaining features may be blocked by dependencies.", flush=True)
-                        await asyncio.sleep(POLL_INTERVAL * 2)
+                        await self._wait_for_agent_completion(timeout=POLL_INTERVAL * 2)
                        continue

                # Start features up to capacity
@@ -1174,7 +1220,7 @@ class ParallelOrchestrator:

            except Exception as e:
                print(f"Orchestrator error: {e}", flush=True)
-                await asyncio.sleep(POLL_INTERVAL)
+                await self._wait_for_agent_completion()

        # Wait for remaining agents to complete
        print("Waiting for running agents to complete...", flush=True)
@@ -1184,7 +1230,8 @@ class ParallelOrchestrator:
                testing_done = len(self.running_testing_agents) == 0
                if coding_done and testing_done:
                    break
-            await asyncio.sleep(1)
+            # Use short timeout since we're just waiting for final agents to finish
+            await self._wait_for_agent_completion(timeout=1.0)

        print("Orchestrator finished.", flush=True)