Fix agent output validation to prevent false verified status (#807)

* Changes from fix/cursor-fix * feat: Enhance provider error messages with diagnostic context, address test failure, fix port change, move playwright tests to different port * Update apps/ui/src/components/views/board-view/dialogs/add-feature-dialog.tsx Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> * ci: Update test server port from 3008 to 3108 and add environment configuration * fix: Correct typo in health endpoint URL and standardize port env vars --------- Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
2026-03-18 10:23:07 +00:00 · 2026-02-24 20:18:40 -08:00
parent 0330c70261
commit 51e9a23ba1
36 changed files with 1610 additions and 104 deletions
--- a/apps/server/.env.example
+++ b/apps/server/.env.example
@@ -52,6 +52,12 @@ HOST=0.0.0.0
 # Port to run the server on
 PORT=3008

+# Port to run the server on for testing
+TEST_SERVER_PORT=3108
+
+# Port to run the UI on for testing
+TEST_PORT=3107
+
 # Data directory for sessions and metadata
 DATA_DIR=./data

--- a/apps/server/src/providers/copilot-provider.ts
+++ b/apps/server/src/providers/copilot-provider.ts
@@ -389,9 +389,14 @@ export class CopilotProvider extends CliProvider {

      case 'session.error': {
        const errorEvent = sdkEvent as SdkSessionErrorEvent;
+        const enrichedError =
+          errorEvent.data.message ||
+          (errorEvent.data.code
+            ? `Copilot agent error (code: ${errorEvent.data.code})`
+            : 'Copilot agent error');
        return {
          type: 'error',
-          error: errorEvent.data.message || 'Unknown error',
+          error: enrichedError,
        };
      }

--- a/apps/server/src/providers/cursor-provider.ts
+++ b/apps/server/src/providers/cursor-provider.ts
@@ -562,10 +562,14 @@ export class CursorProvider extends CliProvider {
        const resultEvent = cursorEvent as CursorResultEvent;

        if (resultEvent.is_error) {
+          const errorText = resultEvent.error || resultEvent.result || '';
+          const enrichedError =
+            errorText ||
+            `Cursor agent failed (duration: ${resultEvent.duration_ms}ms, subtype: ${resultEvent.subtype}, session: ${resultEvent.session_id ?? 'none'})`;
          return {
            type: 'error',
            session_id: resultEvent.session_id,
-            error: resultEvent.error || resultEvent.result || 'Unknown error',
+            error: enrichedError,
          };
        }

--- a/apps/server/src/providers/gemini-provider.ts
+++ b/apps/server/src/providers/gemini-provider.ts
@@ -381,10 +381,13 @@ export class GeminiProvider extends CliProvider {
        const resultEvent = geminiEvent as GeminiResultEvent;

        if (resultEvent.status === 'error') {
+          const enrichedError =
+            resultEvent.error ||
+            `Gemini agent failed (duration: ${resultEvent.stats?.duration_ms ?? 'unknown'}ms, session: ${resultEvent.session_id ?? 'none'})`;
          return {
            type: 'error',
            session_id: resultEvent.session_id,
-            error: resultEvent.error || 'Unknown error',
+            error: enrichedError,
          };
        }

@@ -401,10 +404,12 @@ export class GeminiProvider extends CliProvider {

      case 'error': {
        const errorEvent = geminiEvent as GeminiResultEvent;
+        const enrichedError =
+          errorEvent.error || `Gemini agent failed (session: ${errorEvent.session_id ?? 'none'})`;
        return {
          type: 'error',
          session_id: errorEvent.session_id,
-          error: errorEvent.error || 'Unknown error',
+          error: enrichedError,
        };
      }

--- a/apps/server/src/services/agent-executor.ts
+++ b/apps/server/src/services/agent-executor.ts
@@ -296,8 +296,28 @@ export class AgentExecutor {
            }
          }
        } else if (msg.type === 'error') {
-          throw new Error(AgentExecutor.sanitizeProviderError(msg.error));
-        } else if (msg.type === 'result' && msg.subtype === 'success') scheduleWrite();
+          const sanitized = AgentExecutor.sanitizeProviderError(msg.error);
+          logger.error(
+            `[execute] Feature ${featureId} received error from provider. ` +
+              `raw="${msg.error}", sanitized="${sanitized}", session_id=${msg.session_id ?? 'none'}`
+          );
+          throw new Error(sanitized);
+        } else if (msg.type === 'result') {
+          if (msg.subtype === 'success') {
+            scheduleWrite();
+          } else if (msg.subtype?.startsWith('error')) {
+            // Non-success result subtypes from the SDK (error_max_turns, error_during_execution, etc.)
+            logger.error(
+              `[execute] Feature ${featureId} ended with error subtype: ${msg.subtype}. ` +
+                `session_id=${msg.session_id ?? 'none'}`
+            );
+            throw new Error(`Agent execution ended with: ${msg.subtype}`);
+          } else {
+            logger.warn(
+              `[execute] Feature ${featureId} received unhandled result subtype: ${msg.subtype}`
+            );
+          }
+        }
      }
    } finally {
      clearInterval(streamHeartbeat);
@@ -447,16 +467,28 @@ export class AgentExecutor {
              });
          }
        } else if (msg.type === 'error') {
-          // Clean the error: strip ANSI codes and redundant "Error: " prefix
-          const cleanedError =
-            (msg.error || `Error during task ${task.id}`)
-              .replace(/\x1b\[[0-9;]*m/g, '')
-              .replace(/^Error:\s*/i, '')
-              .trim() || `Error during task ${task.id}`;
-          throw new Error(cleanedError);
-        } else if (msg.type === 'result' && msg.subtype === 'success') {
-          taskOutput += msg.result || '';
-          responseText += msg.result || '';
+          const fallback = `Error during task ${task.id}`;
+          const sanitized = AgentExecutor.sanitizeProviderError(msg.error || fallback);
+          logger.error(
+            `[executeTasksLoop] Feature ${featureId} task ${task.id} received error from provider. ` +
+              `raw="${msg.error}", sanitized="${sanitized}", session_id=${msg.session_id ?? 'none'}`
+          );
+          throw new Error(sanitized);
+        } else if (msg.type === 'result') {
+          if (msg.subtype === 'success') {
+            taskOutput += msg.result || '';
+            responseText += msg.result || '';
+          } else if (msg.subtype?.startsWith('error')) {
+            logger.error(
+              `[executeTasksLoop] Feature ${featureId} task ${task.id} ended with error subtype: ${msg.subtype}. ` +
+                `session_id=${msg.session_id ?? 'none'}`
+            );
+            throw new Error(`Agent execution ended with: ${msg.subtype}`);
+          } else {
+            logger.warn(
+              `[executeTasksLoop] Feature ${featureId} task ${task.id} received unhandled result subtype: ${msg.subtype}`
+            );
+          }
        }
      }
      if (!taskCompleteDetected)
--- a/apps/server/src/services/execution-service.ts
+++ b/apps/server/src/services/execution-service.ts
@@ -60,6 +60,12 @@ import type {

 const logger = createLogger('ExecutionService');

+/** Marker written by agent-executor for each tool invocation. */
+const TOOL_USE_MARKER = '🔧 Tool:';
+
+/** Minimum trimmed output length to consider agent work meaningful. */
+const MIN_MEANINGFUL_OUTPUT_LENGTH = 200;
+
 export class ExecutionService {
  constructor(
    private eventBus: TypedEventBus,
@@ -409,7 +415,41 @@ Please continue from where you left off and complete all remaining tasks. Use th
        }
      }

-      const finalStatus = feature.skipTests ? 'waiting_approval' : 'verified';
+      // Read agent output before determining final status.
+      // CLI-based providers (Cursor, Codex, etc.) may exit quickly without doing
+      // meaningful work. Check output to avoid prematurely marking as 'verified'.
+      const outputPath = path.join(getFeatureDir(projectPath, featureId), 'agent-output.md');
+      let agentOutput = '';
+      try {
+        agentOutput = (await secureFs.readFile(outputPath, 'utf-8')) as string;
+      } catch {
+        /* */
+      }
+
+      // Determine if the agent did meaningful work by checking for tool usage
+      // indicators in the output. The agent executor writes "🔧 Tool:" markers
+      // each time a tool is invoked. No tool usage suggests the CLI exited
+      // without performing implementation work.
+      const hasToolUsage = agentOutput.includes(TOOL_USE_MARKER);
+      const isOutputTooShort = agentOutput.trim().length < MIN_MEANINGFUL_OUTPUT_LENGTH;
+      const agentDidWork = hasToolUsage && !isOutputTooShort;
+
+      let finalStatus: 'verified' | 'waiting_approval';
+      if (feature.skipTests) {
+        finalStatus = 'waiting_approval';
+      } else if (!agentDidWork) {
+        // Agent didn't produce meaningful output (e.g., CLI exited quickly).
+        // Route to waiting_approval so the user can review and re-run.
+        finalStatus = 'waiting_approval';
+        logger.warn(
+          `[executeFeature] Feature ${featureId}: agent produced insufficient output ` +
+            `(${agentOutput.trim().length}/${MIN_MEANINGFUL_OUTPUT_LENGTH} chars, toolUsage=${hasToolUsage}). ` +
+            `Setting status to waiting_approval instead of verified.`
+        );
+      } else {
+        finalStatus = 'verified';
+      }
+
      await this.updateFeatureStatusFn(projectPath, featureId, finalStatus);
      this.recordSuccessFn();

@@ -421,13 +461,6 @@ Please continue from where you left off and complete all remaining tasks. Use th
      const hasIncompleteTasks = totalTasks > 0 && completedTasks < totalTasks;

      try {
-        const outputPath = path.join(getFeatureDir(projectPath, featureId), 'agent-output.md');
-        let agentOutput = '';
-        try {
-          agentOutput = (await secureFs.readFile(outputPath, 'utf-8')) as string;
-        } catch {
-          /* */
-        }
        if (agentOutput) {
          const summary = extractSummary(agentOutput);
          if (summary) await this.saveFeatureSummaryFn(projectPath, featureId, summary);
--- a/apps/server/tests/unit/lib/thinking-level-normalization.test.ts
+++ b/apps/server/tests/unit/lib/thinking-level-normalization.test.ts
@@ -0,0 +1,20 @@
+import { describe, it, expect } from 'vitest';
+import { normalizeThinkingLevelForModel } from '@automaker/types';
+
+describe('normalizeThinkingLevelForModel', () => {
+  it('preserves explicitly selected none for Opus models', () => {
+    expect(normalizeThinkingLevelForModel('claude-opus', 'none')).toBe('none');
+  });
+
+  it('falls back to none when Opus receives an unsupported manual thinking level', () => {
+    expect(normalizeThinkingLevelForModel('claude-opus', 'medium')).toBe('none');
+  });
+
+  it('keeps adaptive for Opus when adaptive is selected', () => {
+    expect(normalizeThinkingLevelForModel('claude-opus', 'adaptive')).toBe('adaptive');
+  });
+
+  it('preserves supported manual levels for non-Opus models', () => {
+    expect(normalizeThinkingLevelForModel('claude-sonnet', 'high')).toBe('high');
+  });
+});
--- a/apps/server/tests/unit/providers/copilot-provider.test.ts
+++ b/apps/server/tests/unit/providers/copilot-provider.test.ts
@@ -397,6 +397,45 @@ describe('copilot-provider.ts', () => {
      });
    });

+    it('should use error code in fallback when session.error message is empty', () => {
+      const event = {
+        type: 'session.error',
+        data: { message: '', code: 'RATE_LIMIT_EXCEEDED' },
+      };
+
+      const result = provider.normalizeEvent(event);
+      expect(result).not.toBeNull();
+      expect(result!.type).toBe('error');
+      expect(result!.error).toContain('RATE_LIMIT_EXCEEDED');
+      expect(result!.error).not.toBe('Unknown error');
+    });
+
+    it('should return generic "Copilot agent error" fallback when both message and code are empty', () => {
+      const event = {
+        type: 'session.error',
+        data: { message: '', code: '' },
+      };
+
+      const result = provider.normalizeEvent(event);
+      expect(result).not.toBeNull();
+      expect(result!.type).toBe('error');
+      expect(result!.error).toBe('Copilot agent error');
+      // Must NOT be the old opaque 'Unknown error'
+      expect(result!.error).not.toBe('Unknown error');
+    });
+
+    it('should return generic "Copilot agent error" fallback when data has no code field', () => {
+      const event = {
+        type: 'session.error',
+        data: { message: '' },
+      };
+
+      const result = provider.normalizeEvent(event);
+      expect(result).not.toBeNull();
+      expect(result!.type).toBe('error');
+      expect(result!.error).toBe('Copilot agent error');
+    });
+
    it('should return null for unknown event types', () => {
      const event = { type: 'unknown.event' };

--- a/apps/server/tests/unit/providers/cursor-provider.test.ts
+++ b/apps/server/tests/unit/providers/cursor-provider.test.ts
@@ -1,4 +1,4 @@
-import { describe, it, expect } from 'vitest';
+import { describe, it, expect, beforeEach } from 'vitest';
 import { CursorProvider } from '@/providers/cursor-provider.js';

 describe('cursor-provider.ts', () => {
@@ -36,4 +36,122 @@ describe('cursor-provider.ts', () => {
      expect(args).not.toContain('--resume');
    });
  });
+
+  describe('normalizeEvent - result error handling', () => {
+    let provider: CursorProvider;
+
+    beforeEach(() => {
+      provider = Object.create(CursorProvider.prototype) as CursorProvider;
+    });
+
+    it('returns error message from resultEvent.error when is_error=true', () => {
+      const event = {
+        type: 'result',
+        is_error: true,
+        error: 'Rate limit exceeded',
+        result: '',
+        subtype: 'error',
+        duration_ms: 3000,
+        session_id: 'sess-123',
+      };
+
+      const msg = provider.normalizeEvent(event);
+
+      expect(msg).not.toBeNull();
+      expect(msg!.type).toBe('error');
+      expect(msg!.error).toBe('Rate limit exceeded');
+    });
+
+    it('falls back to resultEvent.result when error field is empty and is_error=true', () => {
+      const event = {
+        type: 'result',
+        is_error: true,
+        error: '',
+        result: 'Process terminated unexpectedly',
+        subtype: 'error',
+        duration_ms: 5000,
+        session_id: 'sess-456',
+      };
+
+      const msg = provider.normalizeEvent(event);
+
+      expect(msg).not.toBeNull();
+      expect(msg!.type).toBe('error');
+      expect(msg!.error).toBe('Process terminated unexpectedly');
+    });
+
+    it('builds diagnostic fallback when both error and result are empty and is_error=true', () => {
+      const event = {
+        type: 'result',
+        is_error: true,
+        error: '',
+        result: '',
+        subtype: 'error',
+        duration_ms: 5000,
+        session_id: 'sess-789',
+      };
+
+      const msg = provider.normalizeEvent(event);
+
+      expect(msg).not.toBeNull();
+      expect(msg!.type).toBe('error');
+      // Should contain diagnostic info rather than 'Unknown error'
+      expect(msg!.error).toContain('5000ms');
+      expect(msg!.error).toContain('sess-789');
+      expect(msg!.error).not.toBe('Unknown error');
+    });
+
+    it('preserves session_id in error message', () => {
+      const event = {
+        type: 'result',
+        is_error: true,
+        error: 'Timeout occurred',
+        result: '',
+        subtype: 'error',
+        duration_ms: 30000,
+        session_id: 'my-session-id',
+      };
+
+      const msg = provider.normalizeEvent(event);
+
+      expect(msg!.session_id).toBe('my-session-id');
+    });
+
+    it('uses "none" when session_id is missing from diagnostic fallback', () => {
+      const event = {
+        type: 'result',
+        is_error: true,
+        error: '',
+        result: '',
+        subtype: 'error',
+        duration_ms: 5000,
+        // session_id intentionally omitted
+      };
+
+      const msg = provider.normalizeEvent(event);
+
+      expect(msg).not.toBeNull();
+      expect(msg!.type).toBe('error');
+      expect(msg!.error).toContain('none');
+      expect(msg!.error).not.toContain('undefined');
+    });
+
+    it('returns success result when is_error=false', () => {
+      const event = {
+        type: 'result',
+        is_error: false,
+        error: '',
+        result: 'Completed successfully',
+        subtype: 'success',
+        duration_ms: 2000,
+        session_id: 'sess-ok',
+      };
+
+      const msg = provider.normalizeEvent(event);
+
+      expect(msg).not.toBeNull();
+      expect(msg!.type).toBe('result');
+      expect(msg!.subtype).toBe('success');
+    });
+  });
 });
--- a/apps/server/tests/unit/providers/gemini-provider.test.ts
+++ b/apps/server/tests/unit/providers/gemini-provider.test.ts
@@ -1,5 +1,6 @@
 import { describe, it, expect, beforeEach } from 'vitest';
 import { GeminiProvider } from '@/providers/gemini-provider.js';
+import type { ProviderMessage } from '@automaker/types';

 describe('gemini-provider.ts', () => {
  let provider: GeminiProvider;
@@ -116,4 +117,140 @@ describe('gemini-provider.ts', () => {
      expect(args[modelIndex + 1]).toBe('gemini-2.5-pro');
    });
  });
+
+  describe('normalizeEvent - error handling', () => {
+    it('returns error from result event when status=error and error field is set', () => {
+      const event = {
+        type: 'result',
+        status: 'error',
+        error: 'Model overloaded',
+        session_id: 'sess-gemini-1',
+        stats: { duration_ms: 4000, total_tokens: 0 },
+      };
+
+      const msg = provider.normalizeEvent(event) as ProviderMessage;
+
+      expect(msg).not.toBeNull();
+      expect(msg.type).toBe('error');
+      expect(msg.error).toBe('Model overloaded');
+      expect(msg.session_id).toBe('sess-gemini-1');
+    });
+
+    it('builds diagnostic fallback when result event has status=error but empty error field', () => {
+      const event = {
+        type: 'result',
+        status: 'error',
+        error: '',
+        session_id: 'sess-gemini-2',
+        stats: { duration_ms: 7500, total_tokens: 0 },
+      };
+
+      const msg = provider.normalizeEvent(event) as ProviderMessage;
+
+      expect(msg).not.toBeNull();
+      expect(msg.type).toBe('error');
+      // Diagnostic info should be present instead of 'Unknown error'
+      expect(msg.error).toContain('7500ms');
+      expect(msg.error).toContain('sess-gemini-2');
+      expect(msg.error).not.toBe('Unknown error');
+    });
+
+    it('builds fallback with "unknown" duration when stats are missing', () => {
+      const event = {
+        type: 'result',
+        status: 'error',
+        error: '',
+        session_id: 'sess-gemini-nostats',
+        // no stats field
+      };
+
+      const msg = provider.normalizeEvent(event) as ProviderMessage;
+
+      expect(msg).not.toBeNull();
+      expect(msg.type).toBe('error');
+      expect(msg.error).toContain('unknown');
+    });
+
+    it('returns error from standalone error event with error field set', () => {
+      const event = {
+        type: 'error',
+        error: 'API key invalid',
+        session_id: 'sess-gemini-3',
+      };
+
+      const msg = provider.normalizeEvent(event) as ProviderMessage;
+
+      expect(msg).not.toBeNull();
+      expect(msg.type).toBe('error');
+      expect(msg.error).toBe('API key invalid');
+    });
+
+    it('builds diagnostic fallback when standalone error event has empty error field', () => {
+      const event = {
+        type: 'error',
+        error: '',
+        session_id: 'sess-gemini-empty',
+      };
+
+      const msg = provider.normalizeEvent(event) as ProviderMessage;
+
+      expect(msg).not.toBeNull();
+      expect(msg.type).toBe('error');
+      // Should include session_id, not just 'Unknown error'
+      expect(msg.error).toContain('sess-gemini-empty');
+      expect(msg.error).not.toBe('Unknown error');
+    });
+
+    it('builds fallback mentioning "none" when session_id is missing from error event', () => {
+      const event = {
+        type: 'error',
+        error: '',
+        // no session_id
+      };
+
+      const msg = provider.normalizeEvent(event) as ProviderMessage;
+
+      expect(msg).not.toBeNull();
+      expect(msg.type).toBe('error');
+      expect(msg.error).toContain('none');
+    });
+
+    it('uses consistent "Gemini agent failed" label for both result and error event fallbacks', () => {
+      const resultEvent = {
+        type: 'result',
+        status: 'error',
+        error: '',
+        session_id: 'sess-r',
+        stats: { duration_ms: 1000 },
+      };
+      const errorEvent = {
+        type: 'error',
+        error: '',
+        session_id: 'sess-e',
+      };
+
+      const resultMsg = provider.normalizeEvent(resultEvent) as ProviderMessage;
+      const errorMsg = provider.normalizeEvent(errorEvent) as ProviderMessage;
+
+      // Both fallback messages should use the same "Gemini agent failed" prefix
+      expect(resultMsg.error).toContain('Gemini agent failed');
+      expect(errorMsg.error).toContain('Gemini agent failed');
+    });
+
+    it('returns success result when result event has status=success', () => {
+      const event = {
+        type: 'result',
+        status: 'success',
+        error: '',
+        session_id: 'sess-gemini-ok',
+        stats: { duration_ms: 1200, total_tokens: 500 },
+      };
+
+      const msg = provider.normalizeEvent(event) as ProviderMessage;
+
+      expect(msg).not.toBeNull();
+      expect(msg.type).toBe('result');
+      expect(msg.subtype).toBe('success');
+    });
+  });
 });
--- a/apps/server/tests/unit/services/agent-executor.test.ts
+++ b/apps/server/tests/unit/services/agent-executor.test.ts
@@ -685,6 +685,309 @@ describe('AgentExecutor', () => {
      await expect(executor.execute(options, callbacks)).rejects.toThrow('API rate limit exceeded');
    });

+    it('should throw "Unknown error" when provider stream yields error with empty message', async () => {
+      const executor = new AgentExecutor(
+        mockEventBus,
+        mockFeatureStateManager,
+        mockPlanApprovalService,
+        mockSettingsService
+      );
+
+      const mockProvider = {
+        getName: () => 'mock',
+        executeQuery: vi.fn().mockImplementation(function* () {
+          yield {
+            type: 'error',
+            error: '',
+            session_id: 'sess-123',
+          };
+        }),
+      } as unknown as BaseProvider;
+
+      const options: AgentExecutionOptions = {
+        workDir: '/test',
+        featureId: 'test-feature',
+        prompt: 'Test prompt',
+        projectPath: '/project',
+        abortController: new AbortController(),
+        provider: mockProvider,
+        effectiveBareModel: 'claude-sonnet-4-6',
+        planningMode: 'skip',
+      };
+
+      const callbacks = {
+        waitForApproval: vi.fn().mockResolvedValue({ approved: true }),
+        saveFeatureSummary: vi.fn(),
+        updateFeatureSummary: vi.fn(),
+        buildTaskPrompt: vi.fn().mockReturnValue('task prompt'),
+      };
+
+      await expect(executor.execute(options, callbacks)).rejects.toThrow('Unknown error');
+    });
+
+    it('should throw with sanitized error when provider yields ANSI-decorated error', async () => {
+      const executor = new AgentExecutor(
+        mockEventBus,
+        mockFeatureStateManager,
+        mockPlanApprovalService,
+        mockSettingsService
+      );
+
+      const mockProvider = {
+        getName: () => 'mock',
+        executeQuery: vi.fn().mockImplementation(function* () {
+          yield {
+            type: 'error',
+            // ANSI color codes + "Error: " prefix that should be stripped
+            error: '\x1b[31mError: Connection refused\x1b[0m',
+          };
+        }),
+      } as unknown as BaseProvider;
+
+      const options: AgentExecutionOptions = {
+        workDir: '/test',
+        featureId: 'test-feature',
+        prompt: 'Test prompt',
+        projectPath: '/project',
+        abortController: new AbortController(),
+        provider: mockProvider,
+        effectiveBareModel: 'claude-sonnet-4-6',
+        planningMode: 'skip',
+      };
+
+      const callbacks = {
+        waitForApproval: vi.fn().mockResolvedValue({ approved: true }),
+        saveFeatureSummary: vi.fn(),
+        updateFeatureSummary: vi.fn(),
+        buildTaskPrompt: vi.fn().mockReturnValue('task prompt'),
+      };
+
+      // Should strip ANSI codes and "Error: " prefix
+      await expect(executor.execute(options, callbacks)).rejects.toThrow('Connection refused');
+    });
+
+    it('should throw when result subtype is error_max_turns', async () => {
+      const executor = new AgentExecutor(
+        mockEventBus,
+        mockFeatureStateManager,
+        mockPlanApprovalService,
+        mockSettingsService
+      );
+
+      const mockProvider = {
+        getName: () => 'mock',
+        executeQuery: vi.fn().mockImplementation(function* () {
+          yield {
+            type: 'assistant',
+            message: {
+              content: [{ type: 'text', text: 'Working on it...' }],
+            },
+          };
+          yield {
+            type: 'result',
+            subtype: 'error_max_turns',
+            session_id: 'sess-456',
+          };
+        }),
+      } as unknown as BaseProvider;
+
+      const options: AgentExecutionOptions = {
+        workDir: '/test',
+        featureId: 'test-feature',
+        prompt: 'Test prompt',
+        projectPath: '/project',
+        abortController: new AbortController(),
+        provider: mockProvider,
+        effectiveBareModel: 'claude-sonnet-4-6',
+        planningMode: 'skip',
+      };
+
+      const callbacks = {
+        waitForApproval: vi.fn().mockResolvedValue({ approved: true }),
+        saveFeatureSummary: vi.fn(),
+        updateFeatureSummary: vi.fn(),
+        buildTaskPrompt: vi.fn().mockReturnValue('task prompt'),
+      };
+
+      await expect(executor.execute(options, callbacks)).rejects.toThrow(
+        'Agent execution ended with: error_max_turns'
+      );
+    });
+
+    it('should throw when result subtype is error_during_execution', async () => {
+      const executor = new AgentExecutor(
+        mockEventBus,
+        mockFeatureStateManager,
+        mockPlanApprovalService,
+        mockSettingsService
+      );
+
+      const mockProvider = {
+        getName: () => 'mock',
+        executeQuery: vi.fn().mockImplementation(function* () {
+          yield {
+            type: 'result',
+            subtype: 'error_during_execution',
+            session_id: 'sess-789',
+          };
+        }),
+      } as unknown as BaseProvider;
+
+      const options: AgentExecutionOptions = {
+        workDir: '/test',
+        featureId: 'test-feature',
+        prompt: 'Test prompt',
+        projectPath: '/project',
+        abortController: new AbortController(),
+        provider: mockProvider,
+        effectiveBareModel: 'claude-sonnet-4-6',
+        planningMode: 'skip',
+      };
+
+      const callbacks = {
+        waitForApproval: vi.fn().mockResolvedValue({ approved: true }),
+        saveFeatureSummary: vi.fn(),
+        updateFeatureSummary: vi.fn(),
+        buildTaskPrompt: vi.fn().mockReturnValue('task prompt'),
+      };
+
+      await expect(executor.execute(options, callbacks)).rejects.toThrow(
+        'Agent execution ended with: error_during_execution'
+      );
+    });
+
+    it('should throw when result subtype is error_max_structured_output_retries', async () => {
+      const executor = new AgentExecutor(
+        mockEventBus,
+        mockFeatureStateManager,
+        mockPlanApprovalService,
+        mockSettingsService
+      );
+
+      const mockProvider = {
+        getName: () => 'mock',
+        executeQuery: vi.fn().mockImplementation(function* () {
+          yield {
+            type: 'result',
+            subtype: 'error_max_structured_output_retries',
+          };
+        }),
+      } as unknown as BaseProvider;
+
+      const options: AgentExecutionOptions = {
+        workDir: '/test',
+        featureId: 'test-feature',
+        prompt: 'Test prompt',
+        projectPath: '/project',
+        abortController: new AbortController(),
+        provider: mockProvider,
+        effectiveBareModel: 'claude-sonnet-4-6',
+        planningMode: 'skip',
+      };
+
+      const callbacks = {
+        waitForApproval: vi.fn().mockResolvedValue({ approved: true }),
+        saveFeatureSummary: vi.fn(),
+        updateFeatureSummary: vi.fn(),
+        buildTaskPrompt: vi.fn().mockReturnValue('task prompt'),
+      };
+
+      await expect(executor.execute(options, callbacks)).rejects.toThrow(
+        'Agent execution ended with: error_max_structured_output_retries'
+      );
+    });
+
+    it('should throw when result subtype is error_max_budget_usd', async () => {
+      const executor = new AgentExecutor(
+        mockEventBus,
+        mockFeatureStateManager,
+        mockPlanApprovalService,
+        mockSettingsService
+      );
+
+      const mockProvider = {
+        getName: () => 'mock',
+        executeQuery: vi.fn().mockImplementation(function* () {
+          yield {
+            type: 'result',
+            subtype: 'error_max_budget_usd',
+            session_id: 'sess-budget',
+          };
+        }),
+      } as unknown as BaseProvider;
+
+      const options: AgentExecutionOptions = {
+        workDir: '/test',
+        featureId: 'test-feature',
+        prompt: 'Test prompt',
+        projectPath: '/project',
+        abortController: new AbortController(),
+        provider: mockProvider,
+        effectiveBareModel: 'claude-sonnet-4-6',
+        planningMode: 'skip',
+      };
+
+      const callbacks = {
+        waitForApproval: vi.fn().mockResolvedValue({ approved: true }),
+        saveFeatureSummary: vi.fn(),
+        updateFeatureSummary: vi.fn(),
+        buildTaskPrompt: vi.fn().mockReturnValue('task prompt'),
+      };
+
+      await expect(executor.execute(options, callbacks)).rejects.toThrow(
+        'Agent execution ended with: error_max_budget_usd'
+      );
+    });
+
+    it('should NOT throw when result subtype is success', async () => {
+      const executor = new AgentExecutor(
+        mockEventBus,
+        mockFeatureStateManager,
+        mockPlanApprovalService,
+        mockSettingsService
+      );
+
+      const mockProvider = {
+        getName: () => 'mock',
+        executeQuery: vi.fn().mockImplementation(function* () {
+          yield {
+            type: 'assistant',
+            message: {
+              content: [{ type: 'text', text: 'Done!' }],
+            },
+          };
+          yield {
+            type: 'result',
+            subtype: 'success',
+            session_id: 'sess-ok',
+          };
+        }),
+      } as unknown as BaseProvider;
+
+      const options: AgentExecutionOptions = {
+        workDir: '/test',
+        featureId: 'test-feature',
+        prompt: 'Test prompt',
+        projectPath: '/project',
+        abortController: new AbortController(),
+        provider: mockProvider,
+        effectiveBareModel: 'claude-sonnet-4-6',
+        planningMode: 'skip',
+      };
+
+      const callbacks = {
+        waitForApproval: vi.fn().mockResolvedValue({ approved: true }),
+        saveFeatureSummary: vi.fn(),
+        updateFeatureSummary: vi.fn(),
+        buildTaskPrompt: vi.fn().mockReturnValue('task prompt'),
+      };
+
+      // Should resolve without throwing
+      const result = await executor.execute(options, callbacks);
+      expect(result.aborted).toBe(false);
+      expect(result.responseText).toContain('Done!');
+    });
+
    it('should throw error when authentication fails in response', async () => {
      const executor = new AgentExecutor(
        mockEventBus,
--- a/apps/server/tests/unit/services/agent-output-validation.test.ts
+++ b/apps/server/tests/unit/services/agent-output-validation.test.ts
@@ -0,0 +1,192 @@
+import { describe, it, expect } from 'vitest';
+
+/**
+ * Contract tests verifying the tool marker format used by agent-executor
+ * (which writes agent output) and execution-service (which reads it to
+ * determine if the agent did meaningful work).
+ *
+ * The agent-executor writes: `\n🔧 Tool: ${block.name}\n`
+ * The execution-service checks: `agentOutput.includes('🔧 Tool:')`
+ *
+ * These tests ensure the marker format contract stays consistent and
+ * document the exact detection logic used for status determination.
+ */
+
+// The exact marker prefix that execution-service searches for
+const TOOL_MARKER = '🔧 Tool:';
+
+// Minimum output length threshold for "meaningful work"
+const MIN_OUTPUT_LENGTH = 200;
+
+/**
+ * Simulates the agent-executor's tool_use output format.
+ * See: agent-executor.ts line ~293
+ */
+function formatToolUseBlock(toolName: string, input?: Record<string, unknown>): string {
+  let output = `\n${TOOL_MARKER} ${toolName}\n`;
+  if (input) output += `Input: ${JSON.stringify(input, null, 2)}\n`;
+  return output;
+}
+
+/**
+ * Simulates the execution-service's output validation logic.
+ * See: execution-service.ts lines ~427-429
+ */
+function validateAgentOutput(
+  agentOutput: string,
+  skipTests: boolean
+): 'verified' | 'waiting_approval' {
+  const hasToolUsage = agentOutput.includes(TOOL_MARKER);
+  const hasMinimalOutput = agentOutput.trim().length < MIN_OUTPUT_LENGTH;
+  const agentDidWork = hasToolUsage && !hasMinimalOutput;
+
+  if (skipTests) return 'waiting_approval';
+  if (!agentDidWork) return 'waiting_approval';
+  return 'verified';
+}
+
+describe('Agent Output Validation - Contract Tests', () => {
+  describe('tool marker format contract', () => {
+    it('agent-executor tool format contains the expected marker', () => {
+      const toolOutput = formatToolUseBlock('Read', { file_path: '/src/index.ts' });
+      expect(toolOutput).toContain(TOOL_MARKER);
+    });
+
+    it('agent-executor tool format includes tool name after marker', () => {
+      const toolOutput = formatToolUseBlock('Edit', {
+        file_path: '/src/app.ts',
+        old_string: 'foo',
+        new_string: 'bar',
+      });
+      expect(toolOutput).toContain('🔧 Tool: Edit');
+    });
+
+    it('agent-executor tool format includes JSON input', () => {
+      const input = { file_path: '/src/index.ts' };
+      const toolOutput = formatToolUseBlock('Read', input);
+      expect(toolOutput).toContain('Input: ');
+      expect(toolOutput).toContain('"file_path": "/src/index.ts"');
+    });
+
+    it('agent-executor tool format works without input', () => {
+      const toolOutput = formatToolUseBlock('Bash');
+      expect(toolOutput).toContain('🔧 Tool: Bash');
+      expect(toolOutput).not.toContain('Input:');
+    });
+
+    it('marker includes colon and space to avoid false positives', () => {
+      // Ensure the marker is specific enough to avoid matching other emoji patterns
+      expect(TOOL_MARKER).toBe('🔧 Tool:');
+      expect(TOOL_MARKER).toContain(':');
+    });
+  });
+
+  describe('output validation logic', () => {
+    it('verified: tool usage + sufficient output', () => {
+      const output =
+        'Starting implementation of the new feature...\n' +
+        formatToolUseBlock('Read', { file_path: '/src/index.ts' }) +
+        'I can see the existing code. Let me make the needed changes.\n' +
+        formatToolUseBlock('Edit', { file_path: '/src/index.ts' }) +
+        'Changes complete. The implementation adds new validation logic and tests.';
+      expect(output.trim().length).toBeGreaterThanOrEqual(MIN_OUTPUT_LENGTH);
+
+      expect(validateAgentOutput(output, false)).toBe('verified');
+    });
+
+    it('waiting_approval: no tool markers regardless of length', () => {
+      const longOutput = 'I analyzed the codebase. '.repeat(50);
+      expect(longOutput.trim().length).toBeGreaterThan(MIN_OUTPUT_LENGTH);
+
+      expect(validateAgentOutput(longOutput, false)).toBe('waiting_approval');
+    });
+
+    it('waiting_approval: tool markers but insufficient length', () => {
+      const shortOutput = formatToolUseBlock('Read', { file_path: '/src/a.ts' });
+      expect(shortOutput.trim().length).toBeLessThan(MIN_OUTPUT_LENGTH);
+
+      expect(validateAgentOutput(shortOutput, false)).toBe('waiting_approval');
+    });
+
+    it('waiting_approval: empty output', () => {
+      expect(validateAgentOutput('', false)).toBe('waiting_approval');
+    });
+
+    it('waiting_approval: skipTests always overrides', () => {
+      const goodOutput =
+        'Starting...\n' +
+        formatToolUseBlock('Read', { file_path: '/src/index.ts' }) +
+        formatToolUseBlock('Edit', { file_path: '/src/index.ts' }) +
+        'Done implementing. '.repeat(15);
+      expect(goodOutput.trim().length).toBeGreaterThanOrEqual(MIN_OUTPUT_LENGTH);
+
+      expect(validateAgentOutput(goodOutput, true)).toBe('waiting_approval');
+    });
+
+    it('boundary: exactly MIN_OUTPUT_LENGTH chars with tool is verified', () => {
+      const tool = formatToolUseBlock('Read');
+      const padding = 'x'.repeat(MIN_OUTPUT_LENGTH - tool.trim().length);
+      const output = tool + padding;
+      expect(output.trim().length).toBeGreaterThanOrEqual(MIN_OUTPUT_LENGTH);
+
+      expect(validateAgentOutput(output, false)).toBe('verified');
+    });
+
+    it('boundary: MIN_OUTPUT_LENGTH - 1 chars with tool is waiting_approval', () => {
+      const marker = `${TOOL_MARKER} Read\n`;
+      const padding = 'x'.repeat(MIN_OUTPUT_LENGTH - 1 - marker.length);
+      const output = marker + padding;
+      expect(output.trim().length).toBe(MIN_OUTPUT_LENGTH - 1);
+
+      expect(validateAgentOutput(output, false)).toBe('waiting_approval');
+    });
+  });
+
+  describe('realistic provider scenarios', () => {
+    it('Claude SDK agent with multiple tools → verified', () => {
+      let output = "I'll implement the feature.\n\n";
+      output += formatToolUseBlock('Read', { file_path: '/src/components/App.tsx' });
+      output += 'I see the component. Let me update it.\n\n';
+      output += formatToolUseBlock('Edit', {
+        file_path: '/src/components/App.tsx',
+        old_string: 'const App = () => {',
+        new_string: 'const App: React.FC = () => {',
+      });
+      output += 'Done. The component is now typed correctly.\n';
+
+      expect(validateAgentOutput(output, false)).toBe('verified');
+    });
+
+    it('Cursor CLI quick exit (no tools) → waiting_approval', () => {
+      const output = 'Task received. Processing...\nResult: completed successfully.';
+      expect(validateAgentOutput(output, false)).toBe('waiting_approval');
+    });
+
+    it('Codex CLI with brief acknowledgment → waiting_approval', () => {
+      const output = 'Understood the task. Starting implementation.\nDone.';
+      expect(validateAgentOutput(output, false)).toBe('waiting_approval');
+    });
+
+    it('Agent that only reads but makes no edits (single Read tool, short output) → waiting_approval', () => {
+      const output = formatToolUseBlock('Read', { file_path: '/src/index.ts' }) + 'File read.';
+      expect(output.trim().length).toBeLessThan(MIN_OUTPUT_LENGTH);
+      expect(validateAgentOutput(output, false)).toBe('waiting_approval');
+    });
+
+    it('Agent with extensive tool usage and explanation → verified', () => {
+      let output = 'Analyzing the codebase for the authentication feature.\n\n';
+      for (let i = 0; i < 5; i++) {
+        output += formatToolUseBlock('Read', { file_path: `/src/auth/handler${i}.ts` });
+        output += `Found handler ${i}. `;
+      }
+      output += formatToolUseBlock('Edit', {
+        file_path: '/src/auth/handler0.ts',
+        old_string: 'function login() {}',
+        new_string: 'async function login(creds: Credentials) { ... }',
+      });
+      output += 'Implementation complete with all authentication changes applied.\n';
+
+      expect(validateAgentOutput(output, false)).toBe('verified');
+    });
+  });
+});
--- a/apps/server/tests/unit/services/execution-service.test.ts
+++ b/apps/server/tests/unit/services/execution-service.test.ts
@@ -211,7 +211,14 @@ describe('execution-service.ts', () => {
    });

    // Default mocks for secureFs
-    vi.mocked(secureFs.readFile).mockResolvedValue('Agent output content');
+    // Include tool usage markers to simulate meaningful agent output.
+    // The execution service checks for '🔧 Tool:' markers and minimum
+    // output length to determine if the agent did real work.
+    vi.mocked(secureFs.readFile).mockResolvedValue(
+      'Starting implementation...\n\n🔧 Tool: Read\nInput: {"file_path": "/src/index.ts"}\n\n' +
+        '🔧 Tool: Edit\nInput: {"file_path": "/src/index.ts", "old_string": "foo", "new_string": "bar"}\n\n' +
+        'Implementation complete. Updated the code as requested.'
+    );
    vi.mocked(secureFs.access).mockResolvedValue(undefined);

    // Re-setup platform mocks
@@ -1433,4 +1440,439 @@ describe('execution-service.ts', () => {
      );
    });
  });
+
+  describe('executeFeature - agent output validation', () => {
+    // Helper to generate realistic agent output with tool markers
+    const makeAgentOutput = (toolCount: number, extraText = ''): string => {
+      let output = 'Starting implementation...\n\n';
+      for (let i = 0; i < toolCount; i++) {
+        output += `🔧 Tool: Edit\nInput: {"file_path": "/src/file${i}.ts", "old_string": "old${i}", "new_string": "new${i}"}\n\n`;
+      }
+      output += `Implementation complete. ${extraText}`;
+      return output;
+    };
+
+    const createServiceWithMocks = () => {
+      return new ExecutionService(
+        mockEventBus,
+        mockConcurrencyManager,
+        mockWorktreeResolver,
+        mockSettingsService,
+        mockRunAgentFn,
+        mockExecutePipelineFn,
+        mockUpdateFeatureStatusFn,
+        mockLoadFeatureFn,
+        mockGetPlanningPromptPrefixFn,
+        mockSaveFeatureSummaryFn,
+        mockRecordLearningsFn,
+        mockContextExistsFn,
+        mockResumeFeatureFn,
+        mockTrackFailureFn,
+        mockSignalPauseFn,
+        mockRecordSuccessFn,
+        mockSaveExecutionStateFn,
+        mockLoadContextFilesFn
+      );
+    };
+
+    it('sets verified when agent output has tool usage and sufficient length', async () => {
+      const output = makeAgentOutput(3, 'Updated authentication module with new login flow.');
+      vi.mocked(secureFs.readFile).mockResolvedValue(output);
+
+      await service.executeFeature('/test/project', 'feature-1');
+
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'verified'
+      );
+    });
+
+    it('sets waiting_approval when agent output is empty', async () => {
+      vi.mocked(secureFs.readFile).mockResolvedValue('');
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('sets waiting_approval when agent output has no tool usage markers', async () => {
+      // Long output but no tool markers - agent printed text but didn't use tools
+      const longOutputNoTools = 'I analyzed the codebase and found several issues. '.repeat(20);
+      vi.mocked(secureFs.readFile).mockResolvedValue(longOutputNoTools);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('sets waiting_approval when agent output has tool markers but is too short', async () => {
+      // Has a tool marker but total output is under 200 chars
+      const shortWithTool = '🔧 Tool: Read\nInput: {"file_path": "/src/index.ts"}\nDone.';
+      expect(shortWithTool.trim().length).toBeLessThan(200);
+
+      vi.mocked(secureFs.readFile).mockResolvedValue(shortWithTool);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('sets waiting_approval when agent output file is missing (ENOENT)', async () => {
+      vi.mocked(secureFs.readFile).mockRejectedValue(new Error('ENOENT'));
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('sets waiting_approval when agent output is only whitespace', async () => {
+      vi.mocked(secureFs.readFile).mockResolvedValue('   \n\n\t  \n  ');
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('sets verified when output is exactly at the 200 char threshold with tool usage', async () => {
+      // Create output that's exactly 200 chars trimmed with tool markers
+      const toolMarker = '🔧 Tool: Edit\nInput: {"file_path": "/src/index.ts"}\n';
+      const padding = 'x'.repeat(200 - toolMarker.length);
+      const output = toolMarker + padding;
+      expect(output.trim().length).toBeGreaterThanOrEqual(200);
+
+      vi.mocked(secureFs.readFile).mockResolvedValue(output);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'verified'
+      );
+    });
+
+    it('sets waiting_approval when output is 199 chars with tool usage (below threshold)', async () => {
+      const toolMarker = '🔧 Tool: Read\n';
+      const padding = 'x'.repeat(199 - toolMarker.length);
+      const output = toolMarker + padding;
+      expect(output.trim().length).toBe(199);
+
+      vi.mocked(secureFs.readFile).mockResolvedValue(output);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('skipTests always takes priority over output validation', async () => {
+      // Meaningful output with tool usage - would normally be 'verified'
+      const output = makeAgentOutput(5, 'All changes applied successfully.');
+      vi.mocked(secureFs.readFile).mockResolvedValue(output);
+
+      mockLoadFeatureFn = vi.fn().mockResolvedValue({ ...testFeature, skipTests: true });
+      const svc = createServiceWithMocks();
+
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // skipTests=true always means waiting_approval regardless of output quality
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('skipTests with empty output still results in waiting_approval', async () => {
+      vi.mocked(secureFs.readFile).mockResolvedValue('');
+
+      mockLoadFeatureFn = vi.fn().mockResolvedValue({ ...testFeature, skipTests: true });
+      const svc = createServiceWithMocks();
+
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('still records success even when output validation fails', async () => {
+      vi.mocked(secureFs.readFile).mockResolvedValue('');
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // recordSuccess should still be called - the agent ran without errors
+      expect(mockRecordSuccessFn).toHaveBeenCalled();
+    });
+
+    it('still extracts summary when output has content but no tool markers', async () => {
+      const outputNoTools = 'A '.repeat(150); // > 200 chars but no tool markers
+      vi.mocked(secureFs.readFile).mockResolvedValue(outputNoTools);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // Summary extraction still runs even though status is waiting_approval
+      expect(extractSummary).toHaveBeenCalledWith(outputNoTools);
+      expect(mockSaveFeatureSummaryFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'Test summary'
+      );
+    });
+
+    it('emits feature_complete with passes=true even when output validation routes to waiting_approval', async () => {
+      vi.mocked(secureFs.readFile).mockResolvedValue('');
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1', false, true);
+
+      // The agent ran without error - it's still a "pass" from the execution perspective
+      expect(mockEventBus.emitAutoModeEvent).toHaveBeenCalledWith(
+        'auto_mode_feature_complete',
+        expect.objectContaining({ passes: true })
+      );
+    });
+
+    it('handles realistic Cursor CLI output that exits quickly', async () => {
+      // Simulates a Cursor CLI that prints a brief message and exits
+      const cursorQuickExit = 'Task received. Processing...\nResult: completed successfully.';
+      expect(cursorQuickExit.includes('🔧 Tool:')).toBe(false);
+
+      vi.mocked(secureFs.readFile).mockResolvedValue(cursorQuickExit);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // No tool usage = waiting_approval
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('handles realistic Claude SDK output with multiple tool uses', async () => {
+      // Simulates a Claude SDK agent that does real work
+      const claudeOutput =
+        "I'll implement the requested feature.\n\n" +
+        '🔧 Tool: Read\nInput: {"file_path": "/src/components/App.tsx"}\n\n' +
+        'I can see the existing component structure. Let me modify it.\n\n' +
+        '🔧 Tool: Edit\nInput: {"file_path": "/src/components/App.tsx", "old_string": "const App = () => {", "new_string": "const App: React.FC = () => {"}\n\n' +
+        '🔧 Tool: Write\nInput: {"file_path": "/src/components/NewFeature.tsx"}\n\n' +
+        "I've created the new component and updated the existing one. The feature is now implemented with proper TypeScript types.";
+
+      vi.mocked(secureFs.readFile).mockResolvedValue(claudeOutput);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // Real work = verified
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'verified'
+      );
+    });
+
+    it('reads agent output from the correct path with utf-8 encoding', async () => {
+      const output = makeAgentOutput(2, 'Done with changes.');
+      vi.mocked(secureFs.readFile).mockResolvedValue(output);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // Verify readFile was called with the correct path derived from getFeatureDir
+      expect(secureFs.readFile).toHaveBeenCalledWith(
+        '/test/project/.automaker/features/feature-1/agent-output.md',
+        'utf-8'
+      );
+    });
+
+    it('completion message includes auto-verified when status is verified', async () => {
+      const output = makeAgentOutput(3, 'All changes applied.');
+      vi.mocked(secureFs.readFile).mockResolvedValue(output);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1', false, true);
+
+      expect(mockEventBus.emitAutoModeEvent).toHaveBeenCalledWith(
+        'auto_mode_feature_complete',
+        expect.objectContaining({
+          message: expect.stringContaining('auto-verified'),
+        })
+      );
+    });
+
+    it('completion message does NOT include auto-verified when status is waiting_approval', async () => {
+      // Empty output → waiting_approval
+      vi.mocked(secureFs.readFile).mockResolvedValue('');
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1', false, true);
+
+      const completeCall = vi
+        .mocked(mockEventBus.emitAutoModeEvent)
+        .mock.calls.find((call) => call[0] === 'auto_mode_feature_complete');
+      expect(completeCall).toBeDefined();
+      expect((completeCall![1] as { message: string }).message).not.toContain('auto-verified');
+    });
+
+    it('uses same agentOutput for both status determination and summary extraction', async () => {
+      // Specific output that is long enough with tool markers (verified path)
+      // AND has content for summary extraction
+      const specificOutput =
+        '🔧 Tool: Read\nReading file...\n🔧 Tool: Edit\nEditing file...\n' +
+        'The implementation is complete. Here is a detailed description of what was done. '.repeat(
+          3
+        );
+      vi.mocked(secureFs.readFile).mockResolvedValue(specificOutput);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // Status should be verified (has tools + long enough)
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'verified'
+      );
+      // extractSummary should receive the exact same output
+      expect(extractSummary).toHaveBeenCalledWith(specificOutput);
+      // recordLearnings should also receive the same output
+      expect(mockRecordLearningsFn).toHaveBeenCalledWith(
+        '/test/project',
+        testFeature,
+        specificOutput
+      );
+    });
+
+    it('does not call recordMemoryUsage when output is empty and memoryFiles is empty', async () => {
+      vi.mocked(secureFs.readFile).mockResolvedValue('');
+      const { recordMemoryUsage } = await import('@automaker/utils');
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // With empty output and empty memoryFiles, recordMemoryUsage should not be called
+      expect(recordMemoryUsage).not.toHaveBeenCalled();
+    });
+
+    it('handles output with special unicode characters correctly', async () => {
+      // Output with various unicode but includes tool markers
+      const unicodeOutput =
+        '🔧 Tool: Read\n' +
+        '🔧 Tool: Edit\n' +
+        'Añadiendo función de búsqueda con caracteres especiales: ñ, ü, ö, é, 日本語テスト. ' +
+        'Die Änderungen wurden erfolgreich implementiert. '.repeat(3);
+      vi.mocked(secureFs.readFile).mockResolvedValue(unicodeOutput);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // Should still detect tool markers and sufficient length
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'verified'
+      );
+    });
+
+    it('treats output with only newlines and spaces around tool marker as insufficient', async () => {
+      // Has tool marker but surrounded by whitespace, total trimmed < 200
+      const sparseOutput = '\n\n  🔧 Tool: Read  \n\n';
+      expect(sparseOutput.trim().length).toBeLessThan(200);
+
+      vi.mocked(secureFs.readFile).mockResolvedValue(sparseOutput);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('detects tool marker substring correctly (partial match like "🔧 Tools:" does not count)', async () => {
+      // Output with a similar but not exact marker - "🔧 Tools:" instead of "🔧 Tool:"
+      const wrongMarker = '🔧 Tools: Read\n🔧 Tools: Edit\n' + 'Implementation done. '.repeat(20);
+      expect(wrongMarker.includes('🔧 Tool:')).toBe(false);
+
+      vi.mocked(secureFs.readFile).mockResolvedValue(wrongMarker);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // "🔧 Tools:" is not the same as "🔧 Tool:" - should be waiting_approval
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('pipeline merge_conflict status short-circuits before output validation', async () => {
+      // Set up pipeline that results in merge_conflict
+      vi.mocked(pipelineService.getPipelineConfig).mockResolvedValue({
+        version: 1,
+        steps: [{ id: 'step-1', name: 'Step 1', order: 1, instructions: 'Do step 1' }] as any,
+      });
+
+      // After pipeline, loadFeature returns merge_conflict status
+      let loadCallCount = 0;
+      mockLoadFeatureFn = vi.fn().mockImplementation(() => {
+        loadCallCount++;
+        if (loadCallCount === 1) return testFeature; // initial load
+        // All subsequent loads (task check + pipeline refresh) return merge_conflict
+        return { ...testFeature, status: 'merge_conflict' };
+      });
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // Should NOT have called updateFeatureStatusFn with 'verified' or 'waiting_approval'
+      // because pipeline merge_conflict short-circuits the method
+      const statusCalls = vi
+        .mocked(mockUpdateFeatureStatusFn)
+        .mock.calls.filter((call) => call[2] === 'verified' || call[2] === 'waiting_approval');
+      // The only non-in_progress status call should be absent since merge_conflict returns early
+      expect(statusCalls.length).toBe(0);
+    });
+  });
 });