Fix agent output validation to prevent false verified status (#807)

* Changes from fix/cursor-fix * feat: Enhance provider error messages with diagnostic context, address test failure, fix port change, move playwright tests to different port * Update apps/ui/src/components/views/board-view/dialogs/add-feature-dialog.tsx Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> * ci: Update test server port from 3008 to 3108 and add environment configuration * fix: Correct typo in health endpoint URL and standardize port env vars --------- Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
2026-03-16 21:53:07 +00:00 · 2026-02-24 20:18:40 -08:00
parent 0330c70261
commit 51e9a23ba1
36 changed files with 1610 additions and 104 deletions
--- a/.github/workflows/e2e-tests.yml
+++ b/.github/workflows/e2e-tests.yml
@@ -46,7 +46,8 @@ jobs:
          echo "SERVER_PID=$SERVER_PID" >> $GITHUB_ENV

        env:
-          PORT: 3008
+          PORT: 3108
+          TEST_SERVER_PORT: 3108
          NODE_ENV: test
          # Use a deterministic API key so Playwright can log in reliably
          AUTOMAKER_API_KEY: test-api-key-for-e2e-tests
@@ -81,13 +82,13 @@ jobs:

          # Wait for health endpoint
          for i in {1..60}; do
-            if curl -s -f http://localhost:3008/api/health > /dev/null 2>&1; then
+            if curl -s -f http://localhost:3108/api/health > /dev/null 2>&1; then
              echo "Backend server is ready!"
              echo "=== Backend logs ==="
              cat backend.log
              echo ""
              echo "Health check response:"
-              curl -s http://localhost:3008/api/health | jq . 2>/dev/null || echo "Health check: $(curl -s http://localhost:3008/api/health 2>/dev/null || echo 'No response')"
+              curl -s http://localhost:3108/api/health | jq . 2>/dev/null || echo "Health check: $(curl -s http://localhost:3108/api/health 2>/dev/null || echo 'No response')"
              exit 0
            fi
            
@@ -111,11 +112,11 @@ jobs:
          ps aux | grep -E "(node|tsx)" | grep -v grep || echo "No node processes found"
          echo ""
          echo "=== Port status ==="
-          netstat -tlnp 2>/dev/null | grep :3008 || echo "Port 3008 not listening"
-          lsof -i :3008 2>/dev/null || echo "lsof not available or port not in use"
+          netstat -tlnp 2>/dev/null | grep :3108 || echo "Port 3108 not listening"
+          lsof -i :3108 2>/dev/null || echo "lsof not available or port not in use"
          echo ""
          echo "=== Health endpoint test ==="
-          curl -v http://localhost:3008/api/health 2>&1 || echo "Health endpoint failed"
+          curl -v http://localhost:3108/api/health 2>&1 || echo "Health endpoint failed"

          # Kill the server process if it's still hanging
          if kill -0 $SERVER_PID 2>/dev/null; then
@@ -132,8 +133,8 @@ jobs:
        run: npm run test --workspace=apps/ui
        env:
          CI: true
-          VITE_SERVER_URL: http://localhost:3008
-          SERVER_URL: http://localhost:3008
+          VITE_SERVER_URL: http://localhost:3108
+          SERVER_URL: http://localhost:3108
          VITE_SKIP_SETUP: 'true'
          # Keep UI-side login/defaults consistent
          AUTOMAKER_API_KEY: test-api-key-for-e2e-tests
@@ -148,7 +149,7 @@ jobs:
          ps aux | grep -E "(node|tsx)" | grep -v grep || echo "No node processes found"
          echo ""
          echo "=== Port status ==="
-          netstat -tlnp 2>/dev/null | grep :3008 || echo "Port 3008 not listening"
+          netstat -tlnp 2>/dev/null | grep :3108 || echo "Port 3108 not listening"

      - name: Upload Playwright report
        uses: actions/upload-artifact@v4
--- a/apps/server/.env.example
+++ b/apps/server/.env.example
@@ -52,6 +52,12 @@ HOST=0.0.0.0
 # Port to run the server on
 PORT=3008

+# Port to run the server on for testing
+TEST_SERVER_PORT=3108
+
+# Port to run the UI on for testing
+TEST_PORT=3107
+
 # Data directory for sessions and metadata
 DATA_DIR=./data

--- a/apps/server/src/providers/copilot-provider.ts
+++ b/apps/server/src/providers/copilot-provider.ts
@@ -389,9 +389,14 @@ export class CopilotProvider extends CliProvider {

      case 'session.error': {
        const errorEvent = sdkEvent as SdkSessionErrorEvent;
+        const enrichedError =
+          errorEvent.data.message ||
+          (errorEvent.data.code
+            ? `Copilot agent error (code: ${errorEvent.data.code})`
+            : 'Copilot agent error');
        return {
          type: 'error',
-          error: errorEvent.data.message || 'Unknown error',
+          error: enrichedError,
        };
      }

--- a/apps/server/src/providers/cursor-provider.ts
+++ b/apps/server/src/providers/cursor-provider.ts
@@ -562,10 +562,14 @@ export class CursorProvider extends CliProvider {
        const resultEvent = cursorEvent as CursorResultEvent;

        if (resultEvent.is_error) {
+          const errorText = resultEvent.error || resultEvent.result || '';
+          const enrichedError =
+            errorText ||
+            `Cursor agent failed (duration: ${resultEvent.duration_ms}ms, subtype: ${resultEvent.subtype}, session: ${resultEvent.session_id ?? 'none'})`;
          return {
            type: 'error',
            session_id: resultEvent.session_id,
-            error: resultEvent.error || resultEvent.result || 'Unknown error',
+            error: enrichedError,
          };
        }

--- a/apps/server/src/providers/gemini-provider.ts
+++ b/apps/server/src/providers/gemini-provider.ts
@@ -381,10 +381,13 @@ export class GeminiProvider extends CliProvider {
        const resultEvent = geminiEvent as GeminiResultEvent;

        if (resultEvent.status === 'error') {
+          const enrichedError =
+            resultEvent.error ||
+            `Gemini agent failed (duration: ${resultEvent.stats?.duration_ms ?? 'unknown'}ms, session: ${resultEvent.session_id ?? 'none'})`;
          return {
            type: 'error',
            session_id: resultEvent.session_id,
-            error: resultEvent.error || 'Unknown error',
+            error: enrichedError,
          };
        }

@@ -401,10 +404,12 @@ export class GeminiProvider extends CliProvider {

      case 'error': {
        const errorEvent = geminiEvent as GeminiResultEvent;
+        const enrichedError =
+          errorEvent.error || `Gemini agent failed (session: ${errorEvent.session_id ?? 'none'})`;
        return {
          type: 'error',
          session_id: errorEvent.session_id,
-          error: errorEvent.error || 'Unknown error',
+          error: enrichedError,
        };
      }

--- a/apps/server/src/services/agent-executor.ts
+++ b/apps/server/src/services/agent-executor.ts
@@ -296,8 +296,28 @@ export class AgentExecutor {
            }
          }
        } else if (msg.type === 'error') {
-          throw new Error(AgentExecutor.sanitizeProviderError(msg.error));
-        } else if (msg.type === 'result' && msg.subtype === 'success') scheduleWrite();
+          const sanitized = AgentExecutor.sanitizeProviderError(msg.error);
+          logger.error(
+            `[execute] Feature ${featureId} received error from provider. ` +
+              `raw="${msg.error}", sanitized="${sanitized}", session_id=${msg.session_id ?? 'none'}`
+          );
+          throw new Error(sanitized);
+        } else if (msg.type === 'result') {
+          if (msg.subtype === 'success') {
+            scheduleWrite();
+          } else if (msg.subtype?.startsWith('error')) {
+            // Non-success result subtypes from the SDK (error_max_turns, error_during_execution, etc.)
+            logger.error(
+              `[execute] Feature ${featureId} ended with error subtype: ${msg.subtype}. ` +
+                `session_id=${msg.session_id ?? 'none'}`
+            );
+            throw new Error(`Agent execution ended with: ${msg.subtype}`);
+          } else {
+            logger.warn(
+              `[execute] Feature ${featureId} received unhandled result subtype: ${msg.subtype}`
+            );
+          }
+        }
      }
    } finally {
      clearInterval(streamHeartbeat);
@@ -447,16 +467,28 @@ export class AgentExecutor {
              });
          }
        } else if (msg.type === 'error') {
-          // Clean the error: strip ANSI codes and redundant "Error: " prefix
-          const cleanedError =
-            (msg.error || `Error during task ${task.id}`)
-              .replace(/\x1b\[[0-9;]*m/g, '')
-              .replace(/^Error:\s*/i, '')
-              .trim() || `Error during task ${task.id}`;
-          throw new Error(cleanedError);
-        } else if (msg.type === 'result' && msg.subtype === 'success') {
-          taskOutput += msg.result || '';
-          responseText += msg.result || '';
+          const fallback = `Error during task ${task.id}`;
+          const sanitized = AgentExecutor.sanitizeProviderError(msg.error || fallback);
+          logger.error(
+            `[executeTasksLoop] Feature ${featureId} task ${task.id} received error from provider. ` +
+              `raw="${msg.error}", sanitized="${sanitized}", session_id=${msg.session_id ?? 'none'}`
+          );
+          throw new Error(sanitized);
+        } else if (msg.type === 'result') {
+          if (msg.subtype === 'success') {
+            taskOutput += msg.result || '';
+            responseText += msg.result || '';
+          } else if (msg.subtype?.startsWith('error')) {
+            logger.error(
+              `[executeTasksLoop] Feature ${featureId} task ${task.id} ended with error subtype: ${msg.subtype}. ` +
+                `session_id=${msg.session_id ?? 'none'}`
+            );
+            throw new Error(`Agent execution ended with: ${msg.subtype}`);
+          } else {
+            logger.warn(
+              `[executeTasksLoop] Feature ${featureId} task ${task.id} received unhandled result subtype: ${msg.subtype}`
+            );
+          }
        }
      }
      if (!taskCompleteDetected)
--- a/apps/server/src/services/execution-service.ts
+++ b/apps/server/src/services/execution-service.ts
@@ -60,6 +60,12 @@ import type {

 const logger = createLogger('ExecutionService');

+/** Marker written by agent-executor for each tool invocation. */
+const TOOL_USE_MARKER = '🔧 Tool:';
+
+/** Minimum trimmed output length to consider agent work meaningful. */
+const MIN_MEANINGFUL_OUTPUT_LENGTH = 200;
+
 export class ExecutionService {
  constructor(
    private eventBus: TypedEventBus,
@@ -409,7 +415,41 @@ Please continue from where you left off and complete all remaining tasks. Use th
        }
      }

-      const finalStatus = feature.skipTests ? 'waiting_approval' : 'verified';
+      // Read agent output before determining final status.
+      // CLI-based providers (Cursor, Codex, etc.) may exit quickly without doing
+      // meaningful work. Check output to avoid prematurely marking as 'verified'.
+      const outputPath = path.join(getFeatureDir(projectPath, featureId), 'agent-output.md');
+      let agentOutput = '';
+      try {
+        agentOutput = (await secureFs.readFile(outputPath, 'utf-8')) as string;
+      } catch {
+        /* */
+      }
+
+      // Determine if the agent did meaningful work by checking for tool usage
+      // indicators in the output. The agent executor writes "🔧 Tool:" markers
+      // each time a tool is invoked. No tool usage suggests the CLI exited
+      // without performing implementation work.
+      const hasToolUsage = agentOutput.includes(TOOL_USE_MARKER);
+      const isOutputTooShort = agentOutput.trim().length < MIN_MEANINGFUL_OUTPUT_LENGTH;
+      const agentDidWork = hasToolUsage && !isOutputTooShort;
+
+      let finalStatus: 'verified' | 'waiting_approval';
+      if (feature.skipTests) {
+        finalStatus = 'waiting_approval';
+      } else if (!agentDidWork) {
+        // Agent didn't produce meaningful output (e.g., CLI exited quickly).
+        // Route to waiting_approval so the user can review and re-run.
+        finalStatus = 'waiting_approval';
+        logger.warn(
+          `[executeFeature] Feature ${featureId}: agent produced insufficient output ` +
+            `(${agentOutput.trim().length}/${MIN_MEANINGFUL_OUTPUT_LENGTH} chars, toolUsage=${hasToolUsage}). ` +
+            `Setting status to waiting_approval instead of verified.`
+        );
+      } else {
+        finalStatus = 'verified';
+      }
+
      await this.updateFeatureStatusFn(projectPath, featureId, finalStatus);
      this.recordSuccessFn();

@@ -421,13 +461,6 @@ Please continue from where you left off and complete all remaining tasks. Use th
      const hasIncompleteTasks = totalTasks > 0 && completedTasks < totalTasks;

      try {
-        const outputPath = path.join(getFeatureDir(projectPath, featureId), 'agent-output.md');
-        let agentOutput = '';
-        try {
-          agentOutput = (await secureFs.readFile(outputPath, 'utf-8')) as string;
-        } catch {
-          /* */
-        }
        if (agentOutput) {
          const summary = extractSummary(agentOutput);
          if (summary) await this.saveFeatureSummaryFn(projectPath, featureId, summary);
--- a/apps/server/tests/unit/lib/thinking-level-normalization.test.ts
+++ b/apps/server/tests/unit/lib/thinking-level-normalization.test.ts
@@ -0,0 +1,20 @@
+import { describe, it, expect } from 'vitest';
+import { normalizeThinkingLevelForModel } from '@automaker/types';
+
+describe('normalizeThinkingLevelForModel', () => {
+  it('preserves explicitly selected none for Opus models', () => {
+    expect(normalizeThinkingLevelForModel('claude-opus', 'none')).toBe('none');
+  });
+
+  it('falls back to none when Opus receives an unsupported manual thinking level', () => {
+    expect(normalizeThinkingLevelForModel('claude-opus', 'medium')).toBe('none');
+  });
+
+  it('keeps adaptive for Opus when adaptive is selected', () => {
+    expect(normalizeThinkingLevelForModel('claude-opus', 'adaptive')).toBe('adaptive');
+  });
+
+  it('preserves supported manual levels for non-Opus models', () => {
+    expect(normalizeThinkingLevelForModel('claude-sonnet', 'high')).toBe('high');
+  });
+});
--- a/apps/server/tests/unit/providers/copilot-provider.test.ts
+++ b/apps/server/tests/unit/providers/copilot-provider.test.ts
@@ -397,6 +397,45 @@ describe('copilot-provider.ts', () => {
      });
    });

+    it('should use error code in fallback when session.error message is empty', () => {
+      const event = {
+        type: 'session.error',
+        data: { message: '', code: 'RATE_LIMIT_EXCEEDED' },
+      };
+
+      const result = provider.normalizeEvent(event);
+      expect(result).not.toBeNull();
+      expect(result!.type).toBe('error');
+      expect(result!.error).toContain('RATE_LIMIT_EXCEEDED');
+      expect(result!.error).not.toBe('Unknown error');
+    });
+
+    it('should return generic "Copilot agent error" fallback when both message and code are empty', () => {
+      const event = {
+        type: 'session.error',
+        data: { message: '', code: '' },
+      };
+
+      const result = provider.normalizeEvent(event);
+      expect(result).not.toBeNull();
+      expect(result!.type).toBe('error');
+      expect(result!.error).toBe('Copilot agent error');
+      // Must NOT be the old opaque 'Unknown error'
+      expect(result!.error).not.toBe('Unknown error');
+    });
+
+    it('should return generic "Copilot agent error" fallback when data has no code field', () => {
+      const event = {
+        type: 'session.error',
+        data: { message: '' },
+      };
+
+      const result = provider.normalizeEvent(event);
+      expect(result).not.toBeNull();
+      expect(result!.type).toBe('error');
+      expect(result!.error).toBe('Copilot agent error');
+    });
+
    it('should return null for unknown event types', () => {
      const event = { type: 'unknown.event' };

--- a/apps/server/tests/unit/providers/cursor-provider.test.ts
+++ b/apps/server/tests/unit/providers/cursor-provider.test.ts
@@ -1,4 +1,4 @@
-import { describe, it, expect } from 'vitest';
+import { describe, it, expect, beforeEach } from 'vitest';
 import { CursorProvider } from '@/providers/cursor-provider.js';

 describe('cursor-provider.ts', () => {
@@ -36,4 +36,122 @@ describe('cursor-provider.ts', () => {
      expect(args).not.toContain('--resume');
    });
  });
+
+  describe('normalizeEvent - result error handling', () => {
+    let provider: CursorProvider;
+
+    beforeEach(() => {
+      provider = Object.create(CursorProvider.prototype) as CursorProvider;
+    });
+
+    it('returns error message from resultEvent.error when is_error=true', () => {
+      const event = {
+        type: 'result',
+        is_error: true,
+        error: 'Rate limit exceeded',
+        result: '',
+        subtype: 'error',
+        duration_ms: 3000,
+        session_id: 'sess-123',
+      };
+
+      const msg = provider.normalizeEvent(event);
+
+      expect(msg).not.toBeNull();
+      expect(msg!.type).toBe('error');
+      expect(msg!.error).toBe('Rate limit exceeded');
+    });
+
+    it('falls back to resultEvent.result when error field is empty and is_error=true', () => {
+      const event = {
+        type: 'result',
+        is_error: true,
+        error: '',
+        result: 'Process terminated unexpectedly',
+        subtype: 'error',
+        duration_ms: 5000,
+        session_id: 'sess-456',
+      };
+
+      const msg = provider.normalizeEvent(event);
+
+      expect(msg).not.toBeNull();
+      expect(msg!.type).toBe('error');
+      expect(msg!.error).toBe('Process terminated unexpectedly');
+    });
+
+    it('builds diagnostic fallback when both error and result are empty and is_error=true', () => {
+      const event = {
+        type: 'result',
+        is_error: true,
+        error: '',
+        result: '',
+        subtype: 'error',
+        duration_ms: 5000,
+        session_id: 'sess-789',
+      };
+
+      const msg = provider.normalizeEvent(event);
+
+      expect(msg).not.toBeNull();
+      expect(msg!.type).toBe('error');
+      // Should contain diagnostic info rather than 'Unknown error'
+      expect(msg!.error).toContain('5000ms');
+      expect(msg!.error).toContain('sess-789');
+      expect(msg!.error).not.toBe('Unknown error');
+    });
+
+    it('preserves session_id in error message', () => {
+      const event = {
+        type: 'result',
+        is_error: true,
+        error: 'Timeout occurred',
+        result: '',
+        subtype: 'error',
+        duration_ms: 30000,
+        session_id: 'my-session-id',
+      };
+
+      const msg = provider.normalizeEvent(event);
+
+      expect(msg!.session_id).toBe('my-session-id');
+    });
+
+    it('uses "none" when session_id is missing from diagnostic fallback', () => {
+      const event = {
+        type: 'result',
+        is_error: true,
+        error: '',
+        result: '',
+        subtype: 'error',
+        duration_ms: 5000,
+        // session_id intentionally omitted
+      };
+
+      const msg = provider.normalizeEvent(event);
+
+      expect(msg).not.toBeNull();
+      expect(msg!.type).toBe('error');
+      expect(msg!.error).toContain('none');
+      expect(msg!.error).not.toContain('undefined');
+    });
+
+    it('returns success result when is_error=false', () => {
+      const event = {
+        type: 'result',
+        is_error: false,
+        error: '',
+        result: 'Completed successfully',
+        subtype: 'success',
+        duration_ms: 2000,
+        session_id: 'sess-ok',
+      };
+
+      const msg = provider.normalizeEvent(event);
+
+      expect(msg).not.toBeNull();
+      expect(msg!.type).toBe('result');
+      expect(msg!.subtype).toBe('success');
+    });
+  });
 });
--- a/apps/server/tests/unit/providers/gemini-provider.test.ts
+++ b/apps/server/tests/unit/providers/gemini-provider.test.ts
@@ -1,5 +1,6 @@
 import { describe, it, expect, beforeEach } from 'vitest';
 import { GeminiProvider } from '@/providers/gemini-provider.js';
+import type { ProviderMessage } from '@automaker/types';

 describe('gemini-provider.ts', () => {
  let provider: GeminiProvider;
@@ -116,4 +117,140 @@ describe('gemini-provider.ts', () => {
      expect(args[modelIndex + 1]).toBe('gemini-2.5-pro');
    });
  });
+
+  describe('normalizeEvent - error handling', () => {
+    it('returns error from result event when status=error and error field is set', () => {
+      const event = {
+        type: 'result',
+        status: 'error',
+        error: 'Model overloaded',
+        session_id: 'sess-gemini-1',
+        stats: { duration_ms: 4000, total_tokens: 0 },
+      };
+
+      const msg = provider.normalizeEvent(event) as ProviderMessage;
+
+      expect(msg).not.toBeNull();
+      expect(msg.type).toBe('error');
+      expect(msg.error).toBe('Model overloaded');
+      expect(msg.session_id).toBe('sess-gemini-1');
+    });
+
+    it('builds diagnostic fallback when result event has status=error but empty error field', () => {
+      const event = {
+        type: 'result',
+        status: 'error',
+        error: '',
+        session_id: 'sess-gemini-2',
+        stats: { duration_ms: 7500, total_tokens: 0 },
+      };
+
+      const msg = provider.normalizeEvent(event) as ProviderMessage;
+
+      expect(msg).not.toBeNull();
+      expect(msg.type).toBe('error');
+      // Diagnostic info should be present instead of 'Unknown error'
+      expect(msg.error).toContain('7500ms');
+      expect(msg.error).toContain('sess-gemini-2');
+      expect(msg.error).not.toBe('Unknown error');
+    });
+
+    it('builds fallback with "unknown" duration when stats are missing', () => {
+      const event = {
+        type: 'result',
+        status: 'error',
+        error: '',
+        session_id: 'sess-gemini-nostats',
+        // no stats field
+      };
+
+      const msg = provider.normalizeEvent(event) as ProviderMessage;
+
+      expect(msg).not.toBeNull();
+      expect(msg.type).toBe('error');
+      expect(msg.error).toContain('unknown');
+    });
+
+    it('returns error from standalone error event with error field set', () => {
+      const event = {
+        type: 'error',
+        error: 'API key invalid',
+        session_id: 'sess-gemini-3',
+      };
+
+      const msg = provider.normalizeEvent(event) as ProviderMessage;
+
+      expect(msg).not.toBeNull();
+      expect(msg.type).toBe('error');
+      expect(msg.error).toBe('API key invalid');
+    });
+
+    it('builds diagnostic fallback when standalone error event has empty error field', () => {
+      const event = {
+        type: 'error',
+        error: '',
+        session_id: 'sess-gemini-empty',
+      };
+
+      const msg = provider.normalizeEvent(event) as ProviderMessage;
+
+      expect(msg).not.toBeNull();
+      expect(msg.type).toBe('error');
+      // Should include session_id, not just 'Unknown error'
+      expect(msg.error).toContain('sess-gemini-empty');
+      expect(msg.error).not.toBe('Unknown error');
+    });
+
+    it('builds fallback mentioning "none" when session_id is missing from error event', () => {
+      const event = {
+        type: 'error',
+        error: '',
+        // no session_id
+      };
+
+      const msg = provider.normalizeEvent(event) as ProviderMessage;
+
+      expect(msg).not.toBeNull();
+      expect(msg.type).toBe('error');
+      expect(msg.error).toContain('none');
+    });
+
+    it('uses consistent "Gemini agent failed" label for both result and error event fallbacks', () => {
+      const resultEvent = {
+        type: 'result',
+        status: 'error',
+        error: '',
+        session_id: 'sess-r',
+        stats: { duration_ms: 1000 },
+      };
+      const errorEvent = {
+        type: 'error',
+        error: '',
+        session_id: 'sess-e',
+      };
+
+      const resultMsg = provider.normalizeEvent(resultEvent) as ProviderMessage;
+      const errorMsg = provider.normalizeEvent(errorEvent) as ProviderMessage;
+
+      // Both fallback messages should use the same "Gemini agent failed" prefix
+      expect(resultMsg.error).toContain('Gemini agent failed');
+      expect(errorMsg.error).toContain('Gemini agent failed');
+    });
+
+    it('returns success result when result event has status=success', () => {
+      const event = {
+        type: 'result',
+        status: 'success',
+        error: '',
+        session_id: 'sess-gemini-ok',
+        stats: { duration_ms: 1200, total_tokens: 500 },
+      };
+
+      const msg = provider.normalizeEvent(event) as ProviderMessage;
+
+      expect(msg).not.toBeNull();
+      expect(msg.type).toBe('result');
+      expect(msg.subtype).toBe('success');
+    });
+  });
 });
--- a/apps/server/tests/unit/services/agent-executor.test.ts
+++ b/apps/server/tests/unit/services/agent-executor.test.ts
@@ -685,6 +685,309 @@ describe('AgentExecutor', () => {
      await expect(executor.execute(options, callbacks)).rejects.toThrow('API rate limit exceeded');
    });

+    it('should throw "Unknown error" when provider stream yields error with empty message', async () => {
+      const executor = new AgentExecutor(
+        mockEventBus,
+        mockFeatureStateManager,
+        mockPlanApprovalService,
+        mockSettingsService
+      );
+
+      const mockProvider = {
+        getName: () => 'mock',
+        executeQuery: vi.fn().mockImplementation(function* () {
+          yield {
+            type: 'error',
+            error: '',
+            session_id: 'sess-123',
+          };
+        }),
+      } as unknown as BaseProvider;
+
+      const options: AgentExecutionOptions = {
+        workDir: '/test',
+        featureId: 'test-feature',
+        prompt: 'Test prompt',
+        projectPath: '/project',
+        abortController: new AbortController(),
+        provider: mockProvider,
+        effectiveBareModel: 'claude-sonnet-4-6',
+        planningMode: 'skip',
+      };
+
+      const callbacks = {
+        waitForApproval: vi.fn().mockResolvedValue({ approved: true }),
+        saveFeatureSummary: vi.fn(),
+        updateFeatureSummary: vi.fn(),
+        buildTaskPrompt: vi.fn().mockReturnValue('task prompt'),
+      };
+
+      await expect(executor.execute(options, callbacks)).rejects.toThrow('Unknown error');
+    });
+
+    it('should throw with sanitized error when provider yields ANSI-decorated error', async () => {
+      const executor = new AgentExecutor(
+        mockEventBus,
+        mockFeatureStateManager,
+        mockPlanApprovalService,
+        mockSettingsService
+      );
+
+      const mockProvider = {
+        getName: () => 'mock',
+        executeQuery: vi.fn().mockImplementation(function* () {
+          yield {
+            type: 'error',
+            // ANSI color codes + "Error: " prefix that should be stripped
+            error: '\x1b[31mError: Connection refused\x1b[0m',
+          };
+        }),
+      } as unknown as BaseProvider;
+
+      const options: AgentExecutionOptions = {
+        workDir: '/test',
+        featureId: 'test-feature',
+        prompt: 'Test prompt',
+        projectPath: '/project',
+        abortController: new AbortController(),
+        provider: mockProvider,
+        effectiveBareModel: 'claude-sonnet-4-6',
+        planningMode: 'skip',
+      };
+
+      const callbacks = {
+        waitForApproval: vi.fn().mockResolvedValue({ approved: true }),
+        saveFeatureSummary: vi.fn(),
+        updateFeatureSummary: vi.fn(),
+        buildTaskPrompt: vi.fn().mockReturnValue('task prompt'),
+      };
+
+      // Should strip ANSI codes and "Error: " prefix
+      await expect(executor.execute(options, callbacks)).rejects.toThrow('Connection refused');
+    });
+
+    it('should throw when result subtype is error_max_turns', async () => {
+      const executor = new AgentExecutor(
+        mockEventBus,
+        mockFeatureStateManager,
+        mockPlanApprovalService,
+        mockSettingsService
+      );
+
+      const mockProvider = {
+        getName: () => 'mock',
+        executeQuery: vi.fn().mockImplementation(function* () {
+          yield {
+            type: 'assistant',
+            message: {
+              content: [{ type: 'text', text: 'Working on it...' }],
+            },
+          };
+          yield {
+            type: 'result',
+            subtype: 'error_max_turns',
+            session_id: 'sess-456',
+          };
+        }),
+      } as unknown as BaseProvider;
+
+      const options: AgentExecutionOptions = {
+        workDir: '/test',
+        featureId: 'test-feature',
+        prompt: 'Test prompt',
+        projectPath: '/project',
+        abortController: new AbortController(),
+        provider: mockProvider,
+        effectiveBareModel: 'claude-sonnet-4-6',
+        planningMode: 'skip',
+      };
+
+      const callbacks = {
+        waitForApproval: vi.fn().mockResolvedValue({ approved: true }),
+        saveFeatureSummary: vi.fn(),
+        updateFeatureSummary: vi.fn(),
+        buildTaskPrompt: vi.fn().mockReturnValue('task prompt'),
+      };
+
+      await expect(executor.execute(options, callbacks)).rejects.toThrow(
+        'Agent execution ended with: error_max_turns'
+      );
+    });
+
+    it('should throw when result subtype is error_during_execution', async () => {
+      const executor = new AgentExecutor(
+        mockEventBus,
+        mockFeatureStateManager,
+        mockPlanApprovalService,
+        mockSettingsService
+      );
+
+      const mockProvider = {
+        getName: () => 'mock',
+        executeQuery: vi.fn().mockImplementation(function* () {
+          yield {
+            type: 'result',
+            subtype: 'error_during_execution',
+            session_id: 'sess-789',
+          };
+        }),
+      } as unknown as BaseProvider;
+
+      const options: AgentExecutionOptions = {
+        workDir: '/test',
+        featureId: 'test-feature',
+        prompt: 'Test prompt',
+        projectPath: '/project',
+        abortController: new AbortController(),
+        provider: mockProvider,
+        effectiveBareModel: 'claude-sonnet-4-6',
+        planningMode: 'skip',
+      };
+
+      const callbacks = {
+        waitForApproval: vi.fn().mockResolvedValue({ approved: true }),
+        saveFeatureSummary: vi.fn(),
+        updateFeatureSummary: vi.fn(),
+        buildTaskPrompt: vi.fn().mockReturnValue('task prompt'),
+      };
+
+      await expect(executor.execute(options, callbacks)).rejects.toThrow(
+        'Agent execution ended with: error_during_execution'
+      );
+    });
+
+    it('should throw when result subtype is error_max_structured_output_retries', async () => {
+      const executor = new AgentExecutor(
+        mockEventBus,
+        mockFeatureStateManager,
+        mockPlanApprovalService,
+        mockSettingsService
+      );
+
+      const mockProvider = {
+        getName: () => 'mock',
+        executeQuery: vi.fn().mockImplementation(function* () {
+          yield {
+            type: 'result',
+            subtype: 'error_max_structured_output_retries',
+          };
+        }),
+      } as unknown as BaseProvider;
+
+      const options: AgentExecutionOptions = {
+        workDir: '/test',
+        featureId: 'test-feature',
+        prompt: 'Test prompt',
+        projectPath: '/project',
+        abortController: new AbortController(),
+        provider: mockProvider,
+        effectiveBareModel: 'claude-sonnet-4-6',
+        planningMode: 'skip',
+      };
+
+      const callbacks = {
+        waitForApproval: vi.fn().mockResolvedValue({ approved: true }),
+        saveFeatureSummary: vi.fn(),
+        updateFeatureSummary: vi.fn(),
+        buildTaskPrompt: vi.fn().mockReturnValue('task prompt'),
+      };
+
+      await expect(executor.execute(options, callbacks)).rejects.toThrow(
+        'Agent execution ended with: error_max_structured_output_retries'
+      );
+    });
+
+    it('should throw when result subtype is error_max_budget_usd', async () => {
+      const executor = new AgentExecutor(
+        mockEventBus,
+        mockFeatureStateManager,
+        mockPlanApprovalService,
+        mockSettingsService
+      );
+
+      const mockProvider = {
+        getName: () => 'mock',
+        executeQuery: vi.fn().mockImplementation(function* () {
+          yield {
+            type: 'result',
+            subtype: 'error_max_budget_usd',
+            session_id: 'sess-budget',
+          };
+        }),
+      } as unknown as BaseProvider;
+
+      const options: AgentExecutionOptions = {
+        workDir: '/test',
+        featureId: 'test-feature',
+        prompt: 'Test prompt',
+        projectPath: '/project',
+        abortController: new AbortController(),
+        provider: mockProvider,
+        effectiveBareModel: 'claude-sonnet-4-6',
+        planningMode: 'skip',
+      };
+
+      const callbacks = {
+        waitForApproval: vi.fn().mockResolvedValue({ approved: true }),
+        saveFeatureSummary: vi.fn(),
+        updateFeatureSummary: vi.fn(),
+        buildTaskPrompt: vi.fn().mockReturnValue('task prompt'),
+      };
+
+      await expect(executor.execute(options, callbacks)).rejects.toThrow(
+        'Agent execution ended with: error_max_budget_usd'
+      );
+    });
+
+    it('should NOT throw when result subtype is success', async () => {
+      const executor = new AgentExecutor(
+        mockEventBus,
+        mockFeatureStateManager,
+        mockPlanApprovalService,
+        mockSettingsService
+      );
+
+      const mockProvider = {
+        getName: () => 'mock',
+        executeQuery: vi.fn().mockImplementation(function* () {
+          yield {
+            type: 'assistant',
+            message: {
+              content: [{ type: 'text', text: 'Done!' }],
+            },
+          };
+          yield {
+            type: 'result',
+            subtype: 'success',
+            session_id: 'sess-ok',
+          };
+        }),
+      } as unknown as BaseProvider;
+
+      const options: AgentExecutionOptions = {
+        workDir: '/test',
+        featureId: 'test-feature',
+        prompt: 'Test prompt',
+        projectPath: '/project',
+        abortController: new AbortController(),
+        provider: mockProvider,
+        effectiveBareModel: 'claude-sonnet-4-6',
+        planningMode: 'skip',
+      };
+
+      const callbacks = {
+        waitForApproval: vi.fn().mockResolvedValue({ approved: true }),
+        saveFeatureSummary: vi.fn(),
+        updateFeatureSummary: vi.fn(),
+        buildTaskPrompt: vi.fn().mockReturnValue('task prompt'),
+      };
+
+      // Should resolve without throwing
+      const result = await executor.execute(options, callbacks);
+      expect(result.aborted).toBe(false);
+      expect(result.responseText).toContain('Done!');
+    });
+
    it('should throw error when authentication fails in response', async () => {
      const executor = new AgentExecutor(
        mockEventBus,
--- a/apps/server/tests/unit/services/agent-output-validation.test.ts
+++ b/apps/server/tests/unit/services/agent-output-validation.test.ts
@@ -0,0 +1,192 @@
+import { describe, it, expect } from 'vitest';
+
+/**
+ * Contract tests verifying the tool marker format used by agent-executor
+ * (which writes agent output) and execution-service (which reads it to
+ * determine if the agent did meaningful work).
+ *
+ * The agent-executor writes: `\n🔧 Tool: ${block.name}\n`
+ * The execution-service checks: `agentOutput.includes('🔧 Tool:')`
+ *
+ * These tests ensure the marker format contract stays consistent and
+ * document the exact detection logic used for status determination.
+ */
+
+// The exact marker prefix that execution-service searches for
+const TOOL_MARKER = '🔧 Tool:';
+
+// Minimum output length threshold for "meaningful work"
+const MIN_OUTPUT_LENGTH = 200;
+
+/**
+ * Simulates the agent-executor's tool_use output format.
+ * See: agent-executor.ts line ~293
+ */
+function formatToolUseBlock(toolName: string, input?: Record<string, unknown>): string {
+  let output = `\n${TOOL_MARKER} ${toolName}\n`;
+  if (input) output += `Input: ${JSON.stringify(input, null, 2)}\n`;
+  return output;
+}
+
+/**
+ * Simulates the execution-service's output validation logic.
+ * See: execution-service.ts lines ~427-429
+ */
+function validateAgentOutput(
+  agentOutput: string,
+  skipTests: boolean
+): 'verified' | 'waiting_approval' {
+  const hasToolUsage = agentOutput.includes(TOOL_MARKER);
+  const hasMinimalOutput = agentOutput.trim().length < MIN_OUTPUT_LENGTH;
+  const agentDidWork = hasToolUsage && !hasMinimalOutput;
+
+  if (skipTests) return 'waiting_approval';
+  if (!agentDidWork) return 'waiting_approval';
+  return 'verified';
+}
+
+describe('Agent Output Validation - Contract Tests', () => {
+  describe('tool marker format contract', () => {
+    it('agent-executor tool format contains the expected marker', () => {
+      const toolOutput = formatToolUseBlock('Read', { file_path: '/src/index.ts' });
+      expect(toolOutput).toContain(TOOL_MARKER);
+    });
+
+    it('agent-executor tool format includes tool name after marker', () => {
+      const toolOutput = formatToolUseBlock('Edit', {
+        file_path: '/src/app.ts',
+        old_string: 'foo',
+        new_string: 'bar',
+      });
+      expect(toolOutput).toContain('🔧 Tool: Edit');
+    });
+
+    it('agent-executor tool format includes JSON input', () => {
+      const input = { file_path: '/src/index.ts' };
+      const toolOutput = formatToolUseBlock('Read', input);
+      expect(toolOutput).toContain('Input: ');
+      expect(toolOutput).toContain('"file_path": "/src/index.ts"');
+    });
+
+    it('agent-executor tool format works without input', () => {
+      const toolOutput = formatToolUseBlock('Bash');
+      expect(toolOutput).toContain('🔧 Tool: Bash');
+      expect(toolOutput).not.toContain('Input:');
+    });
+
+    it('marker includes colon and space to avoid false positives', () => {
+      // Ensure the marker is specific enough to avoid matching other emoji patterns
+      expect(TOOL_MARKER).toBe('🔧 Tool:');
+      expect(TOOL_MARKER).toContain(':');
+    });
+  });
+
+  describe('output validation logic', () => {
+    it('verified: tool usage + sufficient output', () => {
+      const output =
+        'Starting implementation of the new feature...\n' +
+        formatToolUseBlock('Read', { file_path: '/src/index.ts' }) +
+        'I can see the existing code. Let me make the needed changes.\n' +
+        formatToolUseBlock('Edit', { file_path: '/src/index.ts' }) +
+        'Changes complete. The implementation adds new validation logic and tests.';
+      expect(output.trim().length).toBeGreaterThanOrEqual(MIN_OUTPUT_LENGTH);
+
+      expect(validateAgentOutput(output, false)).toBe('verified');
+    });
+
+    it('waiting_approval: no tool markers regardless of length', () => {
+      const longOutput = 'I analyzed the codebase. '.repeat(50);
+      expect(longOutput.trim().length).toBeGreaterThan(MIN_OUTPUT_LENGTH);
+
+      expect(validateAgentOutput(longOutput, false)).toBe('waiting_approval');
+    });
+
+    it('waiting_approval: tool markers but insufficient length', () => {
+      const shortOutput = formatToolUseBlock('Read', { file_path: '/src/a.ts' });
+      expect(shortOutput.trim().length).toBeLessThan(MIN_OUTPUT_LENGTH);
+
+      expect(validateAgentOutput(shortOutput, false)).toBe('waiting_approval');
+    });
+
+    it('waiting_approval: empty output', () => {
+      expect(validateAgentOutput('', false)).toBe('waiting_approval');
+    });
+
+    it('waiting_approval: skipTests always overrides', () => {
+      const goodOutput =
+        'Starting...\n' +
+        formatToolUseBlock('Read', { file_path: '/src/index.ts' }) +
+        formatToolUseBlock('Edit', { file_path: '/src/index.ts' }) +
+        'Done implementing. '.repeat(15);
+      expect(goodOutput.trim().length).toBeGreaterThanOrEqual(MIN_OUTPUT_LENGTH);
+
+      expect(validateAgentOutput(goodOutput, true)).toBe('waiting_approval');
+    });
+
+    it('boundary: exactly MIN_OUTPUT_LENGTH chars with tool is verified', () => {
+      const tool = formatToolUseBlock('Read');
+      const padding = 'x'.repeat(MIN_OUTPUT_LENGTH - tool.trim().length);
+      const output = tool + padding;
+      expect(output.trim().length).toBeGreaterThanOrEqual(MIN_OUTPUT_LENGTH);
+
+      expect(validateAgentOutput(output, false)).toBe('verified');
+    });
+
+    it('boundary: MIN_OUTPUT_LENGTH - 1 chars with tool is waiting_approval', () => {
+      const marker = `${TOOL_MARKER} Read\n`;
+      const padding = 'x'.repeat(MIN_OUTPUT_LENGTH - 1 - marker.length);
+      const output = marker + padding;
+      expect(output.trim().length).toBe(MIN_OUTPUT_LENGTH - 1);
+
+      expect(validateAgentOutput(output, false)).toBe('waiting_approval');
+    });
+  });
+
+  describe('realistic provider scenarios', () => {
+    it('Claude SDK agent with multiple tools → verified', () => {
+      let output = "I'll implement the feature.\n\n";
+      output += formatToolUseBlock('Read', { file_path: '/src/components/App.tsx' });
+      output += 'I see the component. Let me update it.\n\n';
+      output += formatToolUseBlock('Edit', {
+        file_path: '/src/components/App.tsx',
+        old_string: 'const App = () => {',
+        new_string: 'const App: React.FC = () => {',
+      });
+      output += 'Done. The component is now typed correctly.\n';
+
+      expect(validateAgentOutput(output, false)).toBe('verified');
+    });
+
+    it('Cursor CLI quick exit (no tools) → waiting_approval', () => {
+      const output = 'Task received. Processing...\nResult: completed successfully.';
+      expect(validateAgentOutput(output, false)).toBe('waiting_approval');
+    });
+
+    it('Codex CLI with brief acknowledgment → waiting_approval', () => {
+      const output = 'Understood the task. Starting implementation.\nDone.';
+      expect(validateAgentOutput(output, false)).toBe('waiting_approval');
+    });
+
+    it('Agent that only reads but makes no edits (single Read tool, short output) → waiting_approval', () => {
+      const output = formatToolUseBlock('Read', { file_path: '/src/index.ts' }) + 'File read.';
+      expect(output.trim().length).toBeLessThan(MIN_OUTPUT_LENGTH);
+      expect(validateAgentOutput(output, false)).toBe('waiting_approval');
+    });
+
+    it('Agent with extensive tool usage and explanation → verified', () => {
+      let output = 'Analyzing the codebase for the authentication feature.\n\n';
+      for (let i = 0; i < 5; i++) {
+        output += formatToolUseBlock('Read', { file_path: `/src/auth/handler${i}.ts` });
+        output += `Found handler ${i}. `;
+      }
+      output += formatToolUseBlock('Edit', {
+        file_path: '/src/auth/handler0.ts',
+        old_string: 'function login() {}',
+        new_string: 'async function login(creds: Credentials) { ... }',
+      });
+      output += 'Implementation complete with all authentication changes applied.\n';
+
+      expect(validateAgentOutput(output, false)).toBe('verified');
+    });
+  });
+});
--- a/apps/server/tests/unit/services/execution-service.test.ts
+++ b/apps/server/tests/unit/services/execution-service.test.ts
@@ -211,7 +211,14 @@ describe('execution-service.ts', () => {
    });

    // Default mocks for secureFs
-    vi.mocked(secureFs.readFile).mockResolvedValue('Agent output content');
+    // Include tool usage markers to simulate meaningful agent output.
+    // The execution service checks for '🔧 Tool:' markers and minimum
+    // output length to determine if the agent did real work.
+    vi.mocked(secureFs.readFile).mockResolvedValue(
+      'Starting implementation...\n\n🔧 Tool: Read\nInput: {"file_path": "/src/index.ts"}\n\n' +
+        '🔧 Tool: Edit\nInput: {"file_path": "/src/index.ts", "old_string": "foo", "new_string": "bar"}\n\n' +
+        'Implementation complete. Updated the code as requested.'
+    );
    vi.mocked(secureFs.access).mockResolvedValue(undefined);

    // Re-setup platform mocks
@@ -1433,4 +1440,439 @@ describe('execution-service.ts', () => {
      );
    });
  });
+
+  describe('executeFeature - agent output validation', () => {
+    // Helper to generate realistic agent output with tool markers
+    const makeAgentOutput = (toolCount: number, extraText = ''): string => {
+      let output = 'Starting implementation...\n\n';
+      for (let i = 0; i < toolCount; i++) {
+        output += `🔧 Tool: Edit\nInput: {"file_path": "/src/file${i}.ts", "old_string": "old${i}", "new_string": "new${i}"}\n\n`;
+      }
+      output += `Implementation complete. ${extraText}`;
+      return output;
+    };
+
+    const createServiceWithMocks = () => {
+      return new ExecutionService(
+        mockEventBus,
+        mockConcurrencyManager,
+        mockWorktreeResolver,
+        mockSettingsService,
+        mockRunAgentFn,
+        mockExecutePipelineFn,
+        mockUpdateFeatureStatusFn,
+        mockLoadFeatureFn,
+        mockGetPlanningPromptPrefixFn,
+        mockSaveFeatureSummaryFn,
+        mockRecordLearningsFn,
+        mockContextExistsFn,
+        mockResumeFeatureFn,
+        mockTrackFailureFn,
+        mockSignalPauseFn,
+        mockRecordSuccessFn,
+        mockSaveExecutionStateFn,
+        mockLoadContextFilesFn
+      );
+    };
+
+    it('sets verified when agent output has tool usage and sufficient length', async () => {
+      const output = makeAgentOutput(3, 'Updated authentication module with new login flow.');
+      vi.mocked(secureFs.readFile).mockResolvedValue(output);
+
+      await service.executeFeature('/test/project', 'feature-1');
+
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'verified'
+      );
+    });
+
+    it('sets waiting_approval when agent output is empty', async () => {
+      vi.mocked(secureFs.readFile).mockResolvedValue('');
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('sets waiting_approval when agent output has no tool usage markers', async () => {
+      // Long output but no tool markers - agent printed text but didn't use tools
+      const longOutputNoTools = 'I analyzed the codebase and found several issues. '.repeat(20);
+      vi.mocked(secureFs.readFile).mockResolvedValue(longOutputNoTools);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('sets waiting_approval when agent output has tool markers but is too short', async () => {
+      // Has a tool marker but total output is under 200 chars
+      const shortWithTool = '🔧 Tool: Read\nInput: {"file_path": "/src/index.ts"}\nDone.';
+      expect(shortWithTool.trim().length).toBeLessThan(200);
+
+      vi.mocked(secureFs.readFile).mockResolvedValue(shortWithTool);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('sets waiting_approval when agent output file is missing (ENOENT)', async () => {
+      vi.mocked(secureFs.readFile).mockRejectedValue(new Error('ENOENT'));
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('sets waiting_approval when agent output is only whitespace', async () => {
+      vi.mocked(secureFs.readFile).mockResolvedValue('   \n\n\t  \n  ');
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('sets verified when output is exactly at the 200 char threshold with tool usage', async () => {
+      // Create output that's exactly 200 chars trimmed with tool markers
+      const toolMarker = '🔧 Tool: Edit\nInput: {"file_path": "/src/index.ts"}\n';
+      const padding = 'x'.repeat(200 - toolMarker.length);
+      const output = toolMarker + padding;
+      expect(output.trim().length).toBeGreaterThanOrEqual(200);
+
+      vi.mocked(secureFs.readFile).mockResolvedValue(output);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'verified'
+      );
+    });
+
+    it('sets waiting_approval when output is 199 chars with tool usage (below threshold)', async () => {
+      const toolMarker = '🔧 Tool: Read\n';
+      const padding = 'x'.repeat(199 - toolMarker.length);
+      const output = toolMarker + padding;
+      expect(output.trim().length).toBe(199);
+
+      vi.mocked(secureFs.readFile).mockResolvedValue(output);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('skipTests always takes priority over output validation', async () => {
+      // Meaningful output with tool usage - would normally be 'verified'
+      const output = makeAgentOutput(5, 'All changes applied successfully.');
+      vi.mocked(secureFs.readFile).mockResolvedValue(output);
+
+      mockLoadFeatureFn = vi.fn().mockResolvedValue({ ...testFeature, skipTests: true });
+      const svc = createServiceWithMocks();
+
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // skipTests=true always means waiting_approval regardless of output quality
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('skipTests with empty output still results in waiting_approval', async () => {
+      vi.mocked(secureFs.readFile).mockResolvedValue('');
+
+      mockLoadFeatureFn = vi.fn().mockResolvedValue({ ...testFeature, skipTests: true });
+      const svc = createServiceWithMocks();
+
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('still records success even when output validation fails', async () => {
+      vi.mocked(secureFs.readFile).mockResolvedValue('');
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // recordSuccess should still be called - the agent ran without errors
+      expect(mockRecordSuccessFn).toHaveBeenCalled();
+    });
+
+    it('still extracts summary when output has content but no tool markers', async () => {
+      const outputNoTools = 'A '.repeat(150); // > 200 chars but no tool markers
+      vi.mocked(secureFs.readFile).mockResolvedValue(outputNoTools);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // Summary extraction still runs even though status is waiting_approval
+      expect(extractSummary).toHaveBeenCalledWith(outputNoTools);
+      expect(mockSaveFeatureSummaryFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'Test summary'
+      );
+    });
+
+    it('emits feature_complete with passes=true even when output validation routes to waiting_approval', async () => {
+      vi.mocked(secureFs.readFile).mockResolvedValue('');
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1', false, true);
+
+      // The agent ran without error - it's still a "pass" from the execution perspective
+      expect(mockEventBus.emitAutoModeEvent).toHaveBeenCalledWith(
+        'auto_mode_feature_complete',
+        expect.objectContaining({ passes: true })
+      );
+    });
+
+    it('handles realistic Cursor CLI output that exits quickly', async () => {
+      // Simulates a Cursor CLI that prints a brief message and exits
+      const cursorQuickExit = 'Task received. Processing...\nResult: completed successfully.';
+      expect(cursorQuickExit.includes('🔧 Tool:')).toBe(false);
+
+      vi.mocked(secureFs.readFile).mockResolvedValue(cursorQuickExit);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // No tool usage = waiting_approval
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('handles realistic Claude SDK output with multiple tool uses', async () => {
+      // Simulates a Claude SDK agent that does real work
+      const claudeOutput =
+        "I'll implement the requested feature.\n\n" +
+        '🔧 Tool: Read\nInput: {"file_path": "/src/components/App.tsx"}\n\n' +
+        'I can see the existing component structure. Let me modify it.\n\n' +
+        '🔧 Tool: Edit\nInput: {"file_path": "/src/components/App.tsx", "old_string": "const App = () => {", "new_string": "const App: React.FC = () => {"}\n\n' +
+        '🔧 Tool: Write\nInput: {"file_path": "/src/components/NewFeature.tsx"}\n\n' +
+        "I've created the new component and updated the existing one. The feature is now implemented with proper TypeScript types.";
+
+      vi.mocked(secureFs.readFile).mockResolvedValue(claudeOutput);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // Real work = verified
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'verified'
+      );
+    });
+
+    it('reads agent output from the correct path with utf-8 encoding', async () => {
+      const output = makeAgentOutput(2, 'Done with changes.');
+      vi.mocked(secureFs.readFile).mockResolvedValue(output);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // Verify readFile was called with the correct path derived from getFeatureDir
+      expect(secureFs.readFile).toHaveBeenCalledWith(
+        '/test/project/.automaker/features/feature-1/agent-output.md',
+        'utf-8'
+      );
+    });
+
+    it('completion message includes auto-verified when status is verified', async () => {
+      const output = makeAgentOutput(3, 'All changes applied.');
+      vi.mocked(secureFs.readFile).mockResolvedValue(output);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1', false, true);
+
+      expect(mockEventBus.emitAutoModeEvent).toHaveBeenCalledWith(
+        'auto_mode_feature_complete',
+        expect.objectContaining({
+          message: expect.stringContaining('auto-verified'),
+        })
+      );
+    });
+
+    it('completion message does NOT include auto-verified when status is waiting_approval', async () => {
+      // Empty output → waiting_approval
+      vi.mocked(secureFs.readFile).mockResolvedValue('');
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1', false, true);
+
+      const completeCall = vi
+        .mocked(mockEventBus.emitAutoModeEvent)
+        .mock.calls.find((call) => call[0] === 'auto_mode_feature_complete');
+      expect(completeCall).toBeDefined();
+      expect((completeCall![1] as { message: string }).message).not.toContain('auto-verified');
+    });
+
+    it('uses same agentOutput for both status determination and summary extraction', async () => {
+      // Specific output that is long enough with tool markers (verified path)
+      // AND has content for summary extraction
+      const specificOutput =
+        '🔧 Tool: Read\nReading file...\n🔧 Tool: Edit\nEditing file...\n' +
+        'The implementation is complete. Here is a detailed description of what was done. '.repeat(
+          3
+        );
+      vi.mocked(secureFs.readFile).mockResolvedValue(specificOutput);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // Status should be verified (has tools + long enough)
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'verified'
+      );
+      // extractSummary should receive the exact same output
+      expect(extractSummary).toHaveBeenCalledWith(specificOutput);
+      // recordLearnings should also receive the same output
+      expect(mockRecordLearningsFn).toHaveBeenCalledWith(
+        '/test/project',
+        testFeature,
+        specificOutput
+      );
+    });
+
+    it('does not call recordMemoryUsage when output is empty and memoryFiles is empty', async () => {
+      vi.mocked(secureFs.readFile).mockResolvedValue('');
+      const { recordMemoryUsage } = await import('@automaker/utils');
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // With empty output and empty memoryFiles, recordMemoryUsage should not be called
+      expect(recordMemoryUsage).not.toHaveBeenCalled();
+    });
+
+    it('handles output with special unicode characters correctly', async () => {
+      // Output with various unicode but includes tool markers
+      const unicodeOutput =
+        '🔧 Tool: Read\n' +
+        '🔧 Tool: Edit\n' +
+        'Añadiendo función de búsqueda con caracteres especiales: ñ, ü, ö, é, 日本語テスト. ' +
+        'Die Änderungen wurden erfolgreich implementiert. '.repeat(3);
+      vi.mocked(secureFs.readFile).mockResolvedValue(unicodeOutput);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // Should still detect tool markers and sufficient length
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'verified'
+      );
+    });
+
+    it('treats output with only newlines and spaces around tool marker as insufficient', async () => {
+      // Has tool marker but surrounded by whitespace, total trimmed < 200
+      const sparseOutput = '\n\n  🔧 Tool: Read  \n\n';
+      expect(sparseOutput.trim().length).toBeLessThan(200);
+
+      vi.mocked(secureFs.readFile).mockResolvedValue(sparseOutput);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('detects tool marker substring correctly (partial match like "🔧 Tools:" does not count)', async () => {
+      // Output with a similar but not exact marker - "🔧 Tools:" instead of "🔧 Tool:"
+      const wrongMarker = '🔧 Tools: Read\n🔧 Tools: Edit\n' + 'Implementation done. '.repeat(20);
+      expect(wrongMarker.includes('🔧 Tool:')).toBe(false);
+
+      vi.mocked(secureFs.readFile).mockResolvedValue(wrongMarker);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // "🔧 Tools:" is not the same as "🔧 Tool:" - should be waiting_approval
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('pipeline merge_conflict status short-circuits before output validation', async () => {
+      // Set up pipeline that results in merge_conflict
+      vi.mocked(pipelineService.getPipelineConfig).mockResolvedValue({
+        version: 1,
+        steps: [{ id: 'step-1', name: 'Step 1', order: 1, instructions: 'Do step 1' }] as any,
+      });
+
+      // After pipeline, loadFeature returns merge_conflict status
+      let loadCallCount = 0;
+      mockLoadFeatureFn = vi.fn().mockImplementation(() => {
+        loadCallCount++;
+        if (loadCallCount === 1) return testFeature; // initial load
+        // All subsequent loads (task check + pipeline refresh) return merge_conflict
+        return { ...testFeature, status: 'merge_conflict' };
+      });
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // Should NOT have called updateFeatureStatusFn with 'verified' or 'waiting_approval'
+      // because pipeline merge_conflict short-circuits the method
+      const statusCalls = vi
+        .mocked(mockUpdateFeatureStatusFn)
+        .mock.calls.filter((call) => call[2] === 'verified' || call[2] === 'waiting_approval');
+      // The only non-in_progress status call should be absent since merge_conflict returns early
+      expect(statusCalls.length).toBe(0);
+    });
+  });
 });
--- a/apps/ui/playwright.config.ts
+++ b/apps/ui/playwright.config.ts
@@ -1,7 +1,7 @@
 import { defineConfig, devices } from '@playwright/test';

-const port = process.env.TEST_PORT || 3007;
-const serverPort = process.env.TEST_SERVER_PORT || 3008;
+const port = process.env.TEST_PORT || 3107;
+const serverPort = process.env.TEST_SERVER_PORT || 3108;
 const reuseServer = process.env.TEST_REUSE_SERVER === 'true';
 const useExternalBackend = !!process.env.VITE_SERVER_URL;
 // Always use mock agent for tests (disables rate limiting, uses mock Claude responses)
@@ -70,6 +70,10 @@ export default defineConfig({
            timeout: 120000,
            env: {
              ...process.env,
+              // Must set AUTOMAKER_WEB_PORT to match the port Playwright waits for
+              AUTOMAKER_WEB_PORT: String(port),
+              // Must set AUTOMAKER_SERVER_PORT so Vite proxy forwards to the correct backend port
+              AUTOMAKER_SERVER_PORT: String(serverPort),
              VITE_SKIP_SETUP: 'true',
              // Always skip electron plugin during tests - prevents duplicate server spawning
              VITE_SKIP_ELECTRON: 'true',
--- a/apps/ui/scripts/kill-test-servers.mjs
+++ b/apps/ui/scripts/kill-test-servers.mjs
@@ -8,10 +8,11 @@ import { promisify } from 'util';

 const execAsync = promisify(exec);

-const SERVER_PORT = process.env.TEST_SERVER_PORT || 3008;
-const UI_PORT = process.env.TEST_PORT || 3007;
+const SERVER_PORT = process.env.TEST_SERVER_PORT || 3108;
+const UI_PORT = process.env.TEST_PORT || 3107;
 const USE_EXTERNAL_SERVER = !!process.env.VITE_SERVER_URL;
-
+console.log(`[KillTestServers] SERVER_PORT ${SERVER_PORT}`);
+console.log(`[KillTestServers] UI_PORT ${UI_PORT}`);
 async function killProcessOnPort(port) {
  try {
    const hasLsof = await execAsync('command -v lsof').then(
--- a/apps/ui/src/components/dialogs/pr-comment-resolution-dialog.tsx
+++ b/apps/ui/src/components/dialogs/pr-comment-resolution-dialog.tsx
@@ -45,7 +45,7 @@ import { toast } from 'sonner';
 import type { PRReviewComment } from '@/lib/electron';
 import type { Feature } from '@/store/app-store';
 import type { PhaseModelEntry } from '@automaker/types';
-import { supportsReasoningEffort, isAdaptiveThinkingModel } from '@automaker/types';
+import { supportsReasoningEffort, normalizeThinkingLevelForModel } from '@automaker/types';
 import { resolveModelString } from '@automaker/model-resolver';
 import { PhaseModelSelector } from '@/components/views/settings-view/model-defaults';

@@ -590,20 +590,10 @@ export function PRCommentResolutionDialog({
  const wasOpenRef = useRef(false);

  const handleModelChange = useCallback((entry: PhaseModelEntry) => {
-    // Normalize thinking level when switching between adaptive and non-adaptive models
-    const isNewModelAdaptive =
-      typeof entry.model === 'string' && isAdaptiveThinkingModel(entry.model);
-    const currentLevel = entry.thinkingLevel || 'none';
+    const modelId = typeof entry.model === 'string' ? entry.model : '';
+    const normalizedThinkingLevel = normalizeThinkingLevelForModel(modelId, entry.thinkingLevel);

-    if (isNewModelAdaptive && currentLevel !== 'none' && currentLevel !== 'adaptive') {
-      // Switching TO an adaptive model with a manual level -> auto-switch to 'adaptive'
-      setModelEntry({ ...entry, thinkingLevel: 'adaptive' });
-    } else if (!isNewModelAdaptive && currentLevel === 'adaptive') {
-      // Switching FROM an adaptive model with adaptive -> auto-switch to 'high'
-      setModelEntry({ ...entry, thinkingLevel: 'high' });
-    } else {
-      setModelEntry(entry);
-    }
+    setModelEntry({ ...entry, thinkingLevel: normalizedThinkingLevel });
  }, []);

  // Fetch PR review comments
--- a/apps/ui/src/components/views/board-view/dialogs/add-feature-dialog.tsx
+++ b/apps/ui/src/components/views/board-view/dialogs/add-feature-dialog.tsx
@@ -26,11 +26,12 @@ import { useNavigate } from '@tanstack/react-router';
 import { toast } from 'sonner';
 import { cn } from '@/lib/utils';
 import { modelSupportsThinking } from '@/lib/utils';
-import { useAppStore, ThinkingLevel, FeatureImage, PlanningMode, Feature } from '@/store/app-store';
+import { useAppStore } from '@/store/app-store';
+import type { ThinkingLevel, PlanningMode, Feature, FeatureImage } from '@/store/types';
 import type { ReasoningEffort, PhaseModelEntry, AgentModel } from '@automaker/types';
 import {
  supportsReasoningEffort,
-  isAdaptiveThinkingModel,
+  normalizeThinkingLevelForModel,
  getThinkingLevelsForModel,
 } from '@automaker/types';
 import {
@@ -308,20 +309,10 @@ export function AddFeatureDialog({
  }, [planningMode]);

  const handleModelChange = (entry: PhaseModelEntry) => {
-    // Normalize thinking level when switching between adaptive and non-adaptive models
-    const isNewModelAdaptive =
-      typeof entry.model === 'string' && isAdaptiveThinkingModel(entry.model);
-    const currentLevel = entry.thinkingLevel || 'none';
+    const modelId = typeof entry.model === 'string' ? entry.model : '';
+    const normalizedThinkingLevel = normalizeThinkingLevelForModel(modelId, entry.thinkingLevel);

-    if (isNewModelAdaptive && currentLevel !== 'none' && currentLevel !== 'adaptive') {
-      // Switching TO Opus 4.6 with a manual level -> auto-switch to 'adaptive'
-      setModelEntry({ ...entry, thinkingLevel: 'adaptive' });
-    } else if (!isNewModelAdaptive && currentLevel === 'adaptive') {
-      // Switching FROM Opus 4.6 with adaptive -> auto-switch to 'high'
-      setModelEntry({ ...entry, thinkingLevel: 'high' });
-    } else {
-      setModelEntry(entry);
-    }
+    setModelEntry({ ...entry, thinkingLevel: normalizedThinkingLevel });
  };

  const buildFeatureData = (): FeatureData | null => {
--- a/apps/ui/src/components/views/settings-view/model-defaults/phase-model-selector.tsx
+++ b/apps/ui/src/components/views/settings-view/model-defaults/phase-model-selector.tsx
@@ -1017,6 +1017,7 @@ export function PhaseModelSelector({
            {/* Secondary zone: expand reasoning effort popover */}
            <Popover
              open={isExpanded}
+              modal={false}
              onOpenChange={(isOpen) => {
                if (!isOpen) {
                  setExpandedCodexModel(null);
@@ -1409,7 +1410,9 @@ export function PhaseModelSelector({
                return (
                  <button
                    key={level}
-                    onClick={() => {
+                    onClick={(e) => {
+                      e.stopPropagation();
+                      e.preventDefault();
                      onChange({
                        providerId: provider.id,
                        model: model.id,
@@ -1497,6 +1500,7 @@ export function PhaseModelSelector({
            {/* Secondary zone: expand thinking level popover */}
            <Popover
              open={isExpanded}
+              modal={false}
              onOpenChange={(isOpen) => {
                if (!isOpen) {
                  setExpandedProviderModel(null);
@@ -1549,7 +1553,9 @@ export function PhaseModelSelector({
                    return (
                      <button
                        key={level}
-                        onClick={() => {
+                        onClick={(e) => {
+                          e.stopPropagation();
+                          e.preventDefault();
                          onChange({
                            providerId: provider.id,
                            model: model.id,
@@ -1752,7 +1758,9 @@ export function PhaseModelSelector({
                return (
                  <button
                    key={level}
-                    onClick={() => {
+                    onClick={(e) => {
+                      e.stopPropagation();
+                      e.preventDefault();
                      onChange({
                        model: model.id as ModelAlias,
                        thinkingLevel: level,
@@ -1856,6 +1864,7 @@ export function PhaseModelSelector({
            {/* Secondary zone: expand thinking level popover */}
            <Popover
              open={isExpanded}
+              modal={false}
              onOpenChange={(isOpen) => {
                if (!isOpen) {
                  setExpandedClaudeModel(null);
@@ -1906,7 +1915,9 @@ export function PhaseModelSelector({
                    return (
                      <button
                        key={level}
-                        onClick={() => {
+                        onClick={(e) => {
+                          e.stopPropagation();
+                          e.preventDefault();
                          onChange({
                            model: model.id as ModelAlias,
                            thinkingLevel: level,
@@ -2054,6 +2065,7 @@ export function PhaseModelSelector({
      >
        <Popover
          open={isExpanded}
+          modal={false}
          onOpenChange={(isOpen) => {
            if (!isOpen) {
              setExpandedGroup(null);
--- a/apps/ui/src/electron/constants.ts
+++ b/apps/ui/src/electron/constants.ts
@@ -20,11 +20,11 @@ export const DEFAULT_HEIGHT = 950;
 // ============================================
 // Default ports (can be overridden via env) - will be dynamically assigned if these are in use
 // When launched via root init.mjs we pass:
-// - PORT (backend)
-// - TEST_PORT (vite dev server / static)
+// - SERVER_PORT (backend API server)
+// - PORT (Vite dev server / static file server)
 // Guard against NaN from non-numeric environment variables
-const parsedServerPort = Number.parseInt(process.env.PORT ?? '', 10);
-const parsedStaticPort = Number.parseInt(process.env.TEST_PORT ?? '', 10);
+const parsedServerPort = Number.parseInt(process.env.SERVER_PORT ?? '', 10);
+const parsedStaticPort = Number.parseInt(process.env.PORT ?? '', 10);
 export const DEFAULT_SERVER_PORT = Number.isFinite(parsedServerPort) ? parsedServerPort : 3008;
 export const DEFAULT_STATIC_PORT = Number.isFinite(parsedStaticPort) ? parsedStaticPort : 3007;

--- a/apps/ui/src/styles/themes/paper.css
+++ b/apps/ui/src/styles/themes/paper.css
@@ -33,11 +33,11 @@
  --input: oklch(0.98 0 0);
  --ring: oklch(0.3 0 0);

-  --chart-1: oklch(0.3 0 0);
-  --chart-2: oklch(0.5 0 0);
-  --chart-3: oklch(0.4 0 0);
-  --chart-4: oklch(0.6 0 0);
-  --chart-5: oklch(0.35 0 0);
+  --chart-1: oklch(0.5 0.14 25); /* Warm red - strings, regex */
+  --chart-2: oklch(0.5 0.13 250); /* Blue - properties, variables */
+  --chart-3: oklch(0.55 0.13 145); /* Green - numbers */
+  --chart-4: oklch(0.45 0.14 300); /* Purple - keywords, booleans, tags */
+  --chart-5: oklch(0.5 0.12 180); /* Teal - types, classes */

  --sidebar: oklch(0.98 0 0);
  --sidebar-foreground: oklch(0.15 0 0);
--- a/apps/ui/tests/e2e-testing-guide.md
+++ b/apps/ui/tests/e2e-testing-guide.md
@@ -297,10 +297,10 @@ await expect(page.locator('[data-testid="my-element"]')).toBeVisible();

 ### Port conflicts

-If you see "Port 3008 is already in use", kill the process:
+If you see "Port 3108 is already in use", kill the process:

 ```bash
-lsof -ti:3008 | xargs kill -9
+lsof -ti:3108 | xargs kill -9
 ```

 ## Available Test Utilities
--- a/apps/ui/tests/features/feature-manual-review-flow.spec.ts
+++ b/apps/ui/tests/features/feature-manual-review-flow.spec.ts
@@ -21,6 +21,7 @@ import {
  getKanbanColumn,
  authenticateForTests,
  handleLoginScreenIfPresent,
+  API_BASE_URL,
 } from '../utils';

 const TEST_TEMP_DIR = createTempDirPath('manual-review-test');
@@ -155,7 +156,6 @@ test.describe('Feature Manual Review Flow', () => {
      priority: 2,
    };

-    const API_BASE_URL = process.env.SERVER_URL || 'http://localhost:3008';
    const createResponse = await page.request.post(`${API_BASE_URL}/api/features/create`, {
      data: { projectPath, feature },
      headers: { 'Content-Type': 'application/json' },
--- a/apps/ui/tests/features/opus-thinking-level-none.spec.ts
+++ b/apps/ui/tests/features/opus-thinking-level-none.spec.ts
@@ -0,0 +1,110 @@
+import { test, expect } from '@playwright/test';
+import * as fs from 'fs';
+import * as path from 'path';
+import {
+  createTempDirPath,
+  cleanupTempDir,
+  setupRealProject,
+  waitForNetworkIdle,
+  clickAddFeature,
+  fillAddFeatureDialog,
+  confirmAddFeature,
+  authenticateForTests,
+  handleLoginScreenIfPresent,
+} from '../utils';
+
+const TEST_TEMP_DIR = createTempDirPath('opus-thinking-level-none');
+
+test.describe('Opus thinking level', () => {
+  let projectPath: string;
+  const projectName = `test-project-${Date.now()}`;
+
+  test.beforeAll(async () => {
+    if (!fs.existsSync(TEST_TEMP_DIR)) {
+      fs.mkdirSync(TEST_TEMP_DIR, { recursive: true });
+    }
+
+    projectPath = path.join(TEST_TEMP_DIR, projectName);
+    fs.mkdirSync(projectPath, { recursive: true });
+
+    fs.writeFileSync(
+      path.join(projectPath, 'package.json'),
+      JSON.stringify({ name: projectName, version: '1.0.0' }, null, 2)
+    );
+
+    const automakerDir = path.join(projectPath, '.automaker');
+    fs.mkdirSync(automakerDir, { recursive: true });
+    fs.mkdirSync(path.join(automakerDir, 'features'), { recursive: true });
+    fs.mkdirSync(path.join(automakerDir, 'context'), { recursive: true });
+
+    fs.writeFileSync(
+      path.join(automakerDir, 'categories.json'),
+      JSON.stringify({ categories: [] }, null, 2)
+    );
+
+    fs.writeFileSync(
+      path.join(automakerDir, 'app_spec.txt'),
+      `# ${projectName}\n\nA test project for Opus thinking level e2e coverage.`
+    );
+  });
+
+  test.afterAll(async () => {
+    cleanupTempDir(TEST_TEMP_DIR);
+  });
+
+  test('persists thinkingLevel none when selected for Claude Opus', async ({ page }) => {
+    const featureDescription = `Opus none thinking ${Date.now()}`;
+
+    await setupRealProject(page, projectPath, projectName, { setAsCurrent: true });
+    await authenticateForTests(page);
+    await page.goto('/board');
+    await page.waitForLoadState('load');
+    await handleLoginScreenIfPresent(page);
+    await waitForNetworkIdle(page);
+
+    await clickAddFeature(page);
+    await fillAddFeatureDialog(page, featureDescription);
+
+    await page.locator('[data-testid="model-selector"]').click();
+    await page.locator('[cmdk-input]').fill('opus');
+
+    const opusItem = page.locator('[cmdk-item]').filter({ hasText: 'Claude Opus' }).first();
+    await expect(opusItem).toBeVisible({ timeout: 10000 });
+    await opusItem.locator('button[title="Adjust thinking level"]').click();
+
+    // Wait for the thinking level popover to appear
+    // The nested popover contains "Thinking Level" text and "None" option
+    // Radix UI popovers need a brief delay for the animation to complete
+    const POPOVER_OPEN_DELAY_MS = 500;
+    await page.waitForTimeout(POPOVER_OPEN_DELAY_MS);
+
+    // Find and click the None button
+    // The button's accessible name includes description: "None No extended thinking"
+    const noneOption = page.getByRole('button', { name: /None.*No extended thinking/i }).first();
+    await expect(noneOption).toBeVisible({ timeout: 5000 });
+    await noneOption.click();
+
+    // Wait for the popover to close and the state to update
+    // The React state update needs a brief moment to propagate to the badge
+    const STATE_UPDATE_DELAY_MS = 300;
+    await page.waitForTimeout(STATE_UPDATE_DELAY_MS);
+
+    // When "None" is selected, the badge should NOT show "Adaptive"
+    await expect(page.locator('[data-testid="model-selector"]')).not.toContainText('Adaptive');
+
+    await confirmAddFeature(page);
+
+    const featuresDir = path.join(projectPath, '.automaker', 'features');
+    await expect.poll(() => fs.readdirSync(featuresDir).length).toBe(1);
+
+    const featureDir = fs.readdirSync(featuresDir)[0];
+    const featureJsonPath = path.join(featuresDir, featureDir, 'feature.json');
+    const featureJson = JSON.parse(fs.readFileSync(featureJsonPath, 'utf-8')) as {
+      description: string;
+      thinkingLevel: string;
+    };
+
+    expect(featureJson.description).toBe(featureDescription);
+    expect(featureJson.thinkingLevel).toBe('none');
+  });
+});
--- a/apps/ui/tests/features/planning-mode-fix-verification.spec.ts
+++ b/apps/ui/tests/features/planning-mode-fix-verification.spec.ts
@@ -18,6 +18,7 @@ import {
  authenticateForTests,
  handleLoginScreenIfPresent,
 } from '../utils';
+import { DEFAULT_ELEMENT_TIMEOUT_MS } from '../utils/core/waiting';

 const TEST_TEMP_DIR = createTempDirPath('planning-mode-verification-test');

@@ -69,7 +70,9 @@ test.describe('Planning Mode Fix Verification (GitHub #671)', () => {
    await handleLoginScreenIfPresent(page);
    await waitForNetworkIdle(page);

-    await expect(page.locator('[data-testid="board-view"]')).toBeVisible({ timeout: 10000 });
+    await expect(page.locator('[data-testid="board-view"]')).toBeVisible({
+      timeout: DEFAULT_ELEMENT_TIMEOUT_MS,
+    });
    await expect(page.locator('[data-testid="kanban-column-backlog"]')).toBeVisible({
      timeout: 5000,
    });
@@ -77,9 +80,9 @@ test.describe('Planning Mode Fix Verification (GitHub #671)', () => {
    // Open the add feature dialog
    await clickAddFeature(page);

-    // Wait for dialog to be visible
+    // Wait for dialog to be visible (clickAddFeature already waits, but this adds an extra check)
    await expect(page.locator('[data-testid="add-feature-dialog"]')).toBeVisible({
-      timeout: 5000,
+      timeout: DEFAULT_ELEMENT_TIMEOUT_MS,
    });

    // Find the planning mode select trigger
--- a/apps/ui/tests/features/running-task-card-display.spec.ts
+++ b/apps/ui/tests/features/running-task-card-display.spec.ts
@@ -22,6 +22,7 @@ import {
  getKanbanColumn,
  authenticateForTests,
  handleLoginScreenIfPresent,
+  API_BASE_URL,
 } from '../utils';

 const TEST_TEMP_DIR = createTempDirPath('running-task-display-test');
@@ -142,8 +143,6 @@ test.describe('Running Task Card Display', () => {
      priority: 2,
    };

-    const API_BASE_URL = process.env.SERVER_URL || 'http://localhost:3008';
-
    // Create both features via HTTP API
    const createInProgress = await page.request.post(`${API_BASE_URL}/api/features/create`, {
      data: { projectPath, feature: inProgressFeature },
--- a/apps/ui/tests/utils/api/client.ts
+++ b/apps/ui/tests/utils/api/client.ts
@@ -4,7 +4,7 @@
 */

 import { Page, APIResponse } from '@playwright/test';
-import { API_BASE_URL, API_ENDPOINTS } from '../core/constants';
+import { API_BASE_URL, API_ENDPOINTS, WEB_BASE_URL } from '../core/constants';

 // ============================================================================
 // Types
@@ -300,7 +300,7 @@ export async function authenticateWithApiKey(page: Page, apiKey: string): Promis
    // Ensure we're on a page (needed for cookies to work)
    const currentUrl = page.url();
    if (!currentUrl || currentUrl === 'about:blank') {
-      await page.goto('http://localhost:3007', { waitUntil: 'domcontentloaded' });
+      await page.goto(WEB_BASE_URL, { waitUntil: 'domcontentloaded' });
    }

    // Use Playwright request API (tied to this browser context) to avoid flakiness
--- a/apps/ui/tests/utils/core/constants.ts
+++ b/apps/ui/tests/utils/core/constants.ts
@@ -9,8 +9,19 @@

 /**
 * Base URL for the API server
+ * Uses TEST_SERVER_PORT env var (default 3108) for test runs
 */
-export const API_BASE_URL = 'http://localhost:3008';
+export const API_BASE_URL = process.env.TEST_SERVER_PORT
+  ? `http://localhost:${process.env.TEST_SERVER_PORT}`
+  : 'http://localhost:3108';
+
+/**
+ * Base URL for the frontend web server
+ * Uses TEST_PORT env var (default 3107) for test runs
+ */
+export const WEB_BASE_URL = process.env.TEST_PORT
+  ? `http://localhost:${process.env.TEST_PORT}`
+  : 'http://localhost:3107';

 /**
 * API endpoints for worktree operations
--- a/apps/ui/tests/utils/core/waiting.ts
+++ b/apps/ui/tests/utils/core/waiting.ts
@@ -1,5 +1,12 @@
 import { Page, Locator } from '@playwright/test';

+/**
+ * Default timeout for element waiting operations in E2E tests.
+ * Increased from 5000ms to 10000ms to accommodate CI environments
+ * where dialog rendering may take longer due to React Query data fetching.
+ */
+export const DEFAULT_ELEMENT_TIMEOUT_MS = 10000;
+
 /**
 * Wait for the page to load
 * Uses 'load' state instead of 'networkidle' because the app has persistent
@@ -20,7 +27,7 @@ export async function waitForElement(
 ): Promise<Locator> {
  const element = page.locator(`[data-testid="${testId}"]`);
  await element.waitFor({
-    timeout: options?.timeout ?? 5000,
+    timeout: options?.timeout ?? DEFAULT_ELEMENT_TIMEOUT_MS,
    state: options?.state ?? 'visible',
  });
  return element;
@@ -36,7 +43,7 @@ export async function waitForElementHidden(
 ): Promise<void> {
  const element = page.locator(`[data-testid="${testId}"]`);
  await element.waitFor({
-    timeout: options?.timeout ?? 5000,
+    timeout: options?.timeout ?? DEFAULT_ELEMENT_TIMEOUT_MS,
    state: 'hidden',
  });
 }
--- a/apps/ui/tests/utils/views/board.ts
+++ b/apps/ui/tests/utils/views/board.ts
@@ -1,4 +1,5 @@
 import { Page, Locator } from '@playwright/test';
+import { DEFAULT_ELEMENT_TIMEOUT_MS } from '../core/waiting';

 /**
 * Get a kanban card by feature ID
@@ -96,9 +97,16 @@ export async function getDragHandleForFeature(page: Page, featureId: string): Pr
 * Click the add feature button
 */
 export async function clickAddFeature(page: Page): Promise<void> {
-  await page.click('[data-testid="add-feature-button"]');
+  // There may be multiple add-feature buttons on the page (header, empty state).
+  // Use .first() to click the first visible one.
+  const addButton = page.locator('[data-testid="add-feature-button"]').first();
+  await addButton.waitFor({ state: 'visible', timeout: DEFAULT_ELEMENT_TIMEOUT_MS });
+  await addButton.click({ timeout: 5000 });
+
+  // Wait for dialog to be visible
  await page.waitForSelector('[data-testid="add-feature-dialog"]', {
-    timeout: 5000,
+    state: 'visible',
+    timeout: DEFAULT_ELEMENT_TIMEOUT_MS,
  });
 }

@@ -159,7 +167,7 @@ export async function confirmAddFeature(page: Page): Promise<void> {
  await page.click('[data-testid="confirm-add-feature"]');
  // Wait for dialog to close
  await page.waitForFunction(() => !document.querySelector('[data-testid="add-feature-dialog"]'), {
-    timeout: 5000,
+    timeout: DEFAULT_ELEMENT_TIMEOUT_MS,
  });
 }

--- a/apps/ui/vite.config.mts
+++ b/apps/ui/vite.config.mts
@@ -249,11 +249,11 @@ export default defineConfig(({ command }) => {
    },
    server: {
      host: process.env.HOST || '0.0.0.0',
-      port: parseInt(process.env.TEST_PORT || process.env.AUTOMAKER_WEB_PORT || '3007', 10),
+      port: parseInt(process.env.AUTOMAKER_WEB_PORT || '3007', 10),
      allowedHosts: true,
      proxy: {
        '/api': {
-          target: 'http://localhost:' + (process.env.AUTOMAKER_SERVER_PORT ?? '5008'),
+          target: 'http://localhost:' + (process.env.AUTOMAKER_SERVER_PORT ?? '3008'),
          changeOrigin: true,
          ws: true,
        },
--- a/docker-compose.dev.yml
+++ b/docker-compose.dev.yml
@@ -109,7 +109,7 @@ services:
    environment:
      - NODE_ENV=development
      - VITE_SERVER_URL=http://localhost:3008
-      - TEST_PORT=3007
+      - TEST_PORT=3107
      - VITE_SKIP_ELECTRON=true
      - VITE_APP_MODE=3
      - HUSKY=0
--- a/libs/types/src/index.ts
+++ b/libs/types/src/index.ts
@@ -202,6 +202,7 @@ export {
  getThinkingTokenBudget,
  isAdaptiveThinkingModel,
  getThinkingLevelsForModel,
+  normalizeThinkingLevelForModel,
  getDefaultThinkingLevel,
  // Event hook constants
  EVENT_HOOK_TRIGGER_LABELS,
--- a/libs/types/src/provider.ts
+++ b/libs/types/src/provider.ts
@@ -260,7 +260,13 @@ export interface ContentBlock {
 */
 export interface ProviderMessage {
  type: 'assistant' | 'user' | 'error' | 'result';
-  subtype?: 'success' | 'error' | 'error_max_turns' | 'error_max_structured_output_retries';
+  subtype?:
+    | 'success'
+    | 'error'
+    | 'error_max_turns'
+    | 'error_max_structured_output_retries'
+    | 'error_during_execution'
+    | 'error_max_budget_usd';
  session_id?: string;
  message?: {
    role: 'user' | 'assistant';
--- a/libs/types/src/settings.ts
+++ b/libs/types/src/settings.ts
@@ -349,6 +349,28 @@ export function getThinkingLevelsForModel(model: string): ThinkingLevel[] {
  return ['none', 'low', 'medium', 'high', 'ultrathink'];
 }

+/**
+ * Normalize a selected thinking level to a value supported by the target model.
+ * Prefers preserving the selected level, falls back to 'none' when available.
+ */
+export function normalizeThinkingLevelForModel(
+  model: string,
+  thinkingLevel: ThinkingLevel | undefined
+): ThinkingLevel {
+  const availableLevels = getThinkingLevelsForModel(model);
+  const currentLevel = thinkingLevel || 'none';
+
+  if (availableLevels.includes(currentLevel)) {
+    return currentLevel;
+  }
+
+  if (availableLevels.includes('none')) {
+    return 'none';
+  }
+
+  return availableLevels[0];
+}
+
 /**
 * Get the default thinking level for a given model.
 * Used when selecting a model via the primary button in the two-stage selector.
--- a/start-automaker.sh
+++ b/start-automaker.sh
@@ -57,6 +57,8 @@ DEFAULT_SERVER_PORT=${AUTOMAKER_SERVER_PORT:-3008}
 PORT_SEARCH_MAX_ATTEMPTS=100
 WEB_PORT=$DEFAULT_WEB_PORT
 SERVER_PORT=$DEFAULT_SERVER_PORT
+TEST_WEB_PORT=${TEST_PORT:-3107}
+TEST_SERVER_PORT=${TEST_SERVER_PORT:-3108}

 # Port validation function
 # Returns 0 if valid, 1 if invalid (with error message printed)
@@ -1184,7 +1186,8 @@ case $MODE in
        if [ -f .env ]; then
            export $(grep -v '^#' .env | xargs)
        fi
-        export TEST_PORT="$WEB_PORT"
+        export TEST_PORT="$TEST_WEB_PORT"
+        export TEST_SERVER_PORT="$TEST_SERVER_PORT"
        export VITE_SERVER_URL="http://${APP_HOST}:$SERVER_PORT"
        export PORT="$SERVER_PORT"
        export DATA_DIR="$SCRIPT_DIR/data"
@@ -1275,7 +1278,8 @@ case $MODE in
        ;;
    electron)
        # Set environment variables for Electron (it starts its own server)
-        export TEST_PORT="$WEB_PORT"
+        export TEST_PORT="$TEST_WEB_PORT"
+        export TEST_SERVER_PORT="$TEST_SERVER_PORT"
        export PORT="$SERVER_PORT"
        export VITE_SERVER_URL="http://localhost:$SERVER_PORT"
        export CORS_ORIGIN="http://localhost:$WEB_PORT,http://127.0.0.1:$WEB_PORT"