Fix agent output validation to prevent false verified status (#807)

* Changes from fix/cursor-fix * feat: Enhance provider error messages with diagnostic context, address test failure, fix port change, move playwright tests to different port * Update apps/ui/src/components/views/board-view/dialogs/add-feature-dialog.tsx Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> * ci: Update test server port from 3008 to 3108 and add environment configuration * fix: Correct typo in health endpoint URL and standardize port env vars --------- Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
2026-03-17 10:03:08 +00:00 · 2026-02-24 20:18:40 -08:00
parent 0330c70261
commit 51e9a23ba1
36 changed files with 1610 additions and 104 deletions
--- a/apps/server/src/providers/copilot-provider.ts
+++ b/apps/server/src/providers/copilot-provider.ts
@@ -389,9 +389,14 @@ export class CopilotProvider extends CliProvider {

      case 'session.error': {
        const errorEvent = sdkEvent as SdkSessionErrorEvent;
+        const enrichedError =
+          errorEvent.data.message ||
+          (errorEvent.data.code
+            ? `Copilot agent error (code: ${errorEvent.data.code})`
+            : 'Copilot agent error');
        return {
          type: 'error',
-          error: errorEvent.data.message || 'Unknown error',
+          error: enrichedError,
        };
      }

--- a/apps/server/src/providers/cursor-provider.ts
+++ b/apps/server/src/providers/cursor-provider.ts
@@ -562,10 +562,14 @@ export class CursorProvider extends CliProvider {
        const resultEvent = cursorEvent as CursorResultEvent;

        if (resultEvent.is_error) {
+          const errorText = resultEvent.error || resultEvent.result || '';
+          const enrichedError =
+            errorText ||
+            `Cursor agent failed (duration: ${resultEvent.duration_ms}ms, subtype: ${resultEvent.subtype}, session: ${resultEvent.session_id ?? 'none'})`;
          return {
            type: 'error',
            session_id: resultEvent.session_id,
-            error: resultEvent.error || resultEvent.result || 'Unknown error',
+            error: enrichedError,
          };
        }

--- a/apps/server/src/providers/gemini-provider.ts
+++ b/apps/server/src/providers/gemini-provider.ts
@@ -381,10 +381,13 @@ export class GeminiProvider extends CliProvider {
        const resultEvent = geminiEvent as GeminiResultEvent;

        if (resultEvent.status === 'error') {
+          const enrichedError =
+            resultEvent.error ||
+            `Gemini agent failed (duration: ${resultEvent.stats?.duration_ms ?? 'unknown'}ms, session: ${resultEvent.session_id ?? 'none'})`;
          return {
            type: 'error',
            session_id: resultEvent.session_id,
-            error: resultEvent.error || 'Unknown error',
+            error: enrichedError,
          };
        }

@@ -401,10 +404,12 @@ export class GeminiProvider extends CliProvider {

      case 'error': {
        const errorEvent = geminiEvent as GeminiResultEvent;
+        const enrichedError =
+          errorEvent.error || `Gemini agent failed (session: ${errorEvent.session_id ?? 'none'})`;
        return {
          type: 'error',
          session_id: errorEvent.session_id,
-          error: errorEvent.error || 'Unknown error',
+          error: enrichedError,
        };
      }

--- a/apps/server/src/services/agent-executor.ts
+++ b/apps/server/src/services/agent-executor.ts
@@ -296,8 +296,28 @@ export class AgentExecutor {
            }
          }
        } else if (msg.type === 'error') {
-          throw new Error(AgentExecutor.sanitizeProviderError(msg.error));
-        } else if (msg.type === 'result' && msg.subtype === 'success') scheduleWrite();
+          const sanitized = AgentExecutor.sanitizeProviderError(msg.error);
+          logger.error(
+            `[execute] Feature ${featureId} received error from provider. ` +
+              `raw="${msg.error}", sanitized="${sanitized}", session_id=${msg.session_id ?? 'none'}`
+          );
+          throw new Error(sanitized);
+        } else if (msg.type === 'result') {
+          if (msg.subtype === 'success') {
+            scheduleWrite();
+          } else if (msg.subtype?.startsWith('error')) {
+            // Non-success result subtypes from the SDK (error_max_turns, error_during_execution, etc.)
+            logger.error(
+              `[execute] Feature ${featureId} ended with error subtype: ${msg.subtype}. ` +
+                `session_id=${msg.session_id ?? 'none'}`
+            );
+            throw new Error(`Agent execution ended with: ${msg.subtype}`);
+          } else {
+            logger.warn(
+              `[execute] Feature ${featureId} received unhandled result subtype: ${msg.subtype}`
+            );
+          }
+        }
      }
    } finally {
      clearInterval(streamHeartbeat);
@@ -447,16 +467,28 @@ export class AgentExecutor {
              });
          }
        } else if (msg.type === 'error') {
-          // Clean the error: strip ANSI codes and redundant "Error: " prefix
-          const cleanedError =
-            (msg.error || `Error during task ${task.id}`)
-              .replace(/\x1b\[[0-9;]*m/g, '')
-              .replace(/^Error:\s*/i, '')
-              .trim() || `Error during task ${task.id}`;
-          throw new Error(cleanedError);
-        } else if (msg.type === 'result' && msg.subtype === 'success') {
-          taskOutput += msg.result || '';
-          responseText += msg.result || '';
+          const fallback = `Error during task ${task.id}`;
+          const sanitized = AgentExecutor.sanitizeProviderError(msg.error || fallback);
+          logger.error(
+            `[executeTasksLoop] Feature ${featureId} task ${task.id} received error from provider. ` +
+              `raw="${msg.error}", sanitized="${sanitized}", session_id=${msg.session_id ?? 'none'}`
+          );
+          throw new Error(sanitized);
+        } else if (msg.type === 'result') {
+          if (msg.subtype === 'success') {
+            taskOutput += msg.result || '';
+            responseText += msg.result || '';
+          } else if (msg.subtype?.startsWith('error')) {
+            logger.error(
+              `[executeTasksLoop] Feature ${featureId} task ${task.id} ended with error subtype: ${msg.subtype}. ` +
+                `session_id=${msg.session_id ?? 'none'}`
+            );
+            throw new Error(`Agent execution ended with: ${msg.subtype}`);
+          } else {
+            logger.warn(
+              `[executeTasksLoop] Feature ${featureId} task ${task.id} received unhandled result subtype: ${msg.subtype}`
+            );
+          }
        }
      }
      if (!taskCompleteDetected)
--- a/apps/server/src/services/execution-service.ts
+++ b/apps/server/src/services/execution-service.ts
@@ -60,6 +60,12 @@ import type {

 const logger = createLogger('ExecutionService');

+/** Marker written by agent-executor for each tool invocation. */
+const TOOL_USE_MARKER = '🔧 Tool:';
+
+/** Minimum trimmed output length to consider agent work meaningful. */
+const MIN_MEANINGFUL_OUTPUT_LENGTH = 200;
+
 export class ExecutionService {
  constructor(
    private eventBus: TypedEventBus,
@@ -409,7 +415,41 @@ Please continue from where you left off and complete all remaining tasks. Use th
        }
      }

-      const finalStatus = feature.skipTests ? 'waiting_approval' : 'verified';
+      // Read agent output before determining final status.
+      // CLI-based providers (Cursor, Codex, etc.) may exit quickly without doing
+      // meaningful work. Check output to avoid prematurely marking as 'verified'.
+      const outputPath = path.join(getFeatureDir(projectPath, featureId), 'agent-output.md');
+      let agentOutput = '';
+      try {
+        agentOutput = (await secureFs.readFile(outputPath, 'utf-8')) as string;
+      } catch {
+        /* */
+      }
+
+      // Determine if the agent did meaningful work by checking for tool usage
+      // indicators in the output. The agent executor writes "🔧 Tool:" markers
+      // each time a tool is invoked. No tool usage suggests the CLI exited
+      // without performing implementation work.
+      const hasToolUsage = agentOutput.includes(TOOL_USE_MARKER);
+      const isOutputTooShort = agentOutput.trim().length < MIN_MEANINGFUL_OUTPUT_LENGTH;
+      const agentDidWork = hasToolUsage && !isOutputTooShort;
+
+      let finalStatus: 'verified' | 'waiting_approval';
+      if (feature.skipTests) {
+        finalStatus = 'waiting_approval';
+      } else if (!agentDidWork) {
+        // Agent didn't produce meaningful output (e.g., CLI exited quickly).
+        // Route to waiting_approval so the user can review and re-run.
+        finalStatus = 'waiting_approval';
+        logger.warn(
+          `[executeFeature] Feature ${featureId}: agent produced insufficient output ` +
+            `(${agentOutput.trim().length}/${MIN_MEANINGFUL_OUTPUT_LENGTH} chars, toolUsage=${hasToolUsage}). ` +
+            `Setting status to waiting_approval instead of verified.`
+        );
+      } else {
+        finalStatus = 'verified';
+      }
+
      await this.updateFeatureStatusFn(projectPath, featureId, finalStatus);
      this.recordSuccessFn();

@@ -421,13 +461,6 @@ Please continue from where you left off and complete all remaining tasks. Use th
      const hasIncompleteTasks = totalTasks > 0 && completedTasks < totalTasks;

      try {
-        const outputPath = path.join(getFeatureDir(projectPath, featureId), 'agent-output.md');
-        let agentOutput = '';
-        try {
-          agentOutput = (await secureFs.readFile(outputPath, 'utf-8')) as string;
-        } catch {
-          /* */
-        }
        if (agentOutput) {
          const summary = extractSummary(agentOutput);
          if (summary) await this.saveFeatureSummaryFn(projectPath, featureId, summary);