Fix agent output validation to prevent false verified status (#807)

* Changes from fix/cursor-fix * feat: Enhance provider error messages with diagnostic context, address test failure, fix port change, move playwright tests to different port * Update apps/ui/src/components/views/board-view/dialogs/add-feature-dialog.tsx Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> * ci: Update test server port from 3008 to 3108 and add environment configuration * fix: Correct typo in health endpoint URL and standardize port env vars --------- Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
2026-03-18 22:33:08 +00:00 · 2026-02-24 20:18:40 -08:00
parent 0330c70261
commit 51e9a23ba1
36 changed files with 1610 additions and 104 deletions
--- a/apps/server/tests/unit/services/execution-service.test.ts
+++ b/apps/server/tests/unit/services/execution-service.test.ts
@@ -211,7 +211,14 @@ describe('execution-service.ts', () => {
    });

    // Default mocks for secureFs
-    vi.mocked(secureFs.readFile).mockResolvedValue('Agent output content');
+    // Include tool usage markers to simulate meaningful agent output.
+    // The execution service checks for '🔧 Tool:' markers and minimum
+    // output length to determine if the agent did real work.
+    vi.mocked(secureFs.readFile).mockResolvedValue(
+      'Starting implementation...\n\n🔧 Tool: Read\nInput: {"file_path": "/src/index.ts"}\n\n' +
+        '🔧 Tool: Edit\nInput: {"file_path": "/src/index.ts", "old_string": "foo", "new_string": "bar"}\n\n' +
+        'Implementation complete. Updated the code as requested.'
+    );
    vi.mocked(secureFs.access).mockResolvedValue(undefined);

    // Re-setup platform mocks
@@ -1433,4 +1440,439 @@ describe('execution-service.ts', () => {
      );
    });
  });
+
+  describe('executeFeature - agent output validation', () => {
+    // Helper to generate realistic agent output with tool markers
+    const makeAgentOutput = (toolCount: number, extraText = ''): string => {
+      let output = 'Starting implementation...\n\n';
+      for (let i = 0; i < toolCount; i++) {
+        output += `🔧 Tool: Edit\nInput: {"file_path": "/src/file${i}.ts", "old_string": "old${i}", "new_string": "new${i}"}\n\n`;
+      }
+      output += `Implementation complete. ${extraText}`;
+      return output;
+    };
+
+    const createServiceWithMocks = () => {
+      return new ExecutionService(
+        mockEventBus,
+        mockConcurrencyManager,
+        mockWorktreeResolver,
+        mockSettingsService,
+        mockRunAgentFn,
+        mockExecutePipelineFn,
+        mockUpdateFeatureStatusFn,
+        mockLoadFeatureFn,
+        mockGetPlanningPromptPrefixFn,
+        mockSaveFeatureSummaryFn,
+        mockRecordLearningsFn,
+        mockContextExistsFn,
+        mockResumeFeatureFn,
+        mockTrackFailureFn,
+        mockSignalPauseFn,
+        mockRecordSuccessFn,
+        mockSaveExecutionStateFn,
+        mockLoadContextFilesFn
+      );
+    };
+
+    it('sets verified when agent output has tool usage and sufficient length', async () => {
+      const output = makeAgentOutput(3, 'Updated authentication module with new login flow.');
+      vi.mocked(secureFs.readFile).mockResolvedValue(output);
+
+      await service.executeFeature('/test/project', 'feature-1');
+
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'verified'
+      );
+    });
+
+    it('sets waiting_approval when agent output is empty', async () => {
+      vi.mocked(secureFs.readFile).mockResolvedValue('');
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('sets waiting_approval when agent output has no tool usage markers', async () => {
+      // Long output but no tool markers - agent printed text but didn't use tools
+      const longOutputNoTools = 'I analyzed the codebase and found several issues. '.repeat(20);
+      vi.mocked(secureFs.readFile).mockResolvedValue(longOutputNoTools);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('sets waiting_approval when agent output has tool markers but is too short', async () => {
+      // Has a tool marker but total output is under 200 chars
+      const shortWithTool = '🔧 Tool: Read\nInput: {"file_path": "/src/index.ts"}\nDone.';
+      expect(shortWithTool.trim().length).toBeLessThan(200);
+
+      vi.mocked(secureFs.readFile).mockResolvedValue(shortWithTool);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('sets waiting_approval when agent output file is missing (ENOENT)', async () => {
+      vi.mocked(secureFs.readFile).mockRejectedValue(new Error('ENOENT'));
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('sets waiting_approval when agent output is only whitespace', async () => {
+      vi.mocked(secureFs.readFile).mockResolvedValue('   \n\n\t  \n  ');
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('sets verified when output is exactly at the 200 char threshold with tool usage', async () => {
+      // Create output that's exactly 200 chars trimmed with tool markers
+      const toolMarker = '🔧 Tool: Edit\nInput: {"file_path": "/src/index.ts"}\n';
+      const padding = 'x'.repeat(200 - toolMarker.length);
+      const output = toolMarker + padding;
+      expect(output.trim().length).toBeGreaterThanOrEqual(200);
+
+      vi.mocked(secureFs.readFile).mockResolvedValue(output);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'verified'
+      );
+    });
+
+    it('sets waiting_approval when output is 199 chars with tool usage (below threshold)', async () => {
+      const toolMarker = '🔧 Tool: Read\n';
+      const padding = 'x'.repeat(199 - toolMarker.length);
+      const output = toolMarker + padding;
+      expect(output.trim().length).toBe(199);
+
+      vi.mocked(secureFs.readFile).mockResolvedValue(output);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('skipTests always takes priority over output validation', async () => {
+      // Meaningful output with tool usage - would normally be 'verified'
+      const output = makeAgentOutput(5, 'All changes applied successfully.');
+      vi.mocked(secureFs.readFile).mockResolvedValue(output);
+
+      mockLoadFeatureFn = vi.fn().mockResolvedValue({ ...testFeature, skipTests: true });
+      const svc = createServiceWithMocks();
+
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // skipTests=true always means waiting_approval regardless of output quality
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('skipTests with empty output still results in waiting_approval', async () => {
+      vi.mocked(secureFs.readFile).mockResolvedValue('');
+
+      mockLoadFeatureFn = vi.fn().mockResolvedValue({ ...testFeature, skipTests: true });
+      const svc = createServiceWithMocks();
+
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('still records success even when output validation fails', async () => {
+      vi.mocked(secureFs.readFile).mockResolvedValue('');
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // recordSuccess should still be called - the agent ran without errors
+      expect(mockRecordSuccessFn).toHaveBeenCalled();
+    });
+
+    it('still extracts summary when output has content but no tool markers', async () => {
+      const outputNoTools = 'A '.repeat(150); // > 200 chars but no tool markers
+      vi.mocked(secureFs.readFile).mockResolvedValue(outputNoTools);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // Summary extraction still runs even though status is waiting_approval
+      expect(extractSummary).toHaveBeenCalledWith(outputNoTools);
+      expect(mockSaveFeatureSummaryFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'Test summary'
+      );
+    });
+
+    it('emits feature_complete with passes=true even when output validation routes to waiting_approval', async () => {
+      vi.mocked(secureFs.readFile).mockResolvedValue('');
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1', false, true);
+
+      // The agent ran without error - it's still a "pass" from the execution perspective
+      expect(mockEventBus.emitAutoModeEvent).toHaveBeenCalledWith(
+        'auto_mode_feature_complete',
+        expect.objectContaining({ passes: true })
+      );
+    });
+
+    it('handles realistic Cursor CLI output that exits quickly', async () => {
+      // Simulates a Cursor CLI that prints a brief message and exits
+      const cursorQuickExit = 'Task received. Processing...\nResult: completed successfully.';
+      expect(cursorQuickExit.includes('🔧 Tool:')).toBe(false);
+
+      vi.mocked(secureFs.readFile).mockResolvedValue(cursorQuickExit);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // No tool usage = waiting_approval
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('handles realistic Claude SDK output with multiple tool uses', async () => {
+      // Simulates a Claude SDK agent that does real work
+      const claudeOutput =
+        "I'll implement the requested feature.\n\n" +
+        '🔧 Tool: Read\nInput: {"file_path": "/src/components/App.tsx"}\n\n' +
+        'I can see the existing component structure. Let me modify it.\n\n' +
+        '🔧 Tool: Edit\nInput: {"file_path": "/src/components/App.tsx", "old_string": "const App = () => {", "new_string": "const App: React.FC = () => {"}\n\n' +
+        '🔧 Tool: Write\nInput: {"file_path": "/src/components/NewFeature.tsx"}\n\n' +
+        "I've created the new component and updated the existing one. The feature is now implemented with proper TypeScript types.";
+
+      vi.mocked(secureFs.readFile).mockResolvedValue(claudeOutput);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // Real work = verified
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'verified'
+      );
+    });
+
+    it('reads agent output from the correct path with utf-8 encoding', async () => {
+      const output = makeAgentOutput(2, 'Done with changes.');
+      vi.mocked(secureFs.readFile).mockResolvedValue(output);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // Verify readFile was called with the correct path derived from getFeatureDir
+      expect(secureFs.readFile).toHaveBeenCalledWith(
+        '/test/project/.automaker/features/feature-1/agent-output.md',
+        'utf-8'
+      );
+    });
+
+    it('completion message includes auto-verified when status is verified', async () => {
+      const output = makeAgentOutput(3, 'All changes applied.');
+      vi.mocked(secureFs.readFile).mockResolvedValue(output);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1', false, true);
+
+      expect(mockEventBus.emitAutoModeEvent).toHaveBeenCalledWith(
+        'auto_mode_feature_complete',
+        expect.objectContaining({
+          message: expect.stringContaining('auto-verified'),
+        })
+      );
+    });
+
+    it('completion message does NOT include auto-verified when status is waiting_approval', async () => {
+      // Empty output → waiting_approval
+      vi.mocked(secureFs.readFile).mockResolvedValue('');
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1', false, true);
+
+      const completeCall = vi
+        .mocked(mockEventBus.emitAutoModeEvent)
+        .mock.calls.find((call) => call[0] === 'auto_mode_feature_complete');
+      expect(completeCall).toBeDefined();
+      expect((completeCall![1] as { message: string }).message).not.toContain('auto-verified');
+    });
+
+    it('uses same agentOutput for both status determination and summary extraction', async () => {
+      // Specific output that is long enough with tool markers (verified path)
+      // AND has content for summary extraction
+      const specificOutput =
+        '🔧 Tool: Read\nReading file...\n🔧 Tool: Edit\nEditing file...\n' +
+        'The implementation is complete. Here is a detailed description of what was done. '.repeat(
+          3
+        );
+      vi.mocked(secureFs.readFile).mockResolvedValue(specificOutput);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // Status should be verified (has tools + long enough)
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'verified'
+      );
+      // extractSummary should receive the exact same output
+      expect(extractSummary).toHaveBeenCalledWith(specificOutput);
+      // recordLearnings should also receive the same output
+      expect(mockRecordLearningsFn).toHaveBeenCalledWith(
+        '/test/project',
+        testFeature,
+        specificOutput
+      );
+    });
+
+    it('does not call recordMemoryUsage when output is empty and memoryFiles is empty', async () => {
+      vi.mocked(secureFs.readFile).mockResolvedValue('');
+      const { recordMemoryUsage } = await import('@automaker/utils');
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // With empty output and empty memoryFiles, recordMemoryUsage should not be called
+      expect(recordMemoryUsage).not.toHaveBeenCalled();
+    });
+
+    it('handles output with special unicode characters correctly', async () => {
+      // Output with various unicode but includes tool markers
+      const unicodeOutput =
+        '🔧 Tool: Read\n' +
+        '🔧 Tool: Edit\n' +
+        'Añadiendo función de búsqueda con caracteres especiales: ñ, ü, ö, é, 日本語テスト. ' +
+        'Die Änderungen wurden erfolgreich implementiert. '.repeat(3);
+      vi.mocked(secureFs.readFile).mockResolvedValue(unicodeOutput);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // Should still detect tool markers and sufficient length
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'verified'
+      );
+    });
+
+    it('treats output with only newlines and spaces around tool marker as insufficient', async () => {
+      // Has tool marker but surrounded by whitespace, total trimmed < 200
+      const sparseOutput = '\n\n  🔧 Tool: Read  \n\n';
+      expect(sparseOutput.trim().length).toBeLessThan(200);
+
+      vi.mocked(secureFs.readFile).mockResolvedValue(sparseOutput);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('detects tool marker substring correctly (partial match like "🔧 Tools:" does not count)', async () => {
+      // Output with a similar but not exact marker - "🔧 Tools:" instead of "🔧 Tool:"
+      const wrongMarker = '🔧 Tools: Read\n🔧 Tools: Edit\n' + 'Implementation done. '.repeat(20);
+      expect(wrongMarker.includes('🔧 Tool:')).toBe(false);
+
+      vi.mocked(secureFs.readFile).mockResolvedValue(wrongMarker);
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // "🔧 Tools:" is not the same as "🔧 Tool:" - should be waiting_approval
+      expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
+        '/test/project',
+        'feature-1',
+        'waiting_approval'
+      );
+    });
+
+    it('pipeline merge_conflict status short-circuits before output validation', async () => {
+      // Set up pipeline that results in merge_conflict
+      vi.mocked(pipelineService.getPipelineConfig).mockResolvedValue({
+        version: 1,
+        steps: [{ id: 'step-1', name: 'Step 1', order: 1, instructions: 'Do step 1' }] as any,
+      });
+
+      // After pipeline, loadFeature returns merge_conflict status
+      let loadCallCount = 0;
+      mockLoadFeatureFn = vi.fn().mockImplementation(() => {
+        loadCallCount++;
+        if (loadCallCount === 1) return testFeature; // initial load
+        // All subsequent loads (task check + pipeline refresh) return merge_conflict
+        return { ...testFeature, status: 'merge_conflict' };
+      });
+
+      const svc = createServiceWithMocks();
+      await svc.executeFeature('/test/project', 'feature-1');
+
+      // Should NOT have called updateFeatureStatusFn with 'verified' or 'waiting_approval'
+      // because pipeline merge_conflict short-circuits the method
+      const statusCalls = vi
+        .mocked(mockUpdateFeatureStatusFn)
+        .mock.calls.filter((call) => call[2] === 'verified' || call[2] === 'waiting_approval');
+      // The only non-in_progress status call should be absent since merge_conflict returns early
+      expect(statusCalls.length).toBe(0);
+    });
+  });
 });