Files
automaker/apps/server/tests/unit/ui/agent-output-summary-e2e.test.ts
gsxdsm 9747faf1b9 Fix agent output summary for pipeline steps (#812)
* Changes from fix/agent-output-summary-for-pipeline-steps

* feat: Optimize pipeline summary extraction and fix regex vulnerability

* fix: Use fallback summary for pipeline steps when extraction fails

* fix: Strip follow-up session scaffold from pipeline step fallback summaries
2026-02-25 22:13:38 -08:00

564 lines
20 KiB
TypeScript

/**
* End-to-end integration tests for agent output summary display flow.
*
* These tests validate the complete flow from:
* 1. Server-side summary accumulation (FeatureStateManager.saveFeatureSummary)
* 2. Event emission with accumulated summary (auto_mode_summary event)
* 3. UI-side summary retrieval (feature.summary via API)
* 4. UI-side summary parsing and display (parsePhaseSummaries, extractSummary)
*
* The tests simulate what happens when:
* - A feature goes through multiple pipeline steps
* - Each step produces a summary
* - The server accumulates all summaries
* - The UI displays the accumulated summary
*/
import { describe, it, expect, beforeEach, vi, type Mock } from 'vitest';
import { FeatureStateManager } from '@/services/feature-state-manager.js';
import type { Feature } from '@automaker/types';
import type { EventEmitter } from '@/lib/events.js';
import type { FeatureLoader } from '@/services/feature-loader.js';
import { atomicWriteJson, readJsonWithRecovery } from '@automaker/utils';
import { getFeatureDir } from '@automaker/platform';
import { pipelineService } from '@/services/pipeline-service.js';
// Mock dependencies
vi.mock('@/lib/secure-fs.js', () => ({
readFile: vi.fn(),
readdir: vi.fn(),
}));
vi.mock('@automaker/utils', async (importOriginal) => {
const actual = await importOriginal<typeof import('@automaker/utils')>();
return {
...actual,
atomicWriteJson: vi.fn(),
readJsonWithRecovery: vi.fn(),
logRecoveryWarning: vi.fn(),
};
});
vi.mock('@automaker/platform', () => ({
getFeatureDir: vi.fn(),
getFeaturesDir: vi.fn(),
}));
vi.mock('@/services/notification-service.js', () => ({
getNotificationService: vi.fn(() => ({
createNotification: vi.fn(),
})),
}));
vi.mock('@/services/pipeline-service.js', () => ({
pipelineService: {
getStepIdFromStatus: vi.fn((status: string) => {
if (status.startsWith('pipeline_')) return status.replace('pipeline_', '');
return null;
}),
getStep: vi.fn(),
},
}));
// ============================================================================
// UI-side parsing functions (mirrored from apps/ui/src/lib/log-parser.ts)
// ============================================================================
function parsePhaseSummaries(summary: string | undefined): Map<string, string> {
const phaseSummaries = new Map<string, string>();
if (!summary || !summary.trim()) return phaseSummaries;
const sections = summary.split(/\n\n---\n\n/);
for (const section of sections) {
const headerMatch = section.match(/^###\s+(.+?)(?:\n|$)/);
if (headerMatch) {
const phaseName = headerMatch[1].trim().toLowerCase();
const content = section.substring(headerMatch[0].length).trim();
phaseSummaries.set(phaseName, content);
}
}
return phaseSummaries;
}
function extractSummary(rawOutput: string): string | null {
if (!rawOutput || !rawOutput.trim()) return null;
const regexesToTry: Array<{
regex: RegExp;
processor: (m: RegExpMatchArray) => string;
}> = [
{ regex: /<summary>([\s\S]*?)<\/summary>/gi, processor: (m) => m[1] },
{ regex: /^##\s+Summary[^\n]*\n([\s\S]*?)(?=\n##\s+[^#]|\n🔧|$)/gm, processor: (m) => m[1] },
];
for (const { regex, processor } of regexesToTry) {
const matches = [...rawOutput.matchAll(regex)];
if (matches.length > 0) {
const lastMatch = matches[matches.length - 1];
return processor(lastMatch).trim();
}
}
return null;
}
function isAccumulatedSummary(summary: string | undefined): boolean {
if (!summary || !summary.trim()) return false;
return summary.includes('\n\n---\n\n') && (summary.match(/###\s+.+/g)?.length ?? 0) > 0;
}
/**
* Returns the first summary candidate that contains non-whitespace content.
* Mirrors getFirstNonEmptySummary from apps/ui/src/lib/summary-selection.ts
*/
function getFirstNonEmptySummary(...candidates: (string | null | undefined)[]): string | null {
for (const candidate of candidates) {
if (typeof candidate === 'string' && candidate.trim().length > 0) {
return candidate;
}
}
return null;
}
// ============================================================================
// Unit tests for helper functions
// ============================================================================
describe('getFirstNonEmptySummary', () => {
it('should return the first non-empty string', () => {
expect(getFirstNonEmptySummary(null, undefined, 'first', 'second')).toBe('first');
});
it('should skip null and undefined candidates', () => {
expect(getFirstNonEmptySummary(null, undefined, 'valid')).toBe('valid');
});
it('should skip whitespace-only strings', () => {
expect(getFirstNonEmptySummary(' ', '\n\t', 'actual content')).toBe('actual content');
});
it('should return null when all candidates are empty', () => {
expect(getFirstNonEmptySummary(null, undefined, '', ' ')).toBeNull();
});
it('should return null when no candidates provided', () => {
expect(getFirstNonEmptySummary()).toBeNull();
});
it('should handle empty string as invalid', () => {
expect(getFirstNonEmptySummary('', 'valid')).toBe('valid');
});
it('should prefer first valid candidate', () => {
expect(getFirstNonEmptySummary('first', 'second', 'third')).toBe('first');
});
it('should handle strings with only spaces as invalid', () => {
expect(getFirstNonEmptySummary(' ', ' \n ', 'valid')).toBe('valid');
});
it('should accept strings with content surrounded by whitespace', () => {
expect(getFirstNonEmptySummary(' content with spaces ')).toBe(' content with spaces ');
});
});
describe('Agent Output Summary E2E Flow', () => {
let manager: FeatureStateManager;
let mockEvents: EventEmitter;
const baseFeature: Feature = {
id: 'e2e-feature-1',
name: 'E2E Feature',
title: 'E2E Feature Title',
description: 'A feature going through complete pipeline',
status: 'pipeline_implementation',
createdAt: '2024-01-01T00:00:00Z',
updatedAt: '2024-01-01T00:00:00Z',
};
beforeEach(() => {
vi.clearAllMocks();
mockEvents = {
emit: vi.fn(),
subscribe: vi.fn(() => vi.fn()),
};
const mockFeatureLoader = {
syncFeatureToAppSpec: vi.fn(),
} as unknown as FeatureLoader;
manager = new FeatureStateManager(mockEvents, mockFeatureLoader);
(getFeatureDir as Mock).mockReturnValue('/project/.automaker/features/e2e-feature-1');
});
describe('complete pipeline flow: server accumulation → UI display', () => {
it('should maintain complete summary across all pipeline steps', async () => {
// ===== STEP 1: Implementation =====
(pipelineService.getStep as Mock).mockResolvedValue({
name: 'Implementation',
id: 'implementation',
});
(readJsonWithRecovery as Mock).mockResolvedValue({
data: { ...baseFeature, status: 'pipeline_implementation', summary: undefined },
recovered: false,
source: 'main',
});
await manager.saveFeatureSummary(
'/project',
'e2e-feature-1',
'## Changes\n- Created auth module\n- Added user service'
);
const step1Feature = (atomicWriteJson as Mock).mock.calls[0][1] as Feature;
const step1Summary = step1Feature.summary;
// Verify server-side accumulation format
expect(step1Summary).toBe(
'### Implementation\n\n## Changes\n- Created auth module\n- Added user service'
);
// Verify UI can parse this summary
const phases1 = parsePhaseSummaries(step1Summary);
expect(phases1.size).toBe(1);
expect(phases1.get('implementation')).toContain('Created auth module');
// ===== STEP 2: Code Review =====
vi.clearAllMocks();
(getFeatureDir as Mock).mockReturnValue('/project/.automaker/features/e2e-feature-1');
(pipelineService.getStep as Mock).mockResolvedValue({
name: 'Code Review',
id: 'code_review',
});
(readJsonWithRecovery as Mock).mockResolvedValue({
data: { ...baseFeature, status: 'pipeline_code_review', summary: step1Summary },
recovered: false,
source: 'main',
});
await manager.saveFeatureSummary(
'/project',
'e2e-feature-1',
'## Review Results\n- Approved with minor suggestions'
);
const step2Feature = (atomicWriteJson as Mock).mock.calls[0][1] as Feature;
const step2Summary = step2Feature.summary;
// Verify accumulation now has both steps
expect(step2Summary).toContain('### Implementation');
expect(step2Summary).toContain('Created auth module');
expect(step2Summary).toContain('### Code Review');
expect(step2Summary).toContain('Approved with minor suggestions');
expect(step2Summary).toContain('\n\n---\n\n'); // Separator
// Verify UI can parse accumulated summary
expect(isAccumulatedSummary(step2Summary)).toBe(true);
const phases2 = parsePhaseSummaries(step2Summary);
expect(phases2.size).toBe(2);
expect(phases2.get('implementation')).toContain('Created auth module');
expect(phases2.get('code review')).toContain('Approved with minor suggestions');
// ===== STEP 3: Testing =====
vi.clearAllMocks();
(getFeatureDir as Mock).mockReturnValue('/project/.automaker/features/e2e-feature-1');
(pipelineService.getStep as Mock).mockResolvedValue({ name: 'Testing', id: 'testing' });
(readJsonWithRecovery as Mock).mockResolvedValue({
data: { ...baseFeature, status: 'pipeline_testing', summary: step2Summary },
recovered: false,
source: 'main',
});
await manager.saveFeatureSummary(
'/project',
'e2e-feature-1',
'## Test Results\n- 42 tests pass\n- 98% coverage'
);
const finalFeature = (atomicWriteJson as Mock).mock.calls[0][1] as Feature;
const finalSummary = finalFeature.summary;
// Verify final accumulation has all three steps
expect(finalSummary).toContain('### Implementation');
expect(finalSummary).toContain('Created auth module');
expect(finalSummary).toContain('### Code Review');
expect(finalSummary).toContain('Approved with minor suggestions');
expect(finalSummary).toContain('### Testing');
expect(finalSummary).toContain('42 tests pass');
// Verify UI-side parsing of complete pipeline
expect(isAccumulatedSummary(finalSummary)).toBe(true);
const finalPhases = parsePhaseSummaries(finalSummary);
expect(finalPhases.size).toBe(3);
// Verify chronological order (implementation before testing)
const summaryLines = finalSummary!.split('\n');
const implIndex = summaryLines.findIndex((l) => l.includes('### Implementation'));
const reviewIndex = summaryLines.findIndex((l) => l.includes('### Code Review'));
const testIndex = summaryLines.findIndex((l) => l.includes('### Testing'));
expect(implIndex).toBeLessThan(reviewIndex);
expect(reviewIndex).toBeLessThan(testIndex);
});
it('should emit events with accumulated summaries for real-time UI updates', async () => {
// Step 1
(pipelineService.getStep as Mock).mockResolvedValue({
name: 'Implementation',
id: 'implementation',
});
(readJsonWithRecovery as Mock).mockResolvedValue({
data: { ...baseFeature, status: 'pipeline_implementation', summary: undefined },
recovered: false,
source: 'main',
});
await manager.saveFeatureSummary('/project', 'e2e-feature-1', 'Step 1 output');
// Verify event emission
expect(mockEvents.emit).toHaveBeenCalledWith('auto-mode:event', {
type: 'auto_mode_summary',
featureId: 'e2e-feature-1',
projectPath: '/project',
summary: '### Implementation\n\nStep 1 output',
});
// Step 2
vi.clearAllMocks();
(getFeatureDir as Mock).mockReturnValue('/project/.automaker/features/e2e-feature-1');
(pipelineService.getStep as Mock).mockResolvedValue({ name: 'Testing', id: 'testing' });
(readJsonWithRecovery as Mock).mockResolvedValue({
data: {
...baseFeature,
status: 'pipeline_testing',
summary: '### Implementation\n\nStep 1 output',
},
recovered: false,
source: 'main',
});
await manager.saveFeatureSummary('/project', 'e2e-feature-1', 'Step 2 output');
// Event should contain FULL accumulated summary
expect(mockEvents.emit).toHaveBeenCalledWith('auto-mode:event', {
type: 'auto_mode_summary',
featureId: 'e2e-feature-1',
projectPath: '/project',
summary: '### Implementation\n\nStep 1 output\n\n---\n\n### Testing\n\nStep 2 output',
});
});
});
describe('UI display logic: feature.summary vs extractSummary()', () => {
it('should prefer feature.summary (server-accumulated) over extractSummary() (last only)', () => {
// Simulate what the server has accumulated
const featureSummary = [
'### Implementation',
'',
'## Changes',
'- Created feature',
'',
'---',
'',
'### Testing',
'',
'## Results',
'- All tests pass',
].join('\n');
// Simulate raw agent output (only contains last summary)
const rawOutput = `
Working on tests...
<summary>
## Results
- All tests pass
</summary>
`;
// UI logic: getFirstNonEmptySummary(feature?.summary, extractSummary(output))
const displaySummary = getFirstNonEmptySummary(featureSummary, extractSummary(rawOutput));
// Should use server-accumulated summary
expect(displaySummary).toBe(featureSummary);
expect(displaySummary).toContain('### Implementation');
expect(displaySummary).toContain('### Testing');
// If server summary was missing, only last summary would be shown
const fallbackSummary = extractSummary(rawOutput);
expect(fallbackSummary).not.toContain('Implementation');
expect(fallbackSummary).toContain('All tests pass');
});
it('should handle legacy features without server accumulation', () => {
// Legacy features have no feature.summary
const featureSummary = undefined;
// Raw output contains the summary
const rawOutput = `
<summary>
## Implementation Complete
- Created the feature
- All tests pass
</summary>
`;
// UI logic: getFirstNonEmptySummary(feature?.summary, extractSummary(output))
const displaySummary = getFirstNonEmptySummary(featureSummary, extractSummary(rawOutput));
// Should fall back to client-side extraction
expect(displaySummary).toContain('Implementation Complete');
expect(displaySummary).toContain('All tests pass');
});
});
describe('error recovery and edge cases', () => {
it('should gracefully handle pipeline interruption', async () => {
// Step 1 completes
(pipelineService.getStep as Mock).mockResolvedValue({
name: 'Implementation',
id: 'implementation',
});
(readJsonWithRecovery as Mock).mockResolvedValue({
data: { ...baseFeature, status: 'pipeline_implementation', summary: undefined },
recovered: false,
source: 'main',
});
await manager.saveFeatureSummary('/project', 'e2e-feature-1', 'Implementation done');
const step1Summary = ((atomicWriteJson as Mock).mock.calls[0][1] as Feature).summary;
// Pipeline gets interrupted (status changes but summary is preserved)
// When user views the feature later, the summary should still be available
expect(step1Summary).toBe('### Implementation\n\nImplementation done');
// UI can still parse the partial pipeline
const phases = parsePhaseSummaries(step1Summary);
expect(phases.size).toBe(1);
expect(phases.get('implementation')).toBe('Implementation done');
});
it('should handle very large accumulated summaries', async () => {
// Generate large content for each step
const generateLargeContent = (stepNum: number) => {
const lines = [`## Step ${stepNum} Changes`];
for (let i = 0; i < 100; i++) {
lines.push(
`- Change ${i}: This is a detailed description of the change made during step ${stepNum}`
);
}
return lines.join('\n');
};
// Simulate 5 pipeline steps with large content
let currentSummary: string | undefined = undefined;
const stepNames = ['Planning', 'Implementation', 'Code Review', 'Testing', 'Refinement'];
for (let i = 0; i < 5; i++) {
vi.clearAllMocks();
(getFeatureDir as Mock).mockReturnValue('/project/.automaker/features/e2e-feature-1');
(pipelineService.getStep as Mock).mockResolvedValue({
name: stepNames[i],
id: stepNames[i].toLowerCase().replace(' ', '_'),
});
(readJsonWithRecovery as Mock).mockResolvedValue({
data: {
...baseFeature,
status: `pipeline_${stepNames[i].toLowerCase().replace(' ', '_')}`,
summary: currentSummary,
},
recovered: false,
source: 'main',
});
await manager.saveFeatureSummary('/project', 'e2e-feature-1', generateLargeContent(i + 1));
currentSummary = ((atomicWriteJson as Mock).mock.calls[0][1] as Feature).summary;
}
// Final summary should be large but still parseable
expect(currentSummary!.length).toBeGreaterThan(5000);
expect(isAccumulatedSummary(currentSummary)).toBe(true);
const phases = parsePhaseSummaries(currentSummary);
expect(phases.size).toBe(5);
// Verify all steps are present
for (const stepName of stepNames) {
expect(phases.has(stepName.toLowerCase())).toBe(true);
}
});
});
describe('query invalidation simulation', () => {
it('should trigger UI refetch on auto_mode_summary event', async () => {
// This test documents the expected behavior:
// When saveFeatureSummary is called, it emits auto_mode_summary event
// The UI's use-query-invalidation.ts invalidates the feature query
// This causes a refetch of the feature, getting the updated summary
(pipelineService.getStep as Mock).mockResolvedValue({
name: 'Implementation',
id: 'implementation',
});
(readJsonWithRecovery as Mock).mockResolvedValue({
data: { ...baseFeature, status: 'pipeline_implementation', summary: undefined },
recovered: false,
source: 'main',
});
await manager.saveFeatureSummary('/project', 'e2e-feature-1', 'Summary content');
// Verify event was emitted (triggers React Query invalidation)
expect(mockEvents.emit).toHaveBeenCalledWith(
'auto-mode:event',
expect.objectContaining({
type: 'auto_mode_summary',
featureId: 'e2e-feature-1',
summary: expect.any(String),
})
);
// The UI would then:
// 1. Receive the event via WebSocket
// 2. Invalidate the feature query
// 3. Refetch the feature (GET /api/features/:id)
// 4. Display the updated feature.summary
});
});
});
/**
* KEY E2E FLOW SUMMARY:
*
* 1. PIPELINE EXECUTION:
* - Feature starts with status='pipeline_implementation'
* - Agent runs and produces summary
* - FeatureStateManager.saveFeatureSummary() accumulates with step header
* - Status advances to 'pipeline_testing'
* - Process repeats for each step
*
* 2. SERVER-SIDE ACCUMULATION:
* - First step: `### Implementation\n\n<content>`
* - Second step: `### Implementation\n\n<content>\n\n---\n\n### Testing\n\n<content>`
* - Pattern continues with each step
*
* 3. EVENT EMISSION:
* - auto_mode_summary event contains FULL accumulated summary
* - UI receives event via WebSocket
* - React Query invalidates feature query
* - Feature is refetched with updated summary
*
* 4. UI DISPLAY:
* - AgentOutputModal uses: getFirstNonEmptySummary(feature?.summary, extractSummary(output))
* - feature.summary is preferred (contains all steps)
* - extractSummary() is fallback (last summary only)
* - parsePhaseSummaries() can split into individual phases for UI
*
* 5. FALLBACK FOR LEGACY:
* - Old features may not have feature.summary
* - UI falls back to extracting from raw output
* - Only last summary is available in this case
*/