mirror of
https://github.com/AutoMaker-Org/automaker.git
synced 2026-03-18 10:23:07 +00:00
Fix agent output validation to prevent false verified status (#807)
* Changes from fix/cursor-fix * feat: Enhance provider error messages with diagnostic context, address test failure, fix port change, move playwright tests to different port * Update apps/ui/src/components/views/board-view/dialogs/add-feature-dialog.tsx Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> * ci: Update test server port from 3008 to 3108 and add environment configuration * fix: Correct typo in health endpoint URL and standardize port env vars --------- Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
This commit is contained in:
@@ -52,6 +52,12 @@ HOST=0.0.0.0
|
||||
# Port to run the server on
|
||||
PORT=3008
|
||||
|
||||
# Port to run the server on for testing
|
||||
TEST_SERVER_PORT=3108
|
||||
|
||||
# Port to run the UI on for testing
|
||||
TEST_PORT=3107
|
||||
|
||||
# Data directory for sessions and metadata
|
||||
DATA_DIR=./data
|
||||
|
||||
|
||||
@@ -389,9 +389,14 @@ export class CopilotProvider extends CliProvider {
|
||||
|
||||
case 'session.error': {
|
||||
const errorEvent = sdkEvent as SdkSessionErrorEvent;
|
||||
const enrichedError =
|
||||
errorEvent.data.message ||
|
||||
(errorEvent.data.code
|
||||
? `Copilot agent error (code: ${errorEvent.data.code})`
|
||||
: 'Copilot agent error');
|
||||
return {
|
||||
type: 'error',
|
||||
error: errorEvent.data.message || 'Unknown error',
|
||||
error: enrichedError,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -562,10 +562,14 @@ export class CursorProvider extends CliProvider {
|
||||
const resultEvent = cursorEvent as CursorResultEvent;
|
||||
|
||||
if (resultEvent.is_error) {
|
||||
const errorText = resultEvent.error || resultEvent.result || '';
|
||||
const enrichedError =
|
||||
errorText ||
|
||||
`Cursor agent failed (duration: ${resultEvent.duration_ms}ms, subtype: ${resultEvent.subtype}, session: ${resultEvent.session_id ?? 'none'})`;
|
||||
return {
|
||||
type: 'error',
|
||||
session_id: resultEvent.session_id,
|
||||
error: resultEvent.error || resultEvent.result || 'Unknown error',
|
||||
error: enrichedError,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -381,10 +381,13 @@ export class GeminiProvider extends CliProvider {
|
||||
const resultEvent = geminiEvent as GeminiResultEvent;
|
||||
|
||||
if (resultEvent.status === 'error') {
|
||||
const enrichedError =
|
||||
resultEvent.error ||
|
||||
`Gemini agent failed (duration: ${resultEvent.stats?.duration_ms ?? 'unknown'}ms, session: ${resultEvent.session_id ?? 'none'})`;
|
||||
return {
|
||||
type: 'error',
|
||||
session_id: resultEvent.session_id,
|
||||
error: resultEvent.error || 'Unknown error',
|
||||
error: enrichedError,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -401,10 +404,12 @@ export class GeminiProvider extends CliProvider {
|
||||
|
||||
case 'error': {
|
||||
const errorEvent = geminiEvent as GeminiResultEvent;
|
||||
const enrichedError =
|
||||
errorEvent.error || `Gemini agent failed (session: ${errorEvent.session_id ?? 'none'})`;
|
||||
return {
|
||||
type: 'error',
|
||||
session_id: errorEvent.session_id,
|
||||
error: errorEvent.error || 'Unknown error',
|
||||
error: enrichedError,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -296,8 +296,28 @@ export class AgentExecutor {
|
||||
}
|
||||
}
|
||||
} else if (msg.type === 'error') {
|
||||
throw new Error(AgentExecutor.sanitizeProviderError(msg.error));
|
||||
} else if (msg.type === 'result' && msg.subtype === 'success') scheduleWrite();
|
||||
const sanitized = AgentExecutor.sanitizeProviderError(msg.error);
|
||||
logger.error(
|
||||
`[execute] Feature ${featureId} received error from provider. ` +
|
||||
`raw="${msg.error}", sanitized="${sanitized}", session_id=${msg.session_id ?? 'none'}`
|
||||
);
|
||||
throw new Error(sanitized);
|
||||
} else if (msg.type === 'result') {
|
||||
if (msg.subtype === 'success') {
|
||||
scheduleWrite();
|
||||
} else if (msg.subtype?.startsWith('error')) {
|
||||
// Non-success result subtypes from the SDK (error_max_turns, error_during_execution, etc.)
|
||||
logger.error(
|
||||
`[execute] Feature ${featureId} ended with error subtype: ${msg.subtype}. ` +
|
||||
`session_id=${msg.session_id ?? 'none'}`
|
||||
);
|
||||
throw new Error(`Agent execution ended with: ${msg.subtype}`);
|
||||
} else {
|
||||
logger.warn(
|
||||
`[execute] Feature ${featureId} received unhandled result subtype: ${msg.subtype}`
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
clearInterval(streamHeartbeat);
|
||||
@@ -447,16 +467,28 @@ export class AgentExecutor {
|
||||
});
|
||||
}
|
||||
} else if (msg.type === 'error') {
|
||||
// Clean the error: strip ANSI codes and redundant "Error: " prefix
|
||||
const cleanedError =
|
||||
(msg.error || `Error during task ${task.id}`)
|
||||
.replace(/\x1b\[[0-9;]*m/g, '')
|
||||
.replace(/^Error:\s*/i, '')
|
||||
.trim() || `Error during task ${task.id}`;
|
||||
throw new Error(cleanedError);
|
||||
} else if (msg.type === 'result' && msg.subtype === 'success') {
|
||||
taskOutput += msg.result || '';
|
||||
responseText += msg.result || '';
|
||||
const fallback = `Error during task ${task.id}`;
|
||||
const sanitized = AgentExecutor.sanitizeProviderError(msg.error || fallback);
|
||||
logger.error(
|
||||
`[executeTasksLoop] Feature ${featureId} task ${task.id} received error from provider. ` +
|
||||
`raw="${msg.error}", sanitized="${sanitized}", session_id=${msg.session_id ?? 'none'}`
|
||||
);
|
||||
throw new Error(sanitized);
|
||||
} else if (msg.type === 'result') {
|
||||
if (msg.subtype === 'success') {
|
||||
taskOutput += msg.result || '';
|
||||
responseText += msg.result || '';
|
||||
} else if (msg.subtype?.startsWith('error')) {
|
||||
logger.error(
|
||||
`[executeTasksLoop] Feature ${featureId} task ${task.id} ended with error subtype: ${msg.subtype}. ` +
|
||||
`session_id=${msg.session_id ?? 'none'}`
|
||||
);
|
||||
throw new Error(`Agent execution ended with: ${msg.subtype}`);
|
||||
} else {
|
||||
logger.warn(
|
||||
`[executeTasksLoop] Feature ${featureId} task ${task.id} received unhandled result subtype: ${msg.subtype}`
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!taskCompleteDetected)
|
||||
|
||||
@@ -60,6 +60,12 @@ import type {
|
||||
|
||||
const logger = createLogger('ExecutionService');
|
||||
|
||||
/** Marker written by agent-executor for each tool invocation. */
|
||||
const TOOL_USE_MARKER = '🔧 Tool:';
|
||||
|
||||
/** Minimum trimmed output length to consider agent work meaningful. */
|
||||
const MIN_MEANINGFUL_OUTPUT_LENGTH = 200;
|
||||
|
||||
export class ExecutionService {
|
||||
constructor(
|
||||
private eventBus: TypedEventBus,
|
||||
@@ -409,7 +415,41 @@ Please continue from where you left off and complete all remaining tasks. Use th
|
||||
}
|
||||
}
|
||||
|
||||
const finalStatus = feature.skipTests ? 'waiting_approval' : 'verified';
|
||||
// Read agent output before determining final status.
|
||||
// CLI-based providers (Cursor, Codex, etc.) may exit quickly without doing
|
||||
// meaningful work. Check output to avoid prematurely marking as 'verified'.
|
||||
const outputPath = path.join(getFeatureDir(projectPath, featureId), 'agent-output.md');
|
||||
let agentOutput = '';
|
||||
try {
|
||||
agentOutput = (await secureFs.readFile(outputPath, 'utf-8')) as string;
|
||||
} catch {
|
||||
/* */
|
||||
}
|
||||
|
||||
// Determine if the agent did meaningful work by checking for tool usage
|
||||
// indicators in the output. The agent executor writes "🔧 Tool:" markers
|
||||
// each time a tool is invoked. No tool usage suggests the CLI exited
|
||||
// without performing implementation work.
|
||||
const hasToolUsage = agentOutput.includes(TOOL_USE_MARKER);
|
||||
const isOutputTooShort = agentOutput.trim().length < MIN_MEANINGFUL_OUTPUT_LENGTH;
|
||||
const agentDidWork = hasToolUsage && !isOutputTooShort;
|
||||
|
||||
let finalStatus: 'verified' | 'waiting_approval';
|
||||
if (feature.skipTests) {
|
||||
finalStatus = 'waiting_approval';
|
||||
} else if (!agentDidWork) {
|
||||
// Agent didn't produce meaningful output (e.g., CLI exited quickly).
|
||||
// Route to waiting_approval so the user can review and re-run.
|
||||
finalStatus = 'waiting_approval';
|
||||
logger.warn(
|
||||
`[executeFeature] Feature ${featureId}: agent produced insufficient output ` +
|
||||
`(${agentOutput.trim().length}/${MIN_MEANINGFUL_OUTPUT_LENGTH} chars, toolUsage=${hasToolUsage}). ` +
|
||||
`Setting status to waiting_approval instead of verified.`
|
||||
);
|
||||
} else {
|
||||
finalStatus = 'verified';
|
||||
}
|
||||
|
||||
await this.updateFeatureStatusFn(projectPath, featureId, finalStatus);
|
||||
this.recordSuccessFn();
|
||||
|
||||
@@ -421,13 +461,6 @@ Please continue from where you left off and complete all remaining tasks. Use th
|
||||
const hasIncompleteTasks = totalTasks > 0 && completedTasks < totalTasks;
|
||||
|
||||
try {
|
||||
const outputPath = path.join(getFeatureDir(projectPath, featureId), 'agent-output.md');
|
||||
let agentOutput = '';
|
||||
try {
|
||||
agentOutput = (await secureFs.readFile(outputPath, 'utf-8')) as string;
|
||||
} catch {
|
||||
/* */
|
||||
}
|
||||
if (agentOutput) {
|
||||
const summary = extractSummary(agentOutput);
|
||||
if (summary) await this.saveFeatureSummaryFn(projectPath, featureId, summary);
|
||||
|
||||
@@ -0,0 +1,20 @@
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import { normalizeThinkingLevelForModel } from '@automaker/types';
|
||||
|
||||
describe('normalizeThinkingLevelForModel', () => {
|
||||
it('preserves explicitly selected none for Opus models', () => {
|
||||
expect(normalizeThinkingLevelForModel('claude-opus', 'none')).toBe('none');
|
||||
});
|
||||
|
||||
it('falls back to none when Opus receives an unsupported manual thinking level', () => {
|
||||
expect(normalizeThinkingLevelForModel('claude-opus', 'medium')).toBe('none');
|
||||
});
|
||||
|
||||
it('keeps adaptive for Opus when adaptive is selected', () => {
|
||||
expect(normalizeThinkingLevelForModel('claude-opus', 'adaptive')).toBe('adaptive');
|
||||
});
|
||||
|
||||
it('preserves supported manual levels for non-Opus models', () => {
|
||||
expect(normalizeThinkingLevelForModel('claude-sonnet', 'high')).toBe('high');
|
||||
});
|
||||
});
|
||||
@@ -397,6 +397,45 @@ describe('copilot-provider.ts', () => {
|
||||
});
|
||||
});
|
||||
|
||||
it('should use error code in fallback when session.error message is empty', () => {
|
||||
const event = {
|
||||
type: 'session.error',
|
||||
data: { message: '', code: 'RATE_LIMIT_EXCEEDED' },
|
||||
};
|
||||
|
||||
const result = provider.normalizeEvent(event);
|
||||
expect(result).not.toBeNull();
|
||||
expect(result!.type).toBe('error');
|
||||
expect(result!.error).toContain('RATE_LIMIT_EXCEEDED');
|
||||
expect(result!.error).not.toBe('Unknown error');
|
||||
});
|
||||
|
||||
it('should return generic "Copilot agent error" fallback when both message and code are empty', () => {
|
||||
const event = {
|
||||
type: 'session.error',
|
||||
data: { message: '', code: '' },
|
||||
};
|
||||
|
||||
const result = provider.normalizeEvent(event);
|
||||
expect(result).not.toBeNull();
|
||||
expect(result!.type).toBe('error');
|
||||
expect(result!.error).toBe('Copilot agent error');
|
||||
// Must NOT be the old opaque 'Unknown error'
|
||||
expect(result!.error).not.toBe('Unknown error');
|
||||
});
|
||||
|
||||
it('should return generic "Copilot agent error" fallback when data has no code field', () => {
|
||||
const event = {
|
||||
type: 'session.error',
|
||||
data: { message: '' },
|
||||
};
|
||||
|
||||
const result = provider.normalizeEvent(event);
|
||||
expect(result).not.toBeNull();
|
||||
expect(result!.type).toBe('error');
|
||||
expect(result!.error).toBe('Copilot agent error');
|
||||
});
|
||||
|
||||
it('should return null for unknown event types', () => {
|
||||
const event = { type: 'unknown.event' };
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import { describe, it, expect, beforeEach } from 'vitest';
|
||||
import { CursorProvider } from '@/providers/cursor-provider.js';
|
||||
|
||||
describe('cursor-provider.ts', () => {
|
||||
@@ -36,4 +36,122 @@ describe('cursor-provider.ts', () => {
|
||||
expect(args).not.toContain('--resume');
|
||||
});
|
||||
});
|
||||
|
||||
describe('normalizeEvent - result error handling', () => {
|
||||
let provider: CursorProvider;
|
||||
|
||||
beforeEach(() => {
|
||||
provider = Object.create(CursorProvider.prototype) as CursorProvider;
|
||||
});
|
||||
|
||||
it('returns error message from resultEvent.error when is_error=true', () => {
|
||||
const event = {
|
||||
type: 'result',
|
||||
is_error: true,
|
||||
error: 'Rate limit exceeded',
|
||||
result: '',
|
||||
subtype: 'error',
|
||||
duration_ms: 3000,
|
||||
session_id: 'sess-123',
|
||||
};
|
||||
|
||||
const msg = provider.normalizeEvent(event);
|
||||
|
||||
expect(msg).not.toBeNull();
|
||||
expect(msg!.type).toBe('error');
|
||||
expect(msg!.error).toBe('Rate limit exceeded');
|
||||
});
|
||||
|
||||
it('falls back to resultEvent.result when error field is empty and is_error=true', () => {
|
||||
const event = {
|
||||
type: 'result',
|
||||
is_error: true,
|
||||
error: '',
|
||||
result: 'Process terminated unexpectedly',
|
||||
subtype: 'error',
|
||||
duration_ms: 5000,
|
||||
session_id: 'sess-456',
|
||||
};
|
||||
|
||||
const msg = provider.normalizeEvent(event);
|
||||
|
||||
expect(msg).not.toBeNull();
|
||||
expect(msg!.type).toBe('error');
|
||||
expect(msg!.error).toBe('Process terminated unexpectedly');
|
||||
});
|
||||
|
||||
it('builds diagnostic fallback when both error and result are empty and is_error=true', () => {
|
||||
const event = {
|
||||
type: 'result',
|
||||
is_error: true,
|
||||
error: '',
|
||||
result: '',
|
||||
subtype: 'error',
|
||||
duration_ms: 5000,
|
||||
session_id: 'sess-789',
|
||||
};
|
||||
|
||||
const msg = provider.normalizeEvent(event);
|
||||
|
||||
expect(msg).not.toBeNull();
|
||||
expect(msg!.type).toBe('error');
|
||||
// Should contain diagnostic info rather than 'Unknown error'
|
||||
expect(msg!.error).toContain('5000ms');
|
||||
expect(msg!.error).toContain('sess-789');
|
||||
expect(msg!.error).not.toBe('Unknown error');
|
||||
});
|
||||
|
||||
it('preserves session_id in error message', () => {
|
||||
const event = {
|
||||
type: 'result',
|
||||
is_error: true,
|
||||
error: 'Timeout occurred',
|
||||
result: '',
|
||||
subtype: 'error',
|
||||
duration_ms: 30000,
|
||||
session_id: 'my-session-id',
|
||||
};
|
||||
|
||||
const msg = provider.normalizeEvent(event);
|
||||
|
||||
expect(msg!.session_id).toBe('my-session-id');
|
||||
});
|
||||
|
||||
it('uses "none" when session_id is missing from diagnostic fallback', () => {
|
||||
const event = {
|
||||
type: 'result',
|
||||
is_error: true,
|
||||
error: '',
|
||||
result: '',
|
||||
subtype: 'error',
|
||||
duration_ms: 5000,
|
||||
// session_id intentionally omitted
|
||||
};
|
||||
|
||||
const msg = provider.normalizeEvent(event);
|
||||
|
||||
expect(msg).not.toBeNull();
|
||||
expect(msg!.type).toBe('error');
|
||||
expect(msg!.error).toContain('none');
|
||||
expect(msg!.error).not.toContain('undefined');
|
||||
});
|
||||
|
||||
it('returns success result when is_error=false', () => {
|
||||
const event = {
|
||||
type: 'result',
|
||||
is_error: false,
|
||||
error: '',
|
||||
result: 'Completed successfully',
|
||||
subtype: 'success',
|
||||
duration_ms: 2000,
|
||||
session_id: 'sess-ok',
|
||||
};
|
||||
|
||||
const msg = provider.normalizeEvent(event);
|
||||
|
||||
expect(msg).not.toBeNull();
|
||||
expect(msg!.type).toBe('result');
|
||||
expect(msg!.subtype).toBe('success');
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import { describe, it, expect, beforeEach } from 'vitest';
|
||||
import { GeminiProvider } from '@/providers/gemini-provider.js';
|
||||
import type { ProviderMessage } from '@automaker/types';
|
||||
|
||||
describe('gemini-provider.ts', () => {
|
||||
let provider: GeminiProvider;
|
||||
@@ -116,4 +117,140 @@ describe('gemini-provider.ts', () => {
|
||||
expect(args[modelIndex + 1]).toBe('gemini-2.5-pro');
|
||||
});
|
||||
});
|
||||
|
||||
describe('normalizeEvent - error handling', () => {
|
||||
it('returns error from result event when status=error and error field is set', () => {
|
||||
const event = {
|
||||
type: 'result',
|
||||
status: 'error',
|
||||
error: 'Model overloaded',
|
||||
session_id: 'sess-gemini-1',
|
||||
stats: { duration_ms: 4000, total_tokens: 0 },
|
||||
};
|
||||
|
||||
const msg = provider.normalizeEvent(event) as ProviderMessage;
|
||||
|
||||
expect(msg).not.toBeNull();
|
||||
expect(msg.type).toBe('error');
|
||||
expect(msg.error).toBe('Model overloaded');
|
||||
expect(msg.session_id).toBe('sess-gemini-1');
|
||||
});
|
||||
|
||||
it('builds diagnostic fallback when result event has status=error but empty error field', () => {
|
||||
const event = {
|
||||
type: 'result',
|
||||
status: 'error',
|
||||
error: '',
|
||||
session_id: 'sess-gemini-2',
|
||||
stats: { duration_ms: 7500, total_tokens: 0 },
|
||||
};
|
||||
|
||||
const msg = provider.normalizeEvent(event) as ProviderMessage;
|
||||
|
||||
expect(msg).not.toBeNull();
|
||||
expect(msg.type).toBe('error');
|
||||
// Diagnostic info should be present instead of 'Unknown error'
|
||||
expect(msg.error).toContain('7500ms');
|
||||
expect(msg.error).toContain('sess-gemini-2');
|
||||
expect(msg.error).not.toBe('Unknown error');
|
||||
});
|
||||
|
||||
it('builds fallback with "unknown" duration when stats are missing', () => {
|
||||
const event = {
|
||||
type: 'result',
|
||||
status: 'error',
|
||||
error: '',
|
||||
session_id: 'sess-gemini-nostats',
|
||||
// no stats field
|
||||
};
|
||||
|
||||
const msg = provider.normalizeEvent(event) as ProviderMessage;
|
||||
|
||||
expect(msg).not.toBeNull();
|
||||
expect(msg.type).toBe('error');
|
||||
expect(msg.error).toContain('unknown');
|
||||
});
|
||||
|
||||
it('returns error from standalone error event with error field set', () => {
|
||||
const event = {
|
||||
type: 'error',
|
||||
error: 'API key invalid',
|
||||
session_id: 'sess-gemini-3',
|
||||
};
|
||||
|
||||
const msg = provider.normalizeEvent(event) as ProviderMessage;
|
||||
|
||||
expect(msg).not.toBeNull();
|
||||
expect(msg.type).toBe('error');
|
||||
expect(msg.error).toBe('API key invalid');
|
||||
});
|
||||
|
||||
it('builds diagnostic fallback when standalone error event has empty error field', () => {
|
||||
const event = {
|
||||
type: 'error',
|
||||
error: '',
|
||||
session_id: 'sess-gemini-empty',
|
||||
};
|
||||
|
||||
const msg = provider.normalizeEvent(event) as ProviderMessage;
|
||||
|
||||
expect(msg).not.toBeNull();
|
||||
expect(msg.type).toBe('error');
|
||||
// Should include session_id, not just 'Unknown error'
|
||||
expect(msg.error).toContain('sess-gemini-empty');
|
||||
expect(msg.error).not.toBe('Unknown error');
|
||||
});
|
||||
|
||||
it('builds fallback mentioning "none" when session_id is missing from error event', () => {
|
||||
const event = {
|
||||
type: 'error',
|
||||
error: '',
|
||||
// no session_id
|
||||
};
|
||||
|
||||
const msg = provider.normalizeEvent(event) as ProviderMessage;
|
||||
|
||||
expect(msg).not.toBeNull();
|
||||
expect(msg.type).toBe('error');
|
||||
expect(msg.error).toContain('none');
|
||||
});
|
||||
|
||||
it('uses consistent "Gemini agent failed" label for both result and error event fallbacks', () => {
|
||||
const resultEvent = {
|
||||
type: 'result',
|
||||
status: 'error',
|
||||
error: '',
|
||||
session_id: 'sess-r',
|
||||
stats: { duration_ms: 1000 },
|
||||
};
|
||||
const errorEvent = {
|
||||
type: 'error',
|
||||
error: '',
|
||||
session_id: 'sess-e',
|
||||
};
|
||||
|
||||
const resultMsg = provider.normalizeEvent(resultEvent) as ProviderMessage;
|
||||
const errorMsg = provider.normalizeEvent(errorEvent) as ProviderMessage;
|
||||
|
||||
// Both fallback messages should use the same "Gemini agent failed" prefix
|
||||
expect(resultMsg.error).toContain('Gemini agent failed');
|
||||
expect(errorMsg.error).toContain('Gemini agent failed');
|
||||
});
|
||||
|
||||
it('returns success result when result event has status=success', () => {
|
||||
const event = {
|
||||
type: 'result',
|
||||
status: 'success',
|
||||
error: '',
|
||||
session_id: 'sess-gemini-ok',
|
||||
stats: { duration_ms: 1200, total_tokens: 500 },
|
||||
};
|
||||
|
||||
const msg = provider.normalizeEvent(event) as ProviderMessage;
|
||||
|
||||
expect(msg).not.toBeNull();
|
||||
expect(msg.type).toBe('result');
|
||||
expect(msg.subtype).toBe('success');
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@@ -685,6 +685,309 @@ describe('AgentExecutor', () => {
|
||||
await expect(executor.execute(options, callbacks)).rejects.toThrow('API rate limit exceeded');
|
||||
});
|
||||
|
||||
it('should throw "Unknown error" when provider stream yields error with empty message', async () => {
|
||||
const executor = new AgentExecutor(
|
||||
mockEventBus,
|
||||
mockFeatureStateManager,
|
||||
mockPlanApprovalService,
|
||||
mockSettingsService
|
||||
);
|
||||
|
||||
const mockProvider = {
|
||||
getName: () => 'mock',
|
||||
executeQuery: vi.fn().mockImplementation(function* () {
|
||||
yield {
|
||||
type: 'error',
|
||||
error: '',
|
||||
session_id: 'sess-123',
|
||||
};
|
||||
}),
|
||||
} as unknown as BaseProvider;
|
||||
|
||||
const options: AgentExecutionOptions = {
|
||||
workDir: '/test',
|
||||
featureId: 'test-feature',
|
||||
prompt: 'Test prompt',
|
||||
projectPath: '/project',
|
||||
abortController: new AbortController(),
|
||||
provider: mockProvider,
|
||||
effectiveBareModel: 'claude-sonnet-4-6',
|
||||
planningMode: 'skip',
|
||||
};
|
||||
|
||||
const callbacks = {
|
||||
waitForApproval: vi.fn().mockResolvedValue({ approved: true }),
|
||||
saveFeatureSummary: vi.fn(),
|
||||
updateFeatureSummary: vi.fn(),
|
||||
buildTaskPrompt: vi.fn().mockReturnValue('task prompt'),
|
||||
};
|
||||
|
||||
await expect(executor.execute(options, callbacks)).rejects.toThrow('Unknown error');
|
||||
});
|
||||
|
||||
it('should throw with sanitized error when provider yields ANSI-decorated error', async () => {
|
||||
const executor = new AgentExecutor(
|
||||
mockEventBus,
|
||||
mockFeatureStateManager,
|
||||
mockPlanApprovalService,
|
||||
mockSettingsService
|
||||
);
|
||||
|
||||
const mockProvider = {
|
||||
getName: () => 'mock',
|
||||
executeQuery: vi.fn().mockImplementation(function* () {
|
||||
yield {
|
||||
type: 'error',
|
||||
// ANSI color codes + "Error: " prefix that should be stripped
|
||||
error: '\x1b[31mError: Connection refused\x1b[0m',
|
||||
};
|
||||
}),
|
||||
} as unknown as BaseProvider;
|
||||
|
||||
const options: AgentExecutionOptions = {
|
||||
workDir: '/test',
|
||||
featureId: 'test-feature',
|
||||
prompt: 'Test prompt',
|
||||
projectPath: '/project',
|
||||
abortController: new AbortController(),
|
||||
provider: mockProvider,
|
||||
effectiveBareModel: 'claude-sonnet-4-6',
|
||||
planningMode: 'skip',
|
||||
};
|
||||
|
||||
const callbacks = {
|
||||
waitForApproval: vi.fn().mockResolvedValue({ approved: true }),
|
||||
saveFeatureSummary: vi.fn(),
|
||||
updateFeatureSummary: vi.fn(),
|
||||
buildTaskPrompt: vi.fn().mockReturnValue('task prompt'),
|
||||
};
|
||||
|
||||
// Should strip ANSI codes and "Error: " prefix
|
||||
await expect(executor.execute(options, callbacks)).rejects.toThrow('Connection refused');
|
||||
});
|
||||
|
||||
it('should throw when result subtype is error_max_turns', async () => {
|
||||
const executor = new AgentExecutor(
|
||||
mockEventBus,
|
||||
mockFeatureStateManager,
|
||||
mockPlanApprovalService,
|
||||
mockSettingsService
|
||||
);
|
||||
|
||||
const mockProvider = {
|
||||
getName: () => 'mock',
|
||||
executeQuery: vi.fn().mockImplementation(function* () {
|
||||
yield {
|
||||
type: 'assistant',
|
||||
message: {
|
||||
content: [{ type: 'text', text: 'Working on it...' }],
|
||||
},
|
||||
};
|
||||
yield {
|
||||
type: 'result',
|
||||
subtype: 'error_max_turns',
|
||||
session_id: 'sess-456',
|
||||
};
|
||||
}),
|
||||
} as unknown as BaseProvider;
|
||||
|
||||
const options: AgentExecutionOptions = {
|
||||
workDir: '/test',
|
||||
featureId: 'test-feature',
|
||||
prompt: 'Test prompt',
|
||||
projectPath: '/project',
|
||||
abortController: new AbortController(),
|
||||
provider: mockProvider,
|
||||
effectiveBareModel: 'claude-sonnet-4-6',
|
||||
planningMode: 'skip',
|
||||
};
|
||||
|
||||
const callbacks = {
|
||||
waitForApproval: vi.fn().mockResolvedValue({ approved: true }),
|
||||
saveFeatureSummary: vi.fn(),
|
||||
updateFeatureSummary: vi.fn(),
|
||||
buildTaskPrompt: vi.fn().mockReturnValue('task prompt'),
|
||||
};
|
||||
|
||||
await expect(executor.execute(options, callbacks)).rejects.toThrow(
|
||||
'Agent execution ended with: error_max_turns'
|
||||
);
|
||||
});
|
||||
|
||||
it('should throw when result subtype is error_during_execution', async () => {
|
||||
const executor = new AgentExecutor(
|
||||
mockEventBus,
|
||||
mockFeatureStateManager,
|
||||
mockPlanApprovalService,
|
||||
mockSettingsService
|
||||
);
|
||||
|
||||
const mockProvider = {
|
||||
getName: () => 'mock',
|
||||
executeQuery: vi.fn().mockImplementation(function* () {
|
||||
yield {
|
||||
type: 'result',
|
||||
subtype: 'error_during_execution',
|
||||
session_id: 'sess-789',
|
||||
};
|
||||
}),
|
||||
} as unknown as BaseProvider;
|
||||
|
||||
const options: AgentExecutionOptions = {
|
||||
workDir: '/test',
|
||||
featureId: 'test-feature',
|
||||
prompt: 'Test prompt',
|
||||
projectPath: '/project',
|
||||
abortController: new AbortController(),
|
||||
provider: mockProvider,
|
||||
effectiveBareModel: 'claude-sonnet-4-6',
|
||||
planningMode: 'skip',
|
||||
};
|
||||
|
||||
const callbacks = {
|
||||
waitForApproval: vi.fn().mockResolvedValue({ approved: true }),
|
||||
saveFeatureSummary: vi.fn(),
|
||||
updateFeatureSummary: vi.fn(),
|
||||
buildTaskPrompt: vi.fn().mockReturnValue('task prompt'),
|
||||
};
|
||||
|
||||
await expect(executor.execute(options, callbacks)).rejects.toThrow(
|
||||
'Agent execution ended with: error_during_execution'
|
||||
);
|
||||
});
|
||||
|
||||
it('should throw when result subtype is error_max_structured_output_retries', async () => {
|
||||
const executor = new AgentExecutor(
|
||||
mockEventBus,
|
||||
mockFeatureStateManager,
|
||||
mockPlanApprovalService,
|
||||
mockSettingsService
|
||||
);
|
||||
|
||||
const mockProvider = {
|
||||
getName: () => 'mock',
|
||||
executeQuery: vi.fn().mockImplementation(function* () {
|
||||
yield {
|
||||
type: 'result',
|
||||
subtype: 'error_max_structured_output_retries',
|
||||
};
|
||||
}),
|
||||
} as unknown as BaseProvider;
|
||||
|
||||
const options: AgentExecutionOptions = {
|
||||
workDir: '/test',
|
||||
featureId: 'test-feature',
|
||||
prompt: 'Test prompt',
|
||||
projectPath: '/project',
|
||||
abortController: new AbortController(),
|
||||
provider: mockProvider,
|
||||
effectiveBareModel: 'claude-sonnet-4-6',
|
||||
planningMode: 'skip',
|
||||
};
|
||||
|
||||
const callbacks = {
|
||||
waitForApproval: vi.fn().mockResolvedValue({ approved: true }),
|
||||
saveFeatureSummary: vi.fn(),
|
||||
updateFeatureSummary: vi.fn(),
|
||||
buildTaskPrompt: vi.fn().mockReturnValue('task prompt'),
|
||||
};
|
||||
|
||||
await expect(executor.execute(options, callbacks)).rejects.toThrow(
|
||||
'Agent execution ended with: error_max_structured_output_retries'
|
||||
);
|
||||
});
|
||||
|
||||
it('should throw when result subtype is error_max_budget_usd', async () => {
|
||||
const executor = new AgentExecutor(
|
||||
mockEventBus,
|
||||
mockFeatureStateManager,
|
||||
mockPlanApprovalService,
|
||||
mockSettingsService
|
||||
);
|
||||
|
||||
const mockProvider = {
|
||||
getName: () => 'mock',
|
||||
executeQuery: vi.fn().mockImplementation(function* () {
|
||||
yield {
|
||||
type: 'result',
|
||||
subtype: 'error_max_budget_usd',
|
||||
session_id: 'sess-budget',
|
||||
};
|
||||
}),
|
||||
} as unknown as BaseProvider;
|
||||
|
||||
const options: AgentExecutionOptions = {
|
||||
workDir: '/test',
|
||||
featureId: 'test-feature',
|
||||
prompt: 'Test prompt',
|
||||
projectPath: '/project',
|
||||
abortController: new AbortController(),
|
||||
provider: mockProvider,
|
||||
effectiveBareModel: 'claude-sonnet-4-6',
|
||||
planningMode: 'skip',
|
||||
};
|
||||
|
||||
const callbacks = {
|
||||
waitForApproval: vi.fn().mockResolvedValue({ approved: true }),
|
||||
saveFeatureSummary: vi.fn(),
|
||||
updateFeatureSummary: vi.fn(),
|
||||
buildTaskPrompt: vi.fn().mockReturnValue('task prompt'),
|
||||
};
|
||||
|
||||
await expect(executor.execute(options, callbacks)).rejects.toThrow(
|
||||
'Agent execution ended with: error_max_budget_usd'
|
||||
);
|
||||
});
|
||||
|
||||
it('should NOT throw when result subtype is success', async () => {
|
||||
const executor = new AgentExecutor(
|
||||
mockEventBus,
|
||||
mockFeatureStateManager,
|
||||
mockPlanApprovalService,
|
||||
mockSettingsService
|
||||
);
|
||||
|
||||
const mockProvider = {
|
||||
getName: () => 'mock',
|
||||
executeQuery: vi.fn().mockImplementation(function* () {
|
||||
yield {
|
||||
type: 'assistant',
|
||||
message: {
|
||||
content: [{ type: 'text', text: 'Done!' }],
|
||||
},
|
||||
};
|
||||
yield {
|
||||
type: 'result',
|
||||
subtype: 'success',
|
||||
session_id: 'sess-ok',
|
||||
};
|
||||
}),
|
||||
} as unknown as BaseProvider;
|
||||
|
||||
const options: AgentExecutionOptions = {
|
||||
workDir: '/test',
|
||||
featureId: 'test-feature',
|
||||
prompt: 'Test prompt',
|
||||
projectPath: '/project',
|
||||
abortController: new AbortController(),
|
||||
provider: mockProvider,
|
||||
effectiveBareModel: 'claude-sonnet-4-6',
|
||||
planningMode: 'skip',
|
||||
};
|
||||
|
||||
const callbacks = {
|
||||
waitForApproval: vi.fn().mockResolvedValue({ approved: true }),
|
||||
saveFeatureSummary: vi.fn(),
|
||||
updateFeatureSummary: vi.fn(),
|
||||
buildTaskPrompt: vi.fn().mockReturnValue('task prompt'),
|
||||
};
|
||||
|
||||
// Should resolve without throwing
|
||||
const result = await executor.execute(options, callbacks);
|
||||
expect(result.aborted).toBe(false);
|
||||
expect(result.responseText).toContain('Done!');
|
||||
});
|
||||
|
||||
it('should throw error when authentication fails in response', async () => {
|
||||
const executor = new AgentExecutor(
|
||||
mockEventBus,
|
||||
|
||||
192
apps/server/tests/unit/services/agent-output-validation.test.ts
Normal file
192
apps/server/tests/unit/services/agent-output-validation.test.ts
Normal file
@@ -0,0 +1,192 @@
|
||||
import { describe, it, expect } from 'vitest';
|
||||
|
||||
/**
|
||||
* Contract tests verifying the tool marker format used by agent-executor
|
||||
* (which writes agent output) and execution-service (which reads it to
|
||||
* determine if the agent did meaningful work).
|
||||
*
|
||||
* The agent-executor writes: `\n🔧 Tool: ${block.name}\n`
|
||||
* The execution-service checks: `agentOutput.includes('🔧 Tool:')`
|
||||
*
|
||||
* These tests ensure the marker format contract stays consistent and
|
||||
* document the exact detection logic used for status determination.
|
||||
*/
|
||||
|
||||
// The exact marker prefix that execution-service searches for
|
||||
const TOOL_MARKER = '🔧 Tool:';
|
||||
|
||||
// Minimum output length threshold for "meaningful work"
|
||||
const MIN_OUTPUT_LENGTH = 200;
|
||||
|
||||
/**
|
||||
* Simulates the agent-executor's tool_use output format.
|
||||
* See: agent-executor.ts line ~293
|
||||
*/
|
||||
function formatToolUseBlock(toolName: string, input?: Record<string, unknown>): string {
|
||||
let output = `\n${TOOL_MARKER} ${toolName}\n`;
|
||||
if (input) output += `Input: ${JSON.stringify(input, null, 2)}\n`;
|
||||
return output;
|
||||
}
|
||||
|
||||
/**
|
||||
* Simulates the execution-service's output validation logic.
|
||||
* See: execution-service.ts lines ~427-429
|
||||
*/
|
||||
function validateAgentOutput(
|
||||
agentOutput: string,
|
||||
skipTests: boolean
|
||||
): 'verified' | 'waiting_approval' {
|
||||
const hasToolUsage = agentOutput.includes(TOOL_MARKER);
|
||||
const hasMinimalOutput = agentOutput.trim().length < MIN_OUTPUT_LENGTH;
|
||||
const agentDidWork = hasToolUsage && !hasMinimalOutput;
|
||||
|
||||
if (skipTests) return 'waiting_approval';
|
||||
if (!agentDidWork) return 'waiting_approval';
|
||||
return 'verified';
|
||||
}
|
||||
|
||||
describe('Agent Output Validation - Contract Tests', () => {
|
||||
describe('tool marker format contract', () => {
|
||||
it('agent-executor tool format contains the expected marker', () => {
|
||||
const toolOutput = formatToolUseBlock('Read', { file_path: '/src/index.ts' });
|
||||
expect(toolOutput).toContain(TOOL_MARKER);
|
||||
});
|
||||
|
||||
it('agent-executor tool format includes tool name after marker', () => {
|
||||
const toolOutput = formatToolUseBlock('Edit', {
|
||||
file_path: '/src/app.ts',
|
||||
old_string: 'foo',
|
||||
new_string: 'bar',
|
||||
});
|
||||
expect(toolOutput).toContain('🔧 Tool: Edit');
|
||||
});
|
||||
|
||||
it('agent-executor tool format includes JSON input', () => {
|
||||
const input = { file_path: '/src/index.ts' };
|
||||
const toolOutput = formatToolUseBlock('Read', input);
|
||||
expect(toolOutput).toContain('Input: ');
|
||||
expect(toolOutput).toContain('"file_path": "/src/index.ts"');
|
||||
});
|
||||
|
||||
it('agent-executor tool format works without input', () => {
|
||||
const toolOutput = formatToolUseBlock('Bash');
|
||||
expect(toolOutput).toContain('🔧 Tool: Bash');
|
||||
expect(toolOutput).not.toContain('Input:');
|
||||
});
|
||||
|
||||
it('marker includes colon and space to avoid false positives', () => {
|
||||
// Ensure the marker is specific enough to avoid matching other emoji patterns
|
||||
expect(TOOL_MARKER).toBe('🔧 Tool:');
|
||||
expect(TOOL_MARKER).toContain(':');
|
||||
});
|
||||
});
|
||||
|
||||
describe('output validation logic', () => {
|
||||
it('verified: tool usage + sufficient output', () => {
|
||||
const output =
|
||||
'Starting implementation of the new feature...\n' +
|
||||
formatToolUseBlock('Read', { file_path: '/src/index.ts' }) +
|
||||
'I can see the existing code. Let me make the needed changes.\n' +
|
||||
formatToolUseBlock('Edit', { file_path: '/src/index.ts' }) +
|
||||
'Changes complete. The implementation adds new validation logic and tests.';
|
||||
expect(output.trim().length).toBeGreaterThanOrEqual(MIN_OUTPUT_LENGTH);
|
||||
|
||||
expect(validateAgentOutput(output, false)).toBe('verified');
|
||||
});
|
||||
|
||||
it('waiting_approval: no tool markers regardless of length', () => {
|
||||
const longOutput = 'I analyzed the codebase. '.repeat(50);
|
||||
expect(longOutput.trim().length).toBeGreaterThan(MIN_OUTPUT_LENGTH);
|
||||
|
||||
expect(validateAgentOutput(longOutput, false)).toBe('waiting_approval');
|
||||
});
|
||||
|
||||
it('waiting_approval: tool markers but insufficient length', () => {
|
||||
const shortOutput = formatToolUseBlock('Read', { file_path: '/src/a.ts' });
|
||||
expect(shortOutput.trim().length).toBeLessThan(MIN_OUTPUT_LENGTH);
|
||||
|
||||
expect(validateAgentOutput(shortOutput, false)).toBe('waiting_approval');
|
||||
});
|
||||
|
||||
it('waiting_approval: empty output', () => {
|
||||
expect(validateAgentOutput('', false)).toBe('waiting_approval');
|
||||
});
|
||||
|
||||
it('waiting_approval: skipTests always overrides', () => {
|
||||
const goodOutput =
|
||||
'Starting...\n' +
|
||||
formatToolUseBlock('Read', { file_path: '/src/index.ts' }) +
|
||||
formatToolUseBlock('Edit', { file_path: '/src/index.ts' }) +
|
||||
'Done implementing. '.repeat(15);
|
||||
expect(goodOutput.trim().length).toBeGreaterThanOrEqual(MIN_OUTPUT_LENGTH);
|
||||
|
||||
expect(validateAgentOutput(goodOutput, true)).toBe('waiting_approval');
|
||||
});
|
||||
|
||||
it('boundary: exactly MIN_OUTPUT_LENGTH chars with tool is verified', () => {
|
||||
const tool = formatToolUseBlock('Read');
|
||||
const padding = 'x'.repeat(MIN_OUTPUT_LENGTH - tool.trim().length);
|
||||
const output = tool + padding;
|
||||
expect(output.trim().length).toBeGreaterThanOrEqual(MIN_OUTPUT_LENGTH);
|
||||
|
||||
expect(validateAgentOutput(output, false)).toBe('verified');
|
||||
});
|
||||
|
||||
it('boundary: MIN_OUTPUT_LENGTH - 1 chars with tool is waiting_approval', () => {
|
||||
const marker = `${TOOL_MARKER} Read\n`;
|
||||
const padding = 'x'.repeat(MIN_OUTPUT_LENGTH - 1 - marker.length);
|
||||
const output = marker + padding;
|
||||
expect(output.trim().length).toBe(MIN_OUTPUT_LENGTH - 1);
|
||||
|
||||
expect(validateAgentOutput(output, false)).toBe('waiting_approval');
|
||||
});
|
||||
});
|
||||
|
||||
describe('realistic provider scenarios', () => {
|
||||
it('Claude SDK agent with multiple tools → verified', () => {
|
||||
let output = "I'll implement the feature.\n\n";
|
||||
output += formatToolUseBlock('Read', { file_path: '/src/components/App.tsx' });
|
||||
output += 'I see the component. Let me update it.\n\n';
|
||||
output += formatToolUseBlock('Edit', {
|
||||
file_path: '/src/components/App.tsx',
|
||||
old_string: 'const App = () => {',
|
||||
new_string: 'const App: React.FC = () => {',
|
||||
});
|
||||
output += 'Done. The component is now typed correctly.\n';
|
||||
|
||||
expect(validateAgentOutput(output, false)).toBe('verified');
|
||||
});
|
||||
|
||||
it('Cursor CLI quick exit (no tools) → waiting_approval', () => {
|
||||
const output = 'Task received. Processing...\nResult: completed successfully.';
|
||||
expect(validateAgentOutput(output, false)).toBe('waiting_approval');
|
||||
});
|
||||
|
||||
it('Codex CLI with brief acknowledgment → waiting_approval', () => {
|
||||
const output = 'Understood the task. Starting implementation.\nDone.';
|
||||
expect(validateAgentOutput(output, false)).toBe('waiting_approval');
|
||||
});
|
||||
|
||||
it('Agent that only reads but makes no edits (single Read tool, short output) → waiting_approval', () => {
|
||||
const output = formatToolUseBlock('Read', { file_path: '/src/index.ts' }) + 'File read.';
|
||||
expect(output.trim().length).toBeLessThan(MIN_OUTPUT_LENGTH);
|
||||
expect(validateAgentOutput(output, false)).toBe('waiting_approval');
|
||||
});
|
||||
|
||||
it('Agent with extensive tool usage and explanation → verified', () => {
|
||||
let output = 'Analyzing the codebase for the authentication feature.\n\n';
|
||||
for (let i = 0; i < 5; i++) {
|
||||
output += formatToolUseBlock('Read', { file_path: `/src/auth/handler${i}.ts` });
|
||||
output += `Found handler ${i}. `;
|
||||
}
|
||||
output += formatToolUseBlock('Edit', {
|
||||
file_path: '/src/auth/handler0.ts',
|
||||
old_string: 'function login() {}',
|
||||
new_string: 'async function login(creds: Credentials) { ... }',
|
||||
});
|
||||
output += 'Implementation complete with all authentication changes applied.\n';
|
||||
|
||||
expect(validateAgentOutput(output, false)).toBe('verified');
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -211,7 +211,14 @@ describe('execution-service.ts', () => {
|
||||
});
|
||||
|
||||
// Default mocks for secureFs
|
||||
vi.mocked(secureFs.readFile).mockResolvedValue('Agent output content');
|
||||
// Include tool usage markers to simulate meaningful agent output.
|
||||
// The execution service checks for '🔧 Tool:' markers and minimum
|
||||
// output length to determine if the agent did real work.
|
||||
vi.mocked(secureFs.readFile).mockResolvedValue(
|
||||
'Starting implementation...\n\n🔧 Tool: Read\nInput: {"file_path": "/src/index.ts"}\n\n' +
|
||||
'🔧 Tool: Edit\nInput: {"file_path": "/src/index.ts", "old_string": "foo", "new_string": "bar"}\n\n' +
|
||||
'Implementation complete. Updated the code as requested.'
|
||||
);
|
||||
vi.mocked(secureFs.access).mockResolvedValue(undefined);
|
||||
|
||||
// Re-setup platform mocks
|
||||
@@ -1433,4 +1440,439 @@ describe('execution-service.ts', () => {
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe('executeFeature - agent output validation', () => {
|
||||
// Helper to generate realistic agent output with tool markers
|
||||
const makeAgentOutput = (toolCount: number, extraText = ''): string => {
|
||||
let output = 'Starting implementation...\n\n';
|
||||
for (let i = 0; i < toolCount; i++) {
|
||||
output += `🔧 Tool: Edit\nInput: {"file_path": "/src/file${i}.ts", "old_string": "old${i}", "new_string": "new${i}"}\n\n`;
|
||||
}
|
||||
output += `Implementation complete. ${extraText}`;
|
||||
return output;
|
||||
};
|
||||
|
||||
const createServiceWithMocks = () => {
|
||||
return new ExecutionService(
|
||||
mockEventBus,
|
||||
mockConcurrencyManager,
|
||||
mockWorktreeResolver,
|
||||
mockSettingsService,
|
||||
mockRunAgentFn,
|
||||
mockExecutePipelineFn,
|
||||
mockUpdateFeatureStatusFn,
|
||||
mockLoadFeatureFn,
|
||||
mockGetPlanningPromptPrefixFn,
|
||||
mockSaveFeatureSummaryFn,
|
||||
mockRecordLearningsFn,
|
||||
mockContextExistsFn,
|
||||
mockResumeFeatureFn,
|
||||
mockTrackFailureFn,
|
||||
mockSignalPauseFn,
|
||||
mockRecordSuccessFn,
|
||||
mockSaveExecutionStateFn,
|
||||
mockLoadContextFilesFn
|
||||
);
|
||||
};
|
||||
|
||||
it('sets verified when agent output has tool usage and sufficient length', async () => {
|
||||
const output = makeAgentOutput(3, 'Updated authentication module with new login flow.');
|
||||
vi.mocked(secureFs.readFile).mockResolvedValue(output);
|
||||
|
||||
await service.executeFeature('/test/project', 'feature-1');
|
||||
|
||||
expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
|
||||
'/test/project',
|
||||
'feature-1',
|
||||
'verified'
|
||||
);
|
||||
});
|
||||
|
||||
it('sets waiting_approval when agent output is empty', async () => {
|
||||
vi.mocked(secureFs.readFile).mockResolvedValue('');
|
||||
|
||||
const svc = createServiceWithMocks();
|
||||
await svc.executeFeature('/test/project', 'feature-1');
|
||||
|
||||
expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
|
||||
'/test/project',
|
||||
'feature-1',
|
||||
'waiting_approval'
|
||||
);
|
||||
});
|
||||
|
||||
it('sets waiting_approval when agent output has no tool usage markers', async () => {
|
||||
// Long output but no tool markers - agent printed text but didn't use tools
|
||||
const longOutputNoTools = 'I analyzed the codebase and found several issues. '.repeat(20);
|
||||
vi.mocked(secureFs.readFile).mockResolvedValue(longOutputNoTools);
|
||||
|
||||
const svc = createServiceWithMocks();
|
||||
await svc.executeFeature('/test/project', 'feature-1');
|
||||
|
||||
expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
|
||||
'/test/project',
|
||||
'feature-1',
|
||||
'waiting_approval'
|
||||
);
|
||||
});
|
||||
|
||||
it('sets waiting_approval when agent output has tool markers but is too short', async () => {
|
||||
// Has a tool marker but total output is under 200 chars
|
||||
const shortWithTool = '🔧 Tool: Read\nInput: {"file_path": "/src/index.ts"}\nDone.';
|
||||
expect(shortWithTool.trim().length).toBeLessThan(200);
|
||||
|
||||
vi.mocked(secureFs.readFile).mockResolvedValue(shortWithTool);
|
||||
|
||||
const svc = createServiceWithMocks();
|
||||
await svc.executeFeature('/test/project', 'feature-1');
|
||||
|
||||
expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
|
||||
'/test/project',
|
||||
'feature-1',
|
||||
'waiting_approval'
|
||||
);
|
||||
});
|
||||
|
||||
it('sets waiting_approval when agent output file is missing (ENOENT)', async () => {
|
||||
vi.mocked(secureFs.readFile).mockRejectedValue(new Error('ENOENT'));
|
||||
|
||||
const svc = createServiceWithMocks();
|
||||
await svc.executeFeature('/test/project', 'feature-1');
|
||||
|
||||
expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
|
||||
'/test/project',
|
||||
'feature-1',
|
||||
'waiting_approval'
|
||||
);
|
||||
});
|
||||
|
||||
it('sets waiting_approval when agent output is only whitespace', async () => {
|
||||
vi.mocked(secureFs.readFile).mockResolvedValue(' \n\n\t \n ');
|
||||
|
||||
const svc = createServiceWithMocks();
|
||||
await svc.executeFeature('/test/project', 'feature-1');
|
||||
|
||||
expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
|
||||
'/test/project',
|
||||
'feature-1',
|
||||
'waiting_approval'
|
||||
);
|
||||
});
|
||||
|
||||
it('sets verified when output is exactly at the 200 char threshold with tool usage', async () => {
|
||||
// Create output that's exactly 200 chars trimmed with tool markers
|
||||
const toolMarker = '🔧 Tool: Edit\nInput: {"file_path": "/src/index.ts"}\n';
|
||||
const padding = 'x'.repeat(200 - toolMarker.length);
|
||||
const output = toolMarker + padding;
|
||||
expect(output.trim().length).toBeGreaterThanOrEqual(200);
|
||||
|
||||
vi.mocked(secureFs.readFile).mockResolvedValue(output);
|
||||
|
||||
const svc = createServiceWithMocks();
|
||||
await svc.executeFeature('/test/project', 'feature-1');
|
||||
|
||||
expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
|
||||
'/test/project',
|
||||
'feature-1',
|
||||
'verified'
|
||||
);
|
||||
});
|
||||
|
||||
it('sets waiting_approval when output is 199 chars with tool usage (below threshold)', async () => {
|
||||
const toolMarker = '🔧 Tool: Read\n';
|
||||
const padding = 'x'.repeat(199 - toolMarker.length);
|
||||
const output = toolMarker + padding;
|
||||
expect(output.trim().length).toBe(199);
|
||||
|
||||
vi.mocked(secureFs.readFile).mockResolvedValue(output);
|
||||
|
||||
const svc = createServiceWithMocks();
|
||||
await svc.executeFeature('/test/project', 'feature-1');
|
||||
|
||||
expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
|
||||
'/test/project',
|
||||
'feature-1',
|
||||
'waiting_approval'
|
||||
);
|
||||
});
|
||||
|
||||
it('skipTests always takes priority over output validation', async () => {
|
||||
// Meaningful output with tool usage - would normally be 'verified'
|
||||
const output = makeAgentOutput(5, 'All changes applied successfully.');
|
||||
vi.mocked(secureFs.readFile).mockResolvedValue(output);
|
||||
|
||||
mockLoadFeatureFn = vi.fn().mockResolvedValue({ ...testFeature, skipTests: true });
|
||||
const svc = createServiceWithMocks();
|
||||
|
||||
await svc.executeFeature('/test/project', 'feature-1');
|
||||
|
||||
// skipTests=true always means waiting_approval regardless of output quality
|
||||
expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
|
||||
'/test/project',
|
||||
'feature-1',
|
||||
'waiting_approval'
|
||||
);
|
||||
});
|
||||
|
||||
it('skipTests with empty output still results in waiting_approval', async () => {
|
||||
vi.mocked(secureFs.readFile).mockResolvedValue('');
|
||||
|
||||
mockLoadFeatureFn = vi.fn().mockResolvedValue({ ...testFeature, skipTests: true });
|
||||
const svc = createServiceWithMocks();
|
||||
|
||||
await svc.executeFeature('/test/project', 'feature-1');
|
||||
|
||||
expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
|
||||
'/test/project',
|
||||
'feature-1',
|
||||
'waiting_approval'
|
||||
);
|
||||
});
|
||||
|
||||
it('still records success even when output validation fails', async () => {
|
||||
vi.mocked(secureFs.readFile).mockResolvedValue('');
|
||||
|
||||
const svc = createServiceWithMocks();
|
||||
await svc.executeFeature('/test/project', 'feature-1');
|
||||
|
||||
// recordSuccess should still be called - the agent ran without errors
|
||||
expect(mockRecordSuccessFn).toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('still extracts summary when output has content but no tool markers', async () => {
|
||||
const outputNoTools = 'A '.repeat(150); // > 200 chars but no tool markers
|
||||
vi.mocked(secureFs.readFile).mockResolvedValue(outputNoTools);
|
||||
|
||||
const svc = createServiceWithMocks();
|
||||
await svc.executeFeature('/test/project', 'feature-1');
|
||||
|
||||
// Summary extraction still runs even though status is waiting_approval
|
||||
expect(extractSummary).toHaveBeenCalledWith(outputNoTools);
|
||||
expect(mockSaveFeatureSummaryFn).toHaveBeenCalledWith(
|
||||
'/test/project',
|
||||
'feature-1',
|
||||
'Test summary'
|
||||
);
|
||||
});
|
||||
|
||||
it('emits feature_complete with passes=true even when output validation routes to waiting_approval', async () => {
|
||||
vi.mocked(secureFs.readFile).mockResolvedValue('');
|
||||
|
||||
const svc = createServiceWithMocks();
|
||||
await svc.executeFeature('/test/project', 'feature-1', false, true);
|
||||
|
||||
// The agent ran without error - it's still a "pass" from the execution perspective
|
||||
expect(mockEventBus.emitAutoModeEvent).toHaveBeenCalledWith(
|
||||
'auto_mode_feature_complete',
|
||||
expect.objectContaining({ passes: true })
|
||||
);
|
||||
});
|
||||
|
||||
it('handles realistic Cursor CLI output that exits quickly', async () => {
|
||||
// Simulates a Cursor CLI that prints a brief message and exits
|
||||
const cursorQuickExit = 'Task received. Processing...\nResult: completed successfully.';
|
||||
expect(cursorQuickExit.includes('🔧 Tool:')).toBe(false);
|
||||
|
||||
vi.mocked(secureFs.readFile).mockResolvedValue(cursorQuickExit);
|
||||
|
||||
const svc = createServiceWithMocks();
|
||||
await svc.executeFeature('/test/project', 'feature-1');
|
||||
|
||||
// No tool usage = waiting_approval
|
||||
expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
|
||||
'/test/project',
|
||||
'feature-1',
|
||||
'waiting_approval'
|
||||
);
|
||||
});
|
||||
|
||||
it('handles realistic Claude SDK output with multiple tool uses', async () => {
|
||||
// Simulates a Claude SDK agent that does real work
|
||||
const claudeOutput =
|
||||
"I'll implement the requested feature.\n\n" +
|
||||
'🔧 Tool: Read\nInput: {"file_path": "/src/components/App.tsx"}\n\n' +
|
||||
'I can see the existing component structure. Let me modify it.\n\n' +
|
||||
'🔧 Tool: Edit\nInput: {"file_path": "/src/components/App.tsx", "old_string": "const App = () => {", "new_string": "const App: React.FC = () => {"}\n\n' +
|
||||
'🔧 Tool: Write\nInput: {"file_path": "/src/components/NewFeature.tsx"}\n\n' +
|
||||
"I've created the new component and updated the existing one. The feature is now implemented with proper TypeScript types.";
|
||||
|
||||
vi.mocked(secureFs.readFile).mockResolvedValue(claudeOutput);
|
||||
|
||||
const svc = createServiceWithMocks();
|
||||
await svc.executeFeature('/test/project', 'feature-1');
|
||||
|
||||
// Real work = verified
|
||||
expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
|
||||
'/test/project',
|
||||
'feature-1',
|
||||
'verified'
|
||||
);
|
||||
});
|
||||
|
||||
it('reads agent output from the correct path with utf-8 encoding', async () => {
|
||||
const output = makeAgentOutput(2, 'Done with changes.');
|
||||
vi.mocked(secureFs.readFile).mockResolvedValue(output);
|
||||
|
||||
const svc = createServiceWithMocks();
|
||||
await svc.executeFeature('/test/project', 'feature-1');
|
||||
|
||||
// Verify readFile was called with the correct path derived from getFeatureDir
|
||||
expect(secureFs.readFile).toHaveBeenCalledWith(
|
||||
'/test/project/.automaker/features/feature-1/agent-output.md',
|
||||
'utf-8'
|
||||
);
|
||||
});
|
||||
|
||||
it('completion message includes auto-verified when status is verified', async () => {
|
||||
const output = makeAgentOutput(3, 'All changes applied.');
|
||||
vi.mocked(secureFs.readFile).mockResolvedValue(output);
|
||||
|
||||
const svc = createServiceWithMocks();
|
||||
await svc.executeFeature('/test/project', 'feature-1', false, true);
|
||||
|
||||
expect(mockEventBus.emitAutoModeEvent).toHaveBeenCalledWith(
|
||||
'auto_mode_feature_complete',
|
||||
expect.objectContaining({
|
||||
message: expect.stringContaining('auto-verified'),
|
||||
})
|
||||
);
|
||||
});
|
||||
|
||||
it('completion message does NOT include auto-verified when status is waiting_approval', async () => {
|
||||
// Empty output → waiting_approval
|
||||
vi.mocked(secureFs.readFile).mockResolvedValue('');
|
||||
|
||||
const svc = createServiceWithMocks();
|
||||
await svc.executeFeature('/test/project', 'feature-1', false, true);
|
||||
|
||||
const completeCall = vi
|
||||
.mocked(mockEventBus.emitAutoModeEvent)
|
||||
.mock.calls.find((call) => call[0] === 'auto_mode_feature_complete');
|
||||
expect(completeCall).toBeDefined();
|
||||
expect((completeCall![1] as { message: string }).message).not.toContain('auto-verified');
|
||||
});
|
||||
|
||||
it('uses same agentOutput for both status determination and summary extraction', async () => {
|
||||
// Specific output that is long enough with tool markers (verified path)
|
||||
// AND has content for summary extraction
|
||||
const specificOutput =
|
||||
'🔧 Tool: Read\nReading file...\n🔧 Tool: Edit\nEditing file...\n' +
|
||||
'The implementation is complete. Here is a detailed description of what was done. '.repeat(
|
||||
3
|
||||
);
|
||||
vi.mocked(secureFs.readFile).mockResolvedValue(specificOutput);
|
||||
|
||||
const svc = createServiceWithMocks();
|
||||
await svc.executeFeature('/test/project', 'feature-1');
|
||||
|
||||
// Status should be verified (has tools + long enough)
|
||||
expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
|
||||
'/test/project',
|
||||
'feature-1',
|
||||
'verified'
|
||||
);
|
||||
// extractSummary should receive the exact same output
|
||||
expect(extractSummary).toHaveBeenCalledWith(specificOutput);
|
||||
// recordLearnings should also receive the same output
|
||||
expect(mockRecordLearningsFn).toHaveBeenCalledWith(
|
||||
'/test/project',
|
||||
testFeature,
|
||||
specificOutput
|
||||
);
|
||||
});
|
||||
|
||||
it('does not call recordMemoryUsage when output is empty and memoryFiles is empty', async () => {
|
||||
vi.mocked(secureFs.readFile).mockResolvedValue('');
|
||||
const { recordMemoryUsage } = await import('@automaker/utils');
|
||||
|
||||
const svc = createServiceWithMocks();
|
||||
await svc.executeFeature('/test/project', 'feature-1');
|
||||
|
||||
// With empty output and empty memoryFiles, recordMemoryUsage should not be called
|
||||
expect(recordMemoryUsage).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('handles output with special unicode characters correctly', async () => {
|
||||
// Output with various unicode but includes tool markers
|
||||
const unicodeOutput =
|
||||
'🔧 Tool: Read\n' +
|
||||
'🔧 Tool: Edit\n' +
|
||||
'Añadiendo función de búsqueda con caracteres especiales: ñ, ü, ö, é, 日本語テスト. ' +
|
||||
'Die Änderungen wurden erfolgreich implementiert. '.repeat(3);
|
||||
vi.mocked(secureFs.readFile).mockResolvedValue(unicodeOutput);
|
||||
|
||||
const svc = createServiceWithMocks();
|
||||
await svc.executeFeature('/test/project', 'feature-1');
|
||||
|
||||
// Should still detect tool markers and sufficient length
|
||||
expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
|
||||
'/test/project',
|
||||
'feature-1',
|
||||
'verified'
|
||||
);
|
||||
});
|
||||
|
||||
it('treats output with only newlines and spaces around tool marker as insufficient', async () => {
|
||||
// Has tool marker but surrounded by whitespace, total trimmed < 200
|
||||
const sparseOutput = '\n\n 🔧 Tool: Read \n\n';
|
||||
expect(sparseOutput.trim().length).toBeLessThan(200);
|
||||
|
||||
vi.mocked(secureFs.readFile).mockResolvedValue(sparseOutput);
|
||||
|
||||
const svc = createServiceWithMocks();
|
||||
await svc.executeFeature('/test/project', 'feature-1');
|
||||
|
||||
expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
|
||||
'/test/project',
|
||||
'feature-1',
|
||||
'waiting_approval'
|
||||
);
|
||||
});
|
||||
|
||||
it('detects tool marker substring correctly (partial match like "🔧 Tools:" does not count)', async () => {
|
||||
// Output with a similar but not exact marker - "🔧 Tools:" instead of "🔧 Tool:"
|
||||
const wrongMarker = '🔧 Tools: Read\n🔧 Tools: Edit\n' + 'Implementation done. '.repeat(20);
|
||||
expect(wrongMarker.includes('🔧 Tool:')).toBe(false);
|
||||
|
||||
vi.mocked(secureFs.readFile).mockResolvedValue(wrongMarker);
|
||||
|
||||
const svc = createServiceWithMocks();
|
||||
await svc.executeFeature('/test/project', 'feature-1');
|
||||
|
||||
// "🔧 Tools:" is not the same as "🔧 Tool:" - should be waiting_approval
|
||||
expect(mockUpdateFeatureStatusFn).toHaveBeenCalledWith(
|
||||
'/test/project',
|
||||
'feature-1',
|
||||
'waiting_approval'
|
||||
);
|
||||
});
|
||||
|
||||
it('pipeline merge_conflict status short-circuits before output validation', async () => {
|
||||
// Set up pipeline that results in merge_conflict
|
||||
vi.mocked(pipelineService.getPipelineConfig).mockResolvedValue({
|
||||
version: 1,
|
||||
steps: [{ id: 'step-1', name: 'Step 1', order: 1, instructions: 'Do step 1' }] as any,
|
||||
});
|
||||
|
||||
// After pipeline, loadFeature returns merge_conflict status
|
||||
let loadCallCount = 0;
|
||||
mockLoadFeatureFn = vi.fn().mockImplementation(() => {
|
||||
loadCallCount++;
|
||||
if (loadCallCount === 1) return testFeature; // initial load
|
||||
// All subsequent loads (task check + pipeline refresh) return merge_conflict
|
||||
return { ...testFeature, status: 'merge_conflict' };
|
||||
});
|
||||
|
||||
const svc = createServiceWithMocks();
|
||||
await svc.executeFeature('/test/project', 'feature-1');
|
||||
|
||||
// Should NOT have called updateFeatureStatusFn with 'verified' or 'waiting_approval'
|
||||
// because pipeline merge_conflict short-circuits the method
|
||||
const statusCalls = vi
|
||||
.mocked(mockUpdateFeatureStatusFn)
|
||||
.mock.calls.filter((call) => call[2] === 'verified' || call[2] === 'waiting_approval');
|
||||
// The only non-in_progress status call should be absent since merge_conflict returns early
|
||||
expect(statusCalls.length).toBe(0);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user