mirror of
https://github.com/AutoMaker-Org/automaker.git
synced 2026-03-17 10:03:08 +00:00
Fix agent output validation to prevent false verified status (#807)
* Changes from fix/cursor-fix * feat: Enhance provider error messages with diagnostic context, address test failure, fix port change, move playwright tests to different port * Update apps/ui/src/components/views/board-view/dialogs/add-feature-dialog.tsx Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> * ci: Update test server port from 3008 to 3108 and add environment configuration * fix: Correct typo in health endpoint URL and standardize port env vars --------- Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
This commit is contained in:
@@ -389,9 +389,14 @@ export class CopilotProvider extends CliProvider {
|
||||
|
||||
case 'session.error': {
|
||||
const errorEvent = sdkEvent as SdkSessionErrorEvent;
|
||||
const enrichedError =
|
||||
errorEvent.data.message ||
|
||||
(errorEvent.data.code
|
||||
? `Copilot agent error (code: ${errorEvent.data.code})`
|
||||
: 'Copilot agent error');
|
||||
return {
|
||||
type: 'error',
|
||||
error: errorEvent.data.message || 'Unknown error',
|
||||
error: enrichedError,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -562,10 +562,14 @@ export class CursorProvider extends CliProvider {
|
||||
const resultEvent = cursorEvent as CursorResultEvent;
|
||||
|
||||
if (resultEvent.is_error) {
|
||||
const errorText = resultEvent.error || resultEvent.result || '';
|
||||
const enrichedError =
|
||||
errorText ||
|
||||
`Cursor agent failed (duration: ${resultEvent.duration_ms}ms, subtype: ${resultEvent.subtype}, session: ${resultEvent.session_id ?? 'none'})`;
|
||||
return {
|
||||
type: 'error',
|
||||
session_id: resultEvent.session_id,
|
||||
error: resultEvent.error || resultEvent.result || 'Unknown error',
|
||||
error: enrichedError,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -381,10 +381,13 @@ export class GeminiProvider extends CliProvider {
|
||||
const resultEvent = geminiEvent as GeminiResultEvent;
|
||||
|
||||
if (resultEvent.status === 'error') {
|
||||
const enrichedError =
|
||||
resultEvent.error ||
|
||||
`Gemini agent failed (duration: ${resultEvent.stats?.duration_ms ?? 'unknown'}ms, session: ${resultEvent.session_id ?? 'none'})`;
|
||||
return {
|
||||
type: 'error',
|
||||
session_id: resultEvent.session_id,
|
||||
error: resultEvent.error || 'Unknown error',
|
||||
error: enrichedError,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -401,10 +404,12 @@ export class GeminiProvider extends CliProvider {
|
||||
|
||||
case 'error': {
|
||||
const errorEvent = geminiEvent as GeminiResultEvent;
|
||||
const enrichedError =
|
||||
errorEvent.error || `Gemini agent failed (session: ${errorEvent.session_id ?? 'none'})`;
|
||||
return {
|
||||
type: 'error',
|
||||
session_id: errorEvent.session_id,
|
||||
error: errorEvent.error || 'Unknown error',
|
||||
error: enrichedError,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -296,8 +296,28 @@ export class AgentExecutor {
|
||||
}
|
||||
}
|
||||
} else if (msg.type === 'error') {
|
||||
throw new Error(AgentExecutor.sanitizeProviderError(msg.error));
|
||||
} else if (msg.type === 'result' && msg.subtype === 'success') scheduleWrite();
|
||||
const sanitized = AgentExecutor.sanitizeProviderError(msg.error);
|
||||
logger.error(
|
||||
`[execute] Feature ${featureId} received error from provider. ` +
|
||||
`raw="${msg.error}", sanitized="${sanitized}", session_id=${msg.session_id ?? 'none'}`
|
||||
);
|
||||
throw new Error(sanitized);
|
||||
} else if (msg.type === 'result') {
|
||||
if (msg.subtype === 'success') {
|
||||
scheduleWrite();
|
||||
} else if (msg.subtype?.startsWith('error')) {
|
||||
// Non-success result subtypes from the SDK (error_max_turns, error_during_execution, etc.)
|
||||
logger.error(
|
||||
`[execute] Feature ${featureId} ended with error subtype: ${msg.subtype}. ` +
|
||||
`session_id=${msg.session_id ?? 'none'}`
|
||||
);
|
||||
throw new Error(`Agent execution ended with: ${msg.subtype}`);
|
||||
} else {
|
||||
logger.warn(
|
||||
`[execute] Feature ${featureId} received unhandled result subtype: ${msg.subtype}`
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
clearInterval(streamHeartbeat);
|
||||
@@ -447,16 +467,28 @@ export class AgentExecutor {
|
||||
});
|
||||
}
|
||||
} else if (msg.type === 'error') {
|
||||
// Clean the error: strip ANSI codes and redundant "Error: " prefix
|
||||
const cleanedError =
|
||||
(msg.error || `Error during task ${task.id}`)
|
||||
.replace(/\x1b\[[0-9;]*m/g, '')
|
||||
.replace(/^Error:\s*/i, '')
|
||||
.trim() || `Error during task ${task.id}`;
|
||||
throw new Error(cleanedError);
|
||||
} else if (msg.type === 'result' && msg.subtype === 'success') {
|
||||
taskOutput += msg.result || '';
|
||||
responseText += msg.result || '';
|
||||
const fallback = `Error during task ${task.id}`;
|
||||
const sanitized = AgentExecutor.sanitizeProviderError(msg.error || fallback);
|
||||
logger.error(
|
||||
`[executeTasksLoop] Feature ${featureId} task ${task.id} received error from provider. ` +
|
||||
`raw="${msg.error}", sanitized="${sanitized}", session_id=${msg.session_id ?? 'none'}`
|
||||
);
|
||||
throw new Error(sanitized);
|
||||
} else if (msg.type === 'result') {
|
||||
if (msg.subtype === 'success') {
|
||||
taskOutput += msg.result || '';
|
||||
responseText += msg.result || '';
|
||||
} else if (msg.subtype?.startsWith('error')) {
|
||||
logger.error(
|
||||
`[executeTasksLoop] Feature ${featureId} task ${task.id} ended with error subtype: ${msg.subtype}. ` +
|
||||
`session_id=${msg.session_id ?? 'none'}`
|
||||
);
|
||||
throw new Error(`Agent execution ended with: ${msg.subtype}`);
|
||||
} else {
|
||||
logger.warn(
|
||||
`[executeTasksLoop] Feature ${featureId} task ${task.id} received unhandled result subtype: ${msg.subtype}`
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!taskCompleteDetected)
|
||||
|
||||
@@ -60,6 +60,12 @@ import type {
|
||||
|
||||
const logger = createLogger('ExecutionService');
|
||||
|
||||
/** Marker written by agent-executor for each tool invocation. */
|
||||
const TOOL_USE_MARKER = '🔧 Tool:';
|
||||
|
||||
/** Minimum trimmed output length to consider agent work meaningful. */
|
||||
const MIN_MEANINGFUL_OUTPUT_LENGTH = 200;
|
||||
|
||||
export class ExecutionService {
|
||||
constructor(
|
||||
private eventBus: TypedEventBus,
|
||||
@@ -409,7 +415,41 @@ Please continue from where you left off and complete all remaining tasks. Use th
|
||||
}
|
||||
}
|
||||
|
||||
const finalStatus = feature.skipTests ? 'waiting_approval' : 'verified';
|
||||
// Read agent output before determining final status.
|
||||
// CLI-based providers (Cursor, Codex, etc.) may exit quickly without doing
|
||||
// meaningful work. Check output to avoid prematurely marking as 'verified'.
|
||||
const outputPath = path.join(getFeatureDir(projectPath, featureId), 'agent-output.md');
|
||||
let agentOutput = '';
|
||||
try {
|
||||
agentOutput = (await secureFs.readFile(outputPath, 'utf-8')) as string;
|
||||
} catch {
|
||||
/* */
|
||||
}
|
||||
|
||||
// Determine if the agent did meaningful work by checking for tool usage
|
||||
// indicators in the output. The agent executor writes "🔧 Tool:" markers
|
||||
// each time a tool is invoked. No tool usage suggests the CLI exited
|
||||
// without performing implementation work.
|
||||
const hasToolUsage = agentOutput.includes(TOOL_USE_MARKER);
|
||||
const isOutputTooShort = agentOutput.trim().length < MIN_MEANINGFUL_OUTPUT_LENGTH;
|
||||
const agentDidWork = hasToolUsage && !isOutputTooShort;
|
||||
|
||||
let finalStatus: 'verified' | 'waiting_approval';
|
||||
if (feature.skipTests) {
|
||||
finalStatus = 'waiting_approval';
|
||||
} else if (!agentDidWork) {
|
||||
// Agent didn't produce meaningful output (e.g., CLI exited quickly).
|
||||
// Route to waiting_approval so the user can review and re-run.
|
||||
finalStatus = 'waiting_approval';
|
||||
logger.warn(
|
||||
`[executeFeature] Feature ${featureId}: agent produced insufficient output ` +
|
||||
`(${agentOutput.trim().length}/${MIN_MEANINGFUL_OUTPUT_LENGTH} chars, toolUsage=${hasToolUsage}). ` +
|
||||
`Setting status to waiting_approval instead of verified.`
|
||||
);
|
||||
} else {
|
||||
finalStatus = 'verified';
|
||||
}
|
||||
|
||||
await this.updateFeatureStatusFn(projectPath, featureId, finalStatus);
|
||||
this.recordSuccessFn();
|
||||
|
||||
@@ -421,13 +461,6 @@ Please continue from where you left off and complete all remaining tasks. Use th
|
||||
const hasIncompleteTasks = totalTasks > 0 && completedTasks < totalTasks;
|
||||
|
||||
try {
|
||||
const outputPath = path.join(getFeatureDir(projectPath, featureId), 'agent-output.md');
|
||||
let agentOutput = '';
|
||||
try {
|
||||
agentOutput = (await secureFs.readFile(outputPath, 'utf-8')) as string;
|
||||
} catch {
|
||||
/* */
|
||||
}
|
||||
if (agentOutput) {
|
||||
const summary = extractSummary(agentOutput);
|
||||
if (summary) await this.saveFeatureSummaryFn(projectPath, featureId, summary);
|
||||
|
||||
Reference in New Issue
Block a user