Fix agent output validation to prevent false verified status (#807)

* Changes from fix/cursor-fix

* feat: Enhance provider error messages with diagnostic context, address test failure, fix port change, move playwright tests to different port

* Update apps/ui/src/components/views/board-view/dialogs/add-feature-dialog.tsx

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>

* ci: Update test server port from 3008 to 3108 and add environment configuration

* fix: Correct typo in health endpoint URL and standardize port env vars

---------

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
This commit is contained in:
gsxdsm
2026-02-24 20:18:40 -08:00
committed by GitHub
parent 0330c70261
commit 51e9a23ba1
36 changed files with 1610 additions and 104 deletions

View File

@@ -389,9 +389,14 @@ export class CopilotProvider extends CliProvider {
case 'session.error': {
const errorEvent = sdkEvent as SdkSessionErrorEvent;
const enrichedError =
errorEvent.data.message ||
(errorEvent.data.code
? `Copilot agent error (code: ${errorEvent.data.code})`
: 'Copilot agent error');
return {
type: 'error',
error: errorEvent.data.message || 'Unknown error',
error: enrichedError,
};
}

View File

@@ -562,10 +562,14 @@ export class CursorProvider extends CliProvider {
const resultEvent = cursorEvent as CursorResultEvent;
if (resultEvent.is_error) {
const errorText = resultEvent.error || resultEvent.result || '';
const enrichedError =
errorText ||
`Cursor agent failed (duration: ${resultEvent.duration_ms}ms, subtype: ${resultEvent.subtype}, session: ${resultEvent.session_id ?? 'none'})`;
return {
type: 'error',
session_id: resultEvent.session_id,
error: resultEvent.error || resultEvent.result || 'Unknown error',
error: enrichedError,
};
}

View File

@@ -381,10 +381,13 @@ export class GeminiProvider extends CliProvider {
const resultEvent = geminiEvent as GeminiResultEvent;
if (resultEvent.status === 'error') {
const enrichedError =
resultEvent.error ||
`Gemini agent failed (duration: ${resultEvent.stats?.duration_ms ?? 'unknown'}ms, session: ${resultEvent.session_id ?? 'none'})`;
return {
type: 'error',
session_id: resultEvent.session_id,
error: resultEvent.error || 'Unknown error',
error: enrichedError,
};
}
@@ -401,10 +404,12 @@ export class GeminiProvider extends CliProvider {
case 'error': {
const errorEvent = geminiEvent as GeminiResultEvent;
const enrichedError =
errorEvent.error || `Gemini agent failed (session: ${errorEvent.session_id ?? 'none'})`;
return {
type: 'error',
session_id: errorEvent.session_id,
error: errorEvent.error || 'Unknown error',
error: enrichedError,
};
}

View File

@@ -296,8 +296,28 @@ export class AgentExecutor {
}
}
} else if (msg.type === 'error') {
throw new Error(AgentExecutor.sanitizeProviderError(msg.error));
} else if (msg.type === 'result' && msg.subtype === 'success') scheduleWrite();
const sanitized = AgentExecutor.sanitizeProviderError(msg.error);
logger.error(
`[execute] Feature ${featureId} received error from provider. ` +
`raw="${msg.error}", sanitized="${sanitized}", session_id=${msg.session_id ?? 'none'}`
);
throw new Error(sanitized);
} else if (msg.type === 'result') {
if (msg.subtype === 'success') {
scheduleWrite();
} else if (msg.subtype?.startsWith('error')) {
// Non-success result subtypes from the SDK (error_max_turns, error_during_execution, etc.)
logger.error(
`[execute] Feature ${featureId} ended with error subtype: ${msg.subtype}. ` +
`session_id=${msg.session_id ?? 'none'}`
);
throw new Error(`Agent execution ended with: ${msg.subtype}`);
} else {
logger.warn(
`[execute] Feature ${featureId} received unhandled result subtype: ${msg.subtype}`
);
}
}
}
} finally {
clearInterval(streamHeartbeat);
@@ -447,16 +467,28 @@ export class AgentExecutor {
});
}
} else if (msg.type === 'error') {
// Clean the error: strip ANSI codes and redundant "Error: " prefix
const cleanedError =
(msg.error || `Error during task ${task.id}`)
.replace(/\x1b\[[0-9;]*m/g, '')
.replace(/^Error:\s*/i, '')
.trim() || `Error during task ${task.id}`;
throw new Error(cleanedError);
} else if (msg.type === 'result' && msg.subtype === 'success') {
taskOutput += msg.result || '';
responseText += msg.result || '';
const fallback = `Error during task ${task.id}`;
const sanitized = AgentExecutor.sanitizeProviderError(msg.error || fallback);
logger.error(
`[executeTasksLoop] Feature ${featureId} task ${task.id} received error from provider. ` +
`raw="${msg.error}", sanitized="${sanitized}", session_id=${msg.session_id ?? 'none'}`
);
throw new Error(sanitized);
} else if (msg.type === 'result') {
if (msg.subtype === 'success') {
taskOutput += msg.result || '';
responseText += msg.result || '';
} else if (msg.subtype?.startsWith('error')) {
logger.error(
`[executeTasksLoop] Feature ${featureId} task ${task.id} ended with error subtype: ${msg.subtype}. ` +
`session_id=${msg.session_id ?? 'none'}`
);
throw new Error(`Agent execution ended with: ${msg.subtype}`);
} else {
logger.warn(
`[executeTasksLoop] Feature ${featureId} task ${task.id} received unhandled result subtype: ${msg.subtype}`
);
}
}
}
if (!taskCompleteDetected)

View File

@@ -60,6 +60,12 @@ import type {
const logger = createLogger('ExecutionService');
/** Marker written by agent-executor for each tool invocation. */
const TOOL_USE_MARKER = '🔧 Tool:';
/** Minimum trimmed output length to consider agent work meaningful. */
const MIN_MEANINGFUL_OUTPUT_LENGTH = 200;
export class ExecutionService {
constructor(
private eventBus: TypedEventBus,
@@ -409,7 +415,41 @@ Please continue from where you left off and complete all remaining tasks. Use th
}
}
const finalStatus = feature.skipTests ? 'waiting_approval' : 'verified';
// Read agent output before determining final status.
// CLI-based providers (Cursor, Codex, etc.) may exit quickly without doing
// meaningful work. Check output to avoid prematurely marking as 'verified'.
const outputPath = path.join(getFeatureDir(projectPath, featureId), 'agent-output.md');
let agentOutput = '';
try {
agentOutput = (await secureFs.readFile(outputPath, 'utf-8')) as string;
} catch {
/* */
}
// Determine if the agent did meaningful work by checking for tool usage
// indicators in the output. The agent executor writes "🔧 Tool:" markers
// each time a tool is invoked. No tool usage suggests the CLI exited
// without performing implementation work.
const hasToolUsage = agentOutput.includes(TOOL_USE_MARKER);
const isOutputTooShort = agentOutput.trim().length < MIN_MEANINGFUL_OUTPUT_LENGTH;
const agentDidWork = hasToolUsage && !isOutputTooShort;
let finalStatus: 'verified' | 'waiting_approval';
if (feature.skipTests) {
finalStatus = 'waiting_approval';
} else if (!agentDidWork) {
// Agent didn't produce meaningful output (e.g., CLI exited quickly).
// Route to waiting_approval so the user can review and re-run.
finalStatus = 'waiting_approval';
logger.warn(
`[executeFeature] Feature ${featureId}: agent produced insufficient output ` +
`(${agentOutput.trim().length}/${MIN_MEANINGFUL_OUTPUT_LENGTH} chars, toolUsage=${hasToolUsage}). ` +
`Setting status to waiting_approval instead of verified.`
);
} else {
finalStatus = 'verified';
}
await this.updateFeatureStatusFn(projectPath, featureId, finalStatus);
this.recordSuccessFn();
@@ -421,13 +461,6 @@ Please continue from where you left off and complete all remaining tasks. Use th
const hasIncompleteTasks = totalTasks > 0 && completedTasks < totalTasks;
try {
const outputPath = path.join(getFeatureDir(projectPath, featureId), 'agent-output.md');
let agentOutput = '';
try {
agentOutput = (await secureFs.readFile(outputPath, 'utf-8')) as string;
} catch {
/* */
}
if (agentOutput) {
const summary = extractSummary(agentOutput);
if (summary) await this.saveFeatureSummaryFn(projectPath, featureId, summary);