feat(gemini-cli): upgrade to native structured output support (#1437)

This commit is contained in:
Ben Vargas
2025-11-24 10:56:18 -07:00
committed by GitHub
parent c1df63d722
commit 783398ecdf
17 changed files with 688 additions and 1329 deletions

View File

@@ -3,35 +3,38 @@
*
* Implementation for interacting with Gemini models via Gemini CLI
* using the ai-sdk-provider-gemini-cli package.
*
* As of v1.4.0, the SDK provides native structured output support via:
* - supportsStructuredOutputs = true
* - defaultObjectGenerationMode = 'json'
* - responseJsonSchema passed directly to Gemini API
*
* This eliminates the need for JSON extraction workarounds.
* System messages are automatically handled by the SDK's mapPromptToGeminiFormat
* which extracts them to Gemini's systemInstruction field.
*/
import { generateObject, generateText, streamText } from 'ai';
import { parse } from 'jsonc-parser';
import { BaseAIProvider } from './base-provider.js';
import { log } from '../../scripts/modules/utils.js';
import { createGeminiProvider } from 'ai-sdk-provider-gemini-cli';
import { BaseAIProvider } from './base-provider.js';
export class GeminiCliProvider extends BaseAIProvider {
constructor() {
super();
this.name = 'Gemini CLI';
// Gemini CLI requires explicit JSON schema mode
this.needsExplicitJsonSchema = true;
// Gemini CLI does not support temperature parameter
this.supportsTemperature = false;
}
/**
* Override validateAuth to handle Gemini CLI authentication options
* Override validateAuth to handle Gemini CLI authentication options.
* Gemini CLI is designed to use pre-configured OAuth authentication.
* Users choose gemini-cli specifically to leverage their existing
* gemini auth login credentials, not to use API keys.
* @param {object} params - Parameters to validate
*/
validateAuth(params) {
// Gemini CLI is designed to use pre-configured OAuth authentication
// Users choose gemini-cli specifically to leverage their existing
// gemini auth login credentials, not to use API keys.
// We support API keys for compatibility, but the expected usage
// is through CLI authentication (no API key required).
// No validation needed - the SDK will handle auth internally
// via OAuth (primary) or API key (optional fallback)
}
/**
@@ -44,7 +47,7 @@ export class GeminiCliProvider extends BaseAIProvider {
*/
async getClient(params) {
try {
// Primary use case: Use existing gemini CLI authentication
// Primary use case: Use existing gemini CLI authentication via OAuth
// Secondary use case: Direct API key (for compatibility)
let authOptions = {};
@@ -74,577 +77,18 @@ export class GeminiCliProvider extends BaseAIProvider {
}
/**
* Extracts system messages from the messages array and returns them separately.
* This is needed because ai-sdk-provider-gemini-cli expects system prompts as a separate parameter.
* @param {Array} messages - Array of message objects
* @param {Object} options - Options for system prompt enhancement
* @param {boolean} options.enforceJsonOutput - Whether to add JSON enforcement to system prompt
* @returns {Object} - {systemPrompt: string|undefined, messages: Array}
* Returns the name of the API key environment variable.
* @returns {string} API key environment variable name
*/
_extractSystemMessage(messages, options = {}) {
if (!messages || !Array.isArray(messages)) {
return { systemPrompt: undefined, messages: messages || [] };
}
const systemMessages = messages.filter((msg) => msg.role === 'system');
const nonSystemMessages = messages.filter((msg) => msg.role !== 'system');
// Combine multiple system messages if present
let systemPrompt =
systemMessages.length > 0
? systemMessages.map((msg) => msg.content).join('\n\n')
: undefined;
// Add Gemini CLI specific JSON enforcement if requested
if (options.enforceJsonOutput) {
const jsonEnforcement = this._getJsonEnforcementPrompt();
systemPrompt = systemPrompt
? `${systemPrompt}\n\n${jsonEnforcement}`
: jsonEnforcement;
}
return { systemPrompt, messages: nonSystemMessages };
}
/**
* Gets a Gemini CLI specific system prompt to enforce strict JSON output
* @returns {string} JSON enforcement system prompt
*/
_getJsonEnforcementPrompt() {
return `CRITICAL: You MUST respond with ONLY valid JSON. Do not include any explanatory text, markdown formatting, code block markers, or conversational phrases like "Here is" or "Of course". Your entire response must be parseable JSON that starts with { or [ and ends with } or ]. No exceptions.`;
}
/**
* Checks if a string is valid JSON
* @param {string} text - Text to validate
* @returns {boolean} True if valid JSON
*/
_isValidJson(text) {
if (!text || typeof text !== 'string') {
return false;
}
try {
JSON.parse(text.trim());
return true;
} catch {
return false;
}
}
/**
* Detects if the user prompt is requesting JSON output
* @param {Array} messages - Array of message objects
* @returns {boolean} True if JSON output is likely expected
*/
_detectJsonRequest(messages) {
const userMessages = messages.filter((msg) => msg.role === 'user');
const combinedText = userMessages
.map((msg) => msg.content)
.join(' ')
.toLowerCase();
// Look for indicators that JSON output is expected
const jsonIndicators = [
'json',
'respond only with',
'return only',
'output only',
'format:',
'structure:',
'schema:',
'{"',
'[{',
'subtasks',
'array',
'object'
];
return jsonIndicators.some((indicator) => combinedText.includes(indicator));
}
/**
* Simplifies complex prompts for gemini-cli to improve JSON output compliance
* @param {Array} messages - Array of message objects
* @returns {Array} Simplified messages array
*/
_simplifyJsonPrompts(messages) {
// First, check if this is an expand-task operation by looking at the system message
const systemMsg = messages.find((m) => m.role === 'system');
const isExpandTask =
systemMsg &&
systemMsg.content.includes(
'You are an AI assistant helping with task breakdown. Generate exactly'
);
if (!isExpandTask) {
return messages; // Not an expand task, return unchanged
}
// Extract subtask count from system message
const subtaskCountMatch = systemMsg.content.match(
/Generate exactly (\d+) subtasks/
);
const subtaskCount = subtaskCountMatch ? subtaskCountMatch[1] : '10';
log(
'debug',
`${this.name} detected expand-task operation, simplifying for ${subtaskCount} subtasks`
);
return messages.map((msg) => {
if (msg.role !== 'user') {
return msg;
}
// For expand-task user messages, create a much simpler, more direct prompt
// that doesn't depend on specific task content
const simplifiedPrompt = `Generate exactly ${subtaskCount} subtasks in the following JSON format.
CRITICAL INSTRUCTION: You must respond with ONLY valid JSON. No explanatory text, no "Here is", no "Of course", no markdown - just the JSON object.
Required JSON structure:
{
"subtasks": [
{
"id": 1,
"title": "Specific actionable task title",
"description": "Clear task description",
"dependencies": [],
"details": "Implementation details and guidance",
"testStrategy": "Testing approach"
}
]
}
Generate ${subtaskCount} subtasks based on the original task context. Return ONLY the JSON object.`;
log(
'debug',
`${this.name} simplified user prompt for better JSON compliance`
);
return { ...msg, content: simplifiedPrompt };
});
}
/**
* Extract JSON from Gemini's response using a tolerant parser.
*
* Optimized approach that progressively tries different parsing strategies:
* 1. Direct parsing after cleanup
* 2. Smart boundary detection with single-pass analysis
* 3. Limited character-by-character fallback for edge cases
*
* @param {string} text - Raw text which may contain JSON
* @returns {string} A valid JSON string if extraction succeeds, otherwise the original text
*/
extractJson(text) {
if (!text || typeof text !== 'string') {
return text;
}
let content = text.trim();
// Early exit for very short content
if (content.length < 2) {
return text;
}
// Strip common wrappers in a single pass
content = content
// Remove markdown fences
.replace(/^.*?```(?:json)?\s*([\s\S]*?)\s*```.*$/i, '$1')
// Remove variable declarations
.replace(/^\s*(?:const|let|var)\s+\w+\s*=\s*([\s\S]*?)(?:;|\s*)$/i, '$1')
// Remove common prefixes
.replace(/^(?:Here's|The)\s+(?:the\s+)?JSON.*?[:]\s*/i, '')
.trim();
// Find the first JSON-like structure
const firstObj = content.indexOf('{');
const firstArr = content.indexOf('[');
if (firstObj === -1 && firstArr === -1) {
return text;
}
const start =
firstArr === -1
? firstObj
: firstObj === -1
? firstArr
: Math.min(firstObj, firstArr);
content = content.slice(start);
// Optimized parsing function with error collection
const tryParse = (value) => {
if (!value || value.length < 2) return undefined;
const errors = [];
try {
const result = parse(value, errors, {
allowTrailingComma: true,
allowEmptyContent: false
});
if (errors.length === 0 && result !== undefined) {
return JSON.stringify(result, null, 2);
}
} catch {
// Parsing failed completely
}
return undefined;
};
// Try parsing the full content first
const fullParse = tryParse(content);
if (fullParse !== undefined) {
return fullParse;
}
// Smart boundary detection - single pass with optimizations
const openChar = content[0];
const closeChar = openChar === '{' ? '}' : ']';
let depth = 0;
let inString = false;
let escapeNext = false;
let lastValidEnd = -1;
// Single-pass boundary detection with early termination
for (let i = 0; i < content.length && i < 10000; i++) {
// Limit scan for performance
const char = content[i];
if (escapeNext) {
escapeNext = false;
continue;
}
if (char === '\\') {
escapeNext = true;
continue;
}
if (char === '"') {
inString = !inString;
continue;
}
if (inString) continue;
if (char === openChar) {
depth++;
} else if (char === closeChar) {
depth--;
if (depth === 0) {
lastValidEnd = i + 1;
// Try parsing immediately on first valid boundary
const candidate = content.slice(0, lastValidEnd);
const parsed = tryParse(candidate);
if (parsed !== undefined) {
return parsed;
}
}
}
}
// If we found valid boundaries but parsing failed, try limited fallback
if (lastValidEnd > 0) {
const maxAttempts = Math.min(5, Math.floor(lastValidEnd / 100)); // Limit attempts
for (let i = 0; i < maxAttempts; i++) {
const testEnd = Math.max(
lastValidEnd - i * 50,
Math.floor(lastValidEnd * 0.8)
);
const candidate = content.slice(0, testEnd);
const parsed = tryParse(candidate);
if (parsed !== undefined) {
return parsed;
}
}
}
return text;
}
/**
* Generates text using Gemini CLI model
* Overrides base implementation to properly handle system messages and enforce JSON output when needed
*/
async generateText(params) {
try {
this.validateParams(params);
this.validateMessages(params.messages);
log(
'debug',
`Generating ${this.name} text with model: ${params.modelId}`
);
// Detect if JSON output is expected and enforce it for better gemini-cli compatibility
const enforceJsonOutput = this._detectJsonRequest(params.messages);
// Debug logging to understand what's happening
log('debug', `${this.name} JSON detection analysis:`, {
enforceJsonOutput,
messageCount: params.messages.length,
messages: params.messages.map((msg) => ({
role: msg.role,
contentPreview: msg.content
? msg.content.substring(0, 200) + '...'
: 'empty'
}))
});
if (enforceJsonOutput) {
log(
'debug',
`${this.name} detected JSON request - applying strict JSON enforcement system prompt`
);
}
// For gemini-cli, simplify complex prompts before processing
let processedMessages = params.messages;
if (enforceJsonOutput) {
processedMessages = this._simplifyJsonPrompts(params.messages);
}
// Extract system messages for separate handling with optional JSON enforcement
const { systemPrompt, messages } = this._extractSystemMessage(
processedMessages,
{ enforceJsonOutput }
);
// Debug the final system prompt being sent
log('debug', `${this.name} final system prompt:`, {
systemPromptLength: systemPrompt ? systemPrompt.length : 0,
systemPromptPreview: systemPrompt
? systemPrompt.substring(0, 300) + '...'
: 'none',
finalMessageCount: messages.length
});
const client = await this.getClient(params);
const result = await generateText({
model: client(params.modelId),
system: systemPrompt,
messages: messages,
maxOutputTokens: params.maxTokens,
temperature: params.temperature
});
// If we detected a JSON request and gemini-cli returned conversational text,
// attempt to extract JSON from the response
let finalText = result.text;
if (enforceJsonOutput && result.text && !this._isValidJson(result.text)) {
log(
'debug',
`${this.name} response appears conversational, attempting JSON extraction`
);
// Log first 1000 chars of the response to see what Gemini actually returned
log('debug', `${this.name} raw response preview:`, {
responseLength: result.text.length,
responseStart: result.text.substring(0, 1000)
});
const extractedJson = this.extractJson(result.text);
if (this._isValidJson(extractedJson)) {
log(
'debug',
`${this.name} successfully extracted JSON from conversational response`
);
finalText = extractedJson;
} else {
log(
'debug',
`${this.name} JSON extraction failed, returning original response`
);
// Log what extraction returned to debug why it failed
log('debug', `${this.name} extraction result preview:`, {
extractedLength: extractedJson ? extractedJson.length : 0,
extractedStart: extractedJson
? extractedJson.substring(0, 500)
: 'null'
});
}
}
log(
'debug',
`${this.name} generateText completed successfully for model: ${params.modelId}`
);
return {
text: finalText,
usage: {
inputTokens: result.usage?.promptTokens,
outputTokens: result.usage?.completionTokens,
totalTokens: result.usage?.totalTokens
}
};
} catch (error) {
this.handleError('text generation', error);
}
}
/**
* Streams text using Gemini CLI model
* Overrides base implementation to properly handle system messages and enforce JSON output when needed
*/
async streamText(params) {
try {
this.validateParams(params);
this.validateMessages(params.messages);
log('debug', `Streaming ${this.name} text with model: ${params.modelId}`);
// Detect if JSON output is expected and enforce it for better gemini-cli compatibility
const enforceJsonOutput = this._detectJsonRequest(params.messages);
// Debug logging to understand what's happening
log('debug', `${this.name} JSON detection analysis:`, {
enforceJsonOutput,
messageCount: params.messages.length,
messages: params.messages.map((msg) => ({
role: msg.role,
contentPreview: msg.content
? msg.content.substring(0, 200) + '...'
: 'empty'
}))
});
if (enforceJsonOutput) {
log(
'debug',
`${this.name} detected JSON request - applying strict JSON enforcement system prompt`
);
}
// Extract system messages for separate handling with optional JSON enforcement
const { systemPrompt, messages } = this._extractSystemMessage(
params.messages,
{ enforceJsonOutput }
);
const client = await this.getClient(params);
const stream = await streamText({
model: client(params.modelId),
system: systemPrompt,
messages: messages,
maxOutputTokens: params.maxTokens,
temperature: params.temperature
});
log(
'debug',
`${this.name} streamText initiated successfully for model: ${params.modelId}`
);
// Note: For streaming, we can't intercept and modify the response in real-time
// The JSON extraction would need to happen on the consuming side
return stream;
} catch (error) {
this.handleError('text streaming', error);
}
}
/**
* Generates a structured object using Gemini CLI model
* Overrides base implementation to handle Gemini-specific JSON formatting issues and system messages
*/
async generateObject(params) {
try {
// First try the standard generateObject from base class
return await super.generateObject(params);
} catch (error) {
// If it's a JSON parsing error, try to extract and parse JSON manually
if (error.message?.includes('JSON') || error.message?.includes('parse')) {
log(
'debug',
`Gemini CLI generateObject failed with parsing error, attempting manual extraction`
);
try {
// Validate params first
this.validateParams(params);
this.validateMessages(params.messages);
if (!params.schema) {
throw new Error('Schema is required for object generation');
}
if (!params.objectName) {
throw new Error('Object name is required for object generation');
}
// Extract system messages for separate handling with JSON enforcement
const { systemPrompt, messages } = this._extractSystemMessage(
params.messages,
{ enforceJsonOutput: true }
);
// Call generateObject directly with our client
const client = await this.getClient(params);
const result = await generateObject({
model: client(params.modelId),
system: systemPrompt,
messages: messages,
schema: params.schema,
mode: this.needsExplicitJsonSchema ? 'json' : 'auto',
maxOutputTokens: params.maxTokens,
temperature: params.temperature
});
// If we get rawResponse text, try to extract JSON from it
if (result.rawResponse?.text && !result.object) {
const extractedJson = this.extractJson(result.rawResponse.text);
try {
result.object = JSON.parse(extractedJson);
} catch (parseError) {
log(
'error',
`Failed to parse extracted JSON: ${parseError.message}`
);
log(
'debug',
`Extracted JSON: ${extractedJson.substring(0, 500)}...`
);
throw new Error(
`Gemini CLI returned invalid JSON that could not be parsed: ${parseError.message}`
);
}
}
return {
object: result.object,
usage: {
inputTokens: result.usage?.promptTokens,
outputTokens: result.usage?.completionTokens,
totalTokens: result.usage?.totalTokens
}
};
} catch (retryError) {
log(
'error',
`Gemini CLI manual JSON extraction failed: ${retryError.message}`
);
// Re-throw the original error with more context
throw new Error(
`${this.name} failed to generate valid JSON object: ${error.message}`
);
}
}
// For non-parsing errors, just re-throw
throw error;
}
}
getRequiredApiKeyName() {
return 'GEMINI_API_KEY';
}
/**
* Indicates whether an API key is required.
* Gemini CLI primarily uses OAuth, so API key is optional.
* @returns {boolean} False - API key is not required
*/
isRequiredApiKey() {
return false;
}