feat: Add structured template metadata generation with OpenAI

- Implement OpenAI batch API integration for metadata generation - Add metadata columns to database schema (metadata_json, metadata_generated_at) - Create MetadataGenerator service with structured output schemas - Create BatchProcessor for handling OpenAI batch jobs - Add --generate-metadata flag to fetch-templates script - Update template repository with metadata management methods - Add OpenAI configuration to environment variables - Include comprehensive tests for metadata generation - Use gpt-4o-mini model with 50% cost savings via batch API 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2026-02-10 07:13:07 +00:00 · 2025-09-14 20:00:39 +02:00
parent d49416fc58
commit 6e24da722b
9 changed files with 1011 additions and 13 deletions
--- a/src/templates/batch-processor.ts
+++ b/src/templates/batch-processor.ts
@@ -0,0 +1,282 @@
+import * as fs from 'fs';
+import * as path from 'path';
+import OpenAI from 'openai';
+import { logger } from '../utils/logger';
+import { MetadataGenerator, MetadataRequest, MetadataResult } from './metadata-generator';
+
+export interface BatchProcessorOptions {
+  apiKey: string;
+  model?: string;
+  batchSize?: number;
+  outputDir?: string;
+}
+
+export interface BatchJob {
+  id: string;
+  status: 'validating' | 'in_progress' | 'finalizing' | 'completed' | 'failed' | 'expired' | 'cancelled';
+  created_at: number;
+  completed_at?: number;
+  input_file_id: string;
+  output_file_id?: string;
+  error?: any;
+}
+
+export class BatchProcessor {
+  private client: OpenAI;
+  private generator: MetadataGenerator;
+  private batchSize: number;
+  private outputDir: string;
+  
+  constructor(options: BatchProcessorOptions) {
+    this.client = new OpenAI({ apiKey: options.apiKey });
+    this.generator = new MetadataGenerator(options.apiKey, options.model);
+    this.batchSize = options.batchSize || 100;
+    this.outputDir = options.outputDir || './temp';
+    
+    // Ensure output directory exists
+    if (!fs.existsSync(this.outputDir)) {
+      fs.mkdirSync(this.outputDir, { recursive: true });
+    }
+  }
+  
+  /**
+   * Process templates in batches
+   */
+  async processTemplates(
+    templates: MetadataRequest[],
+    progressCallback?: (message: string, current: number, total: number) => void
+  ): Promise<Map<number, MetadataResult>> {
+    const results = new Map<number, MetadataResult>();
+    const batches = this.createBatches(templates);
+    
+    logger.info(`Processing ${templates.length} templates in ${batches.length} batches`);
+    
+    for (let i = 0; i < batches.length; i++) {
+      const batch = batches[i];
+      const batchNum = i + 1;
+      
+      try {
+        progressCallback?.(`Processing batch ${batchNum}/${batches.length}`, i * this.batchSize, templates.length);
+        
+        // Process this batch
+        const batchResults = await this.processBatch(batch, `batch_${batchNum}`);
+        
+        // Merge results
+        for (const result of batchResults) {
+          results.set(result.templateId, result);
+        }
+        
+        logger.info(`Completed batch ${batchNum}/${batches.length}: ${batchResults.length} results`);
+        progressCallback?.(`Completed batch ${batchNum}/${batches.length}`, Math.min((i + 1) * this.batchSize, templates.length), templates.length);
+      } catch (error) {
+        logger.error(`Error processing batch ${batchNum}:`, error);
+        // Continue with next batch
+      }
+    }
+    
+    logger.info(`Batch processing complete: ${results.size} results`);
+    return results;
+  }
+  
+  /**
+   * Process a single batch
+   */
+  private async processBatch(templates: MetadataRequest[], batchName: string): Promise<MetadataResult[]> {
+    // Create JSONL file
+    const inputFile = await this.createBatchFile(templates, batchName);
+    
+    try {
+      // Upload file to OpenAI
+      const uploadedFile = await this.uploadFile(inputFile);
+      
+      // Create batch job
+      const batchJob = await this.createBatchJob(uploadedFile.id);
+      
+      // Monitor job until completion
+      const completedJob = await this.monitorBatchJob(batchJob.id);
+      
+      // Retrieve and parse results
+      const results = await this.retrieveResults(completedJob);
+      
+      // Cleanup
+      await this.cleanup(inputFile, uploadedFile.id, completedJob.output_file_id);
+      
+      return results;
+    } catch (error) {
+      // Cleanup on error
+      try {
+        fs.unlinkSync(inputFile);
+      } catch {}
+      throw error;
+    }
+  }
+  
+  /**
+   * Create batches from templates
+   */
+  private createBatches(templates: MetadataRequest[]): MetadataRequest[][] {
+    const batches: MetadataRequest[][] = [];
+    
+    for (let i = 0; i < templates.length; i += this.batchSize) {
+      batches.push(templates.slice(i, i + this.batchSize));
+    }
+    
+    return batches;
+  }
+  
+  /**
+   * Create JSONL batch file
+   */
+  private async createBatchFile(templates: MetadataRequest[], batchName: string): Promise<string> {
+    const filename = path.join(this.outputDir, `${batchName}_${Date.now()}.jsonl`);
+    const stream = fs.createWriteStream(filename);
+    
+    for (const template of templates) {
+      const request = this.generator.createBatchRequest(template);
+      stream.write(JSON.stringify(request) + '\n');
+    }
+    
+    stream.end();
+    
+    // Wait for stream to finish
+    await new Promise<void>((resolve, reject) => {
+      stream.on('finish', () => resolve());
+      stream.on('error', reject);
+    });
+    
+    logger.debug(`Created batch file: ${filename} with ${templates.length} requests`);
+    return filename;
+  }
+  
+  /**
+   * Upload file to OpenAI
+   */
+  private async uploadFile(filepath: string): Promise<any> {
+    const file = fs.createReadStream(filepath);
+    const uploadedFile = await this.client.files.create({
+      file,
+      purpose: 'batch'
+    });
+    
+    logger.debug(`Uploaded file: ${uploadedFile.id}`);
+    return uploadedFile;
+  }
+  
+  /**
+   * Create batch job
+   */
+  private async createBatchJob(fileId: string): Promise<any> {
+    const batchJob = await this.client.batches.create({
+      input_file_id: fileId,
+      endpoint: '/v1/chat/completions',
+      completion_window: '24h'
+    });
+    
+    logger.info(`Created batch job: ${batchJob.id}`);
+    return batchJob;
+  }
+  
+  /**
+   * Monitor batch job with exponential backoff
+   */
+  private async monitorBatchJob(batchId: string): Promise<any> {
+    const waitTimes = [60, 120, 300, 600, 900, 1800]; // Progressive wait times in seconds
+    let waitIndex = 0;
+    let attempts = 0;
+    const maxAttempts = 100; // Safety limit
+    
+    while (attempts < maxAttempts) {
+      const batchJob = await this.client.batches.retrieve(batchId);
+      
+      logger.debug(`Batch ${batchId} status: ${batchJob.status} (attempt ${attempts + 1})`);
+      
+      if (batchJob.status === 'completed') {
+        logger.info(`Batch job ${batchId} completed successfully`);
+        return batchJob;
+      }
+      
+      if (['failed', 'expired', 'cancelled'].includes(batchJob.status)) {
+        throw new Error(`Batch job failed with status: ${batchJob.status}`);
+      }
+      
+      // Wait before next check
+      const waitTime = waitTimes[Math.min(waitIndex, waitTimes.length - 1)];
+      logger.debug(`Waiting ${waitTime} seconds before next check...`);
+      await this.sleep(waitTime * 1000);
+      
+      waitIndex = Math.min(waitIndex + 1, waitTimes.length - 1);
+      attempts++;
+    }
+    
+    throw new Error(`Batch job monitoring timed out after ${maxAttempts} attempts`);
+  }
+  
+  /**
+   * Retrieve and parse results
+   */
+  private async retrieveResults(batchJob: any): Promise<MetadataResult[]> {
+    if (!batchJob.output_file_id) {
+      throw new Error('No output file available for batch job');
+    }
+    
+    // Download result file
+    const fileResponse = await this.client.files.content(batchJob.output_file_id);
+    const fileContent = await fileResponse.text();
+    
+    // Parse JSONL results
+    const results: MetadataResult[] = [];
+    const lines = fileContent.trim().split('\n');
+    
+    for (const line of lines) {
+      if (!line) continue;
+      
+      try {
+        const result = JSON.parse(line);
+        const parsed = this.generator.parseResult(result);
+        results.push(parsed);
+      } catch (error) {
+        logger.error('Error parsing result line:', error);
+      }
+    }
+    
+    logger.info(`Retrieved ${results.length} results from batch job`);
+    return results;
+  }
+  
+  /**
+   * Cleanup temporary files
+   */
+  private async cleanup(localFile: string, inputFileId: string, outputFileId?: string): Promise<void> {
+    // Delete local file
+    try {
+      fs.unlinkSync(localFile);
+      logger.debug(`Deleted local file: ${localFile}`);
+    } catch (error) {
+      logger.warn(`Failed to delete local file: ${localFile}`, error);
+    }
+    
+    // Delete uploaded files from OpenAI
+    try {
+      await this.client.files.del(inputFileId);
+      logger.debug(`Deleted input file from OpenAI: ${inputFileId}`);
+    } catch (error) {
+      logger.warn(`Failed to delete input file from OpenAI: ${inputFileId}`, error);
+    }
+    
+    if (outputFileId) {
+      try {
+        await this.client.files.del(outputFileId);
+        logger.debug(`Deleted output file from OpenAI: ${outputFileId}`);
+      } catch (error) {
+        logger.warn(`Failed to delete output file from OpenAI: ${outputFileId}`, error);
+      }
+    }
+  }
+  
+  /**
+   * Sleep helper
+   */
+  private sleep(ms: number): Promise<void> {
+    return new Promise(resolve => setTimeout(resolve, ms));
+  }
+}
--- a/src/templates/metadata-generator.ts
+++ b/src/templates/metadata-generator.ts
@@ -0,0 +1,287 @@
+import OpenAI from 'openai';
+import { z } from 'zod';
+import { logger } from '../utils/logger';
+import { TemplateWorkflow, TemplateDetail } from './template-fetcher';
+
+// Metadata schema using Zod for validation
+export const TemplateMetadataSchema = z.object({
+  categories: z.array(z.string()).max(5).describe('Main categories (max 5)'),
+  complexity: z.enum(['simple', 'medium', 'complex']).describe('Implementation complexity'),
+  use_cases: z.array(z.string()).max(5).describe('Primary use cases'),
+  estimated_setup_minutes: z.number().min(5).max(480).describe('Setup time in minutes'),
+  required_services: z.array(z.string()).describe('External services needed'),
+  key_features: z.array(z.string()).max(5).describe('Main capabilities'),
+  target_audience: z.array(z.string()).max(3).describe('Target users')
+});
+
+export type TemplateMetadata = z.infer<typeof TemplateMetadataSchema>;
+
+export interface MetadataRequest {
+  templateId: number;
+  name: string;
+  description?: string;
+  nodes: string[];
+  workflow?: any;
+}
+
+export interface MetadataResult {
+  templateId: number;
+  metadata: TemplateMetadata;
+  error?: string;
+}
+
+export class MetadataGenerator {
+  private client: OpenAI;
+  private model: string;
+  
+  constructor(apiKey: string, model: string = 'gpt-4o-mini') {
+    this.client = new OpenAI({ apiKey });
+    this.model = model;
+  }
+  
+  /**
+   * Generate the JSON schema for OpenAI structured outputs
+   */
+  private getJsonSchema() {
+    return {
+      name: 'template_metadata',
+      strict: true,
+      schema: {
+        type: 'object',
+        properties: {
+          categories: {
+            type: 'array',
+            items: { type: 'string' },
+            maxItems: 5,
+            description: 'Main categories like automation, integration, data processing'
+          },
+          complexity: {
+            type: 'string',
+            enum: ['simple', 'medium', 'complex'],
+            description: 'Implementation complexity level'
+          },
+          use_cases: {
+            type: 'array',
+            items: { type: 'string' },
+            maxItems: 5,
+            description: 'Primary use cases for this template'
+          },
+          estimated_setup_minutes: {
+            type: 'number',
+            minimum: 5,
+            maximum: 480,
+            description: 'Estimated setup time in minutes'
+          },
+          required_services: {
+            type: 'array',
+            items: { type: 'string' },
+            description: 'External services or APIs required'
+          },
+          key_features: {
+            type: 'array',
+            items: { type: 'string' },
+            maxItems: 5,
+            description: 'Main capabilities or features'
+          },
+          target_audience: {
+            type: 'array',
+            items: { type: 'string' },
+            maxItems: 3,
+            description: 'Target users like developers, marketers, analysts'
+          }
+        },
+        required: [
+          'categories',
+          'complexity',
+          'use_cases',
+          'estimated_setup_minutes',
+          'required_services',
+          'key_features',
+          'target_audience'
+        ],
+        additionalProperties: false
+      }
+    };
+  }
+  
+  /**
+   * Create a batch request for a single template
+   */
+  createBatchRequest(template: MetadataRequest): any {
+    // Extract node information for analysis
+    const nodesSummary = this.summarizeNodes(template.nodes);
+    
+    // Build context for the AI
+    const context = [
+      `Template: ${template.name}`,
+      template.description ? `Description: ${template.description}` : '',
+      `Nodes Used (${template.nodes.length}): ${nodesSummary}`,
+      template.workflow ? `Workflow has ${template.workflow.nodes?.length || 0} nodes with ${Object.keys(template.workflow.connections || {}).length} connections` : ''
+    ].filter(Boolean).join('\n');
+    
+    return {
+      custom_id: `template-${template.templateId}`,
+      method: 'POST',
+      url: '/v1/chat/completions',
+      body: {
+        model: this.model,
+        temperature: 0.1,
+        max_tokens: 500,
+        response_format: {
+          type: 'json_schema',
+          json_schema: this.getJsonSchema()
+        },
+        messages: [
+          {
+            role: 'system',
+            content: `You are an n8n workflow expert analyzing templates to extract structured metadata.
+            
+            Analyze the provided template information and extract:
+            - Categories: Classify into relevant categories (automation, integration, data, communication, etc.)
+            - Complexity: Assess as simple (1-3 nodes), medium (4-8 nodes), or complex (9+ nodes or advanced logic)
+            - Use cases: Identify primary business use cases
+            - Setup time: Estimate realistic setup time based on complexity and required configurations
+            - Required services: List any external services, APIs, or accounts needed
+            - Key features: Highlight main capabilities or benefits
+            - Target audience: Identify who would benefit most (developers, marketers, ops teams, etc.)
+            
+            Be concise and practical in your analysis.`
+          },
+          {
+            role: 'user',
+            content: context
+          }
+        ]
+      }
+    };
+  }
+  
+  /**
+   * Summarize nodes for better context
+   */
+  private summarizeNodes(nodes: string[]): string {
+    // Group similar nodes
+    const nodeGroups: Record<string, number> = {};
+    
+    for (const node of nodes) {
+      // Extract base node name (remove package prefix)
+      const baseName = node.split('.').pop() || node;
+      
+      // Group by category
+      if (baseName.includes('webhook') || baseName.includes('http')) {
+        nodeGroups['HTTP/Webhooks'] = (nodeGroups['HTTP/Webhooks'] || 0) + 1;
+      } else if (baseName.includes('database') || baseName.includes('postgres') || baseName.includes('mysql')) {
+        nodeGroups['Database'] = (nodeGroups['Database'] || 0) + 1;
+      } else if (baseName.includes('slack') || baseName.includes('email') || baseName.includes('gmail')) {
+        nodeGroups['Communication'] = (nodeGroups['Communication'] || 0) + 1;
+      } else if (baseName.includes('ai') || baseName.includes('openai') || baseName.includes('langchain')) {
+        nodeGroups['AI/ML'] = (nodeGroups['AI/ML'] || 0) + 1;
+      } else if (baseName.includes('sheet') || baseName.includes('csv') || baseName.includes('excel')) {
+        nodeGroups['Spreadsheets'] = (nodeGroups['Spreadsheets'] || 0) + 1;
+      } else {
+        const cleanName = baseName.replace(/Trigger$/, '').replace(/Node$/, '');
+        nodeGroups[cleanName] = (nodeGroups[cleanName] || 0) + 1;
+      }
+    }
+    
+    // Format summary
+    const summary = Object.entries(nodeGroups)
+      .sort((a, b) => b[1] - a[1])
+      .slice(0, 10) // Top 10 groups
+      .map(([name, count]) => count > 1 ? `${name} (${count})` : name)
+      .join(', ');
+    
+    return summary;
+  }
+  
+  /**
+   * Parse a batch result
+   */
+  parseResult(result: any): MetadataResult {
+    try {
+      if (result.error) {
+        return {
+          templateId: parseInt(result.custom_id.replace('template-', '')),
+          metadata: this.getDefaultMetadata(),
+          error: result.error.message
+        };
+      }
+      
+      const response = result.response;
+      if (!response?.body?.choices?.[0]?.message?.content) {
+        throw new Error('Invalid response structure');
+      }
+      
+      const content = response.body.choices[0].message.content;
+      const metadata = JSON.parse(content);
+      
+      // Validate with Zod
+      const validated = TemplateMetadataSchema.parse(metadata);
+      
+      return {
+        templateId: parseInt(result.custom_id.replace('template-', '')),
+        metadata: validated
+      };
+    } catch (error) {
+      logger.error(`Error parsing result for ${result.custom_id}:`, error);
+      return {
+        templateId: parseInt(result.custom_id.replace('template-', '')),
+        metadata: this.getDefaultMetadata(),
+        error: error instanceof Error ? error.message : 'Unknown error'
+      };
+    }
+  }
+  
+  /**
+   * Get default metadata for fallback
+   */
+  private getDefaultMetadata(): TemplateMetadata {
+    return {
+      categories: ['automation'],
+      complexity: 'medium',
+      use_cases: ['Process automation'],
+      estimated_setup_minutes: 30,
+      required_services: [],
+      key_features: ['Workflow automation'],
+      target_audience: ['developers']
+    };
+  }
+  
+  /**
+   * Generate metadata for a single template (for testing)
+   */
+  async generateSingle(template: MetadataRequest): Promise<TemplateMetadata> {
+    try {
+      const completion = await this.client.chat.completions.create({
+        model: this.model,
+        temperature: 0.1,
+        max_tokens: 500,
+        response_format: {
+          type: 'json_schema',
+          json_schema: this.getJsonSchema()
+        } as any,
+        messages: [
+          {
+            role: 'system',
+            content: `You are an n8n workflow expert analyzing templates to extract structured metadata.`
+          },
+          {
+            role: 'user',
+            content: `Analyze this template: ${template.name}\nNodes: ${template.nodes.join(', ')}`
+          }
+        ]
+      });
+      
+      const content = completion.choices[0].message.content;
+      if (!content) {
+        throw new Error('No content in response');
+      }
+      
+      const metadata = JSON.parse(content);
+      return TemplateMetadataSchema.parse(metadata);
+    } catch (error) {
+      logger.error('Error generating single metadata:', error);
+      return this.getDefaultMetadata();
+    }
+  }
+}
--- a/src/templates/template-repository.ts
+++ b/src/templates/template-repository.ts
@@ -22,6 +22,8 @@ export interface StoredTemplate {
  updated_at: string;
  url: string;
  scraped_at: string;
+  metadata_json?: string; // Structured metadata from OpenAI (JSON string)
+  metadata_generated_at?: string; // When metadata was generated
 }

 export class TemplateRepository {
@@ -536,4 +538,91 @@ export class TemplateRepository {
      // Non-critical error - search will fallback to LIKE
    }
  }
+  
+  /**
+   * Update metadata for a template
+   */
+  updateTemplateMetadata(templateId: number, metadata: any): void {
+    const stmt = this.db.prepare(`
+      UPDATE templates 
+      SET metadata_json = ?, metadata_generated_at = CURRENT_TIMESTAMP
+      WHERE id = ?
+    `);
+    
+    stmt.run(JSON.stringify(metadata), templateId);
+    logger.debug(`Updated metadata for template ${templateId}`);
+  }
+  
+  /**
+   * Batch update metadata for multiple templates
+   */
+  batchUpdateMetadata(metadataMap: Map<number, any>): void {
+    const stmt = this.db.prepare(`
+      UPDATE templates 
+      SET metadata_json = ?, metadata_generated_at = CURRENT_TIMESTAMP
+      WHERE id = ?
+    `);
+    
+    // Simple approach - just run the updates
+    // Most operations are fast enough without explicit transactions
+    for (const [templateId, metadata] of metadataMap.entries()) {
+      stmt.run(JSON.stringify(metadata), templateId);
+    }
+    
+    logger.info(`Updated metadata for ${metadataMap.size} templates`);
+  }
+  
+  /**
+   * Get templates without metadata
+   */
+  getTemplatesWithoutMetadata(limit: number = 100): StoredTemplate[] {
+    const stmt = this.db.prepare(`
+      SELECT * FROM templates 
+      WHERE metadata_json IS NULL OR metadata_generated_at IS NULL
+      ORDER BY views DESC
+      LIMIT ?
+    `);
+    
+    return stmt.all(limit) as StoredTemplate[];
+  }
+  
+  /**
+   * Get templates with outdated metadata (older than days specified)
+   */
+  getTemplatesWithOutdatedMetadata(daysOld: number = 30, limit: number = 100): StoredTemplate[] {
+    const stmt = this.db.prepare(`
+      SELECT * FROM templates 
+      WHERE metadata_generated_at < datetime('now', '-' || ? || ' days')
+      ORDER BY views DESC
+      LIMIT ?
+    `);
+    
+    return stmt.all(daysOld, limit) as StoredTemplate[];
+  }
+  
+  /**
+   * Get template metadata stats
+   */
+  getMetadataStats(): { 
+    total: number; 
+    withMetadata: number; 
+    withoutMetadata: number;
+    outdated: number;
+  } {
+    const total = this.getTemplateCount();
+    
+    const withMetadata = (this.db.prepare(`
+      SELECT COUNT(*) as count FROM templates 
+      WHERE metadata_json IS NOT NULL
+    `).get() as { count: number }).count;
+    
+    const withoutMetadata = total - withMetadata;
+    
+    const outdated = (this.db.prepare(`
+      SELECT COUNT(*) as count FROM templates 
+      WHERE metadata_generated_at < datetime('now', '-30 days')
+    `).get() as { count: number }).count;
+    
+    return { total, withMetadata, withoutMetadata, outdated };
+  }
 }