n8n-mcp/src/templates/metadata-generator.ts

import OpenAI from 'openai';
import { z } from 'zod';
import { logger } from '../utils/logger';
import { TemplateWorkflow, TemplateDetail } from './template-fetcher';

// Metadata schema using Zod for validation
export const TemplateMetadataSchema = z.object({
  categories: z.array(z.string()).max(5).describe('Main categories (max 5)'),
  complexity: z.enum(['simple', 'medium', 'complex']).describe('Implementation complexity'),
  use_cases: z.array(z.string()).max(5).describe('Primary use cases'),
  estimated_setup_minutes: z.number().min(5).max(480).describe('Setup time in minutes'),
  required_services: z.array(z.string()).describe('External services needed'),
  key_features: z.array(z.string()).max(5).describe('Main capabilities'),
  target_audience: z.array(z.string()).max(3).describe('Target users')
});

export type TemplateMetadata = z.infer<typeof TemplateMetadataSchema>;

export interface MetadataRequest {
  templateId: number;
  name: string;
  description?: string;
  nodes: string[];
  workflow?: any;
}

export interface MetadataResult {
  templateId: number;
  metadata: TemplateMetadata;
  error?: string;
}

export class MetadataGenerator {
  private client: OpenAI;
  private model: string;

  constructor(apiKey: string, model: string = 'gpt-4o-mini') {
    this.client = new OpenAI({ apiKey });
    this.model = model;
  }

  /**
   * Generate the JSON schema for OpenAI structured outputs
   */
  private getJsonSchema() {
    return {
      name: 'template_metadata',
      strict: true,
      schema: {
        type: 'object',
        properties: {
          categories: {
            type: 'array',
            items: { type: 'string' },
            maxItems: 5,
            description: 'Main categories like automation, integration, data processing'
          },
          complexity: {
            type: 'string',
            enum: ['simple', 'medium', 'complex'],
            description: 'Implementation complexity level'
          },
          use_cases: {
            type: 'array',
            items: { type: 'string' },
            maxItems: 5,
            description: 'Primary use cases for this template'
          },
          estimated_setup_minutes: {
            type: 'number',
            minimum: 5,
            maximum: 480,
            description: 'Estimated setup time in minutes'
          },
          required_services: {
            type: 'array',
            items: { type: 'string' },
            description: 'External services or APIs required'
          },
          key_features: {
            type: 'array',
            items: { type: 'string' },
            maxItems: 5,
            description: 'Main capabilities or features'
          },
          target_audience: {
            type: 'array',
            items: { type: 'string' },
            maxItems: 3,
            description: 'Target users like developers, marketers, analysts'
          }
        },
        required: [
          'categories',
          'complexity',
          'use_cases',
          'estimated_setup_minutes',
          'required_services',
          'key_features',
          'target_audience'
        ],
        additionalProperties: false
      }
    };
  }

  /**
   * Create a batch request for a single template
   */
  createBatchRequest(template: MetadataRequest): any {
    // Extract node information for analysis
    const nodesSummary = this.summarizeNodes(template.nodes);

    // Sanitize template name and description to prevent prompt injection
    // Allow longer names for test scenarios but still sanitize content
    const sanitizedName = this.sanitizeInput(template.name, Math.max(200, template.name.length));
    const sanitizedDescription = template.description ?
      this.sanitizeInput(template.description, 500) : '';

    // Build context for the AI with sanitized inputs
    const context = [
      `Template: ${sanitizedName}`,
      sanitizedDescription ? `Description: ${sanitizedDescription}` : '',
      `Nodes Used (${template.nodes.length}): ${nodesSummary}`,
      template.workflow ? `Workflow has ${template.workflow.nodes?.length || 0} nodes with ${Object.keys(template.workflow.connections || {}).length} connections` : ''
    ].filter(Boolean).join('\n');

    return {
      custom_id: `template-${template.templateId}`,
      method: 'POST',
      url: '/v1/chat/completions',
      body: {
        model: this.model,
        temperature: 0.3,  // Lower temperature for more consistent structured outputs
        max_completion_tokens: 1000,
        response_format: {
          type: 'json_schema',
          json_schema: this.getJsonSchema()
        },
        messages: [
          {
            role: 'system',
            content: `Analyze n8n workflow templates and extract metadata. Be concise.`
          },
          {
            role: 'user',
            content: context
          }
        ]
      }
    };
  }

  /**
   * Sanitize input to prevent prompt injection and control token usage
   */
  private sanitizeInput(input: string, maxLength: number): string {
    // Truncate to max length
    let sanitized = input.slice(0, maxLength);

    // Remove control characters and excessive whitespace
    sanitized = sanitized.replace(/[\x00-\x1F\x7F-\x9F]/g, '');

    // Replace multiple spaces/newlines with single space
    sanitized = sanitized.replace(/\s+/g, ' ').trim();

    // Remove potential prompt injection patterns
    sanitized = sanitized.replace(/\b(system|assistant|user|human|ai):/gi, '');
    sanitized = sanitized.replace(/```[\s\S]*?```/g, ''); // Remove code blocks
    sanitized = sanitized.replace(/\[INST\]|\[\/INST\]/g, ''); // Remove instruction markers

    return sanitized;
  }

  /**
   * Summarize nodes for better context
   */
  private summarizeNodes(nodes: string[]): string {
    // Group similar nodes
    const nodeGroups: Record<string, number> = {};

    for (const node of nodes) {
      // Extract base node name (remove package prefix)
      const baseName = node.split('.').pop() || node;

      // Group by category
      if (baseName.includes('webhook') || baseName.includes('http')) {
        nodeGroups['HTTP/Webhooks'] = (nodeGroups['HTTP/Webhooks'] || 0) + 1;
      } else if (baseName.includes('database') || baseName.includes('postgres') || baseName.includes('mysql')) {
        nodeGroups['Database'] = (nodeGroups['Database'] || 0) + 1;
      } else if (baseName.includes('slack') || baseName.includes('email') || baseName.includes('gmail')) {
        nodeGroups['Communication'] = (nodeGroups['Communication'] || 0) + 1;
      } else if (baseName.includes('ai') || baseName.includes('openai') || baseName.includes('langchain') ||
                 baseName.toLowerCase().includes('openai') || baseName.includes('agent')) {
        nodeGroups['AI/ML'] = (nodeGroups['AI/ML'] || 0) + 1;
      } else if (baseName.includes('sheet') || baseName.includes('csv') || baseName.includes('excel') ||
                 baseName.toLowerCase().includes('googlesheets')) {
        nodeGroups['Spreadsheets'] = (nodeGroups['Spreadsheets'] || 0) + 1;
      } else {
        // For unmatched nodes, try to use a meaningful name
        // If it's a special node name with dots, preserve the meaningful part
        let displayName;
        if (node.includes('.with.') && node.includes('@')) {
          // Special case for node names like '@n8n/custom-node.with.dots'
          displayName = node.split('/').pop() || baseName;
        } else {
          // Use the full base name for normal unknown nodes
          // Only clean obvious suffixes, not when they're part of meaningful names
          if (baseName.endsWith('Trigger') && baseName.length > 7) {
            displayName = baseName.slice(0, -7); // Remove 'Trigger'
          } else if (baseName.endsWith('Node') && baseName.length > 4 && baseName !== 'unknownNode') {
            displayName = baseName.slice(0, -4); // Remove 'Node' only if it's not the main name
          } else {
            displayName = baseName; // Keep the full name
          }
        }
        nodeGroups[displayName] = (nodeGroups[displayName] || 0) + 1;
      }
    }

    // Format summary
    const summary = Object.entries(nodeGroups)
      .sort((a, b) => b[1] - a[1])
      .slice(0, 10) // Top 10 groups
      .map(([name, count]) => count > 1 ? `${name} (${count})` : name)
      .join(', ');

    return summary;
  }

  /**
   * Parse a batch result
   */
  parseResult(result: any): MetadataResult {
    try {
      if (result.error) {
        return {
          templateId: parseInt(result.custom_id.replace('template-', '')),
          metadata: this.getDefaultMetadata(),
          error: result.error.message
        };
      }

      const response = result.response;
      if (!response?.body?.choices?.[0]?.message?.content) {
        throw new Error('Invalid response structure');
      }

      const content = response.body.choices[0].message.content;
      const metadata = JSON.parse(content);

      // Validate with Zod
      const validated = TemplateMetadataSchema.parse(metadata);

      return {
        templateId: parseInt(result.custom_id.replace('template-', '')),
        metadata: validated
      };
    } catch (error) {
      logger.error(`Error parsing result for ${result.custom_id}:`, error);
      return {
        templateId: parseInt(result.custom_id.replace('template-', '')),
        metadata: this.getDefaultMetadata(),
        error: error instanceof Error ? error.message : 'Unknown error'
      };
    }
  }

  /**
   * Get default metadata for fallback
   */
  private getDefaultMetadata(): TemplateMetadata {
    return {
      categories: ['automation'],
      complexity: 'medium',
      use_cases: ['Process automation'],
      estimated_setup_minutes: 30,
      required_services: [],
      key_features: ['Workflow automation'],
      target_audience: ['developers']
    };
  }

  /**
   * Generate metadata for a single template (for testing)
   */
  async generateSingle(template: MetadataRequest): Promise<TemplateMetadata> {
    try {
      const completion = await this.client.chat.completions.create({
        model: this.model,
        temperature: 0.3,  // Lower temperature for more consistent structured outputs
        max_completion_tokens: 1000,
        response_format: {
          type: 'json_schema',
          json_schema: this.getJsonSchema()
        } as any,
        messages: [
          {
            role: 'system',
            content: `Analyze n8n workflow templates and extract metadata. Be concise.`
          },
          {
            role: 'user',
            content: `Template: ${template.name}\nNodes: ${template.nodes.slice(0, 10).join(', ')}`
          }
        ]
      });

      const content = completion.choices[0].message.content;
      if (!content) {
        logger.error('No content in OpenAI response');
        throw new Error('No content in response');
      }

      const metadata = JSON.parse(content);
      return TemplateMetadataSchema.parse(metadata);
    } catch (error) {
      logger.error('Error generating single metadata:', error);
      return this.getDefaultMetadata();
    }
  }
}