n8n-mcp/src/community/documentation-batch-processor.ts

/**
 * Batch processor for community node documentation generation.
 *
 * Orchestrates the full workflow:
 * 1. Fetch READMEs from npm registry
 * 2. Generate AI documentation summaries
 * 3. Store results in database
 */

import { NodeRepository } from '../database/node-repository';
import { CommunityNodeFetcher } from './community-node-fetcher';
import {
  DocumentationGenerator,
  DocumentationInput,
  DocumentationResult,
  createDocumentationGenerator,
} from './documentation-generator';
import { logger } from '../utils/logger';

/**
 * Options for batch processing
 */
export interface BatchProcessorOptions {
  /** Skip nodes that already have READMEs (default: false) */
  skipExistingReadme?: boolean;
  /** Skip nodes that already have AI summaries (default: false) */
  skipExistingSummary?: boolean;
  /** Only fetch READMEs, skip AI generation (default: false) */
  readmeOnly?: boolean;
  /** Only generate AI summaries, skip README fetch (default: false) */
  summaryOnly?: boolean;
  /** Max nodes to process (default: unlimited) */
  limit?: number;
  /** Concurrency for npm README fetches (default: 5) */
  readmeConcurrency?: number;
  /** Concurrency for LLM API calls (default: 3) */
  llmConcurrency?: number;
  /** Progress callback */
  progressCallback?: (message: string, current: number, total: number) => void;
}

/**
 * Result of batch processing
 */
export interface BatchProcessorResult {
  /** Number of READMEs fetched */
  readmesFetched: number;
  /** Number of READMEs that failed to fetch */
  readmesFailed: number;
  /** Number of AI summaries generated */
  summariesGenerated: number;
  /** Number of AI summaries that failed */
  summariesFailed: number;
  /** Nodes that were skipped (already had data) */
  skipped: number;
  /** Total duration in seconds */
  durationSeconds: number;
  /** Errors encountered */
  errors: string[];
}

/**
 * Batch processor for generating documentation for community nodes
 */
export class DocumentationBatchProcessor {
  private repository: NodeRepository;
  private fetcher: CommunityNodeFetcher;
  private generator: DocumentationGenerator;

  constructor(
    repository: NodeRepository,
    fetcher?: CommunityNodeFetcher,
    generator?: DocumentationGenerator
  ) {
    this.repository = repository;
    this.fetcher = fetcher || new CommunityNodeFetcher();
    this.generator = generator || createDocumentationGenerator();
  }

  /**
   * Process all community nodes to generate documentation
   */
  async processAll(options: BatchProcessorOptions = {}): Promise<BatchProcessorResult> {
    const startTime = Date.now();
    const result: BatchProcessorResult = {
      readmesFetched: 0,
      readmesFailed: 0,
      summariesGenerated: 0,
      summariesFailed: 0,
      skipped: 0,
      durationSeconds: 0,
      errors: [],
    };

    const {
      skipExistingReadme = false,
      skipExistingSummary = false,
      readmeOnly = false,
      summaryOnly = false,
      limit,
      readmeConcurrency = 5,
      llmConcurrency = 3,
      progressCallback,
    } = options;

    try {
      // Step 1: Fetch READMEs (unless summaryOnly)
      if (!summaryOnly) {
        const readmeResult = await this.fetchReadmes({
          skipExisting: skipExistingReadme,
          limit,
          concurrency: readmeConcurrency,
          progressCallback,
        });
        result.readmesFetched = readmeResult.fetched;
        result.readmesFailed = readmeResult.failed;
        result.skipped += readmeResult.skipped;
        result.errors.push(...readmeResult.errors);
      }

      // Step 2: Generate AI summaries (unless readmeOnly)
      if (!readmeOnly) {
        const summaryResult = await this.generateSummaries({
          skipExisting: skipExistingSummary,
          limit,
          concurrency: llmConcurrency,
          progressCallback,
        });
        result.summariesGenerated = summaryResult.generated;
        result.summariesFailed = summaryResult.failed;
        result.skipped += summaryResult.skipped;
        result.errors.push(...summaryResult.errors);
      }

      result.durationSeconds = (Date.now() - startTime) / 1000;
      return result;
    } catch (error) {
      const errorMessage = error instanceof Error ? error.message : 'Unknown error';
      result.errors.push(`Batch processing failed: ${errorMessage}`);
      result.durationSeconds = (Date.now() - startTime) / 1000;
      return result;
    }
  }

  /**
   * Fetch READMEs for community nodes
   */
  private async fetchReadmes(options: {
    skipExisting?: boolean;
    limit?: number;
    concurrency?: number;
    progressCallback?: (message: string, current: number, total: number) => void;
  }): Promise<{ fetched: number; failed: number; skipped: number; errors: string[] }> {
    const { skipExisting = false, limit, concurrency = 5, progressCallback } = options;

    // Get nodes that need READMEs
    let nodes = skipExisting
      ? this.repository.getCommunityNodesWithoutReadme()
      : this.repository.getCommunityNodes({ orderBy: 'downloads' });

    if (limit) {
      nodes = nodes.slice(0, limit);
    }

    logger.info(`Fetching READMEs for ${nodes.length} community nodes...`);

    if (nodes.length === 0) {
      return { fetched: 0, failed: 0, skipped: 0, errors: [] };
    }

    // Get package names
    const packageNames = nodes
      .map((n) => n.npmPackageName)
      .filter((name): name is string => !!name);

    // Fetch READMEs in batches
    const readmeMap = await this.fetcher.fetchReadmesBatch(
      packageNames,
      progressCallback,
      concurrency
    );

    // Store READMEs in database
    let fetched = 0;
    let failed = 0;
    const errors: string[] = [];

    for (const node of nodes) {
      if (!node.npmPackageName) continue;

      const readme = readmeMap.get(node.npmPackageName);
      if (readme) {
        try {
          this.repository.updateNodeReadme(node.nodeType, readme);
          fetched++;
        } catch (error) {
          const msg = `Failed to save README for ${node.nodeType}: ${error}`;
          errors.push(msg);
          failed++;
        }
      } else {
        failed++;
      }
    }

    logger.info(`README fetch complete: ${fetched} fetched, ${failed} failed`);
    return { fetched, failed, skipped: 0, errors };
  }

  /**
   * Generate AI documentation summaries
   */
  private async generateSummaries(options: {
    skipExisting?: boolean;
    limit?: number;
    concurrency?: number;
    progressCallback?: (message: string, current: number, total: number) => void;
  }): Promise<{ generated: number; failed: number; skipped: number; errors: string[] }> {
    const { skipExisting = false, limit, concurrency = 3, progressCallback } = options;

    // Get nodes that need summaries (must have READMEs first)
    let nodes = skipExisting
      ? this.repository.getCommunityNodesWithoutAISummary()
      : this.repository.getCommunityNodes({ orderBy: 'downloads' }).filter(
          (n) => n.npmReadme && n.npmReadme.length > 0
        );

    if (limit) {
      nodes = nodes.slice(0, limit);
    }

    logger.info(`Generating AI summaries for ${nodes.length} nodes...`);

    if (nodes.length === 0) {
      return { generated: 0, failed: 0, skipped: 0, errors: [] };
    }

    // Test LLM connection first
    const connectionTest = await this.generator.testConnection();
    if (!connectionTest.success) {
      const error = `LLM connection failed: ${connectionTest.message}`;
      logger.error(error);
      return { generated: 0, failed: nodes.length, skipped: 0, errors: [error] };
    }

    logger.info(`LLM connection successful: ${connectionTest.message}`);

    // Prepare inputs for batch generation
    const inputs: DocumentationInput[] = nodes.map((node) => ({
      nodeType: node.nodeType,
      displayName: node.displayName,
      description: node.description,
      readme: node.npmReadme || '',
      npmPackageName: node.npmPackageName,
    }));

    // Generate summaries in parallel
    const results = await this.generator.generateBatch(inputs, concurrency, progressCallback);

    // Store summaries in database
    let generated = 0;
    let failed = 0;
    const errors: string[] = [];

    for (const result of results) {
      if (result.error) {
        errors.push(`${result.nodeType}: ${result.error}`);
        failed++;
      } else {
        try {
          this.repository.updateNodeAISummary(result.nodeType, result.summary);
          generated++;
        } catch (error) {
          const msg = `Failed to save summary for ${result.nodeType}: ${error}`;
          errors.push(msg);
          failed++;
        }
      }
    }

    logger.info(`AI summary generation complete: ${generated} generated, ${failed} failed`);
    return { generated, failed, skipped: 0, errors };
  }

  /**
   * Get current documentation statistics
   */
  getStats(): ReturnType<NodeRepository['getDocumentationStats']> {
    return this.repository.getDocumentationStats();
  }
}