mirror of
https://github.com/czlonkowski/n8n-mcp.git
synced 2026-02-09 06:43:08 +00:00
fix: refactor telemetry system with critical improvements (v2.14.1)
Major improvements to telemetry system addressing code review findings: Architecture & Modularization: - Split 636-line TelemetryManager into 7 focused modules - Separated concerns: event tracking, batch processing, validation, rate limiting - Lazy initialization pattern to avoid early singleton creation - Clean separation of responsibilities Security & Privacy: - Added comprehensive input validation with Zod schemas - Sanitization of sensitive data (URLs, API keys, emails) - Expanded sensitive key detection patterns (25+ patterns) - Row Level Security on Supabase backend - Added data deletion contact info (romuald@n8n-mcp.com) Performance & Reliability: - Sliding window rate limiter (100 events/minute) - Circuit breaker pattern for network failures - Dead letter queue for failed events - Exponential backoff with jitter for retries - Performance monitoring with overhead tracking (<5%) - Memory-safe array limits in rate limiter Testing: - Comprehensive test coverage (87%+ for core modules) - Unit tests for all new modules - Integration tests for MCP telemetry - Fixed test isolation issues Data Management: - Clear user consent in welcome message - Batch processing with deduplication - Automatic workflow flushing BREAKING CHANGE: TelemetryManager constructor is now private, use getInstance() 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
387
src/telemetry/batch-processor.ts
Normal file
387
src/telemetry/batch-processor.ts
Normal file
@@ -0,0 +1,387 @@
|
||||
/**
|
||||
* Batch Processor for Telemetry
|
||||
* Handles batching, queuing, and sending telemetry data to Supabase
|
||||
*/
|
||||
|
||||
import { SupabaseClient } from '@supabase/supabase-js';
|
||||
import { TelemetryEvent, WorkflowTelemetry, TELEMETRY_CONFIG, TelemetryMetrics } from './telemetry-types';
|
||||
import { TelemetryError, TelemetryErrorType, TelemetryCircuitBreaker } from './telemetry-error';
|
||||
import { logger } from '../utils/logger';
|
||||
|
||||
export class TelemetryBatchProcessor {
|
||||
private flushTimer?: NodeJS.Timeout;
|
||||
private isFlushingEvents: boolean = false;
|
||||
private isFlushingWorkflows: boolean = false;
|
||||
private circuitBreaker: TelemetryCircuitBreaker;
|
||||
private metrics: TelemetryMetrics = {
|
||||
eventsTracked: 0,
|
||||
eventsDropped: 0,
|
||||
eventsFailed: 0,
|
||||
batchesSent: 0,
|
||||
batchesFailed: 0,
|
||||
averageFlushTime: 0,
|
||||
rateLimitHits: 0
|
||||
};
|
||||
private flushTimes: number[] = [];
|
||||
private deadLetterQueue: (TelemetryEvent | WorkflowTelemetry)[] = [];
|
||||
private readonly maxDeadLetterSize = 100;
|
||||
|
||||
constructor(
|
||||
private supabase: SupabaseClient | null,
|
||||
private isEnabled: () => boolean
|
||||
) {
|
||||
this.circuitBreaker = new TelemetryCircuitBreaker();
|
||||
}
|
||||
|
||||
/**
|
||||
* Start the batch processor
|
||||
*/
|
||||
start(): void {
|
||||
if (!this.isEnabled() || !this.supabase) return;
|
||||
|
||||
// Set up periodic flushing
|
||||
this.flushTimer = setInterval(() => {
|
||||
this.flush();
|
||||
}, TELEMETRY_CONFIG.BATCH_FLUSH_INTERVAL);
|
||||
|
||||
// Prevent timer from keeping process alive
|
||||
this.flushTimer.unref();
|
||||
|
||||
// Set up process exit handlers
|
||||
process.on('beforeExit', () => this.flush());
|
||||
process.on('SIGINT', () => {
|
||||
this.flush();
|
||||
process.exit(0);
|
||||
});
|
||||
process.on('SIGTERM', () => {
|
||||
this.flush();
|
||||
process.exit(0);
|
||||
});
|
||||
|
||||
logger.debug('Telemetry batch processor started');
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop the batch processor
|
||||
*/
|
||||
stop(): void {
|
||||
if (this.flushTimer) {
|
||||
clearInterval(this.flushTimer);
|
||||
this.flushTimer = undefined;
|
||||
}
|
||||
logger.debug('Telemetry batch processor stopped');
|
||||
}
|
||||
|
||||
/**
|
||||
* Flush events and workflows to Supabase
|
||||
*/
|
||||
async flush(events?: TelemetryEvent[], workflows?: WorkflowTelemetry[]): Promise<void> {
|
||||
if (!this.isEnabled() || !this.supabase) return;
|
||||
|
||||
// Check circuit breaker
|
||||
if (!this.circuitBreaker.shouldAllow()) {
|
||||
logger.debug('Circuit breaker open - skipping flush');
|
||||
this.metrics.eventsDropped += (events?.length || 0) + (workflows?.length || 0);
|
||||
return;
|
||||
}
|
||||
|
||||
const startTime = Date.now();
|
||||
let hasErrors = false;
|
||||
|
||||
// Flush events if provided
|
||||
if (events && events.length > 0) {
|
||||
hasErrors = !(await this.flushEvents(events)) || hasErrors;
|
||||
}
|
||||
|
||||
// Flush workflows if provided
|
||||
if (workflows && workflows.length > 0) {
|
||||
hasErrors = !(await this.flushWorkflows(workflows)) || hasErrors;
|
||||
}
|
||||
|
||||
// Record flush time
|
||||
const flushTime = Date.now() - startTime;
|
||||
this.recordFlushTime(flushTime);
|
||||
|
||||
// Update circuit breaker
|
||||
if (hasErrors) {
|
||||
this.circuitBreaker.recordFailure();
|
||||
} else {
|
||||
this.circuitBreaker.recordSuccess();
|
||||
}
|
||||
|
||||
// Process dead letter queue if circuit is healthy
|
||||
if (!hasErrors && this.deadLetterQueue.length > 0) {
|
||||
await this.processDeadLetterQueue();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Flush events with batching
|
||||
*/
|
||||
private async flushEvents(events: TelemetryEvent[]): Promise<boolean> {
|
||||
if (this.isFlushingEvents || events.length === 0) return true;
|
||||
|
||||
this.isFlushingEvents = true;
|
||||
|
||||
try {
|
||||
// Batch events
|
||||
const batches = this.createBatches(events, TELEMETRY_CONFIG.MAX_BATCH_SIZE);
|
||||
|
||||
for (const batch of batches) {
|
||||
const result = await this.executeWithRetry(async () => {
|
||||
const { error } = await this.supabase!
|
||||
.from('telemetry_events')
|
||||
.insert(batch);
|
||||
|
||||
if (error) {
|
||||
throw error;
|
||||
}
|
||||
|
||||
logger.debug(`Flushed batch of ${batch.length} telemetry events`);
|
||||
return true;
|
||||
}, 'Flush telemetry events');
|
||||
|
||||
if (result) {
|
||||
this.metrics.eventsTracked += batch.length;
|
||||
this.metrics.batchesSent++;
|
||||
} else {
|
||||
this.metrics.eventsFailed += batch.length;
|
||||
this.metrics.batchesFailed++;
|
||||
this.addToDeadLetterQueue(batch);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
} catch (error) {
|
||||
logger.debug('Failed to flush events:', error);
|
||||
throw new TelemetryError(
|
||||
TelemetryErrorType.NETWORK_ERROR,
|
||||
'Failed to flush events',
|
||||
{ error: error instanceof Error ? error.message : String(error) },
|
||||
true
|
||||
);
|
||||
} finally {
|
||||
this.isFlushingEvents = false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Flush workflows with deduplication
|
||||
*/
|
||||
private async flushWorkflows(workflows: WorkflowTelemetry[]): Promise<boolean> {
|
||||
if (this.isFlushingWorkflows || workflows.length === 0) return true;
|
||||
|
||||
this.isFlushingWorkflows = true;
|
||||
|
||||
try {
|
||||
// Deduplicate workflows by hash
|
||||
const uniqueWorkflows = this.deduplicateWorkflows(workflows);
|
||||
logger.debug(`Deduplicating workflows: ${workflows.length} -> ${uniqueWorkflows.length}`);
|
||||
|
||||
// Batch workflows
|
||||
const batches = this.createBatches(uniqueWorkflows, TELEMETRY_CONFIG.MAX_BATCH_SIZE);
|
||||
|
||||
for (const batch of batches) {
|
||||
const result = await this.executeWithRetry(async () => {
|
||||
const { error } = await this.supabase!
|
||||
.from('telemetry_workflows')
|
||||
.insert(batch);
|
||||
|
||||
if (error) {
|
||||
throw error;
|
||||
}
|
||||
|
||||
logger.debug(`Flushed batch of ${batch.length} telemetry workflows`);
|
||||
return true;
|
||||
}, 'Flush telemetry workflows');
|
||||
|
||||
if (result) {
|
||||
this.metrics.eventsTracked += batch.length;
|
||||
this.metrics.batchesSent++;
|
||||
} else {
|
||||
this.metrics.eventsFailed += batch.length;
|
||||
this.metrics.batchesFailed++;
|
||||
this.addToDeadLetterQueue(batch);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
} catch (error) {
|
||||
logger.debug('Failed to flush workflows:', error);
|
||||
throw new TelemetryError(
|
||||
TelemetryErrorType.NETWORK_ERROR,
|
||||
'Failed to flush workflows',
|
||||
{ error: error instanceof Error ? error.message : String(error) },
|
||||
true
|
||||
);
|
||||
} finally {
|
||||
this.isFlushingWorkflows = false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute operation with exponential backoff retry
|
||||
*/
|
||||
private async executeWithRetry<T>(
|
||||
operation: () => Promise<T>,
|
||||
operationName: string
|
||||
): Promise<T | null> {
|
||||
let lastError: Error | null = null;
|
||||
let delay = TELEMETRY_CONFIG.RETRY_DELAY;
|
||||
|
||||
for (let attempt = 1; attempt <= TELEMETRY_CONFIG.MAX_RETRIES; attempt++) {
|
||||
try {
|
||||
// Create a timeout promise
|
||||
const timeoutPromise = new Promise<never>((_, reject) => {
|
||||
setTimeout(() => reject(new Error('Operation timed out')), TELEMETRY_CONFIG.OPERATION_TIMEOUT);
|
||||
});
|
||||
|
||||
// Race between operation and timeout
|
||||
const result = await Promise.race([operation(), timeoutPromise]) as T;
|
||||
return result;
|
||||
} catch (error) {
|
||||
lastError = error as Error;
|
||||
logger.debug(`${operationName} attempt ${attempt} failed:`, error);
|
||||
|
||||
if (attempt < TELEMETRY_CONFIG.MAX_RETRIES) {
|
||||
// Exponential backoff with jitter
|
||||
const jitter = Math.random() * 0.3 * delay; // 30% jitter
|
||||
const waitTime = delay + jitter;
|
||||
await new Promise(resolve => setTimeout(resolve, waitTime));
|
||||
delay *= 2; // Double the delay for next attempt
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
logger.debug(`${operationName} failed after ${TELEMETRY_CONFIG.MAX_RETRIES} attempts:`, lastError);
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create batches from array
|
||||
*/
|
||||
private createBatches<T>(items: T[], batchSize: number): T[][] {
|
||||
const batches: T[][] = [];
|
||||
|
||||
for (let i = 0; i < items.length; i += batchSize) {
|
||||
batches.push(items.slice(i, i + batchSize));
|
||||
}
|
||||
|
||||
return batches;
|
||||
}
|
||||
|
||||
/**
|
||||
* Deduplicate workflows by hash
|
||||
*/
|
||||
private deduplicateWorkflows(workflows: WorkflowTelemetry[]): WorkflowTelemetry[] {
|
||||
const seen = new Set<string>();
|
||||
const unique: WorkflowTelemetry[] = [];
|
||||
|
||||
for (const workflow of workflows) {
|
||||
if (!seen.has(workflow.workflow_hash)) {
|
||||
seen.add(workflow.workflow_hash);
|
||||
unique.push(workflow);
|
||||
}
|
||||
}
|
||||
|
||||
return unique;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add failed items to dead letter queue
|
||||
*/
|
||||
private addToDeadLetterQueue(items: (TelemetryEvent | WorkflowTelemetry)[]): void {
|
||||
for (const item of items) {
|
||||
this.deadLetterQueue.push(item);
|
||||
|
||||
// Maintain max size
|
||||
if (this.deadLetterQueue.length > this.maxDeadLetterSize) {
|
||||
const dropped = this.deadLetterQueue.shift();
|
||||
if (dropped) {
|
||||
this.metrics.eventsDropped++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
logger.debug(`Added ${items.length} items to dead letter queue`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Process dead letter queue when circuit is healthy
|
||||
*/
|
||||
private async processDeadLetterQueue(): Promise<void> {
|
||||
if (this.deadLetterQueue.length === 0) return;
|
||||
|
||||
logger.debug(`Processing ${this.deadLetterQueue.length} items from dead letter queue`);
|
||||
|
||||
const events: TelemetryEvent[] = [];
|
||||
const workflows: WorkflowTelemetry[] = [];
|
||||
|
||||
// Separate events and workflows
|
||||
for (const item of this.deadLetterQueue) {
|
||||
if ('workflow_hash' in item) {
|
||||
workflows.push(item as WorkflowTelemetry);
|
||||
} else {
|
||||
events.push(item as TelemetryEvent);
|
||||
}
|
||||
}
|
||||
|
||||
// Clear dead letter queue
|
||||
this.deadLetterQueue = [];
|
||||
|
||||
// Try to flush
|
||||
if (events.length > 0) {
|
||||
await this.flushEvents(events);
|
||||
}
|
||||
if (workflows.length > 0) {
|
||||
await this.flushWorkflows(workflows);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Record flush time for metrics
|
||||
*/
|
||||
private recordFlushTime(time: number): void {
|
||||
this.flushTimes.push(time);
|
||||
|
||||
// Keep last 100 flush times
|
||||
if (this.flushTimes.length > 100) {
|
||||
this.flushTimes.shift();
|
||||
}
|
||||
|
||||
// Update average
|
||||
const sum = this.flushTimes.reduce((a, b) => a + b, 0);
|
||||
this.metrics.averageFlushTime = Math.round(sum / this.flushTimes.length);
|
||||
this.metrics.lastFlushTime = time;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get processor metrics
|
||||
*/
|
||||
getMetrics(): TelemetryMetrics & { circuitBreakerState: any; deadLetterQueueSize: number } {
|
||||
return {
|
||||
...this.metrics,
|
||||
circuitBreakerState: this.circuitBreaker.getState(),
|
||||
deadLetterQueueSize: this.deadLetterQueue.length
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset metrics
|
||||
*/
|
||||
resetMetrics(): void {
|
||||
this.metrics = {
|
||||
eventsTracked: 0,
|
||||
eventsDropped: 0,
|
||||
eventsFailed: 0,
|
||||
batchesSent: 0,
|
||||
batchesFailed: 0,
|
||||
averageFlushTime: 0,
|
||||
rateLimitHits: 0
|
||||
};
|
||||
this.flushTimes = [];
|
||||
this.circuitBreaker.reset();
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user