feat: implement template compression and view count filtering

- Add gzip compression for workflow JSONs (89% size reduction)
- Filter templates with ≤10 views to remove low-quality content
- Reduce template count from 4,505 to 2,596 high-quality templates
- Compress template data from ~75MB to 12.10MB
- Total database reduced from 117MB to 48MB
- Add on-the-fly decompression for template retrieval
- Update schema to support compressed workflow storage

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
czlonkowski
2025-09-14 14:49:45 +02:00
parent 10c29dd585
commit f35097ed46
9 changed files with 15684 additions and 57 deletions

Binary file not shown.

15435
fetch_log.txt Normal file

File diff suppressed because one or more lines are too long

32
monitor_fetch.sh Normal file
View File

@@ -0,0 +1,32 @@
#!/bin/bash
echo "Monitoring template fetch progress..."
echo "=================================="
while true; do
# Check if process is still running
if ! pgrep -f "fetch-templates" > /dev/null; then
echo "Fetch process completed!"
break
fi
# Get database size
DB_SIZE=$(ls -lh data/nodes.db 2>/dev/null | awk '{print $5}')
# Get template count
TEMPLATE_COUNT=$(sqlite3 data/nodes.db "SELECT COUNT(*) FROM templates" 2>/dev/null || echo "0")
# Get last log entry
LAST_LOG=$(tail -n 1 fetch_log.txt 2>/dev/null | grep "Fetching template details" | tail -1)
# Display status
echo -ne "\rDB Size: $DB_SIZE | Templates: $TEMPLATE_COUNT | $LAST_LOG"
sleep 5
done
echo ""
echo "Final statistics:"
echo "-----------------"
ls -lh data/nodes.db
sqlite3 data/nodes.db "SELECT COUNT(*) as count, printf('%.1f MB', SUM(LENGTH(workflow_json_compressed))/1024.0/1024.0) as compressed_size FROM templates"

View File

@@ -35,7 +35,8 @@ CREATE TABLE IF NOT EXISTS templates (
author_username TEXT,
author_verified INTEGER DEFAULT 0,
nodes_used TEXT, -- JSON array of node types
workflow_json TEXT NOT NULL, -- Complete workflow JSON
workflow_json TEXT, -- Complete workflow JSON (deprecated, use workflow_json_compressed)
workflow_json_compressed TEXT, -- Compressed workflow JSON (base64 encoded gzip)
categories TEXT, -- JSON array of categories
views INTEGER DEFAULT 0,
created_at DATETIME,

View File

@@ -4,8 +4,10 @@ import { TemplateService } from '../templates/template-service';
import * as fs from 'fs';
import * as path from 'path';
async function fetchTemplates() {
console.log('🌐 Fetching n8n workflow templates...\n');
async function fetchTemplates(mode: 'rebuild' | 'update' = 'rebuild') {
const modeEmoji = mode === 'rebuild' ? '🔄' : '⬆️';
const modeText = mode === 'rebuild' ? 'Rebuilding' : 'Updating';
console.log(`${modeEmoji} ${modeText} n8n workflow templates...\n`);
// Ensure data directory exists
const dataDir = './data';
@@ -16,13 +18,17 @@ async function fetchTemplates() {
// Initialize database
const db = await createDatabaseAdapter('./data/nodes.db');
// Drop existing templates table to ensure clean schema
try {
db.exec('DROP TABLE IF EXISTS templates');
db.exec('DROP TABLE IF EXISTS templates_fts');
console.log('🗑️ Dropped existing templates tables\n');
} catch (error) {
// Ignore errors if tables don't exist
// Only drop tables in rebuild mode
if (mode === 'rebuild') {
try {
db.exec('DROP TABLE IF EXISTS templates');
db.exec('DROP TABLE IF EXISTS templates_fts');
console.log('🗑️ Dropped existing templates tables (rebuild mode)\n');
} catch (error) {
// Ignore errors if tables don't exist
}
} else {
console.log('📊 Update mode: Keeping existing templates\n');
}
// Apply schema with updated constraint
@@ -86,10 +92,10 @@ async function fetchTemplates() {
process.stdout.write('\r' + ' '.repeat(lastMessage.length) + '\r');
}
const progress = Math.round((current / total) * 100);
const progress = total > 0 ? Math.round((current / total) * 100) : 0;
lastMessage = `📊 ${message}: ${current}/${total} (${progress}%)`;
process.stdout.write(lastMessage);
});
}, mode);
console.log('\n'); // New line after progress
@@ -119,9 +125,34 @@ async function fetchTemplates() {
}
}
// Parse command line arguments
function parseArgs(): 'rebuild' | 'update' {
const args = process.argv.slice(2);
// Check for --mode flag
const modeIndex = args.findIndex(arg => arg.startsWith('--mode'));
if (modeIndex !== -1) {
const modeArg = args[modeIndex];
const mode = modeArg.includes('=') ? modeArg.split('=')[1] : args[modeIndex + 1];
if (mode === 'update') {
return 'update';
}
}
// Check for --update flag as shorthand
if (args.includes('--update')) {
return 'update';
}
// Default to rebuild
return 'rebuild';
}
// Run if called directly
if (require.main === module) {
fetchTemplates().catch(console.error);
const mode = parseArgs();
fetchTemplates(mode).catch(console.error);
}
export { fetchTemplates };

View File

@@ -52,7 +52,7 @@ async function runValidationSummary() {
for (const template of templates) {
try {
const workflow = JSON.parse(template.workflow_json);
const workflow = JSON.parse(template.workflow_json || '{}');
const validationResult = await validator.validateWorkflow(workflow, {
profile: 'minimal' // Use minimal profile to focus on critical errors
});

View File

@@ -39,58 +39,75 @@ export interface TemplateDetail {
export class TemplateFetcher {
private readonly baseUrl = 'https://api.n8n.io/api/templates';
private readonly pageSize = 100;
private readonly pageSize = 250; // Maximum allowed by API
/**
* Fetch all templates and filter to last 12 months
* This fetches ALL pages first, then applies date filter locally
*/
async fetchTemplates(progressCallback?: (current: number, total: number) => void): Promise<TemplateWorkflow[]> {
const allTemplates = await this.fetchAllTemplates(progressCallback);
// Apply date filter locally after fetching all
const oneYearAgo = new Date();
oneYearAgo.setMonth(oneYearAgo.getMonth() - 12);
const recentTemplates = allTemplates.filter((w: TemplateWorkflow) => {
const createdDate = new Date(w.createdAt);
return createdDate >= oneYearAgo;
});
logger.info(`Filtered to ${recentTemplates.length} templates from last 12 months (out of ${allTemplates.length} total)`);
return recentTemplates;
}
/**
* Fetch ALL templates from the API without date filtering
* Used internally and can be used for other filtering strategies
*/
async fetchAllTemplates(progressCallback?: (current: number, total: number) => void): Promise<TemplateWorkflow[]> {
const allTemplates: TemplateWorkflow[] = [];
let page = 1;
let hasMore = true;
let totalWorkflows = 0;
logger.info('Starting template fetch from n8n.io API');
logger.info('Starting complete template fetch from n8n.io API');
while (hasMore) {
try {
const response = await axios.get(`${this.baseUrl}/search`, {
params: {
page,
rows: this.pageSize,
sort_by: 'last-updated'
rows: this.pageSize
// Note: sort_by parameter doesn't work, templates come in popularity order
}
});
const { workflows, totalWorkflows } = response.data;
const { workflows } = response.data;
totalWorkflows = response.data.totalWorkflows || totalWorkflows;
// Filter templates by date
const recentTemplates = workflows.filter((w: TemplateWorkflow) => {
const createdDate = new Date(w.createdAt);
return createdDate >= oneYearAgo;
});
allTemplates.push(...workflows);
// If we hit templates older than 1 year, stop fetching
if (recentTemplates.length < workflows.length) {
hasMore = false;
logger.info(`Reached templates older than 1 year at page ${page}`);
}
allTemplates.push(...recentTemplates);
// Calculate total pages for better progress reporting
const totalPages = Math.ceil(totalWorkflows / this.pageSize);
if (progressCallback) {
progressCallback(allTemplates.length, Math.min(totalWorkflows, allTemplates.length + 500));
// Enhanced progress with page information
progressCallback(allTemplates.length, totalWorkflows);
}
logger.debug(`Fetched page ${page}/${totalPages}: ${workflows.length} templates (total so far: ${allTemplates.length}/${totalWorkflows})`);
// Check if there are more pages
if (workflows.length < this.pageSize || allTemplates.length >= totalWorkflows) {
if (workflows.length < this.pageSize) {
hasMore = false;
}
page++;
// Rate limiting - be nice to the API
// Rate limiting - be nice to the API (slightly faster with 250 rows/page)
if (hasMore) {
await this.sleep(500); // 500ms between requests
await this.sleep(300); // 300ms between requests (was 500ms with 100 rows)
}
} catch (error) {
logger.error(`Error fetching templates page ${page}:`, error);
@@ -98,7 +115,7 @@ export class TemplateFetcher {
}
}
logger.info(`Fetched ${allTemplates.length} templates from last year`);
logger.info(`Fetched all ${allTemplates.length} templates from n8n.io`);
return allTemplates;
}
@@ -131,8 +148,8 @@ export class TemplateFetcher {
progressCallback(i + 1, workflows.length);
}
// Rate limiting
await this.sleep(200); // 200ms between requests
// Rate limiting (conservative to avoid API throttling)
await this.sleep(150); // 150ms between requests
} catch (error) {
logger.error(`Failed to fetch details for workflow ${workflow.id}:`, error);
// Continue with other templates

View File

@@ -2,6 +2,7 @@ import { DatabaseAdapter } from '../database/database-adapter';
import { TemplateWorkflow, TemplateDetail } from './template-fetcher';
import { logger } from '../utils/logger';
import { TemplateSanitizer } from '../utils/template-sanitizer';
import * as zlib from 'zlib';
export interface StoredTemplate {
id: number;
@@ -12,7 +13,8 @@ export interface StoredTemplate {
author_username: string;
author_verified: number;
nodes_used: string; // JSON string
workflow_json: string; // JSON string
workflow_json?: string; // JSON string (deprecated)
workflow_json_compressed?: string; // Base64 encoded gzip
categories: string; // JSON string
views: number;
created_at: string;
@@ -105,10 +107,16 @@ export class TemplateRepository {
* Save a template to the database
*/
saveTemplate(workflow: TemplateWorkflow, detail: TemplateDetail, categories: string[] = []): void {
// Filter out templates with 10 or fewer views
if ((workflow.totalViews || 0) <= 10) {
logger.debug(`Skipping template ${workflow.id}: ${workflow.name} (only ${workflow.totalViews} views)`);
return;
}
const stmt = this.db.prepare(`
INSERT OR REPLACE INTO templates (
id, workflow_id, name, description, author_name, author_username,
author_verified, nodes_used, workflow_json, categories, views,
author_verified, nodes_used, workflow_json_compressed, categories, views,
created_at, updated_at, url
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`);
@@ -133,6 +141,17 @@ export class TemplateRepository {
});
}
// Compress the workflow JSON
const workflowJsonStr = JSON.stringify(sanitizedWorkflow);
const compressed = zlib.gzipSync(workflowJsonStr);
const compressedBase64 = compressed.toString('base64');
// Log compression ratio
const originalSize = Buffer.byteLength(workflowJsonStr);
const compressedSize = compressed.length;
const ratio = Math.round((1 - compressedSize / originalSize) * 100);
logger.debug(`Template ${workflow.id} compression: ${originalSize}${compressedSize} bytes (${ratio}% reduction)`);
stmt.run(
workflow.id,
workflow.id,
@@ -142,7 +161,7 @@ export class TemplateRepository {
workflow.user.username,
workflow.user.verified ? 1 : 0,
JSON.stringify(nodeTypes),
JSON.stringify(sanitizedWorkflow),
compressedBase64,
JSON.stringify(categories),
workflow.totalViews || 0,
workflow.createdAt,
@@ -165,7 +184,8 @@ export class TemplateRepository {
`;
const params = [...nodeTypes.map(n => `%"${n}"%`), limit];
return this.db.prepare(query).all(...params) as StoredTemplate[];
const results = this.db.prepare(query).all(...params) as StoredTemplate[];
return results.map(t => this.decompressWorkflow(t));
}
/**
@@ -176,7 +196,37 @@ export class TemplateRepository {
SELECT * FROM templates WHERE id = ?
`).get(templateId) as StoredTemplate | undefined;
return row || null;
if (!row) return null;
// Decompress workflow JSON if compressed
if (row.workflow_json_compressed && !row.workflow_json) {
try {
const compressed = Buffer.from(row.workflow_json_compressed, 'base64');
const decompressed = zlib.gunzipSync(compressed);
row.workflow_json = decompressed.toString();
} catch (error) {
logger.error(`Failed to decompress workflow for template ${templateId}:`, error);
return null;
}
}
return row;
}
/**
* Decompress workflow JSON for a template
*/
private decompressWorkflow(template: StoredTemplate): StoredTemplate {
if (template.workflow_json_compressed && !template.workflow_json) {
try {
const compressed = Buffer.from(template.workflow_json_compressed, 'base64');
const decompressed = zlib.gunzipSync(compressed);
template.workflow_json = decompressed.toString();
} catch (error) {
logger.error(`Failed to decompress workflow for template ${template.id}:`, error);
}
}
return template;
}
/**
@@ -209,7 +259,7 @@ export class TemplateRepository {
`).all(ftsQuery, limit) as StoredTemplate[];
logger.debug(`FTS5 search returned ${results.length} results`);
return results;
return results.map(t => this.decompressWorkflow(t));
} catch (error: any) {
// If FTS5 query fails, fallback to LIKE search
logger.warn('FTS5 template search failed, using LIKE fallback:', {
@@ -236,7 +286,7 @@ export class TemplateRepository {
`).all(likeQuery, likeQuery, limit) as StoredTemplate[];
logger.debug(`LIKE search returned ${results.length} results`);
return results;
return results.map(t => this.decompressWorkflow(t));
}
/**
@@ -269,11 +319,12 @@ export class TemplateRepository {
* Get all templates with limit
*/
getAllTemplates(limit: number = 10): StoredTemplate[] {
return this.db.prepare(`
const results = this.db.prepare(`
SELECT * FROM templates
ORDER BY views DESC, created_at DESC
LIMIT ?
`).all(limit) as StoredTemplate[];
return results.map(t => this.decompressWorkflow(t));
}
/**
@@ -284,6 +335,41 @@ export class TemplateRepository {
return result.count;
}
/**
* Get all existing template IDs for comparison
* Used in update mode to skip already fetched templates
*/
getExistingTemplateIds(): Set<number> {
const rows = this.db.prepare('SELECT id FROM templates').all() as { id: number }[];
return new Set(rows.map(r => r.id));
}
/**
* Check if a template exists in the database
*/
hasTemplate(templateId: number): boolean {
const result = this.db.prepare('SELECT 1 FROM templates WHERE id = ?').get(templateId) as { 1: number } | undefined;
return result !== undefined;
}
/**
* Get template metadata (id, name, updated_at) for all templates
* Used for comparison in update scenarios
*/
getTemplateMetadata(): Map<number, { name: string; updated_at: string }> {
const rows = this.db.prepare('SELECT id, name, updated_at FROM templates').all() as {
id: number;
name: string;
updated_at: string;
}[];
const metadata = new Map<number, { name: string; updated_at: string }>();
for (const row of rows) {
metadata.set(row.id, { name: row.name, updated_at: row.updated_at });
}
return metadata;
}
/**
* Get template statistics
*/

View File

@@ -47,7 +47,7 @@ export class TemplateService {
return {
...this.formatTemplateInfo(template),
workflow: JSON.parse(template.workflow_json)
workflow: JSON.parse(template.workflow_json || '{}')
};
}
@@ -94,36 +94,59 @@ export class TemplateService {
/**
* Fetch and update templates from n8n.io
* @param mode - 'rebuild' to clear and rebuild, 'update' to add only new templates
*/
async fetchAndUpdateTemplates(
progressCallback?: (message: string, current: number, total: number) => void
progressCallback?: (message: string, current: number, total: number) => void,
mode: 'rebuild' | 'update' = 'rebuild'
): Promise<void> {
try {
// Dynamically import fetcher only when needed (requires axios)
const { TemplateFetcher } = await import('./template-fetcher');
const fetcher = new TemplateFetcher();
// Clear existing templates
this.repository.clearTemplates();
// Get existing template IDs if in update mode
let existingIds: Set<number> = new Set();
if (mode === 'update') {
existingIds = this.repository.getExistingTemplateIds();
logger.info(`Update mode: Found ${existingIds.size} existing templates in database`);
} else {
// Clear existing templates in rebuild mode
this.repository.clearTemplates();
logger.info('Rebuild mode: Cleared existing templates');
}
// Fetch template list
logger.info('Fetching template list from n8n.io');
logger.info(`Fetching template list from n8n.io (mode: ${mode})`);
const templates = await fetcher.fetchTemplates((current, total) => {
progressCallback?.('Fetching template list', current, total);
});
logger.info(`Found ${templates.length} templates from last year`);
logger.info(`Found ${templates.length} templates from last 12 months`);
// Filter to only new templates if in update mode
let templatesToFetch = templates;
if (mode === 'update') {
templatesToFetch = templates.filter(t => !existingIds.has(t.id));
logger.info(`Update mode: ${templatesToFetch.length} new templates to fetch (skipping ${templates.length - templatesToFetch.length} existing)`);
if (templatesToFetch.length === 0) {
logger.info('No new templates to fetch');
progressCallback?.('No new templates', 0, 0);
return;
}
}
// Fetch details for each template
logger.info('Fetching template details');
const details = await fetcher.fetchAllTemplateDetails(templates, (current, total) => {
logger.info(`Fetching details for ${templatesToFetch.length} templates`);
const details = await fetcher.fetchAllTemplateDetails(templatesToFetch, (current, total) => {
progressCallback?.('Fetching template details', current, total);
});
// Save to database
logger.info('Saving templates to database');
let saved = 0;
for (const template of templates) {
for (const template of templatesToFetch) {
const detail = details.get(template.id);
if (detail) {
this.repository.saveTemplate(template, detail);
@@ -134,8 +157,10 @@ export class TemplateService {
logger.info(`Successfully saved ${saved} templates to database`);
// Rebuild FTS5 index after bulk import
logger.info('Rebuilding FTS5 index for templates');
this.repository.rebuildTemplateFTS();
if (saved > 0) {
logger.info('Rebuilding FTS5 index for templates');
this.repository.rebuildTemplateFTS();
}
progressCallback?.('Complete', saved, saved);
} catch (error) {