const { Command } = require('commander'); const fs = require('fs-extra'); const path = require('node:path'); const process = require('node:process'); // Modularized components const { findProjectRoot } = require('./projectRoot.js'); const { promptYesNo, promptPath } = require('./prompts.js'); const { discoverFiles, filterFiles, aggregateFileContents } = require('./files.js'); const { generateXMLOutput } = require('./xml.js'); const { calculateStatistics } = require('./stats.js'); /** * Recursively discover all files in a directory * @param {string} rootDir - The root directory to scan * @returns {Promise} Array of file paths */ /** * Parse .gitignore file and return ignore patterns * @param {string} gitignorePath - Path to .gitignore file * @returns {Promise} Array of ignore patterns */ /** * Check if a file is binary using file command and heuristics * @param {string} filePath - Path to the file * @returns {Promise} True if file is binary */ /** * Read and aggregate content from text files * @param {string[]} files - Array of file paths * @param {string} rootDir - The root directory * @param {Object} spinner - Optional spinner instance for progress display * @returns {Promise} Object containing file contents and metadata */ /** * Generate XML output with aggregated file contents using streaming * @param {Object} aggregatedContent - The aggregated content object * @param {string} outputPath - The output file path * @returns {Promise} Promise that resolves when writing is complete */ /** * Calculate statistics for the processed files * @param {Object} aggregatedContent - The aggregated content object * @param {number} xmlFileSize - The size of the generated XML file in bytes * @returns {Object} Statistics object */ /** * Filter files based on .gitignore patterns * @param {string[]} files - Array of file paths * @param {string} rootDir - The root directory * @returns {Promise} Filtered array of file paths */ /** * Attempt to find the project root by walking up from startDir * Looks for common project markers like .git, package.json, pyproject.toml, etc. * @param {string} startDir * @returns {Promise} project root directory or null if not found */ const program = new Command(); program .name('bmad-flatten') .description('BMAD-METHODโ„ข codebase flattener tool') .version('1.0.0') .option('-i, --input ', 'Input directory to flatten', process.cwd()) .option('-o, --output ', 'Output file path', 'flattened-codebase.xml') .action(async (options) => { let inputDir = path.resolve(options.input); let outputPath = path.resolve(options.output); // Detect if user explicitly provided -i/--input or -o/--output const argv = process.argv.slice(2); const userSpecifiedInput = argv.some( (a) => a === '-i' || a === '--input' || a.startsWith('--input='), ); const userSpecifiedOutput = argv.some( (a) => a === '-o' || a === '--output' || a.startsWith('--output='), ); const noPathArguments = !userSpecifiedInput && !userSpecifiedOutput; if (noPathArguments) { const detectedRoot = await findProjectRoot(process.cwd()); const suggestedOutput = detectedRoot ? path.join(detectedRoot, 'flattened-codebase.xml') : path.resolve('flattened-codebase.xml'); if (detectedRoot) { const useDefaults = await promptYesNo( `Detected project root at "${detectedRoot}". Use it as input and write output to "${suggestedOutput}"?`, true, ); if (useDefaults) { inputDir = detectedRoot; outputPath = suggestedOutput; } else { inputDir = await promptPath('Enter input directory path', process.cwd()); outputPath = await promptPath( 'Enter output file path', path.join(inputDir, 'flattened-codebase.xml'), ); } } else { console.log('Could not auto-detect a project root.'); inputDir = await promptPath('Enter input directory path', process.cwd()); outputPath = await promptPath( 'Enter output file path', path.join(inputDir, 'flattened-codebase.xml'), ); } } // Ensure output directory exists await fs.ensureDir(path.dirname(outputPath)); try { // Verify input directory exists if (!(await fs.pathExists(inputDir))) { console.error(`โŒ Error: Input directory does not exist: ${inputDir}`); process.exit(1); } // Import ora dynamically const { default: ora } = await import('ora'); // Start file discovery with spinner const discoverySpinner = ora('๐Ÿ” Discovering files...').start(); const files = await discoverFiles(inputDir); const filteredFiles = await filterFiles(files, inputDir); discoverySpinner.succeed(`๐Ÿ“ Found ${filteredFiles.length} files to include`); // Process files with progress tracking console.log('Reading file contents'); const processingSpinner = ora('๐Ÿ“„ Processing files...').start(); const aggregatedContent = await aggregateFileContents( filteredFiles, inputDir, processingSpinner, ); processingSpinner.succeed( `โœ… Processed ${aggregatedContent.processedFiles}/${filteredFiles.length} files`, ); if (aggregatedContent.errors.length > 0) { console.log(`Errors: ${aggregatedContent.errors.length}`); } // Generate XML output using streaming const xmlSpinner = ora('๐Ÿ”ง Generating XML output...').start(); await generateXMLOutput(aggregatedContent, outputPath); xmlSpinner.succeed('๐Ÿ“ XML generation completed'); // Calculate and display statistics const outputStats = await fs.stat(outputPath); const stats = await calculateStatistics(aggregatedContent, outputStats.size, inputDir); // Display completion summary console.log('\n๐Ÿ“Š Completion Summary:'); console.log( `โœ… Successfully processed ${filteredFiles.length} files into ${path.basename(outputPath)}`, ); console.log(`๐Ÿ“ Output file: ${outputPath}`); console.log(`๐Ÿ“ Total source size: ${stats.totalSize}`); console.log(`๐Ÿ“„ Generated XML size: ${stats.xmlSize}`); console.log(`๐Ÿ“ Total lines of code: ${stats.totalLines.toLocaleString()}`); console.log(`๐Ÿ”ข Estimated tokens: ${stats.estimatedTokens}`); console.log( `๐Ÿ“Š File breakdown: ${stats.textFiles} text, ${stats.binaryFiles} binary, ${stats.errorFiles} errors\n`, ); // Ask user if they want detailed stats + markdown report const generateDetailed = await promptYesNo( 'Generate detailed stats (console + markdown) now?', true, ); if (generateDetailed) { // Additional detailed stats console.log('\n๐Ÿ“ˆ Size Percentiles:'); console.log( ` Avg: ${Math.round(stats.avgFileSize).toLocaleString()} B, Median: ${Math.round( stats.medianFileSize, ).toLocaleString()} B, p90: ${stats.p90.toLocaleString()} B, p95: ${stats.p95.toLocaleString()} B, p99: ${stats.p99.toLocaleString()} B`, ); if (Array.isArray(stats.histogram) && stats.histogram.length > 0) { console.log('\n๐Ÿงฎ Size Histogram:'); for (const b of stats.histogram.slice(0, 2)) { console.log(` ${b.label}: ${b.count} files, ${b.bytes.toLocaleString()} bytes`); } if (stats.histogram.length > 2) { console.log(` โ€ฆ and ${stats.histogram.length - 2} more buckets`); } } if (Array.isArray(stats.byExtension) && stats.byExtension.length > 0) { const topExt = stats.byExtension.slice(0, 2); console.log('\n๐Ÿ“ฆ Top Extensions:'); for (const e of topExt) { const pct = stats.totalBytes ? (e.bytes / stats.totalBytes) * 100 : 0; console.log( ` ${e.ext}: ${e.count} files, ${e.bytes.toLocaleString()} bytes (${pct.toFixed( 2, )}%)`, ); } if (stats.byExtension.length > 2) { console.log(` โ€ฆ and ${stats.byExtension.length - 2} more extensions`); } } if (Array.isArray(stats.byDirectory) && stats.byDirectory.length > 0) { const topDir = stats.byDirectory.slice(0, 2); console.log('\n๐Ÿ“‚ Top Directories:'); for (const d of topDir) { const pct = stats.totalBytes ? (d.bytes / stats.totalBytes) * 100 : 0; console.log( ` ${d.dir}: ${d.count} files, ${d.bytes.toLocaleString()} bytes (${pct.toFixed( 2, )}%)`, ); } if (stats.byDirectory.length > 2) { console.log(` โ€ฆ and ${stats.byDirectory.length - 2} more directories`); } } if (Array.isArray(stats.depthDistribution) && stats.depthDistribution.length > 0) { console.log('\n๐ŸŒณ Depth Distribution:'); const dd = stats.depthDistribution.slice(0, 2); let line = ' ' + dd.map((d) => `${d.depth}:${d.count}`).join(' '); if (stats.depthDistribution.length > 2) { line += ` โ€ฆ +${stats.depthDistribution.length - 2} more`; } console.log(line); } if (Array.isArray(stats.longestPaths) && stats.longestPaths.length > 0) { console.log('\n๐Ÿงต Longest Paths:'); for (const p of stats.longestPaths.slice(0, 2)) { console.log(` ${p.path} (${p.length} chars, ${p.size.toLocaleString()} bytes)`); } if (stats.longestPaths.length > 2) { console.log(` โ€ฆ and ${stats.longestPaths.length - 2} more paths`); } } if (stats.temporal) { console.log('\nโฑ๏ธ Temporal:'); if (stats.temporal.oldest) { console.log( ` Oldest: ${stats.temporal.oldest.path} (${stats.temporal.oldest.mtime})`, ); } if (stats.temporal.newest) { console.log( ` Newest: ${stats.temporal.newest.path} (${stats.temporal.newest.mtime})`, ); } if (Array.isArray(stats.temporal.ageBuckets)) { console.log(' Age buckets:'); for (const b of stats.temporal.ageBuckets.slice(0, 2)) { console.log(` ${b.label}: ${b.count} files, ${b.bytes.toLocaleString()} bytes`); } if (stats.temporal.ageBuckets.length > 2) { console.log(` โ€ฆ and ${stats.temporal.ageBuckets.length - 2} more buckets`); } } } if (stats.quality) { console.log('\nโœ… Quality Signals:'); console.log(` Zero-byte files: ${stats.quality.zeroByteFiles}`); console.log(` Empty text files: ${stats.quality.emptyTextFiles}`); console.log(` Hidden files: ${stats.quality.hiddenFiles}`); console.log(` Symlinks: ${stats.quality.symlinks}`); console.log( ` Large files (>= ${(stats.quality.largeThreshold / (1024 * 1024)).toFixed( 0, )} MB): ${stats.quality.largeFilesCount}`, ); console.log( ` Suspiciously large files (>= 100 MB): ${stats.quality.suspiciousLargeFilesCount}`, ); } if (Array.isArray(stats.duplicateCandidates) && stats.duplicateCandidates.length > 0) { console.log('\n๐Ÿงฌ Duplicate Candidates:'); for (const d of stats.duplicateCandidates.slice(0, 2)) { console.log(` ${d.reason}: ${d.count} files @ ${d.size.toLocaleString()} bytes`); } if (stats.duplicateCandidates.length > 2) { console.log(` โ€ฆ and ${stats.duplicateCandidates.length - 2} more groups`); } } if (typeof stats.compressibilityRatio === 'number') { console.log( `\n๐Ÿ—œ๏ธ Compressibility ratio (sampled): ${(stats.compressibilityRatio * 100).toFixed( 2, )}%`, ); } if (stats.git && stats.git.isRepo) { console.log('\n๐Ÿ”ง Git:'); console.log( ` Tracked: ${stats.git.trackedCount} files, ${stats.git.trackedBytes.toLocaleString()} bytes`, ); console.log( ` Untracked: ${stats.git.untrackedCount} files, ${stats.git.untrackedBytes.toLocaleString()} bytes`, ); if (Array.isArray(stats.git.lfsCandidates) && stats.git.lfsCandidates.length > 0) { console.log(' LFS candidates (top 2):'); for (const f of stats.git.lfsCandidates.slice(0, 2)) { console.log(` ${f.path} (${f.size.toLocaleString()} bytes)`); } if (stats.git.lfsCandidates.length > 2) { console.log(` โ€ฆ and ${stats.git.lfsCandidates.length - 2} more`); } } } if (Array.isArray(stats.largestFiles) && stats.largestFiles.length > 0) { console.log('\n๐Ÿ“š Largest Files (top 2):'); for (const f of stats.largestFiles.slice(0, 2)) { // Show LOC for text files when available; omit ext and mtime let locStr = ''; if (!f.isBinary && Array.isArray(aggregatedContent?.textFiles)) { const tf = aggregatedContent.textFiles.find((t) => t.path === f.path); if (tf && typeof tf.lines === 'number') { locStr = `, LOC: ${tf.lines.toLocaleString()}`; } } console.log( ` ${f.path} โ€“ ${f.sizeFormatted} (${f.percentOfTotal.toFixed(2)}%)${locStr}`, ); } if (stats.largestFiles.length > 2) { console.log(` โ€ฆ and ${stats.largestFiles.length - 2} more files`); } } // Write a comprehensive markdown report next to the XML { const mdPath = outputPath.endsWith('.xml') ? outputPath.replace(/\.xml$/i, '.stats.md') : outputPath + '.stats.md'; try { const pct = (num, den) => (den ? (num / den) * 100 : 0); const md = []; md.push( `# ๐Ÿงพ Flatten Stats for ${path.basename(outputPath)}`, '', '## ๐Ÿ“Š Summary', `- Total source size: ${stats.totalSize}`, `- Generated XML size: ${stats.xmlSize}`, `- Total lines of code: ${stats.totalLines.toLocaleString()}`, `- Estimated tokens: ${stats.estimatedTokens}`, `- File breakdown: ${stats.textFiles} text, ${stats.binaryFiles} binary, ${stats.errorFiles} errors`, '', '## ๐Ÿ“ˆ Size Percentiles', `Avg: ${Math.round(stats.avgFileSize).toLocaleString()} B, Median: ${Math.round( stats.medianFileSize, ).toLocaleString()} B, p90: ${stats.p90.toLocaleString()} B, p95: ${stats.p95.toLocaleString()} B, p99: ${stats.p99.toLocaleString()} B`, '', ); // Histogram if (Array.isArray(stats.histogram) && stats.histogram.length > 0) { md.push( '## ๐Ÿงฎ Size Histogram', '| Bucket | Files | Bytes |', '| --- | ---: | ---: |', ); for (const b of stats.histogram) { md.push(`| ${b.label} | ${b.count} | ${b.bytes.toLocaleString()} |`); } md.push(''); } // Top Extensions if (Array.isArray(stats.byExtension) && stats.byExtension.length > 0) { md.push( '## ๐Ÿ“ฆ Top Extensions by Bytes (Top 20)', '| Ext | Files | Bytes | % of total |', '| --- | ---: | ---: | ---: |', ); for (const e of stats.byExtension.slice(0, 20)) { const p = pct(e.bytes, stats.totalBytes); md.push( `| ${e.ext} | ${e.count} | ${e.bytes.toLocaleString()} | ${p.toFixed(2)}% |`, ); } md.push(''); } // Top Directories if (Array.isArray(stats.byDirectory) && stats.byDirectory.length > 0) { md.push( '## ๐Ÿ“‚ Top Directories by Bytes (Top 20)', '| Directory | Files | Bytes | % of total |', '| --- | ---: | ---: | ---: |', ); for (const d of stats.byDirectory.slice(0, 20)) { const p = pct(d.bytes, stats.totalBytes); md.push( `| ${d.dir} | ${d.count} | ${d.bytes.toLocaleString()} | ${p.toFixed(2)}% |`, ); } md.push(''); } // Depth distribution if (Array.isArray(stats.depthDistribution) && stats.depthDistribution.length > 0) { md.push('## ๐ŸŒณ Depth Distribution', '| Depth | Count |', '| ---: | ---: |'); for (const d of stats.depthDistribution) { md.push(`| ${d.depth} | ${d.count} |`); } md.push(''); } // Longest paths if (Array.isArray(stats.longestPaths) && stats.longestPaths.length > 0) { md.push( '## ๐Ÿงต Longest Paths (Top 25)', '| Path | Length | Bytes |', '| --- | ---: | ---: |', ); for (const pth of stats.longestPaths) { md.push(`| ${pth.path} | ${pth.length} | ${pth.size.toLocaleString()} |`); } md.push(''); } // Temporal if (stats.temporal) { md.push('## โฑ๏ธ Temporal'); if (stats.temporal.oldest) { md.push(`- Oldest: ${stats.temporal.oldest.path} (${stats.temporal.oldest.mtime})`); } if (stats.temporal.newest) { md.push(`- Newest: ${stats.temporal.newest.path} (${stats.temporal.newest.mtime})`); } if (Array.isArray(stats.temporal.ageBuckets)) { md.push('', '| Age | Files | Bytes |', '| --- | ---: | ---: |'); for (const b of stats.temporal.ageBuckets) { md.push(`| ${b.label} | ${b.count} | ${b.bytes.toLocaleString()} |`); } } md.push(''); } // Quality signals if (stats.quality) { md.push( '## โœ… Quality Signals', `- Zero-byte files: ${stats.quality.zeroByteFiles}`, `- Empty text files: ${stats.quality.emptyTextFiles}`, `- Hidden files: ${stats.quality.hiddenFiles}`, `- Symlinks: ${stats.quality.symlinks}`, `- Large files (>= ${(stats.quality.largeThreshold / (1024 * 1024)).toFixed(0)} MB): ${stats.quality.largeFilesCount}`, `- Suspiciously large files (>= 100 MB): ${stats.quality.suspiciousLargeFilesCount}`, '', ); } // Duplicates if (Array.isArray(stats.duplicateCandidates) && stats.duplicateCandidates.length > 0) { md.push( '## ๐Ÿงฌ Duplicate Candidates', '| Reason | Files | Size (bytes) |', '| --- | ---: | ---: |', ); for (const d of stats.duplicateCandidates) { md.push(`| ${d.reason} | ${d.count} | ${d.size.toLocaleString()} |`); } md.push('', '### ๐Ÿงฌ Duplicate Groups Details'); let dupIndex = 1; for (const d of stats.duplicateCandidates) { md.push( `#### Group ${dupIndex}: ${d.count} files @ ${d.size.toLocaleString()} bytes (${d.reason})`, ); if (Array.isArray(d.files) && d.files.length > 0) { for (const fp of d.files) { md.push(`- ${fp}`); } } else { md.push('- (file list unavailable)'); } md.push(''); dupIndex++; } md.push(''); } // Compressibility if (typeof stats.compressibilityRatio === 'number') { md.push( '## ๐Ÿ—œ๏ธ Compressibility', `Sampled compressibility ratio: ${(stats.compressibilityRatio * 100).toFixed(2)}%`, '', ); } // Git if (stats.git && stats.git.isRepo) { md.push( '## ๐Ÿ”ง Git', `- Tracked: ${stats.git.trackedCount} files, ${stats.git.trackedBytes.toLocaleString()} bytes`, `- Untracked: ${stats.git.untrackedCount} files, ${stats.git.untrackedBytes.toLocaleString()} bytes`, ); if (Array.isArray(stats.git.lfsCandidates) && stats.git.lfsCandidates.length > 0) { md.push('', '### ๐Ÿ“ฆ LFS Candidates (Top 20)', '| Path | Bytes |', '| --- | ---: |'); for (const f of stats.git.lfsCandidates.slice(0, 20)) { md.push(`| ${f.path} | ${f.size.toLocaleString()} |`); } } md.push(''); } // Largest Files if (Array.isArray(stats.largestFiles) && stats.largestFiles.length > 0) { md.push( '## ๐Ÿ“š Largest Files (Top 50)', '| Path | Size | % of total | LOC |', '| --- | ---: | ---: | ---: |', ); for (const f of stats.largestFiles) { let loc = ''; if (!f.isBinary && Array.isArray(aggregatedContent?.textFiles)) { const tf = aggregatedContent.textFiles.find((t) => t.path === f.path); if (tf && typeof tf.lines === 'number') { loc = tf.lines.toLocaleString(); } } md.push( `| ${f.path} | ${f.sizeFormatted} | ${f.percentOfTotal.toFixed(2)}% | ${loc} |`, ); } md.push(''); } await fs.writeFile(mdPath, md.join('\n')); console.log(`\n๐Ÿงพ Detailed stats report written to: ${mdPath}`); } catch (error) { console.warn(`โš ๏ธ Failed to write stats markdown: ${error.message}`); } } } } catch (error) { console.error('โŒ Critical error:', error.message); console.error('An unexpected error occurred.'); process.exit(1); } }); if (require.main === module) { program.parse(); } module.exports = program;