Files
BMAD-METHOD/tools/flattener/main.js
manjaroblack fab9d5e1f5 feat(flattener): prompt for detailed stats; polish .stats.md with emojis (#422)
* feat: add detailed statistics and markdown report generation to flattener tool

* fix: remove redundant error handling for project root detection
2025-08-16 08:03:28 -05:00

677 lines
24 KiB
JavaScript
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
const { Command } = require("commander");
const fs = require("fs-extra");
const path = require("node:path");
const process = require("node:process");
// Modularized components
const { findProjectRoot } = require("./projectRoot.js");
const { promptYesNo, promptPath } = require("./prompts.js");
const {
discoverFiles,
filterFiles,
aggregateFileContents,
} = require("./files.js");
const { generateXMLOutput } = require("./xml.js");
const { calculateStatistics } = require("./stats.js");
/**
* Recursively discover all files in a directory
* @param {string} rootDir - The root directory to scan
* @returns {Promise<string[]>} Array of file paths
*/
/**
* Parse .gitignore file and return ignore patterns
* @param {string} gitignorePath - Path to .gitignore file
* @returns {Promise<string[]>} Array of ignore patterns
*/
/**
* Check if a file is binary using file command and heuristics
* @param {string} filePath - Path to the file
* @returns {Promise<boolean>} True if file is binary
*/
/**
* Read and aggregate content from text files
* @param {string[]} files - Array of file paths
* @param {string} rootDir - The root directory
* @param {Object} spinner - Optional spinner instance for progress display
* @returns {Promise<Object>} Object containing file contents and metadata
*/
/**
* Generate XML output with aggregated file contents using streaming
* @param {Object} aggregatedContent - The aggregated content object
* @param {string} outputPath - The output file path
* @returns {Promise<void>} Promise that resolves when writing is complete
*/
/**
* Calculate statistics for the processed files
* @param {Object} aggregatedContent - The aggregated content object
* @param {number} xmlFileSize - The size of the generated XML file in bytes
* @returns {Object} Statistics object
*/
/**
* Filter files based on .gitignore patterns
* @param {string[]} files - Array of file paths
* @param {string} rootDir - The root directory
* @returns {Promise<string[]>} Filtered array of file paths
*/
/**
* Attempt to find the project root by walking up from startDir
* Looks for common project markers like .git, package.json, pyproject.toml, etc.
* @param {string} startDir
* @returns {Promise<string|null>} project root directory or null if not found
*/
const program = new Command();
program
.name("bmad-flatten")
.description("BMad-Method codebase flattener tool")
.version("1.0.0")
.option("-i, --input <path>", "Input directory to flatten", process.cwd())
.option("-o, --output <path>", "Output file path", "flattened-codebase.xml")
.action(async (options) => {
let inputDir = path.resolve(options.input);
let outputPath = path.resolve(options.output);
// Detect if user explicitly provided -i/--input or -o/--output
const argv = process.argv.slice(2);
const userSpecifiedInput = argv.some((a) =>
a === "-i" || a === "--input" || a.startsWith("--input=")
);
const userSpecifiedOutput = argv.some((a) =>
a === "-o" || a === "--output" || a.startsWith("--output=")
);
const noPathArgs = !userSpecifiedInput && !userSpecifiedOutput;
if (noPathArgs) {
const detectedRoot = await findProjectRoot(process.cwd());
const suggestedOutput = detectedRoot
? path.join(detectedRoot, "flattened-codebase.xml")
: path.resolve("flattened-codebase.xml");
if (detectedRoot) {
const useDefaults = await promptYesNo(
`Detected project root at "${detectedRoot}". Use it as input and write output to "${suggestedOutput}"?`,
true,
);
if (useDefaults) {
inputDir = detectedRoot;
outputPath = suggestedOutput;
} else {
inputDir = await promptPath(
"Enter input directory path",
process.cwd(),
);
outputPath = await promptPath(
"Enter output file path",
path.join(inputDir, "flattened-codebase.xml"),
);
}
} else {
console.log("Could not auto-detect a project root.");
inputDir = await promptPath(
"Enter input directory path",
process.cwd(),
);
outputPath = await promptPath(
"Enter output file path",
path.join(inputDir, "flattened-codebase.xml"),
);
}
}
// Ensure output directory exists
await fs.ensureDir(path.dirname(outputPath));
try {
// Verify input directory exists
if (!await fs.pathExists(inputDir)) {
console.error(`❌ Error: Input directory does not exist: ${inputDir}`);
process.exit(1);
}
// Import ora dynamically
const { default: ora } = await import("ora");
// Start file discovery with spinner
const discoverySpinner = ora("🔍 Discovering files...").start();
const files = await discoverFiles(inputDir);
const filteredFiles = await filterFiles(files, inputDir);
discoverySpinner.succeed(
`📁 Found ${filteredFiles.length} files to include`,
);
// Process files with progress tracking
const processingSpinner = ora("📄 Processing files...").start();
const aggregatedContent = await aggregateFileContents(
filteredFiles,
inputDir,
processingSpinner,
);
processingSpinner.succeed(
`✅ Processed ${aggregatedContent.processedFiles}/${filteredFiles.length} files`,
);
if (aggregatedContent.errors.length > 0) {
console.log(`Errors: ${aggregatedContent.errors.length}`);
}
// Generate XML output using streaming
const xmlSpinner = ora("🔧 Generating XML output...").start();
await generateXMLOutput(aggregatedContent, outputPath);
xmlSpinner.succeed("📝 XML generation completed");
// Calculate and display statistics
const outputStats = await fs.stat(outputPath);
const stats = await calculateStatistics(
aggregatedContent,
outputStats.size,
inputDir,
);
// Display completion summary
console.log("\n📊 Completion Summary:");
console.log(
`✅ Successfully processed ${filteredFiles.length} files into ${
path.basename(outputPath)
}`,
);
console.log(`📁 Output file: ${outputPath}`);
console.log(`📏 Total source size: ${stats.totalSize}`);
console.log(`📄 Generated XML size: ${stats.xmlSize}`);
console.log(
`📝 Total lines of code: ${stats.totalLines.toLocaleString()}`,
);
console.log(`🔢 Estimated tokens: ${stats.estimatedTokens}`);
console.log(
`📊 File breakdown: ${stats.textFiles} text, ${stats.binaryFiles} binary, ${stats.errorFiles} errors\n`,
);
// Ask user if they want detailed stats + markdown report
const generateDetailed = await promptYesNo(
"Generate detailed stats (console + markdown) now?",
true,
);
if (generateDetailed) {
// Additional detailed stats
console.log("\n📈 Size Percentiles:");
console.log(
` Avg: ${
Math.round(stats.avgFileSize).toLocaleString()
} B, Median: ${
Math.round(stats.medianFileSize).toLocaleString()
} B, p90: ${stats.p90.toLocaleString()} B, p95: ${stats.p95.toLocaleString()} B, p99: ${stats.p99.toLocaleString()} B`,
);
if (Array.isArray(stats.histogram) && stats.histogram.length) {
console.log("\n🧮 Size Histogram:");
for (const b of stats.histogram.slice(0, 2)) {
console.log(
` ${b.label}: ${b.count} files, ${b.bytes.toLocaleString()} bytes`,
);
}
if (stats.histogram.length > 2) {
console.log(` … and ${stats.histogram.length - 2} more buckets`);
}
}
if (Array.isArray(stats.byExtension) && stats.byExtension.length) {
const topExt = stats.byExtension.slice(0, 2);
console.log("\n📦 Top Extensions:");
for (const e of topExt) {
const pct = stats.totalBytes
? ((e.bytes / stats.totalBytes) * 100)
: 0;
console.log(
` ${e.ext}: ${e.count} files, ${e.bytes.toLocaleString()} bytes (${
pct.toFixed(2)
}%)`,
);
}
if (stats.byExtension.length > 2) {
console.log(
` … and ${stats.byExtension.length - 2} more extensions`,
);
}
}
if (Array.isArray(stats.byDirectory) && stats.byDirectory.length) {
const topDir = stats.byDirectory.slice(0, 2);
console.log("\n📂 Top Directories:");
for (const d of topDir) {
const pct = stats.totalBytes
? ((d.bytes / stats.totalBytes) * 100)
: 0;
console.log(
` ${d.dir}: ${d.count} files, ${d.bytes.toLocaleString()} bytes (${
pct.toFixed(2)
}%)`,
);
}
if (stats.byDirectory.length > 2) {
console.log(
` … and ${stats.byDirectory.length - 2} more directories`,
);
}
}
if (
Array.isArray(stats.depthDistribution) &&
stats.depthDistribution.length
) {
console.log("\n🌳 Depth Distribution:");
const dd = stats.depthDistribution.slice(0, 2);
let line = " " + dd.map((d) => `${d.depth}:${d.count}`).join(" ");
if (stats.depthDistribution.length > 2) {
line += ` … +${stats.depthDistribution.length - 2} more`;
}
console.log(line);
}
if (Array.isArray(stats.longestPaths) && stats.longestPaths.length) {
console.log("\n🧵 Longest Paths:");
for (const p of stats.longestPaths.slice(0, 2)) {
console.log(
` ${p.path} (${p.length} chars, ${p.size.toLocaleString()} bytes)`,
);
}
if (stats.longestPaths.length > 2) {
console.log(` … and ${stats.longestPaths.length - 2} more paths`);
}
}
if (stats.temporal) {
console.log("\n⏱ Temporal:");
if (stats.temporal.oldest) {
console.log(
` Oldest: ${stats.temporal.oldest.path} (${stats.temporal.oldest.mtime})`,
);
}
if (stats.temporal.newest) {
console.log(
` Newest: ${stats.temporal.newest.path} (${stats.temporal.newest.mtime})`,
);
}
if (Array.isArray(stats.temporal.ageBuckets)) {
console.log(" Age buckets:");
for (const b of stats.temporal.ageBuckets.slice(0, 2)) {
console.log(
` ${b.label}: ${b.count} files, ${b.bytes.toLocaleString()} bytes`,
);
}
if (stats.temporal.ageBuckets.length > 2) {
console.log(
` … and ${
stats.temporal.ageBuckets.length - 2
} more buckets`,
);
}
}
}
if (stats.quality) {
console.log("\n✅ Quality Signals:");
console.log(` Zero-byte files: ${stats.quality.zeroByteFiles}`);
console.log(` Empty text files: ${stats.quality.emptyTextFiles}`);
console.log(` Hidden files: ${stats.quality.hiddenFiles}`);
console.log(` Symlinks: ${stats.quality.symlinks}`);
console.log(
` Large files (>= ${
(stats.quality.largeThreshold / (1024 * 1024)).toFixed(0)
} MB): ${stats.quality.largeFilesCount}`,
);
console.log(
` Suspiciously large files (>= 100 MB): ${stats.quality.suspiciousLargeFilesCount}`,
);
}
if (
Array.isArray(stats.duplicateCandidates) &&
stats.duplicateCandidates.length
) {
console.log("\n🧬 Duplicate Candidates:");
for (const d of stats.duplicateCandidates.slice(0, 2)) {
console.log(
` ${d.reason}: ${d.count} files @ ${d.size.toLocaleString()} bytes`,
);
}
if (stats.duplicateCandidates.length > 2) {
console.log(
` … and ${stats.duplicateCandidates.length - 2} more groups`,
);
}
}
if (typeof stats.compressibilityRatio === "number") {
console.log(
`\n🗜️ Compressibility ratio (sampled): ${
(stats.compressibilityRatio * 100).toFixed(2)
}%`,
);
}
if (stats.git && stats.git.isRepo) {
console.log("\n🔧 Git:");
console.log(
` Tracked: ${stats.git.trackedCount} files, ${stats.git.trackedBytes.toLocaleString()} bytes`,
);
console.log(
` Untracked: ${stats.git.untrackedCount} files, ${stats.git.untrackedBytes.toLocaleString()} bytes`,
);
if (
Array.isArray(stats.git.lfsCandidates) &&
stats.git.lfsCandidates.length
) {
console.log(" LFS candidates (top 2):");
for (const f of stats.git.lfsCandidates.slice(0, 2)) {
console.log(` ${f.path} (${f.size.toLocaleString()} bytes)`);
}
if (stats.git.lfsCandidates.length > 2) {
console.log(
` … and ${stats.git.lfsCandidates.length - 2} more`,
);
}
}
}
if (Array.isArray(stats.largestFiles) && stats.largestFiles.length) {
console.log("\n📚 Largest Files (top 2):");
for (const f of stats.largestFiles.slice(0, 2)) {
// Show LOC for text files when available; omit ext and mtime
let locStr = "";
if (!f.isBinary && Array.isArray(aggregatedContent?.textFiles)) {
const tf = aggregatedContent.textFiles.find((t) =>
t.path === f.path
);
if (tf && typeof tf.lines === "number") {
locStr = `, LOC: ${tf.lines.toLocaleString()}`;
}
}
console.log(
` ${f.path} ${f.sizeFormatted} (${
f.percentOfTotal.toFixed(2)
}%)${locStr}`,
);
}
if (stats.largestFiles.length > 2) {
console.log(` … and ${stats.largestFiles.length - 2} more files`);
}
}
// Write a comprehensive markdown report next to the XML
{
const mdPath = outputPath.endsWith(".xml")
? outputPath.replace(/\.xml$/i, ".stats.md")
: outputPath + ".stats.md";
try {
const pct = (num, den) => (den ? ((num / den) * 100) : 0);
const md = [];
md.push(`# 🧾 Flatten Stats for ${path.basename(outputPath)}`);
md.push("");
md.push("## 📊 Summary");
md.push(`- Total source size: ${stats.totalSize}`);
md.push(`- Generated XML size: ${stats.xmlSize}`);
md.push(
`- Total lines of code: ${stats.totalLines.toLocaleString()}`,
);
md.push(`- Estimated tokens: ${stats.estimatedTokens}`);
md.push(
`- File breakdown: ${stats.textFiles} text, ${stats.binaryFiles} binary, ${stats.errorFiles} errors`,
);
md.push("");
// Percentiles
md.push("## 📈 Size Percentiles");
md.push(
`Avg: ${
Math.round(stats.avgFileSize).toLocaleString()
} B, Median: ${
Math.round(stats.medianFileSize).toLocaleString()
} B, p90: ${stats.p90.toLocaleString()} B, p95: ${stats.p95.toLocaleString()} B, p99: ${stats.p99.toLocaleString()} B`,
);
md.push("");
// Histogram
if (Array.isArray(stats.histogram) && stats.histogram.length) {
md.push("## 🧮 Size Histogram");
md.push("| Bucket | Files | Bytes |");
md.push("| --- | ---: | ---: |");
for (const b of stats.histogram) {
md.push(
`| ${b.label} | ${b.count} | ${b.bytes.toLocaleString()} |`,
);
}
md.push("");
}
// Top Extensions
if (Array.isArray(stats.byExtension) && stats.byExtension.length) {
md.push("## 📦 Top Extensions by Bytes (Top 20)");
md.push("| Ext | Files | Bytes | % of total |");
md.push("| --- | ---: | ---: | ---: |");
for (const e of stats.byExtension.slice(0, 20)) {
const p = pct(e.bytes, stats.totalBytes);
md.push(
`| ${e.ext} | ${e.count} | ${e.bytes.toLocaleString()} | ${
p.toFixed(2)
}% |`,
);
}
md.push("");
}
// Top Directories
if (Array.isArray(stats.byDirectory) && stats.byDirectory.length) {
md.push("## 📂 Top Directories by Bytes (Top 20)");
md.push("| Directory | Files | Bytes | % of total |");
md.push("| --- | ---: | ---: | ---: |");
for (const d of stats.byDirectory.slice(0, 20)) {
const p = pct(d.bytes, stats.totalBytes);
md.push(
`| ${d.dir} | ${d.count} | ${d.bytes.toLocaleString()} | ${
p.toFixed(2)
}% |`,
);
}
md.push("");
}
// Depth distribution
if (
Array.isArray(stats.depthDistribution) &&
stats.depthDistribution.length
) {
md.push("## 🌳 Depth Distribution");
md.push("| Depth | Count |");
md.push("| ---: | ---: |");
for (const d of stats.depthDistribution) {
md.push(`| ${d.depth} | ${d.count} |`);
}
md.push("");
}
// Longest paths
if (
Array.isArray(stats.longestPaths) && stats.longestPaths.length
) {
md.push("## 🧵 Longest Paths (Top 25)");
md.push("| Path | Length | Bytes |");
md.push("| --- | ---: | ---: |");
for (const pth of stats.longestPaths) {
md.push(
`| ${pth.path} | ${pth.length} | ${pth.size.toLocaleString()} |`,
);
}
md.push("");
}
// Temporal
if (stats.temporal) {
md.push("## ⏱️ Temporal");
if (stats.temporal.oldest) {
md.push(
`- Oldest: ${stats.temporal.oldest.path} (${stats.temporal.oldest.mtime})`,
);
}
if (stats.temporal.newest) {
md.push(
`- Newest: ${stats.temporal.newest.path} (${stats.temporal.newest.mtime})`,
);
}
if (Array.isArray(stats.temporal.ageBuckets)) {
md.push("");
md.push("| Age | Files | Bytes |");
md.push("| --- | ---: | ---: |");
for (const b of stats.temporal.ageBuckets) {
md.push(
`| ${b.label} | ${b.count} | ${b.bytes.toLocaleString()} |`,
);
}
}
md.push("");
}
// Quality signals
if (stats.quality) {
md.push("## ✅ Quality Signals");
md.push(`- Zero-byte files: ${stats.quality.zeroByteFiles}`);
md.push(`- Empty text files: ${stats.quality.emptyTextFiles}`);
md.push(`- Hidden files: ${stats.quality.hiddenFiles}`);
md.push(`- Symlinks: ${stats.quality.symlinks}`);
md.push(
`- Large files (>= ${
(stats.quality.largeThreshold / (1024 * 1024)).toFixed(0)
} MB): ${stats.quality.largeFilesCount}`,
);
md.push(
`- Suspiciously large files (>= 100 MB): ${stats.quality.suspiciousLargeFilesCount}`,
);
md.push("");
}
// Duplicates
if (
Array.isArray(stats.duplicateCandidates) &&
stats.duplicateCandidates.length
) {
md.push("## 🧬 Duplicate Candidates");
md.push("| Reason | Files | Size (bytes) |");
md.push("| --- | ---: | ---: |");
for (const d of stats.duplicateCandidates) {
md.push(
`| ${d.reason} | ${d.count} | ${d.size.toLocaleString()} |`,
);
}
md.push("");
// Detailed listing of duplicate file names and locations
md.push("### 🧬 Duplicate Groups Details");
let dupIndex = 1;
for (const d of stats.duplicateCandidates) {
md.push(
`#### Group ${dupIndex}: ${d.count} files @ ${d.size.toLocaleString()} bytes (${d.reason})`,
);
if (Array.isArray(d.files) && d.files.length) {
for (const fp of d.files) {
md.push(`- ${fp}`);
}
} else {
md.push("- (file list unavailable)");
}
md.push("");
dupIndex++;
}
md.push("");
}
// Compressibility
if (typeof stats.compressibilityRatio === "number") {
md.push("## 🗜️ Compressibility");
md.push(
`Sampled compressibility ratio: ${
(stats.compressibilityRatio * 100).toFixed(2)
}%`,
);
md.push("");
}
// Git
if (stats.git && stats.git.isRepo) {
md.push("## 🔧 Git");
md.push(
`- Tracked: ${stats.git.trackedCount} files, ${stats.git.trackedBytes.toLocaleString()} bytes`,
);
md.push(
`- Untracked: ${stats.git.untrackedCount} files, ${stats.git.untrackedBytes.toLocaleString()} bytes`,
);
if (
Array.isArray(stats.git.lfsCandidates) &&
stats.git.lfsCandidates.length
) {
md.push("");
md.push("### 📦 LFS Candidates (Top 20)");
md.push("| Path | Bytes |");
md.push("| --- | ---: |");
for (const f of stats.git.lfsCandidates.slice(0, 20)) {
md.push(`| ${f.path} | ${f.size.toLocaleString()} |`);
}
}
md.push("");
}
// Largest Files
if (
Array.isArray(stats.largestFiles) && stats.largestFiles.length
) {
md.push("## 📚 Largest Files (Top 50)");
md.push("| Path | Size | % of total | LOC |");
md.push("| --- | ---: | ---: | ---: |");
for (const f of stats.largestFiles) {
let loc = "";
if (
!f.isBinary && Array.isArray(aggregatedContent?.textFiles)
) {
const tf = aggregatedContent.textFiles.find((t) =>
t.path === f.path
);
if (tf && typeof tf.lines === "number") {
loc = tf.lines.toLocaleString();
}
}
md.push(
`| ${f.path} | ${f.sizeFormatted} | ${
f.percentOfTotal.toFixed(2)
}% | ${loc} |`,
);
}
md.push("");
}
await fs.writeFile(mdPath, md.join("\n"));
console.log(`\n🧾 Detailed stats report written to: ${mdPath}`);
} catch (e) {
console.warn(`⚠️ Failed to write stats markdown: ${e.message}`);
}
}
}
} catch (error) {
console.error("❌ Critical error:", error.message);
console.error("An unexpected error occurred.");
process.exit(1);
}
});
if (require.main === module) {
program.parse();
}
module.exports = program;