feat(flattener): prompt for detailed stats; polish .stats.md with emojis (#422)

* feat: add detailed statistics and markdown report generation to flattener tool

* fix: remove redundant error handling for project root detection
This commit is contained in:
manjaroblack
2025-08-16 08:03:28 -05:00
committed by GitHub
parent 93426c2d2f
commit fab9d5e1f5
6 changed files with 1458 additions and 54 deletions

View File

@@ -127,19 +127,11 @@ program
path.join(inputDir, "flattened-codebase.xml"),
);
}
} else {
console.error(
"Could not auto-detect a project root and no arguments were provided. Please specify -i/--input and -o/--output.",
);
process.exit(1);
}
// Ensure output directory exists
await fs.ensureDir(path.dirname(outputPath));
console.log(`Flattening codebase from: ${inputDir}`);
console.log(`Output file: ${outputPath}`);
try {
// Verify input directory exists
if (!await fs.pathExists(inputDir)) {
@@ -159,7 +151,6 @@ program
);
// Process files with progress tracking
console.log("Reading file contents");
const processingSpinner = ora("📄 Processing files...").start();
const aggregatedContent = await aggregateFileContents(
filteredFiles,
@@ -172,10 +163,6 @@ program
if (aggregatedContent.errors.length > 0) {
console.log(`Errors: ${aggregatedContent.errors.length}`);
}
console.log(`Text files: ${aggregatedContent.textFiles.length}`);
if (aggregatedContent.binaryFiles.length > 0) {
console.log(`Binary files: ${aggregatedContent.binaryFiles.length}`);
}
// Generate XML output using streaming
const xmlSpinner = ora("🔧 Generating XML output...").start();
@@ -184,7 +171,11 @@ program
// Calculate and display statistics
const outputStats = await fs.stat(outputPath);
const stats = calculateStatistics(aggregatedContent, outputStats.size);
const stats = await calculateStatistics(
aggregatedContent,
outputStats.size,
inputDir,
);
// Display completion summary
console.log("\n📊 Completion Summary:");
@@ -201,8 +192,476 @@ program
);
console.log(`🔢 Estimated tokens: ${stats.estimatedTokens}`);
console.log(
`📊 File breakdown: ${stats.textFiles} text, ${stats.binaryFiles} binary, ${stats.errorFiles} errors`,
`📊 File breakdown: ${stats.textFiles} text, ${stats.binaryFiles} binary, ${stats.errorFiles} errors\n`,
);
// Ask user if they want detailed stats + markdown report
const generateDetailed = await promptYesNo(
"Generate detailed stats (console + markdown) now?",
true,
);
if (generateDetailed) {
// Additional detailed stats
console.log("\n📈 Size Percentiles:");
console.log(
` Avg: ${
Math.round(stats.avgFileSize).toLocaleString()
} B, Median: ${
Math.round(stats.medianFileSize).toLocaleString()
} B, p90: ${stats.p90.toLocaleString()} B, p95: ${stats.p95.toLocaleString()} B, p99: ${stats.p99.toLocaleString()} B`,
);
if (Array.isArray(stats.histogram) && stats.histogram.length) {
console.log("\n🧮 Size Histogram:");
for (const b of stats.histogram.slice(0, 2)) {
console.log(
` ${b.label}: ${b.count} files, ${b.bytes.toLocaleString()} bytes`,
);
}
if (stats.histogram.length > 2) {
console.log(` … and ${stats.histogram.length - 2} more buckets`);
}
}
if (Array.isArray(stats.byExtension) && stats.byExtension.length) {
const topExt = stats.byExtension.slice(0, 2);
console.log("\n📦 Top Extensions:");
for (const e of topExt) {
const pct = stats.totalBytes
? ((e.bytes / stats.totalBytes) * 100)
: 0;
console.log(
` ${e.ext}: ${e.count} files, ${e.bytes.toLocaleString()} bytes (${
pct.toFixed(2)
}%)`,
);
}
if (stats.byExtension.length > 2) {
console.log(
` … and ${stats.byExtension.length - 2} more extensions`,
);
}
}
if (Array.isArray(stats.byDirectory) && stats.byDirectory.length) {
const topDir = stats.byDirectory.slice(0, 2);
console.log("\n📂 Top Directories:");
for (const d of topDir) {
const pct = stats.totalBytes
? ((d.bytes / stats.totalBytes) * 100)
: 0;
console.log(
` ${d.dir}: ${d.count} files, ${d.bytes.toLocaleString()} bytes (${
pct.toFixed(2)
}%)`,
);
}
if (stats.byDirectory.length > 2) {
console.log(
` … and ${stats.byDirectory.length - 2} more directories`,
);
}
}
if (
Array.isArray(stats.depthDistribution) &&
stats.depthDistribution.length
) {
console.log("\n🌳 Depth Distribution:");
const dd = stats.depthDistribution.slice(0, 2);
let line = " " + dd.map((d) => `${d.depth}:${d.count}`).join(" ");
if (stats.depthDistribution.length > 2) {
line += ` … +${stats.depthDistribution.length - 2} more`;
}
console.log(line);
}
if (Array.isArray(stats.longestPaths) && stats.longestPaths.length) {
console.log("\n🧵 Longest Paths:");
for (const p of stats.longestPaths.slice(0, 2)) {
console.log(
` ${p.path} (${p.length} chars, ${p.size.toLocaleString()} bytes)`,
);
}
if (stats.longestPaths.length > 2) {
console.log(` … and ${stats.longestPaths.length - 2} more paths`);
}
}
if (stats.temporal) {
console.log("\n⏱ Temporal:");
if (stats.temporal.oldest) {
console.log(
` Oldest: ${stats.temporal.oldest.path} (${stats.temporal.oldest.mtime})`,
);
}
if (stats.temporal.newest) {
console.log(
` Newest: ${stats.temporal.newest.path} (${stats.temporal.newest.mtime})`,
);
}
if (Array.isArray(stats.temporal.ageBuckets)) {
console.log(" Age buckets:");
for (const b of stats.temporal.ageBuckets.slice(0, 2)) {
console.log(
` ${b.label}: ${b.count} files, ${b.bytes.toLocaleString()} bytes`,
);
}
if (stats.temporal.ageBuckets.length > 2) {
console.log(
` … and ${
stats.temporal.ageBuckets.length - 2
} more buckets`,
);
}
}
}
if (stats.quality) {
console.log("\n✅ Quality Signals:");
console.log(` Zero-byte files: ${stats.quality.zeroByteFiles}`);
console.log(` Empty text files: ${stats.quality.emptyTextFiles}`);
console.log(` Hidden files: ${stats.quality.hiddenFiles}`);
console.log(` Symlinks: ${stats.quality.symlinks}`);
console.log(
` Large files (>= ${
(stats.quality.largeThreshold / (1024 * 1024)).toFixed(0)
} MB): ${stats.quality.largeFilesCount}`,
);
console.log(
` Suspiciously large files (>= 100 MB): ${stats.quality.suspiciousLargeFilesCount}`,
);
}
if (
Array.isArray(stats.duplicateCandidates) &&
stats.duplicateCandidates.length
) {
console.log("\n🧬 Duplicate Candidates:");
for (const d of stats.duplicateCandidates.slice(0, 2)) {
console.log(
` ${d.reason}: ${d.count} files @ ${d.size.toLocaleString()} bytes`,
);
}
if (stats.duplicateCandidates.length > 2) {
console.log(
` … and ${stats.duplicateCandidates.length - 2} more groups`,
);
}
}
if (typeof stats.compressibilityRatio === "number") {
console.log(
`\n🗜️ Compressibility ratio (sampled): ${
(stats.compressibilityRatio * 100).toFixed(2)
}%`,
);
}
if (stats.git && stats.git.isRepo) {
console.log("\n🔧 Git:");
console.log(
` Tracked: ${stats.git.trackedCount} files, ${stats.git.trackedBytes.toLocaleString()} bytes`,
);
console.log(
` Untracked: ${stats.git.untrackedCount} files, ${stats.git.untrackedBytes.toLocaleString()} bytes`,
);
if (
Array.isArray(stats.git.lfsCandidates) &&
stats.git.lfsCandidates.length
) {
console.log(" LFS candidates (top 2):");
for (const f of stats.git.lfsCandidates.slice(0, 2)) {
console.log(` ${f.path} (${f.size.toLocaleString()} bytes)`);
}
if (stats.git.lfsCandidates.length > 2) {
console.log(
` … and ${stats.git.lfsCandidates.length - 2} more`,
);
}
}
}
if (Array.isArray(stats.largestFiles) && stats.largestFiles.length) {
console.log("\n📚 Largest Files (top 2):");
for (const f of stats.largestFiles.slice(0, 2)) {
// Show LOC for text files when available; omit ext and mtime
let locStr = "";
if (!f.isBinary && Array.isArray(aggregatedContent?.textFiles)) {
const tf = aggregatedContent.textFiles.find((t) =>
t.path === f.path
);
if (tf && typeof tf.lines === "number") {
locStr = `, LOC: ${tf.lines.toLocaleString()}`;
}
}
console.log(
` ${f.path} ${f.sizeFormatted} (${
f.percentOfTotal.toFixed(2)
}%)${locStr}`,
);
}
if (stats.largestFiles.length > 2) {
console.log(` … and ${stats.largestFiles.length - 2} more files`);
}
}
// Write a comprehensive markdown report next to the XML
{
const mdPath = outputPath.endsWith(".xml")
? outputPath.replace(/\.xml$/i, ".stats.md")
: outputPath + ".stats.md";
try {
const pct = (num, den) => (den ? ((num / den) * 100) : 0);
const md = [];
md.push(`# 🧾 Flatten Stats for ${path.basename(outputPath)}`);
md.push("");
md.push("## 📊 Summary");
md.push(`- Total source size: ${stats.totalSize}`);
md.push(`- Generated XML size: ${stats.xmlSize}`);
md.push(
`- Total lines of code: ${stats.totalLines.toLocaleString()}`,
);
md.push(`- Estimated tokens: ${stats.estimatedTokens}`);
md.push(
`- File breakdown: ${stats.textFiles} text, ${stats.binaryFiles} binary, ${stats.errorFiles} errors`,
);
md.push("");
// Percentiles
md.push("## 📈 Size Percentiles");
md.push(
`Avg: ${
Math.round(stats.avgFileSize).toLocaleString()
} B, Median: ${
Math.round(stats.medianFileSize).toLocaleString()
} B, p90: ${stats.p90.toLocaleString()} B, p95: ${stats.p95.toLocaleString()} B, p99: ${stats.p99.toLocaleString()} B`,
);
md.push("");
// Histogram
if (Array.isArray(stats.histogram) && stats.histogram.length) {
md.push("## 🧮 Size Histogram");
md.push("| Bucket | Files | Bytes |");
md.push("| --- | ---: | ---: |");
for (const b of stats.histogram) {
md.push(
`| ${b.label} | ${b.count} | ${b.bytes.toLocaleString()} |`,
);
}
md.push("");
}
// Top Extensions
if (Array.isArray(stats.byExtension) && stats.byExtension.length) {
md.push("## 📦 Top Extensions by Bytes (Top 20)");
md.push("| Ext | Files | Bytes | % of total |");
md.push("| --- | ---: | ---: | ---: |");
for (const e of stats.byExtension.slice(0, 20)) {
const p = pct(e.bytes, stats.totalBytes);
md.push(
`| ${e.ext} | ${e.count} | ${e.bytes.toLocaleString()} | ${
p.toFixed(2)
}% |`,
);
}
md.push("");
}
// Top Directories
if (Array.isArray(stats.byDirectory) && stats.byDirectory.length) {
md.push("## 📂 Top Directories by Bytes (Top 20)");
md.push("| Directory | Files | Bytes | % of total |");
md.push("| --- | ---: | ---: | ---: |");
for (const d of stats.byDirectory.slice(0, 20)) {
const p = pct(d.bytes, stats.totalBytes);
md.push(
`| ${d.dir} | ${d.count} | ${d.bytes.toLocaleString()} | ${
p.toFixed(2)
}% |`,
);
}
md.push("");
}
// Depth distribution
if (
Array.isArray(stats.depthDistribution) &&
stats.depthDistribution.length
) {
md.push("## 🌳 Depth Distribution");
md.push("| Depth | Count |");
md.push("| ---: | ---: |");
for (const d of stats.depthDistribution) {
md.push(`| ${d.depth} | ${d.count} |`);
}
md.push("");
}
// Longest paths
if (
Array.isArray(stats.longestPaths) && stats.longestPaths.length
) {
md.push("## 🧵 Longest Paths (Top 25)");
md.push("| Path | Length | Bytes |");
md.push("| --- | ---: | ---: |");
for (const pth of stats.longestPaths) {
md.push(
`| ${pth.path} | ${pth.length} | ${pth.size.toLocaleString()} |`,
);
}
md.push("");
}
// Temporal
if (stats.temporal) {
md.push("## ⏱️ Temporal");
if (stats.temporal.oldest) {
md.push(
`- Oldest: ${stats.temporal.oldest.path} (${stats.temporal.oldest.mtime})`,
);
}
if (stats.temporal.newest) {
md.push(
`- Newest: ${stats.temporal.newest.path} (${stats.temporal.newest.mtime})`,
);
}
if (Array.isArray(stats.temporal.ageBuckets)) {
md.push("");
md.push("| Age | Files | Bytes |");
md.push("| --- | ---: | ---: |");
for (const b of stats.temporal.ageBuckets) {
md.push(
`| ${b.label} | ${b.count} | ${b.bytes.toLocaleString()} |`,
);
}
}
md.push("");
}
// Quality signals
if (stats.quality) {
md.push("## ✅ Quality Signals");
md.push(`- Zero-byte files: ${stats.quality.zeroByteFiles}`);
md.push(`- Empty text files: ${stats.quality.emptyTextFiles}`);
md.push(`- Hidden files: ${stats.quality.hiddenFiles}`);
md.push(`- Symlinks: ${stats.quality.symlinks}`);
md.push(
`- Large files (>= ${
(stats.quality.largeThreshold / (1024 * 1024)).toFixed(0)
} MB): ${stats.quality.largeFilesCount}`,
);
md.push(
`- Suspiciously large files (>= 100 MB): ${stats.quality.suspiciousLargeFilesCount}`,
);
md.push("");
}
// Duplicates
if (
Array.isArray(stats.duplicateCandidates) &&
stats.duplicateCandidates.length
) {
md.push("## 🧬 Duplicate Candidates");
md.push("| Reason | Files | Size (bytes) |");
md.push("| --- | ---: | ---: |");
for (const d of stats.duplicateCandidates) {
md.push(
`| ${d.reason} | ${d.count} | ${d.size.toLocaleString()} |`,
);
}
md.push("");
// Detailed listing of duplicate file names and locations
md.push("### 🧬 Duplicate Groups Details");
let dupIndex = 1;
for (const d of stats.duplicateCandidates) {
md.push(
`#### Group ${dupIndex}: ${d.count} files @ ${d.size.toLocaleString()} bytes (${d.reason})`,
);
if (Array.isArray(d.files) && d.files.length) {
for (const fp of d.files) {
md.push(`- ${fp}`);
}
} else {
md.push("- (file list unavailable)");
}
md.push("");
dupIndex++;
}
md.push("");
}
// Compressibility
if (typeof stats.compressibilityRatio === "number") {
md.push("## 🗜️ Compressibility");
md.push(
`Sampled compressibility ratio: ${
(stats.compressibilityRatio * 100).toFixed(2)
}%`,
);
md.push("");
}
// Git
if (stats.git && stats.git.isRepo) {
md.push("## 🔧 Git");
md.push(
`- Tracked: ${stats.git.trackedCount} files, ${stats.git.trackedBytes.toLocaleString()} bytes`,
);
md.push(
`- Untracked: ${stats.git.untrackedCount} files, ${stats.git.untrackedBytes.toLocaleString()} bytes`,
);
if (
Array.isArray(stats.git.lfsCandidates) &&
stats.git.lfsCandidates.length
) {
md.push("");
md.push("### 📦 LFS Candidates (Top 20)");
md.push("| Path | Bytes |");
md.push("| --- | ---: |");
for (const f of stats.git.lfsCandidates.slice(0, 20)) {
md.push(`| ${f.path} | ${f.size.toLocaleString()} |`);
}
}
md.push("");
}
// Largest Files
if (
Array.isArray(stats.largestFiles) && stats.largestFiles.length
) {
md.push("## 📚 Largest Files (Top 50)");
md.push("| Path | Size | % of total | LOC |");
md.push("| --- | ---: | ---: | ---: |");
for (const f of stats.largestFiles) {
let loc = "";
if (
!f.isBinary && Array.isArray(aggregatedContent?.textFiles)
) {
const tf = aggregatedContent.textFiles.find((t) =>
t.path === f.path
);
if (tf && typeof tf.lines === "number") {
loc = tf.lines.toLocaleString();
}
}
md.push(
`| ${f.path} | ${f.sizeFormatted} | ${
f.percentOfTotal.toFixed(2)
}% | ${loc} |`,
);
}
md.push("");
}
await fs.writeFile(mdPath, md.join("\n"));
console.log(`\n🧾 Detailed stats report written to: ${mdPath}`);
} catch (e) {
console.warn(`⚠️ Failed to write stats markdown: ${e.message}`);
}
}
}
} catch (error) {
console.error("❌ Critical error:", error.message);
console.error("An unexpected error occurred.");