feat(flattener): prompt for detailed stats; polish .stats.md with emojis (#422)

* feat: add detailed statistics and markdown report generation to flattener tool * fix: remove redundant error handling for project root detection
2025-08-16 08:03:28 -05:00
parent 93426c2d2f
commit fab9d5e1f5
6 changed files with 1458 additions and 54 deletions
--- a/tools/flattener/main.js
+++ b/tools/flattener/main.js
@@ -127,19 +127,11 @@ program
          path.join(inputDir, "flattened-codebase.xml"),
        );
      }
-    } else {
-      console.error(
-        "Could not auto-detect a project root and no arguments were provided. Please specify -i/--input and -o/--output.",
-      );
-      process.exit(1);
    }

    // Ensure output directory exists
    await fs.ensureDir(path.dirname(outputPath));

-    console.log(`Flattening codebase from: ${inputDir}`);
-    console.log(`Output file: ${outputPath}`);
-
    try {
      // Verify input directory exists
      if (!await fs.pathExists(inputDir)) {
@@ -159,7 +151,6 @@ program
      );

      // Process files with progress tracking
-      console.log("Reading file contents");
      const processingSpinner = ora("📄 Processing files...").start();
      const aggregatedContent = await aggregateFileContents(
        filteredFiles,
@@ -172,10 +163,6 @@ program
      if (aggregatedContent.errors.length > 0) {
        console.log(`Errors: ${aggregatedContent.errors.length}`);
      }
-      console.log(`Text files: ${aggregatedContent.textFiles.length}`);
-      if (aggregatedContent.binaryFiles.length > 0) {
-        console.log(`Binary files: ${aggregatedContent.binaryFiles.length}`);
-      }

      // Generate XML output using streaming
      const xmlSpinner = ora("🔧 Generating XML output...").start();
@@ -184,7 +171,11 @@ program

      // Calculate and display statistics
      const outputStats = await fs.stat(outputPath);
-      const stats = calculateStatistics(aggregatedContent, outputStats.size);
+      const stats = await calculateStatistics(
+        aggregatedContent,
+        outputStats.size,
+        inputDir,
+      );

      // Display completion summary
      console.log("\n📊 Completion Summary:");
@@ -201,8 +192,476 @@ program
      );
      console.log(`🔢 Estimated tokens: ${stats.estimatedTokens}`);
      console.log(
-        `📊 File breakdown: ${stats.textFiles} text, ${stats.binaryFiles} binary, ${stats.errorFiles} errors`,
+        `📊 File breakdown: ${stats.textFiles} text, ${stats.binaryFiles} binary, ${stats.errorFiles} errors\n`,
      );
+
+      // Ask user if they want detailed stats + markdown report
+      const generateDetailed = await promptYesNo(
+        "Generate detailed stats (console + markdown) now?",
+        true,
+      );
+
+      if (generateDetailed) {
+        // Additional detailed stats
+        console.log("\n📈 Size Percentiles:");
+        console.log(
+          `   Avg: ${
+            Math.round(stats.avgFileSize).toLocaleString()
+          } B, Median: ${
+            Math.round(stats.medianFileSize).toLocaleString()
+          } B, p90: ${stats.p90.toLocaleString()} B, p95: ${stats.p95.toLocaleString()} B, p99: ${stats.p99.toLocaleString()} B`,
+        );
+
+        if (Array.isArray(stats.histogram) && stats.histogram.length) {
+          console.log("\n🧮 Size Histogram:");
+          for (const b of stats.histogram.slice(0, 2)) {
+            console.log(
+              `   ${b.label}: ${b.count} files, ${b.bytes.toLocaleString()} bytes`,
+            );
+          }
+          if (stats.histogram.length > 2) {
+            console.log(`   … and ${stats.histogram.length - 2} more buckets`);
+          }
+        }
+
+        if (Array.isArray(stats.byExtension) && stats.byExtension.length) {
+          const topExt = stats.byExtension.slice(0, 2);
+          console.log("\n📦 Top Extensions:");
+          for (const e of topExt) {
+            const pct = stats.totalBytes
+              ? ((e.bytes / stats.totalBytes) * 100)
+              : 0;
+            console.log(
+              `   ${e.ext}: ${e.count} files, ${e.bytes.toLocaleString()} bytes (${
+                pct.toFixed(2)
+              }%)`,
+            );
+          }
+          if (stats.byExtension.length > 2) {
+            console.log(
+              `   … and ${stats.byExtension.length - 2} more extensions`,
+            );
+          }
+        }
+
+        if (Array.isArray(stats.byDirectory) && stats.byDirectory.length) {
+          const topDir = stats.byDirectory.slice(0, 2);
+          console.log("\n📂 Top Directories:");
+          for (const d of topDir) {
+            const pct = stats.totalBytes
+              ? ((d.bytes / stats.totalBytes) * 100)
+              : 0;
+            console.log(
+              `   ${d.dir}: ${d.count} files, ${d.bytes.toLocaleString()} bytes (${
+                pct.toFixed(2)
+              }%)`,
+            );
+          }
+          if (stats.byDirectory.length > 2) {
+            console.log(
+              `   … and ${stats.byDirectory.length - 2} more directories`,
+            );
+          }
+        }
+
+        if (
+          Array.isArray(stats.depthDistribution) &&
+          stats.depthDistribution.length
+        ) {
+          console.log("\n🌳 Depth Distribution:");
+          const dd = stats.depthDistribution.slice(0, 2);
+          let line = "   " + dd.map((d) => `${d.depth}:${d.count}`).join("  ");
+          if (stats.depthDistribution.length > 2) {
+            line += `  … +${stats.depthDistribution.length - 2} more`;
+          }
+          console.log(line);
+        }
+
+        if (Array.isArray(stats.longestPaths) && stats.longestPaths.length) {
+          console.log("\n🧵 Longest Paths:");
+          for (const p of stats.longestPaths.slice(0, 2)) {
+            console.log(
+              `   ${p.path} (${p.length} chars, ${p.size.toLocaleString()} bytes)`,
+            );
+          }
+          if (stats.longestPaths.length > 2) {
+            console.log(`   … and ${stats.longestPaths.length - 2} more paths`);
+          }
+        }
+
+        if (stats.temporal) {
+          console.log("\n⏱️ Temporal:");
+          if (stats.temporal.oldest) {
+            console.log(
+              `   Oldest: ${stats.temporal.oldest.path} (${stats.temporal.oldest.mtime})`,
+            );
+          }
+          if (stats.temporal.newest) {
+            console.log(
+              `   Newest: ${stats.temporal.newest.path} (${stats.temporal.newest.mtime})`,
+            );
+          }
+          if (Array.isArray(stats.temporal.ageBuckets)) {
+            console.log("   Age buckets:");
+            for (const b of stats.temporal.ageBuckets.slice(0, 2)) {
+              console.log(
+                `     ${b.label}: ${b.count} files, ${b.bytes.toLocaleString()} bytes`,
+              );
+            }
+            if (stats.temporal.ageBuckets.length > 2) {
+              console.log(
+                `     … and ${
+                  stats.temporal.ageBuckets.length - 2
+                } more buckets`,
+              );
+            }
+          }
+        }
+
+        if (stats.quality) {
+          console.log("\n✅ Quality Signals:");
+          console.log(`   Zero-byte files: ${stats.quality.zeroByteFiles}`);
+          console.log(`   Empty text files: ${stats.quality.emptyTextFiles}`);
+          console.log(`   Hidden files: ${stats.quality.hiddenFiles}`);
+          console.log(`   Symlinks: ${stats.quality.symlinks}`);
+          console.log(
+            `   Large files (>= ${
+              (stats.quality.largeThreshold / (1024 * 1024)).toFixed(0)
+            } MB): ${stats.quality.largeFilesCount}`,
+          );
+          console.log(
+            `   Suspiciously large files (>= 100 MB): ${stats.quality.suspiciousLargeFilesCount}`,
+          );
+        }
+
+        if (
+          Array.isArray(stats.duplicateCandidates) &&
+          stats.duplicateCandidates.length
+        ) {
+          console.log("\n🧬 Duplicate Candidates:");
+          for (const d of stats.duplicateCandidates.slice(0, 2)) {
+            console.log(
+              `   ${d.reason}: ${d.count} files @ ${d.size.toLocaleString()} bytes`,
+            );
+          }
+          if (stats.duplicateCandidates.length > 2) {
+            console.log(
+              `   … and ${stats.duplicateCandidates.length - 2} more groups`,
+            );
+          }
+        }
+
+        if (typeof stats.compressibilityRatio === "number") {
+          console.log(
+            `\n🗜️ Compressibility ratio (sampled): ${
+              (stats.compressibilityRatio * 100).toFixed(2)
+            }%`,
+          );
+        }
+
+        if (stats.git && stats.git.isRepo) {
+          console.log("\n🔧 Git:");
+          console.log(
+            `   Tracked: ${stats.git.trackedCount} files, ${stats.git.trackedBytes.toLocaleString()} bytes`,
+          );
+          console.log(
+            `   Untracked: ${stats.git.untrackedCount} files, ${stats.git.untrackedBytes.toLocaleString()} bytes`,
+          );
+          if (
+            Array.isArray(stats.git.lfsCandidates) &&
+            stats.git.lfsCandidates.length
+          ) {
+            console.log("   LFS candidates (top 2):");
+            for (const f of stats.git.lfsCandidates.slice(0, 2)) {
+              console.log(`     ${f.path} (${f.size.toLocaleString()} bytes)`);
+            }
+            if (stats.git.lfsCandidates.length > 2) {
+              console.log(
+                `     … and ${stats.git.lfsCandidates.length - 2} more`,
+              );
+            }
+          }
+        }
+
+        if (Array.isArray(stats.largestFiles) && stats.largestFiles.length) {
+          console.log("\n📚 Largest Files (top 2):");
+          for (const f of stats.largestFiles.slice(0, 2)) {
+            // Show LOC for text files when available; omit ext and mtime
+            let locStr = "";
+            if (!f.isBinary && Array.isArray(aggregatedContent?.textFiles)) {
+              const tf = aggregatedContent.textFiles.find((t) =>
+                t.path === f.path
+              );
+              if (tf && typeof tf.lines === "number") {
+                locStr = `, LOC: ${tf.lines.toLocaleString()}`;
+              }
+            }
+            console.log(
+              `   ${f.path} – ${f.sizeFormatted} (${
+                f.percentOfTotal.toFixed(2)
+              }%)${locStr}`,
+            );
+          }
+          if (stats.largestFiles.length > 2) {
+            console.log(`   … and ${stats.largestFiles.length - 2} more files`);
+          }
+        }
+
+        // Write a comprehensive markdown report next to the XML
+        {
+          const mdPath = outputPath.endsWith(".xml")
+            ? outputPath.replace(/\.xml$/i, ".stats.md")
+            : outputPath + ".stats.md";
+          try {
+            const pct = (num, den) => (den ? ((num / den) * 100) : 0);
+            const md = [];
+            md.push(`# 🧾 Flatten Stats for ${path.basename(outputPath)}`);
+            md.push("");
+            md.push("## 📊 Summary");
+            md.push(`- Total source size: ${stats.totalSize}`);
+            md.push(`- Generated XML size: ${stats.xmlSize}`);
+            md.push(
+              `- Total lines of code: ${stats.totalLines.toLocaleString()}`,
+            );
+            md.push(`- Estimated tokens: ${stats.estimatedTokens}`);
+            md.push(
+              `- File breakdown: ${stats.textFiles} text, ${stats.binaryFiles} binary, ${stats.errorFiles} errors`,
+            );
+            md.push("");
+
+            // Percentiles
+            md.push("## 📈 Size Percentiles");
+            md.push(
+              `Avg: ${
+                Math.round(stats.avgFileSize).toLocaleString()
+              } B, Median: ${
+                Math.round(stats.medianFileSize).toLocaleString()
+              } B, p90: ${stats.p90.toLocaleString()} B, p95: ${stats.p95.toLocaleString()} B, p99: ${stats.p99.toLocaleString()} B`,
+            );
+            md.push("");
+
+            // Histogram
+            if (Array.isArray(stats.histogram) && stats.histogram.length) {
+              md.push("## 🧮 Size Histogram");
+              md.push("| Bucket | Files | Bytes |");
+              md.push("| --- | ---: | ---: |");
+              for (const b of stats.histogram) {
+                md.push(
+                  `| ${b.label} | ${b.count} | ${b.bytes.toLocaleString()} |`,
+                );
+              }
+              md.push("");
+            }
+
+            // Top Extensions
+            if (Array.isArray(stats.byExtension) && stats.byExtension.length) {
+              md.push("## 📦 Top Extensions by Bytes (Top 20)");
+              md.push("| Ext | Files | Bytes | % of total |");
+              md.push("| --- | ---: | ---: | ---: |");
+              for (const e of stats.byExtension.slice(0, 20)) {
+                const p = pct(e.bytes, stats.totalBytes);
+                md.push(
+                  `| ${e.ext} | ${e.count} | ${e.bytes.toLocaleString()} | ${
+                    p.toFixed(2)
+                  }% |`,
+                );
+              }
+              md.push("");
+            }
+
+            // Top Directories
+            if (Array.isArray(stats.byDirectory) && stats.byDirectory.length) {
+              md.push("## 📂 Top Directories by Bytes (Top 20)");
+              md.push("| Directory | Files | Bytes | % of total |");
+              md.push("| --- | ---: | ---: | ---: |");
+              for (const d of stats.byDirectory.slice(0, 20)) {
+                const p = pct(d.bytes, stats.totalBytes);
+                md.push(
+                  `| ${d.dir} | ${d.count} | ${d.bytes.toLocaleString()} | ${
+                    p.toFixed(2)
+                  }% |`,
+                );
+              }
+              md.push("");
+            }
+
+            // Depth distribution
+            if (
+              Array.isArray(stats.depthDistribution) &&
+              stats.depthDistribution.length
+            ) {
+              md.push("## 🌳 Depth Distribution");
+              md.push("| Depth | Count |");
+              md.push("| ---: | ---: |");
+              for (const d of stats.depthDistribution) {
+                md.push(`| ${d.depth} | ${d.count} |`);
+              }
+              md.push("");
+            }
+
+            // Longest paths
+            if (
+              Array.isArray(stats.longestPaths) && stats.longestPaths.length
+            ) {
+              md.push("## 🧵 Longest Paths (Top 25)");
+              md.push("| Path | Length | Bytes |");
+              md.push("| --- | ---: | ---: |");
+              for (const pth of stats.longestPaths) {
+                md.push(
+                  `| ${pth.path} | ${pth.length} | ${pth.size.toLocaleString()} |`,
+                );
+              }
+              md.push("");
+            }
+
+            // Temporal
+            if (stats.temporal) {
+              md.push("## ⏱️ Temporal");
+              if (stats.temporal.oldest) {
+                md.push(
+                  `- Oldest: ${stats.temporal.oldest.path} (${stats.temporal.oldest.mtime})`,
+                );
+              }
+              if (stats.temporal.newest) {
+                md.push(
+                  `- Newest: ${stats.temporal.newest.path} (${stats.temporal.newest.mtime})`,
+                );
+              }
+              if (Array.isArray(stats.temporal.ageBuckets)) {
+                md.push("");
+                md.push("| Age | Files | Bytes |");
+                md.push("| --- | ---: | ---: |");
+                for (const b of stats.temporal.ageBuckets) {
+                  md.push(
+                    `| ${b.label} | ${b.count} | ${b.bytes.toLocaleString()} |`,
+                  );
+                }
+              }
+              md.push("");
+            }
+
+            // Quality signals
+            if (stats.quality) {
+              md.push("## ✅ Quality Signals");
+              md.push(`- Zero-byte files: ${stats.quality.zeroByteFiles}`);
+              md.push(`- Empty text files: ${stats.quality.emptyTextFiles}`);
+              md.push(`- Hidden files: ${stats.quality.hiddenFiles}`);
+              md.push(`- Symlinks: ${stats.quality.symlinks}`);
+              md.push(
+                `- Large files (>= ${
+                  (stats.quality.largeThreshold / (1024 * 1024)).toFixed(0)
+                } MB): ${stats.quality.largeFilesCount}`,
+              );
+              md.push(
+                `- Suspiciously large files (>= 100 MB): ${stats.quality.suspiciousLargeFilesCount}`,
+              );
+              md.push("");
+            }
+
+            // Duplicates
+            if (
+              Array.isArray(stats.duplicateCandidates) &&
+              stats.duplicateCandidates.length
+            ) {
+              md.push("## 🧬 Duplicate Candidates");
+              md.push("| Reason | Files | Size (bytes) |");
+              md.push("| --- | ---: | ---: |");
+              for (const d of stats.duplicateCandidates) {
+                md.push(
+                  `| ${d.reason} | ${d.count} | ${d.size.toLocaleString()} |`,
+                );
+              }
+              md.push("");
+              // Detailed listing of duplicate file names and locations
+              md.push("### 🧬 Duplicate Groups Details");
+              let dupIndex = 1;
+              for (const d of stats.duplicateCandidates) {
+                md.push(
+                  `#### Group ${dupIndex}: ${d.count} files @ ${d.size.toLocaleString()} bytes (${d.reason})`,
+                );
+                if (Array.isArray(d.files) && d.files.length) {
+                  for (const fp of d.files) {
+                    md.push(`- ${fp}`);
+                  }
+                } else {
+                  md.push("- (file list unavailable)");
+                }
+                md.push("");
+                dupIndex++;
+              }
+              md.push("");
+            }
+
+            // Compressibility
+            if (typeof stats.compressibilityRatio === "number") {
+              md.push("## 🗜️ Compressibility");
+              md.push(
+                `Sampled compressibility ratio: ${
+                  (stats.compressibilityRatio * 100).toFixed(2)
+                }%`,
+              );
+              md.push("");
+            }
+
+            // Git
+            if (stats.git && stats.git.isRepo) {
+              md.push("## 🔧 Git");
+              md.push(
+                `- Tracked: ${stats.git.trackedCount} files, ${stats.git.trackedBytes.toLocaleString()} bytes`,
+              );
+              md.push(
+                `- Untracked: ${stats.git.untrackedCount} files, ${stats.git.untrackedBytes.toLocaleString()} bytes`,
+              );
+              if (
+                Array.isArray(stats.git.lfsCandidates) &&
+                stats.git.lfsCandidates.length
+              ) {
+                md.push("");
+                md.push("### 📦 LFS Candidates (Top 20)");
+                md.push("| Path | Bytes |");
+                md.push("| --- | ---: |");
+                for (const f of stats.git.lfsCandidates.slice(0, 20)) {
+                  md.push(`| ${f.path} | ${f.size.toLocaleString()} |`);
+                }
+              }
+              md.push("");
+            }
+
+            // Largest Files
+            if (
+              Array.isArray(stats.largestFiles) && stats.largestFiles.length
+            ) {
+              md.push("## 📚 Largest Files (Top 50)");
+              md.push("| Path | Size | % of total | LOC |");
+              md.push("| --- | ---: | ---: | ---: |");
+              for (const f of stats.largestFiles) {
+                let loc = "";
+                if (
+                  !f.isBinary && Array.isArray(aggregatedContent?.textFiles)
+                ) {
+                  const tf = aggregatedContent.textFiles.find((t) =>
+                    t.path === f.path
+                  );
+                  if (tf && typeof tf.lines === "number") {
+                    loc = tf.lines.toLocaleString();
+                  }
+                }
+                md.push(
+                  `| ${f.path} | ${f.sizeFormatted} | ${
+                    f.percentOfTotal.toFixed(2)
+                  }% | ${loc} |`,
+                );
+              }
+              md.push("");
+            }
+
+            await fs.writeFile(mdPath, md.join("\n"));
+            console.log(`\n🧾 Detailed stats report written to: ${mdPath}`);
+          } catch (e) {
+            console.warn(`⚠️ Failed to write stats markdown: ${e.message}`);
+          }
+        }
+      }
    } catch (error) {
      console.error("❌ Critical error:", error.message);
      console.error("An unexpected error occurred.");