chore(skill-creator): update to latest skill-creator

2026-03-18 23:03:07 +00:00 · 2026-02-24 17:10:46 -08:00
parent 99e11d9592
commit e05013d229
23 changed files with 3634 additions and 2847 deletions
--- a/plugins/skill-creator/skills/skill-creator/scripts/init.py
+++ b/plugins/skill-creator/skills/skill-creator/scripts/init.py
--- a/plugins/skill-creator/skills/skill-creator/scripts/aggregate_benchmark.py
+++ b/plugins/skill-creator/skills/skill-creator/scripts/aggregate_benchmark.py
@@ -12,18 +12,26 @@ Usage:
 Example:
    python aggregate_benchmark.py benchmarks/2026-01-15T10-30-00/

-The script expects this directory structure:
+The script supports two directory layouts:
+
+    Workspace layout (from skill-creator iterations):
+    <benchmark_dir>/
+    └── eval-N/
+        ├── with_skill/
+        │   ├── run-1/grading.json
+        │   └── run-2/grading.json
+        └── without_skill/
+            ├── run-1/grading.json
+            └── run-2/grading.json
+
+    Legacy layout (with runs/ subdirectory):
    <benchmark_dir>/
    └── runs/
        └── eval-N/
            ├── with_skill/
-            │   ├── run-1/grading.json
-            │   ├── run-2/grading.json
-            │   └── run-3/grading.json
+            │   └── run-1/grading.json
            └── without_skill/
-                ├── run-1/grading.json
-                ├── run-2/grading.json
-                └── run-3/grading.json
+                └── run-1/grading.json
 """

 import argparse
@@ -60,31 +68,45 @@ def load_run_results(benchmark_dir: Path) -> dict:
    """
    Load all run results from a benchmark directory.

-    Returns dict with structure:
-    {
-        "with_skill": [
-            {"eval_id": 1, "run_number": 1, "pass_rate": 0.85, ...},
-            ...
-        ],
-        "without_skill": [...]
-    }
+    Returns dict keyed by config name (e.g. "with_skill"/"without_skill",
+    or "new_skill"/"old_skill"), each containing a list of run results.
    """
+    # Support both layouts: eval dirs directly under benchmark_dir, or under runs/
    runs_dir = benchmark_dir / "runs"
+    if runs_dir.exists():
+        search_dir = runs_dir
+    elif list(benchmark_dir.glob("eval-*")):
+        search_dir = benchmark_dir
+    else:
+        print(f"No eval directories found in {benchmark_dir} or {benchmark_dir / 'runs'}")
+        return {}

-    if not runs_dir.exists():
-        print(f"Runs directory not found: {runs_dir}")
-        return {"with_skill": [], "without_skill": []}
+    results: dict[str, list] = {}

-    results = {"with_skill": [], "without_skill": []}
+    for eval_idx, eval_dir in enumerate(sorted(search_dir.glob("eval-*"))):
+        metadata_path = eval_dir / "eval_metadata.json"
+        if metadata_path.exists():
+            try:
+                with open(metadata_path) as mf:
+                    eval_id = json.load(mf).get("eval_id", eval_idx)
+            except (json.JSONDecodeError, OSError):
+                eval_id = eval_idx
+        else:
+            try:
+                eval_id = int(eval_dir.name.split("-")[1])
+            except ValueError:
+                eval_id = eval_idx

-    for eval_dir in sorted(runs_dir.glob("eval-*")):
-        eval_id = int(eval_dir.name.split("-")[1])
-
-        for config in ["with_skill", "without_skill"]:
-            config_dir = eval_dir / config
-
-            if not config_dir.exists():
+        # Discover config directories dynamically rather than hardcoding names
+        for config_dir in sorted(eval_dir.iterdir()):
+            if not config_dir.is_dir():
                continue
+            # Skip non-config directories (inputs, outputs, etc.)
+            if not list(config_dir.glob("run-*")):
+                continue
+            config = config_dir.name
+            if config not in results:
+                results[config] = []

            for run_dir in sorted(config_dir.glob("run-*")):
                run_number = int(run_dir.name.split("-")[1])
@@ -111,18 +133,32 @@ def load_run_results(benchmark_dir: Path) -> dict:
                    "total": grading.get("summary", {}).get("total", 0),
                }

-                # Extract timing if available
+                # Extract timing — check grading.json first, then sibling timing.json
                timing = grading.get("timing", {})
                result["time_seconds"] = timing.get("total_duration_seconds", 0.0)
+                timing_file = run_dir / "timing.json"
+                if result["time_seconds"] == 0.0 and timing_file.exists():
+                    try:
+                        with open(timing_file) as tf:
+                            timing_data = json.load(tf)
+                        result["time_seconds"] = timing_data.get("total_duration_seconds", 0.0)
+                        result["tokens"] = timing_data.get("total_tokens", 0)
+                    except json.JSONDecodeError:
+                        pass

                # Extract metrics if available
                metrics = grading.get("execution_metrics", {})
                result["tool_calls"] = metrics.get("total_tool_calls", 0)
-                result["tokens"] = metrics.get("output_chars", 0)  # Placeholder
+                if not result.get("tokens"):
+                    result["tokens"] = metrics.get("output_chars", 0)
                result["errors"] = metrics.get("errors_encountered", 0)

-                # Extract expectations
-                result["expectations"] = grading.get("expectations", [])
+                # Extract expectations — viewer requires fields: text, passed, evidence
+                raw_expectations = grading.get("expectations", [])
+                for exp in raw_expectations:
+                    if "text" not in exp or "passed" not in exp:
+                        print(f"Warning: expectation in {grading_file} missing required fields (text, passed, evidence): {exp}")
+                result["expectations"] = raw_expectations

                # Extract notes from user_notes_summary
                notes_summary = grading.get("user_notes_summary", {})
@@ -144,8 +180,9 @@ def aggregate_results(results: dict) -> dict:
    Returns run_summary with stats for each configuration and delta.
    """
    run_summary = {}
+    configs = list(results.keys())

-    for config in ["with_skill", "without_skill"]:
+    for config in configs:
        runs = results.get(config, [])

        if not runs:
@@ -166,13 +203,17 @@ def aggregate_results(results: dict) -> dict:
            "tokens": calculate_stats(tokens)
        }

-    # Calculate delta
-    with_skill = run_summary.get("with_skill", {})
-    without_skill = run_summary.get("without_skill", {})
+    # Calculate delta between the first two configs (if two exist)
+    if len(configs) >= 2:
+        primary = run_summary.get(configs[0], {})
+        baseline = run_summary.get(configs[1], {})
+    else:
+        primary = run_summary.get(configs[0], {}) if configs else {}
+        baseline = {}

-    delta_pass_rate = with_skill.get("pass_rate", {}).get("mean", 0) - without_skill.get("pass_rate", {}).get("mean", 0)
-    delta_time = with_skill.get("time_seconds", {}).get("mean", 0) - without_skill.get("time_seconds", {}).get("mean", 0)
-    delta_tokens = with_skill.get("tokens", {}).get("mean", 0) - without_skill.get("tokens", {}).get("mean", 0)
+    delta_pass_rate = primary.get("pass_rate", {}).get("mean", 0) - baseline.get("pass_rate", {}).get("mean", 0)
+    delta_time = primary.get("time_seconds", {}).get("mean", 0) - baseline.get("time_seconds", {}).get("mean", 0)
+    delta_tokens = primary.get("tokens", {}).get("mean", 0) - baseline.get("tokens", {}).get("mean", 0)

    run_summary["delta"] = {
        "pass_rate": f"{delta_pass_rate:+.2f}",
@@ -192,8 +233,8 @@ def generate_benchmark(benchmark_dir: Path, skill_name: str = "", skill_path: st

    # Build runs array for benchmark.json
    runs = []
-    for config in ["with_skill", "without_skill"]:
-        for result in results.get(config, []):
+    for config in results:
+        for result in results[config]:
            runs.append({
                "eval_id": result["eval_id"],
                "configuration": config,
@@ -242,6 +283,13 @@ def generate_markdown(benchmark: dict) -> str:
    metadata = benchmark["metadata"]
    run_summary = benchmark["run_summary"]

+    # Determine config names (excluding "delta")
+    configs = [k for k in run_summary if k != "delta"]
+    config_a = configs[0] if len(configs) >= 1 else "config_a"
+    config_b = configs[1] if len(configs) >= 2 else "config_b"
+    label_a = config_a.replace("_", " ").title()
+    label_b = config_b.replace("_", " ").title()
+
    lines = [
        f"# Skill Benchmark: {metadata['skill_name']}",
        "",
@@ -251,27 +299,28 @@ def generate_markdown(benchmark: dict) -> str:
        "",
        "## Summary",
        "",
-        "| Metric | With Skill | Without Skill | Delta |",
+        f"| Metric | {label_a} | {label_b} | Delta |",
        "|--------|------------|---------------|-------|",
    ]

+    a_summary = run_summary.get(config_a, {})
+    b_summary = run_summary.get(config_b, {})
+    delta = run_summary.get("delta", {})
+
    # Format pass rate
-    with_pr = run_summary["with_skill"]["pass_rate"]
-    without_pr = run_summary["without_skill"]["pass_rate"]
-    delta_pr = run_summary["delta"]["pass_rate"]
-    lines.append(f"| Pass Rate | {with_pr['mean']*100:.0f}% ± {with_pr['stddev']*100:.0f}% | {without_pr['mean']*100:.0f}% ± {without_pr['stddev']*100:.0f}% | {delta_pr} |")
+    a_pr = a_summary.get("pass_rate", {})
+    b_pr = b_summary.get("pass_rate", {})
+    lines.append(f"| Pass Rate | {a_pr.get('mean', 0)*100:.0f}% ± {a_pr.get('stddev', 0)*100:.0f}% | {b_pr.get('mean', 0)*100:.0f}% ± {b_pr.get('stddev', 0)*100:.0f}% | {delta.get('pass_rate', '—')} |")

    # Format time
-    with_time = run_summary["with_skill"]["time_seconds"]
-    without_time = run_summary["without_skill"]["time_seconds"]
-    delta_time = run_summary["delta"]["time_seconds"]
-    lines.append(f"| Time | {with_time['mean']:.1f}s ± {with_time['stddev']:.1f}s | {without_time['mean']:.1f}s ± {without_time['stddev']:.1f}s | {delta_time}s |")
+    a_time = a_summary.get("time_seconds", {})
+    b_time = b_summary.get("time_seconds", {})
+    lines.append(f"| Time | {a_time.get('mean', 0):.1f}s ± {a_time.get('stddev', 0):.1f}s | {b_time.get('mean', 0):.1f}s ± {b_time.get('stddev', 0):.1f}s | {delta.get('time_seconds', '—')}s |")

    # Format tokens
-    with_tokens = run_summary["with_skill"]["tokens"]
-    without_tokens = run_summary["without_skill"]["tokens"]
-    delta_tokens = run_summary["delta"]["tokens"]
-    lines.append(f"| Tokens | {with_tokens['mean']:.0f} ± {with_tokens['stddev']:.0f} | {without_tokens['mean']:.0f} ± {without_tokens['stddev']:.0f} | {delta_tokens} |")
+    a_tokens = a_summary.get("tokens", {})
+    b_tokens = b_summary.get("tokens", {})
+    lines.append(f"| Tokens | {a_tokens.get('mean', 0):.0f} ± {a_tokens.get('stddev', 0):.0f} | {b_tokens.get('mean', 0):.0f} ± {b_tokens.get('stddev', 0):.0f} | {delta.get('tokens', '—')} |")

    # Notes section
    if benchmark.get("notes"):
@@ -337,14 +386,15 @@ def main():

    # Print summary
    run_summary = benchmark["run_summary"]
-    with_pr = run_summary["with_skill"]["pass_rate"]["mean"]
-    without_pr = run_summary["without_skill"]["pass_rate"]["mean"]
-    delta = run_summary["delta"]["pass_rate"]
+    configs = [k for k in run_summary if k != "delta"]
+    delta = run_summary.get("delta", {})

    print(f"\nSummary:")
-    print(f"  With skill:    {with_pr*100:.1f}% pass rate")
-    print(f"  Without skill: {without_pr*100:.1f}% pass rate")
-    print(f"  Delta:         {delta}")
+    for config in configs:
+        pr = run_summary[config]["pass_rate"]["mean"]
+        label = config.replace("_", " ").title()
+        print(f"  {label}: {pr*100:.1f}% pass rate")
+    print(f"  Delta:         {delta.get('pass_rate', '—')}")


 if __name__ == "__main__":
--- a/plugins/skill-creator/skills/skill-creator/scripts/copy_skill.py
+++ b/plugins/skill-creator/skills/skill-creator/scripts/copy_skill.py
@@ -1,209 +0,0 @@
-#!/usr/bin/env python3
-"""
-Copy a skill directory with metadata tracking.
-
-Creates a copy of a skill directory and adds a META.yaml file
-to track lineage, changes, and performance metrics.
-"""
-
-import argparse
-import shutil
-from datetime import datetime, timezone
-from pathlib import Path
-
-
-def create_meta_yaml(
-    dest: Path,
-    parent: str | None,
-    changes: str | None,
-    score: float | None,
-    iteration: int | None,
-) -> None:
-    """Create META.yaml file in the destination directory."""
-    created_at = datetime.now(timezone.utc).isoformat()
-
-    # Build YAML content manually to avoid external dependencies
-    lines = ["# Skill iteration metadata", ""]
-
-    # Helper to format YAML values
-    def yaml_value(val):
-        if val is None:
-            return "null"
-        if isinstance(val, bool):
-            return "true" if val else "false"
-        if isinstance(val, (int, float)):
-            return str(val)
-        if isinstance(val, str):
-            # Quote strings that might be ambiguous
-            if val in ("null", "true", "false") or val.startswith(("'", '"', "[", "{")):
-                return f'"{val}"'
-            # Quote strings with special characters
-            if any(c in val for c in (":", "#", "\n", '"', "'")):
-                escaped = val.replace("\\", "\\\\").replace('"', '\\"')
-                return f'"{escaped}"'
-            return val
-        return str(val)
-
-    lines.append(f"parent: {yaml_value(parent)}")
-    lines.append(f"changes: {yaml_value(changes)}")
-    lines.append(f"score: {yaml_value(score)}")
-    lines.append(f"iteration: {yaml_value(iteration)}")
-    lines.append(f"created_at: {yaml_value(created_at)}")
-    lines.append("")
-
-    meta_path = dest / "META.yaml"
-    meta_path.write_text("\n".join(lines))
-
-
-def copy_skill(
-    source: Path,
-    dest: Path,
-    parent: str | None = None,
-    changes: str | None = None,
-    score: float | None = None,
-    iteration: int | None = None,
-) -> None:
-    """
-    Copy a skill directory and create version directory structure.
-
-    Creates a version directory with:
-    - skill/        : The actual skill files (copied from source)
-    - runs/         : Created by executor during execution (run-1/, run-2/, run-3/)
-    - improvements/ : For improvement suggestions (if not v0)
-    - META.yaml     : Version metadata
-
-    The runs/ directory structure is created on-demand by the executor:
-    - runs/run-1/transcript.md, outputs/, evaluation.json
-    - runs/run-2/...
-    - runs/run-3/...
-
-    Args:
-        source: Path to the source skill directory (or source/skill/ if copying from another version)
-        dest: Path to the destination version directory (e.g., workspace/v1)
-        parent: Name/path of the parent skill iteration
-        changes: Description of changes from parent
-        score: Evaluation score for this iteration
-        iteration: Iteration number
-    """
-    source = Path(source).resolve()
-    dest = Path(dest).resolve()
-
-    if not source.exists():
-        raise FileNotFoundError(f"Source directory does not exist: {source}")
-
-    if not source.is_dir():
-        raise ValueError(f"Source must be a directory: {source}")
-
-    if dest.exists():
-        raise FileExistsError(f"Destination already exists: {dest}")
-
-    # Create the version directory structure
-    dest.mkdir(parents=True)
-    skill_dest = dest / "skill"
-    (dest / "runs").mkdir()
-
-    # Create improvements directory for non-baseline versions
-    if iteration is not None and iteration > 0:
-        (dest / "improvements").mkdir()
-
-    # Copy the skill files to skill/ subdirectory
-    shutil.copytree(source, skill_dest)
-
-    # Create metadata file at the version root
-    create_meta_yaml(dest, parent, changes, score, iteration)
-
-    print(f"Copied skill from {source} to {skill_dest}")
-    print(f"Created version directory structure at {dest}")
-    print(f"  - skill/        : Skill files")
-    print(f"  - runs/         : For execution runs (run-1/, run-2/, run-3/)")
-    if iteration is not None and iteration > 0:
-        print(f"  - improvements/ : Improvement suggestions")
-    print(f"  - META.yaml     : Version metadata")
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Copy a skill directory with metadata tracking and version structure.",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog="""
-Examples:
-  # Create baseline v0 from an existing skill
-  python copy_skill.py ./skills/public/pdf ./skill_iterations/v0 --iteration 0
-
-  # Create v1 from v0's skill directory
-  python copy_skill.py ./skill_iterations/v0/skill ./skill_iterations/v1 \\
-      --parent v0 \\
-      --changes "Added error handling for empty cells" \\
-      --iteration 1
-
-  # Create v2 with score from evaluation
-  python copy_skill.py ./skill_iterations/v1/skill ./skill_iterations/v2 \\
-      --parent v1 \\
-      --changes "Improved coordinate guidance" \\
-      --score 7.5 \\
-      --iteration 2
-
-Output structure:
-  dest/
-  ├── META.yaml        # Version metadata
-  ├── skill/           # The actual skill files
-  ├── runs/            # Execution runs (created by executor)
-  │   ├── run-1/
-  │   │   ├── transcript.md
-  │   │   ├── outputs/
-  │   │   └── evaluation.json
-  │   ├── run-2/
-  │   └── run-3/
-  └── improvements/    # Improvement suggestions (v1+)
-        """,
-    )
-
-    parser.add_argument("source", type=Path, help="Source skill directory to copy")
-
-    parser.add_argument("dest", type=Path, help="Destination path for the copy")
-
-    parser.add_argument(
-        "--parent",
-        type=str,
-        default=None,
-        help="Name or path of the parent skill iteration",
-    )
-
-    parser.add_argument(
-        "--changes",
-        type=str,
-        default=None,
-        help="Description of changes from the parent version",
-    )
-
-    parser.add_argument(
-        "--score",
-        type=float,
-        default=None,
-        help="Evaluation score for this iteration (e.g., 7.5)",
-    )
-
-    parser.add_argument(
-        "--iteration",
-        type=int,
-        default=None,
-        help="Iteration number (e.g., 1, 2, 3)",
-    )
-
-    args = parser.parse_args()
-
-    try:
-        copy_skill(
-            source=args.source,
-            dest=args.dest,
-            parent=args.parent,
-            changes=args.changes,
-            score=args.score,
-            iteration=args.iteration,
-        )
-    except (FileNotFoundError, FileExistsError, ValueError) as e:
-        parser.error(str(e))
-
-
-if __name__ == "__main__":
-    main()
--- a/plugins/skill-creator/skills/skill-creator/scripts/generate_report.py
+++ b/plugins/skill-creator/skills/skill-creator/scripts/generate_report.py
@@ -0,0 +1,326 @@
+#!/usr/bin/env python3
+"""Generate an HTML report from run_loop.py output.
+
+Takes the JSON output from run_loop.py and generates a visual HTML report
+showing each description attempt with check/x for each test case.
+Distinguishes between train and test queries.
+"""
+
+import argparse
+import html
+import json
+import sys
+from pathlib import Path
+
+
+def generate_html(data: dict, auto_refresh: bool = False, skill_name: str = "") -> str:
+    """Generate HTML report from loop output data. If auto_refresh is True, adds a meta refresh tag."""
+    history = data.get("history", [])
+    holdout = data.get("holdout", 0)
+    title_prefix = html.escape(skill_name + " \u2014 ") if skill_name else ""
+
+    # Get all unique queries from train and test sets, with should_trigger info
+    train_queries: list[dict] = []
+    test_queries: list[dict] = []
+    if history:
+        for r in history[0].get("train_results", history[0].get("results", [])):
+            train_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)})
+        if history[0].get("test_results"):
+            for r in history[0].get("test_results", []):
+                test_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)})
+
+    refresh_tag = '    <meta http-equiv="refresh" content="5">\n' if auto_refresh else ""
+
+    html_parts = ["""<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="utf-8">
+""" + refresh_tag + """    <title>""" + title_prefix + """Skill Description Optimization</title>
+    <link rel="preconnect" href="https://fonts.googleapis.com">
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+    <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
+    <style>
+        body {
+            font-family: 'Lora', Georgia, serif;
+            max-width: 100%;
+            margin: 0 auto;
+            padding: 20px;
+            background: #faf9f5;
+            color: #141413;
+        }
+        h1 { font-family: 'Poppins', sans-serif; color: #141413; }
+        .explainer {
+            background: white;
+            padding: 15px;
+            border-radius: 6px;
+            margin-bottom: 20px;
+            border: 1px solid #e8e6dc;
+            color: #b0aea5;
+            font-size: 0.875rem;
+            line-height: 1.6;
+        }
+        .summary {
+            background: white;
+            padding: 15px;
+            border-radius: 6px;
+            margin-bottom: 20px;
+            border: 1px solid #e8e6dc;
+        }
+        .summary p { margin: 5px 0; }
+        .best { color: #788c5d; font-weight: bold; }
+        .table-container {
+            overflow-x: auto;
+            width: 100%;
+        }
+        table {
+            border-collapse: collapse;
+            background: white;
+            border: 1px solid #e8e6dc;
+            border-radius: 6px;
+            font-size: 12px;
+            min-width: 100%;
+        }
+        th, td {
+            padding: 8px;
+            text-align: left;
+            border: 1px solid #e8e6dc;
+            white-space: normal;
+            word-wrap: break-word;
+        }
+        th {
+            font-family: 'Poppins', sans-serif;
+            background: #141413;
+            color: #faf9f5;
+            font-weight: 500;
+        }
+        th.test-col {
+            background: #6a9bcc;
+        }
+        th.query-col { min-width: 200px; }
+        td.description {
+            font-family: monospace;
+            font-size: 11px;
+            word-wrap: break-word;
+            max-width: 400px;
+        }
+        td.result {
+            text-align: center;
+            font-size: 16px;
+            min-width: 40px;
+        }
+        td.test-result {
+            background: #f0f6fc;
+        }
+        .pass { color: #788c5d; }
+        .fail { color: #c44; }
+        .rate {
+            font-size: 9px;
+            color: #b0aea5;
+            display: block;
+        }
+        tr:hover { background: #faf9f5; }
+        .score {
+            display: inline-block;
+            padding: 2px 6px;
+            border-radius: 4px;
+            font-weight: bold;
+            font-size: 11px;
+        }
+        .score-good { background: #eef2e8; color: #788c5d; }
+        .score-ok { background: #fef3c7; color: #d97706; }
+        .score-bad { background: #fceaea; color: #c44; }
+        .train-label { color: #b0aea5; font-size: 10px; }
+        .test-label { color: #6a9bcc; font-size: 10px; font-weight: bold; }
+        .best-row { background: #f5f8f2; }
+        th.positive-col { border-bottom: 3px solid #788c5d; }
+        th.negative-col { border-bottom: 3px solid #c44; }
+        th.test-col.positive-col { border-bottom: 3px solid #788c5d; }
+        th.test-col.negative-col { border-bottom: 3px solid #c44; }
+        .legend { font-family: 'Poppins', sans-serif; display: flex; gap: 20px; margin-bottom: 10px; font-size: 13px; align-items: center; }
+        .legend-item { display: flex; align-items: center; gap: 6px; }
+        .legend-swatch { width: 16px; height: 16px; border-radius: 3px; display: inline-block; }
+        .swatch-positive { background: #141413; border-bottom: 3px solid #788c5d; }
+        .swatch-negative { background: #141413; border-bottom: 3px solid #c44; }
+        .swatch-test { background: #6a9bcc; }
+        .swatch-train { background: #141413; }
+    </style>
+</head>
+<body>
+    <h1>""" + title_prefix + """Skill Description Optimization</h1>
+    <div class="explainer">
+        <strong>Optimizing your skill's description.</strong> This page updates automatically as Claude tests different versions of your skill's description. Each row is an iteration — a new description attempt. The columns show test queries: green checkmarks mean the skill triggered correctly (or correctly didn't trigger), red crosses mean it got it wrong. The "Train" score shows performance on queries used to improve the description; the "Test" score shows performance on held-out queries the optimizer hasn't seen. When it's done, Claude will apply the best-performing description to your skill.
+    </div>
+"""]
+
+    # Summary section
+    best_test_score = data.get('best_test_score')
+    best_train_score = data.get('best_train_score')
+    html_parts.append(f"""
+    <div class="summary">
+        <p><strong>Original:</strong> {html.escape(data.get('original_description', 'N/A'))}</p>
+        <p class="best"><strong>Best:</strong> {html.escape(data.get('best_description', 'N/A'))}</p>
+        <p><strong>Best Score:</strong> {data.get('best_score', 'N/A')} {'(test)' if best_test_score else '(train)'}</p>
+        <p><strong>Iterations:</strong> {data.get('iterations_run', 0)} | <strong>Train:</strong> {data.get('train_size', '?')} | <strong>Test:</strong> {data.get('test_size', '?')}</p>
+    </div>
+""")
+
+    # Legend
+    html_parts.append("""
+    <div class="legend">
+        <span style="font-weight:600">Query columns:</span>
+        <span class="legend-item"><span class="legend-swatch swatch-positive"></span> Should trigger</span>
+        <span class="legend-item"><span class="legend-swatch swatch-negative"></span> Should NOT trigger</span>
+        <span class="legend-item"><span class="legend-swatch swatch-train"></span> Train</span>
+        <span class="legend-item"><span class="legend-swatch swatch-test"></span> Test</span>
+    </div>
+""")
+
+    # Table header
+    html_parts.append("""
+    <div class="table-container">
+    <table>
+        <thead>
+            <tr>
+                <th>Iter</th>
+                <th>Train</th>
+                <th>Test</th>
+                <th class="query-col">Description</th>
+""")
+
+    # Add column headers for train queries
+    for qinfo in train_queries:
+        polarity = "positive-col" if qinfo["should_trigger"] else "negative-col"
+        html_parts.append(f'                <th class="{polarity}">{html.escape(qinfo["query"])}</th>\n')
+
+    # Add column headers for test queries (different color)
+    for qinfo in test_queries:
+        polarity = "positive-col" if qinfo["should_trigger"] else "negative-col"
+        html_parts.append(f'                <th class="test-col {polarity}">{html.escape(qinfo["query"])}</th>\n')
+
+    html_parts.append("""            </tr>
+        </thead>
+        <tbody>
+""")
+
+    # Find best iteration for highlighting
+    if test_queries:
+        best_iter = max(history, key=lambda h: h.get("test_passed") or 0).get("iteration")
+    else:
+        best_iter = max(history, key=lambda h: h.get("train_passed", h.get("passed", 0))).get("iteration")
+
+    # Add rows for each iteration
+    for h in history:
+        iteration = h.get("iteration", "?")
+        train_passed = h.get("train_passed", h.get("passed", 0))
+        train_total = h.get("train_total", h.get("total", 0))
+        test_passed = h.get("test_passed")
+        test_total = h.get("test_total")
+        description = h.get("description", "")
+        train_results = h.get("train_results", h.get("results", []))
+        test_results = h.get("test_results", [])
+
+        # Create lookups for results by query
+        train_by_query = {r["query"]: r for r in train_results}
+        test_by_query = {r["query"]: r for r in test_results} if test_results else {}
+
+        # Compute aggregate correct/total runs across all retries
+        def aggregate_runs(results: list[dict]) -> tuple[int, int]:
+            correct = 0
+            total = 0
+            for r in results:
+                runs = r.get("runs", 0)
+                triggers = r.get("triggers", 0)
+                total += runs
+                if r.get("should_trigger", True):
+                    correct += triggers
+                else:
+                    correct += runs - triggers
+            return correct, total
+
+        train_correct, train_runs = aggregate_runs(train_results)
+        test_correct, test_runs = aggregate_runs(test_results)
+
+        # Determine score classes
+        def score_class(correct: int, total: int) -> str:
+            if total > 0:
+                ratio = correct / total
+                if ratio >= 0.8:
+                    return "score-good"
+                elif ratio >= 0.5:
+                    return "score-ok"
+            return "score-bad"
+
+        train_class = score_class(train_correct, train_runs)
+        test_class = score_class(test_correct, test_runs)
+
+        row_class = "best-row" if iteration == best_iter else ""
+
+        html_parts.append(f"""            <tr class="{row_class}">
+                <td>{iteration}</td>
+                <td><span class="score {train_class}">{train_correct}/{train_runs}</span></td>
+                <td><span class="score {test_class}">{test_correct}/{test_runs}</span></td>
+                <td class="description">{html.escape(description)}</td>
+""")
+
+        # Add result for each train query
+        for qinfo in train_queries:
+            r = train_by_query.get(qinfo["query"], {})
+            did_pass = r.get("pass", False)
+            triggers = r.get("triggers", 0)
+            runs = r.get("runs", 0)
+
+            icon = "✓" if did_pass else "✗"
+            css_class = "pass" if did_pass else "fail"
+
+            html_parts.append(f'                <td class="result {css_class}">{icon}<span class="rate">{triggers}/{runs}</span></td>\n')
+
+        # Add result for each test query (with different background)
+        for qinfo in test_queries:
+            r = test_by_query.get(qinfo["query"], {})
+            did_pass = r.get("pass", False)
+            triggers = r.get("triggers", 0)
+            runs = r.get("runs", 0)
+
+            icon = "✓" if did_pass else "✗"
+            css_class = "pass" if did_pass else "fail"
+
+            html_parts.append(f'                <td class="result test-result {css_class}">{icon}<span class="rate">{triggers}/{runs}</span></td>\n')
+
+        html_parts.append("            </tr>\n")
+
+    html_parts.append("""        </tbody>
+    </table>
+    </div>
+""")
+
+    html_parts.append("""
+</body>
+</html>
+""")
+
+    return "".join(html_parts)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate HTML report from run_loop output")
+    parser.add_argument("input", help="Path to JSON output from run_loop.py (or - for stdin)")
+    parser.add_argument("-o", "--output", default=None, help="Output HTML file (default: stdout)")
+    parser.add_argument("--skill-name", default="", help="Skill name to include in the report title")
+    args = parser.parse_args()
+
+    if args.input == "-":
+        data = json.load(sys.stdin)
+    else:
+        data = json.loads(Path(args.input).read_text())
+
+    html_output = generate_html(data, skill_name=args.skill_name)
+
+    if args.output:
+        Path(args.output).write_text(html_output)
+        print(f"Report written to {args.output}", file=sys.stderr)
+    else:
+        print(html_output)
+
+
+if __name__ == "__main__":
+    main()
--- a/plugins/skill-creator/skills/skill-creator/scripts/improve_description.py
+++ b/plugins/skill-creator/skills/skill-creator/scripts/improve_description.py
@@ -0,0 +1,248 @@
+#!/usr/bin/env python3
+"""Improve a skill description based on eval results.
+
+Takes eval results (from run_eval.py) and generates an improved description
+using Claude with extended thinking.
+"""
+
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+
+import anthropic
+
+from scripts.utils import parse_skill_md
+
+
+def improve_description(
+    client: anthropic.Anthropic,
+    skill_name: str,
+    skill_content: str,
+    current_description: str,
+    eval_results: dict,
+    history: list[dict],
+    model: str,
+    test_results: dict | None = None,
+    log_dir: Path | None = None,
+    iteration: int | None = None,
+) -> str:
+    """Call Claude to improve the description based on eval results."""
+    failed_triggers = [
+        r for r in eval_results["results"]
+        if r["should_trigger"] and not r["pass"]
+    ]
+    false_triggers = [
+        r for r in eval_results["results"]
+        if not r["should_trigger"] and not r["pass"]
+    ]
+
+    # Build scores summary
+    train_score = f"{eval_results['summary']['passed']}/{eval_results['summary']['total']}"
+    if test_results:
+        test_score = f"{test_results['summary']['passed']}/{test_results['summary']['total']}"
+        scores_summary = f"Train: {train_score}, Test: {test_score}"
+    else:
+        scores_summary = f"Train: {train_score}"
+
+    prompt = f"""You are optimizing a skill description for a Claude Code skill called "{skill_name}". A "skill" is sort of like a prompt, but with progressive disclosure -- there's a title and description that Claude sees when deciding whether to use the skill, and then if it does use the skill, it reads the .md file which has lots more details and potentially links to other resources in the skill folder like helper files and scripts and additional documentation or examples.
+
+The description appears in Claude's "available_skills" list. When a user sends a query, Claude decides whether to invoke the skill based solely on the title and on this description. Your goal is to write a description that triggers for relevant queries, and doesn't trigger for irrelevant ones.
+
+Here's the current description:
+<current_description>
+"{current_description}"
+</current_description>
+
+Current scores ({scores_summary}):
+<scores_summary>
+"""
+    if failed_triggers:
+        prompt += "FAILED TO TRIGGER (should have triggered but didn't):\n"
+        for r in failed_triggers:
+            prompt += f'  - "{r["query"]}" (triggered {r["triggers"]}/{r["runs"]} times)\n'
+        prompt += "\n"
+
+    if false_triggers:
+        prompt += "FALSE TRIGGERS (triggered but shouldn't have):\n"
+        for r in false_triggers:
+            prompt += f'  - "{r["query"]}" (triggered {r["triggers"]}/{r["runs"]} times)\n'
+        prompt += "\n"
+
+    if history:
+        prompt += "PREVIOUS ATTEMPTS (do NOT repeat these — try something structurally different):\n\n"
+        for h in history:
+            train_s = f"{h.get('train_passed', h.get('passed', 0))}/{h.get('train_total', h.get('total', 0))}"
+            test_s = f"{h.get('test_passed', '?')}/{h.get('test_total', '?')}" if h.get('test_passed') is not None else None
+            score_str = f"train={train_s}" + (f", test={test_s}" if test_s else "")
+            prompt += f'<attempt {score_str}>\n'
+            prompt += f'Description: "{h["description"]}"\n'
+            if "results" in h:
+                prompt += "Train results:\n"
+                for r in h["results"]:
+                    status = "PASS" if r["pass"] else "FAIL"
+                    prompt += f'  [{status}] "{r["query"][:80]}" (triggered {r["triggers"]}/{r["runs"]})\n'
+            if h.get("note"):
+                prompt += f'Note: {h["note"]}\n'
+            prompt += "</attempt>\n\n"
+
+    prompt += f"""</scores_summary>
+
+Skill content (for context on what the skill does):
+<skill_content>
+{skill_content}
+</skill_content>
+
+Based on the failures, write a new and improved description that is more likely to trigger correctly. When I say "based on the failures", it's a bit of a tricky line to walk because we don't want to overfit to the specific cases you're seeing. So what I DON'T want you to do is produce an ever-expanding list of specific queries that this skill should or shouldn't trigger for. Instead, try to generalize from the failures to broader categories of user intent and situations where this skill would be useful or not useful. The reason for this is twofold:
+
+1. Avoid overfitting
+2. The list might get loooong and it's injected into ALL queries and there might be a lot of skills, so we don't want to blow too much space on any given description.
+
+Concretely, your description should not be more than about 100-200 words, even if that comes at the cost of accuracy.
+
+Here are some tips that we've found to work well in writing these descriptions:
+- The skill should be phrased in the imperative -- "Use this skill for" rather than "this skill does"
+- The skill description should focus on the user's intent, what they are trying to achieve, vs. the implementation details of how the skill works.
+- The description competes with other skills for Claude's attention — make it distinctive and immediately recognizable.
+- If you're getting lots of failures after repeated attempts, change things up. Try different sentence structures or wordings.
+
+I'd encourage you to be creative and mix up the style in different iterations since you'll have multiple opportunities to try different approaches and we'll just grab the highest-scoring one at the end. 
+
+Please respond with only the new description text in <new_description> tags, nothing else."""
+
+    response = client.messages.create(
+        model=model,
+        max_tokens=16000,
+        thinking={
+            "type": "enabled",
+            "budget_tokens": 10000,
+        },
+        messages=[{"role": "user", "content": prompt}],
+    )
+
+    # Extract thinking and text from response
+    thinking_text = ""
+    text = ""
+    for block in response.content:
+        if block.type == "thinking":
+            thinking_text = block.thinking
+        elif block.type == "text":
+            text = block.text
+
+    # Parse out the <new_description> tags
+    match = re.search(r"<new_description>(.*?)</new_description>", text, re.DOTALL)
+    description = match.group(1).strip().strip('"') if match else text.strip().strip('"')
+
+    # Log the transcript
+    transcript: dict = {
+        "iteration": iteration,
+        "prompt": prompt,
+        "thinking": thinking_text,
+        "response": text,
+        "parsed_description": description,
+        "char_count": len(description),
+        "over_limit": len(description) > 1024,
+    }
+
+    # If over 1024 chars, ask the model to shorten it
+    if len(description) > 1024:
+        shorten_prompt = f"Your description is {len(description)} characters, which exceeds the hard 1024 character limit. Please rewrite it to be under 1024 characters while preserving the most important trigger words and intent coverage. Respond with only the new description in <new_description> tags."
+        shorten_response = client.messages.create(
+            model=model,
+            max_tokens=16000,
+            thinking={
+                "type": "enabled",
+                "budget_tokens": 10000,
+            },
+            messages=[
+                {"role": "user", "content": prompt},
+                {"role": "assistant", "content": text},
+                {"role": "user", "content": shorten_prompt},
+            ],
+        )
+
+        shorten_thinking = ""
+        shorten_text = ""
+        for block in shorten_response.content:
+            if block.type == "thinking":
+                shorten_thinking = block.thinking
+            elif block.type == "text":
+                shorten_text = block.text
+
+        match = re.search(r"<new_description>(.*?)</new_description>", shorten_text, re.DOTALL)
+        shortened = match.group(1).strip().strip('"') if match else shorten_text.strip().strip('"')
+
+        transcript["rewrite_prompt"] = shorten_prompt
+        transcript["rewrite_thinking"] = shorten_thinking
+        transcript["rewrite_response"] = shorten_text
+        transcript["rewrite_description"] = shortened
+        transcript["rewrite_char_count"] = len(shortened)
+        description = shortened
+
+    transcript["final_description"] = description
+
+    if log_dir:
+        log_dir.mkdir(parents=True, exist_ok=True)
+        log_file = log_dir / f"improve_iter_{iteration or 'unknown'}.json"
+        log_file.write_text(json.dumps(transcript, indent=2))
+
+    return description
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Improve a skill description based on eval results")
+    parser.add_argument("--eval-results", required=True, help="Path to eval results JSON (from run_eval.py)")
+    parser.add_argument("--skill-path", required=True, help="Path to skill directory")
+    parser.add_argument("--history", default=None, help="Path to history JSON (previous attempts)")
+    parser.add_argument("--model", required=True, help="Model for improvement")
+    parser.add_argument("--verbose", action="store_true", help="Print thinking to stderr")
+    args = parser.parse_args()
+
+    skill_path = Path(args.skill_path)
+    if not (skill_path / "SKILL.md").exists():
+        print(f"Error: No SKILL.md found at {skill_path}", file=sys.stderr)
+        sys.exit(1)
+
+    eval_results = json.loads(Path(args.eval_results).read_text())
+    history = []
+    if args.history:
+        history = json.loads(Path(args.history).read_text())
+
+    name, _, content = parse_skill_md(skill_path)
+    current_description = eval_results["description"]
+
+    if args.verbose:
+        print(f"Current: {current_description}", file=sys.stderr)
+        print(f"Score: {eval_results['summary']['passed']}/{eval_results['summary']['total']}", file=sys.stderr)
+
+    client = anthropic.Anthropic()
+    new_description = improve_description(
+        client=client,
+        skill_name=name,
+        skill_content=content,
+        current_description=current_description,
+        eval_results=eval_results,
+        history=history,
+        model=args.model,
+    )
+
+    if args.verbose:
+        print(f"Improved: {new_description}", file=sys.stderr)
+
+    # Output as JSON with both the new description and updated history
+    output = {
+        "description": new_description,
+        "history": history + [{
+            "description": current_description,
+            "passed": eval_results["summary"]["passed"],
+            "failed": eval_results["summary"]["failed"],
+            "total": eval_results["summary"]["total"],
+            "results": eval_results["results"],
+        }],
+    }
+    print(json.dumps(output, indent=2))
+
+
+if __name__ == "__main__":
+    main()
--- a/plugins/skill-creator/skills/skill-creator/scripts/init_json.py
+++ b/plugins/skill-creator/skills/skill-creator/scripts/init_json.py
@@ -1,311 +0,0 @@
-#!/usr/bin/env python3
-"""
-Initialize JSON files with the correct structure for skill-creator-edge.
-
-Creates template JSON files that can be filled in.
-
-Usage:
-    python init_json.py <type> <output_path>
-
-Examples:
-    python init_json.py evals evals/evals.json
-    python init_json.py grading run-1/grading.json
-    python init_json.py benchmark benchmarks/2026-01-15/benchmark.json
-"""
-
-import argparse
-import json
-import sys
-from datetime import datetime, timezone
-from pathlib import Path
-
-
-TEMPLATES = {
-    "evals": {
-        "skill_name": "<skill-name>",
-        "evals": [
-            {
-                "id": 1,
-                "prompt": "Example task prompt",
-                "expected_output": "Description of expected result",
-                "files": [],
-                "expectations": [
-                    "The output includes X",
-                    "The skill correctly handles Y"
-                ]
-            }
-        ]
-    },
-
-    "grading": {
-        "expectations": [
-            {
-                "text": "Example expectation",
-                "passed": True,
-                "evidence": "Found in transcript: ..."
-            }
-        ],
-        "summary": {
-            "passed": 1,
-            "failed": 0,
-            "total": 1,
-            "pass_rate": 1.0
-        },
-        "execution_metrics": {
-            "tool_calls": {
-                "Read": 0,
-                "Write": 0,
-                "Bash": 0,
-                "Edit": 0,
-                "Glob": 0,
-                "Grep": 0
-            },
-            "total_tool_calls": 0,
-            "total_steps": 0,
-            "errors_encountered": 0,
-            "output_chars": 0,
-            "transcript_chars": 0
-        },
-        "timing": {
-            "executor_duration_seconds": 0.0,
-            "grader_duration_seconds": 0.0,
-            "total_duration_seconds": 0.0
-        },
-        "claims": [],
-        "user_notes_summary": {
-            "uncertainties": [],
-            "needs_review": [],
-            "workarounds": []
-        }
-    },
-
-    "benchmark": {
-        "metadata": {
-            "skill_name": "<skill-name>",
-            "skill_path": "<path/to/skill>",
-            "executor_model": "<model-name>",
-            "analyzer_model": "<model-name>",
-            "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
-            "evals_run": [1],
-            "runs_per_configuration": 3
-        },
-        "runs": [
-            {
-                "eval_id": 1,
-                "configuration": "with_skill",
-                "run_number": 1,
-                "result": {
-                    "pass_rate": 0.0,
-                    "passed": 0,
-                    "failed": 0,
-                    "total": 0,
-                    "time_seconds": 0.0,
-                    "tokens": 0,
-                    "tool_calls": 0,
-                    "errors": 0
-                },
-                "expectations": [],
-                "notes": []
-            }
-        ],
-        "run_summary": {
-            "with_skill": {
-                "pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
-                "time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
-                "tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
-            },
-            "without_skill": {
-                "pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
-                "time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
-                "tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
-            },
-            "delta": {
-                "pass_rate": "+0.0",
-                "time_seconds": "+0.0",
-                "tokens": "+0"
-            }
-        },
-        "notes": []
-    },
-
-    "metrics": {
-        "tool_calls": {
-            "Read": 0,
-            "Write": 0,
-            "Bash": 0,
-            "Edit": 0,
-            "Glob": 0,
-            "Grep": 0
-        },
-        "total_tool_calls": 0,
-        "total_steps": 0,
-        "files_created": [],
-        "errors_encountered": 0,
-        "output_chars": 0,
-        "transcript_chars": 0
-    },
-
-    "timing": {
-        "executor_start": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
-        "executor_end": "",
-        "executor_duration_seconds": 0.0,
-        "grader_start": "",
-        "grader_end": "",
-        "grader_duration_seconds": 0.0,
-        "total_duration_seconds": 0.0
-    },
-
-    "history": {
-        "started_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
-        "skill_name": "<skill-name>",
-        "current_best": "v0",
-        "iterations": [
-            {
-                "version": "v0",
-                "parent": None,
-                "expectation_pass_rate": 0.0,
-                "grading_result": "baseline",
-                "is_current_best": True
-            }
-        ]
-    },
-
-    "comparison": {
-        "winner": "A",
-        "reasoning": "Explanation of why the winner was chosen",
-        "rubric": {
-            "A": {
-                "content": {
-                    "correctness": 5,
-                    "completeness": 5,
-                    "accuracy": 5
-                },
-                "structure": {
-                    "organization": 5,
-                    "formatting": 5,
-                    "usability": 5
-                },
-                "content_score": 5.0,
-                "structure_score": 5.0,
-                "overall_score": 10.0
-            },
-            "B": {
-                "content": {
-                    "correctness": 3,
-                    "completeness": 3,
-                    "accuracy": 3
-                },
-                "structure": {
-                    "organization": 3,
-                    "formatting": 3,
-                    "usability": 3
-                },
-                "content_score": 3.0,
-                "structure_score": 3.0,
-                "overall_score": 6.0
-            }
-        },
-        "output_quality": {
-            "A": {
-                "score": 10,
-                "strengths": [],
-                "weaknesses": []
-            },
-            "B": {
-                "score": 6,
-                "strengths": [],
-                "weaknesses": []
-            }
-        }
-    },
-
-    "analysis": {
-        "comparison_summary": {
-            "winner": "A",
-            "winner_skill": "<path/to/winner>",
-            "loser_skill": "<path/to/loser>",
-            "comparator_reasoning": "Summary of comparison"
-        },
-        "winner_strengths": [],
-        "loser_weaknesses": [],
-        "instruction_following": {
-            "winner": {
-                "score": 10,
-                "issues": []
-            },
-            "loser": {
-                "score": 5,
-                "issues": []
-            }
-        },
-        "improvement_suggestions": [
-            {
-                "priority": "high",
-                "category": "instructions",
-                "suggestion": "Specific improvement suggestion",
-                "expected_impact": "Why this would help"
-            }
-        ],
-        "transcript_insights": {
-            "winner_execution_pattern": "Description of how winner executed",
-            "loser_execution_pattern": "Description of how loser executed"
-        }
-    }
-}
-
-
-def init_json(json_type: str, output_path: Path, force: bool = False) -> bool:
-    """
-    Initialize a JSON file with the correct template structure.
-
-    Returns True on success, False on failure.
-    """
-    if json_type not in TEMPLATES:
-        print(f"Unknown type: {json_type}. Valid types: {list(TEMPLATES.keys())}")
-        return False
-
-    if output_path.exists() and not force:
-        print(f"File already exists: {output_path}")
-        print("Use --force to overwrite")
-        return False
-
-    # Create parent directories
-    output_path.parent.mkdir(parents=True, exist_ok=True)
-
-    # Write template
-    template = TEMPLATES[json_type]
-    with open(output_path, "w") as f:
-        json.dump(template, f, indent=2)
-
-    print(f"Created {json_type} template: {output_path}")
-    return True
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Initialize JSON files with correct structure"
-    )
-    parser.add_argument(
-        "type",
-        choices=list(TEMPLATES.keys()),
-        help="Type of JSON file to create"
-    )
-    parser.add_argument(
-        "output",
-        type=Path,
-        help="Output path for the JSON file"
-    )
-    parser.add_argument(
-        "--force", "-f",
-        action="store_true",
-        help="Overwrite existing file"
-    )
-
-    args = parser.parse_args()
-
-    success = init_json(args.type, args.output, args.force)
-    sys.exit(0 if success else 1)
-
-
-if __name__ == "__main__":
-    main()
--- a/plugins/skill-creator/skills/skill-creator/scripts/init_skill.py
+++ b/plugins/skill-creator/skills/skill-creator/scripts/init_skill.py
@@ -1,303 +0,0 @@
-#!/usr/bin/env python3
-"""
-Skill Initializer - Creates a new skill from template
-
-Usage:
-    init_skill.py <skill-name> --path <path>
-
-Examples:
-    init_skill.py my-new-skill --path skills/public
-    init_skill.py my-api-helper --path skills/private
-    init_skill.py custom-skill --path /custom/location
-"""
-
-import sys
-from pathlib import Path
-
-
-SKILL_TEMPLATE = """---
-name: {skill_name}
-description: [TODO: Complete and informative explanation of what the skill does and when to use it. Include WHEN to use this skill - specific scenarios, file types, or tasks that trigger it.]
---
-
-# {skill_title}
-
-## Overview
-
-[TODO: 1-2 sentences explaining what this skill enables]
-
-## Structuring This Skill
-
-[TODO: Choose the structure that best fits this skill's purpose. Common patterns:
-
-**1. Workflow-Based** (best for sequential processes)
- Works well when there are clear step-by-step procedures
- Example: DOCX skill with "Workflow Decision Tree" → "Reading" → "Creating" → "Editing"
- Structure: ## Overview → ## Workflow Decision Tree → ## Step 1 → ## Step 2...
-
-**2. Task-Based** (best for tool collections)
- Works well when the skill offers different operations/capabilities
- Example: PDF skill with "Quick Start" → "Merge PDFs" → "Split PDFs" → "Extract Text"
- Structure: ## Overview → ## Quick Start → ## Task Category 1 → ## Task Category 2...
-
-**3. Reference/Guidelines** (best for standards or specifications)
- Works well for brand guidelines, coding standards, or requirements
- Example: Brand styling with "Brand Guidelines" → "Colors" → "Typography" → "Features"
- Structure: ## Overview → ## Guidelines → ## Specifications → ## Usage...
-
-**4. Capabilities-Based** (best for integrated systems)
- Works well when the skill provides multiple interrelated features
- Example: Product Management with "Core Capabilities" → numbered capability list
- Structure: ## Overview → ## Core Capabilities → ### 1. Feature → ### 2. Feature...
-
-Patterns can be mixed and matched as needed. Most skills combine patterns (e.g., start with task-based, add workflow for complex operations).
-
-Delete this entire "Structuring This Skill" section when done - it's just guidance.]
-
-## [TODO: Replace with the first main section based on chosen structure]
-
-[TODO: Add content here. See examples in existing skills:
- Code samples for technical skills
- Decision trees for complex workflows
- Concrete examples with realistic user requests
- References to scripts/templates/references as needed]
-
-## Resources
-
-This skill includes example resource directories that demonstrate how to organize different types of bundled resources:
-
-### scripts/
-Executable code (Python/Bash/etc.) that can be run directly to perform specific operations.
-
-**Examples from other skills:**
- PDF skill: `fill_fillable_fields.py`, `extract_form_field_info.py` - utilities for PDF manipulation
- DOCX skill: `document.py`, `utilities.py` - Python modules for document processing
-
-**Appropriate for:** Python scripts, shell scripts, or any executable code that performs automation, data processing, or specific operations.
-
-**Note:** Scripts may be executed without loading into context, but can still be read by Claude for patching or environment adjustments.
-
-### references/
-Documentation and reference material intended to be loaded into context to inform Claude's process and thinking.
-
-**Examples from other skills:**
- Product management: `communication.md`, `context_building.md` - detailed workflow guides
- BigQuery: API reference documentation and query examples
- Finance: Schema documentation, company policies
-
-**Appropriate for:** In-depth documentation, API references, database schemas, comprehensive guides, or any detailed information that Claude should reference while working.
-
-### assets/
-Files not intended to be loaded into context, but rather used within the output Claude produces.
-
-**Examples from other skills:**
- Brand styling: PowerPoint template files (.pptx), logo files
- Frontend builder: HTML/React boilerplate project directories
- Typography: Font files (.ttf, .woff2)
-
-**Appropriate for:** Templates, boilerplate code, document templates, images, icons, fonts, or any files meant to be copied or used in the final output.
-
---
-
-**Any unneeded directories can be deleted.** Not every skill requires all three types of resources.
-"""
-
-EXAMPLE_SCRIPT = '''#!/usr/bin/env python3
-"""
-Example helper script for {skill_name}
-
-This is a placeholder script that can be executed directly.
-Replace with actual implementation or delete if not needed.
-
-Example real scripts from other skills:
- pdf/scripts/fill_fillable_fields.py - Fills PDF form fields
- pdf/scripts/convert_pdf_to_images.py - Converts PDF pages to images
-"""
-
-def main():
-    print("This is an example script for {skill_name}")
-    # TODO: Add actual script logic here
-    # This could be data processing, file conversion, API calls, etc.
-
-if __name__ == "__main__":
-    main()
-'''
-
-EXAMPLE_REFERENCE = """# Reference Documentation for {skill_title}
-
-This is a placeholder for detailed reference documentation.
-Replace with actual reference content or delete if not needed.
-
-Example real reference docs from other skills:
- product-management/references/communication.md - Comprehensive guide for status updates
- product-management/references/context_building.md - Deep-dive on gathering context
- bigquery/references/ - API references and query examples
-
-## When Reference Docs Are Useful
-
-Reference docs are ideal for:
- Comprehensive API documentation
- Detailed workflow guides
- Complex multi-step processes
- Information too lengthy for main SKILL.md
- Content that's only needed for specific use cases
-
-## Structure Suggestions
-
-### API Reference Example
- Overview
- Authentication
- Endpoints with examples
- Error codes
- Rate limits
-
-### Workflow Guide Example
- Prerequisites
- Step-by-step instructions
- Common patterns
- Troubleshooting
- Best practices
-"""
-
-EXAMPLE_ASSET = """# Example Asset File
-
-This placeholder represents where asset files would be stored.
-Replace with actual asset files (templates, images, fonts, etc.) or delete if not needed.
-
-Asset files are NOT intended to be loaded into context, but rather used within
-the output Claude produces.
-
-Example asset files from other skills:
- Brand guidelines: logo.png, slides_template.pptx
- Frontend builder: hello-world/ directory with HTML/React boilerplate
- Typography: custom-font.ttf, font-family.woff2
- Data: sample_data.csv, test_dataset.json
-
-## Common Asset Types
-
- Templates: .pptx, .docx, boilerplate directories
- Images: .png, .jpg, .svg, .gif
- Fonts: .ttf, .otf, .woff, .woff2
- Boilerplate code: Project directories, starter files
- Icons: .ico, .svg
- Data files: .csv, .json, .xml, .yaml
-
-Note: This is a text placeholder. Actual assets can be any file type.
-"""
-
-
-def title_case_skill_name(skill_name):
-    """Convert hyphenated skill name to Title Case for display."""
-    return ' '.join(word.capitalize() for word in skill_name.split('-'))
-
-
-def init_skill(skill_name, path):
-    """
-    Initialize a new skill directory with template SKILL.md.
-
-    Args:
-        skill_name: Name of the skill
-        path: Path where the skill directory should be created
-
-    Returns:
-        Path to created skill directory, or None if error
-    """
-    # Determine skill directory path
-    skill_dir = Path(path).resolve() / skill_name
-
-    # Check if directory already exists
-    if skill_dir.exists():
-        print(f"❌ Error: Skill directory already exists: {skill_dir}")
-        return None
-
-    # Create skill directory
-    try:
-        skill_dir.mkdir(parents=True, exist_ok=False)
-        print(f"✅ Created skill directory: {skill_dir}")
-    except Exception as e:
-        print(f"❌ Error creating directory: {e}")
-        return None
-
-    # Create SKILL.md from template
-    skill_title = title_case_skill_name(skill_name)
-    skill_content = SKILL_TEMPLATE.format(
-        skill_name=skill_name,
-        skill_title=skill_title
-    )
-
-    skill_md_path = skill_dir / 'SKILL.md'
-    try:
-        skill_md_path.write_text(skill_content)
-        print("✅ Created SKILL.md")
-    except Exception as e:
-        print(f"❌ Error creating SKILL.md: {e}")
-        return None
-
-    # Create resource directories with example files
-    try:
-        # Create scripts/ directory with example script
-        scripts_dir = skill_dir / 'scripts'
-        scripts_dir.mkdir(exist_ok=True)
-        example_script = scripts_dir / 'example.py'
-        example_script.write_text(EXAMPLE_SCRIPT.format(skill_name=skill_name))
-        example_script.chmod(0o755)
-        print("✅ Created scripts/example.py")
-
-        # Create references/ directory with example reference doc
-        references_dir = skill_dir / 'references'
-        references_dir.mkdir(exist_ok=True)
-        example_reference = references_dir / 'api_reference.md'
-        example_reference.write_text(EXAMPLE_REFERENCE.format(skill_title=skill_title))
-        print("✅ Created references/api_reference.md")
-
-        # Create assets/ directory with example asset placeholder
-        assets_dir = skill_dir / 'assets'
-        assets_dir.mkdir(exist_ok=True)
-        example_asset = assets_dir / 'example_asset.txt'
-        example_asset.write_text(EXAMPLE_ASSET)
-        print("✅ Created assets/example_asset.txt")
-    except Exception as e:
-        print(f"❌ Error creating resource directories: {e}")
-        return None
-
-    # Print next steps
-    print(f"\n✅ Skill '{skill_name}' initialized successfully at {skill_dir}")
-    print("\nNext steps:")
-    print("1. Edit SKILL.md to complete the TODO items and update the description")
-    print("2. Customize or delete the example files in scripts/, references/, and assets/")
-    print("3. Run the validator when ready to check the skill structure")
-
-    return skill_dir
-
-
-def main():
-    if len(sys.argv) < 4 or sys.argv[2] != '--path':
-        print("Usage: init_skill.py <skill-name> --path <path>")
-        print("\nSkill name requirements:")
-        print("  - Kebab-case identifier (e.g., 'my-data-analyzer')")
-        print("  - Lowercase letters, digits, and hyphens only")
-        print("  - Max 64 characters")
-        print("  - Must match directory name exactly")
-        print("\nExamples:")
-        print("  init_skill.py my-new-skill --path skills/public")
-        print("  init_skill.py my-api-helper --path skills/private")
-        print("  init_skill.py custom-skill --path /custom/location")
-        sys.exit(1)
-
-    skill_name = sys.argv[1]
-    path = sys.argv[3]
-
-    print(f"🚀 Initializing skill: {skill_name}")
-    print(f"   Location: {path}")
-    print()
-
-    result = init_skill(skill_name, path)
-
-    if result:
-        sys.exit(0)
-    else:
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    main()
--- a/plugins/skill-creator/skills/skill-creator/scripts/package_skill.py
+++ b/plugins/skill-creator/skills/skill-creator/scripts/package_skill.py
@@ -14,7 +14,7 @@ import fnmatch
 import sys
 import zipfile
 from pathlib import Path
-from quick_validate import validate_skill
+from scripts.quick_validate import validate_skill

 # Patterns to exclude when packaging skills.
 EXCLUDE_DIRS = {"__pycache__", "node_modules"}
--- a/plugins/skill-creator/skills/skill-creator/scripts/prepare_eval.py
+++ b/plugins/skill-creator/skills/skill-creator/scripts/prepare_eval.py
@@ -1,279 +0,0 @@
-#!/usr/bin/env python3
-"""
-Prepare environment for running a skill eval.
-
-Usage:
-    prepare_eval.py <skill-path> <eval-id> --output-dir <dir> [--no-skill]
-
-Examples:
-    prepare_eval.py skills/public/pdf 0 --output-dir workspace/eval-001/with-skill
-    prepare_eval.py skills/public/pdf 0 --output-dir workspace/eval-001/without-skill --no-skill
-
-Options:
-    <skill-path>     Path to the skill directory
-    <eval-id>        Index of the eval in evals/evals.json (0-based)
-    --output-dir     Directory to prepare for the eval run
-    --no-skill       If set, do not copy the skill (for baseline comparison)
-"""
-
-import json
-import os
-import shutil
-import sys
-from pathlib import Path
-
-
-def is_writable(path: Path) -> bool:
-    """Check if a directory is writable."""
-    try:
-        test_file = path / ".write_test"
-        test_file.touch()
-        test_file.unlink()
-        return True
-    except (OSError, PermissionError):
-        return False
-
-
-def load_evals(skill_path: Path) -> list:
-    """Load evals from the skill's evals/evals.json file."""
-    evals_file = skill_path / "evals" / "evals.json"
-    if not evals_file.exists():
-        raise FileNotFoundError(f"Evals file not found: {evals_file}")
-
-    with open(evals_file, "r") as f:
-        data = json.load(f)
-
-    # Handle both formats: plain list or wrapped in object with "evals" key
-    if isinstance(data, dict) and "evals" in data:
-        evals = data["evals"]
-    elif isinstance(data, list):
-        evals = data
-    else:
-        raise ValueError(
-            f"Expected evals.json to contain a list or object with 'evals' key, "
-            f"got {type(data).__name__}"
-        )
-
-    return evals
-
-
-def get_eval(evals: list, eval_id: int) -> dict:
-    """Get a specific eval by ID (0-based index)."""
-    if eval_id < 0 or eval_id >= len(evals):
-        raise IndexError(f"Eval ID {eval_id} out of range (0-{len(evals)-1})")
-    return evals[eval_id]
-
-
-def normalize_eval(eval_data: dict) -> dict:
-    """
-    Normalize eval data to a consistent format.
-
-    Handles both the design doc format (prompt, files, assertions)
-    and the gym format (query, files, expected_behavior).
-    """
-    # Get the prompt (can be "prompt" or "query")
-    prompt = eval_data.get("prompt") or eval_data.get("query")
-    if not prompt:
-        raise ValueError("Eval must have either 'prompt' or 'query' field")
-
-    # Get files (default to empty list)
-    files = eval_data.get("files", [])
-
-    # Get assertions - can be "assertions" (list of strings)
-    # or "expected_behavior" (list of strings or objects)
-    assertions = eval_data.get("assertions")
-    if assertions is None:
-        expected_behavior = eval_data.get("expected_behavior", [])
-        # Convert expected_behavior to string assertions if needed
-        assertions = []
-        for item in expected_behavior:
-            if isinstance(item, str):
-                assertions.append(item)
-            elif isinstance(item, dict):
-                # Convert structured assertion to string description
-                assertion_type = item.get("assertion", "unknown")
-                # Build a human-readable assertion string
-                parts = [f"Assertion type: {assertion_type}"]
-                for key, value in item.items():
-                    if key != "assertion":
-                        parts.append(f"{key}={value}")
-                assertions.append(" - ".join(parts))
-
-    return {
-        "prompt": prompt,
-        "files": files,
-        "assertions": assertions
-    }
-
-
-def prepare_eval(skill_path: Path, eval_id: int, output_dir: Path, no_skill: bool = False) -> dict:
-    """
-    Prepare the environment for running an eval.
-
-    Args:
-        skill_path: Path to the skill directory
-        eval_id: Index of the eval in evals.json
-        output_dir: Directory to prepare for the eval run
-        no_skill: If True, do not copy the skill (for baseline comparison)
-
-    Returns:
-        Dictionary with eval metadata
-    """
-    skill_path = Path(skill_path).resolve()
-    output_dir = Path(output_dir).resolve()
-
-    # Validate skill path
-    if not skill_path.exists():
-        raise FileNotFoundError(f"Skill directory not found: {skill_path}")
-
-    skill_md = skill_path / "SKILL.md"
-    if not skill_md.exists():
-        raise FileNotFoundError(f"SKILL.md not found in {skill_path}")
-
-    # Load and get the specific eval
-    evals = load_evals(skill_path)
-    eval_data = get_eval(evals, eval_id)
-    normalized = normalize_eval(eval_data)
-
-    # Create output directory
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    # Create inputs directory and stage input files
-    inputs_dir = output_dir / "inputs"
-    inputs_dir.mkdir(exist_ok=True)
-
-    staged_files = []
-    for file_ref in normalized["files"]:
-        # Files can be relative to skill's evals/files/ directory
-        source = skill_path / "evals" / "files" / file_ref
-        if not source.exists():
-            # Try relative to evals/ directly
-            source = skill_path / "evals" / file_ref
-        if not source.exists():
-            # Try relative to skill root
-            source = skill_path / file_ref
-
-        if source.exists():
-            dest = inputs_dir / Path(file_ref).name
-            if source.is_file():
-                shutil.copy2(source, dest)
-            else:
-                shutil.copytree(source, dest, dirs_exist_ok=True)
-            staged_files.append(str(dest))
-            print(f"  Staged: {file_ref} -> {dest}")
-        else:
-            print(f"  Warning: File not found: {file_ref}")
-
-    # Create outputs directory
-    outputs_dir = output_dir / "outputs"
-    outputs_dir.mkdir(exist_ok=True)
-
-    # Copy skill if not --no-skill
-    skill_copy_path = None
-    if not no_skill:
-        skill_copy_path = output_dir / "skill"
-        if skill_copy_path.exists():
-            shutil.rmtree(skill_copy_path)
-        shutil.copytree(skill_path, skill_copy_path, dirs_exist_ok=True)
-        skill_copy_path = str(skill_copy_path)
-        print(f"  Copied skill to: {skill_copy_path}")
-
-    # Build metadata
-    metadata = {
-        "eval_id": eval_id,
-        "prompt": normalized["prompt"],
-        "assertions": normalized["assertions"],
-        "input_files": staged_files,
-        "skill_path": skill_copy_path,
-        "output_dir": str(output_dir),
-        "inputs_dir": str(inputs_dir),
-        "outputs_dir": str(outputs_dir),
-        "no_skill": no_skill,
-        "original_skill_path": str(skill_path)
-    }
-
-    # Write metadata file
-    metadata_path = output_dir / "eval_metadata.json"
-    with open(metadata_path, "w") as f:
-        json.dump(metadata, f, indent=2)
-    print(f"  Wrote: {metadata_path}")
-
-    return metadata
-
-
-def determine_workspace(skill_path: Path) -> Path:
-    """
-    Determine the appropriate workspace location.
-
-    If skill directory is writable, use <skill>/workspace/
-    Otherwise, use <project-root>/<skill-name>-workspace/
-    """
-    skill_path = Path(skill_path).resolve()
-
-    if is_writable(skill_path):
-        return skill_path / "workspace"
-
-    # Find project root (look for .git or go up to home)
-    project_root = skill_path
-    while project_root != project_root.parent:
-        if (project_root / ".git").exists():
-            break
-        project_root = project_root.parent
-
-    if project_root == project_root.parent:
-        # Fallback to skill's parent directory
-        project_root = skill_path.parent
-
-    skill_name = skill_path.name
-    return project_root / f"{skill_name}-workspace"
-
-
-def main():
-    # Parse arguments
-    args = sys.argv[1:]
-
-    if len(args) < 4 or "--output-dir" not in args:
-        print(__doc__)
-        sys.exit(1)
-
-    # Find positional arguments and flags
-    skill_path = args[0]
-    eval_id = int(args[1])
-    no_skill = "--no-skill" in args
-
-    # Find --output-dir value
-    output_dir_idx = args.index("--output-dir")
-    if output_dir_idx + 1 >= len(args):
-        print("Error: --output-dir requires a value")
-        sys.exit(1)
-    output_dir = args[output_dir_idx + 1]
-
-    print(f"Preparing eval {eval_id} for skill: {skill_path}")
-    print(f"Output directory: {output_dir}")
-    if no_skill:
-        print("Mode: without skill (baseline)")
-    else:
-        print("Mode: with skill")
-    print()
-
-    try:
-        metadata = prepare_eval(
-            skill_path=Path(skill_path),
-            eval_id=eval_id,
-            output_dir=Path(output_dir),
-            no_skill=no_skill
-        )
-
-        print()
-        print("Eval prepared successfully!")
-        print(f"  Prompt: {metadata['prompt'][:60]}..." if len(metadata['prompt']) > 60 else f"  Prompt: {metadata['prompt']}")
-        print(f"  Assertions: {len(metadata['assertions'])}")
-        print(f"  Input files: {len(metadata['input_files'])}")
-
-    except Exception as e:
-        print(f"Error: {e}")
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    main()
--- a/plugins/skill-creator/skills/skill-creator/scripts/run_eval.py
+++ b/plugins/skill-creator/skills/skill-creator/scripts/run_eval.py
@@ -0,0 +1,310 @@
+#!/usr/bin/env python3
+"""Run trigger evaluation for a skill description.
+
+Tests whether a skill's description causes Claude to trigger (read the skill)
+for a set of queries. Outputs results as JSON.
+"""
+
+import argparse
+import json
+import os
+import select
+import subprocess
+import sys
+import time
+import uuid
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from pathlib import Path
+
+from scripts.utils import parse_skill_md
+
+
+def find_project_root() -> Path:
+    """Find the project root by walking up from cwd looking for .claude/.
+
+    Mimics how Claude Code discovers its project root, so the command file
+    we create ends up where claude -p will look for it.
+    """
+    current = Path.cwd()
+    for parent in [current, *current.parents]:
+        if (parent / ".claude").is_dir():
+            return parent
+    return current
+
+
+def run_single_query(
+    query: str,
+    skill_name: str,
+    skill_description: str,
+    timeout: int,
+    project_root: str,
+    model: str | None = None,
+) -> bool:
+    """Run a single query and return whether the skill was triggered.
+
+    Creates a command file in .claude/commands/ so it appears in Claude's
+    available_skills list, then runs `claude -p` with the raw query.
+    Uses --include-partial-messages to detect triggering early from
+    stream events (content_block_start) rather than waiting for the
+    full assistant message, which only arrives after tool execution.
+    """
+    unique_id = uuid.uuid4().hex[:8]
+    clean_name = f"{skill_name}-skill-{unique_id}"
+    project_commands_dir = Path(project_root) / ".claude" / "commands"
+    command_file = project_commands_dir / f"{clean_name}.md"
+
+    try:
+        project_commands_dir.mkdir(parents=True, exist_ok=True)
+        # Use YAML block scalar to avoid breaking on quotes in description
+        indented_desc = "\n  ".join(skill_description.split("\n"))
+        command_content = (
+            f"---\n"
+            f"description: |\n"
+            f"  {indented_desc}\n"
+            f"---\n\n"
+            f"# {skill_name}\n\n"
+            f"This skill handles: {skill_description}\n"
+        )
+        command_file.write_text(command_content)
+
+        cmd = [
+            "claude",
+            "-p", query,
+            "--output-format", "stream-json",
+            "--verbose",
+            "--include-partial-messages",
+        ]
+        if model:
+            cmd.extend(["--model", model])
+
+        # Remove CLAUDECODE env var to allow nesting claude -p inside a
+        # Claude Code session. The guard is for interactive terminal conflicts;
+        # programmatic subprocess usage is safe.
+        env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
+
+        process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.DEVNULL,
+            cwd=project_root,
+            env=env,
+        )
+
+        triggered = False
+        start_time = time.time()
+        buffer = ""
+        # Track state for stream event detection
+        pending_tool_name = None
+        accumulated_json = ""
+
+        try:
+            while time.time() - start_time < timeout:
+                if process.poll() is not None:
+                    remaining = process.stdout.read()
+                    if remaining:
+                        buffer += remaining.decode("utf-8", errors="replace")
+                    break
+
+                ready, _, _ = select.select([process.stdout], [], [], 1.0)
+                if not ready:
+                    continue
+
+                chunk = os.read(process.stdout.fileno(), 8192)
+                if not chunk:
+                    break
+                buffer += chunk.decode("utf-8", errors="replace")
+
+                while "\n" in buffer:
+                    line, buffer = buffer.split("\n", 1)
+                    line = line.strip()
+                    if not line:
+                        continue
+
+                    try:
+                        event = json.loads(line)
+                    except json.JSONDecodeError:
+                        continue
+
+                    # Early detection via stream events
+                    if event.get("type") == "stream_event":
+                        se = event.get("event", {})
+                        se_type = se.get("type", "")
+
+                        if se_type == "content_block_start":
+                            cb = se.get("content_block", {})
+                            if cb.get("type") == "tool_use":
+                                tool_name = cb.get("name", "")
+                                if tool_name in ("Skill", "Read"):
+                                    pending_tool_name = tool_name
+                                    accumulated_json = ""
+                                else:
+                                    return False
+
+                        elif se_type == "content_block_delta" and pending_tool_name:
+                            delta = se.get("delta", {})
+                            if delta.get("type") == "input_json_delta":
+                                accumulated_json += delta.get("partial_json", "")
+                                if clean_name in accumulated_json:
+                                    return True
+
+                        elif se_type in ("content_block_stop", "message_stop"):
+                            if pending_tool_name:
+                                return clean_name in accumulated_json
+                            if se_type == "message_stop":
+                                return False
+
+                    # Fallback: full assistant message
+                    elif event.get("type") == "assistant":
+                        message = event.get("message", {})
+                        for content_item in message.get("content", []):
+                            if content_item.get("type") != "tool_use":
+                                continue
+                            tool_name = content_item.get("name", "")
+                            tool_input = content_item.get("input", {})
+                            if tool_name == "Skill" and clean_name in tool_input.get("skill", ""):
+                                triggered = True
+                            elif tool_name == "Read" and clean_name in tool_input.get("file_path", ""):
+                                triggered = True
+                            return triggered
+
+                    elif event.get("type") == "result":
+                        return triggered
+        finally:
+            # Clean up process on any exit path (return, exception, timeout)
+            if process.poll() is None:
+                process.kill()
+                process.wait()
+
+        return triggered
+    finally:
+        if command_file.exists():
+            command_file.unlink()
+
+
+def run_eval(
+    eval_set: list[dict],
+    skill_name: str,
+    description: str,
+    num_workers: int,
+    timeout: int,
+    project_root: Path,
+    runs_per_query: int = 1,
+    trigger_threshold: float = 0.5,
+    model: str | None = None,
+) -> dict:
+    """Run the full eval set and return results."""
+    results = []
+
+    with ProcessPoolExecutor(max_workers=num_workers) as executor:
+        future_to_info = {}
+        for item in eval_set:
+            for run_idx in range(runs_per_query):
+                future = executor.submit(
+                    run_single_query,
+                    item["query"],
+                    skill_name,
+                    description,
+                    timeout,
+                    str(project_root),
+                    model,
+                )
+                future_to_info[future] = (item, run_idx)
+
+        query_triggers: dict[str, list[bool]] = {}
+        query_items: dict[str, dict] = {}
+        for future in as_completed(future_to_info):
+            item, _ = future_to_info[future]
+            query = item["query"]
+            query_items[query] = item
+            if query not in query_triggers:
+                query_triggers[query] = []
+            try:
+                query_triggers[query].append(future.result())
+            except Exception as e:
+                print(f"Warning: query failed: {e}", file=sys.stderr)
+                query_triggers[query].append(False)
+
+    for query, triggers in query_triggers.items():
+        item = query_items[query]
+        trigger_rate = sum(triggers) / len(triggers)
+        should_trigger = item["should_trigger"]
+        if should_trigger:
+            did_pass = trigger_rate >= trigger_threshold
+        else:
+            did_pass = trigger_rate < trigger_threshold
+        results.append({
+            "query": query,
+            "should_trigger": should_trigger,
+            "trigger_rate": trigger_rate,
+            "triggers": sum(triggers),
+            "runs": len(triggers),
+            "pass": did_pass,
+        })
+
+    passed = sum(1 for r in results if r["pass"])
+    total = len(results)
+
+    return {
+        "skill_name": skill_name,
+        "description": description,
+        "results": results,
+        "summary": {
+            "total": total,
+            "passed": passed,
+            "failed": total - passed,
+        },
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run trigger evaluation for a skill description")
+    parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file")
+    parser.add_argument("--skill-path", required=True, help="Path to skill directory")
+    parser.add_argument("--description", default=None, help="Override description to test")
+    parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers")
+    parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds")
+    parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query")
+    parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")
+    parser.add_argument("--model", default=None, help="Model to use for claude -p (default: user's configured model)")
+    parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
+    args = parser.parse_args()
+
+    eval_set = json.loads(Path(args.eval_set).read_text())
+    skill_path = Path(args.skill_path)
+
+    if not (skill_path / "SKILL.md").exists():
+        print(f"Error: No SKILL.md found at {skill_path}", file=sys.stderr)
+        sys.exit(1)
+
+    name, original_description, content = parse_skill_md(skill_path)
+    description = args.description or original_description
+    project_root = find_project_root()
+
+    if args.verbose:
+        print(f"Evaluating: {description}", file=sys.stderr)
+
+    output = run_eval(
+        eval_set=eval_set,
+        skill_name=name,
+        description=description,
+        num_workers=args.num_workers,
+        timeout=args.timeout,
+        project_root=project_root,
+        runs_per_query=args.runs_per_query,
+        trigger_threshold=args.trigger_threshold,
+        model=args.model,
+    )
+
+    if args.verbose:
+        summary = output["summary"]
+        print(f"Results: {summary['passed']}/{summary['total']} passed", file=sys.stderr)
+        for r in output["results"]:
+            status = "PASS" if r["pass"] else "FAIL"
+            rate_str = f"{r['triggers']}/{r['runs']}"
+            print(f"  [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:70]}", file=sys.stderr)
+
+    print(json.dumps(output, indent=2))
+
+
+if __name__ == "__main__":
+    main()
--- a/plugins/skill-creator/skills/skill-creator/scripts/run_loop.py
+++ b/plugins/skill-creator/skills/skill-creator/scripts/run_loop.py
@@ -0,0 +1,332 @@
+#!/usr/bin/env python3
+"""Run the eval + improve loop until all pass or max iterations reached.
+
+Combines run_eval.py and improve_description.py in a loop, tracking history
+and returning the best description found. Supports train/test split to prevent
+overfitting.
+"""
+
+import argparse
+import json
+import random
+import sys
+import tempfile
+import time
+import webbrowser
+from pathlib import Path
+
+import anthropic
+
+from scripts.generate_report import generate_html
+from scripts.improve_description import improve_description
+from scripts.run_eval import find_project_root, run_eval
+from scripts.utils import parse_skill_md
+
+
+def split_eval_set(eval_set: list[dict], holdout: float, seed: int = 42) -> tuple[list[dict], list[dict]]:
+    """Split eval set into train and test sets, stratified by should_trigger."""
+    random.seed(seed)
+
+    # Separate by should_trigger
+    trigger = [e for e in eval_set if e["should_trigger"]]
+    no_trigger = [e for e in eval_set if not e["should_trigger"]]
+
+    # Shuffle each group
+    random.shuffle(trigger)
+    random.shuffle(no_trigger)
+
+    # Calculate split points
+    n_trigger_test = max(1, int(len(trigger) * holdout))
+    n_no_trigger_test = max(1, int(len(no_trigger) * holdout))
+
+    # Split
+    test_set = trigger[:n_trigger_test] + no_trigger[:n_no_trigger_test]
+    train_set = trigger[n_trigger_test:] + no_trigger[n_no_trigger_test:]
+
+    return train_set, test_set
+
+
+def run_loop(
+    eval_set: list[dict],
+    skill_path: Path,
+    description_override: str | None,
+    num_workers: int,
+    timeout: int,
+    max_iterations: int,
+    runs_per_query: int,
+    trigger_threshold: float,
+    holdout: float,
+    model: str,
+    verbose: bool,
+    live_report_path: Path | None = None,
+    log_dir: Path | None = None,
+) -> dict:
+    """Run the eval + improvement loop."""
+    project_root = find_project_root()
+    name, original_description, content = parse_skill_md(skill_path)
+    current_description = description_override or original_description
+
+    # Split into train/test if holdout > 0
+    if holdout > 0:
+        train_set, test_set = split_eval_set(eval_set, holdout)
+        if verbose:
+            print(f"Split: {len(train_set)} train, {len(test_set)} test (holdout={holdout})", file=sys.stderr)
+    else:
+        train_set = eval_set
+        test_set = []
+
+    client = anthropic.Anthropic()
+    history = []
+    exit_reason = "unknown"
+
+    for iteration in range(1, max_iterations + 1):
+        if verbose:
+            print(f"\n{'='*60}", file=sys.stderr)
+            print(f"Iteration {iteration}/{max_iterations}", file=sys.stderr)
+            print(f"Description: {current_description}", file=sys.stderr)
+            print(f"{'='*60}", file=sys.stderr)
+
+        # Evaluate train + test together in one batch for parallelism
+        all_queries = train_set + test_set
+        t0 = time.time()
+        all_results = run_eval(
+            eval_set=all_queries,
+            skill_name=name,
+            description=current_description,
+            num_workers=num_workers,
+            timeout=timeout,
+            project_root=project_root,
+            runs_per_query=runs_per_query,
+            trigger_threshold=trigger_threshold,
+            model=model,
+        )
+        eval_elapsed = time.time() - t0
+
+        # Split results back into train/test by matching queries
+        train_queries_set = {q["query"] for q in train_set}
+        train_result_list = [r for r in all_results["results"] if r["query"] in train_queries_set]
+        test_result_list = [r for r in all_results["results"] if r["query"] not in train_queries_set]
+
+        train_passed = sum(1 for r in train_result_list if r["pass"])
+        train_total = len(train_result_list)
+        train_summary = {"passed": train_passed, "failed": train_total - train_passed, "total": train_total}
+        train_results = {"results": train_result_list, "summary": train_summary}
+
+        if test_set:
+            test_passed = sum(1 for r in test_result_list if r["pass"])
+            test_total = len(test_result_list)
+            test_summary = {"passed": test_passed, "failed": test_total - test_passed, "total": test_total}
+            test_results = {"results": test_result_list, "summary": test_summary}
+        else:
+            test_results = None
+            test_summary = None
+
+        history.append({
+            "iteration": iteration,
+            "description": current_description,
+            "train_passed": train_summary["passed"],
+            "train_failed": train_summary["failed"],
+            "train_total": train_summary["total"],
+            "train_results": train_results["results"],
+            "test_passed": test_summary["passed"] if test_summary else None,
+            "test_failed": test_summary["failed"] if test_summary else None,
+            "test_total": test_summary["total"] if test_summary else None,
+            "test_results": test_results["results"] if test_results else None,
+            # For backward compat with report generator
+            "passed": train_summary["passed"],
+            "failed": train_summary["failed"],
+            "total": train_summary["total"],
+            "results": train_results["results"],
+        })
+
+        # Write live report if path provided
+        if live_report_path:
+            partial_output = {
+                "original_description": original_description,
+                "best_description": current_description,
+                "best_score": "in progress",
+                "iterations_run": len(history),
+                "holdout": holdout,
+                "train_size": len(train_set),
+                "test_size": len(test_set),
+                "history": history,
+            }
+            live_report_path.write_text(generate_html(partial_output, auto_refresh=True, skill_name=name))
+
+        if verbose:
+            def print_eval_stats(label, results, elapsed):
+                pos = [r for r in results if r["should_trigger"]]
+                neg = [r for r in results if not r["should_trigger"]]
+                tp = sum(r["triggers"] for r in pos)
+                pos_runs = sum(r["runs"] for r in pos)
+                fn = pos_runs - tp
+                fp = sum(r["triggers"] for r in neg)
+                neg_runs = sum(r["runs"] for r in neg)
+                tn = neg_runs - fp
+                total = tp + tn + fp + fn
+                precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0
+                recall = tp / (tp + fn) if (tp + fn) > 0 else 1.0
+                accuracy = (tp + tn) / total if total > 0 else 0.0
+                print(f"{label}: {tp+tn}/{total} correct, precision={precision:.0%} recall={recall:.0%} accuracy={accuracy:.0%} ({elapsed:.1f}s)", file=sys.stderr)
+                for r in results:
+                    status = "PASS" if r["pass"] else "FAIL"
+                    rate_str = f"{r['triggers']}/{r['runs']}"
+                    print(f"  [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:60]}", file=sys.stderr)
+
+            print_eval_stats("Train", train_results["results"], eval_elapsed)
+            if test_summary:
+                print_eval_stats("Test ", test_results["results"], 0)
+
+        if train_summary["failed"] == 0:
+            exit_reason = f"all_passed (iteration {iteration})"
+            if verbose:
+                print(f"\nAll train queries passed on iteration {iteration}!", file=sys.stderr)
+            break
+
+        if iteration == max_iterations:
+            exit_reason = f"max_iterations ({max_iterations})"
+            if verbose:
+                print(f"\nMax iterations reached ({max_iterations}).", file=sys.stderr)
+            break
+
+        # Improve the description based on train results
+        if verbose:
+            print(f"\nImproving description...", file=sys.stderr)
+
+        t0 = time.time()
+        # Strip test scores from history so improvement model can't see them
+        blinded_history = [
+            {k: v for k, v in h.items() if not k.startswith("test_")}
+            for h in history
+        ]
+        new_description = improve_description(
+            client=client,
+            skill_name=name,
+            skill_content=content,
+            current_description=current_description,
+            eval_results=train_results,
+            history=blinded_history,
+            model=model,
+            log_dir=log_dir,
+            iteration=iteration,
+        )
+        improve_elapsed = time.time() - t0
+
+        if verbose:
+            print(f"Proposed ({improve_elapsed:.1f}s): {new_description}", file=sys.stderr)
+
+        current_description = new_description
+
+    # Find the best iteration by TEST score (or train if no test set)
+    if test_set:
+        best = max(history, key=lambda h: h["test_passed"] or 0)
+        best_score = f"{best['test_passed']}/{best['test_total']}"
+    else:
+        best = max(history, key=lambda h: h["train_passed"])
+        best_score = f"{best['train_passed']}/{best['train_total']}"
+
+    if verbose:
+        print(f"\nExit reason: {exit_reason}", file=sys.stderr)
+        print(f"Best score: {best_score} (iteration {best['iteration']})", file=sys.stderr)
+
+    return {
+        "exit_reason": exit_reason,
+        "original_description": original_description,
+        "best_description": best["description"],
+        "best_score": best_score,
+        "best_train_score": f"{best['train_passed']}/{best['train_total']}",
+        "best_test_score": f"{best['test_passed']}/{best['test_total']}" if test_set else None,
+        "final_description": current_description,
+        "iterations_run": len(history),
+        "holdout": holdout,
+        "train_size": len(train_set),
+        "test_size": len(test_set),
+        "history": history,
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run eval + improve loop")
+    parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file")
+    parser.add_argument("--skill-path", required=True, help="Path to skill directory")
+    parser.add_argument("--description", default=None, help="Override starting description")
+    parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers")
+    parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds")
+    parser.add_argument("--max-iterations", type=int, default=5, help="Max improvement iterations")
+    parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query")
+    parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")
+    parser.add_argument("--holdout", type=float, default=0.4, help="Fraction of eval set to hold out for testing (0 to disable)")
+    parser.add_argument("--model", required=True, help="Model for improvement")
+    parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
+    parser.add_argument("--report", default="auto", help="Generate HTML report at this path (default: 'auto' for temp file, 'none' to disable)")
+    parser.add_argument("--results-dir", default=None, help="Save all outputs (results.json, report.html, log.txt) to a timestamped subdirectory here")
+    args = parser.parse_args()
+
+    eval_set = json.loads(Path(args.eval_set).read_text())
+    skill_path = Path(args.skill_path)
+
+    if not (skill_path / "SKILL.md").exists():
+        print(f"Error: No SKILL.md found at {skill_path}", file=sys.stderr)
+        sys.exit(1)
+
+    name, _, _ = parse_skill_md(skill_path)
+
+    # Set up live report path
+    if args.report != "none":
+        if args.report == "auto":
+            timestamp = time.strftime("%Y%m%d_%H%M%S")
+            live_report_path = Path(tempfile.gettempdir()) / f"skill_description_report_{skill_path.name}_{timestamp}.html"
+        else:
+            live_report_path = Path(args.report)
+        # Open the report immediately so the user can watch
+        live_report_path.write_text("<html><body><h1>Starting optimization loop...</h1><meta http-equiv='refresh' content='5'></body></html>")
+        webbrowser.open(str(live_report_path))
+    else:
+        live_report_path = None
+
+    # Determine output directory (create before run_loop so logs can be written)
+    if args.results_dir:
+        timestamp = time.strftime("%Y-%m-%d_%H%M%S")
+        results_dir = Path(args.results_dir) / timestamp
+        results_dir.mkdir(parents=True, exist_ok=True)
+    else:
+        results_dir = None
+
+    log_dir = results_dir / "logs" if results_dir else None
+
+    output = run_loop(
+        eval_set=eval_set,
+        skill_path=skill_path,
+        description_override=args.description,
+        num_workers=args.num_workers,
+        timeout=args.timeout,
+        max_iterations=args.max_iterations,
+        runs_per_query=args.runs_per_query,
+        trigger_threshold=args.trigger_threshold,
+        holdout=args.holdout,
+        model=args.model,
+        verbose=args.verbose,
+        live_report_path=live_report_path,
+        log_dir=log_dir,
+    )
+
+    # Save JSON output
+    json_output = json.dumps(output, indent=2)
+    print(json_output)
+    if results_dir:
+        (results_dir / "results.json").write_text(json_output)
+
+    # Write final HTML report (without auto-refresh)
+    if live_report_path:
+        live_report_path.write_text(generate_html(output, auto_refresh=False, skill_name=name))
+        print(f"\nReport: {live_report_path}", file=sys.stderr)
+
+    if results_dir and live_report_path:
+        (results_dir / "report.html").write_text(generate_html(output, auto_refresh=False, skill_name=name))
+
+    if results_dir:
+        print(f"Results saved to: {results_dir}", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
--- a/plugins/skill-creator/skills/skill-creator/scripts/utils.py
+++ b/plugins/skill-creator/skills/skill-creator/scripts/utils.py
@@ -0,0 +1,47 @@
+"""Shared utilities for skill-creator scripts."""
+
+from pathlib import Path
+
+
+
+def parse_skill_md(skill_path: Path) -> tuple[str, str, str]:
+    """Parse a SKILL.md file, returning (name, description, full_content)."""
+    content = (skill_path / "SKILL.md").read_text()
+    lines = content.split("\n")
+
+    if lines[0].strip() != "---":
+        raise ValueError("SKILL.md missing frontmatter (no opening ---)")
+
+    end_idx = None
+    for i, line in enumerate(lines[1:], start=1):
+        if line.strip() == "---":
+            end_idx = i
+            break
+
+    if end_idx is None:
+        raise ValueError("SKILL.md missing frontmatter (no closing ---)")
+
+    name = ""
+    description = ""
+    frontmatter_lines = lines[1:end_idx]
+    i = 0
+    while i < len(frontmatter_lines):
+        line = frontmatter_lines[i]
+        if line.startswith("name:"):
+            name = line[len("name:"):].strip().strip('"').strip("'")
+        elif line.startswith("description:"):
+            value = line[len("description:"):].strip()
+            # Handle YAML multiline indicators (>, |, >-, |-)
+            if value in (">", "|", ">-", "|-"):
+                continuation_lines: list[str] = []
+                i += 1
+                while i < len(frontmatter_lines) and (frontmatter_lines[i].startswith("  ") or frontmatter_lines[i].startswith("\t")):
+                    continuation_lines.append(frontmatter_lines[i].strip())
+                    i += 1
+                description = " ".join(continuation_lines)
+                continue
+            else:
+                description = value.strip('"').strip("'")
+        i += 1
+
+    return name, description, content
--- a/plugins/skill-creator/skills/skill-creator/scripts/validate_json.py
+++ b/plugins/skill-creator/skills/skill-creator/scripts/validate_json.py
@@ -1,411 +0,0 @@
-#!/usr/bin/env python3
-"""
-Validate JSON files produced by skill-creator-edge.
-
-Supports validation for:
- evals.json: Skill evaluation definitions
- grading.json: Grader output
- benchmark.json: Benchmark results
- metrics.json: Executor metrics
- timing.json: Timing data
- history.json: Improve mode version history
- comparison.json: Blind comparator output
- analysis.json: Post-hoc analyzer output
-
-Usage:
-    python validate_json.py <file_path> [--type <type>]
-
-Examples:
-    python validate_json.py workspace/benchmark.json
-    python validate_json.py evals/evals.json --type evals
-    python validate_json.py run-1/grading.json --type grading
-"""
-
-import argparse
-import json
-import sys
-from pathlib import Path
-from typing import Any
-
-
-# Schema definitions as validation rules
-SCHEMAS = {
-    "evals": {
-        "required": ["skill_name", "evals"],
-        "evals_item": {
-            "required": ["id", "prompt"],
-            "optional": ["expected_output", "files", "expectations"]
-        }
-    },
-    "grading": {
-        "required": ["expectations", "summary"],
-        "summary": {
-            "required": ["passed", "failed", "total", "pass_rate"]
-        },
-        "expectations_item": {
-            "required": ["text", "passed", "evidence"]
-        }
-    },
-    "benchmark": {
-        "required": ["metadata", "runs", "run_summary"],
-        "metadata": {
-            "required": ["skill_name", "timestamp", "runs_per_configuration"]
-        },
-        "runs_item": {
-            "required": ["eval_id", "configuration", "run_number", "result"]
-        },
-        "run_summary_config": {
-            "required": ["pass_rate", "time_seconds", "tokens"]
-        }
-    },
-    "metrics": {
-        "required": ["tool_calls", "total_tool_calls"],
-        "optional": ["total_steps", "files_created", "errors_encountered",
-                     "output_chars", "transcript_chars"]
-    },
-    "timing": {
-        "required": [],  # All fields optional but should have at least one
-        "optional": ["executor_start", "executor_end", "executor_duration_seconds",
-                     "grader_start", "grader_end", "grader_duration_seconds",
-                     "total_duration_seconds"]
-    },
-    "history": {
-        "required": ["started_at", "skill_name", "current_best", "iterations"],
-        "iterations_item": {
-            "required": ["version", "expectation_pass_rate", "grading_result", "is_current_best"]
-        }
-    },
-    "comparison": {
-        "required": ["winner", "reasoning", "rubric", "output_quality"],
-        "rubric_side": {
-            "required": ["content", "structure", "overall_score"]
-        }
-    },
-    "analysis": {
-        "required": ["comparison_summary", "winner_strengths", "loser_weaknesses",
-                     "improvement_suggestions"],
-        "improvement_item": {
-            "required": ["priority", "category", "suggestion"]
-        }
-    }
-}
-
-
-def infer_type(file_path: Path) -> str | None:
-    """Infer JSON type from filename."""
-    name = file_path.name.lower()
-
-    if name == "evals.json":
-        return "evals"
-    elif name == "grading.json":
-        return "grading"
-    elif name == "benchmark.json":
-        return "benchmark"
-    elif name == "metrics.json":
-        return "metrics"
-    elif name == "timing.json":
-        return "timing"
-    elif name == "history.json":
-        return "history"
-    elif name.startswith("comparison"):
-        return "comparison"
-    elif name == "analysis.json":
-        return "analysis"
-
-    return None
-
-
-def validate_required_fields(data: dict, required: list[str], path: str = "") -> list[str]:
-    """Check that all required fields are present."""
-    errors = []
-    for field in required:
-        if field not in data:
-            errors.append(f"{path}Missing required field: {field}")
-    return errors
-
-
-def validate_evals(data: dict) -> list[str]:
-    """Validate evals.json structure."""
-    errors = validate_required_fields(data, SCHEMAS["evals"]["required"])
-
-    if "evals" in data:
-        if not isinstance(data["evals"], list):
-            errors.append("'evals' must be a list")
-        else:
-            for i, eval_item in enumerate(data["evals"]):
-                item_errors = validate_required_fields(
-                    eval_item,
-                    SCHEMAS["evals"]["evals_item"]["required"],
-                    f"evals[{i}]: "
-                )
-                errors.extend(item_errors)
-
-                # Validate id is an integer
-                if "id" in eval_item and not isinstance(eval_item["id"], int):
-                    errors.append(f"evals[{i}]: 'id' must be an integer")
-
-                # Validate expectations is a list of strings
-                if "expectations" in eval_item:
-                    if not isinstance(eval_item["expectations"], list):
-                        errors.append(f"evals[{i}]: 'expectations' must be a list")
-                    else:
-                        for j, exp in enumerate(eval_item["expectations"]):
-                            if not isinstance(exp, str):
-                                errors.append(f"evals[{i}].expectations[{j}]: must be a string")
-
-    return errors
-
-
-def validate_grading(data: dict) -> list[str]:
-    """Validate grading.json structure."""
-    errors = validate_required_fields(data, SCHEMAS["grading"]["required"])
-
-    if "summary" in data:
-        summary_errors = validate_required_fields(
-            data["summary"],
-            SCHEMAS["grading"]["summary"]["required"],
-            "summary: "
-        )
-        errors.extend(summary_errors)
-
-        # Validate pass_rate is between 0 and 1
-        if "pass_rate" in data["summary"]:
-            pr = data["summary"]["pass_rate"]
-            if not isinstance(pr, (int, float)) or pr < 0 or pr > 1:
-                errors.append("summary.pass_rate must be a number between 0 and 1")
-
-    if "expectations" in data:
-        if not isinstance(data["expectations"], list):
-            errors.append("'expectations' must be a list")
-        else:
-            for i, exp in enumerate(data["expectations"]):
-                exp_errors = validate_required_fields(
-                    exp,
-                    SCHEMAS["grading"]["expectations_item"]["required"],
-                    f"expectations[{i}]: "
-                )
-                errors.extend(exp_errors)
-
-                if "passed" in exp and not isinstance(exp["passed"], bool):
-                    errors.append(f"expectations[{i}].passed must be a boolean")
-
-    return errors
-
-
-def validate_benchmark(data: dict) -> list[str]:
-    """Validate benchmark.json structure."""
-    errors = validate_required_fields(data, SCHEMAS["benchmark"]["required"])
-
-    if "metadata" in data:
-        meta_errors = validate_required_fields(
-            data["metadata"],
-            SCHEMAS["benchmark"]["metadata"]["required"],
-            "metadata: "
-        )
-        errors.extend(meta_errors)
-
-    if "runs" in data:
-        if not isinstance(data["runs"], list):
-            errors.append("'runs' must be a list")
-        else:
-            for i, run in enumerate(data["runs"]):
-                run_errors = validate_required_fields(
-                    run,
-                    SCHEMAS["benchmark"]["runs_item"]["required"],
-                    f"runs[{i}]: "
-                )
-                errors.extend(run_errors)
-
-                # Validate configuration
-                if "configuration" in run:
-                    if run["configuration"] not in ["with_skill", "without_skill"]:
-                        errors.append(f"runs[{i}].configuration must be 'with_skill' or 'without_skill'")
-
-    if "run_summary" in data:
-        for config in ["with_skill", "without_skill"]:
-            if config in data["run_summary"]:
-                config_errors = validate_required_fields(
-                    data["run_summary"][config],
-                    SCHEMAS["benchmark"]["run_summary_config"]["required"],
-                    f"run_summary.{config}: "
-                )
-                errors.extend(config_errors)
-
-    return errors
-
-
-def validate_metrics(data: dict) -> list[str]:
-    """Validate metrics.json structure."""
-    errors = validate_required_fields(data, SCHEMAS["metrics"]["required"])
-
-    if "tool_calls" in data and not isinstance(data["tool_calls"], dict):
-        errors.append("'tool_calls' must be an object")
-
-    if "total_tool_calls" in data and not isinstance(data["total_tool_calls"], int):
-        errors.append("'total_tool_calls' must be an integer")
-
-    return errors
-
-
-def validate_timing(data: dict) -> list[str]:
-    """Validate timing.json structure."""
-    errors = []
-
-    # At least one timing field should be present
-    timing_fields = SCHEMAS["timing"]["optional"]
-    has_timing = any(field in data for field in timing_fields)
-
-    if not has_timing:
-        errors.append("timing.json should have at least one timing field")
-
-    # Validate duration fields are numbers
-    for field in ["executor_duration_seconds", "grader_duration_seconds", "total_duration_seconds"]:
-        if field in data and not isinstance(data[field], (int, float)):
-            errors.append(f"'{field}' must be a number")
-
-    return errors
-
-
-def validate_history(data: dict) -> list[str]:
-    """Validate history.json structure."""
-    errors = validate_required_fields(data, SCHEMAS["history"]["required"])
-
-    if "iterations" in data:
-        if not isinstance(data["iterations"], list):
-            errors.append("'iterations' must be a list")
-        else:
-            for i, iteration in enumerate(data["iterations"]):
-                iter_errors = validate_required_fields(
-                    iteration,
-                    SCHEMAS["history"]["iterations_item"]["required"],
-                    f"iterations[{i}]: "
-                )
-                errors.extend(iter_errors)
-
-                if "grading_result" in iteration:
-                    valid_results = ["baseline", "won", "lost", "tie"]
-                    if iteration["grading_result"] not in valid_results:
-                        errors.append(f"iterations[{i}].grading_result must be one of: {valid_results}")
-
-    return errors
-
-
-def validate_comparison(data: dict) -> list[str]:
-    """Validate comparison.json structure."""
-    errors = validate_required_fields(data, SCHEMAS["comparison"]["required"])
-
-    if "winner" in data:
-        if data["winner"] not in ["A", "B", "TIE"]:
-            errors.append("'winner' must be 'A', 'B', or 'TIE'")
-
-    if "rubric" in data:
-        for side in ["A", "B"]:
-            if side in data["rubric"]:
-                side_errors = validate_required_fields(
-                    data["rubric"][side],
-                    SCHEMAS["comparison"]["rubric_side"]["required"],
-                    f"rubric.{side}: "
-                )
-                errors.extend(side_errors)
-
-    return errors
-
-
-def validate_analysis(data: dict) -> list[str]:
-    """Validate analysis.json structure."""
-    errors = validate_required_fields(data, SCHEMAS["analysis"]["required"])
-
-    if "improvement_suggestions" in data:
-        if not isinstance(data["improvement_suggestions"], list):
-            errors.append("'improvement_suggestions' must be a list")
-        else:
-            for i, suggestion in enumerate(data["improvement_suggestions"]):
-                sugg_errors = validate_required_fields(
-                    suggestion,
-                    SCHEMAS["analysis"]["improvement_item"]["required"],
-                    f"improvement_suggestions[{i}]: "
-                )
-                errors.extend(sugg_errors)
-
-                if "priority" in suggestion:
-                    if suggestion["priority"] not in ["high", "medium", "low"]:
-                        errors.append(f"improvement_suggestions[{i}].priority must be 'high', 'medium', or 'low'")
-
-    return errors
-
-
-VALIDATORS = {
-    "evals": validate_evals,
-    "grading": validate_grading,
-    "benchmark": validate_benchmark,
-    "metrics": validate_metrics,
-    "timing": validate_timing,
-    "history": validate_history,
-    "comparison": validate_comparison,
-    "analysis": validate_analysis,
-}
-
-
-def validate_file(file_path: Path, json_type: str | None = None) -> tuple[bool, list[str]]:
-    """
-    Validate a JSON file.
-
-    Returns (is_valid, errors) tuple.
-    """
-    errors = []
-
-    # Check file exists
-    if not file_path.exists():
-        return False, [f"File not found: {file_path}"]
-
-    # Load JSON
-    try:
-        with open(file_path) as f:
-            data = json.load(f)
-    except json.JSONDecodeError as e:
-        return False, [f"Invalid JSON: {e}"]
-
-    # Infer type if not provided
-    if json_type is None:
-        json_type = infer_type(file_path)
-
-    if json_type is None:
-        return False, [f"Could not infer JSON type from filename. Use --type to specify."]
-
-    if json_type not in VALIDATORS:
-        return False, [f"Unknown JSON type: {json_type}. Valid types: {list(VALIDATORS.keys())}"]
-
-    # Run validation
-    validator = VALIDATORS[json_type]
-    errors = validator(data)
-
-    return len(errors) == 0, errors
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Validate JSON files produced by skill-creator-edge"
-    )
-    parser.add_argument("file", type=Path, help="Path to the JSON file to validate")
-    parser.add_argument(
-        "--type", "-t",
-        choices=list(VALIDATORS.keys()),
-        help="JSON type (inferred from filename if not specified)"
-    )
-
-    args = parser.parse_args()
-
-    is_valid, errors = validate_file(args.file, args.type)
-
-    if is_valid:
-        print(f"✓ {args.file} is valid")
-        sys.exit(0)
-    else:
-        print(f"✗ {args.file} has {len(errors)} error(s):")
-        for error in errors:
-            print(f"  - {error}")
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    main()