Add skill-creator plugin

2026-03-18 23:03:07 +00:00 · 2026-02-17 17:02:51 -08:00
parent 261ce4fba4
commit 30975e61e3
20 changed files with 4879 additions and 0 deletions
--- a/plugins/skill-creator/skills/skill-creator/scripts/aggregate_benchmark.py
+++ b/plugins/skill-creator/skills/skill-creator/scripts/aggregate_benchmark.py
@@ -0,0 +1,351 @@
+#!/usr/bin/env python3
+"""
+Aggregate individual run results into benchmark summary statistics.
+
+Reads grading.json files from run directories and produces:
+- run_summary with mean, stddev, min, max for each metric
+- delta between with_skill and without_skill configurations
+
+Usage:
+    python aggregate_benchmark.py <benchmark_dir>
+
+Example:
+    python aggregate_benchmark.py benchmarks/2026-01-15T10-30-00/
+
+The script expects this directory structure:
+    <benchmark_dir>/
+    └── runs/
+        └── eval-N/
+            ├── with_skill/
+            │   ├── run-1/grading.json
+            │   ├── run-2/grading.json
+            │   └── run-3/grading.json
+            └── without_skill/
+                ├── run-1/grading.json
+                ├── run-2/grading.json
+                └── run-3/grading.json
+"""
+
+import argparse
+import json
+import math
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+
+def calculate_stats(values: list[float]) -> dict:
+    """Calculate mean, stddev, min, max for a list of values."""
+    if not values:
+        return {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}
+
+    n = len(values)
+    mean = sum(values) / n
+
+    if n > 1:
+        variance = sum((x - mean) ** 2 for x in values) / (n - 1)
+        stddev = math.sqrt(variance)
+    else:
+        stddev = 0.0
+
+    return {
+        "mean": round(mean, 4),
+        "stddev": round(stddev, 4),
+        "min": round(min(values), 4),
+        "max": round(max(values), 4)
+    }
+
+
+def load_run_results(benchmark_dir: Path) -> dict:
+    """
+    Load all run results from a benchmark directory.
+
+    Returns dict with structure:
+    {
+        "with_skill": [
+            {"eval_id": 1, "run_number": 1, "pass_rate": 0.85, ...},
+            ...
+        ],
+        "without_skill": [...]
+    }
+    """
+    runs_dir = benchmark_dir / "runs"
+
+    if not runs_dir.exists():
+        print(f"Runs directory not found: {runs_dir}")
+        return {"with_skill": [], "without_skill": []}
+
+    results = {"with_skill": [], "without_skill": []}
+
+    for eval_dir in sorted(runs_dir.glob("eval-*")):
+        eval_id = int(eval_dir.name.split("-")[1])
+
+        for config in ["with_skill", "without_skill"]:
+            config_dir = eval_dir / config
+
+            if not config_dir.exists():
+                continue
+
+            for run_dir in sorted(config_dir.glob("run-*")):
+                run_number = int(run_dir.name.split("-")[1])
+                grading_file = run_dir / "grading.json"
+
+                if not grading_file.exists():
+                    print(f"Warning: grading.json not found in {run_dir}")
+                    continue
+
+                try:
+                    with open(grading_file) as f:
+                        grading = json.load(f)
+                except json.JSONDecodeError as e:
+                    print(f"Warning: Invalid JSON in {grading_file}: {e}")
+                    continue
+
+                # Extract metrics
+                result = {
+                    "eval_id": eval_id,
+                    "run_number": run_number,
+                    "pass_rate": grading.get("summary", {}).get("pass_rate", 0.0),
+                    "passed": grading.get("summary", {}).get("passed", 0),
+                    "failed": grading.get("summary", {}).get("failed", 0),
+                    "total": grading.get("summary", {}).get("total", 0),
+                }
+
+                # Extract timing if available
+                timing = grading.get("timing", {})
+                result["time_seconds"] = timing.get("total_duration_seconds", 0.0)
+
+                # Extract metrics if available
+                metrics = grading.get("execution_metrics", {})
+                result["tool_calls"] = metrics.get("total_tool_calls", 0)
+                result["tokens"] = metrics.get("output_chars", 0)  # Placeholder
+                result["errors"] = metrics.get("errors_encountered", 0)
+
+                # Extract expectations
+                result["expectations"] = grading.get("expectations", [])
+
+                # Extract notes from user_notes_summary
+                notes_summary = grading.get("user_notes_summary", {})
+                notes = []
+                notes.extend(notes_summary.get("uncertainties", []))
+                notes.extend(notes_summary.get("needs_review", []))
+                notes.extend(notes_summary.get("workarounds", []))
+                result["notes"] = notes
+
+                results[config].append(result)
+
+    return results
+
+
+def aggregate_results(results: dict) -> dict:
+    """
+    Aggregate run results into summary statistics.
+
+    Returns run_summary with stats for each configuration and delta.
+    """
+    run_summary = {}
+
+    for config in ["with_skill", "without_skill"]:
+        runs = results.get(config, [])
+
+        if not runs:
+            run_summary[config] = {
+                "pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
+                "time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
+                "tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
+            }
+            continue
+
+        pass_rates = [r["pass_rate"] for r in runs]
+        times = [r["time_seconds"] for r in runs]
+        tokens = [r.get("tokens", 0) for r in runs]
+
+        run_summary[config] = {
+            "pass_rate": calculate_stats(pass_rates),
+            "time_seconds": calculate_stats(times),
+            "tokens": calculate_stats(tokens)
+        }
+
+    # Calculate delta
+    with_skill = run_summary.get("with_skill", {})
+    without_skill = run_summary.get("without_skill", {})
+
+    delta_pass_rate = with_skill.get("pass_rate", {}).get("mean", 0) - without_skill.get("pass_rate", {}).get("mean", 0)
+    delta_time = with_skill.get("time_seconds", {}).get("mean", 0) - without_skill.get("time_seconds", {}).get("mean", 0)
+    delta_tokens = with_skill.get("tokens", {}).get("mean", 0) - without_skill.get("tokens", {}).get("mean", 0)
+
+    run_summary["delta"] = {
+        "pass_rate": f"{delta_pass_rate:+.2f}",
+        "time_seconds": f"{delta_time:+.1f}",
+        "tokens": f"{delta_tokens:+.0f}"
+    }
+
+    return run_summary
+
+
+def generate_benchmark(benchmark_dir: Path, skill_name: str = "", skill_path: str = "") -> dict:
+    """
+    Generate complete benchmark.json from run results.
+    """
+    results = load_run_results(benchmark_dir)
+    run_summary = aggregate_results(results)
+
+    # Build runs array for benchmark.json
+    runs = []
+    for config in ["with_skill", "without_skill"]:
+        for result in results.get(config, []):
+            runs.append({
+                "eval_id": result["eval_id"],
+                "configuration": config,
+                "run_number": result["run_number"],
+                "result": {
+                    "pass_rate": result["pass_rate"],
+                    "passed": result["passed"],
+                    "failed": result["failed"],
+                    "total": result["total"],
+                    "time_seconds": result["time_seconds"],
+                    "tokens": result.get("tokens", 0),
+                    "tool_calls": result.get("tool_calls", 0),
+                    "errors": result.get("errors", 0)
+                },
+                "expectations": result["expectations"],
+                "notes": result["notes"]
+            })
+
+    # Determine eval IDs from results
+    eval_ids = sorted(set(
+        r["eval_id"]
+        for config in results.values()
+        for r in config
+    ))
+
+    benchmark = {
+        "metadata": {
+            "skill_name": skill_name or "<skill-name>",
+            "skill_path": skill_path or "<path/to/skill>",
+            "executor_model": "<model-name>",
+            "analyzer_model": "<model-name>",
+            "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
+            "evals_run": eval_ids,
+            "runs_per_configuration": 3
+        },
+        "runs": runs,
+        "run_summary": run_summary,
+        "notes": []  # To be filled by analyzer
+    }
+
+    return benchmark
+
+
+def generate_markdown(benchmark: dict) -> str:
+    """Generate human-readable benchmark.md from benchmark data."""
+    metadata = benchmark["metadata"]
+    run_summary = benchmark["run_summary"]
+
+    lines = [
+        f"# Skill Benchmark: {metadata['skill_name']}",
+        "",
+        f"**Model**: {metadata['executor_model']}",
+        f"**Date**: {metadata['timestamp']}",
+        f"**Evals**: {', '.join(map(str, metadata['evals_run']))} ({metadata['runs_per_configuration']} runs each per configuration)",
+        "",
+        "## Summary",
+        "",
+        "| Metric | With Skill | Without Skill | Delta |",
+        "|--------|------------|---------------|-------|",
+    ]
+
+    # Format pass rate
+    with_pr = run_summary["with_skill"]["pass_rate"]
+    without_pr = run_summary["without_skill"]["pass_rate"]
+    delta_pr = run_summary["delta"]["pass_rate"]
+    lines.append(f"| Pass Rate | {with_pr['mean']*100:.0f}% ± {with_pr['stddev']*100:.0f}% | {without_pr['mean']*100:.0f}% ± {without_pr['stddev']*100:.0f}% | {delta_pr} |")
+
+    # Format time
+    with_time = run_summary["with_skill"]["time_seconds"]
+    without_time = run_summary["without_skill"]["time_seconds"]
+    delta_time = run_summary["delta"]["time_seconds"]
+    lines.append(f"| Time | {with_time['mean']:.1f}s ± {with_time['stddev']:.1f}s | {without_time['mean']:.1f}s ± {without_time['stddev']:.1f}s | {delta_time}s |")
+
+    # Format tokens
+    with_tokens = run_summary["with_skill"]["tokens"]
+    without_tokens = run_summary["without_skill"]["tokens"]
+    delta_tokens = run_summary["delta"]["tokens"]
+    lines.append(f"| Tokens | {with_tokens['mean']:.0f} ± {with_tokens['stddev']:.0f} | {without_tokens['mean']:.0f} ± {without_tokens['stddev']:.0f} | {delta_tokens} |")
+
+    # Notes section
+    if benchmark.get("notes"):
+        lines.extend([
+            "",
+            "## Notes",
+            ""
+        ])
+        for note in benchmark["notes"]:
+            lines.append(f"- {note}")
+
+    return "\n".join(lines)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Aggregate benchmark run results into summary statistics"
+    )
+    parser.add_argument(
+        "benchmark_dir",
+        type=Path,
+        help="Path to the benchmark directory"
+    )
+    parser.add_argument(
+        "--skill-name",
+        default="",
+        help="Name of the skill being benchmarked"
+    )
+    parser.add_argument(
+        "--skill-path",
+        default="",
+        help="Path to the skill being benchmarked"
+    )
+    parser.add_argument(
+        "--output", "-o",
+        type=Path,
+        help="Output path for benchmark.json (default: <benchmark_dir>/benchmark.json)"
+    )
+
+    args = parser.parse_args()
+
+    if not args.benchmark_dir.exists():
+        print(f"Directory not found: {args.benchmark_dir}")
+        sys.exit(1)
+
+    # Generate benchmark
+    benchmark = generate_benchmark(args.benchmark_dir, args.skill_name, args.skill_path)
+
+    # Determine output paths
+    output_json = args.output or (args.benchmark_dir / "benchmark.json")
+    output_md = output_json.with_suffix(".md")
+
+    # Write benchmark.json
+    with open(output_json, "w") as f:
+        json.dump(benchmark, f, indent=2)
+    print(f"Generated: {output_json}")
+
+    # Write benchmark.md
+    markdown = generate_markdown(benchmark)
+    with open(output_md, "w") as f:
+        f.write(markdown)
+    print(f"Generated: {output_md}")
+
+    # Print summary
+    run_summary = benchmark["run_summary"]
+    with_pr = run_summary["with_skill"]["pass_rate"]["mean"]
+    without_pr = run_summary["without_skill"]["pass_rate"]["mean"]
+    delta = run_summary["delta"]["pass_rate"]
+
+    print(f"\nSummary:")
+    print(f"  With skill:    {with_pr*100:.1f}% pass rate")
+    print(f"  Without skill: {without_pr*100:.1f}% pass rate")
+    print(f"  Delta:         {delta}")
+
+
+if __name__ == "__main__":
+    main()
--- a/plugins/skill-creator/skills/skill-creator/scripts/copy_skill.py
+++ b/plugins/skill-creator/skills/skill-creator/scripts/copy_skill.py
@@ -0,0 +1,209 @@
+#!/usr/bin/env python3
+"""
+Copy a skill directory with metadata tracking.
+
+Creates a copy of a skill directory and adds a META.yaml file
+to track lineage, changes, and performance metrics.
+"""
+
+import argparse
+import shutil
+from datetime import datetime, timezone
+from pathlib import Path
+
+
+def create_meta_yaml(
+    dest: Path,
+    parent: str | None,
+    changes: str | None,
+    score: float | None,
+    iteration: int | None,
+) -> None:
+    """Create META.yaml file in the destination directory."""
+    created_at = datetime.now(timezone.utc).isoformat()
+
+    # Build YAML content manually to avoid external dependencies
+    lines = ["# Skill iteration metadata", ""]
+
+    # Helper to format YAML values
+    def yaml_value(val):
+        if val is None:
+            return "null"
+        if isinstance(val, bool):
+            return "true" if val else "false"
+        if isinstance(val, (int, float)):
+            return str(val)
+        if isinstance(val, str):
+            # Quote strings that might be ambiguous
+            if val in ("null", "true", "false") or val.startswith(("'", '"', "[", "{")):
+                return f'"{val}"'
+            # Quote strings with special characters
+            if any(c in val for c in (":", "#", "\n", '"', "'")):
+                escaped = val.replace("\\", "\\\\").replace('"', '\\"')
+                return f'"{escaped}"'
+            return val
+        return str(val)
+
+    lines.append(f"parent: {yaml_value(parent)}")
+    lines.append(f"changes: {yaml_value(changes)}")
+    lines.append(f"score: {yaml_value(score)}")
+    lines.append(f"iteration: {yaml_value(iteration)}")
+    lines.append(f"created_at: {yaml_value(created_at)}")
+    lines.append("")
+
+    meta_path = dest / "META.yaml"
+    meta_path.write_text("\n".join(lines))
+
+
+def copy_skill(
+    source: Path,
+    dest: Path,
+    parent: str | None = None,
+    changes: str | None = None,
+    score: float | None = None,
+    iteration: int | None = None,
+) -> None:
+    """
+    Copy a skill directory and create version directory structure.
+
+    Creates a version directory with:
+    - skill/        : The actual skill files (copied from source)
+    - runs/         : Created by executor during execution (run-1/, run-2/, run-3/)
+    - improvements/ : For improvement suggestions (if not v0)
+    - META.yaml     : Version metadata
+
+    The runs/ directory structure is created on-demand by the executor:
+    - runs/run-1/transcript.md, outputs/, evaluation.json
+    - runs/run-2/...
+    - runs/run-3/...
+
+    Args:
+        source: Path to the source skill directory (or source/skill/ if copying from another version)
+        dest: Path to the destination version directory (e.g., workspace/v1)
+        parent: Name/path of the parent skill iteration
+        changes: Description of changes from parent
+        score: Evaluation score for this iteration
+        iteration: Iteration number
+    """
+    source = Path(source).resolve()
+    dest = Path(dest).resolve()
+
+    if not source.exists():
+        raise FileNotFoundError(f"Source directory does not exist: {source}")
+
+    if not source.is_dir():
+        raise ValueError(f"Source must be a directory: {source}")
+
+    if dest.exists():
+        raise FileExistsError(f"Destination already exists: {dest}")
+
+    # Create the version directory structure
+    dest.mkdir(parents=True)
+    skill_dest = dest / "skill"
+    (dest / "runs").mkdir()
+
+    # Create improvements directory for non-baseline versions
+    if iteration is not None and iteration > 0:
+        (dest / "improvements").mkdir()
+
+    # Copy the skill files to skill/ subdirectory
+    shutil.copytree(source, skill_dest)
+
+    # Create metadata file at the version root
+    create_meta_yaml(dest, parent, changes, score, iteration)
+
+    print(f"Copied skill from {source} to {skill_dest}")
+    print(f"Created version directory structure at {dest}")
+    print(f"  - skill/        : Skill files")
+    print(f"  - runs/         : For execution runs (run-1/, run-2/, run-3/)")
+    if iteration is not None and iteration > 0:
+        print(f"  - improvements/ : Improvement suggestions")
+    print(f"  - META.yaml     : Version metadata")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Copy a skill directory with metadata tracking and version structure.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Create baseline v0 from an existing skill
+  python copy_skill.py ./skills/public/pdf ./skill_iterations/v0 --iteration 0
+
+  # Create v1 from v0's skill directory
+  python copy_skill.py ./skill_iterations/v0/skill ./skill_iterations/v1 \\
+      --parent v0 \\
+      --changes "Added error handling for empty cells" \\
+      --iteration 1
+
+  # Create v2 with score from evaluation
+  python copy_skill.py ./skill_iterations/v1/skill ./skill_iterations/v2 \\
+      --parent v1 \\
+      --changes "Improved coordinate guidance" \\
+      --score 7.5 \\
+      --iteration 2
+
+Output structure:
+  dest/
+  ├── META.yaml        # Version metadata
+  ├── skill/           # The actual skill files
+  ├── runs/            # Execution runs (created by executor)
+  │   ├── run-1/
+  │   │   ├── transcript.md
+  │   │   ├── outputs/
+  │   │   └── evaluation.json
+  │   ├── run-2/
+  │   └── run-3/
+  └── improvements/    # Improvement suggestions (v1+)
+        """,
+    )
+
+    parser.add_argument("source", type=Path, help="Source skill directory to copy")
+
+    parser.add_argument("dest", type=Path, help="Destination path for the copy")
+
+    parser.add_argument(
+        "--parent",
+        type=str,
+        default=None,
+        help="Name or path of the parent skill iteration",
+    )
+
+    parser.add_argument(
+        "--changes",
+        type=str,
+        default=None,
+        help="Description of changes from the parent version",
+    )
+
+    parser.add_argument(
+        "--score",
+        type=float,
+        default=None,
+        help="Evaluation score for this iteration (e.g., 7.5)",
+    )
+
+    parser.add_argument(
+        "--iteration",
+        type=int,
+        default=None,
+        help="Iteration number (e.g., 1, 2, 3)",
+    )
+
+    args = parser.parse_args()
+
+    try:
+        copy_skill(
+            source=args.source,
+            dest=args.dest,
+            parent=args.parent,
+            changes=args.changes,
+            score=args.score,
+            iteration=args.iteration,
+        )
+    except (FileNotFoundError, FileExistsError, ValueError) as e:
+        parser.error(str(e))
+
+
+if __name__ == "__main__":
+    main()
--- a/plugins/skill-creator/skills/skill-creator/scripts/init_json.py
+++ b/plugins/skill-creator/skills/skill-creator/scripts/init_json.py
@@ -0,0 +1,311 @@
+#!/usr/bin/env python3
+"""
+Initialize JSON files with the correct structure for skill-creator-edge.
+
+Creates template JSON files that can be filled in.
+
+Usage:
+    python init_json.py <type> <output_path>
+
+Examples:
+    python init_json.py evals evals/evals.json
+    python init_json.py grading run-1/grading.json
+    python init_json.py benchmark benchmarks/2026-01-15/benchmark.json
+"""
+
+import argparse
+import json
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+
+TEMPLATES = {
+    "evals": {
+        "skill_name": "<skill-name>",
+        "evals": [
+            {
+                "id": 1,
+                "prompt": "Example task prompt",
+                "expected_output": "Description of expected result",
+                "files": [],
+                "expectations": [
+                    "The output includes X",
+                    "The skill correctly handles Y"
+                ]
+            }
+        ]
+    },
+
+    "grading": {
+        "expectations": [
+            {
+                "text": "Example expectation",
+                "passed": True,
+                "evidence": "Found in transcript: ..."
+            }
+        ],
+        "summary": {
+            "passed": 1,
+            "failed": 0,
+            "total": 1,
+            "pass_rate": 1.0
+        },
+        "execution_metrics": {
+            "tool_calls": {
+                "Read": 0,
+                "Write": 0,
+                "Bash": 0,
+                "Edit": 0,
+                "Glob": 0,
+                "Grep": 0
+            },
+            "total_tool_calls": 0,
+            "total_steps": 0,
+            "errors_encountered": 0,
+            "output_chars": 0,
+            "transcript_chars": 0
+        },
+        "timing": {
+            "executor_duration_seconds": 0.0,
+            "grader_duration_seconds": 0.0,
+            "total_duration_seconds": 0.0
+        },
+        "claims": [],
+        "user_notes_summary": {
+            "uncertainties": [],
+            "needs_review": [],
+            "workarounds": []
+        }
+    },
+
+    "benchmark": {
+        "metadata": {
+            "skill_name": "<skill-name>",
+            "skill_path": "<path/to/skill>",
+            "executor_model": "<model-name>",
+            "analyzer_model": "<model-name>",
+            "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
+            "evals_run": [1],
+            "runs_per_configuration": 3
+        },
+        "runs": [
+            {
+                "eval_id": 1,
+                "configuration": "with_skill",
+                "run_number": 1,
+                "result": {
+                    "pass_rate": 0.0,
+                    "passed": 0,
+                    "failed": 0,
+                    "total": 0,
+                    "time_seconds": 0.0,
+                    "tokens": 0,
+                    "tool_calls": 0,
+                    "errors": 0
+                },
+                "expectations": [],
+                "notes": []
+            }
+        ],
+        "run_summary": {
+            "with_skill": {
+                "pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
+                "time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
+                "tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
+            },
+            "without_skill": {
+                "pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
+                "time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
+                "tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
+            },
+            "delta": {
+                "pass_rate": "+0.0",
+                "time_seconds": "+0.0",
+                "tokens": "+0"
+            }
+        },
+        "notes": []
+    },
+
+    "metrics": {
+        "tool_calls": {
+            "Read": 0,
+            "Write": 0,
+            "Bash": 0,
+            "Edit": 0,
+            "Glob": 0,
+            "Grep": 0
+        },
+        "total_tool_calls": 0,
+        "total_steps": 0,
+        "files_created": [],
+        "errors_encountered": 0,
+        "output_chars": 0,
+        "transcript_chars": 0
+    },
+
+    "timing": {
+        "executor_start": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
+        "executor_end": "",
+        "executor_duration_seconds": 0.0,
+        "grader_start": "",
+        "grader_end": "",
+        "grader_duration_seconds": 0.0,
+        "total_duration_seconds": 0.0
+    },
+
+    "history": {
+        "started_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
+        "skill_name": "<skill-name>",
+        "current_best": "v0",
+        "iterations": [
+            {
+                "version": "v0",
+                "parent": None,
+                "expectation_pass_rate": 0.0,
+                "grading_result": "baseline",
+                "is_current_best": True
+            }
+        ]
+    },
+
+    "comparison": {
+        "winner": "A",
+        "reasoning": "Explanation of why the winner was chosen",
+        "rubric": {
+            "A": {
+                "content": {
+                    "correctness": 5,
+                    "completeness": 5,
+                    "accuracy": 5
+                },
+                "structure": {
+                    "organization": 5,
+                    "formatting": 5,
+                    "usability": 5
+                },
+                "content_score": 5.0,
+                "structure_score": 5.0,
+                "overall_score": 10.0
+            },
+            "B": {
+                "content": {
+                    "correctness": 3,
+                    "completeness": 3,
+                    "accuracy": 3
+                },
+                "structure": {
+                    "organization": 3,
+                    "formatting": 3,
+                    "usability": 3
+                },
+                "content_score": 3.0,
+                "structure_score": 3.0,
+                "overall_score": 6.0
+            }
+        },
+        "output_quality": {
+            "A": {
+                "score": 10,
+                "strengths": [],
+                "weaknesses": []
+            },
+            "B": {
+                "score": 6,
+                "strengths": [],
+                "weaknesses": []
+            }
+        }
+    },
+
+    "analysis": {
+        "comparison_summary": {
+            "winner": "A",
+            "winner_skill": "<path/to/winner>",
+            "loser_skill": "<path/to/loser>",
+            "comparator_reasoning": "Summary of comparison"
+        },
+        "winner_strengths": [],
+        "loser_weaknesses": [],
+        "instruction_following": {
+            "winner": {
+                "score": 10,
+                "issues": []
+            },
+            "loser": {
+                "score": 5,
+                "issues": []
+            }
+        },
+        "improvement_suggestions": [
+            {
+                "priority": "high",
+                "category": "instructions",
+                "suggestion": "Specific improvement suggestion",
+                "expected_impact": "Why this would help"
+            }
+        ],
+        "transcript_insights": {
+            "winner_execution_pattern": "Description of how winner executed",
+            "loser_execution_pattern": "Description of how loser executed"
+        }
+    }
+}
+
+
+def init_json(json_type: str, output_path: Path, force: bool = False) -> bool:
+    """
+    Initialize a JSON file with the correct template structure.
+
+    Returns True on success, False on failure.
+    """
+    if json_type not in TEMPLATES:
+        print(f"Unknown type: {json_type}. Valid types: {list(TEMPLATES.keys())}")
+        return False
+
+    if output_path.exists() and not force:
+        print(f"File already exists: {output_path}")
+        print("Use --force to overwrite")
+        return False
+
+    # Create parent directories
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    # Write template
+    template = TEMPLATES[json_type]
+    with open(output_path, "w") as f:
+        json.dump(template, f, indent=2)
+
+    print(f"Created {json_type} template: {output_path}")
+    return True
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Initialize JSON files with correct structure"
+    )
+    parser.add_argument(
+        "type",
+        choices=list(TEMPLATES.keys()),
+        help="Type of JSON file to create"
+    )
+    parser.add_argument(
+        "output",
+        type=Path,
+        help="Output path for the JSON file"
+    )
+    parser.add_argument(
+        "--force", "-f",
+        action="store_true",
+        help="Overwrite existing file"
+    )
+
+    args = parser.parse_args()
+
+    success = init_json(args.type, args.output, args.force)
+    sys.exit(0 if success else 1)
+
+
+if __name__ == "__main__":
+    main()
--- a/plugins/skill-creator/skills/skill-creator/scripts/init_skill.py
+++ b/plugins/skill-creator/skills/skill-creator/scripts/init_skill.py
@@ -0,0 +1,303 @@
+#!/usr/bin/env python3
+"""
+Skill Initializer - Creates a new skill from template
+
+Usage:
+    init_skill.py <skill-name> --path <path>
+
+Examples:
+    init_skill.py my-new-skill --path skills/public
+    init_skill.py my-api-helper --path skills/private
+    init_skill.py custom-skill --path /custom/location
+"""
+
+import sys
+from pathlib import Path
+
+
+SKILL_TEMPLATE = """---
+name: {skill_name}
+description: [TODO: Complete and informative explanation of what the skill does and when to use it. Include WHEN to use this skill - specific scenarios, file types, or tasks that trigger it.]
+---
+
+# {skill_title}
+
+## Overview
+
+[TODO: 1-2 sentences explaining what this skill enables]
+
+## Structuring This Skill
+
+[TODO: Choose the structure that best fits this skill's purpose. Common patterns:
+
+**1. Workflow-Based** (best for sequential processes)
+- Works well when there are clear step-by-step procedures
+- Example: DOCX skill with "Workflow Decision Tree" → "Reading" → "Creating" → "Editing"
+- Structure: ## Overview → ## Workflow Decision Tree → ## Step 1 → ## Step 2...
+
+**2. Task-Based** (best for tool collections)
+- Works well when the skill offers different operations/capabilities
+- Example: PDF skill with "Quick Start" → "Merge PDFs" → "Split PDFs" → "Extract Text"
+- Structure: ## Overview → ## Quick Start → ## Task Category 1 → ## Task Category 2...
+
+**3. Reference/Guidelines** (best for standards or specifications)
+- Works well for brand guidelines, coding standards, or requirements
+- Example: Brand styling with "Brand Guidelines" → "Colors" → "Typography" → "Features"
+- Structure: ## Overview → ## Guidelines → ## Specifications → ## Usage...
+
+**4. Capabilities-Based** (best for integrated systems)
+- Works well when the skill provides multiple interrelated features
+- Example: Product Management with "Core Capabilities" → numbered capability list
+- Structure: ## Overview → ## Core Capabilities → ### 1. Feature → ### 2. Feature...
+
+Patterns can be mixed and matched as needed. Most skills combine patterns (e.g., start with task-based, add workflow for complex operations).
+
+Delete this entire "Structuring This Skill" section when done - it's just guidance.]
+
+## [TODO: Replace with the first main section based on chosen structure]
+
+[TODO: Add content here. See examples in existing skills:
+- Code samples for technical skills
+- Decision trees for complex workflows
+- Concrete examples with realistic user requests
+- References to scripts/templates/references as needed]
+
+## Resources
+
+This skill includes example resource directories that demonstrate how to organize different types of bundled resources:
+
+### scripts/
+Executable code (Python/Bash/etc.) that can be run directly to perform specific operations.
+
+**Examples from other skills:**
+- PDF skill: `fill_fillable_fields.py`, `extract_form_field_info.py` - utilities for PDF manipulation
+- DOCX skill: `document.py`, `utilities.py` - Python modules for document processing
+
+**Appropriate for:** Python scripts, shell scripts, or any executable code that performs automation, data processing, or specific operations.
+
+**Note:** Scripts may be executed without loading into context, but can still be read by Claude for patching or environment adjustments.
+
+### references/
+Documentation and reference material intended to be loaded into context to inform Claude's process and thinking.
+
+**Examples from other skills:**
+- Product management: `communication.md`, `context_building.md` - detailed workflow guides
+- BigQuery: API reference documentation and query examples
+- Finance: Schema documentation, company policies
+
+**Appropriate for:** In-depth documentation, API references, database schemas, comprehensive guides, or any detailed information that Claude should reference while working.
+
+### assets/
+Files not intended to be loaded into context, but rather used within the output Claude produces.
+
+**Examples from other skills:**
+- Brand styling: PowerPoint template files (.pptx), logo files
+- Frontend builder: HTML/React boilerplate project directories
+- Typography: Font files (.ttf, .woff2)
+
+**Appropriate for:** Templates, boilerplate code, document templates, images, icons, fonts, or any files meant to be copied or used in the final output.
+
+---
+
+**Any unneeded directories can be deleted.** Not every skill requires all three types of resources.
+"""
+
+EXAMPLE_SCRIPT = '''#!/usr/bin/env python3
+"""
+Example helper script for {skill_name}
+
+This is a placeholder script that can be executed directly.
+Replace with actual implementation or delete if not needed.
+
+Example real scripts from other skills:
+- pdf/scripts/fill_fillable_fields.py - Fills PDF form fields
+- pdf/scripts/convert_pdf_to_images.py - Converts PDF pages to images
+"""
+
+def main():
+    print("This is an example script for {skill_name}")
+    # TODO: Add actual script logic here
+    # This could be data processing, file conversion, API calls, etc.
+
+if __name__ == "__main__":
+    main()
+'''
+
+EXAMPLE_REFERENCE = """# Reference Documentation for {skill_title}
+
+This is a placeholder for detailed reference documentation.
+Replace with actual reference content or delete if not needed.
+
+Example real reference docs from other skills:
+- product-management/references/communication.md - Comprehensive guide for status updates
+- product-management/references/context_building.md - Deep-dive on gathering context
+- bigquery/references/ - API references and query examples
+
+## When Reference Docs Are Useful
+
+Reference docs are ideal for:
+- Comprehensive API documentation
+- Detailed workflow guides
+- Complex multi-step processes
+- Information too lengthy for main SKILL.md
+- Content that's only needed for specific use cases
+
+## Structure Suggestions
+
+### API Reference Example
+- Overview
+- Authentication
+- Endpoints with examples
+- Error codes
+- Rate limits
+
+### Workflow Guide Example
+- Prerequisites
+- Step-by-step instructions
+- Common patterns
+- Troubleshooting
+- Best practices
+"""
+
+EXAMPLE_ASSET = """# Example Asset File
+
+This placeholder represents where asset files would be stored.
+Replace with actual asset files (templates, images, fonts, etc.) or delete if not needed.
+
+Asset files are NOT intended to be loaded into context, but rather used within
+the output Claude produces.
+
+Example asset files from other skills:
+- Brand guidelines: logo.png, slides_template.pptx
+- Frontend builder: hello-world/ directory with HTML/React boilerplate
+- Typography: custom-font.ttf, font-family.woff2
+- Data: sample_data.csv, test_dataset.json
+
+## Common Asset Types
+
+- Templates: .pptx, .docx, boilerplate directories
+- Images: .png, .jpg, .svg, .gif
+- Fonts: .ttf, .otf, .woff, .woff2
+- Boilerplate code: Project directories, starter files
+- Icons: .ico, .svg
+- Data files: .csv, .json, .xml, .yaml
+
+Note: This is a text placeholder. Actual assets can be any file type.
+"""
+
+
+def title_case_skill_name(skill_name):
+    """Convert hyphenated skill name to Title Case for display."""
+    return ' '.join(word.capitalize() for word in skill_name.split('-'))
+
+
+def init_skill(skill_name, path):
+    """
+    Initialize a new skill directory with template SKILL.md.
+
+    Args:
+        skill_name: Name of the skill
+        path: Path where the skill directory should be created
+
+    Returns:
+        Path to created skill directory, or None if error
+    """
+    # Determine skill directory path
+    skill_dir = Path(path).resolve() / skill_name
+
+    # Check if directory already exists
+    if skill_dir.exists():
+        print(f"❌ Error: Skill directory already exists: {skill_dir}")
+        return None
+
+    # Create skill directory
+    try:
+        skill_dir.mkdir(parents=True, exist_ok=False)
+        print(f"✅ Created skill directory: {skill_dir}")
+    except Exception as e:
+        print(f"❌ Error creating directory: {e}")
+        return None
+
+    # Create SKILL.md from template
+    skill_title = title_case_skill_name(skill_name)
+    skill_content = SKILL_TEMPLATE.format(
+        skill_name=skill_name,
+        skill_title=skill_title
+    )
+
+    skill_md_path = skill_dir / 'SKILL.md'
+    try:
+        skill_md_path.write_text(skill_content)
+        print("✅ Created SKILL.md")
+    except Exception as e:
+        print(f"❌ Error creating SKILL.md: {e}")
+        return None
+
+    # Create resource directories with example files
+    try:
+        # Create scripts/ directory with example script
+        scripts_dir = skill_dir / 'scripts'
+        scripts_dir.mkdir(exist_ok=True)
+        example_script = scripts_dir / 'example.py'
+        example_script.write_text(EXAMPLE_SCRIPT.format(skill_name=skill_name))
+        example_script.chmod(0o755)
+        print("✅ Created scripts/example.py")
+
+        # Create references/ directory with example reference doc
+        references_dir = skill_dir / 'references'
+        references_dir.mkdir(exist_ok=True)
+        example_reference = references_dir / 'api_reference.md'
+        example_reference.write_text(EXAMPLE_REFERENCE.format(skill_title=skill_title))
+        print("✅ Created references/api_reference.md")
+
+        # Create assets/ directory with example asset placeholder
+        assets_dir = skill_dir / 'assets'
+        assets_dir.mkdir(exist_ok=True)
+        example_asset = assets_dir / 'example_asset.txt'
+        example_asset.write_text(EXAMPLE_ASSET)
+        print("✅ Created assets/example_asset.txt")
+    except Exception as e:
+        print(f"❌ Error creating resource directories: {e}")
+        return None
+
+    # Print next steps
+    print(f"\n✅ Skill '{skill_name}' initialized successfully at {skill_dir}")
+    print("\nNext steps:")
+    print("1. Edit SKILL.md to complete the TODO items and update the description")
+    print("2. Customize or delete the example files in scripts/, references/, and assets/")
+    print("3. Run the validator when ready to check the skill structure")
+
+    return skill_dir
+
+
+def main():
+    if len(sys.argv) < 4 or sys.argv[2] != '--path':
+        print("Usage: init_skill.py <skill-name> --path <path>")
+        print("\nSkill name requirements:")
+        print("  - Kebab-case identifier (e.g., 'my-data-analyzer')")
+        print("  - Lowercase letters, digits, and hyphens only")
+        print("  - Max 64 characters")
+        print("  - Must match directory name exactly")
+        print("\nExamples:")
+        print("  init_skill.py my-new-skill --path skills/public")
+        print("  init_skill.py my-api-helper --path skills/private")
+        print("  init_skill.py custom-skill --path /custom/location")
+        sys.exit(1)
+
+    skill_name = sys.argv[1]
+    path = sys.argv[3]
+
+    print(f"🚀 Initializing skill: {skill_name}")
+    print(f"   Location: {path}")
+    print()
+
+    result = init_skill(skill_name, path)
+
+    if result:
+        sys.exit(0)
+    else:
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/plugins/skill-creator/skills/skill-creator/scripts/package_skill.py
+++ b/plugins/skill-creator/skills/skill-creator/scripts/package_skill.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+"""
+Skill Packager - Creates a distributable .skill file of a skill folder
+
+Usage:
+    python utils/package_skill.py <path/to/skill-folder> [output-directory]
+
+Example:
+    python utils/package_skill.py skills/public/my-skill
+    python utils/package_skill.py skills/public/my-skill ./dist
+"""
+
+import fnmatch
+import sys
+import zipfile
+from pathlib import Path
+from quick_validate import validate_skill
+
+# Patterns to exclude when packaging skills.
+EXCLUDE_DIRS = {"__pycache__", "node_modules"}
+EXCLUDE_GLOBS = {"*.pyc"}
+EXCLUDE_FILES = {".DS_Store"}
+# Directories excluded only at the skill root (not when nested deeper).
+ROOT_EXCLUDE_DIRS = {"evals"}
+
+
+def should_exclude(rel_path: Path) -> bool:
+    """Check if a path should be excluded from packaging."""
+    parts = rel_path.parts
+    if any(part in EXCLUDE_DIRS for part in parts):
+        return True
+    # rel_path is relative to skill_path.parent, so parts[0] is the skill
+    # folder name and parts[1] (if present) is the first subdir.
+    if len(parts) > 1 and parts[1] in ROOT_EXCLUDE_DIRS:
+        return True
+    name = rel_path.name
+    if name in EXCLUDE_FILES:
+        return True
+    return any(fnmatch.fnmatch(name, pat) for pat in EXCLUDE_GLOBS)
+
+
+def package_skill(skill_path, output_dir=None):
+    """
+    Package a skill folder into a .skill file.
+
+    Args:
+        skill_path: Path to the skill folder
+        output_dir: Optional output directory for the .skill file (defaults to current directory)
+
+    Returns:
+        Path to the created .skill file, or None if error
+    """
+    skill_path = Path(skill_path).resolve()
+
+    # Validate skill folder exists
+    if not skill_path.exists():
+        print(f"❌ Error: Skill folder not found: {skill_path}")
+        return None
+
+    if not skill_path.is_dir():
+        print(f"❌ Error: Path is not a directory: {skill_path}")
+        return None
+
+    # Validate SKILL.md exists
+    skill_md = skill_path / "SKILL.md"
+    if not skill_md.exists():
+        print(f"❌ Error: SKILL.md not found in {skill_path}")
+        return None
+
+    # Run validation before packaging
+    print("🔍 Validating skill...")
+    valid, message = validate_skill(skill_path)
+    if not valid:
+        print(f"❌ Validation failed: {message}")
+        print("   Please fix the validation errors before packaging.")
+        return None
+    print(f"✅ {message}\n")
+
+    # Determine output location
+    skill_name = skill_path.name
+    if output_dir:
+        output_path = Path(output_dir).resolve()
+        output_path.mkdir(parents=True, exist_ok=True)
+    else:
+        output_path = Path.cwd()
+
+    skill_filename = output_path / f"{skill_name}.skill"
+
+    # Create the .skill file (zip format)
+    try:
+        with zipfile.ZipFile(skill_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
+            # Walk through the skill directory, excluding build artifacts
+            for file_path in skill_path.rglob('*'):
+                if not file_path.is_file():
+                    continue
+                arcname = file_path.relative_to(skill_path.parent)
+                if should_exclude(arcname):
+                    print(f"  Skipped: {arcname}")
+                    continue
+                zipf.write(file_path, arcname)
+                print(f"  Added: {arcname}")
+
+        print(f"\n✅ Successfully packaged skill to: {skill_filename}")
+        return skill_filename
+
+    except Exception as e:
+        print(f"❌ Error creating .skill file: {e}")
+        return None
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python utils/package_skill.py <path/to/skill-folder> [output-directory]")
+        print("\nExample:")
+        print("  python utils/package_skill.py skills/public/my-skill")
+        print("  python utils/package_skill.py skills/public/my-skill ./dist")
+        sys.exit(1)
+
+    skill_path = sys.argv[1]
+    output_dir = sys.argv[2] if len(sys.argv) > 2 else None
+
+    print(f"📦 Packaging skill: {skill_path}")
+    if output_dir:
+        print(f"   Output directory: {output_dir}")
+    print()
+
+    result = package_skill(skill_path, output_dir)
+
+    if result:
+        sys.exit(0)
+    else:
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/plugins/skill-creator/skills/skill-creator/scripts/prepare_eval.py
+++ b/plugins/skill-creator/skills/skill-creator/scripts/prepare_eval.py
@@ -0,0 +1,279 @@
+#!/usr/bin/env python3
+"""
+Prepare environment for running a skill eval.
+
+Usage:
+    prepare_eval.py <skill-path> <eval-id> --output-dir <dir> [--no-skill]
+
+Examples:
+    prepare_eval.py skills/public/pdf 0 --output-dir workspace/eval-001/with-skill
+    prepare_eval.py skills/public/pdf 0 --output-dir workspace/eval-001/without-skill --no-skill
+
+Options:
+    <skill-path>     Path to the skill directory
+    <eval-id>        Index of the eval in evals/evals.json (0-based)
+    --output-dir     Directory to prepare for the eval run
+    --no-skill       If set, do not copy the skill (for baseline comparison)
+"""
+
+import json
+import os
+import shutil
+import sys
+from pathlib import Path
+
+
+def is_writable(path: Path) -> bool:
+    """Check if a directory is writable."""
+    try:
+        test_file = path / ".write_test"
+        test_file.touch()
+        test_file.unlink()
+        return True
+    except (OSError, PermissionError):
+        return False
+
+
+def load_evals(skill_path: Path) -> list:
+    """Load evals from the skill's evals/evals.json file."""
+    evals_file = skill_path / "evals" / "evals.json"
+    if not evals_file.exists():
+        raise FileNotFoundError(f"Evals file not found: {evals_file}")
+
+    with open(evals_file, "r") as f:
+        data = json.load(f)
+
+    # Handle both formats: plain list or wrapped in object with "evals" key
+    if isinstance(data, dict) and "evals" in data:
+        evals = data["evals"]
+    elif isinstance(data, list):
+        evals = data
+    else:
+        raise ValueError(
+            f"Expected evals.json to contain a list or object with 'evals' key, "
+            f"got {type(data).__name__}"
+        )
+
+    return evals
+
+
+def get_eval(evals: list, eval_id: int) -> dict:
+    """Get a specific eval by ID (0-based index)."""
+    if eval_id < 0 or eval_id >= len(evals):
+        raise IndexError(f"Eval ID {eval_id} out of range (0-{len(evals)-1})")
+    return evals[eval_id]
+
+
+def normalize_eval(eval_data: dict) -> dict:
+    """
+    Normalize eval data to a consistent format.
+
+    Handles both the design doc format (prompt, files, assertions)
+    and the gym format (query, files, expected_behavior).
+    """
+    # Get the prompt (can be "prompt" or "query")
+    prompt = eval_data.get("prompt") or eval_data.get("query")
+    if not prompt:
+        raise ValueError("Eval must have either 'prompt' or 'query' field")
+
+    # Get files (default to empty list)
+    files = eval_data.get("files", [])
+
+    # Get assertions - can be "assertions" (list of strings)
+    # or "expected_behavior" (list of strings or objects)
+    assertions = eval_data.get("assertions")
+    if assertions is None:
+        expected_behavior = eval_data.get("expected_behavior", [])
+        # Convert expected_behavior to string assertions if needed
+        assertions = []
+        for item in expected_behavior:
+            if isinstance(item, str):
+                assertions.append(item)
+            elif isinstance(item, dict):
+                # Convert structured assertion to string description
+                assertion_type = item.get("assertion", "unknown")
+                # Build a human-readable assertion string
+                parts = [f"Assertion type: {assertion_type}"]
+                for key, value in item.items():
+                    if key != "assertion":
+                        parts.append(f"{key}={value}")
+                assertions.append(" - ".join(parts))
+
+    return {
+        "prompt": prompt,
+        "files": files,
+        "assertions": assertions
+    }
+
+
+def prepare_eval(skill_path: Path, eval_id: int, output_dir: Path, no_skill: bool = False) -> dict:
+    """
+    Prepare the environment for running an eval.
+
+    Args:
+        skill_path: Path to the skill directory
+        eval_id: Index of the eval in evals.json
+        output_dir: Directory to prepare for the eval run
+        no_skill: If True, do not copy the skill (for baseline comparison)
+
+    Returns:
+        Dictionary with eval metadata
+    """
+    skill_path = Path(skill_path).resolve()
+    output_dir = Path(output_dir).resolve()
+
+    # Validate skill path
+    if not skill_path.exists():
+        raise FileNotFoundError(f"Skill directory not found: {skill_path}")
+
+    skill_md = skill_path / "SKILL.md"
+    if not skill_md.exists():
+        raise FileNotFoundError(f"SKILL.md not found in {skill_path}")
+
+    # Load and get the specific eval
+    evals = load_evals(skill_path)
+    eval_data = get_eval(evals, eval_id)
+    normalized = normalize_eval(eval_data)
+
+    # Create output directory
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Create inputs directory and stage input files
+    inputs_dir = output_dir / "inputs"
+    inputs_dir.mkdir(exist_ok=True)
+
+    staged_files = []
+    for file_ref in normalized["files"]:
+        # Files can be relative to skill's evals/files/ directory
+        source = skill_path / "evals" / "files" / file_ref
+        if not source.exists():
+            # Try relative to evals/ directly
+            source = skill_path / "evals" / file_ref
+        if not source.exists():
+            # Try relative to skill root
+            source = skill_path / file_ref
+
+        if source.exists():
+            dest = inputs_dir / Path(file_ref).name
+            if source.is_file():
+                shutil.copy2(source, dest)
+            else:
+                shutil.copytree(source, dest, dirs_exist_ok=True)
+            staged_files.append(str(dest))
+            print(f"  Staged: {file_ref} -> {dest}")
+        else:
+            print(f"  Warning: File not found: {file_ref}")
+
+    # Create outputs directory
+    outputs_dir = output_dir / "outputs"
+    outputs_dir.mkdir(exist_ok=True)
+
+    # Copy skill if not --no-skill
+    skill_copy_path = None
+    if not no_skill:
+        skill_copy_path = output_dir / "skill"
+        if skill_copy_path.exists():
+            shutil.rmtree(skill_copy_path)
+        shutil.copytree(skill_path, skill_copy_path, dirs_exist_ok=True)
+        skill_copy_path = str(skill_copy_path)
+        print(f"  Copied skill to: {skill_copy_path}")
+
+    # Build metadata
+    metadata = {
+        "eval_id": eval_id,
+        "prompt": normalized["prompt"],
+        "assertions": normalized["assertions"],
+        "input_files": staged_files,
+        "skill_path": skill_copy_path,
+        "output_dir": str(output_dir),
+        "inputs_dir": str(inputs_dir),
+        "outputs_dir": str(outputs_dir),
+        "no_skill": no_skill,
+        "original_skill_path": str(skill_path)
+    }
+
+    # Write metadata file
+    metadata_path = output_dir / "eval_metadata.json"
+    with open(metadata_path, "w") as f:
+        json.dump(metadata, f, indent=2)
+    print(f"  Wrote: {metadata_path}")
+
+    return metadata
+
+
+def determine_workspace(skill_path: Path) -> Path:
+    """
+    Determine the appropriate workspace location.
+
+    If skill directory is writable, use <skill>/workspace/
+    Otherwise, use <project-root>/<skill-name>-workspace/
+    """
+    skill_path = Path(skill_path).resolve()
+
+    if is_writable(skill_path):
+        return skill_path / "workspace"
+
+    # Find project root (look for .git or go up to home)
+    project_root = skill_path
+    while project_root != project_root.parent:
+        if (project_root / ".git").exists():
+            break
+        project_root = project_root.parent
+
+    if project_root == project_root.parent:
+        # Fallback to skill's parent directory
+        project_root = skill_path.parent
+
+    skill_name = skill_path.name
+    return project_root / f"{skill_name}-workspace"
+
+
+def main():
+    # Parse arguments
+    args = sys.argv[1:]
+
+    if len(args) < 4 or "--output-dir" not in args:
+        print(__doc__)
+        sys.exit(1)
+
+    # Find positional arguments and flags
+    skill_path = args[0]
+    eval_id = int(args[1])
+    no_skill = "--no-skill" in args
+
+    # Find --output-dir value
+    output_dir_idx = args.index("--output-dir")
+    if output_dir_idx + 1 >= len(args):
+        print("Error: --output-dir requires a value")
+        sys.exit(1)
+    output_dir = args[output_dir_idx + 1]
+
+    print(f"Preparing eval {eval_id} for skill: {skill_path}")
+    print(f"Output directory: {output_dir}")
+    if no_skill:
+        print("Mode: without skill (baseline)")
+    else:
+        print("Mode: with skill")
+    print()
+
+    try:
+        metadata = prepare_eval(
+            skill_path=Path(skill_path),
+            eval_id=eval_id,
+            output_dir=Path(output_dir),
+            no_skill=no_skill
+        )
+
+        print()
+        print("Eval prepared successfully!")
+        print(f"  Prompt: {metadata['prompt'][:60]}..." if len(metadata['prompt']) > 60 else f"  Prompt: {metadata['prompt']}")
+        print(f"  Assertions: {len(metadata['assertions'])}")
+        print(f"  Input files: {len(metadata['input_files'])}")
+
+    except Exception as e:
+        print(f"Error: {e}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/plugins/skill-creator/skills/skill-creator/scripts/quick_validate.py
+++ b/plugins/skill-creator/skills/skill-creator/scripts/quick_validate.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+"""
+Quick validation script for skills - minimal version
+"""
+
+import sys
+import os
+import re
+import yaml
+from pathlib import Path
+
+def validate_skill(skill_path):
+    """Basic validation of a skill"""
+    skill_path = Path(skill_path)
+
+    # Check SKILL.md exists
+    skill_md = skill_path / 'SKILL.md'
+    if not skill_md.exists():
+        return False, "SKILL.md not found"
+
+    # Read and validate frontmatter
+    content = skill_md.read_text()
+    if not content.startswith('---'):
+        return False, "No YAML frontmatter found"
+
+    # Extract frontmatter
+    match = re.match(r'^---\n(.*?)\n---', content, re.DOTALL)
+    if not match:
+        return False, "Invalid frontmatter format"
+
+    frontmatter_text = match.group(1)
+
+    # Parse YAML frontmatter
+    try:
+        frontmatter = yaml.safe_load(frontmatter_text)
+        if not isinstance(frontmatter, dict):
+            return False, "Frontmatter must be a YAML dictionary"
+    except yaml.YAMLError as e:
+        return False, f"Invalid YAML in frontmatter: {e}"
+
+    # Define allowed properties
+    ALLOWED_PROPERTIES = {'name', 'description', 'license', 'allowed-tools', 'metadata', 'compatibility'}
+
+    # Check for unexpected properties (excluding nested keys under metadata)
+    unexpected_keys = set(frontmatter.keys()) - ALLOWED_PROPERTIES
+    if unexpected_keys:
+        return False, (
+            f"Unexpected key(s) in SKILL.md frontmatter: {', '.join(sorted(unexpected_keys))}. "
+            f"Allowed properties are: {', '.join(sorted(ALLOWED_PROPERTIES))}"
+        )
+
+    # Check required fields
+    if 'name' not in frontmatter:
+        return False, "Missing 'name' in frontmatter"
+    if 'description' not in frontmatter:
+        return False, "Missing 'description' in frontmatter"
+
+    # Extract name for validation
+    name = frontmatter.get('name', '')
+    if not isinstance(name, str):
+        return False, f"Name must be a string, got {type(name).__name__}"
+    name = name.strip()
+    if name:
+        # Check naming convention (kebab-case: lowercase with hyphens)
+        if not re.match(r'^[a-z0-9-]+$', name):
+            return False, f"Name '{name}' should be kebab-case (lowercase letters, digits, and hyphens only)"
+        if name.startswith('-') or name.endswith('-') or '--' in name:
+            return False, f"Name '{name}' cannot start/end with hyphen or contain consecutive hyphens"
+        # Check name length (max 64 characters per spec)
+        if len(name) > 64:
+            return False, f"Name is too long ({len(name)} characters). Maximum is 64 characters."
+
+    # Extract and validate description
+    description = frontmatter.get('description', '')
+    if not isinstance(description, str):
+        return False, f"Description must be a string, got {type(description).__name__}"
+    description = description.strip()
+    if description:
+        # Check for angle brackets
+        if '<' in description or '>' in description:
+            return False, "Description cannot contain angle brackets (< or >)"
+        # Check description length (max 1024 characters per spec)
+        if len(description) > 1024:
+            return False, f"Description is too long ({len(description)} characters). Maximum is 1024 characters."
+
+    # Validate compatibility field if present (optional)
+    compatibility = frontmatter.get('compatibility', '')
+    if compatibility:
+        if not isinstance(compatibility, str):
+            return False, f"Compatibility must be a string, got {type(compatibility).__name__}"
+        if len(compatibility) > 500:
+            return False, f"Compatibility is too long ({len(compatibility)} characters). Maximum is 500 characters."
+
+    return True, "Skill is valid!"
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage: python quick_validate.py <skill_directory>")
+        sys.exit(1)
+    
+    valid, message = validate_skill(sys.argv[1])
+    print(message)
+    sys.exit(0 if valid else 1)
--- a/plugins/skill-creator/skills/skill-creator/scripts/validate_json.py
+++ b/plugins/skill-creator/skills/skill-creator/scripts/validate_json.py
@@ -0,0 +1,411 @@
+#!/usr/bin/env python3
+"""
+Validate JSON files produced by skill-creator-edge.
+
+Supports validation for:
+- evals.json: Skill evaluation definitions
+- grading.json: Grader output
+- benchmark.json: Benchmark results
+- metrics.json: Executor metrics
+- timing.json: Timing data
+- history.json: Improve mode version history
+- comparison.json: Blind comparator output
+- analysis.json: Post-hoc analyzer output
+
+Usage:
+    python validate_json.py <file_path> [--type <type>]
+
+Examples:
+    python validate_json.py workspace/benchmark.json
+    python validate_json.py evals/evals.json --type evals
+    python validate_json.py run-1/grading.json --type grading
+"""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Any
+
+
+# Schema definitions as validation rules
+SCHEMAS = {
+    "evals": {
+        "required": ["skill_name", "evals"],
+        "evals_item": {
+            "required": ["id", "prompt"],
+            "optional": ["expected_output", "files", "expectations"]
+        }
+    },
+    "grading": {
+        "required": ["expectations", "summary"],
+        "summary": {
+            "required": ["passed", "failed", "total", "pass_rate"]
+        },
+        "expectations_item": {
+            "required": ["text", "passed", "evidence"]
+        }
+    },
+    "benchmark": {
+        "required": ["metadata", "runs", "run_summary"],
+        "metadata": {
+            "required": ["skill_name", "timestamp", "runs_per_configuration"]
+        },
+        "runs_item": {
+            "required": ["eval_id", "configuration", "run_number", "result"]
+        },
+        "run_summary_config": {
+            "required": ["pass_rate", "time_seconds", "tokens"]
+        }
+    },
+    "metrics": {
+        "required": ["tool_calls", "total_tool_calls"],
+        "optional": ["total_steps", "files_created", "errors_encountered",
+                     "output_chars", "transcript_chars"]
+    },
+    "timing": {
+        "required": [],  # All fields optional but should have at least one
+        "optional": ["executor_start", "executor_end", "executor_duration_seconds",
+                     "grader_start", "grader_end", "grader_duration_seconds",
+                     "total_duration_seconds"]
+    },
+    "history": {
+        "required": ["started_at", "skill_name", "current_best", "iterations"],
+        "iterations_item": {
+            "required": ["version", "expectation_pass_rate", "grading_result", "is_current_best"]
+        }
+    },
+    "comparison": {
+        "required": ["winner", "reasoning", "rubric", "output_quality"],
+        "rubric_side": {
+            "required": ["content", "structure", "overall_score"]
+        }
+    },
+    "analysis": {
+        "required": ["comparison_summary", "winner_strengths", "loser_weaknesses",
+                     "improvement_suggestions"],
+        "improvement_item": {
+            "required": ["priority", "category", "suggestion"]
+        }
+    }
+}
+
+
+def infer_type(file_path: Path) -> str | None:
+    """Infer JSON type from filename."""
+    name = file_path.name.lower()
+
+    if name == "evals.json":
+        return "evals"
+    elif name == "grading.json":
+        return "grading"
+    elif name == "benchmark.json":
+        return "benchmark"
+    elif name == "metrics.json":
+        return "metrics"
+    elif name == "timing.json":
+        return "timing"
+    elif name == "history.json":
+        return "history"
+    elif name.startswith("comparison"):
+        return "comparison"
+    elif name == "analysis.json":
+        return "analysis"
+
+    return None
+
+
+def validate_required_fields(data: dict, required: list[str], path: str = "") -> list[str]:
+    """Check that all required fields are present."""
+    errors = []
+    for field in required:
+        if field not in data:
+            errors.append(f"{path}Missing required field: {field}")
+    return errors
+
+
+def validate_evals(data: dict) -> list[str]:
+    """Validate evals.json structure."""
+    errors = validate_required_fields(data, SCHEMAS["evals"]["required"])
+
+    if "evals" in data:
+        if not isinstance(data["evals"], list):
+            errors.append("'evals' must be a list")
+        else:
+            for i, eval_item in enumerate(data["evals"]):
+                item_errors = validate_required_fields(
+                    eval_item,
+                    SCHEMAS["evals"]["evals_item"]["required"],
+                    f"evals[{i}]: "
+                )
+                errors.extend(item_errors)
+
+                # Validate id is an integer
+                if "id" in eval_item and not isinstance(eval_item["id"], int):
+                    errors.append(f"evals[{i}]: 'id' must be an integer")
+
+                # Validate expectations is a list of strings
+                if "expectations" in eval_item:
+                    if not isinstance(eval_item["expectations"], list):
+                        errors.append(f"evals[{i}]: 'expectations' must be a list")
+                    else:
+                        for j, exp in enumerate(eval_item["expectations"]):
+                            if not isinstance(exp, str):
+                                errors.append(f"evals[{i}].expectations[{j}]: must be a string")
+
+    return errors
+
+
+def validate_grading(data: dict) -> list[str]:
+    """Validate grading.json structure."""
+    errors = validate_required_fields(data, SCHEMAS["grading"]["required"])
+
+    if "summary" in data:
+        summary_errors = validate_required_fields(
+            data["summary"],
+            SCHEMAS["grading"]["summary"]["required"],
+            "summary: "
+        )
+        errors.extend(summary_errors)
+
+        # Validate pass_rate is between 0 and 1
+        if "pass_rate" in data["summary"]:
+            pr = data["summary"]["pass_rate"]
+            if not isinstance(pr, (int, float)) or pr < 0 or pr > 1:
+                errors.append("summary.pass_rate must be a number between 0 and 1")
+
+    if "expectations" in data:
+        if not isinstance(data["expectations"], list):
+            errors.append("'expectations' must be a list")
+        else:
+            for i, exp in enumerate(data["expectations"]):
+                exp_errors = validate_required_fields(
+                    exp,
+                    SCHEMAS["grading"]["expectations_item"]["required"],
+                    f"expectations[{i}]: "
+                )
+                errors.extend(exp_errors)
+
+                if "passed" in exp and not isinstance(exp["passed"], bool):
+                    errors.append(f"expectations[{i}].passed must be a boolean")
+
+    return errors
+
+
+def validate_benchmark(data: dict) -> list[str]:
+    """Validate benchmark.json structure."""
+    errors = validate_required_fields(data, SCHEMAS["benchmark"]["required"])
+
+    if "metadata" in data:
+        meta_errors = validate_required_fields(
+            data["metadata"],
+            SCHEMAS["benchmark"]["metadata"]["required"],
+            "metadata: "
+        )
+        errors.extend(meta_errors)
+
+    if "runs" in data:
+        if not isinstance(data["runs"], list):
+            errors.append("'runs' must be a list")
+        else:
+            for i, run in enumerate(data["runs"]):
+                run_errors = validate_required_fields(
+                    run,
+                    SCHEMAS["benchmark"]["runs_item"]["required"],
+                    f"runs[{i}]: "
+                )
+                errors.extend(run_errors)
+
+                # Validate configuration
+                if "configuration" in run:
+                    if run["configuration"] not in ["with_skill", "without_skill"]:
+                        errors.append(f"runs[{i}].configuration must be 'with_skill' or 'without_skill'")
+
+    if "run_summary" in data:
+        for config in ["with_skill", "without_skill"]:
+            if config in data["run_summary"]:
+                config_errors = validate_required_fields(
+                    data["run_summary"][config],
+                    SCHEMAS["benchmark"]["run_summary_config"]["required"],
+                    f"run_summary.{config}: "
+                )
+                errors.extend(config_errors)
+
+    return errors
+
+
+def validate_metrics(data: dict) -> list[str]:
+    """Validate metrics.json structure."""
+    errors = validate_required_fields(data, SCHEMAS["metrics"]["required"])
+
+    if "tool_calls" in data and not isinstance(data["tool_calls"], dict):
+        errors.append("'tool_calls' must be an object")
+
+    if "total_tool_calls" in data and not isinstance(data["total_tool_calls"], int):
+        errors.append("'total_tool_calls' must be an integer")
+
+    return errors
+
+
+def validate_timing(data: dict) -> list[str]:
+    """Validate timing.json structure."""
+    errors = []
+
+    # At least one timing field should be present
+    timing_fields = SCHEMAS["timing"]["optional"]
+    has_timing = any(field in data for field in timing_fields)
+
+    if not has_timing:
+        errors.append("timing.json should have at least one timing field")
+
+    # Validate duration fields are numbers
+    for field in ["executor_duration_seconds", "grader_duration_seconds", "total_duration_seconds"]:
+        if field in data and not isinstance(data[field], (int, float)):
+            errors.append(f"'{field}' must be a number")
+
+    return errors
+
+
+def validate_history(data: dict) -> list[str]:
+    """Validate history.json structure."""
+    errors = validate_required_fields(data, SCHEMAS["history"]["required"])
+
+    if "iterations" in data:
+        if not isinstance(data["iterations"], list):
+            errors.append("'iterations' must be a list")
+        else:
+            for i, iteration in enumerate(data["iterations"]):
+                iter_errors = validate_required_fields(
+                    iteration,
+                    SCHEMAS["history"]["iterations_item"]["required"],
+                    f"iterations[{i}]: "
+                )
+                errors.extend(iter_errors)
+
+                if "grading_result" in iteration:
+                    valid_results = ["baseline", "won", "lost", "tie"]
+                    if iteration["grading_result"] not in valid_results:
+                        errors.append(f"iterations[{i}].grading_result must be one of: {valid_results}")
+
+    return errors
+
+
+def validate_comparison(data: dict) -> list[str]:
+    """Validate comparison.json structure."""
+    errors = validate_required_fields(data, SCHEMAS["comparison"]["required"])
+
+    if "winner" in data:
+        if data["winner"] not in ["A", "B", "TIE"]:
+            errors.append("'winner' must be 'A', 'B', or 'TIE'")
+
+    if "rubric" in data:
+        for side in ["A", "B"]:
+            if side in data["rubric"]:
+                side_errors = validate_required_fields(
+                    data["rubric"][side],
+                    SCHEMAS["comparison"]["rubric_side"]["required"],
+                    f"rubric.{side}: "
+                )
+                errors.extend(side_errors)
+
+    return errors
+
+
+def validate_analysis(data: dict) -> list[str]:
+    """Validate analysis.json structure."""
+    errors = validate_required_fields(data, SCHEMAS["analysis"]["required"])
+
+    if "improvement_suggestions" in data:
+        if not isinstance(data["improvement_suggestions"], list):
+            errors.append("'improvement_suggestions' must be a list")
+        else:
+            for i, suggestion in enumerate(data["improvement_suggestions"]):
+                sugg_errors = validate_required_fields(
+                    suggestion,
+                    SCHEMAS["analysis"]["improvement_item"]["required"],
+                    f"improvement_suggestions[{i}]: "
+                )
+                errors.extend(sugg_errors)
+
+                if "priority" in suggestion:
+                    if suggestion["priority"] not in ["high", "medium", "low"]:
+                        errors.append(f"improvement_suggestions[{i}].priority must be 'high', 'medium', or 'low'")
+
+    return errors
+
+
+VALIDATORS = {
+    "evals": validate_evals,
+    "grading": validate_grading,
+    "benchmark": validate_benchmark,
+    "metrics": validate_metrics,
+    "timing": validate_timing,
+    "history": validate_history,
+    "comparison": validate_comparison,
+    "analysis": validate_analysis,
+}
+
+
+def validate_file(file_path: Path, json_type: str | None = None) -> tuple[bool, list[str]]:
+    """
+    Validate a JSON file.
+
+    Returns (is_valid, errors) tuple.
+    """
+    errors = []
+
+    # Check file exists
+    if not file_path.exists():
+        return False, [f"File not found: {file_path}"]
+
+    # Load JSON
+    try:
+        with open(file_path) as f:
+            data = json.load(f)
+    except json.JSONDecodeError as e:
+        return False, [f"Invalid JSON: {e}"]
+
+    # Infer type if not provided
+    if json_type is None:
+        json_type = infer_type(file_path)
+
+    if json_type is None:
+        return False, [f"Could not infer JSON type from filename. Use --type to specify."]
+
+    if json_type not in VALIDATORS:
+        return False, [f"Unknown JSON type: {json_type}. Valid types: {list(VALIDATORS.keys())}"]
+
+    # Run validation
+    validator = VALIDATORS[json_type]
+    errors = validator(data)
+
+    return len(errors) == 0, errors
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Validate JSON files produced by skill-creator-edge"
+    )
+    parser.add_argument("file", type=Path, help="Path to the JSON file to validate")
+    parser.add_argument(
+        "--type", "-t",
+        choices=list(VALIDATORS.keys()),
+        help="JSON type (inferred from filename if not specified)"
+    )
+
+    args = parser.parse_args()
+
+    is_valid, errors = validate_file(args.file, args.type)
+
+    if is_valid:
+        print(f"✓ {args.file} is valid")
+        sys.exit(0)
+    else:
+        print(f"✗ {args.file} has {len(errors)} error(s):")
+        for error in errors:
+            print(f"  - {error}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()