mirror of
https://github.com/anthropics/claude-plugins-official.git
synced 2026-03-18 23:03:07 +00:00
Add skill-creator plugin
This commit is contained in:
351
plugins/skill-creator/skills/skill-creator/scripts/aggregate_benchmark.py
Executable file
351
plugins/skill-creator/skills/skill-creator/scripts/aggregate_benchmark.py
Executable file
@@ -0,0 +1,351 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Aggregate individual run results into benchmark summary statistics.
|
||||
|
||||
Reads grading.json files from run directories and produces:
|
||||
- run_summary with mean, stddev, min, max for each metric
|
||||
- delta between with_skill and without_skill configurations
|
||||
|
||||
Usage:
|
||||
python aggregate_benchmark.py <benchmark_dir>
|
||||
|
||||
Example:
|
||||
python aggregate_benchmark.py benchmarks/2026-01-15T10-30-00/
|
||||
|
||||
The script expects this directory structure:
|
||||
<benchmark_dir>/
|
||||
└── runs/
|
||||
└── eval-N/
|
||||
├── with_skill/
|
||||
│ ├── run-1/grading.json
|
||||
│ ├── run-2/grading.json
|
||||
│ └── run-3/grading.json
|
||||
└── without_skill/
|
||||
├── run-1/grading.json
|
||||
├── run-2/grading.json
|
||||
└── run-3/grading.json
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import math
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def calculate_stats(values: list[float]) -> dict:
|
||||
"""Calculate mean, stddev, min, max for a list of values."""
|
||||
if not values:
|
||||
return {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}
|
||||
|
||||
n = len(values)
|
||||
mean = sum(values) / n
|
||||
|
||||
if n > 1:
|
||||
variance = sum((x - mean) ** 2 for x in values) / (n - 1)
|
||||
stddev = math.sqrt(variance)
|
||||
else:
|
||||
stddev = 0.0
|
||||
|
||||
return {
|
||||
"mean": round(mean, 4),
|
||||
"stddev": round(stddev, 4),
|
||||
"min": round(min(values), 4),
|
||||
"max": round(max(values), 4)
|
||||
}
|
||||
|
||||
|
||||
def load_run_results(benchmark_dir: Path) -> dict:
|
||||
"""
|
||||
Load all run results from a benchmark directory.
|
||||
|
||||
Returns dict with structure:
|
||||
{
|
||||
"with_skill": [
|
||||
{"eval_id": 1, "run_number": 1, "pass_rate": 0.85, ...},
|
||||
...
|
||||
],
|
||||
"without_skill": [...]
|
||||
}
|
||||
"""
|
||||
runs_dir = benchmark_dir / "runs"
|
||||
|
||||
if not runs_dir.exists():
|
||||
print(f"Runs directory not found: {runs_dir}")
|
||||
return {"with_skill": [], "without_skill": []}
|
||||
|
||||
results = {"with_skill": [], "without_skill": []}
|
||||
|
||||
for eval_dir in sorted(runs_dir.glob("eval-*")):
|
||||
eval_id = int(eval_dir.name.split("-")[1])
|
||||
|
||||
for config in ["with_skill", "without_skill"]:
|
||||
config_dir = eval_dir / config
|
||||
|
||||
if not config_dir.exists():
|
||||
continue
|
||||
|
||||
for run_dir in sorted(config_dir.glob("run-*")):
|
||||
run_number = int(run_dir.name.split("-")[1])
|
||||
grading_file = run_dir / "grading.json"
|
||||
|
||||
if not grading_file.exists():
|
||||
print(f"Warning: grading.json not found in {run_dir}")
|
||||
continue
|
||||
|
||||
try:
|
||||
with open(grading_file) as f:
|
||||
grading = json.load(f)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Warning: Invalid JSON in {grading_file}: {e}")
|
||||
continue
|
||||
|
||||
# Extract metrics
|
||||
result = {
|
||||
"eval_id": eval_id,
|
||||
"run_number": run_number,
|
||||
"pass_rate": grading.get("summary", {}).get("pass_rate", 0.0),
|
||||
"passed": grading.get("summary", {}).get("passed", 0),
|
||||
"failed": grading.get("summary", {}).get("failed", 0),
|
||||
"total": grading.get("summary", {}).get("total", 0),
|
||||
}
|
||||
|
||||
# Extract timing if available
|
||||
timing = grading.get("timing", {})
|
||||
result["time_seconds"] = timing.get("total_duration_seconds", 0.0)
|
||||
|
||||
# Extract metrics if available
|
||||
metrics = grading.get("execution_metrics", {})
|
||||
result["tool_calls"] = metrics.get("total_tool_calls", 0)
|
||||
result["tokens"] = metrics.get("output_chars", 0) # Placeholder
|
||||
result["errors"] = metrics.get("errors_encountered", 0)
|
||||
|
||||
# Extract expectations
|
||||
result["expectations"] = grading.get("expectations", [])
|
||||
|
||||
# Extract notes from user_notes_summary
|
||||
notes_summary = grading.get("user_notes_summary", {})
|
||||
notes = []
|
||||
notes.extend(notes_summary.get("uncertainties", []))
|
||||
notes.extend(notes_summary.get("needs_review", []))
|
||||
notes.extend(notes_summary.get("workarounds", []))
|
||||
result["notes"] = notes
|
||||
|
||||
results[config].append(result)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def aggregate_results(results: dict) -> dict:
|
||||
"""
|
||||
Aggregate run results into summary statistics.
|
||||
|
||||
Returns run_summary with stats for each configuration and delta.
|
||||
"""
|
||||
run_summary = {}
|
||||
|
||||
for config in ["with_skill", "without_skill"]:
|
||||
runs = results.get(config, [])
|
||||
|
||||
if not runs:
|
||||
run_summary[config] = {
|
||||
"pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
|
||||
"time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
|
||||
"tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
|
||||
}
|
||||
continue
|
||||
|
||||
pass_rates = [r["pass_rate"] for r in runs]
|
||||
times = [r["time_seconds"] for r in runs]
|
||||
tokens = [r.get("tokens", 0) for r in runs]
|
||||
|
||||
run_summary[config] = {
|
||||
"pass_rate": calculate_stats(pass_rates),
|
||||
"time_seconds": calculate_stats(times),
|
||||
"tokens": calculate_stats(tokens)
|
||||
}
|
||||
|
||||
# Calculate delta
|
||||
with_skill = run_summary.get("with_skill", {})
|
||||
without_skill = run_summary.get("without_skill", {})
|
||||
|
||||
delta_pass_rate = with_skill.get("pass_rate", {}).get("mean", 0) - without_skill.get("pass_rate", {}).get("mean", 0)
|
||||
delta_time = with_skill.get("time_seconds", {}).get("mean", 0) - without_skill.get("time_seconds", {}).get("mean", 0)
|
||||
delta_tokens = with_skill.get("tokens", {}).get("mean", 0) - without_skill.get("tokens", {}).get("mean", 0)
|
||||
|
||||
run_summary["delta"] = {
|
||||
"pass_rate": f"{delta_pass_rate:+.2f}",
|
||||
"time_seconds": f"{delta_time:+.1f}",
|
||||
"tokens": f"{delta_tokens:+.0f}"
|
||||
}
|
||||
|
||||
return run_summary
|
||||
|
||||
|
||||
def generate_benchmark(benchmark_dir: Path, skill_name: str = "", skill_path: str = "") -> dict:
|
||||
"""
|
||||
Generate complete benchmark.json from run results.
|
||||
"""
|
||||
results = load_run_results(benchmark_dir)
|
||||
run_summary = aggregate_results(results)
|
||||
|
||||
# Build runs array for benchmark.json
|
||||
runs = []
|
||||
for config in ["with_skill", "without_skill"]:
|
||||
for result in results.get(config, []):
|
||||
runs.append({
|
||||
"eval_id": result["eval_id"],
|
||||
"configuration": config,
|
||||
"run_number": result["run_number"],
|
||||
"result": {
|
||||
"pass_rate": result["pass_rate"],
|
||||
"passed": result["passed"],
|
||||
"failed": result["failed"],
|
||||
"total": result["total"],
|
||||
"time_seconds": result["time_seconds"],
|
||||
"tokens": result.get("tokens", 0),
|
||||
"tool_calls": result.get("tool_calls", 0),
|
||||
"errors": result.get("errors", 0)
|
||||
},
|
||||
"expectations": result["expectations"],
|
||||
"notes": result["notes"]
|
||||
})
|
||||
|
||||
# Determine eval IDs from results
|
||||
eval_ids = sorted(set(
|
||||
r["eval_id"]
|
||||
for config in results.values()
|
||||
for r in config
|
||||
))
|
||||
|
||||
benchmark = {
|
||||
"metadata": {
|
||||
"skill_name": skill_name or "<skill-name>",
|
||||
"skill_path": skill_path or "<path/to/skill>",
|
||||
"executor_model": "<model-name>",
|
||||
"analyzer_model": "<model-name>",
|
||||
"timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"evals_run": eval_ids,
|
||||
"runs_per_configuration": 3
|
||||
},
|
||||
"runs": runs,
|
||||
"run_summary": run_summary,
|
||||
"notes": [] # To be filled by analyzer
|
||||
}
|
||||
|
||||
return benchmark
|
||||
|
||||
|
||||
def generate_markdown(benchmark: dict) -> str:
|
||||
"""Generate human-readable benchmark.md from benchmark data."""
|
||||
metadata = benchmark["metadata"]
|
||||
run_summary = benchmark["run_summary"]
|
||||
|
||||
lines = [
|
||||
f"# Skill Benchmark: {metadata['skill_name']}",
|
||||
"",
|
||||
f"**Model**: {metadata['executor_model']}",
|
||||
f"**Date**: {metadata['timestamp']}",
|
||||
f"**Evals**: {', '.join(map(str, metadata['evals_run']))} ({metadata['runs_per_configuration']} runs each per configuration)",
|
||||
"",
|
||||
"## Summary",
|
||||
"",
|
||||
"| Metric | With Skill | Without Skill | Delta |",
|
||||
"|--------|------------|---------------|-------|",
|
||||
]
|
||||
|
||||
# Format pass rate
|
||||
with_pr = run_summary["with_skill"]["pass_rate"]
|
||||
without_pr = run_summary["without_skill"]["pass_rate"]
|
||||
delta_pr = run_summary["delta"]["pass_rate"]
|
||||
lines.append(f"| Pass Rate | {with_pr['mean']*100:.0f}% ± {with_pr['stddev']*100:.0f}% | {without_pr['mean']*100:.0f}% ± {without_pr['stddev']*100:.0f}% | {delta_pr} |")
|
||||
|
||||
# Format time
|
||||
with_time = run_summary["with_skill"]["time_seconds"]
|
||||
without_time = run_summary["without_skill"]["time_seconds"]
|
||||
delta_time = run_summary["delta"]["time_seconds"]
|
||||
lines.append(f"| Time | {with_time['mean']:.1f}s ± {with_time['stddev']:.1f}s | {without_time['mean']:.1f}s ± {without_time['stddev']:.1f}s | {delta_time}s |")
|
||||
|
||||
# Format tokens
|
||||
with_tokens = run_summary["with_skill"]["tokens"]
|
||||
without_tokens = run_summary["without_skill"]["tokens"]
|
||||
delta_tokens = run_summary["delta"]["tokens"]
|
||||
lines.append(f"| Tokens | {with_tokens['mean']:.0f} ± {with_tokens['stddev']:.0f} | {without_tokens['mean']:.0f} ± {without_tokens['stddev']:.0f} | {delta_tokens} |")
|
||||
|
||||
# Notes section
|
||||
if benchmark.get("notes"):
|
||||
lines.extend([
|
||||
"",
|
||||
"## Notes",
|
||||
""
|
||||
])
|
||||
for note in benchmark["notes"]:
|
||||
lines.append(f"- {note}")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Aggregate benchmark run results into summary statistics"
|
||||
)
|
||||
parser.add_argument(
|
||||
"benchmark_dir",
|
||||
type=Path,
|
||||
help="Path to the benchmark directory"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skill-name",
|
||||
default="",
|
||||
help="Name of the skill being benchmarked"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skill-path",
|
||||
default="",
|
||||
help="Path to the skill being benchmarked"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", "-o",
|
||||
type=Path,
|
||||
help="Output path for benchmark.json (default: <benchmark_dir>/benchmark.json)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.benchmark_dir.exists():
|
||||
print(f"Directory not found: {args.benchmark_dir}")
|
||||
sys.exit(1)
|
||||
|
||||
# Generate benchmark
|
||||
benchmark = generate_benchmark(args.benchmark_dir, args.skill_name, args.skill_path)
|
||||
|
||||
# Determine output paths
|
||||
output_json = args.output or (args.benchmark_dir / "benchmark.json")
|
||||
output_md = output_json.with_suffix(".md")
|
||||
|
||||
# Write benchmark.json
|
||||
with open(output_json, "w") as f:
|
||||
json.dump(benchmark, f, indent=2)
|
||||
print(f"Generated: {output_json}")
|
||||
|
||||
# Write benchmark.md
|
||||
markdown = generate_markdown(benchmark)
|
||||
with open(output_md, "w") as f:
|
||||
f.write(markdown)
|
||||
print(f"Generated: {output_md}")
|
||||
|
||||
# Print summary
|
||||
run_summary = benchmark["run_summary"]
|
||||
with_pr = run_summary["with_skill"]["pass_rate"]["mean"]
|
||||
without_pr = run_summary["without_skill"]["pass_rate"]["mean"]
|
||||
delta = run_summary["delta"]["pass_rate"]
|
||||
|
||||
print(f"\nSummary:")
|
||||
print(f" With skill: {with_pr*100:.1f}% pass rate")
|
||||
print(f" Without skill: {without_pr*100:.1f}% pass rate")
|
||||
print(f" Delta: {delta}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
209
plugins/skill-creator/skills/skill-creator/scripts/copy_skill.py
Executable file
209
plugins/skill-creator/skills/skill-creator/scripts/copy_skill.py
Executable file
@@ -0,0 +1,209 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Copy a skill directory with metadata tracking.
|
||||
|
||||
Creates a copy of a skill directory and adds a META.yaml file
|
||||
to track lineage, changes, and performance metrics.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import shutil
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def create_meta_yaml(
|
||||
dest: Path,
|
||||
parent: str | None,
|
||||
changes: str | None,
|
||||
score: float | None,
|
||||
iteration: int | None,
|
||||
) -> None:
|
||||
"""Create META.yaml file in the destination directory."""
|
||||
created_at = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
# Build YAML content manually to avoid external dependencies
|
||||
lines = ["# Skill iteration metadata", ""]
|
||||
|
||||
# Helper to format YAML values
|
||||
def yaml_value(val):
|
||||
if val is None:
|
||||
return "null"
|
||||
if isinstance(val, bool):
|
||||
return "true" if val else "false"
|
||||
if isinstance(val, (int, float)):
|
||||
return str(val)
|
||||
if isinstance(val, str):
|
||||
# Quote strings that might be ambiguous
|
||||
if val in ("null", "true", "false") or val.startswith(("'", '"', "[", "{")):
|
||||
return f'"{val}"'
|
||||
# Quote strings with special characters
|
||||
if any(c in val for c in (":", "#", "\n", '"', "'")):
|
||||
escaped = val.replace("\\", "\\\\").replace('"', '\\"')
|
||||
return f'"{escaped}"'
|
||||
return val
|
||||
return str(val)
|
||||
|
||||
lines.append(f"parent: {yaml_value(parent)}")
|
||||
lines.append(f"changes: {yaml_value(changes)}")
|
||||
lines.append(f"score: {yaml_value(score)}")
|
||||
lines.append(f"iteration: {yaml_value(iteration)}")
|
||||
lines.append(f"created_at: {yaml_value(created_at)}")
|
||||
lines.append("")
|
||||
|
||||
meta_path = dest / "META.yaml"
|
||||
meta_path.write_text("\n".join(lines))
|
||||
|
||||
|
||||
def copy_skill(
|
||||
source: Path,
|
||||
dest: Path,
|
||||
parent: str | None = None,
|
||||
changes: str | None = None,
|
||||
score: float | None = None,
|
||||
iteration: int | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Copy a skill directory and create version directory structure.
|
||||
|
||||
Creates a version directory with:
|
||||
- skill/ : The actual skill files (copied from source)
|
||||
- runs/ : Created by executor during execution (run-1/, run-2/, run-3/)
|
||||
- improvements/ : For improvement suggestions (if not v0)
|
||||
- META.yaml : Version metadata
|
||||
|
||||
The runs/ directory structure is created on-demand by the executor:
|
||||
- runs/run-1/transcript.md, outputs/, evaluation.json
|
||||
- runs/run-2/...
|
||||
- runs/run-3/...
|
||||
|
||||
Args:
|
||||
source: Path to the source skill directory (or source/skill/ if copying from another version)
|
||||
dest: Path to the destination version directory (e.g., workspace/v1)
|
||||
parent: Name/path of the parent skill iteration
|
||||
changes: Description of changes from parent
|
||||
score: Evaluation score for this iteration
|
||||
iteration: Iteration number
|
||||
"""
|
||||
source = Path(source).resolve()
|
||||
dest = Path(dest).resolve()
|
||||
|
||||
if not source.exists():
|
||||
raise FileNotFoundError(f"Source directory does not exist: {source}")
|
||||
|
||||
if not source.is_dir():
|
||||
raise ValueError(f"Source must be a directory: {source}")
|
||||
|
||||
if dest.exists():
|
||||
raise FileExistsError(f"Destination already exists: {dest}")
|
||||
|
||||
# Create the version directory structure
|
||||
dest.mkdir(parents=True)
|
||||
skill_dest = dest / "skill"
|
||||
(dest / "runs").mkdir()
|
||||
|
||||
# Create improvements directory for non-baseline versions
|
||||
if iteration is not None and iteration > 0:
|
||||
(dest / "improvements").mkdir()
|
||||
|
||||
# Copy the skill files to skill/ subdirectory
|
||||
shutil.copytree(source, skill_dest)
|
||||
|
||||
# Create metadata file at the version root
|
||||
create_meta_yaml(dest, parent, changes, score, iteration)
|
||||
|
||||
print(f"Copied skill from {source} to {skill_dest}")
|
||||
print(f"Created version directory structure at {dest}")
|
||||
print(f" - skill/ : Skill files")
|
||||
print(f" - runs/ : For execution runs (run-1/, run-2/, run-3/)")
|
||||
if iteration is not None and iteration > 0:
|
||||
print(f" - improvements/ : Improvement suggestions")
|
||||
print(f" - META.yaml : Version metadata")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Copy a skill directory with metadata tracking and version structure.",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Create baseline v0 from an existing skill
|
||||
python copy_skill.py ./skills/public/pdf ./skill_iterations/v0 --iteration 0
|
||||
|
||||
# Create v1 from v0's skill directory
|
||||
python copy_skill.py ./skill_iterations/v0/skill ./skill_iterations/v1 \\
|
||||
--parent v0 \\
|
||||
--changes "Added error handling for empty cells" \\
|
||||
--iteration 1
|
||||
|
||||
# Create v2 with score from evaluation
|
||||
python copy_skill.py ./skill_iterations/v1/skill ./skill_iterations/v2 \\
|
||||
--parent v1 \\
|
||||
--changes "Improved coordinate guidance" \\
|
||||
--score 7.5 \\
|
||||
--iteration 2
|
||||
|
||||
Output structure:
|
||||
dest/
|
||||
├── META.yaml # Version metadata
|
||||
├── skill/ # The actual skill files
|
||||
├── runs/ # Execution runs (created by executor)
|
||||
│ ├── run-1/
|
||||
│ │ ├── transcript.md
|
||||
│ │ ├── outputs/
|
||||
│ │ └── evaluation.json
|
||||
│ ├── run-2/
|
||||
│ └── run-3/
|
||||
└── improvements/ # Improvement suggestions (v1+)
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument("source", type=Path, help="Source skill directory to copy")
|
||||
|
||||
parser.add_argument("dest", type=Path, help="Destination path for the copy")
|
||||
|
||||
parser.add_argument(
|
||||
"--parent",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Name or path of the parent skill iteration",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--changes",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Description of changes from the parent version",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--score",
|
||||
type=float,
|
||||
default=None,
|
||||
help="Evaluation score for this iteration (e.g., 7.5)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--iteration",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Iteration number (e.g., 1, 2, 3)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
copy_skill(
|
||||
source=args.source,
|
||||
dest=args.dest,
|
||||
parent=args.parent,
|
||||
changes=args.changes,
|
||||
score=args.score,
|
||||
iteration=args.iteration,
|
||||
)
|
||||
except (FileNotFoundError, FileExistsError, ValueError) as e:
|
||||
parser.error(str(e))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
311
plugins/skill-creator/skills/skill-creator/scripts/init_json.py
Executable file
311
plugins/skill-creator/skills/skill-creator/scripts/init_json.py
Executable file
@@ -0,0 +1,311 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Initialize JSON files with the correct structure for skill-creator-edge.
|
||||
|
||||
Creates template JSON files that can be filled in.
|
||||
|
||||
Usage:
|
||||
python init_json.py <type> <output_path>
|
||||
|
||||
Examples:
|
||||
python init_json.py evals evals/evals.json
|
||||
python init_json.py grading run-1/grading.json
|
||||
python init_json.py benchmark benchmarks/2026-01-15/benchmark.json
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
TEMPLATES = {
|
||||
"evals": {
|
||||
"skill_name": "<skill-name>",
|
||||
"evals": [
|
||||
{
|
||||
"id": 1,
|
||||
"prompt": "Example task prompt",
|
||||
"expected_output": "Description of expected result",
|
||||
"files": [],
|
||||
"expectations": [
|
||||
"The output includes X",
|
||||
"The skill correctly handles Y"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
"grading": {
|
||||
"expectations": [
|
||||
{
|
||||
"text": "Example expectation",
|
||||
"passed": True,
|
||||
"evidence": "Found in transcript: ..."
|
||||
}
|
||||
],
|
||||
"summary": {
|
||||
"passed": 1,
|
||||
"failed": 0,
|
||||
"total": 1,
|
||||
"pass_rate": 1.0
|
||||
},
|
||||
"execution_metrics": {
|
||||
"tool_calls": {
|
||||
"Read": 0,
|
||||
"Write": 0,
|
||||
"Bash": 0,
|
||||
"Edit": 0,
|
||||
"Glob": 0,
|
||||
"Grep": 0
|
||||
},
|
||||
"total_tool_calls": 0,
|
||||
"total_steps": 0,
|
||||
"errors_encountered": 0,
|
||||
"output_chars": 0,
|
||||
"transcript_chars": 0
|
||||
},
|
||||
"timing": {
|
||||
"executor_duration_seconds": 0.0,
|
||||
"grader_duration_seconds": 0.0,
|
||||
"total_duration_seconds": 0.0
|
||||
},
|
||||
"claims": [],
|
||||
"user_notes_summary": {
|
||||
"uncertainties": [],
|
||||
"needs_review": [],
|
||||
"workarounds": []
|
||||
}
|
||||
},
|
||||
|
||||
"benchmark": {
|
||||
"metadata": {
|
||||
"skill_name": "<skill-name>",
|
||||
"skill_path": "<path/to/skill>",
|
||||
"executor_model": "<model-name>",
|
||||
"analyzer_model": "<model-name>",
|
||||
"timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"evals_run": [1],
|
||||
"runs_per_configuration": 3
|
||||
},
|
||||
"runs": [
|
||||
{
|
||||
"eval_id": 1,
|
||||
"configuration": "with_skill",
|
||||
"run_number": 1,
|
||||
"result": {
|
||||
"pass_rate": 0.0,
|
||||
"passed": 0,
|
||||
"failed": 0,
|
||||
"total": 0,
|
||||
"time_seconds": 0.0,
|
||||
"tokens": 0,
|
||||
"tool_calls": 0,
|
||||
"errors": 0
|
||||
},
|
||||
"expectations": [],
|
||||
"notes": []
|
||||
}
|
||||
],
|
||||
"run_summary": {
|
||||
"with_skill": {
|
||||
"pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
|
||||
"time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
|
||||
"tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
|
||||
},
|
||||
"without_skill": {
|
||||
"pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
|
||||
"time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
|
||||
"tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
|
||||
},
|
||||
"delta": {
|
||||
"pass_rate": "+0.0",
|
||||
"time_seconds": "+0.0",
|
||||
"tokens": "+0"
|
||||
}
|
||||
},
|
||||
"notes": []
|
||||
},
|
||||
|
||||
"metrics": {
|
||||
"tool_calls": {
|
||||
"Read": 0,
|
||||
"Write": 0,
|
||||
"Bash": 0,
|
||||
"Edit": 0,
|
||||
"Glob": 0,
|
||||
"Grep": 0
|
||||
},
|
||||
"total_tool_calls": 0,
|
||||
"total_steps": 0,
|
||||
"files_created": [],
|
||||
"errors_encountered": 0,
|
||||
"output_chars": 0,
|
||||
"transcript_chars": 0
|
||||
},
|
||||
|
||||
"timing": {
|
||||
"executor_start": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"executor_end": "",
|
||||
"executor_duration_seconds": 0.0,
|
||||
"grader_start": "",
|
||||
"grader_end": "",
|
||||
"grader_duration_seconds": 0.0,
|
||||
"total_duration_seconds": 0.0
|
||||
},
|
||||
|
||||
"history": {
|
||||
"started_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"skill_name": "<skill-name>",
|
||||
"current_best": "v0",
|
||||
"iterations": [
|
||||
{
|
||||
"version": "v0",
|
||||
"parent": None,
|
||||
"expectation_pass_rate": 0.0,
|
||||
"grading_result": "baseline",
|
||||
"is_current_best": True
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
"comparison": {
|
||||
"winner": "A",
|
||||
"reasoning": "Explanation of why the winner was chosen",
|
||||
"rubric": {
|
||||
"A": {
|
||||
"content": {
|
||||
"correctness": 5,
|
||||
"completeness": 5,
|
||||
"accuracy": 5
|
||||
},
|
||||
"structure": {
|
||||
"organization": 5,
|
||||
"formatting": 5,
|
||||
"usability": 5
|
||||
},
|
||||
"content_score": 5.0,
|
||||
"structure_score": 5.0,
|
||||
"overall_score": 10.0
|
||||
},
|
||||
"B": {
|
||||
"content": {
|
||||
"correctness": 3,
|
||||
"completeness": 3,
|
||||
"accuracy": 3
|
||||
},
|
||||
"structure": {
|
||||
"organization": 3,
|
||||
"formatting": 3,
|
||||
"usability": 3
|
||||
},
|
||||
"content_score": 3.0,
|
||||
"structure_score": 3.0,
|
||||
"overall_score": 6.0
|
||||
}
|
||||
},
|
||||
"output_quality": {
|
||||
"A": {
|
||||
"score": 10,
|
||||
"strengths": [],
|
||||
"weaknesses": []
|
||||
},
|
||||
"B": {
|
||||
"score": 6,
|
||||
"strengths": [],
|
||||
"weaknesses": []
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
"analysis": {
|
||||
"comparison_summary": {
|
||||
"winner": "A",
|
||||
"winner_skill": "<path/to/winner>",
|
||||
"loser_skill": "<path/to/loser>",
|
||||
"comparator_reasoning": "Summary of comparison"
|
||||
},
|
||||
"winner_strengths": [],
|
||||
"loser_weaknesses": [],
|
||||
"instruction_following": {
|
||||
"winner": {
|
||||
"score": 10,
|
||||
"issues": []
|
||||
},
|
||||
"loser": {
|
||||
"score": 5,
|
||||
"issues": []
|
||||
}
|
||||
},
|
||||
"improvement_suggestions": [
|
||||
{
|
||||
"priority": "high",
|
||||
"category": "instructions",
|
||||
"suggestion": "Specific improvement suggestion",
|
||||
"expected_impact": "Why this would help"
|
||||
}
|
||||
],
|
||||
"transcript_insights": {
|
||||
"winner_execution_pattern": "Description of how winner executed",
|
||||
"loser_execution_pattern": "Description of how loser executed"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def init_json(json_type: str, output_path: Path, force: bool = False) -> bool:
|
||||
"""
|
||||
Initialize a JSON file with the correct template structure.
|
||||
|
||||
Returns True on success, False on failure.
|
||||
"""
|
||||
if json_type not in TEMPLATES:
|
||||
print(f"Unknown type: {json_type}. Valid types: {list(TEMPLATES.keys())}")
|
||||
return False
|
||||
|
||||
if output_path.exists() and not force:
|
||||
print(f"File already exists: {output_path}")
|
||||
print("Use --force to overwrite")
|
||||
return False
|
||||
|
||||
# Create parent directories
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Write template
|
||||
template = TEMPLATES[json_type]
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(template, f, indent=2)
|
||||
|
||||
print(f"Created {json_type} template: {output_path}")
|
||||
return True
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Initialize JSON files with correct structure"
|
||||
)
|
||||
parser.add_argument(
|
||||
"type",
|
||||
choices=list(TEMPLATES.keys()),
|
||||
help="Type of JSON file to create"
|
||||
)
|
||||
parser.add_argument(
|
||||
"output",
|
||||
type=Path,
|
||||
help="Output path for the JSON file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--force", "-f",
|
||||
action="store_true",
|
||||
help="Overwrite existing file"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
success = init_json(args.type, args.output, args.force)
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
303
plugins/skill-creator/skills/skill-creator/scripts/init_skill.py
Executable file
303
plugins/skill-creator/skills/skill-creator/scripts/init_skill.py
Executable file
@@ -0,0 +1,303 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Skill Initializer - Creates a new skill from template
|
||||
|
||||
Usage:
|
||||
init_skill.py <skill-name> --path <path>
|
||||
|
||||
Examples:
|
||||
init_skill.py my-new-skill --path skills/public
|
||||
init_skill.py my-api-helper --path skills/private
|
||||
init_skill.py custom-skill --path /custom/location
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
SKILL_TEMPLATE = """---
|
||||
name: {skill_name}
|
||||
description: [TODO: Complete and informative explanation of what the skill does and when to use it. Include WHEN to use this skill - specific scenarios, file types, or tasks that trigger it.]
|
||||
---
|
||||
|
||||
# {skill_title}
|
||||
|
||||
## Overview
|
||||
|
||||
[TODO: 1-2 sentences explaining what this skill enables]
|
||||
|
||||
## Structuring This Skill
|
||||
|
||||
[TODO: Choose the structure that best fits this skill's purpose. Common patterns:
|
||||
|
||||
**1. Workflow-Based** (best for sequential processes)
|
||||
- Works well when there are clear step-by-step procedures
|
||||
- Example: DOCX skill with "Workflow Decision Tree" → "Reading" → "Creating" → "Editing"
|
||||
- Structure: ## Overview → ## Workflow Decision Tree → ## Step 1 → ## Step 2...
|
||||
|
||||
**2. Task-Based** (best for tool collections)
|
||||
- Works well when the skill offers different operations/capabilities
|
||||
- Example: PDF skill with "Quick Start" → "Merge PDFs" → "Split PDFs" → "Extract Text"
|
||||
- Structure: ## Overview → ## Quick Start → ## Task Category 1 → ## Task Category 2...
|
||||
|
||||
**3. Reference/Guidelines** (best for standards or specifications)
|
||||
- Works well for brand guidelines, coding standards, or requirements
|
||||
- Example: Brand styling with "Brand Guidelines" → "Colors" → "Typography" → "Features"
|
||||
- Structure: ## Overview → ## Guidelines → ## Specifications → ## Usage...
|
||||
|
||||
**4. Capabilities-Based** (best for integrated systems)
|
||||
- Works well when the skill provides multiple interrelated features
|
||||
- Example: Product Management with "Core Capabilities" → numbered capability list
|
||||
- Structure: ## Overview → ## Core Capabilities → ### 1. Feature → ### 2. Feature...
|
||||
|
||||
Patterns can be mixed and matched as needed. Most skills combine patterns (e.g., start with task-based, add workflow for complex operations).
|
||||
|
||||
Delete this entire "Structuring This Skill" section when done - it's just guidance.]
|
||||
|
||||
## [TODO: Replace with the first main section based on chosen structure]
|
||||
|
||||
[TODO: Add content here. See examples in existing skills:
|
||||
- Code samples for technical skills
|
||||
- Decision trees for complex workflows
|
||||
- Concrete examples with realistic user requests
|
||||
- References to scripts/templates/references as needed]
|
||||
|
||||
## Resources
|
||||
|
||||
This skill includes example resource directories that demonstrate how to organize different types of bundled resources:
|
||||
|
||||
### scripts/
|
||||
Executable code (Python/Bash/etc.) that can be run directly to perform specific operations.
|
||||
|
||||
**Examples from other skills:**
|
||||
- PDF skill: `fill_fillable_fields.py`, `extract_form_field_info.py` - utilities for PDF manipulation
|
||||
- DOCX skill: `document.py`, `utilities.py` - Python modules for document processing
|
||||
|
||||
**Appropriate for:** Python scripts, shell scripts, or any executable code that performs automation, data processing, or specific operations.
|
||||
|
||||
**Note:** Scripts may be executed without loading into context, but can still be read by Claude for patching or environment adjustments.
|
||||
|
||||
### references/
|
||||
Documentation and reference material intended to be loaded into context to inform Claude's process and thinking.
|
||||
|
||||
**Examples from other skills:**
|
||||
- Product management: `communication.md`, `context_building.md` - detailed workflow guides
|
||||
- BigQuery: API reference documentation and query examples
|
||||
- Finance: Schema documentation, company policies
|
||||
|
||||
**Appropriate for:** In-depth documentation, API references, database schemas, comprehensive guides, or any detailed information that Claude should reference while working.
|
||||
|
||||
### assets/
|
||||
Files not intended to be loaded into context, but rather used within the output Claude produces.
|
||||
|
||||
**Examples from other skills:**
|
||||
- Brand styling: PowerPoint template files (.pptx), logo files
|
||||
- Frontend builder: HTML/React boilerplate project directories
|
||||
- Typography: Font files (.ttf, .woff2)
|
||||
|
||||
**Appropriate for:** Templates, boilerplate code, document templates, images, icons, fonts, or any files meant to be copied or used in the final output.
|
||||
|
||||
---
|
||||
|
||||
**Any unneeded directories can be deleted.** Not every skill requires all three types of resources.
|
||||
"""
|
||||
|
||||
EXAMPLE_SCRIPT = '''#!/usr/bin/env python3
|
||||
"""
|
||||
Example helper script for {skill_name}
|
||||
|
||||
This is a placeholder script that can be executed directly.
|
||||
Replace with actual implementation or delete if not needed.
|
||||
|
||||
Example real scripts from other skills:
|
||||
- pdf/scripts/fill_fillable_fields.py - Fills PDF form fields
|
||||
- pdf/scripts/convert_pdf_to_images.py - Converts PDF pages to images
|
||||
"""
|
||||
|
||||
def main():
|
||||
print("This is an example script for {skill_name}")
|
||||
# TODO: Add actual script logic here
|
||||
# This could be data processing, file conversion, API calls, etc.
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
'''
|
||||
|
||||
EXAMPLE_REFERENCE = """# Reference Documentation for {skill_title}
|
||||
|
||||
This is a placeholder for detailed reference documentation.
|
||||
Replace with actual reference content or delete if not needed.
|
||||
|
||||
Example real reference docs from other skills:
|
||||
- product-management/references/communication.md - Comprehensive guide for status updates
|
||||
- product-management/references/context_building.md - Deep-dive on gathering context
|
||||
- bigquery/references/ - API references and query examples
|
||||
|
||||
## When Reference Docs Are Useful
|
||||
|
||||
Reference docs are ideal for:
|
||||
- Comprehensive API documentation
|
||||
- Detailed workflow guides
|
||||
- Complex multi-step processes
|
||||
- Information too lengthy for main SKILL.md
|
||||
- Content that's only needed for specific use cases
|
||||
|
||||
## Structure Suggestions
|
||||
|
||||
### API Reference Example
|
||||
- Overview
|
||||
- Authentication
|
||||
- Endpoints with examples
|
||||
- Error codes
|
||||
- Rate limits
|
||||
|
||||
### Workflow Guide Example
|
||||
- Prerequisites
|
||||
- Step-by-step instructions
|
||||
- Common patterns
|
||||
- Troubleshooting
|
||||
- Best practices
|
||||
"""
|
||||
|
||||
EXAMPLE_ASSET = """# Example Asset File
|
||||
|
||||
This placeholder represents where asset files would be stored.
|
||||
Replace with actual asset files (templates, images, fonts, etc.) or delete if not needed.
|
||||
|
||||
Asset files are NOT intended to be loaded into context, but rather used within
|
||||
the output Claude produces.
|
||||
|
||||
Example asset files from other skills:
|
||||
- Brand guidelines: logo.png, slides_template.pptx
|
||||
- Frontend builder: hello-world/ directory with HTML/React boilerplate
|
||||
- Typography: custom-font.ttf, font-family.woff2
|
||||
- Data: sample_data.csv, test_dataset.json
|
||||
|
||||
## Common Asset Types
|
||||
|
||||
- Templates: .pptx, .docx, boilerplate directories
|
||||
- Images: .png, .jpg, .svg, .gif
|
||||
- Fonts: .ttf, .otf, .woff, .woff2
|
||||
- Boilerplate code: Project directories, starter files
|
||||
- Icons: .ico, .svg
|
||||
- Data files: .csv, .json, .xml, .yaml
|
||||
|
||||
Note: This is a text placeholder. Actual assets can be any file type.
|
||||
"""
|
||||
|
||||
|
||||
def title_case_skill_name(skill_name):
|
||||
"""Convert hyphenated skill name to Title Case for display."""
|
||||
return ' '.join(word.capitalize() for word in skill_name.split('-'))
|
||||
|
||||
|
||||
def init_skill(skill_name, path):
|
||||
"""
|
||||
Initialize a new skill directory with template SKILL.md.
|
||||
|
||||
Args:
|
||||
skill_name: Name of the skill
|
||||
path: Path where the skill directory should be created
|
||||
|
||||
Returns:
|
||||
Path to created skill directory, or None if error
|
||||
"""
|
||||
# Determine skill directory path
|
||||
skill_dir = Path(path).resolve() / skill_name
|
||||
|
||||
# Check if directory already exists
|
||||
if skill_dir.exists():
|
||||
print(f"❌ Error: Skill directory already exists: {skill_dir}")
|
||||
return None
|
||||
|
||||
# Create skill directory
|
||||
try:
|
||||
skill_dir.mkdir(parents=True, exist_ok=False)
|
||||
print(f"✅ Created skill directory: {skill_dir}")
|
||||
except Exception as e:
|
||||
print(f"❌ Error creating directory: {e}")
|
||||
return None
|
||||
|
||||
# Create SKILL.md from template
|
||||
skill_title = title_case_skill_name(skill_name)
|
||||
skill_content = SKILL_TEMPLATE.format(
|
||||
skill_name=skill_name,
|
||||
skill_title=skill_title
|
||||
)
|
||||
|
||||
skill_md_path = skill_dir / 'SKILL.md'
|
||||
try:
|
||||
skill_md_path.write_text(skill_content)
|
||||
print("✅ Created SKILL.md")
|
||||
except Exception as e:
|
||||
print(f"❌ Error creating SKILL.md: {e}")
|
||||
return None
|
||||
|
||||
# Create resource directories with example files
|
||||
try:
|
||||
# Create scripts/ directory with example script
|
||||
scripts_dir = skill_dir / 'scripts'
|
||||
scripts_dir.mkdir(exist_ok=True)
|
||||
example_script = scripts_dir / 'example.py'
|
||||
example_script.write_text(EXAMPLE_SCRIPT.format(skill_name=skill_name))
|
||||
example_script.chmod(0o755)
|
||||
print("✅ Created scripts/example.py")
|
||||
|
||||
# Create references/ directory with example reference doc
|
||||
references_dir = skill_dir / 'references'
|
||||
references_dir.mkdir(exist_ok=True)
|
||||
example_reference = references_dir / 'api_reference.md'
|
||||
example_reference.write_text(EXAMPLE_REFERENCE.format(skill_title=skill_title))
|
||||
print("✅ Created references/api_reference.md")
|
||||
|
||||
# Create assets/ directory with example asset placeholder
|
||||
assets_dir = skill_dir / 'assets'
|
||||
assets_dir.mkdir(exist_ok=True)
|
||||
example_asset = assets_dir / 'example_asset.txt'
|
||||
example_asset.write_text(EXAMPLE_ASSET)
|
||||
print("✅ Created assets/example_asset.txt")
|
||||
except Exception as e:
|
||||
print(f"❌ Error creating resource directories: {e}")
|
||||
return None
|
||||
|
||||
# Print next steps
|
||||
print(f"\n✅ Skill '{skill_name}' initialized successfully at {skill_dir}")
|
||||
print("\nNext steps:")
|
||||
print("1. Edit SKILL.md to complete the TODO items and update the description")
|
||||
print("2. Customize or delete the example files in scripts/, references/, and assets/")
|
||||
print("3. Run the validator when ready to check the skill structure")
|
||||
|
||||
return skill_dir
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 4 or sys.argv[2] != '--path':
|
||||
print("Usage: init_skill.py <skill-name> --path <path>")
|
||||
print("\nSkill name requirements:")
|
||||
print(" - Kebab-case identifier (e.g., 'my-data-analyzer')")
|
||||
print(" - Lowercase letters, digits, and hyphens only")
|
||||
print(" - Max 64 characters")
|
||||
print(" - Must match directory name exactly")
|
||||
print("\nExamples:")
|
||||
print(" init_skill.py my-new-skill --path skills/public")
|
||||
print(" init_skill.py my-api-helper --path skills/private")
|
||||
print(" init_skill.py custom-skill --path /custom/location")
|
||||
sys.exit(1)
|
||||
|
||||
skill_name = sys.argv[1]
|
||||
path = sys.argv[3]
|
||||
|
||||
print(f"🚀 Initializing skill: {skill_name}")
|
||||
print(f" Location: {path}")
|
||||
print()
|
||||
|
||||
result = init_skill(skill_name, path)
|
||||
|
||||
if result:
|
||||
sys.exit(0)
|
||||
else:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
136
plugins/skill-creator/skills/skill-creator/scripts/package_skill.py
Executable file
136
plugins/skill-creator/skills/skill-creator/scripts/package_skill.py
Executable file
@@ -0,0 +1,136 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Skill Packager - Creates a distributable .skill file of a skill folder
|
||||
|
||||
Usage:
|
||||
python utils/package_skill.py <path/to/skill-folder> [output-directory]
|
||||
|
||||
Example:
|
||||
python utils/package_skill.py skills/public/my-skill
|
||||
python utils/package_skill.py skills/public/my-skill ./dist
|
||||
"""
|
||||
|
||||
import fnmatch
|
||||
import sys
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from quick_validate import validate_skill
|
||||
|
||||
# Patterns to exclude when packaging skills.
|
||||
EXCLUDE_DIRS = {"__pycache__", "node_modules"}
|
||||
EXCLUDE_GLOBS = {"*.pyc"}
|
||||
EXCLUDE_FILES = {".DS_Store"}
|
||||
# Directories excluded only at the skill root (not when nested deeper).
|
||||
ROOT_EXCLUDE_DIRS = {"evals"}
|
||||
|
||||
|
||||
def should_exclude(rel_path: Path) -> bool:
|
||||
"""Check if a path should be excluded from packaging."""
|
||||
parts = rel_path.parts
|
||||
if any(part in EXCLUDE_DIRS for part in parts):
|
||||
return True
|
||||
# rel_path is relative to skill_path.parent, so parts[0] is the skill
|
||||
# folder name and parts[1] (if present) is the first subdir.
|
||||
if len(parts) > 1 and parts[1] in ROOT_EXCLUDE_DIRS:
|
||||
return True
|
||||
name = rel_path.name
|
||||
if name in EXCLUDE_FILES:
|
||||
return True
|
||||
return any(fnmatch.fnmatch(name, pat) for pat in EXCLUDE_GLOBS)
|
||||
|
||||
|
||||
def package_skill(skill_path, output_dir=None):
|
||||
"""
|
||||
Package a skill folder into a .skill file.
|
||||
|
||||
Args:
|
||||
skill_path: Path to the skill folder
|
||||
output_dir: Optional output directory for the .skill file (defaults to current directory)
|
||||
|
||||
Returns:
|
||||
Path to the created .skill file, or None if error
|
||||
"""
|
||||
skill_path = Path(skill_path).resolve()
|
||||
|
||||
# Validate skill folder exists
|
||||
if not skill_path.exists():
|
||||
print(f"❌ Error: Skill folder not found: {skill_path}")
|
||||
return None
|
||||
|
||||
if not skill_path.is_dir():
|
||||
print(f"❌ Error: Path is not a directory: {skill_path}")
|
||||
return None
|
||||
|
||||
# Validate SKILL.md exists
|
||||
skill_md = skill_path / "SKILL.md"
|
||||
if not skill_md.exists():
|
||||
print(f"❌ Error: SKILL.md not found in {skill_path}")
|
||||
return None
|
||||
|
||||
# Run validation before packaging
|
||||
print("🔍 Validating skill...")
|
||||
valid, message = validate_skill(skill_path)
|
||||
if not valid:
|
||||
print(f"❌ Validation failed: {message}")
|
||||
print(" Please fix the validation errors before packaging.")
|
||||
return None
|
||||
print(f"✅ {message}\n")
|
||||
|
||||
# Determine output location
|
||||
skill_name = skill_path.name
|
||||
if output_dir:
|
||||
output_path = Path(output_dir).resolve()
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
else:
|
||||
output_path = Path.cwd()
|
||||
|
||||
skill_filename = output_path / f"{skill_name}.skill"
|
||||
|
||||
# Create the .skill file (zip format)
|
||||
try:
|
||||
with zipfile.ZipFile(skill_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
||||
# Walk through the skill directory, excluding build artifacts
|
||||
for file_path in skill_path.rglob('*'):
|
||||
if not file_path.is_file():
|
||||
continue
|
||||
arcname = file_path.relative_to(skill_path.parent)
|
||||
if should_exclude(arcname):
|
||||
print(f" Skipped: {arcname}")
|
||||
continue
|
||||
zipf.write(file_path, arcname)
|
||||
print(f" Added: {arcname}")
|
||||
|
||||
print(f"\n✅ Successfully packaged skill to: {skill_filename}")
|
||||
return skill_filename
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error creating .skill file: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python utils/package_skill.py <path/to/skill-folder> [output-directory]")
|
||||
print("\nExample:")
|
||||
print(" python utils/package_skill.py skills/public/my-skill")
|
||||
print(" python utils/package_skill.py skills/public/my-skill ./dist")
|
||||
sys.exit(1)
|
||||
|
||||
skill_path = sys.argv[1]
|
||||
output_dir = sys.argv[2] if len(sys.argv) > 2 else None
|
||||
|
||||
print(f"📦 Packaging skill: {skill_path}")
|
||||
if output_dir:
|
||||
print(f" Output directory: {output_dir}")
|
||||
print()
|
||||
|
||||
result = package_skill(skill_path, output_dir)
|
||||
|
||||
if result:
|
||||
sys.exit(0)
|
||||
else:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
279
plugins/skill-creator/skills/skill-creator/scripts/prepare_eval.py
Executable file
279
plugins/skill-creator/skills/skill-creator/scripts/prepare_eval.py
Executable file
@@ -0,0 +1,279 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Prepare environment for running a skill eval.
|
||||
|
||||
Usage:
|
||||
prepare_eval.py <skill-path> <eval-id> --output-dir <dir> [--no-skill]
|
||||
|
||||
Examples:
|
||||
prepare_eval.py skills/public/pdf 0 --output-dir workspace/eval-001/with-skill
|
||||
prepare_eval.py skills/public/pdf 0 --output-dir workspace/eval-001/without-skill --no-skill
|
||||
|
||||
Options:
|
||||
<skill-path> Path to the skill directory
|
||||
<eval-id> Index of the eval in evals/evals.json (0-based)
|
||||
--output-dir Directory to prepare for the eval run
|
||||
--no-skill If set, do not copy the skill (for baseline comparison)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def is_writable(path: Path) -> bool:
|
||||
"""Check if a directory is writable."""
|
||||
try:
|
||||
test_file = path / ".write_test"
|
||||
test_file.touch()
|
||||
test_file.unlink()
|
||||
return True
|
||||
except (OSError, PermissionError):
|
||||
return False
|
||||
|
||||
|
||||
def load_evals(skill_path: Path) -> list:
|
||||
"""Load evals from the skill's evals/evals.json file."""
|
||||
evals_file = skill_path / "evals" / "evals.json"
|
||||
if not evals_file.exists():
|
||||
raise FileNotFoundError(f"Evals file not found: {evals_file}")
|
||||
|
||||
with open(evals_file, "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Handle both formats: plain list or wrapped in object with "evals" key
|
||||
if isinstance(data, dict) and "evals" in data:
|
||||
evals = data["evals"]
|
||||
elif isinstance(data, list):
|
||||
evals = data
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Expected evals.json to contain a list or object with 'evals' key, "
|
||||
f"got {type(data).__name__}"
|
||||
)
|
||||
|
||||
return evals
|
||||
|
||||
|
||||
def get_eval(evals: list, eval_id: int) -> dict:
|
||||
"""Get a specific eval by ID (0-based index)."""
|
||||
if eval_id < 0 or eval_id >= len(evals):
|
||||
raise IndexError(f"Eval ID {eval_id} out of range (0-{len(evals)-1})")
|
||||
return evals[eval_id]
|
||||
|
||||
|
||||
def normalize_eval(eval_data: dict) -> dict:
|
||||
"""
|
||||
Normalize eval data to a consistent format.
|
||||
|
||||
Handles both the design doc format (prompt, files, assertions)
|
||||
and the gym format (query, files, expected_behavior).
|
||||
"""
|
||||
# Get the prompt (can be "prompt" or "query")
|
||||
prompt = eval_data.get("prompt") or eval_data.get("query")
|
||||
if not prompt:
|
||||
raise ValueError("Eval must have either 'prompt' or 'query' field")
|
||||
|
||||
# Get files (default to empty list)
|
||||
files = eval_data.get("files", [])
|
||||
|
||||
# Get assertions - can be "assertions" (list of strings)
|
||||
# or "expected_behavior" (list of strings or objects)
|
||||
assertions = eval_data.get("assertions")
|
||||
if assertions is None:
|
||||
expected_behavior = eval_data.get("expected_behavior", [])
|
||||
# Convert expected_behavior to string assertions if needed
|
||||
assertions = []
|
||||
for item in expected_behavior:
|
||||
if isinstance(item, str):
|
||||
assertions.append(item)
|
||||
elif isinstance(item, dict):
|
||||
# Convert structured assertion to string description
|
||||
assertion_type = item.get("assertion", "unknown")
|
||||
# Build a human-readable assertion string
|
||||
parts = [f"Assertion type: {assertion_type}"]
|
||||
for key, value in item.items():
|
||||
if key != "assertion":
|
||||
parts.append(f"{key}={value}")
|
||||
assertions.append(" - ".join(parts))
|
||||
|
||||
return {
|
||||
"prompt": prompt,
|
||||
"files": files,
|
||||
"assertions": assertions
|
||||
}
|
||||
|
||||
|
||||
def prepare_eval(skill_path: Path, eval_id: int, output_dir: Path, no_skill: bool = False) -> dict:
|
||||
"""
|
||||
Prepare the environment for running an eval.
|
||||
|
||||
Args:
|
||||
skill_path: Path to the skill directory
|
||||
eval_id: Index of the eval in evals.json
|
||||
output_dir: Directory to prepare for the eval run
|
||||
no_skill: If True, do not copy the skill (for baseline comparison)
|
||||
|
||||
Returns:
|
||||
Dictionary with eval metadata
|
||||
"""
|
||||
skill_path = Path(skill_path).resolve()
|
||||
output_dir = Path(output_dir).resolve()
|
||||
|
||||
# Validate skill path
|
||||
if not skill_path.exists():
|
||||
raise FileNotFoundError(f"Skill directory not found: {skill_path}")
|
||||
|
||||
skill_md = skill_path / "SKILL.md"
|
||||
if not skill_md.exists():
|
||||
raise FileNotFoundError(f"SKILL.md not found in {skill_path}")
|
||||
|
||||
# Load and get the specific eval
|
||||
evals = load_evals(skill_path)
|
||||
eval_data = get_eval(evals, eval_id)
|
||||
normalized = normalize_eval(eval_data)
|
||||
|
||||
# Create output directory
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Create inputs directory and stage input files
|
||||
inputs_dir = output_dir / "inputs"
|
||||
inputs_dir.mkdir(exist_ok=True)
|
||||
|
||||
staged_files = []
|
||||
for file_ref in normalized["files"]:
|
||||
# Files can be relative to skill's evals/files/ directory
|
||||
source = skill_path / "evals" / "files" / file_ref
|
||||
if not source.exists():
|
||||
# Try relative to evals/ directly
|
||||
source = skill_path / "evals" / file_ref
|
||||
if not source.exists():
|
||||
# Try relative to skill root
|
||||
source = skill_path / file_ref
|
||||
|
||||
if source.exists():
|
||||
dest = inputs_dir / Path(file_ref).name
|
||||
if source.is_file():
|
||||
shutil.copy2(source, dest)
|
||||
else:
|
||||
shutil.copytree(source, dest, dirs_exist_ok=True)
|
||||
staged_files.append(str(dest))
|
||||
print(f" Staged: {file_ref} -> {dest}")
|
||||
else:
|
||||
print(f" Warning: File not found: {file_ref}")
|
||||
|
||||
# Create outputs directory
|
||||
outputs_dir = output_dir / "outputs"
|
||||
outputs_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Copy skill if not --no-skill
|
||||
skill_copy_path = None
|
||||
if not no_skill:
|
||||
skill_copy_path = output_dir / "skill"
|
||||
if skill_copy_path.exists():
|
||||
shutil.rmtree(skill_copy_path)
|
||||
shutil.copytree(skill_path, skill_copy_path, dirs_exist_ok=True)
|
||||
skill_copy_path = str(skill_copy_path)
|
||||
print(f" Copied skill to: {skill_copy_path}")
|
||||
|
||||
# Build metadata
|
||||
metadata = {
|
||||
"eval_id": eval_id,
|
||||
"prompt": normalized["prompt"],
|
||||
"assertions": normalized["assertions"],
|
||||
"input_files": staged_files,
|
||||
"skill_path": skill_copy_path,
|
||||
"output_dir": str(output_dir),
|
||||
"inputs_dir": str(inputs_dir),
|
||||
"outputs_dir": str(outputs_dir),
|
||||
"no_skill": no_skill,
|
||||
"original_skill_path": str(skill_path)
|
||||
}
|
||||
|
||||
# Write metadata file
|
||||
metadata_path = output_dir / "eval_metadata.json"
|
||||
with open(metadata_path, "w") as f:
|
||||
json.dump(metadata, f, indent=2)
|
||||
print(f" Wrote: {metadata_path}")
|
||||
|
||||
return metadata
|
||||
|
||||
|
||||
def determine_workspace(skill_path: Path) -> Path:
|
||||
"""
|
||||
Determine the appropriate workspace location.
|
||||
|
||||
If skill directory is writable, use <skill>/workspace/
|
||||
Otherwise, use <project-root>/<skill-name>-workspace/
|
||||
"""
|
||||
skill_path = Path(skill_path).resolve()
|
||||
|
||||
if is_writable(skill_path):
|
||||
return skill_path / "workspace"
|
||||
|
||||
# Find project root (look for .git or go up to home)
|
||||
project_root = skill_path
|
||||
while project_root != project_root.parent:
|
||||
if (project_root / ".git").exists():
|
||||
break
|
||||
project_root = project_root.parent
|
||||
|
||||
if project_root == project_root.parent:
|
||||
# Fallback to skill's parent directory
|
||||
project_root = skill_path.parent
|
||||
|
||||
skill_name = skill_path.name
|
||||
return project_root / f"{skill_name}-workspace"
|
||||
|
||||
|
||||
def main():
|
||||
# Parse arguments
|
||||
args = sys.argv[1:]
|
||||
|
||||
if len(args) < 4 or "--output-dir" not in args:
|
||||
print(__doc__)
|
||||
sys.exit(1)
|
||||
|
||||
# Find positional arguments and flags
|
||||
skill_path = args[0]
|
||||
eval_id = int(args[1])
|
||||
no_skill = "--no-skill" in args
|
||||
|
||||
# Find --output-dir value
|
||||
output_dir_idx = args.index("--output-dir")
|
||||
if output_dir_idx + 1 >= len(args):
|
||||
print("Error: --output-dir requires a value")
|
||||
sys.exit(1)
|
||||
output_dir = args[output_dir_idx + 1]
|
||||
|
||||
print(f"Preparing eval {eval_id} for skill: {skill_path}")
|
||||
print(f"Output directory: {output_dir}")
|
||||
if no_skill:
|
||||
print("Mode: without skill (baseline)")
|
||||
else:
|
||||
print("Mode: with skill")
|
||||
print()
|
||||
|
||||
try:
|
||||
metadata = prepare_eval(
|
||||
skill_path=Path(skill_path),
|
||||
eval_id=eval_id,
|
||||
output_dir=Path(output_dir),
|
||||
no_skill=no_skill
|
||||
)
|
||||
|
||||
print()
|
||||
print("Eval prepared successfully!")
|
||||
print(f" Prompt: {metadata['prompt'][:60]}..." if len(metadata['prompt']) > 60 else f" Prompt: {metadata['prompt']}")
|
||||
print(f" Assertions: {len(metadata['assertions'])}")
|
||||
print(f" Input files: {len(metadata['input_files'])}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
103
plugins/skill-creator/skills/skill-creator/scripts/quick_validate.py
Executable file
103
plugins/skill-creator/skills/skill-creator/scripts/quick_validate.py
Executable file
@@ -0,0 +1,103 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Quick validation script for skills - minimal version
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
|
||||
def validate_skill(skill_path):
|
||||
"""Basic validation of a skill"""
|
||||
skill_path = Path(skill_path)
|
||||
|
||||
# Check SKILL.md exists
|
||||
skill_md = skill_path / 'SKILL.md'
|
||||
if not skill_md.exists():
|
||||
return False, "SKILL.md not found"
|
||||
|
||||
# Read and validate frontmatter
|
||||
content = skill_md.read_text()
|
||||
if not content.startswith('---'):
|
||||
return False, "No YAML frontmatter found"
|
||||
|
||||
# Extract frontmatter
|
||||
match = re.match(r'^---\n(.*?)\n---', content, re.DOTALL)
|
||||
if not match:
|
||||
return False, "Invalid frontmatter format"
|
||||
|
||||
frontmatter_text = match.group(1)
|
||||
|
||||
# Parse YAML frontmatter
|
||||
try:
|
||||
frontmatter = yaml.safe_load(frontmatter_text)
|
||||
if not isinstance(frontmatter, dict):
|
||||
return False, "Frontmatter must be a YAML dictionary"
|
||||
except yaml.YAMLError as e:
|
||||
return False, f"Invalid YAML in frontmatter: {e}"
|
||||
|
||||
# Define allowed properties
|
||||
ALLOWED_PROPERTIES = {'name', 'description', 'license', 'allowed-tools', 'metadata', 'compatibility'}
|
||||
|
||||
# Check for unexpected properties (excluding nested keys under metadata)
|
||||
unexpected_keys = set(frontmatter.keys()) - ALLOWED_PROPERTIES
|
||||
if unexpected_keys:
|
||||
return False, (
|
||||
f"Unexpected key(s) in SKILL.md frontmatter: {', '.join(sorted(unexpected_keys))}. "
|
||||
f"Allowed properties are: {', '.join(sorted(ALLOWED_PROPERTIES))}"
|
||||
)
|
||||
|
||||
# Check required fields
|
||||
if 'name' not in frontmatter:
|
||||
return False, "Missing 'name' in frontmatter"
|
||||
if 'description' not in frontmatter:
|
||||
return False, "Missing 'description' in frontmatter"
|
||||
|
||||
# Extract name for validation
|
||||
name = frontmatter.get('name', '')
|
||||
if not isinstance(name, str):
|
||||
return False, f"Name must be a string, got {type(name).__name__}"
|
||||
name = name.strip()
|
||||
if name:
|
||||
# Check naming convention (kebab-case: lowercase with hyphens)
|
||||
if not re.match(r'^[a-z0-9-]+$', name):
|
||||
return False, f"Name '{name}' should be kebab-case (lowercase letters, digits, and hyphens only)"
|
||||
if name.startswith('-') or name.endswith('-') or '--' in name:
|
||||
return False, f"Name '{name}' cannot start/end with hyphen or contain consecutive hyphens"
|
||||
# Check name length (max 64 characters per spec)
|
||||
if len(name) > 64:
|
||||
return False, f"Name is too long ({len(name)} characters). Maximum is 64 characters."
|
||||
|
||||
# Extract and validate description
|
||||
description = frontmatter.get('description', '')
|
||||
if not isinstance(description, str):
|
||||
return False, f"Description must be a string, got {type(description).__name__}"
|
||||
description = description.strip()
|
||||
if description:
|
||||
# Check for angle brackets
|
||||
if '<' in description or '>' in description:
|
||||
return False, "Description cannot contain angle brackets (< or >)"
|
||||
# Check description length (max 1024 characters per spec)
|
||||
if len(description) > 1024:
|
||||
return False, f"Description is too long ({len(description)} characters). Maximum is 1024 characters."
|
||||
|
||||
# Validate compatibility field if present (optional)
|
||||
compatibility = frontmatter.get('compatibility', '')
|
||||
if compatibility:
|
||||
if not isinstance(compatibility, str):
|
||||
return False, f"Compatibility must be a string, got {type(compatibility).__name__}"
|
||||
if len(compatibility) > 500:
|
||||
return False, f"Compatibility is too long ({len(compatibility)} characters). Maximum is 500 characters."
|
||||
|
||||
return True, "Skill is valid!"
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 2:
|
||||
print("Usage: python quick_validate.py <skill_directory>")
|
||||
sys.exit(1)
|
||||
|
||||
valid, message = validate_skill(sys.argv[1])
|
||||
print(message)
|
||||
sys.exit(0 if valid else 1)
|
||||
411
plugins/skill-creator/skills/skill-creator/scripts/validate_json.py
Executable file
411
plugins/skill-creator/skills/skill-creator/scripts/validate_json.py
Executable file
@@ -0,0 +1,411 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validate JSON files produced by skill-creator-edge.
|
||||
|
||||
Supports validation for:
|
||||
- evals.json: Skill evaluation definitions
|
||||
- grading.json: Grader output
|
||||
- benchmark.json: Benchmark results
|
||||
- metrics.json: Executor metrics
|
||||
- timing.json: Timing data
|
||||
- history.json: Improve mode version history
|
||||
- comparison.json: Blind comparator output
|
||||
- analysis.json: Post-hoc analyzer output
|
||||
|
||||
Usage:
|
||||
python validate_json.py <file_path> [--type <type>]
|
||||
|
||||
Examples:
|
||||
python validate_json.py workspace/benchmark.json
|
||||
python validate_json.py evals/evals.json --type evals
|
||||
python validate_json.py run-1/grading.json --type grading
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
# Schema definitions as validation rules
|
||||
SCHEMAS = {
|
||||
"evals": {
|
||||
"required": ["skill_name", "evals"],
|
||||
"evals_item": {
|
||||
"required": ["id", "prompt"],
|
||||
"optional": ["expected_output", "files", "expectations"]
|
||||
}
|
||||
},
|
||||
"grading": {
|
||||
"required": ["expectations", "summary"],
|
||||
"summary": {
|
||||
"required": ["passed", "failed", "total", "pass_rate"]
|
||||
},
|
||||
"expectations_item": {
|
||||
"required": ["text", "passed", "evidence"]
|
||||
}
|
||||
},
|
||||
"benchmark": {
|
||||
"required": ["metadata", "runs", "run_summary"],
|
||||
"metadata": {
|
||||
"required": ["skill_name", "timestamp", "runs_per_configuration"]
|
||||
},
|
||||
"runs_item": {
|
||||
"required": ["eval_id", "configuration", "run_number", "result"]
|
||||
},
|
||||
"run_summary_config": {
|
||||
"required": ["pass_rate", "time_seconds", "tokens"]
|
||||
}
|
||||
},
|
||||
"metrics": {
|
||||
"required": ["tool_calls", "total_tool_calls"],
|
||||
"optional": ["total_steps", "files_created", "errors_encountered",
|
||||
"output_chars", "transcript_chars"]
|
||||
},
|
||||
"timing": {
|
||||
"required": [], # All fields optional but should have at least one
|
||||
"optional": ["executor_start", "executor_end", "executor_duration_seconds",
|
||||
"grader_start", "grader_end", "grader_duration_seconds",
|
||||
"total_duration_seconds"]
|
||||
},
|
||||
"history": {
|
||||
"required": ["started_at", "skill_name", "current_best", "iterations"],
|
||||
"iterations_item": {
|
||||
"required": ["version", "expectation_pass_rate", "grading_result", "is_current_best"]
|
||||
}
|
||||
},
|
||||
"comparison": {
|
||||
"required": ["winner", "reasoning", "rubric", "output_quality"],
|
||||
"rubric_side": {
|
||||
"required": ["content", "structure", "overall_score"]
|
||||
}
|
||||
},
|
||||
"analysis": {
|
||||
"required": ["comparison_summary", "winner_strengths", "loser_weaknesses",
|
||||
"improvement_suggestions"],
|
||||
"improvement_item": {
|
||||
"required": ["priority", "category", "suggestion"]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def infer_type(file_path: Path) -> str | None:
|
||||
"""Infer JSON type from filename."""
|
||||
name = file_path.name.lower()
|
||||
|
||||
if name == "evals.json":
|
||||
return "evals"
|
||||
elif name == "grading.json":
|
||||
return "grading"
|
||||
elif name == "benchmark.json":
|
||||
return "benchmark"
|
||||
elif name == "metrics.json":
|
||||
return "metrics"
|
||||
elif name == "timing.json":
|
||||
return "timing"
|
||||
elif name == "history.json":
|
||||
return "history"
|
||||
elif name.startswith("comparison"):
|
||||
return "comparison"
|
||||
elif name == "analysis.json":
|
||||
return "analysis"
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def validate_required_fields(data: dict, required: list[str], path: str = "") -> list[str]:
|
||||
"""Check that all required fields are present."""
|
||||
errors = []
|
||||
for field in required:
|
||||
if field not in data:
|
||||
errors.append(f"{path}Missing required field: {field}")
|
||||
return errors
|
||||
|
||||
|
||||
def validate_evals(data: dict) -> list[str]:
|
||||
"""Validate evals.json structure."""
|
||||
errors = validate_required_fields(data, SCHEMAS["evals"]["required"])
|
||||
|
||||
if "evals" in data:
|
||||
if not isinstance(data["evals"], list):
|
||||
errors.append("'evals' must be a list")
|
||||
else:
|
||||
for i, eval_item in enumerate(data["evals"]):
|
||||
item_errors = validate_required_fields(
|
||||
eval_item,
|
||||
SCHEMAS["evals"]["evals_item"]["required"],
|
||||
f"evals[{i}]: "
|
||||
)
|
||||
errors.extend(item_errors)
|
||||
|
||||
# Validate id is an integer
|
||||
if "id" in eval_item and not isinstance(eval_item["id"], int):
|
||||
errors.append(f"evals[{i}]: 'id' must be an integer")
|
||||
|
||||
# Validate expectations is a list of strings
|
||||
if "expectations" in eval_item:
|
||||
if not isinstance(eval_item["expectations"], list):
|
||||
errors.append(f"evals[{i}]: 'expectations' must be a list")
|
||||
else:
|
||||
for j, exp in enumerate(eval_item["expectations"]):
|
||||
if not isinstance(exp, str):
|
||||
errors.append(f"evals[{i}].expectations[{j}]: must be a string")
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
def validate_grading(data: dict) -> list[str]:
|
||||
"""Validate grading.json structure."""
|
||||
errors = validate_required_fields(data, SCHEMAS["grading"]["required"])
|
||||
|
||||
if "summary" in data:
|
||||
summary_errors = validate_required_fields(
|
||||
data["summary"],
|
||||
SCHEMAS["grading"]["summary"]["required"],
|
||||
"summary: "
|
||||
)
|
||||
errors.extend(summary_errors)
|
||||
|
||||
# Validate pass_rate is between 0 and 1
|
||||
if "pass_rate" in data["summary"]:
|
||||
pr = data["summary"]["pass_rate"]
|
||||
if not isinstance(pr, (int, float)) or pr < 0 or pr > 1:
|
||||
errors.append("summary.pass_rate must be a number between 0 and 1")
|
||||
|
||||
if "expectations" in data:
|
||||
if not isinstance(data["expectations"], list):
|
||||
errors.append("'expectations' must be a list")
|
||||
else:
|
||||
for i, exp in enumerate(data["expectations"]):
|
||||
exp_errors = validate_required_fields(
|
||||
exp,
|
||||
SCHEMAS["grading"]["expectations_item"]["required"],
|
||||
f"expectations[{i}]: "
|
||||
)
|
||||
errors.extend(exp_errors)
|
||||
|
||||
if "passed" in exp and not isinstance(exp["passed"], bool):
|
||||
errors.append(f"expectations[{i}].passed must be a boolean")
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
def validate_benchmark(data: dict) -> list[str]:
|
||||
"""Validate benchmark.json structure."""
|
||||
errors = validate_required_fields(data, SCHEMAS["benchmark"]["required"])
|
||||
|
||||
if "metadata" in data:
|
||||
meta_errors = validate_required_fields(
|
||||
data["metadata"],
|
||||
SCHEMAS["benchmark"]["metadata"]["required"],
|
||||
"metadata: "
|
||||
)
|
||||
errors.extend(meta_errors)
|
||||
|
||||
if "runs" in data:
|
||||
if not isinstance(data["runs"], list):
|
||||
errors.append("'runs' must be a list")
|
||||
else:
|
||||
for i, run in enumerate(data["runs"]):
|
||||
run_errors = validate_required_fields(
|
||||
run,
|
||||
SCHEMAS["benchmark"]["runs_item"]["required"],
|
||||
f"runs[{i}]: "
|
||||
)
|
||||
errors.extend(run_errors)
|
||||
|
||||
# Validate configuration
|
||||
if "configuration" in run:
|
||||
if run["configuration"] not in ["with_skill", "without_skill"]:
|
||||
errors.append(f"runs[{i}].configuration must be 'with_skill' or 'without_skill'")
|
||||
|
||||
if "run_summary" in data:
|
||||
for config in ["with_skill", "without_skill"]:
|
||||
if config in data["run_summary"]:
|
||||
config_errors = validate_required_fields(
|
||||
data["run_summary"][config],
|
||||
SCHEMAS["benchmark"]["run_summary_config"]["required"],
|
||||
f"run_summary.{config}: "
|
||||
)
|
||||
errors.extend(config_errors)
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
def validate_metrics(data: dict) -> list[str]:
|
||||
"""Validate metrics.json structure."""
|
||||
errors = validate_required_fields(data, SCHEMAS["metrics"]["required"])
|
||||
|
||||
if "tool_calls" in data and not isinstance(data["tool_calls"], dict):
|
||||
errors.append("'tool_calls' must be an object")
|
||||
|
||||
if "total_tool_calls" in data and not isinstance(data["total_tool_calls"], int):
|
||||
errors.append("'total_tool_calls' must be an integer")
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
def validate_timing(data: dict) -> list[str]:
|
||||
"""Validate timing.json structure."""
|
||||
errors = []
|
||||
|
||||
# At least one timing field should be present
|
||||
timing_fields = SCHEMAS["timing"]["optional"]
|
||||
has_timing = any(field in data for field in timing_fields)
|
||||
|
||||
if not has_timing:
|
||||
errors.append("timing.json should have at least one timing field")
|
||||
|
||||
# Validate duration fields are numbers
|
||||
for field in ["executor_duration_seconds", "grader_duration_seconds", "total_duration_seconds"]:
|
||||
if field in data and not isinstance(data[field], (int, float)):
|
||||
errors.append(f"'{field}' must be a number")
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
def validate_history(data: dict) -> list[str]:
|
||||
"""Validate history.json structure."""
|
||||
errors = validate_required_fields(data, SCHEMAS["history"]["required"])
|
||||
|
||||
if "iterations" in data:
|
||||
if not isinstance(data["iterations"], list):
|
||||
errors.append("'iterations' must be a list")
|
||||
else:
|
||||
for i, iteration in enumerate(data["iterations"]):
|
||||
iter_errors = validate_required_fields(
|
||||
iteration,
|
||||
SCHEMAS["history"]["iterations_item"]["required"],
|
||||
f"iterations[{i}]: "
|
||||
)
|
||||
errors.extend(iter_errors)
|
||||
|
||||
if "grading_result" in iteration:
|
||||
valid_results = ["baseline", "won", "lost", "tie"]
|
||||
if iteration["grading_result"] not in valid_results:
|
||||
errors.append(f"iterations[{i}].grading_result must be one of: {valid_results}")
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
def validate_comparison(data: dict) -> list[str]:
|
||||
"""Validate comparison.json structure."""
|
||||
errors = validate_required_fields(data, SCHEMAS["comparison"]["required"])
|
||||
|
||||
if "winner" in data:
|
||||
if data["winner"] not in ["A", "B", "TIE"]:
|
||||
errors.append("'winner' must be 'A', 'B', or 'TIE'")
|
||||
|
||||
if "rubric" in data:
|
||||
for side in ["A", "B"]:
|
||||
if side in data["rubric"]:
|
||||
side_errors = validate_required_fields(
|
||||
data["rubric"][side],
|
||||
SCHEMAS["comparison"]["rubric_side"]["required"],
|
||||
f"rubric.{side}: "
|
||||
)
|
||||
errors.extend(side_errors)
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
def validate_analysis(data: dict) -> list[str]:
|
||||
"""Validate analysis.json structure."""
|
||||
errors = validate_required_fields(data, SCHEMAS["analysis"]["required"])
|
||||
|
||||
if "improvement_suggestions" in data:
|
||||
if not isinstance(data["improvement_suggestions"], list):
|
||||
errors.append("'improvement_suggestions' must be a list")
|
||||
else:
|
||||
for i, suggestion in enumerate(data["improvement_suggestions"]):
|
||||
sugg_errors = validate_required_fields(
|
||||
suggestion,
|
||||
SCHEMAS["analysis"]["improvement_item"]["required"],
|
||||
f"improvement_suggestions[{i}]: "
|
||||
)
|
||||
errors.extend(sugg_errors)
|
||||
|
||||
if "priority" in suggestion:
|
||||
if suggestion["priority"] not in ["high", "medium", "low"]:
|
||||
errors.append(f"improvement_suggestions[{i}].priority must be 'high', 'medium', or 'low'")
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
VALIDATORS = {
|
||||
"evals": validate_evals,
|
||||
"grading": validate_grading,
|
||||
"benchmark": validate_benchmark,
|
||||
"metrics": validate_metrics,
|
||||
"timing": validate_timing,
|
||||
"history": validate_history,
|
||||
"comparison": validate_comparison,
|
||||
"analysis": validate_analysis,
|
||||
}
|
||||
|
||||
|
||||
def validate_file(file_path: Path, json_type: str | None = None) -> tuple[bool, list[str]]:
|
||||
"""
|
||||
Validate a JSON file.
|
||||
|
||||
Returns (is_valid, errors) tuple.
|
||||
"""
|
||||
errors = []
|
||||
|
||||
# Check file exists
|
||||
if not file_path.exists():
|
||||
return False, [f"File not found: {file_path}"]
|
||||
|
||||
# Load JSON
|
||||
try:
|
||||
with open(file_path) as f:
|
||||
data = json.load(f)
|
||||
except json.JSONDecodeError as e:
|
||||
return False, [f"Invalid JSON: {e}"]
|
||||
|
||||
# Infer type if not provided
|
||||
if json_type is None:
|
||||
json_type = infer_type(file_path)
|
||||
|
||||
if json_type is None:
|
||||
return False, [f"Could not infer JSON type from filename. Use --type to specify."]
|
||||
|
||||
if json_type not in VALIDATORS:
|
||||
return False, [f"Unknown JSON type: {json_type}. Valid types: {list(VALIDATORS.keys())}"]
|
||||
|
||||
# Run validation
|
||||
validator = VALIDATORS[json_type]
|
||||
errors = validator(data)
|
||||
|
||||
return len(errors) == 0, errors
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Validate JSON files produced by skill-creator-edge"
|
||||
)
|
||||
parser.add_argument("file", type=Path, help="Path to the JSON file to validate")
|
||||
parser.add_argument(
|
||||
"--type", "-t",
|
||||
choices=list(VALIDATORS.keys()),
|
||||
help="JSON type (inferred from filename if not specified)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
is_valid, errors = validate_file(args.file, args.type)
|
||||
|
||||
if is_valid:
|
||||
print(f"✓ {args.file} is valid")
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(f"✗ {args.file} has {len(errors)} error(s):")
|
||||
for error in errors:
|
||||
print(f" - {error}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user