Add skill-creator plugin

This commit is contained in:
Kenshiro Nakagawa
2026-02-17 17:02:51 -08:00
parent 261ce4fba4
commit 30975e61e3
20 changed files with 4879 additions and 0 deletions

View File

@@ -0,0 +1,351 @@
#!/usr/bin/env python3
"""
Aggregate individual run results into benchmark summary statistics.
Reads grading.json files from run directories and produces:
- run_summary with mean, stddev, min, max for each metric
- delta between with_skill and without_skill configurations
Usage:
python aggregate_benchmark.py <benchmark_dir>
Example:
python aggregate_benchmark.py benchmarks/2026-01-15T10-30-00/
The script expects this directory structure:
<benchmark_dir>/
└── runs/
└── eval-N/
├── with_skill/
│ ├── run-1/grading.json
│ ├── run-2/grading.json
│ └── run-3/grading.json
└── without_skill/
├── run-1/grading.json
├── run-2/grading.json
└── run-3/grading.json
"""
import argparse
import json
import math
import sys
from datetime import datetime, timezone
from pathlib import Path
def calculate_stats(values: list[float]) -> dict:
"""Calculate mean, stddev, min, max for a list of values."""
if not values:
return {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}
n = len(values)
mean = sum(values) / n
if n > 1:
variance = sum((x - mean) ** 2 for x in values) / (n - 1)
stddev = math.sqrt(variance)
else:
stddev = 0.0
return {
"mean": round(mean, 4),
"stddev": round(stddev, 4),
"min": round(min(values), 4),
"max": round(max(values), 4)
}
def load_run_results(benchmark_dir: Path) -> dict:
"""
Load all run results from a benchmark directory.
Returns dict with structure:
{
"with_skill": [
{"eval_id": 1, "run_number": 1, "pass_rate": 0.85, ...},
...
],
"without_skill": [...]
}
"""
runs_dir = benchmark_dir / "runs"
if not runs_dir.exists():
print(f"Runs directory not found: {runs_dir}")
return {"with_skill": [], "without_skill": []}
results = {"with_skill": [], "without_skill": []}
for eval_dir in sorted(runs_dir.glob("eval-*")):
eval_id = int(eval_dir.name.split("-")[1])
for config in ["with_skill", "without_skill"]:
config_dir = eval_dir / config
if not config_dir.exists():
continue
for run_dir in sorted(config_dir.glob("run-*")):
run_number = int(run_dir.name.split("-")[1])
grading_file = run_dir / "grading.json"
if not grading_file.exists():
print(f"Warning: grading.json not found in {run_dir}")
continue
try:
with open(grading_file) as f:
grading = json.load(f)
except json.JSONDecodeError as e:
print(f"Warning: Invalid JSON in {grading_file}: {e}")
continue
# Extract metrics
result = {
"eval_id": eval_id,
"run_number": run_number,
"pass_rate": grading.get("summary", {}).get("pass_rate", 0.0),
"passed": grading.get("summary", {}).get("passed", 0),
"failed": grading.get("summary", {}).get("failed", 0),
"total": grading.get("summary", {}).get("total", 0),
}
# Extract timing if available
timing = grading.get("timing", {})
result["time_seconds"] = timing.get("total_duration_seconds", 0.0)
# Extract metrics if available
metrics = grading.get("execution_metrics", {})
result["tool_calls"] = metrics.get("total_tool_calls", 0)
result["tokens"] = metrics.get("output_chars", 0) # Placeholder
result["errors"] = metrics.get("errors_encountered", 0)
# Extract expectations
result["expectations"] = grading.get("expectations", [])
# Extract notes from user_notes_summary
notes_summary = grading.get("user_notes_summary", {})
notes = []
notes.extend(notes_summary.get("uncertainties", []))
notes.extend(notes_summary.get("needs_review", []))
notes.extend(notes_summary.get("workarounds", []))
result["notes"] = notes
results[config].append(result)
return results
def aggregate_results(results: dict) -> dict:
"""
Aggregate run results into summary statistics.
Returns run_summary with stats for each configuration and delta.
"""
run_summary = {}
for config in ["with_skill", "without_skill"]:
runs = results.get(config, [])
if not runs:
run_summary[config] = {
"pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
"time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
"tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
}
continue
pass_rates = [r["pass_rate"] for r in runs]
times = [r["time_seconds"] for r in runs]
tokens = [r.get("tokens", 0) for r in runs]
run_summary[config] = {
"pass_rate": calculate_stats(pass_rates),
"time_seconds": calculate_stats(times),
"tokens": calculate_stats(tokens)
}
# Calculate delta
with_skill = run_summary.get("with_skill", {})
without_skill = run_summary.get("without_skill", {})
delta_pass_rate = with_skill.get("pass_rate", {}).get("mean", 0) - without_skill.get("pass_rate", {}).get("mean", 0)
delta_time = with_skill.get("time_seconds", {}).get("mean", 0) - without_skill.get("time_seconds", {}).get("mean", 0)
delta_tokens = with_skill.get("tokens", {}).get("mean", 0) - without_skill.get("tokens", {}).get("mean", 0)
run_summary["delta"] = {
"pass_rate": f"{delta_pass_rate:+.2f}",
"time_seconds": f"{delta_time:+.1f}",
"tokens": f"{delta_tokens:+.0f}"
}
return run_summary
def generate_benchmark(benchmark_dir: Path, skill_name: str = "", skill_path: str = "") -> dict:
"""
Generate complete benchmark.json from run results.
"""
results = load_run_results(benchmark_dir)
run_summary = aggregate_results(results)
# Build runs array for benchmark.json
runs = []
for config in ["with_skill", "without_skill"]:
for result in results.get(config, []):
runs.append({
"eval_id": result["eval_id"],
"configuration": config,
"run_number": result["run_number"],
"result": {
"pass_rate": result["pass_rate"],
"passed": result["passed"],
"failed": result["failed"],
"total": result["total"],
"time_seconds": result["time_seconds"],
"tokens": result.get("tokens", 0),
"tool_calls": result.get("tool_calls", 0),
"errors": result.get("errors", 0)
},
"expectations": result["expectations"],
"notes": result["notes"]
})
# Determine eval IDs from results
eval_ids = sorted(set(
r["eval_id"]
for config in results.values()
for r in config
))
benchmark = {
"metadata": {
"skill_name": skill_name or "<skill-name>",
"skill_path": skill_path or "<path/to/skill>",
"executor_model": "<model-name>",
"analyzer_model": "<model-name>",
"timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
"evals_run": eval_ids,
"runs_per_configuration": 3
},
"runs": runs,
"run_summary": run_summary,
"notes": [] # To be filled by analyzer
}
return benchmark
def generate_markdown(benchmark: dict) -> str:
"""Generate human-readable benchmark.md from benchmark data."""
metadata = benchmark["metadata"]
run_summary = benchmark["run_summary"]
lines = [
f"# Skill Benchmark: {metadata['skill_name']}",
"",
f"**Model**: {metadata['executor_model']}",
f"**Date**: {metadata['timestamp']}",
f"**Evals**: {', '.join(map(str, metadata['evals_run']))} ({metadata['runs_per_configuration']} runs each per configuration)",
"",
"## Summary",
"",
"| Metric | With Skill | Without Skill | Delta |",
"|--------|------------|---------------|-------|",
]
# Format pass rate
with_pr = run_summary["with_skill"]["pass_rate"]
without_pr = run_summary["without_skill"]["pass_rate"]
delta_pr = run_summary["delta"]["pass_rate"]
lines.append(f"| Pass Rate | {with_pr['mean']*100:.0f}% ± {with_pr['stddev']*100:.0f}% | {without_pr['mean']*100:.0f}% ± {without_pr['stddev']*100:.0f}% | {delta_pr} |")
# Format time
with_time = run_summary["with_skill"]["time_seconds"]
without_time = run_summary["without_skill"]["time_seconds"]
delta_time = run_summary["delta"]["time_seconds"]
lines.append(f"| Time | {with_time['mean']:.1f}s ± {with_time['stddev']:.1f}s | {without_time['mean']:.1f}s ± {without_time['stddev']:.1f}s | {delta_time}s |")
# Format tokens
with_tokens = run_summary["with_skill"]["tokens"]
without_tokens = run_summary["without_skill"]["tokens"]
delta_tokens = run_summary["delta"]["tokens"]
lines.append(f"| Tokens | {with_tokens['mean']:.0f} ± {with_tokens['stddev']:.0f} | {without_tokens['mean']:.0f} ± {without_tokens['stddev']:.0f} | {delta_tokens} |")
# Notes section
if benchmark.get("notes"):
lines.extend([
"",
"## Notes",
""
])
for note in benchmark["notes"]:
lines.append(f"- {note}")
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser(
description="Aggregate benchmark run results into summary statistics"
)
parser.add_argument(
"benchmark_dir",
type=Path,
help="Path to the benchmark directory"
)
parser.add_argument(
"--skill-name",
default="",
help="Name of the skill being benchmarked"
)
parser.add_argument(
"--skill-path",
default="",
help="Path to the skill being benchmarked"
)
parser.add_argument(
"--output", "-o",
type=Path,
help="Output path for benchmark.json (default: <benchmark_dir>/benchmark.json)"
)
args = parser.parse_args()
if not args.benchmark_dir.exists():
print(f"Directory not found: {args.benchmark_dir}")
sys.exit(1)
# Generate benchmark
benchmark = generate_benchmark(args.benchmark_dir, args.skill_name, args.skill_path)
# Determine output paths
output_json = args.output or (args.benchmark_dir / "benchmark.json")
output_md = output_json.with_suffix(".md")
# Write benchmark.json
with open(output_json, "w") as f:
json.dump(benchmark, f, indent=2)
print(f"Generated: {output_json}")
# Write benchmark.md
markdown = generate_markdown(benchmark)
with open(output_md, "w") as f:
f.write(markdown)
print(f"Generated: {output_md}")
# Print summary
run_summary = benchmark["run_summary"]
with_pr = run_summary["with_skill"]["pass_rate"]["mean"]
without_pr = run_summary["without_skill"]["pass_rate"]["mean"]
delta = run_summary["delta"]["pass_rate"]
print(f"\nSummary:")
print(f" With skill: {with_pr*100:.1f}% pass rate")
print(f" Without skill: {without_pr*100:.1f}% pass rate")
print(f" Delta: {delta}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,209 @@
#!/usr/bin/env python3
"""
Copy a skill directory with metadata tracking.
Creates a copy of a skill directory and adds a META.yaml file
to track lineage, changes, and performance metrics.
"""
import argparse
import shutil
from datetime import datetime, timezone
from pathlib import Path
def create_meta_yaml(
dest: Path,
parent: str | None,
changes: str | None,
score: float | None,
iteration: int | None,
) -> None:
"""Create META.yaml file in the destination directory."""
created_at = datetime.now(timezone.utc).isoformat()
# Build YAML content manually to avoid external dependencies
lines = ["# Skill iteration metadata", ""]
# Helper to format YAML values
def yaml_value(val):
if val is None:
return "null"
if isinstance(val, bool):
return "true" if val else "false"
if isinstance(val, (int, float)):
return str(val)
if isinstance(val, str):
# Quote strings that might be ambiguous
if val in ("null", "true", "false") or val.startswith(("'", '"', "[", "{")):
return f'"{val}"'
# Quote strings with special characters
if any(c in val for c in (":", "#", "\n", '"', "'")):
escaped = val.replace("\\", "\\\\").replace('"', '\\"')
return f'"{escaped}"'
return val
return str(val)
lines.append(f"parent: {yaml_value(parent)}")
lines.append(f"changes: {yaml_value(changes)}")
lines.append(f"score: {yaml_value(score)}")
lines.append(f"iteration: {yaml_value(iteration)}")
lines.append(f"created_at: {yaml_value(created_at)}")
lines.append("")
meta_path = dest / "META.yaml"
meta_path.write_text("\n".join(lines))
def copy_skill(
source: Path,
dest: Path,
parent: str | None = None,
changes: str | None = None,
score: float | None = None,
iteration: int | None = None,
) -> None:
"""
Copy a skill directory and create version directory structure.
Creates a version directory with:
- skill/ : The actual skill files (copied from source)
- runs/ : Created by executor during execution (run-1/, run-2/, run-3/)
- improvements/ : For improvement suggestions (if not v0)
- META.yaml : Version metadata
The runs/ directory structure is created on-demand by the executor:
- runs/run-1/transcript.md, outputs/, evaluation.json
- runs/run-2/...
- runs/run-3/...
Args:
source: Path to the source skill directory (or source/skill/ if copying from another version)
dest: Path to the destination version directory (e.g., workspace/v1)
parent: Name/path of the parent skill iteration
changes: Description of changes from parent
score: Evaluation score for this iteration
iteration: Iteration number
"""
source = Path(source).resolve()
dest = Path(dest).resolve()
if not source.exists():
raise FileNotFoundError(f"Source directory does not exist: {source}")
if not source.is_dir():
raise ValueError(f"Source must be a directory: {source}")
if dest.exists():
raise FileExistsError(f"Destination already exists: {dest}")
# Create the version directory structure
dest.mkdir(parents=True)
skill_dest = dest / "skill"
(dest / "runs").mkdir()
# Create improvements directory for non-baseline versions
if iteration is not None and iteration > 0:
(dest / "improvements").mkdir()
# Copy the skill files to skill/ subdirectory
shutil.copytree(source, skill_dest)
# Create metadata file at the version root
create_meta_yaml(dest, parent, changes, score, iteration)
print(f"Copied skill from {source} to {skill_dest}")
print(f"Created version directory structure at {dest}")
print(f" - skill/ : Skill files")
print(f" - runs/ : For execution runs (run-1/, run-2/, run-3/)")
if iteration is not None and iteration > 0:
print(f" - improvements/ : Improvement suggestions")
print(f" - META.yaml : Version metadata")
def main():
parser = argparse.ArgumentParser(
description="Copy a skill directory with metadata tracking and version structure.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Create baseline v0 from an existing skill
python copy_skill.py ./skills/public/pdf ./skill_iterations/v0 --iteration 0
# Create v1 from v0's skill directory
python copy_skill.py ./skill_iterations/v0/skill ./skill_iterations/v1 \\
--parent v0 \\
--changes "Added error handling for empty cells" \\
--iteration 1
# Create v2 with score from evaluation
python copy_skill.py ./skill_iterations/v1/skill ./skill_iterations/v2 \\
--parent v1 \\
--changes "Improved coordinate guidance" \\
--score 7.5 \\
--iteration 2
Output structure:
dest/
├── META.yaml # Version metadata
├── skill/ # The actual skill files
├── runs/ # Execution runs (created by executor)
│ ├── run-1/
│ │ ├── transcript.md
│ │ ├── outputs/
│ │ └── evaluation.json
│ ├── run-2/
│ └── run-3/
└── improvements/ # Improvement suggestions (v1+)
""",
)
parser.add_argument("source", type=Path, help="Source skill directory to copy")
parser.add_argument("dest", type=Path, help="Destination path for the copy")
parser.add_argument(
"--parent",
type=str,
default=None,
help="Name or path of the parent skill iteration",
)
parser.add_argument(
"--changes",
type=str,
default=None,
help="Description of changes from the parent version",
)
parser.add_argument(
"--score",
type=float,
default=None,
help="Evaluation score for this iteration (e.g., 7.5)",
)
parser.add_argument(
"--iteration",
type=int,
default=None,
help="Iteration number (e.g., 1, 2, 3)",
)
args = parser.parse_args()
try:
copy_skill(
source=args.source,
dest=args.dest,
parent=args.parent,
changes=args.changes,
score=args.score,
iteration=args.iteration,
)
except (FileNotFoundError, FileExistsError, ValueError) as e:
parser.error(str(e))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,311 @@
#!/usr/bin/env python3
"""
Initialize JSON files with the correct structure for skill-creator-edge.
Creates template JSON files that can be filled in.
Usage:
python init_json.py <type> <output_path>
Examples:
python init_json.py evals evals/evals.json
python init_json.py grading run-1/grading.json
python init_json.py benchmark benchmarks/2026-01-15/benchmark.json
"""
import argparse
import json
import sys
from datetime import datetime, timezone
from pathlib import Path
TEMPLATES = {
"evals": {
"skill_name": "<skill-name>",
"evals": [
{
"id": 1,
"prompt": "Example task prompt",
"expected_output": "Description of expected result",
"files": [],
"expectations": [
"The output includes X",
"The skill correctly handles Y"
]
}
]
},
"grading": {
"expectations": [
{
"text": "Example expectation",
"passed": True,
"evidence": "Found in transcript: ..."
}
],
"summary": {
"passed": 1,
"failed": 0,
"total": 1,
"pass_rate": 1.0
},
"execution_metrics": {
"tool_calls": {
"Read": 0,
"Write": 0,
"Bash": 0,
"Edit": 0,
"Glob": 0,
"Grep": 0
},
"total_tool_calls": 0,
"total_steps": 0,
"errors_encountered": 0,
"output_chars": 0,
"transcript_chars": 0
},
"timing": {
"executor_duration_seconds": 0.0,
"grader_duration_seconds": 0.0,
"total_duration_seconds": 0.0
},
"claims": [],
"user_notes_summary": {
"uncertainties": [],
"needs_review": [],
"workarounds": []
}
},
"benchmark": {
"metadata": {
"skill_name": "<skill-name>",
"skill_path": "<path/to/skill>",
"executor_model": "<model-name>",
"analyzer_model": "<model-name>",
"timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
"evals_run": [1],
"runs_per_configuration": 3
},
"runs": [
{
"eval_id": 1,
"configuration": "with_skill",
"run_number": 1,
"result": {
"pass_rate": 0.0,
"passed": 0,
"failed": 0,
"total": 0,
"time_seconds": 0.0,
"tokens": 0,
"tool_calls": 0,
"errors": 0
},
"expectations": [],
"notes": []
}
],
"run_summary": {
"with_skill": {
"pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
"time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
"tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
},
"without_skill": {
"pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
"time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
"tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
},
"delta": {
"pass_rate": "+0.0",
"time_seconds": "+0.0",
"tokens": "+0"
}
},
"notes": []
},
"metrics": {
"tool_calls": {
"Read": 0,
"Write": 0,
"Bash": 0,
"Edit": 0,
"Glob": 0,
"Grep": 0
},
"total_tool_calls": 0,
"total_steps": 0,
"files_created": [],
"errors_encountered": 0,
"output_chars": 0,
"transcript_chars": 0
},
"timing": {
"executor_start": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
"executor_end": "",
"executor_duration_seconds": 0.0,
"grader_start": "",
"grader_end": "",
"grader_duration_seconds": 0.0,
"total_duration_seconds": 0.0
},
"history": {
"started_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
"skill_name": "<skill-name>",
"current_best": "v0",
"iterations": [
{
"version": "v0",
"parent": None,
"expectation_pass_rate": 0.0,
"grading_result": "baseline",
"is_current_best": True
}
]
},
"comparison": {
"winner": "A",
"reasoning": "Explanation of why the winner was chosen",
"rubric": {
"A": {
"content": {
"correctness": 5,
"completeness": 5,
"accuracy": 5
},
"structure": {
"organization": 5,
"formatting": 5,
"usability": 5
},
"content_score": 5.0,
"structure_score": 5.0,
"overall_score": 10.0
},
"B": {
"content": {
"correctness": 3,
"completeness": 3,
"accuracy": 3
},
"structure": {
"organization": 3,
"formatting": 3,
"usability": 3
},
"content_score": 3.0,
"structure_score": 3.0,
"overall_score": 6.0
}
},
"output_quality": {
"A": {
"score": 10,
"strengths": [],
"weaknesses": []
},
"B": {
"score": 6,
"strengths": [],
"weaknesses": []
}
}
},
"analysis": {
"comparison_summary": {
"winner": "A",
"winner_skill": "<path/to/winner>",
"loser_skill": "<path/to/loser>",
"comparator_reasoning": "Summary of comparison"
},
"winner_strengths": [],
"loser_weaknesses": [],
"instruction_following": {
"winner": {
"score": 10,
"issues": []
},
"loser": {
"score": 5,
"issues": []
}
},
"improvement_suggestions": [
{
"priority": "high",
"category": "instructions",
"suggestion": "Specific improvement suggestion",
"expected_impact": "Why this would help"
}
],
"transcript_insights": {
"winner_execution_pattern": "Description of how winner executed",
"loser_execution_pattern": "Description of how loser executed"
}
}
}
def init_json(json_type: str, output_path: Path, force: bool = False) -> bool:
"""
Initialize a JSON file with the correct template structure.
Returns True on success, False on failure.
"""
if json_type not in TEMPLATES:
print(f"Unknown type: {json_type}. Valid types: {list(TEMPLATES.keys())}")
return False
if output_path.exists() and not force:
print(f"File already exists: {output_path}")
print("Use --force to overwrite")
return False
# Create parent directories
output_path.parent.mkdir(parents=True, exist_ok=True)
# Write template
template = TEMPLATES[json_type]
with open(output_path, "w") as f:
json.dump(template, f, indent=2)
print(f"Created {json_type} template: {output_path}")
return True
def main():
parser = argparse.ArgumentParser(
description="Initialize JSON files with correct structure"
)
parser.add_argument(
"type",
choices=list(TEMPLATES.keys()),
help="Type of JSON file to create"
)
parser.add_argument(
"output",
type=Path,
help="Output path for the JSON file"
)
parser.add_argument(
"--force", "-f",
action="store_true",
help="Overwrite existing file"
)
args = parser.parse_args()
success = init_json(args.type, args.output, args.force)
sys.exit(0 if success else 1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,303 @@
#!/usr/bin/env python3
"""
Skill Initializer - Creates a new skill from template
Usage:
init_skill.py <skill-name> --path <path>
Examples:
init_skill.py my-new-skill --path skills/public
init_skill.py my-api-helper --path skills/private
init_skill.py custom-skill --path /custom/location
"""
import sys
from pathlib import Path
SKILL_TEMPLATE = """---
name: {skill_name}
description: [TODO: Complete and informative explanation of what the skill does and when to use it. Include WHEN to use this skill - specific scenarios, file types, or tasks that trigger it.]
---
# {skill_title}
## Overview
[TODO: 1-2 sentences explaining what this skill enables]
## Structuring This Skill
[TODO: Choose the structure that best fits this skill's purpose. Common patterns:
**1. Workflow-Based** (best for sequential processes)
- Works well when there are clear step-by-step procedures
- Example: DOCX skill with "Workflow Decision Tree""Reading""Creating""Editing"
- Structure: ## Overview → ## Workflow Decision Tree → ## Step 1 → ## Step 2...
**2. Task-Based** (best for tool collections)
- Works well when the skill offers different operations/capabilities
- Example: PDF skill with "Quick Start""Merge PDFs""Split PDFs""Extract Text"
- Structure: ## Overview → ## Quick Start → ## Task Category 1 → ## Task Category 2...
**3. Reference/Guidelines** (best for standards or specifications)
- Works well for brand guidelines, coding standards, or requirements
- Example: Brand styling with "Brand Guidelines""Colors""Typography""Features"
- Structure: ## Overview → ## Guidelines → ## Specifications → ## Usage...
**4. Capabilities-Based** (best for integrated systems)
- Works well when the skill provides multiple interrelated features
- Example: Product Management with "Core Capabilities" → numbered capability list
- Structure: ## Overview → ## Core Capabilities → ### 1. Feature → ### 2. Feature...
Patterns can be mixed and matched as needed. Most skills combine patterns (e.g., start with task-based, add workflow for complex operations).
Delete this entire "Structuring This Skill" section when done - it's just guidance.]
## [TODO: Replace with the first main section based on chosen structure]
[TODO: Add content here. See examples in existing skills:
- Code samples for technical skills
- Decision trees for complex workflows
- Concrete examples with realistic user requests
- References to scripts/templates/references as needed]
## Resources
This skill includes example resource directories that demonstrate how to organize different types of bundled resources:
### scripts/
Executable code (Python/Bash/etc.) that can be run directly to perform specific operations.
**Examples from other skills:**
- PDF skill: `fill_fillable_fields.py`, `extract_form_field_info.py` - utilities for PDF manipulation
- DOCX skill: `document.py`, `utilities.py` - Python modules for document processing
**Appropriate for:** Python scripts, shell scripts, or any executable code that performs automation, data processing, or specific operations.
**Note:** Scripts may be executed without loading into context, but can still be read by Claude for patching or environment adjustments.
### references/
Documentation and reference material intended to be loaded into context to inform Claude's process and thinking.
**Examples from other skills:**
- Product management: `communication.md`, `context_building.md` - detailed workflow guides
- BigQuery: API reference documentation and query examples
- Finance: Schema documentation, company policies
**Appropriate for:** In-depth documentation, API references, database schemas, comprehensive guides, or any detailed information that Claude should reference while working.
### assets/
Files not intended to be loaded into context, but rather used within the output Claude produces.
**Examples from other skills:**
- Brand styling: PowerPoint template files (.pptx), logo files
- Frontend builder: HTML/React boilerplate project directories
- Typography: Font files (.ttf, .woff2)
**Appropriate for:** Templates, boilerplate code, document templates, images, icons, fonts, or any files meant to be copied or used in the final output.
---
**Any unneeded directories can be deleted.** Not every skill requires all three types of resources.
"""
EXAMPLE_SCRIPT = '''#!/usr/bin/env python3
"""
Example helper script for {skill_name}
This is a placeholder script that can be executed directly.
Replace with actual implementation or delete if not needed.
Example real scripts from other skills:
- pdf/scripts/fill_fillable_fields.py - Fills PDF form fields
- pdf/scripts/convert_pdf_to_images.py - Converts PDF pages to images
"""
def main():
print("This is an example script for {skill_name}")
# TODO: Add actual script logic here
# This could be data processing, file conversion, API calls, etc.
if __name__ == "__main__":
main()
'''
EXAMPLE_REFERENCE = """# Reference Documentation for {skill_title}
This is a placeholder for detailed reference documentation.
Replace with actual reference content or delete if not needed.
Example real reference docs from other skills:
- product-management/references/communication.md - Comprehensive guide for status updates
- product-management/references/context_building.md - Deep-dive on gathering context
- bigquery/references/ - API references and query examples
## When Reference Docs Are Useful
Reference docs are ideal for:
- Comprehensive API documentation
- Detailed workflow guides
- Complex multi-step processes
- Information too lengthy for main SKILL.md
- Content that's only needed for specific use cases
## Structure Suggestions
### API Reference Example
- Overview
- Authentication
- Endpoints with examples
- Error codes
- Rate limits
### Workflow Guide Example
- Prerequisites
- Step-by-step instructions
- Common patterns
- Troubleshooting
- Best practices
"""
EXAMPLE_ASSET = """# Example Asset File
This placeholder represents where asset files would be stored.
Replace with actual asset files (templates, images, fonts, etc.) or delete if not needed.
Asset files are NOT intended to be loaded into context, but rather used within
the output Claude produces.
Example asset files from other skills:
- Brand guidelines: logo.png, slides_template.pptx
- Frontend builder: hello-world/ directory with HTML/React boilerplate
- Typography: custom-font.ttf, font-family.woff2
- Data: sample_data.csv, test_dataset.json
## Common Asset Types
- Templates: .pptx, .docx, boilerplate directories
- Images: .png, .jpg, .svg, .gif
- Fonts: .ttf, .otf, .woff, .woff2
- Boilerplate code: Project directories, starter files
- Icons: .ico, .svg
- Data files: .csv, .json, .xml, .yaml
Note: This is a text placeholder. Actual assets can be any file type.
"""
def title_case_skill_name(skill_name):
"""Convert hyphenated skill name to Title Case for display."""
return ' '.join(word.capitalize() for word in skill_name.split('-'))
def init_skill(skill_name, path):
"""
Initialize a new skill directory with template SKILL.md.
Args:
skill_name: Name of the skill
path: Path where the skill directory should be created
Returns:
Path to created skill directory, or None if error
"""
# Determine skill directory path
skill_dir = Path(path).resolve() / skill_name
# Check if directory already exists
if skill_dir.exists():
print(f"❌ Error: Skill directory already exists: {skill_dir}")
return None
# Create skill directory
try:
skill_dir.mkdir(parents=True, exist_ok=False)
print(f"✅ Created skill directory: {skill_dir}")
except Exception as e:
print(f"❌ Error creating directory: {e}")
return None
# Create SKILL.md from template
skill_title = title_case_skill_name(skill_name)
skill_content = SKILL_TEMPLATE.format(
skill_name=skill_name,
skill_title=skill_title
)
skill_md_path = skill_dir / 'SKILL.md'
try:
skill_md_path.write_text(skill_content)
print("✅ Created SKILL.md")
except Exception as e:
print(f"❌ Error creating SKILL.md: {e}")
return None
# Create resource directories with example files
try:
# Create scripts/ directory with example script
scripts_dir = skill_dir / 'scripts'
scripts_dir.mkdir(exist_ok=True)
example_script = scripts_dir / 'example.py'
example_script.write_text(EXAMPLE_SCRIPT.format(skill_name=skill_name))
example_script.chmod(0o755)
print("✅ Created scripts/example.py")
# Create references/ directory with example reference doc
references_dir = skill_dir / 'references'
references_dir.mkdir(exist_ok=True)
example_reference = references_dir / 'api_reference.md'
example_reference.write_text(EXAMPLE_REFERENCE.format(skill_title=skill_title))
print("✅ Created references/api_reference.md")
# Create assets/ directory with example asset placeholder
assets_dir = skill_dir / 'assets'
assets_dir.mkdir(exist_ok=True)
example_asset = assets_dir / 'example_asset.txt'
example_asset.write_text(EXAMPLE_ASSET)
print("✅ Created assets/example_asset.txt")
except Exception as e:
print(f"❌ Error creating resource directories: {e}")
return None
# Print next steps
print(f"\n✅ Skill '{skill_name}' initialized successfully at {skill_dir}")
print("\nNext steps:")
print("1. Edit SKILL.md to complete the TODO items and update the description")
print("2. Customize or delete the example files in scripts/, references/, and assets/")
print("3. Run the validator when ready to check the skill structure")
return skill_dir
def main():
if len(sys.argv) < 4 or sys.argv[2] != '--path':
print("Usage: init_skill.py <skill-name> --path <path>")
print("\nSkill name requirements:")
print(" - Kebab-case identifier (e.g., 'my-data-analyzer')")
print(" - Lowercase letters, digits, and hyphens only")
print(" - Max 64 characters")
print(" - Must match directory name exactly")
print("\nExamples:")
print(" init_skill.py my-new-skill --path skills/public")
print(" init_skill.py my-api-helper --path skills/private")
print(" init_skill.py custom-skill --path /custom/location")
sys.exit(1)
skill_name = sys.argv[1]
path = sys.argv[3]
print(f"🚀 Initializing skill: {skill_name}")
print(f" Location: {path}")
print()
result = init_skill(skill_name, path)
if result:
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,136 @@
#!/usr/bin/env python3
"""
Skill Packager - Creates a distributable .skill file of a skill folder
Usage:
python utils/package_skill.py <path/to/skill-folder> [output-directory]
Example:
python utils/package_skill.py skills/public/my-skill
python utils/package_skill.py skills/public/my-skill ./dist
"""
import fnmatch
import sys
import zipfile
from pathlib import Path
from quick_validate import validate_skill
# Patterns to exclude when packaging skills.
EXCLUDE_DIRS = {"__pycache__", "node_modules"}
EXCLUDE_GLOBS = {"*.pyc"}
EXCLUDE_FILES = {".DS_Store"}
# Directories excluded only at the skill root (not when nested deeper).
ROOT_EXCLUDE_DIRS = {"evals"}
def should_exclude(rel_path: Path) -> bool:
"""Check if a path should be excluded from packaging."""
parts = rel_path.parts
if any(part in EXCLUDE_DIRS for part in parts):
return True
# rel_path is relative to skill_path.parent, so parts[0] is the skill
# folder name and parts[1] (if present) is the first subdir.
if len(parts) > 1 and parts[1] in ROOT_EXCLUDE_DIRS:
return True
name = rel_path.name
if name in EXCLUDE_FILES:
return True
return any(fnmatch.fnmatch(name, pat) for pat in EXCLUDE_GLOBS)
def package_skill(skill_path, output_dir=None):
"""
Package a skill folder into a .skill file.
Args:
skill_path: Path to the skill folder
output_dir: Optional output directory for the .skill file (defaults to current directory)
Returns:
Path to the created .skill file, or None if error
"""
skill_path = Path(skill_path).resolve()
# Validate skill folder exists
if not skill_path.exists():
print(f"❌ Error: Skill folder not found: {skill_path}")
return None
if not skill_path.is_dir():
print(f"❌ Error: Path is not a directory: {skill_path}")
return None
# Validate SKILL.md exists
skill_md = skill_path / "SKILL.md"
if not skill_md.exists():
print(f"❌ Error: SKILL.md not found in {skill_path}")
return None
# Run validation before packaging
print("🔍 Validating skill...")
valid, message = validate_skill(skill_path)
if not valid:
print(f"❌ Validation failed: {message}")
print(" Please fix the validation errors before packaging.")
return None
print(f"{message}\n")
# Determine output location
skill_name = skill_path.name
if output_dir:
output_path = Path(output_dir).resolve()
output_path.mkdir(parents=True, exist_ok=True)
else:
output_path = Path.cwd()
skill_filename = output_path / f"{skill_name}.skill"
# Create the .skill file (zip format)
try:
with zipfile.ZipFile(skill_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
# Walk through the skill directory, excluding build artifacts
for file_path in skill_path.rglob('*'):
if not file_path.is_file():
continue
arcname = file_path.relative_to(skill_path.parent)
if should_exclude(arcname):
print(f" Skipped: {arcname}")
continue
zipf.write(file_path, arcname)
print(f" Added: {arcname}")
print(f"\n✅ Successfully packaged skill to: {skill_filename}")
return skill_filename
except Exception as e:
print(f"❌ Error creating .skill file: {e}")
return None
def main():
if len(sys.argv) < 2:
print("Usage: python utils/package_skill.py <path/to/skill-folder> [output-directory]")
print("\nExample:")
print(" python utils/package_skill.py skills/public/my-skill")
print(" python utils/package_skill.py skills/public/my-skill ./dist")
sys.exit(1)
skill_path = sys.argv[1]
output_dir = sys.argv[2] if len(sys.argv) > 2 else None
print(f"📦 Packaging skill: {skill_path}")
if output_dir:
print(f" Output directory: {output_dir}")
print()
result = package_skill(skill_path, output_dir)
if result:
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,279 @@
#!/usr/bin/env python3
"""
Prepare environment for running a skill eval.
Usage:
prepare_eval.py <skill-path> <eval-id> --output-dir <dir> [--no-skill]
Examples:
prepare_eval.py skills/public/pdf 0 --output-dir workspace/eval-001/with-skill
prepare_eval.py skills/public/pdf 0 --output-dir workspace/eval-001/without-skill --no-skill
Options:
<skill-path> Path to the skill directory
<eval-id> Index of the eval in evals/evals.json (0-based)
--output-dir Directory to prepare for the eval run
--no-skill If set, do not copy the skill (for baseline comparison)
"""
import json
import os
import shutil
import sys
from pathlib import Path
def is_writable(path: Path) -> bool:
"""Check if a directory is writable."""
try:
test_file = path / ".write_test"
test_file.touch()
test_file.unlink()
return True
except (OSError, PermissionError):
return False
def load_evals(skill_path: Path) -> list:
"""Load evals from the skill's evals/evals.json file."""
evals_file = skill_path / "evals" / "evals.json"
if not evals_file.exists():
raise FileNotFoundError(f"Evals file not found: {evals_file}")
with open(evals_file, "r") as f:
data = json.load(f)
# Handle both formats: plain list or wrapped in object with "evals" key
if isinstance(data, dict) and "evals" in data:
evals = data["evals"]
elif isinstance(data, list):
evals = data
else:
raise ValueError(
f"Expected evals.json to contain a list or object with 'evals' key, "
f"got {type(data).__name__}"
)
return evals
def get_eval(evals: list, eval_id: int) -> dict:
"""Get a specific eval by ID (0-based index)."""
if eval_id < 0 or eval_id >= len(evals):
raise IndexError(f"Eval ID {eval_id} out of range (0-{len(evals)-1})")
return evals[eval_id]
def normalize_eval(eval_data: dict) -> dict:
"""
Normalize eval data to a consistent format.
Handles both the design doc format (prompt, files, assertions)
and the gym format (query, files, expected_behavior).
"""
# Get the prompt (can be "prompt" or "query")
prompt = eval_data.get("prompt") or eval_data.get("query")
if not prompt:
raise ValueError("Eval must have either 'prompt' or 'query' field")
# Get files (default to empty list)
files = eval_data.get("files", [])
# Get assertions - can be "assertions" (list of strings)
# or "expected_behavior" (list of strings or objects)
assertions = eval_data.get("assertions")
if assertions is None:
expected_behavior = eval_data.get("expected_behavior", [])
# Convert expected_behavior to string assertions if needed
assertions = []
for item in expected_behavior:
if isinstance(item, str):
assertions.append(item)
elif isinstance(item, dict):
# Convert structured assertion to string description
assertion_type = item.get("assertion", "unknown")
# Build a human-readable assertion string
parts = [f"Assertion type: {assertion_type}"]
for key, value in item.items():
if key != "assertion":
parts.append(f"{key}={value}")
assertions.append(" - ".join(parts))
return {
"prompt": prompt,
"files": files,
"assertions": assertions
}
def prepare_eval(skill_path: Path, eval_id: int, output_dir: Path, no_skill: bool = False) -> dict:
"""
Prepare the environment for running an eval.
Args:
skill_path: Path to the skill directory
eval_id: Index of the eval in evals.json
output_dir: Directory to prepare for the eval run
no_skill: If True, do not copy the skill (for baseline comparison)
Returns:
Dictionary with eval metadata
"""
skill_path = Path(skill_path).resolve()
output_dir = Path(output_dir).resolve()
# Validate skill path
if not skill_path.exists():
raise FileNotFoundError(f"Skill directory not found: {skill_path}")
skill_md = skill_path / "SKILL.md"
if not skill_md.exists():
raise FileNotFoundError(f"SKILL.md not found in {skill_path}")
# Load and get the specific eval
evals = load_evals(skill_path)
eval_data = get_eval(evals, eval_id)
normalized = normalize_eval(eval_data)
# Create output directory
output_dir.mkdir(parents=True, exist_ok=True)
# Create inputs directory and stage input files
inputs_dir = output_dir / "inputs"
inputs_dir.mkdir(exist_ok=True)
staged_files = []
for file_ref in normalized["files"]:
# Files can be relative to skill's evals/files/ directory
source = skill_path / "evals" / "files" / file_ref
if not source.exists():
# Try relative to evals/ directly
source = skill_path / "evals" / file_ref
if not source.exists():
# Try relative to skill root
source = skill_path / file_ref
if source.exists():
dest = inputs_dir / Path(file_ref).name
if source.is_file():
shutil.copy2(source, dest)
else:
shutil.copytree(source, dest, dirs_exist_ok=True)
staged_files.append(str(dest))
print(f" Staged: {file_ref} -> {dest}")
else:
print(f" Warning: File not found: {file_ref}")
# Create outputs directory
outputs_dir = output_dir / "outputs"
outputs_dir.mkdir(exist_ok=True)
# Copy skill if not --no-skill
skill_copy_path = None
if not no_skill:
skill_copy_path = output_dir / "skill"
if skill_copy_path.exists():
shutil.rmtree(skill_copy_path)
shutil.copytree(skill_path, skill_copy_path, dirs_exist_ok=True)
skill_copy_path = str(skill_copy_path)
print(f" Copied skill to: {skill_copy_path}")
# Build metadata
metadata = {
"eval_id": eval_id,
"prompt": normalized["prompt"],
"assertions": normalized["assertions"],
"input_files": staged_files,
"skill_path": skill_copy_path,
"output_dir": str(output_dir),
"inputs_dir": str(inputs_dir),
"outputs_dir": str(outputs_dir),
"no_skill": no_skill,
"original_skill_path": str(skill_path)
}
# Write metadata file
metadata_path = output_dir / "eval_metadata.json"
with open(metadata_path, "w") as f:
json.dump(metadata, f, indent=2)
print(f" Wrote: {metadata_path}")
return metadata
def determine_workspace(skill_path: Path) -> Path:
"""
Determine the appropriate workspace location.
If skill directory is writable, use <skill>/workspace/
Otherwise, use <project-root>/<skill-name>-workspace/
"""
skill_path = Path(skill_path).resolve()
if is_writable(skill_path):
return skill_path / "workspace"
# Find project root (look for .git or go up to home)
project_root = skill_path
while project_root != project_root.parent:
if (project_root / ".git").exists():
break
project_root = project_root.parent
if project_root == project_root.parent:
# Fallback to skill's parent directory
project_root = skill_path.parent
skill_name = skill_path.name
return project_root / f"{skill_name}-workspace"
def main():
# Parse arguments
args = sys.argv[1:]
if len(args) < 4 or "--output-dir" not in args:
print(__doc__)
sys.exit(1)
# Find positional arguments and flags
skill_path = args[0]
eval_id = int(args[1])
no_skill = "--no-skill" in args
# Find --output-dir value
output_dir_idx = args.index("--output-dir")
if output_dir_idx + 1 >= len(args):
print("Error: --output-dir requires a value")
sys.exit(1)
output_dir = args[output_dir_idx + 1]
print(f"Preparing eval {eval_id} for skill: {skill_path}")
print(f"Output directory: {output_dir}")
if no_skill:
print("Mode: without skill (baseline)")
else:
print("Mode: with skill")
print()
try:
metadata = prepare_eval(
skill_path=Path(skill_path),
eval_id=eval_id,
output_dir=Path(output_dir),
no_skill=no_skill
)
print()
print("Eval prepared successfully!")
print(f" Prompt: {metadata['prompt'][:60]}..." if len(metadata['prompt']) > 60 else f" Prompt: {metadata['prompt']}")
print(f" Assertions: {len(metadata['assertions'])}")
print(f" Input files: {len(metadata['input_files'])}")
except Exception as e:
print(f"Error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,103 @@
#!/usr/bin/env python3
"""
Quick validation script for skills - minimal version
"""
import sys
import os
import re
import yaml
from pathlib import Path
def validate_skill(skill_path):
"""Basic validation of a skill"""
skill_path = Path(skill_path)
# Check SKILL.md exists
skill_md = skill_path / 'SKILL.md'
if not skill_md.exists():
return False, "SKILL.md not found"
# Read and validate frontmatter
content = skill_md.read_text()
if not content.startswith('---'):
return False, "No YAML frontmatter found"
# Extract frontmatter
match = re.match(r'^---\n(.*?)\n---', content, re.DOTALL)
if not match:
return False, "Invalid frontmatter format"
frontmatter_text = match.group(1)
# Parse YAML frontmatter
try:
frontmatter = yaml.safe_load(frontmatter_text)
if not isinstance(frontmatter, dict):
return False, "Frontmatter must be a YAML dictionary"
except yaml.YAMLError as e:
return False, f"Invalid YAML in frontmatter: {e}"
# Define allowed properties
ALLOWED_PROPERTIES = {'name', 'description', 'license', 'allowed-tools', 'metadata', 'compatibility'}
# Check for unexpected properties (excluding nested keys under metadata)
unexpected_keys = set(frontmatter.keys()) - ALLOWED_PROPERTIES
if unexpected_keys:
return False, (
f"Unexpected key(s) in SKILL.md frontmatter: {', '.join(sorted(unexpected_keys))}. "
f"Allowed properties are: {', '.join(sorted(ALLOWED_PROPERTIES))}"
)
# Check required fields
if 'name' not in frontmatter:
return False, "Missing 'name' in frontmatter"
if 'description' not in frontmatter:
return False, "Missing 'description' in frontmatter"
# Extract name for validation
name = frontmatter.get('name', '')
if not isinstance(name, str):
return False, f"Name must be a string, got {type(name).__name__}"
name = name.strip()
if name:
# Check naming convention (kebab-case: lowercase with hyphens)
if not re.match(r'^[a-z0-9-]+$', name):
return False, f"Name '{name}' should be kebab-case (lowercase letters, digits, and hyphens only)"
if name.startswith('-') or name.endswith('-') or '--' in name:
return False, f"Name '{name}' cannot start/end with hyphen or contain consecutive hyphens"
# Check name length (max 64 characters per spec)
if len(name) > 64:
return False, f"Name is too long ({len(name)} characters). Maximum is 64 characters."
# Extract and validate description
description = frontmatter.get('description', '')
if not isinstance(description, str):
return False, f"Description must be a string, got {type(description).__name__}"
description = description.strip()
if description:
# Check for angle brackets
if '<' in description or '>' in description:
return False, "Description cannot contain angle brackets (< or >)"
# Check description length (max 1024 characters per spec)
if len(description) > 1024:
return False, f"Description is too long ({len(description)} characters). Maximum is 1024 characters."
# Validate compatibility field if present (optional)
compatibility = frontmatter.get('compatibility', '')
if compatibility:
if not isinstance(compatibility, str):
return False, f"Compatibility must be a string, got {type(compatibility).__name__}"
if len(compatibility) > 500:
return False, f"Compatibility is too long ({len(compatibility)} characters). Maximum is 500 characters."
return True, "Skill is valid!"
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python quick_validate.py <skill_directory>")
sys.exit(1)
valid, message = validate_skill(sys.argv[1])
print(message)
sys.exit(0 if valid else 1)

View File

@@ -0,0 +1,411 @@
#!/usr/bin/env python3
"""
Validate JSON files produced by skill-creator-edge.
Supports validation for:
- evals.json: Skill evaluation definitions
- grading.json: Grader output
- benchmark.json: Benchmark results
- metrics.json: Executor metrics
- timing.json: Timing data
- history.json: Improve mode version history
- comparison.json: Blind comparator output
- analysis.json: Post-hoc analyzer output
Usage:
python validate_json.py <file_path> [--type <type>]
Examples:
python validate_json.py workspace/benchmark.json
python validate_json.py evals/evals.json --type evals
python validate_json.py run-1/grading.json --type grading
"""
import argparse
import json
import sys
from pathlib import Path
from typing import Any
# Schema definitions as validation rules
SCHEMAS = {
"evals": {
"required": ["skill_name", "evals"],
"evals_item": {
"required": ["id", "prompt"],
"optional": ["expected_output", "files", "expectations"]
}
},
"grading": {
"required": ["expectations", "summary"],
"summary": {
"required": ["passed", "failed", "total", "pass_rate"]
},
"expectations_item": {
"required": ["text", "passed", "evidence"]
}
},
"benchmark": {
"required": ["metadata", "runs", "run_summary"],
"metadata": {
"required": ["skill_name", "timestamp", "runs_per_configuration"]
},
"runs_item": {
"required": ["eval_id", "configuration", "run_number", "result"]
},
"run_summary_config": {
"required": ["pass_rate", "time_seconds", "tokens"]
}
},
"metrics": {
"required": ["tool_calls", "total_tool_calls"],
"optional": ["total_steps", "files_created", "errors_encountered",
"output_chars", "transcript_chars"]
},
"timing": {
"required": [], # All fields optional but should have at least one
"optional": ["executor_start", "executor_end", "executor_duration_seconds",
"grader_start", "grader_end", "grader_duration_seconds",
"total_duration_seconds"]
},
"history": {
"required": ["started_at", "skill_name", "current_best", "iterations"],
"iterations_item": {
"required": ["version", "expectation_pass_rate", "grading_result", "is_current_best"]
}
},
"comparison": {
"required": ["winner", "reasoning", "rubric", "output_quality"],
"rubric_side": {
"required": ["content", "structure", "overall_score"]
}
},
"analysis": {
"required": ["comparison_summary", "winner_strengths", "loser_weaknesses",
"improvement_suggestions"],
"improvement_item": {
"required": ["priority", "category", "suggestion"]
}
}
}
def infer_type(file_path: Path) -> str | None:
"""Infer JSON type from filename."""
name = file_path.name.lower()
if name == "evals.json":
return "evals"
elif name == "grading.json":
return "grading"
elif name == "benchmark.json":
return "benchmark"
elif name == "metrics.json":
return "metrics"
elif name == "timing.json":
return "timing"
elif name == "history.json":
return "history"
elif name.startswith("comparison"):
return "comparison"
elif name == "analysis.json":
return "analysis"
return None
def validate_required_fields(data: dict, required: list[str], path: str = "") -> list[str]:
"""Check that all required fields are present."""
errors = []
for field in required:
if field not in data:
errors.append(f"{path}Missing required field: {field}")
return errors
def validate_evals(data: dict) -> list[str]:
"""Validate evals.json structure."""
errors = validate_required_fields(data, SCHEMAS["evals"]["required"])
if "evals" in data:
if not isinstance(data["evals"], list):
errors.append("'evals' must be a list")
else:
for i, eval_item in enumerate(data["evals"]):
item_errors = validate_required_fields(
eval_item,
SCHEMAS["evals"]["evals_item"]["required"],
f"evals[{i}]: "
)
errors.extend(item_errors)
# Validate id is an integer
if "id" in eval_item and not isinstance(eval_item["id"], int):
errors.append(f"evals[{i}]: 'id' must be an integer")
# Validate expectations is a list of strings
if "expectations" in eval_item:
if not isinstance(eval_item["expectations"], list):
errors.append(f"evals[{i}]: 'expectations' must be a list")
else:
for j, exp in enumerate(eval_item["expectations"]):
if not isinstance(exp, str):
errors.append(f"evals[{i}].expectations[{j}]: must be a string")
return errors
def validate_grading(data: dict) -> list[str]:
"""Validate grading.json structure."""
errors = validate_required_fields(data, SCHEMAS["grading"]["required"])
if "summary" in data:
summary_errors = validate_required_fields(
data["summary"],
SCHEMAS["grading"]["summary"]["required"],
"summary: "
)
errors.extend(summary_errors)
# Validate pass_rate is between 0 and 1
if "pass_rate" in data["summary"]:
pr = data["summary"]["pass_rate"]
if not isinstance(pr, (int, float)) or pr < 0 or pr > 1:
errors.append("summary.pass_rate must be a number between 0 and 1")
if "expectations" in data:
if not isinstance(data["expectations"], list):
errors.append("'expectations' must be a list")
else:
for i, exp in enumerate(data["expectations"]):
exp_errors = validate_required_fields(
exp,
SCHEMAS["grading"]["expectations_item"]["required"],
f"expectations[{i}]: "
)
errors.extend(exp_errors)
if "passed" in exp and not isinstance(exp["passed"], bool):
errors.append(f"expectations[{i}].passed must be a boolean")
return errors
def validate_benchmark(data: dict) -> list[str]:
"""Validate benchmark.json structure."""
errors = validate_required_fields(data, SCHEMAS["benchmark"]["required"])
if "metadata" in data:
meta_errors = validate_required_fields(
data["metadata"],
SCHEMAS["benchmark"]["metadata"]["required"],
"metadata: "
)
errors.extend(meta_errors)
if "runs" in data:
if not isinstance(data["runs"], list):
errors.append("'runs' must be a list")
else:
for i, run in enumerate(data["runs"]):
run_errors = validate_required_fields(
run,
SCHEMAS["benchmark"]["runs_item"]["required"],
f"runs[{i}]: "
)
errors.extend(run_errors)
# Validate configuration
if "configuration" in run:
if run["configuration"] not in ["with_skill", "without_skill"]:
errors.append(f"runs[{i}].configuration must be 'with_skill' or 'without_skill'")
if "run_summary" in data:
for config in ["with_skill", "without_skill"]:
if config in data["run_summary"]:
config_errors = validate_required_fields(
data["run_summary"][config],
SCHEMAS["benchmark"]["run_summary_config"]["required"],
f"run_summary.{config}: "
)
errors.extend(config_errors)
return errors
def validate_metrics(data: dict) -> list[str]:
"""Validate metrics.json structure."""
errors = validate_required_fields(data, SCHEMAS["metrics"]["required"])
if "tool_calls" in data and not isinstance(data["tool_calls"], dict):
errors.append("'tool_calls' must be an object")
if "total_tool_calls" in data and not isinstance(data["total_tool_calls"], int):
errors.append("'total_tool_calls' must be an integer")
return errors
def validate_timing(data: dict) -> list[str]:
"""Validate timing.json structure."""
errors = []
# At least one timing field should be present
timing_fields = SCHEMAS["timing"]["optional"]
has_timing = any(field in data for field in timing_fields)
if not has_timing:
errors.append("timing.json should have at least one timing field")
# Validate duration fields are numbers
for field in ["executor_duration_seconds", "grader_duration_seconds", "total_duration_seconds"]:
if field in data and not isinstance(data[field], (int, float)):
errors.append(f"'{field}' must be a number")
return errors
def validate_history(data: dict) -> list[str]:
"""Validate history.json structure."""
errors = validate_required_fields(data, SCHEMAS["history"]["required"])
if "iterations" in data:
if not isinstance(data["iterations"], list):
errors.append("'iterations' must be a list")
else:
for i, iteration in enumerate(data["iterations"]):
iter_errors = validate_required_fields(
iteration,
SCHEMAS["history"]["iterations_item"]["required"],
f"iterations[{i}]: "
)
errors.extend(iter_errors)
if "grading_result" in iteration:
valid_results = ["baseline", "won", "lost", "tie"]
if iteration["grading_result"] not in valid_results:
errors.append(f"iterations[{i}].grading_result must be one of: {valid_results}")
return errors
def validate_comparison(data: dict) -> list[str]:
"""Validate comparison.json structure."""
errors = validate_required_fields(data, SCHEMAS["comparison"]["required"])
if "winner" in data:
if data["winner"] not in ["A", "B", "TIE"]:
errors.append("'winner' must be 'A', 'B', or 'TIE'")
if "rubric" in data:
for side in ["A", "B"]:
if side in data["rubric"]:
side_errors = validate_required_fields(
data["rubric"][side],
SCHEMAS["comparison"]["rubric_side"]["required"],
f"rubric.{side}: "
)
errors.extend(side_errors)
return errors
def validate_analysis(data: dict) -> list[str]:
"""Validate analysis.json structure."""
errors = validate_required_fields(data, SCHEMAS["analysis"]["required"])
if "improvement_suggestions" in data:
if not isinstance(data["improvement_suggestions"], list):
errors.append("'improvement_suggestions' must be a list")
else:
for i, suggestion in enumerate(data["improvement_suggestions"]):
sugg_errors = validate_required_fields(
suggestion,
SCHEMAS["analysis"]["improvement_item"]["required"],
f"improvement_suggestions[{i}]: "
)
errors.extend(sugg_errors)
if "priority" in suggestion:
if suggestion["priority"] not in ["high", "medium", "low"]:
errors.append(f"improvement_suggestions[{i}].priority must be 'high', 'medium', or 'low'")
return errors
VALIDATORS = {
"evals": validate_evals,
"grading": validate_grading,
"benchmark": validate_benchmark,
"metrics": validate_metrics,
"timing": validate_timing,
"history": validate_history,
"comparison": validate_comparison,
"analysis": validate_analysis,
}
def validate_file(file_path: Path, json_type: str | None = None) -> tuple[bool, list[str]]:
"""
Validate a JSON file.
Returns (is_valid, errors) tuple.
"""
errors = []
# Check file exists
if not file_path.exists():
return False, [f"File not found: {file_path}"]
# Load JSON
try:
with open(file_path) as f:
data = json.load(f)
except json.JSONDecodeError as e:
return False, [f"Invalid JSON: {e}"]
# Infer type if not provided
if json_type is None:
json_type = infer_type(file_path)
if json_type is None:
return False, [f"Could not infer JSON type from filename. Use --type to specify."]
if json_type not in VALIDATORS:
return False, [f"Unknown JSON type: {json_type}. Valid types: {list(VALIDATORS.keys())}"]
# Run validation
validator = VALIDATORS[json_type]
errors = validator(data)
return len(errors) == 0, errors
def main():
parser = argparse.ArgumentParser(
description="Validate JSON files produced by skill-creator-edge"
)
parser.add_argument("file", type=Path, help="Path to the JSON file to validate")
parser.add_argument(
"--type", "-t",
choices=list(VALIDATORS.keys()),
help="JSON type (inferred from filename if not specified)"
)
args = parser.parse_args()
is_valid, errors = validate_file(args.file, args.type)
if is_valid:
print(f"{args.file} is valid")
sys.exit(0)
else:
print(f"{args.file} has {len(errors)} error(s):")
for error in errors:
print(f" - {error}")
sys.exit(1)
if __name__ == "__main__":
main()