Files
claude-plugins-official/plugins/skill-creator/skills/skill-creator/scripts/prepare_eval.py
2026-02-17 17:07:09 -08:00

280 lines
8.7 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Prepare environment for running a skill eval.
Usage:
prepare_eval.py <skill-path> <eval-id> --output-dir <dir> [--no-skill]
Examples:
prepare_eval.py skills/public/pdf 0 --output-dir workspace/eval-001/with-skill
prepare_eval.py skills/public/pdf 0 --output-dir workspace/eval-001/without-skill --no-skill
Options:
<skill-path> Path to the skill directory
<eval-id> Index of the eval in evals/evals.json (0-based)
--output-dir Directory to prepare for the eval run
--no-skill If set, do not copy the skill (for baseline comparison)
"""
import json
import os
import shutil
import sys
from pathlib import Path
def is_writable(path: Path) -> bool:
"""Check if a directory is writable."""
try:
test_file = path / ".write_test"
test_file.touch()
test_file.unlink()
return True
except (OSError, PermissionError):
return False
def load_evals(skill_path: Path) -> list:
"""Load evals from the skill's evals/evals.json file."""
evals_file = skill_path / "evals" / "evals.json"
if not evals_file.exists():
raise FileNotFoundError(f"Evals file not found: {evals_file}")
with open(evals_file, "r") as f:
data = json.load(f)
# Handle both formats: plain list or wrapped in object with "evals" key
if isinstance(data, dict) and "evals" in data:
evals = data["evals"]
elif isinstance(data, list):
evals = data
else:
raise ValueError(
f"Expected evals.json to contain a list or object with 'evals' key, "
f"got {type(data).__name__}"
)
return evals
def get_eval(evals: list, eval_id: int) -> dict:
"""Get a specific eval by ID (0-based index)."""
if eval_id < 0 or eval_id >= len(evals):
raise IndexError(f"Eval ID {eval_id} out of range (0-{len(evals)-1})")
return evals[eval_id]
def normalize_eval(eval_data: dict) -> dict:
"""
Normalize eval data to a consistent format.
Handles both the design doc format (prompt, files, assertions)
and the gym format (query, files, expected_behavior).
"""
# Get the prompt (can be "prompt" or "query")
prompt = eval_data.get("prompt") or eval_data.get("query")
if not prompt:
raise ValueError("Eval must have either 'prompt' or 'query' field")
# Get files (default to empty list)
files = eval_data.get("files", [])
# Get assertions - can be "assertions" (list of strings)
# or "expected_behavior" (list of strings or objects)
assertions = eval_data.get("assertions")
if assertions is None:
expected_behavior = eval_data.get("expected_behavior", [])
# Convert expected_behavior to string assertions if needed
assertions = []
for item in expected_behavior:
if isinstance(item, str):
assertions.append(item)
elif isinstance(item, dict):
# Convert structured assertion to string description
assertion_type = item.get("assertion", "unknown")
# Build a human-readable assertion string
parts = [f"Assertion type: {assertion_type}"]
for key, value in item.items():
if key != "assertion":
parts.append(f"{key}={value}")
assertions.append(" - ".join(parts))
return {
"prompt": prompt,
"files": files,
"assertions": assertions
}
def prepare_eval(skill_path: Path, eval_id: int, output_dir: Path, no_skill: bool = False) -> dict:
"""
Prepare the environment for running an eval.
Args:
skill_path: Path to the skill directory
eval_id: Index of the eval in evals.json
output_dir: Directory to prepare for the eval run
no_skill: If True, do not copy the skill (for baseline comparison)
Returns:
Dictionary with eval metadata
"""
skill_path = Path(skill_path).resolve()
output_dir = Path(output_dir).resolve()
# Validate skill path
if not skill_path.exists():
raise FileNotFoundError(f"Skill directory not found: {skill_path}")
skill_md = skill_path / "SKILL.md"
if not skill_md.exists():
raise FileNotFoundError(f"SKILL.md not found in {skill_path}")
# Load and get the specific eval
evals = load_evals(skill_path)
eval_data = get_eval(evals, eval_id)
normalized = normalize_eval(eval_data)
# Create output directory
output_dir.mkdir(parents=True, exist_ok=True)
# Create inputs directory and stage input files
inputs_dir = output_dir / "inputs"
inputs_dir.mkdir(exist_ok=True)
staged_files = []
for file_ref in normalized["files"]:
# Files can be relative to skill's evals/files/ directory
source = skill_path / "evals" / "files" / file_ref
if not source.exists():
# Try relative to evals/ directly
source = skill_path / "evals" / file_ref
if not source.exists():
# Try relative to skill root
source = skill_path / file_ref
if source.exists():
dest = inputs_dir / Path(file_ref).name
if source.is_file():
shutil.copy2(source, dest)
else:
shutil.copytree(source, dest, dirs_exist_ok=True)
staged_files.append(str(dest))
print(f" Staged: {file_ref} -> {dest}")
else:
print(f" Warning: File not found: {file_ref}")
# Create outputs directory
outputs_dir = output_dir / "outputs"
outputs_dir.mkdir(exist_ok=True)
# Copy skill if not --no-skill
skill_copy_path = None
if not no_skill:
skill_copy_path = output_dir / "skill"
if skill_copy_path.exists():
shutil.rmtree(skill_copy_path)
shutil.copytree(skill_path, skill_copy_path, dirs_exist_ok=True)
skill_copy_path = str(skill_copy_path)
print(f" Copied skill to: {skill_copy_path}")
# Build metadata
metadata = {
"eval_id": eval_id,
"prompt": normalized["prompt"],
"assertions": normalized["assertions"],
"input_files": staged_files,
"skill_path": skill_copy_path,
"output_dir": str(output_dir),
"inputs_dir": str(inputs_dir),
"outputs_dir": str(outputs_dir),
"no_skill": no_skill,
"original_skill_path": str(skill_path)
}
# Write metadata file
metadata_path = output_dir / "eval_metadata.json"
with open(metadata_path, "w") as f:
json.dump(metadata, f, indent=2)
print(f" Wrote: {metadata_path}")
return metadata
def determine_workspace(skill_path: Path) -> Path:
"""
Determine the appropriate workspace location.
If skill directory is writable, use <skill>/workspace/
Otherwise, use <project-root>/<skill-name>-workspace/
"""
skill_path = Path(skill_path).resolve()
if is_writable(skill_path):
return skill_path / "workspace"
# Find project root (look for .git or go up to home)
project_root = skill_path
while project_root != project_root.parent:
if (project_root / ".git").exists():
break
project_root = project_root.parent
if project_root == project_root.parent:
# Fallback to skill's parent directory
project_root = skill_path.parent
skill_name = skill_path.name
return project_root / f"{skill_name}-workspace"
def main():
# Parse arguments
args = sys.argv[1:]
if len(args) < 4 or "--output-dir" not in args:
print(__doc__)
sys.exit(1)
# Find positional arguments and flags
skill_path = args[0]
eval_id = int(args[1])
no_skill = "--no-skill" in args
# Find --output-dir value
output_dir_idx = args.index("--output-dir")
if output_dir_idx + 1 >= len(args):
print("Error: --output-dir requires a value")
sys.exit(1)
output_dir = args[output_dir_idx + 1]
print(f"Preparing eval {eval_id} for skill: {skill_path}")
print(f"Output directory: {output_dir}")
if no_skill:
print("Mode: without skill (baseline)")
else:
print("Mode: with skill")
print()
try:
metadata = prepare_eval(
skill_path=Path(skill_path),
eval_id=eval_id,
output_dir=Path(output_dir),
no_skill=no_skill
)
print()
print("Eval prepared successfully!")
print(f" Prompt: {metadata['prompt'][:60]}..." if len(metadata['prompt']) > 60 else f" Prompt: {metadata['prompt']}")
print(f" Assertions: {len(metadata['assertions'])}")
print(f" Input files: {len(metadata['input_files'])}")
except Exception as e:
print(f"Error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()