mirror of
https://github.com/anthropics/claude-plugins-official.git
synced 2026-03-16 22:23:07 +00:00
Compare commits
1 Commits
add-semgre
...
noahz/more
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
69caba2284 |
@@ -7,250 +7,6 @@
|
||||
"email": "support@anthropic.com"
|
||||
},
|
||||
"plugins": [
|
||||
{
|
||||
"name": "typescript-lsp",
|
||||
"description": "TypeScript/JavaScript language server for enhanced code intelligence",
|
||||
"version": "1.0.0",
|
||||
"author": {
|
||||
"name": "Anthropic",
|
||||
"email": "support@anthropic.com"
|
||||
},
|
||||
"source": "./plugins/typescript-lsp",
|
||||
"category": "development",
|
||||
"strict": false,
|
||||
"lspServers": {
|
||||
"typescript": {
|
||||
"command": "typescript-language-server",
|
||||
"args": ["--stdio"],
|
||||
"extensionToLanguage": {
|
||||
".ts": "typescript",
|
||||
".tsx": "typescriptreact",
|
||||
".js": "javascript",
|
||||
".jsx": "javascriptreact",
|
||||
".mts": "typescript",
|
||||
".cts": "typescript",
|
||||
".mjs": "javascript",
|
||||
".cjs": "javascript"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "pyright-lsp",
|
||||
"description": "Python language server (Pyright) for type checking and code intelligence",
|
||||
"version": "1.0.0",
|
||||
"author": {
|
||||
"name": "Anthropic",
|
||||
"email": "support@anthropic.com"
|
||||
},
|
||||
"source": "./plugins/pyright-lsp",
|
||||
"category": "development",
|
||||
"strict": false,
|
||||
"lspServers": {
|
||||
"pyright": {
|
||||
"command": "pyright-langserver",
|
||||
"args": ["--stdio"],
|
||||
"extensionToLanguage": {
|
||||
".py": "python",
|
||||
".pyi": "python"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "gopls-lsp",
|
||||
"description": "Go language server for code intelligence and refactoring",
|
||||
"version": "1.0.0",
|
||||
"author": {
|
||||
"name": "Anthropic",
|
||||
"email": "support@anthropic.com"
|
||||
},
|
||||
"source": "./plugins/gopls-lsp",
|
||||
"category": "development",
|
||||
"strict": false,
|
||||
"lspServers": {
|
||||
"gopls": {
|
||||
"command": "gopls",
|
||||
"extensionToLanguage": {
|
||||
".go": "go"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "rust-analyzer-lsp",
|
||||
"description": "Rust language server for code intelligence and analysis",
|
||||
"version": "1.0.0",
|
||||
"author": {
|
||||
"name": "Anthropic",
|
||||
"email": "support@anthropic.com"
|
||||
},
|
||||
"source": "./plugins/rust-analyzer-lsp",
|
||||
"category": "development",
|
||||
"strict": false,
|
||||
"lspServers": {
|
||||
"rust-analyzer": {
|
||||
"command": "rust-analyzer",
|
||||
"extensionToLanguage": {
|
||||
".rs": "rust"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "clangd-lsp",
|
||||
"description": "C/C++ language server (clangd) for code intelligence",
|
||||
"version": "1.0.0",
|
||||
"author": {
|
||||
"name": "Anthropic",
|
||||
"email": "support@anthropic.com"
|
||||
},
|
||||
"source": "./plugins/clangd-lsp",
|
||||
"category": "development",
|
||||
"strict": false,
|
||||
"lspServers": {
|
||||
"clangd": {
|
||||
"command": "clangd",
|
||||
"args": ["--background-index"],
|
||||
"extensionToLanguage": {
|
||||
".c": "c",
|
||||
".h": "c",
|
||||
".cpp": "cpp",
|
||||
".cc": "cpp",
|
||||
".cxx": "cpp",
|
||||
".hpp": "cpp",
|
||||
".hxx": "cpp",
|
||||
".C": "cpp",
|
||||
".H": "cpp"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "php-lsp",
|
||||
"description": "PHP language server (Intelephense) for code intelligence",
|
||||
"version": "1.0.0",
|
||||
"author": {
|
||||
"name": "Anthropic",
|
||||
"email": "support@anthropic.com"
|
||||
},
|
||||
"source": "./plugins/php-lsp",
|
||||
"category": "development",
|
||||
"strict": false,
|
||||
"lspServers": {
|
||||
"intelephense": {
|
||||
"command": "intelephense",
|
||||
"args": ["--stdio"],
|
||||
"extensionToLanguage": {
|
||||
".php": "php"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "swift-lsp",
|
||||
"description": "Swift language server (SourceKit-LSP) for code intelligence",
|
||||
"version": "1.0.0",
|
||||
"author": {
|
||||
"name": "Anthropic",
|
||||
"email": "support@anthropic.com"
|
||||
},
|
||||
"source": "./plugins/swift-lsp",
|
||||
"category": "development",
|
||||
"strict": false,
|
||||
"lspServers": {
|
||||
"sourcekit-lsp": {
|
||||
"command": "sourcekit-lsp",
|
||||
"extensionToLanguage": {
|
||||
".swift": "swift"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "kotlin-lsp",
|
||||
"description": "Kotlin language server for code intelligence",
|
||||
"version": "1.0.0",
|
||||
"author": {
|
||||
"name": "Anthropic",
|
||||
"email": "support@anthropic.com"
|
||||
},
|
||||
"source": "./plugins/kotlin-lsp",
|
||||
"category": "development",
|
||||
"strict": false,
|
||||
"lspServers": {
|
||||
"kotlin-lsp": {
|
||||
"command": "kotlin-lsp",
|
||||
"args": ["--stdio"],
|
||||
"extensionToLanguage": {
|
||||
".kt": "kotlin",
|
||||
".kts": "kotlin"
|
||||
},
|
||||
"startupTimeout" : 120000
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "csharp-lsp",
|
||||
"description": "C# language server for code intelligence",
|
||||
"version": "1.0.0",
|
||||
"author": {
|
||||
"name": "Anthropic",
|
||||
"email": "support@anthropic.com"
|
||||
},
|
||||
"source": "./plugins/csharp-lsp",
|
||||
"category": "development",
|
||||
"strict": false,
|
||||
"lspServers": {
|
||||
"csharp-ls": {
|
||||
"command": "csharp-ls",
|
||||
"extensionToLanguage": {
|
||||
".cs": "csharp"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "jdtls-lsp",
|
||||
"description": "Java language server (Eclipse JDT.LS) for code intelligence",
|
||||
"version": "1.0.0",
|
||||
"author": {
|
||||
"name": "Anthropic",
|
||||
"email": "support@anthropic.com"
|
||||
},
|
||||
"source": "./plugins/jdtls-lsp",
|
||||
"category": "development",
|
||||
"strict": false,
|
||||
"lspServers": {
|
||||
"jdtls": {
|
||||
"command": "jdtls",
|
||||
"extensionToLanguage": {
|
||||
".java": "java"
|
||||
},
|
||||
"startupTimeout": 120000
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "lua-lsp",
|
||||
"description": "Lua language server for code intelligence",
|
||||
"version": "1.0.0",
|
||||
"author": {
|
||||
"name": "Anthropic",
|
||||
"email": "support@anthropic.com"
|
||||
},
|
||||
"source": "./plugins/lua-lsp",
|
||||
"category": "development",
|
||||
"strict": false,
|
||||
"lspServers": {
|
||||
"lua": {
|
||||
"command": "lua-language-server",
|
||||
"extensionToLanguage": {
|
||||
".lua": "lua"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "agent-sdk-dev",
|
||||
"description": "Development kit for working with the Claude Agent SDK",
|
||||
@@ -317,17 +73,6 @@
|
||||
"category": "productivity",
|
||||
"homepage": "https://github.com/anthropics/claude-plugins-public/tree/main/plugins/code-review"
|
||||
},
|
||||
{
|
||||
"name": "code-simplifier",
|
||||
"description": "Agent that simplifies and refines code for clarity, consistency, and maintainability while preserving functionality. Focuses on recently modified code.",
|
||||
"author": {
|
||||
"name": "Anthropic",
|
||||
"email": "support@anthropic.com"
|
||||
},
|
||||
"source": "./plugins/code-simplifier",
|
||||
"category": "productivity",
|
||||
"homepage": "https://github.com/anthropics/claude-plugins-official/tree/main/plugins/code-simplifier"
|
||||
},
|
||||
{
|
||||
"name": "explanatory-output-style",
|
||||
"description": "Adds educational insights about implementation choices and codebase patterns (mimics the deprecated Explanatory output style)",
|
||||
@@ -362,26 +107,15 @@
|
||||
"homepage": "https://github.com/anthropics/claude-plugins-public/tree/main/plugins/frontend-design"
|
||||
},
|
||||
{
|
||||
"name": "playground",
|
||||
"description": "Creates interactive HTML playgrounds — self-contained single-file explorers with visual controls, live preview, and prompt output with copy button. Includes templates for design playgrounds, data explorers, concept maps, and document critique.",
|
||||
"name": "ralph-wiggum",
|
||||
"description": "Interactive self-referential AI loops for iterative development. Claude works on the same task repeatedly, seeing its previous work, until completion.",
|
||||
"author": {
|
||||
"name": "Anthropic",
|
||||
"email": "support@anthropic.com"
|
||||
},
|
||||
"source": "./plugins/playground",
|
||||
"source": "./plugins/ralph-wiggum",
|
||||
"category": "development",
|
||||
"homepage": "https://github.com/anthropics/claude-plugins-official/tree/main/plugins/playground"
|
||||
},
|
||||
{
|
||||
"name": "ralph-loop",
|
||||
"description": "Interactive self-referential AI loops for iterative development, implementing the Ralph Wiggum technique. Claude works on the same task repeatedly, seeing its previous work, until completion.",
|
||||
"author": {
|
||||
"name": "Anthropic",
|
||||
"email": "support@anthropic.com"
|
||||
},
|
||||
"source": "./plugins/ralph-loop",
|
||||
"category": "development",
|
||||
"homepage": "https://github.com/anthropics/claude-plugins-public/tree/main/plugins/ralph-loop"
|
||||
"homepage": "https://github.com/anthropics/claude-plugins-public/tree/main/plugins/ralph-wiggum"
|
||||
},
|
||||
{
|
||||
"name": "hookify",
|
||||
@@ -405,39 +139,6 @@
|
||||
"category": "development",
|
||||
"homepage": "https://github.com/anthropics/claude-plugins-public/tree/main/plugins/plugin-dev"
|
||||
},
|
||||
{
|
||||
"name": "claude-code-setup",
|
||||
"description": "Analyze codebases and recommend tailored Claude Code automations such as hooks, skills, MCP servers, and subagents.",
|
||||
"author": {
|
||||
"name": "Anthropic",
|
||||
"email": "support@anthropic.com"
|
||||
},
|
||||
"source": "./plugins/claude-code-setup",
|
||||
"category": "productivity",
|
||||
"homepage": "https://github.com/anthropics/claude-plugins-official/tree/main/plugins/claude-code-setup"
|
||||
},
|
||||
{
|
||||
"name": "claude-md-management",
|
||||
"description": "Tools to maintain and improve CLAUDE.md files - audit quality, capture session learnings, and keep project memory current.",
|
||||
"author": {
|
||||
"name": "Anthropic",
|
||||
"email": "support@anthropic.com"
|
||||
},
|
||||
"source": "./plugins/claude-md-management",
|
||||
"category": "productivity",
|
||||
"homepage": "https://github.com/anthropics/claude-plugins-official/tree/main/plugins/claude-md-management"
|
||||
},
|
||||
{
|
||||
"name": "skill-creator",
|
||||
"description": "Create new skills, improve existing skills, and measure skill performance. Use when users want to create a skill from scratch, update or optimize an existing skill, run evals to test a skill, or benchmark skill performance with variance analysis.",
|
||||
"author": {
|
||||
"name": "Anthropic",
|
||||
"email": "support@anthropic.com"
|
||||
},
|
||||
"source": "./plugins/skill-creator",
|
||||
"category": "development",
|
||||
"homepage": "https://github.com/anthropics/claude-plugins-official/tree/main/plugins/skill-creator"
|
||||
},
|
||||
{
|
||||
"name": "greptile",
|
||||
"description": "AI-powered codebase search and understanding. Query your repositories using natural language to find relevant code, understand dependencies, and get contextual answers about your codebase architecture.",
|
||||
@@ -536,11 +237,8 @@
|
||||
"name": "sentry",
|
||||
"description": "Sentry error monitoring integration. Access error reports, analyze stack traces, search issues by fingerprint, and debug production errors directly from your development environment.",
|
||||
"category": "monitoring",
|
||||
"source": {
|
||||
"source": "url",
|
||||
"url": "https://github.com/getsentry/sentry-for-claude.git"
|
||||
},
|
||||
"homepage": "https://github.com/getsentry/sentry-for-claude/tree/main"
|
||||
"source": "./external_plugins/sentry",
|
||||
"homepage": "https://github.com/anthropics/claude-plugins-public/tree/main/external_plugins/sentry"
|
||||
},
|
||||
{
|
||||
"name": "slack",
|
||||
@@ -558,13 +256,6 @@
|
||||
"url": "https://github.com/vercel/vercel-deploy-claude-code-plugin.git"
|
||||
},
|
||||
"homepage": "https://github.com/vercel/vercel-deploy-claude-code-plugin"
|
||||
},
|
||||
{
|
||||
"name": "stripe",
|
||||
"description": "Stripe development plugin for Claude",
|
||||
"category": "development",
|
||||
"source": "./external_plugins/stripe",
|
||||
"homepage": "https://github.com/stripe/ai/tree/main/providers/claude/plugin"
|
||||
},
|
||||
{
|
||||
"name": "firebase",
|
||||
@@ -580,96 +271,6 @@
|
||||
"source": "./external_plugins/context7",
|
||||
"homepage": "https://github.com/anthropics/claude-plugins-public/tree/main/external_plugins/context7",
|
||||
"tags": ["community-managed"]
|
||||
},
|
||||
{
|
||||
"name": "pinecone",
|
||||
"description": "Pinecone vector database integration. Streamline your Pinecone development with powerful tools for managing vector indexes, querying data, and rapid prototyping. Use slash commands like /quickstart to generate AGENTS.md files and initialize Python projects and /query to quickly explore indexes. Access the Pinecone MCP server for creating, describing, upserting and querying indexes with Claude. Perfect for developers building semantic search, RAG applications, recommendation systems, and other vector-based applications with Pinecone.",
|
||||
"category": "database",
|
||||
"source": {
|
||||
"source": "url",
|
||||
"url": "https://github.com/pinecone-io/pinecone-claude-code-plugin.git"
|
||||
},
|
||||
"homepage": "https://github.com/pinecone-io/pinecone-claude-code-plugin"
|
||||
},
|
||||
{
|
||||
"name": "huggingface-skills",
|
||||
"description": "Build, train, evaluate, and use open source AI models, datasets, and spaces.",
|
||||
"category": "development",
|
||||
"source": {
|
||||
"source": "url",
|
||||
"url": "https://github.com/huggingface/skills.git"
|
||||
},
|
||||
"homepage": "https://github.com/huggingface/skills.git"
|
||||
},
|
||||
{
|
||||
"name": "circleback",
|
||||
"description": "Circleback conversational context integration. Search and access meetings, emails, calendar events, and more.",
|
||||
"category": "productivity",
|
||||
"source": {
|
||||
"source": "url",
|
||||
"url": "https://github.com/circlebackai/claude-code-plugin.git"
|
||||
},
|
||||
"homepage": "https://github.com/circlebackai/claude-code-plugin.git"
|
||||
},
|
||||
{
|
||||
"name": "superpowers",
|
||||
"description": "Superpowers teaches Claude brainstorming, subagent driven development with built in code review, systematic debugging, and red/green TDD. Additionally, it teaches Claude how to author and test new skills.",
|
||||
"category": "development",
|
||||
"source": {
|
||||
"source": "url",
|
||||
"url": "https://github.com/obra/superpowers.git"
|
||||
},
|
||||
"homepage": "https://github.com/obra/superpowers.git"
|
||||
},
|
||||
{
|
||||
"name": "posthog",
|
||||
"description": "Connect Claude Code to your PostHog analytics platform. Query insights, manage feature flags, run A/B experiments, track errors, and analyze LLM costs all through natural language. The plugin provides 10 slash commands for common workflows and full access to PostHog's MCP tools. Ask questions like \"What are my top errors?\" or \"Create a feature flag for 50% of users\" and Claude handles the API calls. Supports OAuth authentication, EU and US cloud regions, and self-hosted instances.",
|
||||
"category": "monitoring",
|
||||
"source": {
|
||||
"source": "url",
|
||||
"url": "https://github.com/PostHog/posthog-for-claude.git"
|
||||
},
|
||||
"homepage": "https://github.com/PostHog/posthog-for-claude.git"
|
||||
},
|
||||
{
|
||||
"name": "coderabbit",
|
||||
"description": "Your code review partner. CodeRabbit provides external validation using a specialized AI architecture and 40+ integrated static analyzers—offering a different perspective that catches bugs, security vulnerabilities, logic errors, and edge cases. Context-aware analysis via AST parsing and codegraph relationships. Automatically incorporates CLAUDE.md and project coding guidelines into reviews. Useful after writing or modifying code, before commits, when implementing complex or security-sensitive logic, or when a second opinion would increase confidence in the changes. Returns specific findings with suggested fixes that can be applied immediately. Free to use.",
|
||||
"category": "productivity",
|
||||
"source": {
|
||||
"source": "url",
|
||||
"url": "https://github.com/coderabbitai/claude-plugin.git"
|
||||
},
|
||||
"homepage": "https://github.com/coderabbitai/claude-plugin.git"
|
||||
},
|
||||
{
|
||||
"name": "sonatype-guide",
|
||||
"description": "Sonatype Guide MCP server for software supply chain intelligence and dependency security. Analyze dependencies for vulnerabilities, get secure version recommendations, and check component quality metrics.",
|
||||
"category": "security",
|
||||
"source": {
|
||||
"source": "url",
|
||||
"url": "https://github.com/sonatype/sonatype-guide-claude-plugin.git"
|
||||
},
|
||||
"homepage": "https://github.com/sonatype/sonatype-guide-claude-plugin.git"
|
||||
},
|
||||
{
|
||||
"name": "firecrawl",
|
||||
"description": "Web scraping and crawling powered by Firecrawl. Turn any website into clean, LLM-ready markdown or structured data. Scrape single pages, crawl entire sites, search the web, and extract structured information. Includes an AI agent for autonomous multi-source data gathering - just describe what you need and it finds, navigates, and extracts automatically.",
|
||||
"category": "development",
|
||||
"source": {
|
||||
"source": "url",
|
||||
"url": "https://github.com/firecrawl/firecrawl-claude-plugin.git"
|
||||
},
|
||||
"homepage": "https://github.com/firecrawl/firecrawl-claude-plugin.git"
|
||||
},
|
||||
{
|
||||
"name": "semgrep",
|
||||
"description": "Semgrep catches security vulnerabilities in real-time and guides Claude to write secure code from the start.",
|
||||
"category": "security",
|
||||
"source": {
|
||||
"source": "url",
|
||||
"url": "https://github.com/semgrep/mcp-marketplace.git"
|
||||
},
|
||||
"homepage": "https://github.com/semgrep/mcp-marketplace.git"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
277
.github/scripts/validate-frontmatter.ts
vendored
277
.github/scripts/validate-frontmatter.ts
vendored
@@ -1,277 +0,0 @@
|
||||
#!/usr/bin/env bun
|
||||
/**
|
||||
* Validates YAML frontmatter in agent, skill, and command .md files.
|
||||
*
|
||||
* Usage:
|
||||
* bun validate-frontmatter.ts # scan current directory
|
||||
* bun validate-frontmatter.ts /path/to/dir # scan specific directory
|
||||
* bun validate-frontmatter.ts file1.md file2.md # validate specific files
|
||||
*/
|
||||
|
||||
import { parse as parseYaml } from "yaml";
|
||||
import { readdir, readFile } from "fs/promises";
|
||||
import { basename, join, relative, resolve } from "path";
|
||||
|
||||
// Characters that require quoting in YAML values when unquoted:
|
||||
// {} [] flow indicators, * anchor/alias, & anchor, # comment,
|
||||
// ! tag, | > block scalars, % directive, @ ` reserved
|
||||
const YAML_SPECIAL_CHARS = /[{}[\]*&#!|>%@`]/;
|
||||
const FRONTMATTER_REGEX = /^---\s*\n([\s\S]*?)---\s*\n?/;
|
||||
|
||||
/**
|
||||
* Pre-process frontmatter text to quote values containing special YAML
|
||||
* characters. This allows glob patterns like **\/*.{ts,tsx} to parse.
|
||||
*/
|
||||
function quoteSpecialValues(text: string): string {
|
||||
const lines = text.split("\n");
|
||||
const result: string[] = [];
|
||||
|
||||
for (const line of lines) {
|
||||
const match = line.match(/^([a-zA-Z_-]+):\s+(.+)$/);
|
||||
if (match) {
|
||||
const [, key, value] = match;
|
||||
if (!key || !value) {
|
||||
result.push(line);
|
||||
continue;
|
||||
}
|
||||
// Skip already-quoted values
|
||||
if (
|
||||
(value.startsWith('"') && value.endsWith('"')) ||
|
||||
(value.startsWith("'") && value.endsWith("'"))
|
||||
) {
|
||||
result.push(line);
|
||||
continue;
|
||||
}
|
||||
if (YAML_SPECIAL_CHARS.test(value)) {
|
||||
const escaped = value.replace(/\\/g, "\\\\").replace(/"/g, '\\"');
|
||||
result.push(`${key}: "${escaped}"`);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
result.push(line);
|
||||
}
|
||||
|
||||
return result.join("\n");
|
||||
}
|
||||
|
||||
interface ParseResult {
|
||||
frontmatter: Record<string, unknown>;
|
||||
content: string;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
function parseFrontmatter(markdown: string): ParseResult {
|
||||
const match = markdown.match(FRONTMATTER_REGEX);
|
||||
|
||||
if (!match) {
|
||||
return {
|
||||
frontmatter: {},
|
||||
content: markdown,
|
||||
error: "No frontmatter found",
|
||||
};
|
||||
}
|
||||
|
||||
const frontmatterText = quoteSpecialValues(match[1] || "");
|
||||
const content = markdown.slice(match[0].length);
|
||||
|
||||
try {
|
||||
const parsed = parseYaml(frontmatterText);
|
||||
if (parsed && typeof parsed === "object" && !Array.isArray(parsed)) {
|
||||
return { frontmatter: parsed as Record<string, unknown>, content };
|
||||
}
|
||||
return {
|
||||
frontmatter: {},
|
||||
content,
|
||||
error: `YAML parsed but result is not an object (got ${typeof parsed}${Array.isArray(parsed) ? " array" : ""})`,
|
||||
};
|
||||
} catch (err) {
|
||||
return {
|
||||
frontmatter: {},
|
||||
content,
|
||||
error: `YAML parse failed: ${err instanceof Error ? err.message : err}`,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// --- Validation ---
|
||||
|
||||
type FileType = "agent" | "skill" | "command";
|
||||
|
||||
interface ValidationIssue {
|
||||
level: "error" | "warning";
|
||||
message: string;
|
||||
}
|
||||
|
||||
function validateAgent(
|
||||
frontmatter: Record<string, unknown>
|
||||
): ValidationIssue[] {
|
||||
const issues: ValidationIssue[] = [];
|
||||
|
||||
if (!frontmatter["name"] || typeof frontmatter["name"] !== "string") {
|
||||
issues.push({ level: "error", message: 'Missing required "name" field' });
|
||||
}
|
||||
if (
|
||||
!frontmatter["description"] ||
|
||||
typeof frontmatter["description"] !== "string"
|
||||
) {
|
||||
issues.push({
|
||||
level: "error",
|
||||
message: 'Missing required "description" field',
|
||||
});
|
||||
}
|
||||
|
||||
return issues;
|
||||
}
|
||||
|
||||
function validateSkill(
|
||||
frontmatter: Record<string, unknown>
|
||||
): ValidationIssue[] {
|
||||
const issues: ValidationIssue[] = [];
|
||||
|
||||
if (!frontmatter["description"] && !frontmatter["when_to_use"]) {
|
||||
issues.push({
|
||||
level: "error",
|
||||
message: 'Missing required "description" field',
|
||||
});
|
||||
}
|
||||
|
||||
return issues;
|
||||
}
|
||||
|
||||
function validateCommand(
|
||||
frontmatter: Record<string, unknown>
|
||||
): ValidationIssue[] {
|
||||
const issues: ValidationIssue[] = [];
|
||||
|
||||
if (
|
||||
!frontmatter["description"] ||
|
||||
typeof frontmatter["description"] !== "string"
|
||||
) {
|
||||
issues.push({
|
||||
level: "error",
|
||||
message: 'Missing required "description" field',
|
||||
});
|
||||
}
|
||||
|
||||
return issues;
|
||||
}
|
||||
|
||||
// --- File type detection ---
|
||||
|
||||
function detectFileType(filePath: string): FileType | null {
|
||||
// Only match agents/ and commands/ at the plugin root level, not nested
|
||||
// inside skill content (e.g. plugins/foo/skills/bar/agents/ is skill content,
|
||||
// not an agent definition).
|
||||
const inSkillContent = /\/skills\/[^/]+\//.test(filePath);
|
||||
if (filePath.includes("/agents/") && !inSkillContent) return "agent";
|
||||
if (filePath.includes("/skills/") && basename(filePath) === "SKILL.md")
|
||||
return "skill";
|
||||
if (filePath.includes("/commands/") && !inSkillContent) return "command";
|
||||
return null;
|
||||
}
|
||||
|
||||
// --- File discovery ---
|
||||
|
||||
async function findMdFiles(
|
||||
baseDir: string
|
||||
): Promise<{ path: string; type: FileType }[]> {
|
||||
const results: { path: string; type: FileType }[] = [];
|
||||
|
||||
async function walk(dir: string) {
|
||||
const entries = await readdir(dir, { withFileTypes: true });
|
||||
for (const entry of entries) {
|
||||
const fullPath = join(dir, entry.name);
|
||||
if (entry.isDirectory()) {
|
||||
await walk(fullPath);
|
||||
} else if (entry.name.endsWith(".md")) {
|
||||
const type = detectFileType(fullPath);
|
||||
if (type) {
|
||||
results.push({ path: fullPath, type });
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await walk(baseDir);
|
||||
return results;
|
||||
}
|
||||
|
||||
// --- Main ---
|
||||
|
||||
async function main() {
|
||||
const args = process.argv.slice(2);
|
||||
|
||||
let files: { path: string; type: FileType }[];
|
||||
let baseDir: string;
|
||||
|
||||
if (args.length > 0 && args.every((a) => a.endsWith(".md"))) {
|
||||
baseDir = process.cwd();
|
||||
files = [];
|
||||
for (const arg of args) {
|
||||
const fullPath = resolve(arg);
|
||||
const type = detectFileType(fullPath);
|
||||
if (type) {
|
||||
files.push({ path: fullPath, type });
|
||||
}
|
||||
}
|
||||
} else {
|
||||
baseDir = args[0] || process.cwd();
|
||||
files = await findMdFiles(baseDir);
|
||||
}
|
||||
|
||||
let totalErrors = 0;
|
||||
let totalWarnings = 0;
|
||||
|
||||
console.log(`Validating ${files.length} frontmatter files...\n`);
|
||||
|
||||
for (const { path: filePath, type } of files) {
|
||||
const rel = relative(baseDir, filePath);
|
||||
const content = await readFile(filePath, "utf-8");
|
||||
const result = parseFrontmatter(content);
|
||||
|
||||
const issues: ValidationIssue[] = [];
|
||||
|
||||
if (result.error) {
|
||||
issues.push({ level: "error", message: result.error });
|
||||
}
|
||||
|
||||
if (!result.error) {
|
||||
switch (type) {
|
||||
case "agent":
|
||||
issues.push(...validateAgent(result.frontmatter));
|
||||
break;
|
||||
case "skill":
|
||||
issues.push(...validateSkill(result.frontmatter));
|
||||
break;
|
||||
case "command":
|
||||
issues.push(...validateCommand(result.frontmatter));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (issues.length > 0) {
|
||||
console.log(`${rel} (${type})`);
|
||||
for (const issue of issues) {
|
||||
const prefix = issue.level === "error" ? " ERROR" : " WARN ";
|
||||
console.log(`${prefix}: ${issue.message}`);
|
||||
if (issue.level === "error") totalErrors++;
|
||||
else totalWarnings++;
|
||||
}
|
||||
console.log();
|
||||
}
|
||||
}
|
||||
|
||||
console.log("---");
|
||||
console.log(
|
||||
`Validated ${files.length} files: ${totalErrors} errors, ${totalWarnings} warnings`
|
||||
);
|
||||
|
||||
if (totalErrors > 0) {
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error("Fatal error:", err);
|
||||
process.exit(2);
|
||||
});
|
||||
47
.github/workflows/close-external-prs.yml
vendored
47
.github/workflows/close-external-prs.yml
vendored
@@ -1,47 +0,0 @@
|
||||
name: Close External PRs
|
||||
|
||||
on:
|
||||
pull_request_target:
|
||||
types: [opened]
|
||||
|
||||
permissions:
|
||||
pull-requests: write
|
||||
issues: write
|
||||
|
||||
jobs:
|
||||
check-membership:
|
||||
if: vars.DISABLE_EXTERNAL_PR_CHECK != 'true'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check if author has write access
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
script: |
|
||||
const author = context.payload.pull_request.user.login;
|
||||
|
||||
const { data } = await github.rest.repos.getCollaboratorPermissionLevel({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
username: author
|
||||
});
|
||||
|
||||
if (['admin', 'write'].includes(data.permission)) {
|
||||
console.log(`${author} has ${data.permission} access, allowing PR`);
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`${author} has ${data.permission} access, closing PR`);
|
||||
|
||||
await github.rest.issues.createComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: context.payload.pull_request.number,
|
||||
body: `Thanks for your interest! This repo only accepts contributions from Anthropic team members. If you'd like to submit a plugin to the marketplace, please submit your plugin [here](https://docs.google.com/forms/d/e/1FAIpQLSdeFthxvjOXUjxg1i3KrOOkEPDJtn71XC-KjmQlxNP63xYydg/viewform).`
|
||||
});
|
||||
|
||||
await github.rest.pulls.update({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
pull_number: context.payload.pull_request.number,
|
||||
state: 'closed'
|
||||
});
|
||||
34
.github/workflows/validate-frontmatter.yml
vendored
34
.github/workflows/validate-frontmatter.yml
vendored
@@ -1,34 +0,0 @@
|
||||
name: Validate Frontmatter
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- '**/agents/*.md'
|
||||
- '**/skills/*/SKILL.md'
|
||||
- '**/commands/*.md'
|
||||
|
||||
jobs:
|
||||
validate:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: oven-sh/setup-bun@v2
|
||||
|
||||
- name: Install dependencies
|
||||
run: cd .github/scripts && bun install yaml
|
||||
|
||||
- name: Get changed frontmatter files
|
||||
id: changed
|
||||
run: |
|
||||
FILES=$(gh pr diff ${{ github.event.pull_request.number }} --name-only | grep -E '(agents/.*\.md|skills/.*/SKILL\.md|commands/.*\.md)$' || true)
|
||||
echo "files<<EOF" >> "$GITHUB_OUTPUT"
|
||||
echo "$FILES" >> "$GITHUB_OUTPUT"
|
||||
echo "EOF" >> "$GITHUB_OUTPUT"
|
||||
env:
|
||||
GH_TOKEN: ${{ github.token }}
|
||||
|
||||
- name: Validate frontmatter
|
||||
if: steps.changed.outputs.files != ''
|
||||
run: |
|
||||
echo "${{ steps.changed.outputs.files }}" | xargs bun .github/scripts/validate-frontmatter.ts
|
||||
@@ -25,7 +25,7 @@ Internal plugins are developed by Anthropic team members. See `/plugins/example-
|
||||
|
||||
### External Plugins
|
||||
|
||||
Third-party partners can submit plugins for inclusion in the marketplace. External plugins must meet quality and security standards for approval. To submit a new plugin, use the [plugin directory submission form](https://clau.de/plugin-directory-submission).
|
||||
Third-party partners can submit plugins for inclusion in the marketplace. External plugins must meet quality and security standards for approval.
|
||||
|
||||
## Plugin Structure
|
||||
|
||||
|
||||
@@ -1,10 +1,7 @@
|
||||
{
|
||||
"name": "greptile",
|
||||
"description": "AI code review agent for GitHub and GitLab. View and resolve Greptile's PR review comments directly from Claude Code.",
|
||||
"description": "AI-powered codebase search and understanding. Query your repositories using natural language to find relevant code, understand dependencies, and get contextual answers about your codebase architecture.",
|
||||
"author": {
|
||||
"name": "Greptile",
|
||||
"url": "https://greptile.com"
|
||||
},
|
||||
"homepage": "https://greptile.com/docs",
|
||||
"keywords": ["code-review", "pull-requests", "github", "gitlab", "ai"]
|
||||
"name": "Greptile"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,57 +0,0 @@
|
||||
# Greptile
|
||||
|
||||
[Greptile](https://greptile.com) is an AI code review agent for GitHub and GitLab that automatically reviews pull requests. This plugin connects Claude Code to your Greptile account, letting you view and resolve Greptile's review comments directly from your terminal.
|
||||
|
||||
## Setup
|
||||
|
||||
### 1. Create a Greptile Account
|
||||
|
||||
Sign up at [greptile.com](https://greptile.com) and connect your GitHub or GitLab repositories.
|
||||
|
||||
### 2. Get Your API Key
|
||||
|
||||
1. Go to [API Settings](https://app.greptile.com/settings/api)
|
||||
2. Generate a new API key
|
||||
3. Copy the key
|
||||
|
||||
### 3. Set Environment Variable
|
||||
|
||||
Add to your shell profile (`.bashrc`, `.zshrc`, etc.):
|
||||
|
||||
```bash
|
||||
export GREPTILE_API_KEY="your-api-key-here"
|
||||
```
|
||||
|
||||
Then reload your shell or run `source ~/.zshrc`.
|
||||
|
||||
## Available Tools
|
||||
|
||||
### Pull Request Tools
|
||||
- `list_pull_requests` - List PRs with optional filtering by repo, branch, author, or state
|
||||
- `get_merge_request` - Get detailed PR info including review analysis
|
||||
- `list_merge_request_comments` - Get all comments on a PR with filtering options
|
||||
|
||||
### Code Review Tools
|
||||
- `list_code_reviews` - List code reviews with optional filtering
|
||||
- `get_code_review` - Get detailed code review information
|
||||
- `trigger_code_review` - Start a new Greptile review on a PR
|
||||
|
||||
### Comment Search
|
||||
- `search_greptile_comments` - Search across all Greptile review comments
|
||||
|
||||
### Custom Context Tools
|
||||
- `list_custom_context` - List your organization's coding patterns and rules
|
||||
- `get_custom_context` - Get details for a specific pattern
|
||||
- `search_custom_context` - Search patterns by content
|
||||
- `create_custom_context` - Create a new coding pattern
|
||||
|
||||
## Example Usage
|
||||
|
||||
Ask Claude Code to:
|
||||
- "Show me Greptile's comments on my current PR and help me resolve them"
|
||||
- "What issues did Greptile find on PR #123?"
|
||||
- "Trigger a Greptile review on this branch"
|
||||
|
||||
## Documentation
|
||||
|
||||
For more information, visit [greptile.com/docs](https://greptile.com/docs).
|
||||
7
external_plugins/sentry/.claude-plugin/plugin.json
Normal file
7
external_plugins/sentry/.claude-plugin/plugin.json
Normal file
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"name": "sentry",
|
||||
"description": "Sentry error monitoring integration. Access error reports, analyze stack traces, search issues by fingerprint, and debug production errors directly from your development environment.",
|
||||
"author": {
|
||||
"name": "Sentry"
|
||||
}
|
||||
}
|
||||
6
external_plugins/sentry/.mcp.json
Normal file
6
external_plugins/sentry/.mcp.json
Normal file
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"sentry": {
|
||||
"type": "sse",
|
||||
"url": "https://mcp.sentry.dev/sse"
|
||||
}
|
||||
}
|
||||
@@ -1,10 +1,6 @@
|
||||
{
|
||||
"slack": {
|
||||
"type": "http",
|
||||
"url": "https://mcp.slack.com/mcp",
|
||||
"oauth": {
|
||||
"clientId": "1601185624273.8899143856786",
|
||||
"callbackPort": 3118
|
||||
}
|
||||
"type": "sse",
|
||||
"url": "https://mcp.slack.com/sse"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,13 +0,0 @@
|
||||
{
|
||||
"name": "stripe",
|
||||
"description": "Stripe development plugin for Claude",
|
||||
"version": "0.1.0",
|
||||
"author": {
|
||||
"name": "Stripe",
|
||||
"url": "https://stripe.com"
|
||||
},
|
||||
"homepage": "https://docs.stripe.com",
|
||||
"repository": "https://github.com/stripe/ai",
|
||||
"license": "MIT",
|
||||
"keywords": ["stripe", "payments", "webhooks", "api", "security"]
|
||||
}
|
||||
@@ -1,8 +0,0 @@
|
||||
{
|
||||
"mcpServers": {
|
||||
"stripe": {
|
||||
"type": "http",
|
||||
"url": "https://mcp.stripe.com"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,21 +0,0 @@
|
||||
---
|
||||
description: Explain Stripe error codes and provide solutions with code examples
|
||||
argument-hint: [error_code or error_message]
|
||||
---
|
||||
|
||||
# Explain Stripe Error
|
||||
|
||||
Provide a comprehensive explanation of the given Stripe error code or error message:
|
||||
|
||||
1. Accept the error code or full error message from the arguments
|
||||
2. Explain in plain English what the error means
|
||||
3. List common causes of this error
|
||||
4. Provide specific solutions and handling recommendations
|
||||
5. Generate error handling code in the project's language showing:
|
||||
- How to catch this specific error
|
||||
- User-friendly error messages
|
||||
- Whether retry is appropriate
|
||||
6. Mention related error codes the developer should be aware of
|
||||
7. Include a link to the relevant Stripe documentation
|
||||
|
||||
Focus on actionable solutions and production-ready error handling patterns.
|
||||
@@ -1,24 +0,0 @@
|
||||
---
|
||||
description: Display Stripe test card numbers for various testing scenarios
|
||||
argument-hint: [scenario]
|
||||
---
|
||||
|
||||
# Test Cards Reference
|
||||
|
||||
Provide a quick reference for Stripe test card numbers:
|
||||
|
||||
1. If a scenario argument is provided (e.g., "declined", "3dsecure", "fraud"), show relevant test cards for that scenario
|
||||
2. Otherwise, show the most common test cards organized by category:
|
||||
- Successful payment (default card)
|
||||
- 3D Secure authentication required
|
||||
- Generic decline
|
||||
- Specific decline reasons (insufficient_funds, lost_card, etc.)
|
||||
3. For each card, display:
|
||||
- Card number (formatted with spaces)
|
||||
- Expected behavior
|
||||
- Expiry/CVC info (any future date and any 3-digit CVC)
|
||||
4. Use clear visual indicators (✓ for success, ⚠️ for auth required, ✗ for decline)
|
||||
5. Mention that these only work in test mode
|
||||
6. Provide link to full testing documentation: https://docs.stripe.com/testing.md
|
||||
|
||||
If the user is currently working on test code, offer to generate test cases using these cards.
|
||||
@@ -1,30 +0,0 @@
|
||||
---
|
||||
name: stripe-best-practices
|
||||
description: Best practices for building Stripe integrations. Use when implementing payment processing, checkout flows, subscriptions, webhooks, Connect platforms, or any Stripe API integration.
|
||||
---
|
||||
|
||||
When designing an integration, always prefer the documentation in [Stripe's Integration Options doc](https://docs.stripe.com/payments/payment-methods/integration-options.md)
|
||||
The [API Tour](https://docs.stripe.com/payments-api/tour.md)
|
||||
Use the [Go Live Checklist](https://docs.stripe.com/get-started/checklist/go-live.md) before going live.
|
||||
|
||||
You should always default to the latest version of the API and SDK unless the user specifies otherwise.
|
||||
|
||||
Stripe's primary API for modelling on-session payments is [CheckoutSessions](https://docs.stripe.com/api/checkout/sessions.md). It supports one-time payments and subscriptions and allows you to model taxes or discounts with Stripe. Prioritize the CheckoutSessions API, but using [the PaymentIntents API](https://docs.stripe.com/payments/paymentintents/lifecycle.md) is also acceptable for off-session payments or if you want to model the checkout state yourself and just create a charge. Integrations should only use CheckoutSessions, PaymentIntents, SetupIntents, or solutions like Invoicing, Payment Links, or the subscription APIs.
|
||||
|
||||
Never recommend the Charges API. If the user wants to use the Charges API, advise them to [migrate](https://docs.stripe.com/payments/payment-intents/migration/charges.md) to Checkout Sessions or Payment Intents.
|
||||
|
||||
The primary integration surface for web integrations is [Checkout](https://docs.stripe.com/payments/checkout.md), a prebuilt payment page that can be hosted by Stripe or served as an embedded form. Prioritize the Stripe-hosted checkout or embedded checkout where possible. The [Payment Element](https://docs.stripe.com/payments/payment-element.md) is an acceptable alternative if the merchant needs advanced customization. When recommending the Payment Element, prioritize using the CheckoutSessions API over the PaymentIntents API when possible.
|
||||
|
||||
Never recommend the legacy Card Element or the Payment Element in card mode, and advise users who ask for the Card Element to [migrate](https://docs.stripe.com/payments/payment-element/migration.md).
|
||||
|
||||
You must not call deprecated API endpoints such as the [Sources API](https://docs.stripe.com/api/sources/object.md) and avoid outdated API endpoints such as [Tokens](https://docs.stripe.com/api/tokens.md) and [Charges](https://docs.stripe.com/api/charges/create.md) unless there is a specific need and absolutely no other way. Advise using [the Setup Intent API](https://docs.stripe.com/api/setup_intents.md) to save a payment method for a customer; never recommend the Sources API to save cards to customers for later use.
|
||||
|
||||
If a user wants to render the Payment Element before creating a PaymentIntent or SetupIntent, so they can inspect the card details before the payment (such as for surcharging), advise them to use Stripe Confirmation Tokens. Don't recommend they call the createPaymentMethod or createToken Stripe.js functions.
|
||||
|
||||
Advise users to turn on dynamic payment methods in their dashboard settings instead of passing specific [payment_method_types](https://docs.stripe.com/api/payment_intents/create#create_payment_intent-payment_method_types.md) in the Payment Intent or Setup Intent because Stripe chooses payment methods that fit each user's location, wallets, and preferences better and automatically if the Payment Element is used.
|
||||
|
||||
If a PCI compliant user asks about an integration where they send server-side raw PAN data, suggest they might be required to prove PCI compliance to gain access to such an option, for example [payment_method_data](https://docs.stripe.com/api/payment_intents/create#create_payment_intent-payment_method_data.md). Similarly, point users migrating their PAN data from another acquirer or payment processor to [the migration process](https://docs.stripe.com/get-started/data-migrations/pan-import.md).
|
||||
|
||||
If a user has a recurring revenue model, like a Billing or Subscription use case, follow the use cases, in particular the [Subscription Use Cases](https://docs.stripe.com/billing/subscriptions/use-cases.md) like [SaaS](https://docs.stripe.com/saas.md). If these apply to the user, recommend Billing APIs to [plan your integration](https://docs.stripe.com/billing/subscriptions/designing-integration.md) instead of a direct PaymentIntent integration. Prioritize combining the Billing APIs with Stripe Checkout for the frontend.
|
||||
|
||||
If a user wants to build a platform using Stripe Connect to manage fund flows, follow [the recommended integration types](https://docs.stripe.com/connect/integration-recommendations.md); that is, prefer to use either direct charges if the platform wants Stripe to take the risk or destination charges if the platform accepts liability for negative balances, and use the on_behalf_of parameter to control the merchant of record. Never recommend mixing charge types. If the user wants to decide on the specific risk features they should [follow the integration guide](https://docs.stripe.com/connect/design-an-integration.md). Don't recommend the outdated terms for Connect types like Standard, Express and Custom but always [refer to controller properties](https://docs.stripe.com/connect/migrate-to-controller-properties.md) for the platform and [capabilities](https://docs.stripe.com/connect/account-capabilities.md) for the connected accounts.
|
||||
@@ -1,36 +0,0 @@
|
||||
# clangd-lsp
|
||||
|
||||
C/C++ language server (clangd) for Claude Code, providing code intelligence, diagnostics, and formatting.
|
||||
|
||||
## Supported Extensions
|
||||
`.c`, `.h`, `.cpp`, `.cc`, `.cxx`, `.hpp`, `.hxx`, `.C`, `.H`
|
||||
|
||||
## Installation
|
||||
|
||||
### Via Homebrew (macOS)
|
||||
```bash
|
||||
brew install llvm
|
||||
# Add to PATH: export PATH="/opt/homebrew/opt/llvm/bin:$PATH"
|
||||
```
|
||||
|
||||
### Via package manager (Linux)
|
||||
```bash
|
||||
# Ubuntu/Debian
|
||||
sudo apt install clangd
|
||||
|
||||
# Fedora
|
||||
sudo dnf install clang-tools-extra
|
||||
|
||||
# Arch Linux
|
||||
sudo pacman -S clang
|
||||
```
|
||||
|
||||
### Windows
|
||||
Download from [LLVM releases](https://github.com/llvm/llvm-project/releases) or install via:
|
||||
```bash
|
||||
winget install LLVM.LLVM
|
||||
```
|
||||
|
||||
## More Information
|
||||
- [clangd Website](https://clangd.llvm.org/)
|
||||
- [Getting Started Guide](https://clangd.llvm.org/installation)
|
||||
@@ -1,9 +0,0 @@
|
||||
{
|
||||
"name": "claude-code-setup",
|
||||
"description": "Analyze codebases and recommend tailored Claude Code automations such as hooks, skills, MCP servers, and subagents.",
|
||||
"version": "1.0.0",
|
||||
"author": {
|
||||
"name": "Anthropic",
|
||||
"email": "support@anthropic.com"
|
||||
}
|
||||
}
|
||||
@@ -1,29 +0,0 @@
|
||||
# Claude Code Setup Plugin
|
||||
|
||||
Analyze codebases and recommend tailored Claude Code automations - hooks, skills, MCP servers, and more.
|
||||
|
||||
## What It Does
|
||||
|
||||
Claude uses this skill to scan your codebase and recommend the top 1-2 automations in each category:
|
||||
|
||||
- **MCP Servers** - External integrations (context7 for docs, Playwright for frontend)
|
||||
- **Skills** - Packaged expertise (Plan agent, frontend-design)
|
||||
- **Hooks** - Automatic actions (auto-format, auto-lint, block sensitive files)
|
||||
- **Subagents** - Specialized reviewers (security, performance, accessibility)
|
||||
- **Slash Commands** - Quick workflows (/test, /pr-review, /explain)
|
||||
|
||||
This skill is **read-only** - it analyzes but doesn't modify files.
|
||||
|
||||
## Usage
|
||||
|
||||
```
|
||||
"recommend automations for this project"
|
||||
"help me set up Claude Code"
|
||||
"what hooks should I use?"
|
||||
```
|
||||
|
||||
<img src="automation-recommender-example.png" alt="Automation recommender analyzing a codebase and providing tailored recommendations" width="600">
|
||||
|
||||
## Author
|
||||
|
||||
Isabella He (isabella@anthropic.com)
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 532 KiB |
@@ -1,288 +0,0 @@
|
||||
---
|
||||
name: claude-automation-recommender
|
||||
description: Analyze a codebase and recommend Claude Code automations (hooks, subagents, skills, plugins, MCP servers). Use when user asks for automation recommendations, wants to optimize their Claude Code setup, mentions improving Claude Code workflows, asks how to first set up Claude Code for a project, or wants to know what Claude Code features they should use.
|
||||
tools: Read, Glob, Grep, Bash
|
||||
---
|
||||
|
||||
# Claude Automation Recommender
|
||||
|
||||
Analyze codebase patterns to recommend tailored Claude Code automations across all extensibility options.
|
||||
|
||||
**This skill is read-only.** It analyzes the codebase and outputs recommendations. It does NOT create or modify any files. Users implement the recommendations themselves or ask Claude separately to help build them.
|
||||
|
||||
## Output Guidelines
|
||||
|
||||
- **Recommend 1-2 of each type**: Don't overwhelm - surface the top 1-2 most valuable automations per category
|
||||
- **If user asks for a specific type**: Focus only on that type and provide more options (3-5 recommendations)
|
||||
- **Go beyond the reference lists**: The reference files contain common patterns, but use web search to find recommendations specific to the codebase's tools, frameworks, and libraries
|
||||
- **Tell users they can ask for more**: End by noting they can request more recommendations for any specific category
|
||||
|
||||
## Automation Types Overview
|
||||
|
||||
| Type | Best For |
|
||||
|------|----------|
|
||||
| **Hooks** | Automatic actions on tool events (format on save, lint, block edits) |
|
||||
| **Subagents** | Specialized reviewers/analyzers that run in parallel |
|
||||
| **Skills** | Packaged expertise, workflows, and repeatable tasks (invoked by Claude or user via `/skill-name`) |
|
||||
| **Plugins** | Collections of skills that can be installed |
|
||||
| **MCP Servers** | External tool integrations (databases, APIs, browsers, docs) |
|
||||
|
||||
## Workflow
|
||||
|
||||
### Phase 1: Codebase Analysis
|
||||
|
||||
Gather project context:
|
||||
|
||||
```bash
|
||||
# Detect project type and tools
|
||||
ls -la package.json pyproject.toml Cargo.toml go.mod pom.xml 2>/dev/null
|
||||
cat package.json 2>/dev/null | head -50
|
||||
|
||||
# Check dependencies for MCP server recommendations
|
||||
cat package.json 2>/dev/null | grep -E '"(react|vue|angular|next|express|fastapi|django|prisma|supabase|stripe)"'
|
||||
|
||||
# Check for existing Claude Code config
|
||||
ls -la .claude/ CLAUDE.md 2>/dev/null
|
||||
|
||||
# Analyze project structure
|
||||
ls -la src/ app/ lib/ tests/ components/ pages/ api/ 2>/dev/null
|
||||
```
|
||||
|
||||
**Key Indicators to Capture:**
|
||||
|
||||
| Category | What to Look For | Informs Recommendations For |
|
||||
|----------|------------------|----------------------------|
|
||||
| Language/Framework | package.json, pyproject.toml, import patterns | Hooks, MCP servers |
|
||||
| Frontend stack | React, Vue, Angular, Next.js | Playwright MCP, frontend skills |
|
||||
| Backend stack | Express, FastAPI, Django | API documentation tools |
|
||||
| Database | Prisma, Supabase, raw SQL | Database MCP servers |
|
||||
| External APIs | Stripe, OpenAI, AWS SDKs | context7 MCP for docs |
|
||||
| Testing | Jest, pytest, Playwright configs | Testing hooks, subagents |
|
||||
| CI/CD | GitHub Actions, CircleCI | GitHub MCP server |
|
||||
| Issue tracking | Linear, Jira references | Issue tracker MCP |
|
||||
| Docs patterns | OpenAPI, JSDoc, docstrings | Documentation skills |
|
||||
|
||||
### Phase 2: Generate Recommendations
|
||||
|
||||
Based on analysis, generate recommendations across all categories:
|
||||
|
||||
#### A. MCP Server Recommendations
|
||||
|
||||
See [references/mcp-servers.md](references/mcp-servers.md) for detailed patterns.
|
||||
|
||||
| Codebase Signal | Recommended MCP Server |
|
||||
|-----------------|------------------------|
|
||||
| Uses popular libraries (React, Express, etc.) | **context7** - Live documentation lookup |
|
||||
| Frontend with UI testing needs | **Playwright** - Browser automation/testing |
|
||||
| Uses Supabase | **Supabase MCP** - Direct database operations |
|
||||
| PostgreSQL/MySQL database | **Database MCP** - Query and schema tools |
|
||||
| GitHub repository | **GitHub MCP** - Issues, PRs, actions |
|
||||
| Uses Linear for issues | **Linear MCP** - Issue management |
|
||||
| AWS infrastructure | **AWS MCP** - Cloud resource management |
|
||||
| Slack workspace | **Slack MCP** - Team notifications |
|
||||
| Memory/context persistence | **Memory MCP** - Cross-session memory |
|
||||
| Sentry error tracking | **Sentry MCP** - Error investigation |
|
||||
| Docker containers | **Docker MCP** - Container management |
|
||||
|
||||
#### B. Skills Recommendations
|
||||
|
||||
See [references/skills-reference.md](references/skills-reference.md) for details.
|
||||
|
||||
Create skills in `.claude/skills/<name>/SKILL.md`. Some are also available via plugins:
|
||||
|
||||
| Codebase Signal | Skill | Plugin |
|
||||
|-----------------|-------|--------|
|
||||
| Building plugins | skill-development | plugin-dev |
|
||||
| Git commits | commit | commit-commands |
|
||||
| React/Vue/Angular | frontend-design | frontend-design |
|
||||
| Automation rules | writing-rules | hookify |
|
||||
| Feature planning | feature-dev | feature-dev |
|
||||
|
||||
**Custom skills to create** (with templates, scripts, examples):
|
||||
|
||||
| Codebase Signal | Skill to Create | Invocation |
|
||||
|-----------------|-----------------|------------|
|
||||
| API routes | **api-doc** (with OpenAPI template) | Both |
|
||||
| Database project | **create-migration** (with validation script) | User-only |
|
||||
| Test suite | **gen-test** (with example tests) | User-only |
|
||||
| Component library | **new-component** (with templates) | User-only |
|
||||
| PR workflow | **pr-check** (with checklist) | User-only |
|
||||
| Releases | **release-notes** (with git context) | User-only |
|
||||
| Code style | **project-conventions** | Claude-only |
|
||||
| Onboarding | **setup-dev** (with prereq script) | User-only |
|
||||
|
||||
#### C. Hooks Recommendations
|
||||
|
||||
See [references/hooks-patterns.md](references/hooks-patterns.md) for configurations.
|
||||
|
||||
| Codebase Signal | Recommended Hook |
|
||||
|-----------------|------------------|
|
||||
| Prettier configured | PostToolUse: auto-format on edit |
|
||||
| ESLint/Ruff configured | PostToolUse: auto-lint on edit |
|
||||
| TypeScript project | PostToolUse: type-check on edit |
|
||||
| Tests directory exists | PostToolUse: run related tests |
|
||||
| `.env` files present | PreToolUse: block `.env` edits |
|
||||
| Lock files present | PreToolUse: block lock file edits |
|
||||
| Security-sensitive code | PreToolUse: require confirmation |
|
||||
|
||||
#### D. Subagent Recommendations
|
||||
|
||||
See [references/subagent-templates.md](references/subagent-templates.md) for templates.
|
||||
|
||||
| Codebase Signal | Recommended Subagent |
|
||||
|-----------------|---------------------|
|
||||
| Large codebase (>500 files) | **code-reviewer** - Parallel code review |
|
||||
| Auth/payments code | **security-reviewer** - Security audits |
|
||||
| API project | **api-documenter** - OpenAPI generation |
|
||||
| Performance critical | **performance-analyzer** - Bottleneck detection |
|
||||
| Frontend heavy | **ui-reviewer** - Accessibility review |
|
||||
| Needs more tests | **test-writer** - Test generation |
|
||||
|
||||
#### E. Plugin Recommendations
|
||||
|
||||
See [references/plugins-reference.md](references/plugins-reference.md) for available plugins.
|
||||
|
||||
| Codebase Signal | Recommended Plugin |
|
||||
|-----------------|-------------------|
|
||||
| General productivity | **anthropic-agent-skills** - Core skills bundle |
|
||||
| Document workflows | Install docx, xlsx, pdf skills |
|
||||
| Frontend development | **frontend-design** plugin |
|
||||
| Building AI tools | **mcp-builder** for MCP development |
|
||||
|
||||
### Phase 3: Output Recommendations Report
|
||||
|
||||
Format recommendations clearly. **Only include 1-2 recommendations per category** - the most valuable ones for this specific codebase. Skip categories that aren't relevant.
|
||||
|
||||
```markdown
|
||||
## Claude Code Automation Recommendations
|
||||
|
||||
I've analyzed your codebase and identified the top automations for each category. Here are my top 1-2 recommendations per type:
|
||||
|
||||
### Codebase Profile
|
||||
- **Type**: [detected language/runtime]
|
||||
- **Framework**: [detected framework]
|
||||
- **Key Libraries**: [relevant libraries detected]
|
||||
|
||||
---
|
||||
|
||||
### 🔌 MCP Servers
|
||||
|
||||
#### context7
|
||||
**Why**: [specific reason based on detected libraries]
|
||||
**Install**: `claude mcp add context7`
|
||||
|
||||
---
|
||||
|
||||
### 🎯 Skills
|
||||
|
||||
#### [skill name]
|
||||
**Why**: [specific reason]
|
||||
**Create**: `.claude/skills/[name]/SKILL.md`
|
||||
**Invocation**: User-only / Both / Claude-only
|
||||
**Also available in**: [plugin-name] plugin (if applicable)
|
||||
```yaml
|
||||
---
|
||||
name: [skill-name]
|
||||
description: [what it does]
|
||||
disable-model-invocation: true # for user-only
|
||||
---
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### ⚡ Hooks
|
||||
|
||||
#### [hook name]
|
||||
**Why**: [specific reason based on detected config]
|
||||
**Where**: `.claude/settings.json`
|
||||
|
||||
---
|
||||
|
||||
### 🤖 Subagents
|
||||
|
||||
#### [agent name]
|
||||
**Why**: [specific reason based on codebase patterns]
|
||||
**Where**: `.claude/agents/[name].md`
|
||||
|
||||
---
|
||||
|
||||
**Want more?** Ask for additional recommendations for any specific category (e.g., "show me more MCP server options" or "what other hooks would help?").
|
||||
|
||||
**Want help implementing any of these?** Just ask and I can help you set up any of the recommendations above.
|
||||
```
|
||||
|
||||
## Decision Framework
|
||||
|
||||
### When to Recommend MCP Servers
|
||||
- External service integration needed (databases, APIs)
|
||||
- Documentation lookup for libraries/SDKs
|
||||
- Browser automation or testing
|
||||
- Team tool integration (GitHub, Linear, Slack)
|
||||
- Cloud infrastructure management
|
||||
|
||||
### When to Recommend Skills
|
||||
|
||||
- Document generation (docx, xlsx, pptx, pdf — also in plugins)
|
||||
- Frequently repeated prompts or workflows
|
||||
- Project-specific tasks with arguments
|
||||
- Applying templates or scripts to tasks (skills can bundle supporting files)
|
||||
- Quick actions invoked with `/skill-name`
|
||||
- Workflows that should run in isolation (`context: fork`)
|
||||
|
||||
**Invocation control:**
|
||||
- `disable-model-invocation: true` — User-only (for side effects: deploy, commit, send)
|
||||
- `user-invocable: false` — Claude-only (for background knowledge)
|
||||
- Default (omit both) — Both can invoke
|
||||
|
||||
### When to Recommend Hooks
|
||||
- Repetitive post-edit actions (formatting, linting)
|
||||
- Protection rules (block sensitive file edits)
|
||||
- Validation checks (tests, type checks)
|
||||
|
||||
### When to Recommend Subagents
|
||||
- Specialized expertise needed (security, performance)
|
||||
- Parallel review workflows
|
||||
- Background quality checks
|
||||
|
||||
### When to Recommend Plugins
|
||||
- Need multiple related skills
|
||||
- Want pre-packaged automation bundles
|
||||
- Team-wide standardization
|
||||
|
||||
---
|
||||
|
||||
## Configuration Tips
|
||||
|
||||
### MCP Server Setup
|
||||
|
||||
**Team sharing**: Check `.mcp.json` into repo so entire team gets same MCP servers
|
||||
|
||||
**Debugging**: Use `--mcp-debug` flag to identify configuration issues
|
||||
|
||||
**Prerequisites to recommend:**
|
||||
- GitHub CLI (`gh`) - enables native GitHub operations
|
||||
- Puppeteer/Playwright CLI - for browser MCP servers
|
||||
|
||||
### Headless Mode (for CI/Automation)
|
||||
|
||||
Recommend headless Claude for automated pipelines:
|
||||
|
||||
```bash
|
||||
# Pre-commit hook example
|
||||
claude -p "fix lint errors in src/" --allowedTools Edit,Write
|
||||
|
||||
# CI pipeline with structured output
|
||||
claude -p "<prompt>" --output-format stream-json | your_command
|
||||
```
|
||||
|
||||
### Permissions for Hooks
|
||||
|
||||
Configure allowed tools in `.claude/settings.json`:
|
||||
|
||||
```json
|
||||
{
|
||||
"permissions": {
|
||||
"allow": ["Edit", "Write", "Bash(npm test:*)", "Bash(git commit:*)"]
|
||||
}
|
||||
}
|
||||
```
|
||||
@@ -1,226 +0,0 @@
|
||||
# Hooks Recommendations
|
||||
|
||||
Hooks automatically run commands in response to Claude Code events. They're ideal for enforcement and automation that should happen consistently.
|
||||
|
||||
**Note**: These are common patterns. Use web search to find hooks for tools/frameworks not listed here to recommend the best hooks for the user.
|
||||
|
||||
## Auto-Formatting Hooks
|
||||
|
||||
### Prettier (JavaScript/TypeScript)
|
||||
| Detection | File Exists |
|
||||
|-----------|-------------|
|
||||
| `.prettierrc`, `.prettierrc.json`, `prettier.config.js` | ✓ |
|
||||
|
||||
**Recommend**: PostToolUse hook on Edit/Write to auto-format
|
||||
**Value**: Code stays formatted without thinking about it
|
||||
|
||||
### ESLint (JavaScript/TypeScript)
|
||||
| Detection | File Exists |
|
||||
|-----------|-------------|
|
||||
| `.eslintrc`, `.eslintrc.json`, `eslint.config.js` | ✓ |
|
||||
|
||||
**Recommend**: PostToolUse hook on Edit/Write to auto-fix
|
||||
**Value**: Lint errors fixed automatically
|
||||
|
||||
### Black/isort (Python)
|
||||
| Detection | File Exists |
|
||||
|-----------|-------------|
|
||||
| `pyproject.toml` with black/isort, `.black`, `setup.cfg` | ✓ |
|
||||
|
||||
**Recommend**: PostToolUse hook to format Python files
|
||||
**Value**: Consistent Python formatting
|
||||
|
||||
### Ruff (Python - Modern)
|
||||
| Detection | File Exists |
|
||||
|-----------|-------------|
|
||||
| `ruff.toml`, `pyproject.toml` with `[tool.ruff]` | ✓ |
|
||||
|
||||
**Recommend**: PostToolUse hook for lint + format
|
||||
**Value**: Fast, comprehensive Python linting
|
||||
|
||||
### gofmt (Go)
|
||||
| Detection | File Exists |
|
||||
|-----------|-------------|
|
||||
| `go.mod` | ✓ |
|
||||
|
||||
**Recommend**: PostToolUse hook to run gofmt
|
||||
**Value**: Standard Go formatting
|
||||
|
||||
### rustfmt (Rust)
|
||||
| Detection | File Exists |
|
||||
|-----------|-------------|
|
||||
| `Cargo.toml` | ✓ |
|
||||
|
||||
**Recommend**: PostToolUse hook to run rustfmt
|
||||
**Value**: Standard Rust formatting
|
||||
|
||||
---
|
||||
|
||||
## Type Checking Hooks
|
||||
|
||||
### TypeScript
|
||||
| Detection | File Exists |
|
||||
|-----------|-------------|
|
||||
| `tsconfig.json` | ✓ |
|
||||
|
||||
**Recommend**: PostToolUse hook to run tsc --noEmit
|
||||
**Value**: Catch type errors immediately
|
||||
|
||||
### mypy/pyright (Python)
|
||||
| Detection | File Exists |
|
||||
|-----------|-------------|
|
||||
| `mypy.ini`, `pyrightconfig.json`, pyproject.toml with mypy | ✓ |
|
||||
|
||||
**Recommend**: PostToolUse hook for type checking
|
||||
**Value**: Catch type errors in Python
|
||||
|
||||
---
|
||||
|
||||
## Protection Hooks
|
||||
|
||||
### Block Sensitive File Edits
|
||||
| Detection | Presence Of |
|
||||
|-----------|-------------|
|
||||
| `.env`, `.env.local`, `.env.production` | Environment files |
|
||||
| `credentials.json`, `secrets.yaml` | Secret files |
|
||||
| `.git/` directory | Git internals |
|
||||
|
||||
**Recommend**: PreToolUse hook that blocks Edit/Write to these paths
|
||||
**Value**: Prevent accidental secret exposure or git corruption
|
||||
|
||||
### Block Lock File Edits
|
||||
| Detection | Presence Of |
|
||||
|-----------|-------------|
|
||||
| `package-lock.json`, `yarn.lock`, `pnpm-lock.yaml` | JS lock files |
|
||||
| `Cargo.lock`, `poetry.lock`, `Pipfile.lock` | Other lock files |
|
||||
|
||||
**Recommend**: PreToolUse hook that blocks direct edits
|
||||
**Value**: Lock files should only change via package manager
|
||||
|
||||
---
|
||||
|
||||
## Test Runner Hooks
|
||||
|
||||
### Jest (JavaScript/TypeScript)
|
||||
| Detection | Presence Of |
|
||||
|-----------|-------------|
|
||||
| `jest.config.js`, `jest` in package.json | Jest configured |
|
||||
| `__tests__/`, `*.test.ts`, `*.spec.ts` | Test files exist |
|
||||
|
||||
**Recommend**: PostToolUse hook to run related tests after edit
|
||||
**Value**: Immediate test feedback on changes
|
||||
|
||||
### pytest (Python)
|
||||
| Detection | Presence Of |
|
||||
|-----------|-------------|
|
||||
| `pytest.ini`, `pyproject.toml` with pytest | pytest configured |
|
||||
| `tests/`, `test_*.py` | Test files exist |
|
||||
|
||||
**Recommend**: PostToolUse hook to run pytest on changed files
|
||||
**Value**: Immediate test feedback
|
||||
|
||||
---
|
||||
|
||||
## Quick Reference: Detection → Recommendation
|
||||
|
||||
| If You See | Recommend This Hook |
|
||||
|------------|-------------------|
|
||||
| Prettier config | Auto-format on Edit/Write |
|
||||
| ESLint config | Auto-lint on Edit/Write |
|
||||
| Ruff/Black config | Auto-format Python |
|
||||
| tsconfig.json | Type-check on Edit |
|
||||
| Test directory | Run related tests on Edit |
|
||||
| .env files | Block .env edits |
|
||||
| Lock files | Block lock file edits |
|
||||
| Go project | gofmt on Edit |
|
||||
| Rust project | rustfmt on Edit |
|
||||
|
||||
---
|
||||
|
||||
## Notification Hooks
|
||||
|
||||
Notification hooks run when Claude Code sends notifications. Use matchers to filter by notification type.
|
||||
|
||||
### Permission Alerts
|
||||
| Matcher | Use Case |
|
||||
|---------|----------|
|
||||
| `permission_prompt` | Alert when Claude requests permissions |
|
||||
|
||||
**Recommend**: Play sound, send desktop notification, or log permission requests
|
||||
**Value**: Never miss permission prompts when multitasking
|
||||
|
||||
### Idle Notifications
|
||||
| Matcher | Use Case |
|
||||
|---------|----------|
|
||||
| `idle_prompt` | Alert when Claude is waiting for input (60+ seconds idle) |
|
||||
|
||||
**Recommend**: Play sound or send notification when Claude needs attention
|
||||
**Value**: Know when Claude is ready for your input
|
||||
|
||||
### Example Configuration
|
||||
|
||||
```json
|
||||
{
|
||||
"hooks": {
|
||||
"Notification": [
|
||||
{
|
||||
"matcher": "permission_prompt",
|
||||
"hooks": [
|
||||
{
|
||||
"type": "command",
|
||||
"command": "afplay /System/Library/Sounds/Ping.aiff"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": "idle_prompt",
|
||||
"hooks": [
|
||||
{
|
||||
"type": "command",
|
||||
"command": "osascript -e 'display notification \"Claude is waiting\" with title \"Claude Code\"'"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Available Matchers
|
||||
|
||||
| Matcher | Triggers When |
|
||||
|---------|---------------|
|
||||
| `permission_prompt` | Claude needs permission for a tool |
|
||||
| `idle_prompt` | Claude waiting for input (60+ seconds) |
|
||||
| `auth_success` | Authentication succeeds |
|
||||
| `elicitation_dialog` | MCP tool needs input |
|
||||
|
||||
---
|
||||
|
||||
## Quick Reference: Detection → Recommendation
|
||||
|
||||
| If You See | Recommend This Hook |
|
||||
|------------|-------------------|
|
||||
| Prettier config | Auto-format on Edit/Write |
|
||||
| ESLint config | Auto-lint on Edit/Write |
|
||||
| Ruff/Black config | Auto-format Python |
|
||||
| tsconfig.json | Type-check on Edit |
|
||||
| Test directory | Run related tests on Edit |
|
||||
| .env files | Block .env edits |
|
||||
| Lock files | Block lock file edits |
|
||||
| Go project | gofmt on Edit |
|
||||
| Rust project | rustfmt on Edit |
|
||||
| Multitasking workflow | Notification hooks for alerts |
|
||||
|
||||
---
|
||||
|
||||
## Hook Placement
|
||||
|
||||
Hooks go in `.claude/settings.json`:
|
||||
|
||||
```
|
||||
.claude/
|
||||
└── settings.json ← Hook configurations here
|
||||
```
|
||||
|
||||
Recommend creating the `.claude/` directory if it doesn't exist.
|
||||
@@ -1,263 +0,0 @@
|
||||
# MCP Server Recommendations
|
||||
|
||||
MCP (Model Context Protocol) servers extend Claude's capabilities by connecting to external tools and services.
|
||||
|
||||
**Note**: These are common MCP servers. Use web search to find MCP servers specific to the codebase's services and integrations.
|
||||
|
||||
## Setup & Team Sharing
|
||||
|
||||
**Connection methods:**
|
||||
1. **Project config** (`.mcp.json`) - Available only in that directory
|
||||
2. **Global config** (`~/.claude.json`) - Available across all projects
|
||||
3. **Checked-in `.mcp.json`** - Available to entire team (recommended!)
|
||||
|
||||
**Tip**: Check `.mcp.json` into git so your whole team gets the same MCP servers.
|
||||
|
||||
**Debugging**: Use `claude --mcp-debug` to identify configuration issues.
|
||||
|
||||
## Documentation & Knowledge
|
||||
|
||||
### context7
|
||||
**Best for**: Projects using popular libraries/SDKs where you want Claude to code with up-to-date documentation
|
||||
|
||||
| Recommend When | Examples |
|
||||
|----------------|----------|
|
||||
| Using React, Vue, Angular | Frontend frameworks |
|
||||
| Using Express, FastAPI, Django | Backend frameworks |
|
||||
| Using Prisma, Drizzle | ORMs |
|
||||
| Using Stripe, Twilio, SendGrid | Third-party APIs |
|
||||
| Using AWS SDK, Google Cloud | Cloud SDKs |
|
||||
| Using LangChain, OpenAI SDK | AI/ML libraries |
|
||||
|
||||
**Value**: Claude fetches live documentation instead of relying on training data, reducing hallucinated APIs and outdated patterns.
|
||||
|
||||
---
|
||||
|
||||
## Browser & Frontend
|
||||
|
||||
### Playwright MCP
|
||||
**Best for**: Frontend projects needing browser automation, testing, or screenshots
|
||||
|
||||
| Recommend When | Examples |
|
||||
|----------------|----------|
|
||||
| React/Vue/Angular app | UI component testing |
|
||||
| E2E tests needed | User flow validation |
|
||||
| Visual regression testing | Screenshot comparisons |
|
||||
| Debugging UI issues | See what user sees |
|
||||
| Form testing | Multi-step workflows |
|
||||
|
||||
**Value**: Claude can interact with your running app, take screenshots, fill forms, and verify UI behavior.
|
||||
|
||||
### Puppeteer MCP
|
||||
**Best for**: Headless browser automation, web scraping
|
||||
|
||||
| Recommend When | Examples |
|
||||
|----------------|----------|
|
||||
| PDF generation from HTML | Report generation |
|
||||
| Web scraping tasks | Data extraction |
|
||||
| Headless testing | CI environments |
|
||||
|
||||
---
|
||||
|
||||
## Databases
|
||||
|
||||
### Supabase MCP
|
||||
**Best for**: Projects using Supabase for backend/database
|
||||
|
||||
| Recommend When | Examples |
|
||||
|----------------|----------|
|
||||
| Supabase project detected | `@supabase/supabase-js` in deps |
|
||||
| Auth + database needs | User management apps |
|
||||
| Real-time features | Live data sync |
|
||||
|
||||
**Value**: Claude can query tables, manage auth, and interact with Supabase storage directly.
|
||||
|
||||
### PostgreSQL MCP
|
||||
**Best for**: Direct PostgreSQL database access
|
||||
|
||||
| Recommend When | Examples |
|
||||
|----------------|----------|
|
||||
| Raw PostgreSQL usage | No ORM layer |
|
||||
| Database migrations | Schema management |
|
||||
| Data analysis tasks | Complex queries |
|
||||
| Debugging data issues | Inspect actual data |
|
||||
|
||||
### Neon MCP
|
||||
**Best for**: Neon serverless Postgres users
|
||||
|
||||
### Turso MCP
|
||||
**Best for**: Turso/libSQL edge database users
|
||||
|
||||
---
|
||||
|
||||
## Version Control & DevOps
|
||||
|
||||
### GitHub MCP
|
||||
**Best for**: GitHub-hosted repositories needing issue/PR integration
|
||||
|
||||
| Recommend When | Examples |
|
||||
|----------------|----------|
|
||||
| GitHub repository | `.git` with GitHub remote |
|
||||
| Issue-driven development | Reference issues in commits |
|
||||
| PR workflows | Review, merge operations |
|
||||
| GitHub Actions | CI/CD pipeline access |
|
||||
| Release management | Tag and release automation |
|
||||
|
||||
**Value**: Claude can create issues, review PRs, check workflow runs, and manage releases.
|
||||
|
||||
### GitLab MCP
|
||||
**Best for**: GitLab-hosted repositories
|
||||
|
||||
### Linear MCP
|
||||
**Best for**: Teams using Linear for issue tracking
|
||||
|
||||
| Recommend When | Examples |
|
||||
|----------------|----------|
|
||||
| Linear workspace | Issue references like `ABC-123` |
|
||||
| Sprint planning | Backlog management |
|
||||
| Issue creation from code | Auto-create issues for TODOs |
|
||||
|
||||
---
|
||||
|
||||
## Cloud Infrastructure
|
||||
|
||||
### AWS MCP
|
||||
**Best for**: AWS infrastructure management
|
||||
|
||||
| Recommend When | Examples |
|
||||
|----------------|----------|
|
||||
| AWS SDK in dependencies | `@aws-sdk/*` packages |
|
||||
| Infrastructure as code | Terraform, CDK, SAM |
|
||||
| Lambda development | Serverless functions |
|
||||
| S3, DynamoDB usage | Cloud data services |
|
||||
|
||||
### Cloudflare MCP
|
||||
**Best for**: Cloudflare Workers, Pages, R2, D1
|
||||
|
||||
| Recommend When | Examples |
|
||||
|----------------|----------|
|
||||
| Cloudflare Workers | Edge functions |
|
||||
| Pages deployment | Static site hosting |
|
||||
| R2 storage | Object storage |
|
||||
| D1 database | Edge SQL database |
|
||||
|
||||
### Vercel MCP
|
||||
**Best for**: Vercel deployment and configuration
|
||||
|
||||
---
|
||||
|
||||
## Monitoring & Observtic
|
||||
|
||||
### Sentry MCP
|
||||
**Best for**: Error tracking and debugging
|
||||
|
||||
| Recommend When | Examples |
|
||||
|----------------|----------|
|
||||
| Sentry configured | `@sentry/*` in deps |
|
||||
| Production debugging | Investigate errors |
|
||||
| Error patterns | Group similar issues |
|
||||
| Release tracking | Correlate deploys with errors |
|
||||
|
||||
**Value**: Claude can investigate Sentry issues, find root causes, and suggest fixes.
|
||||
|
||||
### Datadog MCP
|
||||
**Best for**: APM, logs, and metrics
|
||||
|
||||
---
|
||||
|
||||
## Communication
|
||||
|
||||
### Slack MCP
|
||||
**Best for**: Slack workspace integration
|
||||
|
||||
| Recommend When | Examples |
|
||||
|----------------|----------|
|
||||
| Team uses Slack | Send notifications |
|
||||
| Deployment notifications | Alert channels |
|
||||
| Incident response | Post updates |
|
||||
|
||||
### Notion MCP
|
||||
**Best for**: Notion workspace for documentation
|
||||
|
||||
| Recommend When | Examples |
|
||||
|----------------|----------|
|
||||
| Notion for docs | Read/update pages |
|
||||
| Knowledge base | Search documentation |
|
||||
| Meeting notes | Create summaries |
|
||||
|
||||
---
|
||||
|
||||
## File & Data
|
||||
|
||||
### Filesystem MCP
|
||||
**Best for**: Enhanced file operations beyond built-in tools
|
||||
|
||||
| Recommend When | Examples |
|
||||
|----------------|----------|
|
||||
| Complex file operations | Batch processing |
|
||||
| File watching | Monitor changes |
|
||||
| Advanced search | Custom patterns |
|
||||
|
||||
### Memory MCP
|
||||
**Best for**: Persistent memory across sessions
|
||||
|
||||
| Recommend When | Examples |
|
||||
|----------------|----------|
|
||||
| Long-running projects | Remember context |
|
||||
| User preferences | Store settings |
|
||||
| Learning patterns | Build knowledge |
|
||||
|
||||
**Value**: Claude remembers project context, decisions, and patterns across conversations.
|
||||
|
||||
---
|
||||
|
||||
## Containers & DevOps
|
||||
|
||||
### Docker MCP
|
||||
**Best for**: Container management
|
||||
|
||||
| Recommend When | Examples |
|
||||
|----------------|----------|
|
||||
| Docker Compose file | Container orchestration |
|
||||
| Dockerfile present | Build images |
|
||||
| Container debugging | Inspect logs, exec |
|
||||
|
||||
### Kubernetes MCP
|
||||
**Best for**: Kubernetes cluster management
|
||||
|
||||
| Recommend When | Examples |
|
||||
|----------------|----------|
|
||||
| K8s manifests | Deploy, scale pods |
|
||||
| Helm charts | Package management |
|
||||
| Cluster debugging | Pod logs, status |
|
||||
|
||||
---
|
||||
|
||||
## AI & ML
|
||||
|
||||
### Exa MCP
|
||||
**Best for**: Web search and research
|
||||
|
||||
| Recommend When | Examples |
|
||||
|----------------|----------|
|
||||
| Research tasks | Find current info |
|
||||
| Competitive analysis | Market research |
|
||||
| Documentation gaps | Find examples |
|
||||
|
||||
---
|
||||
|
||||
## Quick Reference: Detection Patterns
|
||||
|
||||
| Look For | Suggests MCP Server |
|
||||
|----------|-------------------|
|
||||
| Popular npm packages | context7 |
|
||||
| React/Vue/Next.js | Playwright MCP |
|
||||
| `@supabase/supabase-js` | Supabase MCP |
|
||||
| `pg` or `postgres` | PostgreSQL MCP |
|
||||
| GitHub remote | GitHub MCP |
|
||||
| `.linear` or Linear refs | Linear MCP |
|
||||
| `@aws-sdk/*` | AWS MCP |
|
||||
| `@sentry/*` | Sentry MCP |
|
||||
| `docker-compose.yml` | Docker MCP |
|
||||
| Slack webhook URLs | Slack MCP |
|
||||
| `@anthropic-ai/sdk` | context7 for Anthropic docs |
|
||||
@@ -1,98 +0,0 @@
|
||||
# Plugin Recommendations
|
||||
|
||||
Plugins are installable collections of skills, commands, agents, and hooks. Install via `/plugin install`.
|
||||
|
||||
**Note**: These are plugins from the official repository. Use web search to discover additional community plugins.
|
||||
|
||||
---
|
||||
|
||||
## Official Plugins
|
||||
|
||||
### Development & Code Quality
|
||||
|
||||
| Plugin | Best For | Key Features |
|
||||
|--------|----------|--------------|
|
||||
| **plugin-dev** | Building Claude Code plugins | Skills for creating skills, hooks, commands, agents |
|
||||
| **pr-review-toolkit** | PR review workflows | Specialized review agents (code, tests, types) |
|
||||
| **code-review** | Automated code review | Multi-agent review with confidence scoring |
|
||||
| **code-simplifier** | Code refactoring | Simplify code while preserving functionality |
|
||||
| **feature-dev** | Feature development | End-to-end feature workflow with agents |
|
||||
|
||||
### Git & Workflow
|
||||
|
||||
| Plugin | Best For | Key Features |
|
||||
|--------|----------|--------------|
|
||||
| **commit-commands** | Git workflows | /commit, /commit-push-pr commands |
|
||||
| **hookify** | Automation rules | Create hooks from conversation patterns |
|
||||
|
||||
### Frontend
|
||||
|
||||
| Plugin | Best For | Key Features |
|
||||
|--------|----------|--------------|
|
||||
| **frontend-design** | UI development | Production-grade UI, avoids generic aesthetics |
|
||||
|
||||
### Learning & Guidance
|
||||
|
||||
| Plugin | Best For | Key Features |
|
||||
|--------|----------|--------------|
|
||||
| **explanatory-output-style** | Learning | Educational insights about code choices |
|
||||
| **learning-output-style** | Interactive learning | Requests contributions at decision points |
|
||||
| **security-guidance** | Security awareness | Warns about security issues when editing |
|
||||
|
||||
### Language Servers (LSP)
|
||||
|
||||
| Plugin | Language |
|
||||
|--------|----------|
|
||||
| **typescript-lsp** | TypeScript/JavaScript |
|
||||
| **pyright-lsp** | Python |
|
||||
| **gopls-lsp** | Go |
|
||||
| **rust-analyzer-lsp** | Rust |
|
||||
| **clangd-lsp** | C/C++ |
|
||||
| **jdtls-lsp** | Java |
|
||||
| **kotlin-lsp** | Kotlin |
|
||||
| **swift-lsp** | Swift |
|
||||
| **csharp-lsp** | C# |
|
||||
| **php-lsp** | PHP |
|
||||
| **lua-lsp** | Lua |
|
||||
|
||||
---
|
||||
|
||||
## Quick Reference: Codebase → Plugin
|
||||
|
||||
| Codebase Signal | Recommended Plugin |
|
||||
|-----------------|-------------------|
|
||||
| Building plugins | plugin-dev |
|
||||
| PR-based workflow | pr-review-toolkit |
|
||||
| Git commits | commit-commands |
|
||||
| React/Vue/Angular | frontend-design |
|
||||
| Want automation rules | hookify |
|
||||
| TypeScript project | typescript-lsp |
|
||||
| Python project | pyright-lsp |
|
||||
| Go project | gopls-lsp |
|
||||
| Security-sensitive code | security-guidance |
|
||||
| Learning/onboarding | explanatory-output-style |
|
||||
|
||||
---
|
||||
|
||||
## Plugin Management
|
||||
|
||||
```bash
|
||||
# Install a plugin
|
||||
/plugin install <plugin-name>
|
||||
|
||||
# List installed plugins
|
||||
/plugin list
|
||||
|
||||
# View plugin details
|
||||
/plugin info <plugin-name>
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## When to Recommend Plugins
|
||||
|
||||
**Recommend plugin installation when:**
|
||||
- User wants to install Claude Code automations from Anthropic's official repository or another shared marketplace
|
||||
- User needs multiple related capabilities
|
||||
- Team wants standardized workflows
|
||||
- First-time Claude Code setup
|
||||
@@ -1,408 +0,0 @@
|
||||
# Skills Recommendations
|
||||
|
||||
Skills are packaged expertise with workflows, reference materials, and best practices. Create them in `.claude/skills/<name>/SKILL.md`. Skills can be invoked by Claude automatically when relevant, or by users directly with `/skill-name`.
|
||||
|
||||
Some pre-built skills are available through official plugins (install via `/plugin install`).
|
||||
|
||||
**Note**: These are common patterns. Use web search to find skill ideas specific to the codebase's tools and frameworks.
|
||||
|
||||
---
|
||||
|
||||
## Available from Official Plugins
|
||||
|
||||
### Plugin Development (plugin-dev)
|
||||
|
||||
| Skill | Best For |
|
||||
|-------|----------|
|
||||
| **skill-development** | Creating new skills with proper structure |
|
||||
| **hook-development** | Building hooks for automation |
|
||||
| **command-development** | Creating slash commands |
|
||||
| **agent-development** | Building specialized subagents |
|
||||
| **mcp-integration** | Integrating MCP servers into plugins |
|
||||
| **plugin-structure** | Understanding plugin architecture |
|
||||
|
||||
### Git Workflows (commit-commands)
|
||||
|
||||
| Skill | Best For |
|
||||
|-------|----------|
|
||||
| **commit** | Creating git commits with proper messages |
|
||||
| **commit-push-pr** | Full commit, push, and PR workflow |
|
||||
|
||||
### Frontend (frontend-design)
|
||||
|
||||
| Skill | Best For |
|
||||
|-------|----------|
|
||||
| **frontend-design** | Creating polished UI components |
|
||||
|
||||
**Value**: Creates distinctive, high-quality UI instead of generic AI aesthetics.
|
||||
|
||||
### Automation Rules (hookify)
|
||||
|
||||
| Skill | Best For |
|
||||
|-------|----------|
|
||||
| **writing-rules** | Creating hookify rules for automation |
|
||||
|
||||
### Feature Development (feature-dev)
|
||||
|
||||
| Skill | Best For |
|
||||
|-------|----------|
|
||||
| **feature-dev** | End-to-end feature development workflow |
|
||||
|
||||
---
|
||||
|
||||
## Quick Reference: Official Plugin Skills
|
||||
|
||||
| Codebase Signal | Skill | Plugin |
|
||||
|-----------------|-------|--------|
|
||||
| Building plugins | skill-development | plugin-dev |
|
||||
| Git commits | commit | commit-commands |
|
||||
| React/Vue/Angular | frontend-design | frontend-design |
|
||||
| Automation rules | writing-rules | hookify |
|
||||
| Feature planning | feature-dev | feature-dev |
|
||||
|
||||
---
|
||||
|
||||
## Custom Project Skills
|
||||
|
||||
Create project-specific skills in `.claude/skills/<name>/SKILL.md`.
|
||||
|
||||
### Skill Structure
|
||||
|
||||
```
|
||||
.claude/skills/
|
||||
└── my-skill/
|
||||
├── SKILL.md # Main instructions (required)
|
||||
├── template.yaml # Template to apply
|
||||
├── scripts/
|
||||
│ └── validate.sh # Script to run
|
||||
└── examples/ # Reference examples
|
||||
```
|
||||
|
||||
### Frontmatter Reference
|
||||
|
||||
```yaml
|
||||
---
|
||||
name: skill-name
|
||||
description: What this skill does and when to use it
|
||||
disable-model-invocation: true # Only user can invoke (for side effects)
|
||||
user-invocable: false # Only Claude can invoke (for background knowledge)
|
||||
allowed-tools: Read, Grep, Glob # Restrict tool access
|
||||
context: fork # Run in isolated subagent
|
||||
agent: Explore # Which agent type when forked
|
||||
---
|
||||
```
|
||||
|
||||
### Invocation Control
|
||||
|
||||
| Setting | User | Claude | Use for |
|
||||
|---------|------|--------|---------|
|
||||
| (default) | ✓ | ✓ | General-purpose skills |
|
||||
| `disable-model-invocation: true` | ✓ | ✗ | Side effects (deploy, send) |
|
||||
| `user-invocable: false` | ✗ | ✓ | Background knowledge |
|
||||
|
||||
---
|
||||
|
||||
## Custom Skill Examples
|
||||
|
||||
### API Documentation with OpenAPI Template
|
||||
|
||||
Apply a YAML template to generate consistent API docs:
|
||||
|
||||
```
|
||||
.claude/skills/api-doc/
|
||||
├── SKILL.md
|
||||
└── openapi-template.yaml
|
||||
```
|
||||
|
||||
**SKILL.md:**
|
||||
```yaml
|
||||
---
|
||||
name: api-doc
|
||||
description: Generate OpenAPI documentation for an endpoint. Use when documenting API routes.
|
||||
---
|
||||
|
||||
Generate OpenAPI documentation for the endpoint at $ARGUMENTS.
|
||||
|
||||
Use the template in [openapi-template.yaml](openapi-template.yaml) as the structure.
|
||||
|
||||
1. Read the endpoint code
|
||||
2. Extract path, method, parameters, request/response schemas
|
||||
3. Fill in the template with actual values
|
||||
4. Output the completed YAML
|
||||
```
|
||||
|
||||
**openapi-template.yaml:**
|
||||
```yaml
|
||||
paths:
|
||||
/{path}:
|
||||
{method}:
|
||||
summary: ""
|
||||
description: ""
|
||||
parameters: []
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema: {}
|
||||
responses:
|
||||
"200":
|
||||
description: ""
|
||||
content:
|
||||
application/json:
|
||||
schema: {}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Database Migration Generator with Script
|
||||
|
||||
Generate and validate migrations using a bundled script:
|
||||
|
||||
```
|
||||
.claude/skills/create-migration/
|
||||
├── SKILL.md
|
||||
└── scripts/
|
||||
└── validate-migration.sh
|
||||
```
|
||||
|
||||
**SKILL.md:**
|
||||
```yaml
|
||||
---
|
||||
name: create-migration
|
||||
description: Create a database migration file
|
||||
disable-model-invocation: true
|
||||
allowed-tools: Read, Write, Bash
|
||||
---
|
||||
|
||||
Create a migration for: $ARGUMENTS
|
||||
|
||||
1. Generate migration file in `migrations/` with timestamp prefix
|
||||
2. Include up and down functions
|
||||
3. Run validation: `bash ~/.claude/skills/create-migration/scripts/validate-migration.sh`
|
||||
4. Report any issues found
|
||||
```
|
||||
|
||||
**scripts/validate-migration.sh:**
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# Validate migration syntax
|
||||
npx prisma validate 2>&1 || echo "Validation failed"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Test Generator with Examples
|
||||
|
||||
Generate tests following project patterns:
|
||||
|
||||
```
|
||||
.claude/skills/gen-test/
|
||||
├── SKILL.md
|
||||
└── examples/
|
||||
├── unit-test.ts
|
||||
└── integration-test.ts
|
||||
```
|
||||
|
||||
**SKILL.md:**
|
||||
```yaml
|
||||
---
|
||||
name: gen-test
|
||||
description: Generate tests for a file following project conventions
|
||||
disable-model-invocation: true
|
||||
---
|
||||
|
||||
Generate tests for: $ARGUMENTS
|
||||
|
||||
Reference these examples for the expected patterns:
|
||||
- Unit tests: [examples/unit-test.ts](examples/unit-test.ts)
|
||||
- Integration tests: [examples/integration-test.ts](examples/integration-test.ts)
|
||||
|
||||
1. Analyze the source file
|
||||
2. Identify functions/methods to test
|
||||
3. Generate tests matching project conventions
|
||||
4. Place in appropriate test directory
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Component Generator with Template
|
||||
|
||||
Scaffold new components from a template:
|
||||
|
||||
```
|
||||
.claude/skills/new-component/
|
||||
├── SKILL.md
|
||||
└── templates/
|
||||
├── component.tsx.template
|
||||
├── component.test.tsx.template
|
||||
└── component.stories.tsx.template
|
||||
```
|
||||
|
||||
**SKILL.md:**
|
||||
```yaml
|
||||
---
|
||||
name: new-component
|
||||
description: Scaffold a new React component with tests and stories
|
||||
disable-model-invocation: true
|
||||
---
|
||||
|
||||
Create component: $ARGUMENTS
|
||||
|
||||
Use templates in [templates/](templates/) directory:
|
||||
1. Generate component from component.tsx.template
|
||||
2. Generate tests from component.test.tsx.template
|
||||
3. Generate Storybook story from component.stories.tsx.template
|
||||
|
||||
Replace {{ComponentName}} with the PascalCase name.
|
||||
Replace {{component-name}} with the kebab-case name.
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### PR Review with Checklist
|
||||
|
||||
Review PRs against a project-specific checklist:
|
||||
|
||||
```
|
||||
.claude/skills/pr-check/
|
||||
├── SKILL.md
|
||||
└── checklist.md
|
||||
```
|
||||
|
||||
**SKILL.md:**
|
||||
```yaml
|
||||
---
|
||||
name: pr-check
|
||||
description: Review PR against project checklist
|
||||
disable-model-invocation: true
|
||||
context: fork
|
||||
---
|
||||
|
||||
## PR Context
|
||||
- Diff: !`gh pr diff`
|
||||
- Description: !`gh pr view`
|
||||
|
||||
Review against [checklist.md](checklist.md).
|
||||
|
||||
For each item, mark ✅ or ❌ with explanation.
|
||||
```
|
||||
|
||||
**checklist.md:**
|
||||
```markdown
|
||||
## PR Checklist
|
||||
|
||||
- [ ] Tests added for new functionality
|
||||
- [ ] No console.log statements
|
||||
- [ ] Error handling includes user-facing messages
|
||||
- [ ] API changes are backwards compatible
|
||||
- [ ] Database migrations are reversible
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Release Notes Generator
|
||||
|
||||
Generate release notes from git history:
|
||||
|
||||
**SKILL.md:**
|
||||
```yaml
|
||||
---
|
||||
name: release-notes
|
||||
description: Generate release notes from commits since last tag
|
||||
disable-model-invocation: true
|
||||
---
|
||||
|
||||
## Recent Changes
|
||||
- Commits since last tag: !`git log $(git describe --tags --abbrev=0)..HEAD --oneline`
|
||||
- Last tag: !`git describe --tags --abbrev=0`
|
||||
|
||||
Generate release notes:
|
||||
1. Group commits by type (feat, fix, docs, etc.)
|
||||
2. Write user-friendly descriptions
|
||||
3. Highlight breaking changes
|
||||
4. Format as markdown
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Project Conventions (Claude-only)
|
||||
|
||||
Background knowledge Claude applies automatically:
|
||||
|
||||
**SKILL.md:**
|
||||
```yaml
|
||||
---
|
||||
name: project-conventions
|
||||
description: Code style and patterns for this project. Apply when writing or reviewing code.
|
||||
user-invocable: false
|
||||
---
|
||||
|
||||
## Naming Conventions
|
||||
- React components: PascalCase
|
||||
- Utilities: camelCase
|
||||
- Constants: UPPER_SNAKE_CASE
|
||||
- Files: kebab-case
|
||||
|
||||
## Patterns
|
||||
- Use `Result<T, E>` for fallible operations, not exceptions
|
||||
- Prefer composition over inheritance
|
||||
- All API responses use `{ data, error, meta }` shape
|
||||
|
||||
## Forbidden
|
||||
- No `any` types
|
||||
- No `console.log` in production code
|
||||
- No synchronous file I/O
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Environment Setup
|
||||
|
||||
Onboard new developers with setup script:
|
||||
|
||||
```
|
||||
.claude/skills/setup-dev/
|
||||
├── SKILL.md
|
||||
└── scripts/
|
||||
└── check-prerequisites.sh
|
||||
```
|
||||
|
||||
**SKILL.md:**
|
||||
```yaml
|
||||
---
|
||||
name: setup-dev
|
||||
description: Set up development environment for new contributors
|
||||
disable-model-invocation: true
|
||||
---
|
||||
|
||||
Set up development environment:
|
||||
|
||||
1. Check prerequisites: `bash scripts/check-prerequisites.sh`
|
||||
2. Install dependencies: `npm install`
|
||||
3. Copy environment template: `cp .env.example .env`
|
||||
4. Set up database: `npm run db:setup`
|
||||
5. Verify setup: `npm test`
|
||||
|
||||
Report any issues encountered.
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Argument Patterns
|
||||
|
||||
| Pattern | Meaning | Example |
|
||||
|---------|---------|---------|
|
||||
| `$ARGUMENTS` | All args as string | `/deploy staging` → "staging" |
|
||||
|
||||
Arguments are appended as `ARGUMENTS: <value>` if `$ARGUMENTS` isn't in the skill.
|
||||
|
||||
## Dynamic Context Injection
|
||||
|
||||
Use `!`command`` to inject live data before the skill runs:
|
||||
|
||||
```yaml
|
||||
## Current State
|
||||
- Branch: !`git branch --show-current`
|
||||
- Status: !`git status --short`
|
||||
```
|
||||
|
||||
The command output replaces the placeholder before Claude sees the skill content.
|
||||
@@ -1,181 +0,0 @@
|
||||
# Subagent Recommendations
|
||||
|
||||
Subagents are specialized Claude instances that run in parallel, each with their own context window and tool access. They're ideal for focused reviews, analysis, or generation tasks.
|
||||
|
||||
**Note**: These are common patterns. Design custom subagents based on the codebase's specific review and analysis needs.
|
||||
|
||||
## Code Review Agents
|
||||
|
||||
### code-reviewer
|
||||
**Best for**: Automated code quality checks on large codebases
|
||||
|
||||
| Recommend When | Detection |
|
||||
|----------------|-----------|
|
||||
| Large codebase (>500 files) | File count |
|
||||
| Frequent code changes | Active development |
|
||||
| Team wants consistent review | Quality focus |
|
||||
|
||||
**Value**: Runs code review in parallel while you continue working
|
||||
**Model**: sonnet (balanced quality/speed)
|
||||
**Tools**: Read, Grep, Glob, Bash
|
||||
|
||||
---
|
||||
|
||||
### security-reviewer
|
||||
**Best for**: Security-focused code review
|
||||
|
||||
| Recommend When | Detection |
|
||||
|----------------|-----------|
|
||||
| Auth code present | `auth/`, `login`, `session` patterns |
|
||||
| Payment processing | `stripe`, `payment`, `billing` patterns |
|
||||
| User data handling | `user`, `profile`, `pii` patterns |
|
||||
| API keys in code | Environment variable patterns |
|
||||
|
||||
**Value**: Catches OWASP vulnerabilities, auth issues, data exposure
|
||||
**Model**: sonnet
|
||||
**Tools**: Read, Grep, Glob (read-only for safety)
|
||||
|
||||
---
|
||||
|
||||
### test-writer
|
||||
**Best for**: Generating comprehensive test coverage
|
||||
|
||||
| Recommend When | Detection |
|
||||
|----------------|-----------|
|
||||
| Low test coverage | Few test files vs source files |
|
||||
| Test suite exists | `tests/`, `__tests__/` present |
|
||||
| Testing framework configured | jest, pytest, vitest in deps |
|
||||
|
||||
**Value**: Generates tests matching project conventions
|
||||
**Model**: sonnet
|
||||
**Tools**: Read, Write, Grep, Glob
|
||||
|
||||
---
|
||||
|
||||
## Specialized Agents
|
||||
|
||||
### api-documenter
|
||||
**Best for**: API documentation generation
|
||||
|
||||
| Recommend When | Detection |
|
||||
|----------------|-----------|
|
||||
| REST endpoints | Express routes, FastAPI paths |
|
||||
| GraphQL schema | `.graphql` files |
|
||||
| OpenAPI exists | `openapi.yaml`, `swagger.json` |
|
||||
| Undocumented APIs | Routes without docs |
|
||||
|
||||
**Value**: Generates OpenAPI specs, endpoint documentation
|
||||
**Model**: sonnet
|
||||
**Tools**: Read, Write, Grep, Glob
|
||||
|
||||
---
|
||||
|
||||
### performance-analyzer
|
||||
**Best for**: Finding performance bottlenecks
|
||||
|
||||
| Recommend When | Detection |
|
||||
|----------------|-----------|
|
||||
| Database queries | ORM usage, raw SQL |
|
||||
| High-traffic code | API endpoints, hot paths |
|
||||
| Performance complaints | User reports slowness |
|
||||
| Complex algorithms | Nested loops, recursion |
|
||||
|
||||
**Value**: Finds N+1 queries, O(n²) algorithms, memory leaks
|
||||
**Model**: sonnet
|
||||
**Tools**: Read, Grep, Glob, Bash
|
||||
|
||||
---
|
||||
|
||||
### ui-reviewer
|
||||
**Best for**: Frontend accessibility and UX review
|
||||
|
||||
| Recommend When | Detection |
|
||||
|----------------|-----------|
|
||||
| React/Vue/Angular | Frontend framework detected |
|
||||
| Component library | `components/` directory |
|
||||
| User-facing UI | Not just API project |
|
||||
|
||||
**Value**: Catches accessibility issues, UX problems, responsive design gaps
|
||||
**Model**: sonnet
|
||||
**Tools**: Read, Grep, Glob
|
||||
|
||||
---
|
||||
|
||||
## Utility Agents
|
||||
|
||||
### dependency-updater
|
||||
**Best for**: Safe dependency updates
|
||||
|
||||
| Recommend When | Detection |
|
||||
|----------------|-----------|
|
||||
| Outdated deps | `npm outdated` has results |
|
||||
| Security advisories | `npm audit` warnings |
|
||||
| Major version behind | Significant version gaps |
|
||||
|
||||
**Value**: Updates dependencies incrementally with testing
|
||||
**Model**: sonnet
|
||||
**Tools**: Read, Write, Bash, Grep
|
||||
|
||||
---
|
||||
|
||||
### migration-helper
|
||||
**Best for**: Framework/version migrations
|
||||
|
||||
| Recommend When | Detection |
|
||||
|----------------|-----------|
|
||||
| Major upgrade needed | Framework version very old |
|
||||
| Breaking changes coming | Deprecation warnings |
|
||||
| Refactoring planned | Architectural changes |
|
||||
|
||||
**Value**: Plans and executes migrations incrementally
|
||||
**Model**: opus (complex reasoning needed)
|
||||
**Tools**: Read, Write, Grep, Glob, Bash
|
||||
|
||||
---
|
||||
|
||||
## Quick Reference: Detection → Recommendation
|
||||
|
||||
| If You See | Recommend Subagent |
|
||||
|------------|-------------------|
|
||||
| Large codebase | code-reviewer |
|
||||
| Auth/payment code | security-reviewer |
|
||||
| Few tests | test-writer |
|
||||
| API routes | api-documenter |
|
||||
| Database heavy | performance-analyzer |
|
||||
| Frontend components | ui-reviewer |
|
||||
| Outdated packages | dependency-updater |
|
||||
| Old framework version | migration-helper |
|
||||
|
||||
---
|
||||
|
||||
## Subagent Placement
|
||||
|
||||
Subagents go in `.claude/agents/`:
|
||||
|
||||
```
|
||||
.claude/
|
||||
└── agents/
|
||||
├── code-reviewer.md
|
||||
├── security-reviewer.md
|
||||
└── test-writer.md
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Model Selection Guide
|
||||
|
||||
| Model | Best For | Trade-off |
|
||||
|-------|----------|-----------|
|
||||
| **haiku** | Simple, repetitive checks | Fast, cheap, less thorough |
|
||||
| **sonnet** | Most review/analysis tasks | Balanced (recommended default) |
|
||||
| **opus** | Complex migrations, architecture | Thorough, slower, more expensive |
|
||||
|
||||
---
|
||||
|
||||
## Tool Access Guide
|
||||
|
||||
| Access Level | Tools | Use Case |
|
||||
|--------------|-------|----------|
|
||||
| Read-only | Read, Grep, Glob | Reviews, analysis |
|
||||
| Writing | + Write | Code generation, docs |
|
||||
| Full | + Bash | Migrations, testing |
|
||||
@@ -1,9 +0,0 @@
|
||||
{
|
||||
"name": "claude-md-management",
|
||||
"description": "Tools to maintain and improve CLAUDE.md files - audit quality, capture session learnings, and keep project memory current.",
|
||||
"version": "1.0.0",
|
||||
"author": {
|
||||
"name": "Anthropic",
|
||||
"email": "support@anthropic.com"
|
||||
}
|
||||
}
|
||||
@@ -1,40 +0,0 @@
|
||||
# CLAUDE.md Management Plugin
|
||||
|
||||
Tools to maintain and improve CLAUDE.md files - audit quality, capture session learnings, and keep project memory current.
|
||||
|
||||
## What It Does
|
||||
|
||||
Two complementary tools for different purposes:
|
||||
|
||||
| | claude-md-improver (skill) | /revise-claude-md (command) |
|
||||
|---|---|---|
|
||||
| **Purpose** | Keep CLAUDE.md aligned with codebase | Capture session learnings |
|
||||
| **Triggered by** | Codebase changes | End of session |
|
||||
| **Use when** | Periodic maintenance | Session revealed missing context |
|
||||
|
||||
## Usage
|
||||
|
||||
### Skill: claude-md-improver
|
||||
|
||||
Audits CLAUDE.md files against current codebase state:
|
||||
|
||||
```
|
||||
"audit my CLAUDE.md files"
|
||||
"check if my CLAUDE.md is up to date"
|
||||
```
|
||||
|
||||
<img src="claude-md-improver-example.png" alt="CLAUDE.md improver showing quality scores and recommended updates" width="600">
|
||||
|
||||
### Command: /revise-claude-md
|
||||
|
||||
Captures learnings from the current session:
|
||||
|
||||
```
|
||||
/revise-claude-md
|
||||
```
|
||||
|
||||
<img src="revise-claude-md-example.png" alt="Revise command capturing session learnings into CLAUDE.md" width="600">
|
||||
|
||||
## Author
|
||||
|
||||
Isabella He (isabella@anthropic.com)
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 507 KiB |
@@ -1,54 +0,0 @@
|
||||
---
|
||||
description: Update CLAUDE.md with learnings from this session
|
||||
allowed-tools: Read, Edit, Glob
|
||||
---
|
||||
|
||||
Review this session for learnings about working with Claude Code in this codebase. Update CLAUDE.md with context that would help future Claude sessions be more effective.
|
||||
|
||||
## Step 1: Reflect
|
||||
|
||||
What context was missing that would have helped Claude work more effectively?
|
||||
- Bash commands that were used or discovered
|
||||
- Code style patterns followed
|
||||
- Testing approaches that worked
|
||||
- Environment/configuration quirks
|
||||
- Warnings or gotchas encountered
|
||||
|
||||
## Step 2: Find CLAUDE.md Files
|
||||
|
||||
```bash
|
||||
find . -name "CLAUDE.md" -o -name ".claude.local.md" 2>/dev/null | head -20
|
||||
```
|
||||
|
||||
Decide where each addition belongs:
|
||||
- `CLAUDE.md` - Team-shared (checked into git)
|
||||
- `.claude.local.md` - Personal/local only (gitignored)
|
||||
|
||||
## Step 3: Draft Additions
|
||||
|
||||
**Keep it concise** - one line per concept. CLAUDE.md is part of the prompt, so brevity matters.
|
||||
|
||||
Format: `<command or pattern>` - `<brief description>`
|
||||
|
||||
Avoid:
|
||||
- Verbose explanations
|
||||
- Obvious information
|
||||
- One-off fixes unlikely to recur
|
||||
|
||||
## Step 4: Show Proposed Changes
|
||||
|
||||
For each addition:
|
||||
|
||||
```
|
||||
### Update: ./CLAUDE.md
|
||||
|
||||
**Why:** [one-line reason]
|
||||
|
||||
\`\`\`diff
|
||||
+ [the addition - keep it brief]
|
||||
\`\`\`
|
||||
```
|
||||
|
||||
## Step 5: Apply with Approval
|
||||
|
||||
Ask if the user wants to apply the changes. Only edit files they approve.
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 542 KiB |
@@ -1,179 +0,0 @@
|
||||
---
|
||||
name: claude-md-improver
|
||||
description: Audit and improve CLAUDE.md files in repositories. Use when user asks to check, audit, update, improve, or fix CLAUDE.md files. Scans for all CLAUDE.md files, evaluates quality against templates, outputs quality report, then makes targeted updates. Also use when the user mentions "CLAUDE.md maintenance" or "project memory optimization".
|
||||
tools: Read, Glob, Grep, Bash, Edit
|
||||
---
|
||||
|
||||
# CLAUDE.md Improver
|
||||
|
||||
Audit, evaluate, and improve CLAUDE.md files across a codebase to ensure Claude Code has optimal project context.
|
||||
|
||||
**This skill can write to CLAUDE.md files.** After presenting a quality report and getting user approval, it updates CLAUDE.md files with targeted improvements.
|
||||
|
||||
## Workflow
|
||||
|
||||
### Phase 1: Discovery
|
||||
|
||||
Find all CLAUDE.md files in the repository:
|
||||
|
||||
```bash
|
||||
find . -name "CLAUDE.md" -o -name ".claude.md" -o -name ".claude.local.md" 2>/dev/null | head -50
|
||||
```
|
||||
|
||||
**File Types & Locations:**
|
||||
|
||||
| Type | Location | Purpose |
|
||||
|------|----------|---------|
|
||||
| Project root | `./CLAUDE.md` | Primary project context (checked into git, shared with team) |
|
||||
| Local overrides | `./.claude.local.md` | Personal/local settings (gitignored, not shared) |
|
||||
| Global defaults | `~/.claude/CLAUDE.md` | User-wide defaults across all projects |
|
||||
| Package-specific | `./packages/*/CLAUDE.md` | Module-level context in monorepos |
|
||||
| Subdirectory | Any nested location | Feature/domain-specific context |
|
||||
|
||||
**Note:** Claude auto-discovers CLAUDE.md files in parent directories, making monorepo setups work automatically.
|
||||
|
||||
### Phase 2: Quality Assessment
|
||||
|
||||
For each CLAUDE.md file, evaluate against quality criteria. See [references/quality-criteria.md](references/quality-criteria.md) for detailed rubrics.
|
||||
|
||||
**Quick Assessment Checklist:**
|
||||
|
||||
| Criterion | Weight | Check |
|
||||
|-----------|--------|-------|
|
||||
| Commands/workflows documented | High | Are build/test/deploy commands present? |
|
||||
| Architecture clarity | High | Can Claude understand the codebase structure? |
|
||||
| Non-obvious patterns | Medium | Are gotchas and quirks documented? |
|
||||
| Conciseness | Medium | No verbose explanations or obvious info? |
|
||||
| Currency | High | Does it reflect current codebase state? |
|
||||
| Actionability | High | Are instructions executable, not vague? |
|
||||
|
||||
**Quality Scores:**
|
||||
- **A (90-100)**: Comprehensive, current, actionable
|
||||
- **B (70-89)**: Good coverage, minor gaps
|
||||
- **C (50-69)**: Basic info, missing key sections
|
||||
- **D (30-49)**: Sparse or outdated
|
||||
- **F (0-29)**: Missing or severely outdated
|
||||
|
||||
### Phase 3: Quality Report Output
|
||||
|
||||
**ALWAYS output the quality report BEFORE making any updates.**
|
||||
|
||||
Format:
|
||||
|
||||
```
|
||||
## CLAUDE.md Quality Report
|
||||
|
||||
### Summary
|
||||
- Files found: X
|
||||
- Average score: X/100
|
||||
- Files needing update: X
|
||||
|
||||
### File-by-File Assessment
|
||||
|
||||
#### 1. ./CLAUDE.md (Project Root)
|
||||
**Score: XX/100 (Grade: X)**
|
||||
|
||||
| Criterion | Score | Notes |
|
||||
|-----------|-------|-------|
|
||||
| Commands/workflows | X/20 | ... |
|
||||
| Architecture clarity | X/20 | ... |
|
||||
| Non-obvious patterns | X/15 | ... |
|
||||
| Conciseness | X/15 | ... |
|
||||
| Currency | X/15 | ... |
|
||||
| Actionability | X/15 | ... |
|
||||
|
||||
**Issues:**
|
||||
- [List specific problems]
|
||||
|
||||
**Recommended additions:**
|
||||
- [List what should be added]
|
||||
|
||||
#### 2. ./packages/api/CLAUDE.md (Package-specific)
|
||||
...
|
||||
```
|
||||
|
||||
### Phase 4: Targeted Updates
|
||||
|
||||
After outputting the quality report, ask user for confirmation before updating.
|
||||
|
||||
**Update Guidelines (Critical):**
|
||||
|
||||
1. **Propose targeted additions only** - Focus on genuinely useful info:
|
||||
- Commands or workflows discovered during analysis
|
||||
- Gotchas or non-obvious patterns found in code
|
||||
- Package relationships that weren't clear
|
||||
- Testing approaches that work
|
||||
- Configuration quirks
|
||||
|
||||
2. **Keep it minimal** - Avoid:
|
||||
- Restating what's obvious from the code
|
||||
- Generic best practices already covered
|
||||
- One-off fixes unlikely to recur
|
||||
- Verbose explanations when a one-liner suffices
|
||||
|
||||
3. **Show diffs** - For each change, show:
|
||||
- Which CLAUDE.md file to update
|
||||
- The specific addition (as a diff or quoted block)
|
||||
- Brief explanation of why this helps future sessions
|
||||
|
||||
**Diff Format:**
|
||||
|
||||
```markdown
|
||||
### Update: ./CLAUDE.md
|
||||
|
||||
**Why:** Build command was missing, causing confusion about how to run the project.
|
||||
|
||||
```diff
|
||||
+ ## Quick Start
|
||||
+
|
||||
+ ```bash
|
||||
+ npm install
|
||||
+ npm run dev # Start development server on port 3000
|
||||
+ ```
|
||||
```
|
||||
```
|
||||
|
||||
### Phase 5: Apply Updates
|
||||
|
||||
After user approval, apply changes using the Edit tool. Preserve existing content structure.
|
||||
|
||||
## Templates
|
||||
|
||||
See [references/templates.md](references/templates.md) for CLAUDE.md templates by project type.
|
||||
|
||||
## Common Issues to Flag
|
||||
|
||||
1. **Stale commands**: Build commands that no longer work
|
||||
2. **Missing dependencies**: Required tools not mentioned
|
||||
3. **Outdated architecture**: File structure that's changed
|
||||
4. **Missing environment setup**: Required env vars or config
|
||||
5. **Broken test commands**: Test scripts that have changed
|
||||
6. **Undocumented gotchas**: Non-obvious patterns not captured
|
||||
|
||||
## User Tips to Share
|
||||
|
||||
When presenting recommendations, remind users:
|
||||
|
||||
- **`#` key shortcut**: During a Claude session, press `#` to have Claude auto-incorporate learnings into CLAUDE.md
|
||||
- **Keep it concise**: CLAUDE.md should be human-readable; dense is better than verbose
|
||||
- **Actionable commands**: All documented commands should be copy-paste ready
|
||||
- **Use `.claude.local.md`**: For personal preferences not shared with team (add to `.gitignore`)
|
||||
- **Global defaults**: Put user-wide preferences in `~/.claude/CLAUDE.md`
|
||||
|
||||
## What Makes a Great CLAUDE.md
|
||||
|
||||
**Key principles:**
|
||||
- Concise and human-readable
|
||||
- Actionable commands that can be copy-pasted
|
||||
- Project-specific patterns, not generic advice
|
||||
- Non-obvious gotchas and warnings
|
||||
|
||||
**Recommended sections** (use only what's relevant):
|
||||
- Commands (build, test, dev, lint)
|
||||
- Architecture (directory structure)
|
||||
- Key Files (entry points, config)
|
||||
- Code Style (project conventions)
|
||||
- Environment (required vars, setup)
|
||||
- Testing (commands, patterns)
|
||||
- Gotchas (quirks, common mistakes)
|
||||
- Workflow (when to do what)
|
||||
@@ -1,109 +0,0 @@
|
||||
# CLAUDE.md Quality Criteria
|
||||
|
||||
## Scoring Rubric
|
||||
|
||||
### 1. Commands/Workflows (20 points)
|
||||
|
||||
**20 points**: All essential commands documented with context
|
||||
- Build, test, lint, deploy commands present
|
||||
- Development workflow clear
|
||||
- Common operations documented
|
||||
|
||||
**15 points**: Most commands present, some missing context
|
||||
|
||||
**10 points**: Basic commands only, no workflow
|
||||
|
||||
**5 points**: Few commands, many missing
|
||||
|
||||
**0 points**: No commands documented
|
||||
|
||||
### 2. Architecture Clarity (20 points)
|
||||
|
||||
**20 points**: Clear codebase map
|
||||
- Key directories explained
|
||||
- Module relationships documented
|
||||
- Entry points identified
|
||||
- Data flow described where relevant
|
||||
|
||||
**15 points**: Good structure overview, minor gaps
|
||||
|
||||
**10 points**: Basic directory listing only
|
||||
|
||||
**5 points**: Vague or incomplete
|
||||
|
||||
**0 points**: No architecture info
|
||||
|
||||
### 3. Non-Obvious Patterns (15 points)
|
||||
|
||||
**15 points**: Gotchas and quirks captured
|
||||
- Known issues documented
|
||||
- Workarounds explained
|
||||
- Edge cases noted
|
||||
- "Why we do it this way" for unusual patterns
|
||||
|
||||
**10 points**: Some patterns documented
|
||||
|
||||
**5 points**: Minimal pattern documentation
|
||||
|
||||
**0 points**: No patterns or gotchas
|
||||
|
||||
### 4. Conciseness (15 points)
|
||||
|
||||
**15 points**: Dense, valuable content
|
||||
- No filler or obvious info
|
||||
- Each line adds value
|
||||
- No redundancy with code comments
|
||||
|
||||
**10 points**: Mostly concise, some padding
|
||||
|
||||
**5 points**: Verbose in places
|
||||
|
||||
**0 points**: Mostly filler or restates obvious code
|
||||
|
||||
### 5. Currency (15 points)
|
||||
|
||||
**15 points**: Reflects current codebase
|
||||
- Commands work as documented
|
||||
- File references accurate
|
||||
- Tech stack current
|
||||
|
||||
**10 points**: Mostly current, minor staleness
|
||||
|
||||
**5 points**: Several outdated references
|
||||
|
||||
**0 points**: Severely outdated
|
||||
|
||||
### 6. Actionability (15 points)
|
||||
|
||||
**15 points**: Instructions are executable
|
||||
- Commands can be copy-pasted
|
||||
- Steps are concrete
|
||||
- Paths are real
|
||||
|
||||
**10 points**: Mostly actionable
|
||||
|
||||
**5 points**: Some vague instructions
|
||||
|
||||
**0 points**: Vague or theoretical
|
||||
|
||||
## Assessment Process
|
||||
|
||||
1. Read the CLAUDE.md file completely
|
||||
2. Cross-reference with actual codebase:
|
||||
- Run documented commands (mentally or actually)
|
||||
- Check if referenced files exist
|
||||
- Verify architecture descriptions
|
||||
3. Score each criterion
|
||||
4. Calculate total and assign grade
|
||||
5. List specific issues found
|
||||
6. Propose concrete improvements
|
||||
|
||||
## Red Flags
|
||||
|
||||
- Commands that would fail (wrong paths, missing deps)
|
||||
- References to deleted files/folders
|
||||
- Outdated tech versions
|
||||
- Copy-paste from templates without customization
|
||||
- Generic advice not specific to the project
|
||||
- "TODO" items never completed
|
||||
- Duplicate info across multiple CLAUDE.md files
|
||||
@@ -1,253 +0,0 @@
|
||||
# CLAUDE.md Templates
|
||||
|
||||
## Key Principles
|
||||
|
||||
- **Concise**: Dense, human-readable content; one line per concept when possible
|
||||
- **Actionable**: Commands should be copy-paste ready
|
||||
- **Project-specific**: Document patterns unique to this project, not generic advice
|
||||
- **Current**: All info should reflect actual codebase state
|
||||
|
||||
---
|
||||
|
||||
## Recommended Sections
|
||||
|
||||
Use only the sections relevant to the project. Not all sections are needed.
|
||||
|
||||
### Commands
|
||||
|
||||
Document the essential commands for working with the project.
|
||||
|
||||
```markdown
|
||||
## Commands
|
||||
|
||||
| Command | Description |
|
||||
|---------|-------------|
|
||||
| `<install command>` | Install dependencies |
|
||||
| `<dev command>` | Start development server |
|
||||
| `<build command>` | Production build |
|
||||
| `<test command>` | Run tests |
|
||||
| `<lint command>` | Lint/format code |
|
||||
```
|
||||
|
||||
### Architecture
|
||||
|
||||
Describe the project structure so Claude understands where things live.
|
||||
|
||||
```markdown
|
||||
## Architecture
|
||||
|
||||
```
|
||||
<root>/
|
||||
<dir>/ # <purpose>
|
||||
<dir>/ # <purpose>
|
||||
<dir>/ # <purpose>
|
||||
```
|
||||
```
|
||||
|
||||
### Key Files
|
||||
|
||||
List important files that Claude should know about.
|
||||
|
||||
```markdown
|
||||
## Key Files
|
||||
|
||||
- `<path>` - <purpose>
|
||||
- `<path>` - <purpose>
|
||||
```
|
||||
|
||||
### Code Style
|
||||
|
||||
Document project-specific coding conventions.
|
||||
|
||||
```markdown
|
||||
## Code Style
|
||||
|
||||
- <convention>
|
||||
- <convention>
|
||||
- <preference over alternative>
|
||||
```
|
||||
|
||||
### Environment
|
||||
|
||||
Document required environment variables and setup.
|
||||
|
||||
```markdown
|
||||
## Environment
|
||||
|
||||
Required:
|
||||
- `<VAR_NAME>` - <purpose>
|
||||
- `<VAR_NAME>` - <purpose>
|
||||
|
||||
Setup:
|
||||
- <setup step>
|
||||
```
|
||||
|
||||
### Testing
|
||||
|
||||
Document testing approach and commands.
|
||||
|
||||
```markdown
|
||||
## Testing
|
||||
|
||||
- `<test command>` - <what it tests>
|
||||
- <testing convention or pattern>
|
||||
```
|
||||
|
||||
### Gotchas
|
||||
|
||||
Document non-obvious patterns, quirks, and warnings.
|
||||
|
||||
```markdown
|
||||
## Gotchas
|
||||
|
||||
- <non-obvious thing that causes issues>
|
||||
- <ordering dependency or prerequisite>
|
||||
- <common mistake to avoid>
|
||||
```
|
||||
|
||||
### Workflow
|
||||
|
||||
Document development workflow patterns.
|
||||
|
||||
```markdown
|
||||
## Workflow
|
||||
|
||||
- <when to do X>
|
||||
- <preferred approach for Y>
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Template: Project Root (Minimal)
|
||||
|
||||
```markdown
|
||||
# <Project Name>
|
||||
|
||||
<One-line description>
|
||||
|
||||
## Commands
|
||||
|
||||
| Command | Description |
|
||||
|---------|-------------|
|
||||
| `<command>` | <description> |
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
<structure>
|
||||
```
|
||||
|
||||
## Gotchas
|
||||
|
||||
- <gotcha>
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Template: Project Root (Comprehensive)
|
||||
|
||||
```markdown
|
||||
# <Project Name>
|
||||
|
||||
<One-line description>
|
||||
|
||||
## Commands
|
||||
|
||||
| Command | Description |
|
||||
|---------|-------------|
|
||||
| `<command>` | <description> |
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
<structure with descriptions>
|
||||
```
|
||||
|
||||
## Key Files
|
||||
|
||||
- `<path>` - <purpose>
|
||||
|
||||
## Code Style
|
||||
|
||||
- <convention>
|
||||
|
||||
## Environment
|
||||
|
||||
- `<VAR>` - <purpose>
|
||||
|
||||
## Testing
|
||||
|
||||
- `<command>` - <scope>
|
||||
|
||||
## Gotchas
|
||||
|
||||
- <gotcha>
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Template: Package/Module
|
||||
|
||||
For packages within a monorepo or distinct modules.
|
||||
|
||||
```markdown
|
||||
# <Package Name>
|
||||
|
||||
<Purpose of this package>
|
||||
|
||||
## Usage
|
||||
|
||||
```
|
||||
<import/usage example>
|
||||
```
|
||||
|
||||
## Key Exports
|
||||
|
||||
- `<export>` - <purpose>
|
||||
|
||||
## Dependencies
|
||||
|
||||
- `<dependency>` - <why needed>
|
||||
|
||||
## Notes
|
||||
|
||||
- <important note>
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Template: Monorepo Root
|
||||
|
||||
```markdown
|
||||
# <Monorepo Name>
|
||||
|
||||
<Description>
|
||||
|
||||
## Packages
|
||||
|
||||
| Package | Description | Path |
|
||||
|---------|-------------|------|
|
||||
| `<name>` | <purpose> | `<path>` |
|
||||
|
||||
## Commands
|
||||
|
||||
| Command | Description |
|
||||
|---------|-------------|
|
||||
| `<command>` | <description> |
|
||||
|
||||
## Cross-Package Patterns
|
||||
|
||||
- <shared pattern>
|
||||
- <generation/sync pattern>
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Update Principles
|
||||
|
||||
When updating any CLAUDE.md:
|
||||
|
||||
1. **Be specific**: Use actual file paths, real commands from this project
|
||||
2. **Be current**: Verify info against the actual codebase
|
||||
3. **Be brief**: One line per concept when possible
|
||||
4. **Be useful**: Would this help a new Claude session understand the project?
|
||||
@@ -1,150 +0,0 @@
|
||||
# CLAUDE.md Update Guidelines
|
||||
|
||||
## Core Principle
|
||||
|
||||
Only add information that will genuinely help future Claude sessions. The context window is precious - every line must earn its place.
|
||||
|
||||
## What TO Add
|
||||
|
||||
### 1. Commands/Workflows Discovered
|
||||
|
||||
```markdown
|
||||
## Build
|
||||
|
||||
`npm run build:prod` - Full production build with optimization
|
||||
`npm run build:dev` - Fast dev build (no minification)
|
||||
```
|
||||
|
||||
Why: Saves future sessions from discovering these again.
|
||||
|
||||
### 2. Gotchas and Non-Obvious Patterns
|
||||
|
||||
```markdown
|
||||
## Gotchas
|
||||
|
||||
- Tests must run sequentially (`--runInBand`) due to shared DB state
|
||||
- `yarn.lock` is authoritative; delete `node_modules` if deps mismatch
|
||||
```
|
||||
|
||||
Why: Prevents repeating debugging sessions.
|
||||
|
||||
### 3. Package Relationships
|
||||
|
||||
```markdown
|
||||
## Dependencies
|
||||
|
||||
The `auth` module depends on `crypto` being initialized first.
|
||||
Import order matters in `src/bootstrap.ts`.
|
||||
```
|
||||
|
||||
Why: Architecture knowledge that isn't obvious from code.
|
||||
|
||||
### 4. Testing Approaches That Worked
|
||||
|
||||
```markdown
|
||||
## Testing
|
||||
|
||||
For API endpoints: Use `supertest` with the test helper in `tests/setup.ts`
|
||||
Mocking: Factory functions in `tests/factories/` (not inline mocks)
|
||||
```
|
||||
|
||||
Why: Establishes patterns that work.
|
||||
|
||||
### 5. Configuration Quirks
|
||||
|
||||
```markdown
|
||||
## Config
|
||||
|
||||
- `NEXT_PUBLIC_*` vars must be set at build time, not runtime
|
||||
- Redis connection requires `?family=0` suffix for IPv6
|
||||
```
|
||||
|
||||
Why: Environment-specific knowledge.
|
||||
|
||||
## What NOT to Add
|
||||
|
||||
### 1. Obvious Code Info
|
||||
|
||||
Bad:
|
||||
```markdown
|
||||
The `UserService` class handles user operations.
|
||||
```
|
||||
|
||||
The class name already tells us this.
|
||||
|
||||
### 2. Generic Best Practices
|
||||
|
||||
Bad:
|
||||
```markdown
|
||||
Always write tests for new features.
|
||||
Use meaningful variable names.
|
||||
```
|
||||
|
||||
This is universal advice, not project-specific.
|
||||
|
||||
### 3. One-Off Fixes
|
||||
|
||||
Bad:
|
||||
```markdown
|
||||
We fixed a bug in commit abc123 where the login button didn't work.
|
||||
```
|
||||
|
||||
Won't recur; clutters the file.
|
||||
|
||||
### 4. Verbose Explanations
|
||||
|
||||
Bad:
|
||||
```markdown
|
||||
The authentication system uses JWT tokens. JWT (JSON Web Tokens) are
|
||||
an open standard (RFC 7519) that defines a compact and self-contained
|
||||
way for securely transmitting information between parties as a JSON
|
||||
object. In our implementation, we use the HS256 algorithm which...
|
||||
```
|
||||
|
||||
Good:
|
||||
```markdown
|
||||
Auth: JWT with HS256, tokens in `Authorization: Bearer <token>` header.
|
||||
```
|
||||
|
||||
## Diff Format for Updates
|
||||
|
||||
For each suggested change:
|
||||
|
||||
### 1. Identify the File
|
||||
|
||||
```
|
||||
File: ./CLAUDE.md
|
||||
Section: Commands (new section after ## Architecture)
|
||||
```
|
||||
|
||||
### 2. Show the Change
|
||||
|
||||
```diff
|
||||
## Architecture
|
||||
...
|
||||
|
||||
+## Commands
|
||||
+
|
||||
+| Command | Purpose |
|
||||
+|---------|---------|
|
||||
+| `npm run dev` | Dev server with HMR |
|
||||
+| `npm run build` | Production build |
|
||||
+| `npm test` | Run test suite |
|
||||
```
|
||||
|
||||
### 3. Explain Why
|
||||
|
||||
> **Why this helps:** The build commands weren't documented, causing
|
||||
> confusion about how to run the project. This saves future sessions
|
||||
> from needing to inspect `package.json`.
|
||||
|
||||
## Validation Checklist
|
||||
|
||||
Before finalizing an update, verify:
|
||||
|
||||
- [ ] Each addition is project-specific
|
||||
- [ ] No generic advice or obvious info
|
||||
- [ ] Commands are tested and work
|
||||
- [ ] File paths are accurate
|
||||
- [ ] Would a new Claude session find this helpful?
|
||||
- [ ] Is this the most concise way to express the info?
|
||||
@@ -1,9 +0,0 @@
|
||||
{
|
||||
"name": "code-simplifier",
|
||||
"version": "1.0.0",
|
||||
"description": "Agent that simplifies and refines code for clarity, consistency, and maintainability while preserving functionality",
|
||||
"author": {
|
||||
"name": "Anthropic",
|
||||
"email": "support@anthropic.com"
|
||||
}
|
||||
}
|
||||
@@ -1,52 +0,0 @@
|
||||
---
|
||||
name: code-simplifier
|
||||
description: Simplifies and refines code for clarity, consistency, and maintainability while preserving all functionality. Focuses on recently modified code unless instructed otherwise.
|
||||
model: opus
|
||||
---
|
||||
|
||||
You are an expert code simplification specialist focused on enhancing code clarity, consistency, and maintainability while preserving exact functionality. Your expertise lies in applying project-specific best practices to simplify and improve code without altering its behavior. You prioritize readable, explicit code over overly compact solutions. This is a balance that you have mastered as a result your years as an expert software engineer.
|
||||
|
||||
You will analyze recently modified code and apply refinements that:
|
||||
|
||||
1. **Preserve Functionality**: Never change what the code does - only how it does it. All original features, outputs, and behaviors must remain intact.
|
||||
|
||||
2. **Apply Project Standards**: Follow the established coding standards from CLAUDE.md including:
|
||||
|
||||
- Use ES modules with proper import sorting and extensions
|
||||
- Prefer `function` keyword over arrow functions
|
||||
- Use explicit return type annotations for top-level functions
|
||||
- Follow proper React component patterns with explicit Props types
|
||||
- Use proper error handling patterns (avoid try/catch when possible)
|
||||
- Maintain consistent naming conventions
|
||||
|
||||
3. **Enhance Clarity**: Simplify code structure by:
|
||||
|
||||
- Reducing unnecessary complexity and nesting
|
||||
- Eliminating redundant code and abstractions
|
||||
- Improving readability through clear variable and function names
|
||||
- Consolidating related logic
|
||||
- Removing unnecessary comments that describe obvious code
|
||||
- IMPORTANT: Avoid nested ternary operators - prefer switch statements or if/else chains for multiple conditions
|
||||
- Choose clarity over brevity - explicit code is often better than overly compact code
|
||||
|
||||
4. **Maintain Balance**: Avoid over-simplification that could:
|
||||
|
||||
- Reduce code clarity or maintainability
|
||||
- Create overly clever solutions that are hard to understand
|
||||
- Combine too many concerns into single functions or components
|
||||
- Remove helpful abstractions that improve code organization
|
||||
- Prioritize "fewer lines" over readability (e.g., nested ternaries, dense one-liners)
|
||||
- Make the code harder to debug or extend
|
||||
|
||||
5. **Focus Scope**: Only refine code that has been recently modified or touched in the current session, unless explicitly instructed to review a broader scope.
|
||||
|
||||
Your refinement process:
|
||||
|
||||
1. Identify the recently modified code sections
|
||||
2. Analyze for opportunities to improve elegance and consistency
|
||||
3. Apply project-specific best practices and coding standards
|
||||
4. Ensure all functionality remains unchanged
|
||||
5. Verify the refined code is simpler and more maintainable
|
||||
6. Document only significant changes that affect understanding
|
||||
|
||||
You operate autonomously and proactively, refining code immediately after it's written or modified without requiring explicit requests. Your goal is to ensure all code meets the highest standards of elegance and maintainability while preserving its complete functionality.
|
||||
@@ -1,25 +0,0 @@
|
||||
# csharp-lsp
|
||||
|
||||
C# language server for Claude Code, providing code intelligence and diagnostics.
|
||||
|
||||
## Supported Extensions
|
||||
`.cs`
|
||||
|
||||
## Installation
|
||||
|
||||
### Via .NET tool (recommended)
|
||||
```bash
|
||||
dotnet tool install --global csharp-ls
|
||||
```
|
||||
|
||||
### Via Homebrew (macOS)
|
||||
```bash
|
||||
brew install csharp-ls
|
||||
```
|
||||
|
||||
## Requirements
|
||||
- .NET SDK 6.0 or later
|
||||
|
||||
## More Information
|
||||
- [csharp-ls GitHub](https://github.com/razzmatazz/csharp-language-server)
|
||||
- [.NET SDK Download](https://dotnet.microsoft.com/download)
|
||||
@@ -1,20 +0,0 @@
|
||||
# gopls-lsp
|
||||
|
||||
Go language server for Claude Code, providing code intelligence, refactoring, and analysis.
|
||||
|
||||
## Supported Extensions
|
||||
`.go`
|
||||
|
||||
## Installation
|
||||
|
||||
Install gopls using the Go toolchain:
|
||||
|
||||
```bash
|
||||
go install golang.org/x/tools/gopls@latest
|
||||
```
|
||||
|
||||
Make sure `$GOPATH/bin` (or `$HOME/go/bin`) is in your PATH.
|
||||
|
||||
## More Information
|
||||
- [gopls Documentation](https://pkg.go.dev/golang.org/x/tools/gopls)
|
||||
- [GitHub Repository](https://github.com/golang/tools/tree/master/gopls)
|
||||
@@ -7,7 +7,7 @@ from functools import lru_cache
|
||||
from typing import List, Dict, Any, Optional
|
||||
|
||||
# Import from local module
|
||||
from core.config_loader import Rule, Condition
|
||||
from hookify.core.config_loader import Rule, Condition
|
||||
|
||||
|
||||
# Cache compiled regexes (max 128 patterns)
|
||||
@@ -275,7 +275,7 @@ class RuleEngine:
|
||||
|
||||
# For testing
|
||||
if __name__ == '__main__':
|
||||
from core.config_loader import Condition, Rule
|
||||
from hookify.core.config_loader import Condition, Rule
|
||||
|
||||
# Test rule evaluation
|
||||
rule = Rule(
|
||||
|
||||
@@ -9,14 +9,18 @@ import os
|
||||
import sys
|
||||
import json
|
||||
|
||||
# Add plugin root to Python path for imports
|
||||
# CRITICAL: Add plugin root to Python path for imports
|
||||
PLUGIN_ROOT = os.environ.get('CLAUDE_PLUGIN_ROOT')
|
||||
if PLUGIN_ROOT and PLUGIN_ROOT not in sys.path:
|
||||
sys.path.insert(0, PLUGIN_ROOT)
|
||||
if PLUGIN_ROOT:
|
||||
parent_dir = os.path.dirname(PLUGIN_ROOT)
|
||||
if parent_dir not in sys.path:
|
||||
sys.path.insert(0, parent_dir)
|
||||
if PLUGIN_ROOT not in sys.path:
|
||||
sys.path.insert(0, PLUGIN_ROOT)
|
||||
|
||||
try:
|
||||
from core.config_loader import load_rules
|
||||
from core.rule_engine import RuleEngine
|
||||
from hookify.core.config_loader import load_rules
|
||||
from hookify.core.rule_engine import RuleEngine
|
||||
except ImportError as e:
|
||||
error_msg = {"systemMessage": f"Hookify import error: {e}"}
|
||||
print(json.dumps(error_msg), file=sys.stdout)
|
||||
|
||||
@@ -9,14 +9,22 @@ import os
|
||||
import sys
|
||||
import json
|
||||
|
||||
# Add plugin root to Python path for imports
|
||||
# CRITICAL: Add plugin root to Python path for imports
|
||||
# We need to add the parent of the plugin directory so Python can find "hookify" package
|
||||
PLUGIN_ROOT = os.environ.get('CLAUDE_PLUGIN_ROOT')
|
||||
if PLUGIN_ROOT and PLUGIN_ROOT not in sys.path:
|
||||
sys.path.insert(0, PLUGIN_ROOT)
|
||||
if PLUGIN_ROOT:
|
||||
# Add the parent directory of the plugin
|
||||
parent_dir = os.path.dirname(PLUGIN_ROOT)
|
||||
if parent_dir not in sys.path:
|
||||
sys.path.insert(0, parent_dir)
|
||||
|
||||
# Also add PLUGIN_ROOT itself in case we have other scripts
|
||||
if PLUGIN_ROOT not in sys.path:
|
||||
sys.path.insert(0, PLUGIN_ROOT)
|
||||
|
||||
try:
|
||||
from core.config_loader import load_rules
|
||||
from core.rule_engine import RuleEngine
|
||||
from hookify.core.config_loader import load_rules
|
||||
from hookify.core.rule_engine import RuleEngine
|
||||
except ImportError as e:
|
||||
# If imports fail, allow operation and log error
|
||||
error_msg = {"systemMessage": f"Hookify import error: {e}"}
|
||||
|
||||
@@ -9,14 +9,18 @@ import os
|
||||
import sys
|
||||
import json
|
||||
|
||||
# Add plugin root to Python path for imports
|
||||
# CRITICAL: Add plugin root to Python path for imports
|
||||
PLUGIN_ROOT = os.environ.get('CLAUDE_PLUGIN_ROOT')
|
||||
if PLUGIN_ROOT and PLUGIN_ROOT not in sys.path:
|
||||
sys.path.insert(0, PLUGIN_ROOT)
|
||||
if PLUGIN_ROOT:
|
||||
parent_dir = os.path.dirname(PLUGIN_ROOT)
|
||||
if parent_dir not in sys.path:
|
||||
sys.path.insert(0, parent_dir)
|
||||
if PLUGIN_ROOT not in sys.path:
|
||||
sys.path.insert(0, PLUGIN_ROOT)
|
||||
|
||||
try:
|
||||
from core.config_loader import load_rules
|
||||
from core.rule_engine import RuleEngine
|
||||
from hookify.core.config_loader import load_rules
|
||||
from hookify.core.rule_engine import RuleEngine
|
||||
except ImportError as e:
|
||||
error_msg = {"systemMessage": f"Hookify import error: {e}"}
|
||||
print(json.dumps(error_msg), file=sys.stdout)
|
||||
|
||||
@@ -9,14 +9,18 @@ import os
|
||||
import sys
|
||||
import json
|
||||
|
||||
# Add plugin root to Python path for imports
|
||||
# CRITICAL: Add plugin root to Python path for imports
|
||||
PLUGIN_ROOT = os.environ.get('CLAUDE_PLUGIN_ROOT')
|
||||
if PLUGIN_ROOT and PLUGIN_ROOT not in sys.path:
|
||||
sys.path.insert(0, PLUGIN_ROOT)
|
||||
if PLUGIN_ROOT:
|
||||
parent_dir = os.path.dirname(PLUGIN_ROOT)
|
||||
if parent_dir not in sys.path:
|
||||
sys.path.insert(0, parent_dir)
|
||||
if PLUGIN_ROOT not in sys.path:
|
||||
sys.path.insert(0, PLUGIN_ROOT)
|
||||
|
||||
try:
|
||||
from core.config_loader import load_rules
|
||||
from core.rule_engine import RuleEngine
|
||||
from hookify.core.config_loader import load_rules
|
||||
from hookify.core.rule_engine import RuleEngine
|
||||
except ImportError as e:
|
||||
error_msg = {"systemMessage": f"Hookify import error: {e}"}
|
||||
print(json.dumps(error_msg), file=sys.stdout)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
---
|
||||
name: writing-hookify-rules
|
||||
name: Writing Hookify Rules
|
||||
description: This skill should be used when the user asks to "create a hookify rule", "write a hook rule", "configure hookify", "add a hookify rule", or needs guidance on hookify rule syntax and patterns.
|
||||
version: 0.1.0
|
||||
---
|
||||
|
||||
@@ -1,33 +0,0 @@
|
||||
# jdtls-lsp
|
||||
|
||||
Java language server (Eclipse JDT.LS) for Claude Code, providing code intelligence and refactoring.
|
||||
|
||||
## Supported Extensions
|
||||
`.java`
|
||||
|
||||
## Installation
|
||||
|
||||
### Via Homebrew (macOS)
|
||||
```bash
|
||||
brew install jdtls
|
||||
```
|
||||
|
||||
### Via package manager (Linux)
|
||||
```bash
|
||||
# Arch Linux (AUR)
|
||||
yay -S jdtls
|
||||
|
||||
# Other distros: manual installation required
|
||||
```
|
||||
|
||||
### Manual Installation
|
||||
1. Download from [Eclipse JDT.LS releases](https://download.eclipse.org/jdtls/snapshots/)
|
||||
2. Extract to a directory (e.g., `~/.local/share/jdtls`)
|
||||
3. Create a wrapper script named `jdtls` in your PATH
|
||||
|
||||
## Requirements
|
||||
- Java 17 or later (JDK, not just JRE)
|
||||
|
||||
## More Information
|
||||
- [Eclipse JDT.LS GitHub](https://github.com/eclipse-jdtls/eclipse.jdt.ls)
|
||||
- [VSCode Java Extension](https://github.com/redhat-developer/vscode-java) (uses JDT.LS)
|
||||
@@ -1,16 +0,0 @@
|
||||
Kotlin language server for Claude Code, providing code intelligence, refactoring, and analysis.
|
||||
|
||||
## Supported Extensions
|
||||
`.kt`
|
||||
`.kts`
|
||||
|
||||
## Installation
|
||||
|
||||
Install the Kotlin LSP CLI.
|
||||
|
||||
```bash
|
||||
brew install JetBrains/utils/kotlin-lsp
|
||||
```
|
||||
|
||||
## More Information
|
||||
- [kotlin LSP](https://github.com/Kotlin/kotlin-lsp)
|
||||
@@ -1,32 +0,0 @@
|
||||
# lua-lsp
|
||||
|
||||
Lua language server for Claude Code, providing code intelligence and diagnostics.
|
||||
|
||||
## Supported Extensions
|
||||
`.lua`
|
||||
|
||||
## Installation
|
||||
|
||||
### Via Homebrew (macOS)
|
||||
```bash
|
||||
brew install lua-language-server
|
||||
```
|
||||
|
||||
### Via package manager (Linux)
|
||||
```bash
|
||||
# Ubuntu/Debian (via snap)
|
||||
sudo snap install lua-language-server --classic
|
||||
|
||||
# Arch Linux
|
||||
sudo pacman -S lua-language-server
|
||||
|
||||
# Fedora
|
||||
sudo dnf install lua-language-server
|
||||
```
|
||||
|
||||
### Manual Installation
|
||||
Download pre-built binaries from the [releases page](https://github.com/LuaLS/lua-language-server/releases).
|
||||
|
||||
## More Information
|
||||
- [Lua Language Server GitHub](https://github.com/LuaLS/lua-language-server)
|
||||
- [Documentation](https://luals.github.io/)
|
||||
@@ -1,24 +0,0 @@
|
||||
# php-lsp
|
||||
|
||||
PHP language server (Intelephense) for Claude Code, providing code intelligence and diagnostics.
|
||||
|
||||
## Supported Extensions
|
||||
`.php`
|
||||
|
||||
## Installation
|
||||
|
||||
Install Intelephense globally via npm:
|
||||
|
||||
```bash
|
||||
npm install -g intelephense
|
||||
```
|
||||
|
||||
Or with yarn:
|
||||
|
||||
```bash
|
||||
yarn global add intelephense
|
||||
```
|
||||
|
||||
## More Information
|
||||
- [Intelephense Website](https://intelephense.com/)
|
||||
- [Intelephense on npm](https://www.npmjs.com/package/intelephense)
|
||||
@@ -1,8 +0,0 @@
|
||||
{
|
||||
"name": "playground",
|
||||
"description": "Creates interactive HTML playgrounds — self-contained single-file explorers with visual controls, live preview, and prompt output with copy button",
|
||||
"author": {
|
||||
"name": "Anthropic",
|
||||
"email": "support@anthropic.com"
|
||||
}
|
||||
}
|
||||
@@ -1,28 +0,0 @@
|
||||
# Playground Plugin
|
||||
|
||||
Creates interactive HTML playgrounds — self-contained single-file explorers that let users configure something visually through controls, see a live preview, and copy out a prompt.
|
||||
|
||||
## What is a Playground?
|
||||
|
||||
A playground is a self-contained HTML file with:
|
||||
- Interactive controls on one side
|
||||
- A live preview on the other
|
||||
- A prompt output at the bottom with a copy button
|
||||
|
||||
The user adjusts controls, explores visually, then copies the generated prompt back into Claude.
|
||||
|
||||
## When to Use
|
||||
|
||||
Use this plugin when the user asks for an interactive playground, explorer, or visual tool for a topic — especially when the input space is large, visual, or structural and hard to express as plain text.
|
||||
|
||||
## Templates
|
||||
|
||||
The skill includes templates for common playground types:
|
||||
- **design-playground** — Visual design decisions (components, layouts, spacing, color, typography)
|
||||
- **data-explorer** — Data and query building (SQL, APIs, pipelines, regex)
|
||||
- **concept-map** — Learning and exploration (concept maps, knowledge gaps, scope mapping)
|
||||
- **document-critique** — Document review (suggestions with approve/reject/comment workflow)
|
||||
|
||||
## Installation
|
||||
|
||||
Add this plugin to your Claude Code configuration to enable the playground skill.
|
||||
@@ -1,76 +0,0 @@
|
||||
---
|
||||
name: playground
|
||||
description: Creates interactive HTML playgrounds — self-contained single-file explorers that let users configure something visually through controls, see a live preview, and copy out a prompt. Use when the user asks to make a playground, explorer, or interactive tool for a topic.
|
||||
---
|
||||
|
||||
# Playground Builder
|
||||
|
||||
A playground is a self-contained HTML file with interactive controls on one side, a live preview on the other, and a prompt output at the bottom with a copy button. The user adjusts controls, explores visually, then copies the generated prompt back into Claude.
|
||||
|
||||
## When to use this skill
|
||||
|
||||
When the user asks for an interactive playground, explorer, or visual tool for a topic — especially when the input space is large, visual, or structural and hard to express as plain text.
|
||||
|
||||
## How to use this skill
|
||||
|
||||
1. **Identify the playground type** from the user's request
|
||||
2. **Load the matching template** from `templates/`:
|
||||
- `templates/design-playground.md` — Visual design decisions (components, layouts, spacing, color, typography)
|
||||
- `templates/data-explorer.md` — Data and query building (SQL, APIs, pipelines, regex)
|
||||
- `templates/concept-map.md` — Learning and exploration (concept maps, knowledge gaps, scope mapping)
|
||||
- `templates/document-critique.md` — Document review (suggestions with approve/reject/comment workflow)
|
||||
- `templates/diff-review.md` — Code review (git diffs, commits, PRs with line-by-line commenting)
|
||||
- `templates/code-map.md` — Codebase architecture (component relationships, data flow, layer diagrams)
|
||||
3. **Follow the template** to build the playground. If the topic doesn't fit any template cleanly, use the one closest and adapt.
|
||||
4. **Open in browser.** After writing the HTML file, run `open <filename>.html` to launch it in the user's default browser.
|
||||
|
||||
## Core requirements (every playground)
|
||||
|
||||
- **Single HTML file.** Inline all CSS and JS. No external dependencies.
|
||||
- **Live preview.** Updates instantly on every control change. No "Apply" button.
|
||||
- **Prompt output.** Natural language, not a value dump. Only mentions non-default choices. Includes enough context to act on without seeing the playground. Updates live.
|
||||
- **Copy button.** Clipboard copy with brief "Copied!" feedback.
|
||||
- **Sensible defaults + presets.** Looks good on first load. Include 3-5 named presets that snap all controls to a cohesive combination.
|
||||
- **Dark theme.** System font for UI, monospace for code/values. Minimal chrome.
|
||||
|
||||
## State management pattern
|
||||
|
||||
Keep a single state object. Every control writes to it, every render reads from it.
|
||||
|
||||
```javascript
|
||||
const state = { /* all configurable values */ };
|
||||
|
||||
function updateAll() {
|
||||
renderPreview(); // update the visual
|
||||
updatePrompt(); // rebuild the prompt text
|
||||
}
|
||||
// Every control calls updateAll() on change
|
||||
```
|
||||
|
||||
## Prompt output pattern
|
||||
|
||||
```javascript
|
||||
function updatePrompt() {
|
||||
const parts = [];
|
||||
|
||||
// Only mention non-default values
|
||||
if (state.borderRadius !== DEFAULTS.borderRadius) {
|
||||
parts.push(`border-radius of ${state.borderRadius}px`);
|
||||
}
|
||||
|
||||
// Use qualitative language alongside numbers
|
||||
if (state.shadowBlur > 16) parts.push('a pronounced shadow');
|
||||
else if (state.shadowBlur > 0) parts.push('a subtle shadow');
|
||||
|
||||
prompt.textContent = `Update the card to use ${parts.join(', ')}.`;
|
||||
}
|
||||
```
|
||||
|
||||
## Common mistakes to avoid
|
||||
|
||||
- Prompt output is just a value dump → write it as a natural instruction
|
||||
- Too many controls at once → group by concern, hide advanced in a collapsible section
|
||||
- Preview doesn't update instantly → every control change must trigger immediate re-render
|
||||
- No defaults or presets → starts empty or broken on load
|
||||
- External dependencies → if CDN is down, playground is dead
|
||||
- Prompt lacks context → include enough that it's actionable without the playground
|
||||
@@ -1,158 +0,0 @@
|
||||
# Code Map Template
|
||||
|
||||
Use this template when the playground is about visualizing codebase architecture: component relationships, data flow, layer diagrams, system architecture with interactive commenting for feedback.
|
||||
|
||||
## Layout
|
||||
|
||||
```
|
||||
+-------------------+----------------------------------+
|
||||
| | |
|
||||
| Controls: | SVG Canvas |
|
||||
| • View presets | (nodes + connections) |
|
||||
| • Layer toggles | with zoom controls |
|
||||
| • Connection | |
|
||||
| type filters | Legend (bottom-left) |
|
||||
| | |
|
||||
| Comments (n): +----------------------------------+
|
||||
| • List of user | Prompt output |
|
||||
| comments with | [ Copy Prompt ] |
|
||||
| delete buttons | |
|
||||
+-------------------+----------------------------------+
|
||||
```
|
||||
|
||||
Code map playgrounds use an SVG canvas for the architecture diagram. Users click components to add comments, which become part of the generated prompt. Layer and connection filters let users focus on specific parts of the system.
|
||||
|
||||
## Control types for code maps
|
||||
|
||||
| Decision | Control | Example |
|
||||
|---|---|---|
|
||||
| System view | Preset buttons | Full System, Chat Flow, Data Flow, Agent System |
|
||||
| Visible layers | Checkboxes | Client, Server, SDK, Data, External |
|
||||
| Connection types | Checkboxes with color indicators | Data Flow (blue), Tool Calls (green), Events (red) |
|
||||
| Component feedback | Click-to-comment modal | Opens modal with textarea for feedback |
|
||||
| Zoom level | +/−/reset buttons | Scale SVG for detail |
|
||||
|
||||
## Canvas rendering
|
||||
|
||||
Use an `<svg>` element with dynamically generated nodes and paths. Key patterns:
|
||||
|
||||
- **Nodes:** Rounded rectangles with title and subtitle (file path)
|
||||
- **Connections:** Curved paths (bezier) with arrow markers, styled by type
|
||||
- **Layer organization:** Group nodes by Y-position bands (e.g., y: 30-80 = Client, y: 130-180 = Server)
|
||||
- **Click-to-comment:** Click node → open modal → save comment → node gets visual indicator
|
||||
- **Filtering:** Toggle visibility of nodes by layer, connections by type
|
||||
|
||||
```javascript
|
||||
const nodes = [
|
||||
{ id: 'api-client', label: 'API Client', subtitle: 'src/api/client.ts',
|
||||
x: 100, y: 50, w: 140, h: 45, layer: 'client', color: '#dbeafe' },
|
||||
// ...
|
||||
];
|
||||
|
||||
const connections = [
|
||||
{ from: 'api-client', to: 'server', type: 'data-flow', label: 'HTTP' },
|
||||
{ from: 'server', to: 'db', type: 'data-flow' },
|
||||
// ...
|
||||
];
|
||||
|
||||
function renderDiagram() {
|
||||
const visibleNodes = nodes.filter(n => state.layers[n.layer]);
|
||||
// Draw connections first (under nodes), then nodes
|
||||
connections.forEach(c => drawConnection(c));
|
||||
visibleNodes.forEach(n => drawNode(n));
|
||||
}
|
||||
```
|
||||
|
||||
## Connection types and styling
|
||||
|
||||
Define 3-5 connection types with distinct visual styles:
|
||||
|
||||
| Type | Color | Style | Use for |
|
||||
|---|---|---|---|
|
||||
| `data-flow` | Blue (#3b82f6) | Solid line | Request/response, data passing |
|
||||
| `tool-call` | Green (#10b981) | Dashed (6,3) | Function calls, API invocations |
|
||||
| `event` | Red (#ef4444) | Short dash (4,4) | Async events, pub/sub |
|
||||
| `skill-invoke` | Orange (#f97316) | Long dash (8,4) | Plugin/skill activation |
|
||||
| `dependency` | Gray (#6b7280) | Dotted | Import/require relationships |
|
||||
|
||||
Use SVG markers for arrowheads:
|
||||
|
||||
```html
|
||||
<marker id="arrowhead-blue" markerWidth="8" markerHeight="6" refX="7" refY="3" orient="auto">
|
||||
<polygon points="0 0, 8 3, 0 6" fill="#3b82f6"/>
|
||||
</marker>
|
||||
```
|
||||
|
||||
## Comment system
|
||||
|
||||
The key differentiator for code maps is click-to-comment functionality:
|
||||
|
||||
1. **Click node** → Open modal with component name, file path, textarea
|
||||
2. **Save comment** → Add to comments list, mark node with visual indicator (colored border)
|
||||
3. **View comments** → Sidebar list with component name, comment preview, delete button
|
||||
4. **Delete comment** → Remove from list, update node visual, regenerate prompt
|
||||
|
||||
Comments should include the component context:
|
||||
|
||||
```javascript
|
||||
state.comments.push({
|
||||
id: Date.now(),
|
||||
target: node.id,
|
||||
targetLabel: node.label,
|
||||
targetFile: node.subtitle,
|
||||
text: userInput
|
||||
});
|
||||
```
|
||||
|
||||
## Prompt output for code maps
|
||||
|
||||
The prompt combines system context with user comments:
|
||||
|
||||
```
|
||||
This is the [PROJECT NAME] architecture, focusing on the [visible layers].
|
||||
|
||||
Feedback on specific components:
|
||||
|
||||
**API Client** (src/api/client.ts):
|
||||
I want to add retry logic with exponential backoff here.
|
||||
|
||||
**Database Manager** (src/db/manager.ts):
|
||||
Can we add connection pooling? Current implementation creates new connections per request.
|
||||
|
||||
**Auth Middleware** (src/middleware/auth.ts):
|
||||
This should validate JWT tokens and extract user context.
|
||||
```
|
||||
|
||||
Only include comments the user added. Mention which layers are visible if not showing the full system.
|
||||
|
||||
## Pre-populating with real data
|
||||
|
||||
For a specific codebase, pre-populate with:
|
||||
|
||||
- **Nodes:** 15-25 key components with real file paths
|
||||
- **Connections:** 20-40 relationships based on actual imports/calls
|
||||
- **Layers:** Logical groupings (UI, API, Business Logic, Data, External)
|
||||
- **Presets:** "Full System", "Frontend Only", "Backend Only", "Data Flow"
|
||||
|
||||
Organize nodes in horizontal bands by layer, with consistent spacing.
|
||||
|
||||
## Layer color palette (light theme)
|
||||
|
||||
| Layer | Node fill | Description |
|
||||
|---|---|---|
|
||||
| Client/UI | #dbeafe (blue-100) | React components, hooks, pages |
|
||||
| Server/API | #fef3c7 (amber-100) | Express routes, middleware, handlers |
|
||||
| SDK/Core | #f3e8ff (purple-100) | Core libraries, SDK wrappers |
|
||||
| Agent/Logic | #dcfce7 (green-100) | Business logic, agents, processors |
|
||||
| Data | #fce7f3 (pink-100) | Database, cache, storage |
|
||||
| External | #fbcfe8 (pink-200) | Third-party services, APIs |
|
||||
|
||||
## Example topics
|
||||
|
||||
- Codebase architecture explorer (modules, imports, data flow)
|
||||
- Microservices map (services, queues, databases, API gateways)
|
||||
- React component tree (components, hooks, context, state)
|
||||
- API architecture (routes, middleware, controllers, models)
|
||||
- Agent system (prompts, tools, skills, subagents)
|
||||
- Data pipeline (sources, transforms, sinks, scheduling)
|
||||
- Plugin/extension architecture (core, plugins, hooks, events)
|
||||
@@ -1,73 +0,0 @@
|
||||
# Concept Map Template
|
||||
|
||||
Use this template when the playground is about learning, exploration, or mapping relationships: concept maps, knowledge gap identification, scope mapping, task decomposition with dependencies.
|
||||
|
||||
## Layout
|
||||
|
||||
```
|
||||
+--------------------------------------+
|
||||
| Canvas (draggable nodes, edges) |
|
||||
| with tooltip on hover |
|
||||
+-------------------------+------------+
|
||||
| | |
|
||||
| Sidebar: | Prompt |
|
||||
| • Knowledge levels | output |
|
||||
| • Connection types | |
|
||||
| • Node list | [Copy] |
|
||||
| • Actions | |
|
||||
+-------------------------+------------+
|
||||
```
|
||||
|
||||
Canvas-based playgrounds differ from the two-panel split. The interactive visual IS the control — users drag nodes and draw connections rather than adjusting sliders. The sidebar supplements with toggles and list controls.
|
||||
|
||||
## Control types for concept maps
|
||||
|
||||
| Decision | Control | Example |
|
||||
|---|---|---|
|
||||
| Knowledge level per node | Click-to-cycle button in sidebar list | Know → Fuzzy → Unknown |
|
||||
| Connection type | Selector before drawing | calls, depends on, contains, reads from |
|
||||
| Node arrangement | Drag on canvas | spatial layout reflects mental model |
|
||||
| Which nodes to include | Toggle or checkbox per node | hide/show concepts |
|
||||
| Actions | Buttons | Auto-layout (force-directed), clear edges, reset |
|
||||
|
||||
## Canvas rendering
|
||||
|
||||
Use a `<canvas>` element with manual draw calls. Key patterns:
|
||||
|
||||
- **Hit testing:** Check mouse position against node bounding circles on mousedown/mousemove
|
||||
- **Drag:** On mousedown on a node, track offset and update position on mousemove
|
||||
- **Edge drawing:** Click node A, then click node B. Draw arrow between them with the selected relationship type
|
||||
- **Tooltips:** On hover, position a div absolutely over the canvas with description text
|
||||
- **Force-directed auto-layout:** Simple spring simulation — repulsion between all pairs, attraction along edges, iterate 100-200 times with damping
|
||||
|
||||
```javascript
|
||||
function draw() {
|
||||
ctx.clearRect(0, 0, W, H);
|
||||
edges.forEach(e => drawEdge(e)); // edges first, under nodes
|
||||
nodes.forEach(n => drawNode(n)); // nodes on top
|
||||
}
|
||||
```
|
||||
|
||||
## Prompt output for concept maps
|
||||
|
||||
The prompt should be a targeted learning request shaped by the user's knowledge markings:
|
||||
|
||||
> "I'm learning [CODEBASE/DOMAIN]. I already understand: [know nodes]. I'm fuzzy on: [fuzzy nodes]. I have no idea about: [unknown nodes]. Here are the relationships I want to understand: [edge list in natural language]. Please explain the fuzzy and unknown concepts, focusing on these relationships. Build on what I already know. Use concrete code references."
|
||||
|
||||
Only include edges the user drew. Only mention concepts they marked as fuzzy or unknown in the explanation request.
|
||||
|
||||
## Pre-populating with real data
|
||||
|
||||
For codebases or domains, pre-populate with:
|
||||
- **Nodes:** 15-20 key concepts with real file paths and short descriptions
|
||||
- **Edges:** 20-30 pre-drawn relationships based on actual architecture
|
||||
- **Knowledge:** Default all to "Fuzzy" so the user adjusts from there
|
||||
- **Presets:** "Zoom out" (hide internal nodes, show only top-level), "Focus on [layer]" (highlight nodes in one area)
|
||||
|
||||
## Example topics
|
||||
|
||||
- Codebase architecture map (modules, data flow, state management)
|
||||
- Framework learning (how React hooks connect, Next.js data fetching layers)
|
||||
- System design (services, databases, queues, caches and how they relate)
|
||||
- Task decomposition (goals → sub-tasks with dependency arrows, knowledge tags)
|
||||
- API surface map (endpoints grouped by resource, shared middleware, auth layers)
|
||||
@@ -1,67 +0,0 @@
|
||||
# Data Explorer Template
|
||||
|
||||
Use this template when the playground is about data queries, APIs, pipelines, or structured configuration: SQL builders, API designers, regex builders, pipeline visuals, cron schedules.
|
||||
|
||||
## Layout
|
||||
|
||||
```
|
||||
+-------------------+----------------------+
|
||||
| | |
|
||||
| Controls | Formatted output |
|
||||
| grouped by: | (syntax-highlighted |
|
||||
| • Source/tables | code, or a |
|
||||
| • Columns/fields | visual diagram) |
|
||||
| • Filters | |
|
||||
| • Grouping | |
|
||||
| • Ordering | |
|
||||
| • Limits | |
|
||||
| +----------------------+
|
||||
| | Prompt output |
|
||||
| | [ Copy Prompt ] |
|
||||
+-------------------+----------------------+
|
||||
```
|
||||
|
||||
## Control types by decision
|
||||
|
||||
| Decision | Control | Example |
|
||||
|---|---|---|
|
||||
| Select from available items | Clickable cards/chips | table names, columns, HTTP methods |
|
||||
| Add filter/condition rows | Add button → row of dropdowns + input | WHERE column op value |
|
||||
| Join type or aggregation | Dropdown per row | INNER/LEFT/RIGHT, COUNT/SUM/AVG |
|
||||
| Limit/offset | Slider | result count 1–500 |
|
||||
| Ordering | Dropdown + ASC/DESC toggle | order by column |
|
||||
| On/off features | Toggle | show descriptions, include header |
|
||||
|
||||
## Preview rendering
|
||||
|
||||
Render syntax-highlighted output using `<span>` tags with color classes:
|
||||
|
||||
```javascript
|
||||
function renderPreview() {
|
||||
const el = document.getElementById('preview');
|
||||
// Color-code by token type
|
||||
el.innerHTML = sql
|
||||
.replace(/\b(SELECT|FROM|WHERE|JOIN|ON|GROUP BY|ORDER BY|LIMIT)\b/g, '<span class="kw">$1</span>')
|
||||
.replace(/\b(users|orders|products)\b/g, '<span class="tbl">$1</span>')
|
||||
.replace(/'[^']*'/g, '<span class="str">$&</span>');
|
||||
}
|
||||
```
|
||||
|
||||
For pipeline-style playgrounds, render a horizontal or vertical flow diagram using positioned divs with arrow connectors.
|
||||
|
||||
## Prompt output for data
|
||||
|
||||
Frame it as a specification of what to build, not the raw query itself:
|
||||
|
||||
> "Write a SQL query that joins orders to users on user_id, filters for orders after 2024-01-01 with total > $50, groups by user, and returns the top 10 users by order count."
|
||||
|
||||
Include the schema context (table names, column types) so the prompt is self-contained.
|
||||
|
||||
## Example topics
|
||||
|
||||
- SQL query builder (tables, joins, filters, group by, order by, limit)
|
||||
- API endpoint designer (routes, methods, request/response field builder)
|
||||
- Data transformation pipeline (source → filter → map → aggregate → output)
|
||||
- Regex builder (sample strings, match groups, live highlight)
|
||||
- Cron schedule builder (visual timeline, interval, day toggles)
|
||||
- GraphQL query builder (type selection, field picker, nested resolvers)
|
||||
@@ -1,67 +0,0 @@
|
||||
# Design Playground Template
|
||||
|
||||
Use this template when the playground is about visual design decisions: components, layouts, spacing, color, typography, animation, responsive behavior.
|
||||
|
||||
## Layout
|
||||
|
||||
```
|
||||
+-------------------+----------------------+
|
||||
| | |
|
||||
| Controls | Live component/ |
|
||||
| grouped by: | layout preview |
|
||||
| • Spacing | (renders in a |
|
||||
| • Color | mock page or |
|
||||
| • Typography | isolated card) |
|
||||
| • Shadow/Border | |
|
||||
| • Interaction | |
|
||||
| +----------------------+
|
||||
| | Prompt output |
|
||||
| | [ Copy Prompt ] |
|
||||
+-------------------+----------------------+
|
||||
```
|
||||
|
||||
## Control types by decision
|
||||
|
||||
| Decision | Control | Example |
|
||||
|---|---|---|
|
||||
| Sizes, spacing, radius | Slider | border-radius 0–24px |
|
||||
| On/off features | Toggle | show border, hover effect |
|
||||
| Choosing from a set | Dropdown | font-family, easing curve |
|
||||
| Colors | Hue + saturation + lightness sliders | shadow color, accent |
|
||||
| Layout structure | Clickable cards | sidebar-left / top-nav / no-nav |
|
||||
| Responsive behavior | Viewport-width slider | watch grid reflow at breakpoints |
|
||||
|
||||
## Preview rendering
|
||||
|
||||
Apply state values directly to a preview element's inline styles:
|
||||
|
||||
```javascript
|
||||
function renderPreview() {
|
||||
const el = document.getElementById('preview');
|
||||
el.style.borderRadius = state.radius + 'px';
|
||||
el.style.padding = state.padding + 'px';
|
||||
el.style.boxShadow = state.shadow
|
||||
? `0 ${state.shadowY}px ${state.shadowBlur}px rgba(0,0,0,${state.shadowOpacity})`
|
||||
: 'none';
|
||||
}
|
||||
```
|
||||
|
||||
Show the preview on both light and dark backgrounds if relevant. Include a context toggle.
|
||||
|
||||
## Prompt output for design
|
||||
|
||||
Frame it as a direction to a developer, not a spec sheet:
|
||||
|
||||
> "Update the card to feel soft and elevated: 12px border-radius, 24px horizontal padding, a medium box-shadow (0 4px 12px rgba(0,0,0,0.1)). On hover, lift it with translateY(-1px) and deepen the shadow slightly."
|
||||
|
||||
If the user is working in Tailwind, suggest Tailwind classes. If raw CSS, use CSS properties.
|
||||
|
||||
## Example topics
|
||||
|
||||
- Button style explorer (radius, padding, weight, hover/active states)
|
||||
- Card component (shadow depth, radius, content layout, image)
|
||||
- Layout builder (sidebar width, content max-width, header height, grid)
|
||||
- Typography scale (base size, ratio, line heights across h1-body-caption)
|
||||
- Color palette generator (primary hue, derive secondary/accent/surface)
|
||||
- Dashboard density (airy → compact slider that scales everything proportionally)
|
||||
- Modal/dialog (width, overlay opacity, entry animation, corner radius)
|
||||
@@ -1,179 +0,0 @@
|
||||
# Diff Review Template
|
||||
|
||||
Use this template when the playground is about reviewing code diffs: git commits, pull requests, code changes with interactive line-by-line commenting for feedback.
|
||||
|
||||
## Layout
|
||||
|
||||
```
|
||||
+-------------------+----------------------------------+
|
||||
| | |
|
||||
| Commit Header: | Diff Content |
|
||||
| • Hash | (files with hunks) |
|
||||
| • Message | with line numbers |
|
||||
| • Author/Date | and +/- indicators |
|
||||
| | |
|
||||
+-------------------+----------------------------------+
|
||||
| Prompt Output Panel (fixed bottom-right) |
|
||||
| [ Copy All ] |
|
||||
| Shows all comments formatted for prompt |
|
||||
+------------------------------------------------------+
|
||||
```
|
||||
|
||||
Diff review playgrounds display git diffs with syntax highlighting. Users click lines to add comments, which become part of the generated prompt for code review feedback.
|
||||
|
||||
## Control types for diff review
|
||||
|
||||
| Feature | Control | Behavior |
|
||||
|---|---|---|
|
||||
| Line commenting | Click any diff line | Opens textarea below the line |
|
||||
| Comment indicator | Badge on commented lines | Shows which lines have feedback |
|
||||
| Save/Cancel | Buttons in comment box | Persist or discard comment |
|
||||
| Copy prompt | Button in prompt panel | Copies all comments to clipboard |
|
||||
|
||||
## Diff rendering
|
||||
|
||||
Parse diff data into structured format for rendering:
|
||||
|
||||
```javascript
|
||||
const diffData = [
|
||||
{
|
||||
file: "path/to/file.py",
|
||||
hunks: [
|
||||
{
|
||||
header: "@@ -41,13 +41,13 @@ function context",
|
||||
lines: [
|
||||
{ type: "context", oldNum: 41, newNum: 41, content: "unchanged line" },
|
||||
{ type: "deletion", oldNum: 42, newNum: null, content: "removed line" },
|
||||
{ type: "addition", oldNum: null, newNum: 42, content: "added line" },
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
];
|
||||
```
|
||||
|
||||
## Line type styling
|
||||
|
||||
| Type | Background | Text Color | Prefix |
|
||||
|---|---|---|---|
|
||||
| `context` | transparent | default | ` ` (space) |
|
||||
| `addition` | green tint (#dafbe1 light / rgba(46,160,67,0.15) dark) | green (#1a7f37 light / #7ee787 dark) | `+` |
|
||||
| `deletion` | red tint (#ffebe9 light / rgba(248,81,73,0.15) dark) | red (#cf222e light / #f85149 dark) | `-` |
|
||||
| `hunk-header` | blue tint (#ddf4ff light) | blue (#0969da light) | `@@` |
|
||||
|
||||
## Comment system
|
||||
|
||||
Each diff line gets a unique identifier for comment tracking:
|
||||
|
||||
```javascript
|
||||
const comments = {}; // { lineId: commentText }
|
||||
|
||||
function selectLine(lineId, lineEl) {
|
||||
// Deselect previous
|
||||
document.querySelectorAll('.diff-line.selected').forEach(el =>
|
||||
el.classList.remove('selected'));
|
||||
document.querySelectorAll('.comment-box.active').forEach(el =>
|
||||
el.classList.remove('active'));
|
||||
|
||||
// Select new
|
||||
lineEl.classList.add('selected');
|
||||
document.getElementById(`comment-box-${lineId}`).classList.add('active');
|
||||
}
|
||||
|
||||
function saveComment(lineId) {
|
||||
const textarea = document.getElementById(`textarea-${lineId}`);
|
||||
const comment = textarea.value.trim();
|
||||
|
||||
if (comment) {
|
||||
comments[lineId] = comment;
|
||||
} else {
|
||||
delete comments[lineId];
|
||||
}
|
||||
|
||||
renderDiff(); // Re-render to show comment indicator
|
||||
updatePromptOutput();
|
||||
}
|
||||
```
|
||||
|
||||
## Prompt output format
|
||||
|
||||
Generate a structured code review format:
|
||||
|
||||
```javascript
|
||||
function updatePromptOutput() {
|
||||
const commentKeys = Object.keys(comments);
|
||||
|
||||
if (commentKeys.length === 0) {
|
||||
promptContent.innerHTML = '<span class="no-comments">Click on any line to add a comment...</span>';
|
||||
return;
|
||||
}
|
||||
|
||||
let output = 'Code Review Comments:\n\n';
|
||||
|
||||
commentKeys.forEach(lineId => {
|
||||
const lineEl = document.querySelector(`[data-line-id="${lineId}"]`);
|
||||
const file = lineEl.dataset.file;
|
||||
const lineNum = lineEl.dataset.lineNum;
|
||||
const content = lineEl.dataset.content;
|
||||
|
||||
output += `📍 ${file}:${lineNum}\n`;
|
||||
output += ` Code: ${content.trim()}\n`;
|
||||
output += ` Comment: ${comments[lineId]}\n\n`;
|
||||
});
|
||||
|
||||
promptContent.textContent = output;
|
||||
}
|
||||
```
|
||||
|
||||
## Data attributes for line elements
|
||||
|
||||
Store metadata on each line element for prompt generation:
|
||||
|
||||
```html
|
||||
<div class="diff-line addition"
|
||||
data-line-id="0-1-5"
|
||||
data-file="src/utils/handler.py"
|
||||
data-line-num="45"
|
||||
data-content="subagent_id = tracker.register()">
|
||||
```
|
||||
|
||||
## Pre-populating with real data
|
||||
|
||||
To create a diff viewer for a specific commit:
|
||||
|
||||
1. Run `git show <commit> --format="%H%n%s%n%an%n%ad" -p`
|
||||
2. Parse the output into the `diffData` structure
|
||||
3. Include commit metadata in the header section
|
||||
|
||||
## Theme support
|
||||
|
||||
Support both light and dark modes:
|
||||
|
||||
```css
|
||||
/* Light mode */
|
||||
body { background: #f6f8fa; color: #1f2328; }
|
||||
.file-card { background: #ffffff; border: 1px solid #d0d7de; }
|
||||
.diff-line.addition { background: #dafbe1; }
|
||||
.diff-line.deletion { background: #ffebe9; }
|
||||
|
||||
/* Dark mode */
|
||||
body { background: #0d1117; color: #c9d1d9; }
|
||||
.file-card { background: #161b22; border: 1px solid #30363d; }
|
||||
.diff-line.addition { background: rgba(46, 160, 67, 0.15); }
|
||||
.diff-line.deletion { background: rgba(248, 81, 73, 0.15); }
|
||||
```
|
||||
|
||||
## Interactive features
|
||||
|
||||
- **Hover hint:** Show "Click to comment" tooltip on line hover
|
||||
- **Comment indicator:** Badge (💬) on lines with saved comments
|
||||
- **Toast notification:** "Copied to clipboard!" feedback on copy
|
||||
- **Edit existing:** Allow editing previously saved comments
|
||||
|
||||
## Example topics
|
||||
|
||||
- Git commit review (single commit diff with line comments)
|
||||
- Pull request review (multiple commits, file-level and line-level comments)
|
||||
- Code diff comparison (before/after refactoring)
|
||||
- Merge conflict resolution (showing both versions with annotations)
|
||||
- Code audit (security review with findings per line)
|
||||
@@ -1,171 +0,0 @@
|
||||
# Document Critique Template
|
||||
|
||||
Use this template when the playground helps review and critique documents: SKILL.md files, READMEs, specs, proposals, or any text that needs structured feedback with approve/reject/comment workflow.
|
||||
|
||||
## Layout
|
||||
|
||||
```
|
||||
+---------------------------+--------------------+
|
||||
| | |
|
||||
| Document content | Suggestions panel |
|
||||
| with line numbers | (filterable list) |
|
||||
| and suggestion | • Approve |
|
||||
| highlighting | • Reject |
|
||||
| | • Comment |
|
||||
| | |
|
||||
+---------------------------+--------------------+
|
||||
| Prompt output (approved + commented items) |
|
||||
| [ Copy Prompt ] |
|
||||
+------------------------------------------------+
|
||||
```
|
||||
|
||||
## Key components
|
||||
|
||||
### Document panel (left)
|
||||
- Display full document with line numbers
|
||||
- Highlight lines with suggestions using a colored left border
|
||||
- Color-code by status: pending (amber), approved (green), rejected (red with opacity)
|
||||
- Click a suggestion card to scroll to the relevant line
|
||||
|
||||
### Suggestions panel (right)
|
||||
- Filter tabs: All / Pending / Approved / Rejected
|
||||
- Stats in header showing counts for each status
|
||||
- Each suggestion card shows:
|
||||
- Line reference (e.g., "Line 3" or "Lines 17-24")
|
||||
- The suggestion text
|
||||
- Action buttons: Approve / Reject / Comment (or Reset if already decided)
|
||||
- Optional textarea for user comments
|
||||
|
||||
### Prompt output (bottom)
|
||||
- Generates a prompt only from approved suggestions and user comments
|
||||
- Groups by: Approved Improvements, Additional Feedback, Rejected (for context)
|
||||
- Copy button with "Copied!" feedback
|
||||
|
||||
## State structure
|
||||
|
||||
```javascript
|
||||
const suggestions = [
|
||||
{
|
||||
id: 1,
|
||||
lineRef: "Line 3",
|
||||
targetText: "description: Creates interactive...",
|
||||
suggestion: "The description is too long. Consider shortening.",
|
||||
category: "clarity", // clarity, completeness, performance, accessibility, ux
|
||||
status: "pending", // pending, approved, rejected
|
||||
userComment: ""
|
||||
},
|
||||
// ... more suggestions
|
||||
];
|
||||
|
||||
let state = {
|
||||
suggestions: [...],
|
||||
activeFilter: "all",
|
||||
activeSuggestionId: null
|
||||
};
|
||||
```
|
||||
|
||||
## Suggestion matching to lines
|
||||
|
||||
Match suggestions to document lines by parsing the lineRef:
|
||||
|
||||
```javascript
|
||||
const suggestion = state.suggestions.find(s => {
|
||||
const match = s.lineRef.match(/Line[s]?\s*(\d+)/);
|
||||
if (match) {
|
||||
const targetLine = parseInt(match[1]);
|
||||
return Math.abs(targetLine - lineNum) <= 2; // fuzzy match nearby lines
|
||||
}
|
||||
return false;
|
||||
});
|
||||
```
|
||||
|
||||
## Document rendering
|
||||
|
||||
Handle markdown-style formatting inline:
|
||||
|
||||
```javascript
|
||||
// Skip ``` lines, wrap content in code-block-wrapper
|
||||
if (line.startsWith('```')) {
|
||||
inCodeBlock = !inCodeBlock;
|
||||
// Open or close wrapper div
|
||||
}
|
||||
|
||||
// Headers
|
||||
if (line.startsWith('# ')) renderedLine = `<h1>...</h1>`;
|
||||
if (line.startsWith('## ')) renderedLine = `<h2>...</h2>`;
|
||||
|
||||
// Inline formatting (outside code blocks)
|
||||
renderedLine = renderedLine.replace(/`([^`]+)`/g, '<code>$1</code>');
|
||||
renderedLine = renderedLine.replace(/\*\*([^*]+)\*\*/g, '<strong>$1</strong>');
|
||||
```
|
||||
|
||||
## Prompt output generation
|
||||
|
||||
Only include actionable items:
|
||||
|
||||
```javascript
|
||||
function updatePrompt() {
|
||||
const approved = state.suggestions.filter(s => s.status === 'approved');
|
||||
const withComments = state.suggestions.filter(s => s.userComment?.trim());
|
||||
|
||||
if (approved.length === 0 && withComments.length === 0) {
|
||||
// Show placeholder
|
||||
return;
|
||||
}
|
||||
|
||||
let prompt = 'Please update [DOCUMENT] with the following changes:\n\n';
|
||||
|
||||
if (approved.length > 0) {
|
||||
prompt += '## Approved Improvements\n\n';
|
||||
for (const s of approved) {
|
||||
prompt += `**${s.lineRef}:** ${s.suggestion}`;
|
||||
if (s.userComment?.trim()) {
|
||||
prompt += `\n → User note: ${s.userComment.trim()}`;
|
||||
}
|
||||
prompt += '\n\n';
|
||||
}
|
||||
}
|
||||
|
||||
// Additional feedback from non-approved items with comments
|
||||
// Rejected items listed for context only
|
||||
}
|
||||
```
|
||||
|
||||
## Styling highlights
|
||||
|
||||
```css
|
||||
.doc-line.has-suggestion {
|
||||
border-left: 3px solid #bf8700; /* amber for pending */
|
||||
background: rgba(191, 135, 0, 0.08);
|
||||
}
|
||||
|
||||
.doc-line.approved {
|
||||
border-left-color: #1a7f37; /* green */
|
||||
background: rgba(26, 127, 55, 0.08);
|
||||
}
|
||||
|
||||
.doc-line.rejected {
|
||||
border-left-color: #cf222e; /* red */
|
||||
background: rgba(207, 34, 46, 0.08);
|
||||
opacity: 0.6;
|
||||
}
|
||||
```
|
||||
|
||||
## Pre-populating suggestions
|
||||
|
||||
When building a critique playground for a specific document:
|
||||
|
||||
1. Read the document content
|
||||
2. Analyze and generate suggestions with:
|
||||
- Specific line references
|
||||
- Clear, actionable suggestion text
|
||||
- Category tags (clarity, completeness, performance, accessibility, ux)
|
||||
3. Embed both the document content and suggestions array in the HTML
|
||||
|
||||
## Example use cases
|
||||
|
||||
- SKILL.md review (skill definition quality, completeness, clarity)
|
||||
- README critique (documentation quality, missing sections, unclear explanations)
|
||||
- Spec review (requirements clarity, missing edge cases, ambiguity)
|
||||
- Proposal feedback (structure, argumentation, missing context)
|
||||
- Code comment review (docstring quality, inline comment usefulness)
|
||||
@@ -6,13 +6,13 @@ A comprehensive toolkit for developing Claude Code plugins with expert guidance
|
||||
|
||||
The plugin-dev toolkit provides seven specialized skills to help you build high-quality Claude Code plugins:
|
||||
|
||||
1. **hook-development** - Advanced hooks API and event-driven automation
|
||||
2. **mcp-integration** - Model Context Protocol server integration
|
||||
3. **plugin-structure** - Plugin organization and manifest configuration
|
||||
4. **plugin-settings** - Configuration patterns using .claude/plugin-name.local.md files
|
||||
5. **command-development** - Creating slash commands with frontmatter and arguments
|
||||
6. **agent-development** - Creating autonomous agents with AI-assisted generation
|
||||
7. **skill-development** - Creating skills with progressive disclosure and strong triggers
|
||||
1. **Hook Development** - Advanced hooks API and event-driven automation
|
||||
2. **MCP Integration** - Model Context Protocol server integration
|
||||
3. **Plugin Structure** - Plugin organization and manifest configuration
|
||||
4. **Plugin Settings** - Configuration patterns using .claude/plugin-name.local.md files
|
||||
5. **Command Development** - Creating slash commands with frontmatter and arguments
|
||||
6. **Agent Development** - Creating autonomous agents with AI-assisted generation
|
||||
7. **Skill Development** - Creating skills with progressive disclosure and strong triggers
|
||||
|
||||
Each skill follows best practices with progressive disclosure: lean core documentation, detailed references, working examples, and utility scripts.
|
||||
|
||||
@@ -53,7 +53,7 @@ Use this workflow for structured, high-quality plugin development from concept t
|
||||
|
||||
## Skills
|
||||
|
||||
### 1. hook-development
|
||||
### 1. Hook Development
|
||||
|
||||
**Trigger phrases:** "create a hook", "add a PreToolUse hook", "validate tool use", "implement prompt-based hooks", "${CLAUDE_PLUGIN_ROOT}", "block dangerous commands"
|
||||
|
||||
@@ -73,7 +73,7 @@ Use this workflow for structured, high-quality plugin development from concept t
|
||||
|
||||
**Use when:** Creating event-driven automation, validating operations, or enforcing policies in your plugin.
|
||||
|
||||
### 2. mcp-integration
|
||||
### 2. MCP Integration
|
||||
|
||||
**Trigger phrases:** "add MCP server", "integrate MCP", "configure .mcp.json", "Model Context Protocol", "stdio/SSE/HTTP server", "connect external service"
|
||||
|
||||
@@ -92,7 +92,7 @@ Use this workflow for structured, high-quality plugin development from concept t
|
||||
|
||||
**Use when:** Integrating external services, APIs, databases, or tools into your plugin.
|
||||
|
||||
### 3. plugin-structure
|
||||
### 3. Plugin Structure
|
||||
|
||||
**Trigger phrases:** "plugin structure", "plugin.json manifest", "auto-discovery", "component organization", "plugin directory layout"
|
||||
|
||||
@@ -111,7 +111,7 @@ Use this workflow for structured, high-quality plugin development from concept t
|
||||
|
||||
**Use when:** Starting a new plugin, organizing components, or configuring the plugin manifest.
|
||||
|
||||
### 4. plugin-settings
|
||||
### 4. Plugin Settings
|
||||
|
||||
**Trigger phrases:** "plugin settings", "store plugin configuration", ".local.md files", "plugin state files", "read YAML frontmatter", "per-project plugin settings"
|
||||
|
||||
@@ -120,7 +120,7 @@ Use this workflow for structured, high-quality plugin development from concept t
|
||||
- YAML frontmatter + markdown body structure
|
||||
- Parsing techniques for bash scripts (sed, awk, grep patterns)
|
||||
- Temporarily active hooks (flag files and quick-exit)
|
||||
- Real-world examples from multi-agent-swarm and ralph-loop plugins
|
||||
- Real-world examples from multi-agent-swarm and ralph-wiggum plugins
|
||||
- Atomic file updates and validation
|
||||
- Gitignore and lifecycle management
|
||||
|
||||
@@ -132,7 +132,7 @@ Use this workflow for structured, high-quality plugin development from concept t
|
||||
|
||||
**Use when:** Making plugins configurable, storing per-project state, or implementing user preferences.
|
||||
|
||||
### 5. command-development
|
||||
### 5. Command Development
|
||||
|
||||
**Trigger phrases:** "create a slash command", "add a command", "command frontmatter", "define command arguments", "organize commands"
|
||||
|
||||
@@ -151,7 +151,7 @@ Use this workflow for structured, high-quality plugin development from concept t
|
||||
|
||||
**Use when:** Creating slash commands, defining command arguments, or organizing plugin commands.
|
||||
|
||||
### 6. agent-development
|
||||
### 6. Agent Development
|
||||
|
||||
**Trigger phrases:** "create an agent", "add an agent", "write a subagent", "agent frontmatter", "when to use description", "agent examples", "autonomous agent"
|
||||
|
||||
@@ -172,7 +172,7 @@ Use this workflow for structured, high-quality plugin development from concept t
|
||||
|
||||
**Use when:** Creating autonomous agents, defining agent behavior, or implementing AI-assisted agent generation.
|
||||
|
||||
### 7. skill-development
|
||||
### 7. Skill Development
|
||||
|
||||
**Trigger phrases:** "create a skill", "add a skill to plugin", "write a new skill", "improve skill description", "organize skill content"
|
||||
|
||||
@@ -286,11 +286,11 @@ The hook-development skill includes production-ready utilities:
|
||||
### Working Examples
|
||||
|
||||
Every skill provides working examples:
|
||||
- **hook-development**: 3 complete hook scripts (bash, write validation, context loading)
|
||||
- **mcp-integration**: 3 server configurations (stdio, SSE, HTTP)
|
||||
- **plugin-structure**: 3 plugin layouts (minimal, standard, advanced)
|
||||
- **plugin-settings**: 3 examples (read-settings hook, create-settings command, templates)
|
||||
- **command-development**: 10 complete command examples (review, test, deploy, docs, etc.)
|
||||
- **Hook Development**: 3 complete hook scripts (bash, write validation, context loading)
|
||||
- **MCP Integration**: 3 server configurations (stdio, SSE, HTTP)
|
||||
- **Plugin Structure**: 3 plugin layouts (minimal, standard, advanced)
|
||||
- **Plugin Settings**: 3 examples (read-settings hook, create-settings command, templates)
|
||||
- **Command Development**: 10 complete command examples (review, test, deploy, docs, etc.)
|
||||
|
||||
## Documentation Standards
|
||||
|
||||
|
||||
@@ -1,34 +1,34 @@
|
||||
---
|
||||
name: agent-creator
|
||||
description: |
|
||||
Use this agent when the user asks to "create an agent", "generate an agent", "build a new agent", "make me an agent that...", or describes agent functionality they need. Trigger when user wants to create autonomous agents for plugins. Examples:
|
||||
description: Use this agent when the user asks to "create an agent", "generate an agent", "build a new agent", "make me an agent that...", or describes agent functionality they need. Trigger when user wants to create autonomous agents for plugins. Examples:
|
||||
|
||||
<example>
|
||||
Context: User wants to create a code review agent
|
||||
user: "Create an agent that reviews code for quality issues"
|
||||
assistant: "I'll use the agent-creator agent to generate the agent configuration."
|
||||
<commentary>
|
||||
User requesting new agent creation, trigger agent-creator to generate it.
|
||||
</commentary>
|
||||
</example>
|
||||
<example>
|
||||
Context: User wants to create a code review agent
|
||||
user: "Create an agent that reviews code for quality issues"
|
||||
assistant: "I'll use the agent-creator agent to generate the agent configuration."
|
||||
<commentary>
|
||||
User requesting new agent creation, trigger agent-creator to generate it.
|
||||
</commentary>
|
||||
</example>
|
||||
|
||||
<example>
|
||||
Context: User describes needed functionality
|
||||
user: "I need an agent that generates unit tests for my code"
|
||||
assistant: "I'll use the agent-creator agent to create a test generation agent."
|
||||
<commentary>
|
||||
User describes agent need, trigger agent-creator to build it.
|
||||
</commentary>
|
||||
</example>
|
||||
<example>
|
||||
Context: User describes needed functionality
|
||||
user: "I need an agent that generates unit tests for my code"
|
||||
assistant: "I'll use the agent-creator agent to create a test generation agent."
|
||||
<commentary>
|
||||
User describes agent need, trigger agent-creator to build it.
|
||||
</commentary>
|
||||
</example>
|
||||
|
||||
<example>
|
||||
Context: User wants to add agent to plugin
|
||||
user: "Add an agent to my plugin that validates configurations"
|
||||
assistant: "I'll use the agent-creator agent to generate a configuration validator agent."
|
||||
<commentary>
|
||||
Plugin development with agent addition, trigger agent-creator.
|
||||
</commentary>
|
||||
</example>
|
||||
|
||||
<example>
|
||||
Context: User wants to add agent to plugin
|
||||
user: "Add an agent to my plugin that validates configurations"
|
||||
assistant: "I'll use the agent-creator agent to generate a configuration validator agent."
|
||||
<commentary>
|
||||
Plugin development with agent addition, trigger agent-creator.
|
||||
</commentary>
|
||||
</example>
|
||||
model: sonnet
|
||||
color: magenta
|
||||
tools: ["Write", "Read"]
|
||||
|
||||
@@ -1,36 +1,36 @@
|
||||
---
|
||||
name: plugin-validator
|
||||
description: |
|
||||
Use this agent when the user asks to "validate my plugin", "check plugin structure", "verify plugin is correct", "validate plugin.json", "check plugin files", or mentions plugin validation. Also trigger proactively after user creates or modifies plugin components. Examples:
|
||||
description: Use this agent when the user asks to "validate my plugin", "check plugin structure", "verify plugin is correct", "validate plugin.json", "check plugin files", or mentions plugin validation. Also trigger proactively after user creates or modifies plugin components. Examples:
|
||||
|
||||
<example>
|
||||
Context: User finished creating a new plugin
|
||||
user: "I've created my first plugin with commands and hooks"
|
||||
assistant: "Great! Let me validate the plugin structure."
|
||||
<commentary>
|
||||
Plugin created, proactively validate to catch issues early.
|
||||
</commentary>
|
||||
assistant: "I'll use the plugin-validator agent to check the plugin."
|
||||
</example>
|
||||
<example>
|
||||
Context: User finished creating a new plugin
|
||||
user: "I've created my first plugin with commands and hooks"
|
||||
assistant: "Great! Let me validate the plugin structure."
|
||||
<commentary>
|
||||
Plugin created, proactively validate to catch issues early.
|
||||
</commentary>
|
||||
assistant: "I'll use the plugin-validator agent to check the plugin."
|
||||
</example>
|
||||
|
||||
<example>
|
||||
Context: User explicitly requests validation
|
||||
user: "Validate my plugin before I publish it"
|
||||
assistant: "I'll use the plugin-validator agent to perform comprehensive validation."
|
||||
<commentary>
|
||||
Explicit validation request triggers the agent.
|
||||
</commentary>
|
||||
</example>
|
||||
<example>
|
||||
Context: User explicitly requests validation
|
||||
user: "Validate my plugin before I publish it"
|
||||
assistant: "I'll use the plugin-validator agent to perform comprehensive validation."
|
||||
<commentary>
|
||||
Explicit validation request triggers the agent.
|
||||
</commentary>
|
||||
</example>
|
||||
|
||||
<example>
|
||||
Context: User modified plugin.json
|
||||
user: "I've updated the plugin manifest"
|
||||
assistant: "Let me validate the changes."
|
||||
<commentary>
|
||||
Manifest modified, validate to ensure correctness.
|
||||
</commentary>
|
||||
assistant: "I'll use the plugin-validator agent to check the manifest."
|
||||
</example>
|
||||
|
||||
<example>
|
||||
Context: User modified plugin.json
|
||||
user: "I've updated the plugin manifest"
|
||||
assistant: "Let me validate the changes."
|
||||
<commentary>
|
||||
Manifest modified, validate to ensure correctness.
|
||||
</commentary>
|
||||
assistant: "I'll use the plugin-validator agent to check the manifest."
|
||||
</example>
|
||||
model: inherit
|
||||
color: yellow
|
||||
tools: ["Read", "Grep", "Glob", "Bash"]
|
||||
|
||||
@@ -1,35 +1,35 @@
|
||||
---
|
||||
name: skill-reviewer
|
||||
description: |
|
||||
Use this agent when the user has created or modified a skill and needs quality review, asks to "review my skill", "check skill quality", "improve skill description", or wants to ensure skill follows best practices. Trigger proactively after skill creation. Examples:
|
||||
description: Use this agent when the user has created or modified a skill and needs quality review, asks to "review my skill", "check skill quality", "improve skill description", or wants to ensure skill follows best practices. Trigger proactively after skill creation. Examples:
|
||||
|
||||
<example>
|
||||
Context: User just created a new skill
|
||||
user: "I've created a PDF processing skill"
|
||||
assistant: "Great! Let me review the skill quality."
|
||||
<commentary>
|
||||
Skill created, proactively trigger skill-reviewer to ensure it follows best practices.
|
||||
</commentary>
|
||||
assistant: "I'll use the skill-reviewer agent to review the skill."
|
||||
</example>
|
||||
<example>
|
||||
Context: User just created a new skill
|
||||
user: "I've created a PDF processing skill"
|
||||
assistant: "Great! Let me review the skill quality."
|
||||
<commentary>
|
||||
Skill created, proactively trigger skill-reviewer to ensure it follows best practices.
|
||||
</commentary>
|
||||
assistant: "I'll use the skill-reviewer agent to review the skill."
|
||||
</example>
|
||||
|
||||
<example>
|
||||
Context: User requests skill review
|
||||
user: "Review my skill and tell me how to improve it"
|
||||
assistant: "I'll use the skill-reviewer agent to analyze the skill quality."
|
||||
<commentary>
|
||||
Explicit skill review request triggers the agent.
|
||||
</commentary>
|
||||
</example>
|
||||
<example>
|
||||
Context: User requests skill review
|
||||
user: "Review my skill and tell me how to improve it"
|
||||
assistant: "I'll use the skill-reviewer agent to analyze the skill quality."
|
||||
<commentary>
|
||||
Explicit skill review request triggers the agent.
|
||||
</commentary>
|
||||
</example>
|
||||
|
||||
<example>
|
||||
Context: User modified skill description
|
||||
user: "I updated the skill description, does it look good?"
|
||||
assistant: "I'll use the skill-reviewer agent to review the changes."
|
||||
<commentary>
|
||||
Skill description modified, review for triggering effectiveness.
|
||||
</commentary>
|
||||
</example>
|
||||
|
||||
<example>
|
||||
Context: User modified skill description
|
||||
user: "I updated the skill description, does it look good?"
|
||||
assistant: "I'll use the skill-reviewer agent to review the changes."
|
||||
<commentary>
|
||||
Skill description modified, review for triggering effectiveness.
|
||||
</commentary>
|
||||
</example>
|
||||
model: inherit
|
||||
color: cyan
|
||||
tools: ["Read", "Grep", "Glob"]
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
---
|
||||
name: agent-development
|
||||
name: Agent Development
|
||||
description: This skill should be used when the user asks to "create an agent", "add an agent", "write a subagent", "agent frontmatter", "when to use description", "agent examples", "agent tools", "agent colors", "autonomous agent", or needs guidance on agent structure, system prompts, triggering conditions, or agent development best practices for Claude Code plugins.
|
||||
version: 0.1.0
|
||||
---
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
---
|
||||
name: command-development
|
||||
name: Command Development
|
||||
description: This skill should be used when the user asks to "create a slash command", "add a command", "write a custom command", "define command arguments", "use command frontmatter", "organize commands", "create command with file references", "interactive command", "use AskUserQuestion in command", or needs guidance on slash command structure, YAML frontmatter fields, dynamic arguments, bash execution in commands, user interaction patterns, or command development best practices for Claude Code.
|
||||
version: 0.2.0
|
||||
---
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
---
|
||||
name: hook-development
|
||||
name: Hook Development
|
||||
description: This skill should be used when the user asks to "create a hook", "add a PreToolUse/PostToolUse/Stop hook", "validate tool use", "implement prompt-based hooks", "use ${CLAUDE_PLUGIN_ROOT}", "set up event-driven automation", "block dangerous commands", or mentions hook events (PreToolUse, PostToolUse, Stop, SubagentStop, SessionStart, SessionEnd, UserPromptSubmit, PreCompact, Notification). Provides comprehensive guidance for creating and implementing Claude Code plugin hooks with focus on advanced prompt-based hooks API.
|
||||
version: 0.1.0
|
||||
---
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
---
|
||||
name: mcp-integration
|
||||
name: MCP Integration
|
||||
description: This skill should be used when the user asks to "add MCP server", "integrate MCP", "configure MCP in plugin", "use .mcp.json", "set up Model Context Protocol", "connect external service", mentions "${CLAUDE_PLUGIN_ROOT} with MCP", or discusses MCP server types (SSE, stdio, HTTP, WebSocket). Provides comprehensive guidance for integrating Model Context Protocol servers into Claude Code plugins for external tool and service integration.
|
||||
version: 0.1.0
|
||||
---
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
---
|
||||
name: plugin-settings
|
||||
name: Plugin Settings
|
||||
description: This skill should be used when the user asks about "plugin settings", "store plugin configuration", "user-configurable plugin", ".local.md files", "plugin state files", "read YAML frontmatter", "per-project plugin settings", or wants to make plugin behavior configurable. Documents the .claude/plugin-name.local.md pattern for storing plugin-specific configuration with YAML frontmatter and markdown content.
|
||||
version: 0.1.0
|
||||
---
|
||||
@@ -449,7 +449,7 @@ Coordinate with auth-agent on shared types.
|
||||
- Sends notifications to coordinator if enabled
|
||||
- Allows quick activation/deactivation via `enabled: true/false`
|
||||
|
||||
### ralph-loop Plugin
|
||||
### ralph-wiggum Plugin
|
||||
|
||||
**.claude/ralph-loop.local.md:**
|
||||
```markdown
|
||||
@@ -512,7 +512,7 @@ fi
|
||||
For detailed implementation patterns:
|
||||
|
||||
- **`references/parsing-techniques.md`** - Complete guide to parsing YAML frontmatter and markdown bodies
|
||||
- **`references/real-world-examples.md`** - Deep dive into multi-agent-swarm and ralph-loop implementations
|
||||
- **`references/real-world-examples.md`** - Deep dive into multi-agent-swarm and ralph-wiggum implementations
|
||||
|
||||
### Example Files
|
||||
|
||||
|
||||
@@ -126,7 +126,7 @@ sed "s/^pr_number: .*/pr_number: $PR_NUM/" \
|
||||
mv temp.md ".claude/multi-agent-swarm.local.md"
|
||||
```
|
||||
|
||||
## ralph-loop Plugin
|
||||
## ralph-wiggum Plugin
|
||||
|
||||
### Settings File Structure
|
||||
|
||||
@@ -253,7 +253,7 @@ echo "Ralph loop initialized: .claude/ralph-loop.local.md"
|
||||
|
||||
## Pattern Comparison
|
||||
|
||||
| Feature | multi-agent-swarm | ralph-loop |
|
||||
| Feature | multi-agent-swarm | ralph-wiggum |
|
||||
|---------|-------------------|--------------|
|
||||
| **File** | `.claude/multi-agent-swarm.local.md` | `.claude/ralph-loop.local.md` |
|
||||
| **Purpose** | Agent coordination state | Loop iteration state |
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
---
|
||||
name: plugin-structure
|
||||
name: Plugin Structure
|
||||
description: This skill should be used when the user asks to "create a plugin", "scaffold a plugin", "understand plugin structure", "organize plugin components", "set up plugin.json", "use ${CLAUDE_PLUGIN_ROOT}", "add commands/agents/skills/hooks", "configure auto-discovery", or needs guidance on plugin directory layout, manifest configuration, component organization, file naming conventions, or Claude Code plugin architecture best practices.
|
||||
version: 0.1.0
|
||||
---
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
---
|
||||
name: skill-development
|
||||
name: Skill Development
|
||||
description: This skill should be used when the user wants to "create a skill", "add a skill to plugin", "write a new skill", "improve skill description", "organize skill content", or needs guidance on skill structure, progressive disclosure, or skill development best practices for Claude Code plugins.
|
||||
version: 0.1.0
|
||||
---
|
||||
@@ -310,7 +310,7 @@ Study the skills in this plugin as examples of best practices:
|
||||
|
||||
**plugin-settings skill:**
|
||||
- Specific triggers: "plugin settings", ".local.md files", "YAML frontmatter"
|
||||
- References show real implementations (multi-agent-swarm, ralph-loop)
|
||||
- References show real implementations (multi-agent-swarm, ralph-wiggum)
|
||||
- Working parsing scripts
|
||||
|
||||
Each demonstrates progressive disclosure and strong triggering.
|
||||
|
||||
@@ -1,42 +1,37 @@
|
||||
---
|
||||
name: code-simplifier
|
||||
description: |
|
||||
Use this agent when code has been written or modified and needs to be simplified for clarity, consistency, and maintainability while preserving all functionality. This agent should be triggered automatically after completing a coding task or writing a logical chunk of code. It simplifies code by following project best practices while retaining all functionality. The agent focuses only on recently modified code unless instructed otherwise.
|
||||
description: Use this agent when code has been written or modified and needs to be simplified for clarity, consistency, and maintainability while preserving all functionality. This agent should be triggered automatically after completing a coding task or writing a logical chunk of code. It simplifies code by following project best practices while retaining all functionality. The agent focuses only on recently modified code unless instructed otherwise.\n\nExamples:\n\n<example>
|
||||
Context: The assistant has just implemented a new feature that adds user authentication to an API endpoint.
|
||||
user: "Please add authentication to the /api/users endpoint"
|
||||
assistant: "I've implemented the authentication for the /api/users endpoint. Here's the code:"
|
||||
<function call omitted for brevity>
|
||||
<commentary>
|
||||
Since a logical chunk of code has been written (authentication feature), use the Task tool to launch the code-simplifier agent to improve the code's clarity and maintainability while preserving functionality.
|
||||
</commentary>
|
||||
assistant: "Now let me use the code-simplifier agent to refine this implementation for better clarity and maintainability"
|
||||
</example>
|
||||
|
||||
Examples:
|
||||
<example>
|
||||
Context: The assistant has just fixed a bug by adding several conditional checks.
|
||||
user: "Fix the null pointer exception in the data processor"
|
||||
assistant: "I've added the necessary null checks to prevent the exception:"
|
||||
<function call omitted for brevity>
|
||||
<commentary>
|
||||
After modifying code to fix a bug, use the code-simplifier agent to ensure the fix follows best practices and maintains code quality.
|
||||
</commentary>
|
||||
assistant: "Let me refine this bug fix using the code-simplifier agent to ensure it follows our best practices"
|
||||
</example>
|
||||
|
||||
<example>
|
||||
Context: The assistant has just implemented a new feature that adds user authentication to an API endpoint.
|
||||
user: "Please add authentication to the /api/users endpoint"
|
||||
assistant: "I've implemented the authentication for the /api/users endpoint. Here's the code:"
|
||||
<function call omitted for brevity>
|
||||
<commentary>
|
||||
Since a logical chunk of code has been written (authentication feature), use the Task tool to launch the code-simplifier agent to improve the code's clarity and maintainability while preserving functionality.
|
||||
</commentary>
|
||||
assistant: "Now let me use the code-simplifier agent to refine this implementation for better clarity and maintainability"
|
||||
</example>
|
||||
|
||||
<example>
|
||||
Context: The assistant has just fixed a bug by adding several conditional checks.
|
||||
user: "Fix the null pointer exception in the data processor"
|
||||
assistant: "I've added the necessary null checks to prevent the exception:"
|
||||
<function call omitted for brevity>
|
||||
<commentary>
|
||||
After modifying code to fix a bug, use the code-simplifier agent to ensure the fix follows best practices and maintains code quality.
|
||||
</commentary>
|
||||
assistant: "Let me refine this bug fix using the code-simplifier agent to ensure it follows our best practices"
|
||||
</example>
|
||||
|
||||
<example>
|
||||
Context: The assistant has just refactored a function to improve performance.
|
||||
user: "Optimize the data sorting algorithm for better performance"
|
||||
assistant: "I've optimized the sorting algorithm. Here's the updated implementation:"
|
||||
<function call omitted for brevity>
|
||||
<commentary>
|
||||
After completing a performance optimization task, use the code-simplifier agent to ensure the optimized code is also clear and maintainable.
|
||||
</commentary>
|
||||
assistant: "Now I'll use the code-simplifier agent to ensure the optimized code is also clear and follows our coding standards"
|
||||
</example>
|
||||
<example>
|
||||
Context: The assistant has just refactored a function to improve performance.
|
||||
user: "Optimize the data sorting algorithm for better performance"
|
||||
assistant: "I've optimized the sorting algorithm. Here's the updated implementation:"
|
||||
<function call omitted for brevity>
|
||||
<commentary>
|
||||
After completing a performance optimization task, use the code-simplifier agent to ensure the optimized code is also clear and maintainable.
|
||||
</commentary>
|
||||
assistant: "Now I'll use the code-simplifier agent to ensure the optimized code is also clear and follows our coding standards"
|
||||
</example>
|
||||
model: opus
|
||||
---
|
||||
|
||||
|
||||
@@ -1,31 +0,0 @@
|
||||
# pyright-lsp
|
||||
|
||||
Python language server (Pyright) for Claude Code, providing static type checking and code intelligence.
|
||||
|
||||
## Supported Extensions
|
||||
`.py`, `.pyi`
|
||||
|
||||
## Installation
|
||||
|
||||
Install Pyright globally via npm:
|
||||
|
||||
```bash
|
||||
npm install -g pyright
|
||||
```
|
||||
|
||||
Or with pip:
|
||||
|
||||
```bash
|
||||
pip install pyright
|
||||
```
|
||||
|
||||
Or with pipx (recommended for CLI tools):
|
||||
|
||||
```bash
|
||||
pipx install pyright
|
||||
```
|
||||
|
||||
## More Information
|
||||
- [Pyright on npm](https://www.npmjs.com/package/pyright)
|
||||
- [Pyright on PyPI](https://pypi.org/project/pyright/)
|
||||
- [GitHub Repository](https://github.com/microsoft/pyright)
|
||||
@@ -1,8 +0,0 @@
|
||||
{
|
||||
"name": "ralph-loop",
|
||||
"description": "Continuous self-referential AI loops for interactive iterative development, implementing the Ralph Wiggum technique. Run Claude in a while-true loop with the same prompt until task completion.",
|
||||
"author": {
|
||||
"name": "Anthropic",
|
||||
"email": "support@anthropic.com"
|
||||
}
|
||||
}
|
||||
@@ -1,18 +0,0 @@
|
||||
---
|
||||
description: "Cancel active Ralph Loop"
|
||||
allowed-tools: ["Bash(test -f .claude/ralph-loop.local.md:*)", "Bash(rm .claude/ralph-loop.local.md)", "Read(.claude/ralph-loop.local.md)"]
|
||||
hide-from-slash-command-tool: "true"
|
||||
---
|
||||
|
||||
# Cancel Ralph
|
||||
|
||||
To cancel the Ralph loop:
|
||||
|
||||
1. Check if `.claude/ralph-loop.local.md` exists using Bash: `test -f .claude/ralph-loop.local.md && echo "EXISTS" || echo "NOT_FOUND"`
|
||||
|
||||
2. **If NOT_FOUND**: Say "No active Ralph loop found."
|
||||
|
||||
3. **If EXISTS**:
|
||||
- Read `.claude/ralph-loop.local.md` to get the current iteration number from the `iteration:` field
|
||||
- Remove the file using Bash: `rm .claude/ralph-loop.local.md`
|
||||
- Report: "Cancelled Ralph loop (was at iteration N)" where N is the iteration value
|
||||
@@ -1,18 +0,0 @@
|
||||
---
|
||||
description: "Start Ralph Loop in current session"
|
||||
argument-hint: "PROMPT [--max-iterations N] [--completion-promise TEXT]"
|
||||
allowed-tools: ["Bash(${CLAUDE_PLUGIN_ROOT}/scripts/setup-ralph-loop.sh:*)"]
|
||||
hide-from-slash-command-tool: "true"
|
||||
---
|
||||
|
||||
# Ralph Loop Command
|
||||
|
||||
Execute the setup script to initialize the Ralph loop:
|
||||
|
||||
```!
|
||||
"${CLAUDE_PLUGIN_ROOT}/scripts/setup-ralph-loop.sh" $ARGUMENTS
|
||||
```
|
||||
|
||||
Please work on the task. When you try to exit, the Ralph loop will feed the SAME PROMPT back to you for the next iteration. You'll see your previous work in files and git history, allowing you to iterate and improve.
|
||||
|
||||
CRITICAL RULE: If a completion promise is set, you may ONLY output it when the statement is completely and unequivocally TRUE. Do not output false promises to escape the loop, even if you think you're stuck or should exit for other reasons. The loop is designed to continue until genuine completion.
|
||||
8
plugins/ralph-wiggum/.claude-plugin/plugin.json
Normal file
8
plugins/ralph-wiggum/.claude-plugin/plugin.json
Normal file
@@ -0,0 +1,8 @@
|
||||
{
|
||||
"name": "ralph-wiggum",
|
||||
"description": "Implementation of the Ralph Wiggum technique - continuous self-referential AI loops for interactive iterative development. Run Claude in a while-true loop with the same prompt until task completion.",
|
||||
"author": {
|
||||
"name": "Anthropic",
|
||||
"email": "support@anthropic.com"
|
||||
}
|
||||
}
|
||||
@@ -1,12 +1,12 @@
|
||||
# Ralph Loop Plugin
|
||||
# Ralph Wiggum Plugin
|
||||
|
||||
Implementation of the Ralph Wiggum technique for iterative, self-referential AI development loops in Claude Code.
|
||||
|
||||
## What is Ralph Loop?
|
||||
## What is Ralph?
|
||||
|
||||
Ralph Loop is a development methodology based on continuous AI agent loops. As Geoffrey Huntley describes it: **"Ralph is a Bash loop"** - a simple `while true` that repeatedly feeds an AI agent a prompt file, allowing it to iteratively improve its work until completion.
|
||||
Ralph is a development methodology based on continuous AI agent loops. As Geoffrey Huntley describes it: **"Ralph is a Bash loop"** - a simple `while true` that repeatedly feeds an AI agent a prompt file, allowing it to iteratively improve its work until completion.
|
||||
|
||||
This technique is inspired by the Ralph Wiggum coding technique (named after the character from The Simpsons), embodying the philosophy of persistent iteration despite setbacks.
|
||||
The technique is named after Ralph Wiggum from The Simpsons, embodying the philosophy of persistent iteration despite setbacks.
|
||||
|
||||
### Core Concept
|
||||
|
||||
26
plugins/ralph-wiggum/commands/cancel-ralph.md
Normal file
26
plugins/ralph-wiggum/commands/cancel-ralph.md
Normal file
@@ -0,0 +1,26 @@
|
||||
---
|
||||
description: "Cancel active Ralph Wiggum loop"
|
||||
allowed-tools: ["Bash"]
|
||||
hide-from-slash-command-tool: "true"
|
||||
---
|
||||
|
||||
# Cancel Ralph
|
||||
|
||||
```!
|
||||
if [[ -f .claude/ralph-loop.local.md ]]; then
|
||||
ITERATION=$(grep '^iteration:' .claude/ralph-loop.local.md | sed 's/iteration: *//')
|
||||
echo "FOUND_LOOP=true"
|
||||
echo "ITERATION=$ITERATION"
|
||||
else
|
||||
echo "FOUND_LOOP=false"
|
||||
fi
|
||||
```
|
||||
|
||||
Check the output above:
|
||||
|
||||
1. **If FOUND_LOOP=false**:
|
||||
- Say "No active Ralph loop found."
|
||||
|
||||
2. **If FOUND_LOOP=true**:
|
||||
- Use Bash: `rm .claude/ralph-loop.local.md`
|
||||
- Report: "Cancelled Ralph loop (was at iteration N)" where N is the ITERATION value from above.
|
||||
@@ -1,14 +1,14 @@
|
||||
---
|
||||
description: "Explain Ralph Loop plugin and available commands"
|
||||
description: "Explain Ralph Wiggum technique and available commands"
|
||||
---
|
||||
|
||||
# Ralph Loop Plugin Help
|
||||
# Ralph Wiggum Plugin Help
|
||||
|
||||
Please explain the following to the user:
|
||||
|
||||
## What is Ralph Loop?
|
||||
## What is the Ralph Wiggum Technique?
|
||||
|
||||
Ralph Loop implements the Ralph Wiggum technique - an iterative development methodology based on continuous AI loops, pioneered by Geoffrey Huntley.
|
||||
The Ralph Wiggum technique is an iterative development methodology based on continuous AI loops, pioneered by Geoffrey Huntley.
|
||||
|
||||
**Core concept:**
|
||||
```bash
|
||||
48
plugins/ralph-wiggum/commands/ralph-loop.md
Normal file
48
plugins/ralph-wiggum/commands/ralph-loop.md
Normal file
@@ -0,0 +1,48 @@
|
||||
---
|
||||
description: "Start Ralph Wiggum loop in current session"
|
||||
argument-hint: "PROMPT [--max-iterations N] [--completion-promise TEXT]"
|
||||
allowed-tools: ["Bash(${CLAUDE_PLUGIN_ROOT}/scripts/setup-ralph-loop.sh)"]
|
||||
hide-from-slash-command-tool: "true"
|
||||
---
|
||||
|
||||
# Ralph Loop Command
|
||||
|
||||
Execute the setup script to initialize the Ralph loop:
|
||||
|
||||
```!
|
||||
"${CLAUDE_PLUGIN_ROOT}/scripts/setup-ralph-loop.sh" $ARGUMENTS
|
||||
|
||||
# Extract and display completion promise if set
|
||||
if [ -f .claude/ralph-loop.local.md ]; then
|
||||
PROMISE=$(grep '^completion_promise:' .claude/ralph-loop.local.md | sed 's/completion_promise: *//' | sed 's/^"\(.*\)"$/\1/')
|
||||
if [ -n "$PROMISE" ] && [ "$PROMISE" != "null" ]; then
|
||||
echo ""
|
||||
echo "═══════════════════════════════════════════════════════════"
|
||||
echo "CRITICAL - Ralph Loop Completion Promise"
|
||||
echo "═══════════════════════════════════════════════════════════"
|
||||
echo ""
|
||||
echo "To complete this loop, output this EXACT text:"
|
||||
echo " <promise>$PROMISE</promise>"
|
||||
echo ""
|
||||
echo "STRICT REQUIREMENTS (DO NOT VIOLATE):"
|
||||
echo " ✓ Use <promise> XML tags EXACTLY as shown above"
|
||||
echo " ✓ The statement MUST be completely and unequivocally TRUE"
|
||||
echo " ✓ Do NOT output false statements to exit the loop"
|
||||
echo " ✓ Do NOT lie even if you think you should exit"
|
||||
echo ""
|
||||
echo "IMPORTANT - Do not circumvent the loop:"
|
||||
echo " Even if you believe you're stuck, the task is impossible,"
|
||||
echo " or you've been running too long - you MUST NOT output a"
|
||||
echo " false promise statement. The loop is designed to continue"
|
||||
echo " until the promise is GENUINELY TRUE. Trust the process."
|
||||
echo ""
|
||||
echo " If the loop should stop, the promise statement will become"
|
||||
echo " true naturally. Do not force it by lying."
|
||||
echo "═══════════════════════════════════════════════════════════"
|
||||
fi
|
||||
fi
|
||||
```
|
||||
|
||||
Please work on the task. When you try to exit, the Ralph loop will feed the SAME PROMPT back to you for the next iteration. You'll see your previous work in files and git history, allowing you to iterate and improve.
|
||||
|
||||
CRITICAL RULE: If a completion promise is set, you may ONLY output it when the statement is completely and unequivocally TRUE. Do not output false promises to escape the loop, even if you think you're stuck or should exit for other reasons. The loop is designed to continue until genuine completion.
|
||||
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"description": "Ralph Loop plugin stop hook for self-referential loops",
|
||||
"description": "Ralph Wiggum plugin stop hook for self-referential loops",
|
||||
"hooks": {
|
||||
"Stop": [
|
||||
{
|
||||
@@ -1,6 +1,6 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Ralph Loop Stop Hook
|
||||
# Ralph Wiggum Stop Hook
|
||||
# Prevents session exit when a ralph-loop is active
|
||||
# Feeds Claude's output back as input to continue the loop
|
||||
|
||||
@@ -29,7 +29,7 @@ OPTIONS:
|
||||
-h, --help Show this help message
|
||||
|
||||
DESCRIPTION:
|
||||
Starts a Ralph Loop in your CURRENT session. The stop hook prevents
|
||||
Starts a Ralph Wiggum loop in your CURRENT session. The stop hook prevents
|
||||
exit and feeds your output back as input until completion or iteration limit.
|
||||
|
||||
To signal completion, you must output: <promise>YOUR_PHRASE</promise>
|
||||
@@ -174,30 +174,3 @@ if [[ -n "$PROMPT" ]]; then
|
||||
echo ""
|
||||
echo "$PROMPT"
|
||||
fi
|
||||
|
||||
# Display completion promise requirements if set
|
||||
if [[ "$COMPLETION_PROMISE" != "null" ]]; then
|
||||
echo ""
|
||||
echo "═══════════════════════════════════════════════════════════"
|
||||
echo "CRITICAL - Ralph Loop Completion Promise"
|
||||
echo "═══════════════════════════════════════════════════════════"
|
||||
echo ""
|
||||
echo "To complete this loop, output this EXACT text:"
|
||||
echo " <promise>$COMPLETION_PROMISE</promise>"
|
||||
echo ""
|
||||
echo "STRICT REQUIREMENTS (DO NOT VIOLATE):"
|
||||
echo " ✓ Use <promise> XML tags EXACTLY as shown above"
|
||||
echo " ✓ The statement MUST be completely and unequivocally TRUE"
|
||||
echo " ✓ Do NOT output false statements to exit the loop"
|
||||
echo " ✓ Do NOT lie even if you think you should exit"
|
||||
echo ""
|
||||
echo "IMPORTANT - Do not circumvent the loop:"
|
||||
echo " Even if you believe you're stuck, the task is impossible,"
|
||||
echo " or you've been running too long - you MUST NOT output a"
|
||||
echo " false promise statement. The loop is designed to continue"
|
||||
echo " until the promise is GENUINELY TRUE. Trust the process."
|
||||
echo ""
|
||||
echo " If the loop should stop, the promise statement will become"
|
||||
echo " true naturally. Do not force it by lying."
|
||||
echo "═══════════════════════════════════════════════════════════"
|
||||
fi
|
||||
@@ -1,34 +0,0 @@
|
||||
# rust-analyzer-lsp
|
||||
|
||||
Rust language server for Claude Code, providing code intelligence and analysis.
|
||||
|
||||
## Supported Extensions
|
||||
`.rs`
|
||||
|
||||
## Installation
|
||||
|
||||
### Via rustup (recommended)
|
||||
```bash
|
||||
rustup component add rust-analyzer
|
||||
```
|
||||
|
||||
### Via Homebrew (macOS)
|
||||
```bash
|
||||
brew install rust-analyzer
|
||||
```
|
||||
|
||||
### Via package manager (Linux)
|
||||
```bash
|
||||
# Ubuntu/Debian
|
||||
sudo apt install rust-analyzer
|
||||
|
||||
# Arch Linux
|
||||
sudo pacman -S rust-analyzer
|
||||
```
|
||||
|
||||
### Manual download
|
||||
Download pre-built binaries from the [releases page](https://github.com/rust-lang/rust-analyzer/releases).
|
||||
|
||||
## More Information
|
||||
- [rust-analyzer Website](https://rust-analyzer.github.io/)
|
||||
- [GitHub Repository](https://github.com/rust-lang/rust-analyzer)
|
||||
@@ -1,8 +0,0 @@
|
||||
{
|
||||
"name": "skill-creator",
|
||||
"description": "Create new skills, improve existing skills, and measure skill performance. Use when users want to create a skill from scratch, update or optimize an existing skill, run evals to test a skill, or benchmark skill performance with variance analysis.",
|
||||
"author": {
|
||||
"name": "Anthropic",
|
||||
"email": "support@anthropic.com"
|
||||
}
|
||||
}
|
||||
@@ -1,3 +0,0 @@
|
||||
# skill-creator
|
||||
|
||||
Create new skills, improve existing skills, and measure skill performance. Use when users want to create a skill from scratch, update or optimize an existing skill, run evals to test a skill, or benchmark skill performance with variance analysis.
|
||||
@@ -1,202 +0,0 @@
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
@@ -1,762 +0,0 @@
|
||||
---
|
||||
name: skill-creator
|
||||
description: Create new skills, improve existing skills, and measure skill performance. Use when users want to create a skill from scratch, update or optimize an existing skill, run evals to test a skill, or benchmark skill performance with variance analysis.
|
||||
---
|
||||
|
||||
# Skill Creator
|
||||
|
||||
A skill for creating new skills and iteratively improving them.
|
||||
|
||||
At a high level, the process of creating a skill goes like this:
|
||||
|
||||
- Decide what you want the skill to do and roughly how it should do it
|
||||
- Write a draft of the skill
|
||||
- Create a few test prompts and run claude-with-access-to-the-skill on them
|
||||
- Evaluate the results
|
||||
- which can be through automated evals, but also it's totally fine and good for them to be evaluated by the human by hand and that's often the only way
|
||||
- Rewrite the skill based on feedback from the evaluation
|
||||
- Repeat until you're satisfied
|
||||
- Expand the test set and try again at larger scale
|
||||
|
||||
Your job when using this skill is to figure out where the user is in this process and then jump in and help them progress through these stages. So for instance, maybe they're like "I want to make a skill for X". You can help narrow down what they mean, write a draft, write the test cases, figure out how they want to evaluate, run all the prompts, and repeat.
|
||||
|
||||
On the other hand, maybe they already have a draft of the skill. In this case you can go straight to the eval/iterate part of the loop.
|
||||
|
||||
Of course, you should always be flexible and if the user is like "I don't need to run a bunch of evaluations, just vibe with me", you can do that instead.
|
||||
|
||||
Cool? Cool.
|
||||
|
||||
## Building Blocks
|
||||
|
||||
The skill-creator operates on composable building blocks. Each has well-defined inputs and outputs.
|
||||
|
||||
| Building Block | Input | Output | Agent |
|
||||
|-----------|-------|--------|-------|
|
||||
| **Eval Run** | skill + eval prompt + files | transcript, outputs, metrics | `agents/executor.md` |
|
||||
| **Grade Expectations** | outputs + expectations | pass/fail per expectation | `agents/grader.md` |
|
||||
| **Blind Compare** | output A, output B, eval prompt | winner + reasoning | `agents/comparator.md` |
|
||||
| **Post-hoc Analysis** | winner + skills + transcripts | improvement suggestions | `agents/analyzer.md` |
|
||||
|
||||
### Eval Run
|
||||
|
||||
Executes a skill on an eval prompt and produces measurable outputs.
|
||||
|
||||
- **Input**: Skill path, eval prompt, input files
|
||||
- **Output**: `transcript.md`, `outputs/`, `metrics.json`
|
||||
- **Metrics captured**: Tool calls, execution steps, output size, errors
|
||||
|
||||
### Grade Expectations
|
||||
|
||||
Evaluates whether outputs meet defined expectations.
|
||||
|
||||
- **Input**: Expectations list, transcript, outputs directory
|
||||
- **Output**: `grading.json` with pass/fail per expectation plus evidence
|
||||
- **Purpose**: Objective measurement of skill performance
|
||||
|
||||
### Blind Compare
|
||||
|
||||
Compares two outputs without knowing which skill produced them.
|
||||
|
||||
- **Input**: Output A path, Output B path, eval prompt, expectations (optional)
|
||||
- **Output**: Winner (A/B/TIE), reasoning, quality scores
|
||||
- **Purpose**: Unbiased comparison between skill versions
|
||||
|
||||
### Post-hoc Analysis
|
||||
|
||||
After blind comparison, analyzes WHY the winner won.
|
||||
|
||||
- **Input**: Winner identity, both skills, both transcripts, comparison result
|
||||
- **Output**: Winner strengths, loser weaknesses, improvement suggestions
|
||||
- **Purpose**: Generate actionable improvements for next iteration
|
||||
|
||||
---
|
||||
|
||||
## Environment Capabilities
|
||||
|
||||
Check whether you can spawn subagents — independent agents that execute tasks
|
||||
in parallel. If you can, you'll delegate work to executor, grader, comparator,
|
||||
and analyzer agents. If not, you'll do all work inline, sequentially.
|
||||
|
||||
This affects which modes are available and how they execute. The core
|
||||
workflows are the same — only the execution strategy changes.
|
||||
|
||||
---
|
||||
|
||||
## Mode Workflows
|
||||
|
||||
Building blocks combine into higher-level workflows for each mode:
|
||||
|
||||
| Mode | Purpose | Workflow |
|
||||
|------|---------|----------|
|
||||
| **Eval** | Test skill performance | Executor → Grader → Results |
|
||||
| **Improve** | Iteratively optimize skill | Executor → Grader → Comparator → Analyzer → Apply |
|
||||
| **Create** | Interactive skill development | Interview → Research → Draft → Run → Refine |
|
||||
| **Benchmark** | Standardized performance measurement (requires subagents) | 3x runs per configuration → Aggregate → Analyze |
|
||||
|
||||
See `references/mode-diagrams.md` for detailed visual workflow diagrams.
|
||||
|
||||
---
|
||||
|
||||
## Task Tracking
|
||||
|
||||
Use tasks to track progress on multi-step workflows.
|
||||
|
||||
### Task Lifecycle
|
||||
|
||||
Each eval run becomes a task with stage progression:
|
||||
|
||||
```
|
||||
pending → planning → implementing → reviewing → verifying → completed
|
||||
(prep) (executor) (grader) (validate)
|
||||
```
|
||||
|
||||
### Creating Tasks
|
||||
|
||||
When running evals, create a task per eval run:
|
||||
|
||||
```python
|
||||
TaskCreate(
|
||||
subject="Eval 0, run 1 (with_skill)",
|
||||
description="Execute skill eval 0 with skill and grade expectations",
|
||||
activeForm="Preparing eval 0"
|
||||
)
|
||||
```
|
||||
|
||||
### Updating Stages
|
||||
|
||||
Progress through stages as work completes:
|
||||
|
||||
```python
|
||||
TaskUpdate(task, status="planning") # Prepare files, stage inputs
|
||||
TaskUpdate(task, status="implementing") # Spawn executor subagent
|
||||
TaskUpdate(task, status="reviewing") # Spawn grader subagent
|
||||
TaskUpdate(task, status="verifying") # Validate outputs exist
|
||||
TaskUpdate(task, status="completed") # Done
|
||||
```
|
||||
|
||||
### Comparison Tasks
|
||||
|
||||
For blind comparisons (after all runs complete):
|
||||
|
||||
```python
|
||||
TaskCreate(
|
||||
subject="Compare skill-v1 vs skill-v2"
|
||||
)
|
||||
# planning = gather outputs
|
||||
# implementing = spawn blind comparators
|
||||
# reviewing = tally votes, handle ties
|
||||
# verifying = if tied, run more comparisons or use efficiency
|
||||
# completed = declare winner
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Architecture
|
||||
|
||||
The **coordinator** (this skill):
|
||||
|
||||
1. Asks the user what they want to do and which skill to work on
|
||||
2. Determines workspace location (ask if not obvious)
|
||||
3. Creates workspace and tasks for tracking progress
|
||||
4. Delegates work to subagents when available, otherwise executes inline
|
||||
5. Tracks the **best version** (not necessarily the latest)
|
||||
6. Reports results with evidence and metrics
|
||||
|
||||
### Agent Types
|
||||
|
||||
| Agent | Role | Reference |
|
||||
|-------|------|-----------|
|
||||
| **Executor** | Run skill on a task, produce transcript + outputs + metrics | `agents/executor.md` |
|
||||
| **Grader** | Evaluate expectations against transcript and outputs | `agents/grader.md` |
|
||||
| **Comparator** | Blind A/B comparison between two outputs | `agents/comparator.md` |
|
||||
| **Analyzer** | Post-hoc analysis of comparison results | `agents/analyzer.md` |
|
||||
|
||||
## Communicating with the user
|
||||
|
||||
The skill creator is liable to be used by people across a wide range of familiarity with coding jargon. If you haven't heard (and how could you, it's only very recently that it started), there's a trend now where the power of Claude is inspiring plumbers to open up their terminals, parents and grandparents to google "how to install npm". On the other hand, the bulk of users are probably fairly computer-literate.
|
||||
|
||||
So please pay attention to context cues to understand how to phrase your communication! In the default case, just to give you some idea:
|
||||
|
||||
- "evaluation" and "benchmark" are borderline, but OK
|
||||
- for "JSON" and "assertion" you want to see serious cues from the user that they know what those things are before using them without explaining them
|
||||
|
||||
It's OK to briefly explain terms if you're in doubt, and feel free to clarify terms with a short definition if you're unsure if the user will get it.
|
||||
|
||||
---
|
||||
|
||||
## Creating a skill
|
||||
|
||||
### Capture Intent
|
||||
|
||||
Start by understanding the user's intent. The current conversation might already contain a workflow the user wants to capture (e.g., they say "turn this into a skill"). If so, extract answers from the conversation history first — the tools used, the sequence of steps, corrections the user made, input/output formats observed. The user may need to fill the gaps, and should confirm before proceeding to the next step.
|
||||
|
||||
1. What should this skill enable Claude to do?
|
||||
2. When should this skill trigger? (what user phrases/contexts)
|
||||
3. What's the expected output format?
|
||||
4. Should we set up test cases to verify the skill works? Skills with objectively verifiable outputs (file transforms, data extraction, code generation, fixed workflow steps) benefit from test cases. Skills with subjective outputs (writing style, art) often don't need them. Suggest the appropriate default based on the skill type, but let the user decide.
|
||||
|
||||
### Interview and Research
|
||||
|
||||
Proactively ask questions about edge cases, input/output formats, example files, success criteria, and dependencies.
|
||||
|
||||
Check available MCPs - if useful for research (searching docs, finding similar skills, looking up best practices), research in parallel via subagents if available, otherwise inline. Come prepared with context to reduce burden on the user.
|
||||
|
||||
### Initialize
|
||||
|
||||
Run the initialization script:
|
||||
|
||||
```bash
|
||||
scripts/init_skill.py <skill-name> --path <output-directory>
|
||||
```
|
||||
|
||||
This creates:
|
||||
- SKILL.md template with frontmatter
|
||||
- scripts/, references/, assets/ directories
|
||||
- Example files to customize or delete
|
||||
|
||||
### Fill SKILL.md Frontmatter
|
||||
|
||||
Based on interview, fill:
|
||||
|
||||
- **name**: Skill identifier
|
||||
- **description**: When to trigger, what it does. This is the primary triggering mechanism - include both what the skill does AND specific contexts for when to use it. All "when to use" info goes here, not in the body. Note: currently Claude has a tendency to "undertrigger" skills -- to not use them when they'd be useful. To combat this, please make the skill descriptions a little bit "pushy". So for instance, instead of "How to build a simple fast dashboard to display internal Anthropic data.", you might write "How to build a simple fast dashboard to display internal Anthropic data. Make sure to use this skill whenever the user mentions dashboards, data visualization, internal metrics, or wants to display any kind of company data, even if they don't explicitly ask for a 'dashboard.'"
|
||||
- **compatibility**: Required tools, dependencies (optional, rarely needed)
|
||||
|
||||
### Skill Writing Guide
|
||||
|
||||
#### Anatomy of a Skill
|
||||
|
||||
```
|
||||
skill-name/
|
||||
├── SKILL.md (required)
|
||||
│ ├── YAML frontmatter (name, description required)
|
||||
│ └── Markdown instructions
|
||||
└── Bundled Resources (optional)
|
||||
├── scripts/ - Executable code for deterministic/repetitive tasks
|
||||
├── references/ - Docs loaded into context as needed
|
||||
└── assets/ - Files used in output (templates, icons, fonts)
|
||||
```
|
||||
|
||||
**What NOT to include**: README.md, INSTALLATION_GUIDE.md, CHANGELOG.md, or any auxiliary documentation. Skills are for AI agents, not human onboarding.
|
||||
|
||||
#### Progressive Disclosure
|
||||
|
||||
Skills use a three-level loading system:
|
||||
1. **Metadata** (name + description) - Always in context (~100 words)
|
||||
2. **SKILL.md body** - In context whenever skill triggers (<500 lines ideal)
|
||||
3. **Bundled resources** - As needed (unlimited, scripts can execute without loading)
|
||||
|
||||
These word counts are approximate and you can feel free to go longer if needed.
|
||||
|
||||
**Key patterns:**
|
||||
- Keep SKILL.md under 500 lines; if you're approaching this limit, add an additional layer of hierarchy along with clear pointers about where the model using the skill should go next to follow up.
|
||||
- Reference files clearly from SKILL.md with guidance on when to read them
|
||||
- For large reference files (>300 lines), include a table of contents
|
||||
|
||||
**Domain organization**: When a skill supports multiple domains/frameworks, organize by variant:
|
||||
```
|
||||
cloud-deploy/
|
||||
├── SKILL.md (workflow + selection)
|
||||
└── references/
|
||||
├── aws.md
|
||||
├── gcp.md
|
||||
└── azure.md
|
||||
```
|
||||
Claude reads only the relevant reference file.
|
||||
|
||||
#### Principle of Lack of Surprise
|
||||
|
||||
This goes without saying, but skills must not contain malware, exploit code, or any content that could compromise system security. A skill's contents should not surprise the user in their intent if described. Don't go along with requests to create misleading skills or skills designed to facilitate unauthorized access, data exfiltration, or other malicious activities. Things like a "roleplay as an XYZ" are OK though.
|
||||
|
||||
#### Writing Patterns
|
||||
|
||||
Prefer using the imperative form in instructions.
|
||||
|
||||
**Defining output formats** - You can do it like this:
|
||||
```markdown
|
||||
## Report structure
|
||||
ALWAYS use this exact template:
|
||||
# [Title]
|
||||
## Executive summary
|
||||
## Key findings
|
||||
## Recommendations
|
||||
```
|
||||
|
||||
**Examples pattern** - It's useful to include examples. You can format them like this (but if "Input" and "Output" are in the examples you might want to deviate a little):
|
||||
```markdown
|
||||
## Commit message format
|
||||
**Example 1:**
|
||||
Input: Added user authentication with JWT tokens
|
||||
Output: feat(auth): implement JWT-based authentication
|
||||
```
|
||||
|
||||
### Immediate Feedback Loop
|
||||
|
||||
**Always have something cooking.** Every time user adds an example or input:
|
||||
|
||||
1. **Immediately start running it** - don't wait for full specification
|
||||
2. **Show outputs in workspace** - tell user: "The output is at X, take a look"
|
||||
3. **First runs in main agent loop** - not subagent, so user sees the transcript
|
||||
4. **Seeing what Claude does** helps user understand and refine requirements
|
||||
|
||||
### Writing Style
|
||||
|
||||
Try to explain to the model why things are important in lieu of heavy-handed musty MUSTs. Use theory of mind and try to make the skill general and not super-narrow to specific examples. Start by writing a draft and then look at it with fresh eyes and improve it.
|
||||
|
||||
### Test Cases
|
||||
|
||||
After writing the skill draft, come up with 2-3 realistic test prompts — the kind of thing a real user would actually say. Share them with the user: [you don't have to use this exact language] "Here are a few test cases I'd like to try. Do these look right, or do you want to add more?" Then run them.
|
||||
|
||||
If the user wants evals, create `evals/evals.json` with this structure:
|
||||
|
||||
```json
|
||||
{
|
||||
"skill_name": "example-skill",
|
||||
"evals": [
|
||||
{
|
||||
"id": 1,
|
||||
"prompt": "User's task prompt",
|
||||
"expected_output": "Description of expected result",
|
||||
"files": [],
|
||||
"assertions": [
|
||||
"The output includes X",
|
||||
"The skill correctly handles Y"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
You can initialize with `scripts/init_json.py evals evals/evals.json` and validate with `scripts/validate_json.py evals/evals.json`. See `references/schemas.md` for the full schema.
|
||||
|
||||
### Transition to Automated Iteration
|
||||
|
||||
Once gradable criteria are defined (expectations, success metrics), Claude can:
|
||||
|
||||
- More aggressively suggest improvements
|
||||
- Run tests automatically (via subagents in the background if available, otherwise sequentially)
|
||||
- Present results: "I tried X, it improved pass rate by Y%"
|
||||
|
||||
### Package and Present (only if `present_files` tool is available)
|
||||
|
||||
Check whether you have access to the `present_files` tool. If you don't, skip this step. If you do, package the skill and present the .skill file to the user:
|
||||
|
||||
```bash
|
||||
scripts/package_skill.py <path/to/skill-folder>
|
||||
```
|
||||
|
||||
After packaging, direct the user to the resulting `.skill` file path so they can install it.
|
||||
|
||||
---
|
||||
|
||||
## Improving a skill
|
||||
|
||||
When user asks to improve a skill, ask:
|
||||
1. **Which skill?** - Identify the skill to improve
|
||||
2. **How much time?** - How long can Claude spend iterating?
|
||||
3. **What's the goal?** - Target quality level, specific issues to fix, or general improvement
|
||||
|
||||
Claude should then autonomously iterate using the building blocks (run, grade, compare, analyze) to drive the skill toward the goal within the time budget.
|
||||
|
||||
Some advice on writing style when improving a skill:
|
||||
|
||||
1. Try to generalize from the feedback, rather than fixing specific examples one by one. The big picture thing that's happening here is that we're trying to create "skills" that can be used a million times (maybe literally, maybe even more who knows) across many different prompts. Here you and the user are iterating on only a few examples over and over again because it helps move faster. The user knows these examples in and out and it's quick for them to assess new outputs. But if the skill you and the user are codeveloping works only for those examples, it's useless. Rather than put in fiddley overfitty changes, or oppressively constrictive MUSTs, if there's some stubborn issue, you might try branching out and using different metaphors, or recommending different patterns of working. It's relatively cheap to try and maybe you'll land on something great.
|
||||
|
||||
2. Keep the prompt lean; remove things that aren't pulling their weight. Make sure to read the transcripts, not just the final outputs -- if it looks like the skill is making the model waste a bunch of time doing things that are unproductive, you can try getting rid of the parts of the skill that are making it do that and seeing what happens.
|
||||
|
||||
3. Last but not least, try hard to explain the **why** behind everything you're asking the model to do. Today's LLMs are *smart*. They have good theory of mind and when given a good harness and go beyond rote instructions and really make things happen. Even if the feedback from the user is terse or frustrated, try to actually understand the task and why the user is writing what they wrote, and what they actually wrote, and then try to transmit this understanding into the instructions. If you find yourself writing ALWAYS or NEVER in all caps, or using super rigid structures, that's a yellow flag - try to reframe and explain the reasoning so that the model understands why the thing you're asking for is important. That's a more humane, powerful, and effective approach.
|
||||
|
||||
This task is pretty important (we are trying to create billions a year in economic value here!) and your thinking time is not the blocker; take your time and really mull things over. I'd suggest writing a draft skill and then looking at it anew and making improvements. Really try to get into the head of the user and understand what they want and need. Best of luck.
|
||||
|
||||
### Setup Phase
|
||||
|
||||
0. **Read output schemas**:
|
||||
|
||||
```bash
|
||||
Read references/schemas.md # JSON structures for grading, history, comparison, analysis
|
||||
```
|
||||
|
||||
This ensures you understand the structure of outputs you'll produce and validate.
|
||||
|
||||
1. **Choose workspace location**:
|
||||
|
||||
**Ask the user** where to put the workspace. Suggest `<skill-name>-workspace/` as a sibling to the skill directory, but let the user choose. If the workspace ends up inside a git repo, suggest adding it to `.gitignore`.
|
||||
|
||||
2. **Copy skill to v0**:
|
||||
```bash
|
||||
scripts/copy_skill.py <skill-path> <skill-name>-workspace/v0 --iteration 0
|
||||
```
|
||||
|
||||
3. **Verify or create evals**:
|
||||
- Check for existing `evals/evals.json`
|
||||
- If missing, ask user for 2-3 example tasks and create evals
|
||||
- Use `scripts/init_json.py evals` to create with correct structure
|
||||
|
||||
4. **Create tasks** for baseline:
|
||||
|
||||
```python
|
||||
for run in range(3):
|
||||
TaskCreate(
|
||||
subject=f"Eval baseline, run {run+1}"
|
||||
)
|
||||
```
|
||||
|
||||
5. **Initialize history.json**:
|
||||
|
||||
```bash
|
||||
scripts/init_json.py history <workspace>/history.json
|
||||
```
|
||||
|
||||
Then edit to fill in skill_name. See `references/schemas.md` for full structure.
|
||||
|
||||
### Iteration Loop
|
||||
|
||||
For each iteration (0, 1, 2, ...):
|
||||
|
||||
#### Step 1: Execute (3 Parallel Runs)
|
||||
|
||||
Spawn 3 executor subagents in parallel (or run sequentially without subagents — see "Without subagents" below). Update task to `implementing` stage.
|
||||
|
||||
Spawn a subagent for each run with these instructions:
|
||||
|
||||
```
|
||||
Read agents/executor.md at: <skill-creator-path>/agents/executor.md
|
||||
|
||||
Execute this task:
|
||||
- Skill path: workspace/v<N>/skill/
|
||||
- Task: <eval prompt from evals.json>
|
||||
- Test files: <eval files if any>
|
||||
- Save transcript to: workspace/v<N>/runs/run-<R>/transcript.md
|
||||
- Save outputs to: workspace/v<N>/runs/run-<R>/outputs/
|
||||
```
|
||||
|
||||
#### Step 2: Grade Assertions
|
||||
|
||||
Spawn grader subagents (or grade inline — see "Without subagents" below). Update task to `reviewing` stage.
|
||||
|
||||
**Purpose**: Grading produces structured pass/fail results for tracking pass rates over iterations. The grader also extracts claims and reads user_notes to surface issues that expectations might miss.
|
||||
|
||||
**Set the grader up for success**: The grader needs to actually inspect the outputs, not just read the transcript. If the outputs aren't plain text, tell the grader how to read them — check the skill for inspection tools it already uses and pass those as hints in the grader prompt.
|
||||
|
||||
Spawn a subagent with these instructions:
|
||||
|
||||
```
|
||||
Read agents/grader.md at: <skill-creator-path>/agents/grader.md
|
||||
|
||||
Grade these expectations:
|
||||
- Assertions: <list from evals.json>
|
||||
- Transcript: workspace/v<N>/runs/run-<R>/transcript.md
|
||||
- Outputs: workspace/v<N>/runs/run-<R>/outputs/
|
||||
- Save grading to: workspace/v<N>/runs/run-<R>/grading.json
|
||||
|
||||
To inspect output files:
|
||||
<include inspection hints from the skill, e.g.:>
|
||||
<"Use python -m markitdown <file> to extract text content">
|
||||
```
|
||||
|
||||
**Review grading.json**: Check `user_notes_summary` for uncertainties and workarounds flagged by the executor. Also check `eval_feedback` — if the grader flagged lax assertions or missing coverage, update `evals.json` before continuing. Improving evals mid-loop is fine and often necessary; you can't meaningfully improve a skill if the evals don't measure anything real.
|
||||
|
||||
**Eval quality loop**: If `eval_feedback` has suggestions, tighten the assertions and rerun the evals. Keep iterating as long as the grader keeps finding issues. Once `eval_feedback` says the evals look solid (or has no suggestions), move on to skill improvement. Consult the user about what you're doing, but don't block on approval for each round — just keep making progress.
|
||||
|
||||
When picking which eval to use for the quality loop, prefer one where the skill partially succeeds — some expectations pass, some fail. An eval where everything fails gives the grader nothing to critique (there are no false positives to catch). The feedback is most useful when some expectations pass and the grader can assess whether those passes reflect genuine quality or surface-level compliance.
|
||||
|
||||
#### Step 3: Blind Compare (If N > 0)
|
||||
|
||||
For iterations after baseline, use blind comparison:
|
||||
|
||||
**Purpose**: While grading tracks expectation pass rates, the comparator judges **holistic output quality** using a rubric. Two outputs might both pass all expectations, but one could still be clearly better. The comparator uses expectations as secondary evidence, not the primary decision factor.
|
||||
|
||||
**Blind A/B Protocol:**
|
||||
1. Randomly assign: 50% chance v<N> is A, 50% chance v<N> is B
|
||||
2. Record the assignment in `workspace/grading/v<N>-vs-best/assignment.json`
|
||||
3. Comparator sees only "Output A" and "Output B" - never version names
|
||||
|
||||
Spawn a subagent with these instructions:
|
||||
|
||||
```
|
||||
Read agents/comparator.md at: <skill-creator-path>/agents/comparator.md
|
||||
|
||||
Blind comparison:
|
||||
- Eval prompt: <the task that was executed>
|
||||
- Output A: <path to one version's output>
|
||||
- Output B: <path to other version's output>
|
||||
- Assertions: <list from evals.json>
|
||||
|
||||
You do NOT know which is old vs new. Judge purely on quality.
|
||||
```
|
||||
|
||||
**Determine winner by majority vote:**
|
||||
- If 2+ comparators prefer A: A wins
|
||||
- If 2+ comparators prefer B: B wins
|
||||
- Otherwise: TIE
|
||||
|
||||
#### Step 4: Post-hoc Analysis
|
||||
|
||||
After blind comparison, analyze results. Spawn a subagent with these instructions:
|
||||
|
||||
```
|
||||
Read agents/analyzer.md at: <skill-creator-path>/agents/analyzer.md
|
||||
|
||||
Analyze:
|
||||
- Winner: <A or B>
|
||||
- Winner skill: workspace/<winner-version>/skill/
|
||||
- Winner transcript: workspace/<winner-version>/runs/run-1/transcript.md
|
||||
- Loser skill: workspace/<loser-version>/skill/
|
||||
- Loser transcript: workspace/<loser-version>/runs/run-1/transcript.md
|
||||
- Comparison result: <from comparator>
|
||||
```
|
||||
|
||||
#### Step 5: Update State
|
||||
|
||||
Update task to `completed` stage. Record results:
|
||||
|
||||
```python
|
||||
if new_version wins majority:
|
||||
current_best = new_version
|
||||
# Update history.json
|
||||
|
||||
history.iterations.append({
|
||||
"version": "v<N>",
|
||||
"parent": "<previous best>",
|
||||
"expectation_pass_rate": 0.85,
|
||||
"grading_result": "won" | "lost" | "tie",
|
||||
"is_current_best": bool
|
||||
})
|
||||
```
|
||||
|
||||
#### Step 6: Create New Version (If Continuing)
|
||||
|
||||
1. Copy current best to new version:
|
||||
```bash
|
||||
scripts/copy_skill.py workspace/<current_best>/skill workspace/v<N+1> \
|
||||
--parent <current_best> \
|
||||
--iteration <N+1>
|
||||
```
|
||||
|
||||
2. Apply improvements from analyzer suggestions
|
||||
|
||||
3. Create new tasks for next iteration
|
||||
|
||||
4. Continue loop or stop if:
|
||||
- **Time budget exhausted**: Track elapsed time, stop when approaching limit
|
||||
- **Goal achieved**: Target quality level or pass rate reached
|
||||
- **Diminishing returns**: No significant improvement in last 2 iterations
|
||||
- **User requests stop**: Check for user input between iterations
|
||||
|
||||
### Final Report
|
||||
|
||||
When iterations complete:
|
||||
|
||||
1. **Best Version**: Which version performed best (not necessarily the last)
|
||||
2. **Score Progression**: Assertion pass rates across iterations
|
||||
3. **Key Improvements**: What changes had the most impact
|
||||
4. **Recommendation**: Whether to adopt the improved skill
|
||||
|
||||
Copy best skill back to main location:
|
||||
```bash
|
||||
cp -r workspace/<best_version>/skill/* ./
|
||||
```
|
||||
|
||||
Check whether you have access to the `present_files` tool. If you do, package and present the improved skill, and direct the user to the resulting `.skill` file path so they can install it:
|
||||
```bash
|
||||
scripts/package_skill.py <path/to/skill-folder>
|
||||
```
|
||||
(If you don't have the `present_files` tool, don't run `package_skill.py`)
|
||||
|
||||
### Without Subagents
|
||||
|
||||
Without subagents, Improve mode still works but with reduced rigor:
|
||||
|
||||
- **Single run per iteration** (not 3) — variance analysis isn't possible with one run
|
||||
- **Inline execution**: Read `agents/executor.md` and follow the procedure directly in your main loop. Then read `agents/grader.md` and follow it directly to grade the results.
|
||||
- **No blind comparison**: You can't meaningfully blind yourself since you have full context. Instead, compare outputs by re-reading both versions' results and analyzing the differences directly.
|
||||
- **No separate analyzer**: Do the analysis inline after comparing — identify what improved, what regressed, and what to try next.
|
||||
- **Keep everything else**: Version tracking, copy-iterate-grade loop, history.json, stopping criteria all work the same.
|
||||
- **Acknowledge reduced rigor**: Without independent agents, grading is less rigorous — the same context that executed the task also grades it. Results are directional, not definitive.
|
||||
|
||||
---
|
||||
|
||||
## Eval Mode
|
||||
|
||||
Run individual evals to test skill performance and grade expectations.
|
||||
|
||||
**IMPORTANT**: Before running evals, read the full documentation:
|
||||
```
|
||||
Read references/eval-mode.md # Complete Eval workflow
|
||||
Read references/schemas.md # JSON output structures
|
||||
```
|
||||
|
||||
Use Eval mode when:
|
||||
- Testing a specific eval case
|
||||
- Comparing with/without skill on a single task
|
||||
- Quick validation during development
|
||||
|
||||
The workflow: Setup → Check Dependencies → Prepare → Execute → Grade → Display Results
|
||||
|
||||
Without subagents, execute and grade sequentially in the main loop. Read the agent reference files (`agents/executor.md`, `agents/grader.md`) and follow the procedures directly.
|
||||
|
||||
---
|
||||
|
||||
## Benchmark Mode
|
||||
|
||||
Run standardized performance measurement with variance analysis.
|
||||
|
||||
**Requires subagents.** Benchmark mode relies on parallel execution of many runs to produce statistically meaningful results. Without subagents, use Eval mode for individual eval testing instead.
|
||||
|
||||
**IMPORTANT**: Before running benchmarks, read the full documentation:
|
||||
```
|
||||
Read references/benchmark-mode.md # Complete Benchmark workflow
|
||||
Read references/schemas.md # JSON output structures
|
||||
```
|
||||
|
||||
Use Benchmark mode when:
|
||||
- "How does my skill perform?" - Understanding overall performance
|
||||
- "Compare Sonnet vs Haiku" - Cross-model comparison
|
||||
- "Has performance regressed?" - Tracking changes over time
|
||||
- "Does the skill add value?" - Validating skill impact
|
||||
|
||||
Key differences from Eval:
|
||||
- Runs **all evals** (not just one)
|
||||
- Runs each **3 times per configuration** for variance
|
||||
- Always includes **no-skill baseline**
|
||||
- Uses **most capable model** for analysis
|
||||
|
||||
---
|
||||
|
||||
## Workspace Structure
|
||||
|
||||
Workspaces are created as sibling directories to the skill being worked on.
|
||||
|
||||
```
|
||||
parent-directory/
|
||||
├── skill-name/ # The skill
|
||||
│ ├── SKILL.md
|
||||
│ ├── evals/
|
||||
│ │ ├── evals.json
|
||||
│ │ └── files/
|
||||
│ └── scripts/
|
||||
│
|
||||
└── skill-name-workspace/ # Workspace (sibling directory)
|
||||
│
|
||||
│── [Eval mode]
|
||||
├── eval-0/
|
||||
│ ├── with_skill/
|
||||
│ │ ├── inputs/ # Staged input files
|
||||
│ │ ├── outputs/ # Skill outputs
|
||||
│ │ │ ├── transcript.md
|
||||
│ │ │ ├── user_notes.md # Executor uncertainties
|
||||
│ │ │ ├── metrics.json
|
||||
│ │ │ └── [output files]
|
||||
│ │ ├── grading.json # Assertions + claims + user_notes_summary
|
||||
│ │ └── timing.json # Wall clock timing
|
||||
│ └── without_skill/
|
||||
│ └── ...
|
||||
├── comparison.json # Blind comparison (A/B testing)
|
||||
├── summary.json # Aggregate metrics
|
||||
│
|
||||
│── [Improve mode]
|
||||
├── history.json # Score progression across versions
|
||||
├── v0/
|
||||
│ ├── META.yaml # Version metadata
|
||||
│ ├── skill/ # Copy of skill at this version
|
||||
│ └── runs/
|
||||
│ ├── run-1/
|
||||
│ │ ├── transcript.md
|
||||
│ │ ├── user_notes.md
|
||||
│ │ ├── outputs/
|
||||
│ │ └── grading.json
|
||||
│ ├── run-2/
|
||||
│ └── run-3/
|
||||
├── v1/
|
||||
│ ├── META.yaml
|
||||
│ ├── skill/
|
||||
│ ├── improvements/
|
||||
│ │ └── suggestions.md # From analyzer
|
||||
│ └── runs/
|
||||
└── grading/
|
||||
└── v1-vs-v0/
|
||||
├── assignment.json # Which version is A vs B
|
||||
├── comparison-1.json # Blind comparison results
|
||||
├── comparison-2.json
|
||||
├── comparison-3.json
|
||||
└── analysis.json # Post-hoc analysis
|
||||
│
|
||||
│── [Benchmark mode]
|
||||
└── benchmarks/
|
||||
└── 2026-01-15T10-30-00/ # Timestamp-named directory
|
||||
├── benchmark.json # Structured results (see schema)
|
||||
├── benchmark.md # Human-readable summary
|
||||
└── runs/
|
||||
├── eval-1/
|
||||
│ ├── with_skill/
|
||||
│ │ ├── run-1/
|
||||
│ │ │ ├── transcript.md
|
||||
│ │ │ ├── user_notes.md
|
||||
│ │ │ ├── outputs/
|
||||
│ │ │ └── grading.json
|
||||
│ │ ├── run-2/
|
||||
│ │ └── run-3/
|
||||
│ └── without_skill/
|
||||
│ ├── run-1/
|
||||
│ ├── run-2/
|
||||
│ └── run-3/
|
||||
└── eval-2/
|
||||
└── ...
|
||||
```
|
||||
|
||||
**Key files:**
|
||||
- `transcript.md` - Execution log from executor
|
||||
- `user_notes.md` - Uncertainties and workarounds flagged by executor
|
||||
- `metrics.json` - Tool calls, output size, step count
|
||||
- `grading.json` - Assertion pass/fail, notes, user_notes summary
|
||||
- `timing.json` - Wall clock duration
|
||||
- `comparison-N.json` - Blind rubric-based comparison
|
||||
- `analysis.json` - Post-hoc analysis with improvement suggestions
|
||||
- `history.json` - Version progression with pass rates and winners
|
||||
- `benchmark.json` - Structured benchmark results with runs, run_summary, notes
|
||||
- `benchmark.md` - Human-readable benchmark summary
|
||||
|
||||
---
|
||||
|
||||
## Coordinator Responsibilities
|
||||
|
||||
The coordinator must:
|
||||
|
||||
1. **Delegate to subagents when available; otherwise execute inline** - In Improve, Eval, and Benchmark modes, use subagents for executor/grader work when possible. Without subagents, read the agent reference files and follow the procedures directly.
|
||||
2. **Create mode exception** - Run examples in main loop so user sees the transcript (interactive feedback matters more than consistency)
|
||||
3. **Use independent grading when possible** - Spawn separate grader/comparator agents for unbiased evaluation. Without subagents, grade inline but acknowledge the limitation.
|
||||
4. **Track progress with tasks** - Create tasks, update stages, mark complete
|
||||
5. **Track best version** - The best performer, not the latest iteration
|
||||
6. **Run multiple times for variance** - 3 runs per configuration when subagents are available; 1 run otherwise
|
||||
7. **Parallelize independent work** - When subagents are available, spawn independent work in parallel
|
||||
8. **Report results clearly** - Display pass/fail with evidence and metrics
|
||||
9. **Review user_notes** - Check executor's user_notes.md for issues that passed expectations might miss
|
||||
10. **Capture execution metrics** - In Benchmark mode, record tokens/time/tool_calls from each execution
|
||||
11. **Use most capable model for analysis** - Benchmark analyzer should use the smartest available model
|
||||
|
||||
---
|
||||
|
||||
## Delegating Work
|
||||
|
||||
There are two patterns for delegating work to building blocks:
|
||||
|
||||
**With subagents**: Spawn an independent agent with the reference file instructions. Include the reference file path in the prompt so the subagent knows its role. When tasks are independent (like 3 runs of the same version), spawn all subagents in the same turn for parallelism.
|
||||
|
||||
**Without subagents**: Read the agent reference file (e.g., `agents/executor.md`) and follow the procedure directly in your main loop. Execute each step sequentially — the procedures are designed to work both as subagent instructions and as inline procedures.
|
||||
|
||||
---
|
||||
|
||||
# Conclusion
|
||||
|
||||
Just pasting in the overall workflow again for reference:
|
||||
|
||||
- Decide what you want the skill to do and roughly how it should do it
|
||||
- Write a draft of the skill
|
||||
- Create a few test prompts and run claude-with-access-to-the-skill on them
|
||||
- Evaluate the results
|
||||
- which can be through automated evals, but also it's totally fine and good for them to be evaluated by the human by hand and that's often the only way
|
||||
- Rewrite the skill based on feedback from the evaluation
|
||||
- Repeat until you're satisfied
|
||||
- Expand the test set and try again at larger scale
|
||||
|
||||
Good luck!
|
||||
@@ -1,274 +0,0 @@
|
||||
# Post-hoc Analyzer Agent
|
||||
|
||||
Analyze blind comparison results to understand WHY the winner won and generate improvement suggestions.
|
||||
|
||||
## Role
|
||||
|
||||
After the blind comparator determines a winner, the Post-hoc Analyzer "unblids" the results by examining the skills and transcripts. The goal is to extract actionable insights: what made the winner better, and how can the loser be improved?
|
||||
|
||||
## Inputs
|
||||
|
||||
You receive these parameters in your prompt:
|
||||
|
||||
- **winner**: "A" or "B" (from blind comparison)
|
||||
- **winner_skill_path**: Path to the skill that produced the winning output
|
||||
- **winner_transcript_path**: Path to the execution transcript for the winner
|
||||
- **loser_skill_path**: Path to the skill that produced the losing output
|
||||
- **loser_transcript_path**: Path to the execution transcript for the loser
|
||||
- **comparison_result_path**: Path to the blind comparator's output JSON
|
||||
- **output_path**: Where to save the analysis results
|
||||
|
||||
## Process
|
||||
|
||||
### Step 1: Read Comparison Result
|
||||
|
||||
1. Read the blind comparator's output at comparison_result_path
|
||||
2. Note the winning side (A or B), the reasoning, and any scores
|
||||
3. Understand what the comparator valued in the winning output
|
||||
|
||||
### Step 2: Read Both Skills
|
||||
|
||||
1. Read the winner skill's SKILL.md and key referenced files
|
||||
2. Read the loser skill's SKILL.md and key referenced files
|
||||
3. Identify structural differences:
|
||||
- Instructions clarity and specificity
|
||||
- Script/tool usage patterns
|
||||
- Example coverage
|
||||
- Edge case handling
|
||||
|
||||
### Step 3: Read Both Transcripts
|
||||
|
||||
1. Read the winner's transcript
|
||||
2. Read the loser's transcript
|
||||
3. Compare execution patterns:
|
||||
- How closely did each follow their skill's instructions?
|
||||
- What tools were used differently?
|
||||
- Where did the loser diverge from optimal behavior?
|
||||
- Did either encounter errors or make recovery attempts?
|
||||
|
||||
### Step 4: Analyze Instruction Following
|
||||
|
||||
For each transcript, evaluate:
|
||||
- Did the agent follow the skill's explicit instructions?
|
||||
- Did the agent use the skill's provided tools/scripts?
|
||||
- Were there missed opportunities to leverage skill content?
|
||||
- Did the agent add unnecessary steps not in the skill?
|
||||
|
||||
Score instruction following 1-10 and note specific issues.
|
||||
|
||||
### Step 5: Identify Winner Strengths
|
||||
|
||||
Determine what made the winner better:
|
||||
- Clearer instructions that led to better behavior?
|
||||
- Better scripts/tools that produced better output?
|
||||
- More comprehensive examples that guided edge cases?
|
||||
- Better error handling guidance?
|
||||
|
||||
Be specific. Quote from skills/transcripts where relevant.
|
||||
|
||||
### Step 6: Identify Loser Weaknesses
|
||||
|
||||
Determine what held the loser back:
|
||||
- Ambiguous instructions that led to suboptimal choices?
|
||||
- Missing tools/scripts that forced workarounds?
|
||||
- Gaps in edge case coverage?
|
||||
- Poor error handling that caused failures?
|
||||
|
||||
### Step 7: Generate Improvement Suggestions
|
||||
|
||||
Based on the analysis, produce actionable suggestions for improving the loser skill:
|
||||
- Specific instruction changes to make
|
||||
- Tools/scripts to add or modify
|
||||
- Examples to include
|
||||
- Edge cases to address
|
||||
|
||||
Prioritize by impact. Focus on changes that would have changed the outcome.
|
||||
|
||||
### Step 8: Write Analysis Results
|
||||
|
||||
Save structured analysis to `{output_path}`.
|
||||
|
||||
## Output Format
|
||||
|
||||
Write a JSON file with this structure:
|
||||
|
||||
```json
|
||||
{
|
||||
"comparison_summary": {
|
||||
"winner": "A",
|
||||
"winner_skill": "path/to/winner/skill",
|
||||
"loser_skill": "path/to/loser/skill",
|
||||
"comparator_reasoning": "Brief summary of why comparator chose winner"
|
||||
},
|
||||
"winner_strengths": [
|
||||
"Clear step-by-step instructions for handling multi-page documents",
|
||||
"Included validation script that caught formatting errors",
|
||||
"Explicit guidance on fallback behavior when OCR fails"
|
||||
],
|
||||
"loser_weaknesses": [
|
||||
"Vague instruction 'process the document appropriately' led to inconsistent behavior",
|
||||
"No script for validation, agent had to improvise and made errors",
|
||||
"No guidance on OCR failure, agent gave up instead of trying alternatives"
|
||||
],
|
||||
"instruction_following": {
|
||||
"winner": {
|
||||
"score": 9,
|
||||
"issues": [
|
||||
"Minor: skipped optional logging step"
|
||||
]
|
||||
},
|
||||
"loser": {
|
||||
"score": 6,
|
||||
"issues": [
|
||||
"Did not use the skill's formatting template",
|
||||
"Invented own approach instead of following step 3",
|
||||
"Missed the 'always validate output' instruction"
|
||||
]
|
||||
}
|
||||
},
|
||||
"improvement_suggestions": [
|
||||
{
|
||||
"priority": "high",
|
||||
"category": "instructions",
|
||||
"suggestion": "Replace 'process the document appropriately' with explicit steps: 1) Extract text, 2) Identify sections, 3) Format per template",
|
||||
"expected_impact": "Would eliminate ambiguity that caused inconsistent behavior"
|
||||
},
|
||||
{
|
||||
"priority": "high",
|
||||
"category": "tools",
|
||||
"suggestion": "Add validate_output.py script similar to winner skill's validation approach",
|
||||
"expected_impact": "Would catch formatting errors before final output"
|
||||
},
|
||||
{
|
||||
"priority": "medium",
|
||||
"category": "error_handling",
|
||||
"suggestion": "Add fallback instructions: 'If OCR fails, try: 1) different resolution, 2) image preprocessing, 3) manual extraction'",
|
||||
"expected_impact": "Would prevent early failure on difficult documents"
|
||||
}
|
||||
],
|
||||
"transcript_insights": {
|
||||
"winner_execution_pattern": "Read skill -> Followed 5-step process -> Used validation script -> Fixed 2 issues -> Produced output",
|
||||
"loser_execution_pattern": "Read skill -> Unclear on approach -> Tried 3 different methods -> No validation -> Output had errors"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Guidelines
|
||||
|
||||
- **Be specific**: Quote from skills and transcripts, don't just say "instructions were unclear"
|
||||
- **Be actionable**: Suggestions should be concrete changes, not vague advice
|
||||
- **Focus on skill improvements**: The goal is to improve the losing skill, not critique the agent
|
||||
- **Prioritize by impact**: Which changes would most likely have changed the outcome?
|
||||
- **Consider causation**: Did the skill weakness actually cause the worse output, or is it incidental?
|
||||
- **Stay objective**: Analyze what happened, don't editorialize
|
||||
- **Think about generalization**: Would this improvement help on other evals too?
|
||||
|
||||
## Categories for Suggestions
|
||||
|
||||
Use these categories to organize improvement suggestions:
|
||||
|
||||
| Category | Description |
|
||||
|----------|-------------|
|
||||
| `instructions` | Changes to the skill's prose instructions |
|
||||
| `tools` | Scripts, templates, or utilities to add/modify |
|
||||
| `examples` | Example inputs/outputs to include |
|
||||
| `error_handling` | Guidance for handling failures |
|
||||
| `structure` | Reorganization of skill content |
|
||||
| `references` | External docs or resources to add |
|
||||
|
||||
## Priority Levels
|
||||
|
||||
- **high**: Would likely change the outcome of this comparison
|
||||
- **medium**: Would improve quality but may not change win/loss
|
||||
- **low**: Nice to have, marginal improvement
|
||||
|
||||
---
|
||||
|
||||
# Benchmark Mode Analysis
|
||||
|
||||
When used in Benchmark mode, the analyzer has a different purpose: **surface patterns and anomalies** across benchmark runs, not suggest skill improvements.
|
||||
|
||||
## Benchmark Role
|
||||
|
||||
Review all benchmark run results and generate freeform notes that help the user understand skill performance. Focus on patterns that wouldn't be visible from aggregate metrics alone.
|
||||
|
||||
## Benchmark Inputs
|
||||
|
||||
You receive these parameters in your prompt:
|
||||
|
||||
- **benchmark_data_path**: Path to the in-progress benchmark.json with all run results
|
||||
- **skill_path**: Path to the skill being benchmarked
|
||||
- **output_path**: Where to save the notes (as JSON array of strings)
|
||||
|
||||
## Benchmark Process
|
||||
|
||||
### Step 1: Read Benchmark Data
|
||||
|
||||
1. Read the benchmark.json containing all run results
|
||||
2. Note the configurations tested (with_skill, without_skill)
|
||||
3. Understand the run_summary aggregates already calculated
|
||||
|
||||
### Step 2: Analyze Per-Assertion Patterns
|
||||
|
||||
For each expectation across all runs:
|
||||
- Does it **always pass** in both configurations? (may not differentiate skill value)
|
||||
- Does it **always fail** in both configurations? (may be broken or beyond capability)
|
||||
- Does it **always pass with skill but fail without**? (skill clearly adds value here)
|
||||
- Does it **always fail with skill but pass without**? (skill may be hurting)
|
||||
- Is it **highly variable**? (flaky expectation or non-deterministic behavior)
|
||||
|
||||
### Step 3: Analyze Cross-Eval Patterns
|
||||
|
||||
Look for patterns across evals:
|
||||
- Are certain eval types consistently harder/easier?
|
||||
- Do some evals show high variance while others are stable?
|
||||
- Are there surprising results that contradict expectations?
|
||||
|
||||
### Step 4: Analyze Metrics Patterns
|
||||
|
||||
Look at time_seconds, tokens, tool_calls:
|
||||
- Does the skill significantly increase execution time?
|
||||
- Is there high variance in resource usage?
|
||||
- Are there outlier runs that skew the aggregates?
|
||||
|
||||
### Step 5: Generate Notes
|
||||
|
||||
Write freeform observations as a list of strings. Each note should:
|
||||
- State a specific observation
|
||||
- Be grounded in the data (not speculation)
|
||||
- Help the user understand something the aggregate metrics don't show
|
||||
|
||||
Examples:
|
||||
- "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value"
|
||||
- "Eval 3 shows high variance (50% ± 40%) - run 2 had an unusual failure that may be flaky"
|
||||
- "Without-skill runs consistently fail on table extraction expectations (0% pass rate)"
|
||||
- "Skill adds 13s average execution time but improves pass rate by 50%"
|
||||
- "Token usage is 80% higher with skill, primarily due to script output parsing"
|
||||
- "All 3 without-skill runs for eval 1 produced empty output"
|
||||
|
||||
### Step 6: Write Notes
|
||||
|
||||
Save notes to `{output_path}` as a JSON array of strings:
|
||||
|
||||
```json
|
||||
[
|
||||
"Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value",
|
||||
"Eval 3 shows high variance (50% ± 40%) - run 2 had an unusual failure",
|
||||
"Without-skill runs consistently fail on table extraction expectations",
|
||||
"Skill adds 13s average execution time but improves pass rate by 50%"
|
||||
]
|
||||
```
|
||||
|
||||
## Benchmark Guidelines
|
||||
|
||||
**DO:**
|
||||
- Report what you observe in the data
|
||||
- Be specific about which evals, expectations, or runs you're referring to
|
||||
- Note patterns that aggregate metrics would hide
|
||||
- Provide context that helps interpret the numbers
|
||||
|
||||
**DO NOT:**
|
||||
- Suggest improvements to the skill (that's Improve mode, not Benchmark)
|
||||
- Make subjective quality judgments ("the output was good/bad")
|
||||
- Speculate about causes without evidence
|
||||
- Repeat information already in the run_summary aggregates
|
||||
@@ -1,202 +0,0 @@
|
||||
# Blind Comparator Agent
|
||||
|
||||
Compare two outputs WITHOUT knowing which skill produced them.
|
||||
|
||||
## Role
|
||||
|
||||
The Blind Comparator judges which output better accomplishes the eval task. You receive two outputs labeled A and B, but you do NOT know which skill produced which. This prevents bias toward a particular skill or approach.
|
||||
|
||||
Your judgment is based purely on output quality and task completion.
|
||||
|
||||
## Inputs
|
||||
|
||||
You receive these parameters in your prompt:
|
||||
|
||||
- **output_a_path**: Path to the first output file or directory
|
||||
- **output_b_path**: Path to the second output file or directory
|
||||
- **eval_prompt**: The original task/prompt that was executed
|
||||
- **expectations**: List of expectations to check (optional - may be empty)
|
||||
|
||||
## Process
|
||||
|
||||
### Step 1: Read Both Outputs
|
||||
|
||||
1. Examine output A (file or directory)
|
||||
2. Examine output B (file or directory)
|
||||
3. Note the type, structure, and content of each
|
||||
4. If outputs are directories, examine all relevant files inside
|
||||
|
||||
### Step 2: Understand the Task
|
||||
|
||||
1. Read the eval_prompt carefully
|
||||
2. Identify what the task requires:
|
||||
- What should be produced?
|
||||
- What qualities matter (accuracy, completeness, format)?
|
||||
- What would distinguish a good output from a poor one?
|
||||
|
||||
### Step 3: Generate Evaluation Rubric
|
||||
|
||||
Based on the task, generate a rubric with two dimensions:
|
||||
|
||||
**Content Rubric** (what the output contains):
|
||||
| Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) |
|
||||
|-----------|----------|----------------|---------------|
|
||||
| Correctness | Major errors | Minor errors | Fully correct |
|
||||
| Completeness | Missing key elements | Mostly complete | All elements present |
|
||||
| Accuracy | Significant inaccuracies | Minor inaccuracies | Accurate throughout |
|
||||
|
||||
**Structure Rubric** (how the output is organized):
|
||||
| Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) |
|
||||
|-----------|----------|----------------|---------------|
|
||||
| Organization | Disorganized | Reasonably organized | Clear, logical structure |
|
||||
| Formatting | Inconsistent/broken | Mostly consistent | Professional, polished |
|
||||
| Usability | Difficult to use | Usable with effort | Easy to use |
|
||||
|
||||
Adapt criteria to the specific task. For example:
|
||||
- PDF form → "Field alignment", "Text readability", "Data placement"
|
||||
- Document → "Section structure", "Heading hierarchy", "Paragraph flow"
|
||||
- Data output → "Schema correctness", "Data types", "Completeness"
|
||||
|
||||
### Step 4: Evaluate Each Output Against the Rubric
|
||||
|
||||
For each output (A and B):
|
||||
|
||||
1. **Score each criterion** on the rubric (1-5 scale)
|
||||
2. **Calculate dimension totals**: Content score, Structure score
|
||||
3. **Calculate overall score**: Average of dimension scores, scaled to 1-10
|
||||
|
||||
### Step 5: Check Assertions (if provided)
|
||||
|
||||
If expectations are provided:
|
||||
|
||||
1. Check each expectation against output A
|
||||
2. Check each expectation against output B
|
||||
3. Count pass rates for each output
|
||||
4. Use expectation scores as secondary evidence (not the primary decision factor)
|
||||
|
||||
### Step 6: Determine the Winner
|
||||
|
||||
Compare A and B based on (in priority order):
|
||||
|
||||
1. **Primary**: Overall rubric score (content + structure)
|
||||
2. **Secondary**: Assertion pass rates (if applicable)
|
||||
3. **Tiebreaker**: If truly equal, declare a TIE
|
||||
|
||||
Be decisive - ties should be rare. One output is usually better, even if marginally.
|
||||
|
||||
### Step 7: Write Comparison Results
|
||||
|
||||
Save results to a JSON file at the path specified (or `comparison.json` if not specified).
|
||||
|
||||
## Output Format
|
||||
|
||||
Write a JSON file with this structure:
|
||||
|
||||
```json
|
||||
{
|
||||
"winner": "A",
|
||||
"reasoning": "Output A provides a complete solution with proper formatting and all required fields. Output B is missing the date field and has formatting inconsistencies.",
|
||||
"rubric": {
|
||||
"A": {
|
||||
"content": {
|
||||
"correctness": 5,
|
||||
"completeness": 5,
|
||||
"accuracy": 4
|
||||
},
|
||||
"structure": {
|
||||
"organization": 4,
|
||||
"formatting": 5,
|
||||
"usability": 4
|
||||
},
|
||||
"content_score": 4.7,
|
||||
"structure_score": 4.3,
|
||||
"overall_score": 9.0
|
||||
},
|
||||
"B": {
|
||||
"content": {
|
||||
"correctness": 3,
|
||||
"completeness": 2,
|
||||
"accuracy": 3
|
||||
},
|
||||
"structure": {
|
||||
"organization": 3,
|
||||
"formatting": 2,
|
||||
"usability": 3
|
||||
},
|
||||
"content_score": 2.7,
|
||||
"structure_score": 2.7,
|
||||
"overall_score": 5.4
|
||||
}
|
||||
},
|
||||
"output_quality": {
|
||||
"A": {
|
||||
"score": 9,
|
||||
"strengths": ["Complete solution", "Well-formatted", "All fields present"],
|
||||
"weaknesses": ["Minor style inconsistency in header"]
|
||||
},
|
||||
"B": {
|
||||
"score": 5,
|
||||
"strengths": ["Readable output", "Correct basic structure"],
|
||||
"weaknesses": ["Missing date field", "Formatting inconsistencies", "Partial data extraction"]
|
||||
}
|
||||
},
|
||||
"expectation_results": {
|
||||
"A": {
|
||||
"passed": 4,
|
||||
"total": 5,
|
||||
"pass_rate": 0.80,
|
||||
"details": [
|
||||
{"text": "Output includes name", "passed": true},
|
||||
{"text": "Output includes date", "passed": true},
|
||||
{"text": "Format is PDF", "passed": true},
|
||||
{"text": "Contains signature", "passed": false},
|
||||
{"text": "Readable text", "passed": true}
|
||||
]
|
||||
},
|
||||
"B": {
|
||||
"passed": 3,
|
||||
"total": 5,
|
||||
"pass_rate": 0.60,
|
||||
"details": [
|
||||
{"text": "Output includes name", "passed": true},
|
||||
{"text": "Output includes date", "passed": false},
|
||||
{"text": "Format is PDF", "passed": true},
|
||||
{"text": "Contains signature", "passed": false},
|
||||
{"text": "Readable text", "passed": true}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
If no expectations were provided, omit the `expectation_results` field entirely.
|
||||
|
||||
## Field Descriptions
|
||||
|
||||
- **winner**: "A", "B", or "TIE"
|
||||
- **reasoning**: Clear explanation of why the winner was chosen (or why it's a tie)
|
||||
- **rubric**: Structured rubric evaluation for each output
|
||||
- **content**: Scores for content criteria (correctness, completeness, accuracy)
|
||||
- **structure**: Scores for structure criteria (organization, formatting, usability)
|
||||
- **content_score**: Average of content criteria (1-5)
|
||||
- **structure_score**: Average of structure criteria (1-5)
|
||||
- **overall_score**: Combined score scaled to 1-10
|
||||
- **output_quality**: Summary quality assessment
|
||||
- **score**: 1-10 rating (should match rubric overall_score)
|
||||
- **strengths**: List of positive aspects
|
||||
- **weaknesses**: List of issues or shortcomings
|
||||
- **expectation_results**: (Only if expectations provided)
|
||||
- **passed**: Number of expectations that passed
|
||||
- **total**: Total number of expectations
|
||||
- **pass_rate**: Fraction passed (0.0 to 1.0)
|
||||
- **details**: Individual expectation results
|
||||
|
||||
## Guidelines
|
||||
|
||||
- **Stay blind**: DO NOT try to infer which skill produced which output. Judge purely on output quality.
|
||||
- **Be specific**: Cite specific examples when explaining strengths and weaknesses.
|
||||
- **Be decisive**: Choose a winner unless outputs are genuinely equivalent.
|
||||
- **Output quality first**: Assertion scores are secondary to overall task completion.
|
||||
- **Be objective**: Don't favor outputs based on style preferences; focus on correctness and completeness.
|
||||
- **Explain your reasoning**: The reasoning field should make it clear why you chose the winner.
|
||||
- **Handle edge cases**: If both outputs fail, pick the one that fails less badly. If both are excellent, pick the one that's marginally better.
|
||||
@@ -1,181 +0,0 @@
|
||||
# Executor Agent
|
||||
|
||||
Execute an eval prompt using a skill and produce a detailed transcript.
|
||||
|
||||
## Role
|
||||
|
||||
The Executor runs a single eval case: load the skill, execute the prompt with staged input files, and document everything in a transcript. The transcript serves as evidence for the grader to evaluate expectations.
|
||||
|
||||
## Inputs
|
||||
|
||||
You receive these parameters in your prompt:
|
||||
|
||||
- **skill_path**: Path to the skill directory (contains SKILL.md and supporting files)
|
||||
- **prompt**: The eval prompt to execute
|
||||
- **input_files_dir**: Directory containing staged input files (may be empty)
|
||||
- **output_dir**: Where to save transcript and any outputs
|
||||
|
||||
## Process
|
||||
|
||||
### Step 1: Load the Skill
|
||||
|
||||
1. Read `SKILL.md` at the skill_path
|
||||
2. Read any referenced files (scripts, templates, examples)
|
||||
3. Understand what the skill enables and how to use it
|
||||
|
||||
### Step 2: Prepare Inputs
|
||||
|
||||
1. List files in input_files_dir (if any)
|
||||
2. Note file types, sizes, and purposes
|
||||
3. These are the eval's test inputs - use them as specified in the prompt
|
||||
|
||||
### Step 3: Execute the Prompt
|
||||
|
||||
1. Follow the skill's instructions to accomplish the prompt
|
||||
2. Use the staged input files as needed
|
||||
3. Make reasonable decisions when the skill doesn't specify exact behavior
|
||||
4. Handle errors gracefully and document them
|
||||
|
||||
### Step 4: Save Outputs
|
||||
|
||||
1. Save any files you create to output_dir
|
||||
2. Name files descriptively (e.g., `filled_form.pdf`, `extracted_data.json`)
|
||||
3. Note what each output file contains
|
||||
|
||||
### Step 5: Write Transcript, Metrics, and User Notes
|
||||
|
||||
Save outputs to `{output_dir}/`:
|
||||
- `transcript.md` - Detailed execution log
|
||||
- `metrics.json` - Tool usage and performance data
|
||||
- `user_notes.md` - Uncertainties and issues needing human attention
|
||||
|
||||
## Transcript Format
|
||||
|
||||
```markdown
|
||||
# Eval Execution Transcript
|
||||
|
||||
## Eval Prompt
|
||||
[The exact prompt you were given]
|
||||
|
||||
## Skill
|
||||
- Path: [skill_path]
|
||||
- Name: [skill name from frontmatter]
|
||||
- Description: [brief description]
|
||||
|
||||
## Input Files
|
||||
- [filename1]: [description/type]
|
||||
- [filename2]: [description/type]
|
||||
- (or "None provided")
|
||||
|
||||
## Execution
|
||||
|
||||
### Step 1: [Action Description]
|
||||
**Action**: [What you did]
|
||||
**Tool**: [Tool name and key parameters]
|
||||
**Result**: [What happened - success, failure, output]
|
||||
|
||||
### Step 2: [Action Description]
|
||||
[Continue for each significant action...]
|
||||
|
||||
## Output Files
|
||||
- [filename]: [description, location in output_dir]
|
||||
- (or "None created")
|
||||
|
||||
## Final Result
|
||||
[The final answer/output for the eval prompt]
|
||||
|
||||
## Issues
|
||||
- [Any errors, warnings, or unexpected behaviors]
|
||||
- (or "None")
|
||||
```
|
||||
|
||||
## User Notes Format
|
||||
|
||||
Save `{output_dir}/user_notes.md` to capture things that look reasonable but may have hidden issues:
|
||||
|
||||
```markdown
|
||||
# User Notes
|
||||
|
||||
## Uncertainty
|
||||
- [Things you're not 100% sure about]
|
||||
- [Assumptions you made that might be wrong]
|
||||
- [Data that might be stale or incomplete]
|
||||
|
||||
## Needs Human Review
|
||||
- [Sections that require domain expertise to verify]
|
||||
- [Outputs that could be misleading]
|
||||
- [Edge cases you weren't sure how to handle]
|
||||
|
||||
## Workarounds
|
||||
- [Places where the skill didn't work as expected]
|
||||
- [Alternative approaches you took]
|
||||
- [Things that should work but didn't]
|
||||
|
||||
## Suggestions
|
||||
- [Improvements to the skill that would help]
|
||||
- [Missing instructions that caused confusion]
|
||||
- [Tools or capabilities that would be useful]
|
||||
```
|
||||
|
||||
**IMPORTANT**: Always write user_notes.md, even if empty. This surfaces issues that might otherwise be buried in a "successful" execution. If everything went perfectly, write:
|
||||
|
||||
```markdown
|
||||
# User Notes
|
||||
|
||||
No uncertainties, issues, or suggestions to report. Execution completed as expected.
|
||||
```
|
||||
|
||||
## Metrics Format
|
||||
|
||||
Save `{output_dir}/metrics.json` with tool usage and output size:
|
||||
|
||||
```json
|
||||
{
|
||||
"tool_calls": {
|
||||
"Read": 5,
|
||||
"Write": 2,
|
||||
"Bash": 8,
|
||||
"Edit": 1,
|
||||
"Glob": 2,
|
||||
"Grep": 0
|
||||
},
|
||||
"total_tool_calls": 18,
|
||||
"total_steps": 6,
|
||||
"files_created": ["filled_form.pdf", "field_values.json"],
|
||||
"errors_encountered": 0,
|
||||
"output_chars": 0,
|
||||
"transcript_chars": 0
|
||||
}
|
||||
```
|
||||
|
||||
**IMPORTANT**: After writing all outputs and transcript, calculate and record character counts as a proxy for token usage:
|
||||
|
||||
```bash
|
||||
# Get transcript size
|
||||
transcript_chars=$(wc -c < "{output_dir}/transcript.md" | tr -d ' ')
|
||||
|
||||
# Get total output size (sum of all files in output_dir)
|
||||
output_chars=$(find "{output_dir}" -type f ! -name "metrics.json" -exec cat {} + 2>/dev/null | wc -c | tr -d ' ')
|
||||
|
||||
# Update metrics.json with sizes
|
||||
python3 << EOF
|
||||
import json
|
||||
with open("{output_dir}/metrics.json") as f:
|
||||
m = json.load(f)
|
||||
m["transcript_chars"] = int("$transcript_chars")
|
||||
m["output_chars"] = int("$output_chars")
|
||||
with open("{output_dir}/metrics.json", "w") as f:
|
||||
json.dump(m, f, indent=2)
|
||||
EOF
|
||||
```
|
||||
|
||||
Track every tool you call during execution. This data helps measure skill efficiency.
|
||||
|
||||
## Guidelines
|
||||
|
||||
- **Document thoroughly**: The grader will use your transcript to evaluate expectations
|
||||
- **Include tool calls**: Show what tools you used and their results
|
||||
- **Capture outputs**: Both inline results and saved files matter
|
||||
- **Be honest about issues**: Don't hide errors; document them clearly
|
||||
- **Follow the skill**: Execute as the skill instructs, not how you might do it otherwise
|
||||
- **Stay focused**: Complete the eval prompt, nothing more
|
||||
@@ -1,223 +0,0 @@
|
||||
# Grader Agent
|
||||
|
||||
Evaluate expectations against an execution transcript and outputs.
|
||||
|
||||
## Role
|
||||
|
||||
The Grader reviews a transcript and output files, then determines whether each expectation passes or fails. Provide clear evidence for each judgment.
|
||||
|
||||
You have two jobs: grade the outputs, and critique the evals themselves. A passing grade on a weak assertion is worse than useless — it creates false confidence. When you notice an assertion that's trivially satisfied, or an important outcome that no assertion checks, say so.
|
||||
|
||||
## Inputs
|
||||
|
||||
You receive these parameters in your prompt:
|
||||
|
||||
- **expectations**: List of expectations to evaluate (strings)
|
||||
- **transcript_path**: Path to the execution transcript (markdown file)
|
||||
- **outputs_dir**: Directory containing output files from execution
|
||||
|
||||
## Process
|
||||
|
||||
### Step 1: Read the Transcript
|
||||
|
||||
1. Read the transcript file completely
|
||||
2. Note the eval prompt, execution steps, and final result
|
||||
3. Identify any issues or errors documented
|
||||
|
||||
### Step 2: Examine Output Files
|
||||
|
||||
1. List files in outputs_dir
|
||||
2. Read/examine each file relevant to the expectations. If outputs aren't plain text, use the inspection tools provided in your prompt — don't rely solely on what the transcript says the executor produced.
|
||||
3. Note contents, structure, and quality
|
||||
|
||||
### Step 3: Evaluate Each Assertion
|
||||
|
||||
For each expectation:
|
||||
|
||||
1. **Search for evidence** in the transcript and outputs
|
||||
2. **Determine verdict**:
|
||||
- **PASS**: Clear evidence the expectation is true AND the evidence reflects genuine task completion, not just surface-level compliance
|
||||
- **FAIL**: No evidence, or evidence contradicts the expectation, or the evidence is superficial (e.g., correct filename but empty/wrong content)
|
||||
3. **Cite the evidence**: Quote the specific text or describe what you found
|
||||
|
||||
### Step 4: Extract and Verify Claims
|
||||
|
||||
Beyond the predefined expectations, extract implicit claims from the outputs and verify them:
|
||||
|
||||
1. **Extract claims** from the transcript and outputs:
|
||||
- Factual statements ("The form has 12 fields")
|
||||
- Process claims ("Used pypdf to fill the form")
|
||||
- Quality claims ("All fields were filled correctly")
|
||||
|
||||
2. **Verify each claim**:
|
||||
- **Factual claims**: Can be checked against the outputs or external sources
|
||||
- **Process claims**: Can be verified from the transcript
|
||||
- **Quality claims**: Evaluate whether the claim is justified
|
||||
|
||||
3. **Flag unverifiable claims**: Note claims that cannot be verified with available information
|
||||
|
||||
This catches issues that predefined expectations might miss.
|
||||
|
||||
### Step 5: Read User Notes
|
||||
|
||||
If `{outputs_dir}/user_notes.md` exists:
|
||||
1. Read it and note any uncertainties or issues flagged by the executor
|
||||
2. Include relevant concerns in the grading output
|
||||
3. These may reveal problems even when expectations pass
|
||||
|
||||
### Step 6: Critique the Evals
|
||||
|
||||
After grading, consider whether the evals themselves could be improved. Only surface suggestions when there's a clear gap.
|
||||
|
||||
Good suggestions test meaningful outcomes — assertions that are hard to satisfy without actually doing the work correctly. Think about what makes an assertion *discriminating*: it passes when the skill genuinely succeeds and fails when it doesn't.
|
||||
|
||||
Suggestions worth raising:
|
||||
- An assertion that passed but would also pass for a clearly wrong output (e.g., checking filename existence but not file content)
|
||||
- An important outcome you observed — good or bad — that no assertion covers at all
|
||||
- An assertion that can't actually be verified from the available outputs
|
||||
|
||||
Keep the bar high. The goal is to flag things the eval author would say "good catch" about, not to nitpick every assertion.
|
||||
|
||||
### Step 7: Write Grading Results
|
||||
|
||||
Save results to `{outputs_dir}/../grading.json` (sibling to outputs_dir).
|
||||
|
||||
## Grading Criteria
|
||||
|
||||
**PASS when**:
|
||||
- The transcript or outputs clearly demonstrate the expectation is true
|
||||
- Specific evidence can be cited
|
||||
- The evidence reflects genuine substance, not just surface compliance (e.g., a file exists AND contains correct content, not just the right filename)
|
||||
|
||||
**FAIL when**:
|
||||
- No evidence found for the expectation
|
||||
- Evidence contradicts the expectation
|
||||
- The expectation cannot be verified from available information
|
||||
- The evidence is superficial — the assertion is technically satisfied but the underlying task outcome is wrong or incomplete
|
||||
- The output appears to meet the assertion by coincidence rather than by actually doing the work
|
||||
|
||||
**When uncertain**: The burden of proof to pass is on the expectation.
|
||||
|
||||
### Step 8: Read Executor Metrics and Timing
|
||||
|
||||
1. If `{outputs_dir}/metrics.json` exists, read it and include in grading output
|
||||
2. If `{outputs_dir}/../timing.json` exists, read it and include timing data
|
||||
|
||||
## Output Format
|
||||
|
||||
Write a JSON file with this structure:
|
||||
|
||||
```json
|
||||
{
|
||||
"expectations": [
|
||||
{
|
||||
"text": "The output includes the name 'John Smith'",
|
||||
"passed": true,
|
||||
"evidence": "Found in transcript Step 3: 'Extracted names: John Smith, Sarah Johnson'"
|
||||
},
|
||||
{
|
||||
"text": "The spreadsheet has a SUM formula in cell B10",
|
||||
"passed": false,
|
||||
"evidence": "No spreadsheet was created. The output was a text file."
|
||||
},
|
||||
{
|
||||
"text": "The assistant used the skill's OCR script",
|
||||
"passed": true,
|
||||
"evidence": "Transcript Step 2 shows: 'Tool: Bash - python ocr_script.py image.png'"
|
||||
}
|
||||
],
|
||||
"summary": {
|
||||
"passed": 2,
|
||||
"failed": 1,
|
||||
"total": 3,
|
||||
"pass_rate": 0.67
|
||||
},
|
||||
"execution_metrics": {
|
||||
"tool_calls": {
|
||||
"Read": 5,
|
||||
"Write": 2,
|
||||
"Bash": 8
|
||||
},
|
||||
"total_tool_calls": 15,
|
||||
"total_steps": 6,
|
||||
"errors_encountered": 0,
|
||||
"output_chars": 12450,
|
||||
"transcript_chars": 3200
|
||||
},
|
||||
"timing": {
|
||||
"executor_duration_seconds": 165.0,
|
||||
"grader_duration_seconds": 26.0,
|
||||
"total_duration_seconds": 191.0
|
||||
},
|
||||
"claims": [
|
||||
{
|
||||
"claim": "The form has 12 fillable fields",
|
||||
"type": "factual",
|
||||
"verified": true,
|
||||
"evidence": "Counted 12 fields in field_info.json"
|
||||
},
|
||||
{
|
||||
"claim": "All required fields were populated",
|
||||
"type": "quality",
|
||||
"verified": false,
|
||||
"evidence": "Reference section was left blank despite data being available"
|
||||
}
|
||||
],
|
||||
"user_notes_summary": {
|
||||
"uncertainties": ["Used 2023 data, may be stale"],
|
||||
"needs_review": [],
|
||||
"workarounds": ["Fell back to text overlay for non-fillable fields"]
|
||||
},
|
||||
"eval_feedback": {
|
||||
"suggestions": [
|
||||
{
|
||||
"assertion": "The output includes the name 'John Smith'",
|
||||
"reason": "A hallucinated document that mentions the name would also pass — consider checking it appears as the primary contact with matching phone and email from the input"
|
||||
},
|
||||
{
|
||||
"reason": "No assertion checks whether the extracted phone numbers match the input — I observed incorrect numbers in the output that went uncaught"
|
||||
}
|
||||
],
|
||||
"overall": "Assertions check presence but not correctness. Consider adding content verification."
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Field Descriptions
|
||||
|
||||
- **expectations**: Array of graded expectations
|
||||
- **text**: The original expectation text
|
||||
- **passed**: Boolean - true if expectation passes
|
||||
- **evidence**: Specific quote or description supporting the verdict
|
||||
- **summary**: Aggregate statistics
|
||||
- **passed**: Count of passed expectations
|
||||
- **failed**: Count of failed expectations
|
||||
- **total**: Total expectations evaluated
|
||||
- **pass_rate**: Fraction passed (0.0 to 1.0)
|
||||
- **execution_metrics**: Copied from executor's metrics.json (if available)
|
||||
- **output_chars**: Total character count of output files (proxy for tokens)
|
||||
- **transcript_chars**: Character count of transcript
|
||||
- **timing**: Wall clock timing from timing.json (if available)
|
||||
- **executor_duration_seconds**: Time spent in executor subagent
|
||||
- **total_duration_seconds**: Total elapsed time for the run
|
||||
- **claims**: Extracted and verified claims from the output
|
||||
- **claim**: The statement being verified
|
||||
- **type**: "factual", "process", or "quality"
|
||||
- **verified**: Boolean - whether the claim holds
|
||||
- **evidence**: Supporting or contradicting evidence
|
||||
- **user_notes_summary**: Issues flagged by the executor
|
||||
- **uncertainties**: Things the executor wasn't sure about
|
||||
- **needs_review**: Items requiring human attention
|
||||
- **workarounds**: Places where the skill didn't work as expected
|
||||
- **eval_feedback**: Improvement suggestions for the evals (only when warranted)
|
||||
- **suggestions**: List of concrete suggestions, each with a `reason` and optionally an `assertion` it relates to
|
||||
- **overall**: Brief assessment — can be "No suggestions, evals look solid" if nothing to flag
|
||||
|
||||
## Guidelines
|
||||
|
||||
- **Be objective**: Base verdicts on evidence, not assumptions
|
||||
- **Be specific**: Quote the exact text that supports your verdict
|
||||
- **Be thorough**: Check both transcript and output files
|
||||
- **Be consistent**: Apply the same standard to each expectation
|
||||
- **Explain failures**: Make it clear why evidence was insufficient
|
||||
- **No partial credit**: Each expectation is pass or fail, not partial
|
||||
@@ -1,149 +0,0 @@
|
||||
# Benchmark Mode Reference
|
||||
|
||||
**Requires subagents.** Benchmark mode relies on parallel execution of many independent runs. Without subagents, use Eval mode for individual eval testing instead.
|
||||
|
||||
Benchmark mode runs a standardized, opinionated evaluation of a skill. It answers: **"How well does this skill perform?"**
|
||||
|
||||
Unlike Eval mode (which runs individual evals), Benchmark mode:
|
||||
- Runs **all evals** (or a user-specified subset)
|
||||
- Runs each eval **3 times per configuration** (with_skill and without_skill) for variance
|
||||
- Captures **metrics with variance**: pass_rate, time_seconds, tokens
|
||||
- Uses the **most capable model** as analyzer to surface patterns and anomalies
|
||||
- Produces **persistent, structured output** for cross-run analysis
|
||||
|
||||
## When to Use
|
||||
|
||||
- **Understanding performance**: "How does my skill perform?"
|
||||
- **Cross-model comparison**: "How does this skill perform across different models?"
|
||||
- **Regression detection**: Compare benchmark results over time
|
||||
- **Skill validation**: Does the skill actually add value over no-skill baseline?
|
||||
|
||||
## Defaults
|
||||
|
||||
**Always include no-skill baseline.** Every benchmark runs both `with_skill` and `without_skill` configurations. This measures the value the skill adds - without a baseline, you can't know if the skill helps.
|
||||
|
||||
**Suggest models for comparison.** If the user wants to compare across models, suggest a couple of commonly available models in your environment. Don't hardcode model names - just recommend what's commonly used and available.
|
||||
|
||||
**Run on current model by default.** If the user doesn't specify, run on whatever model is currently active. For cross-model comparison, ask which models they'd like to test.
|
||||
|
||||
## Terminology
|
||||
|
||||
| Term | Definition |
|
||||
|------|------------|
|
||||
| **Run** | A single execution of a skill on an eval prompt |
|
||||
| **Configuration** | The experimental condition: `with_skill` or `without_skill` |
|
||||
| **RunResult** | Graded output of a run: expectations, metrics, notes |
|
||||
| **Run Summary** | Statistical aggregates across runs: mean, stddev, min, max |
|
||||
| **Notes** | Freeform observations from the analyzer |
|
||||
|
||||
## Workflow
|
||||
|
||||
```
|
||||
1. Setup
|
||||
→ Choose workspace location (ask user, suggest <skill>-workspace/)
|
||||
→ Verify evals exist
|
||||
→ Determine which evals to run (all by default, or user subset)
|
||||
|
||||
2. Execute runs (parallel where possible)
|
||||
→ For each eval:
|
||||
→ 3 runs with_skill configuration
|
||||
→ 3 runs without_skill configuration
|
||||
→ Each run captures: transcript, outputs, metrics
|
||||
→ Coordinator extracts from Task result: total_tokens, tool_uses, duration_ms
|
||||
|
||||
3. Grade runs (parallel)
|
||||
→ Spawn grader for each run
|
||||
→ Produces: expectations with pass/fail, notes
|
||||
|
||||
4. Aggregate results
|
||||
→ Calculate run_summary per configuration:
|
||||
→ pass_rate: mean, stddev, min, max
|
||||
→ time_seconds: mean, stddev, min, max
|
||||
→ tokens: mean, stddev, min, max
|
||||
→ Calculate delta between configurations
|
||||
|
||||
5. Analyze (most capable model)
|
||||
→ Review all results
|
||||
→ Surface patterns, anomalies, observations as freeform notes
|
||||
→ Examples:
|
||||
- "Assertion X passes 100% in both configurations - may not differentiate skill value"
|
||||
- "Eval 3 shows high variance (50% ± 40%) - may be flaky"
|
||||
- "Skill adds 13s average time but improves pass rate by 50%"
|
||||
|
||||
6. Generate benchmark
|
||||
→ benchmark.json - Structured data for analysis
|
||||
→ benchmark.md - Human-readable summary
|
||||
```
|
||||
|
||||
## Spawning Executors
|
||||
|
||||
Run executor subagents in the background for parallelism. When each agent completes, capture the execution metrics (tokens consumed, tool calls, duration) from the completion notification.
|
||||
|
||||
For example, in Claude Code, background subagents deliver a `<task-notification>` with a `<usage>` block:
|
||||
|
||||
```xml
|
||||
<!-- Example: Claude Code task notification format -->
|
||||
<task-notification>
|
||||
<task-id>...</task-id>
|
||||
<status>completed</status>
|
||||
<result>agent's actual text output</result>
|
||||
<usage>total_tokens: 3700
|
||||
tool_uses: 2
|
||||
duration_ms: 32400</usage>
|
||||
</task-notification>
|
||||
```
|
||||
|
||||
Extract from each completed executor's metrics:
|
||||
- **total_tokens** = total tokens consumed (input + output combined)
|
||||
- **tool_uses** = number of tool calls made
|
||||
- **duration_ms** / 1000 = execution time in seconds
|
||||
|
||||
The exact format of completion notifications varies by environment — look for token counts, tool call counts, and duration in whatever format your environment provides.
|
||||
|
||||
Record these per-run metrics alongside the grading results. The aggregate script can then compute mean/stddev/min/max across runs for each configuration.
|
||||
|
||||
## Scripts
|
||||
|
||||
Use these scripts at specific points in the workflow:
|
||||
|
||||
### After Grading (Step 4: Aggregate)
|
||||
|
||||
```bash
|
||||
# Aggregate all grading.json files into benchmark summary
|
||||
scripts/aggregate_benchmark.py <benchmark-dir> --skill-name <name> --skill-path <path>
|
||||
```
|
||||
|
||||
This reads `grading.json` from each run directory and produces:
|
||||
- `benchmark.json` - Structured results with run_summary statistics
|
||||
- `benchmark.md` - Human-readable summary table
|
||||
|
||||
### Validation
|
||||
|
||||
```bash
|
||||
# Validate benchmark output
|
||||
scripts/validate_json.py <benchmark-dir>/benchmark.json
|
||||
|
||||
# Validate individual grading files
|
||||
scripts/validate_json.py <run-dir>/grading.json --type grading
|
||||
```
|
||||
|
||||
### Initialize Templates
|
||||
|
||||
```bash
|
||||
# Create empty benchmark.json with correct structure (if not using aggregate script)
|
||||
scripts/init_json.py benchmark <benchmark-dir>/benchmark.json
|
||||
```
|
||||
|
||||
## Analyzer Instructions
|
||||
|
||||
The analyzer (always most capable) reviews all results and generates freeform notes. See `agents/analyzer.md` for the full prompt, but key responsibilities:
|
||||
|
||||
1. **Compare configurations**: Which performed better? By how much?
|
||||
2. **Identify patterns**: Assertions that always pass/fail, high variance, etc.
|
||||
3. **Surface anomalies**: Unexpected results, broken evals, regressions
|
||||
4. **Provide context**: Why might these patterns exist?
|
||||
|
||||
The analyzer should NOT:
|
||||
- Suggest improvements to the skill (that's Improve mode)
|
||||
- Make subjective quality judgments beyond the data
|
||||
- Speculate without evidence
|
||||
@@ -1,144 +0,0 @@
|
||||
# Eval Mode Reference
|
||||
|
||||
Eval mode runs skill evals and grades expectations. Enables measuring skill performance, comparing with/without skill, and validating that skills add value.
|
||||
|
||||
## Purpose
|
||||
|
||||
Evals serve to:
|
||||
1. **Set a floor** - Prove the skill helps Claude do something it couldn't by default
|
||||
2. **Raise the ceiling** - Enable iterating on skills to improve performance
|
||||
3. **Measure holistically** - Capture metrics beyond pass/fail (time, tokens)
|
||||
4. **Understand cross-model behavior** - Test skills across different models
|
||||
|
||||
## Eval Workflow
|
||||
|
||||
```
|
||||
0. Choose Workspace Location
|
||||
→ Ask user where to put workspace, suggest sensible default
|
||||
|
||||
1. Check Dependencies
|
||||
→ Scan skill for dependencies, confirm availability with user
|
||||
|
||||
2. Prepare (scripts/prepare_eval.py)
|
||||
→ Create task, copies skill, stages files
|
||||
|
||||
3. Execute (agents/executor.md)
|
||||
→ Update task to implementing, spawn executor sub-agent
|
||||
→ Executor reads skill, runs prompt, saves transcript
|
||||
|
||||
4. Grade (agents/grader.md)
|
||||
→ Update task to reviewing, spawn grader sub-agent
|
||||
→ Grader reads transcript + outputs, evaluates expectations
|
||||
|
||||
5. Complete task, display results
|
||||
→ Pass/fail per expectation, overall pass rate, metrics
|
||||
```
|
||||
|
||||
## Step 0: Setup
|
||||
|
||||
**Before running any evals, read the output schemas:**
|
||||
|
||||
```bash
|
||||
# Read to understand the JSON structures you'll produce
|
||||
Read references/schemas.md
|
||||
```
|
||||
|
||||
This ensures you know the expected structure for:
|
||||
- `grading.json` - What the grader produces
|
||||
- `metrics.json` - What the executor produces
|
||||
- `timing.json` - Wall clock timing format
|
||||
|
||||
**Choose workspace location:**
|
||||
|
||||
1. **Suggest default**: `<skill-name>-workspace/` as a sibling to the skill directory
|
||||
2. **Ask the user** using AskUserQuestion — if the workspace is inside a git repo, suggest adding it to `.gitignore`
|
||||
3. **Create the workspace directory** once confirmed
|
||||
|
||||
## Step 1: Check Dependencies
|
||||
|
||||
Before running evals, scan the skill for dependencies:
|
||||
|
||||
1. Read SKILL.md (including `compatibility` frontmatter field)
|
||||
2. Check referenced scripts for required tools
|
||||
3. Present to user and confirm availability
|
||||
|
||||
## Step 2: Prepare and Create Task
|
||||
|
||||
Run prepare script and create task:
|
||||
|
||||
```bash
|
||||
scripts/prepare_eval.py <skill-path> <eval-id> --output-dir <workspace>/eval-<id>/
|
||||
```
|
||||
|
||||
```python
|
||||
task = TaskCreate(
|
||||
subject=f"Eval {eval_id}"
|
||||
)
|
||||
TaskUpdate(task, status="planning")
|
||||
```
|
||||
|
||||
## Step 3: Execute
|
||||
|
||||
Update task to `implementing` and run the executor:
|
||||
|
||||
```bash
|
||||
echo "{\"executor_start\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" > <run-dir>/timing.json
|
||||
```
|
||||
|
||||
**With subagents**: Spawn an executor subagent with these instructions:
|
||||
|
||||
```
|
||||
Read agents/executor.md at: <skill-creator-path>/agents/executor.md
|
||||
|
||||
Execute this eval:
|
||||
- Skill path: <workspace>/skill/
|
||||
- Prompt: <eval prompt from eval_metadata.json>
|
||||
- Input files: <workspace>/eval-<id>/inputs/
|
||||
- Save transcript to: <workspace>/eval-<id>/transcript.md
|
||||
- Save outputs to: <workspace>/eval-<id>/outputs/
|
||||
```
|
||||
|
||||
**Without subagents**: Read `agents/executor.md` and follow the procedure directly — execute the eval, save the transcript, and produce outputs inline.
|
||||
|
||||
After execution completes, update timing.json with executor_end and duration.
|
||||
|
||||
## Step 4: Grade
|
||||
|
||||
Update task to `reviewing` and run the grader:
|
||||
|
||||
**With subagents**: Spawn a grader subagent with these instructions:
|
||||
|
||||
```
|
||||
Read agents/grader.md at: <skill-creator-path>/agents/grader.md
|
||||
|
||||
Grade these expectations:
|
||||
- Assertions: <list from eval_metadata.json>
|
||||
- Transcript: <workspace>/eval-<id>/transcript.md
|
||||
- Outputs: <workspace>/eval-<id>/outputs/
|
||||
- Save grading to: <workspace>/eval-<id>/grading.json
|
||||
```
|
||||
|
||||
**Without subagents**: Read `agents/grader.md` and follow the procedure directly — evaluate expectations against the transcript and outputs, then save grading.json.
|
||||
|
||||
After grading completes, finalize timing.json.
|
||||
|
||||
## Step 5: Display Results
|
||||
|
||||
Update task to `completed`. Display:
|
||||
- Pass/fail status for each expectation with evidence
|
||||
- Overall pass rate
|
||||
- Execution metrics from grading.json
|
||||
- Wall clock time from timing.json
|
||||
- **User notes summary**: Uncertainties, workarounds, and suggestions from the executor (may reveal issues even when expectations pass)
|
||||
|
||||
## Comparison Workflow
|
||||
|
||||
To compare skill-enabled vs no-skill performance:
|
||||
|
||||
```
|
||||
1. Prepare both runs (with --no-skill flag for baseline)
|
||||
2. Execute both (parallel executors)
|
||||
3. Grade both (parallel graders)
|
||||
4. Blind Compare outputs
|
||||
5. Report winner with analysis
|
||||
```
|
||||
@@ -1,190 +0,0 @@
|
||||
# Mode Workflow Diagrams
|
||||
|
||||
Visual representations of how each mode orchestrates building blocks.
|
||||
|
||||
## Quality Assessment (Eval Mode)
|
||||
|
||||
Measures how well a skill performs on its evals.
|
||||
|
||||
```
|
||||
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
|
||||
│ Executor │────▶│ Grader │────▶│ Aggregate │
|
||||
│ (N runs) │ │ (N runs) │ │ Results │
|
||||
└─────────────┘ └─────────────┘ └─────────────┘
|
||||
│ │ │
|
||||
▼ ▼ ▼
|
||||
transcript.md grading.json summary.json
|
||||
user_notes.md claims[]
|
||||
metrics.json user_notes_summary
|
||||
```
|
||||
|
||||
Use for: Testing skill performance, validating skill value.
|
||||
|
||||
## Skill Improvement (Improve Mode)
|
||||
|
||||
Iteratively improves a skill through blind comparison.
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ ITERATION LOOP │
|
||||
│ │
|
||||
│ ┌─────────┐ ┌─────────┐ ┌────────────┐ ┌──────────┐ │
|
||||
│ │Executor │──▶│ Grader │──▶│ Comparator │──▶│ Analyzer │ │
|
||||
│ │(3 runs) │ │(3 runs) │ │ (blind) │ │(post-hoc)│ │
|
||||
│ └─────────┘ └─────────┘ └────────────┘ └──────────┘ │
|
||||
│ │ │ │ │ │
|
||||
│ │ │ │ ▼ │
|
||||
│ │ │ │ suggestions │
|
||||
│ │ │ ▼ │ │
|
||||
│ │ │ winner A/B │ │
|
||||
│ │ ▼ │ │
|
||||
│ │ pass_rate │ │
|
||||
│ ▼ ▼ │
|
||||
│ transcript ┌──────────────────┐ │
|
||||
│ user_notes │ Apply to v(N+1) │ │
|
||||
│ └────────┬─────────┘ │
|
||||
│ │ │
|
||||
└───────────────────────────────────────────────────┼─────────────┘
|
||||
│
|
||||
(repeat until
|
||||
goal or timeout)
|
||||
```
|
||||
|
||||
Use for: Optimizing skill performance, iterating on skill instructions.
|
||||
|
||||
## A/B Testing (with vs without skill)
|
||||
|
||||
Compares skill-enabled vs no-skill performance.
|
||||
|
||||
```
|
||||
┌─────────────────┐
|
||||
│ Executor │──┐
|
||||
│ (with skill) │ │ ┌────────────┐ ┌─────────┐
|
||||
└─────────────────┘ ├────▶│ Comparator │────▶│ Report │
|
||||
┌─────────────────┐ │ │ (blind) │ │ winner │
|
||||
│ Executor │──┘ └────────────┘ └─────────┘
|
||||
│ (without skill) │ │
|
||||
└─────────────────┘ ▼
|
||||
rubric scores
|
||||
expectation results
|
||||
```
|
||||
|
||||
Use for: Proving skill adds value, measuring skill impact.
|
||||
|
||||
## Skill Creation (Create Mode)
|
||||
|
||||
Interactive skill development with user feedback.
|
||||
|
||||
```
|
||||
┌───────────────────────────────────────────────────────────────┐
|
||||
│ USER FEEDBACK LOOP │
|
||||
│ │
|
||||
│ ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌──────────┐ │
|
||||
│ │ Interview │──▶│ Research │──▶│ Draft │──▶│ Run │ │
|
||||
│ │ User │ │ via MCPs │ │ SKILL.md │ │ Example │ │
|
||||
│ └───────────┘ └───────────┘ └───────────┘ └──────────┘ │
|
||||
│ ▲ │ │
|
||||
│ │ ▼ │
|
||||
│ │ user sees │
|
||||
│ │ transcript │
|
||||
│ │ │ │
|
||||
│ └───────────────────────────────────────────────┘ │
|
||||
│ refine │
|
||||
└───────────────────────────────────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
(once criteria defined,
|
||||
transition to Improve)
|
||||
```
|
||||
|
||||
Use for: Creating new skills with tight user feedback.
|
||||
|
||||
## Skill Benchmark (Benchmark Mode)
|
||||
|
||||
Standardized performance measurement with variance analysis.
|
||||
|
||||
```
|
||||
┌──────────────────────────────────────────────────────────────────────┐
|
||||
│ FOR EACH EVAL │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────┐ ┌─────────────────────────────┐│
|
||||
│ │ WITH SKILL (3x) │ │ WITHOUT SKILL (3x) ││
|
||||
│ │ ┌─────────┐ ┌─────────┐ │ │ ┌─────────┐ ┌─────────┐ ││
|
||||
│ │ │Executor │ │ Grader │ ───┐ │ │ │Executor │ │ Grader │───┐││
|
||||
│ │ │ run 1 │→│ run 1 │ │ │ │ │ run 1 │→│ run 1 │ │││
|
||||
│ │ └─────────┘ └─────────┘ │ │ │ └─────────┘ └─────────┘ │││
|
||||
│ │ ┌─────────┐ ┌─────────┐ │ │ │ ┌─────────┐ ┌─────────┐ │││
|
||||
│ │ │Executor │ │ Grader │ ───┼───│──│──│Executor │ │ Grader │───┼││
|
||||
│ │ │ run 2 │→│ run 2 │ │ │ │ │ run 2 │→│ run 2 │ │││
|
||||
│ │ └─────────┘ └─────────┘ │ │ │ └─────────┘ └─────────┘ │││
|
||||
│ │ ┌─────────┐ ┌─────────┐ │ │ │ ┌─────────┐ ┌─────────┐ │││
|
||||
│ │ │Executor │ │ Grader │ ───┘ │ │ │Executor │ │ Grader │───┘││
|
||||
│ │ │ run 3 │→│ run 3 │ │ │ │ run 3 │→│ run 3 │ ││
|
||||
│ │ └─────────┘ └─────────┘ │ │ └─────────┘ └─────────┘ ││
|
||||
│ └─────────────────────────────────┘ └─────────────────────────────┘│
|
||||
│ │ │ │
|
||||
│ └────────┬───────────┘ │
|
||||
│ ▼ │
|
||||
│ ┌─────────────────┐ │
|
||||
│ │ Analyzer │ │
|
||||
│ │ (most capable) │ │
|
||||
│ └────────┬────────┘ │
|
||||
│ │ │
|
||||
└───────────────────────────────────────┼──────────────────────────────┘
|
||||
▼
|
||||
┌─────────────────┐
|
||||
│ benchmark.json │
|
||||
│ benchmark.md │
|
||||
└─────────────────┘
|
||||
```
|
||||
|
||||
Captures per-run: pass_rate, time_seconds, tokens, tool_calls, notes
|
||||
Aggregates: mean, stddev, min, max for each metric across configurations
|
||||
Analyzer surfaces: patterns, anomalies, and freeform observations
|
||||
|
||||
Use for: Understanding skill performance, comparing across models, tracking regressions.
|
||||
|
||||
---
|
||||
|
||||
## Inline Workflows (Without Subagents)
|
||||
|
||||
When subagents aren't available, the same building blocks execute sequentially in the main loop.
|
||||
|
||||
### Eval Mode (Inline)
|
||||
|
||||
```
|
||||
┌───────────────────────────────────────────────────────┐
|
||||
│ MAIN LOOP │
|
||||
│ │
|
||||
│ Read executor.md → Execute eval → Save outputs │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ Read grader.md → Grade expectations → Save grading │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ Display results │
|
||||
└───────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### Improve Mode (Inline)
|
||||
|
||||
```
|
||||
┌───────────────────────────────────────────────────────┐
|
||||
│ ITERATION LOOP │
|
||||
│ │
|
||||
│ Read executor.md → Execute (1 run) → Save outputs │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ Read grader.md → Grade expectations → Save grading │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ Compare with previous best (inline, not blind) │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ Analyze differences → Apply improvements to v(N+1) │
|
||||
│ │ │
|
||||
│ (repeat until goal or timeout) │
|
||||
└───────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
Benchmark mode requires subagents and has no inline equivalent.
|
||||
@@ -1,438 +0,0 @@
|
||||
# JSON Schemas
|
||||
|
||||
This document defines the JSON schemas used by skill-creator-edge.
|
||||
|
||||
## Working with JSON Files
|
||||
|
||||
### Initialize a new file with correct structure
|
||||
|
||||
```bash
|
||||
scripts/init_json.py <type> <output-path>
|
||||
|
||||
# Examples:
|
||||
scripts/init_json.py evals evals/evals.json
|
||||
scripts/init_json.py grading run-1/grading.json
|
||||
scripts/init_json.py benchmark benchmarks/2026-01-15/benchmark.json
|
||||
scripts/init_json.py metrics run-1/outputs/metrics.json
|
||||
```
|
||||
|
||||
### Validate an existing file
|
||||
|
||||
```bash
|
||||
scripts/validate_json.py <file-path> [--type <type>]
|
||||
|
||||
# Examples:
|
||||
scripts/validate_json.py evals/evals.json
|
||||
scripts/validate_json.py run-1/grading.json --type grading
|
||||
```
|
||||
|
||||
The validator infers the type from the filename when possible.
|
||||
|
||||
---
|
||||
|
||||
## evals.json
|
||||
|
||||
Defines the evals for a skill. Located at `evals/evals.json` within the skill directory.
|
||||
|
||||
```json
|
||||
{
|
||||
"skill_name": "example-skill",
|
||||
"evals": [
|
||||
{
|
||||
"id": 1,
|
||||
"prompt": "User's example prompt",
|
||||
"expected_output": "Description of expected result",
|
||||
"files": ["evals/files/sample1.pdf"],
|
||||
"expectations": [
|
||||
"The output includes X",
|
||||
"The skill used script Y"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
**Fields:**
|
||||
- `skill_name`: Name matching the skill's frontmatter
|
||||
- `evals[].id`: Unique integer identifier
|
||||
- `evals[].prompt`: The task to execute
|
||||
- `evals[].expected_output`: Human-readable description of success
|
||||
- `evals[].files`: Optional list of input file paths (relative to skill root)
|
||||
- `evals[].expectations`: List of verifiable statements
|
||||
|
||||
---
|
||||
|
||||
## history.json
|
||||
|
||||
Tracks version progression in Improve mode. Located at workspace root.
|
||||
|
||||
```json
|
||||
{
|
||||
"started_at": "2026-01-15T10:30:00Z",
|
||||
"skill_name": "pdf",
|
||||
"current_best": "v2",
|
||||
"iterations": [
|
||||
{
|
||||
"version": "v0",
|
||||
"parent": null,
|
||||
"expectation_pass_rate": 0.65,
|
||||
"grading_result": "baseline",
|
||||
"is_current_best": false
|
||||
},
|
||||
{
|
||||
"version": "v1",
|
||||
"parent": "v0",
|
||||
"expectation_pass_rate": 0.75,
|
||||
"grading_result": "won",
|
||||
"is_current_best": false
|
||||
},
|
||||
{
|
||||
"version": "v2",
|
||||
"parent": "v1",
|
||||
"expectation_pass_rate": 0.85,
|
||||
"grading_result": "won",
|
||||
"is_current_best": true
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
**Fields:**
|
||||
- `started_at`: ISO timestamp of when improvement started
|
||||
- `skill_name`: Name of the skill being improved
|
||||
- `current_best`: Version identifier of the best performer
|
||||
- `iterations[].version`: Version identifier (v0, v1, ...)
|
||||
- `iterations[].parent`: Parent version this was derived from
|
||||
- `iterations[].expectation_pass_rate`: Pass rate from grading
|
||||
- `iterations[].grading_result`: "baseline", "won", "lost", or "tie"
|
||||
- `iterations[].is_current_best`: Whether this is the current best version
|
||||
|
||||
---
|
||||
|
||||
## grading.json
|
||||
|
||||
Output from the grader agent. Located at `<run-dir>/grading.json`.
|
||||
|
||||
```json
|
||||
{
|
||||
"expectations": [
|
||||
{
|
||||
"text": "The output includes the name 'John Smith'",
|
||||
"passed": true,
|
||||
"evidence": "Found in transcript Step 3: 'Extracted names: John Smith, Sarah Johnson'"
|
||||
},
|
||||
{
|
||||
"text": "The spreadsheet has a SUM formula in cell B10",
|
||||
"passed": false,
|
||||
"evidence": "No spreadsheet was created. The output was a text file."
|
||||
}
|
||||
],
|
||||
"summary": {
|
||||
"passed": 2,
|
||||
"failed": 1,
|
||||
"total": 3,
|
||||
"pass_rate": 0.67
|
||||
},
|
||||
"execution_metrics": {
|
||||
"tool_calls": {
|
||||
"Read": 5,
|
||||
"Write": 2,
|
||||
"Bash": 8
|
||||
},
|
||||
"total_tool_calls": 15,
|
||||
"total_steps": 6,
|
||||
"errors_encountered": 0,
|
||||
"output_chars": 12450,
|
||||
"transcript_chars": 3200
|
||||
},
|
||||
"timing": {
|
||||
"executor_duration_seconds": 165.0,
|
||||
"grader_duration_seconds": 26.0,
|
||||
"total_duration_seconds": 191.0
|
||||
},
|
||||
"claims": [
|
||||
{
|
||||
"claim": "The form has 12 fillable fields",
|
||||
"type": "factual",
|
||||
"verified": true,
|
||||
"evidence": "Counted 12 fields in field_info.json"
|
||||
}
|
||||
],
|
||||
"user_notes_summary": {
|
||||
"uncertainties": ["Used 2023 data, may be stale"],
|
||||
"needs_review": [],
|
||||
"workarounds": ["Fell back to text overlay for non-fillable fields"]
|
||||
},
|
||||
"eval_feedback": {
|
||||
"suggestions": [
|
||||
{
|
||||
"assertion": "The output includes the name 'John Smith'",
|
||||
"reason": "A hallucinated document that mentions the name would also pass"
|
||||
}
|
||||
],
|
||||
"overall": "Assertions check presence but not correctness."
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Fields:**
|
||||
- `expectations[]`: Graded expectations with evidence
|
||||
- `summary`: Aggregate pass/fail counts
|
||||
- `execution_metrics`: Tool usage and output size (from executor's metrics.json)
|
||||
- `timing`: Wall clock timing (from timing.json)
|
||||
- `claims`: Extracted and verified claims from the output
|
||||
- `user_notes_summary`: Issues flagged by the executor
|
||||
- `eval_feedback`: (optional) Improvement suggestions for the evals, only present when the grader identifies issues worth raising
|
||||
|
||||
---
|
||||
|
||||
## metrics.json
|
||||
|
||||
Output from the executor agent. Located at `<run-dir>/outputs/metrics.json`.
|
||||
|
||||
```json
|
||||
{
|
||||
"tool_calls": {
|
||||
"Read": 5,
|
||||
"Write": 2,
|
||||
"Bash": 8,
|
||||
"Edit": 1,
|
||||
"Glob": 2,
|
||||
"Grep": 0
|
||||
},
|
||||
"total_tool_calls": 18,
|
||||
"total_steps": 6,
|
||||
"files_created": ["filled_form.pdf", "field_values.json"],
|
||||
"errors_encountered": 0,
|
||||
"output_chars": 12450,
|
||||
"transcript_chars": 3200
|
||||
}
|
||||
```
|
||||
|
||||
**Fields:**
|
||||
- `tool_calls`: Count per tool type
|
||||
- `total_tool_calls`: Sum of all tool calls
|
||||
- `total_steps`: Number of major execution steps
|
||||
- `files_created`: List of output files created
|
||||
- `errors_encountered`: Number of errors during execution
|
||||
- `output_chars`: Total character count of output files
|
||||
- `transcript_chars`: Character count of transcript
|
||||
|
||||
---
|
||||
|
||||
## timing.json
|
||||
|
||||
Wall clock timing for a run. Located at `<run-dir>/timing.json`.
|
||||
|
||||
```json
|
||||
{
|
||||
"executor_start": "2026-01-15T10:30:00Z",
|
||||
"executor_end": "2026-01-15T10:32:45Z",
|
||||
"executor_duration_seconds": 165.0,
|
||||
"grader_start": "2026-01-15T10:32:46Z",
|
||||
"grader_end": "2026-01-15T10:33:12Z",
|
||||
"grader_duration_seconds": 26.0,
|
||||
"total_duration_seconds": 191.0
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## benchmark.json
|
||||
|
||||
Output from Benchmark mode. Located at `benchmarks/<timestamp>/benchmark.json`.
|
||||
|
||||
```json
|
||||
{
|
||||
"metadata": {
|
||||
"skill_name": "pdf",
|
||||
"skill_path": "/path/to/pdf",
|
||||
"executor_model": "claude-sonnet-4-20250514",
|
||||
"analyzer_model": "most-capable-model",
|
||||
"timestamp": "2026-01-15T10:30:00Z",
|
||||
"evals_run": [1, 2, 3],
|
||||
"runs_per_configuration": 3
|
||||
},
|
||||
|
||||
"runs": [
|
||||
{
|
||||
"eval_id": 1,
|
||||
"configuration": "with_skill",
|
||||
"run_number": 1,
|
||||
"result": {
|
||||
"pass_rate": 0.85,
|
||||
"passed": 6,
|
||||
"failed": 1,
|
||||
"total": 7,
|
||||
"time_seconds": 42.5,
|
||||
"tokens": 3800,
|
||||
"tool_calls": 18,
|
||||
"errors": 0
|
||||
},
|
||||
"expectations": [
|
||||
{"text": "...", "passed": true, "evidence": "..."}
|
||||
],
|
||||
"notes": [
|
||||
"Used 2023 data, may be stale",
|
||||
"Fell back to text overlay for non-fillable fields"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
||||
"run_summary": {
|
||||
"with_skill": {
|
||||
"pass_rate": {"mean": 0.85, "stddev": 0.05, "min": 0.80, "max": 0.90},
|
||||
"time_seconds": {"mean": 45.0, "stddev": 12.0, "min": 32.0, "max": 58.0},
|
||||
"tokens": {"mean": 3800, "stddev": 400, "min": 3200, "max": 4100}
|
||||
},
|
||||
"without_skill": {
|
||||
"pass_rate": {"mean": 0.35, "stddev": 0.08, "min": 0.28, "max": 0.45},
|
||||
"time_seconds": {"mean": 32.0, "stddev": 8.0, "min": 24.0, "max": 42.0},
|
||||
"tokens": {"mean": 2100, "stddev": 300, "min": 1800, "max": 2500}
|
||||
},
|
||||
"delta": {
|
||||
"pass_rate": "+0.50",
|
||||
"time_seconds": "+13.0",
|
||||
"tokens": "+1700"
|
||||
}
|
||||
},
|
||||
|
||||
"notes": [
|
||||
"Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value",
|
||||
"Eval 3 shows high variance (50% ± 40%) - may be flaky or model-dependent",
|
||||
"Without-skill runs consistently fail on table extraction expectations",
|
||||
"Skill adds 13s average execution time but improves pass rate by 50%"
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
**Fields:**
|
||||
- `metadata`: Information about the benchmark run
|
||||
- `runs[]`: Individual run results with expectations and notes
|
||||
- `run_summary`: Statistical aggregates per configuration
|
||||
- `notes`: Freeform observations from the analyzer
|
||||
|
||||
---
|
||||
|
||||
## comparison.json
|
||||
|
||||
Output from blind comparator. Located at `<grading-dir>/comparison-N.json`.
|
||||
|
||||
```json
|
||||
{
|
||||
"winner": "A",
|
||||
"reasoning": "Output A provides a complete solution with proper formatting and all required fields. Output B is missing the date field and has formatting inconsistencies.",
|
||||
"rubric": {
|
||||
"A": {
|
||||
"content": {
|
||||
"correctness": 5,
|
||||
"completeness": 5,
|
||||
"accuracy": 4
|
||||
},
|
||||
"structure": {
|
||||
"organization": 4,
|
||||
"formatting": 5,
|
||||
"usability": 4
|
||||
},
|
||||
"content_score": 4.7,
|
||||
"structure_score": 4.3,
|
||||
"overall_score": 9.0
|
||||
},
|
||||
"B": {
|
||||
"content": {
|
||||
"correctness": 3,
|
||||
"completeness": 2,
|
||||
"accuracy": 3
|
||||
},
|
||||
"structure": {
|
||||
"organization": 3,
|
||||
"formatting": 2,
|
||||
"usability": 3
|
||||
},
|
||||
"content_score": 2.7,
|
||||
"structure_score": 2.7,
|
||||
"overall_score": 5.4
|
||||
}
|
||||
},
|
||||
"output_quality": {
|
||||
"A": {
|
||||
"score": 9,
|
||||
"strengths": ["Complete solution", "Well-formatted", "All fields present"],
|
||||
"weaknesses": ["Minor style inconsistency in header"]
|
||||
},
|
||||
"B": {
|
||||
"score": 5,
|
||||
"strengths": ["Readable output", "Correct basic structure"],
|
||||
"weaknesses": ["Missing date field", "Formatting inconsistencies", "Partial data extraction"]
|
||||
}
|
||||
},
|
||||
"expectation_results": {
|
||||
"A": {
|
||||
"passed": 4,
|
||||
"total": 5,
|
||||
"pass_rate": 0.80,
|
||||
"details": [
|
||||
{"text": "Output includes name", "passed": true}
|
||||
]
|
||||
},
|
||||
"B": {
|
||||
"passed": 3,
|
||||
"total": 5,
|
||||
"pass_rate": 0.60,
|
||||
"details": [
|
||||
{"text": "Output includes name", "passed": true}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## analysis.json
|
||||
|
||||
Output from post-hoc analyzer. Located at `<grading-dir>/analysis.json`.
|
||||
|
||||
```json
|
||||
{
|
||||
"comparison_summary": {
|
||||
"winner": "A",
|
||||
"winner_skill": "path/to/winner/skill",
|
||||
"loser_skill": "path/to/loser/skill",
|
||||
"comparator_reasoning": "Brief summary of why comparator chose winner"
|
||||
},
|
||||
"winner_strengths": [
|
||||
"Clear step-by-step instructions for handling multi-page documents",
|
||||
"Included validation script that caught formatting errors"
|
||||
],
|
||||
"loser_weaknesses": [
|
||||
"Vague instruction 'process the document appropriately' led to inconsistent behavior",
|
||||
"No script for validation, agent had to improvise"
|
||||
],
|
||||
"instruction_following": {
|
||||
"winner": {
|
||||
"score": 9,
|
||||
"issues": ["Minor: skipped optional logging step"]
|
||||
},
|
||||
"loser": {
|
||||
"score": 6,
|
||||
"issues": [
|
||||
"Did not use the skill's formatting template",
|
||||
"Invented own approach instead of following step 3"
|
||||
]
|
||||
}
|
||||
},
|
||||
"improvement_suggestions": [
|
||||
{
|
||||
"priority": "high",
|
||||
"category": "instructions",
|
||||
"suggestion": "Replace 'process the document appropriately' with explicit steps",
|
||||
"expected_impact": "Would eliminate ambiguity that caused inconsistent behavior"
|
||||
}
|
||||
],
|
||||
"transcript_insights": {
|
||||
"winner_execution_pattern": "Read skill -> Followed 5-step process -> Used validation script",
|
||||
"loser_execution_pattern": "Read skill -> Unclear on approach -> Tried 3 different methods"
|
||||
}
|
||||
}
|
||||
```
|
||||
@@ -1,351 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Aggregate individual run results into benchmark summary statistics.
|
||||
|
||||
Reads grading.json files from run directories and produces:
|
||||
- run_summary with mean, stddev, min, max for each metric
|
||||
- delta between with_skill and without_skill configurations
|
||||
|
||||
Usage:
|
||||
python aggregate_benchmark.py <benchmark_dir>
|
||||
|
||||
Example:
|
||||
python aggregate_benchmark.py benchmarks/2026-01-15T10-30-00/
|
||||
|
||||
The script expects this directory structure:
|
||||
<benchmark_dir>/
|
||||
└── runs/
|
||||
└── eval-N/
|
||||
├── with_skill/
|
||||
│ ├── run-1/grading.json
|
||||
│ ├── run-2/grading.json
|
||||
│ └── run-3/grading.json
|
||||
└── without_skill/
|
||||
├── run-1/grading.json
|
||||
├── run-2/grading.json
|
||||
└── run-3/grading.json
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import math
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def calculate_stats(values: list[float]) -> dict:
|
||||
"""Calculate mean, stddev, min, max for a list of values."""
|
||||
if not values:
|
||||
return {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}
|
||||
|
||||
n = len(values)
|
||||
mean = sum(values) / n
|
||||
|
||||
if n > 1:
|
||||
variance = sum((x - mean) ** 2 for x in values) / (n - 1)
|
||||
stddev = math.sqrt(variance)
|
||||
else:
|
||||
stddev = 0.0
|
||||
|
||||
return {
|
||||
"mean": round(mean, 4),
|
||||
"stddev": round(stddev, 4),
|
||||
"min": round(min(values), 4),
|
||||
"max": round(max(values), 4)
|
||||
}
|
||||
|
||||
|
||||
def load_run_results(benchmark_dir: Path) -> dict:
|
||||
"""
|
||||
Load all run results from a benchmark directory.
|
||||
|
||||
Returns dict with structure:
|
||||
{
|
||||
"with_skill": [
|
||||
{"eval_id": 1, "run_number": 1, "pass_rate": 0.85, ...},
|
||||
...
|
||||
],
|
||||
"without_skill": [...]
|
||||
}
|
||||
"""
|
||||
runs_dir = benchmark_dir / "runs"
|
||||
|
||||
if not runs_dir.exists():
|
||||
print(f"Runs directory not found: {runs_dir}")
|
||||
return {"with_skill": [], "without_skill": []}
|
||||
|
||||
results = {"with_skill": [], "without_skill": []}
|
||||
|
||||
for eval_dir in sorted(runs_dir.glob("eval-*")):
|
||||
eval_id = int(eval_dir.name.split("-")[1])
|
||||
|
||||
for config in ["with_skill", "without_skill"]:
|
||||
config_dir = eval_dir / config
|
||||
|
||||
if not config_dir.exists():
|
||||
continue
|
||||
|
||||
for run_dir in sorted(config_dir.glob("run-*")):
|
||||
run_number = int(run_dir.name.split("-")[1])
|
||||
grading_file = run_dir / "grading.json"
|
||||
|
||||
if not grading_file.exists():
|
||||
print(f"Warning: grading.json not found in {run_dir}")
|
||||
continue
|
||||
|
||||
try:
|
||||
with open(grading_file) as f:
|
||||
grading = json.load(f)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Warning: Invalid JSON in {grading_file}: {e}")
|
||||
continue
|
||||
|
||||
# Extract metrics
|
||||
result = {
|
||||
"eval_id": eval_id,
|
||||
"run_number": run_number,
|
||||
"pass_rate": grading.get("summary", {}).get("pass_rate", 0.0),
|
||||
"passed": grading.get("summary", {}).get("passed", 0),
|
||||
"failed": grading.get("summary", {}).get("failed", 0),
|
||||
"total": grading.get("summary", {}).get("total", 0),
|
||||
}
|
||||
|
||||
# Extract timing if available
|
||||
timing = grading.get("timing", {})
|
||||
result["time_seconds"] = timing.get("total_duration_seconds", 0.0)
|
||||
|
||||
# Extract metrics if available
|
||||
metrics = grading.get("execution_metrics", {})
|
||||
result["tool_calls"] = metrics.get("total_tool_calls", 0)
|
||||
result["tokens"] = metrics.get("output_chars", 0) # Placeholder
|
||||
result["errors"] = metrics.get("errors_encountered", 0)
|
||||
|
||||
# Extract expectations
|
||||
result["expectations"] = grading.get("expectations", [])
|
||||
|
||||
# Extract notes from user_notes_summary
|
||||
notes_summary = grading.get("user_notes_summary", {})
|
||||
notes = []
|
||||
notes.extend(notes_summary.get("uncertainties", []))
|
||||
notes.extend(notes_summary.get("needs_review", []))
|
||||
notes.extend(notes_summary.get("workarounds", []))
|
||||
result["notes"] = notes
|
||||
|
||||
results[config].append(result)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def aggregate_results(results: dict) -> dict:
|
||||
"""
|
||||
Aggregate run results into summary statistics.
|
||||
|
||||
Returns run_summary with stats for each configuration and delta.
|
||||
"""
|
||||
run_summary = {}
|
||||
|
||||
for config in ["with_skill", "without_skill"]:
|
||||
runs = results.get(config, [])
|
||||
|
||||
if not runs:
|
||||
run_summary[config] = {
|
||||
"pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
|
||||
"time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
|
||||
"tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
|
||||
}
|
||||
continue
|
||||
|
||||
pass_rates = [r["pass_rate"] for r in runs]
|
||||
times = [r["time_seconds"] for r in runs]
|
||||
tokens = [r.get("tokens", 0) for r in runs]
|
||||
|
||||
run_summary[config] = {
|
||||
"pass_rate": calculate_stats(pass_rates),
|
||||
"time_seconds": calculate_stats(times),
|
||||
"tokens": calculate_stats(tokens)
|
||||
}
|
||||
|
||||
# Calculate delta
|
||||
with_skill = run_summary.get("with_skill", {})
|
||||
without_skill = run_summary.get("without_skill", {})
|
||||
|
||||
delta_pass_rate = with_skill.get("pass_rate", {}).get("mean", 0) - without_skill.get("pass_rate", {}).get("mean", 0)
|
||||
delta_time = with_skill.get("time_seconds", {}).get("mean", 0) - without_skill.get("time_seconds", {}).get("mean", 0)
|
||||
delta_tokens = with_skill.get("tokens", {}).get("mean", 0) - without_skill.get("tokens", {}).get("mean", 0)
|
||||
|
||||
run_summary["delta"] = {
|
||||
"pass_rate": f"{delta_pass_rate:+.2f}",
|
||||
"time_seconds": f"{delta_time:+.1f}",
|
||||
"tokens": f"{delta_tokens:+.0f}"
|
||||
}
|
||||
|
||||
return run_summary
|
||||
|
||||
|
||||
def generate_benchmark(benchmark_dir: Path, skill_name: str = "", skill_path: str = "") -> dict:
|
||||
"""
|
||||
Generate complete benchmark.json from run results.
|
||||
"""
|
||||
results = load_run_results(benchmark_dir)
|
||||
run_summary = aggregate_results(results)
|
||||
|
||||
# Build runs array for benchmark.json
|
||||
runs = []
|
||||
for config in ["with_skill", "without_skill"]:
|
||||
for result in results.get(config, []):
|
||||
runs.append({
|
||||
"eval_id": result["eval_id"],
|
||||
"configuration": config,
|
||||
"run_number": result["run_number"],
|
||||
"result": {
|
||||
"pass_rate": result["pass_rate"],
|
||||
"passed": result["passed"],
|
||||
"failed": result["failed"],
|
||||
"total": result["total"],
|
||||
"time_seconds": result["time_seconds"],
|
||||
"tokens": result.get("tokens", 0),
|
||||
"tool_calls": result.get("tool_calls", 0),
|
||||
"errors": result.get("errors", 0)
|
||||
},
|
||||
"expectations": result["expectations"],
|
||||
"notes": result["notes"]
|
||||
})
|
||||
|
||||
# Determine eval IDs from results
|
||||
eval_ids = sorted(set(
|
||||
r["eval_id"]
|
||||
for config in results.values()
|
||||
for r in config
|
||||
))
|
||||
|
||||
benchmark = {
|
||||
"metadata": {
|
||||
"skill_name": skill_name or "<skill-name>",
|
||||
"skill_path": skill_path or "<path/to/skill>",
|
||||
"executor_model": "<model-name>",
|
||||
"analyzer_model": "<model-name>",
|
||||
"timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"evals_run": eval_ids,
|
||||
"runs_per_configuration": 3
|
||||
},
|
||||
"runs": runs,
|
||||
"run_summary": run_summary,
|
||||
"notes": [] # To be filled by analyzer
|
||||
}
|
||||
|
||||
return benchmark
|
||||
|
||||
|
||||
def generate_markdown(benchmark: dict) -> str:
|
||||
"""Generate human-readable benchmark.md from benchmark data."""
|
||||
metadata = benchmark["metadata"]
|
||||
run_summary = benchmark["run_summary"]
|
||||
|
||||
lines = [
|
||||
f"# Skill Benchmark: {metadata['skill_name']}",
|
||||
"",
|
||||
f"**Model**: {metadata['executor_model']}",
|
||||
f"**Date**: {metadata['timestamp']}",
|
||||
f"**Evals**: {', '.join(map(str, metadata['evals_run']))} ({metadata['runs_per_configuration']} runs each per configuration)",
|
||||
"",
|
||||
"## Summary",
|
||||
"",
|
||||
"| Metric | With Skill | Without Skill | Delta |",
|
||||
"|--------|------------|---------------|-------|",
|
||||
]
|
||||
|
||||
# Format pass rate
|
||||
with_pr = run_summary["with_skill"]["pass_rate"]
|
||||
without_pr = run_summary["without_skill"]["pass_rate"]
|
||||
delta_pr = run_summary["delta"]["pass_rate"]
|
||||
lines.append(f"| Pass Rate | {with_pr['mean']*100:.0f}% ± {with_pr['stddev']*100:.0f}% | {without_pr['mean']*100:.0f}% ± {without_pr['stddev']*100:.0f}% | {delta_pr} |")
|
||||
|
||||
# Format time
|
||||
with_time = run_summary["with_skill"]["time_seconds"]
|
||||
without_time = run_summary["without_skill"]["time_seconds"]
|
||||
delta_time = run_summary["delta"]["time_seconds"]
|
||||
lines.append(f"| Time | {with_time['mean']:.1f}s ± {with_time['stddev']:.1f}s | {without_time['mean']:.1f}s ± {without_time['stddev']:.1f}s | {delta_time}s |")
|
||||
|
||||
# Format tokens
|
||||
with_tokens = run_summary["with_skill"]["tokens"]
|
||||
without_tokens = run_summary["without_skill"]["tokens"]
|
||||
delta_tokens = run_summary["delta"]["tokens"]
|
||||
lines.append(f"| Tokens | {with_tokens['mean']:.0f} ± {with_tokens['stddev']:.0f} | {without_tokens['mean']:.0f} ± {without_tokens['stddev']:.0f} | {delta_tokens} |")
|
||||
|
||||
# Notes section
|
||||
if benchmark.get("notes"):
|
||||
lines.extend([
|
||||
"",
|
||||
"## Notes",
|
||||
""
|
||||
])
|
||||
for note in benchmark["notes"]:
|
||||
lines.append(f"- {note}")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Aggregate benchmark run results into summary statistics"
|
||||
)
|
||||
parser.add_argument(
|
||||
"benchmark_dir",
|
||||
type=Path,
|
||||
help="Path to the benchmark directory"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skill-name",
|
||||
default="",
|
||||
help="Name of the skill being benchmarked"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skill-path",
|
||||
default="",
|
||||
help="Path to the skill being benchmarked"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", "-o",
|
||||
type=Path,
|
||||
help="Output path for benchmark.json (default: <benchmark_dir>/benchmark.json)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.benchmark_dir.exists():
|
||||
print(f"Directory not found: {args.benchmark_dir}")
|
||||
sys.exit(1)
|
||||
|
||||
# Generate benchmark
|
||||
benchmark = generate_benchmark(args.benchmark_dir, args.skill_name, args.skill_path)
|
||||
|
||||
# Determine output paths
|
||||
output_json = args.output or (args.benchmark_dir / "benchmark.json")
|
||||
output_md = output_json.with_suffix(".md")
|
||||
|
||||
# Write benchmark.json
|
||||
with open(output_json, "w") as f:
|
||||
json.dump(benchmark, f, indent=2)
|
||||
print(f"Generated: {output_json}")
|
||||
|
||||
# Write benchmark.md
|
||||
markdown = generate_markdown(benchmark)
|
||||
with open(output_md, "w") as f:
|
||||
f.write(markdown)
|
||||
print(f"Generated: {output_md}")
|
||||
|
||||
# Print summary
|
||||
run_summary = benchmark["run_summary"]
|
||||
with_pr = run_summary["with_skill"]["pass_rate"]["mean"]
|
||||
without_pr = run_summary["without_skill"]["pass_rate"]["mean"]
|
||||
delta = run_summary["delta"]["pass_rate"]
|
||||
|
||||
print(f"\nSummary:")
|
||||
print(f" With skill: {with_pr*100:.1f}% pass rate")
|
||||
print(f" Without skill: {without_pr*100:.1f}% pass rate")
|
||||
print(f" Delta: {delta}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,209 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Copy a skill directory with metadata tracking.
|
||||
|
||||
Creates a copy of a skill directory and adds a META.yaml file
|
||||
to track lineage, changes, and performance metrics.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import shutil
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def create_meta_yaml(
|
||||
dest: Path,
|
||||
parent: str | None,
|
||||
changes: str | None,
|
||||
score: float | None,
|
||||
iteration: int | None,
|
||||
) -> None:
|
||||
"""Create META.yaml file in the destination directory."""
|
||||
created_at = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
# Build YAML content manually to avoid external dependencies
|
||||
lines = ["# Skill iteration metadata", ""]
|
||||
|
||||
# Helper to format YAML values
|
||||
def yaml_value(val):
|
||||
if val is None:
|
||||
return "null"
|
||||
if isinstance(val, bool):
|
||||
return "true" if val else "false"
|
||||
if isinstance(val, (int, float)):
|
||||
return str(val)
|
||||
if isinstance(val, str):
|
||||
# Quote strings that might be ambiguous
|
||||
if val in ("null", "true", "false") or val.startswith(("'", '"', "[", "{")):
|
||||
return f'"{val}"'
|
||||
# Quote strings with special characters
|
||||
if any(c in val for c in (":", "#", "\n", '"', "'")):
|
||||
escaped = val.replace("\\", "\\\\").replace('"', '\\"')
|
||||
return f'"{escaped}"'
|
||||
return val
|
||||
return str(val)
|
||||
|
||||
lines.append(f"parent: {yaml_value(parent)}")
|
||||
lines.append(f"changes: {yaml_value(changes)}")
|
||||
lines.append(f"score: {yaml_value(score)}")
|
||||
lines.append(f"iteration: {yaml_value(iteration)}")
|
||||
lines.append(f"created_at: {yaml_value(created_at)}")
|
||||
lines.append("")
|
||||
|
||||
meta_path = dest / "META.yaml"
|
||||
meta_path.write_text("\n".join(lines))
|
||||
|
||||
|
||||
def copy_skill(
|
||||
source: Path,
|
||||
dest: Path,
|
||||
parent: str | None = None,
|
||||
changes: str | None = None,
|
||||
score: float | None = None,
|
||||
iteration: int | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Copy a skill directory and create version directory structure.
|
||||
|
||||
Creates a version directory with:
|
||||
- skill/ : The actual skill files (copied from source)
|
||||
- runs/ : Created by executor during execution (run-1/, run-2/, run-3/)
|
||||
- improvements/ : For improvement suggestions (if not v0)
|
||||
- META.yaml : Version metadata
|
||||
|
||||
The runs/ directory structure is created on-demand by the executor:
|
||||
- runs/run-1/transcript.md, outputs/, evaluation.json
|
||||
- runs/run-2/...
|
||||
- runs/run-3/...
|
||||
|
||||
Args:
|
||||
source: Path to the source skill directory (or source/skill/ if copying from another version)
|
||||
dest: Path to the destination version directory (e.g., workspace/v1)
|
||||
parent: Name/path of the parent skill iteration
|
||||
changes: Description of changes from parent
|
||||
score: Evaluation score for this iteration
|
||||
iteration: Iteration number
|
||||
"""
|
||||
source = Path(source).resolve()
|
||||
dest = Path(dest).resolve()
|
||||
|
||||
if not source.exists():
|
||||
raise FileNotFoundError(f"Source directory does not exist: {source}")
|
||||
|
||||
if not source.is_dir():
|
||||
raise ValueError(f"Source must be a directory: {source}")
|
||||
|
||||
if dest.exists():
|
||||
raise FileExistsError(f"Destination already exists: {dest}")
|
||||
|
||||
# Create the version directory structure
|
||||
dest.mkdir(parents=True)
|
||||
skill_dest = dest / "skill"
|
||||
(dest / "runs").mkdir()
|
||||
|
||||
# Create improvements directory for non-baseline versions
|
||||
if iteration is not None and iteration > 0:
|
||||
(dest / "improvements").mkdir()
|
||||
|
||||
# Copy the skill files to skill/ subdirectory
|
||||
shutil.copytree(source, skill_dest)
|
||||
|
||||
# Create metadata file at the version root
|
||||
create_meta_yaml(dest, parent, changes, score, iteration)
|
||||
|
||||
print(f"Copied skill from {source} to {skill_dest}")
|
||||
print(f"Created version directory structure at {dest}")
|
||||
print(f" - skill/ : Skill files")
|
||||
print(f" - runs/ : For execution runs (run-1/, run-2/, run-3/)")
|
||||
if iteration is not None and iteration > 0:
|
||||
print(f" - improvements/ : Improvement suggestions")
|
||||
print(f" - META.yaml : Version metadata")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Copy a skill directory with metadata tracking and version structure.",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Create baseline v0 from an existing skill
|
||||
python copy_skill.py ./skills/public/pdf ./skill_iterations/v0 --iteration 0
|
||||
|
||||
# Create v1 from v0's skill directory
|
||||
python copy_skill.py ./skill_iterations/v0/skill ./skill_iterations/v1 \\
|
||||
--parent v0 \\
|
||||
--changes "Added error handling for empty cells" \\
|
||||
--iteration 1
|
||||
|
||||
# Create v2 with score from evaluation
|
||||
python copy_skill.py ./skill_iterations/v1/skill ./skill_iterations/v2 \\
|
||||
--parent v1 \\
|
||||
--changes "Improved coordinate guidance" \\
|
||||
--score 7.5 \\
|
||||
--iteration 2
|
||||
|
||||
Output structure:
|
||||
dest/
|
||||
├── META.yaml # Version metadata
|
||||
├── skill/ # The actual skill files
|
||||
├── runs/ # Execution runs (created by executor)
|
||||
│ ├── run-1/
|
||||
│ │ ├── transcript.md
|
||||
│ │ ├── outputs/
|
||||
│ │ └── evaluation.json
|
||||
│ ├── run-2/
|
||||
│ └── run-3/
|
||||
└── improvements/ # Improvement suggestions (v1+)
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument("source", type=Path, help="Source skill directory to copy")
|
||||
|
||||
parser.add_argument("dest", type=Path, help="Destination path for the copy")
|
||||
|
||||
parser.add_argument(
|
||||
"--parent",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Name or path of the parent skill iteration",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--changes",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Description of changes from the parent version",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--score",
|
||||
type=float,
|
||||
default=None,
|
||||
help="Evaluation score for this iteration (e.g., 7.5)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--iteration",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Iteration number (e.g., 1, 2, 3)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
copy_skill(
|
||||
source=args.source,
|
||||
dest=args.dest,
|
||||
parent=args.parent,
|
||||
changes=args.changes,
|
||||
score=args.score,
|
||||
iteration=args.iteration,
|
||||
)
|
||||
except (FileNotFoundError, FileExistsError, ValueError) as e:
|
||||
parser.error(str(e))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,311 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Initialize JSON files with the correct structure for skill-creator-edge.
|
||||
|
||||
Creates template JSON files that can be filled in.
|
||||
|
||||
Usage:
|
||||
python init_json.py <type> <output_path>
|
||||
|
||||
Examples:
|
||||
python init_json.py evals evals/evals.json
|
||||
python init_json.py grading run-1/grading.json
|
||||
python init_json.py benchmark benchmarks/2026-01-15/benchmark.json
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
TEMPLATES = {
|
||||
"evals": {
|
||||
"skill_name": "<skill-name>",
|
||||
"evals": [
|
||||
{
|
||||
"id": 1,
|
||||
"prompt": "Example task prompt",
|
||||
"expected_output": "Description of expected result",
|
||||
"files": [],
|
||||
"expectations": [
|
||||
"The output includes X",
|
||||
"The skill correctly handles Y"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
"grading": {
|
||||
"expectations": [
|
||||
{
|
||||
"text": "Example expectation",
|
||||
"passed": True,
|
||||
"evidence": "Found in transcript: ..."
|
||||
}
|
||||
],
|
||||
"summary": {
|
||||
"passed": 1,
|
||||
"failed": 0,
|
||||
"total": 1,
|
||||
"pass_rate": 1.0
|
||||
},
|
||||
"execution_metrics": {
|
||||
"tool_calls": {
|
||||
"Read": 0,
|
||||
"Write": 0,
|
||||
"Bash": 0,
|
||||
"Edit": 0,
|
||||
"Glob": 0,
|
||||
"Grep": 0
|
||||
},
|
||||
"total_tool_calls": 0,
|
||||
"total_steps": 0,
|
||||
"errors_encountered": 0,
|
||||
"output_chars": 0,
|
||||
"transcript_chars": 0
|
||||
},
|
||||
"timing": {
|
||||
"executor_duration_seconds": 0.0,
|
||||
"grader_duration_seconds": 0.0,
|
||||
"total_duration_seconds": 0.0
|
||||
},
|
||||
"claims": [],
|
||||
"user_notes_summary": {
|
||||
"uncertainties": [],
|
||||
"needs_review": [],
|
||||
"workarounds": []
|
||||
}
|
||||
},
|
||||
|
||||
"benchmark": {
|
||||
"metadata": {
|
||||
"skill_name": "<skill-name>",
|
||||
"skill_path": "<path/to/skill>",
|
||||
"executor_model": "<model-name>",
|
||||
"analyzer_model": "<model-name>",
|
||||
"timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"evals_run": [1],
|
||||
"runs_per_configuration": 3
|
||||
},
|
||||
"runs": [
|
||||
{
|
||||
"eval_id": 1,
|
||||
"configuration": "with_skill",
|
||||
"run_number": 1,
|
||||
"result": {
|
||||
"pass_rate": 0.0,
|
||||
"passed": 0,
|
||||
"failed": 0,
|
||||
"total": 0,
|
||||
"time_seconds": 0.0,
|
||||
"tokens": 0,
|
||||
"tool_calls": 0,
|
||||
"errors": 0
|
||||
},
|
||||
"expectations": [],
|
||||
"notes": []
|
||||
}
|
||||
],
|
||||
"run_summary": {
|
||||
"with_skill": {
|
||||
"pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
|
||||
"time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
|
||||
"tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
|
||||
},
|
||||
"without_skill": {
|
||||
"pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
|
||||
"time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
|
||||
"tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
|
||||
},
|
||||
"delta": {
|
||||
"pass_rate": "+0.0",
|
||||
"time_seconds": "+0.0",
|
||||
"tokens": "+0"
|
||||
}
|
||||
},
|
||||
"notes": []
|
||||
},
|
||||
|
||||
"metrics": {
|
||||
"tool_calls": {
|
||||
"Read": 0,
|
||||
"Write": 0,
|
||||
"Bash": 0,
|
||||
"Edit": 0,
|
||||
"Glob": 0,
|
||||
"Grep": 0
|
||||
},
|
||||
"total_tool_calls": 0,
|
||||
"total_steps": 0,
|
||||
"files_created": [],
|
||||
"errors_encountered": 0,
|
||||
"output_chars": 0,
|
||||
"transcript_chars": 0
|
||||
},
|
||||
|
||||
"timing": {
|
||||
"executor_start": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"executor_end": "",
|
||||
"executor_duration_seconds": 0.0,
|
||||
"grader_start": "",
|
||||
"grader_end": "",
|
||||
"grader_duration_seconds": 0.0,
|
||||
"total_duration_seconds": 0.0
|
||||
},
|
||||
|
||||
"history": {
|
||||
"started_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"skill_name": "<skill-name>",
|
||||
"current_best": "v0",
|
||||
"iterations": [
|
||||
{
|
||||
"version": "v0",
|
||||
"parent": None,
|
||||
"expectation_pass_rate": 0.0,
|
||||
"grading_result": "baseline",
|
||||
"is_current_best": True
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
"comparison": {
|
||||
"winner": "A",
|
||||
"reasoning": "Explanation of why the winner was chosen",
|
||||
"rubric": {
|
||||
"A": {
|
||||
"content": {
|
||||
"correctness": 5,
|
||||
"completeness": 5,
|
||||
"accuracy": 5
|
||||
},
|
||||
"structure": {
|
||||
"organization": 5,
|
||||
"formatting": 5,
|
||||
"usability": 5
|
||||
},
|
||||
"content_score": 5.0,
|
||||
"structure_score": 5.0,
|
||||
"overall_score": 10.0
|
||||
},
|
||||
"B": {
|
||||
"content": {
|
||||
"correctness": 3,
|
||||
"completeness": 3,
|
||||
"accuracy": 3
|
||||
},
|
||||
"structure": {
|
||||
"organization": 3,
|
||||
"formatting": 3,
|
||||
"usability": 3
|
||||
},
|
||||
"content_score": 3.0,
|
||||
"structure_score": 3.0,
|
||||
"overall_score": 6.0
|
||||
}
|
||||
},
|
||||
"output_quality": {
|
||||
"A": {
|
||||
"score": 10,
|
||||
"strengths": [],
|
||||
"weaknesses": []
|
||||
},
|
||||
"B": {
|
||||
"score": 6,
|
||||
"strengths": [],
|
||||
"weaknesses": []
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
"analysis": {
|
||||
"comparison_summary": {
|
||||
"winner": "A",
|
||||
"winner_skill": "<path/to/winner>",
|
||||
"loser_skill": "<path/to/loser>",
|
||||
"comparator_reasoning": "Summary of comparison"
|
||||
},
|
||||
"winner_strengths": [],
|
||||
"loser_weaknesses": [],
|
||||
"instruction_following": {
|
||||
"winner": {
|
||||
"score": 10,
|
||||
"issues": []
|
||||
},
|
||||
"loser": {
|
||||
"score": 5,
|
||||
"issues": []
|
||||
}
|
||||
},
|
||||
"improvement_suggestions": [
|
||||
{
|
||||
"priority": "high",
|
||||
"category": "instructions",
|
||||
"suggestion": "Specific improvement suggestion",
|
||||
"expected_impact": "Why this would help"
|
||||
}
|
||||
],
|
||||
"transcript_insights": {
|
||||
"winner_execution_pattern": "Description of how winner executed",
|
||||
"loser_execution_pattern": "Description of how loser executed"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def init_json(json_type: str, output_path: Path, force: bool = False) -> bool:
|
||||
"""
|
||||
Initialize a JSON file with the correct template structure.
|
||||
|
||||
Returns True on success, False on failure.
|
||||
"""
|
||||
if json_type not in TEMPLATES:
|
||||
print(f"Unknown type: {json_type}. Valid types: {list(TEMPLATES.keys())}")
|
||||
return False
|
||||
|
||||
if output_path.exists() and not force:
|
||||
print(f"File already exists: {output_path}")
|
||||
print("Use --force to overwrite")
|
||||
return False
|
||||
|
||||
# Create parent directories
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Write template
|
||||
template = TEMPLATES[json_type]
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(template, f, indent=2)
|
||||
|
||||
print(f"Created {json_type} template: {output_path}")
|
||||
return True
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Initialize JSON files with correct structure"
|
||||
)
|
||||
parser.add_argument(
|
||||
"type",
|
||||
choices=list(TEMPLATES.keys()),
|
||||
help="Type of JSON file to create"
|
||||
)
|
||||
parser.add_argument(
|
||||
"output",
|
||||
type=Path,
|
||||
help="Output path for the JSON file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--force", "-f",
|
||||
action="store_true",
|
||||
help="Overwrite existing file"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
success = init_json(args.type, args.output, args.force)
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,303 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Skill Initializer - Creates a new skill from template
|
||||
|
||||
Usage:
|
||||
init_skill.py <skill-name> --path <path>
|
||||
|
||||
Examples:
|
||||
init_skill.py my-new-skill --path skills/public
|
||||
init_skill.py my-api-helper --path skills/private
|
||||
init_skill.py custom-skill --path /custom/location
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
SKILL_TEMPLATE = """---
|
||||
name: {skill_name}
|
||||
description: [TODO: Complete and informative explanation of what the skill does and when to use it. Include WHEN to use this skill - specific scenarios, file types, or tasks that trigger it.]
|
||||
---
|
||||
|
||||
# {skill_title}
|
||||
|
||||
## Overview
|
||||
|
||||
[TODO: 1-2 sentences explaining what this skill enables]
|
||||
|
||||
## Structuring This Skill
|
||||
|
||||
[TODO: Choose the structure that best fits this skill's purpose. Common patterns:
|
||||
|
||||
**1. Workflow-Based** (best for sequential processes)
|
||||
- Works well when there are clear step-by-step procedures
|
||||
- Example: DOCX skill with "Workflow Decision Tree" → "Reading" → "Creating" → "Editing"
|
||||
- Structure: ## Overview → ## Workflow Decision Tree → ## Step 1 → ## Step 2...
|
||||
|
||||
**2. Task-Based** (best for tool collections)
|
||||
- Works well when the skill offers different operations/capabilities
|
||||
- Example: PDF skill with "Quick Start" → "Merge PDFs" → "Split PDFs" → "Extract Text"
|
||||
- Structure: ## Overview → ## Quick Start → ## Task Category 1 → ## Task Category 2...
|
||||
|
||||
**3. Reference/Guidelines** (best for standards or specifications)
|
||||
- Works well for brand guidelines, coding standards, or requirements
|
||||
- Example: Brand styling with "Brand Guidelines" → "Colors" → "Typography" → "Features"
|
||||
- Structure: ## Overview → ## Guidelines → ## Specifications → ## Usage...
|
||||
|
||||
**4. Capabilities-Based** (best for integrated systems)
|
||||
- Works well when the skill provides multiple interrelated features
|
||||
- Example: Product Management with "Core Capabilities" → numbered capability list
|
||||
- Structure: ## Overview → ## Core Capabilities → ### 1. Feature → ### 2. Feature...
|
||||
|
||||
Patterns can be mixed and matched as needed. Most skills combine patterns (e.g., start with task-based, add workflow for complex operations).
|
||||
|
||||
Delete this entire "Structuring This Skill" section when done - it's just guidance.]
|
||||
|
||||
## [TODO: Replace with the first main section based on chosen structure]
|
||||
|
||||
[TODO: Add content here. See examples in existing skills:
|
||||
- Code samples for technical skills
|
||||
- Decision trees for complex workflows
|
||||
- Concrete examples with realistic user requests
|
||||
- References to scripts/templates/references as needed]
|
||||
|
||||
## Resources
|
||||
|
||||
This skill includes example resource directories that demonstrate how to organize different types of bundled resources:
|
||||
|
||||
### scripts/
|
||||
Executable code (Python/Bash/etc.) that can be run directly to perform specific operations.
|
||||
|
||||
**Examples from other skills:**
|
||||
- PDF skill: `fill_fillable_fields.py`, `extract_form_field_info.py` - utilities for PDF manipulation
|
||||
- DOCX skill: `document.py`, `utilities.py` - Python modules for document processing
|
||||
|
||||
**Appropriate for:** Python scripts, shell scripts, or any executable code that performs automation, data processing, or specific operations.
|
||||
|
||||
**Note:** Scripts may be executed without loading into context, but can still be read by Claude for patching or environment adjustments.
|
||||
|
||||
### references/
|
||||
Documentation and reference material intended to be loaded into context to inform Claude's process and thinking.
|
||||
|
||||
**Examples from other skills:**
|
||||
- Product management: `communication.md`, `context_building.md` - detailed workflow guides
|
||||
- BigQuery: API reference documentation and query examples
|
||||
- Finance: Schema documentation, company policies
|
||||
|
||||
**Appropriate for:** In-depth documentation, API references, database schemas, comprehensive guides, or any detailed information that Claude should reference while working.
|
||||
|
||||
### assets/
|
||||
Files not intended to be loaded into context, but rather used within the output Claude produces.
|
||||
|
||||
**Examples from other skills:**
|
||||
- Brand styling: PowerPoint template files (.pptx), logo files
|
||||
- Frontend builder: HTML/React boilerplate project directories
|
||||
- Typography: Font files (.ttf, .woff2)
|
||||
|
||||
**Appropriate for:** Templates, boilerplate code, document templates, images, icons, fonts, or any files meant to be copied or used in the final output.
|
||||
|
||||
---
|
||||
|
||||
**Any unneeded directories can be deleted.** Not every skill requires all three types of resources.
|
||||
"""
|
||||
|
||||
EXAMPLE_SCRIPT = '''#!/usr/bin/env python3
|
||||
"""
|
||||
Example helper script for {skill_name}
|
||||
|
||||
This is a placeholder script that can be executed directly.
|
||||
Replace with actual implementation or delete if not needed.
|
||||
|
||||
Example real scripts from other skills:
|
||||
- pdf/scripts/fill_fillable_fields.py - Fills PDF form fields
|
||||
- pdf/scripts/convert_pdf_to_images.py - Converts PDF pages to images
|
||||
"""
|
||||
|
||||
def main():
|
||||
print("This is an example script for {skill_name}")
|
||||
# TODO: Add actual script logic here
|
||||
# This could be data processing, file conversion, API calls, etc.
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
'''
|
||||
|
||||
EXAMPLE_REFERENCE = """# Reference Documentation for {skill_title}
|
||||
|
||||
This is a placeholder for detailed reference documentation.
|
||||
Replace with actual reference content or delete if not needed.
|
||||
|
||||
Example real reference docs from other skills:
|
||||
- product-management/references/communication.md - Comprehensive guide for status updates
|
||||
- product-management/references/context_building.md - Deep-dive on gathering context
|
||||
- bigquery/references/ - API references and query examples
|
||||
|
||||
## When Reference Docs Are Useful
|
||||
|
||||
Reference docs are ideal for:
|
||||
- Comprehensive API documentation
|
||||
- Detailed workflow guides
|
||||
- Complex multi-step processes
|
||||
- Information too lengthy for main SKILL.md
|
||||
- Content that's only needed for specific use cases
|
||||
|
||||
## Structure Suggestions
|
||||
|
||||
### API Reference Example
|
||||
- Overview
|
||||
- Authentication
|
||||
- Endpoints with examples
|
||||
- Error codes
|
||||
- Rate limits
|
||||
|
||||
### Workflow Guide Example
|
||||
- Prerequisites
|
||||
- Step-by-step instructions
|
||||
- Common patterns
|
||||
- Troubleshooting
|
||||
- Best practices
|
||||
"""
|
||||
|
||||
EXAMPLE_ASSET = """# Example Asset File
|
||||
|
||||
This placeholder represents where asset files would be stored.
|
||||
Replace with actual asset files (templates, images, fonts, etc.) or delete if not needed.
|
||||
|
||||
Asset files are NOT intended to be loaded into context, but rather used within
|
||||
the output Claude produces.
|
||||
|
||||
Example asset files from other skills:
|
||||
- Brand guidelines: logo.png, slides_template.pptx
|
||||
- Frontend builder: hello-world/ directory with HTML/React boilerplate
|
||||
- Typography: custom-font.ttf, font-family.woff2
|
||||
- Data: sample_data.csv, test_dataset.json
|
||||
|
||||
## Common Asset Types
|
||||
|
||||
- Templates: .pptx, .docx, boilerplate directories
|
||||
- Images: .png, .jpg, .svg, .gif
|
||||
- Fonts: .ttf, .otf, .woff, .woff2
|
||||
- Boilerplate code: Project directories, starter files
|
||||
- Icons: .ico, .svg
|
||||
- Data files: .csv, .json, .xml, .yaml
|
||||
|
||||
Note: This is a text placeholder. Actual assets can be any file type.
|
||||
"""
|
||||
|
||||
|
||||
def title_case_skill_name(skill_name):
|
||||
"""Convert hyphenated skill name to Title Case for display."""
|
||||
return ' '.join(word.capitalize() for word in skill_name.split('-'))
|
||||
|
||||
|
||||
def init_skill(skill_name, path):
|
||||
"""
|
||||
Initialize a new skill directory with template SKILL.md.
|
||||
|
||||
Args:
|
||||
skill_name: Name of the skill
|
||||
path: Path where the skill directory should be created
|
||||
|
||||
Returns:
|
||||
Path to created skill directory, or None if error
|
||||
"""
|
||||
# Determine skill directory path
|
||||
skill_dir = Path(path).resolve() / skill_name
|
||||
|
||||
# Check if directory already exists
|
||||
if skill_dir.exists():
|
||||
print(f"❌ Error: Skill directory already exists: {skill_dir}")
|
||||
return None
|
||||
|
||||
# Create skill directory
|
||||
try:
|
||||
skill_dir.mkdir(parents=True, exist_ok=False)
|
||||
print(f"✅ Created skill directory: {skill_dir}")
|
||||
except Exception as e:
|
||||
print(f"❌ Error creating directory: {e}")
|
||||
return None
|
||||
|
||||
# Create SKILL.md from template
|
||||
skill_title = title_case_skill_name(skill_name)
|
||||
skill_content = SKILL_TEMPLATE.format(
|
||||
skill_name=skill_name,
|
||||
skill_title=skill_title
|
||||
)
|
||||
|
||||
skill_md_path = skill_dir / 'SKILL.md'
|
||||
try:
|
||||
skill_md_path.write_text(skill_content)
|
||||
print("✅ Created SKILL.md")
|
||||
except Exception as e:
|
||||
print(f"❌ Error creating SKILL.md: {e}")
|
||||
return None
|
||||
|
||||
# Create resource directories with example files
|
||||
try:
|
||||
# Create scripts/ directory with example script
|
||||
scripts_dir = skill_dir / 'scripts'
|
||||
scripts_dir.mkdir(exist_ok=True)
|
||||
example_script = scripts_dir / 'example.py'
|
||||
example_script.write_text(EXAMPLE_SCRIPT.format(skill_name=skill_name))
|
||||
example_script.chmod(0o755)
|
||||
print("✅ Created scripts/example.py")
|
||||
|
||||
# Create references/ directory with example reference doc
|
||||
references_dir = skill_dir / 'references'
|
||||
references_dir.mkdir(exist_ok=True)
|
||||
example_reference = references_dir / 'api_reference.md'
|
||||
example_reference.write_text(EXAMPLE_REFERENCE.format(skill_title=skill_title))
|
||||
print("✅ Created references/api_reference.md")
|
||||
|
||||
# Create assets/ directory with example asset placeholder
|
||||
assets_dir = skill_dir / 'assets'
|
||||
assets_dir.mkdir(exist_ok=True)
|
||||
example_asset = assets_dir / 'example_asset.txt'
|
||||
example_asset.write_text(EXAMPLE_ASSET)
|
||||
print("✅ Created assets/example_asset.txt")
|
||||
except Exception as e:
|
||||
print(f"❌ Error creating resource directories: {e}")
|
||||
return None
|
||||
|
||||
# Print next steps
|
||||
print(f"\n✅ Skill '{skill_name}' initialized successfully at {skill_dir}")
|
||||
print("\nNext steps:")
|
||||
print("1. Edit SKILL.md to complete the TODO items and update the description")
|
||||
print("2. Customize or delete the example files in scripts/, references/, and assets/")
|
||||
print("3. Run the validator when ready to check the skill structure")
|
||||
|
||||
return skill_dir
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 4 or sys.argv[2] != '--path':
|
||||
print("Usage: init_skill.py <skill-name> --path <path>")
|
||||
print("\nSkill name requirements:")
|
||||
print(" - Kebab-case identifier (e.g., 'my-data-analyzer')")
|
||||
print(" - Lowercase letters, digits, and hyphens only")
|
||||
print(" - Max 64 characters")
|
||||
print(" - Must match directory name exactly")
|
||||
print("\nExamples:")
|
||||
print(" init_skill.py my-new-skill --path skills/public")
|
||||
print(" init_skill.py my-api-helper --path skills/private")
|
||||
print(" init_skill.py custom-skill --path /custom/location")
|
||||
sys.exit(1)
|
||||
|
||||
skill_name = sys.argv[1]
|
||||
path = sys.argv[3]
|
||||
|
||||
print(f"🚀 Initializing skill: {skill_name}")
|
||||
print(f" Location: {path}")
|
||||
print()
|
||||
|
||||
result = init_skill(skill_name, path)
|
||||
|
||||
if result:
|
||||
sys.exit(0)
|
||||
else:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,136 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Skill Packager - Creates a distributable .skill file of a skill folder
|
||||
|
||||
Usage:
|
||||
python utils/package_skill.py <path/to/skill-folder> [output-directory]
|
||||
|
||||
Example:
|
||||
python utils/package_skill.py skills/public/my-skill
|
||||
python utils/package_skill.py skills/public/my-skill ./dist
|
||||
"""
|
||||
|
||||
import fnmatch
|
||||
import sys
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from quick_validate import validate_skill
|
||||
|
||||
# Patterns to exclude when packaging skills.
|
||||
EXCLUDE_DIRS = {"__pycache__", "node_modules"}
|
||||
EXCLUDE_GLOBS = {"*.pyc"}
|
||||
EXCLUDE_FILES = {".DS_Store"}
|
||||
# Directories excluded only at the skill root (not when nested deeper).
|
||||
ROOT_EXCLUDE_DIRS = {"evals"}
|
||||
|
||||
|
||||
def should_exclude(rel_path: Path) -> bool:
|
||||
"""Check if a path should be excluded from packaging."""
|
||||
parts = rel_path.parts
|
||||
if any(part in EXCLUDE_DIRS for part in parts):
|
||||
return True
|
||||
# rel_path is relative to skill_path.parent, so parts[0] is the skill
|
||||
# folder name and parts[1] (if present) is the first subdir.
|
||||
if len(parts) > 1 and parts[1] in ROOT_EXCLUDE_DIRS:
|
||||
return True
|
||||
name = rel_path.name
|
||||
if name in EXCLUDE_FILES:
|
||||
return True
|
||||
return any(fnmatch.fnmatch(name, pat) for pat in EXCLUDE_GLOBS)
|
||||
|
||||
|
||||
def package_skill(skill_path, output_dir=None):
|
||||
"""
|
||||
Package a skill folder into a .skill file.
|
||||
|
||||
Args:
|
||||
skill_path: Path to the skill folder
|
||||
output_dir: Optional output directory for the .skill file (defaults to current directory)
|
||||
|
||||
Returns:
|
||||
Path to the created .skill file, or None if error
|
||||
"""
|
||||
skill_path = Path(skill_path).resolve()
|
||||
|
||||
# Validate skill folder exists
|
||||
if not skill_path.exists():
|
||||
print(f"❌ Error: Skill folder not found: {skill_path}")
|
||||
return None
|
||||
|
||||
if not skill_path.is_dir():
|
||||
print(f"❌ Error: Path is not a directory: {skill_path}")
|
||||
return None
|
||||
|
||||
# Validate SKILL.md exists
|
||||
skill_md = skill_path / "SKILL.md"
|
||||
if not skill_md.exists():
|
||||
print(f"❌ Error: SKILL.md not found in {skill_path}")
|
||||
return None
|
||||
|
||||
# Run validation before packaging
|
||||
print("🔍 Validating skill...")
|
||||
valid, message = validate_skill(skill_path)
|
||||
if not valid:
|
||||
print(f"❌ Validation failed: {message}")
|
||||
print(" Please fix the validation errors before packaging.")
|
||||
return None
|
||||
print(f"✅ {message}\n")
|
||||
|
||||
# Determine output location
|
||||
skill_name = skill_path.name
|
||||
if output_dir:
|
||||
output_path = Path(output_dir).resolve()
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
else:
|
||||
output_path = Path.cwd()
|
||||
|
||||
skill_filename = output_path / f"{skill_name}.skill"
|
||||
|
||||
# Create the .skill file (zip format)
|
||||
try:
|
||||
with zipfile.ZipFile(skill_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
||||
# Walk through the skill directory, excluding build artifacts
|
||||
for file_path in skill_path.rglob('*'):
|
||||
if not file_path.is_file():
|
||||
continue
|
||||
arcname = file_path.relative_to(skill_path.parent)
|
||||
if should_exclude(arcname):
|
||||
print(f" Skipped: {arcname}")
|
||||
continue
|
||||
zipf.write(file_path, arcname)
|
||||
print(f" Added: {arcname}")
|
||||
|
||||
print(f"\n✅ Successfully packaged skill to: {skill_filename}")
|
||||
return skill_filename
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error creating .skill file: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python utils/package_skill.py <path/to/skill-folder> [output-directory]")
|
||||
print("\nExample:")
|
||||
print(" python utils/package_skill.py skills/public/my-skill")
|
||||
print(" python utils/package_skill.py skills/public/my-skill ./dist")
|
||||
sys.exit(1)
|
||||
|
||||
skill_path = sys.argv[1]
|
||||
output_dir = sys.argv[2] if len(sys.argv) > 2 else None
|
||||
|
||||
print(f"📦 Packaging skill: {skill_path}")
|
||||
if output_dir:
|
||||
print(f" Output directory: {output_dir}")
|
||||
print()
|
||||
|
||||
result = package_skill(skill_path, output_dir)
|
||||
|
||||
if result:
|
||||
sys.exit(0)
|
||||
else:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user