Add skill-creator plugin

2026-03-20 23:43:07 +00:00 · 2026-02-17 17:02:51 -08:00
parent 261ce4fba4
commit 30975e61e3
20 changed files with 4879 additions and 0 deletions
--- a/plugins/skill-creator/skills/skill-creator/references/schemas.md
+++ b/plugins/skill-creator/skills/skill-creator/references/schemas.md
@@ -0,0 +1,438 @@
+# JSON Schemas
+
+This document defines the JSON schemas used by skill-creator-edge.
+
+## Working with JSON Files
+
+### Initialize a new file with correct structure
+
+```bash
+scripts/init_json.py <type> <output-path>
+
+# Examples:
+scripts/init_json.py evals evals/evals.json
+scripts/init_json.py grading run-1/grading.json
+scripts/init_json.py benchmark benchmarks/2026-01-15/benchmark.json
+scripts/init_json.py metrics run-1/outputs/metrics.json
+```
+
+### Validate an existing file
+
+```bash
+scripts/validate_json.py <file-path> [--type <type>]
+
+# Examples:
+scripts/validate_json.py evals/evals.json
+scripts/validate_json.py run-1/grading.json --type grading
+```
+
+The validator infers the type from the filename when possible.
+
+---
+
+## evals.json
+
+Defines the evals for a skill. Located at `evals/evals.json` within the skill directory.
+
+```json
+{
+  "skill_name": "example-skill",
+  "evals": [
+    {
+      "id": 1,
+      "prompt": "User's example prompt",
+      "expected_output": "Description of expected result",
+      "files": ["evals/files/sample1.pdf"],
+      "expectations": [
+        "The output includes X",
+        "The skill used script Y"
+      ]
+    }
+  ]
+}
+```
+
+**Fields:**
+- `skill_name`: Name matching the skill's frontmatter
+- `evals[].id`: Unique integer identifier
+- `evals[].prompt`: The task to execute
+- `evals[].expected_output`: Human-readable description of success
+- `evals[].files`: Optional list of input file paths (relative to skill root)
+- `evals[].expectations`: List of verifiable statements
+
+---
+
+## history.json
+
+Tracks version progression in Improve mode. Located at workspace root.
+
+```json
+{
+  "started_at": "2026-01-15T10:30:00Z",
+  "skill_name": "pdf",
+  "current_best": "v2",
+  "iterations": [
+    {
+      "version": "v0",
+      "parent": null,
+      "expectation_pass_rate": 0.65,
+      "grading_result": "baseline",
+      "is_current_best": false
+    },
+    {
+      "version": "v1",
+      "parent": "v0",
+      "expectation_pass_rate": 0.75,
+      "grading_result": "won",
+      "is_current_best": false
+    },
+    {
+      "version": "v2",
+      "parent": "v1",
+      "expectation_pass_rate": 0.85,
+      "grading_result": "won",
+      "is_current_best": true
+    }
+  ]
+}
+```
+
+**Fields:**
+- `started_at`: ISO timestamp of when improvement started
+- `skill_name`: Name of the skill being improved
+- `current_best`: Version identifier of the best performer
+- `iterations[].version`: Version identifier (v0, v1, ...)
+- `iterations[].parent`: Parent version this was derived from
+- `iterations[].expectation_pass_rate`: Pass rate from grading
+- `iterations[].grading_result`: "baseline", "won", "lost", or "tie"
+- `iterations[].is_current_best`: Whether this is the current best version
+
+---
+
+## grading.json
+
+Output from the grader agent. Located at `<run-dir>/grading.json`.
+
+```json
+{
+  "expectations": [
+    {
+      "text": "The output includes the name 'John Smith'",
+      "passed": true,
+      "evidence": "Found in transcript Step 3: 'Extracted names: John Smith, Sarah Johnson'"
+    },
+    {
+      "text": "The spreadsheet has a SUM formula in cell B10",
+      "passed": false,
+      "evidence": "No spreadsheet was created. The output was a text file."
+    }
+  ],
+  "summary": {
+    "passed": 2,
+    "failed": 1,
+    "total": 3,
+    "pass_rate": 0.67
+  },
+  "execution_metrics": {
+    "tool_calls": {
+      "Read": 5,
+      "Write": 2,
+      "Bash": 8
+    },
+    "total_tool_calls": 15,
+    "total_steps": 6,
+    "errors_encountered": 0,
+    "output_chars": 12450,
+    "transcript_chars": 3200
+  },
+  "timing": {
+    "executor_duration_seconds": 165.0,
+    "grader_duration_seconds": 26.0,
+    "total_duration_seconds": 191.0
+  },
+  "claims": [
+    {
+      "claim": "The form has 12 fillable fields",
+      "type": "factual",
+      "verified": true,
+      "evidence": "Counted 12 fields in field_info.json"
+    }
+  ],
+  "user_notes_summary": {
+    "uncertainties": ["Used 2023 data, may be stale"],
+    "needs_review": [],
+    "workarounds": ["Fell back to text overlay for non-fillable fields"]
+  },
+  "eval_feedback": {
+    "suggestions": [
+      {
+        "assertion": "The output includes the name 'John Smith'",
+        "reason": "A hallucinated document that mentions the name would also pass"
+      }
+    ],
+    "overall": "Assertions check presence but not correctness."
+  }
+}
+```
+
+**Fields:**
+- `expectations[]`: Graded expectations with evidence
+- `summary`: Aggregate pass/fail counts
+- `execution_metrics`: Tool usage and output size (from executor's metrics.json)
+- `timing`: Wall clock timing (from timing.json)
+- `claims`: Extracted and verified claims from the output
+- `user_notes_summary`: Issues flagged by the executor
+- `eval_feedback`: (optional) Improvement suggestions for the evals, only present when the grader identifies issues worth raising
+
+---
+
+## metrics.json
+
+Output from the executor agent. Located at `<run-dir>/outputs/metrics.json`.
+
+```json
+{
+  "tool_calls": {
+    "Read": 5,
+    "Write": 2,
+    "Bash": 8,
+    "Edit": 1,
+    "Glob": 2,
+    "Grep": 0
+  },
+  "total_tool_calls": 18,
+  "total_steps": 6,
+  "files_created": ["filled_form.pdf", "field_values.json"],
+  "errors_encountered": 0,
+  "output_chars": 12450,
+  "transcript_chars": 3200
+}
+```
+
+**Fields:**
+- `tool_calls`: Count per tool type
+- `total_tool_calls`: Sum of all tool calls
+- `total_steps`: Number of major execution steps
+- `files_created`: List of output files created
+- `errors_encountered`: Number of errors during execution
+- `output_chars`: Total character count of output files
+- `transcript_chars`: Character count of transcript
+
+---
+
+## timing.json
+
+Wall clock timing for a run. Located at `<run-dir>/timing.json`.
+
+```json
+{
+  "executor_start": "2026-01-15T10:30:00Z",
+  "executor_end": "2026-01-15T10:32:45Z",
+  "executor_duration_seconds": 165.0,
+  "grader_start": "2026-01-15T10:32:46Z",
+  "grader_end": "2026-01-15T10:33:12Z",
+  "grader_duration_seconds": 26.0,
+  "total_duration_seconds": 191.0
+}
+```
+
+---
+
+## benchmark.json
+
+Output from Benchmark mode. Located at `benchmarks/<timestamp>/benchmark.json`.
+
+```json
+{
+  "metadata": {
+    "skill_name": "pdf",
+    "skill_path": "/path/to/pdf",
+    "executor_model": "claude-sonnet-4-20250514",
+    "analyzer_model": "most-capable-model",
+    "timestamp": "2026-01-15T10:30:00Z",
+    "evals_run": [1, 2, 3],
+    "runs_per_configuration": 3
+  },
+
+  "runs": [
+    {
+      "eval_id": 1,
+      "configuration": "with_skill",
+      "run_number": 1,
+      "result": {
+        "pass_rate": 0.85,
+        "passed": 6,
+        "failed": 1,
+        "total": 7,
+        "time_seconds": 42.5,
+        "tokens": 3800,
+        "tool_calls": 18,
+        "errors": 0
+      },
+      "expectations": [
+        {"text": "...", "passed": true, "evidence": "..."}
+      ],
+      "notes": [
+        "Used 2023 data, may be stale",
+        "Fell back to text overlay for non-fillable fields"
+      ]
+    }
+  ],
+
+  "run_summary": {
+    "with_skill": {
+      "pass_rate": {"mean": 0.85, "stddev": 0.05, "min": 0.80, "max": 0.90},
+      "time_seconds": {"mean": 45.0, "stddev": 12.0, "min": 32.0, "max": 58.0},
+      "tokens": {"mean": 3800, "stddev": 400, "min": 3200, "max": 4100}
+    },
+    "without_skill": {
+      "pass_rate": {"mean": 0.35, "stddev": 0.08, "min": 0.28, "max": 0.45},
+      "time_seconds": {"mean": 32.0, "stddev": 8.0, "min": 24.0, "max": 42.0},
+      "tokens": {"mean": 2100, "stddev": 300, "min": 1800, "max": 2500}
+    },
+    "delta": {
+      "pass_rate": "+0.50",
+      "time_seconds": "+13.0",
+      "tokens": "+1700"
+    }
+  },
+
+  "notes": [
+    "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value",
+    "Eval 3 shows high variance (50% ± 40%) - may be flaky or model-dependent",
+    "Without-skill runs consistently fail on table extraction expectations",
+    "Skill adds 13s average execution time but improves pass rate by 50%"
+  ]
+}
+```
+
+**Fields:**
+- `metadata`: Information about the benchmark run
+- `runs[]`: Individual run results with expectations and notes
+- `run_summary`: Statistical aggregates per configuration
+- `notes`: Freeform observations from the analyzer
+
+---
+
+## comparison.json
+
+Output from blind comparator. Located at `<grading-dir>/comparison-N.json`.
+
+```json
+{
+  "winner": "A",
+  "reasoning": "Output A provides a complete solution with proper formatting and all required fields. Output B is missing the date field and has formatting inconsistencies.",
+  "rubric": {
+    "A": {
+      "content": {
+        "correctness": 5,
+        "completeness": 5,
+        "accuracy": 4
+      },
+      "structure": {
+        "organization": 4,
+        "formatting": 5,
+        "usability": 4
+      },
+      "content_score": 4.7,
+      "structure_score": 4.3,
+      "overall_score": 9.0
+    },
+    "B": {
+      "content": {
+        "correctness": 3,
+        "completeness": 2,
+        "accuracy": 3
+      },
+      "structure": {
+        "organization": 3,
+        "formatting": 2,
+        "usability": 3
+      },
+      "content_score": 2.7,
+      "structure_score": 2.7,
+      "overall_score": 5.4
+    }
+  },
+  "output_quality": {
+    "A": {
+      "score": 9,
+      "strengths": ["Complete solution", "Well-formatted", "All fields present"],
+      "weaknesses": ["Minor style inconsistency in header"]
+    },
+    "B": {
+      "score": 5,
+      "strengths": ["Readable output", "Correct basic structure"],
+      "weaknesses": ["Missing date field", "Formatting inconsistencies", "Partial data extraction"]
+    }
+  },
+  "expectation_results": {
+    "A": {
+      "passed": 4,
+      "total": 5,
+      "pass_rate": 0.80,
+      "details": [
+        {"text": "Output includes name", "passed": true}
+      ]
+    },
+    "B": {
+      "passed": 3,
+      "total": 5,
+      "pass_rate": 0.60,
+      "details": [
+        {"text": "Output includes name", "passed": true}
+      ]
+    }
+  }
+}
+```
+
+---
+
+## analysis.json
+
+Output from post-hoc analyzer. Located at `<grading-dir>/analysis.json`.
+
+```json
+{
+  "comparison_summary": {
+    "winner": "A",
+    "winner_skill": "path/to/winner/skill",
+    "loser_skill": "path/to/loser/skill",
+    "comparator_reasoning": "Brief summary of why comparator chose winner"
+  },
+  "winner_strengths": [
+    "Clear step-by-step instructions for handling multi-page documents",
+    "Included validation script that caught formatting errors"
+  ],
+  "loser_weaknesses": [
+    "Vague instruction 'process the document appropriately' led to inconsistent behavior",
+    "No script for validation, agent had to improvise"
+  ],
+  "instruction_following": {
+    "winner": {
+      "score": 9,
+      "issues": ["Minor: skipped optional logging step"]
+    },
+    "loser": {
+      "score": 6,
+      "issues": [
+        "Did not use the skill's formatting template",
+        "Invented own approach instead of following step 3"
+      ]
+    }
+  },
+  "improvement_suggestions": [
+    {
+      "priority": "high",
+      "category": "instructions",
+      "suggestion": "Replace 'process the document appropriately' with explicit steps",
+      "expected_impact": "Would eliminate ambiguity that caused inconsistent behavior"
+    }
+  ],
+  "transcript_insights": {
+    "winner_execution_pattern": "Read skill -> Followed 5-step process -> Used validation script",
+    "loser_execution_pattern": "Read skill -> Unclear on approach -> Tried 3 different methods"
+  }
+}
+```