Skip to content

Python API

Programmatic interface for Skillet.

Installation

bash
pip install pyskillet

Quick Start

python
import asyncio
from skillet import evaluate, tune, create_skill

# Run evals
results = asyncio.run(evaluate("my-skill"))
print(f"Pass rate: {results['pass_rate']}%")

Functions

evaluate()

Run evaluations against a baseline or with a skill.

python
async def evaluate(
    name: str,
    skill_path: Path | None = None,
    samples: int = 3,
    max_evals: int | None = None,
    allowed_tools: list[str] | None = None,
    parallel: int = 3,
    on_status: Callable | None = None,
    skip_cache: bool = False,
    evals_list: list[dict] | None = None,
) -> dict

Parameters:

ParameterTypeDefaultDescription
namestrrequiredEval set name or path
skill_pathPathNonePath to skill (None for baseline)
samplesint3Iterations per eval
max_evalsintNoneMax evals to run (random sample)
allowed_toolslistNoneRestrict available tools
parallelint3Parallel workers
on_statusCallableNoneProgress callback
skip_cacheboolFalseIgnore cached results
evals_listlist[dict]NonePre-loaded evals (skips load_evals() call)

Returns:

python
{
    "results": list[dict],      # Per-iteration results
    "tasks": list[dict],        # Original task definitions
    "pass_rate": float,         # Percentage (0-100)
    "total_runs": int,
    "total_pass": int,
    "cached_count": int,
    "fresh_count": int,
    "total_evals": int,
    "sampled_evals": int,
    "per_eval_metrics": list[dict],  # Per-eval pass@k and pass^k metrics
}

Example:

python
import asyncio
from pathlib import Path
from skillet import evaluate

async def main():
    # Baseline
    baseline = await evaluate("conventional-comments")
    print(f"Baseline: {baseline['pass_rate']}%")

    # With skill
    with_skill = await evaluate(
        "conventional-comments",
        skill_path=Path("~/.claude/skills/conventional-comments").expanduser(),
    )
    print(f"With skill: {with_skill['pass_rate']}%")

asyncio.run(main())

tune()

Iteratively improve a skill using DSPy optimization.

python
async def tune(
    name: str,
    skill_path: Path,
    config: TuneConfig | None = None,
    callbacks: TuneCallbacks | None = None,
    evals_list: list[dict] | None = None,
) -> TuneResult

Parameters:

ParameterTypeDefaultDescription
namestrrequiredEval set name
skill_pathPathrequiredPath to skill
configTuneConfigNoneTuning options
callbacksTuneCallbacksNoneProgress callbacks
evals_listlist[dict]NonePre-loaded evals (skips load_evals() call)

TuneConfig:

python
from dataclasses import dataclass

@dataclass
class TuneConfig:
    max_rounds: int = 5
    target_pass_rate: float = 100.0
    samples: int = 1
    parallel: int = 3

TuneResult:

python
@dataclass
class TuneResult:
    metadata: TuneMetadata       # Timestamps, paths
    config: TuneConfig           # Input config
    result: TuneResultSummary    # Success, final pass rate
    original_skill: str          # Original content
    best_skill: str              # Best found
    rounds: list[RoundResult]    # All rounds

Example:

python
import asyncio
from pathlib import Path
from skillet import tune
from skillet.tune import TuneConfig

async def main():
    result = await tune(
        "conventional-comments",
        Path("~/.claude/skills/conventional-comments").expanduser(),
        config=TuneConfig(
            max_rounds=10,
            target_pass_rate=90.0,
        ),
    )
    print(f"Final pass rate: {result.result.final_pass_rate}%")
    print(f"Rounds: {len(result.rounds)}")

asyncio.run(main())

create_skill()

Generate a skill from captured evals.

python
async def create_skill(
    name: str,
    output_dir: Path,
    extra_prompt: str | None = None,
    overwrite: bool = False,
) -> dict

Parameters:

ParameterTypeDefaultDescription
namestrrequiredEval set name
output_dirPathrequiredBase directory for skill
extra_promptstrNoneAdditional generation instructions
overwriteboolFalseReplace existing skill

Returns:

python
{
    "skill_dir": Path,         # Created skill directory
    "skill_content": str,      # Generated SKILL.md content
    "eval_count": int,         # Evals used
}

Example:

python
import asyncio
from pathlib import Path
from skillet import create_skill

async def main():
    result = await create_skill(
        "conventional-comments",
        Path.home(),  # Creates ~/.claude/skills/conventional-comments/
        extra_prompt="Be concise, max 30 lines",
    )
    print(f"Created: {result['skill_dir']}")

asyncio.run(main())

generate_evals()

Generate candidate eval files from a SKILL.md.

python
async def generate_evals(
    skill_path: Path,
    *,
    output_dir: Path | None = None,
    use_lint: bool = True,
    max_per_category: int = 5,
    domains: list[EvalDomain] | None = None,
) -> GenerateResult

Parameters:

ParameterTypeDefaultDescription
skill_pathPathrequiredPath to skill directory or SKILL.md file
output_dirPathNoneOutput directory for candidate files
use_lintboolTrueIncorporate lint findings
max_per_categoryint5Max evals per category
domainslist[EvalDomain]NoneFilter to specific domains (None = all)

EvalDomain:

python
from skillet.generate import EvalDomain

EvalDomain.TRIGGERING    # "triggering"
EvalDomain.FUNCTIONAL    # "functional"
EvalDomain.PERFORMANCE   # "performance"

Example:

python
import asyncio
from pathlib import Path
from skillet import generate_evals
from skillet.generate import EvalDomain

async def main():
    # Generate all domains
    result = await generate_evals(
        Path("~/.claude/skills/browser-fallback").expanduser(),
        output_dir=Path("./candidates"),
    )
    print(f"Generated {len(result.candidates)} candidates")

    # Only triggering evals
    result = await generate_evals(
        Path("~/.claude/skills/browser-fallback").expanduser(),
        domains=[EvalDomain.TRIGGERING],
    )

asyncio.run(main())

show()

Retrieve cached eval results without re-running evals.

python
def show(
    name: str,
    eval_source: str | None = None,
    skill_path: Path | None = None,
) -> dict

Parameters:

ParameterTypeDefaultDescription
namestrrequiredEval set name
eval_sourcestrNoneFilter to a specific eval file
skill_pathPathNoneShow results with skill instead of baseline

Returns:

python
{
    "name": str,
    "evals": list[{
        "source": str,
        "iterations": list[dict],
        "pass_rate": float | None,
    }],
}

Example:

python
from skillet.show import show

results = show("conventional-comments")
for eval_result in results["evals"]:
    rate = eval_result["pass_rate"]
    print(f"{eval_result['source']}: {rate:.0f}%" if rate is not None else "no data")

lint_skill()

Lint a SKILL.md file for common issues.

python
async def lint_skill(
    path: Path,
    *,
    include_llm: bool = True,
) -> LintResult

Parameters:

ParameterTypeDefaultDescription
pathPathrequiredPath to SKILL.md file
include_llmboolTrueInclude LLM-assisted lint rules

Returns:

A LintResult with path and findings (list of LintFinding, each with rule, message, severity, and optional line).

Example:

python
import asyncio
from pathlib import Path
from skillet.lint import lint_skill

async def main():
    result = await lint_skill(Path("~/.claude/skills/my-skill/SKILL.md").expanduser())
    for finding in result.findings:
        print(f"[{finding.severity.value}] {finding.rule}: {finding.message}")

asyncio.run(main())

Exceptions

python
from skillet import (
    SkilletError,           # Base exception
    EvalError,              # Eval loading/processing error
    EvalValidationError,    # Invalid eval format
    EmptyFolderError,       # No evals found
    SkillError,             # Skill creation error
)

Example:

python
from skillet import evaluate, SkilletError

try:
    results = await evaluate("nonexistent")
except SkilletError as e:
    print(f"Skillet error: {e}")

Callbacks

Progress Callback for evaluate()

python
async def on_status(task: dict, state: str, result: dict | None):
    """
    task: The eval task being run
    state: "cached", "running", or "done"
    result: Result dict when state is "done"
    """
    if state == "running":
        print(f"Running: {task['eval_source']}")
    elif state == "done":
        status = "PASS" if result["pass"] else "FAIL"
        print(f"  {status}: {result['judgment']['reasoning'][:50]}")

Callbacks for tune()

python
from skillet.tune import TuneCallbacks

callbacks = TuneCallbacks(
    on_round_start=lambda round, total: print(f"Round {round}/{total}"),
    on_round_complete=lambda round, rate, results: print(f"  Pass rate: {rate}%"),
    on_improved=lambda instruction, path: print(f"  Improved: {path}"),
    on_complete=lambda path: print(f"Done! Best skill: {path}"),
)

Released under the MIT License.