diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..0cdc750 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,25 @@ +name: CI + +on: + pull_request: + push: + branches: + - main + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: + - "3.11" + - "3.12" + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install + run: python -m pip install -e . + - name: Test + run: python -m unittest discover -s tests -v diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..c518ce3 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,23 @@ +# Contributing + +## Development Setup + +```bash +python3 -m pip install -e . +PYTHONPATH=src python3 -m unittest discover -s tests -v +``` + +## Change Requirements + +- Add or update tests for behavior changes. +- Keep experiment claims tied to reproducible artifacts: study spec, trial spec, + result JSON, probe history, and per-request probe details. +- Do not publish benchmark conclusions from bounded or time-compressed replays + without clearly labeling the replay controls. +- Keep example configs free of private credentials and prefer explicit, + reproducible endpoint settings. + +## Commit Hygiene + +Use small commits grouped by behavior: measurement integrity, orchestration +logic, documentation, or infrastructure. diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..547c280 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 AITuner contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..47a756d --- /dev/null +++ b/README.md @@ -0,0 +1,72 @@ +# AITuner + +AITuner is a small study orchestrator for OpenAI-compatible serving engines. It +replays trace windows, searches for the highest feasible offered load under +configured SLOs, and records enough trial context for LLM- or harness-guided +configuration proposals. + +## Status + +This repository is research tooling. Treat reported experiment numbers as valid +only when the matching study spec, trial artifacts, probe history, and +`probe_details.jsonl` files are available for audit. + +## Install + +```bash +python3 -m pip install -e . +``` + +## Test + +The test suite uses the Python standard library `unittest` runner: + +```bash +PYTHONPATH=src python3 -m unittest discover -s tests -v +``` + +If the package is installed in editable mode, `PYTHONPATH=src` is optional. + +## Basic Workflow + +Initialize a study: + +```bash +aituner study init --spec configs/examples/study.example.json +``` + +Run a local tuning loop: + +```bash +aituner study tune --spec configs/examples/study.example.json --max-trials 2 +``` + +Run a compare: + +```bash +aituner compare run --spec configs/examples/compare.example.json +``` + +Remote experiment notes for this checkout live in `AGENTS.md`. The default +remote host is `dash0`, and code should be synchronized through Git before +remote runs. + +## Experiment Integrity + +- Fixed-length replay requests are scored only when completion token usage is + verifiable and matches the trace expectation. +- Each trial writes aggregate probe history and per-request probe details. +- `request_rate_per_gpu` is the primary cross-topology metric: + `best_feasible_request_rate / (tensor_parallel_size * data_parallel_size)`. +- Compare reports include failed and no-feasible window counts; do not interpret + mean request rates without those counts. +- Bounded replays using `max_requests_per_probe`, `completion_tokens_override`, + or `replay_time_scale` are convergence tests for that bounded workload, not + production benchmarks. + +## Configuration Notes + +Example specs that use `llm.endpoint.provider=codex` resolve the endpoint from +the local Codex configuration unless `llm.endpoint.base_url` or +`AITUNER_CODEX_BASE_URL` is set. Public, reproducible examples should prefer an +explicit endpoint or omit the LLM endpoint and use proposal files. diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..870f51a --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,19 @@ +# Security + +AITuner launches local or remote serving engines and may replay trace payloads. +Do not commit secrets, API keys, private trace content, or private model access +tokens. + +## Reporting + +Report security issues privately to the project maintainers. If this repository +is mirrored to a public forge, use that forge's private vulnerability reporting +flow when available. + +## Operational Guidance + +- Keep `.env` files local; `.env.example` documents expected variable names. +- Review generated trial artifacts before publishing them, because request + payloads may contain trace text. +- Treat remote execution configs as sensitive when they include internal host + names, paths, or scheduler details. diff --git a/docs/aituner-harness-summary.md b/docs/aituner-harness-summary.md index a4742eb..1e21d3e 100644 --- a/docs/aituner-harness-summary.md +++ b/docs/aituner-harness-summary.md @@ -60,7 +60,7 @@ The speedup comes from reducing wasted proposal families, not from changing the - Engine relaunch after early stop is available as opt-in for faster smoke studies, but it is not the default because it can change warm-state comparability. 5. Search-high saturation stop - - If the incumbent's highest measured probe is feasible, has no SLO failures, and is within the configured binary-search resolution of `search.high`, the harness stops before asking the LLM for another proposal. + - If the incumbent's highest measured probe is feasible and is within the configured binary-search resolution of `search.high`, the harness stops before asking the LLM for another proposal. Individual request failures can be present when the aggregate probe still meets the configured pass-rate SLO. - This is not a model-specific threshold. It means the workload search range, not the engine config, is currently the limiting measurement bound. 6. Deterministic first probes diff --git a/docs/qwen235b-thinking-decode/harness-20260428.md b/docs/qwen235b-thinking-decode/harness-20260428.md index af5d282..029d483 100644 --- a/docs/qwen235b-thinking-decode/harness-20260428.md +++ b/docs/qwen235b-thinking-decode/harness-20260428.md @@ -118,7 +118,7 @@ A second generic diagnosis bug was fixed: non-SLO bookkeeping counts such as `pr The base-relative patch issue is now guarded in code, not only in the LLM prompt. When `StudyStore.materialize_trial` sees a runtime/env-only proposal after a non-base incumbent has been found, it inherits the incumbent topology patch into the trial spec unless the proposal explicitly provides a topology. This keeps same-topology runtime validation on the actual incumbent while preserving the ability to test the base topology by stating it explicitly. -Local verification: `PYTHONPATH=src python3 -m unittest discover -s tests` passed 68 tests. +Local verification at that commit: `PYTHONPATH=src python3 -m unittest discover -s tests` passed. The current repository suite has since grown; rerun the command rather than relying on this historical test count. ## Current Harness Judgment diff --git a/docs/qwen30b-community-vllm020/harness-early-stop-ablation-20260502.md b/docs/qwen30b-community-vllm020/harness-early-stop-ablation-20260502.md index e49b0f7..937c65e 100644 --- a/docs/qwen30b-community-vllm020/harness-early-stop-ablation-20260502.md +++ b/docs/qwen30b-community-vllm020/harness-early-stop-ablation-20260502.md @@ -64,7 +64,7 @@ This run tests a stricter early-stop harness: - the validation covered topology and runtime families, or accumulated at least three post-incumbent validation attempts. - If the stop guard fires, `study tune` writes `harness-stop-XXXX` and exits without spending another GPU trial or asking the LLM for another proposal. - A single-family all-infeasible plateau is not enough to stop deterministically. It only blocks repeating that family; the LLM must either justify a different family or later satisfy the validation/convergence stop rule. -- A search-high saturation guard stops immediately when the incumbent's highest measured probe is feasible, has no SLO failures, and is within the configured binary-search resolution of `search.high`. In that case the current study cannot measure a better config without increasing the workload search range, so more config proposals only waste tuning iterations. +- A search-high saturation guard stops immediately when the incumbent's highest measured probe is feasible and is within the configured binary-search resolution of `search.high`. A feasible probe may still contain individual SLO failures as long as it meets the configured pass-rate target. In that case the current study cannot measure a better config without increasing the workload search range, so more config proposals only waste tuning iterations. This is a generic harness rule, not a testcase-specific threshold. It does not depend on qwen27b, qwen235b, qwen30b, a fixed TP/DP value, or a hardcoded SLO number. @@ -76,7 +76,7 @@ Local test command: PYTHONPATH=src python3 -m unittest tests.test_core_flow -q ``` -Result: passed, 77 tests. +Result at the time of this note: passed. The current repository test count may be higher; use the command above as the source of truth. The added coverage checks: diff --git a/docs/superpowers/plans/2026-05-06-repo-audit-repair.md b/docs/superpowers/plans/2026-05-06-repo-audit-repair.md new file mode 100644 index 0000000..aa7723e --- /dev/null +++ b/docs/superpowers/plans/2026-05-06-repo-audit-repair.md @@ -0,0 +1,59 @@ +# Repo Audit Repair Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Repair the audit findings that affect measurement integrity, state correctness, documentation accuracy, and open-source readiness. + +**Architecture:** Keep changes localized to the existing stdlib-only Python package. Measurement validation lives at the HTTP/worker boundary, study state fixes remain in `StudyStore`, compare reporting gains explicit failed/no-feasible accounting, and project metadata/docs are added at repo root. + +**Tech Stack:** Python 3.11+ stdlib, `unittest`, setuptools `pyproject.toml`. + +--- + +### Task 1: Measurement Integrity + +**Files:** +- Modify: `src/aituner/http_client.py` +- Modify: `src/aituner/slo.py` +- Modify: `src/aituner/worker.py` +- Test: `tests/test_core_flow.py` + +- [ ] Write failing tests for completion token source/mismatch failures and persisted per-request probe details. +- [ ] Run the targeted tests and confirm they fail for the expected reason. +- [ ] Add token source metadata to streamed metrics and request outcomes. +- [ ] Fail requests when configured completion length cannot be verified from usage or differs from expected. +- [ ] Persist probe outcome details under each trial artifact directory. +- [ ] Run targeted tests and the full unittest suite. + +### Task 2: State, Spec, And Compare Guards + +**Files:** +- Modify: `src/aituner/spec.py` +- Modify: `src/aituner/store.py` +- Modify: `src/aituner/compare.py` +- Modify: `scripts/run_multi_compare.py` +- Test: `tests/test_core_flow.py` + +- [ ] Write failing tests for state list isolation, invalid trace numeric bounds, and compare aggregate failure accounting. +- [ ] Run targeted tests and confirm expected failures. +- [ ] Deep-copy/replace trial lists when materializing trials. +- [ ] Validate positive trace controls in `TraceSpec.from_dict`. +- [ ] Report failed/no-feasible counts in compare aggregates without changing existing winner semantics. +- [ ] Run targeted tests and the full unittest suite. + +### Task 3: Docs And Open-Source Readiness + +**Files:** +- Create: `README.md` +- Create: `LICENSE` +- Create: `CONTRIBUTING.md` +- Create: `SECURITY.md` +- Modify: `pyproject.toml` +- Modify: selected docs under `docs/` + +- [ ] Add concise repo usage, verification, and experiment integrity guidance. +- [ ] Add MIT license and contribution/security notes. +- [ ] Add project metadata and optional test extra. +- [ ] Update stale docs about high-stop behavior and current test count. +- [ ] Run JSON validation and full unittest suite. +- [ ] Commit changes in logical groups. diff --git a/pyproject.toml b/pyproject.toml index 557483e..6dddece 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,8 +6,23 @@ build-backend = "setuptools.build_meta" name = "aituner" version = "0.1.0" description = "AITuner study orchestrator for OpenAI-compatible serving engines" +readme = "README.md" requires-python = ">=3.11" +license = {text = "MIT"} +authors = [{name = "AITuner contributors"}] dependencies = [] +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] + +[project.optional-dependencies] +test = [] [project.scripts] aituner = "aituner.cli:main"