commit 7fa69b1354b71130f503f9cf826e5b03cf51911b
Author: Gahow Wang <gahow.wang@gmail.com>
Date:   Fri Apr 10 13:22:19 2026 +0000

    Initial project scaffold

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..832c7f7
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,26 @@
+.venv/
+__pycache__/
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+.coverage
+htmlcov/
+.nox/
+.tox/
+*.pyc
+*.pyo
+*.so
+*.o
+*.out
+*.log
+build/
+dist/
+*.egg-info/
+compile_commands.json
+cmake-build-*/
+.cache/
+.idea/
+.vscode/
+.DS_Store
+results/
+profile-output/
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..725fbeb
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,23 @@
+PYTHON ?= python
+KERNEL_LAB_CUDA_ARCH ?= 120
+
+.PHONY: sync env info test bench build-cuda-ext
+
+sync:
+	uv sync
+
+env:
+	$(PYTHON) tools/check_env.py
+
+info:
+	$(PYTHON) tools/print_device_info.py
+
+test:
+	pytest -q
+
+bench:
+	./tools/run_all_benchmarks.sh
+
+build-cuda-ext:
+	cmake -S kernels/cuda -B build/cuda -DCMAKE_CUDA_ARCHITECTURES=$(KERNEL_LAB_CUDA_ARCH)
+	cmake --build build/cuda -j
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..1a59726
--- /dev/null
+++ b/README.md
@@ -0,0 +1,173 @@
+# kernel-lab
+
+`kernel-lab` is a learning-first GPU kernel workbook for studying the same operator across four layers:
+
+1. PyTorch reference code
+2. Triton kernels
+3. Native CUDA C++ kernels
+4. PyTorch custom operator integration
+
+The repository is intentionally not a finished kernel library. The core Triton and CUDA implementations are left as TODO-driven lab exercises so you can study indexing, reductions, tiling, memory movement, correctness checks, and profiling in a controlled way.
+
+## Why This Repo Exists
+
+This lab is aimed at a modern NVIDIA workflow with a Blackwell-class consumer GPU such as an RTX 5090. The exercises themselves are mostly architecture-generic, so the project name stays broad while the build and docs keep hardware targeting explicit.
+
+Each operator exists for a reason:
+
+- `vector_add`: launch geometry, indexing, bounds checks
+- `row_softmax`: reductions, numerical stability, bandwidth limits
+- `tiled_matmul`: tiling, data reuse, memory hierarchy
+- `online_softmax`: running max / running sum recurrence
+- `flash_attention_fwd`: blockwise attention, masking, online normalization
+- `pytorch_custom_op`: how kernels get surfaced as framework operators
+- `profiling`: how to measure what actually happened
+
+## Learning Roadmap
+
+Start with the environment sanity task, then implement kernels in this order:
+
+1. `tasks/00_env_sanity`
+2. `tasks/01_vector_add`
+3. `tasks/02_row_softmax`
+4. `tasks/03_tiled_matmul`
+5. `tasks/04_online_softmax`
+6. `tasks/05_flash_attention_fwd`
+7. `tasks/06_pytorch_custom_op`
+8. `tasks/07_profiling`
+
+The detailed week-1 plan and implementation order live in `docs/roadmap.md`.
+
+## Triton To CUDA Mapping
+
+The core mental model is:
+
+- Triton `program_id` maps to CUDA block-level work assignment
+- Triton blocked tensor operations map to manual thread/block index arithmetic
+- Triton masks map to explicit boundary checks in CUDA
+- Triton load/store helpers abstract pointer math that CUDA exposes directly
+- Triton hides synchronization details that CUDA requires you to reason about
+
+See `docs/triton_vs_cuda.md` for a longer concept table.
+
+## Repository Layout
+
+```text
+docs/         concept notes, roadmap, profiling guidance
+reference/    plain PyTorch reference implementations
+kernels/      Triton and CUDA learner skeletons
+tasks/        workbook specs, TODO skeletons, task-local tests and benches
+tests/        repository-wide checks and correctness scaffolding
+bench/        cross-implementation benchmark harnesses
+tools/        environment checks, profiling helpers, comparison scripts
+```
+
+## Environment Assumptions
+
+- Python 3.10+
+- PyTorch with CUDA support
+- Triton installed if you want to run Triton tasks
+- CUDA toolkit installed if you want to build the native extension
+- A recent NVIDIA driver and a Blackwell-capable software stack
+
+Architecture targeting is configurable:
+
+- `KERNEL_LAB_CUDA_ARCH=120` for Python extension loading helpers
+- `-DCMAKE_CUDA_ARCHITECTURES=120` for direct CMake builds
+
+If your toolkit, driver, or local environment does not yet expose Blackwell exactly as expected, keep the architecture explicit and adjust it instead of editing kernel source files.
+
+## Install
+
+```bash
+uv sync
+```
+
+If you want commands to run inside the uv-managed environment without activating it manually, use `uv run`, for example:
+
+```bash
+uv run pytest -q
+uv run python tools/check_env.py
+```
+
+## Run Environment Checks
+
+```bash
+uv run python tools/check_env.py
+uv run python tools/print_device_info.py
+```
+
+## Run Tests
+
+The default test suite validates references and scaffolding. Triton/CUDA task tests skip gracefully until you implement the learner TODOs.
+
+```bash
+uv run pytest -q
+```
+
+You can also use:
+
+```bash
+make sync
+uv run pytest -q
+uv run ./tools/run_all_tests.sh
+```
+
+## Run Benchmarks
+
+Benchmarks compare PyTorch, Triton, and CUDA when available. Incomplete implementations are reported and skipped.
+
+```bash
+uv run python bench/bench_vector_add.py --device cuda
+uv run python bench/bench_softmax.py --device cuda
+uv run python bench/bench_matmul.py --device cuda
+uv run python bench/bench_attention.py --device cuda
+uv run python bench/compare_impls.py --task vector_add
+```
+
+Or run the helper:
+
+```bash
+uv run ./tools/run_all_benchmarks.sh
+```
+
+## Build The CUDA Extension
+
+Two paths are provided:
+
+1. CMake-first native build:
+
+```bash
+cmake -S kernels/cuda -B build/cuda -DCMAKE_CUDA_ARCHITECTURES=${KERNEL_LAB_CUDA_ARCH:-120}
+cmake --build build/cuda -j
+```
+
+2. Python-driven extension loading for lab experiments:
+
+```bash
+uv run python tasks/06_pytorch_custom_op/extension_skeleton.py
+```
+
+The binding and CUDA source files build a minimal extension skeleton. The learner is expected to fill in operator registration and kernel dispatch details.
+
+## Profile Kernels
+
+Start from one kernel, one shape, one implementation:
+
+```bash
+uv run ./tools/profile_ncu.sh python bench/bench_vector_add.py --device cuda --mode cuda
+uv run ./tools/profile_nsys.sh python bench/bench_attention.py --device cuda --mode triton
+```
+
+See `docs/profiling_guide.md` for warmup, synchronization, and first metrics to inspect.
+
+## How To Use The Workbook
+
+- Read the `spec.md` for the current task.
+- Run the reference implementation and tests first.
+- Read the Triton and CUDA skeleton side by side.
+- Fill in one TODO at a time.
+- Re-run correctness tests before looking at benchmark numbers.
+- Only profile after the kernel is correct on small shapes.
+
+This repo is designed to make the learning path visible. The TODOs are the point.
diff --git a/bench/bench_attention.py b/bench/bench_attention.py
new file mode 100644
index 0000000..d8d188b
--- /dev/null
+++ b/bench/bench_attention.py
@@ -0,0 +1,78 @@
+from __future__ import annotations
+
+import argparse
+import statistics
+import sys
+import time
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+
+import torch
+
+from kernels.triton.flash_attention_fwd import triton_flash_attention_fwd
+from reference.torch_attention import torch_attention
+from tools.lab_extension import build_extension
+
+
+def benchmark(fn, *args, warmup: int = 5, reps: int = 20, **kwargs) -> float:
+    for _ in range(warmup):
+        fn(*args, **kwargs)
+    if args[0].is_cuda:
+        torch.cuda.synchronize()
+    times_ms = []
+    for _ in range(reps):
+        if args[0].is_cuda:
+            torch.cuda.synchronize()
+        start = time.perf_counter()
+        fn(*args, **kwargs)
+        if args[0].is_cuda:
+            torch.cuda.synchronize()
+        times_ms.append((time.perf_counter() - start) * 1e3)
+    return statistics.median(times_ms)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu")
+    parser.add_argument("--mode", choices=["all", "torch", "triton", "cuda"], default="all")
+    parser.add_argument("--batch", type=int, default=2)
+    parser.add_argument("--heads", type=int, default=8)
+    parser.add_argument("--seq", type=int, default=128)
+    parser.add_argument("--dim", type=int, default=64)
+    parser.add_argument("--causal", action="store_true")
+    args = parser.parse_args()
+
+    q = torch.randn(args.batch, args.heads, args.seq, args.dim, device=args.device)
+    k = torch.randn(args.batch, args.heads, args.seq, args.dim, device=args.device)
+    v = torch.randn(args.batch, args.heads, args.seq, args.dim, device=args.device)
+
+    if args.mode in {"all", "torch"}:
+        elapsed_ms = benchmark(torch_attention, q, k, v, causal=args.causal)
+        print(f"torch: {elapsed_ms:.3f} ms")
+
+    if args.device == "cuda" and args.mode in {"all", "triton"}:
+        try:
+            elapsed_ms = benchmark(triton_flash_attention_fwd, q, k, v, causal=args.causal)
+            print(f"triton: {elapsed_ms:.3f} ms")
+        except (NotImplementedError, RuntimeError) as exc:
+            print(f"triton: skipped ({exc})")
+
+    if args.device == "cuda" and args.mode in {"all", "cuda"}:
+        ext = build_extension(verbose=False)
+        if ext is None or not hasattr(torch.ops, "kernel_lab"):
+            print("cuda: skipped (extension unavailable)")
+        else:
+            try:
+                elapsed_ms = benchmark(
+                    torch.ops.kernel_lab.flash_attention_fwd, q, k, v, args.causal
+                )
+                print(f"cuda: {elapsed_ms:.3f} ms")
+            except Exception as exc:
+                print(f"cuda: skipped ({exc})")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bench/bench_matmul.py b/bench/bench_matmul.py
new file mode 100644
index 0000000..62445f7
--- /dev/null
+++ b/bench/bench_matmul.py
@@ -0,0 +1,81 @@
+from __future__ import annotations
+
+import argparse
+import statistics
+import sys
+import time
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+
+import torch
+
+from kernels.triton.tiled_matmul import triton_tiled_matmul
+from reference.torch_matmul import torch_matmul
+from tools.lab_extension import build_extension
+
+
+def benchmark(fn, *args, warmup: int = 5, reps: int = 20) -> float:
+    for _ in range(warmup):
+        fn(*args)
+    if args[0].is_cuda:
+        torch.cuda.synchronize()
+    times_ms = []
+    for _ in range(reps):
+        if args[0].is_cuda:
+            torch.cuda.synchronize()
+        start = time.perf_counter()
+        fn(*args)
+        if args[0].is_cuda:
+            torch.cuda.synchronize()
+        times_ms.append((time.perf_counter() - start) * 1e3)
+    return statistics.median(times_ms)
+
+
+def report(name: str, elapsed_ms: float, m: int, n: int, k: int) -> None:
+    tflops = (2.0 * m * n * k) / (elapsed_ms * 1e-3) / 1e12
+    print(f"{name}: {elapsed_ms:.3f} ms | throughput {tflops:.3f} TFLOP/s")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu")
+    parser.add_argument("--mode", choices=["all", "torch", "triton", "cuda"], default="all")
+    parser.add_argument("--m", type=int, default=1024)
+    parser.add_argument("--n", type=int, default=1024)
+    parser.add_argument("--k", type=int, default=1024)
+    args = parser.parse_args()
+
+    a = torch.randn(args.m, args.k, device=args.device)
+    b = torch.randn(args.k, args.n, device=args.device)
+
+    if args.mode in {"all", "torch"}:
+        report("torch", benchmark(torch_matmul, a, b), args.m, args.n, args.k)
+
+    if args.device == "cuda" and args.mode in {"all", "triton"}:
+        try:
+            report("triton", benchmark(triton_tiled_matmul, a, b), args.m, args.n, args.k)
+        except (NotImplementedError, RuntimeError) as exc:
+            print(f"triton: skipped ({exc})")
+
+    if args.device == "cuda" and args.mode in {"all", "cuda"}:
+        ext = build_extension(verbose=False)
+        if ext is None or not hasattr(torch.ops, "kernel_lab"):
+            print("cuda: skipped (extension unavailable)")
+        else:
+            try:
+                report(
+                    "cuda",
+                    benchmark(torch.ops.kernel_lab.tiled_matmul, a, b),
+                    args.m,
+                    args.n,
+                    args.k,
+                )
+            except Exception as exc:
+                print(f"cuda: skipped ({exc})")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bench/bench_softmax.py b/bench/bench_softmax.py
new file mode 100644
index 0000000..ab146bb
--- /dev/null
+++ b/bench/bench_softmax.py
@@ -0,0 +1,82 @@
+from __future__ import annotations
+
+import argparse
+import statistics
+import sys
+import time
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+
+import torch
+
+from kernels.triton.online_softmax import triton_online_softmax
+from kernels.triton.row_softmax import triton_row_softmax
+from reference.torch_online_softmax import torch_online_softmax
+from reference.torch_row_softmax import torch_row_softmax
+from tools.lab_extension import build_extension
+
+
+def benchmark(fn, *args, warmup: int = 5, reps: int = 25) -> float:
+    for _ in range(warmup):
+        fn(*args)
+    if args[0].is_cuda:
+        torch.cuda.synchronize()
+    times_ms = []
+    for _ in range(reps):
+        if args[0].is_cuda:
+            torch.cuda.synchronize()
+        start = time.perf_counter()
+        fn(*args)
+        if args[0].is_cuda:
+            torch.cuda.synchronize()
+        times_ms.append((time.perf_counter() - start) * 1e3)
+    return statistics.median(times_ms)
+
+
+def report(name: str, elapsed_ms: float, x: torch.Tensor) -> None:
+    logical_bytes = 3 * x.numel() * x.element_size()
+    gbps = logical_bytes / (elapsed_ms * 1e-3) / 1e9
+    print(f"{name}: {elapsed_ms:.3f} ms | logical bandwidth {gbps:.2f} GB/s")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu")
+    parser.add_argument("--mode", choices=["all", "torch", "triton", "cuda"], default="all")
+    parser.add_argument("--variant", choices=["row", "online"], default="row")
+    parser.add_argument("--rows", type=int, default=4096)
+    parser.add_argument("--cols", type=int, default=1024)
+    args = parser.parse_args()
+
+    x = torch.randn(args.rows, args.cols, device=args.device)
+
+    ref_fn = torch_row_softmax if args.variant == "row" else torch_online_softmax
+    triton_fn = triton_row_softmax if args.variant == "row" else triton_online_softmax
+    cuda_name = "row_softmax" if args.variant == "row" else "online_softmax"
+
+    if args.mode in {"all", "torch"}:
+        report(f"torch_{args.variant}_softmax", benchmark(ref_fn, x), x)
+
+    if args.device == "cuda" and args.mode in {"all", "triton"}:
+        try:
+            report(f"triton_{args.variant}_softmax", benchmark(triton_fn, x), x)
+        except (NotImplementedError, RuntimeError) as exc:
+            print(f"triton_{args.variant}_softmax: skipped ({exc})")
+
+    if args.device == "cuda" and args.mode in {"all", "cuda"}:
+        ext = build_extension(verbose=False)
+        if ext is None or not hasattr(torch.ops, "kernel_lab"):
+            print(f"cuda_{args.variant}_softmax: skipped (extension unavailable)")
+        else:
+            try:
+                cuda_fn = getattr(torch.ops.kernel_lab, cuda_name)
+                report(f"cuda_{args.variant}_softmax", benchmark(cuda_fn, x), x)
+            except Exception as exc:
+                print(f"cuda_{args.variant}_softmax: skipped ({exc})")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bench/bench_vector_add.py b/bench/bench_vector_add.py
new file mode 100644
index 0000000..5cb3dc9
--- /dev/null
+++ b/bench/bench_vector_add.py
@@ -0,0 +1,74 @@
+from __future__ import annotations
+
+import argparse
+import statistics
+import sys
+import time
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+
+import torch
+
+from kernels.triton.vector_add import triton_vector_add
+from reference.torch_vector_add import torch_vector_add
+from tools.lab_extension import build_extension
+
+
+def benchmark(fn, *args, warmup: int = 5, reps: int = 30) -> float:
+    for _ in range(warmup):
+        fn(*args)
+    if args[0].is_cuda:
+        torch.cuda.synchronize()
+    times_ms = []
+    for _ in range(reps):
+        if args[0].is_cuda:
+            torch.cuda.synchronize()
+        start = time.perf_counter()
+        fn(*args)
+        if args[0].is_cuda:
+            torch.cuda.synchronize()
+        times_ms.append((time.perf_counter() - start) * 1e3)
+    return statistics.median(times_ms)
+
+
+def report(name: str, elapsed_ms: float, x: torch.Tensor) -> None:
+    bytes_moved = 3 * x.numel() * x.element_size()
+    gbps = bytes_moved / (elapsed_ms * 1e-3) / 1e9
+    print(f"{name}: {elapsed_ms:.3f} ms | effective bandwidth {gbps:.2f} GB/s")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu")
+    parser.add_argument("--mode", choices=["all", "torch", "triton", "cuda"], default="all")
+    parser.add_argument("--numel", type=int, default=1 << 24)
+    args = parser.parse_args()
+
+    x = torch.randn(args.numel, device=args.device)
+    y = torch.randn(args.numel, device=args.device)
+
+    if args.mode in {"all", "torch"}:
+        report("torch", benchmark(torch_vector_add, x, y), x)
+
+    if args.device == "cuda" and args.mode in {"all", "triton"}:
+        try:
+            report("triton", benchmark(triton_vector_add, x, y), x)
+        except (NotImplementedError, RuntimeError) as exc:
+            print(f"triton: skipped ({exc})")
+
+    if args.device == "cuda" and args.mode in {"all", "cuda"}:
+        ext = build_extension(verbose=False)
+        if ext is None or not hasattr(torch.ops, "kernel_lab"):
+            print("cuda: skipped (extension unavailable)")
+        else:
+            try:
+                report("cuda", benchmark(torch.ops.kernel_lab.vector_add, x, y), x)
+            except Exception as exc:
+                print(f"cuda: skipped ({exc})")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bench/compare_impls.py b/bench/compare_impls.py
new file mode 100644
index 0000000..deb58db
--- /dev/null
+++ b/bench/compare_impls.py
@@ -0,0 +1,32 @@
+from __future__ import annotations
+
+import argparse
+import subprocess
+import sys
+from pathlib import Path
+
+
+ROOT = Path(__file__).resolve().parents[1]
+
+
+TASK_TO_SCRIPT = {
+    "vector_add": ROOT / "bench" / "bench_vector_add.py",
+    "softmax": ROOT / "bench" / "bench_softmax.py",
+    "matmul": ROOT / "bench" / "bench_matmul.py",
+    "attention": ROOT / "bench" / "bench_attention.py",
+}
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--task", choices=sorted(TASK_TO_SCRIPT), required=True)
+    parser.add_argument("extra_args", nargs="*")
+    args = parser.parse_args()
+
+    cmd = [sys.executable, str(TASK_TO_SCRIPT[args.task]), *args.extra_args]
+    subprocess.run(cmd, check=True)
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/conftest.py b/conftest.py
new file mode 100644
index 0000000..91b2417
--- /dev/null
+++ b/conftest.py
@@ -0,0 +1,9 @@
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+
+ROOT = Path(__file__).resolve().parent
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
diff --git a/docs/blackwell_notes.md b/docs/blackwell_notes.md
new file mode 100644
index 0000000..647ee15
--- /dev/null
+++ b/docs/blackwell_notes.md
@@ -0,0 +1,20 @@
+# Blackwell Notes
+
+This repository targets a Blackwell-style workflow, but keeps the build configuration explicit because local toolchain support may differ across systems.
+
+## Build Guidance
+
+- Prefer explicit architecture selection over hidden defaults.
+- Use `KERNEL_LAB_CUDA_ARCH=120` for Python-side build helpers when your local environment supports it.
+- Use `-DCMAKE_CUDA_ARCHITECTURES=120` with CMake for direct native builds.
+- If your toolkit does not yet accept the exact architecture value you want, adjust the build flag rather than editing the kernels.
+
+## What To Watch On A New GPU Generation
+
+- compiler support for the target architecture
+- PyTorch wheel compatibility
+- Triton support level
+- driver/toolkit mismatch
+- profiler tool compatibility
+
+Treat environment validation as part of the lab, not as a one-time setup nuisance.
diff --git a/docs/cuda_execution_model.md b/docs/cuda_execution_model.md
new file mode 100644
index 0000000..b29ea4c
--- /dev/null
+++ b/docs/cuda_execution_model.md
@@ -0,0 +1,40 @@
+# CUDA Execution Model
+
+## How To Read A CUDA Kernel
+
+Use this short checklist every time:
+
+1. Find the logical work unit.
+   Ask what one thread, warp, or block is responsible for.
+2. Decode the index math.
+   Look for `blockIdx`, `threadIdx`, `blockDim`, and any derived offsets.
+3. Inspect the memory accesses.
+   Separate global loads, shared memory loads, stores, and reductions.
+4. Find synchronization points.
+   Every `__syncthreads()` should protect a clear shared-memory phase boundary.
+5. Check boundary conditions.
+   Out-of-range reads and stores are a common first bug.
+6. Compare against the reference implementation.
+   Make sure the math, masking, and shape conventions still match.
+
+## Execution Hierarchy
+
+- Grid: all blocks launched for one kernel
+- Block: a cooperating team of threads
+- Thread: one scalar execution context
+
+CUDA makes several things explicit that Triton abstracts:
+
+- manual thread/block decomposition
+- pointer arithmetic
+- shared-memory allocation and reuse
+- synchronization
+- launch configuration choices
+
+## Reading Order For This Lab
+
+- `vector_add.cu`: pure indexing
+- `row_softmax.cu`: reduction structure
+- `tiled_matmul.cu`: shared-memory tiling
+- `online_softmax.cu`: stateful reduction recurrence
+- `flash_attention_fwd.cu`: composition of multiple ideas
diff --git a/docs/flashattention_notes.md b/docs/flashattention_notes.md
new file mode 100644
index 0000000..bf170f5
--- /dev/null
+++ b/docs/flashattention_notes.md
@@ -0,0 +1,28 @@
+# FlashAttention Notes
+
+FlashAttention-style kernels are useful because the naive attention pipeline materializes large score tensors and spends too much bandwidth moving them.
+
+## The Core Idea
+
+Instead of:
+
+1. computing the full score matrix
+2. writing it out
+3. running softmax
+4. reading it back
+5. multiplying by `V`
+
+you process attention block by block and keep more intermediate state on chip.
+
+## Why Online Softmax Matters
+
+Blockwise processing changes the normalization problem. You cannot assume you have seen the full row. The running max / running sum recurrence lets you update normalization state incrementally without losing numerical stability.
+
+## What This Lab Covers
+
+- forward pass only
+- small-shape correctness first
+- optional causal masking
+- side-by-side Triton and CUDA skeletons
+
+This repo intentionally stops short of a polished production FlashAttention implementation. The point is to expose the algorithmic structure.
diff --git a/docs/gpu_basics.md b/docs/gpu_basics.md
new file mode 100644
index 0000000..6b4e57d
--- /dev/null
+++ b/docs/gpu_basics.md
@@ -0,0 +1,30 @@
+# GPU Basics
+
+This lab assumes you are learning GPU kernels as structured data-parallel programs.
+
+## Core Ideas
+
+- GPU throughput comes from massive parallelism, not a single fast thread.
+- Launch geometry determines which logical elements each thread or program instance owns.
+- Global memory is large and slow relative to on-chip storage.
+- Kernel design is often about reducing memory traffic and increasing reuse.
+
+## Terms To Keep Straight
+
+- thread: the smallest execution entity in CUDA
+- warp: a hardware scheduling group, usually 32 threads
+- block: a cooperating group of threads with shared memory access
+- grid: the full launch of all blocks
+- program instance: Triton's block-level work abstraction
+
+## Mental Model For This Repo
+
+Each task asks the same questions in both Triton and CUDA:
+
+- What data does one unit of work own?
+- How is that ownership computed from launch indices?
+- Which reads are coalesced or contiguous?
+- Which intermediate values must be reduced?
+- Which values should be reused on chip?
+
+Keep a notebook. Write down the answers before you code.
diff --git a/docs/profiling_guide.md b/docs/profiling_guide.md
new file mode 100644
index 0000000..3252dfb
--- /dev/null
+++ b/docs/profiling_guide.md
@@ -0,0 +1,87 @@
+# Profiling Guide
+
+## Profile One Kernel At A Time
+
+Good profiling starts narrow:
+
+- one implementation
+- one shape
+- one dtype
+- one device
+- one command you can rerun
+
+If you profile a full training script too early, you will not know which kernel you are looking at.
+
+## Why Warmup Matters
+
+The first iterations may include:
+
+- lazy module loading
+- JIT compilation
+- cache effects
+- allocator setup
+
+Warm up first, then measure.
+
+## Why Synchronization Matters
+
+GPU work is asynchronous with respect to Python. If you do not synchronize before stopping a timer, you usually measure launch overhead instead of kernel runtime.
+
+Use `torch.cuda.synchronize()` around timed regions.
+
+## How To Avoid Misleading Timings
+
+- keep shapes fixed
+- use multiple repetitions
+- report median, not only minimum
+- separate correctness from performance testing
+- compare implementations under the same dtype and device conditions
+- check that all inputs are already on the GPU
+
+## First Metrics To Inspect
+
+- kernel duration
+- achieved memory throughput
+- occupancy
+- DRAM transactions or bandwidth
+- shared-memory throughput when tiling is relevant
+- eligible warps per cycle when investigating latency hiding
+
+## Practical `ncu` Examples
+
+```bash
+ncu --set full --target-processes all \
+  python bench/bench_vector_add.py --device cuda --mode cuda
+```
+
+```bash
+ncu --metrics sm__throughput.avg.pct_of_peak_sustained_elapsed,\
+dram__throughput.avg.pct_of_peak_sustained_elapsed \
+  python bench/bench_softmax.py --device cuda --mode triton
+```
+
+## Practical `nsys` Examples
+
+```bash
+nsys profile --trace=cuda,nvtx,osrt --sample=none \
+  -o profile-output/attention_triton \
+  python bench/bench_attention.py --device cuda --mode triton
+```
+
+```bash
+nsys profile --trace=cuda,nvtx,osrt --sample=none \
+  -o profile-output/matmul_cuda \
+  python bench/bench_matmul.py --device cuda --mode cuda
+```
+
+## Checklist Before Trusting A Benchmark Result
+
+- Was there a warmup phase?
+- Was the device synchronized before and after timing?
+- Did all implementations run the same math?
+- Were outputs checked against a reference?
+- Were shapes and dtypes identical?
+- Was one implementation silently skipped or falling back to CPU?
+- Did you report median time over several repetitions?
+- Is the measured quantity bandwidth-bound or compute-bound?
+- Did you accidentally include setup or compilation time?
diff --git a/docs/roadmap.md b/docs/roadmap.md
new file mode 100644
index 0000000..53716d7
--- /dev/null
+++ b/docs/roadmap.md
@@ -0,0 +1,75 @@
+# Roadmap
+
+## Week 1 Study Plan
+
+Day 1:
+
+- Run `tools/check_env.py`
+- Read `docs/gpu_basics.md`
+- Read `docs/cuda_execution_model.md`
+- Inspect `reference/torch_vector_add.py`
+- Implement or partially implement `tasks/01_vector_add/triton_skeleton.py`
+
+Day 2:
+
+- Read `docs/triton_vs_cuda.md`
+- Inspect `kernels/cuda/src/vector_add.cu`
+- Fill in vector add indexing TODOs in Triton and CUDA
+- Run `pytest -q tasks/01_vector_add/test_task.py`
+
+Day 3:
+
+- Read `reference/torch_row_softmax.py`
+- Read `tasks/02_row_softmax/spec.md`
+- Implement numerically stable row softmax in Triton first
+- Compare against the CUDA skeleton and map the reduction strategy
+
+Day 4:
+
+- Study `tasks/03_tiled_matmul/spec.md`
+- Draw the tile decomposition on paper
+- Implement one matmul tile path with correctness-only priorities
+
+Day 5:
+
+- Read `docs/flashattention_notes.md`
+- Read `tasks/04_online_softmax/spec.md`
+- Derive the running max / running sum recurrence informally
+
+Day 6:
+
+- Inspect `tasks/05_flash_attention_fwd/spec.md`
+- Trace the PyTorch reference line by line
+- Annotate where Q/K/V loads, score computation, normalization, and output accumulation happen
+
+Day 7:
+
+- Read `docs/profiling_guide.md`
+- Run one benchmark and one profiler command
+- Write down which numbers changed after warmup and synchronization
+
+## Recommended TODO Order
+
+1. Environment checks
+2. Vector add Triton
+3. Vector add CUDA
+4. Row softmax Triton
+5. Row softmax CUDA
+6. Tiled matmul Triton
+7. Tiled matmul CUDA
+8. Online softmax Triton
+9. Online softmax CUDA
+10. Flash attention forward Triton
+11. Flash attention forward CUDA
+12. PyTorch custom op binding
+13. Profiling passes and benchmark validation
+
+## What To Focus On First
+
+- Correctness on tiny shapes
+- Clear index math
+- Explicit shape assumptions
+- Numerically stable reductions
+- Repeatable measurement
+
+Do not chase peak performance before you can explain the memory traffic and launch geometry of your kernel.
diff --git a/docs/triton_vs_cuda.md b/docs/triton_vs_cuda.md
new file mode 100644
index 0000000..722254e
--- /dev/null
+++ b/docs/triton_vs_cuda.md
@@ -0,0 +1,30 @@
+# Triton Vs CUDA
+
+## Concept Mapping Table
+
+| Triton concept | CUDA concept | What to notice |
+| --- | --- | --- |
+| `tl.program_id(axis=0)` | `blockIdx.x` and block ownership | Both assign a chunk of logical work to a block-scale unit |
+| `tl.arange(0, BLOCK)` | `threadIdx.x` or manual lane-local offsets | Triton expresses vectors of indices directly |
+| masked `tl.load` / `tl.store` | explicit `if (idx < n)` checks | Same boundary problem, different syntax |
+| blocked tensor operations | thread/block decomposition plus loops | Triton lifts index sets into tensor expressions |
+| pointer arithmetic in element units | byte-addressed pointer math and indexing | CUDA makes layout mechanics more visible |
+| implicit vectorized math | manual scalar or vector intrinsics | Triton often reads like array algebra |
+| autotuned launch parameters | manual block-size tuning | Both still depend on the memory hierarchy |
+| block pointers and tile views | shared memory tiles and cooperative loads | The same reuse idea shows up with different APIs |
+| reduction combinators | warp/block reductions | Same algorithmic structure, different implementation burden |
+| masks and predicates | control flow and bounds checks | Divergence and predication still matter |
+
+## How To Compare Side By Side
+
+1. Start from the reference PyTorch function and identify the mathematical operator.
+2. In the Triton version, ask what one program instance owns.
+3. In the CUDA version, ask what one block and one thread own.
+4. Match the memory reads and writes, not just the variable names.
+5. Write down where reduction state lives in each version.
+6. For tiled code, identify when data moves from global memory to on-chip storage.
+7. Only then compare performance.
+
+## Rule Of Thumb
+
+Triton usually compresses the "how" so you can focus on the blocked tensor math. CUDA exposes the "how" directly, which is why it is valuable to study both.
diff --git a/kernels/__init__.py b/kernels/__init__.py
new file mode 100644
index 0000000..86d2a32
--- /dev/null
+++ b/kernels/__init__.py
@@ -0,0 +1,2 @@
+"""Kernel modules for Triton and CUDA learning tasks."""
+
diff --git a/kernels/cuda/CMakeLists.txt b/kernels/cuda/CMakeLists.txt
new file mode 100644
index 0000000..a3d42a8
--- /dev/null
+++ b/kernels/cuda/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.25)
+project(kernel_lab LANGUAGES CXX CUDA)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+  set(CMAKE_CUDA_ARCHITECTURES 120 CACHE STRING "Target CUDA architectures")
+endif()
+
+find_package(Torch REQUIRED)
+
+add_library(kernel_lab_extension SHARED
+  binding/binding.cpp
+  src/vector_add.cu
+  src/row_softmax.cu
+  src/tiled_matmul.cu
+  src/online_softmax.cu
+  src/flash_attention_fwd.cu
+)
+
+target_include_directories(kernel_lab_extension PRIVATE include)
+target_link_libraries(kernel_lab_extension PRIVATE "${TORCH_LIBRARIES}")
+target_compile_features(kernel_lab_extension PRIVATE cxx_std_17)
+set_target_properties(kernel_lab_extension PROPERTIES
+  PREFIX ""
+  CUDA_SEPARABLE_COMPILATION ON
+)
diff --git a/kernels/cuda/binding/binding.cpp b/kernels/cuda/binding/binding.cpp
new file mode 100644
index 0000000..abfd7f8
--- /dev/null
+++ b/kernels/cuda/binding/binding.cpp
@@ -0,0 +1,69 @@
+#include "../include/common.h"
+#include "../include/cuda_utils.h"
+
+#include <torch/extension.h>
+
+namespace kernel_lab {
+
+torch::Tensor vector_add_dispatch(torch::Tensor x, torch::Tensor y) {
+  check_cuda_pair(x, y);
+  LAB_CHECK_SAME_SHAPE(x, y);
+  return vector_add_cuda(x, y);
+}
+
+torch::Tensor row_softmax_dispatch(torch::Tensor x) {
+  LAB_CHECK_CUDA(x);
+  LAB_CHECK_CONTIGUOUS(x);
+  return row_softmax_cuda(x);
+}
+
+torch::Tensor tiled_matmul_dispatch(torch::Tensor a, torch::Tensor b) {
+  check_cuda_pair(a, b);
+  return tiled_matmul_cuda(a, b);
+}
+
+torch::Tensor online_softmax_dispatch(torch::Tensor x) {
+  LAB_CHECK_CUDA(x);
+  LAB_CHECK_CONTIGUOUS(x);
+  return online_softmax_cuda(x);
+}
+
+torch::Tensor flash_attention_fwd_dispatch(
+    torch::Tensor q,
+    torch::Tensor k,
+    torch::Tensor v,
+    bool causal) {
+  LAB_CHECK_CUDA(q);
+  LAB_CHECK_CUDA(k);
+  LAB_CHECK_CUDA(v);
+  return flash_attention_fwd_cuda(q, k, v, causal);
+}
+
+}  // namespace kernel_lab
+
+TORCH_LIBRARY(kernel_lab, m) {
+  m.def("vector_add(Tensor x, Tensor y) -> Tensor");
+  m.def("row_softmax(Tensor x) -> Tensor");
+  m.def("tiled_matmul(Tensor a, Tensor b) -> Tensor");
+  m.def("online_softmax(Tensor x) -> Tensor");
+  m.def("flash_attention_fwd(Tensor q, Tensor k, Tensor v, bool causal=False) -> Tensor");
+}
+
+TORCH_LIBRARY_IMPL(kernel_lab, CUDA, m) {
+  m.impl("vector_add", &kernel_lab::vector_add_dispatch);
+  m.impl("row_softmax", &kernel_lab::row_softmax_dispatch);
+  m.impl("tiled_matmul", &kernel_lab::tiled_matmul_dispatch);
+  m.impl("online_softmax", &kernel_lab::online_softmax_dispatch);
+  m.impl("flash_attention_fwd", &kernel_lab::flash_attention_fwd_dispatch);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("vector_add_dispatch", &kernel_lab::vector_add_dispatch, "Vector add dispatch");
+  m.def("row_softmax_dispatch", &kernel_lab::row_softmax_dispatch, "Row softmax dispatch");
+  m.def("tiled_matmul_dispatch", &kernel_lab::tiled_matmul_dispatch, "Tiled matmul dispatch");
+  m.def("online_softmax_dispatch", &kernel_lab::online_softmax_dispatch, "Online softmax dispatch");
+  m.def(
+      "flash_attention_fwd_dispatch",
+      &kernel_lab::flash_attention_fwd_dispatch,
+      "Flash attention forward dispatch");
+}
diff --git a/kernels/cuda/include/common.h b/kernels/cuda/include/common.h
new file mode 100644
index 0000000..e8b58db
--- /dev/null
+++ b/kernels/cuda/include/common.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <torch/extension.h>
+
+namespace kernel_lab {
+
+torch::Tensor vector_add_cuda(torch::Tensor x, torch::Tensor y);
+torch::Tensor row_softmax_cuda(torch::Tensor x);
+torch::Tensor tiled_matmul_cuda(torch::Tensor a, torch::Tensor b);
+torch::Tensor online_softmax_cuda(torch::Tensor x);
+torch::Tensor flash_attention_fwd_cuda(
+    torch::Tensor q,
+    torch::Tensor k,
+    torch::Tensor v,
+    bool causal);
+
+}  // namespace kernel_lab
diff --git a/kernels/cuda/include/cuda_utils.h b/kernels/cuda/include/cuda_utils.h
new file mode 100644
index 0000000..0b48677
--- /dev/null
+++ b/kernels/cuda/include/cuda_utils.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <torch/extension.h>
+
+#define LAB_CHECK_CUDA(x) TORCH_CHECK((x).is_cuda(), #x " must be a CUDA tensor")
+#define LAB_CHECK_CONTIGUOUS(x) TORCH_CHECK((x).is_contiguous(), #x " must be contiguous")
+#define LAB_CHECK_SAME_SHAPE(x, y) TORCH_CHECK((x).sizes() == (y).sizes(), #x " and " #y " must have the same shape")
+
+inline void check_cuda_pair(const torch::Tensor& x, const torch::Tensor& y) {
+  LAB_CHECK_CUDA(x);
+  LAB_CHECK_CUDA(y);
+  LAB_CHECK_CONTIGUOUS(x);
+  LAB_CHECK_CONTIGUOUS(y);
+}
+
diff --git a/kernels/cuda/src/flash_attention_fwd.cu b/kernels/cuda/src/flash_attention_fwd.cu
new file mode 100644
index 0000000..1af35dd
--- /dev/null
+++ b/kernels/cuda/src/flash_attention_fwd.cu
@@ -0,0 +1,54 @@
+#include "../include/common.h"
+#include "../include/cuda_utils.h"
+
+namespace kernel_lab {
+
+__global__ void flash_attention_fwd_kernel(
+    const float* q,
+    const float* k,
+    const float* v,
+    float* out,
+    int64_t batch,
+    int64_t heads,
+    int64_t seq_len,
+    int64_t head_dim,
+    bool causal) {
+  (void)q;
+  (void)k;
+  (void)v;
+  (void)out;
+  (void)batch;
+  (void)heads;
+  (void)seq_len;
+  (void)head_dim;
+  (void)causal;
+
+  // TODO(student): assign each block to a batch/head/query tile.
+  // TODO(student): cooperatively load K/V tiles.
+  // TODO(student): compute score blocks and apply causal masking when requested.
+  // TODO(student): maintain online softmax state and accumulate the output tile.
+}
+
+torch::Tensor flash_attention_fwd_cuda(
+    torch::Tensor q,
+    torch::Tensor k,
+    torch::Tensor v,
+    bool causal) {
+  LAB_CHECK_CUDA(q);
+  LAB_CHECK_CUDA(k);
+  LAB_CHECK_CUDA(v);
+  LAB_CHECK_CONTIGUOUS(q);
+  LAB_CHECK_CONTIGUOUS(k);
+  LAB_CHECK_CONTIGUOUS(v);
+  TORCH_CHECK(q.sizes() == k.sizes(), "q and k must match");
+  TORCH_CHECK(q.sizes() == v.sizes(), "q and v must match");
+  TORCH_CHECK(q.dim() == 4, "flash_attention_fwd_cuda expects [batch, heads, seq, dim]");
+  TORCH_CHECK(q.scalar_type() == torch::kFloat32, "flash_attention_fwd_cuda currently assumes float32");
+
+  TORCH_CHECK(
+      false,
+      "TODO(student): implement flash_attention_fwd_cuda in kernels/cuda/src/flash_attention_fwd.cu.");
+  return torch::Tensor();
+}
+
+}  // namespace kernel_lab
diff --git a/kernels/cuda/src/online_softmax.cu b/kernels/cuda/src/online_softmax.cu
new file mode 100644
index 0000000..ad6e975
--- /dev/null
+++ b/kernels/cuda/src/online_softmax.cu
@@ -0,0 +1,36 @@
+#include "../include/common.h"
+#include "../include/cuda_utils.h"
+
+namespace kernel_lab {
+
+__global__ void online_softmax_kernel(
+    const float* x,
+    float* out,
+    int64_t num_rows,
+    int64_t num_cols) {
+  int row = blockIdx.x;
+  if (row >= num_rows) {
+    return;
+  }
+
+  // TODO(student): maintain running max and running sum across column tiles.
+  // TODO(student): write the normalized row after finishing the recurrence.
+  (void)x;
+  (void)out;
+  (void)num_rows;
+  (void)num_cols;
+}
+
+torch::Tensor online_softmax_cuda(torch::Tensor x) {
+  LAB_CHECK_CUDA(x);
+  LAB_CHECK_CONTIGUOUS(x);
+  TORCH_CHECK(x.dim() == 2, "online_softmax_cuda expects a 2D tensor");
+  TORCH_CHECK(x.scalar_type() == torch::kFloat32, "online_softmax_cuda currently assumes float32");
+
+  TORCH_CHECK(
+      false,
+      "TODO(student): implement online_softmax_cuda in kernels/cuda/src/online_softmax.cu.");
+  return torch::Tensor();
+}
+
+}  // namespace kernel_lab
diff --git a/kernels/cuda/src/row_softmax.cu b/kernels/cuda/src/row_softmax.cu
new file mode 100644
index 0000000..2a64f70
--- /dev/null
+++ b/kernels/cuda/src/row_softmax.cu
@@ -0,0 +1,37 @@
+#include "../include/common.h"
+#include "../include/cuda_utils.h"
+
+namespace kernel_lab {
+
+__global__ void row_softmax_kernel(
+    const float* x,
+    float* out,
+    int64_t num_rows,
+    int64_t num_cols) {
+  int row = blockIdx.x;
+  if (row >= num_rows) {
+    return;
+  }
+
+  // TODO(student): decide whether one block owns one row or one row tile.
+  // TODO(student): compute the row max for numerical stability.
+  // TODO(student): compute exp(x - max), reduce the sum, and normalize.
+  (void)x;
+  (void)out;
+  (void)num_rows;
+  (void)num_cols;
+}
+
+torch::Tensor row_softmax_cuda(torch::Tensor x) {
+  LAB_CHECK_CUDA(x);
+  LAB_CHECK_CONTIGUOUS(x);
+  TORCH_CHECK(x.dim() == 2, "row_softmax_cuda expects a 2D tensor");
+  TORCH_CHECK(x.scalar_type() == torch::kFloat32, "row_softmax_cuda currently assumes float32");
+
+  TORCH_CHECK(
+      false,
+      "TODO(student): implement row_softmax_cuda in kernels/cuda/src/row_softmax.cu.");
+  return torch::Tensor();
+}
+
+}  // namespace kernel_lab
diff --git a/kernels/cuda/src/tiled_matmul.cu b/kernels/cuda/src/tiled_matmul.cu
new file mode 100644
index 0000000..2679bb6
--- /dev/null
+++ b/kernels/cuda/src/tiled_matmul.cu
@@ -0,0 +1,40 @@
+#include "../include/common.h"
+#include "../include/cuda_utils.h"
+
+namespace kernel_lab {
+
+__global__ void tiled_matmul_kernel(
+    const float* a,
+    const float* b,
+    float* c,
+    int64_t m,
+    int64_t n,
+    int64_t k) {
+  // TODO(student): map blockIdx/threadIdx to a C tile.
+  // TODO(student): cooperatively load A and B tiles into shared memory.
+  // TODO(student): accumulate partial products across the K dimension.
+  (void)a;
+  (void)b;
+  (void)c;
+  (void)m;
+  (void)n;
+  (void)k;
+}
+
+torch::Tensor tiled_matmul_cuda(torch::Tensor a, torch::Tensor b) {
+  LAB_CHECK_CUDA(a);
+  LAB_CHECK_CUDA(b);
+  LAB_CHECK_CONTIGUOUS(a);
+  LAB_CHECK_CONTIGUOUS(b);
+  TORCH_CHECK(a.dim() == 2 && b.dim() == 2, "tiled_matmul_cuda expects 2D tensors");
+  TORCH_CHECK(a.size(1) == b.size(0), "inner dimensions must match");
+  TORCH_CHECK(a.scalar_type() == torch::kFloat32, "tiled_matmul_cuda currently assumes float32");
+  TORCH_CHECK(b.scalar_type() == torch::kFloat32, "tiled_matmul_cuda currently assumes float32");
+
+  TORCH_CHECK(
+      false,
+      "TODO(student): implement tiled_matmul_cuda in kernels/cuda/src/tiled_matmul.cu.");
+  return torch::Tensor();
+}
+
+}  // namespace kernel_lab
diff --git a/kernels/cuda/src/vector_add.cu b/kernels/cuda/src/vector_add.cu
new file mode 100644
index 0000000..99c77a1
--- /dev/null
+++ b/kernels/cuda/src/vector_add.cu
@@ -0,0 +1,35 @@
+#include "../include/common.h"
+#include "../include/cuda_utils.h"
+
+namespace kernel_lab {
+
+__global__ void vector_add_kernel(
+    const float* x,
+    const float* y,
+    float* out,
+    int64_t numel) {
+  int64_t global_idx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+  if (global_idx >= numel) {
+    return;
+  }
+
+  (void)x;
+  (void)y;
+  (void)out;
+  (void)numel;
+  // TODO(student): replace this placeholder with the real vector-add math.
+  // Hint: one thread should own one element for the first implementation.
+}
+
+torch::Tensor vector_add_cuda(torch::Tensor x, torch::Tensor y) {
+  check_cuda_pair(x, y);
+  LAB_CHECK_SAME_SHAPE(x, y);
+  TORCH_CHECK(x.scalar_type() == torch::kFloat32, "vector_add_cuda currently assumes float32");
+
+  TORCH_CHECK(
+      false,
+      "TODO(student): implement vector_add_cuda in kernels/cuda/src/vector_add.cu and then launch the kernel.");
+  return torch::Tensor();
+}
+
+}  // namespace kernel_lab
diff --git a/kernels/triton/__init__.py b/kernels/triton/__init__.py
new file mode 100644
index 0000000..bc218aa
--- /dev/null
+++ b/kernels/triton/__init__.py
@@ -0,0 +1,2 @@
+"""Triton learner skeletons."""
+
diff --git a/kernels/triton/flash_attention_fwd.py b/kernels/triton/flash_attention_fwd.py
new file mode 100644
index 0000000..30df4a4
--- /dev/null
+++ b/kernels/triton/flash_attention_fwd.py
@@ -0,0 +1,75 @@
+from __future__ import annotations
+
+import torch
+
+try:
+    import triton
+    import triton.language as tl
+except ImportError:  # pragma: no cover - depends on local environment
+    triton = None
+    tl = None
+
+
+TRITON_AVAILABLE = triton is not None
+
+
+if TRITON_AVAILABLE:
+
+    @triton.jit
+    def flash_attention_fwd_kernel(
+        q_ptr,
+        k_ptr,
+        v_ptr,
+        out_ptr,
+        seq_len,
+        head_dim,
+        stride_q_batch,
+        stride_q_head,
+        stride_q_seq,
+        stride_q_dim,
+        stride_k_batch,
+        stride_k_head,
+        stride_k_seq,
+        stride_k_dim,
+        stride_v_batch,
+        stride_v_head,
+        stride_v_seq,
+        stride_v_dim,
+        stride_out_batch,
+        stride_out_head,
+        stride_out_seq,
+        stride_out_dim,
+        causal,
+        block_q: tl.constexpr,
+        block_k: tl.constexpr,
+        block_d: tl.constexpr,
+    ):
+        pid_q = tl.program_id(axis=0)
+        pid_bh = tl.program_id(axis=1)
+        # TODO(student): map pid_q and pid_bh to a batch/head/query tile.
+        # TODO(student): load Q, K, and V blocks.
+        # TODO(student): compute scores for the current block pair.
+        # TODO(student): apply optional causal masking.
+        # TODO(student): update online softmax state and accumulate the output block.
+        # TODO(student): store the final output tile.
+        pass
+
+
+def triton_flash_attention_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    causal: bool = False,
+    block_q: int = 64,
+    block_k: int = 64,
+) -> torch.Tensor:
+    if not TRITON_AVAILABLE:
+        raise RuntimeError("Triton is not installed in this environment.")
+    if q.shape != k.shape or q.shape != v.shape:
+        raise ValueError(f"q, k, v must match; got {q.shape}, {k.shape}, {v.shape}")
+    if q.ndim != 4:
+        raise ValueError("expected [batch, heads, seq, dim] inputs")
+    if not q.is_cuda or not k.is_cuda or not v.is_cuda:
+        raise ValueError("Triton kernels in this lab expect CUDA tensors.")
+    raise NotImplementedError("TODO(student): implement the FlashAttention forward launch.")
+
diff --git a/kernels/triton/online_softmax.py b/kernels/triton/online_softmax.py
new file mode 100644
index 0000000..328c818
--- /dev/null
+++ b/kernels/triton/online_softmax.py
@@ -0,0 +1,42 @@
+from __future__ import annotations
+
+import torch
+
+try:
+    import triton
+    import triton.language as tl
+except ImportError:  # pragma: no cover - depends on local environment
+    triton = None
+    tl = None
+
+
+TRITON_AVAILABLE = triton is not None
+
+
+if TRITON_AVAILABLE:
+
+    @triton.jit
+    def online_softmax_kernel(
+        x_ptr,
+        out_ptr,
+        num_cols,
+        stride_x_row,
+        stride_out_row,
+        block_size: tl.constexpr,
+    ):
+        row_idx = tl.program_id(axis=0)
+        # TODO(student): maintain running max and running sum for this row.
+        # TODO(student): process the row in blocks rather than assuming all columns fit at once.
+        # TODO(student): write the final normalized probabilities.
+        pass
+
+
+def triton_online_softmax(x: torch.Tensor, block_size: int = 128) -> torch.Tensor:
+    if not TRITON_AVAILABLE:
+        raise RuntimeError("Triton is not installed in this environment.")
+    if x.ndim != 2:
+        raise ValueError(f"expected 2D input, got {tuple(x.shape)}")
+    if not x.is_cuda:
+        raise ValueError("Triton kernels in this lab expect CUDA tensors.")
+    raise NotImplementedError("TODO(student): implement online softmax in Triton.")
+
diff --git a/kernels/triton/row_softmax.py b/kernels/triton/row_softmax.py
new file mode 100644
index 0000000..db8dd27
--- /dev/null
+++ b/kernels/triton/row_softmax.py
@@ -0,0 +1,45 @@
+from __future__ import annotations
+
+import torch
+
+try:
+    import triton
+    import triton.language as tl
+except ImportError:  # pragma: no cover - depends on local environment
+    triton = None
+    tl = None
+
+
+TRITON_AVAILABLE = triton is not None
+
+
+if TRITON_AVAILABLE:
+
+    @triton.jit
+    def row_softmax_kernel(
+        x_ptr,
+        out_ptr,
+        num_cols,
+        stride_x_row,
+        stride_out_row,
+        block_size: tl.constexpr,
+    ):
+        row_idx = tl.program_id(axis=0)
+        col_offsets = tl.arange(0, block_size)
+        # TODO(student): convert row_idx and col_offsets into pointers for this row.
+        # TODO(student): load a row with masking.
+        # TODO(student): subtract the row max for stability.
+        # TODO(student): exponentiate, sum, and normalize.
+        # TODO(student): store the normalized row.
+        pass
+
+
+def triton_row_softmax(x: torch.Tensor, block_size: int = 128) -> torch.Tensor:
+    if not TRITON_AVAILABLE:
+        raise RuntimeError("Triton is not installed in this environment.")
+    if x.ndim != 2:
+        raise ValueError(f"expected 2D input, got {tuple(x.shape)}")
+    if not x.is_cuda:
+        raise ValueError("Triton kernels in this lab expect CUDA tensors.")
+    raise NotImplementedError("TODO(student): implement row-wise softmax launch logic.")
+
diff --git a/kernels/triton/tiled_matmul.py b/kernels/triton/tiled_matmul.py
new file mode 100644
index 0000000..9059458
--- /dev/null
+++ b/kernels/triton/tiled_matmul.py
@@ -0,0 +1,61 @@
+from __future__ import annotations
+
+import torch
+
+try:
+    import triton
+    import triton.language as tl
+except ImportError:  # pragma: no cover - depends on local environment
+    triton = None
+    tl = None
+
+
+TRITON_AVAILABLE = triton is not None
+
+
+if TRITON_AVAILABLE:
+
+    @triton.jit
+    def tiled_matmul_kernel(
+        a_ptr,
+        b_ptr,
+        c_ptr,
+        m,
+        n,
+        k,
+        stride_am,
+        stride_ak,
+        stride_bk,
+        stride_bn,
+        stride_cm,
+        stride_cn,
+        block_m: tl.constexpr,
+        block_n: tl.constexpr,
+        block_k: tl.constexpr,
+    ):
+        pid_m = tl.program_id(axis=0)
+        pid_n = tl.program_id(axis=1)
+        # TODO(student): compute the tile owned by this program instance.
+        # TODO(student): loop over K tiles and accumulate partial products.
+        # TODO(student): use masking on edge tiles.
+        # TODO(student): store the output tile.
+        pass
+
+
+def triton_tiled_matmul(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    block_m: int = 64,
+    block_n: int = 64,
+    block_k: int = 32,
+) -> torch.Tensor:
+    if not TRITON_AVAILABLE:
+        raise RuntimeError("Triton is not installed in this environment.")
+    if a.ndim != 2 or b.ndim != 2:
+        raise ValueError("expected two 2D tensors")
+    if a.shape[1] != b.shape[0]:
+        raise ValueError(f"incompatible shapes: {a.shape} and {b.shape}")
+    if not a.is_cuda or not b.is_cuda:
+        raise ValueError("Triton kernels in this lab expect CUDA tensors.")
+    raise NotImplementedError("TODO(student): implement the tiled Triton matmul path.")
+
diff --git a/kernels/triton/vector_add.py b/kernels/triton/vector_add.py
new file mode 100644
index 0000000..d5a2bfc
--- /dev/null
+++ b/kernels/triton/vector_add.py
@@ -0,0 +1,44 @@
+from __future__ import annotations
+
+import torch
+
+try:
+    import triton
+    import triton.language as tl
+except ImportError:  # pragma: no cover - depends on local environment
+    triton = None
+    tl = None
+
+
+TRITON_AVAILABLE = triton is not None
+
+
+if TRITON_AVAILABLE:
+
+    @triton.jit
+    def vector_add_kernel(
+        x_ptr,
+        y_ptr,
+        out_ptr,
+        num_elements,
+        block_size: tl.constexpr,
+    ):
+        pid = tl.program_id(axis=0)
+        offsets = pid * block_size + tl.arange(0, block_size)
+        mask = offsets < num_elements
+        # TODO(student): load x and y using masked tl.load calls.
+        # TODO(student): add the vectors.
+        # TODO(student): write the result with tl.store.
+        pass
+
+
+def triton_vector_add(x: torch.Tensor, y: torch.Tensor, block_size: int = 1024) -> torch.Tensor:
+    """Student entrypoint for the Triton vector add task."""
+    if not TRITON_AVAILABLE:
+        raise RuntimeError("Triton is not installed in this environment.")
+    if x.shape != y.shape:
+        raise ValueError(f"shape mismatch: {x.shape} vs {y.shape}")
+    if not x.is_cuda or not y.is_cuda:
+        raise ValueError("Triton kernels in this lab expect CUDA tensors.")
+    raise NotImplementedError("TODO(student): launch vector_add_kernel and return the output tensor.")
+
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..69a527e
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,45 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "kernel-lab"
+version = "0.1.0"
+description = "Educational GPU kernel lab for PyTorch, Triton, and CUDA."
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+  "numpy>=1.26",
+  "torch>=2.10",
+  "triton>=3.0",
+  "pytest>=8.0",
+  "packaging>=24.0",
+  "cmake>=3.25",
+  "ninja>=1.11",
+]
+
+[tool.uv.sources]
+torch = { index = "pytorch-cu128" }
+
+[[tool.uv.index]]
+name = "aliyun-pypi"
+url = "https://mirrors.aliyun.com/pypi/simple/"
+default = true
+
+[[tool.uv.index]]
+name = "pytorch-cu128"
+url = "https://download.pytorch.org/whl/cu128"
+explicit = true
+
+[tool.setuptools.packages.find]
+include = ["reference*", "kernels*"]
+
+[tool.pytest.ini_options]
+addopts = "-ra --import-mode=importlib"
+testpaths = ["tests", "tasks"]
+markers = [
+  "reference: tests for plain PyTorch references",
+  "skeleton: tests validating learner skeleton behavior",
+  "cuda_required: tests that need CUDA",
+  "triton_required: tests that need Triton",
+]
diff --git a/reference/__init__.py b/reference/__init__.py
new file mode 100644
index 0000000..1626e19
--- /dev/null
+++ b/reference/__init__.py
@@ -0,0 +1,2 @@
+"""Reference PyTorch implementations used throughout the lab."""
+
diff --git a/reference/torch_attention.py b/reference/torch_attention.py
new file mode 100644
index 0000000..0ed80a1
--- /dev/null
+++ b/reference/torch_attention.py
@@ -0,0 +1,30 @@
+from __future__ import annotations
+
+import math
+
+import torch
+
+
+def torch_attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    causal: bool = False,
+) -> torch.Tensor:
+    """Reference scaled dot-product attention forward pass."""
+    if q.ndim != 4 or k.ndim != 4 or v.ndim != 4:
+        raise ValueError("expected tensors shaped [batch, heads, seq, dim]")
+    if q.shape != k.shape or q.shape != v.shape:
+        raise ValueError(f"q, k, v must have matching shapes; got {q.shape}, {k.shape}, {v.shape}")
+
+    dim = q.shape[-1]
+    scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(dim)
+    if causal:
+        seq = q.shape[-2]
+        mask = torch.triu(
+            torch.ones((seq, seq), dtype=torch.bool, device=q.device), diagonal=1
+        )
+        scores = scores.masked_fill(mask, float("-inf"))
+    probs = torch.softmax(scores, dim=-1)
+    return torch.matmul(probs, v)
+
diff --git a/reference/torch_matmul.py b/reference/torch_matmul.py
new file mode 100644
index 0000000..5242b86
--- /dev/null
+++ b/reference/torch_matmul.py
@@ -0,0 +1,13 @@
+from __future__ import annotations
+
+import torch
+
+
+def torch_matmul(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+    """Reference matrix multiplication with simple shape validation."""
+    if a.ndim != 2 or b.ndim != 2:
+        raise ValueError("torch_matmul expects two 2D tensors")
+    if a.shape[1] != b.shape[0]:
+        raise ValueError(f"incompatible shapes: {a.shape} and {b.shape}")
+    return a @ b
+
diff --git a/reference/torch_online_softmax.py b/reference/torch_online_softmax.py
new file mode 100644
index 0000000..96d8d45
--- /dev/null
+++ b/reference/torch_online_softmax.py
@@ -0,0 +1,25 @@
+from __future__ import annotations
+
+import torch
+
+
+def torch_online_softmax(x: torch.Tensor) -> torch.Tensor:
+    """Reference online-softmax derivation implemented with an explicit loop."""
+    if x.ndim != 2:
+        raise ValueError(f"expected a 2D tensor, got shape {tuple(x.shape)}")
+
+    running_max = torch.full(
+        (x.shape[0],), float("-inf"), dtype=x.dtype, device=x.device
+    )
+    running_sum = torch.zeros((x.shape[0],), dtype=x.dtype, device=x.device)
+
+    for col in range(x.shape[1]):
+        current = x[:, col]
+        new_max = torch.maximum(running_max, current)
+        old_scale = torch.exp(running_max - new_max)
+        current_scale = torch.exp(current - new_max)
+        running_sum = running_sum * old_scale + current_scale
+        running_max = new_max
+
+    return torch.exp(x - running_max[:, None]) / running_sum[:, None]
+
diff --git a/reference/torch_row_softmax.py b/reference/torch_row_softmax.py
new file mode 100644
index 0000000..ecd183b
--- /dev/null
+++ b/reference/torch_row_softmax.py
@@ -0,0 +1,15 @@
+from __future__ import annotations
+
+import torch
+
+
+def torch_row_softmax(x: torch.Tensor) -> torch.Tensor:
+    """Numerically stable row-wise softmax for 2D inputs."""
+    if x.ndim != 2:
+        raise ValueError(f"expected a 2D tensor, got shape {tuple(x.shape)}")
+    row_max = x.max(dim=1, keepdim=True).values
+    shifted = x - row_max
+    exp_shifted = shifted.exp()
+    row_sum = exp_shifted.sum(dim=1, keepdim=True)
+    return exp_shifted / row_sum
+
diff --git a/reference/torch_vector_add.py b/reference/torch_vector_add.py
new file mode 100644
index 0000000..0924224
--- /dev/null
+++ b/reference/torch_vector_add.py
@@ -0,0 +1,11 @@
+from __future__ import annotations
+
+import torch
+
+
+def torch_vector_add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+    """Reference vector add with explicit shape checks."""
+    if x.shape != y.shape:
+        raise ValueError(f"shape mismatch: {x.shape} vs {y.shape}")
+    return x + y
+
diff --git a/tasks/00_env_sanity/__init__.py b/tasks/00_env_sanity/__init__.py
new file mode 100644
index 0000000..247e6fc
--- /dev/null
+++ b/tasks/00_env_sanity/__init__.py
@@ -0,0 +1,2 @@
+"""Environment sanity task."""
+
diff --git a/tasks/00_env_sanity/checklist.md b/tasks/00_env_sanity/checklist.md
new file mode 100644
index 0000000..fa5a415
--- /dev/null
+++ b/tasks/00_env_sanity/checklist.md
@@ -0,0 +1,13 @@
+# Environment Checklist
+
+- PyTorch imports successfully
+- `torch.cuda.is_available()` is `True`
+- At least one CUDA device is visible
+- The GPU name matches the machine you expect to be using
+- Device capability is printed and recorded
+- Triton imports successfully, or you know why it does not
+- `torch.version.cuda` is visible when using CUDA-enabled PyTorch
+- `nvcc --version` works if you plan to build the CUDA extension
+- `nvidia-smi` works if the driver stack is installed
+
+If any line above fails, fix that before working on later tasks.
diff --git a/tasks/00_env_sanity/spec.md b/tasks/00_env_sanity/spec.md
new file mode 100644
index 0000000..9810069
--- /dev/null
+++ b/tasks/00_env_sanity/spec.md
@@ -0,0 +1,46 @@
+# Task 00: Environment Sanity
+
+## 1. Problem Statement
+
+Confirm that your machine can see the GPU software stack needed for the rest of the lab.
+
+## 2. Expected Input/Output Shapes
+
+This task is informational rather than tensor-shaped. The outputs are environment facts:
+
+- PyTorch version
+- CUDA availability
+- Triton import status
+- GPU name
+- device capability
+- toolkit and driver hints when available
+
+## 3. Performance Intuition
+
+Do not benchmark anything yet. First confirm that the environment is what you think it is.
+
+## 4. Memory Access Discussion
+
+Not applicable yet. The point is to avoid debugging kernels when the real problem is a mismatched driver or toolkit.
+
+## 5. What Triton Is Abstracting
+
+Even importing Triton depends on a compatible Python, PyTorch, driver, and GPU stack.
+
+## 6. What CUDA Makes Explicit
+
+CUDA makes the toolkit and architecture targeting explicit. Keep that explicit throughout this repo.
+
+## 7. Reflection Questions
+
+- What exact GPU name does the system report?
+- What device capability does PyTorch report?
+- Does Triton import cleanly?
+- Which part of the stack would you inspect first if CUDA is unavailable?
+
+## 8. Implementation Checklist
+
+- Run `python tools/check_env.py`
+- Run `python tools/print_device_info.py`
+- Write down the reported capability
+- Set `KERNEL_LAB_CUDA_ARCH` explicitly if you need to change architecture targeting
diff --git a/tasks/01_vector_add/__init__.py b/tasks/01_vector_add/__init__.py
new file mode 100644
index 0000000..9a154c3
--- /dev/null
+++ b/tasks/01_vector_add/__init__.py
@@ -0,0 +1,2 @@
+"""Vector add task."""
+
diff --git a/tasks/01_vector_add/bench.py b/tasks/01_vector_add/bench.py
new file mode 100644
index 0000000..6d7f9c8
--- /dev/null
+++ b/tasks/01_vector_add/bench.py
@@ -0,0 +1,50 @@
+from __future__ import annotations
+
+import statistics
+import sys
+import time
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[2]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+
+import torch
+
+from kernels.triton.vector_add import triton_vector_add
+from reference.torch_vector_add import torch_vector_add
+
+
+def benchmark(fn, *args, warmup: int = 5, reps: int = 25) -> float:
+    for _ in range(warmup):
+        fn(*args)
+    if args[0].is_cuda:
+        torch.cuda.synchronize()
+    times_ms = []
+    for _ in range(reps):
+        if args[0].is_cuda:
+            torch.cuda.synchronize()
+        start = time.perf_counter()
+        fn(*args)
+        if args[0].is_cuda:
+            torch.cuda.synchronize()
+        times_ms.append((time.perf_counter() - start) * 1e3)
+    return statistics.median(times_ms)
+
+
+def main() -> None:
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    x = torch.randn(1 << 20, device=device)
+    y = torch.randn(1 << 20, device=device)
+    ref_ms = benchmark(torch_vector_add, x, y)
+    print(f"torch_vector_add: {ref_ms:.3f} ms")
+    if device == "cuda":
+        try:
+            triton_ms = benchmark(triton_vector_add, x, y)
+            print(f"triton_vector_add: {triton_ms:.3f} ms")
+        except (NotImplementedError, RuntimeError) as exc:
+            print(f"triton_vector_add: skipped ({exc})")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tasks/01_vector_add/cuda_skeleton.cu b/tasks/01_vector_add/cuda_skeleton.cu
new file mode 100644
index 0000000..736f4cf
--- /dev/null
+++ b/tasks/01_vector_add/cuda_skeleton.cu
@@ -0,0 +1,10 @@
+// Workbook-local CUDA sketch for vector add.
+//
+// The repository-level implementation lives in kernels/cuda/src/vector_add.cu.
+// Read this side by side with the Triton version.
+
+// TODO(student):
+// 1. Compute global_idx from blockIdx.x, blockDim.x, and threadIdx.x.
+// 2. Guard the tail with if (global_idx < numel).
+// 3. Load x[global_idx] and y[global_idx].
+// 4. Store the sum.
diff --git a/tasks/01_vector_add/spec.md b/tasks/01_vector_add/spec.md
new file mode 100644
index 0000000..4aaa336
--- /dev/null
+++ b/tasks/01_vector_add/spec.md
@@ -0,0 +1,40 @@
+# Task 01: Vector Add
+
+## 1. Problem Statement
+
+Implement `out[i] = x[i] + y[i]` in both Triton and CUDA, then compare both against the PyTorch reference.
+
+## 2. Expected Input/Output Shapes
+
+- Input: two tensors with identical 1D or flattened shapes
+- Output: one tensor with the same shape
+
+## 3. Performance Intuition
+
+Vector add is simple enough that launch overhead and memory bandwidth dominate quickly. It is a good place to learn indexing before the math becomes interesting.
+
+## 4. Memory Access Discussion
+
+This kernel should read `x[i]` and `y[i]` once and write `out[i]` once. The main thing to inspect is whether neighboring threads or lanes access neighboring elements.
+
+## 5. What Triton Is Abstracting
+
+Triton lets you express one block of contiguous offsets with `program_id` and `tl.arange`, then apply a mask on the tail.
+
+## 6. What CUDA Makes Explicit
+
+CUDA makes you compute `global_idx` from block and thread indices yourself and write the boundary check explicitly.
+
+## 7. Reflection Questions
+
+- What is the exact correspondence between `program_id` and `blockIdx.x` here?
+- Why is a mask or bounds check required on the final block?
+- How would the ownership change if one thread handled multiple elements?
+
+## 8. Implementation Checklist
+
+- Confirm the reference implementation
+- Fill in the Triton masked loads, add, and store
+- Fill in the CUDA thread ownership and store
+- Test small and non-multiple-of-block-size shapes
+- Benchmark bandwidth on larger vectors
diff --git a/tasks/01_vector_add/test_task.py b/tasks/01_vector_add/test_task.py
new file mode 100644
index 0000000..36f1289
--- /dev/null
+++ b/tasks/01_vector_add/test_task.py
@@ -0,0 +1,35 @@
+from __future__ import annotations
+
+import pytest
+import torch
+
+from kernels.triton.vector_add import triton_vector_add
+from reference.torch_vector_add import torch_vector_add
+
+
+def _run_impl_or_skip(fn, *args):
+    try:
+        return fn(*args)
+    except NotImplementedError:
+        pytest.skip("implementation is still TODO")
+    except RuntimeError as exc:
+        pytest.skip(str(exc))
+
+
+@pytest.mark.reference
+def test_vector_add_reference_matches_torch():
+    x = torch.randn(257)
+    y = torch.randn(257)
+    out = torch_vector_add(x, y)
+    torch.testing.assert_close(out, x + y)
+
+
+@pytest.mark.triton_required
+@pytest.mark.skeleton
+def test_triton_vector_add_if_available():
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
+    x = torch.randn(513, device="cuda")
+    y = torch.randn(513, device="cuda")
+    out = _run_impl_or_skip(triton_vector_add, x, y)
+    torch.testing.assert_close(out, x + y)
diff --git a/tasks/01_vector_add/triton_skeleton.py b/tasks/01_vector_add/triton_skeleton.py
new file mode 100644
index 0000000..661d0a8
--- /dev/null
+++ b/tasks/01_vector_add/triton_skeleton.py
@@ -0,0 +1,19 @@
+"""Workbook-local Triton sketch for vector add.
+
+The repository-level implementation lives in kernels/triton/vector_add.py.
+Use this file as a short-form scratchpad before editing the real kernel.
+"""
+
+
+def notes() -> str:
+    return """
+TODO(student):
+1. Map one Triton program instance to one contiguous block of elements.
+2. Compute offsets with pid * BLOCK_SIZE + arange.
+3. Mask the tail.
+4. Load x and y, add them, store the result.
+"""
+
+
+if __name__ == "__main__":
+    print(notes())
diff --git a/tasks/02_row_softmax/__init__.py b/tasks/02_row_softmax/__init__.py
new file mode 100644
index 0000000..2440342
--- /dev/null
+++ b/tasks/02_row_softmax/__init__.py
@@ -0,0 +1,2 @@
+"""Row softmax task."""
+
diff --git a/tasks/02_row_softmax/bench.py b/tasks/02_row_softmax/bench.py
new file mode 100644
index 0000000..ce272e8
--- /dev/null
+++ b/tasks/02_row_softmax/bench.py
@@ -0,0 +1,49 @@
+from __future__ import annotations
+
+import statistics
+import sys
+import time
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[2]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+
+import torch
+
+from kernels.triton.row_softmax import triton_row_softmax
+from reference.torch_row_softmax import torch_row_softmax
+
+
+def benchmark(fn, *args, warmup: int = 5, reps: int = 25) -> float:
+    for _ in range(warmup):
+        fn(*args)
+    if args[0].is_cuda:
+        torch.cuda.synchronize()
+    times_ms = []
+    for _ in range(reps):
+        if args[0].is_cuda:
+            torch.cuda.synchronize()
+        start = time.perf_counter()
+        fn(*args)
+        if args[0].is_cuda:
+            torch.cuda.synchronize()
+        times_ms.append((time.perf_counter() - start) * 1e3)
+    return statistics.median(times_ms)
+
+
+def main() -> None:
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    x = torch.randn(4096, 1024, device=device)
+    ref_ms = benchmark(torch_row_softmax, x)
+    print(f"torch_row_softmax: {ref_ms:.3f} ms")
+    if device == "cuda":
+        try:
+            triton_ms = benchmark(triton_row_softmax, x)
+            print(f"triton_row_softmax: {triton_ms:.3f} ms")
+        except (NotImplementedError, RuntimeError) as exc:
+            print(f"triton_row_softmax: skipped ({exc})")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tasks/02_row_softmax/cuda_skeleton.cu b/tasks/02_row_softmax/cuda_skeleton.cu
new file mode 100644
index 0000000..bc1529a
--- /dev/null
+++ b/tasks/02_row_softmax/cuda_skeleton.cu
@@ -0,0 +1,11 @@
+// Workbook-local CUDA sketch for row softmax.
+//
+// Reflection prompt:
+// Softmax is usually bandwidth-bound because the math is cheap but the rows are read and written a lot.
+// Keep track of how many global-memory passes your implementation needs.
+
+// TODO(student):
+// 1. Assign one block or block tile to a row.
+// 2. Compute the row max.
+// 3. Compute the sum of exp(x - row_max).
+// 4. Normalize the row.
diff --git a/tasks/02_row_softmax/spec.md b/tasks/02_row_softmax/spec.md
new file mode 100644
index 0000000..bfaec7d
--- /dev/null
+++ b/tasks/02_row_softmax/spec.md
@@ -0,0 +1,40 @@
+# Task 02: Row Softmax
+
+## 1. Problem Statement
+
+Implement a row-wise softmax with numerical stability and compare naive and fused viewpoints.
+
+## 2. Expected Input/Output Shapes
+
+- Input: a 2D tensor `[num_rows, num_cols]`
+- Output: a 2D tensor with the same shape
+
+## 3. Performance Intuition
+
+Softmax is often bandwidth-bound because each element is read several times unless you fuse work carefully. The arithmetic is cheap relative to the data movement.
+
+## 4. Memory Access Discussion
+
+A naive implementation may read rows multiple times: once for the max, once for the sum of exponentials, and once for normalization. Think about which intermediate values can stay on chip.
+
+## 5. What Triton Is Abstracting
+
+Triton makes it easy to load a row block, apply masked operations, and reduce across the block with tensor-style code.
+
+## 6. What CUDA Makes Explicit
+
+CUDA forces you to decide where the row reduction lives: one block per row, multiple warps per row, or a tiled strategy. Shared-memory use and synchronization become explicit design choices.
+
+## 7. Reflection Questions
+
+- Why is max subtraction required for stable softmax?
+- Why is softmax often bandwidth-bound rather than compute-bound?
+- Which intermediate quantities would you prefer not to write back to global memory?
+
+## 8. Implementation Checklist
+
+- Validate the reference row softmax
+- Fill in Triton row loading, max reduction, sum reduction, and normalization
+- Fill in the CUDA reduction structure
+- Test large positive and negative values
+- Compare against `torch.softmax`
diff --git a/tasks/02_row_softmax/test_task.py b/tasks/02_row_softmax/test_task.py
new file mode 100644
index 0000000..d369508
--- /dev/null
+++ b/tasks/02_row_softmax/test_task.py
@@ -0,0 +1,40 @@
+from __future__ import annotations
+
+import pytest
+import torch
+
+from kernels.triton.row_softmax import triton_row_softmax
+from reference.torch_row_softmax import torch_row_softmax
+
+
+def _run_impl_or_skip(fn, *args):
+    try:
+        return fn(*args)
+    except NotImplementedError:
+        pytest.skip("implementation is still TODO")
+    except RuntimeError as exc:
+        pytest.skip(str(exc))
+
+
+@pytest.mark.reference
+def test_row_softmax_reference_matches_torch():
+    x = torch.randn(8, 17)
+    out = torch_row_softmax(x)
+    torch.testing.assert_close(out, torch.softmax(x, dim=1))
+
+
+@pytest.mark.reference
+def test_row_softmax_reference_is_numerically_stable():
+    x = torch.tensor([[1000.0, 1001.0, 1002.0], [-1000.0, -999.0, -998.0]])
+    out = torch_row_softmax(x)
+    torch.testing.assert_close(out.sum(dim=1), torch.ones(2), atol=1e-6, rtol=1e-6)
+
+
+@pytest.mark.triton_required
+@pytest.mark.skeleton
+def test_triton_row_softmax_if_available():
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
+    x = torch.randn(16, 63, device="cuda")
+    out = _run_impl_or_skip(triton_row_softmax, x)
+    torch.testing.assert_close(out, torch.softmax(x, dim=1), atol=1e-4, rtol=1e-4)
diff --git a/tasks/02_row_softmax/triton_skeleton.py b/tasks/02_row_softmax/triton_skeleton.py
new file mode 100644
index 0000000..37e714a
--- /dev/null
+++ b/tasks/02_row_softmax/triton_skeleton.py
@@ -0,0 +1,20 @@
+"""Workbook-local Triton notes for row softmax."""
+
+
+def notes() -> str:
+    return """
+TODO(student):
+1. Decide what one program instance owns: a whole row or a row tile.
+2. Load a row with masking.
+3. Compute row_max = max(x).
+4. Compute exp(x - row_max), then the row sum.
+5. Normalize and store.
+
+Reflection:
+- Why does numerical stability matter here more than in vector add?
+- Where does extra memory traffic appear in a naive multi-kernel approach?
+"""
+
+
+if __name__ == "__main__":
+    print(notes())
diff --git a/tasks/03_tiled_matmul/__init__.py b/tasks/03_tiled_matmul/__init__.py
new file mode 100644
index 0000000..a15c94f
--- /dev/null
+++ b/tasks/03_tiled_matmul/__init__.py
@@ -0,0 +1,2 @@
+"""Tiled matmul task."""
+
diff --git a/tasks/03_tiled_matmul/bench.py b/tasks/03_tiled_matmul/bench.py
new file mode 100644
index 0000000..184b5bc
--- /dev/null
+++ b/tasks/03_tiled_matmul/bench.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+import statistics
+import sys
+import time
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[2]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+
+import torch
+
+from kernels.triton.tiled_matmul import triton_tiled_matmul
+from reference.torch_matmul import torch_matmul
+
+
+def benchmark(fn, *args, warmup: int = 5, reps: int = 20) -> float:
+    for _ in range(warmup):
+        fn(*args)
+    if args[0].is_cuda:
+        torch.cuda.synchronize()
+    times_ms = []
+    for _ in range(reps):
+        if args[0].is_cuda:
+            torch.cuda.synchronize()
+        start = time.perf_counter()
+        fn(*args)
+        if args[0].is_cuda:
+            torch.cuda.synchronize()
+        times_ms.append((time.perf_counter() - start) * 1e3)
+    return statistics.median(times_ms)
+
+
+def main() -> None:
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    for m, k, n in [(128, 128, 128), (512, 512, 512)]:
+        a = torch.randn(m, k, device=device)
+        b = torch.randn(k, n, device=device)
+        ref_ms = benchmark(torch_matmul, a, b)
+        print(f"torch_matmul {m}x{k}x{n}: {ref_ms:.3f} ms")
+        if device == "cuda":
+            try:
+                triton_ms = benchmark(triton_tiled_matmul, a, b)
+                print(f"triton_tiled_matmul {m}x{k}x{n}: {triton_ms:.3f} ms")
+            except (NotImplementedError, RuntimeError) as exc:
+                print(f"triton_tiled_matmul {m}x{k}x{n}: skipped ({exc})")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tasks/03_tiled_matmul/cuda_skeleton.cu b/tasks/03_tiled_matmul/cuda_skeleton.cu
new file mode 100644
index 0000000..73da134
--- /dev/null
+++ b/tasks/03_tiled_matmul/cuda_skeleton.cu
@@ -0,0 +1,9 @@
+// Workbook-local CUDA sketch for tiled matmul.
+//
+// TODO(student):
+// 1. Choose a block tile size, for example 16x16 or 32x32.
+// 2. Load one A tile and one B tile into shared memory.
+// 3. Synchronize.
+// 4. Accumulate partial products.
+// 5. Synchronize before loading the next tile.
+// 6. Store the final C element or tile.
diff --git a/tasks/03_tiled_matmul/spec.md b/tasks/03_tiled_matmul/spec.md
new file mode 100644
index 0000000..898339a
--- /dev/null
+++ b/tasks/03_tiled_matmul/spec.md
@@ -0,0 +1,51 @@
+# Task 03: Tiled Matmul
+
+## 1. Problem Statement
+
+Implement a tiled matrix multiplication and compare the tile abstraction in Triton with the explicit shared-memory strategy in CUDA.
+
+## 2. Expected Input/Output Shapes
+
+- Input `A`: `[M, K]`
+- Input `B`: `[K, N]`
+- Output `C`: `[M, N]`
+
+## 3. Performance Intuition
+
+Matmul becomes interesting once data reuse matters. Re-reading the same `A` and `B` values from global memory is expensive; tiling exists to reuse those values across many multiply-accumulate operations.
+
+## 4. Memory Access Discussion
+
+Think about which `A` tile and `B` tile each work unit needs. The performance win comes from moving those tiles into on-chip storage and reusing them before fetching the next tile.
+
+## 5. What Triton Is Abstracting
+
+Triton lets you think in output tiles and blocked pointer arithmetic. The tile loads and accumulations read like tensor operations.
+
+## 6. What CUDA Makes Explicit
+
+CUDA makes you choose block dimensions, allocate shared memory, manage cooperative loads, and synchronize between load and compute phases.
+
+## 7. Reflection Questions
+
+- Which values in `A` and `B` are reused across multiple output elements?
+- Why does tiling reduce global-memory traffic?
+- How does a Triton tile map to CUDA shared-memory tiles and threads?
+
+## 8. Implementation Checklist
+
+- Confirm the reference matmul
+- Draw a block/tile diagram before coding
+- Implement the Triton tile loop over `K`
+- Implement the CUDA shared-memory tile loop
+- Benchmark against `torch.matmul` on small and medium sizes
+
+## Tile Diagram Prompt
+
+Sketch:
+
+- one output tile `C[m0:m1, n0:n1]`
+- the matching `A[m0:m1, k0:k1]`
+- the matching `B[k0:k1, n0:n1]`
+
+That sketch should tell you what belongs in shared memory.
diff --git a/tasks/03_tiled_matmul/test_task.py b/tasks/03_tiled_matmul/test_task.py
new file mode 100644
index 0000000..a066714
--- /dev/null
+++ b/tasks/03_tiled_matmul/test_task.py
@@ -0,0 +1,35 @@
+from __future__ import annotations
+
+import pytest
+import torch
+
+from kernels.triton.tiled_matmul import triton_tiled_matmul
+from reference.torch_matmul import torch_matmul
+
+
+def _run_impl_or_skip(fn, *args):
+    try:
+        return fn(*args)
+    except NotImplementedError:
+        pytest.skip("implementation is still TODO")
+    except RuntimeError as exc:
+        pytest.skip(str(exc))
+
+
+@pytest.mark.reference
+def test_tiled_matmul_reference_matches_torch():
+    a = torch.randn(8, 16)
+    b = torch.randn(16, 12)
+    out = torch_matmul(a, b)
+    torch.testing.assert_close(out, a @ b)
+
+
+@pytest.mark.triton_required
+@pytest.mark.skeleton
+def test_triton_tiled_matmul_if_available():
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
+    a = torch.randn(32, 48, device="cuda")
+    b = torch.randn(48, 40, device="cuda")
+    out = _run_impl_or_skip(triton_tiled_matmul, a, b)
+    torch.testing.assert_close(out, a @ b, atol=1e-3, rtol=1e-3)
diff --git a/tasks/03_tiled_matmul/triton_skeleton.py b/tasks/03_tiled_matmul/triton_skeleton.py
new file mode 100644
index 0000000..da70002
--- /dev/null
+++ b/tasks/03_tiled_matmul/triton_skeleton.py
@@ -0,0 +1,16 @@
+"""Workbook-local Triton notes for tiled matmul."""
+
+
+def notes() -> str:
+    return """
+TODO(student):
+1. Map one program instance to one output tile.
+2. Build row/col offsets for the tile.
+3. Loop over K in block_k chunks.
+4. Load A and B tiles, accumulate partial products.
+5. Store the output tile with masking on edges.
+"""
+
+
+if __name__ == "__main__":
+    print(notes())
diff --git a/tasks/04_online_softmax/__init__.py b/tasks/04_online_softmax/__init__.py
new file mode 100644
index 0000000..3e500cf
--- /dev/null
+++ b/tasks/04_online_softmax/__init__.py
@@ -0,0 +1,2 @@
+"""Online softmax task."""
+
diff --git a/tasks/04_online_softmax/bench.py b/tasks/04_online_softmax/bench.py
new file mode 100644
index 0000000..8cf1057
--- /dev/null
+++ b/tasks/04_online_softmax/bench.py
@@ -0,0 +1,49 @@
+from __future__ import annotations
+
+import statistics
+import sys
+import time
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[2]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+
+import torch
+
+from kernels.triton.online_softmax import triton_online_softmax
+from reference.torch_online_softmax import torch_online_softmax
+
+
+def benchmark(fn, *args, warmup: int = 5, reps: int = 25) -> float:
+    for _ in range(warmup):
+        fn(*args)
+    if args[0].is_cuda:
+        torch.cuda.synchronize()
+    times_ms = []
+    for _ in range(reps):
+        if args[0].is_cuda:
+            torch.cuda.synchronize()
+        start = time.perf_counter()
+        fn(*args)
+        if args[0].is_cuda:
+            torch.cuda.synchronize()
+        times_ms.append((time.perf_counter() - start) * 1e3)
+    return statistics.median(times_ms)
+
+
+def main() -> None:
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    x = torch.randn(2048, 2048, device=device)
+    ref_ms = benchmark(torch_online_softmax, x)
+    print(f"torch_online_softmax: {ref_ms:.3f} ms")
+    if device == "cuda":
+        try:
+            triton_ms = benchmark(triton_online_softmax, x)
+            print(f"triton_online_softmax: {triton_ms:.3f} ms")
+        except (NotImplementedError, RuntimeError) as exc:
+            print(f"triton_online_softmax: skipped ({exc})")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tasks/04_online_softmax/cuda_skeleton.cu b/tasks/04_online_softmax/cuda_skeleton.cu
new file mode 100644
index 0000000..3f8b431
--- /dev/null
+++ b/tasks/04_online_softmax/cuda_skeleton.cu
@@ -0,0 +1,7 @@
+// Workbook-local CUDA sketch for online softmax.
+//
+// TODO(student):
+// 1. Choose how one block owns one row or row tile.
+// 2. Keep running_max and running_sum across column tiles.
+// 3. Update the recurrence carefully for numerical stability.
+// 4. Normalize the final row.
diff --git a/tasks/04_online_softmax/spec.md b/tasks/04_online_softmax/spec.md
new file mode 100644
index 0000000..c1ea4a5
--- /dev/null
+++ b/tasks/04_online_softmax/spec.md
@@ -0,0 +1,49 @@
+# Task 04: Online Softmax
+
+## 1. Problem Statement
+
+Implement the running max / running sum formulation of softmax and connect it to blockwise attention.
+
+## 2. Expected Input/Output Shapes
+
+- Input: `[num_rows, num_cols]`
+- Output: `[num_rows, num_cols]`
+
+## 3. Performance Intuition
+
+The main goal is algorithmic structure rather than raw speed. Online softmax becomes powerful because it lets you process a row incrementally without materializing the full reduction context at once.
+
+## 4. Memory Access Discussion
+
+Think in column tiles. Each tile updates the running normalization state. This matters later when attention scores are processed block by block.
+
+## 5. What Triton Is Abstracting
+
+Triton can express the blocked recurrence with vectorized loads and tensor math while still letting you reason about per-row state.
+
+## 6. What CUDA Makes Explicit
+
+CUDA forces you to decide where the running max and running sum live and how threads cooperate to update them across tiles.
+
+## 7. Reflection Questions
+
+- Why is a running max needed instead of only a running sum?
+- Why does online softmax enable FlashAttention-style blockwise computation?
+- Which values must persist from one tile to the next?
+
+## 8. Implementation Checklist
+
+- Read the reference online softmax
+- Derive the recurrence informally
+- Implement the Triton blocked recurrence
+- Implement the CUDA blocked recurrence
+- Compare against full softmax on small shapes first
+
+## Informal Recurrence
+
+Given a previous state `(m_prev, l_prev)` and a new tile with max `m_tile` and denominator contribution `l_tile`, define:
+
+- `m_new = max(m_prev, m_tile)`
+- `l_new = l_prev * exp(m_prev - m_new) + l_tile * exp(m_tile - m_new)`
+
+That is the key idea you will reuse in FlashAttention.
diff --git a/tasks/04_online_softmax/test_task.py b/tasks/04_online_softmax/test_task.py
new file mode 100644
index 0000000..a6d0899
--- /dev/null
+++ b/tasks/04_online_softmax/test_task.py
@@ -0,0 +1,33 @@
+from __future__ import annotations
+
+import pytest
+import torch
+
+from kernels.triton.online_softmax import triton_online_softmax
+from reference.torch_online_softmax import torch_online_softmax
+
+
+def _run_impl_or_skip(fn, *args):
+    try:
+        return fn(*args)
+    except NotImplementedError:
+        pytest.skip("implementation is still TODO")
+    except RuntimeError as exc:
+        pytest.skip(str(exc))
+
+
+@pytest.mark.reference
+def test_online_softmax_reference_matches_torch():
+    x = torch.randn(6, 19)
+    out = torch_online_softmax(x)
+    torch.testing.assert_close(out, torch.softmax(x, dim=1), atol=1e-5, rtol=1e-5)
+
+
+@pytest.mark.triton_required
+@pytest.mark.skeleton
+def test_triton_online_softmax_if_available():
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
+    x = torch.randn(8, 97, device="cuda")
+    out = _run_impl_or_skip(triton_online_softmax, x)
+    torch.testing.assert_close(out, torch.softmax(x, dim=1), atol=1e-4, rtol=1e-4)
diff --git a/tasks/04_online_softmax/triton_skeleton.py b/tasks/04_online_softmax/triton_skeleton.py
new file mode 100644
index 0000000..03d42cc
--- /dev/null
+++ b/tasks/04_online_softmax/triton_skeleton.py
@@ -0,0 +1,15 @@
+"""Workbook-local Triton notes for online softmax."""
+
+
+def notes() -> str:
+    return """
+TODO(student):
+1. Keep running_max and running_sum for one row.
+2. Process the row in blocks.
+3. Update the recurrence after each block.
+4. Normalize once the full row has been seen.
+"""
+
+
+if __name__ == "__main__":
+    print(notes())
diff --git a/tasks/05_flash_attention_fwd/__init__.py b/tasks/05_flash_attention_fwd/__init__.py
new file mode 100644
index 0000000..1ccc440
--- /dev/null
+++ b/tasks/05_flash_attention_fwd/__init__.py
@@ -0,0 +1,2 @@
+"""Flash attention forward task."""
+
diff --git a/tasks/05_flash_attention_fwd/bench.py b/tasks/05_flash_attention_fwd/bench.py
new file mode 100644
index 0000000..d4a95f5
--- /dev/null
+++ b/tasks/05_flash_attention_fwd/bench.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+import statistics
+import sys
+import time
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[2]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+
+import torch
+
+from kernels.triton.flash_attention_fwd import triton_flash_attention_fwd
+from reference.torch_attention import torch_attention
+
+
+def benchmark(fn, *args, warmup: int = 5, reps: int = 20, **kwargs) -> float:
+    for _ in range(warmup):
+        fn(*args, **kwargs)
+    if args[0].is_cuda:
+        torch.cuda.synchronize()
+    times_ms = []
+    for _ in range(reps):
+        if args[0].is_cuda:
+            torch.cuda.synchronize()
+        start = time.perf_counter()
+        fn(*args, **kwargs)
+        if args[0].is_cuda:
+            torch.cuda.synchronize()
+        times_ms.append((time.perf_counter() - start) * 1e3)
+    return statistics.median(times_ms)
+
+
+def main() -> None:
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    q = torch.randn(2, 8, 128, 64, device=device)
+    k = torch.randn(2, 8, 128, 64, device=device)
+    v = torch.randn(2, 8, 128, 64, device=device)
+    ref_ms = benchmark(torch_attention, q, k, v, causal=False)
+    print(f"torch_attention: {ref_ms:.3f} ms")
+    if device == "cuda":
+        try:
+            triton_ms = benchmark(triton_flash_attention_fwd, q, k, v, causal=False)
+            print(f"triton_flash_attention_fwd: {triton_ms:.3f} ms")
+        except (NotImplementedError, RuntimeError) as exc:
+            print(f"triton_flash_attention_fwd: skipped ({exc})")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tasks/05_flash_attention_fwd/cuda_skeleton.cu b/tasks/05_flash_attention_fwd/cuda_skeleton.cu
new file mode 100644
index 0000000..9db6de2
--- /dev/null
+++ b/tasks/05_flash_attention_fwd/cuda_skeleton.cu
@@ -0,0 +1,14 @@
+// Workbook-local CUDA sketch for FlashAttention forward.
+//
+// Map this against the Triton sketch:
+// - Triton program_id for query tile -> CUDA block ownership
+// - Triton block pointer loads        -> CUDA cooperative global-to-shared loads
+// - Triton masks                      -> explicit edge and causal checks
+// - Triton implicit block math        -> thread/block index arithmetic
+
+// TODO(student):
+// 1. Assign a block to one batch/head/query tile.
+// 2. Load a Q tile and loop over K/V tiles.
+// 3. Compute score tiles and causal masking.
+// 4. Update online softmax state.
+// 5. Accumulate the output tile.
diff --git a/tasks/05_flash_attention_fwd/spec.md b/tasks/05_flash_attention_fwd/spec.md
new file mode 100644
index 0000000..b26687e
--- /dev/null
+++ b/tasks/05_flash_attention_fwd/spec.md
@@ -0,0 +1,59 @@
+# Task 05: Flash Attention Forward
+
+## 1. Problem Statement
+
+Implement a learning-oriented forward-only FlashAttention-style kernel in both Triton and CUDA.
+
+## 2. Expected Input/Output Shapes
+
+- `Q`: `[batch, heads, seq_len, head_dim]`
+- `K`: `[batch, heads, seq_len, head_dim]`
+- `V`: `[batch, heads, seq_len, head_dim]`
+- `Output`: `[batch, heads, seq_len, head_dim]`
+
+## 3. Performance Intuition
+
+The goal is to reduce memory traffic by avoiding full materialization of the score matrix. Correctness comes first. Performance work only matters after the blockwise algorithm is correct.
+
+## 4. Memory Access Discussion
+
+This task is about staged movement:
+
+- load a `Q` block
+- iterate over `K` and `V` blocks
+- compute score blocks
+- update online normalization
+- accumulate the output block
+
+Track where each quantity lives: global memory, registers, or shared memory.
+
+## 5. What Triton Is Abstracting
+
+Triton makes block pointers, program IDs, and masked block operations compact. Those abstractions still correspond to explicit memory ownership decisions.
+
+## 6. What CUDA Makes Explicit
+
+CUDA exposes thread-block mapping, shared-memory staging, synchronization, and reduction details directly. This is where the same algorithm becomes visibly lower level.
+
+## 7. Reflection Questions
+
+- How does online softmax avoid writing out the full score matrix?
+- Which loop corresponds to iterating over key/value blocks?
+- Where do causal masking and normalization interact?
+- How does a Triton block pointer map to a CUDA shared-memory load phase?
+
+## 8. Implementation Checklist
+
+- Confirm the PyTorch reference on tiny shapes
+- Trace the online softmax state update
+- Implement one Triton blockwise forward path
+- Implement one CUDA blockwise forward path
+- Test non-causal first, then causal
+- Benchmark only after small-shape correctness passes
+
+## Explicit Triton To CUDA Mapping
+
+- Triton `program_id(axis=0)` for query tiles maps to CUDA query-tile block ownership
+- Triton `program_id(axis=1)` for batch/head maps to a flattened batch-head block index
+- Triton block pointer math maps to shared-memory staging and pointer arithmetic
+- Triton masked edge handling maps to explicit tail checks and mask branches
diff --git a/tasks/05_flash_attention_fwd/test_task.py b/tasks/05_flash_attention_fwd/test_task.py
new file mode 100644
index 0000000..3e4d563
--- /dev/null
+++ b/tasks/05_flash_attention_fwd/test_task.py
@@ -0,0 +1,49 @@
+from __future__ import annotations
+
+import pytest
+import torch
+
+from kernels.triton.flash_attention_fwd import triton_flash_attention_fwd
+from reference.torch_attention import torch_attention
+
+
+def _run_impl_or_skip(fn, *args, **kwargs):
+    try:
+        return fn(*args, **kwargs)
+    except NotImplementedError:
+        pytest.skip("implementation is still TODO")
+    except RuntimeError as exc:
+        pytest.skip(str(exc))
+
+
+@pytest.mark.reference
+def test_attention_reference_small_shape():
+    q = torch.randn(1, 2, 8, 16)
+    k = torch.randn(1, 2, 8, 16)
+    v = torch.randn(1, 2, 8, 16)
+    out = torch_attention(q, k, v, causal=False)
+    expected = torch.nn.functional.scaled_dot_product_attention(q, k, v, is_causal=False)
+    torch.testing.assert_close(out, expected, atol=1e-5, rtol=1e-5)
+
+
+@pytest.mark.reference
+def test_attention_reference_causal_small_shape():
+    q = torch.randn(1, 1, 8, 16)
+    k = torch.randn(1, 1, 8, 16)
+    v = torch.randn(1, 1, 8, 16)
+    out = torch_attention(q, k, v, causal=True)
+    expected = torch.nn.functional.scaled_dot_product_attention(q, k, v, is_causal=True)
+    torch.testing.assert_close(out, expected, atol=1e-5, rtol=1e-5)
+
+
+@pytest.mark.triton_required
+@pytest.mark.skeleton
+def test_triton_flash_attention_if_available():
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
+    q = torch.randn(1, 2, 16, 32, device="cuda")
+    k = torch.randn(1, 2, 16, 32, device="cuda")
+    v = torch.randn(1, 2, 16, 32, device="cuda")
+    out = _run_impl_or_skip(triton_flash_attention_fwd, q, k, v, causal=False)
+    expected = torch_attention(q, k, v, causal=False)
+    torch.testing.assert_close(out, expected, atol=2e-3, rtol=2e-3)
diff --git a/tasks/05_flash_attention_fwd/triton_skeleton.py b/tasks/05_flash_attention_fwd/triton_skeleton.py
new file mode 100644
index 0000000..13b3c7f
--- /dev/null
+++ b/tasks/05_flash_attention_fwd/triton_skeleton.py
@@ -0,0 +1,19 @@
+"""Workbook-local Triton notes for FlashAttention forward."""
+
+
+def notes() -> str:
+    return """
+TODO(student):
+1. Assign one program instance to one query block for one batch/head.
+2. Load a Q block.
+3. Iterate over K/V blocks.
+4. Compute score blocks.
+5. Apply optional causal masking.
+6. Update running max and running sum.
+7. Accumulate the output block.
+8. Store the final output.
+"""
+
+
+if __name__ == "__main__":
+    print(notes())
diff --git a/tasks/06_pytorch_custom_op/__init__.py b/tasks/06_pytorch_custom_op/__init__.py
new file mode 100644
index 0000000..42c05ea
--- /dev/null
+++ b/tasks/06_pytorch_custom_op/__init__.py
@@ -0,0 +1,2 @@
+"""PyTorch custom op task."""
+
diff --git a/tasks/06_pytorch_custom_op/extension_skeleton.py b/tasks/06_pytorch_custom_op/extension_skeleton.py
new file mode 100644
index 0000000..9a4841d
--- /dev/null
+++ b/tasks/06_pytorch_custom_op/extension_skeleton.py
@@ -0,0 +1,26 @@
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[2]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+
+import torch
+
+from tools.lab_extension import build_extension
+
+
+def main() -> None:
+    ext = build_extension(verbose=True)
+    if ext is None:
+        return
+    print("Extension loaded.")
+    print("Available torch.ops namespace:", hasattr(torch.ops, "kernel_lab"))
+    if hasattr(torch.ops, "kernel_lab"):
+        print("Registered ops:", dir(torch.ops.kernel_lab))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tasks/06_pytorch_custom_op/opcheck_test.py b/tasks/06_pytorch_custom_op/opcheck_test.py
new file mode 100644
index 0000000..b4e66be
--- /dev/null
+++ b/tasks/06_pytorch_custom_op/opcheck_test.py
@@ -0,0 +1,27 @@
+from __future__ import annotations
+
+import pytest
+import torch
+
+from tools.lab_extension import build_extension
+
+
+@pytest.mark.cuda_required
+@pytest.mark.skeleton
+def test_vector_add_opcheck_if_available():
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
+    ext = build_extension(verbose=False)
+    if ext is None or not hasattr(torch.ops, "kernel_lab"):
+        pytest.skip("extension is unavailable")
+    if not hasattr(torch.library, "opcheck"):
+        pytest.skip("torch.library.opcheck is unavailable")
+
+    x = torch.randn(32, device="cuda")
+    y = torch.randn(32, device="cuda")
+    try:
+        torch.ops.kernel_lab.vector_add(x, y)
+    except Exception as exc:
+        pytest.skip(f"operator is not implemented yet: {exc}")
+
+    torch.library.opcheck(torch.ops.kernel_lab.vector_add, (x, y))
diff --git a/tasks/06_pytorch_custom_op/spec.md b/tasks/06_pytorch_custom_op/spec.md
new file mode 100644
index 0000000..f93414a
--- /dev/null
+++ b/tasks/06_pytorch_custom_op/spec.md
@@ -0,0 +1,45 @@
+# Task 06: PyTorch Custom Op
+
+## 1. Problem Statement
+
+Expose a CUDA kernel as a PyTorch operator so Python code can call it and test it like any other operator.
+
+## 2. Expected Input/Output Shapes
+
+For the starter binding, use vector add:
+
+- `x`: `[N]`
+- `y`: `[N]`
+- output: `[N]`
+
+The same pattern can later be extended to the other operators.
+
+## 3. Performance Intuition
+
+The binding layer is not usually where the kernel time goes, but it determines whether you can test, benchmark, and profile the CUDA implementation from Python.
+
+## 4. Memory Access Discussion
+
+The binding itself does not optimize memory traffic; it passes tensors and dispatches the kernel. Still, the binding must preserve shape, dtype, device, and contiguity assumptions.
+
+## 5. What Triton Is Abstracting
+
+Triton often avoids a separate C++ binding layer because Python can launch the JIT kernel directly.
+
+## 6. What CUDA Makes Explicit
+
+CUDA plus PyTorch binding requires you to define function signatures, operator registration, and build integration explicitly.
+
+## 7. Reflection Questions
+
+- What assumptions should the binding validate before calling a CUDA kernel?
+- Why is operator registration useful for testing and benchmarking?
+- What changes once you want autograd support?
+
+## 8. Implementation Checklist
+
+- Read `kernels/cuda/binding/binding.cpp`
+- Build or load the extension from Python
+- Call the operator from `torch.ops.kernel_lab`
+- Add correctness checks once the CUDA kernel is implemented
+- Try `torch.library.opcheck` if your PyTorch build provides it
diff --git a/tasks/07_profiling/__init__.py b/tasks/07_profiling/__init__.py
new file mode 100644
index 0000000..83fe414
--- /dev/null
+++ b/tasks/07_profiling/__init__.py
@@ -0,0 +1 @@
+"""Profiling task."""
diff --git a/tasks/07_profiling/profile_examples.md b/tasks/07_profiling/profile_examples.md
new file mode 100644
index 0000000..f46ee79
--- /dev/null
+++ b/tasks/07_profiling/profile_examples.md
@@ -0,0 +1,23 @@
+# Profiling Examples
+
+## Nsight Compute
+
+```bash
+./tools/profile_ncu.sh python bench/bench_vector_add.py --device cuda --mode triton
+./tools/profile_ncu.sh python bench/bench_softmax.py --device cuda --mode torch
+```
+
+## Nsight Systems
+
+```bash
+./tools/profile_nsys.sh python bench/bench_matmul.py --device cuda --mode triton
+./tools/profile_nsys.sh python bench/bench_attention.py --device cuda --mode torch
+```
+
+## First Things To Inspect
+
+- median runtime from the benchmark harness
+- whether warmup was excluded
+- whether kernels overlap or serialize
+- whether memory throughput is near a practical ceiling
+- whether a kernel launch is tiny enough that launch overhead matters
diff --git a/tasks/07_profiling/spec.md b/tasks/07_profiling/spec.md
new file mode 100644
index 0000000..c371ced
--- /dev/null
+++ b/tasks/07_profiling/spec.md
@@ -0,0 +1,40 @@
+# Task 07: Profiling
+
+## 1. Problem Statement
+
+Profile one kernel at a time and learn to interpret the first few metrics before tuning anything.
+
+## 2. Expected Input/Output Shapes
+
+Use the same shapes as your benchmark harness so measurements stay comparable.
+
+## 3. Performance Intuition
+
+Profiling is how you turn guesses into evidence. Use it after correctness is established.
+
+## 4. Memory Access Discussion
+
+Profilers can tell you whether the kernel is limited by memory throughput, occupancy, or something else. Interpret those numbers in terms of the operator's access pattern.
+
+## 5. What Triton Is Abstracting
+
+Triton hides low-level details in code, but profilers still show the resulting kernels and hardware behavior.
+
+## 6. What CUDA Makes Explicit
+
+CUDA kernels expose their launch shapes, synchronization behavior, and memory hierarchy choices more directly, which can make profiler results easier to map back to code.
+
+## 7. Reflection Questions
+
+- Did you profile a single kernel or an entire script?
+- Did you warm up before timing?
+- Which metric was the first signal that the kernel was bandwidth-bound or compute-bound?
+
+## 8. Implementation Checklist
+
+- Pick one benchmark and one implementation
+- Warm up first
+- Synchronize before and after timing
+- Run `ncu` and inspect a small set of metrics
+- Run `nsys` and inspect the timeline
+- Write down what you learned before changing the kernel
diff --git a/tasks/__init__.py b/tasks/__init__.py
new file mode 100644
index 0000000..19fcdc7
--- /dev/null
+++ b/tasks/__init__.py
@@ -0,0 +1,2 @@
+"""Workbook tasks."""
+
diff --git a/tests/test_correctness.py b/tests/test_correctness.py
new file mode 100644
index 0000000..9597507
--- /dev/null
+++ b/tests/test_correctness.py
@@ -0,0 +1,45 @@
+from __future__ import annotations
+
+import math
+
+import torch
+
+from reference.torch_attention import torch_attention
+from reference.torch_matmul import torch_matmul
+from reference.torch_online_softmax import torch_online_softmax
+from reference.torch_row_softmax import torch_row_softmax
+from reference.torch_vector_add import torch_vector_add
+
+
+def test_vector_add_matches_torch():
+    x = torch.randn(257)
+    y = torch.randn(257)
+    torch.testing.assert_close(torch_vector_add(x, y), x + y)
+
+
+def test_row_softmax_matches_torch():
+    x = torch.randn(32, 65)
+    torch.testing.assert_close(torch_row_softmax(x), torch.softmax(x, dim=1))
+
+
+def test_matmul_matches_torch():
+    a = torch.randn(16, 24)
+    b = torch.randn(24, 8)
+    torch.testing.assert_close(torch_matmul(a, b), a @ b)
+
+
+def test_online_softmax_matches_torch():
+    x = torch.randn(12, 33)
+    torch.testing.assert_close(
+        torch_online_softmax(x), torch.softmax(x, dim=1), atol=1e-5, rtol=1e-5
+    )
+
+
+def test_attention_matches_manual_formula():
+    q = torch.randn(1, 2, 8, 16)
+    k = torch.randn(1, 2, 8, 16)
+    v = torch.randn(1, 2, 8, 16)
+    scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(q.shape[-1])
+    expected = torch.matmul(torch.softmax(scores, dim=-1), v)
+    torch.testing.assert_close(torch_attention(q, k, v), expected, atol=1e-5, rtol=1e-5)
+
diff --git a/tests/test_extension_import.py b/tests/test_extension_import.py
new file mode 100644
index 0000000..759aa3a
--- /dev/null
+++ b/tests/test_extension_import.py
@@ -0,0 +1,18 @@
+from __future__ import annotations
+
+import pytest
+import torch
+
+from tools.lab_extension import build_extension
+
+
+@pytest.mark.cuda_required
+@pytest.mark.skeleton
+def test_extension_can_build_or_skip():
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
+    ext = build_extension(verbose=False)
+    if ext is None:
+        pytest.skip("extension build/load is unavailable in this environment")
+    assert hasattr(torch.ops, "kernel_lab")
+    assert hasattr(torch.ops.kernel_lab, "vector_add")
diff --git a/tests/test_numerics.py b/tests/test_numerics.py
new file mode 100644
index 0000000..dd3911e
--- /dev/null
+++ b/tests/test_numerics.py
@@ -0,0 +1,28 @@
+from __future__ import annotations
+
+import torch
+
+from reference.torch_online_softmax import torch_online_softmax
+from reference.torch_row_softmax import torch_row_softmax
+
+
+def test_row_softmax_handles_large_values():
+    x = torch.tensor([[10000.0, 10001.0, 9999.0]], dtype=torch.float32)
+    out = torch_row_softmax(x)
+    torch.testing.assert_close(out.sum(dim=1), torch.ones(1), atol=1e-6, rtol=1e-6)
+    assert torch.isfinite(out).all()
+
+
+def test_online_softmax_handles_large_negative_values():
+    x = torch.tensor([[-10000.0, -9998.0, -9999.0]], dtype=torch.float32)
+    out = torch_online_softmax(x)
+    torch.testing.assert_close(out.sum(dim=1), torch.ones(1), atol=1e-6, rtol=1e-6)
+    assert torch.isfinite(out).all()
+
+
+def test_row_and_online_softmax_agree():
+    x = torch.randn(10, 40) * 8.0
+    torch.testing.assert_close(
+        torch_row_softmax(x), torch_online_softmax(x), atol=1e-5, rtol=1e-5
+    )
+
diff --git a/tests/test_shapes.py b/tests/test_shapes.py
new file mode 100644
index 0000000..6691882
--- /dev/null
+++ b/tests/test_shapes.py
@@ -0,0 +1,39 @@
+from __future__ import annotations
+
+import torch
+
+from reference.torch_attention import torch_attention
+from reference.torch_matmul import torch_matmul
+from reference.torch_online_softmax import torch_online_softmax
+from reference.torch_row_softmax import torch_row_softmax
+from reference.torch_vector_add import torch_vector_add
+
+
+def test_vector_add_shape():
+    x = torch.randn(11)
+    y = torch.randn(11)
+    assert torch_vector_add(x, y).shape == x.shape
+
+
+def test_row_softmax_shape():
+    x = torch.randn(4, 9)
+    assert torch_row_softmax(x).shape == x.shape
+
+
+def test_matmul_shape():
+    a = torch.randn(5, 7)
+    b = torch.randn(7, 3)
+    assert torch_matmul(a, b).shape == (5, 3)
+
+
+def test_online_softmax_shape():
+    x = torch.randn(3, 13)
+    assert torch_online_softmax(x).shape == x.shape
+
+
+def test_attention_shape():
+    q = torch.randn(2, 4, 8, 16)
+    k = torch.randn(2, 4, 8, 16)
+    v = torch.randn(2, 4, 8, 16)
+    assert torch_attention(q, k, v).shape == q.shape
+
diff --git a/tools/__init__.py b/tools/__init__.py
new file mode 100644
index 0000000..eecb0de
--- /dev/null
+++ b/tools/__init__.py
@@ -0,0 +1,2 @@
+"""Helper modules and scripts for the lab."""
+
diff --git a/tools/check_env.py b/tools/check_env.py
new file mode 100644
index 0000000..b0c0686
--- /dev/null
+++ b/tools/check_env.py
@@ -0,0 +1,58 @@
+from __future__ import annotations
+
+import platform
+import shutil
+import subprocess
+
+import torch
+
+
+def run_command(cmd: list[str]) -> str:
+    if shutil.which(cmd[0]) is None:
+        return "not found"
+    try:
+        result = subprocess.run(cmd, capture_output=True, text=True, check=False)
+        text = (result.stdout or result.stderr).strip()
+        return text or f"command exited with code {result.returncode}"
+    except Exception as exc:  # pragma: no cover - defensive
+        return f"error: {exc}"
+
+
+def main() -> None:
+    print("=== System ===")
+    print("python:", platform.python_version())
+    print("platform:", platform.platform())
+
+    print("\n=== PyTorch ===")
+    print("torch:", torch.__version__)
+    print("torch.cuda.is_available():", torch.cuda.is_available())
+    print("torch.version.cuda:", torch.version.cuda)
+
+    if torch.cuda.is_available():
+        device_count = torch.cuda.device_count()
+        print("cuda device count:", device_count)
+        for idx in range(device_count):
+            name = torch.cuda.get_device_name(idx)
+            capability = torch.cuda.get_device_capability(idx)
+            print(f"device {idx}: {name} | capability={capability[0]}.{capability[1]}")
+    else:
+        print("no CUDA device visible to PyTorch")
+
+    print("\n=== Triton ===")
+    try:
+        import triton  # type: ignore
+
+        print("triton:", triton.__version__)
+    except Exception as exc:
+        print("triton import failed:", exc)
+
+    print("\n=== Toolkit / Driver Hints ===")
+    print("nvcc --version:")
+    print(run_command(["nvcc", "--version"]))
+    print("\nnvidia-smi:")
+    print(run_command(["nvidia-smi"]))
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/tools/compare_against_torch.py b/tools/compare_against_torch.py
new file mode 100644
index 0000000..554fba7
--- /dev/null
+++ b/tools/compare_against_torch.py
@@ -0,0 +1,138 @@
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+
+import torch
+
+from kernels.triton.flash_attention_fwd import triton_flash_attention_fwd
+from kernels.triton.online_softmax import triton_online_softmax
+from kernels.triton.row_softmax import triton_row_softmax
+from kernels.triton.tiled_matmul import triton_tiled_matmul
+from kernels.triton.vector_add import triton_vector_add
+from reference.torch_attention import torch_attention
+from reference.torch_matmul import torch_matmul
+from reference.torch_online_softmax import torch_online_softmax
+from reference.torch_row_softmax import torch_row_softmax
+from reference.torch_vector_add import torch_vector_add
+from tools.lab_extension import build_extension
+
+
+def compare_vector_add(device: str) -> None:
+    x = torch.randn(4097, device=device)
+    y = torch.randn(4097, device=device)
+    ref = torch_vector_add(x, y)
+    print("torch reference ready")
+    try:
+        torch.testing.assert_close(triton_vector_add(x, y), ref)
+        print("triton matches torch")
+    except Exception as exc:
+        print(f"triton unavailable: {exc}")
+    ext = build_extension(verbose=False) if device == "cuda" else None
+    if ext is not None and hasattr(torch.ops, "kernel_lab"):
+        try:
+            torch.testing.assert_close(torch.ops.kernel_lab.vector_add(x, y), ref)
+            print("cuda op matches torch")
+        except Exception as exc:
+            print(f"cuda op unavailable: {exc}")
+
+
+def compare_softmax(device: str, variant: str) -> None:
+    x = torch.randn(128, 257, device=device)
+    ref = torch_row_softmax(x) if variant == "row" else torch_online_softmax(x)
+    print("torch reference ready")
+    triton_fn = triton_row_softmax if variant == "row" else triton_online_softmax
+    try:
+        torch.testing.assert_close(triton_fn(x), ref, atol=1e-4, rtol=1e-4)
+        print("triton matches torch")
+    except Exception as exc:
+        print(f"triton unavailable: {exc}")
+    ext = build_extension(verbose=False) if device == "cuda" else None
+    op_name = "row_softmax" if variant == "row" else "online_softmax"
+    if ext is not None and hasattr(torch.ops, "kernel_lab"):
+        try:
+            torch.testing.assert_close(
+                getattr(torch.ops.kernel_lab, op_name)(x), ref, atol=1e-4, rtol=1e-4
+            )
+            print("cuda op matches torch")
+        except Exception as exc:
+            print(f"cuda op unavailable: {exc}")
+
+
+def compare_matmul(device: str) -> None:
+    a = torch.randn(64, 96, device=device)
+    b = torch.randn(96, 48, device=device)
+    ref = torch_matmul(a, b)
+    print("torch reference ready")
+    try:
+        torch.testing.assert_close(triton_tiled_matmul(a, b), ref, atol=1e-3, rtol=1e-3)
+        print("triton matches torch")
+    except Exception as exc:
+        print(f"triton unavailable: {exc}")
+    ext = build_extension(verbose=False) if device == "cuda" else None
+    if ext is not None and hasattr(torch.ops, "kernel_lab"):
+        try:
+            torch.testing.assert_close(
+                torch.ops.kernel_lab.tiled_matmul(a, b), ref, atol=1e-3, rtol=1e-3
+            )
+            print("cuda op matches torch")
+        except Exception as exc:
+            print(f"cuda op unavailable: {exc}")
+
+
+def compare_attention(device: str) -> None:
+    q = torch.randn(1, 2, 16, 32, device=device)
+    k = torch.randn(1, 2, 16, 32, device=device)
+    v = torch.randn(1, 2, 16, 32, device=device)
+    ref = torch_attention(q, k, v, causal=False)
+    print("torch reference ready")
+    try:
+        torch.testing.assert_close(
+            triton_flash_attention_fwd(q, k, v, causal=False), ref, atol=2e-3, rtol=2e-3
+        )
+        print("triton matches torch")
+    except Exception as exc:
+        print(f"triton unavailable: {exc}")
+    ext = build_extension(verbose=False) if device == "cuda" else None
+    if ext is not None and hasattr(torch.ops, "kernel_lab"):
+        try:
+            torch.testing.assert_close(
+                torch.ops.kernel_lab.flash_attention_fwd(q, k, v, False),
+                ref,
+                atol=2e-3,
+                rtol=2e-3,
+            )
+            print("cuda op matches torch")
+        except Exception as exc:
+            print(f"cuda op unavailable: {exc}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--task",
+        choices=["vector_add", "row_softmax", "online_softmax", "matmul", "attention"],
+        required=True,
+    )
+    parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu")
+    args = parser.parse_args()
+
+    if args.task == "vector_add":
+        compare_vector_add(args.device)
+    elif args.task == "row_softmax":
+        compare_softmax(args.device, "row")
+    elif args.task == "online_softmax":
+        compare_softmax(args.device, "online")
+    elif args.task == "matmul":
+        compare_matmul(args.device)
+    else:
+        compare_attention(args.device)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/lab_extension.py b/tools/lab_extension.py
new file mode 100644
index 0000000..09f6518
--- /dev/null
+++ b/tools/lab_extension.py
@@ -0,0 +1,57 @@
+from __future__ import annotations
+
+import os
+from pathlib import Path
+from typing import Optional
+
+import torch
+
+try:
+    from torch.utils.cpp_extension import load
+except ImportError:  # pragma: no cover - depends on torch install
+    load = None
+
+
+ROOT = Path(__file__).resolve().parents[1]
+CUDA_DIR = ROOT / "kernels" / "cuda"
+
+
+def _format_torch_cuda_arch(raw_arch: str) -> str:
+    if raw_arch.isdigit() and len(raw_arch) == 3:
+        return f"{raw_arch[:2]}.{raw_arch[2]}"
+    return raw_arch
+
+
+def build_extension(verbose: bool = True) -> Optional[object]:
+    """Build or load the lab extension if the local environment allows it."""
+    if load is None:
+        print("torch.utils.cpp_extension.load is unavailable in this PyTorch build.")
+        return None
+    if not torch.cuda.is_available():
+        print("CUDA is not available; skipping extension build.")
+        return None
+
+    arch = _format_torch_cuda_arch(os.environ.get("KERNEL_LAB_CUDA_ARCH", "120"))
+    os.environ.setdefault("TORCH_CUDA_ARCH_LIST", arch)
+
+    sources = [
+        str(CUDA_DIR / "binding" / "binding.cpp"),
+        str(CUDA_DIR / "src" / "vector_add.cu"),
+        str(CUDA_DIR / "src" / "row_softmax.cu"),
+        str(CUDA_DIR / "src" / "tiled_matmul.cu"),
+        str(CUDA_DIR / "src" / "online_softmax.cu"),
+        str(CUDA_DIR / "src" / "flash_attention_fwd.cu"),
+    ]
+
+    try:
+        return load(
+            name="kernel_lab_ext",
+            sources=sources,
+            extra_include_paths=[str(CUDA_DIR / "include")],
+            extra_cflags=["-O0", "-std=c++17"],
+            extra_cuda_cflags=["-O0", "-lineinfo"],
+            verbose=verbose,
+        )
+    except Exception as exc:  # pragma: no cover - environment-dependent
+        print(f"Extension build/load failed: {exc}")
+        return None
diff --git a/tools/print_device_info.py b/tools/print_device_info.py
new file mode 100644
index 0000000..d6fc737
--- /dev/null
+++ b/tools/print_device_info.py
@@ -0,0 +1,23 @@
+from __future__ import annotations
+
+import torch
+
+
+def main() -> None:
+    if not torch.cuda.is_available():
+        print("CUDA is not available.")
+        return
+
+    for idx in range(torch.cuda.device_count()):
+        props = torch.cuda.get_device_properties(idx)
+        print(f"device {idx}: {props.name}")
+        print(f"  capability: {props.major}.{props.minor}")
+        print(f"  total memory (GB): {props.total_memory / 1e9:.2f}")
+        print(f"  multiprocessors: {props.multi_processor_count}")
+        print(f"  max threads per block: {props.max_threads_per_block}")
+        print(f"  warp size: {props.warp_size}")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/tools/profile_ncu.sh b/tools/profile_ncu.sh
new file mode 100755
index 0000000..2e00e0f
--- /dev/null
+++ b/tools/profile_ncu.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+if [[ $# -eq 0 ]]; then
+  echo "usage: $0 <command ...>"
+  exit 1
+fi
+
+ncu --set full --target-processes all "$@"
+
diff --git a/tools/profile_nsys.sh b/tools/profile_nsys.sh
new file mode 100755
index 0000000..ac45853
--- /dev/null
+++ b/tools/profile_nsys.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+if [[ $# -eq 0 ]]; then
+  echo "usage: $0 <command ...>"
+  exit 1
+fi
+
+mkdir -p profile-output
+nsys profile --trace=cuda,nvtx,osrt --sample=none -o profile-output/profile "$@"
+
diff --git a/tools/run_all_benchmarks.sh b/tools/run_all_benchmarks.sh
new file mode 100755
index 0000000..957bd38
--- /dev/null
+++ b/tools/run_all_benchmarks.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+python bench/bench_vector_add.py "$@"
+python bench/bench_softmax.py "$@"
+python bench/bench_matmul.py "$@"
+python bench/bench_attention.py "$@"
+
diff --git a/tools/run_all_tests.sh b/tools/run_all_tests.sh
new file mode 100755
index 0000000..7a84c11
--- /dev/null
+++ b/tools/run_all_tests.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+pytest -q
+
diff --git a/uv.lock b/uv.lock
new file mode 100644
index 0000000..82d736f
--- /dev/null
+++ b/uv.lock
@@ -0,0 +1,832 @@
+version = 1
+revision = 3
+requires-python = ">=3.10"
+resolution-markers = [
+    "python_full_version >= '3.12'",
+    "python_full_version == '3.11.*'",
+    "python_full_version < '3.11'",
+]
+
+[[package]]
+name = "cmake"
+version = "4.3.1"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/40/72/c295e190193af5f41d583517db1ca1cf43eaa2af8140856dca114fa6486d/cmake-4.3.1.tar.gz", hash = "sha256:6fe523413cdd2568a19a6ec297b8f869a95a3f8edaf0dd73731b81412216e00e" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/43/07/05be57c389f8f0c3d0d7b878685ec1eed863b77787d65584c9399e294531/cmake-4.3.1-py3-none-macosx_10_10_universal2.whl", hash = "sha256:976337df534f4eea6b100a7af39f9a7a538aa5fd65b7d770cf2a07907439dca8" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/9b/80/09c381519855d53a5cce0e5e10e184f9e89caf6a6f1f7d7b42c17bd68d2e/cmake-4.3.1-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:bb7b7ef74beb69c099c39d7f151cfc94256bba1b75354e48ea87d6bf0dcb3007" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/d6/0d/eab407c3592442711584d09bef5de17df93f39ea69baaa310c4564436177/cmake-4.3.1-py3-none-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:8874aac9fbd07d71c506fcaf57255ff2cc015a15ea44146c0d1e694843d5e312" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/ed/de/c7f487b21b33918c0af1dabfdf8d858799e01d62c2bd139fc871b86b21a2/cmake-4.3.1-py3-none-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f439501f3f3ecf1dbafbee6226fc6cc680203202ddfe59586357d076c417ae8f" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/c4/9c/2de3a3b5a5983b72c3e2eeaa23a6c8d251ebae79d15cedb9818e708a4caf/cmake-4.3.1-py3-none-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:345f5e4ee783cd7691c6b54a631b43bb0c2efabf45afa64ac9000f9b0885d250" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/87/32/542ed40b3393bced9af073402f75fb6ae3e57d6656cc38a9470942de7b8c/cmake-4.3.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:80fc99c1958dbd926f529448dfcdbb1b176ecf31a8d485a3d68bd469487e8933" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/7f/ec/4f99b7414984d668aaa9c6214df84af689db8756f1536ea81720bb2fef91/cmake-4.3.1-py3-none-manylinux_2_31_armv7l.whl", hash = "sha256:fdb744921ff4739e755faee005b36b97223d2bb8591ebd7b57abb5cf97300925" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/bf/b4/627e18c8acf6219b1c8c521e7d702bca36edab21994992b64e68e1007430/cmake-4.3.1-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:fdc39eb421177bfa946af3600c797612ee76bfe6daa6036ff8958c504a99937b" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/cc/e1/e3b3dd1c81e72329f1ec3350a02154f74547eab2c1db8270eb2bc344edc1/cmake-4.3.1-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:8d1d224f9df9e82f154ad31b9798b7b4c0d509a11ccdca695d0ee4d140c30c6d" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/98/3e/7a1ca1992de268fe1284b3738549ea8859f665dec294d584ae9200d66ce6/cmake-4.3.1-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:056483831febe0934f25959bc74da077b18f23c7a064a0417432447ec27b8fb2" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/16/ce/9b0e79835b674d2680fae7e57d3229abac0765c3d80ddcc6c5c67ec78fd8/cmake-4.3.1-py3-none-musllinux_1_2_i686.whl", hash = "sha256:9567aac26ee7a0594e6b71a8f94e907c7ce957cdeedeca1404504228f2c9885f" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/bf/3a/eb15faf1ce961431017cda585650bd37a259fe53e836eae34263ea3647ab/cmake-4.3.1-py3-none-musllinux_1_2_ppc64le.whl", hash = "sha256:8cadd35606d3e4f9a43173f7236cd8947fb9652fdfb272a916f0600ad169fbf2" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/48/34/b76e2c7aa0c1aa833430a2b4a1caff3f4163b9db7c38d208a4b6e0287d54/cmake-4.3.1-py3-none-musllinux_1_2_riscv64.whl", hash = "sha256:75622c1e7266e60fab9d4f1f4b4dc25f22a4b57e902d9792c39538f46a997269" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/3a/3b/afd7d51aabec951e8881812d9bf1d4c74c1882434e97eb9ed6f097591dd1/cmake-4.3.1-py3-none-musllinux_1_2_s390x.whl", hash = "sha256:e43acaa51bb8fe57a914424edb0efa91eb82d577fb74ecc6ff67da47a1d23524" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/4f/be/6d9c4f0ef5383622c3d7fd508acb531b1cbaee530e7cf4196c415f548131/cmake-4.3.1-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:e8bc1c42517ed9cb26eb293e720449830940ef7be6dcc101638b9cc65ece98c8" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/3a/49/ca3671c1b6859b5efe5b9f1bebf95cad823d48a3a8f366a72e207a1d7a02/cmake-4.3.1-py3-none-win32.whl", hash = "sha256:cd9058d730da5fa68394c41b26036b18850de494d730a0a85cde51558138b70b" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/2a/2a/a8db7d73a6941e9d4a177011137378b222cd6dcec383f1998f3594c73a0c/cmake-4.3.1-py3-none-win_amd64.whl", hash = "sha256:73fb3851fe760b0395983c5d3dd6da2364b1ce324f8546aee2078d162d96005a" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/9b/be/d62156e55b8a9614f29cd8e576e9bf925ee2b428e43bb0c4f02b5cb97c65/cmake-4.3.1-py3-none-win_arm64.whl", hash = "sha256:86e97fed7c9a61638b08937981fdc9bca9caec9df9c88b87aa0a47442583e02a" },
+]
+
+[[package]]
+name = "colorama"
+version = "0.4.6"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6" },
+]
+
+[[package]]
+name = "cuda-bindings"
+version = "12.9.6"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+dependencies = [
+    { name = "cuda-pathfinder" },
+]
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/08/9d/dd87e1071bcb2e438c14e2e4497aa0037faf2c9775ac1d172f578f448668/cuda_bindings-12.9.6-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bb2f1eedc8f65902b34e807c21a3b7c922dc8de1f51d0829ecbb5c6a5e9c5ff1" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/8c/1d/5631df2faa5e5f6bd3e8fef098d6fc1b7c6f38811821332ef28ad82ce0d4/cuda_bindings-12.9.6-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d9f9031e7a265e74f1517668139987253552d1677d995da4b0d990aa19b9b9b0" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/1f/a5/e9d37c10f6c27c9c65d53c6cd6d9763e1df99c004780585fc2ad9041fbe3/cuda_bindings-12.9.6-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2662f59db67d9aeaf8959c593c91f600792c2970cf02cae2814387fc687b115a" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/66/d5/bd4c03e9516d3cf788a270debe28d687e5c48b13a9931599bbddf01de302/cuda_bindings-12.9.6-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e8519707644ea630a365b101703a9136f4cb144760cc2c73281c38a05e07d08d" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/50/04/8a4d45dc154a8a32982658cc55be291e9778d1197834b15d33427e2f65c1/cuda_bindings-12.9.6-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0ea331bc47d9988cc61f0ecc5fa8df9dd188b4493ae1c6688bb1ee8ce8ba1af4" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/3b/69/4b0375e1b120dfa7427c31c8420cfdee596ecd03955fd291a96116fa375d/cuda_bindings-12.9.6-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b2b54b95a47104eff56b5155818ab5790e3ccdba8dd51e2928ae56782aaf5b02" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/dd/ad/2d9b80c28deae971ce4bbe991c23b81347a2a8918b2672020d07f070a596/cuda_bindings-12.9.6-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:da30d89db8188b9beb5a6467d72b2f11d1b667ab901d2d373bcde51b97765b21" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/b2/ca/729781d11445cfbacd1af1bf0edfe147c311212cfdf1d5c292e0565fabef/cuda_bindings-12.9.6-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3d1be8bd80b34f51dcbaf138dafd817e888cf2d12c47833019fd933beb32d7ef" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/fe/f3/51768221aade33e711dcf7e4a52fdc0d0446c1baf39f6bcc9d69cfbceb0b/cuda_bindings-12.9.6-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:48666e666f083a4c4387ffe20594b05e092b535a4453d1e4817d71237d02aa13" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/71/34/14afff4aabe3b5bd84c647dea4a4dfb917c94b8a8df0adb6b1622c2b465b/cuda_bindings-12.9.6-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b4f82f8f8061f3a39446bf854c4edd9bcc2d0da3f58d8f6f54541b3e4d5c933d" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/3d/d3/a29faf4fb371c2f43ffda23a938ec0bebf6dbab676350e137ae0f61e5ec0/cuda_bindings-12.9.6-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f00290f9468d2cfeee92aaad2275be32dfd2f4967a97ac0f12314b7e6281ad78" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/2a/97/71e66b2ed65d80f7b70a1538af72d73cd798e22bc93d240d7e69f2366322/cuda_bindings-12.9.6-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d3bc6e28cf5d133f72050c515db72876870fb009f1431bcbf45b54a179be2284" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/49/91/c10b575a001aad39c036efd649869aac8d97ef0ba9f1d8ad17b4946b3366/cuda_bindings-12.9.6-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e88d38fdf07cc777dec1afaba8139c2eedb3819063f6b42f1e2ea8516bdd6806" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/2a/9a/998471e76bea78e96d3d7fdf0bc5f46c3210858e81e6d13d8186a9dbb636/cuda_bindings-12.9.6-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4df01e34cefd3275170b2ac0426d325271ab435e85f59a69300eacd8ff23d34c" },
+]
+
+[[package]]
+name = "cuda-pathfinder"
+version = "1.5.2"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/f2/f9/1b9b60a30fc463c14cdea7a77228131a0ccc89572e8df9cb86c9648271ab/cuda_pathfinder-1.5.2-py3-none-any.whl", hash = "sha256:0c5f160a7756c5b072723cbbd6d861e38917ef956c68150b02f0b6e9271c71fa" },
+]
+
+[[package]]
+name = "cuda-toolkit"
+version = "12.8.1"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/d4/c8/7dce3a0b15b42a3b58e7d96eb22a687d3bf2c44e01d149a6874629cd9938/cuda_toolkit-12.8.1-py2.py3-none-any.whl", hash = "sha256:adc7906af4ecbf9a352f9dca5734eceb21daec281ccfcf5675e1d2f724fc2cba" },
+]
+
+[package.optional-dependencies]
+cublas = [
+    { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+]
+cudart = [
+    { name = "nvidia-cuda-runtime-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+]
+cufft = [
+    { name = "nvidia-cufft-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+]
+cufile = [
+    { name = "nvidia-cufile-cu12", marker = "sys_platform == 'linux'" },
+]
+cupti = [
+    { name = "nvidia-cuda-cupti-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+]
+curand = [
+    { name = "nvidia-curand-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+]
+cusolver = [
+    { name = "nvidia-cusolver-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+]
+cusparse = [
+    { name = "nvidia-cusparse-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+]
+nvjitlink = [
+    { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+]
+nvrtc = [
+    { name = "nvidia-cuda-nvrtc-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+]
+nvtx = [
+    { name = "nvidia-nvtx-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+]
+
+[[package]]
+name = "exceptiongroup"
+version = "1.3.1"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+dependencies = [
+    { name = "typing-extensions", marker = "python_full_version < '3.11'" },
+]
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/50/79/66800aadf48771f6b62f7eb014e352e5d06856655206165d775e675a02c9/exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/8a/0e/97c33bf5009bdbac74fd2beace167cab3f978feb69cc36f1ef79360d6c4e/exceptiongroup-1.3.1-py3-none-any.whl", hash = "sha256:a7a39a3bd276781e98394987d3a5701d0c4edffb633bb7a5144577f82c773598" },
+]
+
+[[package]]
+name = "filelock"
+version = "3.25.2"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/94/b8/00651a0f559862f3bb7d6f7477b192afe3f583cc5e26403b44e59a55ab34/filelock-3.25.2.tar.gz", hash = "sha256:b64ece2b38f4ca29dd3e810287aa8c48182bbecd1ae6e9ae126c9b35f1382694" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/a4/a5/842ae8f0c08b61d6484b52f99a03510a3a72d23141942d216ebe81fefbce/filelock-3.25.2-py3-none-any.whl", hash = "sha256:ca8afb0da15f229774c9ad1b455ed96e85a81373065fb10446672f64444ddf70" },
+]
+
+[[package]]
+name = "fsspec"
+version = "2026.3.0"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/e1/cf/b50ddf667c15276a9ab15a70ef5f257564de271957933ffea49d2cdbcdfb/fsspec-2026.3.0.tar.gz", hash = "sha256:1ee6a0e28677557f8c2f994e3eea77db6392b4de9cd1f5d7a9e87a0ae9d01b41" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/d5/1f/5f4a3cd9e4440e9d9bc78ad0a91a1c8d46b4d429d5239ebe6793c9fe5c41/fsspec-2026.3.0-py3-none-any.whl", hash = "sha256:d2ceafaad1b3457968ed14efa28798162f1638dbb5d2a6868a2db002a5ee39a4" },
+]
+
+[[package]]
+name = "iniconfig"
+version = "2.3.0"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12" },
+]
+
+[[package]]
+name = "jinja2"
+version = "3.1.6"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+dependencies = [
+    { name = "markupsafe" },
+]
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67" },
+]
+
+[[package]]
+name = "kernel-lab"
+version = "0.1.0"
+source = { editable = "." }
+dependencies = [
+    { name = "cmake" },
+    { name = "ninja" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }, marker = "python_full_version < '3.11'" },
+    { name = "numpy", version = "2.4.4", source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }, marker = "python_full_version >= '3.11'" },
+    { name = "packaging" },
+    { name = "pytest" },
+    { name = "torch" },
+    { name = "triton" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "cmake", specifier = ">=3.25" },
+    { name = "ninja", specifier = ">=1.11" },
+    { name = "numpy", specifier = ">=1.26" },
+    { name = "packaging", specifier = ">=24.0" },
+    { name = "pytest", specifier = ">=8.0" },
+    { name = "torch", specifier = ">=2.10", index = "https://download.pytorch.org/whl/cu128" },
+    { name = "triton", specifier = ">=3.0" },
+]
+
+[[package]]
+name = "markupsafe"
+version = "3.0.3"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/e8/4b/3541d44f3937ba468b75da9eebcae497dcf67adb65caa16760b0a6807ebb/markupsafe-3.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2f981d352f04553a7171b8e44369f2af4055f888dfb147d55e42d29e29e74559" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/98/1b/fbd8eed11021cabd9226c37342fa6ca4e8a98d8188a8d9b66740494960e4/markupsafe-3.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e1c1493fb6e50ab01d20a22826e57520f1284df32f2d8601fdd90b6304601419" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/40/01/e560d658dc0bb8ab762670ece35281dec7b6c1b33f5fbc09ebb57a185519/markupsafe-3.0.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1ba88449deb3de88bd40044603fafffb7bc2b055d626a330323a9ed736661695" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/af/cd/ce6e848bbf2c32314c9b237839119c5a564a59725b53157c856e90937b7a/markupsafe-3.0.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f42d0984e947b8adf7dd6dde396e720934d12c506ce84eea8476409563607591" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/c9/2a/b5c12c809f1c3045c4d580b035a743d12fcde53cf685dbc44660826308da/markupsafe-3.0.3-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c0c0b3ade1c0b13b936d7970b1d37a57acde9199dc2aecc4c336773e1d86049c" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/cf/e3/9427a68c82728d0a88c50f890d0fc072a1484de2f3ac1ad0bfc1a7214fd5/markupsafe-3.0.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:0303439a41979d9e74d18ff5e2dd8c43ed6c6001fd40e5bf2e43f7bd9bbc523f" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/bc/36/23578f29e9e582a4d0278e009b38081dbe363c5e7165113fad546918a232/markupsafe-3.0.3-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:d2ee202e79d8ed691ceebae8e0486bd9a2cd4794cec4824e1c99b6f5009502f6" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/56/21/dca11354e756ebd03e036bd8ad58d6d7168c80ce1fe5e75218e4945cbab7/markupsafe-3.0.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:177b5253b2834fe3678cb4a5f0059808258584c559193998be2601324fdeafb1" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/87/99/faba9369a7ad6e4d10b6a5fbf71fa2a188fe4a593b15f0963b73859a1bbd/markupsafe-3.0.3-cp310-cp310-win32.whl", hash = "sha256:2a15a08b17dd94c53a1da0438822d70ebcd13f8c3a95abe3a9ef9f11a94830aa" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/d6/25/55dc3ab959917602c96985cb1253efaa4ff42f71194bddeb61eb7278b8be/markupsafe-3.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:c4ffb7ebf07cfe8931028e3e4c85f0357459a3f9f9490886198848f4fa002ec8" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/d0/9e/0a02226640c255d1da0b8d12e24ac2aa6734da68bff14c05dd53b94a0fc3/markupsafe-3.0.3-cp310-cp310-win_arm64.whl", hash = "sha256:e2103a929dfa2fcaf9bb4e7c091983a49c9ac3b19c9061b6d5427dd7d14d81a1" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/08/db/fefacb2136439fc8dd20e797950e749aa1f4997ed584c62cfb8ef7c2be0e/markupsafe-3.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1cc7ea17a6824959616c525620e387f6dd30fec8cb44f649e31712db02123dad" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/e1/2e/5898933336b61975ce9dc04decbc0a7f2fee78c30353c5efba7f2d6ff27a/markupsafe-3.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4bd4cd07944443f5a265608cc6aab442e4f74dff8088b0dfc8238647b8f6ae9a" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/1d/09/adf2df3699d87d1d8184038df46a9c80d78c0148492323f4693df54e17bb/markupsafe-3.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b5420a1d9450023228968e7e6a9ce57f65d148ab56d2313fcd589eee96a7a50" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/30/ac/0273f6fcb5f42e314c6d8cd99effae6a5354604d461b8d392b5ec9530a54/markupsafe-3.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0bf2a864d67e76e5c9a34dc26ec616a66b9888e25e7b9460e1c76d3293bd9dbf" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/19/ae/31c1be199ef767124c042c6c3e904da327a2f7f0cd63a0337e1eca2967a8/markupsafe-3.0.3-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:bc51efed119bc9cfdf792cdeaa4d67e8f6fcccab66ed4bfdd6bde3e59bfcbb2f" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/b2/76/7edcab99d5349a4532a459e1fe64f0b0467a3365056ae550d3bcf3f79e1e/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:068f375c472b3e7acbe2d5318dea141359e6900156b5b2ba06a30b169086b91a" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/a4/28/6e74cdd26d7514849143d69f0bf2399f929c37dc2b31e6829fd2045b2765/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:7be7b61bb172e1ed687f1754f8e7484f1c8019780f6f6b0786e76bb01c2ae115" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/62/7e/a145f36a5c2945673e590850a6f8014318d5577ed7e5920a4b3448e0865d/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f9e130248f4462aaa8e2552d547f36ddadbeaa573879158d721bbd33dfe4743a" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/0f/62/d9c46a7f5c9adbeeeda52f5b8d802e1094e9717705a645efc71b0913a0a8/markupsafe-3.0.3-cp311-cp311-win32.whl", hash = "sha256:0db14f5dafddbb6d9208827849fad01f1a2609380add406671a26386cdf15a19" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/83/8a/4414c03d3f891739326e1783338e48fb49781cc915b2e0ee052aa490d586/markupsafe-3.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:de8a88e63464af587c950061a5e6a67d3632e36df62b986892331d4620a35c01" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/35/73/893072b42e6862f319b5207adc9ae06070f095b358655f077f69a35601f0/markupsafe-3.0.3-cp311-cp311-win_arm64.whl", hash = "sha256:3b562dd9e9ea93f13d53989d23a7e775fdfd1066c33494ff43f5418bc8c58a5c" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/5a/72/147da192e38635ada20e0a2e1a51cf8823d2119ce8883f7053879c2199b5/markupsafe-3.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d53197da72cc091b024dd97249dfc7794d6a56530370992a5e1a08983ad9230e" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/9a/81/7e4e08678a1f98521201c3079f77db69fb552acd56067661f8c2f534a718/markupsafe-3.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1872df69a4de6aead3491198eaf13810b565bdbeec3ae2dc8780f14458ec73ce" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/1e/2c/799f4742efc39633a1b54a92eec4082e4f815314869865d876824c257c1e/markupsafe-3.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a7e8ae81ae39e62a41ec302f972ba6ae23a5c5396c8e60113e9066ef893da0d" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/3c/2e/8d0c2ab90a8c1d9a24f0399058ab8519a3279d1bd4289511d74e909f060e/markupsafe-3.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d6dd0be5b5b189d31db7cda48b91d7e0a9795f31430b7f271219ab30f1d3ac9d" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/2c/54/887f3092a85238093a0b2154bd629c89444f395618842e8b0c41783898ea/markupsafe-3.0.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:94c6f0bb423f739146aec64595853541634bde58b2135f27f61c1ffd1cd4d16a" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/c9/2f/336b8c7b6f4a4d95e91119dc8521402461b74a485558d8f238a68312f11c/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:be8813b57049a7dc738189df53d69395eba14fb99345e0a5994914a3864c8a4b" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/32/43/67935f2b7e4982ffb50a4d169b724d74b62a3964bc1a9a527f5ac4f1ee2b/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:83891d0e9fb81a825d9a6d61e3f07550ca70a076484292a70fde82c4b807286f" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/89/e0/4486f11e51bbba8b0c041098859e869e304d1c261e59244baa3d295d47b7/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:77f0643abe7495da77fb436f50f8dab76dbc6e5fd25d39589a0f1fe6548bfa2b" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/2f/e1/78ee7a023dac597a5825441ebd17170785a9dab23de95d2c7508ade94e0e/markupsafe-3.0.3-cp312-cp312-win32.whl", hash = "sha256:d88b440e37a16e651bda4c7c2b930eb586fd15ca7406cb39e211fcff3bf3017d" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/aa/5b/bec5aa9bbbb2c946ca2733ef9c4ca91c91b6a24580193e891b5f7dbe8e1e/markupsafe-3.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:26a5784ded40c9e318cfc2bdb30fe164bdb8665ded9cd64d500a34fb42067b1c" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/e5/f1/216fc1bbfd74011693a4fd837e7026152e89c4bcf3e77b6692fba9923123/markupsafe-3.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:35add3b638a5d900e807944a078b51922212fb3dedb01633a8defc4b01a3c85f" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/38/2f/907b9c7bbba283e68f20259574b13d005c121a0fa4c175f9bed27c4597ff/markupsafe-3.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e1cf1972137e83c5d4c136c43ced9ac51d0e124706ee1c8aa8532c1287fa8795" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/9c/d9/5f7756922cdd676869eca1c4e3c0cd0df60ed30199ffd775e319089cb3ed/markupsafe-3.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:116bb52f642a37c115f517494ea5feb03889e04df47eeff5b130b1808ce7c219" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/00/07/575a68c754943058c78f30db02ee03a64b3c638586fba6a6dd56830b30a3/markupsafe-3.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:133a43e73a802c5562be9bbcd03d090aa5a1fe899db609c29e8c8d815c5f6de6" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/a9/21/9b05698b46f218fc0e118e1f8168395c65c8a2c750ae2bab54fc4bd4e0e8/markupsafe-3.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ccfcd093f13f0f0b7fdd0f198b90053bf7b2f02a3927a30e63f3ccc9df56b676" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/7f/71/544260864f893f18b6827315b988c146b559391e6e7e8f7252839b1b846a/markupsafe-3.0.3-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:509fa21c6deb7a7a273d629cf5ec029bc209d1a51178615ddf718f5918992ab9" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/c2/28/b50fc2f74d1ad761af2f5dcce7492648b983d00a65b8c0e0cb457c82ebbe/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4afe79fb3de0b7097d81da19090f4df4f8d3a2b3adaa8764138aac2e44f3af1" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/ed/76/104b2aa106a208da8b17a2fb72e033a5a9d7073c68f7e508b94916ed47a9/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:795e7751525cae078558e679d646ae45574b47ed6e7771863fcc079a6171a0fc" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/b5/99/16a5eb2d140087ebd97180d95249b00a03aa87e29cc224056274f2e45fd6/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8485f406a96febb5140bfeca44a73e3ce5116b2501ac54fe953e488fb1d03b12" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/19/bc/e7140ed90c5d61d77cea142eed9f9c303f4c4806f60a1044c13e3f1471d0/markupsafe-3.0.3-cp313-cp313-win32.whl", hash = "sha256:bdd37121970bfd8be76c5fb069c7751683bdf373db1ed6c010162b2a130248ed" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/05/73/c4abe620b841b6b791f2edc248f556900667a5a1cf023a6646967ae98335/markupsafe-3.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:9a1abfdc021a164803f4d485104931fb8f8c1efd55bc6b748d2f5774e78b62c5" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/f0/3a/fa34a0f7cfef23cf9500d68cb7c32dd64ffd58a12b09225fb03dd37d5b80/markupsafe-3.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:7e68f88e5b8799aa49c85cd116c932a1ac15caaa3f5db09087854d218359e485" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/e4/d7/e05cd7efe43a88a17a37b3ae96e79a19e846f3f456fe79c57ca61356ef01/markupsafe-3.0.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:218551f6df4868a8d527e3062d0fb968682fe92054e89978594c28e642c43a73" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/99/9e/e412117548182ce2148bdeacdda3bb494260c0b0184360fe0d56389b523b/markupsafe-3.0.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:3524b778fe5cfb3452a09d31e7b5adefeea8c5be1d43c4f810ba09f2ceb29d37" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/bc/e6/fa0ffcda717ef64a5108eaa7b4f5ed28d56122c9a6d70ab8b72f9f715c80/markupsafe-3.0.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4e885a3d1efa2eadc93c894a21770e4bc67899e3543680313b09f139e149ab19" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/96/ec/2102e881fe9d25fc16cb4b25d5f5cde50970967ffa5dddafdb771237062d/markupsafe-3.0.3-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8709b08f4a89aa7586de0aadc8da56180242ee0ada3999749b183aa23df95025" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/4b/30/6f2fce1f1f205fc9323255b216ca8a235b15860c34b6798f810f05828e32/markupsafe-3.0.3-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b8512a91625c9b3da6f127803b166b629725e68af71f8184ae7e7d54686a56d6" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/58/47/4a0ccea4ab9f5dcb6f79c0236d954acb382202721e704223a8aafa38b5c8/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9b79b7a16f7fedff2495d684f2b59b0457c3b493778c9eed31111be64d58279f" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/6a/70/3780e9b72180b6fecb83a4814d84c3bf4b4ae4bf0b19c27196104149734c/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:12c63dfb4a98206f045aa9563db46507995f7ef6d83b2f68eda65c307c6829eb" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/98/c5/c03c7f4125180fc215220c035beac6b9cb684bc7a067c84fc69414d315f5/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8f71bc33915be5186016f675cd83a1e08523649b0e33efdb898db577ef5bb009" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/80/d6/2d1b89f6ca4bff1036499b1e29a1d02d282259f3681540e16563f27ebc23/markupsafe-3.0.3-cp313-cp313t-win32.whl", hash = "sha256:69c0b73548bc525c8cb9a251cddf1931d1db4d2258e9599c28c07ef3580ef354" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/2b/98/e48a4bfba0a0ffcf9925fe2d69240bfaa19c6f7507b8cd09c70684a53c1e/markupsafe-3.0.3-cp313-cp313t-win_amd64.whl", hash = "sha256:1b4b79e8ebf6b55351f0d91fe80f893b4743f104bff22e90697db1590e47a218" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/0e/72/e3cc540f351f316e9ed0f092757459afbc595824ca724cbc5a5d4263713f/markupsafe-3.0.3-cp313-cp313t-win_arm64.whl", hash = "sha256:ad2cf8aa28b8c020ab2fc8287b0f823d0a7d8630784c31e9ee5edea20f406287" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/33/8a/8e42d4838cd89b7dde187011e97fe6c3af66d8c044997d2183fbd6d31352/markupsafe-3.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:eaa9599de571d72e2daf60164784109f19978b327a3910d3e9de8c97b5b70cfe" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/b5/64/7660f8a4a8e53c924d0fa05dc3a55c9cee10bbd82b11c5afb27d44b096ce/markupsafe-3.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c47a551199eb8eb2121d4f0f15ae0f923d31350ab9280078d1e5f12b249e0026" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/da/ef/e648bfd021127bef5fa12e1720ffed0c6cbb8310c8d9bea7266337ff06de/markupsafe-3.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f34c41761022dd093b4b6896d4810782ffbabe30f2d443ff5f083e0cbbb8c737" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/41/3c/a36c2450754618e62008bf7435ccb0f88053e07592e6028a34776213d877/markupsafe-3.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:457a69a9577064c05a97c41f4e65148652db078a3a509039e64d3467b9e7ef97" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/bc/20/b7fdf89a8456b099837cd1dc21974632a02a999ec9bf7ca3e490aacd98e7/markupsafe-3.0.3-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e8afc3f2ccfa24215f8cb28dcf43f0113ac3c37c2f0f0806d8c70e4228c5cf4d" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/9a/a7/591f592afdc734f47db08a75793a55d7fbcc6902a723ae4cfbab61010cc5/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:ec15a59cf5af7be74194f7ab02d0f59a62bdcf1a537677ce67a2537c9b87fcda" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/7d/33/45b24e4f44195b26521bc6f1a82197118f74df348556594bd2262bda1038/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:0eb9ff8191e8498cca014656ae6b8d61f39da5f95b488805da4bb029cccbfbaf" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/ff/0e/53dfaca23a69fbfbbf17a4b64072090e70717344c52eaaaa9c5ddff1e5f0/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2713baf880df847f2bece4230d4d094280f4e67b1e813eec43b4c0e144a34ffe" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/46/11/f333a06fc16236d5238bfe74daccbca41459dcd8d1fa952e8fbd5dccfb70/markupsafe-3.0.3-cp314-cp314-win32.whl", hash = "sha256:729586769a26dbceff69f7a7dbbf59ab6572b99d94576a5592625d5b411576b9" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/28/52/182836104b33b444e400b14f797212f720cbc9ed6ba34c800639d154e821/markupsafe-3.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:bdc919ead48f234740ad807933cdf545180bfbe9342c2bb451556db2ed958581" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/6f/18/acf23e91bd94fd7b3031558b1f013adfa21a8e407a3fdb32745538730382/markupsafe-3.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:5a7d5dc5140555cf21a6fefbdbf8723f06fcd2f63ef108f2854de715e4422cb4" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/3c/f0/57689aa4076e1b43b15fdfa646b04653969d50cf30c32a102762be2485da/markupsafe-3.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:1353ef0c1b138e1907ae78e2f6c63ff67501122006b0f9abad68fda5f4ffc6ab" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/89/c3/2e67a7ca217c6912985ec766c6393b636fb0c2344443ff9d91404dc4c79f/markupsafe-3.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1085e7fbddd3be5f89cc898938f42c0b3c711fdcb37d75221de2666af647c175" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/f0/00/be561dce4e6ca66b15276e184ce4b8aec61fe83662cce2f7d72bd3249d28/markupsafe-3.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1b52b4fb9df4eb9ae465f8d0c228a00624de2334f216f178a995ccdcf82c4634" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/50/09/c419f6f5a92e5fadde27efd190eca90f05e1261b10dbd8cbcb39cd8ea1dc/markupsafe-3.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fed51ac40f757d41b7c48425901843666a6677e3e8eb0abcff09e4ba6e664f50" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/22/44/a0681611106e0b2921b3033fc19bc53323e0b50bc70cffdd19f7d679bb66/markupsafe-3.0.3-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f190daf01f13c72eac4efd5c430a8de82489d9cff23c364c3ea822545032993e" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/5f/57/1b0b3f100259dc9fffe780cfb60d4be71375510e435efec3d116b6436d43/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e56b7d45a839a697b5eb268c82a71bd8c7f6c94d6fd50c3d577fa39a9f1409f5" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/26/6a/4bf6d0c97c4920f1597cc14dd720705eca0bf7c787aebc6bb4d1bead5388/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:f3e98bb3798ead92273dc0e5fd0f31ade220f59a266ffd8a4f6065e0a3ce0523" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/14/c7/ca723101509b518797fedc2fdf79ba57f886b4aca8a7d31857ba3ee8281f/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5678211cb9333a6468fb8d8be0305520aa073f50d17f089b5b4b477ea6e67fdc" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/fb/df/5bd7a48c256faecd1d36edc13133e51397e41b73bb77e1a69deab746ebac/markupsafe-3.0.3-cp314-cp314t-win32.whl", hash = "sha256:915c04ba3851909ce68ccc2b8e2cd691618c4dc4c4232fb7982bca3f41fd8c3d" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/1a/8a/0402ba61a2f16038b48b39bccca271134be00c5c9f0f623208399333c448/markupsafe-3.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4faffd047e07c38848ce017e8725090413cd80cbc23d86e55c587bf979e579c9" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/70/bc/6f1c2f612465f5fa89b95bead1f44dcb607670fd42891d8fdcd5d039f4f4/markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa" },
+]
+
+[[package]]
+name = "mpmath"
+version = "1.3.0"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c" },
+]
+
+[[package]]
+name = "networkx"
+version = "3.4.2"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+resolution-markers = [
+    "python_full_version < '3.11'",
+]
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/fd/1d/06475e1cd5264c0b870ea2cc6fdb3e37177c1e565c43f56ff17a10e3937f/networkx-3.4.2.tar.gz", hash = "sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/b9/54/dd730b32ea14ea797530a4479b2ed46a6fb250f682a9cfb997e968bf0261/networkx-3.4.2-py3-none-any.whl", hash = "sha256:df5d4365b724cf81b8c6a7312509d0c22386097011ad1abe274afd5e9d3bbc5f" },
+]
+
+[[package]]
+name = "networkx"
+version = "3.6.1"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+resolution-markers = [
+    "python_full_version >= '3.12'",
+    "python_full_version == '3.11.*'",
+]
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/6a/51/63fe664f3908c97be9d2e4f1158eb633317598cfa6e1fc14af5383f17512/networkx-3.6.1.tar.gz", hash = "sha256:26b7c357accc0c8cde558ad486283728b65b6a95d85ee1cd66bafab4c8168509" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/9e/c9/b2622292ea83fbb4ec318f5b9ab867d0a28ab43c5717bb85b0a5f6b3b0a4/networkx-3.6.1-py3-none-any.whl", hash = "sha256:d47fbf302e7d9cbbb9e2555a0d267983d2aa476bac30e90dfbe5669bd57f3762" },
+]
+
+[[package]]
+name = "ninja"
+version = "1.13.0"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/43/73/79a0b22fc731989c708068427579e840a6cf4e937fe7ae5c5d0b7356ac22/ninja-1.13.0.tar.gz", hash = "sha256:4a40ce995ded54d9dc24f8ea37ff3bf62ad192b547f6c7126e7e25045e76f978" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/3c/74/d02409ed2aa865e051b7edda22ad416a39d81a84980f544f8de717cab133/ninja-1.13.0-py3-none-macosx_10_9_universal2.whl", hash = "sha256:fa2a8bfc62e31b08f83127d1613d10821775a0eb334197154c4d6067b7068ff1" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/8e/de/6e1cd6b84b412ac1ef327b76f0641aeb5dcc01e9d3f9eee0286d0c34fd93/ninja-1.13.0-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3d00c692fb717fd511abeb44b8c5d00340c36938c12d6538ba989fe764e79630" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/c8/83/49320fb6e58ae3c079381e333575fdbcf1cca3506ee160a2dcce775046fa/ninja-1.13.0-py3-none-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:be7f478ff9f96a128b599a964fc60a6a87b9fa332ee1bd44fa243ac88d50291c" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/56/c7/ba22748fb59f7f896b609cd3e568d28a0a367a6d953c24c461fe04fc4433/ninja-1.13.0-py3-none-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:60056592cf495e9a6a4bea3cd178903056ecb0943e4de45a2ea825edb6dc8d3e" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/79/22/d1de07632b78ac8e6b785f41fa9aad7a978ec8c0a1bf15772def36d77aac/ninja-1.13.0-py3-none-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:1c97223cdda0417f414bf864cfb73b72d8777e57ebb279c5f6de368de0062988" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/ed/de/0e6edf44d6a04dabd0318a519125ed0415ce437ad5a1ec9b9be03d9048cf/ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fb46acf6b93b8dd0322adc3a4945452a4e774b75b91293bafcc7b7f8e6517dfa" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/54/28/938b562f9057aaa4d6bfbeaa05e81899a47aebb3ba6751e36c027a7f5ff7/ninja-1.13.0-py3-none-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:4be9c1b082d244b1ad7ef41eb8ab088aae8c109a9f3f0b3e56a252d3e00f42c1" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/2a/fb/d06a3838de4f8ab866e44ee52a797b5491df823901c54943b2adb0389fbb/ninja-1.13.0-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:6739d3352073341ad284246f81339a384eec091d9851a886dfa5b00a6d48b3e2" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/31/bf/0d7808af695ceddc763cf251b84a9892cd7f51622dc8b4c89d5012779f06/ninja-1.13.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:11be2d22027bde06f14c343f01d31446747dbb51e72d00decca2eb99be911e2f" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/9d/70/c99d0c2c809f992752453cce312848abb3b1607e56d4cd1b6cded317351a/ninja-1.13.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:aa45b4037b313c2f698bc13306239b8b93b4680eb47e287773156ac9e9304714" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/9f/43/c217b1153f0e499652f5e0766da8523ce3480f0a951039c7af115e224d55/ninja-1.13.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:5f8e1e8a1a30835eeb51db05cf5a67151ad37542f5a4af2a438e9490915e5b72" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/8c/45/9151bba2c8d0ae2b6260f71696330590de5850e5574b7b5694dce6023e20/ninja-1.13.0-py3-none-musllinux_1_2_ppc64le.whl", hash = "sha256:3d7d7779d12cb20c6d054c61b702139fd23a7a964ec8f2c823f1ab1b084150db" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/3c/fb/95752eb635bb8ad27d101d71bef15bc63049de23f299e312878fc21cb2da/ninja-1.13.0-py3-none-musllinux_1_2_riscv64.whl", hash = "sha256:d741a5e6754e0bda767e3274a0f0deeef4807f1fec6c0d7921a0244018926ae5" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/c1/31/aa56a1a286703800c0cbe39fb4e82811c277772dc8cd084f442dd8e2938a/ninja-1.13.0-py3-none-musllinux_1_2_s390x.whl", hash = "sha256:e8bad11f8a00b64137e9b315b137d8bb6cbf3086fbdc43bf1f90fd33324d2e96" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/34/6f/5f5a54a1041af945130abdb2b8529cbef0cdcbbf9bcf3f4195378319d29a/ninja-1.13.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:b4f2a072db3c0f944c32793e91532d8948d20d9ab83da9c0c7c15b5768072200" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/95/97/51359c77527d45943fe7a94d00a3843b81162e6c4244b3579fe8fc54cb9c/ninja-1.13.0-py3-none-win32.whl", hash = "sha256:8cfbb80b4a53456ae8a39f90ae3d7a2129f45ea164f43fadfa15dc38c4aef1c9" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/29/45/c0adfbfb0b5895aa18cec400c535b4f7ff3e52536e0403602fc1a23f7de9/ninja-1.13.0-py3-none-win_amd64.whl", hash = "sha256:fb8ee8719f8af47fed145cced4a85f0755dd55d45b2bddaf7431fa89803c5f3e" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/df/93/a7b983643d1253bb223234b5b226e69de6cda02b76cdca7770f684b795f5/ninja-1.13.0-py3-none-win_arm64.whl", hash = "sha256:3c0b40b1f0bba764644385319028650087b4c1b18cdfa6f45cb39a3669b81aa9" },
+]
+
+[[package]]
+name = "numpy"
+version = "2.2.6"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+resolution-markers = [
+    "python_full_version < '3.11'",
+]
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/76/21/7d2a95e4bba9dc13d043ee156a356c0a8f0c6309dff6b21b4d71a073b8a8/numpy-2.2.6.tar.gz", hash = "sha256:e29554e2bef54a90aa5cc07da6ce955accb83f21ab5de01a62c8478897b264fd" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/9a/3e/ed6db5be21ce87955c0cbd3009f2803f59fa08df21b5df06862e2d8e2bdd/numpy-2.2.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b412caa66f72040e6d268491a59f2c43bf03eb6c96dd8f0307829feb7fa2b6fb" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/22/c2/4b9221495b2a132cc9d2eb862e21d42a009f5a60e45fc44b00118c174bff/numpy-2.2.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8e41fd67c52b86603a91c1a505ebaef50b3314de0213461c7a6e99c9a3beff90" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/fd/77/dc2fcfc66943c6410e2bf598062f5959372735ffda175b39906d54f02349/numpy-2.2.6-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:37e990a01ae6ec7fe7fa1c26c55ecb672dd98b19c3d0e1d1f326fa13cb38d163" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/7a/4f/1cb5fdc353a5f5cc7feb692db9b8ec2c3d6405453f982435efc52561df58/numpy-2.2.6-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:5a6429d4be8ca66d889b7cf70f536a397dc45ba6faeb5f8c5427935d9592e9cf" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/eb/17/96a3acd228cec142fcb8723bd3cc39c2a474f7dcf0a5d16731980bcafa95/numpy-2.2.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:efd28d4e9cd7d7a8d39074a4d44c63eda73401580c5c76acda2ce969e0a38e83" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/b4/63/3de6a34ad7ad6646ac7d2f55ebc6ad439dbbf9c4370017c50cf403fb19b5/numpy-2.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc7b73d02efb0e18c000e9ad8b83480dfcd5dfd11065997ed4c6747470ae8915" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/07/b6/89d837eddef52b3d0cec5c6ba0456c1bf1b9ef6a6672fc2b7873c3ec4e2e/numpy-2.2.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:74d4531beb257d2c3f4b261bfb0fc09e0f9ebb8842d82a7b4209415896adc680" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/01/c8/dc6ae86e3c61cfec1f178e5c9f7858584049b6093f843bca541f94120920/numpy-2.2.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8fc377d995680230e83241d8a96def29f204b5782f371c532579b4f20607a289" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/5b/c5/0064b1b7e7c89137b471ccec1fd2282fceaae0ab3a9550f2568782d80357/numpy-2.2.6-cp310-cp310-win32.whl", hash = "sha256:b093dd74e50a8cba3e873868d9e93a85b78e0daf2e98c6797566ad8044e8363d" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/a3/dd/4b822569d6b96c39d1215dbae0582fd99954dcbcf0c1a13c61783feaca3f/numpy-2.2.6-cp310-cp310-win_amd64.whl", hash = "sha256:f0fd6321b839904e15c46e0d257fdd101dd7f530fe03fd6359c1ea63738703f3" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/da/a8/4f83e2aa666a9fbf56d6118faaaf5f1974d456b1823fda0a176eff722839/numpy-2.2.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f9f1adb22318e121c5c69a09142811a201ef17ab257a1e66ca3025065b7f53ae" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/b3/2b/64e1affc7972decb74c9e29e5649fac940514910960ba25cd9af4488b66c/numpy-2.2.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c820a93b0255bc360f53eca31a0e676fd1101f673dda8da93454a12e23fc5f7a" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/4a/9f/0121e375000b5e50ffdd8b25bf78d8e1a5aa4cca3f185d41265198c7b834/numpy-2.2.6-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3d70692235e759f260c3d837193090014aebdf026dfd167834bcba43e30c2a42" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/31/0d/b48c405c91693635fbe2dcd7bc84a33a602add5f63286e024d3b6741411c/numpy-2.2.6-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:481b49095335f8eed42e39e8041327c05b0f6f4780488f61286ed3c01368d491" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/52/b8/7f0554d49b565d0171eab6e99001846882000883998e7b7d9f0d98b1f934/numpy-2.2.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b64d8d4d17135e00c8e346e0a738deb17e754230d7e0810ac5012750bbd85a5a" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/b3/dd/2238b898e51bd6d389b7389ffb20d7f4c10066d80351187ec8e303a5a475/numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba10f8411898fc418a521833e014a77d3ca01c15b0c6cdcce6a0d2897e6dbbdf" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/83/6c/44d0325722cf644f191042bf47eedad61c1e6df2432ed65cbe28509d404e/numpy-2.2.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:bd48227a919f1bafbdda0583705e547892342c26fb127219d60a5c36882609d1" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/ae/9d/81e8216030ce66be25279098789b665d49ff19eef08bfa8cb96d4957f422/numpy-2.2.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9551a499bf125c1d4f9e250377c1ee2eddd02e01eac6644c080162c0c51778ab" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/6a/fd/e19617b9530b031db51b0926eed5345ce8ddc669bb3bc0044b23e275ebe8/numpy-2.2.6-cp311-cp311-win32.whl", hash = "sha256:0678000bb9ac1475cd454c6b8c799206af8107e310843532b04d49649c717a47" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/31/0a/f354fb7176b81747d870f7991dc763e157a934c717b67b58456bc63da3df/numpy-2.2.6-cp311-cp311-win_amd64.whl", hash = "sha256:e8213002e427c69c45a52bbd94163084025f533a55a59d6f9c5b820774ef3303" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/82/5d/c00588b6cf18e1da539b45d3598d3557084990dcc4331960c15ee776ee41/numpy-2.2.6-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:41c5a21f4a04fa86436124d388f6ed60a9343a6f767fced1a8a71c3fbca038ff" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/66/ee/560deadcdde6c2f90200450d5938f63a34b37e27ebff162810f716f6a230/numpy-2.2.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:de749064336d37e340f640b05f24e9e3dd678c57318c7289d222a8a2f543e90c" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/3c/65/4baa99f1c53b30adf0acd9a5519078871ddde8d2339dc5a7fde80d9d87da/numpy-2.2.6-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:894b3a42502226a1cac872f840030665f33326fc3dac8e57c607905773cdcde3" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/cc/89/e5a34c071a0570cc40c9a54eb472d113eea6d002e9ae12bb3a8407fb912e/numpy-2.2.6-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:71594f7c51a18e728451bb50cc60a3ce4e6538822731b2933209a1f3614e9282" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/f8/35/8c80729f1ff76b3921d5c9487c7ac3de9b2a103b1cd05e905b3090513510/numpy-2.2.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2618db89be1b4e05f7a1a847a9c1c0abd63e63a1607d892dd54668dd92faf87" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/8c/3d/1e1db36cfd41f895d266b103df00ca5b3cbe965184df824dec5c08c6b803/numpy-2.2.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd83c01228a688733f1ded5201c678f0c53ecc1006ffbc404db9f7a899ac6249" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/61/c6/03ed30992602c85aa3cd95b9070a514f8b3c33e31124694438d88809ae36/numpy-2.2.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:37c0ca431f82cd5fa716eca9506aefcabc247fb27ba69c5062a6d3ade8cf8f49" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/b7/25/5761d832a81df431e260719ec45de696414266613c9ee268394dd5ad8236/numpy-2.2.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fe27749d33bb772c80dcd84ae7e8df2adc920ae8297400dabec45f0dedb3f6de" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/57/0a/72d5a3527c5ebffcd47bde9162c39fae1f90138c961e5296491ce778e682/numpy-2.2.6-cp312-cp312-win32.whl", hash = "sha256:4eeaae00d789f66c7a25ac5f34b71a7035bb474e679f410e5e1a94deb24cf2d4" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/36/fa/8c9210162ca1b88529ab76b41ba02d433fd54fecaf6feb70ef9f124683f1/numpy-2.2.6-cp312-cp312-win_amd64.whl", hash = "sha256:c1f9540be57940698ed329904db803cf7a402f3fc200bfe599334c9bd84a40b2" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/f9/5c/6657823f4f594f72b5471f1db1ab12e26e890bb2e41897522d134d2a3e81/numpy-2.2.6-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0811bb762109d9708cca4d0b13c4f67146e3c3b7cf8d34018c722adb2d957c84" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/dc/9e/14520dc3dadf3c803473bd07e9b2bd1b69bc583cb2497b47000fed2fa92f/numpy-2.2.6-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:287cc3162b6f01463ccd86be154f284d0893d2b3ed7292439ea97eafa8170e0b" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/4f/06/7e96c57d90bebdce9918412087fc22ca9851cceaf5567a45c1f404480e9e/numpy-2.2.6-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:f1372f041402e37e5e633e586f62aa53de2eac8d98cbfb822806ce4bbefcb74d" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/73/ed/63d920c23b4289fdac96ddbdd6132e9427790977d5457cd132f18e76eae0/numpy-2.2.6-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:55a4d33fa519660d69614a9fad433be87e5252f4b03850642f88993f7b2ca566" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/85/c5/e19c8f99d83fd377ec8c7e0cf627a8049746da54afc24ef0a0cb73d5dfb5/numpy-2.2.6-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f92729c95468a2f4f15e9bb94c432a9229d0d50de67304399627a943201baa2f" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/19/49/4df9123aafa7b539317bf6d342cb6d227e49f7a35b99c287a6109b13dd93/numpy-2.2.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1bc23a79bfabc5d056d106f9befb8d50c31ced2fbc70eedb8155aec74a45798f" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/b2/6c/04b5f47f4f32f7c2b0e7260442a8cbcf8168b0e1a41ff1495da42f42a14f/numpy-2.2.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e3143e4451880bed956e706a3220b4e5cf6172ef05fcc397f6f36a550b1dd868" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/17/0a/5cd92e352c1307640d5b6fec1b2ffb06cd0dabe7d7b8227f97933d378422/numpy-2.2.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b4f13750ce79751586ae2eb824ba7e1e8dba64784086c98cdbbcc6a42112ce0d" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/f0/3b/5cba2b1d88760ef86596ad0f3d484b1cbff7c115ae2429678465057c5155/numpy-2.2.6-cp313-cp313-win32.whl", hash = "sha256:5beb72339d9d4fa36522fc63802f469b13cdbe4fdab4a288f0c441b74272ebfd" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/cb/3b/d58c12eafcb298d4e6d0d40216866ab15f59e55d148a5658bb3132311fcf/numpy-2.2.6-cp313-cp313-win_amd64.whl", hash = "sha256:b0544343a702fa80c95ad5d3d608ea3599dd54d4632df855e4c8d24eb6ecfa1c" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/6b/9e/4bf918b818e516322db999ac25d00c75788ddfd2d2ade4fa66f1f38097e1/numpy-2.2.6-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0bca768cd85ae743b2affdc762d617eddf3bcf8724435498a1e80132d04879e6" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/61/66/d2de6b291507517ff2e438e13ff7b1e2cdbdb7cb40b3ed475377aece69f9/numpy-2.2.6-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:fc0c5673685c508a142ca65209b4e79ed6740a4ed6b2267dbba90f34b0b3cfda" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/e4/25/480387655407ead912e28ba3a820bc69af9adf13bcbe40b299d454ec011f/numpy-2.2.6-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:5bd4fc3ac8926b3819797a7c0e2631eb889b4118a9898c84f585a54d475b7e40" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/aa/4a/6e313b5108f53dcbf3aca0c0f3e9c92f4c10ce57a0a721851f9785872895/numpy-2.2.6-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:fee4236c876c4e8369388054d02d0e9bb84821feb1a64dd59e137e6511a551f8" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/b7/30/172c2d5c4be71fdf476e9de553443cf8e25feddbe185e0bd88b096915bcc/numpy-2.2.6-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e1dda9c7e08dc141e0247a5b8f49cf05984955246a327d4c48bda16821947b2f" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/12/fb/9e743f8d4e4d3c710902cf87af3512082ae3d43b945d5d16563f26ec251d/numpy-2.2.6-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f447e6acb680fd307f40d3da4852208af94afdfab89cf850986c3ca00562f4fa" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/12/75/ee20da0e58d3a66f204f38916757e01e33a9737d0b22373b3eb5a27358f9/numpy-2.2.6-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:389d771b1623ec92636b0786bc4ae56abafad4a4c513d36a55dce14bd9ce8571" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/76/95/bef5b37f29fc5e739947e9ce5179ad402875633308504a52d188302319c8/numpy-2.2.6-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8e9ace4a37db23421249ed236fdcdd457d671e25146786dfc96835cd951aa7c1" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/09/04/f2f83279d287407cf36a7a8053a5abe7be3622a4363337338f2585e4afda/numpy-2.2.6-cp313-cp313t-win32.whl", hash = "sha256:038613e9fb8c72b0a41f025a7e4c3f0b7a1b5d768ece4796b674c8f3fe13efff" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/67/0e/35082d13c09c02c011cf21570543d202ad929d961c02a147493cb0c2bdf5/numpy-2.2.6-cp313-cp313t-win_amd64.whl", hash = "sha256:6031dd6dfecc0cf9f668681a37648373bddd6421fff6c66ec1624eed0180ee06" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/9e/3b/d94a75f4dbf1ef5d321523ecac21ef23a3cd2ac8b78ae2aac40873590229/numpy-2.2.6-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0b605b275d7bd0c640cad4e5d30fa701a8d59302e127e5f79138ad62762c3e3d" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/17/f4/09b2fa1b58f0fb4f7c7963a1649c64c4d315752240377ed74d9cd878f7b5/numpy-2.2.6-pp310-pypy310_pp73-macosx_14_0_x86_64.whl", hash = "sha256:7befc596a7dc9da8a337f79802ee8adb30a552a94f792b9c9d18c840055907db" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/af/30/feba75f143bdc868a1cc3f44ccfa6c4b9ec522b36458e738cd00f67b573f/numpy-2.2.6-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce47521a4754c8f4593837384bd3424880629f718d87c5d44f8ed763edd63543" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/37/48/ac2a9584402fb6c0cd5b5d1a91dcf176b15760130dd386bbafdbfe3640bf/numpy-2.2.6-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d042d24c90c41b54fd506da306759e06e568864df8ec17ccc17e9e884634fd00" },
+]
+
+[[package]]
+name = "numpy"
+version = "2.4.4"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+resolution-markers = [
+    "python_full_version >= '3.12'",
+    "python_full_version == '3.11.*'",
+]
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/d7/9f/b8cef5bffa569759033adda9481211426f12f53299629b410340795c2514/numpy-2.4.4.tar.gz", hash = "sha256:2d390634c5182175533585cc89f3608a4682ccb173cc9bb940b2881c8d6f8fa0" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/ef/c6/4218570d8c8ecc9704b5157a3348e486e84ef4be0ed3e38218ab473c83d2/numpy-2.4.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f983334aea213c99992053ede6168500e5f086ce74fbc4acc3f2b00f5762e9db" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/dd/92/b4d922c4a5f5dab9ed44e6153908a5c665b71acf183a83b93b690996e39b/numpy-2.4.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:72944b19f2324114e9dc86a159787333b77874143efcf89a5167ef83cfee8af0" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/8a/dc/df98c095978fa6ee7b9a9387d1d58cbb3d232d0e69ad169a4ce784bde4fd/numpy-2.4.4-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:86b6f55f5a352b48d7fbfd2dbc3d5b780b2d79f4d3c121f33eb6efb22e9a2015" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/28/34/b3fdcec6e725409223dd27356bdf5a3c2cc2282e428218ecc9cb7acc9763/numpy-2.4.4-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:ba1f4fc670ed79f876f70082eff4f9583c15fb9a4b89d6188412de4d18ae2f40" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/68/62/63417c13aa35d57bee1337c67446761dc25ea6543130cf868eace6e8157b/numpy-2.4.4-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8a87ec22c87be071b6bdbd27920b129b94f2fc964358ce38f3822635a3e2e03d" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/cf/c5/9fcb7e0e69cef59cf10c746b84f7d58b08bc66a6b7d459783c5a4f6101a6/numpy-2.4.4-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:df3775294accfdd75f32c74ae39fcba920c9a378a2fc18a12b6820aa8c1fb502" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/7e/43/80020edacb3f84b9efdd1591120a4296462c23fd8db0dde1666f6ef66f13/numpy-2.4.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0d4e437e295f18ec29bc79daf55e8a47a9113df44d66f702f02a293d93a2d6dd" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/fd/06/af0658593b18a5f73532d377188b964f239eb0894e664a6c12f484472f97/numpy-2.4.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6aa3236c78803afbcb255045fbef97a9e25a1f6c9888357d205ddc42f4d6eba5" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/e6/ce/13a09ed65f5d0ce5c7dd0669250374c6e379910f97af2c08c57b0608eee4/numpy-2.4.4-cp311-cp311-win32.whl", hash = "sha256:30caa73029a225b2d40d9fae193e008e24b2026b7ee1a867b7ee8d96ca1a448e" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/bd/63/05d193dbb4b5eec1eca73822d80da98b511f8328ad4ae3ca4caf0f4db91d/numpy-2.4.4-cp311-cp311-win_amd64.whl", hash = "sha256:6bbe4eb67390b0a0265a2c25458f6b90a409d5d069f1041e6aff1e27e3d9a79e" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/87/c5/8168052f080c26fa984c413305012be54741c9d0d74abd7fbeeccae3889f/numpy-2.4.4-cp311-cp311-win_arm64.whl", hash = "sha256:fcfe2045fd2e8f3cb0ce9d4ba6dba6333b8fa05bb8a4939c908cd43322d14c7e" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/28/05/32396bec30fb2263770ee910142f49c1476d08e8ad41abf8403806b520ce/numpy-2.4.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:15716cfef24d3a9762e3acdf87e27f58dc823d1348f765bbea6bef8c639bfa1b" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/c5/f3/a983d28637bfcd763a9c7aafdb6d5c0ebf3d487d1e1459ffdb57e2f01117/numpy-2.4.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:23cbfd4c17357c81021f21540da84ee282b9c8fba38a03b7b9d09ba6b951421e" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/9b/fd/e5ecca1e78c05106d98028114f5c00d3eddb41207686b2b7de3e477b0e22/numpy-2.4.4-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:8b3b60bb7cba2c8c81837661c488637eee696f59a877788a396d33150c35d842" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/de/2f/702a4594413c1a8632092beae8aba00f1d67947389369b3777aed783fdca/numpy-2.4.4-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:e4a010c27ff6f210ff4c6ef34394cd61470d01014439b192ec22552ee867f2a8" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/7f/37/eed308a8f56cba4d1fdf467a4fc67ef4ff4bf1c888f5fc980481890104b1/numpy-2.4.4-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f9e75681b59ddaa5e659898085ae0eaea229d054f2ac0c7e563a62205a700121" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/0a/0d/0e3ecece05b7a7e87ab9fb587855548da437a061326fff64a223b6dcb78a/numpy-2.4.4-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:81f4a14bee47aec54f883e0cad2d73986640c1590eb9bfaaba7ad17394481e6e" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/34/49/f2312c154b82a286758ee2f1743336d50651f8b5195db18cdb63675ff649/numpy-2.4.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:62d6b0f03b694173f9fcb1fb317f7222fd0b0b103e784c6549f5e53a27718c44" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/7b/e9/736d17bd77f1b0ec4f9901aaec129c00d59f5d84d5e79bba540ef12c2330/numpy-2.4.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fbc356aae7adf9e6336d336b9c8111d390a05df88f1805573ebb0807bd06fd1d" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/63/f6/d417977c5f519b17c8a5c3bc9e8304b0908b0e21136fe43bf628a1343914/numpy-2.4.4-cp312-cp312-win32.whl", hash = "sha256:0d35aea54ad1d420c812bfa0385c71cd7cc5bcf7c65fed95fc2cd02fe8c79827" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/2d/5b/e1deebf88ff431b01b7406ca3583ab2bbb90972bbe1c568732e49c844f7e/numpy-2.4.4-cp312-cp312-win_amd64.whl", hash = "sha256:b5f0362dc928a6ecd9db58868fca5e48485205e3855957bdedea308f8672ea4a" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/58/89/e4e856ac82a68c3ed64486a544977d0e7bdd18b8da75b78a577ca31c4395/numpy-2.4.4-cp312-cp312-win_arm64.whl", hash = "sha256:846300f379b5b12cc769334464656bc882e0735d27d9726568bc932fdc49d5ec" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/14/1d/d0a583ce4fefcc3308806a749a536c201ed6b5ad6e1322e227ee4848979d/numpy-2.4.4-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:08f2e31ed5e6f04b118e49821397f12767934cfdd12a1ce86a058f91e004ee50" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/c1/62/2b7a48fbb745d344742c0277f01286dead15f3f68e4f359fbfcf7b48f70f/numpy-2.4.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e823b8b6edc81e747526f70f71a9c0a07ac4e7ad13020aa736bb7c9d67196115" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/e5/87/499737bfba066b4a3bebff24a8f1c5b2dee410b209bc6668c9be692580f0/numpy-2.4.4-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:4a19d9dba1a76618dd86b164d608566f393f8ec6ac7c44f0cc879011c45e65af" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/cd/da/464d551604320d1491bc345efed99b4b7034143a85787aab78d5691d5a0e/numpy-2.4.4-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:d2a8490669bfe99a233298348acc2d824d496dee0e66e31b66a6022c2ad74a5c" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/7d/90/8d23e3b0dafd024bf31bdec225b3bb5c2dbfa6912f8a53b8659f21216cbf/numpy-2.4.4-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:45dbed2ab436a9e826e302fcdcbe9133f9b0006e5af7168afb8963a6520da103" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/d1/73/a9d864e42a01896bb5974475438f16086be9ba1f0d19d0bb7a07427c4a8b/numpy-2.4.4-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c901b15172510173f5cb310eae652908340f8dede90fff9e3bf6c0d8dfd92f83" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/34/fb/14570d65c3bde4e202a031210475ae9cde9b7686a2e7dc97ee67d2833b35/numpy-2.4.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:99d838547ace2c4aace6c4f76e879ddfe02bb58a80c1549928477862b7a6d6ed" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/8a/77/2ba9d87081fd41f6d640c83f26fb7351e536b7ce6dd9061b6af5904e8e46/numpy-2.4.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:0aec54fd785890ecca25a6003fd9a5aed47ad607bbac5cd64f836ad8666f4959" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/a2/23/52666c9a41708b0853fa3b1a12c90da38c507a3074883823126d4e9d5b30/numpy-2.4.4-cp313-cp313-win32.whl", hash = "sha256:07077278157d02f65c43b1b26a3886bce886f95d20aabd11f87932750dfb14ed" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/57/fb/48649b4971cde70d817cf97a2a2fdc0b4d8308569f1dd2f2611959d2e0cf/numpy-2.4.4-cp313-cp313-win_amd64.whl", hash = "sha256:5c70f1cc1c4efbe316a572e2d8b9b9cc44e89b95f79ca3331553fbb63716e2bf" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/ba/d8/11490cddd564eb4de97b4579ef6bfe6a736cc07e94c1598590ae25415e01/numpy-2.4.4-cp313-cp313-win_arm64.whl", hash = "sha256:ef4059d6e5152fa1a39f888e344c73fdc926e1b2dd58c771d67b0acfbf2aa67d" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/99/5d/dab4339177a905aad3e2221c915b35202f1ec30d750dd2e5e9d9a72b804b/numpy-2.4.4-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:4bbc7f303d125971f60ec0aaad5e12c62d0d2c925f0ab1273debd0e4ba37aba5" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/eb/e4/0564a65e7d3d97562ed6f9b0fd0fb0a6f559ee444092f105938b50043876/numpy-2.4.4-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:4d6d57903571f86180eb98f8f0c839fa9ebbfb031356d87f1361be91e433f5b7" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/29/8d/35a3a6ce5ad371afa58b4700f1c820f8f279948cca32524e0a695b0ded83/numpy-2.4.4-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:4636de7fd195197b7535f231b5de9e4b36d2c440b6e566d2e4e4746e6af0ca93" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/f4/da/477731acbd5a58a946c736edfdabb2ac5b34c3d08d1ba1a7b437fa0884df/numpy-2.4.4-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ad2e2ef14e0b04e544ea2fa0a36463f847f113d314aa02e5b402fdf910ef309e" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/e6/db/338535d9b152beabeb511579598418ba0212ce77cf9718edd70262cc4370/numpy-2.4.4-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5a285b3b96f951841799528cd1f4f01cd70e7e0204b4abebac9463eecfcf2a40" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/e2/a9/ad248e8f58beb7a0219b413c9c7d8151c5d285f7f946c3e26695bdbbe2df/numpy-2.4.4-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:f8474c4241bc18b750be2abea9d7a9ec84f46ef861dbacf86a4f6e043401f79e" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/b5/1a/3b88ccd3694681356f70da841630e4725a7264d6a885c8d442a697e1146b/numpy-2.4.4-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4e874c976154687c1f71715b034739b45c7711bec81db01914770373d125e392" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/c2/c9/fcfd5d0639222c6eac7f304829b04892ef51c96a75d479214d77e3ce6e33/numpy-2.4.4-cp313-cp313t-win32.whl", hash = "sha256:9c585a1790d5436a5374bac930dad6ed244c046ed91b2b2a3634eb2971d21008" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/d5/e3/3938a61d1c538aaec8ed6fd6323f57b0c2d2d2219512434c5c878db76553/numpy-2.4.4-cp313-cp313t-win_amd64.whl", hash = "sha256:93e15038125dc1e5345d9b5b68aa7f996ec33b98118d18c6ca0d0b7d6198b7e8" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/97/6a/7e345032cc60501721ef94e0e30b60f6b0bd601f9174ebd36389a2b86d40/numpy-2.4.4-cp313-cp313t-win_arm64.whl", hash = "sha256:0dfd3f9d3adbe2920b68b5cd3d51444e13a10792ec7154cd0a2f6e74d4ab3233" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/6e/06/c54062f85f673dd5c04cbe2f14c3acb8c8b95e3384869bb8cc9bff8cb9df/numpy-2.4.4-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:f169b9a863d34f5d11b8698ead99febeaa17a13ca044961aa8e2662a6c7766a0" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/4c/39/8a320264a84404c74cc7e79715de85d6130fa07a0898f67fb5cd5bd79908/numpy-2.4.4-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:2483e4584a1cb3092da4470b38866634bafb223cbcd551ee047633fd2584599a" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/91/fb/287076b2614e1d1044235f50f03748f31fa287e3dbe6abeb35cdfa351eca/numpy-2.4.4-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:2d19e6e2095506d1736b7d80595e0f252d76b89f5e715c35e06e937679ea7d7a" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/63/eb/fcc338595309910de6ecabfcef2419a9ce24399680bfb149421fa2df1280/numpy-2.4.4-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:6a246d5914aa1c820c9443ddcee9c02bec3e203b0c080349533fae17727dfd1b" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/44/5d/e7e9044032a716cdfaa3fba27a8e874bf1c5f1912a1ddd4ed071bf8a14a6/numpy-2.4.4-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:989824e9faf85f96ec9c7761cd8d29c531ad857bfa1daa930cba85baaecf1a9a" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/98/7c/21252050676612625449b4807d6b695b9ce8a7c9e1c197ee6216c8a65c7c/numpy-2.4.4-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:27a8d92cd10f1382a67d7cf4db7ce18341b66438bdd9f691d7b0e48d104c2a9d" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/b1/29/56d2bbef9465db24ef25393383d761a1af4f446a1df9b8cded4fe3a5a5d7/numpy-2.4.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:e44319a2953c738205bf3354537979eaa3998ed673395b964c1176083dd46252" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/e3/2b/a35a6d7589d21f44cea7d0a98de5ddcbb3d421b2622a5c96b1edf18707c3/numpy-2.4.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e892aff75639bbef0d2a2cfd55535510df26ff92f63c92cd84ef8d4ba5a5557f" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/64/c9/d52ec581f2390e0f5f85cbfd80fb83d965fc15e9f0e1aec2195faa142cde/numpy-2.4.4-cp314-cp314-win32.whl", hash = "sha256:1378871da56ca8943c2ba674530924bb8ca40cd228358a3b5f302ad60cf875fc" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/fa/22/4cc31a62a6c7b74a8730e31a4274c5dc80e005751e277a2ce38e675e4923/numpy-2.4.4-cp314-cp314-win_amd64.whl", hash = "sha256:715d1c092715954784bc79e1174fc2a90093dc4dc84ea15eb14dad8abdcdeb74" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/70/2e/14cda6f4d8e396c612d1bf97f22958e92148801d7e4f110cabebdc0eef4b/numpy-2.4.4-cp314-cp314-win_arm64.whl", hash = "sha256:2c194dd721e54ecad9ad387c1d35e63dce5c4450c6dc7dd5611283dda239aabb" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/b1/e8/8fed8c8d848d7ecea092dc3469643f9d10bc3a134a815a3b033da1d2039b/numpy-2.4.4-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2aa0613a5177c264ff5921051a5719d20095ea586ca88cc802c5c218d1c67d3e" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/05/1a/d8007a5138c179c2bf33ef44503e83d70434d2642877ee8fbb230e7c0548/numpy-2.4.4-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:42c16925aa5a02362f986765f9ebabf20de75cdefdca827d14315c568dcab113" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/99/64/ffb99ac6ae93faf117bcbd5c7ba48a7f45364a33e8e458545d3633615dda/numpy-2.4.4-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:874f200b2a981c647340f841730fc3a2b54c9d940566a3c4149099591e2c4c3d" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/6e/6e/795cc078b78a384052e73b2f6281ff7a700e9bf53bcce2ee579d4f6dd879/numpy-2.4.4-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c9b39d38a9bd2ae1becd7eac1303d031c5c110ad31f2b319c6e7d98b135c934d" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/5f/86/2acbda8cc2af5f3d7bfc791192863b9e3e19674da7b5e533fded124d1299/numpy-2.4.4-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b268594bccac7d7cf5844c7732e3f20c50921d94e36d7ec9b79e9857694b1b2f" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/bc/59/cafd83018f4aa55e0ac6fa92aa066c0a1877b77a615ceff1711c260ffae8/numpy-2.4.4-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:ac6b31e35612a26483e20750126d30d0941f949426974cace8e6b5c58a3657b0" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/f0/85/a42548db84e65ece46ab2caea3d3f78b416a47af387fcbb47ec28e660dc2/numpy-2.4.4-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:8e3ed142f2728df44263aaf5fb1f5b0b99f4070c553a0d7f033be65338329150" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/ed/ad/483d9e262f4b831000062e5d8a45e342166ec8aaa1195264982bca267e62/numpy-2.4.4-cp314-cp314t-win32.whl", hash = "sha256:dddbbd259598d7240b18c9d87c56a9d2fb3b02fe266f49a7c101532e78c1d871" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/c7/03/2fc4e14c7bd4ff2964b74ba90ecb8552540b6315f201df70f137faa5c589/numpy-2.4.4-cp314-cp314t-win_amd64.whl", hash = "sha256:a7164afb23be6e37ad90b2f10426149fd75aee07ca55653d2aa41e66c4ef697e" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/58/78/548fb8e07b1a341746bfbecb32f2c268470f45fa028aacdbd10d9bc73aab/numpy-2.4.4-cp314-cp314t-win_arm64.whl", hash = "sha256:ba203255017337d39f89bdd58417f03c4426f12beed0440cfd933cb15f8669c7" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/6b/33/8fae8f964a4f63ed528264ddf25d2b683d0b663e3cba26961eb838a7c1bd/numpy-2.4.4-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:58c8b5929fcb8287cbd6f0a3fae19c6e03a5c48402ae792962ac465224a629a4" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/bc/d0/1aabee441380b981cf8cdda3ae7a46aa827d1b5a8cce84d14598bc94d6d9/numpy-2.4.4-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:eea7ac5d2dce4189771cedb559c738a71512768210dc4e4753b107a2048b3d0e" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/a5/b8/aafb0d1065416894fccf4df6b49ef22b8db045187949545bced89c034b8e/numpy-2.4.4-pp311-pypy311_pp73-macosx_14_0_arm64.whl", hash = "sha256:51fc224f7ca4d92656d5a5eb315f12eb5fe2c97a66249aa7b5f562528a3be38c" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/d6/77/063baa20b08b431038c7f9ff5435540c7b7265c78cf56012a483019ca72d/numpy-2.4.4-pp311-pypy311_pp73-macosx_14_0_x86_64.whl", hash = "sha256:28a650663f7314afc3e6ec620f44f333c386aad9f6fc472030865dc0ebb26ee3" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/c7/a8/379542d45a14f149444c5c4c4e7714707239ce9cc1de8c2803958889da14/numpy-2.4.4-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:19710a9ca9992d7174e9c52f643d4272dcd1558c5f7af7f6f8190f633bd651a7" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/a2/c8/f0a45426d6d21e7ea3310a15cf90c43a14d9232c31a837702dba437f3373/numpy-2.4.4-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9b2aec6af35c113b05695ebb5749a787acd63cafc83086a05771d1e1cd1e555f" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/04/74/f4c001f4714c3ad9ce037e18cf2b9c64871a84951eaa0baf683a9ca9301c/numpy-2.4.4-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:f2cf083b324a467e1ab358c105f6cad5ea950f50524668a80c486ff1db24e119" },
+]
+
+[[package]]
+name = "nvidia-cublas-cu12"
+version = "12.8.4.1"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/29/99/db44d685f0e257ff0e213ade1964fc459b4a690a73293220e98feb3307cf/nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:b86f6dd8935884615a0683b663891d43781b819ac4f2ba2b0c9604676af346d0" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/dc/61/e24b560ab2e2eaeb3c839129175fb330dfcfc29e5203196e5541a4c44682/nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:8ac4e771d5a348c551b2a426eda6193c19aa630236b418086020df5ba9667142" },
+]
+
+[[package]]
+name = "nvidia-cuda-cupti-cu12"
+version = "12.8.90"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/d5/1f/b3bd73445e5cb342727fd24fe1f7b748f690b460acadc27ea22f904502c8/nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4412396548808ddfed3f17a467b104ba7751e6b58678a4b840675c56d21cf7ed" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/f8/02/2adcaa145158bf1a8295d83591d22e4103dbfd821bcaf6f3f53151ca4ffa/nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ea0cb07ebda26bb9b29ba82cda34849e73c166c18162d3913575b0c9db9a6182" },
+]
+
+[[package]]
+name = "nvidia-cuda-nvrtc-cu12"
+version = "12.8.93"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/05/6b/32f747947df2da6994e999492ab306a903659555dddc0fbdeb9d71f75e52/nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:a7756528852ef889772a84c6cd89d41dfa74667e24cca16bb31f8f061e3e9994" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/eb/d1/e50d0acaab360482034b84b6e27ee83c6738f7d32182b987f9c7a4e32962/nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fc1fec1e1637854b4c0a65fb9a8346b51dd9ee69e61ebaccc82058441f15bce8" },
+]
+
+[[package]]
+name = "nvidia-cuda-runtime-cu12"
+version = "12.8.90"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/7c/75/f865a3b236e4647605ea34cc450900854ba123834a5f1598e160b9530c3a/nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:52bf7bbee900262ffefe5e9d5a2a69a30d97e2bc5bb6cc866688caa976966e3d" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/0d/9b/a997b638fcd068ad6e4d53b8551a7d30fe8b404d6f1804abf1df69838932/nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:adade8dcbd0edf427b7204d480d6066d33902cab2a4707dcfc48a2d0fd44ab90" },
+]
+
+[[package]]
+name = "nvidia-cudnn-cu12"
+version = "9.19.0.56"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+dependencies = [
+    { name = "nvidia-cublas-cu12" },
+]
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/09/b8/277c51962ee46fa3e5b203ac5f76107c650f781d6891e681e28e6f3e9fe6/nvidia_cudnn_cu12-9.19.0.56-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:08caaf27fe556aca82a3ee3b5aa49a77e7de0cfcb7ff4e5c29da426387a8267e" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/c5/41/65225d42fba06fb3dd3972485ea258e7dd07a40d6e01c95da6766ad87354/nvidia_cudnn_cu12-9.19.0.56-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:ac6ad90a075bb33a94f2b4cf4622eac13dd4dc65cf6dd9c7572a318516a36625" },
+]
+
+[[package]]
+name = "nvidia-cufft-cu12"
+version = "11.3.3.83"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+dependencies = [
+    { name = "nvidia-nvjitlink-cu12" },
+]
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/60/bc/7771846d3a0272026c416fbb7e5f4c1f146d6d80704534d0b187dd6f4800/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:848ef7224d6305cdb2a4df928759dca7b1201874787083b6e7550dd6765ce69a" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/1f/13/ee4e00f30e676b66ae65b4f08cb5bcbb8392c03f54f2d5413ea99a5d1c80/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d2dd21ec0b88cf61b62e6b43564355e5222e4a3fb394cac0db101f2dd0d4f74" },
+]
+
+[[package]]
+name = "nvidia-cufile-cu12"
+version = "1.13.1.3"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/bb/fe/1bcba1dfbfb8d01be8d93f07bfc502c93fa23afa6fd5ab3fc7c1df71038a/nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1d069003be650e131b21c932ec3d8969c1715379251f8d23a1860554b1cb24fc" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/1e/f5/5607710447a6fe9fd9b3283956fceeee8a06cda1d2f56ce31371f595db2a/nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:4beb6d4cce47c1a0f1013d72e02b0994730359e17801d395bdcbf20cfb3bb00a" },
+]
+
+[[package]]
+name = "nvidia-curand-cu12"
+version = "10.3.9.90"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/45/5e/92aa15eca622a388b80fbf8375d4760738df6285b1e92c43d37390a33a9a/nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:dfab99248034673b779bc6decafdc3404a8a6f502462201f2f31f11354204acd" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/fb/aa/6584b56dc84ebe9cf93226a5cde4d99080c8e90ab40f0c27bda7a0f29aa1/nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:b32331d4f4df5d6eefa0554c565b626c7216f87a06a4f56fab27c3b68a830ec9" },
+]
+
+[[package]]
+name = "nvidia-cusolver-cu12"
+version = "11.7.3.90"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+dependencies = [
+    { name = "nvidia-cublas-cu12" },
+    { name = "nvidia-cusparse-cu12" },
+    { name = "nvidia-nvjitlink-cu12" },
+]
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/c8/32/f7cd6ce8a7690544d084ea21c26e910a97e077c9b7f07bf5de623ee19981/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:db9ed69dbef9715071232caa9b69c52ac7de3a95773c2db65bdba85916e4e5c0" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/85/48/9a13d2975803e8cf2777d5ed57b87a0b6ca2cc795f9a4f59796a910bfb80/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:4376c11ad263152bd50ea295c05370360776f8c3427b30991df774f9fb26c450" },
+]
+
+[[package]]
+name = "nvidia-cusparse-cu12"
+version = "12.5.8.93"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+dependencies = [
+    { name = "nvidia-nvjitlink-cu12" },
+]
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/bc/f7/cd777c4109681367721b00a106f491e0d0d15cfa1fd59672ce580ce42a97/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9b6c161cb130be1a07a27ea6923df8141f3c295852f4b260c65f18f3e0a091dc" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/c2/f5/e1854cb2f2bcd4280c44736c93550cc300ff4b8c95ebe370d0aa7d2b473d/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ec05d76bbbd8b61b06a80e1eaf8cf4959c3d4ce8e711b65ebd0443bb0ebb13b" },
+]
+
+[[package]]
+name = "nvidia-cusparselt-cu12"
+version = "0.7.1"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/73/b9/598f6ff36faaece4b3c50d26f50e38661499ff34346f00e057760b35cc9d/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_aarch64.whl", hash = "sha256:8878dce784d0fac90131b6817b607e803c36e629ba34dc5b433471382196b6a5" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/56/79/12978b96bd44274fe38b5dde5cfb660b1d114f70a65ef962bcbbed99b549/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f1bb701d6b930d5a7cea44c19ceb973311500847f81b634d802b7b539dc55623" },
+]
+
+[[package]]
+name = "nvidia-nccl-cu12"
+version = "2.28.9"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/08/c4/120d2dfd92dff2c776d68f361ff8705fdea2ca64e20b612fab0fd3f581ac/nvidia_nccl_cu12-2.28.9-py3-none-manylinux_2_18_aarch64.whl", hash = "sha256:50a36e01c4a090b9f9c47d92cec54964de6b9fcb3362d0e19b8ffc6323c21b60" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/4a/4e/44dbb46b3d1b0ec61afda8e84837870f2f9ace33c564317d59b70bc19d3e/nvidia_nccl_cu12-2.28.9-py3-none-manylinux_2_18_x86_64.whl", hash = "sha256:485776daa8447da5da39681af455aa3b2c2586ddcf4af8772495e7c532c7e5ab" },
+]
+
+[[package]]
+name = "nvidia-nvjitlink-cu12"
+version = "12.8.93"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/f6/74/86a07f1d0f42998ca31312f998bd3b9a7eff7f52378f4f270c8679c77fb9/nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:81ff63371a7ebd6e6451970684f916be2eab07321b73c9d244dc2b4da7f73b88" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/2a/a2/8cee5da30d13430e87bf99bb33455d2724d0a4a9cb5d7926d80ccb96d008/nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:adccd7161ace7261e01bb91e44e88da350895c270d23f744f0820c818b7229e7" },
+]
+
+[[package]]
+name = "nvidia-nvshmem-cu12"
+version = "3.4.5"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/1d/6a/03aa43cc9bd3ad91553a88b5f6fb25ed6a3752ae86ce2180221962bc2aa5/nvidia_nvshmem_cu12-3.4.5-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0b48363fc6964dede448029434c6abed6c5e37f823cb43c3bcde7ecfc0457e15" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/b5/09/6ea3ea725f82e1e76684f0708bbedd871fc96da89945adeba65c3835a64c/nvidia_nvshmem_cu12-3.4.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:042f2500f24c021db8a06c5eec2539027d57460e1c1a762055a6554f72c369bd" },
+]
+
+[[package]]
+name = "nvidia-nvtx-cu12"
+version = "12.8.90"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/10/c0/1b303feea90d296f6176f32a2a70b5ef230f9bdeb3a72bddb0dc922dc137/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d7ad891da111ebafbf7e015d34879f7112832fc239ff0d7d776b6cb685274615" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/a2/eb/86626c1bbc2edb86323022371c39aa48df6fd8b0a1647bc274577f72e90b/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b17e2001cc0d751a5bc2c6ec6d26ad95913324a4adb86788c944f8ce9ba441f" },
+]
+
+[[package]]
+name = "packaging"
+version = "26.0"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/65/ee/299d360cdc32edc7d2cf530f3accf79c4fca01e96ffc950d8a52213bd8e4/packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529" },
+]
+
+[[package]]
+name = "pluggy"
+version = "1.6.0"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746" },
+]
+
+[[package]]
+name = "pygments"
+version = "2.20.0"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/c3/b2/bc9c9196916376152d655522fdcebac55e66de6603a76a02bca1b6414f6c/pygments-2.20.0.tar.gz", hash = "sha256:6757cd03768053ff99f3039c1a36d6c0aa0b263438fcab17520b30a303a82b5f" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176" },
+]
+
+[[package]]
+name = "pytest"
+version = "9.0.3"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "exceptiongroup", marker = "python_full_version < '3.11'" },
+    { name = "iniconfig" },
+    { name = "packaging" },
+    { name = "pluggy" },
+    { name = "pygments" },
+    { name = "tomli", marker = "python_full_version < '3.11'" },
+]
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/7d/0d/549bd94f1a0a402dc8cf64563a117c0f3765662e2e668477624baeec44d5/pytest-9.0.3.tar.gz", hash = "sha256:b86ada508af81d19edeb213c681b1d48246c1a91d304c6c81a427674c17eb91c" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/d4/24/a372aaf5c9b7208e7112038812994107bc65a84cd00e0354a88c2c77a617/pytest-9.0.3-py3-none-any.whl", hash = "sha256:2c5efc453d45394fdd706ade797c0a81091eccd1d6e4bccfcd476e2b8e0ab5d9" },
+]
+
+[[package]]
+name = "setuptools"
+version = "81.0.0"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/0d/1c/73e719955c59b8e424d015ab450f51c0af856ae46ea2da83eba51cc88de1/setuptools-81.0.0.tar.gz", hash = "sha256:487b53915f52501f0a79ccfd0c02c165ffe06631443a886740b91af4b7a5845a" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/e1/e3/c164c88b2e5ce7b24d667b9bd83589cf4f3520d97cad01534cd3c4f55fdb/setuptools-81.0.0-py3-none-any.whl", hash = "sha256:fdd925d5c5d9f62e4b74b30d6dd7828ce236fd6ed998a08d81de62ce5a6310d6" },
+]
+
+[[package]]
+name = "sympy"
+version = "1.14.0"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+dependencies = [
+    { name = "mpmath" },
+]
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5" },
+]
+
+[[package]]
+name = "tomli"
+version = "2.4.1"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/22/de/48c59722572767841493b26183a0d1cc411d54fd759c5607c4590b6563a6/tomli-2.4.1.tar.gz", hash = "sha256:7c7e1a961a0b2f2472c1ac5b69affa0ae1132c39adcb67aba98568702b9cc23f" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/f4/11/db3d5885d8528263d8adc260bb2d28ebf1270b96e98f0e0268d32b8d9900/tomli-2.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f8f0fc26ec2cc2b965b7a3b87cd19c5c6b8c5e5f436b984e85f486d652285c30" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/6d/f7/675db52c7e46064a9aa928885a9b20f4124ecb9bc2e1ce74c9106648d202/tomli-2.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4ab97e64ccda8756376892c53a72bd1f964e519c77236368527f758fbc36a53a" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/61/71/81c50943cf953efa35bce7646caab3cf457a7d8c030b27cfb40d7235f9ee/tomli-2.4.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:96481a5786729fd470164b47cdb3e0e58062a496f455ee41b4403be77cb5a076" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/48/c1/f41d9cb618acccca7df82aaf682f9b49013c9397212cb9f53219e3abac37/tomli-2.4.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5a881ab208c0baf688221f8cecc5401bd291d67e38a1ac884d6736cbcd8247e9" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/22/e4/5a816ecdd1f8ca51fb756ef684b90f2780afc52fc67f987e3c61d800a46d/tomli-2.4.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:47149d5bd38761ac8be13a84864bf0b7b70bc051806bc3669ab1cbc56216b23c" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/6b/49/2b2a0ef529aa6eec245d25f0c703e020a73955ad7edf73e7f54ddc608aa5/tomli-2.4.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ec9bfaf3ad2df51ace80688143a6a4ebc09a248f6ff781a9945e51937008fcbc" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/83/bd/6c1a630eaca337e1e78c5903104f831bda934c426f9231429396ce3c3467/tomli-2.4.1-cp311-cp311-win32.whl", hash = "sha256:ff2983983d34813c1aeb0fa89091e76c3a22889ee83ab27c5eeb45100560c049" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/42/59/71461df1a885647e10b6bb7802d0b8e66480c61f3f43079e0dcd315b3954/tomli-2.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:5ee18d9ebdb417e384b58fe414e8d6af9f4e7a0ae761519fb50f721de398dd4e" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/b8/83/dceca96142499c069475b790e7913b1044c1a4337e700751f48ed723f883/tomli-2.4.1-cp311-cp311-win_arm64.whl", hash = "sha256:c2541745709bad0264b7d4705ad453b76ccd191e64aa6f0fc66b69a293a45ece" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/c1/ba/42f134a3fe2b370f555f44b1d72feebb94debcab01676bf918d0cb70e9aa/tomli-2.4.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c742f741d58a28940ce01d58f0ab2ea3ced8b12402f162f4d534dfe18ba1cd6a" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/dc/c7/62d7a17c26487ade21c5422b646110f2162f1fcc95980ef7f63e73c68f14/tomli-2.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7f86fd587c4ed9dd76f318225e7d9b29cfc5a9d43de44e5754db8d1128487085" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/5c/05/79d13d7c15f13bdef410bdd49a6485b1c37d28968314eabee452c22a7fda/tomli-2.4.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ff18e6a727ee0ab0388507b89d1bc6a22b138d1e2fa56d1ad494586d61d2eae9" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/10/90/d62ce007a1c80d0b2c93e02cab211224756240884751b94ca72df8a875ca/tomli-2.4.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:136443dbd7e1dee43c68ac2694fde36b2849865fa258d39bf822c10e8068eac5" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/1a/7e/caf6496d60152ad4ed09282c1885cca4eea150bfd007da84aea07bcc0a3e/tomli-2.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5e262d41726bc187e69af7825504c933b6794dc3fbd5945e41a79bb14c31f585" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/99/e7/c6f69c3120de34bbd882c6fba7975f3d7a746e9218e56ab46a1bc4b42552/tomli-2.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5cb41aa38891e073ee49d55fbc7839cfdb2bc0e600add13874d048c94aadddd1" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/d6/2f/4a3c322f22c5c66c4b836ec58211641a4067364f5dcdd7b974b4c5da300c/tomli-2.4.1-cp312-cp312-win32.whl", hash = "sha256:da25dc3563bff5965356133435b757a795a17b17d01dbc0f42fb32447ddfd917" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/24/22/4daacd05391b92c55759d55eaee21e1dfaea86ce5c571f10083360adf534/tomli-2.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:52c8ef851d9a240f11a88c003eacb03c31fc1c9c4ec64a99a0f922b93874fda9" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/68/fd/70e768887666ddd9e9f5d85129e84910f2db2796f9096aa02b721a53098d/tomli-2.4.1-cp312-cp312-win_arm64.whl", hash = "sha256:f758f1b9299d059cc3f6546ae2af89670cb1c4d48ea29c3cacc4fe7de3058257" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/07/06/b823a7e818c756d9a7123ba2cda7d07bc2dd32835648d1a7b7b7a05d848d/tomli-2.4.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:36d2bd2ad5fb9eaddba5226aa02c8ec3fa4f192631e347b3ed28186d43be6b54" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/14/6f/12645cf7f08e1a20c7eb8c297c6f11d31c1b50f316a7e7e1e1de6e2e7b7e/tomli-2.4.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:eb0dc4e38e6a1fd579e5d50369aa2e10acfc9cace504579b2faabb478e76941a" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/5c/e0/90637574e5e7212c09099c67ad349b04ec4d6020324539297b634a0192b0/tomli-2.4.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c7f2c7f2b9ca6bdeef8f0fa897f8e05085923eb091721675170254cbc5b02897" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/10/8f/d3ddb16c5a4befdf31a23307f72828686ab2096f068eaf56631e136c1fdd/tomli-2.4.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f3c6818a1a86dd6dca7ddcaaf76947d5ba31aecc28cb1b67009a5877c9a64f3f" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/e3/f1/dbeeb9116715abee2485bf0a12d07a8f31af94d71608c171c45f64c0469d/tomli-2.4.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d312ef37c91508b0ab2cee7da26ec0b3ed2f03ce12bd87a588d771ae15dcf82d" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/d3/74/16336ffd19ed4da28a70959f92f506233bd7cfc2332b20bdb01591e8b1d1/tomli-2.4.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:51529d40e3ca50046d7606fa99ce3956a617f9b36380da3b7f0dd3dd28e68cb5" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/16/f9/229fa3434c590ddf6c0aa9af64d3af4b752540686cace29e6281e3458469/tomli-2.4.1-cp313-cp313-win32.whl", hash = "sha256:2190f2e9dd7508d2a90ded5ed369255980a1bcdd58e52f7fe24b8162bf9fedbd" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/6a/1e/71dfd96bcc1c775420cb8befe7a9d35f2e5b1309798f009dca17b7708c1e/tomli-2.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:8d65a2fbf9d2f8352685bc1364177ee3923d6baf5e7f43ea4959d7d8bc326a36" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/83/7a/d34f422a021d62420b78f5c538e5b102f62bea616d1d75a13f0a88acb04a/tomli-2.4.1-cp313-cp313-win_arm64.whl", hash = "sha256:4b605484e43cdc43f0954ddae319fb75f04cc10dd80d830540060ee7cd0243cd" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/3c/fb/9a5c8d27dbab540869f7c1f8eb0abb3244189ce780ba9cd73f3770662072/tomli-2.4.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:fd0409a3653af6c147209d267a0e4243f0ae46b011aa978b1080359fddc9b6cf" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/62/05/d2f816630cc771ad836af54f5001f47a6f611d2d39535364f148b6a92d6b/tomli-2.4.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a120733b01c45e9a0c34aeef92bf0cf1d56cfe81ed9d47d562f9ed591a9828ac" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/ce/48/66341bdb858ad9bd0ceab5a86f90eddab127cf8b046418009f2125630ecb/tomli-2.4.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:559db847dc486944896521f68d8190be1c9e719fced785720d2216fe7022b662" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/df/6d/c5fad00d82b3c7a3ab6189bd4b10e60466f22cfe8a08a9394185c8a8111c/tomli-2.4.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:01f520d4f53ef97964a240a035ec2a869fe1a37dde002b57ebc4417a27ccd853" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/00/71/3a69e86f3eafe8c7a59d008d245888051005bd657760e96d5fbfb0b740c2/tomli-2.4.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7f94b27a62cfad8496c8d2513e1a222dd446f095fca8987fceef261225538a15" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/67/50/361e986652847fec4bd5e4a0208752fbe64689c603c7ae5ea7cb16b1c0ca/tomli-2.4.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:ede3e6487c5ef5d28634ba3f31f989030ad6af71edfb0055cbbd14189ff240ba" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/8c/9a/b4173689a9203472e5467217e0154b00e260621caa227b6fa01feab16998/tomli-2.4.1-cp314-cp314-win32.whl", hash = "sha256:3d48a93ee1c9b79c04bb38772ee1b64dcf18ff43085896ea460ca8dec96f35f6" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/14/58/640ac93bf230cd27d002462c9af0d837779f8773bc03dee06b5835208214/tomli-2.4.1-cp314-cp314-win_amd64.whl", hash = "sha256:88dceee75c2c63af144e456745e10101eb67361050196b0b6af5d717254dddf7" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/d5/2f/702d5e05b227401c1068f0d386d79a589bb12bf64c3d2c72ce0631e3bc49/tomli-2.4.1-cp314-cp314-win_arm64.whl", hash = "sha256:b8c198f8c1805dc42708689ed6864951fd2494f924149d3e4bce7710f8eb5232" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/45/4b/b877b05c8ba62927d9865dd980e34a755de541eb65fffba52b4cc495d4d2/tomli-2.4.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:d4d8fe59808a54658fcc0160ecfb1b30f9089906c50b23bcb4c69eddc19ec2b4" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/24/79/6ab420d37a270b89f7195dec5448f79400d9e9c1826df982f3f8e97b24fd/tomli-2.4.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7008df2e7655c495dd12d2a4ad038ff878d4ca4b81fccaf82b714e07eae4402c" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/02/e0/3630057d8eb170310785723ed5adcdfb7d50cb7e6455f85ba8a3deed642b/tomli-2.4.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1d8591993e228b0c930c4bb0db464bdad97b3289fb981255d6c9a41aedc84b2d" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/7a/b4/1613716072e544d1a7891f548d8f9ec6ce2faf42ca65acae01d76ea06bb0/tomli-2.4.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:734e20b57ba95624ecf1841e72b53f6e186355e216e5412de414e3c51e5e3c41" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/05/38/30f541baf6a3f6df77b3df16b01ba319221389e2da59427e221ef417ac0c/tomli-2.4.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8a650c2dbafa08d42e51ba0b62740dae4ecb9338eefa093aa5c78ceb546fcd5c" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/77/a3/ec9dd4fd2c38e98de34223b995a3b34813e6bdadf86c75314c928350ed14/tomli-2.4.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:504aa796fe0569bb43171066009ead363de03675276d2d121ac1a4572397870f" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/ef/be/605a6261cac79fba2ec0c9827e986e00323a1945700969b8ee0b30d85453/tomli-2.4.1-cp314-cp314t-win32.whl", hash = "sha256:b1d22e6e9387bf4739fbe23bfa80e93f6b0373a7f1b96c6227c32bef95a4d7a8" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/12/64/da524626d3b9cc40c168a13da8335fe1c51be12c0a63685cc6db7308daae/tomli-2.4.1-cp314-cp314t-win_amd64.whl", hash = "sha256:2c1c351919aca02858f740c6d33adea0c5deea37f9ecca1cc1ef9e884a619d26" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/5a/cd/e80b62269fc78fc36c9af5a6b89c835baa8af28ff5ad28c7028d60860320/tomli-2.4.1-cp314-cp314t-win_arm64.whl", hash = "sha256:eab21f45c7f66c13f2a9e0e1535309cee140182a9cdae1e041d02e47291e8396" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/7b/61/cceae43728b7de99d9b847560c262873a1f6c98202171fd5ed62640b494b/tomli-2.4.1-py3-none-any.whl", hash = "sha256:0d85819802132122da43cb86656f8d1f8c6587d54ae7dcaf30e90533028b49fe" },
+]
+
+[[package]]
+name = "torch"
+version = "2.11.0+cu128"
+source = { registry = "https://download.pytorch.org/whl/cu128" }
+dependencies = [
+    { name = "cuda-bindings", marker = "sys_platform == 'linux'" },
+    { name = "cuda-toolkit", extra = ["cublas", "cudart", "cufft", "cufile", "cupti", "curand", "cusolver", "cusparse", "nvjitlink", "nvrtc", "nvtx"], marker = "sys_platform == 'linux'" },
+    { name = "filelock" },
+    { name = "fsspec" },
+    { name = "jinja2" },
+    { name = "networkx", version = "3.4.2", source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }, marker = "python_full_version < '3.11'" },
+    { name = "networkx", version = "3.6.1", source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }, marker = "python_full_version >= '3.11'" },
+    { name = "nvidia-cudnn-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cusparselt-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-nccl-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-nvshmem-cu12", marker = "sys_platform == 'linux'" },
+    { name = "setuptools" },
+    { name = "sympy" },
+    { name = "triton", marker = "sys_platform == 'linux'" },
+    { name = "typing-extensions" },
+]
+wheels = [
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp310-cp310-manylinux_2_28_aarch64.whl" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp310-cp310-manylinux_2_28_x86_64.whl" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp310-cp310-win_amd64.whl" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp311-cp311-manylinux_2_28_aarch64.whl" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp311-cp311-manylinux_2_28_x86_64.whl" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp311-cp311-win_amd64.whl" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp312-cp312-manylinux_2_28_aarch64.whl" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp312-cp312-win_amd64.whl" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp313-cp313-manylinux_2_28_aarch64.whl" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp313-cp313-manylinux_2_28_x86_64.whl" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp313-cp313-win_amd64.whl" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp313-cp313t-manylinux_2_28_aarch64.whl" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp313-cp313t-manylinux_2_28_x86_64.whl" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp313-cp313t-win_amd64.whl" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp314-cp314-manylinux_2_28_aarch64.whl" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp314-cp314-manylinux_2_28_x86_64.whl" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp314-cp314-win_amd64.whl" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp314-cp314t-manylinux_2_28_aarch64.whl" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp314-cp314t-manylinux_2_28_x86_64.whl" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.11.0%2Bcu128-cp314-cp314t-win_amd64.whl" },
+]
+
+[[package]]
+name = "triton"
+version = "3.6.0"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/44/ba/b1b04f4b291a3205d95ebd24465de0e5bf010a2df27a4e58a9b5f039d8f2/triton-3.6.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6c723cfb12f6842a0ae94ac307dba7e7a44741d720a40cf0e270ed4a4e3be781" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/8c/f7/f1c9d3424ab199ac53c2da567b859bcddbb9c9e7154805119f8bd95ec36f/triton-3.6.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a6550fae429e0667e397e5de64b332d1e5695b73650ee75a6146e2e902770bea" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/0f/2c/96f92f3c60387e14cc45aed49487f3486f89ea27106c1b1376913c62abe4/triton-3.6.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:49df5ef37379c0c2b5c0012286f80174fcf0e073e5ade1ca9a86c36814553651" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/e0/12/b05ba554d2c623bffa59922b94b0775673de251f468a9609bc9e45de95e9/triton-3.6.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e8e323d608e3a9bfcc2d9efcc90ceefb764a82b99dea12a86d643c72539ad5d3" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/17/5d/08201db32823bdf77a0e2b9039540080b2e5c23a20706ddba942924ebcd6/triton-3.6.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:374f52c11a711fd062b4bfbb201fd9ac0a5febd28a96fb41b4a0f51dde3157f4" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/ab/a8/cdf8b3e4c98132f965f88c2313a4b493266832ad47fb52f23d14d4f86bb5/triton-3.6.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:74caf5e34b66d9f3a429af689c1c7128daba1d8208df60e81106b115c00d6fca" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/3c/12/34d71b350e89a204c2c7777a9bba0dcf2f19a5bfdd70b57c4dbc5ffd7154/triton-3.6.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:448e02fe6dc898e9e5aa89cf0ee5c371e99df5aa5e8ad976a80b93334f3494fd" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/f9/0b/37d991d8c130ce81a8728ae3c25b6e60935838e9be1b58791f5997b24a54/triton-3.6.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:10c7f76c6e72d2ef08df639e3d0d30729112f47a56b0c81672edc05ee5116ac9" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/ce/4e/41b0c8033b503fd3cfcd12392cdd256945026a91ff02452bef40ec34bee7/triton-3.6.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1722e172d34e32abc3eb7711d0025bb69d7959ebea84e3b7f7a341cd7ed694d6" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/35/f8/9c66bfc55361ec6d0e4040a0337fb5924ceb23de4648b8a81ae9d33b2b38/triton-3.6.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d002e07d7180fd65e622134fbd980c9a3d4211fb85224b56a0a0efbd422ab72f" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/49/55/5ecf0dcaa0f2fbbd4420f7ef227ee3cb172e91e5fede9d0ecaddc43363b4/triton-3.6.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ef5523241e7d1abca00f1d240949eebdd7c673b005edbbce0aca95b8191f1d43" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/df/3d/9e7eee57b37c80cec63322c0231bb6da3cfe535a91d7a4d64896fcb89357/triton-3.6.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a17a5d5985f0ac494ed8a8e54568f092f7057ef60e1b0fa09d3fd1512064e803" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/48/db/56ee649cab5eaff4757541325aca81f52d02d4a7cd3506776cad2451e060/triton-3.6.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0b3a97e8ed304dfa9bd23bb41ca04cdf6b2e617d5e782a8653d616037a5d537d" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/f6/56/6113c23ff46c00aae423333eb58b3e60bdfe9179d542781955a5e1514cb3/triton-3.6.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:46bd1c1af4b6704e554cad2eeb3b0a6513a980d470ccfa63189737340c7746a7" },
+]
+
+[[package]]
+name = "typing-extensions"
+version = "4.15.0"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple/" }
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548" },
+]